Skip to content

Commit ab62128

Browse files
committed
Merge 'origin/master' into hipblas
2 parents d91456a + f5bfea0 commit ab62128

38 files changed

+4230
-2798
lines changed

CMakeLists.txt

+5-5
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ set(LLAMA_CUDA_MMQ_Y "64" CACHE STRING "llama: y tile size for mmq CUDA ke
7373
option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF)
7474
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
7575
set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
76-
option(LLAMA_CUDA_DMMV_F16 "llama: use 16 bit floats for dmmv CUDA kernels" OFF)
76+
option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some calculations" OFF)
7777
set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
7878
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
7979
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
@@ -266,8 +266,8 @@ if (LLAMA_CUBLAS)
266266
if (DEFINED LLAMA_CUDA_DMMV_Y)
267267
add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_DMMV_Y}) # for backwards compatibility
268268
endif()
269-
if (LLAMA_CUDA_DMMV_F16)
270-
add_compile_definitions(GGML_CUDA_DMMV_F16)
269+
if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16)
270+
add_compile_definitions(GGML_CUDA_F16)
271271
endif()
272272
add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
273273

@@ -281,8 +281,8 @@ if (LLAMA_CUBLAS)
281281
# 52 == lowest CUDA 12 standard
282282
# 60 == f16 CUDA intrinsics
283283
# 61 == integer CUDA intrinsics
284-
# 70 == (assumed) compute capability at which unrolling a loop in mul_mat_q kernels is faster
285-
if (LLAMA_CUDA_DMMV_F16)
284+
# 70 == compute capability at which unrolling a loop in mul_mat_q kernels is faster
285+
if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16)
286286
set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
287287
else()
288288
set(CMAKE_CUDA_ARCHITECTURES "52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics

Makefile

+31-28
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,28 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
142142
#CXXFLAGS += -mssse3
143143
endif
144144

145+
ifneq ($(filter aarch64%,$(UNAME_M)),)
146+
# Apple M1, M2, etc.
147+
# Raspberry Pi 3, 4, Zero 2 (64-bit)
148+
CFLAGS += -mcpu=native
149+
CXXFLAGS += -mcpu=native
150+
endif
151+
152+
ifneq ($(filter armv6%,$(UNAME_M)),)
153+
# Raspberry Pi 1, Zero
154+
CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access
155+
endif
156+
157+
ifneq ($(filter armv7%,$(UNAME_M)),)
158+
# Raspberry Pi 2
159+
CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
160+
endif
161+
162+
ifneq ($(filter armv8%,$(UNAME_M)),)
163+
# Raspberry Pi 3, 4, Zero 2 (32-bit)
164+
CFLAGS += -mfp16-format=ieee -mno-unaligned-access
165+
endif
166+
145167
ifneq ($(filter ppc64%,$(UNAME_M)),)
146168
POWER9_M := $(shell grep "POWER9" /proc/cpuinfo)
147169
ifneq (,$(findstring POWER9,$(POWER9_M)))
@@ -243,7 +265,7 @@ ifdef LLAMA_CUDA_CCBIN
243265
NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
244266
endif
245267
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
246-
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
268+
$(NVCC) $(NVCCFLAGS) $(subst -Ofast,-O3,$(CXXFLAGS)) -Wno-pedantic -c $< -o $@
247269
endif # LLAMA_CUBLAS
248270

249271
ifdef LLAMA_CLBLAST
@@ -293,28 +315,6 @@ ifdef LLAMA_METAL
293315
OBJS += ggml-metal.o
294316
endif # LLAMA_METAL
295317

296-
ifneq ($(filter aarch64%,$(UNAME_M)),)
297-
# Apple M1, M2, etc.
298-
# Raspberry Pi 3, 4, Zero 2 (64-bit)
299-
CFLAGS += -mcpu=native
300-
CXXFLAGS += -mcpu=native
301-
endif
302-
303-
ifneq ($(filter armv6%,$(UNAME_M)),)
304-
# Raspberry Pi 1, Zero
305-
CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access
306-
endif
307-
308-
ifneq ($(filter armv7%,$(UNAME_M)),)
309-
# Raspberry Pi 2
310-
CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
311-
endif
312-
313-
ifneq ($(filter armv8%,$(UNAME_M)),)
314-
# Raspberry Pi 3, 4, Zero 2 (32-bit)
315-
CFLAGS += -mfp16-format=ieee -mno-unaligned-access
316-
endif
317-
318318
ifdef LLAMA_METAL
319319
ggml-metal.o: ggml-metal.m ggml-metal.h
320320
$(CC) $(CFLAGS) -c $< -o $@
@@ -363,6 +363,9 @@ llama.o: llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h llama.h llama-ut
363363
common.o: examples/common.cpp examples/common.h
364364
$(CXX) $(CXXFLAGS) -c $< -o $@
365365

366+
console.o: examples/console.cpp examples/console.h
367+
$(CXX) $(CXXFLAGS) -c $< -o $@
368+
366369
grammar-parser.o: examples/grammar-parser.cpp examples/grammar-parser.h
367370
$(CXX) $(CXXFLAGS) -c $< -o $@
368371

@@ -376,7 +379,7 @@ clean:
376379
# Examples
377380
#
378381

379-
main: examples/main/main.cpp build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS)
382+
main: examples/main/main.cpp build-info.h ggml.o llama.o common.o console.o grammar-parser.o $(OBJS)
380383
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
381384
@echo
382385
@echo '==== Run ./main -h for help. ===='
@@ -400,7 +403,7 @@ embedding: examples/embedding/embedding.cpp build-info.h ggml.
400403
save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o common.o $(OBJS)
401404
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
402405

403-
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp build-info.h ggml.o llama.o common.o $(OBJS)
406+
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS)
404407
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2)
405408

406409
$(LIB_PRE)embdinput$(DSO_EXT): examples/embd-input/embd-input.h examples/embd-input/embd-input-lib.cpp build-info.h ggml.o llama.o common.o $(OBJS)
@@ -434,13 +437,13 @@ benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o
434437
vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
435438
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
436439

437-
tests/test-double-float: tests/test-double-float.c build-info.h ggml.o llama.o common.o $(OBJS)
440+
tests/test-double-float: tests/test-double-float.cpp build-info.h ggml.o llama.o common.o $(OBJS)
438441
$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
439442

440-
tests/test-grad0: tests/test-grad0.c build-info.h ggml.o llama.o common.o $(OBJS)
443+
tests/test-grad0: tests/test-grad0.cpp build-info.h ggml.o llama.o common.o $(OBJS)
441444
$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
442445

443-
tests/test-opt: tests/test-opt.c build-info.h ggml.o llama.o common.o $(OBJS)
446+
tests/test-opt: tests/test-opt.cpp build-info.h ggml.o llama.o common.o $(OBJS)
444447
$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
445448

446449
tests/test-quantize-fns: tests/test-quantize-fns.cpp build-info.h ggml.o llama.o common.o $(OBJS)

README.md

+8-1
Original file line numberDiff line numberDiff line change
@@ -80,14 +80,15 @@ as the main playground for developing new features for the [ggml](https://github
8080
- [x] LLaMA 2 🦙🦙
8181
- [X] [Alpaca](https://github.com/ggerganov/llama.cpp#instruction-mode-with-alpaca)
8282
- [X] [GPT4All](https://github.com/ggerganov/llama.cpp#using-gpt4all)
83-
- [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca)
83+
- [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) and [Chinese LLaMA-2 / Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2)
8484
- [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne)
8585
- [X] [Vicuna](https://github.com/ggerganov/llama.cpp/discussions/643#discussioncomment-5533894)
8686
- [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/)
8787
- [X] [OpenBuddy 🐶 (Multilingual)](https://github.com/OpenBuddy/OpenBuddy)
8888
- [X] [Pygmalion 7B / Metharme 7B](#using-pygmalion-7b--metharme-7b)
8989
- [X] [WizardLM](https://github.com/nlpxucan/WizardLM)
9090
- [X] [Baichuan-7B](https://huggingface.co/baichuan-inc/baichuan-7B) and its derivations (such as [baichuan-7b-sft](https://huggingface.co/hiyouga/baichuan-7b-sft))
91+
- [X] [Aquila-7B](https://huggingface.co/BAAI/Aquila-7B) / [AquilaChat-7B](https://huggingface.co/BAAI/AquilaChat-7B)
9192

9293
**Bindings:**
9394

@@ -522,13 +523,19 @@ Building the program with BLAS support may lead to some performance improvements
522523
# obtain the original LLaMA model weights and place them in ./models
523524
ls ./models
524525
65B 30B 13B 7B tokenizer_checklist.chk tokenizer.model
526+
# [Optional] for models using BPE tokenizers
527+
ls ./models
528+
65B 30B 13B 7B vocab.json
525529
526530
# install Python dependencies
527531
python3 -m pip install -r requirements.txt
528532
529533
# convert the 7B model to ggml FP16 format
530534
python3 convert.py models/7B/
531535
536+
# [Optional] for models using BPE tokenizers
537+
python convert.py models/7B/ --vocabtype bpe
538+
532539
# quantize the model to 4-bits (using q4_0 method)
533540
./quantize ./models/7B/ggml-model-f16.bin ./models/7B/ggml-model-q4_0.bin q4_0
534541

build.zig

+77-58
Original file line numberDiff line numberDiff line change
@@ -1,68 +1,87 @@
1+
// Compatible with Zig Version 0.11.0
12
const std = @import("std");
2-
const commit_hash = @embedFile(".git/refs/heads/master");
3+
const Compile = std.Build.Step.Compile;
4+
const ConfigHeader = std.Build.Step.ConfigHeader;
5+
const Mode = std.builtin.Mode;
6+
const CrossTarget = std.zig.CrossTarget;
37

4-
// Zig Version: 0.11.0-dev.3986+e05c242cd
5-
pub fn build(b: *std.build.Builder) void {
6-
const target = b.standardTargetOptions(.{});
7-
const optimize = b.standardOptimizeOption(.{});
8+
const Maker = struct {
9+
builder: *std.build.Builder,
10+
target: CrossTarget,
11+
optimize: Mode,
12+
config_header: *ConfigHeader,
13+
14+
const cflags = .{"-std=c11"};
15+
const cxxflags = .{"-std=c++11"};
16+
17+
fn init(builder: *std.build.Builder) Maker {
18+
const commit_hash = @embedFile(".git/refs/heads/master");
19+
const config_header = builder.addConfigHeader(
20+
.{ .style = .blank, .include_path = "build-info.h" },
21+
.{
22+
.BUILD_NUMBER = 0,
23+
.BUILD_COMMIT = commit_hash[0 .. commit_hash.len - 1], // omit newline
24+
},
25+
);
26+
return Maker{
27+
.builder = builder,
28+
.target = builder.standardTargetOptions(.{}),
29+
.optimize = builder.standardOptimizeOption(.{}),
30+
.config_header = config_header,
31+
};
32+
}
833

9-
const config_header = b.addConfigHeader(
10-
.{ .style = .blank, .include_path = "build-info.h" },
11-
.{
12-
.BUILD_NUMBER = 0,
13-
.BUILD_COMMIT = commit_hash[0 .. commit_hash.len - 1], // omit newline
14-
},
15-
);
34+
fn obj(m: *const Maker, name: []const u8, src: []const u8) *Compile {
35+
const o = m.builder.addObject(.{ .name = name, .target = m.target, .optimize = m.optimize });
36+
if (std.mem.endsWith(u8, src, ".c")) {
37+
o.addCSourceFiles(&.{src}, &cflags);
38+
o.linkLibC();
39+
} else {
40+
o.addCSourceFiles(&.{src}, &cxxflags);
41+
o.linkLibCpp();
42+
}
43+
o.addIncludePath(.{ .path = "." });
44+
o.addIncludePath(.{ .path = "./examples" });
45+
return o;
46+
}
47+
48+
fn exe(m: *const Maker, name: []const u8, src: []const u8, deps: []const *Compile) *Compile {
49+
const e = m.builder.addExecutable(.{ .name = name, .target = m.target, .optimize = m.optimize });
50+
e.addIncludePath(.{ .path = "." });
51+
e.addIncludePath(.{ .path = "./examples" });
52+
e.addCSourceFiles(&.{src}, &cxxflags);
53+
for (deps) |d| e.addObject(d);
54+
e.linkLibC();
55+
e.linkLibCpp();
56+
e.addConfigHeader(m.config_header);
57+
m.builder.installArtifact(e);
1658

17-
const lib = b.addStaticLibrary(.{
18-
.name = "llama",
19-
.target = target,
20-
.optimize = optimize,
21-
});
22-
lib.linkLibC();
23-
lib.linkLibCpp();
24-
lib.addIncludePath(".");
25-
lib.addIncludePath("./examples");
26-
lib.addConfigHeader(config_header);
27-
lib.addCSourceFiles(&.{"ggml.c"}, &.{"-std=c11"});
28-
lib.addCSourceFiles(&.{"llama.cpp"}, &.{"-std=c++11"});
29-
b.installArtifact(lib);
59+
// Currently a bug is preventing correct linking for optimized builds for Windows:
60+
// https://github.com/ziglang/zig/issues/15958
61+
if (e.target.isWindows()) {
62+
e.want_lto = false;
63+
}
64+
return e;
65+
}
66+
};
3067

31-
const examples = .{
32-
"main",
33-
"baby-llama",
34-
"embedding",
35-
"metal",
36-
"perplexity",
37-
"quantize",
38-
"quantize-stats",
39-
"save-load-state",
40-
"server",
41-
"simple",
42-
"train-text-from-scratch",
43-
};
68+
pub fn build(b: *std.build.Builder) void {
69+
const make = Maker.init(b);
4470

45-
inline for (examples) |example_name| {
46-
const exe = b.addExecutable(.{
47-
.name = example_name,
48-
.target = target,
49-
.optimize = optimize,
50-
});
51-
exe.addIncludePath(".");
52-
exe.addIncludePath("./examples");
53-
exe.addConfigHeader(config_header);
54-
exe.addCSourceFiles(&.{
55-
std.fmt.comptimePrint("examples/{s}/{s}.cpp", .{ example_name, example_name }),
56-
"examples/common.cpp",
57-
}, &.{"-std=c++11"});
58-
exe.linkLibrary(lib);
59-
b.installArtifact(exe);
71+
const ggml = make.obj("ggml", "ggml.c");
72+
const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c");
73+
const llama = make.obj("llama", "llama.cpp");
74+
const common = make.obj("common", "examples/common.cpp");
75+
const grammar_parser = make.obj("grammar-parser", "examples/grammar-parser.cpp");
6076

61-
const run_cmd = b.addRunArtifact(exe);
62-
run_cmd.step.dependOn(b.getInstallStep());
63-
if (b.args) |args| run_cmd.addArgs(args);
77+
_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, llama, common, grammar_parser });
78+
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, llama });
79+
_ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, llama, common });
80+
_ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, llama, common });
81+
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, llama });
6482

65-
const run_step = b.step("run-" ++ example_name, "Run the app");
66-
run_step.dependOn(&run_cmd.step);
83+
const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, llama, common, grammar_parser });
84+
if (server.target.isWindows()) {
85+
server.linkSystemLibrary("ws2_32");
6786
}
6887
}

convert.py

+7
Original file line numberDiff line numberDiff line change
@@ -465,6 +465,13 @@ def to_ggml(self) -> 'GGMLQuantizedTensor':
465465
def permute(self, n_head: int, n_kv_head: Optional[int] = None) -> 'GGMLQuantizedTensor':
466466
return GGMLQuantizedTensor(permute(self.ndarray, n_head, n_kv_head), self.shape, self.data_type)
467467

468+
def permute_part(self, n_part: int, n_head: int) -> 'UnquantizedTensor':
469+
r = self.ndarray.shape[0] // 3
470+
return UnquantizedTensor(permute(self.ndarray[r * n_part : r * n_part + r, ...], n_head))
471+
472+
def part(self, n_part: int) -> 'UnquantizedTensor':
473+
r = self.ndarray.shape[0] // 3
474+
return UnquantizedTensor(self.ndarray[r * n_part : r * n_part + r, ...])
468475

469476
GGMLCompatibleTensor = Union[UnquantizedTensor, GGMLQuantizedTensor]
470477

examples/CMakeLists.txt

+2
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@ set(TARGET common)
1313
add_library(${TARGET} OBJECT
1414
common.h
1515
common.cpp
16+
console.h
17+
console.cpp
1618
grammar-parser.h
1719
grammar-parser.cpp
1820
)

0 commit comments

Comments
 (0)