Skip to content

Commit 85f902d

Browse files
committed
Merge 'origin/master' into hipblas
2 parents 4362e80 + b50b570 commit 85f902d

15 files changed

+500
-258
lines changed

CMakeLists.txt

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@ set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA
7373
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
7474
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
7575
option(LLAMA_METAL "llama: use Metal" OFF)
76+
option(LLAMA_K_QUANTS "llama: use k-quants" ON)
7677

7778
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
7879
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
@@ -227,6 +228,11 @@ if (LLAMA_METAL)
227228
)
228229
endif()
229230

231+
if (LLAMA_K_QUANTS)
232+
set(GGML_SOURCES_EXTRA ${GGML_SOURCES_EXTRA} k_quants.c k_quants.h)
233+
add_compile_definitions(GGML_USE_K_QUANTS)
234+
endif()
235+
230236
if (LLAMA_CLBLAST)
231237
find_package(CLBlast)
232238
if (CLBlast_FOUND)
@@ -428,11 +434,10 @@ endif()
428434
add_library(ggml OBJECT
429435
ggml.c
430436
ggml.h
431-
ggml-quants-k.h
432-
ggml-quants-k.c
433437
${GGML_SOURCES_CUDA}
434438
${GGML_SOURCES_OPENCL}
435439
${GGML_SOURCES_METAL}
440+
${GGML_SOURCES_EXTRA}
436441
)
437442

438443
target_include_directories(ggml PUBLIC .)

Makefile

Lines changed: 22 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,11 @@ ifneq ($(filter ppc64%,$(UNAME_M)),)
121121
endif
122122
endif
123123

124+
ifndef LLAMA_NO_K_QUANTS
125+
CFLAGS += -DGGML_USE_K_QUANTS
126+
OBJS += k_quants.o
127+
endif
128+
124129
ifndef LLAMA_NO_ACCELERATE
125130
# Mac M1 - include Accelerate framework.
126131
# `-framework Accelerate` works on Mac Intel as well, with negliable performance boost (as of the predict time).
@@ -140,7 +145,7 @@ ifdef LLAMA_OPENBLAS
140145
endif # LLAMA_OPENBLAS
141146

142147
ifdef LLAMA_BLIS
143-
CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
148+
CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
144149
LDFLAGS += -lblis -L/usr/local/lib
145150
endif # LLAMA_BLIS
146151

@@ -230,6 +235,11 @@ ifneq ($(filter armv8%,$(UNAME_M)),)
230235
CFLAGS += -mfp16-format=ieee -mno-unaligned-access
231236
endif
232237

238+
ifdef LLAMA_NO_K_QUANTS
239+
k_quants.o: k_quants.c k_quants.h
240+
$(CC) $(CFLAGS) -c $< -o $@
241+
endif # LLAMA_NO_K_QUANTS
242+
233243
#
234244
# Print build information
235245
#
@@ -249,10 +259,7 @@ $(info )
249259
# Build library
250260
#
251261

252-
ggml.o: ggml.c ggml.h ggml-cuda.h ggml-quants-k.h
253-
$(CC) $(CFLAGS) -c $< -o $@
254-
255-
ggml-quants-k.o: ggml-quants-k.c ggml-quants-k.h ggml.h ggml-cuda.h
262+
ggml.o: ggml.c ggml.h ggml-cuda.h
256263
$(CC) $(CFLAGS) -c $< -o $@
257264

258265
llama.o: llama.cpp ggml.h ggml-cuda.h llama.h llama-util.h
@@ -261,7 +268,7 @@ llama.o: llama.cpp ggml.h ggml-cuda.h llama.h llama-util.h
261268
common.o: examples/common.cpp examples/common.h
262269
$(CXX) $(CXXFLAGS) -c $< -o $@
263270

264-
libllama.so: llama.o ggml.o ggml-quants-k.o $(OBJS)
271+
libllama.so: llama.o ggml.o $(OBJS)
265272
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
266273

267274
clean:
@@ -271,28 +278,28 @@ clean:
271278
# Examples
272279
#
273280

274-
main: examples/main/main.cpp build-info.h ggml.o ggml-quants-k.o llama.o common.o $(OBJS)
281+
main: examples/main/main.cpp build-info.h ggml.o llama.o common.o $(OBJS)
275282
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
276283
@echo
277284
@echo '==== Run ./main -h for help. ===='
278285
@echo
279286

280-
quantize: examples/quantize/quantize.cpp build-info.h ggml.o ggml-quants-k.o llama.o $(OBJS)
287+
quantize: examples/quantize/quantize.cpp build-info.h ggml.o llama.o $(OBJS)
281288
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
282289

283-
quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.h ggml.o ggml-quants-k.o llama.o $(OBJS)
290+
quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.h ggml.o llama.o $(OBJS)
284291
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
285292

286-
perplexity: examples/perplexity/perplexity.cpp build-info.h ggml.o ggml-quants-k.o llama.o common.o $(OBJS)
293+
perplexity: examples/perplexity/perplexity.cpp build-info.h ggml.o llama.o common.o $(OBJS)
287294
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
288295

289-
embedding: examples/embedding/embedding.cpp build-info.h ggml.o ggml-quants-k.o llama.o common.o $(OBJS)
296+
embedding: examples/embedding/embedding.cpp build-info.h ggml.o llama.o common.o $(OBJS)
290297
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
291298

292-
save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o ggml-quants-k.o llama.o common.o $(OBJS)
299+
save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o common.o $(OBJS)
293300
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
294301

295-
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp build-info.h ggml.o ggml-quants-k.o llama.o common.o $(OBJS)
302+
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp build-info.h ggml.o llama.o common.o $(OBJS)
296303
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS)
297304

298305
build-info.h: $(wildcard .git/index) scripts/build-info.sh
@@ -307,11 +314,11 @@ build-info.h: $(wildcard .git/index) scripts/build-info.sh
307314
# Tests
308315
#
309316

310-
benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o ggml-quants-k.o $(OBJS)
317+
benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o $(OBJS)
311318
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
312319
./$@
313320

314-
vdot: pocs/vdot/vdot.cpp ggml.o ggml-quants-k.o $(OBJS)
321+
vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
315322
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
316323

317324
.PHONY: tests clean

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
99

1010
**Hot topics:**
1111

12+
- Roadmap June 2023: https://github.com/ggerganov/llama.cpp/discussions/1729
1213
- GPU support with Metal (Apple Silicon): https://github.com/ggerganov/llama.cpp/pull/1642
1314
- High-quality 2,3,4,5,6-bit quantization: https://github.com/ggerganov/llama.cpp/pull/1684
1415
- Multi-GPU support: https://github.com/ggerganov/llama.cpp/pull/1607

examples/common.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
132132
params.path_prompt_cache = argv[i];
133133
} else if (arg == "--prompt-cache-all") {
134134
params.prompt_cache_all = true;
135+
} else if (arg == "--prompt-cache-ro") {
136+
params.prompt_cache_ro = true;
135137
} else if (arg == "-f" || arg == "--file") {
136138
if (++i >= argc) {
137139
invalid_param = true;
@@ -432,6 +434,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
432434
fprintf(stderr, " --prompt-cache FNAME file to cache prompt state for faster startup (default: none)\n");
433435
fprintf(stderr, " --prompt-cache-all if specified, saves user input and generations to cache as well.\n");
434436
fprintf(stderr, " not supported with --interactive or other interactive options\n");
437+
fprintf(stderr, " --prompt-cache-ro if specified, uses the prompt cache but does not update it.\n");
435438
fprintf(stderr, " --random-prompt start with a randomized prompt.\n");
436439
fprintf(stderr, " --in-prefix STRING string to prefix user inputs with (default: empty)\n");
437440
fprintf(stderr, " --in-suffix STRING string to suffix after user inputs with (default: empty)\n");

examples/common.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ struct gpt_params {
6262
bool use_color = false; // use color to distinguish generations and inputs
6363
bool interactive = false; // interactive mode
6464
bool prompt_cache_all = false; // save user input and generations to prompt cache
65+
bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
6566

6667
bool embedding = false; // get only sentence embedding
6768
bool interactive_first = false; // wait for user input immediately

examples/main/main.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -417,7 +417,7 @@ int main(int argc, char ** argv) {
417417
const bool penalize_nl = params.penalize_nl;
418418

419419
// optionally save the session on first sample (for faster prompt loading next time)
420-
if (!path_session.empty() && need_to_save_session) {
420+
if (!path_session.empty() && need_to_save_session && !params.prompt_cache_ro) {
421421
need_to_save_session = false;
422422
llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
423423
}
@@ -630,7 +630,7 @@ int main(int argc, char ** argv) {
630630
}
631631
}
632632

633-
if (!path_session.empty() && params.prompt_cache_all) {
633+
if (!path_session.empty() && params.prompt_cache_all && !params.prompt_cache_ro) {
634634
fprintf(stderr, "\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str());
635635
llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
636636
}

flake.lock

Lines changed: 24 additions & 6 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

flake.nix

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,13 @@
66
outputs = { self, nixpkgs, flake-utils }:
77
flake-utils.lib.eachDefaultSystem (system:
88
let
9+
inherit (pkgs.stdenv) isAarch64 isDarwin;
10+
inherit (pkgs.lib) optionals;
11+
isM1 = isAarch64 && isDarwin;
12+
osSpecific =
13+
if isM1 then with pkgs.darwin.apple_sdk_11_0.frameworks; [ Accelerate MetalKit MetalPerformanceShaders MetalPerformanceShadersGraph ]
14+
else if isDarwin then with pkgs.darwin.apple_sdk.frameworks; [ Accelerate CoreGraphics CoreVideo ]
15+
else [ ];
916
pkgs = import nixpkgs {
1017
inherit system;
1118
};
@@ -18,17 +25,22 @@
1825
packages.default = pkgs.stdenv.mkDerivation {
1926
name = "llama.cpp";
2027
src = ./.;
28+
postPatch =
29+
if isM1 then ''
30+
substituteInPlace ./ggml-metal.m \
31+
--replace '[[NSBundle mainBundle] pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/ggml-metal.metal\";"
32+
'' else "";
2133
nativeBuildInputs = with pkgs; [ cmake ];
22-
buildInputs = with pkgs; lib.optionals stdenv.isDarwin [
23-
darwin.apple_sdk.frameworks.Accelerate
24-
];
25-
cmakeFlags = with pkgs; lib.optionals (system == "aarch64-darwin") [
34+
buildInputs = osSpecific;
35+
cmakeFlags = [ "-DLLAMA_BUILD_SERVER=ON" ] ++ (optionals isM1 [
2636
"-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1"
27-
];
37+
"-DLLAMA_METAL=ON"
38+
]);
2839
installPhase = ''
2940
mkdir -p $out/bin
3041
mv bin/* $out/bin/
3142
mv $out/bin/main $out/bin/llama
43+
mv $out/bin/server $out/bin/llama-server
3244
3345
echo "#!${llama-python}/bin/python" > $out/bin/convert.py
3446
cat ${./convert.py} >> $out/bin/convert.py
@@ -40,9 +52,7 @@
4052
packages = with pkgs; [
4153
cmake
4254
llama-python
43-
] ++ lib.optionals stdenv.isDarwin [
44-
darwin.apple_sdk.frameworks.Accelerate
45-
];
55+
] ++ osSpecific;
4656
};
4757
}
4858
);

0 commit comments

Comments
 (0)