YellowRoseCx
diff --git a/‎CMakeLists.txt
Lines changed: 7 additions & 2 deletions b/‎CMakeLists.txt
Lines changed: 7 additions & 2 deletions
diff --git a/‎Makefile
Lines changed: 22 additions & 15 deletions b/‎Makefile
Lines changed: 22 additions & 15 deletions
diff --git a/‎README.md
Lines changed: 1 addition & 0 deletions b/‎README.md
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/common.cpp
Lines changed: 3 additions & 0 deletions b/‎examples/common.cpp
Lines changed: 3 additions & 0 deletions
diff --git a/‎examples/common.h
Lines changed: 1 addition & 0 deletions b/‎examples/common.h
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/main/main.cpp
Lines changed: 2 additions & 2 deletions b/‎examples/main/main.cpp
Lines changed: 2 additions & 2 deletions
diff --git a/‎flake.lock
Lines changed: 24 additions & 6 deletions b/‎flake.lock
Lines changed: 24 additions & 6 deletions
diff --git a/‎flake.nix
Lines changed: 18 additions & 8 deletions b/‎flake.nix
Lines changed: 18 additions & 8 deletions
@@ -73,6 +73,7 @@ set(LLAMA_CUDA_DMMV_Y       "1" CACHE STRING "llama: y block size for dmmv CUDA
 option(LLAMA_HIPBLAS                         "llama: use hipBLAS"                               OFF)
 option(LLAMA_CLBLAST                         "llama: use CLBlast"                               OFF)
 option(LLAMA_METAL                           "llama: use Metal"                                 OFF)
+option(LLAMA_K_QUANTS                        "llama: use k-quants"                              ON)
 
 option(LLAMA_BUILD_TESTS                "llama: build tests"    ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_EXAMPLES             "llama: build examples" ${LLAMA_STANDALONE})
@@ -227,6 +228,11 @@ if (LLAMA_METAL)
         )
 endif()
 
+if (LLAMA_K_QUANTS)
+    set(GGML_SOURCES_EXTRA ${GGML_SOURCES_EXTRA} k_quants.c k_quants.h)
+    add_compile_definitions(GGML_USE_K_QUANTS)
+endif()
+
 if (LLAMA_CLBLAST)
     find_package(CLBlast)
     if (CLBlast_FOUND)
@@ -428,11 +434,10 @@ endif()
 add_library(ggml OBJECT
             ggml.c
             ggml.h
-            ggml-quants-k.h
-            ggml-quants-k.c
             ${GGML_SOURCES_CUDA}
             ${GGML_SOURCES_OPENCL}
             ${GGML_SOURCES_METAL}
+            ${GGML_SOURCES_EXTRA}
             )
 
 target_include_directories(ggml PUBLIC .)
 
@@ -121,6 +121,11 @@ ifneq ($(filter ppc64%,$(UNAME_M)),)
 	endif
 endif
 
+ifndef LLAMA_NO_K_QUANTS
+	CFLAGS   += -DGGML_USE_K_QUANTS
+	OBJS     += k_quants.o
+endif
+
 ifndef LLAMA_NO_ACCELERATE
 	# Mac M1 - include Accelerate framework.
 	# `-framework Accelerate` works on Mac Intel as well, with negliable performance boost (as of the predict time).
@@ -140,7 +145,7 @@ ifdef LLAMA_OPENBLAS
 endif # LLAMA_OPENBLAS
 
 ifdef LLAMA_BLIS
-	CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
+	CFLAGS  += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
 	LDFLAGS += -lblis -L/usr/local/lib
 endif # LLAMA_BLIS
 
@@ -230,6 +235,11 @@ ifneq ($(filter armv8%,$(UNAME_M)),)
 	CFLAGS += -mfp16-format=ieee -mno-unaligned-access
 endif
 
+ifdef LLAMA_NO_K_QUANTS
+k_quants.o: k_quants.c k_quants.h
+	$(CC) $(CFLAGS) -c $< -o $@
+endif # LLAMA_NO_K_QUANTS
+
 #
 # Print build information
 #
@@ -249,10 +259,7 @@ $(info )
 # Build library
 #
 
-ggml.o: ggml.c ggml.h ggml-cuda.h ggml-quants-k.h
-	$(CC)  $(CFLAGS)   -c $< -o $@
-
-ggml-quants-k.o: ggml-quants-k.c ggml-quants-k.h ggml.h ggml-cuda.h
+ggml.o: ggml.c ggml.h ggml-cuda.h
 	$(CC)  $(CFLAGS)   -c $< -o $@
 
 llama.o: llama.cpp ggml.h ggml-cuda.h llama.h llama-util.h
@@ -261,7 +268,7 @@ llama.o: llama.cpp ggml.h ggml-cuda.h llama.h llama-util.h
 common.o: examples/common.cpp examples/common.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 
-libllama.so: llama.o ggml.o ggml-quants-k.o $(OBJS)
+libllama.so: llama.o ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
 
 clean:
@@ -271,28 +278,28 @@ clean:
 # Examples
 #
 
-main: examples/main/main.cpp                                  build-info.h ggml.o ggml-quants-k.o llama.o common.o $(OBJS)
+main: examples/main/main.cpp                                  build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 	@echo
 	@echo '====  Run ./main -h for help.  ===='
 	@echo
 
-quantize: examples/quantize/quantize.cpp                      build-info.h ggml.o ggml-quants-k.o llama.o $(OBJS)
+quantize: examples/quantize/quantize.cpp                      build-info.h ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-quantize-stats: examples/quantize-stats/quantize-stats.cpp    build-info.h ggml.o ggml-quants-k.o llama.o $(OBJS)
+quantize-stats: examples/quantize-stats/quantize-stats.cpp    build-info.h ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-perplexity: examples/perplexity/perplexity.cpp                build-info.h ggml.o ggml-quants-k.o llama.o common.o $(OBJS)
+perplexity: examples/perplexity/perplexity.cpp                build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-embedding: examples/embedding/embedding.cpp                   build-info.h ggml.o ggml-quants-k.o llama.o common.o $(OBJS)
+embedding: examples/embedding/embedding.cpp                   build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o ggml-quants-k.o llama.o common.o $(OBJS)
+save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp build-info.h ggml.o ggml-quants-k.o llama.o common.o $(OBJS)
+server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS)
 
 build-info.h: $(wildcard .git/index) scripts/build-info.sh
@@ -307,11 +314,11 @@ build-info.h: $(wildcard .git/index) scripts/build-info.sh
 # Tests
 #
 
-benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o ggml-quants-k.o $(OBJS)
+benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 	./$@
 
-vdot: pocs/vdot/vdot.cpp ggml.o ggml-quants-k.o $(OBJS)
+vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
 
 .PHONY: tests clean
 
@@ -9,6 +9,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
 
 **Hot topics:**
 
+- Roadmap June 2023: https://github.com/ggerganov/llama.cpp/discussions/1729
 - GPU support with Metal (Apple Silicon): https://github.com/ggerganov/llama.cpp/pull/1642
 - High-quality 2,3,4,5,6-bit quantization: https://github.com/ggerganov/llama.cpp/pull/1684
 - Multi-GPU support: https://github.com/ggerganov/llama.cpp/pull/1607
 
@@ -132,6 +132,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
             params.path_prompt_cache = argv[i];
         } else if (arg == "--prompt-cache-all") {
             params.prompt_cache_all = true;
+        } else if (arg == "--prompt-cache-ro") {
+            params.prompt_cache_ro = true;
         } else if (arg == "-f" || arg == "--file") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -432,6 +434,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     fprintf(stderr, "  --prompt-cache FNAME  file to cache prompt state for faster startup (default: none)\n");
     fprintf(stderr, "  --prompt-cache-all    if specified, saves user input and generations to cache as well.\n");
     fprintf(stderr, "                        not supported with --interactive or other interactive options\n");
+    fprintf(stderr, "  --prompt-cache-ro     if specified, uses the prompt cache but does not update it.\n");
     fprintf(stderr, "  --random-prompt       start with a randomized prompt.\n");
     fprintf(stderr, "  --in-prefix STRING    string to prefix user inputs with (default: empty)\n");
     fprintf(stderr, "  --in-suffix STRING    string to suffix after user inputs with (default: empty)\n");
 
@@ -62,6 +62,7 @@ struct gpt_params {
     bool use_color         = false; // use color to distinguish generations and inputs
     bool interactive       = false; // interactive mode
     bool prompt_cache_all  = false; // save user input and generations to prompt cache
+    bool prompt_cache_ro   = false; // open the prompt cache read-only and do not update it
 
     bool embedding         = false; // get only sentence embedding
     bool interactive_first = false; // wait for user input immediately
 
@@ -417,7 +417,7 @@ int main(int argc, char ** argv) {
             const bool    penalize_nl     = params.penalize_nl;
 
             // optionally save the session on first sample (for faster prompt loading next time)
-            if (!path_session.empty() && need_to_save_session) {
+            if (!path_session.empty() && need_to_save_session && !params.prompt_cache_ro) {
                 need_to_save_session = false;
                 llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
             }
@@ -630,7 +630,7 @@ int main(int argc, char ** argv) {
         }
     }
 
-    if (!path_session.empty() && params.prompt_cache_all) {
+    if (!path_session.empty() && params.prompt_cache_all && !params.prompt_cache_ro) {
         fprintf(stderr, "\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str());
         llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
     }
 
@@ -6,6 +6,13 @@
   outputs = { self, nixpkgs, flake-utils }:
     flake-utils.lib.eachDefaultSystem (system:
       let
+        inherit (pkgs.stdenv) isAarch64 isDarwin;
+        inherit (pkgs.lib) optionals;
+        isM1 = isAarch64 && isDarwin;
+        osSpecific =
+          if isM1 then with pkgs.darwin.apple_sdk_11_0.frameworks; [ Accelerate MetalKit MetalPerformanceShaders MetalPerformanceShadersGraph ]
+          else if isDarwin then with pkgs.darwin.apple_sdk.frameworks; [ Accelerate CoreGraphics CoreVideo ]
+          else [ ];
         pkgs = import nixpkgs {
           inherit system;
         };
@@ -18,17 +25,22 @@
         packages.default = pkgs.stdenv.mkDerivation {
           name = "llama.cpp";
           src = ./.;
+          postPatch =
+            if isM1 then ''
+              substituteInPlace ./ggml-metal.m \
+                --replace '[[NSBundle mainBundle] pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/ggml-metal.metal\";"
+            '' else "";
           nativeBuildInputs = with pkgs; [ cmake ];
-          buildInputs = with pkgs; lib.optionals stdenv.isDarwin [
-            darwin.apple_sdk.frameworks.Accelerate
-          ];
-          cmakeFlags = with pkgs; lib.optionals (system == "aarch64-darwin") [
+          buildInputs = osSpecific;
+          cmakeFlags = [ "-DLLAMA_BUILD_SERVER=ON" ] ++ (optionals isM1 [
             "-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1"
-          ];
+            "-DLLAMA_METAL=ON"
+          ]);
           installPhase = ''
             mkdir -p $out/bin
             mv bin/* $out/bin/
             mv $out/bin/main $out/bin/llama
+            mv $out/bin/server $out/bin/llama-server
 
             echo "#!${llama-python}/bin/python" > $out/bin/convert.py
             cat ${./convert.py} >> $out/bin/convert.py
@@ -40,9 +52,7 @@
           packages = with pkgs; [
             cmake
             llama-python
-          ] ++ lib.optionals stdenv.isDarwin [
-            darwin.apple_sdk.frameworks.Accelerate
-          ];
+          ] ++ osSpecific;
         };
       }
     );
Original file line number	Diff line number	Diff line change
`@@ -417,7 +417,7 @@ int main(int argc, char ** argv) {`
`417`	`417`	`const bool penalize_nl = params.penalize_nl;`
`418`	`418`
`419`	`419`	`// optionally save the session on first sample (for faster prompt loading next time)`
`420`		`- if (!path_session.empty() && need_to_save_session) {`
	`420`	`+ if (!path_session.empty() && need_to_save_session && !params.prompt_cache_ro) {`
`421`	`421`	`need_to_save_session = false;`
`422`	`422`	`llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());`
`423`	`423`	`}`
`@@ -630,7 +630,7 @@ int main(int argc, char ** argv) {`
`630`	`630`	`}`
`631`	`631`	`}`
`632`	`632`
`633`		`- if (!path_session.empty() && params.prompt_cache_all) {`
	`633`	`+ if (!path_session.empty() && params.prompt_cache_all && !params.prompt_cache_ro) {`
`634`	`634`	`fprintf(stderr, "\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str());`
`635`	`635`	`llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());`
`636`	`636`	`}`