Update llama.cpp

abetlen · abetlen · commit 1eb130a6b244 · 2023-04-21T17:40:27.000-04:00
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
@@ -117,6 +117,8 @@ class llama_context_params(Structure):
 LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = ctypes.c_int(
     4
 )  # tok_embeddings.weight and output.weight are F16
+LLAMA_FTYPE_MOSTLY_Q4_2 = ctypes.c_int(5)  # except 1d tensors
+LLAMA_FTYPE_MOSTYL_Q4_3 = ctypes.c_int(6)  # except 1d tensors
 
 # Functions
 
@@ -169,11 +171,12 @@ def llama_free(ctx: llama_context_p):
 
 # TODO: not great API - very likely to change
 # Returns 0 on success
-def llama_model_quantize(fname_inp: bytes, fname_out: bytes, itype: c_int) -> c_int:
-    return _lib.llama_model_quantize(fname_inp, fname_out, itype)
+# nthread - how many threads to use. If <=0, will use std::thread::hardware_concurrency(), else the number given
+def llama_model_quantize(fname_inp: bytes, fname_out: bytes, ftype: c_int, nthread: c_int) -> c_int:
+    return _lib.llama_model_quantize(fname_inp, fname_out, ftype, nthread)
 
 
-_lib.llama_model_quantize.argtypes = [c_char_p, c_char_p, c_int]
+_lib.llama_model_quantize.argtypes = [c_char_p, c_char_p, c_int, c_int]
 _lib.llama_model_quantize.restype = c_int
 
 
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit c8c2c524827be8fd681a63f0e5a697b0bf4c587b
+Subproject commit 50cb666b8a2e35a49b08c0f6bc81138c8f6f2ac1