Introduce enum llama_ftype

sw · sw · commit 9c987eeddfad · 2023-04-02T13:37:23.000+02:00
diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
@@ -12,8 +12,8 @@ int main(int argc, char ** argv) {
 
     if (argc != 4) {
         fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
-        fprintf(stderr, "  type = 2 - q4_0\n");
-        fprintf(stderr, "  type = 3 - q4_1\n");
+        fprintf(stderr, "  type = %d - q4_0\n", LLAMA_FTYPE_MOSTLY_Q4_0);
+        fprintf(stderr, "  type = %d - q4_1\n", LLAMA_FTYPE_MOSTLY_Q4_1);
         return 1;
     }
 
@@ -27,7 +27,15 @@ int main(int argc, char ** argv) {
     const std::string fname_inp = argv[1];
     const std::string fname_out = argv[2];
 
-    const int itype = atoi(argv[3]);
+    const enum llama_ftype itype = (enum llama_ftype)atoi(argv[3]);
+    switch (itype) {
+        case LLAMA_FTYPE_MOSTLY_Q4_0:
+        case LLAMA_FTYPE_MOSTLY_Q4_1:
+            break;
+        default:
+            fprintf(stderr, "Invalid model file type %d\n", itype);
+            return 1;
+    }
 
     const int64_t t_main_start_us = ggml_time_us();
 
diff --git a/llama.cpp b/llama.cpp
@@ -54,6 +54,8 @@ enum e_model {
     MODEL_65B,
 };
 
+static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", "gptq" };
+
 static const size_t MB = 1024*1024;
 
 // computed for n_ctx == 2048
@@ -100,7 +102,7 @@ struct llama_hparams {
     int32_t n_head  = 32;
     int32_t n_layer = 32;
     int32_t n_rot   = 64;
-    int32_t f16     = 1;
+    int32_t f16     = LLAMA_FTYPE_MOSTLY_F16;
 };
 
 struct llama_layer {
@@ -435,7 +437,7 @@ static bool llama_model_load(
         }
 
         // temp warning to tell the user to use "--n_parts"
-        if (hparams.f16 == 4 && n_parts != 1) {
+        if (hparams.f16 == LLAMA_FTYPE_PER_LAYER_IS_Q4_1 && n_parts != 1) {
             fprintf(stderr, "%s: GPTQ model detected - are you sure n_parts should be %d? we normally expect it to be 1\n", __func__, n_parts);
             fprintf(stderr, "%s: use '--n_parts 1' if necessary\n", __func__);
         }
@@ -508,11 +510,14 @@ static bool llama_model_load(
     // wtype is for per-layer weights, while vtype is for other weights
     ggml_type wtype, vtype;
     switch (model.hparams.f16) {
-        case 0: wtype = vtype = GGML_TYPE_F32;  break;
-        case 1: wtype = vtype = GGML_TYPE_F16;  break;
-        case 2: wtype = vtype = GGML_TYPE_Q4_0; break;
-        case 3: wtype = vtype = GGML_TYPE_Q4_1; break;
-        case 4: wtype = GGML_TYPE_Q4_1; vtype = GGML_TYPE_F16; break;
+        case LLAMA_FTYPE_ALL_F32:     wtype = vtype = GGML_TYPE_F32;  break;
+        case LLAMA_FTYPE_MOSTLY_F16:  wtype = vtype = GGML_TYPE_F16;  break;
+        case LLAMA_FTYPE_MOSTLY_Q4_0: wtype = vtype = GGML_TYPE_Q4_0; break;
+        case LLAMA_FTYPE_MOSTLY_Q4_1: wtype = vtype = GGML_TYPE_Q4_1; break;
+        case LLAMA_FTYPE_PER_LAYER_IS_Q4_1:
+            wtype = GGML_TYPE_Q4_1;
+            vtype = GGML_TYPE_F16;
+            break;
         default:
                 {
                     fprintf(stderr, "%s: invalid model file '%s' (bad f16 value %d)\n",
@@ -684,16 +689,15 @@ static bool llama_model_load(
                 return false;
             }
             if (0) {
-                static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
                 fprintf(stderr, "%24s - [%5d, %5d], type = %6s\n", name.data(), ne[0], ne[1], ftype_str[ftype]);
             }
 
             switch (ftype) {
-                case 0:  // f32
-                case 1:  // f16
+                case LLAMA_FTYPE_ALL_F32:
+                case LLAMA_FTYPE_MOSTLY_F16:
                     break;
-                case 2:  // q4_0
-                case 3:  // q4_1
+                case LLAMA_FTYPE_MOSTLY_Q4_0:
+                case LLAMA_FTYPE_MOSTLY_Q4_1:
                     assert(ne[0] % 64 == 0);
                     break;
                 default:
@@ -1273,20 +1277,15 @@ static llama_vocab::id llama_sample_top_p_top_k(
 //
 
 // TODO: reuse code from the llama_model_load() somehow
-static bool llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, int itype) {
-    ggml_type type = GGML_TYPE_Q4_1;
+static bool llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype itype) {
+    ggml_type type;
 
     switch (itype) {
-        case 2: type = GGML_TYPE_Q4_0; break;
-        case 3: type = GGML_TYPE_Q4_1; break;
-        default: fprintf(stderr, "%s: invalid quantization type %d\n", __func__, itype); return 1;
+        case LLAMA_FTYPE_MOSTLY_Q4_0: type = GGML_TYPE_Q4_0; break;
+        case LLAMA_FTYPE_MOSTLY_Q4_1: type = GGML_TYPE_Q4_1; break;
+        default: fprintf(stderr, "%s: invalid quantization type %d\n", __func__, itype); return false;
     };
 
-    if (type != GGML_TYPE_Q4_0 && type != GGML_TYPE_Q4_1) {
-        fprintf(stderr, "%s: invalid quantization type %d\n", __func__, type);
-        return false;
-    }
-
     llama_vocab vocab;
 
     printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
@@ -1438,7 +1437,6 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
             }
 
             {
-                static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
                 printf("%48s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ftype_str[ftype]);
             }
 
@@ -1459,12 +1457,12 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
             quantize &= (n_dims == 2);
 
             if (quantize) {
-                if (ftype != 0 && ftype != 1) {
+                if (ftype != LLAMA_FTYPE_ALL_F32 && ftype != LLAMA_FTYPE_MOSTLY_F16) {
                     fprintf(stderr, "%s: unsupported ftype %d for integer quantization\n", __func__, ftype);
                     return false;
                 }
 
-                if (ftype == 1) {
+                if (ftype == LLAMA_FTYPE_MOSTLY_F16) {
                     data_f16.resize(nelements);
                     finp.read(reinterpret_cast<char *>(data_f16.data()), nelements * sizeof(ggml_fp16_t));
                     data_f32.resize(nelements);
@@ -1478,7 +1476,7 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
 
                 ftype = itype;
             } else {
-                const int bpe = (ftype == 0) ? sizeof(float) : sizeof(uint16_t);
+                const int bpe = (ftype == LLAMA_FTYPE_ALL_F32) ? sizeof(float) : sizeof(uint16_t);
 
                 data_u8.resize(nelements*bpe);
                 finp.read(reinterpret_cast<char *>(data_u8.data()), nelements * bpe);
@@ -1659,7 +1657,7 @@ void llama_free(struct llama_context * ctx) {
 int llama_model_quantize(
         const char * fname_inp,
         const char * fname_out,
-               int   itype) {
+  enum llama_ftype   itype) {
     if (!llama_model_quantize_internal(fname_inp, fname_out, itype)) {
         fprintf(stderr, "%s: failed to quantize\n", __func__);
         return 1;
diff --git a/llama.h b/llama.h
@@ -64,6 +64,15 @@ extern "C" {
         void * progress_callback_user_data;
     };
 
+    // model file types
+    enum llama_ftype {
+        LLAMA_FTYPE_ALL_F32           = 0,
+        LLAMA_FTYPE_MOSTLY_F16        = 1,  // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q4_0       = 2,  // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q4_1       = 3,  // except 1d tensors
+        LLAMA_FTYPE_PER_LAYER_IS_Q4_1 = 4,  // but tok_embeddings.weight and output.weight are F16
+    };
+
     LLAMA_API struct llama_context_params llama_context_default_params();
 
     // Various functions for loading a ggml llama model.
@@ -81,7 +90,7 @@ extern "C" {
     LLAMA_API int llama_model_quantize(
             const char * fname_inp,
             const char * fname_out,
-                   int   itype);
+      enum llama_ftype   itype);
 
     // Returns the KV cache that will contain the context for the
     // ongoing prediction with the model.