llama : do not quantize expert gating tensors

ggerganov · ggerganov · commit d1259b7b35f5 · 2023-12-10T13:00:13.000+02:00
diff --git a/llama.cpp b/llama.cpp
@@ -8443,6 +8443,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
         quantize &= params->quantize_output_tensor || name != "output.weight";
         quantize &= !params->only_copy;
 
+        // do not quantize expert gating tensors
+        quantize &= name.find("ffn_gate_inp.weight") == std::string::npos;
+
         enum ggml_type new_type;
         void * new_data;
         size_t new_size;