Skip to content

Commit cf42919

Browse files
ikawrakowKawrakow
authored andcommitted
Fix ffn_down quantization mix for MoE models (ggml-org#4927)
* Fix ffn_down quantization mix for MoE models In ggml-org#4872 I did not consider the part where every third tensor is quantized with more bits. Fir MoE this leads to tensors of the same layer being quantized with different number of bits, which is not considered as a possibility in the inference implementation (it is assumed all experts use the same quantization). * Fix the fix * Review suggestion --------- Co-authored-by: Iwan Kawrakow <[email protected]>
1 parent 3bab66a commit cf42919

File tree

1 file changed

+26
-8
lines changed

1 file changed

+26
-8
lines changed

llama.cpp

+26-8
Original file line numberDiff line numberDiff line change
@@ -8480,28 +8480,46 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
84808480
new_type = GGML_TYPE_Q8_0;
84818481
}
84828482
} else if (name.find("ffn_down") != std::string::npos) {
8483+
const int n_expert = std::max(1, (int)qs.model.hparams.n_expert);
8484+
int i_layer, n_layer;
8485+
if (n_expert == 1) {
8486+
i_layer = qs.i_feed_forward_w2;
8487+
n_layer = qs.n_feed_forward_w2;
8488+
} else {
8489+
// Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but iccasionally randomly
8490+
// sprinkled in the model. Hence, simply dividing i_feed_forward_w2 by n_expert does not work
8491+
// for getting the current layer as I initially thought, and we need to resort to parsing the
8492+
// tensor name.
8493+
n_layer = qs.n_feed_forward_w2 / n_expert;
8494+
if (sscanf(name.c_str(), "blk.%d.ffn_down", &i_layer) != 1) {
8495+
throw std::runtime_error(format("Failed to determine layer for tensor %s", name.c_str()));
8496+
}
8497+
if (i_layer < 0 || i_layer >= n_layer) {
8498+
throw std::runtime_error(format("Bad layer %d for tensor %s. Must be in [0, %d)", i_layer, name.c_str(), n_layer));
8499+
}
8500+
}
84838501
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
84848502
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
8485-
if (qs.i_feed_forward_w2 < qs.n_feed_forward_w2/8) new_type = GGML_TYPE_Q4_K;
8503+
if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
84868504
}
84878505
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
8488-
new_type = qs.i_feed_forward_w2 < qs.n_feed_forward_w2/16 ? GGML_TYPE_Q5_K
8489-
: arch != LLM_ARCH_FALCON || use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q4_K
8506+
new_type = i_layer < n_layer/16 ? GGML_TYPE_Q5_K
8507+
: arch != LLM_ARCH_FALCON || use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q4_K
84908508
: GGML_TYPE_Q3_K;
84918509
}
84928510
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
84938511
new_type = arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
84948512
}
84958513
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
84968514
if (arch == LLM_ARCH_FALCON) {
8497-
new_type = qs.i_feed_forward_w2 < qs.n_feed_forward_w2/16 ? GGML_TYPE_Q6_K :
8498-
use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
8515+
new_type = i_layer < n_layer/16 ? GGML_TYPE_Q6_K :
8516+
use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
84998517
} else {
8500-
if (use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
8518+
if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
85018519
}
85028520
}
8503-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
8504-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && qs.i_feed_forward_w2 < qs.n_feed_forward_w2/8) {
8521+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
8522+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) {
85058523
new_type = GGML_TYPE_Q5_K;
85068524
}
85078525
++qs.i_feed_forward_w2;

0 commit comments

Comments
 (0)