@@ -8480,28 +8480,46 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
8480
8480
new_type = GGML_TYPE_Q8_0;
8481
8481
}
8482
8482
} else if (name.find("ffn_down") != std::string::npos) {
8483
+ const int n_expert = std::max(1, (int)qs.model.hparams.n_expert);
8484
+ int i_layer, n_layer;
8485
+ if (n_expert == 1) {
8486
+ i_layer = qs.i_feed_forward_w2;
8487
+ n_layer = qs.n_feed_forward_w2;
8488
+ } else {
8489
+ // Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but iccasionally randomly
8490
+ // sprinkled in the model. Hence, simply dividing i_feed_forward_w2 by n_expert does not work
8491
+ // for getting the current layer as I initially thought, and we need to resort to parsing the
8492
+ // tensor name.
8493
+ n_layer = qs.n_feed_forward_w2 / n_expert;
8494
+ if (sscanf(name.c_str(), "blk.%d.ffn_down", &i_layer) != 1) {
8495
+ throw std::runtime_error(format("Failed to determine layer for tensor %s", name.c_str()));
8496
+ }
8497
+ if (i_layer < 0 || i_layer >= n_layer) {
8498
+ throw std::runtime_error(format("Bad layer %d for tensor %s. Must be in [0, %d)", i_layer, name.c_str(), n_layer));
8499
+ }
8500
+ }
8483
8501
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
8484
8502
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
8485
- if (qs.i_feed_forward_w2 < qs.n_feed_forward_w2 /8) new_type = GGML_TYPE_Q4_K;
8503
+ if (i_layer < n_layer /8) new_type = GGML_TYPE_Q4_K;
8486
8504
}
8487
8505
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
8488
- new_type = qs.i_feed_forward_w2 < qs.n_feed_forward_w2 /16 ? GGML_TYPE_Q5_K
8489
- : arch != LLM_ARCH_FALCON || use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2 ) ? GGML_TYPE_Q4_K
8506
+ new_type = i_layer < n_layer /16 ? GGML_TYPE_Q5_K
8507
+ : arch != LLM_ARCH_FALCON || use_more_bits(i_layer, n_layer ) ? GGML_TYPE_Q4_K
8490
8508
: GGML_TYPE_Q3_K;
8491
8509
}
8492
8510
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
8493
8511
new_type = arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
8494
8512
}
8495
8513
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
8496
8514
if (arch == LLM_ARCH_FALCON) {
8497
- new_type = qs.i_feed_forward_w2 < qs.n_feed_forward_w2 /16 ? GGML_TYPE_Q6_K :
8498
- use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2 ) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
8515
+ new_type = i_layer < n_layer /16 ? GGML_TYPE_Q6_K :
8516
+ use_more_bits(i_layer, n_layer ) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
8499
8517
} else {
8500
- if (use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2 )) new_type = GGML_TYPE_Q6_K;
8518
+ if (use_more_bits(i_layer, n_layer )) new_type = GGML_TYPE_Q6_K;
8501
8519
}
8502
8520
}
8503
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2 )) new_type = GGML_TYPE_Q6_K;
8504
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && qs.i_feed_forward_w2 < qs.n_feed_forward_w2 /8) {
8521
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_layer, n_layer )) new_type = GGML_TYPE_Q6_K;
8522
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer /8) {
8505
8523
new_type = GGML_TYPE_Q5_K;
8506
8524
}
8507
8525
++qs.i_feed_forward_w2;
0 commit comments