File tree 3 files changed +8
-4
lines changed
3 files changed +8
-4
lines changed Original file line number Diff line number Diff line change @@ -1035,7 +1035,7 @@ static ggml_type kv_cache_type_from_str(const std::string & s) {
1035
1035
return GGML_TYPE_Q5_1;
1036
1036
}
1037
1037
1038
- throw std::runtime_error (" Invalid cache type: " + s);
1038
+ throw std::runtime_error (" Unsupported cache type: " + s);
1039
1039
}
1040
1040
1041
1041
struct llama_context_params common_context_params_to_llama (const common_params & params) {
@@ -1047,7 +1047,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
1047
1047
cparams.n_ubatch = params.n_ubatch ;
1048
1048
cparams.n_threads = params.cpuparams .n_threads ;
1049
1049
cparams.n_threads_batch = params.cpuparams_batch .n_threads == -1 ?
1050
- params.cpuparams .n_threads : params.cpuparams_batch .n_threads ;
1050
+ params.cpuparams .n_threads : params.cpuparams_batch .n_threads ;
1051
1051
cparams.logits_all = params.logits_all ;
1052
1052
cparams.embeddings = params.embedding ;
1053
1053
cparams.rope_scaling_type = params.rope_scaling_type ;
Original file line number Diff line number Diff line change @@ -324,8 +324,9 @@ struct ggml_logger_state {
324
324
static struct ggml_logger_state g_logger_state = {ggml_log_callback_default, NULL};
325
325
326
326
static void ggml_log_internal_v(enum ggml_log_level level, const char * format, va_list args) {
327
- if (format == NULL)
327
+ if (format == NULL) {
328
328
return;
329
+ }
329
330
va_list args_copy;
330
331
va_copy(args_copy, args);
331
332
char buffer[128];
@@ -15723,6 +15724,9 @@ static void ggml_compute_forward_flash_attn_ext_f16(
15723
15724
ggml_vec_dot_t const kq_vec_dot = type_traits[k->type].vec_dot;
15724
15725
ggml_to_float_t const v_to_float = type_traits[v->type].to_float;
15725
15726
15727
+ GGML_ASSERT(q_to_vec_dot && "fattn: unsupported K-type");
15728
+ GGML_ASSERT(v_to_float && "fattn: unsupported V-type");
15729
+
15726
15730
// loop over n_batch and n_head
15727
15731
for (int ir = ir0; ir < ir1; ++ir) {
15728
15732
// q indices
Original file line number Diff line number Diff line change @@ -19243,7 +19243,7 @@ struct llama_context * llama_new_context_with_model(
19243
19243
params.flash_attn = false;
19244
19244
}
19245
19245
19246
- if (params.type_v != GGML_TYPE_F16 && !params.flash_attn) {
19246
+ if (ggml_is_quantized( params.type_v) && !params.flash_attn) {
19247
19247
LLAMA_LOG_ERROR("%s: V cache quantization requires flash_attn\n", __func__);
19248
19248
return nullptr;
19249
19249
}
You can’t perform that action at this time.
0 commit comments