Skip to content

Commit 50e0535

Browse files
authored
llama : add Mistral Nemo inference support (#8604)
1 parent 6281544 commit 50e0535

File tree

2 files changed

+14
-5
lines changed

2 files changed

+14
-5
lines changed

convert_hf_to_gguf.py

+10-1
Original file line numberDiff line numberDiff line change
@@ -239,6 +239,10 @@ def set_gguf_parameters(self):
239239
self.gguf_writer.add_expert_used_count(n_experts_used)
240240
logger.info(f"gguf: experts used count = {n_experts_used}")
241241

242+
if (head_dim := self.hparams.get("head_dim")) is not None:
243+
self.gguf_writer.add_key_length(head_dim)
244+
self.gguf_writer.add_value_length(head_dim)
245+
242246
self.gguf_writer.add_file_type(self.ftype)
243247
logger.info(f"gguf: file type = {self.ftype}")
244248

@@ -1481,7 +1485,12 @@ def set_gguf_parameters(self):
14811485
super().set_gguf_parameters()
14821486
hparams = self.hparams
14831487
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
1484-
self.gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
1488+
1489+
if "head_dim" in hparams:
1490+
rope_dim = hparams["head_dim"]
1491+
else:
1492+
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
1493+
self.gguf_writer.add_rope_dimension_count(rope_dim)
14851494

14861495
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
14871496
if self.hparams["rope_scaling"].get("type") == "linear":

src/llama.cpp

+4-4
Original file line numberDiff line numberDiff line change
@@ -6137,10 +6137,10 @@ static bool llm_load_tensors(
61376137

61386138
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
61396139

6140-
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
6141-
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
6142-
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
6143-
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
6140+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head});
6141+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa});
6142+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa});
6143+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd});
61446144

61456145
// optional bias tensors
61466146
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);

0 commit comments

Comments
 (0)