Skip to content

Commit 2c3f8b8

Browse files
authored
llama : support BailingMoE (Ling) (#12634)
1 parent 4663bd3 commit 2c3f8b8

13 files changed

+404
-0
lines changed

Diff for: README.md

+1
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
113113
- [x] [QRWKV-6](https://huggingface.co/recursal/QRWKV6-32B-Instruct-Preview-v0.1)
114114
- [x] [GigaChat-20B-A3B](https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct)
115115
- [X] [Trillion-7B-preview](https://huggingface.co/trillionlabs/Trillion-7B-preview)
116+
- [x] [Ling models](https://huggingface.co/collections/inclusionAI/ling-67c51c85b34a7ea0aba94c32)
116117

117118
#### Multimodal
118119

Diff for: convert_hf_to_gguf.py

+105
Original file line numberDiff line numberDiff line change
@@ -711,6 +711,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
711711
if chkhsh == "1994ffd01900cfb37395608534236ecd63f2bd5995d6cb1004dda1af50240f15":
712712
# ref: https://huggingface.co/trillionlabs/Trillion-7B-preview
713713
res = "trillion"
714+
if chkhsh == "96a5f08be6259352137b512d4157e333e21df7edd3fcd152990608735a65b224":
715+
# ref: https://huggingface.co/inclusionAI/Ling-lite
716+
res = "bailingmoe"
714717

715718
if res is None:
716719
logger.warning("\n")
@@ -5133,6 +5136,108 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
51335136
return super().modify_tensors(data_torch, name, bid)
51345137

51355138

5139+
@Model.register("BailingMoeForCausalLM")
5140+
class BailingMoeModel(Model):
5141+
model_arch = gguf.MODEL_ARCH.BAILINGMOE
5142+
5143+
def set_vocab(self):
5144+
self._set_vocab_gpt2()
5145+
5146+
def set_gguf_parameters(self):
5147+
super().set_gguf_parameters()
5148+
hparams = self.hparams
5149+
if "head_dim" in hparams:
5150+
rope_dim = hparams["head_dim"]
5151+
else:
5152+
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
5153+
5154+
self.gguf_writer.add_rope_dimension_count(rope_dim)
5155+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
5156+
self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
5157+
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
5158+
self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
5159+
self.gguf_writer.add_expert_weights_scale(1.0)
5160+
self.gguf_writer.add_expert_count(hparams["num_experts"])
5161+
self.gguf_writer.add_expert_shared_count(hparams["num_shared_experts"])
5162+
self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"])
5163+
5164+
_experts: list[dict[str, Tensor]] | None = None
5165+
5166+
@staticmethod
5167+
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
5168+
if n_head_kv is not None and n_head != n_head_kv:
5169+
n_head = n_head_kv
5170+
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
5171+
.swapaxes(1, 2)
5172+
.reshape(weights.shape))
5173+
5174+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
5175+
n_head = self.hparams["num_attention_heads"]
5176+
n_kv_head = self.hparams.get("num_key_value_heads")
5177+
n_embd = self.hparams["hidden_size"]
5178+
head_dim = self.hparams.get("head_dim", n_embd // n_head)
5179+
5180+
output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT)
5181+
5182+
if name.endswith("attention.dense.weight"):
5183+
return [(self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, bid), data_torch)]
5184+
elif name.endswith("query_key_value.weight"):
5185+
q, k, v = data_torch.split([n_head * head_dim, n_kv_head * head_dim, n_kv_head * head_dim], dim=-2)
5186+
5187+
return [
5188+
(self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), BailingMoeModel.permute(q, n_head, n_head)),
5189+
(self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), BailingMoeModel.permute(k, n_head, n_kv_head)),
5190+
(self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), v)
5191+
]
5192+
elif name.find("mlp.experts") != -1:
5193+
n_experts = self.hparams["num_experts"]
5194+
assert bid is not None
5195+
5196+
tensors: list[tuple[str, Tensor]] = []
5197+
5198+
if self._experts is None:
5199+
self._experts = [{} for _ in range(self.block_count)]
5200+
5201+
self._experts[bid][name] = data_torch
5202+
5203+
if len(self._experts[bid]) >= n_experts * 3:
5204+
# merge the experts into a single 3d tensor
5205+
for w_name in ["down_proj", "gate_proj", "up_proj"]:
5206+
datas: list[Tensor] = []
5207+
5208+
for xid in range(n_experts):
5209+
ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
5210+
datas.append(self._experts[bid][ename])
5211+
del self._experts[bid][ename]
5212+
5213+
data_torch = torch.stack(datas, dim=0)
5214+
5215+
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
5216+
5217+
new_name = self.map_tensor_name(merged_name)
5218+
5219+
tensors.append((new_name, data_torch))
5220+
5221+
return tensors
5222+
5223+
new_name = self.map_tensor_name(name)
5224+
5225+
if new_name == output_name and self.hparams.get("norm_head"):
5226+
data_torch = data_torch.float()
5227+
data_torch /= torch.norm(data_torch, p=2, dim=0, keepdim=True) + 1e-7
5228+
5229+
return [(new_name, data_torch)]
5230+
5231+
def prepare_tensors(self):
5232+
super().prepare_tensors()
5233+
5234+
if self._experts is not None:
5235+
# flatten `list[dict[str, Tensor]]` into `list[str]`
5236+
experts = [k for d in self._experts for k in d.keys()]
5237+
if len(experts) > 0:
5238+
raise ValueError(f"Unprocessed experts: {experts}")
5239+
5240+
51365241
@Model.register("ChameleonForConditionalGeneration")
51375242
@Model.register("ChameleonForCausalLM") # obsolete
51385243
class ChameleonModel(Model):

Diff for: convert_hf_to_gguf_update.py

+1
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,7 @@ class TOKENIZER_TYPE(IntEnum):
112112
{"name": "gpt-4o", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Xenova/gpt-4o", },
113113
{"name": "superbpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/UW/OLMo2-8B-SuperBPE-t180k", },
114114
{"name": "trillion", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/trillionlabs/Trillion-7B-preview", },
115+
{"name": "bailingmoe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/inclusionAI/Ling-lite", },
115116
]
116117

117118

Diff for: gguf-py/gguf/constants.py

+24
Original file line numberDiff line numberDiff line change
@@ -287,6 +287,7 @@ class MODEL_ARCH(IntEnum):
287287
CHAMELEON = auto()
288288
WAVTOKENIZER_DEC = auto()
289289
PLM = auto()
290+
BAILINGMOE = auto()
290291

291292

292293
class MODEL_TENSOR(IntEnum):
@@ -490,6 +491,7 @@ class MODEL_TENSOR(IntEnum):
490491
MODEL_ARCH.CHAMELEON: "chameleon",
491492
MODEL_ARCH.WAVTOKENIZER_DEC: "wavtokenizer-dec",
492493
MODEL_ARCH.PLM: "plm",
494+
MODEL_ARCH.BAILINGMOE: "bailingmoe",
493495
}
494496

495497
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
@@ -1667,6 +1669,25 @@ class MODEL_TENSOR(IntEnum):
16671669
MODEL_TENSOR.POSNET_ATTN_V,
16681670
MODEL_TENSOR.POSNET_ATTN_OUT,
16691671
],
1672+
MODEL_ARCH.BAILINGMOE: [
1673+
MODEL_TENSOR.TOKEN_EMBD,
1674+
MODEL_TENSOR.OUTPUT_NORM,
1675+
MODEL_TENSOR.OUTPUT,
1676+
MODEL_TENSOR.ROPE_FREQS,
1677+
MODEL_TENSOR.ATTN_NORM,
1678+
MODEL_TENSOR.ATTN_Q,
1679+
MODEL_TENSOR.ATTN_K,
1680+
MODEL_TENSOR.ATTN_V,
1681+
MODEL_TENSOR.ATTN_OUT,
1682+
MODEL_TENSOR.FFN_GATE_INP,
1683+
MODEL_TENSOR.FFN_NORM,
1684+
MODEL_TENSOR.FFN_GATE_EXP,
1685+
MODEL_TENSOR.FFN_DOWN_EXP,
1686+
MODEL_TENSOR.FFN_UP_EXP,
1687+
MODEL_TENSOR.FFN_GATE_SHEXP,
1688+
MODEL_TENSOR.FFN_DOWN_SHEXP,
1689+
MODEL_TENSOR.FFN_UP_SHEXP,
1690+
],
16701691
# TODO
16711692
}
16721693

@@ -1719,6 +1740,9 @@ class MODEL_TENSOR(IntEnum):
17191740
MODEL_TENSOR.ROPE_FREQS,
17201741
MODEL_TENSOR.ATTN_ROT_EMBD,
17211742
],
1743+
MODEL_ARCH.BAILINGMOE: [
1744+
MODEL_TENSOR.ROPE_FREQS,
1745+
],
17221746
}
17231747

17241748
#

Diff for: gguf-py/gguf/tensor_mapping.py

+1
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ class TensorNameMap:
2929
"shared", # t5
3030
"rwkv.embeddings", # rwkv6
3131
"model.embeddings", # rwkv7
32+
"model.word_embeddings", # bailingmoe
3233
),
3334

3435
# Token type embeddings

Diff for: include/llama.h

+1
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,7 @@ extern "C" {
109109
LLAMA_VOCAB_PRE_TYPE_GPT4O = 29,
110110
LLAMA_VOCAB_PRE_TYPE_SUPERBPE = 30,
111111
LLAMA_VOCAB_PRE_TYPE_TRILLION = 31,
112+
LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32,
112113
};
113114

114115
enum llama_rope_type {

Diff for: src/llama-arch.cpp

+24
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
6666
{ LLM_ARCH_CHAMELEON, "chameleon" },
6767
{ LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
6868
{ LLM_ARCH_PLM, "plm" },
69+
{ LLM_ARCH_BAILINGMOE, "bailingmoe" },
6970
{ LLM_ARCH_UNKNOWN, "(unknown)" },
7071
};
7172

@@ -1409,6 +1410,29 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
14091410
{ LLM_TENSOR_POS_NET_ATTN_OUT, "posnet.%d.attn_output" },
14101411
},
14111412
},
1413+
{
1414+
LLM_ARCH_BAILINGMOE,
1415+
{
1416+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1417+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1418+
{ LLM_TENSOR_OUTPUT, "output" },
1419+
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
1420+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1421+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1422+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1423+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1424+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1425+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
1426+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1427+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
1428+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
1429+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1430+
{ LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },
1431+
{ LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
1432+
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
1433+
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
1434+
},
1435+
},
14121436
{
14131437
LLM_ARCH_UNKNOWN,
14141438
{

Diff for: src/llama-arch.h

+1
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ enum llm_arch {
7070
LLM_ARCH_CHAMELEON,
7171
LLM_ARCH_WAVTOKENIZER_DEC,
7272
LLM_ARCH_PLM,
73+
LLM_ARCH_BAILINGMOE,
7374
LLM_ARCH_UNKNOWN,
7475
};
7576

Diff for: src/llama-chat.cpp

+20
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
6060
{ "gigachat", LLM_CHAT_TEMPLATE_GIGACHAT },
6161
{ "megrez", LLM_CHAT_TEMPLATE_MEGREZ },
6262
{ "yandex", LLM_CHAT_TEMPLATE_YANDEX },
63+
{ "bailing", LLM_CHAT_TEMPLATE_BAILING },
6364
};
6465

6566
llm_chat_template llm_chat_template_from_str(const std::string & name) {
@@ -171,6 +172,8 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
171172
return LLM_CHAT_TEMPLATE_MEGREZ;
172173
} else if (tmpl_contains(" Ассистент:")) {
173174
return LLM_CHAT_TEMPLATE_YANDEX;
175+
} else if (tmpl_contains("<role>ASSISTANT</role>") && tmpl_contains("'HUMAN'")) {
176+
return LLM_CHAT_TEMPLATE_BAILING;
174177
}
175178
return LLM_CHAT_TEMPLATE_UNKNOWN;
176179
}
@@ -588,6 +591,23 @@ int32_t llm_chat_apply_template(
588591
if (add_ass) {
589592
ss << " Ассистент:[SEP]";
590593
}
594+
} else if (tmpl == LLM_CHAT_TEMPLATE_BAILING) {
595+
// Bailing (Ling) template
596+
for (auto message : chat) {
597+
std::string role(message->role);
598+
599+
if (role == "user") {
600+
role = "HUMAN";
601+
} else {
602+
std::transform(role.begin(), role.end(), role.begin(), ::toupper);
603+
}
604+
605+
ss << "<role>" << role << "</role>" << message->content;
606+
}
607+
608+
if (add_ass) {
609+
ss << "<role>ASSISTANT</role>";
610+
}
591611
} else {
592612
// template not supported
593613
return -1;

Diff for: src/llama-chat.h

+1
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ enum llm_chat_template {
3939
LLM_CHAT_TEMPLATE_GIGACHAT,
4040
LLM_CHAT_TEMPLATE_MEGREZ,
4141
LLM_CHAT_TEMPLATE_YANDEX,
42+
LLM_CHAT_TEMPLATE_BAILING,
4243
LLM_CHAT_TEMPLATE_UNKNOWN,
4344
};
4445

0 commit comments

Comments
 (0)