-
Notifications
You must be signed in to change notification settings - Fork 11.6k
Added support for the ArcticForCausalLM. #7020
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
71d8bd6
c95013d
c6f15a7
0cffda8
f3d1227
a892571
4ebb52c
9acc3ec
f4421f7
7a5df5f
85263f0
5b2be25
5553226
f93acb5
b53fd29
eb58c4b
a1a5508
3aa20e1
602c80d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -138,6 +138,7 @@ class MODEL_ARCH(IntEnum): | |
COMMAND_R = auto() | ||
DBRX = auto() | ||
OLMO = auto() | ||
ARCTIC = auto() | ||
|
||
|
||
class MODEL_TENSOR(IntEnum): | ||
|
@@ -180,6 +181,7 @@ class MODEL_TENSOR(IntEnum): | |
SSM_A = auto() | ||
SSM_D = auto() | ||
SSM_OUT = auto() | ||
FFN_NORM_EXP = auto() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since the actual numbers associated to the enum values of If this is changed, it should also be placed similarly in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I changed the order as requested, but in llama_layer struct the order is different, so I didn't touch it. In llm_load_tensors I think it was already in the requested order. |
||
|
||
|
||
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { | ||
|
@@ -215,6 +217,7 @@ class MODEL_TENSOR(IntEnum): | |
MODEL_ARCH.COMMAND_R: "command-r", | ||
MODEL_ARCH.DBRX: "dbrx", | ||
MODEL_ARCH.OLMO: "olmo", | ||
MODEL_ARCH.ARCTIC: "arctic", | ||
} | ||
|
||
TENSOR_NAMES: dict[MODEL_TENSOR, str] = { | ||
|
@@ -257,6 +260,7 @@ class MODEL_TENSOR(IntEnum): | |
MODEL_TENSOR.SSM_A: "blk.{bid}.ssm_a", | ||
MODEL_TENSOR.SSM_D: "blk.{bid}.ssm_d", | ||
MODEL_TENSOR.SSM_OUT: "blk.{bid}.ssm_out", | ||
MODEL_TENSOR.FFN_NORM_EXP: "blk.{bid}.ffn_norm_exps", | ||
} | ||
|
||
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { | ||
|
@@ -725,6 +729,27 @@ class MODEL_TENSOR(IntEnum): | |
MODEL_TENSOR.FFN_DOWN, | ||
MODEL_TENSOR.FFN_UP, | ||
], | ||
MODEL_ARCH.ARCTIC: [ | ||
MODEL_TENSOR.TOKEN_EMBD, | ||
MODEL_TENSOR.OUTPUT_NORM, | ||
MODEL_TENSOR.OUTPUT, | ||
MODEL_TENSOR.ROPE_FREQS, | ||
MODEL_TENSOR.ATTN_NORM, | ||
MODEL_TENSOR.ATTN_Q, | ||
MODEL_TENSOR.ATTN_K, | ||
MODEL_TENSOR.ATTN_V, | ||
MODEL_TENSOR.ATTN_OUT, | ||
MODEL_TENSOR.ATTN_ROT_EMBD, | ||
MODEL_TENSOR.FFN_GATE_INP, | ||
MODEL_TENSOR.FFN_NORM, | ||
MODEL_TENSOR.FFN_GATE, | ||
MODEL_TENSOR.FFN_DOWN, | ||
MODEL_TENSOR.FFN_UP, | ||
MODEL_TENSOR.FFN_GATE_EXP, | ||
MODEL_TENSOR.FFN_DOWN_EXP, | ||
MODEL_TENSOR.FFN_UP_EXP, | ||
MODEL_TENSOR.FFN_NORM_EXP, | ||
], | ||
# TODO | ||
} | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -370,6 +370,64 @@ class TensorNameMap: | |
"model.layers.{bid}.out_proj", | ||
"backbone.layers.{bid}.mixer.out_proj", | ||
), | ||
|
||
} | ||
|
||
# architecture-specific block mappings | ||
arch_block_mappings_cfg: dict[MODEL_ARCH, dict[MODEL_TENSOR, tuple[str, ...]]] = { | ||
MODEL_ARCH.ARCTIC: { | ||
MODEL_TENSOR.TOKEN_EMBD: ( | ||
"model.embed_tokens", | ||
), | ||
MODEL_TENSOR.OUTPUT_NORM: ( | ||
"model.norm", | ||
), | ||
MODEL_TENSOR.OUTPUT: ( | ||
"lm_head", | ||
), | ||
MODEL_TENSOR.ATTN_NORM: ( | ||
"model.layers.{bid}.input_layernorm", | ||
), | ||
MODEL_TENSOR.ATTN_Q: ( | ||
"model.layers.{bid}.self_attn.q_proj", | ||
), | ||
MODEL_TENSOR.ATTN_K: ( | ||
"model.layers.{bid}.self_attn.k_proj", | ||
), | ||
MODEL_TENSOR.ATTN_V: ( | ||
"model.layers.{bid}.self_attn.v_proj", | ||
), | ||
MODEL_TENSOR.ATTN_OUT: ( | ||
"model.layers.{bid}.self_attn.o_proj", | ||
), | ||
MODEL_TENSOR.FFN_GATE_INP: ( | ||
"model.layers.{bid}.block_sparse_moe.gate", | ||
), | ||
MODEL_TENSOR.FFN_NORM: ( | ||
"model.layers.{bid}.residual_layernorm", | ||
), | ||
MODEL_TENSOR.FFN_GATE: ( | ||
"model.layers.{bid}.residual_mlp.w1", | ||
), | ||
MODEL_TENSOR.FFN_DOWN: ( | ||
"model.layers.{bid}.residual_mlp.w2", | ||
), | ||
MODEL_TENSOR.FFN_UP: ( | ||
"model.layers.{bid}.residual_mlp.w3", | ||
), | ||
MODEL_TENSOR.FFN_GATE_EXP: ( | ||
"layers.{bid}.feed_forward.experts.w1", | ||
), | ||
MODEL_TENSOR.FFN_DOWN_EXP: ( | ||
"layers.{bid}.feed_forward.experts.w2", | ||
), | ||
MODEL_TENSOR.FFN_UP_EXP: ( | ||
"layers.{bid}.feed_forward.experts.w3", | ||
), | ||
MODEL_TENSOR.FFN_NORM_EXP: ( | ||
"model.layers.{bid}.post_attention_layernorm", | ||
), | ||
}, | ||
} | ||
|
||
mapping: dict[str, tuple[MODEL_TENSOR, str]] | ||
|
@@ -383,12 +441,16 @@ def __init__(self, arch: MODEL_ARCH, n_blocks: int): | |
self.mapping[tensor_name] = (tensor, tensor_name) | ||
for key in keys: | ||
self.mapping[key] = (tensor, tensor_name) | ||
if arch in self.arch_block_mappings_cfg: | ||
block_mappings = self.arch_block_mappings_cfg[arch] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This means architecture-specific block mappings can't partially override the common mappings (they have to totally re-define everything)? Maybe this is fixable by adding the common mappings first to So maybe using the union operator for dicts would be appropriate here if arch in self.arch_block_mappings_cfg:
block_mappings = self.block_mappings_cfg | self.arch_block_mappings_cfg[arch] But that's only supported since Python 3.9, and In this case using After that, the architecture-specific mapping of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So the idea is to keep only "conflicting" block mappings in architecture-specific mappings and "non-conflicting" mappings in general mappings? I think using dict.update() is a better idea then. Mappings for ARCTIC arch would be shortened to:
while in the TensorNameMap init we would only have to add:
What do you think? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Yes, exactly.
I think using I agree with using There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OK, done |
||
else: | ||
block_mappings = self.block_mappings_cfg | ||
for bid in range(n_blocks): | ||
for tensor, keys in self.block_mappings_cfg.items(): | ||
for tensor, keys in block_mappings.items(): | ||
if tensor not in MODEL_TENSORS[arch]: | ||
continue | ||
# TODO: make this configurable | ||
n_experts = 60 | ||
n_experts = 128 | ||
for xid in range(n_experts): | ||
tensor_name = TENSOR_NAMES[tensor].format(bid = bid, xid = xid) | ||
self.mapping[tensor_name] = (tensor, tensor_name) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
re: #6877 (comment), this should be:
The assertion exists because LlamaHfVocab was primarily written to convert HF "fast" tokenizers with a tokenizer.json. Since before it existed, "slow" sentencepiece tokenizers with a tokenizer.model have (almost?) always been converted using SentencePieceProcessor, which doesn't depend on HF transformers and directly preserves the token types and scores.
If you want to start converting slow tokenizers using HfVocab as well, I won't stop you, but in order to be consistent you'd have to remove all references to SentencePieceProcessor in the convert scripts, and make HF transformers a hard requirement for converting models with a Llama vocab. Otherwise, we'd be making an exception for this model for no clear reason.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
My reason is that the official tokenizer.model file for snowflake-arctic-instruct contains wrong BOS and EOS tokens as confirmed in: https://huggingface.co/Snowflake/snowflake-arctic-instruct/discussions/12
That's why I used llama_hf vocab that reads tokens from json files instead. If there is a better solution for this I'm fully open to any suggestions.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@cebtenzzre What if I implement ArcticModel::set_vocab() myself like XverseForCausalLM did, is that acceptable?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@cebtenzzre I now load vocabulary with SentencePieceProcessor as you suggested and apply necessary token modifications based on added_tokens_decoder field from tokenizer_config.json.