Skip to content

Commit 018fc8d

Browse files
simonJJJhodlen
authored andcommitted
llama : support upcoming Qwen2 (ggml-org#5037)
1 parent f4bd5c0 commit 018fc8d

File tree

3 files changed

+211
-0
lines changed

3 files changed

+211
-0
lines changed

convert-hf-to-gguf.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -189,6 +189,8 @@ def from_model_architecture(model_architecture):
189189
return StableLMModel
190190
if model_architecture == "QWenLMHeadModel":
191191
return QwenModel
192+
if model_architecture == "Qwen2ForCausalLM":
193+
return Model
192194
if model_architecture == "MixtralForCausalLM":
193195
return MixtralModel
194196
if model_architecture == "GPT2LMHeadModel":
@@ -236,6 +238,8 @@ def _get_model_architecture(self) -> gguf.MODEL_ARCH:
236238
return gguf.MODEL_ARCH.STABLELM
237239
if arch == "QWenLMHeadModel":
238240
return gguf.MODEL_ARCH.QWEN
241+
if arch == "Qwen2ForCausalLM":
242+
return gguf.MODEL_ARCH.QWEN2
239243
if arch == "MixtralForCausalLM":
240244
return gguf.MODEL_ARCH.LLAMA
241245
if arch == "GPT2LMHeadModel":

gguf-py/gguf/constants.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,7 @@ class MODEL_ARCH(IntEnum):
9797
BLOOM = auto()
9898
STABLELM = auto()
9999
QWEN = auto()
100+
QWEN2 = auto()
100101
PHI2 = auto()
101102
PLAMO = auto()
102103
CODESHELL = auto()
@@ -146,6 +147,7 @@ class MODEL_TENSOR(IntEnum):
146147
MODEL_ARCH.BLOOM: "bloom",
147148
MODEL_ARCH.STABLELM: "stablelm",
148149
MODEL_ARCH.QWEN: "qwen",
150+
MODEL_ARCH.QWEN2: "qwen2",
149151
MODEL_ARCH.PHI2: "phi2",
150152
MODEL_ARCH.PLAMO: "plamo",
151153
MODEL_ARCH.CODESHELL: "codeshell",
@@ -358,6 +360,20 @@ class MODEL_TENSOR(IntEnum):
358360
MODEL_TENSOR.FFN_DOWN,
359361
MODEL_TENSOR.FFN_UP,
360362
],
363+
MODEL_ARCH.QWEN2: [
364+
MODEL_TENSOR.TOKEN_EMBD,
365+
MODEL_TENSOR.OUTPUT_NORM,
366+
MODEL_TENSOR.OUTPUT,
367+
MODEL_TENSOR.ATTN_NORM,
368+
MODEL_TENSOR.ATTN_Q,
369+
MODEL_TENSOR.ATTN_K,
370+
MODEL_TENSOR.ATTN_V,
371+
MODEL_TENSOR.ATTN_OUT,
372+
MODEL_TENSOR.FFN_NORM,
373+
MODEL_TENSOR.FFN_GATE,
374+
MODEL_TENSOR.FFN_DOWN,
375+
MODEL_TENSOR.FFN_UP,
376+
],
361377
MODEL_ARCH.PLAMO: [
362378
MODEL_TENSOR.TOKEN_EMBD,
363379
MODEL_TENSOR.OUTPUT_NORM,

llama.cpp

Lines changed: 191 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,7 @@ enum llm_arch {
192192
LLM_ARCH_BLOOM,
193193
LLM_ARCH_STABLELM,
194194
LLM_ARCH_QWEN,
195+
LLM_ARCH_QWEN2,
195196
LLM_ARCH_PHI2,
196197
LLM_ARCH_PLAMO,
197198
LLM_ARCH_CODESHELL,
@@ -212,6 +213,7 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
212213
{ LLM_ARCH_BLOOM, "bloom" },
213214
{ LLM_ARCH_STABLELM, "stablelm" },
214215
{ LLM_ARCH_QWEN, "qwen" },
216+
{ LLM_ARCH_QWEN2, "qwen2" },
215217
{ LLM_ARCH_PHI2, "phi2" },
216218
{ LLM_ARCH_PLAMO, "plamo" },
217219
{ LLM_ARCH_CODESHELL, "codeshell" },
@@ -568,6 +570,23 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
568570
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
569571
},
570572
},
573+
{
574+
LLM_ARCH_QWEN2,
575+
{
576+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
577+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
578+
{ LLM_TENSOR_OUTPUT, "output" },
579+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
580+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
581+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
582+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
583+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
584+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
585+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
586+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
587+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
588+
},
589+
},
571590
{
572591
LLM_ARCH_PHI2,
573592
{
@@ -2869,6 +2888,17 @@ static void llm_load_hparams(
28692888
default: model.type = e_model::MODEL_UNKNOWN;
28702889
}
28712890
} break;
2891+
case LLM_ARCH_QWEN2:
2892+
{
2893+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2894+
switch (hparams.n_layer) {
2895+
case 24: model.type = e_model::MODEL_1B; break;
2896+
case 32: model.type = e_model::MODEL_7B; break;
2897+
case 40: model.type = e_model::MODEL_13B; break;
2898+
case 80: model.type = e_model::MODEL_70B; break;
2899+
default: model.type = e_model::MODEL_UNKNOWN;
2900+
}
2901+
} break;
28722902
case LLM_ARCH_PHI2:
28732903
{
28742904
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -3704,6 +3734,41 @@ static bool llm_load_tensors(
37043734
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff/2});
37053735
}
37063736
} break;
3737+
case LLM_ARCH_QWEN2:
3738+
{
3739+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
3740+
3741+
// output
3742+
{
3743+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
3744+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
3745+
}
3746+
3747+
for (int i = 0; i < n_layer; ++i) {
3748+
ggml_context * ctx_layer = ctx_for_layer(i);
3749+
ggml_context * ctx_split = ctx_for_layer_split(i);
3750+
3751+
auto & layer = model.layers[i];
3752+
3753+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
3754+
3755+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
3756+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
3757+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
3758+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
3759+
3760+
// optional bias tensors
3761+
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
3762+
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
3763+
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
3764+
3765+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
3766+
3767+
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
3768+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
3769+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
3770+
}
3771+
} break;
37073772
case LLM_ARCH_PHI2:
37083773
{
37093774
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
@@ -5698,6 +5763,128 @@ struct llm_build_context {
56985763

56995764
return gf;
57005765
}
5766+
5767+
struct ggml_cgraph * build_qwen2() {
5768+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5769+
5770+
const int64_t n_embd_head = hparams.n_embd_head_v;
5771+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
5772+
GGML_ASSERT(n_embd_head == hparams.n_rot);
5773+
5774+
struct ggml_tensor * cur;
5775+
struct ggml_tensor * inpL;
5776+
5777+
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
5778+
cb(inpL, "inp_embd", -1);
5779+
5780+
// inp_pos - contains the positions
5781+
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
5782+
cb(inp_pos, "inp_pos", -1);
5783+
5784+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5785+
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
5786+
cb(KQ_mask, "KQ_mask", -1);
5787+
5788+
// shift the entire K-cache if needed
5789+
if (do_rope_shift) {
5790+
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
5791+
}
5792+
5793+
for (int il = 0; il < n_layer; ++il) {
5794+
struct ggml_tensor * inpSA = inpL;
5795+
5796+
// norm
5797+
cur = llm_build_norm(ctx0, inpL, hparams,
5798+
model.layers[il].attn_norm, NULL,
5799+
LLM_NORM_RMS, cb, il);
5800+
cb(cur, "attn_norm", il);
5801+
5802+
// self-attention
5803+
{
5804+
// compute Q and K and RoPE them
5805+
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
5806+
cb(Qcur, "Qcur", il);
5807+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
5808+
cb(Qcur, "Qcur", il);
5809+
5810+
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
5811+
cb(Kcur, "Kcur", il);
5812+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
5813+
cb(Kcur, "Kcur", il);
5814+
5815+
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
5816+
cb(Vcur, "Vcur", il);
5817+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
5818+
cb(Vcur, "Vcur", il);
5819+
5820+
// these nodes are added to the graph together so that they are not reordered
5821+
// by doing so, the number of splits in the graph is reduced
5822+
ggml_build_forward_expand(gf, Qcur);
5823+
ggml_build_forward_expand(gf, Kcur);
5824+
ggml_build_forward_expand(gf, Vcur);
5825+
5826+
Qcur = ggml_rope_custom(
5827+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
5828+
hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
5829+
ext_factor, attn_factor, beta_fast, beta_slow
5830+
);
5831+
cb(Qcur, "Qcur", il);
5832+
5833+
Kcur = ggml_rope_custom(
5834+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
5835+
hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
5836+
ext_factor, attn_factor, beta_fast, beta_slow
5837+
);
5838+
cb(Kcur, "Kcur", il);
5839+
5840+
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
5841+
5842+
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
5843+
model.layers[il].wo, model.layers[il].bo,
5844+
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5845+
cb(cur, "kqv_out", il);
5846+
}
5847+
5848+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
5849+
cb(ffn_inp, "ffn_inp", il);
5850+
5851+
// feed-forward network
5852+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
5853+
model.layers[il].ffn_norm, NULL,
5854+
LLM_NORM_RMS, cb, il);
5855+
cb(cur, "ffn_norm", il);
5856+
5857+
cur = llm_build_ffn(ctx0, cur,
5858+
model.layers[il].ffn_up, NULL,
5859+
model.layers[il].ffn_gate, NULL,
5860+
model.layers[il].ffn_down, NULL,
5861+
NULL,
5862+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
5863+
cb(cur, "ffn_out", il);
5864+
5865+
cur = ggml_add(ctx0, cur, ffn_inp);
5866+
cb(cur, "l_out", il);
5867+
5868+
// input for next layer
5869+
inpL = cur;
5870+
}
5871+
5872+
cur = inpL;
5873+
5874+
cur = llm_build_norm(ctx0, cur, hparams,
5875+
model.output_norm, NULL,
5876+
LLM_NORM_RMS, cb, -1);
5877+
cb(cur, "result_norm", -1);
5878+
5879+
// lm_head
5880+
cur = ggml_mul_mat(ctx0, model.output, cur);
5881+
cb(cur, "result_output", -1);
5882+
5883+
ggml_build_forward_expand(gf, cur);
5884+
5885+
return gf;
5886+
}
5887+
57015888
struct ggml_cgraph * build_phi2() {
57025889
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
57035890

@@ -6324,6 +6511,10 @@ static struct ggml_cgraph * llama_build_graph(
63246511
{
63256512
result = llm.build_qwen();
63266513
} break;
6514+
case LLM_ARCH_QWEN2:
6515+
{
6516+
result = llm.build_qwen2();
6517+
} break;
63276518
case LLM_ARCH_PHI2:
63286519
{
63296520
result = llm.build_phi2();

0 commit comments

Comments
 (0)