@@ -192,6 +192,7 @@ enum llm_arch {
192
192
LLM_ARCH_BLOOM,
193
193
LLM_ARCH_STABLELM,
194
194
LLM_ARCH_QWEN,
195
+ LLM_ARCH_QWEN2,
195
196
LLM_ARCH_PHI2,
196
197
LLM_ARCH_PLAMO,
197
198
LLM_ARCH_CODESHELL,
@@ -212,6 +213,7 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
212
213
{ LLM_ARCH_BLOOM, "bloom" },
213
214
{ LLM_ARCH_STABLELM, "stablelm" },
214
215
{ LLM_ARCH_QWEN, "qwen" },
216
+ { LLM_ARCH_QWEN2, "qwen2" },
215
217
{ LLM_ARCH_PHI2, "phi2" },
216
218
{ LLM_ARCH_PLAMO, "plamo" },
217
219
{ LLM_ARCH_CODESHELL, "codeshell" },
@@ -568,6 +570,23 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
568
570
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
569
571
},
570
572
},
573
+ {
574
+ LLM_ARCH_QWEN2,
575
+ {
576
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
577
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
578
+ { LLM_TENSOR_OUTPUT, "output" },
579
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
580
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
581
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
582
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
583
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
584
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
585
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
586
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
587
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
588
+ },
589
+ },
571
590
{
572
591
LLM_ARCH_PHI2,
573
592
{
@@ -2869,6 +2888,17 @@ static void llm_load_hparams(
2869
2888
default: model.type = e_model::MODEL_UNKNOWN;
2870
2889
}
2871
2890
} break;
2891
+ case LLM_ARCH_QWEN2:
2892
+ {
2893
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2894
+ switch (hparams.n_layer) {
2895
+ case 24: model.type = e_model::MODEL_1B; break;
2896
+ case 32: model.type = e_model::MODEL_7B; break;
2897
+ case 40: model.type = e_model::MODEL_13B; break;
2898
+ case 80: model.type = e_model::MODEL_70B; break;
2899
+ default: model.type = e_model::MODEL_UNKNOWN;
2900
+ }
2901
+ } break;
2872
2902
case LLM_ARCH_PHI2:
2873
2903
{
2874
2904
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -3704,6 +3734,41 @@ static bool llm_load_tensors(
3704
3734
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff/2});
3705
3735
}
3706
3736
} break;
3737
+ case LLM_ARCH_QWEN2:
3738
+ {
3739
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
3740
+
3741
+ // output
3742
+ {
3743
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
3744
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
3745
+ }
3746
+
3747
+ for (int i = 0; i < n_layer; ++i) {
3748
+ ggml_context * ctx_layer = ctx_for_layer(i);
3749
+ ggml_context * ctx_split = ctx_for_layer_split(i);
3750
+
3751
+ auto & layer = model.layers[i];
3752
+
3753
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
3754
+
3755
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
3756
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
3757
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
3758
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
3759
+
3760
+ // optional bias tensors
3761
+ layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
3762
+ layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
3763
+ layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
3764
+
3765
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
3766
+
3767
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
3768
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
3769
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
3770
+ }
3771
+ } break;
3707
3772
case LLM_ARCH_PHI2:
3708
3773
{
3709
3774
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
@@ -5698,6 +5763,128 @@ struct llm_build_context {
5698
5763
5699
5764
return gf;
5700
5765
}
5766
+
5767
+ struct ggml_cgraph * build_qwen2() {
5768
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5769
+
5770
+ const int64_t n_embd_head = hparams.n_embd_head_v;
5771
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
5772
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
5773
+
5774
+ struct ggml_tensor * cur;
5775
+ struct ggml_tensor * inpL;
5776
+
5777
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
5778
+ cb(inpL, "inp_embd", -1);
5779
+
5780
+ // inp_pos - contains the positions
5781
+ struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
5782
+ cb(inp_pos, "inp_pos", -1);
5783
+
5784
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5785
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
5786
+ cb(KQ_mask, "KQ_mask", -1);
5787
+
5788
+ // shift the entire K-cache if needed
5789
+ if (do_rope_shift) {
5790
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
5791
+ }
5792
+
5793
+ for (int il = 0; il < n_layer; ++il) {
5794
+ struct ggml_tensor * inpSA = inpL;
5795
+
5796
+ // norm
5797
+ cur = llm_build_norm(ctx0, inpL, hparams,
5798
+ model.layers[il].attn_norm, NULL,
5799
+ LLM_NORM_RMS, cb, il);
5800
+ cb(cur, "attn_norm", il);
5801
+
5802
+ // self-attention
5803
+ {
5804
+ // compute Q and K and RoPE them
5805
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
5806
+ cb(Qcur, "Qcur", il);
5807
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
5808
+ cb(Qcur, "Qcur", il);
5809
+
5810
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
5811
+ cb(Kcur, "Kcur", il);
5812
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
5813
+ cb(Kcur, "Kcur", il);
5814
+
5815
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
5816
+ cb(Vcur, "Vcur", il);
5817
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
5818
+ cb(Vcur, "Vcur", il);
5819
+
5820
+ // these nodes are added to the graph together so that they are not reordered
5821
+ // by doing so, the number of splits in the graph is reduced
5822
+ ggml_build_forward_expand(gf, Qcur);
5823
+ ggml_build_forward_expand(gf, Kcur);
5824
+ ggml_build_forward_expand(gf, Vcur);
5825
+
5826
+ Qcur = ggml_rope_custom(
5827
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
5828
+ hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
5829
+ ext_factor, attn_factor, beta_fast, beta_slow
5830
+ );
5831
+ cb(Qcur, "Qcur", il);
5832
+
5833
+ Kcur = ggml_rope_custom(
5834
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
5835
+ hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
5836
+ ext_factor, attn_factor, beta_fast, beta_slow
5837
+ );
5838
+ cb(Kcur, "Kcur", il);
5839
+
5840
+ llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
5841
+
5842
+ cur = llm_build_kqv(ctx0, model, hparams, kv_self,
5843
+ model.layers[il].wo, model.layers[il].bo,
5844
+ Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5845
+ cb(cur, "kqv_out", il);
5846
+ }
5847
+
5848
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
5849
+ cb(ffn_inp, "ffn_inp", il);
5850
+
5851
+ // feed-forward network
5852
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
5853
+ model.layers[il].ffn_norm, NULL,
5854
+ LLM_NORM_RMS, cb, il);
5855
+ cb(cur, "ffn_norm", il);
5856
+
5857
+ cur = llm_build_ffn(ctx0, cur,
5858
+ model.layers[il].ffn_up, NULL,
5859
+ model.layers[il].ffn_gate, NULL,
5860
+ model.layers[il].ffn_down, NULL,
5861
+ NULL,
5862
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
5863
+ cb(cur, "ffn_out", il);
5864
+
5865
+ cur = ggml_add(ctx0, cur, ffn_inp);
5866
+ cb(cur, "l_out", il);
5867
+
5868
+ // input for next layer
5869
+ inpL = cur;
5870
+ }
5871
+
5872
+ cur = inpL;
5873
+
5874
+ cur = llm_build_norm(ctx0, cur, hparams,
5875
+ model.output_norm, NULL,
5876
+ LLM_NORM_RMS, cb, -1);
5877
+ cb(cur, "result_norm", -1);
5878
+
5879
+ // lm_head
5880
+ cur = ggml_mul_mat(ctx0, model.output, cur);
5881
+ cb(cur, "result_output", -1);
5882
+
5883
+ ggml_build_forward_expand(gf, cur);
5884
+
5885
+ return gf;
5886
+ }
5887
+
5701
5888
struct ggml_cgraph * build_phi2() {
5702
5889
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5703
5890
@@ -6324,6 +6511,10 @@ static struct ggml_cgraph * llama_build_graph(
6324
6511
{
6325
6512
result = llm.build_qwen();
6326
6513
} break;
6514
+ case LLM_ARCH_QWEN2:
6515
+ {
6516
+ result = llm.build_qwen2();
6517
+ } break;
6327
6518
case LLM_ARCH_PHI2:
6328
6519
{
6329
6520
result = llm.build_phi2();
0 commit comments