@@ -192,6 +192,7 @@ enum llm_arch {
192
192
LLM_ARCH_BLOOM,
193
193
LLM_ARCH_STABLELM,
194
194
LLM_ARCH_QWEN,
195
+ LLM_ARCH_QWEN2,
195
196
LLM_ARCH_PHI2,
196
197
LLM_ARCH_PLAMO,
197
198
LLM_ARCH_CODESHELL,
@@ -212,6 +213,7 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
212
213
{ LLM_ARCH_BLOOM, " bloom" },
213
214
{ LLM_ARCH_STABLELM, " stablelm" },
214
215
{ LLM_ARCH_QWEN, " qwen" },
216
+ { LLM_ARCH_QWEN2, " qwen2" },
215
217
{ LLM_ARCH_PHI2, " phi2" },
216
218
{ LLM_ARCH_PLAMO, " plamo" },
217
219
{ LLM_ARCH_CODESHELL, " codeshell" },
@@ -568,6 +570,23 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
568
570
{ LLM_TENSOR_FFN_UP, " blk.%d.ffn_up" },
569
571
},
570
572
},
573
+ {
574
+ LLM_ARCH_QWEN2,
575
+ {
576
+ { LLM_TENSOR_TOKEN_EMBD, " token_embd" },
577
+ { LLM_TENSOR_OUTPUT_NORM, " output_norm" },
578
+ { LLM_TENSOR_OUTPUT, " output" },
579
+ { LLM_TENSOR_ATTN_NORM, " blk.%d.attn_norm" },
580
+ { LLM_TENSOR_ATTN_Q, " blk.%d.attn_q" },
581
+ { LLM_TENSOR_ATTN_K, " blk.%d.attn_k" },
582
+ { LLM_TENSOR_ATTN_V, " blk.%d.attn_v" },
583
+ { LLM_TENSOR_ATTN_OUT, " blk.%d.attn_output" },
584
+ { LLM_TENSOR_FFN_NORM, " blk.%d.ffn_norm" },
585
+ { LLM_TENSOR_FFN_GATE, " blk.%d.ffn_gate" },
586
+ { LLM_TENSOR_FFN_DOWN, " blk.%d.ffn_down" },
587
+ { LLM_TENSOR_FFN_UP, " blk.%d.ffn_up" },
588
+ },
589
+ },
571
590
{
572
591
LLM_ARCH_PHI2,
573
592
{
@@ -2869,6 +2888,17 @@ static void llm_load_hparams(
2869
2888
default : model.type = e_model::MODEL_UNKNOWN;
2870
2889
}
2871
2890
} break ;
2891
+ case LLM_ARCH_QWEN2:
2892
+ {
2893
+ ml.get_key (LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps );
2894
+ switch (hparams.n_layer ) {
2895
+ case 24 : model.type = e_model::MODEL_1B; break ;
2896
+ case 32 : model.type = e_model::MODEL_7B; break ;
2897
+ case 40 : model.type = e_model::MODEL_13B; break ;
2898
+ case 80 : model.type = e_model::MODEL_70B; break ;
2899
+ default : model.type = e_model::MODEL_UNKNOWN;
2900
+ }
2901
+ } break ;
2872
2902
case LLM_ARCH_PHI2:
2873
2903
{
2874
2904
ml.get_key (LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps );
@@ -3704,6 +3734,41 @@ static bool llm_load_tensors(
3704
3734
layer.ffn_up = ml.create_tensor (ctx_split, tn (LLM_TENSOR_FFN_UP, " weight" , i), {n_embd, n_ff/2 });
3705
3735
}
3706
3736
} break ;
3737
+ case LLM_ARCH_QWEN2:
3738
+ {
3739
+ model.tok_embd = ml.create_tensor (ctx_input, tn (LLM_TENSOR_TOKEN_EMBD, " weight" ), {n_embd, n_vocab});
3740
+
3741
+ // output
3742
+ {
3743
+ model.output_norm = ml.create_tensor (ctx_output, tn (LLM_TENSOR_OUTPUT_NORM, " weight" ), {n_embd});
3744
+ model.output = ml.create_tensor (ctx_output_split, tn (LLM_TENSOR_OUTPUT, " weight" ), {n_embd, n_vocab});
3745
+ }
3746
+
3747
+ for (int i = 0 ; i < n_layer; ++i) {
3748
+ ggml_context * ctx_layer = ctx_for_layer (i);
3749
+ ggml_context * ctx_split = ctx_for_layer_split (i);
3750
+
3751
+ auto & layer = model.layers [i];
3752
+
3753
+ layer.attn_norm = ml.create_tensor (ctx_layer, tn (LLM_TENSOR_ATTN_NORM, " weight" , i), {n_embd});
3754
+
3755
+ layer.wq = ml.create_tensor (ctx_split, tn (LLM_TENSOR_ATTN_Q, " weight" , i), {n_embd, n_embd});
3756
+ layer.wk = ml.create_tensor (ctx_split, tn (LLM_TENSOR_ATTN_K, " weight" , i), {n_embd, n_embd_gqa});
3757
+ layer.wv = ml.create_tensor (ctx_split, tn (LLM_TENSOR_ATTN_V, " weight" , i), {n_embd, n_embd_gqa});
3758
+ layer.wo = ml.create_tensor (ctx_split, tn (LLM_TENSOR_ATTN_OUT, " weight" , i), {n_embd, n_embd});
3759
+
3760
+ // optional bias tensors
3761
+ layer.bq = ml.create_tensor (ctx_layer, tn (LLM_TENSOR_ATTN_Q, " bias" , i), {n_embd});
3762
+ layer.bk = ml.create_tensor (ctx_layer, tn (LLM_TENSOR_ATTN_K, " bias" , i), {n_embd_gqa});
3763
+ layer.bv = ml.create_tensor (ctx_layer, tn (LLM_TENSOR_ATTN_V, " bias" , i), {n_embd_gqa});
3764
+
3765
+ layer.ffn_norm = ml.create_tensor (ctx_layer, tn (LLM_TENSOR_FFN_NORM, " weight" , i), {n_embd});
3766
+
3767
+ layer.ffn_gate = ml.create_tensor (ctx_split, tn (LLM_TENSOR_FFN_GATE, " weight" , i), {n_embd, n_ff});
3768
+ layer.ffn_down = ml.create_tensor (ctx_split, tn (LLM_TENSOR_FFN_DOWN, " weight" , i), { n_ff, n_embd});
3769
+ layer.ffn_up = ml.create_tensor (ctx_split, tn (LLM_TENSOR_FFN_UP, " weight" , i), {n_embd, n_ff});
3770
+ }
3771
+ } break ;
3707
3772
case LLM_ARCH_PHI2:
3708
3773
{
3709
3774
model.tok_embd = ml.create_tensor (ctx_input, tn (LLM_TENSOR_TOKEN_EMBD, " weight" ), {n_embd, n_vocab});
@@ -5698,6 +5763,128 @@ struct llm_build_context {
5698
5763
5699
5764
return gf;
5700
5765
}
5766
+
5767
+ struct ggml_cgraph * build_qwen2 () {
5768
+ struct ggml_cgraph * gf = ggml_new_graph_custom (ctx0, LLAMA_MAX_NODES, false );
5769
+
5770
+ const int64_t n_embd_head = hparams.n_embd_head_v ;
5771
+ GGML_ASSERT (n_embd_head == hparams.n_embd_head_k );
5772
+ GGML_ASSERT (n_embd_head == hparams.n_rot );
5773
+
5774
+ struct ggml_tensor * cur;
5775
+ struct ggml_tensor * inpL;
5776
+
5777
+ inpL = llm_build_inp_embd (ctx0, hparams, batch, model.tok_embd , cb);
5778
+ cb (inpL, " inp_embd" , -1 );
5779
+
5780
+ // inp_pos - contains the positions
5781
+ struct ggml_tensor * inp_pos = ggml_new_tensor_1d (ctx0, GGML_TYPE_I32, n_tokens);
5782
+ cb (inp_pos, " inp_pos" , -1 );
5783
+
5784
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5785
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d (ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1 );
5786
+ cb (KQ_mask, " KQ_mask" , -1 );
5787
+
5788
+ // shift the entire K-cache if needed
5789
+ if (do_rope_shift) {
5790
+ llm_build_k_shift (ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
5791
+ }
5792
+
5793
+ for (int il = 0 ; il < n_layer; ++il) {
5794
+ struct ggml_tensor * inpSA = inpL;
5795
+
5796
+ // norm
5797
+ cur = llm_build_norm (ctx0, inpL, hparams,
5798
+ model.layers [il].attn_norm , NULL ,
5799
+ LLM_NORM_RMS, cb, il);
5800
+ cb (cur, " attn_norm" , il);
5801
+
5802
+ // self-attention
5803
+ {
5804
+ // compute Q and K and RoPE them
5805
+ struct ggml_tensor * Qcur = ggml_mul_mat (ctx0, model.layers [il].wq , cur);
5806
+ cb (Qcur, " Qcur" , il);
5807
+ Qcur = ggml_add (ctx0, Qcur, model.layers [il].bq );
5808
+ cb (Qcur, " Qcur" , il);
5809
+
5810
+ struct ggml_tensor * Kcur = ggml_mul_mat (ctx0, model.layers [il].wk , cur);
5811
+ cb (Kcur, " Kcur" , il);
5812
+ Kcur = ggml_add (ctx0, Kcur, model.layers [il].bk );
5813
+ cb (Kcur, " Kcur" , il);
5814
+
5815
+ struct ggml_tensor * Vcur = ggml_mul_mat (ctx0, model.layers [il].wv , cur);
5816
+ cb (Vcur, " Vcur" , il);
5817
+ Vcur = ggml_add (ctx0, Vcur, model.layers [il].bv );
5818
+ cb (Vcur, " Vcur" , il);
5819
+
5820
+ // these nodes are added to the graph together so that they are not reordered
5821
+ // by doing so, the number of splits in the graph is reduced
5822
+ ggml_build_forward_expand (gf, Qcur);
5823
+ ggml_build_forward_expand (gf, Kcur);
5824
+ ggml_build_forward_expand (gf, Vcur);
5825
+
5826
+ Qcur = ggml_rope_custom (
5827
+ ctx0, ggml_reshape_3d (ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
5828
+ hparams.n_rot , 2 , 0 , n_orig_ctx, freq_base, freq_scale,
5829
+ ext_factor, attn_factor, beta_fast, beta_slow
5830
+ );
5831
+ cb (Qcur, " Qcur" , il);
5832
+
5833
+ Kcur = ggml_rope_custom (
5834
+ ctx0, ggml_reshape_3d (ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
5835
+ hparams.n_rot , 2 , 0 , n_orig_ctx, freq_base, freq_scale,
5836
+ ext_factor, attn_factor, beta_fast, beta_slow
5837
+ );
5838
+ cb (Kcur, " Kcur" , il);
5839
+
5840
+ llm_build_kv_store (ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
5841
+
5842
+ cur = llm_build_kqv (ctx0, model, hparams, kv_self,
5843
+ model.layers [il].wo , model.layers [il].bo ,
5844
+ Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1 .0f , 1 .0f /sqrtf (float (n_embd_head)), cb, il);
5845
+ cb (cur, " kqv_out" , il);
5846
+ }
5847
+
5848
+ struct ggml_tensor * ffn_inp = ggml_add (ctx0, cur, inpSA);
5849
+ cb (ffn_inp, " ffn_inp" , il);
5850
+
5851
+ // feed-forward network
5852
+ cur = llm_build_norm (ctx0, ffn_inp, hparams,
5853
+ model.layers [il].ffn_norm , NULL ,
5854
+ LLM_NORM_RMS, cb, il);
5855
+ cb (cur, " ffn_norm" , il);
5856
+
5857
+ cur = llm_build_ffn (ctx0, cur,
5858
+ model.layers [il].ffn_up , NULL ,
5859
+ model.layers [il].ffn_gate , NULL ,
5860
+ model.layers [il].ffn_down , NULL ,
5861
+ NULL ,
5862
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
5863
+ cb (cur, " ffn_out" , il);
5864
+
5865
+ cur = ggml_add (ctx0, cur, ffn_inp);
5866
+ cb (cur, " l_out" , il);
5867
+
5868
+ // input for next layer
5869
+ inpL = cur;
5870
+ }
5871
+
5872
+ cur = inpL;
5873
+
5874
+ cur = llm_build_norm (ctx0, cur, hparams,
5875
+ model.output_norm , NULL ,
5876
+ LLM_NORM_RMS, cb, -1 );
5877
+ cb (cur, " result_norm" , -1 );
5878
+
5879
+ // lm_head
5880
+ cur = ggml_mul_mat (ctx0, model.output , cur);
5881
+ cb (cur, " result_output" , -1 );
5882
+
5883
+ ggml_build_forward_expand (gf, cur);
5884
+
5885
+ return gf;
5886
+ }
5887
+
5701
5888
struct ggml_cgraph * build_phi2 () {
5702
5889
struct ggml_cgraph * gf = ggml_new_graph_custom (ctx0, LLAMA_MAX_NODES, false );
5703
5890
@@ -6324,6 +6511,10 @@ static struct ggml_cgraph * llama_build_graph(
6324
6511
{
6325
6512
result = llm.build_qwen ();
6326
6513
} break ;
6514
+ case LLM_ARCH_QWEN2:
6515
+ {
6516
+ result = llm.build_qwen2 ();
6517
+ } break ;
6327
6518
case LLM_ARCH_PHI2:
6328
6519
{
6329
6520
result = llm.build_phi2 ();
0 commit comments