@@ -194,6 +194,7 @@ enum llm_arch {
194
194
LLM_ARCH_QWEN,
195
195
LLM_ARCH_PHI2,
196
196
LLM_ARCH_PLAMO,
197
+ LLM_ARCH_CODESHELL,
197
198
LLM_ARCH_UNKNOWN,
198
199
};
199
200
@@ -213,6 +214,7 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
213
214
{ LLM_ARCH_QWEN, "qwen" },
214
215
{ LLM_ARCH_PHI2, "phi2" },
215
216
{ LLM_ARCH_PLAMO, "plamo" },
217
+ { LLM_ARCH_CODESHELL, "codeshell" },
216
218
};
217
219
218
220
enum llm_kv {
@@ -600,6 +602,26 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
600
602
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
601
603
},
602
604
},
605
+ {
606
+ LLM_ARCH_CODESHELL,
607
+ {
608
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
609
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
610
+ { LLM_TENSOR_OUTPUT, "output" },
611
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
612
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
613
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
614
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
615
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
616
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
617
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
618
+ { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
619
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
620
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
621
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
622
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
623
+ },
624
+ },
603
625
604
626
{
605
627
LLM_ARCH_UNKNOWN,
@@ -2877,6 +2899,14 @@ static void llm_load_hparams(
2877
2899
default: model.type = e_model::MODEL_UNKNOWN;
2878
2900
}
2879
2901
} break;
2902
+ case LLM_ARCH_CODESHELL:
2903
+ {
2904
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
2905
+ switch (hparams.n_layer) {
2906
+ case 42: model.type = e_model::MODEL_SMALL; break;
2907
+ default: model.type = e_model::MODEL_UNKNOWN;
2908
+ }
2909
+ } break;
2880
2910
2881
2911
default: (void)0;
2882
2912
}
@@ -3784,6 +3814,42 @@ static bool llm_load_tensors(
3784
3814
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
3785
3815
}
3786
3816
} break;
3817
+ case LLM_ARCH_CODESHELL:
3818
+ {
3819
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
3820
+
3821
+ // output
3822
+ {
3823
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
3824
+ model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
3825
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
3826
+ }
3827
+
3828
+ for (int i = 0; i < n_layer; ++i) {
3829
+ ggml_context * ctx_layer = ctx_for_layer(i);
3830
+ ggml_context * ctx_split = ctx_for_layer_split(i);
3831
+
3832
+ auto & layer = model.layers[i];
3833
+
3834
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
3835
+ layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
3836
+
3837
+ layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
3838
+ layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa});
3839
+
3840
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
3841
+ layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
3842
+
3843
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
3844
+ layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
3845
+
3846
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
3847
+ layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
3848
+
3849
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
3850
+ layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
3851
+ }
3852
+ } break;
3787
3853
default:
3788
3854
throw std::runtime_error("unknown architecture");
3789
3855
}
@@ -5965,6 +6031,117 @@ struct llm_build_context {
5965
6031
5966
6032
return gf;
5967
6033
}
6034
+
6035
+ struct ggml_cgraph * build_codeshell() {
6036
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
6037
+
6038
+ const int64_t n_embd_head = hparams.n_embd_head_v;
6039
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
6040
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
6041
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
6042
+
6043
+ struct ggml_tensor * cur;
6044
+ struct ggml_tensor * inpL;
6045
+
6046
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
6047
+ cb(inpL, "inp_embd", -1);
6048
+
6049
+ // inp_pos - contains the positions
6050
+ struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
6051
+ cb(inp_pos, "inp_pos", -1);
6052
+
6053
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
6054
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
6055
+ cb(KQ_mask, "KQ_mask", -1);
6056
+
6057
+ // shift the entire K-cache if needed
6058
+ if (do_rope_shift) {
6059
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
6060
+ }
6061
+
6062
+ for (int il = 0; il < n_layer; ++il) {
6063
+ cur = llm_build_norm(ctx0, inpL, hparams,
6064
+ model.layers[il].attn_norm,
6065
+ model.layers[il].attn_norm_b,
6066
+ LLM_NORM, cb, il);
6067
+ cb(cur, "attn_norm", il);
6068
+
6069
+ // self-attention
6070
+ {
6071
+ cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
6072
+ cb(cur, "wqkv", il);
6073
+
6074
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
6075
+ cb(cur, "bqkv", il);
6076
+
6077
+ struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
6078
+ struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
6079
+ struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
6080
+
6081
+ cb(tmpq, "tmpq", il);
6082
+ cb(tmpk, "tmpk", il);
6083
+ cb(Vcur, "Vcur", il);
6084
+
6085
+ struct ggml_tensor * Qcur = ggml_rope_custom(
6086
+ ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos,
6087
+ hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
6088
+ ext_factor, attn_factor, beta_fast, beta_slow
6089
+ );
6090
+ cb(Qcur, "Qcur", il);
6091
+
6092
+ struct ggml_tensor * Kcur = ggml_rope_custom(
6093
+ ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos,
6094
+ hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
6095
+ ext_factor, attn_factor, beta_fast, beta_slow
6096
+ );
6097
+ cb(Kcur, "Kcur", il);
6098
+
6099
+ llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
6100
+
6101
+ cur = llm_build_kqv(ctx0, model, hparams, kv_self,
6102
+ model.layers[il].wo, model.layers[il].bo,
6103
+ Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6104
+ cb(cur, "kqv_out", il);
6105
+ }
6106
+
6107
+ // add the input
6108
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
6109
+ cb(ffn_inp, "ffn_inp", il);
6110
+
6111
+ // FF
6112
+ {
6113
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
6114
+ model.layers[il].ffn_norm,
6115
+ model.layers[il].ffn_norm_b,
6116
+ LLM_NORM, cb, il);
6117
+ cb(cur, "ffn_norm", il);
6118
+
6119
+ cur = llm_build_ffn(ctx0, cur,
6120
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b,
6121
+ NULL, NULL,
6122
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b,
6123
+ NULL,
6124
+ LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
6125
+ cb(cur, "ffn_out", il);
6126
+ }
6127
+
6128
+ inpL = ggml_add(ctx0, cur, ffn_inp);
6129
+ cb(inpL, "l_out", il);
6130
+ }
6131
+
6132
+ cur = llm_build_norm(ctx0, inpL, hparams,
6133
+ model.output_norm,
6134
+ model.output_norm_b,
6135
+ LLM_NORM, cb, -1);
6136
+ cb(cur, "result_norm", -1);
6137
+
6138
+ cur = ggml_mul_mat(ctx0, model.output, cur);
6139
+ cb(cur, "result_output", -1);
6140
+
6141
+ ggml_build_forward_expand(gf, cur);
6142
+
6143
+ return gf;
6144
+ }
5968
6145
};
5969
6146
5970
6147
static struct ggml_cgraph * llama_build_graph(
@@ -6159,6 +6336,10 @@ static struct ggml_cgraph * llama_build_graph(
6159
6336
{
6160
6337
result = llm.build_gpt2();
6161
6338
} break;
6339
+ case LLM_ARCH_CODESHELL:
6340
+ {
6341
+ result = llm.build_codeshell();
6342
+ } break;
6162
6343
default:
6163
6344
GGML_ASSERT(false);
6164
6345
}
0 commit comments