Skip to content

Commit 094cd5e

Browse files
chirankoggerganov
authored andcommitted
llama : add CodeShell support (ggml-org#5016)
* llama: add codeshell support * llama.cpp: fix codeshell with NeoX rope Co-authored-by: Georgi Gerganov <[email protected]> --------- Co-authored-by: Georgi Gerganov <[email protected]>
1 parent 7d6f8e5 commit 094cd5e

File tree

4 files changed

+268
-0
lines changed

4 files changed

+268
-0
lines changed

convert-hf-to-gguf.py

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -197,6 +197,8 @@ def from_model_architecture(model_architecture):
197197
return Phi2Model
198198
if model_architecture == "PlamoForCausalLM":
199199
return PlamoModel
200+
if model_architecture == "CodeShellForCausalLM":
201+
return CodeShellModel
200202
return Model
201203

202204
def _is_model_safetensors(self) -> bool:
@@ -242,6 +244,8 @@ def _get_model_architecture(self) -> gguf.MODEL_ARCH:
242244
return gguf.MODEL_ARCH.PHI2
243245
if arch == "PlamoForCausalLM":
244246
return gguf.MODEL_ARCH.PLAMO
247+
if arch == "CodeShellForCausalLM":
248+
return gguf.MODEL_ARCH.CODESHELL
245249

246250
raise NotImplementedError(f'Architecture "{arch}" not supported!')
247251

@@ -1175,6 +1179,69 @@ def write_tensors(self):
11751179

11761180
self.gguf_writer.add_tensor(new_name, data)
11771181

1182+
class CodeShellModel(Model):
1183+
def set_gguf_parameters(self):
1184+
block_count = self.hparams["n_layer"]
1185+
1186+
self.gguf_writer.add_name("CodeShell")
1187+
self.gguf_writer.add_context_length(self.hparams["n_positions"])
1188+
self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
1189+
self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
1190+
self.gguf_writer.add_block_count(block_count)
1191+
self.gguf_writer.add_head_count(self.hparams["n_head"])
1192+
self.gguf_writer.add_head_count_kv(self.hparams["num_query_groups"])
1193+
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
1194+
self.gguf_writer.add_file_type(self.ftype)
1195+
self.gguf_writer.add_rope_freq_base(10000.0)
1196+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
1197+
self.gguf_writer.add_rope_scaling_factor(1.0)
1198+
1199+
def write_tensors(self):
1200+
block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
1201+
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
1202+
tensors = dict(self.get_tensors())
1203+
has_lm_head = "lm_head.weight" in tensors.keys() or "output.weight" in tensors.keys()
1204+
for name, data_torch in tensors.items():
1205+
# we don't need these
1206+
if name.endswith((".attn.rotary_emb.inv_freq")):
1207+
continue
1208+
1209+
old_dtype = data_torch.dtype
1210+
1211+
# convert any unsupported data types to float32
1212+
if data_torch.dtype not in (torch.float16, torch.float32):
1213+
data_torch = data_torch.to(torch.float32)
1214+
1215+
data = data_torch.squeeze().numpy()
1216+
1217+
# map tensor names
1218+
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
1219+
if new_name is None:
1220+
print(f"Can not map tensor {name!r}")
1221+
sys.exit()
1222+
1223+
n_dims = len(data.shape)
1224+
data_dtype = data.dtype
1225+
1226+
# if f32 desired, convert any float16 to float32
1227+
if self.ftype == 0 and data_dtype == np.float16:
1228+
data = data.astype(np.float32)
1229+
1230+
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
1231+
if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
1232+
data = data.astype(np.float32)
1233+
1234+
# if f16 desired, convert any float32 2-dim weight tensors to float16
1235+
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
1236+
data = data.astype(np.float16)
1237+
1238+
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
1239+
1240+
self.gguf_writer.add_tensor(new_name, data)
1241+
1242+
if not has_lm_head and name == "transformer.wte.weight":
1243+
self.gguf_writer.add_tensor("output.weight", data)
1244+
print(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}")
11781245

11791246
###### CONVERSION LOGIC ######
11801247

gguf-py/gguf/constants.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,7 @@ class MODEL_ARCH(IntEnum):
9999
QWEN = auto()
100100
PHI2 = auto()
101101
PLAMO = auto()
102+
CODESHELL = auto()
102103

103104

104105
class MODEL_TENSOR(IntEnum):
@@ -147,6 +148,7 @@ class MODEL_TENSOR(IntEnum):
147148
MODEL_ARCH.QWEN: "qwen",
148149
MODEL_ARCH.PHI2: "phi2",
149150
MODEL_ARCH.PLAMO: "plamo",
151+
MODEL_ARCH.CODESHELL: "codeshell",
150152
}
151153

152154
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
@@ -396,6 +398,19 @@ class MODEL_TENSOR(IntEnum):
396398
MODEL_TENSOR.FFN_NORM,
397399
MODEL_TENSOR.FFN_DOWN,
398400
MODEL_TENSOR.FFN_UP,
401+
],
402+
MODEL_ARCH.CODESHELL: [
403+
MODEL_TENSOR.TOKEN_EMBD,
404+
MODEL_TENSOR.POS_EMBD,
405+
MODEL_TENSOR.OUTPUT_NORM,
406+
MODEL_TENSOR.OUTPUT,
407+
MODEL_TENSOR.ATTN_NORM,
408+
MODEL_TENSOR.ATTN_QKV,
409+
MODEL_TENSOR.ATTN_OUT,
410+
MODEL_TENSOR.ATTN_ROT_EMBD,
411+
MODEL_TENSOR.FFN_NORM,
412+
MODEL_TENSOR.FFN_DOWN,
413+
MODEL_TENSOR.FFN_UP,
399414
]
400415
# TODO
401416
}
@@ -417,6 +432,10 @@ class MODEL_TENSOR(IntEnum):
417432
MODEL_TENSOR.ROPE_FREQS,
418433
MODEL_TENSOR.ATTN_ROT_EMBD,
419434
],
435+
MODEL_ARCH.CODESHELL: [
436+
MODEL_TENSOR.ROPE_FREQS,
437+
MODEL_TENSOR.ATTN_ROT_EMBD,
438+
],
420439
}
421440

422441
#

gguf-py/gguf/tensor_mapping.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,7 @@ class TensorNameMap:
154154
"model.layers.{bid}.self_attn.rotary_emb.inv_freq", # llama-hf
155155
"layers.{bid}.attention.inner_attention.rope.freqs", # llama-pth
156156
"model.layers.layers.{bid}.self_attn.rotary_emb.inv_freq", # plamo
157+
"transformer.h.{bid}.attn.rotary_emb.inv_freq", # codeshell
157158
),
158159

159160
# Feed-forward norm

llama.cpp

Lines changed: 181 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -194,6 +194,7 @@ enum llm_arch {
194194
LLM_ARCH_QWEN,
195195
LLM_ARCH_PHI2,
196196
LLM_ARCH_PLAMO,
197+
LLM_ARCH_CODESHELL,
197198
LLM_ARCH_UNKNOWN,
198199
};
199200

@@ -213,6 +214,7 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
213214
{ LLM_ARCH_QWEN, "qwen" },
214215
{ LLM_ARCH_PHI2, "phi2" },
215216
{ LLM_ARCH_PLAMO, "plamo" },
217+
{ LLM_ARCH_CODESHELL, "codeshell" },
216218
};
217219

218220
enum llm_kv {
@@ -600,6 +602,26 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
600602
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
601603
},
602604
},
605+
{
606+
LLM_ARCH_CODESHELL,
607+
{
608+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
609+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
610+
{ LLM_TENSOR_OUTPUT, "output" },
611+
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
612+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
613+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
614+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
615+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
616+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
617+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
618+
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
619+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
620+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
621+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
622+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
623+
},
624+
},
603625

604626
{
605627
LLM_ARCH_UNKNOWN,
@@ -2877,6 +2899,14 @@ static void llm_load_hparams(
28772899
default: model.type = e_model::MODEL_UNKNOWN;
28782900
}
28792901
} break;
2902+
case LLM_ARCH_CODESHELL:
2903+
{
2904+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
2905+
switch (hparams.n_layer) {
2906+
case 42: model.type = e_model::MODEL_SMALL; break;
2907+
default: model.type = e_model::MODEL_UNKNOWN;
2908+
}
2909+
} break;
28802910

28812911
default: (void)0;
28822912
}
@@ -3784,6 +3814,42 @@ static bool llm_load_tensors(
37843814
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
37853815
}
37863816
} break;
3817+
case LLM_ARCH_CODESHELL:
3818+
{
3819+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
3820+
3821+
// output
3822+
{
3823+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
3824+
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
3825+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
3826+
}
3827+
3828+
for (int i = 0; i < n_layer; ++i) {
3829+
ggml_context * ctx_layer = ctx_for_layer(i);
3830+
ggml_context * ctx_split = ctx_for_layer_split(i);
3831+
3832+
auto & layer = model.layers[i];
3833+
3834+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
3835+
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
3836+
3837+
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
3838+
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa});
3839+
3840+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
3841+
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
3842+
3843+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
3844+
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
3845+
3846+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
3847+
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
3848+
3849+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
3850+
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
3851+
}
3852+
} break;
37873853
default:
37883854
throw std::runtime_error("unknown architecture");
37893855
}
@@ -5965,6 +6031,117 @@ struct llm_build_context {
59656031

59666032
return gf;
59676033
}
6034+
6035+
struct ggml_cgraph * build_codeshell() {
6036+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
6037+
6038+
const int64_t n_embd_head = hparams.n_embd_head_v;
6039+
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
6040+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
6041+
GGML_ASSERT(n_embd_head == hparams.n_rot);
6042+
6043+
struct ggml_tensor * cur;
6044+
struct ggml_tensor * inpL;
6045+
6046+
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
6047+
cb(inpL, "inp_embd", -1);
6048+
6049+
// inp_pos - contains the positions
6050+
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
6051+
cb(inp_pos, "inp_pos", -1);
6052+
6053+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
6054+
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
6055+
cb(KQ_mask, "KQ_mask", -1);
6056+
6057+
// shift the entire K-cache if needed
6058+
if (do_rope_shift) {
6059+
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
6060+
}
6061+
6062+
for (int il = 0; il < n_layer; ++il) {
6063+
cur = llm_build_norm(ctx0, inpL, hparams,
6064+
model.layers[il].attn_norm,
6065+
model.layers[il].attn_norm_b,
6066+
LLM_NORM, cb, il);
6067+
cb(cur, "attn_norm", il);
6068+
6069+
// self-attention
6070+
{
6071+
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
6072+
cb(cur, "wqkv", il);
6073+
6074+
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
6075+
cb(cur, "bqkv", il);
6076+
6077+
struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
6078+
struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
6079+
struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
6080+
6081+
cb(tmpq, "tmpq", il);
6082+
cb(tmpk, "tmpk", il);
6083+
cb(Vcur, "Vcur", il);
6084+
6085+
struct ggml_tensor * Qcur = ggml_rope_custom(
6086+
ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos,
6087+
hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
6088+
ext_factor, attn_factor, beta_fast, beta_slow
6089+
);
6090+
cb(Qcur, "Qcur", il);
6091+
6092+
struct ggml_tensor * Kcur = ggml_rope_custom(
6093+
ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos,
6094+
hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
6095+
ext_factor, attn_factor, beta_fast, beta_slow
6096+
);
6097+
cb(Kcur, "Kcur", il);
6098+
6099+
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
6100+
6101+
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
6102+
model.layers[il].wo, model.layers[il].bo,
6103+
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6104+
cb(cur, "kqv_out", il);
6105+
}
6106+
6107+
// add the input
6108+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
6109+
cb(ffn_inp, "ffn_inp", il);
6110+
6111+
// FF
6112+
{
6113+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
6114+
model.layers[il].ffn_norm,
6115+
model.layers[il].ffn_norm_b,
6116+
LLM_NORM, cb, il);
6117+
cb(cur, "ffn_norm", il);
6118+
6119+
cur = llm_build_ffn(ctx0, cur,
6120+
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
6121+
NULL, NULL,
6122+
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
6123+
NULL,
6124+
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
6125+
cb(cur, "ffn_out", il);
6126+
}
6127+
6128+
inpL = ggml_add(ctx0, cur, ffn_inp);
6129+
cb(inpL, "l_out", il);
6130+
}
6131+
6132+
cur = llm_build_norm(ctx0, inpL, hparams,
6133+
model.output_norm,
6134+
model.output_norm_b,
6135+
LLM_NORM, cb, -1);
6136+
cb(cur, "result_norm", -1);
6137+
6138+
cur = ggml_mul_mat(ctx0, model.output, cur);
6139+
cb(cur, "result_output", -1);
6140+
6141+
ggml_build_forward_expand(gf, cur);
6142+
6143+
return gf;
6144+
}
59686145
};
59696146

59706147
static struct ggml_cgraph * llama_build_graph(
@@ -6159,6 +6336,10 @@ static struct ggml_cgraph * llama_build_graph(
61596336
{
61606337
result = llm.build_gpt2();
61616338
} break;
6339+
case LLM_ARCH_CODESHELL:
6340+
{
6341+
result = llm.build_codeshell();
6342+
} break;
61626343
default:
61636344
GGML_ASSERT(false);
61646345
}

0 commit comments

Comments
 (0)