Skip to content

Commit 06c3bf0

Browse files
authored
Merge branch 'LostRuins:concedo' into main
2 parents ea6d320 + 8342fe8 commit 06c3bf0

13 files changed

+216
-89
lines changed

convert.py

+69-22
Original file line numberDiff line numberDiff line change
@@ -130,28 +130,76 @@ def make_tensors_list() -> List[str]:
130130
TENSORS_SET = set(TENSORS_LIST)
131131

132132

133+
def find_n_mult(n_ff: int, n_embd: int) -> int:
134+
# hardcoded magic range
135+
for n_mult in range(256, 1, -1):
136+
calc_ff = (((8*n_embd) // 3 + n_mult - 1) // n_mult)*n_mult
137+
if calc_ff == n_ff:
138+
return n_mult
139+
return 1
140+
133141
@dataclass
134142
class Params:
135143
n_vocab: int
136144
n_embd: int
137145
n_mult: int
138146
n_head: int
139147
n_layer: int
140-
file_type: GGMLFileType
141148

142149
@staticmethod
143-
def guessed(model: 'LazyModel', file_type: GGMLFileType) -> 'Params':
144-
n_vocab, n_embd = model["tok_embeddings.weight"].shape
150+
def guessed(model: 'LazyModel') -> 'Params':
151+
# try transformer naming first
152+
n_vocab, n_embd = model["model.embed_tokens.weight"].shape if "model.embed_tokens.weight" in model else model["tok_embeddings.weight"].shape
153+
154+
# try transformer naming first
155+
if "model.layers.0.self_attn.q_proj.weight" in model:
156+
n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model)
157+
else:
158+
n_layer=next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)
159+
160+
n_head=n_embd // 128 # guessed
145161

146162
return Params(
147163
n_vocab=n_vocab,
148164
n_embd=n_embd,
149165
n_mult=256,
150-
n_head=n_embd // 128,
151-
n_layer=next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model),
152-
file_type=file_type,
166+
n_head=n_head,
167+
n_layer=n_layer,
153168
)
154169

170+
@staticmethod
171+
def loadHFTransformerJson(model: 'LazyModel', config_path: 'Path') -> 'Params':
172+
config = json.load(open(config_path))
173+
174+
n_vocab = config["vocab_size"];
175+
n_embd = config["hidden_size"];
176+
n_head = config["num_attention_heads"];
177+
n_layer = config["num_hidden_layers"];
178+
n_ff = config["intermediate_size"];
179+
180+
n_mult = find_n_mult(n_ff, n_embd);
181+
182+
return Params(
183+
n_vocab=n_vocab,
184+
n_embd=n_embd,
185+
n_mult=n_mult,
186+
n_head=n_head,
187+
n_layer=n_layer,
188+
)
189+
190+
@staticmethod
191+
def load(model_plus: 'ModelPlus') -> 'Params':
192+
orig_config_path = model_plus.paths[0].parent / "params.json"
193+
hf_transformer_config_path = model_plus.paths[0].parent / "config.json"
194+
195+
if hf_transformer_config_path.exists():
196+
params = Params.loadHFTransformerJson(model_plus.model, hf_transformer_config_path)
197+
else:
198+
params = Params.guessed(model_plus.model)
199+
200+
print(f'params: n_vocab:{params.n_vocab} n_embd:{params.n_embd} n_mult:{params.n_mult} n_head:{params.n_head} n_layer:{params.n_layer}')
201+
return params
202+
155203

156204
class SentencePieceVocab:
157205
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]) -> None:
@@ -595,18 +643,17 @@ def load() -> Tensor:
595643
return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}) ' + lazy_tensor.description)
596644

597645

598-
def convert_transformers_to_orig(model: LazyModel) -> LazyModel:
646+
def convert_transformers_to_orig(model: LazyModel, params: Params) -> LazyModel:
599647
out: LazyModel = {}
600648
out["tok_embeddings.weight"] = model["model.embed_tokens.weight"]
601649
out["norm.weight"] = model["model.norm.weight"]
602650
out["output.weight"] = model["lm_head.weight"]
603651

604-
n_head = model["model.layers.0.self_attn.q_proj.weight"].shape[1] // 128
605652
for i in itertools.count():
606653
if f"model.layers.{i}.self_attn.q_proj.weight" not in model:
607654
break
608-
out[f"layers.{i}.attention.wq.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], n_head)
609-
out[f"layers.{i}.attention.wk.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], n_head)
655+
out[f"layers.{i}.attention.wq.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head)
656+
out[f"layers.{i}.attention.wk.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head)
610657
out[f"layers.{i}.attention.wv.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
611658
out[f"layers.{i}.attention.wo.weight"] = model[f"model.layers.{i}.self_attn.o_proj.weight"]
612659

@@ -920,7 +967,7 @@ class OutputFile:
920967
def __init__(self, fname_out: Path) -> None:
921968
self.fout = open(fname_out, "wb")
922969

923-
def write_file_header(self, params: Params) -> None:
970+
def write_file_header(self, params: Params, file_type: GGMLFileType) -> None:
924971
self.fout.write(b"ggjt"[::-1]) # magic
925972
values = [
926973
1, # file version
@@ -930,7 +977,7 @@ def write_file_header(self, params: Params) -> None:
930977
params.n_head,
931978
params.n_layer,
932979
params.n_embd // params.n_head, # rot (obsolete)
933-
params.file_type.value,
980+
file_type.value,
934981
]
935982
self.fout.write(struct.pack("i" * len(values), *values))
936983

@@ -958,10 +1005,10 @@ def write_vocab_only(fname_out: Path, vocab: Vocab) -> None:
9581005
of.fout.close()
9591006

9601007
@staticmethod
961-
def write_all(fname_out: Path, params: Params, model: LazyModel, vocab: Vocab) -> None:
1008+
def write_all(fname_out: Path, params: Params, file_type: GGMLFileType, model: LazyModel, vocab: Vocab) -> None:
9621009
check_vocab_size(params, vocab)
9631010
of = OutputFile(fname_out)
964-
of.write_file_header(params)
1011+
of.write_file_header(params, file_type)
9651012
print("Writing vocab...")
9661013
of.write_vocab(vocab)
9671014

@@ -997,11 +1044,11 @@ def pick_output_type(model: LazyModel, output_type_str: Optional[str]) -> GGMLFi
9971044
raise Exception(f"Unexpected combination of types: {name_to_type}")
9981045

9991046

1000-
def do_necessary_conversions(model: LazyModel) -> LazyModel:
1047+
def do_necessary_conversions(model: LazyModel, params: Params) -> LazyModel:
10011048
model = handle_quantization(model)
10021049

10031050
if "lm_head.weight" in model:
1004-
model = convert_transformers_to_orig(model)
1051+
model = convert_transformers_to_orig(model, params)
10051052
model = filter_and_sort_tensors(model)
10061053

10071054
return model
@@ -1107,14 +1154,14 @@ def load_vocab(path: Path) -> SentencePieceVocab:
11071154
return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None)
11081155

11091156

1110-
def default_outfile(model_paths: List[Path], params: Params) -> Path:
1157+
def default_outfile(model_paths: List[Path], file_type: GGMLFileType) -> Path:
11111158
namestr = {
11121159
GGMLFileType.AllF32: "f32",
11131160
GGMLFileType.MostlyF16: "f16",
11141161
GGMLFileType.MostlyQ4_0: "q4_0",
11151162
GGMLFileType.MostlyQ4_1: "q4_1",
11161163
GGMLFileType.PerLayerIsQ4_1: "q4_1",
1117-
}[params.file_type]
1164+
}[file_type]
11181165
ret = model_paths[0].parent / f"ggml-model-{namestr}.bin"
11191166
if ret in model_paths:
11201167
sys.stderr.write(
@@ -1164,13 +1211,13 @@ def main(args_in: Optional[List[str]] = None) -> None:
11641211
else:
11651212
vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
11661213
vocab = load_vocab(vocab_dir)
1214+
params = Params.load(model_plus)
11671215
model = model_plus.model
1168-
model = do_necessary_conversions(model)
1216+
model = do_necessary_conversions(model, params)
11691217
output_type = pick_output_type(model, args.outtype)
11701218
model = convert_to_output_type(model, output_type)
1171-
params = Params.guessed(model, output_type)
1172-
outfile = args.outfile or default_outfile(model_plus.paths, params)
1173-
OutputFile.write_all(outfile, params, model, vocab)
1219+
outfile = args.outfile or default_outfile(model_plus.paths, output_type)
1220+
OutputFile.write_all(outfile, params, output_type, model, vocab)
11741221
print(f"Wrote {outfile}")
11751222

11761223

ggml-opencl.cpp

+3
Original file line numberDiff line numberDiff line change
@@ -184,7 +184,9 @@ void convert_f16(__global half* x, const int ib, const int iqs, float* v0, float
184184
*v0 = vload_half(0, &x[ib + 0]);
185185
*v1 = vload_half(0, &x[ib + 1]);
186186
}
187+
);
187188

189+
static std::string k_quants_source = MULTILINE_QUOTE(
188190
inline void get_scale_min_k4(int j, const __global uint8_t *q, uint8_t *d, uint8_t *m)
189191
{
190192
if (j < 4)
@@ -856,6 +858,7 @@ std::string& replace(std::string& s, const std::string& from, const std::string&
856858
std::string generate_kernels() {
857859
std::stringstream src;
858860
src << program_source << '\n';
861+
src << k_quants_source << '\n';
859862
for (size_t i = 0; i < dequant_str_values.size(); i += dequant_str_keys.size()) {
860863
std::string dequant_kernel = dequant_template;
861864
std::string dmmv_kernel = dequant_mul_mat_vec_template;

gpttype_adapter.cpp

+17-11
Original file line numberDiff line numberDiff line change
@@ -308,8 +308,13 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
308308
params.memory_f16 = inputs.f16_kv;
309309
params.n_ctx = inputs.max_context_length;
310310

311-
neox_ctx_v2.hparams.n_ctx = gptj_ctx_v1.hparams.n_ctx = gptj_ctx_v2.hparams.n_ctx = gpt2_ctx_v1.hparams.n_ctx = gpt2_ctx_v2.hparams.n_ctx
312-
= neox_ctx_v3.hparams.n_ctx = gptj_ctx_v3.hparams.n_ctx = gptj_ctx_v3.hparams.n_ctx = mpt_ctx_v3.hparams.n_ctx = params.n_ctx;
311+
neox_ctx_v2.hparams.n_ctx = neox_ctx_v3.hparams.n_ctx
312+
= gptj_ctx_v1.hparams.n_ctx = gptj_ctx_v2.hparams.n_ctx = gptj_ctx_v3.hparams.n_ctx
313+
= gpt2_ctx_v1.hparams.n_ctx = gpt2_ctx_v2.hparams.n_ctx = gpt2_ctx_v3.hparams.n_ctx
314+
= mpt_ctx_v3.hparams.n_ctx = params.n_ctx;
315+
316+
//this is used for the mem_per_token eval, openblas needs more RAM
317+
bool use_scratch = ggml_cpu_has_gpublas();
313318

314319
printf("System Info: %s\n", llama_print_system_info());
315320
SetQuantsUnshuffled(false);
@@ -546,7 +551,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
546551
return res;
547552
}
548553
// determine the required inference memory per token:
549-
gpt2_eval(gpt2_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, file_format);
554+
gpt2_eval(gpt2_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, use_scratch);
550555
return ModelLoadResult::SUCCESS;
551556
}
552557
else
@@ -613,14 +618,14 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
613618
}
614619

615620
// determine the required inference memory per token:
616-
gptj_eval(gptj_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
621+
gptj_eval(gptj_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, use_scratch);
617622

618623
//if the logits are NAN or duplicated, it means the model is incompatible
619624
std::vector<float> oldlogits(logits);
620625

621626
//this is another hack because they change the library - we run the eval through the model
622627
//twice and compare logits. if they give the same logits for different inputs, model is broken
623-
gptj_eval(gptj_ctx_v3, params.n_threads, 0, {4, 5, 6, 7}, logits, mem_per_token);
628+
gptj_eval(gptj_ctx_v3, params.n_threads, 0, {4, 5, 6, 7}, logits, mem_per_token, use_scratch);
624629

625630
if(logits.size()>0 && (IsNanCheck(logits[0]) || LogitsDuplicated(oldlogits,logits)))
626631
{
@@ -685,7 +690,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
685690
}
686691

687692
// determine the required inference memory per token:
688-
gpt_neox_eval(neox_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
693+
gpt_neox_eval(neox_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, use_scratch);
689694

690695
return ModelLoadResult::SUCCESS;
691696
}
@@ -742,7 +747,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
742747
}
743748

744749
// determine the required inference memory per token:
745-
mpt_eval(mpt_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, false, mem_per_token);
750+
mpt_eval(mpt_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, false, mem_per_token, use_scratch);
746751
return ModelLoadResult::SUCCESS;
747752
}
748753
else
@@ -901,6 +906,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
901906
concat_output = "";
902907

903908
bool startedsampling = false;
909+
bool use_scratch = true; //for normal inference always use scratch
904910

905911
timer_start();
906912
double time1 = 0, time2 = 0;
@@ -1075,15 +1081,15 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
10751081
}
10761082
else if(file_format==FileFormat::GPT2_4)
10771083
{
1078-
evalres = gpt2_eval(gpt2_ctx_v3, params.n_threads, n_past, embd, logits, mem_per_token, file_format);
1084+
evalres = gpt2_eval(gpt2_ctx_v3, params.n_threads, n_past, embd, logits, mem_per_token, use_scratch);
10791085
}
10801086
else if(file_format==FileFormat::NEOX_1 || file_format == FileFormat::NEOX_2 || file_format == FileFormat::NEOX_3 || file_format==FileFormat::NEOX_4 || file_format==FileFormat::NEOX_5)
10811087
{
10821088
evalres = gpt_neox_v2_eval(neox_ctx_v2, params.n_threads, n_past, embd, logits, mem_per_token);
10831089
}
10841090
else if(file_format==FileFormat::NEOX_6|| file_format==FileFormat::NEOX_7)
10851091
{
1086-
evalres = gpt_neox_eval(neox_ctx_v3, params.n_threads, n_past, embd, logits, mem_per_token);
1092+
evalres = gpt_neox_eval(neox_ctx_v3, params.n_threads, n_past, embd, logits, mem_per_token, use_scratch);
10871093
}
10881094
else if(file_format==FileFormat::GPTJ_1 || file_format==FileFormat::GPTJ_2)
10891095
{
@@ -1095,11 +1101,11 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
10951101
}
10961102
else if(file_format==FileFormat::GPTJ_5)
10971103
{
1098-
evalres = gptj_eval(gptj_ctx_v3, params.n_threads, n_past, embd, logits, mem_per_token);
1104+
evalres = gptj_eval(gptj_ctx_v3, params.n_threads, n_past, embd, logits, mem_per_token, use_scratch);
10991105
}
11001106
else if(file_format==FileFormat::MPT_1)
11011107
{
1102-
evalres = mpt_eval(mpt_ctx_v3, params.n_threads, n_past, embd, logits, false, mem_per_token);
1108+
evalres = mpt_eval(mpt_ctx_v3, params.n_threads, n_past, embd, logits, false, mem_per_token, use_scratch);
11031109
}
11041110
else
11051111
{

koboldcpp.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -227,7 +227,7 @@ def utfprint(str):
227227
maxhordelen = 256
228228
modelbusy = False
229229
defaultport = 5001
230-
KcppVersion = "1.32"
230+
KcppVersion = "1.32.3"
231231
showdebug = True
232232

233233
class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):

llama.cpp

+4-4
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
8080
{ MODEL_3B, 256ull * MB },
8181
{ MODEL_7B, 512ull * MB },
8282
{ MODEL_13B, 512ull * MB },
83-
{ MODEL_30B, 512ull * MB },
83+
{ MODEL_30B, 640ull * MB },
8484
{ MODEL_65B, 1024ull * MB },
8585
};
8686
return k_sizes;
@@ -92,7 +92,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
9292
{ MODEL_3B, 256ull * MB },
9393
{ MODEL_7B, 512ull * MB },
9494
{ MODEL_13B, 512ull * MB },
95-
{ MODEL_30B, 512ull * MB },
95+
{ MODEL_30B, 640ull * MB },
9696
{ MODEL_65B, 1024ull * MB },
9797
};
9898
return k_sizes;
@@ -105,7 +105,7 @@ static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
105105
{ MODEL_3B, 682ull * MB },
106106
{ MODEL_7B, 1026ull * MB },
107107
{ MODEL_13B, 1608ull * MB },
108-
{ MODEL_30B, 3124ull * MB },
108+
{ MODEL_30B, 3224ull * MB },
109109
{ MODEL_65B, 5120ull * MB },
110110
};
111111
return k_sizes;
@@ -119,7 +119,7 @@ static const std::map<e_model, size_t> & MEM_REQ_EVAL()
119119
{ MODEL_3B, 512ull * MB },
120120
{ MODEL_7B, 800ull * MB },
121121
{ MODEL_13B, 1024ull * MB },
122-
{ MODEL_30B, 1280ull * MB },
122+
{ MODEL_30B, 1380ull * MB },
123123
{ MODEL_65B, 1536ull * MB },
124124
};
125125
return k_sizes;

model_adapter.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ void print_tok_vec(std::vector<float> &embd)
9898
//we need to read more to determine
9999
int32_t vocabsiz = 0;
100100
fin.read((char *) &vocabsiz, sizeof(int32_t));
101-
if(vocabsiz==4096) //actually the d_model for mpt
101+
if(vocabsiz==4096 || vocabsiz==7168) //actually the d_model for mpt
102102
{
103103
fileformat = FileFormat::MPT_1;
104104
}

0 commit comments

Comments
 (0)