Skip to content

Commit 7487137

Browse files
authored
rework convert.py to read hyper-parameters from config.json (ggml-org#1958)
* Read hyper-parameters from HuggingFace-transformer config.json, if they exist, and fall back to guessing, like before otherwise. This allows converting open_llama 3B and other non-standard model designs.
1 parent bbca06e commit 7487137

File tree

1 file changed

+69
-22
lines changed

1 file changed

+69
-22
lines changed

convert.py

Lines changed: 69 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -130,28 +130,76 @@ def make_tensors_list() -> List[str]:
130130
TENSORS_SET = set(TENSORS_LIST)
131131

132132

133+
def find_n_mult(n_ff: int, n_embd: int) -> int:
134+
# hardcoded magic range
135+
for n_mult in range(256, 1, -1):
136+
calc_ff = (((8*n_embd) // 3 + n_mult - 1) // n_mult)*n_mult
137+
if calc_ff == n_ff:
138+
return n_mult
139+
return 1
140+
133141
@dataclass
134142
class Params:
135143
n_vocab: int
136144
n_embd: int
137145
n_mult: int
138146
n_head: int
139147
n_layer: int
140-
file_type: GGMLFileType
141148

142149
@staticmethod
143-
def guessed(model: 'LazyModel', file_type: GGMLFileType) -> 'Params':
144-
n_vocab, n_embd = model["tok_embeddings.weight"].shape
150+
def guessed(model: 'LazyModel') -> 'Params':
151+
# try transformer naming first
152+
n_vocab, n_embd = model["model.embed_tokens.weight"].shape if "model.embed_tokens.weight" in model else model["tok_embeddings.weight"].shape
153+
154+
# try transformer naming first
155+
if "model.layers.0.self_attn.q_proj.weight" in model:
156+
n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model)
157+
else:
158+
n_layer=next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)
159+
160+
n_head=n_embd // 128 # guessed
145161

146162
return Params(
147163
n_vocab=n_vocab,
148164
n_embd=n_embd,
149165
n_mult=256,
150-
n_head=n_embd // 128,
151-
n_layer=next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model),
152-
file_type=file_type,
166+
n_head=n_head,
167+
n_layer=n_layer,
153168
)
154169

170+
@staticmethod
171+
def loadHFTransformerJson(model: 'LazyModel', config_path: 'Path') -> 'Params':
172+
config = json.load(open(config_path))
173+
174+
n_vocab = config["vocab_size"];
175+
n_embd = config["hidden_size"];
176+
n_head = config["num_attention_heads"];
177+
n_layer = config["num_hidden_layers"];
178+
n_ff = config["intermediate_size"];
179+
180+
n_mult = find_n_mult(n_ff, n_embd);
181+
182+
return Params(
183+
n_vocab=n_vocab,
184+
n_embd=n_embd,
185+
n_mult=n_mult,
186+
n_head=n_head,
187+
n_layer=n_layer,
188+
)
189+
190+
@staticmethod
191+
def load(model_plus: 'ModelPlus') -> 'Params':
192+
orig_config_path = model_plus.paths[0].parent / "params.json"
193+
hf_transformer_config_path = model_plus.paths[0].parent / "config.json"
194+
195+
if hf_transformer_config_path.exists():
196+
params = Params.loadHFTransformerJson(model_plus.model, hf_transformer_config_path)
197+
else:
198+
params = Params.guessed(model_plus.model)
199+
200+
print(f'params: n_vocab:{params.n_vocab} n_embd:{params.n_embd} n_mult:{params.n_mult} n_head:{params.n_head} n_layer:{params.n_layer}')
201+
return params
202+
155203

156204
class SentencePieceVocab:
157205
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]) -> None:
@@ -595,18 +643,17 @@ def load() -> Tensor:
595643
return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}) ' + lazy_tensor.description)
596644

597645

598-
def convert_transformers_to_orig(model: LazyModel) -> LazyModel:
646+
def convert_transformers_to_orig(model: LazyModel, params: Params) -> LazyModel:
599647
out: LazyModel = {}
600648
out["tok_embeddings.weight"] = model["model.embed_tokens.weight"]
601649
out["norm.weight"] = model["model.norm.weight"]
602650
out["output.weight"] = model["lm_head.weight"]
603651

604-
n_head = model["model.layers.0.self_attn.q_proj.weight"].shape[1] // 128
605652
for i in itertools.count():
606653
if f"model.layers.{i}.self_attn.q_proj.weight" not in model:
607654
break
608-
out[f"layers.{i}.attention.wq.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], n_head)
609-
out[f"layers.{i}.attention.wk.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], n_head)
655+
out[f"layers.{i}.attention.wq.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head)
656+
out[f"layers.{i}.attention.wk.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head)
610657
out[f"layers.{i}.attention.wv.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
611658
out[f"layers.{i}.attention.wo.weight"] = model[f"model.layers.{i}.self_attn.o_proj.weight"]
612659

@@ -920,7 +967,7 @@ class OutputFile:
920967
def __init__(self, fname_out: Path) -> None:
921968
self.fout = open(fname_out, "wb")
922969

923-
def write_file_header(self, params: Params) -> None:
970+
def write_file_header(self, params: Params, file_type: GGMLFileType) -> None:
924971
self.fout.write(b"ggjt"[::-1]) # magic
925972
values = [
926973
1, # file version
@@ -930,7 +977,7 @@ def write_file_header(self, params: Params) -> None:
930977
params.n_head,
931978
params.n_layer,
932979
params.n_embd // params.n_head, # rot (obsolete)
933-
params.file_type.value,
980+
file_type.value,
934981
]
935982
self.fout.write(struct.pack("i" * len(values), *values))
936983

@@ -958,10 +1005,10 @@ def write_vocab_only(fname_out: Path, vocab: Vocab) -> None:
9581005
of.fout.close()
9591006

9601007
@staticmethod
961-
def write_all(fname_out: Path, params: Params, model: LazyModel, vocab: Vocab) -> None:
1008+
def write_all(fname_out: Path, params: Params, file_type: GGMLFileType, model: LazyModel, vocab: Vocab) -> None:
9621009
check_vocab_size(params, vocab)
9631010
of = OutputFile(fname_out)
964-
of.write_file_header(params)
1011+
of.write_file_header(params, file_type)
9651012
print("Writing vocab...")
9661013
of.write_vocab(vocab)
9671014

@@ -997,11 +1044,11 @@ def pick_output_type(model: LazyModel, output_type_str: Optional[str]) -> GGMLFi
9971044
raise Exception(f"Unexpected combination of types: {name_to_type}")
9981045

9991046

1000-
def do_necessary_conversions(model: LazyModel) -> LazyModel:
1047+
def do_necessary_conversions(model: LazyModel, params: Params) -> LazyModel:
10011048
model = handle_quantization(model)
10021049

10031050
if "lm_head.weight" in model:
1004-
model = convert_transformers_to_orig(model)
1051+
model = convert_transformers_to_orig(model, params)
10051052
model = filter_and_sort_tensors(model)
10061053

10071054
return model
@@ -1107,14 +1154,14 @@ def load_vocab(path: Path) -> SentencePieceVocab:
11071154
return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None)
11081155

11091156

1110-
def default_outfile(model_paths: List[Path], params: Params) -> Path:
1157+
def default_outfile(model_paths: List[Path], file_type: GGMLFileType) -> Path:
11111158
namestr = {
11121159
GGMLFileType.AllF32: "f32",
11131160
GGMLFileType.MostlyF16: "f16",
11141161
GGMLFileType.MostlyQ4_0: "q4_0",
11151162
GGMLFileType.MostlyQ4_1: "q4_1",
11161163
GGMLFileType.PerLayerIsQ4_1: "q4_1",
1117-
}[params.file_type]
1164+
}[file_type]
11181165
ret = model_paths[0].parent / f"ggml-model-{namestr}.bin"
11191166
if ret in model_paths:
11201167
sys.stderr.write(
@@ -1164,13 +1211,13 @@ def main(args_in: Optional[List[str]] = None) -> None:
11641211
else:
11651212
vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
11661213
vocab = load_vocab(vocab_dir)
1214+
params = Params.load(model_plus)
11671215
model = model_plus.model
1168-
model = do_necessary_conversions(model)
1216+
model = do_necessary_conversions(model, params)
11691217
output_type = pick_output_type(model, args.outtype)
11701218
model = convert_to_output_type(model, output_type)
1171-
params = Params.guessed(model, output_type)
1172-
outfile = args.outfile or default_outfile(model_plus.paths, params)
1173-
OutputFile.write_all(outfile, params, model, vocab)
1219+
outfile = args.outfile or default_outfile(model_plus.paths, output_type)
1220+
OutputFile.write_all(outfile, params, output_type, model, vocab)
11741221
print(f"Wrote {outfile}")
11751222

11761223

0 commit comments

Comments
 (0)