Skip to content

Commit 750f051

Browse files
committed
Applied changes from upstream PR: save memory with lazy evaluation ggml-org#7075 (shameless copy from LlamaModel).
1 parent 9532ce3 commit 750f051

File tree

1 file changed

+46
-81
lines changed

1 file changed

+46
-81
lines changed

convert-hf-to-gguf.py

+46-81
Original file line numberDiff line numberDiff line change
@@ -2370,104 +2370,69 @@ def set_gguf_parameters(self):
23702370
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
23712371
self.gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
23722372

2373-
# Same as super class, but permuting q_proj, k_proj
2374-
def write_tensors(self):
2375-
block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
2376-
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
2377-
n_head = self.hparams.get("num_attention_heads")
2378-
n_kv_head = self.hparams.get("num_key_value_heads")
2379-
n_experts = self.hparams.get("num_local_experts")
2380-
experts = dict()
2381-
for name, data_torch in self.get_tensors():
2382-
# we don't need these
2383-
if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
2384-
continue
2385-
2386-
old_dtype = data_torch.dtype
2387-
2388-
# convert any unsupported data types to float32
2389-
if data_torch.dtype not in (torch.float16, torch.float32):
2390-
data_torch = data_torch.to(torch.float32)
2391-
2392-
data = data_torch.numpy()
2393-
2394-
if name.endswith("q_proj.weight"):
2395-
data = permute(data, n_head, n_head)
2396-
if name.endswith("k_proj.weight"):
2397-
data = permute(data, n_head, n_kv_head)
2398-
2399-
data = data.squeeze()
2373+
@staticmethod
2374+
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
2375+
if n_head_kv is not None and n_head != n_head_kv:
2376+
n_head = n_head_kv
2377+
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
2378+
.swapaxes(1, 2)
2379+
.reshape(weights.shape))
24002380

2401-
# process the experts separately
2402-
if name.find("block_sparse_moe.experts") != -1:
2403-
experts[name] = data
2404-
if len(experts) >= n_experts:
2405-
# merge the experts into a single 3d tensor
2406-
for bid in range(block_count):
2407-
for wid in range(1, 4):
2408-
full = True
2409-
for xid in range(n_experts):
2410-
ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.w{wid}.weight"
2411-
if ename not in experts:
2412-
full = False
2413-
break
2414-
if not full:
2415-
continue
2381+
_experts: list[dict[str, Tensor]] | None = None
24162382

2417-
datas = []
2418-
for xid in range(n_experts):
2419-
ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.w{wid}.weight"
2420-
datas.append(experts[ename])
2421-
del experts[ename]
2383+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2384+
n_head = self.hparams["num_attention_heads"]
2385+
n_kv_head = self.hparams.get("num_key_value_heads")
24222386

2423-
data = np.stack(datas, axis=0)
2424-
data_dtype = data.dtype
2387+
if name.endswith("q_proj.weight"):
2388+
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
2389+
if name.endswith("k_proj.weight"):
2390+
data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
24252391

2426-
if self.ftype == 0 and data_dtype == np.float16:
2427-
data = data.astype(np.float32)
2392+
# process the experts separately
2393+
if name.find("block_sparse_moe.experts") != -1:
2394+
n_experts = self.hparams["num_local_experts"]
24282395

2429-
if self.ftype == 1 and data_dtype == np.float32:
2430-
data = data.astype(np.float16)
2396+
assert bid is not None
24312397

2432-
merged_name = f"layers.{bid}.feed_forward.experts.w{wid}.weight"
2398+
if self._experts is None:
2399+
self._experts = [{} for _ in range(self.block_count)]
24332400

2434-
new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias"))
2435-
if new_name is None:
2436-
print(f"Can not map tensor {name!r}")
2437-
sys.exit()
2401+
self._experts[bid][name] = data_torch
24382402

2439-
print(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}")
2403+
if len(self._experts[bid]) >= n_experts * 3:
2404+
tensors: list[tuple[str, Tensor]] = []
24402405

2441-
self.gguf_writer.add_tensor(new_name, data)
2442-
continue
2406+
# merge the experts into a single 3d tensor
2407+
for wid in ["w1", "w2", "w3"]:
2408+
datas: list[Tensor] = []
24432409

2444-
# map tensor names
2445-
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
2446-
if new_name is None:
2447-
print(f"Can not map tensor {name!r}")
2448-
sys.exit()
2410+
for xid in range(n_experts):
2411+
ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{wid}.weight"
2412+
datas.append(self._experts[bid][ename])
2413+
del self._experts[bid][ename]
24492414

2450-
n_dims = len(data.shape)
2451-
data_dtype = data.dtype
2415+
data_torch = torch.stack(datas, dim=0)
24522416

2453-
# if f32 desired, convert any float16 to float32
2454-
if self.ftype == 0 and data_dtype == np.float16:
2455-
data = data.astype(np.float32)
2417+
merged_name = f"layers.{bid}.feed_forward.experts.{wid}.weight"
24562418

2457-
# 1d tensors need to be converted to float32
2458-
if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
2459-
data = data.astype(np.float32)
2419+
new_name = self.map_tensor_name(merged_name)
24602420

2461-
# if f16 desired, convert any float32 2-dim weight tensors to float16
2462-
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
2463-
data = data.astype(np.float16)
2421+
tensors.append((new_name, data_torch))
2422+
return tensors
2423+
else:
2424+
return []
24642425

2465-
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
2426+
return [(self.map_tensor_name(name), data_torch)]
24662427

2467-
self.gguf_writer.add_tensor(new_name, data)
2428+
def write_tensors(self):
2429+
super().write_tensors()
24682430

2469-
if len(experts) > 0:
2470-
raise ValueError(f"Unprocessed experts: {experts.keys()}")
2431+
if self._experts is not None:
2432+
# flatten `list[dict[str, Tensor]]` into `list[str]`
2433+
experts = [k for d in self._experts for k in d.keys()]
2434+
if len(experts) > 0:
2435+
raise ValueError(f"Unprocessed experts: {experts}")
24712436

24722437

24732438
###### CONVERSION LOGIC ######

0 commit comments

Comments
 (0)