From d5e03e62ce30696f96b042bb223e587c1f9ab0f1 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 19 Apr 2025 15:30:06 +0200 Subject: [PATCH 01/10] convert : experimental support for `--mmproj` flag --- convert_hf_to_gguf.py | 232 +++++++++++++++++++++++++++------ examples/llava/clip-impl.h | 3 - gguf-py/gguf/constants.py | 136 ++++++++++++++++++- gguf-py/gguf/tensor_mapping.py | 144 ++++++++++++++++++++ 4 files changed, 472 insertions(+), 43 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 89522dee8b8ad..599ca5b7b8d39 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -67,6 +67,11 @@ class Model: dir_model_card: Path remote_hf_model_id: str | None + # for vision encoders + mmproj: bool + ignore_vision: bool = False # subclasses may overwrite this + mtmd_model: MultimodalModel | None = None + # subclasses should define this! model_arch: gguf.MODEL_ARCH @@ -74,7 +79,8 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, use_temp_file: bool = False, eager: bool = False, metadata_override: Path | None = None, model_name: str | None = None, split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, - small_first_shard: bool = False, hparams: dict[str, Any] | None = None, remote_hf_model_id: str | None = None): + small_first_shard: bool = False, hparams: dict[str, Any] | None = None, remote_hf_model_id: str | None = None, + mmproj: bool = False): if type(self) is Model: raise TypeError(f"{type(self).__name__!r} should not be directly instantiated") @@ -109,6 +115,7 @@ def get_remote_tensors() -> Iterator[tuple[str, Tensor]]: self.metadata_override = metadata_override self.model_name = model_name self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py + self.mmproj = mmproj # Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type if self.ftype == gguf.LlamaFileType.GUESSED: @@ -125,6 +132,28 @@ def get_remote_tensors() -> Iterator[tuple[str, Tensor]]: self.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file, split_max_tensors=split_max_tensors, split_max_size=split_max_size, dry_run=dry_run, small_first_shard=small_first_shard) + # vision encoder + if mmproj: + vision_hparams = self.hparams.get("vision_config") + if vision_hparams is None: + raise ValueError("Vision config not found in model config") + elif self.ignore_vision: + raise ValueError("Vision config found, but mmproj conversion for this model is not supported yet") + else: + self.mtmd_model = MultimodalModel( + hparams=vision_hparams, + ftype=self.ftype, + fname_out=self.fname_out, + endianess=self.endianess, + use_temp_file=self.use_temp_file, + ) + + @classmethod + def add_prefix_to_filename(cls, path: Path, prefix: str) -> Path: + stem, suffix = path.stem, path.suffix + new_name = f"{prefix}{stem}{suffix}" + return path.with_name(new_name) + @classmethod def __init_subclass__(cls): # can't use an abstract property, because overriding it without type errors @@ -272,8 +301,13 @@ def set_gguf_parameters(self): self.gguf_writer.add_key_length(head_dim) self.gguf_writer.add_value_length(head_dim) - self.gguf_writer.add_file_type(self.ftype) - logger.info(f"gguf: file type = {self.ftype}") + if not self.mmproj: + self.gguf_writer.add_file_type(self.ftype) + logger.info(f"gguf: file type = {self.ftype}") + else: + assert self.mtmd_model is not None + self.mtmd_model.set_gguf_parameters(n_embd_text=n_embd) + logger.info(f"mmproj: file type = {self.mtmd_model.ftype}") def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused @@ -311,6 +345,10 @@ def prepare_tensors(self): break for new_name, data_torch in (self.modify_tensors(data_torch, name, bid)): + # skip adding tensor if we're working with a vision model + if self.mmproj: + continue + # TODO: why do we squeeze here? # data = data_torch.squeeze().numpy() data = data_torch.numpy() @@ -455,12 +493,18 @@ def prepare_metadata(self, vocab_only: bool): self.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION) def write(self): - self.prepare_tensors() - self.prepare_metadata(vocab_only=False) - self.gguf_writer.write_header_to_file(path=self.fname_out) - self.gguf_writer.write_kv_data_to_file() - self.gguf_writer.write_tensors_to_file(progress=True) - self.gguf_writer.close() + if self.mtmd_model is not None: + self.prepare_tensors() + self.prepare_metadata(vocab_only=False) + logger.info("Writing vision model") + self.mtmd_model.write() + else: + self.prepare_tensors() + self.prepare_metadata(vocab_only=False) + self.gguf_writer.write_header_to_file(path=self.fname_out) + self.gguf_writer.write_kv_data_to_file() + self.gguf_writer.write_tensors_to_file(progress=True) + self.gguf_writer.close() def write_vocab(self): if len(self.gguf_writer.tensors) != 1: @@ -485,7 +529,10 @@ def get_model_part_names(dir_model: Path, prefix: str, suffix: str) -> list[str] @staticmethod def load_hparams(dir_model: Path): with open(dir_model / "config.json", "r", encoding="utf-8") as f: - return json.load(f) + hparams = json.load(f) + if "text_config" in hparams: + hparams = {**hparams, **hparams["text_config"]} + return hparams @classmethod def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]: @@ -1024,6 +1071,101 @@ def _set_vocab_builtin(self, model_name: Literal["gpt-neox", "llama-spm"], vocab self.gguf_writer.add_add_eos_token(field.parts[-1].tolist()[0]) +# for converting mmproj file +class MultimodalModel: + hparams: dict + dir_model: Path + ftype: gguf.LlamaFileType + fname_out: Path + tensor_map: gguf.TensorNameMap + gguf_writer: gguf.GGUFWriter + + def __init__(self, hparams: dict, ftype: gguf.LlamaFileType, fname_out: Path, endianess: gguf.GGUFEndian, use_temp_file: bool): + self.hparams = hparams + self.ftype = ftype + self.fname_out = fname_out + self.tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.CLIP_VISION, 128) + self.gguf_writer = gguf.GGUFWriter(path=None, + arch="clip", + endianess=endianess, + use_temp_file=use_temp_file) + + def set_gguf_parameters(self, n_embd_text: int): + """Function to be called by Model.set_gguf_parameters()""" + self.gguf_writer.add_type(gguf.GGUFType.CLIP_VISION) + self.gguf_writer.add_file_type(self.ftype) + self.gguf_writer.add_uint32(gguf.Keys.ClipVision.PROJECTION_DIM, n_embd_text) + self.gguf_writer.add_bool(gguf.Keys.ClipVision.HAS_VISION_ENCODER, True) + + # vision config + self.gguf_writer.add_uint32(gguf.Keys.ClipVision.IMAGE_SIZE, self.find_hparam(["image_size"])) + self.gguf_writer.add_uint32(gguf.Keys.ClipVision.PATCH_SIZE, self.find_hparam(["patch_size"])) + self.gguf_writer.add_uint32(gguf.Keys.ClipVision.EMBEDDING_LENGTH, self.find_hparam(["hidden_size"])) + self.gguf_writer.add_uint32(gguf.Keys.ClipVision.FEED_FORWARD_LENGTH, self.find_hparam(["intermediate_size"])) + self.gguf_writer.add_uint32(gguf.Keys.ClipVision.BLOCK_COUNT, self.find_hparam(["num_hidden_layers"])) + self.gguf_writer.add_uint32(gguf.Keys.ClipVision.Attention.HEAD_COUNT, self.find_hparam(["num_attention_heads"])) + + def find_hparam(self, keys: Iterable[str], optional: bool = False) -> Any: + key = next((k for k in keys if k in self.hparams), None) + if key is not None: + return self.hparams[key] + if optional: + return None + raise KeyError(f"could not find any of: {keys}") + + def get_quantization(self, mapped_name: str, data_torch: Tensor) -> gguf.GGMLQuantizationType: + is_1d = len(data_torch.shape) == 1 + is_embd = "_embd" in mapped_name + can_quantize = not is_1d and not is_embd + data_qtype = gguf.GGMLQuantizationType.F32 + if can_quantize: + if self.ftype == gguf.LlamaFileType.ALL_F32: + data_qtype = gguf.GGMLQuantizationType.F32 + elif self.ftype == gguf.LlamaFileType.MOSTLY_F16: + data_qtype = gguf.GGMLQuantizationType.F16 + elif self.ftype == gguf.LlamaFileType.MOSTLY_BF16: + data_qtype = gguf.GGMLQuantizationType.BF16 + elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0: + data_qtype = gguf.GGMLQuantizationType.Q8_0 + else: + raise ValueError(f"Unsupported file type: {self.ftype}") + return data_qtype + + def add_tensor(self, original_name: str, data_torch: Tensor) -> None: + """Function to be called inside Model.modify_tensors()""" + # name mapping + new_name = self.tensor_map.get_name(key=original_name, try_suffixes=(".weight", ".bias")) + if new_name is None: + raise ValueError(f"Can not map tensor {original_name!r}") + + # process data + # old_dtype = data_torch.dtype + data_qtype = self.get_quantization(new_name, data_torch) + data = data_torch.numpy() + try: + data = gguf.quants.quantize(data, data_qtype) + except Exception as e: + logger.error(f"Error quantizing tensor '{new_name}': {e}, fallback to F16") + data_qtype = gguf.GGMLQuantizationType.F16 + data = gguf.quants.quantize(data, data_qtype) + + # reverse shape to make it similar to the internal ggml dimension order + # TODO: we don't print old_dtype because it's not correct, to be fixed later + old_dtype = "" + shape_str = f"{{{', '.join(str(n) for n in reversed(data_torch.shape))}}}" + logger.info(f"{f'%-32s' % f'{new_name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}") + + # add tensor + self.gguf_writer.add_tensor(new_name, data, raw_dtype=data_qtype) + + def write(self): + """Function to be called by Model.write()""" + self.gguf_writer.write_header_to_file(path=self.fname_out) + self.gguf_writer.write_kv_data_to_file() + self.gguf_writer.write_tensors_to_file(progress=True) + self.gguf_writer.close() + + @Model.register("GPTNeoXForCausalLM") class GPTNeoXModel(Model): model_arch = gguf.MODEL_ARCH.GPTNEOX @@ -1781,20 +1923,13 @@ def prepare_tensors(self): @Model.register("Llama4ForConditionalGeneration") class Llama4Model(LlamaModel): model_arch = gguf.MODEL_ARCH.LLAMA4 - has_vision: bool = False undo_permute = False + ignore_vision = True # TODO @ngxson : avoid duplicate this code everywhere by at least support "text_config" # same with llama, but we need to merge the text_config into the root level of hparams def __init__(self, *args, **kwargs): - hparams = kwargs["hparams"] if "hparams" in kwargs else Model.load_hparams(args[0]) - if "text_config" in hparams: - hparams = {**hparams, **hparams["text_config"]} - kwargs["hparams"] = hparams super().__init__(*args, **kwargs) - if "vision_config" in hparams: - logger.info("Has vision encoder, but it will be ignored") - self.has_vision = True # IMPORTANT: the normal "intermediate_size" is renamed to "intermediate_size_mlp", we need to undo this self.hparams["intermediate_size_moe"] = self.hparams["intermediate_size"] self.hparams["intermediate_size"] = self.hparams["intermediate_size_mlp"] @@ -1824,7 +1959,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): name += ".weight" data_torch = data_torch.transpose(-1, -2) - if "multi_modal_projector" in name or "vision_model" in name: + if "multi_modal_projector" in name or "mtmd_model" in name: return [] return super().modify_tensors(data_torch, name, bid) @@ -3474,24 +3609,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter @Model.register("Gemma3ForCausalLM", "Gemma3ForConditionalGeneration") class Gemma3Model(Model): model_arch = gguf.MODEL_ARCH.GEMMA3 - has_vision: bool = False - - # we need to merge the text_config into the root level of hparams - def __init__(self, *args, **kwargs): - hparams = kwargs["hparams"] if "hparams" in kwargs else Model.load_hparams(args[0]) - if "text_config" in hparams: - hparams = {**hparams, **hparams["text_config"]} - kwargs["hparams"] = hparams - super().__init__(*args, **kwargs) - if "vision_config" in hparams: - logger.info("Has vision encoder, but it will be ignored") - self.has_vision = True def write(self): super().write() - if self.has_vision: - logger.info("NOTE: this script only convert the language model to GGUF") - logger.info(" for the vision model, please use gemma3_convert_encoder_to_gguf.py") def set_vocab(self): self._set_vocab_sentencepiece() @@ -3524,15 +3644,42 @@ def set_gguf_parameters(self): self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) self.gguf_writer.add_rope_scaling_factor(hparams["rope_scaling"]["factor"]) + if self.mtmd_model is not None: + self.mtmd_model.set_gguf_parameters(n_embd_text=hparams["hidden_size"]) + vgguf = self.mtmd_model.gguf_writer + vgguf.add_string(gguf.Keys.ClipVision.PROJECTOR_TYPE, "gemma3") + # default values below are taken from HF tranformers code + vgguf.add_float32(gguf.Keys.ClipVision.Attention.LAYERNORM_EPS, self.mtmd_model.hparams.get("layer_norm_eps", 1e-6)) + vgguf.add_array(gguf.Keys.ClipVision.IMAGE_MEAN, [0.5, 0.5, 0.5]) + vgguf.add_array(gguf.Keys.ClipVision.IMAGE_STD, [0.5, 0.5, 0.5]) + vgguf.add_bool (gguf.Keys.ClipVision.USE_GELU, True) + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused if name.startswith("language_model."): name = name.replace("language_model.", "") + elif name.startswith("multi_modal_projector.") or name.startswith("vision_tower.") \ - or name.startswith("multimodal_projector.") or name.startswith("vision_model."): # this is for old HF model, should be removed later - # ignore vision tensors - return [] + or name.startswith("multimodal_projector.") or name.startswith("mtmd_model."): + if self.mmproj: + assert self.mtmd_model is not None + # process vision tensors + name = name.replace("_weight", ".weight") + if "fc1" in name: + name = name.replace("fc1", "fc2") + else: + name = name.replace("fc2", "fc1") + + # corrent norm value ; only this "soft_emb_norm" need to be corrected as it's part of Gemma projector + # the other norm values are part of SigLIP model, and they are already correct + # ref code: Gemma3RMSNorm + if "soft_emb_norm.weight" in name: + logger.info(f"Correcting norm value for '{name}'") + data_torch = data_torch + 1 + + self.mtmd_model.add_tensor(name, data_torch) + return [] # vision tensor already handled # remove OOV (out-of-vocabulary) rows in token_embd if "embed_tokens.weight" in name: @@ -5554,6 +5701,10 @@ def parse_args() -> argparse.Namespace: "--remote", action="store_true", help="(Experimental) Read safetensors file remotely without downloading to disk. Config and tokenizer files will still be downloaded. To use this feature, you need to specify Hugging Face model repo name instead of a local directory. For example: 'HuggingFaceTB/SmolLM2-1.7B-Instruct'. Note: To access gated repo, set HF_TOKEN environment variable to your Hugging Face token.", ) + parser.add_argument( + "--mmproj", action="store_true", + help="(Experimental) Export multimodal projector (mmproj) for vision models. This will only work on some vision models. A prefix 'mmproj-' will be added to the output file name.", + ) args = parser.parse_args() if not args.print_supported_models and args.model is None: @@ -5633,6 +5784,10 @@ def main() -> None: hparams = Model.load_hparams(dir_model) + if args.mmproj: + if "mmproj" not in fname_out.name: + fname_out = Model.add_prefix_to_filename(fname_out, "mmproj-") + with torch.inference_mode(): output_type = ftype_map[args.outtype] model_architecture = hparams["architectures"][0] @@ -5649,7 +5804,8 @@ def main() -> None: split_max_tensors=args.split_max_tensors, split_max_size=split_str_to_n_bytes(args.split_max_size), dry_run=args.dry_run, small_first_shard=args.no_tensor_first_split, - remote_hf_model_id=str(args.model) if args.remote else None) + remote_hf_model_id=str(args.model) if args.remote else None, + mmproj=args.mmproj) if args.vocab_only: logger.info("Exporting model vocab...") diff --git a/examples/llava/clip-impl.h b/examples/llava/clip-impl.h index 4d7340a56bd0c..180ae9880b124 100644 --- a/examples/llava/clip-impl.h +++ b/examples/llava/clip-impl.h @@ -50,7 +50,6 @@ // tensor name constants // -#define TN_TOKEN_EMBD "%s.token_embd.weight" #define TN_POS_EMBD "%s.position_embd.weight" #define TN_CLASS_EMBD "v.class_embd" #define TN_PATCH_EMBD "v.patch_embd.weight" // not rename tensor with ".0" postfix for backwrad compat @@ -66,8 +65,6 @@ #define TN_LN_2 "%s.blk.%d.ln2.%s" #define TN_LN_PRE "%s.pre_ln.%s" #define TN_LN_POST "%s.post_ln.%s" -#define TN_TEXT_PROJ "text_projection.weight" -#define TN_VIS_PROJ "visual_projection.weight" #define TN_LLAVA_PROJ "mm.%d.%s" #define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s" #define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s" diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 8fcde2626aa7c..70dd13fb731a3 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -218,17 +218,36 @@ class Adapter: TYPE = "adapter.type" LORA_ALPHA = "adapter.lora.alpha" + class ClipVision: + PROJECTOR_TYPE = "clip.projector_type" + HAS_VISION_ENCODER = "clip.has_vision_encoder" + HAS_LLAVA_PROJECTOR = "clip.has_llava_projector" + IMAGE_SIZE = "clip.vision.image_size" + PATCH_SIZE = "clip.vision.patch_size" + EMBEDDING_LENGTH = "clip.vision.embedding_length" + FEED_FORWARD_LENGTH = "clip.vision.feed_forward_length" + PROJECTION_DIM = "clip.vision.projection_dim" + BLOCK_COUNT = "clip.vision.block_count" + IMAGE_MEAN = "clip.vision.image_mean" + IMAGE_STD = "clip.vision.image_std" + USE_GELU = "clip.use_gelu" + class Attention: + HEAD_COUNT = "clip.vision.attention.head_count" + LAYERNORM_EPS = "clip.vision.attention.layer_norm_epsilon" + # # recommended mapping of model tensor names for storage in gguf # class GGUFType: - MODEL = "model" - ADAPTER = "adapter" + MODEL = "model" + ADAPTER = "adapter" + CLIP_VISION = "clip-vision" class MODEL_ARCH(IntEnum): + CLIP_VISION = auto() # dummy arch for clip.cpp LLAMA = auto() LLAMA4 = auto() DECI = auto() @@ -296,6 +315,14 @@ class MODEL_ARCH(IntEnum): PLM = auto() BAILINGMOE = auto() +class VISION_PROJECTOR_TYPE(IntEnum): + MLP = auto() + LDP = auto() + LDPV2 = auto() + RESAMPLER = auto() + GLM_EDGE = auto() + MERGER = auto() + GEMMA3 = auto() class MODEL_TENSOR(IntEnum): TOKEN_EMBD = auto() @@ -436,9 +463,41 @@ class MODEL_TENSOR(IntEnum): POSNET_ATTN_K = auto() POSNET_ATTN_V = auto() POSNET_ATTN_OUT = auto() + # vision + V_MMPROJ = auto() + V_MMPROJ_FC = auto() + V_MMPROJ_MLP = auto() + V_MMPROJ_PEG = auto() + V_ENC_EMBD_CLS = auto() + V_ENC_EMBD_PATCH = auto() + V_ENC_EMBD_POS = auto() + V_ENC_ATTN_Q = auto() + V_ENC_ATTN_K = auto() + V_ENC_ATTN_V = auto() + V_ENC_INPUT_NORM = auto() + V_ENC_OUTPUT = auto() + V_ENC_OUTPUT_NORM = auto() + V_ENC_FFN_UP = auto() + V_ENC_FFN_DOWN = auto() + V_PRE_NORM = auto() + V_POST_NORM = auto() + V_MM_INP_PROJ = auto() # gemma3 + V_MM_SOFT_EMB_NORM = auto() # gemma3 + V_RESMPL_POS_EMBD_K = auto() # minicpmv + V_RESMPL_ATTN_Q = auto() # minicpmv + V_RESMPL_ATTN_K = auto() # minicpmv + V_RESMPL_ATTN_V = auto() # minicpmv + V_RESMPL_ATTN_OUT = auto() # minicpmv + V_RESMPL_KV = auto() # minicpmv + V_RESMPL_KV_NORM = auto() # minicpmv + V_RESMPL_POST_NORM = auto() # minicpmv + V_RESMPL_Q_NORM = auto() # minicpmv + V_RESMPL_PROJ = auto() # minicpmv + V_RESMPL_QUERY = auto() # minicpmv MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { + MODEL_ARCH.CLIP_VISION: "clip_vision", # dummy arch for clip.cpp MODEL_ARCH.LLAMA: "llama", MODEL_ARCH.LLAMA4: "llama4", MODEL_ARCH.DECI: "deci", @@ -507,6 +566,16 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.BAILINGMOE: "bailingmoe", } +VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = { + VISION_PROJECTOR_TYPE.MLP: "mlp", + VISION_PROJECTOR_TYPE.LDP: "ldp", + VISION_PROJECTOR_TYPE.LDPV2: "ldpv2", + VISION_PROJECTOR_TYPE.RESAMPLER: "resampler", + VISION_PROJECTOR_TYPE.GLM_EDGE: "adapter", + VISION_PROJECTOR_TYPE.MERGER: "qwen2vl_merger", + VISION_PROJECTOR_TYPE.GEMMA3: "gemma3", +} + TENSOR_NAMES: dict[MODEL_TENSOR, str] = { MODEL_TENSOR.TOKEN_EMBD: "token_embd", MODEL_TENSOR.TOKEN_EMBD_NORM: "token_embd_norm", @@ -646,9 +715,72 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.POSNET_ATTN_K: "posnet.{bid}.attn_k", MODEL_TENSOR.POSNET_ATTN_V: "posnet.{bid}.attn_v", MODEL_TENSOR.POSNET_ATTN_OUT: "posnet.{bid}.attn_output", + # vision + MODEL_TENSOR.V_MMPROJ: "mm.{bid}", + MODEL_TENSOR.V_MMPROJ_FC: "mm.model.fc", + MODEL_TENSOR.V_MMPROJ_MLP: "mm.model.mlp.{bid}", + MODEL_TENSOR.V_MMPROJ_PEG: "mm.model.peg.{bid}", + MODEL_TENSOR.V_ENC_EMBD_CLS: "v.class_embd", + MODEL_TENSOR.V_ENC_EMBD_PATCH: "v.patch_embd", + MODEL_TENSOR.V_ENC_EMBD_POS: "v.position_embd", + MODEL_TENSOR.V_ENC_ATTN_Q: "v.blk.{bid}.attn_q", + MODEL_TENSOR.V_ENC_ATTN_K: "v.blk.{bid}.attn_k", + MODEL_TENSOR.V_ENC_ATTN_V: "v.blk.{bid}.attn_v", + MODEL_TENSOR.V_ENC_INPUT_NORM: "v.blk.{bid}.ln1", + MODEL_TENSOR.V_ENC_OUTPUT: "v.blk.{bid}.attn_out", + MODEL_TENSOR.V_ENC_OUTPUT_NORM: "v.blk.{bid}.ln2", + MODEL_TENSOR.V_ENC_FFN_UP: "v.blk.{bid}.ffn_up", + MODEL_TENSOR.V_ENC_FFN_DOWN: "v.blk.{bid}.ffn_down", + MODEL_TENSOR.V_PRE_NORM: "v.pre_ln", + MODEL_TENSOR.V_POST_NORM: "v.post_ln", + MODEL_TENSOR.V_MM_INP_PROJ: "mm.input_projection", + MODEL_TENSOR.V_MM_SOFT_EMB_NORM: "mm.soft_emb_norm", + MODEL_TENSOR.V_RESMPL_POS_EMBD_K: "resampler.pos_embd_k", + MODEL_TENSOR.V_RESMPL_ATTN_Q: "resampler.attn.q", + MODEL_TENSOR.V_RESMPL_ATTN_K: "resampler.attn.k", + MODEL_TENSOR.V_RESMPL_ATTN_V: "resampler.attn.v", + MODEL_TENSOR.V_RESMPL_ATTN_OUT: "resampler.attn.out", + MODEL_TENSOR.V_RESMPL_KV: "resampler.kv", + MODEL_TENSOR.V_RESMPL_KV_NORM: "resampler.ln_kv", + MODEL_TENSOR.V_RESMPL_POST_NORM: "resampler.ln_post", + MODEL_TENSOR.V_RESMPL_Q_NORM: "resampler.ln_q", + MODEL_TENSOR.V_RESMPL_PROJ: "resampler.proj", + MODEL_TENSOR.V_RESMPL_QUERY: "resampler.query", } MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { + MODEL_ARCH.CLIP_VISION: [ + MODEL_TENSOR.V_MMPROJ, + MODEL_TENSOR.V_MMPROJ_FC, + MODEL_TENSOR.V_MMPROJ_MLP, + MODEL_TENSOR.V_MMPROJ_PEG, + MODEL_TENSOR.V_ENC_EMBD_CLS, + MODEL_TENSOR.V_ENC_EMBD_PATCH, + MODEL_TENSOR.V_ENC_EMBD_POS, + MODEL_TENSOR.V_ENC_ATTN_Q, + MODEL_TENSOR.V_ENC_ATTN_K, + MODEL_TENSOR.V_ENC_ATTN_V, + MODEL_TENSOR.V_ENC_INPUT_NORM, + MODEL_TENSOR.V_ENC_OUTPUT, + MODEL_TENSOR.V_ENC_OUTPUT_NORM, + MODEL_TENSOR.V_ENC_FFN_UP, + MODEL_TENSOR.V_ENC_FFN_DOWN, + MODEL_TENSOR.V_PRE_NORM, + MODEL_TENSOR.V_POST_NORM, + MODEL_TENSOR.V_MM_INP_PROJ, + MODEL_TENSOR.V_MM_SOFT_EMB_NORM, + MODEL_TENSOR.V_RESMPL_POS_EMBD_K, + MODEL_TENSOR.V_RESMPL_ATTN_Q, + MODEL_TENSOR.V_RESMPL_ATTN_K, + MODEL_TENSOR.V_RESMPL_ATTN_V, + MODEL_TENSOR.V_RESMPL_ATTN_OUT, + MODEL_TENSOR.V_RESMPL_KV, + MODEL_TENSOR.V_RESMPL_KV_NORM, + MODEL_TENSOR.V_RESMPL_POST_NORM, + MODEL_TENSOR.V_RESMPL_Q_NORM, + MODEL_TENSOR.V_RESMPL_PROJ, + MODEL_TENSOR.V_RESMPL_QUERY, + ], MODEL_ARCH.LLAMA: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 0bc75cf513a9f..22066b2868019 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -886,6 +886,150 @@ class TensorNameMap: MODEL_TENSOR.POSNET_ATTN_OUT: ( "backbone.posnet.{bid}.proj_out", # wavtokenizer ), + + ############################################################################# + ## Vision encoder + + MODEL_TENSOR.V_MMPROJ: ( + "multi_modal_projector.linear_{bid}", + ), + + MODEL_TENSOR.V_MMPROJ_FC: ( + "model.connector.modality_projection.proj", # SmolVLM + ), + + MODEL_TENSOR.V_MMPROJ_MLP: ( + "model.mm_projector.mlp.mlp.{bid}", + ), + + MODEL_TENSOR.V_MMPROJ_PEG: ( + "model.mm_projector.peg.peg.{bid}", + ), + + MODEL_TENSOR.V_ENC_EMBD_CLS: ( + "vision_tower.vision_model.embeddings.class_embedding", + ), + + MODEL_TENSOR.V_ENC_EMBD_PATCH: ( + "vision_tower.vision_model.embeddings.patch_embedding", + "vpm.embeddings.patch_embedding", + "model.vision_model.embeddings.patch_embedding", # SmolVLM + ), + + MODEL_TENSOR.V_ENC_EMBD_POS: ( + "vision_tower.vision_model.embeddings.position_embedding", + "vpm.embeddings.position_embedding", + "model.vision_model.embeddings.position_embedding", # SmolVLM + ), + + MODEL_TENSOR.V_ENC_ATTN_Q: ( + "vision_tower.vision_model.encoder.layers.{bid}.self_attn.q_proj", + "vpm.encoder.layers.{bid}.self_attn.q_proj", + "model.vision_model.encoder.layers.{bid}.self_attn.q_proj", # SmolVLM + ), + + MODEL_TENSOR.V_ENC_ATTN_K: ( + "vision_tower.vision_model.encoder.layers.{bid}.self_attn.k_proj", + "vpm.encoder.layers.{bid}.self_attn.k_proj", + "model.vision_model.encoder.layers.{bid}.self_attn.k_proj", # SmolVLM + ), + + MODEL_TENSOR.V_ENC_ATTN_V: ( + "vision_tower.vision_model.encoder.layers.{bid}.self_attn.v_proj", + "vpm.encoder.layers.{bid}.self_attn.v_proj", + "model.vision_model.encoder.layers.{bid}.self_attn.v_proj", # SmolVLM + ), + + MODEL_TENSOR.V_ENC_INPUT_NORM: ( + "vision_tower.vision_model.encoder.layers.{bid}.layer_norm1", + "vpm.encoder.layers.{bid}.layer_norm1", + "model.vision_model.encoder.layers.{bid}.layer_norm1", # SmolVLM + ), + + MODEL_TENSOR.V_ENC_OUTPUT: ( + "vision_tower.vision_model.encoder.layers.{bid}.self_attn.out_proj", + "vpm.encoder.layers.{bid}.self_attn.out_proj", + "model.vision_model.encoder.layers.{bid}.self_attn.out_proj", # SmolVLM + ), + + MODEL_TENSOR.V_ENC_OUTPUT_NORM: ( + "vision_tower.vision_model.encoder.layers.{bid}.layer_norm2", + "vpm.encoder.layers.{bid}.layer_norm2", + "model.vision_model.encoder.layers.{bid}.layer_norm2", # SmolVLM + ), + + MODEL_TENSOR.V_ENC_FFN_UP: ( + "vision_tower.vision_model.encoder.layers.{bid}.mlp.fc1", + "vpm.encoder.layers.{bid}.mlp.fc1", + "model.vision_model.encoder.layers.{bid}.mlp.fc1", # SmolVLM + ), + + MODEL_TENSOR.V_ENC_FFN_DOWN: ( + "vision_tower.vision_model.encoder.layers.{bid}.mlp.fc2", + "vpm.encoder.layers.{bid}.mlp.fc2", + "model.vision_model.encoder.layers.{bid}.mlp.fc2", # SmolVLM + ), + + MODEL_TENSOR.V_PRE_NORM: ( + "vision_tower.vision_model.pre_layrnorm", + ), + + MODEL_TENSOR.V_POST_NORM: ( + "vision_tower.vision_model.post_layernorm", + "model.vision_model.post_layernorm", # SmolVLM + ), + + MODEL_TENSOR.V_MM_INP_PROJ: ( + "multi_modal_projector.mm_input_projection", + ), + + MODEL_TENSOR.V_MM_SOFT_EMB_NORM: ( + "multi_modal_projector.mm_soft_emb_norm", + ), + + MODEL_TENSOR.V_RESMPL_POS_EMBD_K: ( + "resampler.pos_embed_k", + ), + + MODEL_TENSOR.V_RESMPL_ATTN_Q: ( + "resampler.attn.in_proj_q", # tensor generated from resampler.attn.in_proj + ), + + MODEL_TENSOR.V_RESMPL_ATTN_K: ( + "resampler.attn.in_proj_k", # tensor generated from resampler.attn.in_proj + ), + + MODEL_TENSOR.V_RESMPL_ATTN_V: ( + "resampler.attn.in_proj_v", # tensor generated from resampler.attn.in_proj + ), + + MODEL_TENSOR.V_RESMPL_ATTN_OUT: ( + "resampler.attn.out_proj", + ), + + MODEL_TENSOR.V_RESMPL_KV: ( + "resampler.kv_proj", + ), + + MODEL_TENSOR.V_RESMPL_POST_NORM: ( + "resampler.ln_post", + ), + + MODEL_TENSOR.V_RESMPL_KV_NORM: ( + "resampler.ln_kv", + ), + + MODEL_TENSOR.V_RESMPL_Q_NORM: ( + "resampler.ln_q", + ), + + MODEL_TENSOR.V_RESMPL_PROJ: ( + "resampler.proj", + ), + + MODEL_TENSOR.V_RESMPL_QUERY: ( + "resampler.query", + ), } # architecture-specific block mappings From d59a7bb306d4616cfd4610081a8715e175d7fe70 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 19 Apr 2025 15:46:46 +0200 Subject: [PATCH 02/10] fix bad ctrl+f replace --- convert_hf_to_gguf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 599ca5b7b8d39..3ddcbebc7efe4 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -1959,7 +1959,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): name += ".weight" data_torch = data_torch.transpose(-1, -2) - if "multi_modal_projector" in name or "mtmd_model" in name: + if "multi_modal_projector" in name or "vision_model" in name: return [] return super().modify_tensors(data_torch, name, bid) @@ -3661,7 +3661,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter name = name.replace("language_model.", "") elif name.startswith("multi_modal_projector.") or name.startswith("vision_tower.") \ - or name.startswith("multimodal_projector.") or name.startswith("mtmd_model."): + or name.startswith("multimodal_projector.") or name.startswith("vision_model."): if self.mmproj: assert self.mtmd_model is not None # process vision tensors From 55651ad8b2ea062fad793faad9cac3b02aa6106c Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 19 Apr 2025 15:58:21 +0200 Subject: [PATCH 03/10] fix style --- gguf-py/gguf/constants.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 70dd13fb731a3..7eb53695ebd17 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -231,6 +231,7 @@ class ClipVision: IMAGE_MEAN = "clip.vision.image_mean" IMAGE_STD = "clip.vision.image_std" USE_GELU = "clip.use_gelu" + class Attention: HEAD_COUNT = "clip.vision.attention.head_count" LAYERNORM_EPS = "clip.vision.attention.layer_norm_epsilon" @@ -315,6 +316,7 @@ class MODEL_ARCH(IntEnum): PLM = auto() BAILINGMOE = auto() + class VISION_PROJECTOR_TYPE(IntEnum): MLP = auto() LDP = auto() @@ -324,6 +326,7 @@ class VISION_PROJECTOR_TYPE(IntEnum): MERGER = auto() GEMMA3 = auto() + class MODEL_TENSOR(IntEnum): TOKEN_EMBD = auto() TOKEN_EMBD_NORM = auto() From ddd7920e3e89075d0a5cf72520e233ac885320a1 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sun, 20 Apr 2025 11:37:07 +0200 Subject: [PATCH 04/10] split into subclasses TextModel and VisionModel --- convert_hf_to_gguf.py | 550 +++++++++++++++++++----------------------- 1 file changed, 248 insertions(+), 302 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 3ddcbebc7efe4..9326301badb04 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -42,11 +42,19 @@ class SentencePieceTokenTypes(IntEnum): BYTE = 6 +class ModelType(IntEnum): + TEXT = 1 + VISION = 2 + + AnyModel = TypeVar("AnyModel", bound="type[Model]") class Model: - _model_classes: dict[str, type[Model]] = {} + _model_classes: dict[ModelType, dict[str, type[Model]]] = { + ModelType.TEXT: {}, + ModelType.VISION: {}, + } dir_model: Path ftype: gguf.LlamaFileType @@ -67,11 +75,6 @@ class Model: dir_model_card: Path remote_hf_model_id: str | None - # for vision encoders - mmproj: bool - ignore_vision: bool = False # subclasses may overwrite this - mtmd_model: MultimodalModel | None = None - # subclasses should define this! model_arch: gguf.MODEL_ARCH @@ -79,8 +82,7 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, use_temp_file: bool = False, eager: bool = False, metadata_override: Path | None = None, model_name: str | None = None, split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, - small_first_shard: bool = False, hparams: dict[str, Any] | None = None, remote_hf_model_id: str | None = None, - mmproj: bool = False): + small_first_shard: bool = False, hparams: dict[str, Any] | None = None, remote_hf_model_id: str | None = None): if type(self) is Model: raise TypeError(f"{type(self).__name__!r} should not be directly instantiated") @@ -115,7 +117,6 @@ def get_remote_tensors() -> Iterator[tuple[str, Tensor]]: self.metadata_override = metadata_override self.model_name = model_name self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py - self.mmproj = mmproj # Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type if self.ftype == gguf.LlamaFileType.GUESSED: @@ -132,35 +133,12 @@ def get_remote_tensors() -> Iterator[tuple[str, Tensor]]: self.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file, split_max_tensors=split_max_tensors, split_max_size=split_max_size, dry_run=dry_run, small_first_shard=small_first_shard) - # vision encoder - if mmproj: - vision_hparams = self.hparams.get("vision_config") - if vision_hparams is None: - raise ValueError("Vision config not found in model config") - elif self.ignore_vision: - raise ValueError("Vision config found, but mmproj conversion for this model is not supported yet") - else: - self.mtmd_model = MultimodalModel( - hparams=vision_hparams, - ftype=self.ftype, - fname_out=self.fname_out, - endianess=self.endianess, - use_temp_file=self.use_temp_file, - ) - @classmethod def add_prefix_to_filename(cls, path: Path, prefix: str) -> Path: stem, suffix = path.stem, path.suffix new_name = f"{prefix}{stem}{suffix}" return path.with_name(new_name) - @classmethod - def __init_subclass__(cls): - # can't use an abstract property, because overriding it without type errors - # would require using decorated functions instead of simply defining the property - if "model_arch" not in cls.__dict__: - raise TypeError(f"Missing property 'model_arch' for {cls.__name__!r}") - def find_hparam(self, keys: Iterable[str], optional: bool = False) -> Any: key = next((k for k in keys if k in self.hparams), None) if key is not None: @@ -169,9 +147,6 @@ def find_hparam(self, keys: Iterable[str], optional: bool = False) -> Any: return None raise KeyError(f"could not find any of: {keys}") - def set_vocab(self): - self._set_vocab_gpt2() - def get_tensors(self) -> Iterator[tuple[str, Tensor]]: tensor_names_from_parts: set[str] = set() @@ -259,55 +234,7 @@ def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", " return new_name def set_gguf_parameters(self): - self.gguf_writer.add_block_count(self.block_count) - - if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx"], optional=True)) is not None: - self.gguf_writer.add_context_length(n_ctx) - logger.info(f"gguf: context length = {n_ctx}") - - if (n_embd := self.find_hparam(["hidden_size", "n_embd"], optional=True)) is not None: - self.gguf_writer.add_embedding_length(n_embd) - logger.info(f"gguf: embedding length = {n_embd}") - - if (n_ff := self.find_hparam(["intermediate_size", "n_inner"], optional=True)) is not None: - self.gguf_writer.add_feed_forward_length(n_ff) - logger.info(f"gguf: feed forward length = {n_ff}") - - if (n_head := self.find_hparam(["num_attention_heads", "n_head"], optional=True)) is not None: - self.gguf_writer.add_head_count(n_head) - logger.info(f"gguf: head count = {n_head}") - - if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None: - self.gguf_writer.add_head_count_kv(n_head_kv) - logger.info(f"gguf: key-value head count = {n_head_kv}") - - if (rope_theta := self.hparams.get("rope_theta")) is not None: - self.gguf_writer.add_rope_freq_base(rope_theta) - logger.info(f"gguf: rope theta = {rope_theta}") - if (f_rms_eps := self.hparams.get("rms_norm_eps")) is not None: - self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps) - logger.info(f"gguf: rms norm epsilon = {f_rms_eps}") - if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon"], optional=True)) is not None: - self.gguf_writer.add_layer_norm_eps(f_norm_eps) - logger.info(f"gguf: layer norm epsilon = {f_norm_eps}") - if (n_experts := self.hparams.get("num_local_experts")) is not None: - self.gguf_writer.add_expert_count(n_experts) - logger.info(f"gguf: expert count = {n_experts}") - if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None: - self.gguf_writer.add_expert_used_count(n_experts_used) - logger.info(f"gguf: experts used count = {n_experts_used}") - - if (head_dim := self.hparams.get("head_dim")) is not None: - self.gguf_writer.add_key_length(head_dim) - self.gguf_writer.add_value_length(head_dim) - - if not self.mmproj: - self.gguf_writer.add_file_type(self.ftype) - logger.info(f"gguf: file type = {self.ftype}") - else: - assert self.mtmd_model is not None - self.mtmd_model.set_gguf_parameters(n_embd_text=n_embd) - logger.info(f"mmproj: file type = {self.mtmd_model.ftype}") + raise NotImplementedError("set_gguf_parameters() must be implemented in subclasses") def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused @@ -345,10 +272,6 @@ def prepare_tensors(self): break for new_name, data_torch in (self.modify_tensors(data_torch, name, bid)): - # skip adding tensor if we're working with a vision model - if self.mmproj: - continue - # TODO: why do we squeeze here? # data = data_torch.squeeze().numpy() data = data_torch.numpy() @@ -457,27 +380,6 @@ def prepare_metadata(self, vocab_only: bool): if self.metadata.size_label is None and total_params > 0: self.metadata.size_label = gguf.size_label(total_params, shared_params, expert_params, expert_count) - # Extract the encoding scheme from the file type name. e.g. 'gguf.LlamaFileType.MOSTLY_Q8_0' --> 'Q8_0' - output_type: str = self.ftype.name.partition("_")[2] - - # Filename Output - if self.fname_out.is_dir(): - # Generate default filename based on model specification and available metadata - if not vocab_only: - fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, self.metadata.size_label, output_type, model_type="LoRA" if total_params < 0 else None) - else: - fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, size_label=None, output_type=None, model_type="vocab") - - # Use the default filename - self.fname_out = self.fname_out / f"{fname_default}.gguf" - else: - # Output path is a custom defined templated filename - # Note: `not is_dir()` is used because `.is_file()` will not detect - # file template strings as it doesn't actually exist as a file - - # Process templated file name with the output ftype, useful with the "auto" ftype - self.fname_out = self.fname_out.parent / gguf.fill_templated_filename(self.fname_out.name, output_type) - self.set_type() logger.info("Set meta model") @@ -486,33 +388,18 @@ def prepare_metadata(self, vocab_only: bool): logger.info("Set model parameters") self.set_gguf_parameters() - logger.info("Set model tokenizer") - self.set_vocab() - logger.info("Set model quantization version") self.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION) - def write(self): - if self.mtmd_model is not None: - self.prepare_tensors() - self.prepare_metadata(vocab_only=False) - logger.info("Writing vision model") - self.mtmd_model.write() - else: - self.prepare_tensors() - self.prepare_metadata(vocab_only=False) - self.gguf_writer.write_header_to_file(path=self.fname_out) - self.gguf_writer.write_kv_data_to_file() - self.gguf_writer.write_tensors_to_file(progress=True) - self.gguf_writer.close() - def write_vocab(self): - if len(self.gguf_writer.tensors) != 1: - raise ValueError('Splitting the vocabulary is not supported') + raise NotImplementedError("write_vocab() must be implemented in subclasses") - self.prepare_metadata(vocab_only=True) + def write(self): + self.prepare_tensors() + self.prepare_metadata(vocab_only=False) self.gguf_writer.write_header_to_file(path=self.fname_out) self.gguf_writer.write_kv_data_to_file() + self.gguf_writer.write_tensors_to_file(progress=True) self.gguf_writer.close() @staticmethod @@ -539,23 +426,121 @@ def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]: assert names def func(modelcls: AnyModel) -> AnyModel: + model_type = ModelType.VISION if modelcls.model_arch == gguf.MODEL_ARCH.CLIP_VISION else ModelType.TEXT for name in names: - cls._model_classes[name] = modelcls + cls._model_classes[model_type][name] = modelcls return modelcls return func @classmethod def print_registered_models(cls): - for name in sorted(cls._model_classes.keys()): - logger.error(f"- {name}") + for model_type, model_classes in cls._model_classes.items(): + logger.error(f"{model_type.name} models:") + for name in sorted(model_classes.keys()): + logger.error(f" - {name}") @classmethod - def from_model_architecture(cls, arch: str) -> type[Model]: + def from_model_architecture(cls, arch: str, model_type = ModelType.TEXT) -> type[Model]: try: - return cls._model_classes[arch] + return cls._model_classes[model_type][arch] except KeyError: raise NotImplementedError(f'Architecture {arch!r} not supported!') from None + +class TextModel(Model): + @classmethod + def __init_subclass__(cls): + # can't use an abstract property, because overriding it without type errors + # would require using decorated functions instead of simply defining the property + if "model_arch" not in cls.__dict__: + raise TypeError(f"Missing property 'model_arch' for {cls.__name__!r}") + + def set_vocab(self): + self._set_vocab_gpt2() + + def prepare_metadata(self, vocab_only: bool): + super().prepare_metadata(vocab_only=vocab_only) + + total_params = self.gguf_writer.get_total_parameter_count()[0] + # Extract the encoding scheme from the file type name. e.g. 'gguf.LlamaFileType.MOSTLY_Q8_0' --> 'Q8_0' + output_type: str = self.ftype.name.partition("_")[2] + + # Filename Output + if self.fname_out.is_dir(): + # Generate default filename based on model specification and available metadata + if not vocab_only: + fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, self.metadata.size_label, output_type, model_type="LoRA" if total_params < 0 else None) + else: + fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, size_label=None, output_type=None, model_type="vocab") + + # Use the default filename + self.fname_out = self.fname_out / f"{fname_default}.gguf" + else: + # Output path is a custom defined templated filename + # Note: `not is_dir()` is used because `.is_file()` will not detect + # file template strings as it doesn't actually exist as a file + + # Process templated file name with the output ftype, useful with the "auto" ftype + self.fname_out = self.fname_out.parent / gguf.fill_templated_filename(self.fname_out.name, output_type) + + logger.info("Set model tokenizer") + self.set_vocab() + + def set_gguf_parameters(self): + self.gguf_writer.add_block_count(self.block_count) + + if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx"], optional=True)) is not None: + self.gguf_writer.add_context_length(n_ctx) + logger.info(f"gguf: context length = {n_ctx}") + + if (n_embd := self.find_hparam(["hidden_size", "n_embd"], optional=True)) is not None: + self.gguf_writer.add_embedding_length(n_embd) + logger.info(f"gguf: embedding length = {n_embd}") + + if (n_ff := self.find_hparam(["intermediate_size", "n_inner"], optional=True)) is not None: + self.gguf_writer.add_feed_forward_length(n_ff) + logger.info(f"gguf: feed forward length = {n_ff}") + + if (n_head := self.find_hparam(["num_attention_heads", "n_head"], optional=True)) is not None: + self.gguf_writer.add_head_count(n_head) + logger.info(f"gguf: head count = {n_head}") + + if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None: + self.gguf_writer.add_head_count_kv(n_head_kv) + logger.info(f"gguf: key-value head count = {n_head_kv}") + + if (rope_theta := self.hparams.get("rope_theta")) is not None: + self.gguf_writer.add_rope_freq_base(rope_theta) + logger.info(f"gguf: rope theta = {rope_theta}") + if (f_rms_eps := self.hparams.get("rms_norm_eps")) is not None: + self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps) + logger.info(f"gguf: rms norm epsilon = {f_rms_eps}") + if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon"], optional=True)) is not None: + self.gguf_writer.add_layer_norm_eps(f_norm_eps) + logger.info(f"gguf: layer norm epsilon = {f_norm_eps}") + if (n_experts := self.hparams.get("num_local_experts")) is not None: + self.gguf_writer.add_expert_count(n_experts) + logger.info(f"gguf: expert count = {n_experts}") + if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None: + self.gguf_writer.add_expert_used_count(n_experts_used) + logger.info(f"gguf: experts used count = {n_experts_used}") + + if (head_dim := self.hparams.get("head_dim")) is not None: + self.gguf_writer.add_key_length(head_dim) + self.gguf_writer.add_value_length(head_dim) + + self.gguf_writer.add_file_type(self.ftype) + logger.info(f"gguf: file type = {self.ftype}") + + def write_vocab(self): + if len(self.gguf_writer.tensors) != 1: + raise ValueError('Splitting the vocabulary is not supported') + + self.prepare_metadata(vocab_only=True) + self.gguf_writer.write_header_to_file(path=self.fname_out) + self.gguf_writer.write_kv_data_to_file() + self.gguf_writer.close() + def does_token_look_special(self, token: str | bytes) -> bool: if isinstance(token, (bytes, bytearray)): token_text = token.decode(encoding="utf-8") @@ -1071,30 +1056,33 @@ def _set_vocab_builtin(self, model_name: Literal["gpt-neox", "llama-spm"], vocab self.gguf_writer.add_add_eos_token(field.parts[-1].tolist()[0]) -# for converting mmproj file -class MultimodalModel: - hparams: dict - dir_model: Path - ftype: gguf.LlamaFileType - fname_out: Path - tensor_map: gguf.TensorNameMap - gguf_writer: gguf.GGUFWriter +class VisionModel(Model): + model_arch = gguf.MODEL_ARCH.CLIP_VISION + n_text_embd = 0 - def __init__(self, hparams: dict, ftype: gguf.LlamaFileType, fname_out: Path, endianess: gguf.GGUFEndian, use_temp_file: bool): - self.hparams = hparams - self.ftype = ftype - self.fname_out = fname_out + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + if self.model_arch != gguf.MODEL_ARCH.CLIP_VISION: + raise TypeError("VisionModel must be subclassed with model_arch = gguf.MODEL_ARCH.CLIP_VISION") + + # small hack to correct the number of layers self.tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.CLIP_VISION, 128) - self.gguf_writer = gguf.GGUFWriter(path=None, - arch="clip", - endianess=endianess, - use_temp_file=use_temp_file) + self.n_embd_text = self.find_hparam(["hidden_size", "n_embd"]) + assert self.n_embd_text > 0, "n_embd not found in hparams" - def set_gguf_parameters(self, n_embd_text: int): - """Function to be called by Model.set_gguf_parameters()""" + if "vision_config" not in self.hparams: + raise ValueError("vision_config not found in hparams") + # move vision config to the top level + self.hparams = self.hparams["vision_config"] + + def set_type(self): self.gguf_writer.add_type(gguf.GGUFType.CLIP_VISION) + + def set_gguf_parameters(self): + """Function to be called by Model.set_gguf_parameters()""" self.gguf_writer.add_file_type(self.ftype) - self.gguf_writer.add_uint32(gguf.Keys.ClipVision.PROJECTION_DIM, n_embd_text) + self.gguf_writer.add_uint32(gguf.Keys.ClipVision.PROJECTION_DIM, self.n_embd_text) self.gguf_writer.add_bool(gguf.Keys.ClipVision.HAS_VISION_ENCODER, True) # vision config @@ -1105,69 +1093,12 @@ def set_gguf_parameters(self, n_embd_text: int): self.gguf_writer.add_uint32(gguf.Keys.ClipVision.BLOCK_COUNT, self.find_hparam(["num_hidden_layers"])) self.gguf_writer.add_uint32(gguf.Keys.ClipVision.Attention.HEAD_COUNT, self.find_hparam(["num_attention_heads"])) - def find_hparam(self, keys: Iterable[str], optional: bool = False) -> Any: - key = next((k for k in keys if k in self.hparams), None) - if key is not None: - return self.hparams[key] - if optional: - return None - raise KeyError(f"could not find any of: {keys}") - - def get_quantization(self, mapped_name: str, data_torch: Tensor) -> gguf.GGMLQuantizationType: - is_1d = len(data_torch.shape) == 1 - is_embd = "_embd" in mapped_name - can_quantize = not is_1d and not is_embd - data_qtype = gguf.GGMLQuantizationType.F32 - if can_quantize: - if self.ftype == gguf.LlamaFileType.ALL_F32: - data_qtype = gguf.GGMLQuantizationType.F32 - elif self.ftype == gguf.LlamaFileType.MOSTLY_F16: - data_qtype = gguf.GGMLQuantizationType.F16 - elif self.ftype == gguf.LlamaFileType.MOSTLY_BF16: - data_qtype = gguf.GGMLQuantizationType.BF16 - elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0: - data_qtype = gguf.GGMLQuantizationType.Q8_0 - else: - raise ValueError(f"Unsupported file type: {self.ftype}") - return data_qtype - - def add_tensor(self, original_name: str, data_torch: Tensor) -> None: - """Function to be called inside Model.modify_tensors()""" - # name mapping - new_name = self.tensor_map.get_name(key=original_name, try_suffixes=(".weight", ".bias")) - if new_name is None: - raise ValueError(f"Can not map tensor {original_name!r}") - - # process data - # old_dtype = data_torch.dtype - data_qtype = self.get_quantization(new_name, data_torch) - data = data_torch.numpy() - try: - data = gguf.quants.quantize(data, data_qtype) - except Exception as e: - logger.error(f"Error quantizing tensor '{new_name}': {e}, fallback to F16") - data_qtype = gguf.GGMLQuantizationType.F16 - data = gguf.quants.quantize(data, data_qtype) - - # reverse shape to make it similar to the internal ggml dimension order - # TODO: we don't print old_dtype because it's not correct, to be fixed later - old_dtype = "" - shape_str = f"{{{', '.join(str(n) for n in reversed(data_torch.shape))}}}" - logger.info(f"{f'%-32s' % f'{new_name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}") - - # add tensor - self.gguf_writer.add_tensor(new_name, data, raw_dtype=data_qtype) - - def write(self): - """Function to be called by Model.write()""" - self.gguf_writer.write_header_to_file(path=self.fname_out) - self.gguf_writer.write_kv_data_to_file() - self.gguf_writer.write_tensors_to_file(progress=True) - self.gguf_writer.close() + def write_vocab(self): + raise ValueError("VisionModel does not support vocab writing") @Model.register("GPTNeoXForCausalLM") -class GPTNeoXModel(Model): +class GPTNeoXModel(TextModel): model_arch = gguf.MODEL_ARCH.GPTNEOX def set_gguf_parameters(self): @@ -1224,7 +1155,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter @Model.register("BloomForCausalLM", "BloomModel") -class BloomModel(Model): +class BloomModel(TextModel): model_arch = gguf.MODEL_ARCH.BLOOM def set_gguf_parameters(self): @@ -1281,7 +1212,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter @Model.register("MPTForCausalLM") -class MPTModel(Model): +class MPTModel(TextModel): model_arch = gguf.MODEL_ARCH.MPT def set_vocab(self): @@ -1325,7 +1256,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter @Model.register("OrionForCausalLM") -class OrionModel(Model): +class OrionModel(TextModel): model_arch = gguf.MODEL_ARCH.ORION def set_vocab(self): @@ -1360,7 +1291,7 @@ def set_gguf_parameters(self): @Model.register("BaichuanForCausalLM", "BaiChuanForCausalLM") -class BaichuanModel(Model): +class BaichuanModel(TextModel): model_arch = gguf.MODEL_ARCH.BAICHUAN def set_vocab(self): @@ -1440,7 +1371,7 @@ def _reverse_hf_part(self, weights: Tensor, n_part: int) -> Tensor: @Model.register("XverseForCausalLM") -class XverseModel(Model): +class XverseModel(TextModel): model_arch = gguf.MODEL_ARCH.XVERSE def set_vocab(self): @@ -1547,7 +1478,7 @@ def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | Non @Model.register("FalconForCausalLM", "RWForCausalLM") -class FalconModel(Model): +class FalconModel(TextModel): model_arch = gguf.MODEL_ARCH.FALCON def set_gguf_parameters(self): @@ -1601,7 +1532,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter @Model.register("GPTBigCodeForCausalLM") -class StarCoderModel(Model): +class StarCoderModel(TextModel): model_arch = gguf.MODEL_ARCH.STARCODER def set_gguf_parameters(self): @@ -1618,7 +1549,7 @@ def set_gguf_parameters(self): @Model.register("GPTRefactForCausalLM") -class RefactModel(Model): +class RefactModel(TextModel): model_arch = gguf.MODEL_ARCH.REFACT def set_vocab(self): @@ -1682,7 +1613,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter @Model.register("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM") -class StableLMModel(Model): +class StableLMModel(TextModel): model_arch = gguf.MODEL_ARCH.STABLELM def set_vocab(self): @@ -1772,7 +1703,7 @@ def prepare_tensors(self): @Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM") -class LlamaModel(Model): +class LlamaModel(TextModel): model_arch = gguf.MODEL_ARCH.LLAMA undo_permute = True @@ -1984,7 +1915,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): @Model.register("DeciLMForCausalLM") -class DeciModel(Model): +class DeciModel(TextModel): model_arch = gguf.MODEL_ARCH.DECI @staticmethod @@ -2156,7 +2087,7 @@ def prepare_tensors(self): @Model.register("BitnetForCausalLM") -class BitnetModel(Model): +class BitnetModel(TextModel): model_arch = gguf.MODEL_ARCH.BITNET def set_vocab(self): @@ -2197,7 +2128,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter @Model.register("GrokForCausalLM") -class GrokModel(Model): +class GrokModel(TextModel): model_arch = gguf.MODEL_ARCH.GROK def set_vocab(self): @@ -2250,7 +2181,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter @Model.register("DbrxForCausalLM") -class DbrxModel(Model): +class DbrxModel(TextModel): model_arch = gguf.MODEL_ARCH.DBRX def set_gguf_parameters(self): @@ -2319,7 +2250,7 @@ def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: @Model.register("MiniCPMForCausalLM") -class MiniCPMModel(Model): +class MiniCPMModel(TextModel): model_arch = gguf.MODEL_ARCH.MINICPM def set_gguf_parameters(self): @@ -2374,7 +2305,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter @Model.register("MiniCPM3ForCausalLM") -class MiniCPM3Model(Model): +class MiniCPM3Model(TextModel): model_arch = gguf.MODEL_ARCH.MINICPM3 def set_gguf_parameters(self): @@ -2427,7 +2358,7 @@ def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | Non @Model.register("QWenLMHeadModel") -class QwenModel(Model): +class QwenModel(TextModel): model_arch = gguf.MODEL_ARCH.QWEN @staticmethod @@ -2469,7 +2400,7 @@ def set_gguf_parameters(self): @Model.register("Qwen2ForCausalLM") -class Qwen2Model(Model): +class Qwen2Model(TextModel): model_arch = gguf.MODEL_ARCH.QWEN2 def set_vocab(self): @@ -2488,7 +2419,7 @@ def set_gguf_parameters(self): @Model.register("Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration") -class Qwen2VLModel(Model): +class Qwen2VLModel(TextModel): model_arch = gguf.MODEL_ARCH.QWEN2VL def set_gguf_parameters(self): @@ -2511,7 +2442,7 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]: @Model.register("WavTokenizerDec") -class WavTokenizerDecModel(Model): +class WavTokenizerDecModel(TextModel): model_arch = gguf.MODEL_ARCH.WAVTOKENIZER_DEC def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: @@ -2549,7 +2480,7 @@ def set_gguf_parameters(self): @Model.register("Qwen2MoeForCausalLM") -class Qwen2MoeModel(Model): +class Qwen2MoeModel(TextModel): model_arch = gguf.MODEL_ARCH.QWEN2MOE def set_gguf_parameters(self): @@ -2622,7 +2553,7 @@ class Qwen3MoeModel(Qwen2MoeModel): @Model.register("GPT2LMHeadModel") -class GPT2Model(Model): +class GPT2Model(TextModel): model_arch = gguf.MODEL_ARCH.GPT2 def set_gguf_parameters(self): @@ -2654,7 +2585,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter @Model.register("PhiForCausalLM") -class Phi2Model(Model): +class Phi2Model(TextModel): model_arch = gguf.MODEL_ARCH.PHI2 def set_gguf_parameters(self): @@ -2678,7 +2609,7 @@ def set_gguf_parameters(self): @Model.register("Phi3ForCausalLM") -class Phi3MiniModel(Model): +class Phi3MiniModel(TextModel): model_arch = gguf.MODEL_ARCH.PHI3 def set_vocab(self): @@ -2913,7 +2844,7 @@ def prepare_tensors(self): @Model.register("PlamoForCausalLM") -class PlamoModel(Model): +class PlamoModel(TextModel): model_arch = gguf.MODEL_ARCH.PLAMO def set_vocab(self): @@ -2961,7 +2892,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter @Model.register("CodeShellForCausalLM") -class CodeShellModel(Model): +class CodeShellModel(TextModel): model_arch = gguf.MODEL_ARCH.CODESHELL def set_gguf_parameters(self): @@ -3002,7 +2933,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter @Model.register("InternLM2ForCausalLM") -class InternLM2Model(Model): +class InternLM2Model(TextModel): model_arch = gguf.MODEL_ARCH.INTERNLM2 def set_vocab(self): @@ -3175,7 +3106,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter @Model.register("InternLM3ForCausalLM") -class InternLM3Model(Model): +class InternLM3Model(TextModel): model_arch = gguf.MODEL_ARCH.LLAMA def set_vocab(self): @@ -3235,7 +3166,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter @Model.register("BertModel", "BertForMaskedLM", "CamembertModel") -class BertModel(Model): +class BertModel(TextModel): model_arch = gguf.MODEL_ARCH.BERT def __init__(self, *args, **kwargs): @@ -3509,7 +3440,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter @Model.register("GemmaForCausalLM") -class GemmaModel(Model): +class GemmaModel(TextModel): model_arch = gguf.MODEL_ARCH.GEMMA def set_vocab(self): @@ -3560,7 +3491,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter @Model.register("Gemma2ForCausalLM") -class Gemma2Model(Model): +class Gemma2Model(TextModel): model_arch = gguf.MODEL_ARCH.GEMMA2 def set_vocab(self): @@ -3607,12 +3538,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter @Model.register("Gemma3ForCausalLM", "Gemma3ForConditionalGeneration") -class Gemma3Model(Model): +class Gemma3Model(TextModel): model_arch = gguf.MODEL_ARCH.GEMMA3 - def write(self): - super().write() - def set_vocab(self): self._set_vocab_sentencepiece() @@ -3644,16 +3572,6 @@ def set_gguf_parameters(self): self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) self.gguf_writer.add_rope_scaling_factor(hparams["rope_scaling"]["factor"]) - if self.mtmd_model is not None: - self.mtmd_model.set_gguf_parameters(n_embd_text=hparams["hidden_size"]) - vgguf = self.mtmd_model.gguf_writer - vgguf.add_string(gguf.Keys.ClipVision.PROJECTOR_TYPE, "gemma3") - # default values below are taken from HF tranformers code - vgguf.add_float32(gguf.Keys.ClipVision.Attention.LAYERNORM_EPS, self.mtmd_model.hparams.get("layer_norm_eps", 1e-6)) - vgguf.add_array(gguf.Keys.ClipVision.IMAGE_MEAN, [0.5, 0.5, 0.5]) - vgguf.add_array(gguf.Keys.ClipVision.IMAGE_STD, [0.5, 0.5, 0.5]) - vgguf.add_bool (gguf.Keys.ClipVision.USE_GELU, True) - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused @@ -3662,24 +3580,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter elif name.startswith("multi_modal_projector.") or name.startswith("vision_tower.") \ or name.startswith("multimodal_projector.") or name.startswith("vision_model."): - if self.mmproj: - assert self.mtmd_model is not None - # process vision tensors - name = name.replace("_weight", ".weight") - if "fc1" in name: - name = name.replace("fc1", "fc2") - else: - name = name.replace("fc2", "fc1") - - # corrent norm value ; only this "soft_emb_norm" need to be corrected as it's part of Gemma projector - # the other norm values are part of SigLIP model, and they are already correct - # ref code: Gemma3RMSNorm - if "soft_emb_norm.weight" in name: - logger.info(f"Correcting norm value for '{name}'") - data_torch = data_torch + 1 - - self.mtmd_model.add_tensor(name, data_torch) - return [] # vision tensor already handled + return [] # skip vision tensors # remove OOV (out-of-vocabulary) rows in token_embd if "embed_tokens.weight" in name: @@ -3695,13 +3596,58 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [(self.map_tensor_name(name), data_torch)] +@Model.register("Gemma3ForConditionalGeneration") +class Gemma3VisionModel(VisionModel): + def set_gguf_parameters(self): + super().set_gguf_parameters() + hparams = self.hparams + self.gguf_writer.add_string(gguf.Keys.ClipVision.PROJECTOR_TYPE, "gemma3") + # default values below are taken from HF tranformers code + self.gguf_writer.add_float32(gguf.Keys.ClipVision.Attention.LAYERNORM_EPS, hparams.get("layer_norm_eps", 1e-6)) + self.gguf_writer.add_array(gguf.Keys.ClipVision.IMAGE_MEAN, [0.5, 0.5, 0.5]) + self.gguf_writer.add_array(gguf.Keys.ClipVision.IMAGE_STD, [0.5, 0.5, 0.5]) + self.gguf_writer.add_bool (gguf.Keys.ClipVision.USE_GELU, True) + + def tensor_force_quant(self, name, new_name, bid, n_dims): + del bid, new_name, n_dims # unused + # related to https://github.com/ggml-org/llama.cpp/issues/13025 + if "input_projection" in name: + return gguf.GGMLQuantizationType.F16 + if ".embeddings." in name: + return gguf.GGMLQuantizationType.F32 + return False + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + if name.startswith("multi_modal_projector.") or name.startswith("vision_tower.") \ + or name.startswith("multimodal_projector.") or name.startswith("vision_model."): + # process vision tensors + name = name.replace("_weight", ".weight") + if "fc1" in name: + name = name.replace("fc1", "fc2") + else: + name = name.replace("fc2", "fc1") + + # corrent norm value ; only this "soft_emb_norm" need to be corrected as it's part of Gemma projector + # the other norm values are part of SigLIP model, and they are already correct + # ref code: Gemma3RMSNorm + if "soft_emb_norm.weight" in name: + logger.info(f"Correcting norm value for '{name}'") + data_torch = data_torch + 1 + + return [(self.map_tensor_name(name), data_torch)] + + return [] # skip other tensors + + @Model.register("Starcoder2ForCausalLM") -class StarCoder2Model(Model): +class StarCoder2Model(TextModel): model_arch = gguf.MODEL_ARCH.STARCODER2 @Model.register("Rwkv6ForCausalLM") -class Rwkv6Model(Model): +class Rwkv6Model(TextModel): model_arch = gguf.MODEL_ARCH.RWKV6 def set_vocab(self): @@ -3828,7 +3774,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter @Model.register("Rwkv7ForCausalLM", "RWKV7ForCausalLM") -class Rwkv7Model(Model): +class Rwkv7Model(TextModel): model_arch = gguf.MODEL_ARCH.RWKV7 def set_vocab(self): @@ -3990,7 +3936,7 @@ def set_gguf_parameters(self): @Model.register("MambaForCausalLM", "MambaLMHeadModel", "FalconMambaForCausalLM") -class MambaModel(Model): +class MambaModel(TextModel): model_arch = gguf.MODEL_ARCH.MAMBA def set_vocab(self): @@ -4068,7 +4014,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter @Model.register("CohereForCausalLM") -class CommandR2Model(Model): +class CommandR2Model(TextModel): model_arch = gguf.MODEL_ARCH.COMMAND_R def __init__(self, *args, **kwargs): @@ -4086,7 +4032,7 @@ def set_gguf_parameters(self): @Model.register("Cohere2ForCausalLM") -class Cohere2Model(Model): +class Cohere2Model(TextModel): model_arch = gguf.MODEL_ARCH.COHERE2 def set_gguf_parameters(self): @@ -4105,7 +4051,7 @@ def set_gguf_parameters(self): @Model.register("OlmoForCausalLM") @Model.register("OLMoForCausalLM") -class OlmoModel(Model): +class OlmoModel(TextModel): model_arch = gguf.MODEL_ARCH.OLMO def set_gguf_parameters(self): @@ -4132,12 +4078,12 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter @Model.register("Olmo2ForCausalLM") -class Olmo2Model(Model): +class Olmo2Model(TextModel): model_arch = gguf.MODEL_ARCH.OLMO2 @Model.register("OlmoeForCausalLM") -class OlmoeModel(Model): +class OlmoeModel(TextModel): model_arch = gguf.MODEL_ARCH.OLMOE def set_gguf_parameters(self): @@ -4244,7 +4190,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter @Model.register("OpenELMForCausalLM") -class OpenELMModel(Model): +class OpenELMModel(TextModel): model_arch = gguf.MODEL_ARCH.OPENELM @staticmethod @@ -4319,7 +4265,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter @Model.register("ArcticForCausalLM") -class ArcticModel(Model): +class ArcticModel(TextModel): model_arch = gguf.MODEL_ARCH.ARCTIC def set_vocab(self): @@ -4470,7 +4416,7 @@ def prepare_tensors(self): @Model.register("DeepseekForCausalLM") -class DeepseekModel(Model): +class DeepseekModel(TextModel): model_arch = gguf.MODEL_ARCH.DEEPSEEK def set_vocab(self): @@ -4562,7 +4508,7 @@ def prepare_tensors(self): @Model.register("DeepseekV2ForCausalLM") @Model.register("DeepseekV3ForCausalLM") -class DeepseekV2Model(Model): +class DeepseekV2Model(TextModel): model_arch = gguf.MODEL_ARCH.DEEPSEEK2 def set_vocab(self): @@ -4689,7 +4635,7 @@ def prepare_tensors(self): @Model.register("PLMForCausalLM") -class PLMModel(Model): +class PLMModel(TextModel): model_arch = gguf.MODEL_ARCH.PLM def set_vocab(self): @@ -4715,7 +4661,7 @@ def prepare_tensors(self): @Model.register("T5ForConditionalGeneration") @Model.register("MT5ForConditionalGeneration") @Model.register("UMT5ForConditionalGeneration") -class T5Model(Model): +class T5Model(TextModel): model_arch = gguf.MODEL_ARCH.T5 def __init__(self, *args, **kwargs): @@ -4855,7 +4801,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter @Model.register("T5EncoderModel") -class T5EncoderModel(Model): +class T5EncoderModel(TextModel): model_arch = gguf.MODEL_ARCH.T5ENCODER def __init__(self, *args, **kwargs): @@ -4994,7 +4940,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter @Model.register("JAISLMHeadModel") -class JaisModel(Model): +class JaisModel(TextModel): model_arch = gguf.MODEL_ARCH.JAIS def __init__(self, *args, **kwargs): @@ -5077,7 +5023,7 @@ def prepare_tensors(self): @Model.register("Glm4ForCausalLM") -class Glm4Model(Model): +class Glm4Model(TextModel): model_arch = gguf.MODEL_ARCH.GLM4 def set_vocab(self): @@ -5093,7 +5039,7 @@ def set_gguf_parameters(self): @Model.register("GlmForCausalLM", "ChatGLMModel", "ChatGLMForConditionalGeneration") -class ChatGLMModel(Model): +class ChatGLMModel(TextModel): model_arch = gguf.MODEL_ARCH.CHATGLM def set_vocab_chatglm3(self): @@ -5248,7 +5194,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter @Model.register("NemotronForCausalLM") -class NemotronModel(Model): +class NemotronModel(TextModel): model_arch = gguf.MODEL_ARCH.NEMOTRON def set_vocab(self): @@ -5289,7 +5235,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter @Model.register("ExaoneForCausalLM") -class ExaoneModel(Model): +class ExaoneModel(TextModel): model_arch = gguf.MODEL_ARCH.EXAONE def set_gguf_parameters(self): @@ -5416,7 +5362,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter @Model.register("BailingMoeForCausalLM") -class BailingMoeModel(Model): +class BailingMoeModel(TextModel): model_arch = gguf.MODEL_ARCH.BAILINGMOE def set_vocab(self): @@ -5516,7 +5462,7 @@ def prepare_tensors(self): @Model.register("ChameleonForConditionalGeneration") @Model.register("ChameleonForCausalLM") # obsolete -class ChameleonModel(Model): +class ChameleonModel(TextModel): model_arch = gguf.MODEL_ARCH.CHAMELEON def set_gguf_parameters(self): @@ -5791,8 +5737,9 @@ def main() -> None: with torch.inference_mode(): output_type = ftype_map[args.outtype] model_architecture = hparams["architectures"][0] + model_type = ModelType.VISION if args.mmproj else ModelType.TEXT try: - model_class = Model.from_model_architecture(model_architecture) + model_class = Model.from_model_architecture(model_architecture, model_type=model_type) except NotImplementedError: logger.error(f"Model {model_architecture} is not supported") sys.exit(1) @@ -5804,8 +5751,7 @@ def main() -> None: split_max_tensors=args.split_max_tensors, split_max_size=split_str_to_n_bytes(args.split_max_size), dry_run=args.dry_run, small_first_shard=args.no_tensor_first_split, - remote_hf_model_id=str(args.model) if args.remote else None, - mmproj=args.mmproj) + remote_hf_model_id=str(args.model) if args.remote else None) if args.vocab_only: logger.info("Exporting model vocab...") From 93b5f71ed041cf49e4e9bdd22a4b0757a5d06740 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sun, 20 Apr 2025 19:02:32 +0200 Subject: [PATCH 05/10] rename Mode --> ModelBase --- convert_hf_to_gguf.py | 183 ++++++++++++++++++++-------------------- convert_lora_to_gguf.py | 6 +- 2 files changed, 95 insertions(+), 94 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 9326301badb04..a627e99c8875c 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -47,11 +47,11 @@ class ModelType(IntEnum): VISION = 2 -AnyModel = TypeVar("AnyModel", bound="type[Model]") +AnyModel = TypeVar("AnyModel", bound="type[ModelBase]") -class Model: - _model_classes: dict[ModelType, dict[str, type[Model]]] = { +class ModelBase: + _model_classes: dict[ModelType, dict[str, type[ModelBase]]] = { ModelType.TEXT: {}, ModelType.VISION: {}, } @@ -83,7 +83,9 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, metadata_override: Path | None = None, model_name: str | None = None, split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False, hparams: dict[str, Any] | None = None, remote_hf_model_id: str | None = None): - if type(self) is Model: + if type(self) is ModelBase or \ + type(self) is TextModel or \ + type(self) is VisionModel: raise TypeError(f"{type(self).__name__!r} should not be directly instantiated") self.dir_model = dir_model @@ -106,11 +108,11 @@ def get_remote_tensors() -> Iterator[tuple[str, Tensor]]: self.get_tensors = get_remote_tensors else: - self.part_names = Model.get_model_part_names(self.dir_model, "model", ".safetensors") + self.part_names = ModelBase.get_model_part_names(self.dir_model, "model", ".safetensors") self.is_safetensors = len(self.part_names) > 0 if not self.is_safetensors: - self.part_names = Model.get_model_part_names(self.dir_model, "pytorch_model", ".bin") - self.hparams = Model.load_hparams(self.dir_model) if hparams is None else hparams + self.part_names = ModelBase.get_model_part_names(self.dir_model, "pytorch_model", ".bin") + self.hparams = ModelBase.load_hparams(self.dir_model) if hparams is None else hparams self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers"]) self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) self.tensor_names = None @@ -447,7 +449,7 @@ def from_model_architecture(cls, arch: str, model_type = ModelType.TEXT) -> type raise NotImplementedError(f'Architecture {arch!r} not supported!') from None -class TextModel(Model): +class TextModel(ModelBase): @classmethod def __init_subclass__(cls): # can't use an abstract property, because overriding it without type errors @@ -1056,7 +1058,7 @@ def _set_vocab_builtin(self, model_name: Literal["gpt-neox", "llama-spm"], vocab self.gguf_writer.add_add_eos_token(field.parts[-1].tolist()[0]) -class VisionModel(Model): +class VisionModel(ModelBase): model_arch = gguf.MODEL_ARCH.CLIP_VISION n_text_embd = 0 @@ -1080,7 +1082,6 @@ def set_type(self): self.gguf_writer.add_type(gguf.GGUFType.CLIP_VISION) def set_gguf_parameters(self): - """Function to be called by Model.set_gguf_parameters()""" self.gguf_writer.add_file_type(self.ftype) self.gguf_writer.add_uint32(gguf.Keys.ClipVision.PROJECTION_DIM, self.n_embd_text) self.gguf_writer.add_bool(gguf.Keys.ClipVision.HAS_VISION_ENCODER, True) @@ -1097,7 +1098,7 @@ def write_vocab(self): raise ValueError("VisionModel does not support vocab writing") -@Model.register("GPTNeoXForCausalLM") +@ModelBase.register("GPTNeoXForCausalLM") class GPTNeoXModel(TextModel): model_arch = gguf.MODEL_ARCH.GPTNEOX @@ -1154,7 +1155,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return tensors -@Model.register("BloomForCausalLM", "BloomModel") +@ModelBase.register("BloomForCausalLM", "BloomModel") class BloomModel(TextModel): model_arch = gguf.MODEL_ARCH.BLOOM @@ -1211,7 +1212,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return tensors -@Model.register("MPTForCausalLM") +@ModelBase.register("MPTForCausalLM") class MPTModel(TextModel): model_arch = gguf.MODEL_ARCH.MPT @@ -1255,7 +1256,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [(new_name, data_torch)] -@Model.register("OrionForCausalLM") +@ModelBase.register("OrionForCausalLM") class OrionModel(TextModel): model_arch = gguf.MODEL_ARCH.ORION @@ -1290,7 +1291,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_layer_norm_eps(self.hparams["rms_norm_eps"]) -@Model.register("BaichuanForCausalLM", "BaiChuanForCausalLM") +@ModelBase.register("BaichuanForCausalLM", "BaiChuanForCausalLM") class BaichuanModel(TextModel): model_arch = gguf.MODEL_ARCH.BAICHUAN @@ -1370,7 +1371,7 @@ def _reverse_hf_part(self, weights: Tensor, n_part: int) -> Tensor: return weights[r * n_part:r * n_part + r, ...] -@Model.register("XverseForCausalLM") +@ModelBase.register("XverseForCausalLM") class XverseModel(TextModel): model_arch = gguf.MODEL_ARCH.XVERSE @@ -1477,7 +1478,7 @@ def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | Non ) -@Model.register("FalconForCausalLM", "RWForCausalLM") +@ModelBase.register("FalconForCausalLM", "RWForCausalLM") class FalconModel(TextModel): model_arch = gguf.MODEL_ARCH.FALCON @@ -1531,7 +1532,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [(self.map_tensor_name(name), data_torch)] -@Model.register("GPTBigCodeForCausalLM") +@ModelBase.register("GPTBigCodeForCausalLM") class StarCoderModel(TextModel): model_arch = gguf.MODEL_ARCH.STARCODER @@ -1548,7 +1549,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_file_type(self.ftype) -@Model.register("GPTRefactForCausalLM") +@ModelBase.register("GPTRefactForCausalLM") class RefactModel(TextModel): model_arch = gguf.MODEL_ARCH.REFACT @@ -1612,7 +1613,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return tensors -@Model.register("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM") +@ModelBase.register("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM") class StableLMModel(TextModel): model_arch = gguf.MODEL_ARCH.STABLELM @@ -1702,7 +1703,7 @@ def prepare_tensors(self): raise ValueError(f"Unprocessed norms: {norms}") -@Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM") +@ModelBase.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM") class LlamaModel(TextModel): model_arch = gguf.MODEL_ARCH.LLAMA undo_permute = True @@ -1851,7 +1852,7 @@ def prepare_tensors(self): raise ValueError(f"Unprocessed experts: {experts}") -@Model.register("Llama4ForConditionalGeneration") +@ModelBase.register("Llama4ForConditionalGeneration") class Llama4Model(LlamaModel): model_arch = gguf.MODEL_ARCH.LLAMA4 undo_permute = False @@ -1895,13 +1896,13 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): return super().modify_tensors(data_torch, name, bid) -@Model.register("Mistral3ForConditionalGeneration") +@ModelBase.register("Mistral3ForConditionalGeneration") class Mistral3Model(LlamaModel): model_arch = gguf.MODEL_ARCH.LLAMA # we need to merge the text_config into the root level of hparams def __init__(self, *args, **kwargs): - hparams = kwargs["hparams"] if "hparams" in kwargs else Model.load_hparams(args[0]) + hparams = kwargs["hparams"] if "hparams" in kwargs else ModelBase.load_hparams(args[0]) if "text_config" in hparams: hparams = {**hparams, **hparams["text_config"]} kwargs["hparams"] = hparams @@ -1914,7 +1915,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): return super().modify_tensors(data_torch, name, bid) -@Model.register("DeciLMForCausalLM") +@ModelBase.register("DeciLMForCausalLM") class DeciModel(TextModel): model_arch = gguf.MODEL_ARCH.DECI @@ -2086,7 +2087,7 @@ def prepare_tensors(self): super().prepare_tensors() -@Model.register("BitnetForCausalLM") +@ModelBase.register("BitnetForCausalLM") class BitnetModel(TextModel): model_arch = gguf.MODEL_ARCH.BITNET @@ -2127,7 +2128,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter yield (new_name, data_torch) -@Model.register("GrokForCausalLM") +@ModelBase.register("GrokForCausalLM") class GrokModel(TextModel): model_arch = gguf.MODEL_ARCH.GROK @@ -2180,7 +2181,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [(self.map_tensor_name(name), data_torch)] -@Model.register("DbrxForCausalLM") +@ModelBase.register("DbrxForCausalLM") class DbrxModel(TextModel): model_arch = gguf.MODEL_ARCH.DBRX @@ -2249,7 +2250,7 @@ def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: return n_dims > 1 -@Model.register("MiniCPMForCausalLM") +@ModelBase.register("MiniCPMForCausalLM") class MiniCPMModel(TextModel): model_arch = gguf.MODEL_ARCH.MINICPM @@ -2304,7 +2305,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [(self.map_tensor_name(name), data_torch)] -@Model.register("MiniCPM3ForCausalLM") +@ModelBase.register("MiniCPM3ForCausalLM") class MiniCPM3Model(TextModel): model_arch = gguf.MODEL_ARCH.MINICPM3 @@ -2357,7 +2358,7 @@ def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | Non ) -@Model.register("QWenLMHeadModel") +@ModelBase.register("QWenLMHeadModel") class QwenModel(TextModel): model_arch = gguf.MODEL_ARCH.QWEN @@ -2399,7 +2400,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_file_type(self.ftype) -@Model.register("Qwen2ForCausalLM") +@ModelBase.register("Qwen2ForCausalLM") class Qwen2Model(TextModel): model_arch = gguf.MODEL_ARCH.QWEN2 @@ -2418,7 +2419,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"]) -@Model.register("Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration") +@ModelBase.register("Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration") class Qwen2VLModel(TextModel): model_arch = gguf.MODEL_ARCH.QWEN2VL @@ -2441,7 +2442,7 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]: yield name, data -@Model.register("WavTokenizerDec") +@ModelBase.register("WavTokenizerDec") class WavTokenizerDecModel(TextModel): model_arch = gguf.MODEL_ARCH.WAVTOKENIZER_DEC @@ -2479,7 +2480,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_causal_attention(False) -@Model.register("Qwen2MoeForCausalLM") +@ModelBase.register("Qwen2MoeForCausalLM") class Qwen2MoeModel(TextModel): model_arch = gguf.MODEL_ARCH.QWEN2MOE @@ -2542,17 +2543,17 @@ def prepare_tensors(self): raise ValueError(f"Unprocessed experts: {experts}") -@Model.register("Qwen3ForCausalLM") +@ModelBase.register("Qwen3ForCausalLM") class Qwen3Model(Qwen2Model): model_arch = gguf.MODEL_ARCH.QWEN3 -@Model.register("Qwen3MoeForCausalLM") +@ModelBase.register("Qwen3MoeForCausalLM") class Qwen3MoeModel(Qwen2MoeModel): model_arch = gguf.MODEL_ARCH.QWEN3MOE -@Model.register("GPT2LMHeadModel") +@ModelBase.register("GPT2LMHeadModel") class GPT2Model(TextModel): model_arch = gguf.MODEL_ARCH.GPT2 @@ -2584,7 +2585,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return tensors -@Model.register("PhiForCausalLM") +@ModelBase.register("PhiForCausalLM") class Phi2Model(TextModel): model_arch = gguf.MODEL_ARCH.PHI2 @@ -2608,7 +2609,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_add_bos_token(False) -@Model.register("Phi3ForCausalLM") +@ModelBase.register("Phi3ForCausalLM") class Phi3MiniModel(TextModel): model_arch = gguf.MODEL_ARCH.PHI3 @@ -2786,7 +2787,7 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32)) -@Model.register("PhiMoEForCausalLM") +@ModelBase.register("PhiMoEForCausalLM") class PhiMoeModel(Phi3MiniModel): model_arch = gguf.MODEL_ARCH.PHIMOE @@ -2843,7 +2844,7 @@ def prepare_tensors(self): raise ValueError(f"Unprocessed experts: {experts}") -@Model.register("PlamoForCausalLM") +@ModelBase.register("PlamoForCausalLM") class PlamoModel(TextModel): model_arch = gguf.MODEL_ARCH.PLAMO @@ -2891,7 +2892,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [(new_name, data_torch)] -@Model.register("CodeShellForCausalLM") +@ModelBase.register("CodeShellForCausalLM") class CodeShellModel(TextModel): model_arch = gguf.MODEL_ARCH.CODESHELL @@ -2932,7 +2933,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [(new_name, data_torch)] -@Model.register("InternLM2ForCausalLM") +@ModelBase.register("InternLM2ForCausalLM") class InternLM2Model(TextModel): model_arch = gguf.MODEL_ARCH.INTERNLM2 @@ -3105,7 +3106,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [(self.map_tensor_name(name), data_torch)] -@Model.register("InternLM3ForCausalLM") +@ModelBase.register("InternLM3ForCausalLM") class InternLM3Model(TextModel): model_arch = gguf.MODEL_ARCH.LLAMA @@ -3165,7 +3166,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [(self.map_tensor_name(name), data_torch)] -@Model.register("BertModel", "BertForMaskedLM", "CamembertModel") +@ModelBase.register("BertModel", "BertForMaskedLM", "CamembertModel") class BertModel(TextModel): model_arch = gguf.MODEL_ARCH.BERT @@ -3253,7 +3254,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [(self.map_tensor_name(name), data_torch)] -@Model.register("RobertaModel") +@ModelBase.register("RobertaModel") class RobertaModel(BertModel): model_arch = gguf.MODEL_ARCH.BERT @@ -3298,7 +3299,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return super().modify_tensors(data_torch, name, bid) -@Model.register("NomicBertModel") +@ModelBase.register("NomicBertModel") class NomicBertModel(BertModel): model_arch = gguf.MODEL_ARCH.NOMIC_BERT @@ -3328,7 +3329,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"]) -@Model.register("XLMRobertaModel", "XLMRobertaForSequenceClassification") +@ModelBase.register("XLMRobertaModel", "XLMRobertaForSequenceClassification") class XLMRobertaModel(BertModel): model_arch = gguf.MODEL_ARCH.BERT @@ -3439,7 +3440,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return super().modify_tensors(data_torch, name, bid) -@Model.register("GemmaForCausalLM") +@ModelBase.register("GemmaForCausalLM") class GemmaModel(TextModel): model_arch = gguf.MODEL_ARCH.GEMMA @@ -3490,7 +3491,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [(self.map_tensor_name(name), data_torch)] -@Model.register("Gemma2ForCausalLM") +@ModelBase.register("Gemma2ForCausalLM") class Gemma2Model(TextModel): model_arch = gguf.MODEL_ARCH.GEMMA2 @@ -3537,7 +3538,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [(self.map_tensor_name(name), data_torch)] -@Model.register("Gemma3ForCausalLM", "Gemma3ForConditionalGeneration") +@ModelBase.register("Gemma3ForCausalLM", "Gemma3ForConditionalGeneration") class Gemma3Model(TextModel): model_arch = gguf.MODEL_ARCH.GEMMA3 @@ -3596,7 +3597,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [(self.map_tensor_name(name), data_torch)] -@Model.register("Gemma3ForConditionalGeneration") +@ModelBase.register("Gemma3ForConditionalGeneration") class Gemma3VisionModel(VisionModel): def set_gguf_parameters(self): super().set_gguf_parameters() @@ -3641,12 +3642,12 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [] # skip other tensors -@Model.register("Starcoder2ForCausalLM") +@ModelBase.register("Starcoder2ForCausalLM") class StarCoder2Model(TextModel): model_arch = gguf.MODEL_ARCH.STARCODER2 -@Model.register("Rwkv6ForCausalLM") +@ModelBase.register("Rwkv6ForCausalLM") class Rwkv6Model(TextModel): model_arch = gguf.MODEL_ARCH.RWKV6 @@ -3719,7 +3720,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter yield (new_name, data_torch) -@Model.register("RWKV6Qwen2ForCausalLM") +@ModelBase.register("RWKV6Qwen2ForCausalLM") class RWKV6Qwen2Model(Rwkv6Model): model_arch = gguf.MODEL_ARCH.RWKV6QWEN2 @@ -3773,7 +3774,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter yield (new_name, data) -@Model.register("Rwkv7ForCausalLM", "RWKV7ForCausalLM") +@ModelBase.register("Rwkv7ForCausalLM", "RWKV7ForCausalLM") class Rwkv7Model(TextModel): model_arch = gguf.MODEL_ARCH.RWKV7 @@ -3892,7 +3893,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter yield (new_name, data_torch) -@Model.register("RwkvHybridForCausalLM") +@ModelBase.register("RwkvHybridForCausalLM") class ARwkv7Model(Rwkv7Model): model_arch = gguf.MODEL_ARCH.ARWKV7 @@ -3935,7 +3936,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_head_count(0) -@Model.register("MambaForCausalLM", "MambaLMHeadModel", "FalconMambaForCausalLM") +@ModelBase.register("MambaForCausalLM", "MambaLMHeadModel", "FalconMambaForCausalLM") class MambaModel(TextModel): model_arch = gguf.MODEL_ARCH.MAMBA @@ -4013,7 +4014,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [(new_name, data_torch)] -@Model.register("CohereForCausalLM") +@ModelBase.register("CohereForCausalLM") class CommandR2Model(TextModel): model_arch = gguf.MODEL_ARCH.COMMAND_R @@ -4031,7 +4032,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) -@Model.register("Cohere2ForCausalLM") +@ModelBase.register("Cohere2ForCausalLM") class Cohere2Model(TextModel): model_arch = gguf.MODEL_ARCH.COHERE2 @@ -4049,8 +4050,8 @@ def set_gguf_parameters(self): self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) -@Model.register("OlmoForCausalLM") -@Model.register("OLMoForCausalLM") +@ModelBase.register("OlmoForCausalLM") +@ModelBase.register("OLMoForCausalLM") class OlmoModel(TextModel): model_arch = gguf.MODEL_ARCH.OLMO @@ -4077,12 +4078,12 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [(self.map_tensor_name(name), data_torch)] -@Model.register("Olmo2ForCausalLM") +@ModelBase.register("Olmo2ForCausalLM") class Olmo2Model(TextModel): model_arch = gguf.MODEL_ARCH.OLMO2 -@Model.register("OlmoeForCausalLM") +@ModelBase.register("OlmoeForCausalLM") class OlmoeModel(TextModel): model_arch = gguf.MODEL_ARCH.OLMOE @@ -4142,7 +4143,7 @@ def prepare_tensors(self): raise ValueError(f"Unprocessed experts: {experts}") -@Model.register("JinaBertModel", "JinaBertForMaskedLM") +@ModelBase.register("JinaBertModel", "JinaBertForMaskedLM") class JinaBertV2Model(BertModel): model_arch = gguf.MODEL_ARCH.JINA_BERT_V2 @@ -4189,7 +4190,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return super().modify_tensors(data_torch, name, bid) -@Model.register("OpenELMForCausalLM") +@ModelBase.register("OpenELMForCausalLM") class OpenELMModel(TextModel): model_arch = gguf.MODEL_ARCH.OPENELM @@ -4264,7 +4265,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter yield (self.map_tensor_name(name), data_torch) -@Model.register("ArcticForCausalLM") +@ModelBase.register("ArcticForCausalLM") class ArcticModel(TextModel): model_arch = gguf.MODEL_ARCH.ARCTIC @@ -4415,7 +4416,7 @@ def prepare_tensors(self): raise ValueError(f"Unprocessed experts: {experts}") -@Model.register("DeepseekForCausalLM") +@ModelBase.register("DeepseekForCausalLM") class DeepseekModel(TextModel): model_arch = gguf.MODEL_ARCH.DEEPSEEK @@ -4506,8 +4507,8 @@ def prepare_tensors(self): raise ValueError(f"Unprocessed experts: {experts}") -@Model.register("DeepseekV2ForCausalLM") -@Model.register("DeepseekV3ForCausalLM") +@ModelBase.register("DeepseekV2ForCausalLM") +@ModelBase.register("DeepseekV3ForCausalLM") class DeepseekV2Model(TextModel): model_arch = gguf.MODEL_ARCH.DEEPSEEK2 @@ -4634,7 +4635,7 @@ def prepare_tensors(self): raise ValueError(f"Unprocessed experts: {experts}") -@Model.register("PLMForCausalLM") +@ModelBase.register("PLMForCausalLM") class PLMModel(TextModel): model_arch = gguf.MODEL_ARCH.PLM @@ -4657,10 +4658,10 @@ def prepare_tensors(self): super().prepare_tensors() -@Model.register("T5WithLMHeadModel") -@Model.register("T5ForConditionalGeneration") -@Model.register("MT5ForConditionalGeneration") -@Model.register("UMT5ForConditionalGeneration") +@ModelBase.register("T5WithLMHeadModel") +@ModelBase.register("T5ForConditionalGeneration") +@ModelBase.register("MT5ForConditionalGeneration") +@ModelBase.register("UMT5ForConditionalGeneration") class T5Model(TextModel): model_arch = gguf.MODEL_ARCH.T5 @@ -4800,7 +4801,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [(self.map_tensor_name(name), data_torch)] -@Model.register("T5EncoderModel") +@ModelBase.register("T5EncoderModel") class T5EncoderModel(TextModel): model_arch = gguf.MODEL_ARCH.T5ENCODER @@ -4939,7 +4940,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [(self.map_tensor_name(name), data_torch)] -@Model.register("JAISLMHeadModel") +@ModelBase.register("JAISLMHeadModel") class JaisModel(TextModel): model_arch = gguf.MODEL_ARCH.JAIS @@ -5022,7 +5023,7 @@ def prepare_tensors(self): self.gguf_writer.add_max_alibi_bias(self.max_alibi_bias) -@Model.register("Glm4ForCausalLM") +@ModelBase.register("Glm4ForCausalLM") class Glm4Model(TextModel): model_arch = gguf.MODEL_ARCH.GLM4 @@ -5038,7 +5039,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"]) -@Model.register("GlmForCausalLM", "ChatGLMModel", "ChatGLMForConditionalGeneration") +@ModelBase.register("GlmForCausalLM", "ChatGLMModel", "ChatGLMForConditionalGeneration") class ChatGLMModel(TextModel): model_arch = gguf.MODEL_ARCH.CHATGLM @@ -5193,7 +5194,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [(self.map_tensor_name(name), data_torch)] -@Model.register("NemotronForCausalLM") +@ModelBase.register("NemotronForCausalLM") class NemotronModel(TextModel): model_arch = gguf.MODEL_ARCH.NEMOTRON @@ -5234,7 +5235,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [(self.map_tensor_name(name), data_torch)] -@Model.register("ExaoneForCausalLM") +@ModelBase.register("ExaoneForCausalLM") class ExaoneModel(TextModel): model_arch = gguf.MODEL_ARCH.EXAONE @@ -5303,7 +5304,7 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32)) -@Model.register("GraniteForCausalLM") +@ModelBase.register("GraniteForCausalLM") class GraniteModel(LlamaModel): """Conversion for IBM's GraniteForCausalLM""" model_arch = gguf.MODEL_ARCH.GRANITE @@ -5337,7 +5338,7 @@ def set_gguf_parameters(self): logger.info("gguf: (granite) logits_scale = %s", logits_scale) -@Model.register("GraniteMoeForCausalLM") +@ModelBase.register("GraniteMoeForCausalLM") class GraniteMoeModel(GraniteModel): """Conversion for IBM's GraniteMoeForCausalLM""" model_arch = gguf.MODEL_ARCH.GRANITE_MOE @@ -5361,7 +5362,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return super().modify_tensors(data_torch, name, bid) -@Model.register("BailingMoeForCausalLM") +@ModelBase.register("BailingMoeForCausalLM") class BailingMoeModel(TextModel): model_arch = gguf.MODEL_ARCH.BAILINGMOE @@ -5460,8 +5461,8 @@ def prepare_tensors(self): raise ValueError(f"Unprocessed experts: {experts}") -@Model.register("ChameleonForConditionalGeneration") -@Model.register("ChameleonForCausalLM") # obsolete +@ModelBase.register("ChameleonForConditionalGeneration") +@ModelBase.register("ChameleonForCausalLM") # obsolete class ChameleonModel(TextModel): model_arch = gguf.MODEL_ARCH.CHAMELEON @@ -5681,7 +5682,7 @@ def main() -> None: if args.print_supported_models: logger.error("Supported models:") - Model.print_registered_models() + ModelBase.print_registered_models() sys.exit(0) if args.verbose: @@ -5728,18 +5729,18 @@ def main() -> None: logger.info(f"Loading model: {dir_model.name}") - hparams = Model.load_hparams(dir_model) + hparams = ModelBase.load_hparams(dir_model) if args.mmproj: if "mmproj" not in fname_out.name: - fname_out = Model.add_prefix_to_filename(fname_out, "mmproj-") + fname_out = ModelBase.add_prefix_to_filename(fname_out, "mmproj-") with torch.inference_mode(): output_type = ftype_map[args.outtype] model_architecture = hparams["architectures"][0] model_type = ModelType.VISION if args.mmproj else ModelType.TEXT try: - model_class = Model.from_model_architecture(model_architecture, model_type=model_type) + model_class = ModelBase.from_model_architecture(model_architecture, model_type=model_type) except NotImplementedError: logger.error(f"Model {model_architecture} is not supported") sys.exit(1) diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py index bdc991533b4e0..00a6733cbd360 100755 --- a/convert_lora_to_gguf.py +++ b/convert_lora_to_gguf.py @@ -24,7 +24,7 @@ import gguf # reuse model definitions from convert_hf_to_gguf.py -from convert_hf_to_gguf import LazyTorchTensor, Model +from convert_hf_to_gguf import LazyTorchTensor, ModelBase logger = logging.getLogger("lora-to-gguf") @@ -340,11 +340,11 @@ def load_hparams_from_hf(hf_model_id: str) -> dict[str, Any]: sys.exit(1) else: logger.info(f"Loading base model: {dir_base_model.name}") - hparams = Model.load_hparams(dir_base_model) + hparams = ModelBase.load_hparams(dir_base_model) with torch.inference_mode(): try: - model_class = Model.from_model_architecture(hparams["architectures"][0]) + model_class = ModelBase.from_model_architecture(hparams["architectures"][0]) except NotImplementedError: logger.error(f"Model {hparams['architectures'][0]} is not supported") sys.exit(1) From 43c2e7572c569ee85a200709cd6624a04ca2673a Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sun, 20 Apr 2025 19:05:43 +0200 Subject: [PATCH 06/10] small fix --- convert_hf_to_gguf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index a627e99c8875c..8346f9bc09a11 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -442,7 +442,7 @@ def print_registered_models(cls): logger.error(f" - {name}") @classmethod - def from_model_architecture(cls, arch: str, model_type = ModelType.TEXT) -> type[Model]: + def from_model_architecture(cls, arch: str, model_type = ModelType.TEXT) -> type[ModelBase]: try: return cls._model_classes[model_type][arch] except KeyError: From ad186f43d72bbb91a4805f51769fa934c85127f4 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sun, 20 Apr 2025 19:08:52 +0200 Subject: [PATCH 07/10] correct CLIP_VISION arch name (because existing GGUF already use it) --- gguf-py/gguf/constants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 7eb53695ebd17..3f24705201d93 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -500,7 +500,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { - MODEL_ARCH.CLIP_VISION: "clip_vision", # dummy arch for clip.cpp + MODEL_ARCH.CLIP_VISION: "clip", # dummy arch for clip.cpp MODEL_ARCH.LLAMA: "llama", MODEL_ARCH.LLAMA4: "llama4", MODEL_ARCH.DECI: "deci", From 0822e156034796940b0b4a5da30ed529d93d3e46 Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Sun, 20 Apr 2025 20:46:12 +0200 Subject: [PATCH 08/10] Apply suggestions from code review Co-authored-by: compilade --- convert_hf_to_gguf.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 8346f9bc09a11..17292487bd173 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -1856,10 +1856,7 @@ def prepare_tensors(self): class Llama4Model(LlamaModel): model_arch = gguf.MODEL_ARCH.LLAMA4 undo_permute = False - ignore_vision = True - # TODO @ngxson : avoid duplicate this code everywhere by at least support "text_config" - # same with llama, but we need to merge the text_config into the root level of hparams def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) # IMPORTANT: the normal "intermediate_size" is renamed to "intermediate_size_mlp", we need to undo this From e37dec65fee8aa50e0fccea88fdc54046f65ecaf Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sun, 20 Apr 2025 20:49:52 +0200 Subject: [PATCH 09/10] fix Mistral3Model --- convert_hf_to_gguf.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 17292487bd173..e18a821eb7f89 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -1897,14 +1897,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): class Mistral3Model(LlamaModel): model_arch = gguf.MODEL_ARCH.LLAMA - # we need to merge the text_config into the root level of hparams - def __init__(self, *args, **kwargs): - hparams = kwargs["hparams"] if "hparams" in kwargs else ModelBase.load_hparams(args[0]) - if "text_config" in hparams: - hparams = {**hparams, **hparams["text_config"]} - kwargs["hparams"] = hparams - super().__init__(*args, **kwargs) - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): name = name.replace("language_model.", "") if "multi_modal_projector" in name or "vision_tower" in name: From 1f71edb1c73ebef3900af750cc0bb6f02213f404 Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Sun, 20 Apr 2025 21:37:21 +0200 Subject: [PATCH 10/10] fix typo Co-authored-by: compilade --- convert_hf_to_gguf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index e18a821eb7f89..6d34541a3cecc 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -3619,7 +3619,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter else: name = name.replace("fc2", "fc1") - # corrent norm value ; only this "soft_emb_norm" need to be corrected as it's part of Gemma projector + # correct norm value ; only this "soft_emb_norm" need to be corrected as it's part of Gemma projector # the other norm values are part of SigLIP model, and they are already correct # ref code: Gemma3RMSNorm if "soft_emb_norm.weight" in name: