From d6569b74d50bd351c20f13d515b8503709a91f01 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Thu, 7 Nov 2024 22:20:51 +0000 Subject: [PATCH 1/8] WIP --- vllm/model_executor/models/llama.py | 19 +++++++++++++++---- vllm/transformers_utils/config.py | 13 ++++++++++--- vllm/transformers_utils/tokenizers/mistral.py | 2 +- 3 files changed, 26 insertions(+), 8 deletions(-) diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index ad5cfcc4402..52dcc4cf3e9 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -483,6 +483,9 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA, SupportsPP): mistral_mapping = { "layers": "model.layers", "attention": "self_attn", + "qscale_act": "input_scale", + "qscale_weight": "weight_scale", + "kv_fake_quantizer.qscale_act": "kv_scale", "wq": "q_proj", "wk": "k_proj", "wv": "v_proj", @@ -603,15 +606,23 @@ def permute(w: torch.Tensor, n_heads: int): modules = name.split(".") # rotary embeds should be sliced - if "wk" in modules: + if "wk" in modules and modules[-1] == "weight": loaded_weight = permute(loaded_weight, self.config.num_key_value_heads) - elif "wq" in modules: + elif "wq" in modules and modules[-1] == "weight": loaded_weight = permute(loaded_weight, self.config.num_attention_heads) - for item in modules: - if item in mapping and mapping[item] not in name: + num_modules = len(modules) + for i in range(num_modules): + item = modules[i] + next_item = modules[i + 1] if i < num_modules - 1 else None + + combined_item = f"{item}.{next_item}" if next_item is not None else None + + if combined_item in mapping: + name = name.replace(combined_item, mapping[combined_item]) + elif item in mapping and mapping[item] not in name: name = name.replace(item, mapping[item]) return name, loaded_weight diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index b33449c42ec..fa419da6ed5 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -218,12 +218,12 @@ def load_params_config(model, revision) -> PretrainedConfig: "hidden_dim": "intermediate_size", } - def recurse_elems(elem: Any): - if isinstance(elem, dict): + def recurse_elems(elem: Any, wrap_to_hf_config: bool = True): + if isinstance(elem, dict) and wrap_to_hf_config: config_dict = {} for key, value in elem.items(): key = config_mapping.get(key, key) - config_dict[key] = recurse_elems(value) + config_dict[key] = recurse_elems(value, wrap_to_hf_config=False) return PretrainedConfig(**config_dict) else: return elem @@ -236,6 +236,12 @@ def recurse_elems(elem: Any): config_dict["max_position_embeddings"] = config_dict.get( "max_position_embeddings", 128_000) + if config_dict.get("quantization") is not None: + config_dict["quantization_config"] = { + "quant_method": "fp8", + "activation_scheme": "static" + } + if config_dict.get("moe") is not None: config_dict["architectures"] = ["MixtralForCausalLM"] else: @@ -252,6 +258,7 @@ def recurse_elems(elem: Any): config_dict["model_type"] = "pixtral" config = recurse_elems(config_dict) + return config diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py index aae10d3ee25..0542ca95f43 100644 --- a/vllm/transformers_utils/tokenizers/mistral.py +++ b/vllm/transformers_utils/tokenizers/mistral.py @@ -220,7 +220,7 @@ def convert_ids_to_tokens( tokens = [self.tokenizer.id_to_piece(id) for id in ids] - if any(t.strip() == "�" for t in tokens): + if any(t.strip() == "�" for t in tokens) and isinstance(self.tokenizer, Tekkenizer): # if any stripped decoded token is undefined # because it's invalid unicode then pass bytes # See: https://github.com/vllm-project/vllm/pull/8640 From 3db78edf31e908f22c3915ddd03d6fca0b44f725 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Thu, 6 Feb 2025 15:48:56 +0000 Subject: [PATCH 2/8] WIP --- vllm/model_executor/models/pixtral.py | 5 ++-- vllm/transformers_utils/config.py | 27 ++++++++++++------- vllm/transformers_utils/tokenizers/mistral.py | 2 +- 3 files changed, 22 insertions(+), 12 deletions(-) diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index 003e9c84c1c..1ecb8095d2f 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -54,8 +54,9 @@ def get_max_pixtral_image_tokens(ctx: InputContext): tokenizer_mode=ctx.model_config.tokenizer_mode) mm_encoder = tokenizer.instruct.mm_encoder - max_image_size = mm_encoder.mm_config.max_image_size - image_patch_size = mm_encoder.mm_config.image_patch_size + image_config = getattr(mm_encoder, "mm_config", mm_encoder.image_config) + max_image_size = image_config.max_image_size + image_patch_size = image_config.image_patch_size return ((max_image_size // image_patch_size)**2) diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 68ceec03539..64a21ab23a7 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -4,7 +4,7 @@ import json import os from pathlib import Path -from typing import Any, Dict, Optional, Type, Union +from typing import Any, Dict, Literal, Optional, Type, Union import huggingface_hub from huggingface_hub import (file_exists, hf_hub_download, @@ -503,13 +503,14 @@ def load_params_config(model: Union[str, Path], revision: Optional[str], "hidden_dim": "intermediate_size", } - def recurse_elems(elem: Any, wrap_to_hf_config: bool = True): - if isinstance(elem, dict) and wrap_to_hf_config: + def recurse_elems(elem: Any): + if isinstance(elem, dict): config_dict = {} for key, value in elem.items(): key = config_mapping.get(key, key) - config_dict[key] = recurse_elems(value, wrap_to_hf_config=False) - return PretrainedConfig(**config_dict) + config_dict[key] = recurse_elems(value) + + return config_dict else: return elem @@ -522,17 +523,19 @@ def recurse_elems(elem: Any, wrap_to_hf_config: bool = True): "max_position_embeddings", 128_000) if config_dict.get("quantization") is not None: - config_dict["quantization_config"] = { + quantization_config = { "quant_method": "fp8", "activation_scheme": "static" } + config_type: Literal["text", "multimodal"] = "multimodal" if config_dict.get("vision_encoder") is not None else "text" + if config_dict.get("moe") is not None: config_dict["architectures"] = ["MixtralForCausalLM"] else: config_dict["architectures"] = ["MistralForCausalLM"] - if config_dict.get("vision_encoder") is not None: + if config_type == "multimodal": multimodal_config = config_dict.pop("vision_encoder") config_dict = { @@ -544,9 +547,15 @@ def recurse_elems(elem: Any, wrap_to_hf_config: bool = True): config_dict.update(kwargs) - config = recurse_elems(config_dict) + config_dict["quantization_config"] = quantization_config + config_dict = recurse_elems(config_dict) - return config + # transform to HF config format + if config_type == "multimodal": + config_dict["text_config"] = PretrainedConfig(**config_dict["text_config"]) + config_dict["vision_config"] = PretrainedConfig(**config_dict["vision_config"]) + + return PretrainedConfig(**config_dict) def get_hf_image_processor_config( diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py index 1550f978ed2..1253b2146cb 100644 --- a/vllm/transformers_utils/tokenizers/mistral.py +++ b/vllm/transformers_utils/tokenizers/mistral.py @@ -88,7 +88,7 @@ def list_local_repo_files(repo_id: str, revision: Optional[str]) -> List[str]: def find_tokenizer_file(files: List[str]): - file_pattern = re.compile(r"^tokenizer\.model\.v.*$|^tekken\.json$") + file_pattern = re.compile(r"^tokenizer\.model\.v.*$|^tekken\.json$|^tokenizer\.mm\.model\.v.*$") matched_files = [file for file in files if file_pattern.match(file)] if len(matched_files) > 1: From 3fea9af39c48d54c85fe236de28bb7aea13b1c2f Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Thu, 6 Feb 2025 18:07:09 +0000 Subject: [PATCH 3/8] WIP --- vllm/model_executor/models/pixtral.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index 1ecb8095d2f..b224f079539 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -54,7 +54,8 @@ def get_max_pixtral_image_tokens(ctx: InputContext): tokenizer_mode=ctx.model_config.tokenizer_mode) mm_encoder = tokenizer.instruct.mm_encoder - image_config = getattr(mm_encoder, "mm_config", mm_encoder.image_config) + image_config = mm_encoder.mm_config if hasattr(mm_encoder, "mm_config") else mm_encoder.image_config + max_image_size = image_config.max_image_size image_patch_size = image_config.image_patch_size From aca252ccd84de055a4bf15fbdacb12ffeda337d6 Mon Sep 17 00:00:00 2001 From: mgoin Date: Fri, 7 Feb 2025 00:08:26 +0000 Subject: [PATCH 4/8] Format and error check Signed-off-by: mgoin --- vllm/transformers_utils/config.py | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 64a21ab23a7..519c2605969 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -523,12 +523,20 @@ def recurse_elems(elem: Any): "max_position_embeddings", 128_000) if config_dict.get("quantization") is not None: - quantization_config = { - "quant_method": "fp8", - "activation_scheme": "static" - } + quantization = config_dict.get("quantization") + if quantization.get("qformat_weight") == "fp8_e4m3": + # This maps to the FP8 static per-tensor quantization scheme + quantization_config = { + "quant_method": "fp8", + "activation_scheme": "static" + } + else: + raise ValueError( + f"Found unknown quantization='{quantization}' in config") - config_type: Literal["text", "multimodal"] = "multimodal" if config_dict.get("vision_encoder") is not None else "text" + config_type: Literal["text", + "multimodal"] = "multimodal" if config_dict.get( + "vision_encoder") is not None else "text" if config_dict.get("moe") is not None: config_dict["architectures"] = ["MixtralForCausalLM"] @@ -552,8 +560,10 @@ def recurse_elems(elem: Any): # transform to HF config format if config_type == "multimodal": - config_dict["text_config"] = PretrainedConfig(**config_dict["text_config"]) - config_dict["vision_config"] = PretrainedConfig(**config_dict["vision_config"]) + config_dict["text_config"] = PretrainedConfig( + **config_dict["text_config"]) + config_dict["vision_config"] = PretrainedConfig( + **config_dict["vision_config"]) return PretrainedConfig(**config_dict) From aa0d0ecf331eb9f08a5fd31e0118b5f964d406ef Mon Sep 17 00:00:00 2001 From: mgoin Date: Fri, 7 Feb 2025 00:11:00 +0000 Subject: [PATCH 5/8] Format Signed-off-by: mgoin --- vllm/model_executor/models/llama.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 648808ec2fd..2afa51431e9 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -602,7 +602,8 @@ def permute(w: torch.Tensor, n_heads: int): item = modules[i] next_item = modules[i + 1] if i < num_modules - 1 else None - combined_item = f"{item}.{next_item}" if next_item is not None else None + combined_item = (f"{item}.{next_item}" + if next_item is not None else None) if combined_item in mapping: name = name.replace(combined_item, mapping[combined_item]) From e716264a04fddfd2b10da6c4b99e4f6dc93f26e7 Mon Sep 17 00:00:00 2001 From: mgoin Date: Fri, 7 Feb 2025 03:34:30 +0000 Subject: [PATCH 6/8] Format Signed-off-by: mgoin --- vllm/model_executor/models/pixtral.py | 3 ++- vllm/transformers_utils/tokenizers/mistral.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index b224f079539..e78e8d62cc4 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -54,7 +54,8 @@ def get_max_pixtral_image_tokens(ctx: InputContext): tokenizer_mode=ctx.model_config.tokenizer_mode) mm_encoder = tokenizer.instruct.mm_encoder - image_config = mm_encoder.mm_config if hasattr(mm_encoder, "mm_config") else mm_encoder.image_config + image_config = mm_encoder.mm_config if hasattr( + mm_encoder, "mm_config") else mm_encoder.image_config max_image_size = image_config.max_image_size image_patch_size = image_config.image_patch_size diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py index 1253b2146cb..aedf6a69919 100644 --- a/vllm/transformers_utils/tokenizers/mistral.py +++ b/vllm/transformers_utils/tokenizers/mistral.py @@ -88,7 +88,8 @@ def list_local_repo_files(repo_id: str, revision: Optional[str]) -> List[str]: def find_tokenizer_file(files: List[str]): - file_pattern = re.compile(r"^tokenizer\.model\.v.*$|^tekken\.json$|^tokenizer\.mm\.model\.v.*$") + file_pattern = re.compile( + r"^tokenizer\.model\.v.*$|^tekken\.json$|^tokenizer\.mm\.model\.v.*$") matched_files = [file for file in files if file_pattern.match(file)] if len(matched_files) > 1: From 6149ce4b23fabaa25323f713d112b94f471c4889 Mon Sep 17 00:00:00 2001 From: mgoin Date: Fri, 7 Feb 2025 03:57:37 +0000 Subject: [PATCH 7/8] Default dict Signed-off-by: mgoin --- vllm/transformers_utils/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 7febe6672b2..6b77e941a87 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -533,7 +533,7 @@ def recurse_elems(elem: Any): "max_position_embeddings", 128_000) if config_dict.get("quantization") is not None: - quantization = config_dict.get("quantization") + quantization = config_dict.get("quantization", {}) if quantization.get("qformat_weight") == "fp8_e4m3": # This maps to the FP8 static per-tensor quantization scheme quantization_config = { From ec4648c4b5011296e10be06eb073a6ba17cdf23e Mon Sep 17 00:00:00 2001 From: mgoin Date: Fri, 7 Feb 2025 14:21:10 +0000 Subject: [PATCH 8/8] Fix quantization_config assignment Signed-off-by: mgoin --- vllm/transformers_utils/config.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 6b77e941a87..83b05e88eca 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -544,6 +544,8 @@ def recurse_elems(elem: Any): raise ValueError( f"Found unknown quantization='{quantization}' in config") + config_dict["quantization_config"] = quantization_config + config_type: Literal["text", "multimodal"] = "multimodal" if config_dict.get( "vision_encoder") is not None else "text" @@ -565,7 +567,6 @@ def recurse_elems(elem: Any): config_dict.update(kwargs) - config_dict["quantization_config"] = quantization_config config_dict = recurse_elems(config_dict) # transform to HF config format