Add GPTQ support

chu-tianxiang · chu-tianxiang · commit 71dfb52796b5 · 2023-08-23T19:31:19.000+08:00
diff --git a/vllm/config.py b/vllm/config.py
@@ -2,6 +2,7 @@
 
 import torch
 from transformers import PretrainedConfig
+from auto_gptq import BaseQuantizeConfig
 
 from vllm.logger import init_logger
 from vllm.transformers_utils.config import get_config
@@ -55,6 +56,10 @@ def __init__(
         self.seed = seed
 
         self.hf_config = get_config(model, trust_remote_code)
+        try:
+            self.quantize_config = BaseQuantizeConfig.from_pretrained(model)
+        except:
+            self.quantize_config = None
         self.dtype = _get_and_verify_dtype(self.hf_config, dtype)
         self._verify_tokenizer_mode()
 
diff --git a/vllm/model_executor/model_loader.py b/vllm/model_executor/model_loader.py
@@ -3,11 +3,14 @@
 
 import torch
 import torch.nn as nn
+from accelerate import init_on_device
 from transformers import PretrainedConfig
+from auto_gptq.modeling._utils import autogptq_post_init
 
 from vllm.config import ModelConfig
 from vllm.model_executor.models import *  # pylint: disable=wildcard-import
 from vllm.model_executor.weight_utils import initialize_dummy_weights
+from vllm.model_executor.quantize import make_quant, find_layers
 
 # TODO(woosuk): Lazy-load the model classes.
 _MODEL_REGISTRY = {
@@ -46,7 +49,25 @@ def get_model(model_config: ModelConfig) -> nn.Module:
 
     # Create a model instance.
     # The weights will be initialized as empty tensors.
-    model = model_class(model_config.hf_config)
+    if model_config.quantize_config:
+        with init_on_device(device=torch.device("cpu")):
+            model = model_class(model_config.hf_config)
+        layers = find_layers(model)
+        ignore_layers = [model_class.lm_head_name] + model_class.outside_layer_modules
+        for name in list(layers.keys()):
+            if any([name.startswith(ignore_layer) for ignore_layer in ignore_layers]):
+                del layers[name]
+
+        make_quant(
+            model,
+            layers,
+            model_config.quantize_config.bits,
+            model_config.quantize_config.group_size,
+            desc_act=model_config.quantize_config.desc_act,
+        )
+        model.quantize_config = model_config.quantize_config
+    else:
+        model = model_class(model_config.hf_config)
     if model_config.use_dummy_weights:
         model = model.cuda()
         # NOTE(woosuk): For accurate performance evaluation, we assign
@@ -57,4 +78,6 @@ def get_model(model_config: ModelConfig) -> nn.Module:
         model.load_weights(model_config.model, model_config.download_dir,
                            model_config.use_np_weights)
         model = model.cuda()
+    if model_config.quantize_config:
+        model = autogptq_post_init(model, use_act_order=model_config.quantize_config.desc_act)
     return model.eval()
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
@@ -229,6 +229,8 @@ def forward(
 
 
 class LlamaForCausalLM(nn.Module):
+    lm_head_name = "lm_head"
+    outside_layer_modules = ["model.embed_tokens", "model.norm"]
 
     def __init__(self, config):
         super().__init__()
@@ -301,11 +303,23 @@ def load_weights(self,
                 if weight_name not in name:
                     continue
                 param = state_dict[name.replace(weight_name, "qkv_proj")]
-
-                loaded_weight = loaded_weight[
-                    shard_size * tensor_model_parallel_rank:shard_size *
-                    (tensor_model_parallel_rank + 1)]
-                param_slice = param.data[offset:offset + shard_size]
+                if "g_idx" in name:
+                    param.data.copy_(loaded_weight)
+                    is_attention_weight = True
+                    continue
+                if any(key in name for key in ('qweight', 'qzeros', 'scales')):
+                    if 'qzeros' in name:
+                        shard_size = shard_size // 32 * self.quantize_config.bits
+                        offset = offset // 32 * self.quantize_config.bits
+                    loaded_weight = loaded_weight[:,
+                        shard_size * tensor_model_parallel_rank:shard_size *
+                        (tensor_model_parallel_rank + 1)]
+                    param_slice = param.data[:, offset:offset + shard_size]
+                else:
+                    loaded_weight = loaded_weight[
+                        shard_size * tensor_model_parallel_rank:shard_size *
+                        (tensor_model_parallel_rank + 1)]
+                    param_slice = param.data[offset:offset + shard_size]
                 assert param_slice.shape == loaded_weight.shape
 
                 param_slice.copy_(loaded_weight)
@@ -319,12 +333,24 @@ def load_weights(self,
                 if weight_name not in name:
                     continue
                 param = state_dict[name.replace(weight_name, "gate_up_proj")]
-                shard_size = param.shape[0] // 2
-                loaded_weight = loaded_weight[
-                    shard_size * tensor_model_parallel_rank:shard_size *
-                    (tensor_model_parallel_rank + 1)]
-                param_slice = param.data[shard_size * stride_id:shard_size *
-                                         (stride_id + 1)]
+                if "g_idx" in name:
+                    param.data.copy_(loaded_weight)
+                    is_gate_up_weight = True
+                    continue
+                if any(key in name for key in ('qweight', 'qzeros', 'scales')):
+                    shard_size = param.shape[1] // 2
+                    loaded_weight = loaded_weight[:,
+                        shard_size * tensor_model_parallel_rank:shard_size *
+                        (tensor_model_parallel_rank + 1)]
+                    param_slice = param.data[:, shard_size * stride_id:shard_size *
+                                             (stride_id + 1)]
+                else:
+                    shard_size = param.shape[0] // 2
+                    loaded_weight = loaded_weight[
+                        shard_size * tensor_model_parallel_rank:shard_size *
+                        (tensor_model_parallel_rank + 1)]
+                    param_slice = param.data[shard_size * stride_id:shard_size *
+                                             (stride_id + 1)]
                 assert param_slice.shape == loaded_weight.shape
                 param_slice.copy_(loaded_weight)
                 is_gate_up_weight = True
diff --git a/vllm/model_executor/quantize.py b/vllm/model_executor/quantize.py
@@ -0,0 +1,89 @@
+# Adapted from
+# https://github.com/PanQiWei/AutoGPTQ/blob/main/auto_gptq/modeling/_utils.py
+"""Utilities for quantizing models."""
+from typing import List, Dict
+
+import torch.nn as nn
+import transformers
+
+from auto_gptq.utils.import_utils import dynamically_import_QuantLinear
+
+from vllm.model_executor.parallel_utils.tensor_parallel.layers import (
+    ColumnParallelLinear,
+    RowParallelLinear,
+)
+
+def find_layers(
+    module: nn.Module,
+    layers: List[nn.Module] = None,
+    name: str = ''
+) -> Dict[str, nn.Module]:
+    if not layers:
+        layers = [transformers.pytorch_utils.Conv1D, nn.Conv2d, nn.Linear,
+                  ColumnParallelLinear, RowParallelLinear]
+    for layer in layers:
+        if isinstance(module,layer):
+            return {name: module}
+    res = {}
+    for name1, child in module.named_children():
+        res.update(find_layers(child, layers=layers, name=name + '.' + name1 if name != '' else name1))
+    return res
+
+
+def make_quant(
+    module: nn.Module,
+    names: List[str],
+    bits: int,
+    group_size: int,
+    name: str = '',
+    use_triton: bool = False,
+    disable_exllama: bool = False,
+    use_cuda_fp16: bool = True,
+    desc_act: bool = False,
+    trainable: bool = False
+) -> None:
+    QuantLinear = dynamically_import_QuantLinear(use_triton=use_triton, desc_act=desc_act, group_size=group_size, bits=bits, disable_exllama=disable_exllama)
+
+    class QuantLinearWrapper(QuantLinear):
+        def forward(self, *args, **kwargs):
+            return super().forward(*args, **kwargs), None
+
+    if isinstance(module, QuantLinear):
+        return
+    for attr in dir(module):
+        tmp = getattr(module, attr)
+        name1 = name + '.' + attr if name != '' else attr
+        if name1 in names:
+            delattr(module, attr)
+            if isinstance(tmp, nn.Linear):
+                in_features = tmp.in_features
+                out_features = tmp.out_features
+            elif isinstance(tmp, nn.Conv2d):
+                in_features = tmp.in_channels
+                out_features = tmp.out_channels
+            elif isinstance(tmp, transformers.pytorch_utils.Conv1D):
+                in_features = tmp.weight.shape[0]
+                out_features = tmp.weight.shape[1]
+            elif isinstance(tmp, ColumnParallelLinear) or isinstance(tmp, RowParallelLinear):
+                in_features = tmp.input_size
+                out_features = tmp.output_size
+            if (not(desc_act) or group_size == -1) and not use_triton:
+                new_layer = QuantLinearWrapper(
+                    bits, group_size, in_features, out_features, True, use_cuda_fp16=use_cuda_fp16, trainable=trainable
+                )
+            else:
+                new_layer = QuantLinearWrapper(bits, group_size, in_features, out_features, True, trainable=trainable)
+            setattr(module, attr, new_layer)
+    for name1, child in module.named_children():
+        make_quant(
+            child,
+            names,
+            bits,
+            group_size,
+            name + '.' + name1 if name != '' else name1,
+            use_triton=use_triton,
+            use_cuda_fp16=use_cuda_fp16,
+            desc_act=desc_act,
+            trainable=trainable,
+            disable_exllama=disable_exllama,
+        )
diff --git a/vllm/model_executor/weight_utils.py b/vllm/model_executor/weight_utils.py
@@ -9,6 +9,7 @@
 import numpy as np
 import torch
 from tqdm.auto import tqdm
+from safetensors.torch import safe_open
 
 
 class Disabledtqdm(tqdm):
@@ -33,7 +34,7 @@ def hf_model_weights_iterator(
     if not is_local:
         with lock:
             hf_folder = snapshot_download(model_name_or_path,
-                                          allow_patterns="*.bin",
+                                          allow_patterns=["*.bin", "*.safetensors"],
                                           cache_dir=cache_dir,
                                           tqdm_class=Disabledtqdm)
     else:
@@ -43,8 +44,19 @@ def hf_model_weights_iterator(
         x for x in glob.glob(os.path.join(hf_folder, "*.bin"))
         if not x.endswith("training_args.bin")
     ]
+    safetensor_files = [
+        x for x in glob.glob(os.path.join(hf_folder, "*.safetensors"))
+    ]
 
-    if use_np_cache:
+    # prioritize safetensor files
+    if safetensor_files:
+        for st_file in safetensor_files:
+            with safe_open(st_file, framework="pt") as f:
+                for name in f.keys():
+                    param = f.get_tensor(name)
+                    yield name, param
+            torch.cuda.empty_cache()
+    elif use_np_cache:
         # Convert the model weights from torch tensors to numpy arrays for
         # faster loading.
         np_folder = os.path.join(hf_folder, "np")
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
@@ -30,4 +30,4 @@ def get_config(model: str, trust_remote_code: bool) -> PretrainedConfig:
     if config.model_type in _CONFIG_REGISTRY:
         config_class = _CONFIG_REGISTRY[config.model_type]
         config = config_class.from_pretrained(model)
-    return config
+    return config