Skip to content

Commit 3127467

Browse files
Fix rocm compat (#1373)
* fix rocm compat Signed-off-by: Qubitium <[email protected]> * format Signed-off-by: Qubitium <[email protected]> * fix rocm compat Signed-off-by: Qubitium <[email protected]> * fix call * fix rocm compat Signed-off-by: Qubitium <[email protected]> * use svd for rocm Signed-off-by: Qubitium <[email protected]> * try ck backend Signed-off-by: Qubitium <[email protected]> * try magma backend Signed-off-by: Qubitium <[email protected]> * cleanup linalg backend Signed-off-by: Qubitium <[email protected]> * disable torch.compile Signed-off-by: Qubitium <[email protected]> * fix eora_save * model_save_dir is optional Signed-off-by: Qubitium <[email protected]> * fix eora save path cannot be a .safetensors * add linalg test Signed-off-by: Qubitium <[email protected]> --------- Signed-off-by: Qubitium <[email protected]> Co-authored-by: CSY-ModelCloud <[email protected]>
1 parent f78de9e commit 3127467

File tree

117 files changed

+260
-458
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

117 files changed

+260
-458
lines changed

examples/benchmark/generation_speed.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -23,13 +23,11 @@
2323

2424
import torch
2525
from datasets import Dataset, load_dataset
26+
from gptqmodel import BACKEND, GPTQModel, QuantizeConfig
2627
from logbar import LogBar
2728
from transformers import AutoTokenizer, GenerationConfig
2829
from transformers.generation.logits_process import LogitsProcessor
2930

30-
from gptqmodel import BACKEND, GPTQModel, QuantizeConfig
31-
32-
3331
logger = LogBar.shared()
3432

3533
random.seed(0)

examples/benchmark/ipex.py

-2
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@
2020
import torch
2121
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
2222

23-
2423
try:
2524
from optimum.intel.utils.modeling_utils import bind_cores_for_best_perf
2625
bind_cores_for_best_perf()
@@ -30,7 +29,6 @@
3029

3130
import argparse
3231

33-
3432
parser = argparse.ArgumentParser(description="Benchmark IPEX vs HF on a pre-trained model.")
3533
parser.add_argument("--model", type=str, required=True, help="Path or name of the pre-trained model.")
3634
parser.add_argument("--cores", type=int, default=8, help="Number of CPU cores to use.")

examples/benchmark/perplexity.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,8 @@
1717
import argparse
1818
import os
1919

20-
from transformers import AutoTokenizer
21-
2220
from gptqmodel.utils import Perplexity
23-
21+
from transformers import AutoTokenizer
2422

2523
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
2624

examples/evaluation/run_language_modeling_task.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,10 @@
1818

1919
import datasets
2020
import torch
21-
from transformers import AutoTokenizer
22-
2321
from gptqmodel import BACKEND, GPTQModel, QuantizeConfig
2422
from gptqmodel.eval_tasks import LanguageModelingTask
2523
from gptqmodel.utils.torch import torch_empty_cache
26-
24+
from transformers import AutoTokenizer
2725

2826
DATASET = "tatsu-lab/alpaca"
2927
WITH_INPUT_TEMPLATE = "Instruction:\n{instruction}\n\nInput:\n{input}\n\nOutput:\n"

examples/evaluation/run_sequence_classification_task.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,10 @@
1919

2020
import datasets
2121
import torch
22-
from transformers import AutoTokenizer
23-
2422
from gptqmodel import BACKEND, GPTQModel, QuantizeConfig
2523
from gptqmodel.eval_tasks import SequenceClassificationTask
2624
from gptqmodel.utils.torch import torch_empty_cache
27-
25+
from transformers import AutoTokenizer
2826

2927
DATASET = "cardiffnlp/tweet_sentiment_multilingual"
3028
TEMPLATE = "Question:What's the sentiment of the given text? Choices are {labels}.\nText: {text}\nAnswer:"

examples/evaluation/run_text_summarization_task.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,10 @@
1919

2020
import datasets
2121
import torch
22-
from transformers import AutoTokenizer, GenerationConfig
23-
2422
from gptqmodel import BACKEND, GPTQModel, QuantizeConfig
2523
from gptqmodel.eval_tasks import TextSummarizationTask
2624
from gptqmodel.utils.torch import torch_empty_cache
27-
25+
from transformers import AutoTokenizer, GenerationConfig
2826

2927
os.system("pip install py7zr")
3028

examples/inference/run_transformers.py

-1
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616

1717
from transformers import AutoModelForCausalLM, AutoTokenizer
1818

19-
2019
tokenizer = AutoTokenizer.from_pretrained("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ")
2120
quantized_model = AutoModelForCausalLM.from_pretrained("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ")
2221
print(tokenizer.decode(quantized_model.generate(**tokenizer("gptqmodel is", return_tensors="pt").to(quantized_model.device))[0]))

examples/inference/run_with_different_backends.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,8 @@
1919
import sys
2020
from argparse import ArgumentParser
2121

22-
from transformers import AutoTokenizer
23-
2422
from gptqmodel import BACKEND, GPTQModel, QuantizeConfig, get_best_device
25-
23+
from transformers import AutoTokenizer
2624

2725
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
2826
pretrained_model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

examples/quantization/basic_usage.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,8 @@
1616

1717
import os
1818

19-
from transformers import AutoTokenizer
20-
2119
from gptqmodel import GPTQModel, QuantizeConfig, get_best_device
22-
20+
from transformers import AutoTokenizer
2321

2422
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
2523

examples/quantization/basic_usage_autoround.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,9 @@
1515
# limitations under the License.
1616

1717
import torch
18-
from transformers import AutoTokenizer
19-
2018
from gptqmodel import GPTQModel
2119
from gptqmodel.quantization.config import AutoRoundQuantizeConfig # noqa: E402
22-
20+
from transformers import AutoTokenizer
2321

2422
pretrained_model_id = "/monster/data/model/TinyLlama-1.1B-Chat-v1.0" # "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
2523
quantized_model_id = "./autoround/TinyLlama-1.1B-Chat-v1.0-4bit-128g"

examples/quantization/basic_usage_wikitext2.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,8 @@
1616

1717
import torch
1818
from datasets import load_dataset
19-
from transformers import AutoTokenizer
20-
2119
from gptqmodel import GPTQModel, QuantizeConfig
22-
20+
from transformers import AutoTokenizer
2321

2422
pretrained_model_id = "/monster/data/model/TinyLlama-1.1B-Chat-v1.0" # "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
2523
quantized_model_id = "TinyLlama-1.1B-Chat-v1.0-4bit-128g"

examples/quantization/transformers_usage.py

-1
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616

1717
from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig
1818

19-
2019
model_id = "facebook/opt-125m"
2120
tokenizer = AutoTokenizer.from_pretrained(model_id)
2221
dataset = ["gptqmodel is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."]

gptqmodel/__init__.py

-1
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@
2222
from .utils.exllama import exllama_set_max_input_length
2323
from .version import __version__
2424

25-
2625
if os.getenv('GPTQMODEL_USE_MODELSCOPE', 'False').lower() in ['true', '1']:
2726
try:
2827
from modelscope.utils.hf_util.patcher import patch_hub

gptqmodel/eora/eora.py

+12-1
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121

2222
from ..looper.named_module import NamedModule
2323
from ..utils.logger import setup_logger
24+
from ..utils.rocm import IS_ROCM
2425

2526
log = setup_logger()
2627

@@ -51,7 +52,13 @@ def eora_compute_lora(
5152
# save this later for SVD
5253
raw_scaling_diag_matrix = eigen_scaling_diag_matrix.to(dtype=torch.float64, device=device)
5354

55+
if IS_ROCM:
56+
# hip cannot resolve linalg ops
57+
original_backend = torch.backends.cuda.preferred_linalg_library()
58+
torch.backends.cuda.preferred_linalg_library(backend="magma")
59+
5460
L, Q = torch.linalg.eigh(raw_scaling_diag_matrix)
61+
5562
if (L < 0).any():
5663
## When expanding the calibration data size for EoRA, I suggest maintaining the balance by allocating 50% to general input (C4) and the remaining 50% to downstream task data.
5764
log.warn(f"Found negative eigenvalues in `{module.name}`. Please increase your calibration data set for EoRA.")
@@ -88,5 +95,9 @@ def eora_compute_lora(
8895
del L, Q, U, S, V,
8996
del w_wq_delta, raw_scaling_diag_matrix, sqrtEigenvalues, scaling_diag_matrix, scaling_matrix_inv, delta_scale
9097
del truc_s, truc_u, truc_v, truc_sigma, sqrtS
91-
98+
99+
# revert linalg backend
100+
if IS_ROCM:
101+
torch.backends.cuda.preferred_linalg_library(original_backend)
102+
92103
return A, B

gptqmodel/looper/eora_processor.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -58,11 +58,11 @@ def __init__(self, tokenizer, qcfg: QuantizeConfig, calibration_dataset, prepare
5858
# needed by eora
5959
# torch._dynamo.config.capture_scalar_outputs = True
6060

61-
self.eora_compute_lora = torch_compile(eora_compute_lora)
62-
self.eora_process_input = torch_compile(eora_process_input)
61+
#self.eora_compute_lora = torch_compile(eora_compute_lora)
62+
#self.eora_process_input = torch_compile(eora_process_input)
6363

64-
# self.eora_compute_lora = eora_compute_lora
65-
# self.eora_process_input = eora_process_input
64+
self.eora_compute_lora = eora_compute_lora
65+
self.eora_process_input = eora_process_input
6666

6767
def log_plotly(self):
6868
task = self.logger_task

gptqmodel/models/_const.py

-1
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@
2525
from ..utils.rocm import IS_ROCM
2626
from ..utils.torch import HAS_CUDA, HAS_MPS, HAS_XPU
2727

28-
2928
CPU = device("cpu")
3029
CUDA = device("cuda")
3130
CUDA_0 = device("cuda:0")

gptqmodel/models/auto.py

-3
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@
2020

2121
from ..utils.logger import setup_logger
2222

23-
2423
log = setup_logger()
2524

2625
if not os.environ.get("PYTORCH_CUDA_ALLOC_CONF", None):
@@ -33,7 +32,6 @@
3332

3433
import sys # noqa: E402
3534

36-
3735
# TODO: waiting for pytorch implementgation of aten ops for MPS
3836
if sys.platform == "darwin":
3937
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
@@ -108,7 +106,6 @@
108106
from .definitions.xverse import XverseGPTQ # noqa: E402
109107
from .definitions.yi import YiGPTQ # noqa: E402
110108

111-
112109
# make quants and inference more determinisitc
113110
torch.manual_seed(787)
114111
random.seed(787)

gptqmodel/models/base.py

+7-31
Original file line numberDiff line numberDiff line change
@@ -28,14 +28,8 @@
2828
from packaging import version
2929
from packaging.version import Version
3030
from tokenicer import Tokenicer
31-
from transformers import (
32-
AutoModelForCausalLM,
33-
AutoProcessor,
34-
PreTrainedModel,
35-
PreTrainedTokenizerBase,
36-
ProcessorMixin,
37-
modeling_utils,
38-
)
31+
from transformers import (AutoModelForCausalLM, AutoProcessor, PreTrainedModel,
32+
PreTrainedTokenizerBase, ProcessorMixin, modeling_utils)
3933

4034
from ..adapter.adapter import Adapter
4135
from ..nn_modules.hooked_linear import replace_linear_with_hooked_linear
@@ -49,31 +43,13 @@
4943
from ..utils.hf import autofix_hf_model_config
5044
from ..utils.importer import select_quant_linear
5145
from ..utils.logger import setup_logger
52-
from ..utils.model import (
53-
MODALITY,
54-
check_to_quantized,
55-
find_modules,
56-
get_device,
57-
get_module,
58-
get_module_by_name_prefix,
59-
get_moe_layer_modules,
60-
move_to,
61-
nested_move_to,
62-
pack_model,
63-
)
46+
from ..utils.model import (MODALITY, check_to_quantized, find_modules, get_device, get_module,
47+
get_module_by_name_prefix, get_moe_layer_modules, move_to, nested_move_to, pack_model)
6448
from ..utils.torch import torch_compile, torch_empty_cache
6549
from ._const import CALIBRATION_DATASET_CONCAT_CHAR, CPU, DEFAULT_MAX_SHARD_SIZE, DEVICE, SUPPORTS_MODULE_TYPES
6650
from .loader import ModelLoader
67-
from .writer import (
68-
PROCESS_LOG_FWD_TIME,
69-
PROCESS_LOG_LAYER,
70-
PROCESS_LOG_MODULE,
71-
PROCESS_LOG_TIME,
72-
QUANT_LOG_DAMP,
73-
QUANT_LOG_LOSS,
74-
ModelWriter,
75-
)
76-
51+
from .writer import (PROCESS_LOG_FWD_TIME, PROCESS_LOG_LAYER, PROCESS_LOG_MODULE,
52+
PROCESS_LOG_TIME, QUANT_LOG_DAMP, QUANT_LOG_LOSS, ModelWriter)
7753

7854
# pytorch 2.6.0 fixes many compilation errors
7955
TORCH_MIN_VERSION_STR = '2.6.0'
@@ -511,7 +487,7 @@ def _eora_generate(
511487
auto_gc=auto_gc,
512488
)
513489

514-
self.eora_save(eora_path=adapter.path)
490+
self.eora_save(save_dir=adapter.path, model_save_dir=self.model_local_path)
515491
return
516492

517493
@torch.no_grad()

gptqmodel/models/definitions/gemma2.py

-1
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818
from ...utils.logger import setup_logger
1919
from ..base import BaseGPTQModel
2020

21-
2221
log = setup_logger()
2322

2423
SUPPORT_ERR = "Currently, only vLLM/SGLang with flashinfer enabled can correctly inference a quantized Gemma2-27B model. Pre-quantized model with sample vLLM code: https://huggingface.co/ModelCloud/gemma-2-27b-it-gptq-4bit ."

gptqmodel/models/loader.py

+3-15
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@
2424
import torch
2525
import transformers
2626

27-
2827
if os.getenv('GPTQMODEL_USE_MODELSCOPE', 'False').lower() in ['true', '1']:
2928
try:
3029
from modelscope import snapshot_download
@@ -49,22 +48,11 @@
4948
from ..utils.importer import auto_select_device, normalize_device_device_map, select_quant_linear
5049
from ..utils.logger import setup_logger
5150
from ..utils.marlin import _validate_marlin_compatibility, _validate_marlin_device_support
52-
from ..utils.model import (
53-
auto_dtype,
54-
convert_gptq_v1_to_v2_format,
55-
find_modules,
56-
get_checkpoints,
57-
get_moe_layer_modules,
58-
gptqmodel_post_init,
59-
load_checkpoint_in_model_then_tie_weights,
60-
make_quant,
61-
simple_dispatch_model,
62-
verify_model_hash,
63-
verify_sharded_model_hashes,
64-
)
51+
from ..utils.model import (auto_dtype, convert_gptq_v1_to_v2_format, find_modules, get_checkpoints,
52+
get_moe_layer_modules, gptqmodel_post_init, load_checkpoint_in_model_then_tie_weights,
53+
make_quant, simple_dispatch_model, verify_model_hash, verify_sharded_model_hashes)
6554
from ._const import DEVICE, SUPPORTED_MODELS, normalize_device
6655

67-
6856
log = setup_logger()
6957

7058
ATTN_IMPLEMENTATION = "attn_implementation"

gptqmodel/models/writer.py

+7-25
Original file line numberDiff line numberDiff line change
@@ -37,36 +37,18 @@
3737

3838
from ..adapter.adapter import HF_ADAPTER_FILE_NAME, HF_ADAPTER_WEIGHT_KEY_PREFIX, Lora
3939
from ..adapter.peft import LoraConfig
40-
from ..quantization.config import (
41-
FORMAT,
42-
META_FIELD_DAMP_AUTO_INCREMENT,
43-
META_FIELD_DAMP_PERCENT,
44-
META_FIELD_MSE,
45-
META_FIELD_QUANTIZER,
46-
META_FIELD_STATIC_GROUPS,
47-
META_FIELD_TRUE_SEQUENTIAL,
48-
META_FIELD_URI,
49-
META_QUANTIZER_GPTQMODEL,
50-
META_VALUE_URI,
51-
MIN_VERSION_WITH_V2,
52-
)
40+
from ..quantization.config import (FORMAT, META_FIELD_DAMP_AUTO_INCREMENT, META_FIELD_DAMP_PERCENT, META_FIELD_MSE,
41+
META_FIELD_QUANTIZER, META_FIELD_STATIC_GROUPS, META_FIELD_TRUE_SEQUENTIAL,
42+
META_FIELD_URI, META_QUANTIZER_GPTQMODEL, META_VALUE_URI, MIN_VERSION_WITH_V2)
5343
from ..utils.backend import BACKEND
5444
from ..utils.logger import setup_logger
55-
from ..utils.model import (
56-
convert_gptq_v2_to_v1_format,
57-
copy_py_files,
58-
find_modules,
59-
get_model_files_size,
60-
get_moe_layer_modules,
61-
get_state_dict_for_save,
62-
load_checkpoint_in_model_then_tie_weights,
63-
make_quant,
64-
)
45+
from ..utils.model import (convert_gptq_v2_to_v1_format, copy_py_files, find_modules,
46+
get_model_files_size, get_moe_layer_modules, get_state_dict_for_save,
47+
load_checkpoint_in_model_then_tie_weights, make_quant)
6548
from ..utils.torch import torch_empty_cache
6649
from ..version import __version__
6750
from ._const import CPU, DEFAULT_MAX_SHARD_SIZE
6851

69-
7052
log = setup_logger()
7153

7254
PROCESS_LOG_NAME = "process"
@@ -90,7 +72,7 @@ def save_pretrained(
9072

9173
cls.save_pretrained = save_pretrained
9274

93-
def _eora_save(self, save_dir: str, model_save_dir: str):
75+
def _eora_save(self, save_dir: str, model_save_dir: str = None):
9476
assert isinstance(self.quantize_config.adapter, Lora)
9577

9678
assert hasattr(self, 'lora_results')

0 commit comments

Comments
 (0)