Skip to content

Commit 7a16eb9

Browse files
Recalc scales from user (vllm-project#774)
mul scale input in factor = 448/240 --------- Co-authored-by: Michał Kuligowski <[email protected]>
1 parent f6441f3 commit 7a16eb9

File tree

3 files changed

+9
-7
lines changed

3 files changed

+9
-7
lines changed

requirements-hpu.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,4 +8,4 @@ pandas
88
tabulate
99
setuptools>=61
1010
setuptools-scm>=8
11-
vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@8087a98
11+
vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@bb47de4

vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import torch
66
from compressed_tensors.quantization import QuantizationStrategy
77
from torch.nn import Parameter
8+
from vllm_hpu_extension.ops import get_hpu_gaudi2_scale_factor, is_hpu_gaudi2
89

910
from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
1011
CompressedTensorsScheme)
@@ -84,8 +85,10 @@ def process_weights_after_loading(self, layer) -> None:
8485

8586
# INPUT SCALE
8687
if self.is_static_input_scheme and hasattr(layer, 'input_scale'):
87-
layer.input_scale = Parameter(layer.input_scale.max(),
88-
requires_grad=False)
88+
input_scale = layer.input_scale.max()
89+
if is_hpu_gaudi2():
90+
input_scale = input_scale * get_hpu_gaudi2_scale_factor()
91+
layer.input_scale = Parameter(input_scale, requires_grad=False)
8992
else:
9093
layer.input_scale = None
9194

vllm/model_executor/layers/quantization/utils/w8a8_utils.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from typing import List, Optional, Tuple, Union
44

55
import torch
6+
from vllm_hpu_extension.ops import get_hpu_gaudi2_scale_factor, is_hpu_gaudi2
67

78
from vllm import _custom_ops as ops
89
from vllm.platforms import current_platform
@@ -101,10 +102,8 @@ def requantize_with_max_scale(
101102
logical_widths: List[int]) -> Tuple[torch.Tensor, torch.Tensor]:
102103
# Max scale to be used for requanitzation.
103104
max_w_scale = weight_scale.max()
104-
if current_platform.is_hpu() and htexp._get_device_type(
105-
) == htexp.synDeviceType.synDeviceGaudi2:
106-
max_w_scale = max_w_scale * (torch.finfo(torch.float8_e4m3fn).max /
107-
torch.finfo(torch.float8_e4m3fnuz).max)
105+
if is_hpu_gaudi2():
106+
max_w_scale = max_w_scale * get_hpu_gaudi2_scale_factor()
108107
# QKV / MLP is fused in the on disk checkpoint if any of the
109108
# weight scales are still set to the default since we initialize
110109
# N weight scales for N shards but we only load 1 weight scale

0 commit comments

Comments
 (0)