Skip to content

Commit 4f95575

Browse files
adrianlizarragaankitm3k
authored andcommitted
[Quant Tool] Prevent int32 quantized bias from clipping by adjusting the weight's scale (microsoft#22020)
### Description Fixes scenario in which a bias input quantized to int32 has a scale that is too small. A bias with a scale that is smaller than a certain threshold will overflow the range of an `int32` when quantized, which significantly decreases accuracy. Credit to @yihonglyu for finding out about this issue and the fix. ### Motivation and Context Consider the following Convolution with very small weights and a constant bias input of `[5, -4.5]`. ![image](https://github.com/user-attachments/assets/4bde2bd9-892f-4ae9-887b-61a6668779a1) The QDQ quantizer first computes the following quantization scale for `input_0` and `weight`: - `input_0`: scale=0.5 - `weight`: scale=7.843e-10 **[really small]** The QDQ quantizer then computes the bias input's scale as follows: ``` bias_scale = input_0_scale * weight_0_scale = 0.5 * 7.843e-10 = 3.9215686274509805e-11 ``` This `bias_scale` is too small. Before this PR, the QDQ quantizer would quantize the f32 bias with this `bias_scale`: ``` bias_quant = round(bias_f32 / bias_scale) = round([5.0/bias_scale, -4.5/bias_scale]) = [127500000000, -114750000000] ``` These quantized bias values exceed the range of int32, and so are clipped to [int32.min(), int32.max()], which is very inaccurate. #### New approach This PR increases the `weight_0_scale` by the necessary amount to ensure that `bias_scale` (which equals `weight_0_scale * input_0_scale`) is appropriate for the int32 quantization type. The smallest valid bias scale is given by the normal scale formula: `bias_smallest_valid_scale = (bias_f32_max - bias_f32_min) / (int32_max - int32_min)` Then, we compute the candidate bias scale: `bias_scale_candidate = input_0_scale * weight_0_scale` If the candidate scale is smaller than the smallest valid scale, we increase the `weight_0_scale` by the necessary ratio: ```python if bias_scale_candidate < bias_smallest_valid_scale: ratio = bias_smallest_valid_scale / bias_scale_candidate weight_0_scale = ratio * weight_0_scale ``` Then, we recompute the final bias scale: ```python bias_scale = input_0_scale * weight_0_scale ``` #### Impact on accuracy Here's the above model's quantized output compared to the f32 (ground-truth) output. - Before PR: - f32 model output[0]: **5.0f** - qdq model output[0]: **0.075** - SNR: 0.1369 (higher is better) - After PR: - f32 model output[0]: **5.0f** - qdq model output[0]: **4.992** - SNR: 55.656 (higher is better)
1 parent 1759211 commit 4f95575

9 files changed

+822
-186
lines changed

onnxruntime/python/tools/quantization/base_quantizer.py

+43-18
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@
2121
from .quant_utils import (
2222
ONNX_TYPE_TO_NP_TYPE,
2323
TENSOR_NAME_QUANT_SUFFIX,
24-
QuantType,
2524
find_by_name,
2625
model_has_infer_metadata,
2726
normalize_axis,
@@ -40,18 +39,26 @@ def __init__(self, **data: Dict[str, Any]):
4039
for k, v in data.items():
4140
if not isinstance(k, str):
4241
raise TypeError(f"Keys must be strings not {type(k)} for k={k!r}.")
43-
if not isinstance(v, (int, str, np.ndarray)):
42+
if k != "axis" and not isinstance(v, (int, str, np.ndarray)):
4443
raise TypeError(f"Values must be numpy arrays, int, float, str not {type(v)} for k={k!r}.")
44+
if k == "axis" and not isinstance(v, int) and v is not None:
45+
raise TypeError(f"Axis value must be an int or None, not {type(v)}.")
4546
if k == "scale" and v.dtype not in (np.float32, np.float16):
4647
raise ValueError(f"scale must a float32 or float16 numpy element but is {v.dtype} for k={k!r}")
4748
self.data[k] = v
4849

50+
def get(self, key, default_value=None):
51+
return self.data.get(key, default_value)
52+
4953
def __iter__(self):
5054
yield from self.data
5155

5256
def __getitem__(self, key):
5357
return self.data[key]
5458

59+
def __setitem__(self, key, value):
60+
self.data[key] = value
61+
5562
def __len__(self):
5663
return len(self.data)
5764

@@ -88,9 +95,10 @@ def __init__(
8895
self.force_quantize_no_input_check = (
8996
"ForceQuantizeNoInputCheck" in self.extra_options and self.extra_options["ForceQuantizeNoInputCheck"]
9097
)
91-
self.is_weight_symmetric = self.extra_options.get(
92-
"WeightSymmetric", weight_qType in (QuantType.QInt8, QuantType.QInt16, QuantType.QFLOAT8E4M3FN)
93-
)
98+
99+
# If user does not explicitly set "WeightSymmetric", then the weight's quantization type determines
100+
# the symmetry (i.e., signed integer types will use symmetric quantization). See `def is_weight_symmetric()`
101+
self._is_weight_symmetric: bool | None = self.extra_options.get("WeightSymmetric", None)
94102
self.is_activation_symmetric = self.extra_options.get("ActivationSymmetric", False)
95103
self.min_real_range = self.extra_options.get("MinimumRealRange")
96104

@@ -131,6 +139,16 @@ def __init__(
131139

132140
self.tensor_quant_override_qtypes = self.tensor_quant_overrides.get_quant_types()
133141

142+
def is_weight_symmetric(self, weight_quant_type: onnx.TensorProto.DataType) -> bool:
143+
if self._is_weight_symmetric is not None:
144+
return self._is_weight_symmetric # Return value explicitly set by user.
145+
return weight_quant_type in (
146+
onnx.TensorProto.INT4,
147+
onnx.TensorProto.INT8,
148+
onnx.TensorProto.INT16,
149+
onnx.TensorProto.FLOAT8E4M3FN,
150+
)
151+
134152
def quantize_model(self):
135153
raise NotImplementedError
136154

@@ -230,9 +248,19 @@ def quantize_bias_static_impl(self, bias_name, input_scale, weight_scale, beta=1
230248
# TODO: This formula should be explained including why the scale is not estimated for the bias as well.
231249
bias_scale = input_scale * weight_scale * beta
232250

233-
quantized_data = (np.asarray(bias_data) / bias_scale).round()
234-
quantized_data = np.clip(quantized_data, np.iinfo(np.int32).min, np.iinfo(np.int32).max)
235-
quantized_data = quantized_data.astype(np.int32)
251+
# Quantize by dividing by bias_scale
252+
quantized_data = np.asarray(bias_data, dtype=np.float64) / np.asarray(bias_scale, dtype=np.float64)
253+
quantized_data = quantized_data.round()
254+
255+
# Clip quantized data to the range of a int32
256+
int32_min = np.float64(np.iinfo(np.int32).min)
257+
int32_max = np.float64(np.iinfo(np.int32).max)
258+
if np.any(quantized_data < int32_min) or np.any(quantized_data > int32_max):
259+
logging.warning(
260+
f"Quantized bias `{bias_name}` exceeds the range of a int32. The bias scale is too small."
261+
)
262+
263+
quantized_data = np.clip(quantized_data, int32_min, int32_max).astype(np.int32)
236264

237265
# update bias initializer
238266
bias_np_data = np.asarray(quantized_data, dtype=np.int32).reshape(bias_initializer.dims)
@@ -282,6 +310,7 @@ def quantize_initializer_impl(self, weight, qType, reduce_range=False, keep_floa
282310
If keep_float_weight is False, quantize the weight, or don't quantize the weight.
283311
:return: quantized weight name, zero point name, scale name
284312
"""
313+
# TODO(adrianlizarraga): This function is now only used by onnx_quantizer.py, so move it there.
285314
q_weight_name = weight.name + TENSOR_NAME_QUANT_SUFFIX
286315
zp_name = weight.name + "_zero_point"
287316
scale_name = weight.name + "_scale"
@@ -303,10 +332,11 @@ def quantize_initializer_impl(self, weight, qType, reduce_range=False, keep_floa
303332
assert isinstance(scale, np.ndarray), f"Unexpected type {type(scale)}"
304333

305334
else:
306-
_, _, zero_point, scale, q_weight_data = quantize_data(
335+
symmetric = self.is_weight_symmetric(qType) if qType == self.weight_qType else self.is_activation_symmetric
336+
zero_point, scale, q_weight_data = quantize_data(
307337
weight_data.flatten(),
308338
qType,
309-
quant_overrides.get("symmetric", self.is_weight_symmetric),
339+
quant_overrides.get("symmetric", symmetric),
310340
reduce_range=quant_overrides.get("reduce_range", self.reduce_range and reduce_range),
311341
min_real_range=self.min_real_range,
312342
rmin_override=quant_overrides.get("rmin"),
@@ -371,6 +401,7 @@ def quantize_weight_per_channel_impl(
371401
reduce_range=True,
372402
keep_float_weight=False,
373403
):
404+
# TODO(adrianlizarraga): This function is now only used by onnx_quantizer.py, so move it there.
374405
initializer = find_by_name(weight_name, self.model.initializer())
375406
if initializer is None:
376407
raise ValueError("{} is not an initializer", weight_name)
@@ -409,13 +440,7 @@ def quantize_weight_per_channel_impl(
409440
if "quant_type" in quant_overrides_for_channels[0]:
410441
weight_qType = quant_overrides_for_channels[0]["quant_type"].tensor_type # noqa: N806
411442

412-
symmetric = quant_overrides_for_channels[0].get(
413-
"symmetric",
414-
(
415-
self.is_weight_symmetric
416-
or weight_qType in (onnx.TensorProto.INT8, onnx.TensorProto.FLOAT8E4M3FN, onnx.TensorProto.INT4)
417-
),
418-
)
443+
symmetric = quant_overrides_for_channels[0].get("symmetric", self.is_weight_symmetric(weight_qType))
419444
reduce_range = quant_overrides_for_channels[0].get("reduce_range", self.reduce_range and reduce_range)
420445
zero_point_list = []
421446
scale_list = []
@@ -444,7 +469,7 @@ def quantize_weight_per_channel_impl(
444469
), f"Unexpected type {type(quantized_per_channel_data)}"
445470

446471
else:
447-
_, _, zero_point, scale, quantized_per_channel_data = quantize_data(
472+
zero_point, scale, quantized_per_channel_data = quantize_data(
448473
per_channel_data.flatten(),
449474
weight_qType,
450475
symmetric,

onnxruntime/python/tools/quantization/onnx_model.py

+20
Original file line numberDiff line numberDiff line change
@@ -296,6 +296,26 @@ def get_largest_node_name_suffix(self, node_name_prefix):
296296

297297
return suffix
298298

299+
def get_largest_initializer_name_suffix(self, initializer_name_prefix):
300+
"""
301+
Gets the largest initializer name integer suffix for all initializer names that begin
302+
with `initializer_name_prefix`. This can be used to create unique initializer names.
303+
304+
Example: for initializer names 'my_weight_0' and 'my_weight_3', this method returns 3 if
305+
`initializer_name_prefix` is 'my_weight_'.
306+
"""
307+
suffix = -1
308+
309+
for initializer in self.model.graph.initializer:
310+
if initializer.name.startswith(initializer_name_prefix):
311+
try:
312+
index = int(initializer.name[len(initializer_name_prefix) :])
313+
suffix = max(index, suffix)
314+
except ValueError:
315+
continue
316+
317+
return suffix
318+
299319
def find_nodes_by_initializer(self, graph, initializer):
300320
"""
301321
Find all nodes with given initializer as an input.

0 commit comments

Comments
 (0)