Merge fixes

ProExpertProg · ProExpertProg · commit d02c5684c93a · 2024-08-27T15:04:49.000-04:00
diff --git a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu b/csrc/quantization/compressed_tensors/int8_quant_kernels.cu
@@ -150,9 +150,11 @@ __global__ void dynamic_scaled_int8_azp_quant_kernel(
   }
 
   // Reduce the max and min values across the block
-  max_val = blockReduceMax(max_val);
+  using BlockReduce = cub::BlockReduce<float, 1024>;
+  __shared__ typename BlockReduce::TempStorage reduceStorage;
+  max_val = BlockReduce(reduceStorage).Reduce(max_val, cub::Max{}, blockDim.x);
   __syncthreads();  // Make sure min doesn't mess with max shared memory
-  min_val = blockReduceMin(min_val);
+  min_val = BlockReduce(reduceStorage).Reduce(min_val, cub::Min{}, blockDim.x);
 
   __shared__ scale_type scale_sh;
   __shared__ azp_type azp_sh;
diff --git a/tests/kernels/test_int8_quant.py b/tests/kernels/test_int8_quant.py
@@ -39,7 +39,7 @@ def test_dynamic_scaled_int8_quant(num_tokens: int, hidden_size: int,
     # reference
     ref_out, ref_scales = ref_dynamic_per_token_quant(x, torch.int8)
     # kernel
-    ops_out, ops_scales = scaled_int8_quant(x)
+    ops_out, ops_scales, _ = scaled_int8_quant(x)
 
     torch.testing.assert_close(ops_scales, ref_scales)
     torch.testing.assert_close(
@@ -103,11 +103,10 @@ def test_static_scaled_int8_quant(num_tokens: int, hidden_size: int,
 
     out1 = (x / scale).round().clamp(int8_traits.min,
                                      int8_traits.max).to(torch.int8)
-    out2, _ = scaled_int8_quant(x, scale)
+    out2, _, _ = scaled_int8_quant(x, scale)
 
-    torch.testing.assert_close(
-        out1, out2, atol=1,
-        rtol=0.0)  # big atol to account for rounding errors
+    # big atol to account for rounding errors
+    torch.testing.assert_close(out1, out2, atol=1, rtol=0.0)
 
 
 @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
@@ -135,4 +134,6 @@ def test_static_scaled_int8_azp_quant(num_tokens: int, hidden_size: int,
 
     torch.ops._C.static_scaled_int8_quant(out2, x, scale_argument,
                                           azp_argument)
-    torch.testing.assert_close(out1, out2, atol=1)  # atol for rounding
+
+    # big atol to account for rounding errors
+    torch.testing.assert_close(out1, out2, atol=1, rtol=0.0)
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
@@ -438,7 +438,9 @@ def scaled_int8_quant(
     output = torch.empty_like(input, dtype=torch.int8)
     if scale is not None:
         # static-per-tensor quantization.
-        assert symmetric == azp is None, "azp must be only be provided for asymmetric quantization."
+        assert symmetric == (
+            azp is
+            None), "azp must only be provided for asymmetric quantization."
         torch.ops._C.static_scaled_int8_quant(output, input, scale, azp)
         return output, scale, None