Reintroduce has_weight_zeros as a template param

metascroy · web-flow · commit 5e4d50ca1f1e · 2025-04-01T13:43:54.000-07:00
Differential Revision: D71503133 Pull Request resolved: #1991
diff --git a/torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight/channelwise_8bit_activation_groupwise_lowbit_weight.h b/torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight/channelwise_8bit_activation_groupwise_lowbit_weight.h
@@ -245,7 +245,7 @@ void kernel_1x4x16_f32_neondot(
       has_clamp);
 }
 
-template <int weight_nbit, bool has_lut>
+template <int weight_nbit, bool has_weight_zeros, bool has_lut>
 void kernel_1x8x16_f32_neondot(
     // Outputs
     float32_t* output,
@@ -260,10 +260,11 @@ void kernel_1x8x16_f32_neondot(
     // Ignored if has_clamp = false
     float clamp_min,
     float clamp_max,
-    bool has_weight_zeros,
+    bool has_weight_zeros_,
     bool has_bias,
     bool has_clamp) {
-  kernel::kernel_1x8x16_f32_neondot<weight_nbit, has_lut>(
+  (void)has_weight_zeros_; // unused
+  kernel::kernel_1x8x16_f32_neondot<weight_nbit, has_weight_zeros, has_lut>(
       output,
       output_m_stride,
       m,
@@ -274,7 +275,6 @@ void kernel_1x8x16_f32_neondot(
       packed_activations,
       clamp_min,
       clamp_max,
-      has_weight_zeros,
       has_bias,
       has_clamp);
 }
diff --git a/torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight/kernel_1x8x16_f32_neondot-impl.h b/torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight/kernel_1x8x16_f32_neondot-impl.h
@@ -58,7 +58,7 @@ vec_clamp(float32x4_t x, float32x4_t vec_min, float32x4_t vec_max) {
 // Roughly inspired by
 // https://gitlab.arm.com/kleidi/kleidiai/-/blob/main/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod.c?ref_type=heads
 
-template <int weight_nbit, bool has_lut>
+template <int weight_nbit, bool has_weight_zeros, bool has_lut>
 void kernel_1x8x16_f32_neondot(
     // Outputs
     float32_t* output,
@@ -73,7 +73,6 @@ void kernel_1x8x16_f32_neondot(
     // Ignored if has_clamp is false
     float clamp_min,
     float clamp_max,
-    bool has_weight_zeros,
     bool has_bias,
     bool has_clamp) {
   assert(k % group_size == 0);
@@ -267,7 +266,7 @@ void kernel_1x8x16_f32_neondot(
 
         int32x4_t term1_4567 = vmulq_n_s32(weight_qvals_sum, activation_zero);
 
-        if (has_weight_zeros) {
+        if constexpr (has_weight_zeros) {
           // Compute term2 and term3
 
           int32_t activation_qvals_sum = *((int32_t*)activation_ptr);
diff --git a/torchao/experimental/kernels/cpu/aarch64/linear/linear.h b/torchao/experimental/kernels/cpu/aarch64/linear/linear.h
@@ -320,7 +320,7 @@ void prepare_weight_data(
               bias);
 }
 
-template <int weight_nbit>
+template <int weight_nbit, bool has_weight_zeros>
 void kernel(
     // Outputs
     float32_t* output,
@@ -335,12 +335,13 @@ void kernel(
     // Ignored if has_clamp = false
     float clamp_min,
     float clamp_max,
-    bool has_weight_zeros,
+    bool has_weight_zeros_,
     bool has_bias,
     bool has_clamp) {
+  (void)has_weight_zeros_; // unused
   torchao::kernels::cpu::aarch64::linear::
       channelwise_8bit_activation_groupwise_lowbit_weight::
-          kernel_1x8x16_f32_neondot<weight_nbit, /*has_lut*/ false>(
+          kernel_1x8x16_f32_neondot<weight_nbit, has_weight_zeros, /*has_lut*/ false>(
               output,
               output_m_stride,
               m,
diff --git a/torchao/experimental/kernels/cpu/aarch64/tests/test_linear.cpp b/torchao/experimental/kernels/cpu/aarch64/tests/test_linear.cpp
@@ -311,7 +311,7 @@ void test_channelwise_8bit_activation_groupwise_lowbit_weight_1x8x16_f32_neondot
       bias_ptr);
 
   std::vector<float> output(m * n);
-  kernel<weight_nbit>(
+  kernel<weight_nbit, has_weight_zeros>(
       output.data(),
       /*output_m_stride=*/n,
       m,
@@ -388,13 +388,12 @@ TEST(
   }
 }
 
-template <int weight_nbit>
+template <int weight_nbit, bool has_weight_zeros>
 void test_channelwise_8bit_activation_groupwise_lowbit_weight_lut(
     int m,
     int k,
     int n,
     int group_size,
-    bool has_weight_zeros,
     bool has_bias,
     bool has_clamp) {
   constexpr int mr = 1;
@@ -453,7 +452,7 @@ void test_channelwise_8bit_activation_groupwise_lowbit_weight_lut(
       has_bias ? test_case.bias.data() : nullptr);
 
   std::vector<float> output(m * n);
-  kernel_1x8x16_f32_neondot<weight_nbit, /*has_lut*/ true>(
+  kernel_1x8x16_f32_neondot<weight_nbit, has_weight_zeros, /*has_lut*/ true>(
       output.data(),
       /*output_m_stride=*/n,
       m,
@@ -476,85 +475,90 @@ void test_channelwise_8bit_activation_groupwise_lowbit_weight_lut(
 TEST(test_channelwise_8bit_activation_groupwise_lowbit_weight, LUT) {
   constexpr int weight_nbit = 4;
 
-  test_channelwise_8bit_activation_groupwise_lowbit_weight_lut<weight_nbit>(
+  test_channelwise_8bit_activation_groupwise_lowbit_weight_lut<
+      weight_nbit,
+      /*has_weight_zeros*/ false>(
       /*m=*/7,
       /*k=*/64,
       /*n=*/13,
       /*group_size=*/16,
-      /*has_weight_zeros=*/false,
       /*has_bias=*/false,
       /*has_clamp=*/false);
 
   // has_weight_zeros
-  test_channelwise_8bit_activation_groupwise_lowbit_weight_lut<weight_nbit>(
+  test_channelwise_8bit_activation_groupwise_lowbit_weight_lut<
+      weight_nbit,
+      /*has_weight_zeros*/ true>(
       /*m=*/7,
       /*k=*/64,
       /*n=*/13,
       /*group_size=*/16,
-      /*has_weight_zeros=*/true,
       /*has_bias=*/false,
       /*has_clamp=*/false);
 
   // has_bias
-  test_channelwise_8bit_activation_groupwise_lowbit_weight_lut<weight_nbit>(
+  test_channelwise_8bit_activation_groupwise_lowbit_weight_lut<
+      weight_nbit,
+      /*has_weight_zeros=*/false>(
       /*m=*/7,
       /*k=*/64,
       /*n=*/13,
       /*group_size=*/16,
-      /*has_weight_zeros=*/false,
       /*has_bias=*/true,
       /*has_clamp=*/false);
 
   // has_clamp
-  test_channelwise_8bit_activation_groupwise_lowbit_weight_lut<weight_nbit>(
+  test_channelwise_8bit_activation_groupwise_lowbit_weight_lut<
+      weight_nbit,
+      /*has_weight_zeros*/ false>(
       /*m=*/7,
       /*k=*/64,
       /*n=*/13,
       /*group_size=*/16,
-      /*has_weight_zeros=*/false,
       /*has_bias=*/false,
       /*has_clamp=*/true);
 
   // n less than 8 (nr)
   for (int n = 1; n < 8; n++) {
-    test_channelwise_8bit_activation_groupwise_lowbit_weight_lut<weight_nbit>(
+    test_channelwise_8bit_activation_groupwise_lowbit_weight_lut<
+        weight_nbit,
+        /*has_weight_zeros=*/false>(
         /*m=*/7,
         /*k=*/64,
         /*n=*/n,
         /*group_size=*/16,
-        /*has_weight_zeros=*/false,
         /*has_bias=*/false,
         /*has_clamp=*/false);
   }
 
   // Other bitwidths
   test_channelwise_8bit_activation_groupwise_lowbit_weight_lut<
-      /*weight_nbit*/ 1>(
+      /*weight_nbit*/ 1,
+      /*has_weight_zeros=*/false>(
       /*m=*/7,
       /*k=*/64,
       /*n=*/13,
       /*group_size=*/16,
-      /*has_weight_zeros=*/false,
       /*has_bias=*/false,
       /*has_clamp=*/false);
 
   test_channelwise_8bit_activation_groupwise_lowbit_weight_lut<
-      /*weight_nbit*/ 2>(
+      /*weight_nbit*/ 2,
+      /*has_weight_zeros=*/false>(
       /*m=*/7,
       /*k=*/64,
       /*n=*/13,
       /*group_size=*/16,
-      /*has_weight_zeros=*/false,
       /*has_bias=*/false,
       /*has_clamp=*/false);
 
   test_channelwise_8bit_activation_groupwise_lowbit_weight_lut<
-      /*weight_nbit*/ 3>(
+      /*weight_nbit*/ 3,
+      /*has_weight_zeros=*/false>(
       /*m=*/7,
       /*k=*/64,
       /*n=*/13,
       /*group_size=*/16,
-      /*has_weight_zeros=*/false,
       /*has_bias=*/false,
       /*has_clamp=*/false);
 }
diff --git a/torchao/experimental/ops/embedding_xbit/op_embedding_xbit-impl.h b/torchao/experimental/ops/embedding_xbit/op_embedding_xbit-impl.h
@@ -253,9 +253,11 @@ Tensor shared_embedding_out_cpu(
       torchao::ops::PackedWeightsHeader::read(packed_weights.const_data_ptr());
   auto format = torchao::ops::linear_8bit_act_xbit_weight::PackedWeightsFormat::
       from_packed_weights_header(header);
-  torchao::ops::linear_8bit_act_xbit_weight::check_format<weight_nbit>(
+
+  torchao::ops::linear_8bit_act_xbit_weight::check_format(
       format,
-      torchao::ops::PackedWeightsType::linear_8bit_act_xbit_weight_universal);
+      torchao::ops::PackedWeightsType::linear_8bit_act_xbit_weight_universal,
+      weight_nbit);
   constexpr int nr = 8;
   constexpr int kr = 16;
   constexpr int sr = 2;
@@ -316,12 +318,7 @@ Tensor shared_embedding_cpu(
     const Tensor& indices) {
   Tensor output_tensor = torch::empty({}, torch::kFloat32);
   shared_embedding_out_cpu<weight_nbit>(
-      packed_weights,
-      group_size,
-      n,
-      k,
-      indices,
-      output_tensor);
+      packed_weights, group_size, n, k, indices, output_tensor);
   return output_tensor;
 }
 #endif // USE_ATEN
diff --git a/torchao/experimental/ops/linear_8bit_act_xbit_weight/kernel_selector.h b/torchao/experimental/ops/linear_8bit_act_xbit_weight/kernel_selector.h
@@ -89,35 +89,62 @@ void register_ukernel_config_universal(
   if (!cpuinfo_initialize()) {
     throw std::runtime_error("Failed to initialize cpuinfo!");
   }
-  check_format<weight_nbit>(
+
+  check_format(
       format,
-      torchao::ops::PackedWeightsType::linear_8bit_act_xbit_weight_universal);
+      torchao::ops::PackedWeightsType::linear_8bit_act_xbit_weight_universal,
+      weight_nbit);
 
   if (format.nr == 8 && format.kr == 16 && format.sr == 2) {
 #if defined(TORCHAO_BUILD_CPU_AARCH64)
     if (cpuinfo_has_arm_neon_dot()) {
       log_registration(format, "universal");
       namespace kernel = torchao::kernels::cpu::aarch64::linear::
           channelwise_8bit_activation_groupwise_lowbit_weight_1x8x16_f32_neondot;
-      table.register_ukernel_config(
-          format,
-          uarch,
-          UKernelConfig{
-              /*preferred_alignment*/ 16,
-              /*nr*/ 8,
-              /*weight_packing_config*/
-              {/*weight_data_size_fn*/
-               &kernel::weight_data_size<weight_nbit>,
-               /*prepare_weight_data_fn*/
-               &kernel::prepare_weight_data<weight_nbit>},
-              /*linear_configs*/
-              {{{/*mr*/ 1,
-                 /*activation_data_size_fn*/
-                 &kernel::activation_data_size,
-                 /*prepare_activation_data_fn*/
-                 &kernel::prepare_activation_data,
-                 /*kernel*/
-                 &kernel::kernel<weight_nbit>}}}});
+
+      if (format.has_weight_zeros) {
+        constexpr bool has_weight_zeros = true;
+        table.register_ukernel_config(
+            format,
+            uarch,
+            UKernelConfig{
+                /*preferred_alignment*/ 16,
+                /*nr*/ 8,
+                /*weight_packing_config*/
+                {/*weight_data_size_fn*/
+                 &kernel::weight_data_size<weight_nbit>,
+                 /*prepare_weight_data_fn*/
+                 &kernel::prepare_weight_data<weight_nbit>},
+                /*linear_configs*/
+                {{{/*mr*/ 1,
+                   /*activation_data_size_fn*/
+                   &kernel::activation_data_size,
+                   /*prepare_activation_data_fn*/
+                   &kernel::prepare_activation_data,
+                   /*kernel*/
+                   &kernel::kernel<weight_nbit, has_weight_zeros>}}}});
+      } else {
+        constexpr bool has_weight_zeros = false;
+        table.register_ukernel_config(
+            format,
+            uarch,
+            UKernelConfig{
+                /*preferred_alignment*/ 16,
+                /*nr*/ 8,
+                /*weight_packing_config*/
+                {/*weight_data_size_fn*/
+                 &kernel::weight_data_size<weight_nbit>,
+                 /*prepare_weight_data_fn*/
+                 &kernel::prepare_weight_data<weight_nbit>},
+                /*linear_configs*/
+                {{{/*mr*/ 1,
+                   /*activation_data_size_fn*/
+                   &kernel::activation_data_size,
+                   /*prepare_activation_data_fn*/
+                   &kernel::prepare_activation_data,
+                   /*kernel*/
+                   &kernel::kernel<weight_nbit, has_weight_zeros>}}}});
+      }
       return;
     }
 #endif // TORCHAO_BUILD_CPU_AARCH64
@@ -166,7 +193,7 @@ void register_ukernel_config_kleidi(
   if (!cpuinfo_initialize()) {
     throw std::runtime_error("Failed to initialize cpuinfo!");
   }
-  check_format<weight_nbit>(format, torchao::ops::PackedWeightsType::kleidi_ai);
+  check_format(format, torchao::ops::PackedWeightsType::kleidi_ai, weight_nbit);
   namespace op = torchao::kernels::cpu::aarch64::kleidi::
       kai_matmul_clamp_f32_qai8dxp_qsi4c32p;
 
diff --git a/torchao/experimental/ops/linear_8bit_act_xbit_weight/packed_weights_format.h b/torchao/experimental/ops/linear_8bit_act_xbit_weight/packed_weights_format.h
@@ -53,10 +53,10 @@ struct PackedWeightsFormat {
   }
 };
 
-template <int weight_nbit>
-void check_format(
+inline void check_format(
     PackedWeightsFormat format,
-    torchao::ops::PackedWeightsType type) {
+    torchao::ops::PackedWeightsType type,
+    int weight_nbit) {
   if (format.type != type) {
     throw std::runtime_error(
         "Kernel expects packed_weights type=" +
diff --git a/torchao/experimental/ops/tests/test_linear_8bit_act_xbit_weight.cpp b/torchao/experimental/ops/tests/test_linear_8bit_act_xbit_weight.cpp
@@ -42,7 +42,7 @@ UKernelConfig get_ukernel_config() {
          /*prepare_activation_data_fn*/
          &kernel::prepare_activation_data,
          /*kernel*/
-         &kernel::kernel<weight_nbit>}}}};
+         &kernel::kernel<weight_nbit, has_weight_zeros>}}}};
 }
 
 template <

Original file line number	Diff line number	Diff line change
`@@ -42,7 +42,7 @@ UKernelConfig get_ukernel_config() {`
`42`	`42`	`/prepare_activation_data_fn/`
`43`	`43`	`&kernel::prepare_activation_data,`
`44`	`44`	`/kernel/`
`45`		`- &kernel::kernel<weight_nbit>}}}};`
	`45`	`+ &kernel::kernel<weight_nbit, has_weight_zeros>}}}};`
`46`	`46`	`}`
`47`	`47`
`48`	`48`	`template <`