Revert D62466496: Multisect successfully blamed "D62466496: [ExecuTorch] Build optimized kernels with bf16 support and gate usage at runtime" for one build failure

Dark Knight · facebook-github-bot · commit e0c931255cfe · 2024-09-13T21:16:16.000-07:00
Summary: This diff reverts D62466496 D62466496: [ExecuTorch] Build optimized kernels with bf16 support and gate usage at runtime by swolchok causes the following build failure: Tests affected: - [playground_mwa_all_for_perftest](https://www.internalfb.com/intern/test/844425060509872/) Here's the Multisect link: https://www.internalfb.com/multisect/10105407 Here are the tasks that are relevant to this breakage: T191385168: 100+ CI signals unhealthy for mwa_import_android The backout may land if someone accepts it. If this diff has been generated in error, you can Commandeer and Abandon it. Reviewed By: sheepsword Differential Revision: D62678457 fbshipit-source-id: 8a06dc283aa0ecabb1c75114166fa7b7184df989
diff --git a/kernels/optimized/blas/BlasKernel.cpp b/kernels/optimized/blas/BlasKernel.cpp
@@ -10,7 +10,6 @@
 
 #ifdef __aarch64__
 #include <arm_neon.h>
-#include <cpuinfo.h>
 #endif
 
 using torch::executor::BFloat16;
@@ -81,37 +80,32 @@ f32_dot_bf16(float32x4_t a, bfloat16x8_t b, bfloat16x8_t c) {
 }
 #endif
 
-template <bool useBfloat16Dot>
 static ET_INLINE void dot_with_fp32_arith_main_inner_loop(
     const BFloat16* vec1,
     const BFloat16* vec2,
     float32x4_t sum[kF32RegistersPerIteration],
     int registerPairIndex) {
 #ifdef __ARM_FEATURE_BF16
-  if (useBfloat16Dot) {
-    const bfloat16x8_t temp_vec1 = vld1q_bf16(reinterpret_cast<const __bf16*>(
-        &vec1[registerPairIndex * 2 * kF32ElementsPerRegister]));
-    const bfloat16x8_t temp_vec2 = vld1q_bf16(reinterpret_cast<const __bf16*>(
-        &vec2[registerPairIndex * 2 * kF32ElementsPerRegister]));
-    sum[registerPairIndex] =
-        f32_dot_bf16(sum[registerPairIndex], temp_vec1, temp_vec2);
-  } else {
-#endif
-    const uint16x8_t temp_vec1 = vld1q_u16(reinterpret_cast<const uint16_t*>(
-        &vec1[registerPairIndex * 2 * kF32ElementsPerRegister]));
-    const uint16x8_t temp_vec2 = vld1q_u16(reinterpret_cast<const uint16_t*>(
-        &vec2[registerPairIndex * 2 * kF32ElementsPerRegister]));
-
-    sum[2 * registerPairIndex] = f32_fma_bf16(
-        sum[2 * registerPairIndex],
-        vget_low_u16(temp_vec1),
-        vget_low_u16(temp_vec2));
-    sum[2 * registerPairIndex + 1] = f32_fma_bf16(
-        sum[2 * registerPairIndex + 1],
-        vget_high_u16(temp_vec1),
-        vget_high_u16(temp_vec2));
-#ifdef __ARM_FEATURE_BF16
-  }
+  const bfloat16x8_t temp_vec1 = vld1q_bf16(reinterpret_cast<const __bf16*>(
+      &vec1[registerPairIndex * 2 * kF32ElementsPerRegister]));
+  const bfloat16x8_t temp_vec2 = vld1q_bf16(reinterpret_cast<const __bf16*>(
+      &vec2[registerPairIndex * 2 * kF32ElementsPerRegister]));
+  sum[registerPairIndex] =
+      f32_dot_bf16(sum[registerPairIndex], temp_vec1, temp_vec2);
+#else
+  const uint16x8_t temp_vec1 = vld1q_u16(reinterpret_cast<const uint16_t*>(
+      &vec1[registerPairIndex * 2 * kF32ElementsPerRegister]));
+  const uint16x8_t temp_vec2 = vld1q_u16(reinterpret_cast<const uint16_t*>(
+      &vec2[registerPairIndex * 2 * kF32ElementsPerRegister]));
+
+  sum[2 * registerPairIndex] = f32_fma_bf16(
+      sum[2 * registerPairIndex],
+      vget_low_u16(temp_vec1),
+      vget_low_u16(temp_vec2));
+  sum[2 * registerPairIndex + 1] = f32_fma_bf16(
+      sum[2 * registerPairIndex + 1],
+      vget_high_u16(temp_vec1),
+      vget_high_u16(temp_vec2));
 #endif
 }
 
@@ -127,7 +121,7 @@ static ET_INLINE void dot_with_fp32_arith_vectorized_tail_inner_loop(
   *tailSum = f32_fma_bf16(*tailSum, temp_vec1, temp_vec2);
 }
 
-template <typename T, bool useBfloat16Dot>
+template <typename T>
 float dot_with_fp32_arith(const T* vec1, const T* vec2, int64_t len) {
   float32x4_t sum[kF32RegistersPerIteration] = {vdupq_n_f32(0)};
   const auto len_aligned = len & ~(kF32ElementsPerIteration - 1);
@@ -136,8 +130,7 @@ float dot_with_fp32_arith(const T* vec1, const T* vec2, int64_t len) {
     const auto* vec2_ = vec2 + j;
     utils::ForcedUnroll<kF32RegisterPairsPerIteration>{}(
         [vec1_, vec2_, &sum](auto k) ET_INLINE_ATTRIBUTE {
-          dot_with_fp32_arith_main_inner_loop<useBfloat16Dot>(
-              vec1_, vec2_, sum, k);
+          dot_with_fp32_arith_main_inner_loop(vec1_, vec2_, sum, k);
         });
   }
   auto reducedSum = reduce(sum);
@@ -164,15 +157,7 @@ float bf16_dot_with_fp32_arith(
     const BFloat16* vec1,
     const BFloat16* vec2,
     int64_t len) {
-#ifdef __ARM_FEATURE_BF16
-  if (cpuinfo_has_arm_bf16()) {
-    return dot_with_fp32_arith<BFloat16, true>(vec1, vec2, len);
-  } else {
-#endif
-    return dot_with_fp32_arith<BFloat16, false>(vec1, vec2, len);
-#ifdef __ARM_FEATURE_BF16
-  }
-#endif
+  return dot_with_fp32_arith(vec1, vec2, len);
 }
 #endif
 } // namespace internal
diff --git a/kernels/optimized/lib_defs.bzl b/kernels/optimized/lib_defs.bzl
@@ -129,14 +129,6 @@ def define_libs():
                 ] if not runtime.is_oss else [],
                 "DEFAULT": [],
             }),
-            fbandroid_platform_compiler_flags = [
-                (
-                    "^android-arm64.*$",
-                    [
-                        "-march=armv8+bf16",
-                    ],
-                ),
-            ],
             fbandroid_platform_preprocessor_flags = [
                 (
                     "^android-arm64.*$",
@@ -153,9 +145,6 @@ def define_libs():
                     ],
                 ),
             ],
-            fbobjc_compiler_flags = [
-                "-march=armv8+bf16",
-            ],
             fbobjc_exported_preprocessor_flags = [
                 "-DET_BUILD_WITH_BLAS",
                 "-DET_BUILD_FOR_APPLE",
diff --git a/kernels/test/op_linear_test.cpp b/kernels/test/op_linear_test.cpp
@@ -43,16 +43,16 @@ class OpLinearOutTest : public OperatorTest {
       }
     }
 
-    // matmul gives 32 * 2 * 3 = 192
-    Tensor x = tf.full({3, 32}, 2);
-    Tensor y = tf.full({5, 32}, 3);
+    // matmul gives 4 * 2 * 3 = 24
+    Tensor x = tf.full({3, 4}, 2);
+    Tensor y = tf.full({5, 4}, 3);
 
     // Output shape should be (3, 5)
     Tensor out = tf.zeros({3, 5});
 
     op_linear_out(x, y, out);
 
-    Tensor expected = tf.full({3, 5}, 192);
+    Tensor expected = tf.full({3, 5}, 24);
 
     EXPECT_TENSOR_EQ(out, expected);
   }
diff --git a/shim/xplat/executorch/build/env_interface.bzl b/shim/xplat/executorch/build/env_interface.bzl
@@ -118,8 +118,7 @@ def _remove_platform_specific_args(kwargs):
     """
     keys = []
     for key in kwargs:
-        if (key.endswith("_platform_preprocessor_flags") or key.endswith("_platform_deps") or
-            key.startswith("fbobjc") or key.endswith("_platform_compiler_flags")):
+        if key.endswith("_platform_preprocessor_flags") or key.endswith("_platform_deps") or key.startswith("fbobjc"):
             keys.append(key)
     for key in keys:
         kwargs.pop(key)

Original file line number	Diff line number	Diff line change
`@@ -43,16 +43,16 @@ class OpLinearOutTest : public OperatorTest {`
`43`	`43`	`}`
`44`	`44`	`}`
`45`	`45`
`46`		`- // matmul gives 32 * 2 * 3 = 192`
`47`		`- Tensor x = tf.full({3, 32}, 2);`
`48`		`- Tensor y = tf.full({5, 32}, 3);`
	`46`	`+ // matmul gives 4 * 2 * 3 = 24`
	`47`	`+ Tensor x = tf.full({3, 4}, 2);`
	`48`	`+ Tensor y = tf.full({5, 4}, 3);`
`49`	`49`
`50`	`50`	`// Output shape should be (3, 5)`
`51`	`51`	`Tensor out = tf.zeros({3, 5});`
`52`	`52`
`53`	`53`	`op_linear_out(x, y, out);`
`54`	`54`
`55`		`- Tensor expected = tf.full({3, 5}, 192);`
	`55`	`+ Tensor expected = tf.full({3, 5}, 24);`
`56`	`56`
`57`	`57`	`EXPECT_TENSOR_EQ(out, expected);`
`58`	`58`	`}`