[Inductor] Enable Inductor to support BF16 atomic_add (#96620)

EikanWang · cyyever · commit e8a42b65b31f · 2023-03-27T15:56:33.000+08:00
Pull Request resolved: pytorch/pytorch#96620 Approved by: https://github.com/jansel, https://github.com/jgong5
diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
@@ -6141,6 +6141,48 @@ def test_cpu_vec_cosim(self):
             union = {*cpp_vec_op_list, *diff}
             self.assertTrue(set(cpp_op_list).issubset(union))
 
+        def test_atomic_add_bf16(self):
+            def fn(test_args):
+                res = torch.gather(**test_args)
+                return res
+
+            input_tensor_for_ref = torch.tensor(
+                [[3.0, -5.0]], dtype=torch.bfloat16, requires_grad=True
+            )
+            input_tensor_for_opt = torch.tensor(
+                [[3.0, -5.0]], dtype=torch.bfloat16, requires_grad=True
+            )
+
+            test_args_for_ref = {
+                "input": input_tensor_for_ref,
+                "dim": 1,
+                "index": torch.tensor([[1]]),
+            }
+            test_args_for_opt = {
+                "input": input_tensor_for_opt,
+                "dim": 1,
+                "index": torch.tensor([[1]]),
+            }
+
+            opt_fn = torch.compile(fn)
+
+            ref_fwd = fn(test_args_for_ref)
+            res_fwd = opt_fn(test_args_for_opt)
+            self.assertEqual(res_fwd, ref_fwd)
+
+            torch.manual_seed(1)
+            bwd_tensor_for_ref = torch.randn(ref_fwd.shape, dtype=torch.bfloat16)
+            torch.manual_seed(1)
+            bwd_tensor_for_opt = torch.randn(res_fwd.shape, dtype=torch.bfloat16)
+            self.assertEqual(bwd_tensor_for_ref, bwd_tensor_for_opt)
+
+            ref_fwd.backward(bwd_tensor_for_ref)
+            res_fwd.backward(bwd_tensor_for_opt)
+
+            ref_grad = test_args_for_ref["input"].grad
+            res_grad = test_args_for_opt["input"].grad
+            self.assertEqual(ref_grad, res_grad)
+
         @unittest.skipIf(
             not codecache.valid_vec_isa_list(), "Does not support vectorization"
         )
diff --git a/torch/_inductor/codegen/cpp_prefix.h b/torch/_inductor/codegen/cpp_prefix.h
@@ -38,6 +38,17 @@ float randn_cpu(uint32_t seed, uint32_t offset) {
 template <typename T> struct AsIntegerType { typedef T type; };
 template <> struct AsIntegerType<float> { typedef uint32_t type; };
 template <> struct AsIntegerType<double> { typedef uint64_t type; };
+template <> struct AsIntegerType<bfloat16> { typedef uint16_t type; };
+
+template <typename T>
+inline T fetch_value(volatile T *addr) {
+  return *addr;
+}
+
+template <>
+inline bfloat16 fetch_value<bfloat16>(volatile bfloat16 *addr) {
+  return bfloat16(addr->x);
+}
 
 template <typename T> void atomic_add(volatile T *addr, T offset) {
   typedef typename AsIntegerType<T>::type alt_type;
@@ -51,7 +62,7 @@ template <typename T> void atomic_add(volatile T *addr, T offset) {
 
   std::atomic<alt_type> *atomic_addr = (std::atomic<alt_type> *)addr;
   do {
-    T val = *addr;
+    T val = fetch_value(addr);
     reinterpret_cast<T *>(&expected)[0] = val;
     reinterpret_cast<T *>(&desired)[0] = val + offset;
   } while (!atomic_addr->compare_exchange_weak(expected, desired,