intel · steffenlarsen · Jun 30, 2022 · Jan 25, 2022 · Jan 25, 2022 · Jan 25, 2022
@@ -60,7 +60,6 @@
 #if SYCL_EXT_ONEAPI_BACKEND_LEVEL_ZERO
 #include <sycl/ext/oneapi/backend/level_zero.hpp>
 #endif
-#include <sycl/ext/oneapi/bf16_storage_builtins.hpp>
 #include <sycl/ext/oneapi/device_global/properties.hpp>
 #include <sycl/ext/oneapi/experimental/builtins.hpp>
 #include <sycl/ext/oneapi/experimental/cuda/barrier.hpp>

@@ -15,6 +15,7 @@
 #include <CL/sycl/detail/type_traits.hpp>
 
 #include <CL/__spirv/spirv_ops.hpp>
+#include <sycl/ext/oneapi/experimental/bfloat16.hpp>
 
 // TODO Decide whether to mark functions with this attribute.
 #define __NOEXC /*noexcept*/
@@ -26,10 +27,7 @@
 #endif
 
 __SYCL_INLINE_NAMESPACE(cl) {
-namespace sycl {
-namespace ext {
-namespace oneapi {
-namespace experimental {
+namespace sycl::ext::oneapi::experimental {
 
 // Provides functionality to print data from kernels in a C way:
 // - On non-host devices this function is directly mapped to printf from
@@ -117,11 +115,154 @@ inline __SYCL_ALWAYS_INLINE
 
 } // namespace native
 
-} // namespace experimental
-} // namespace oneapi
-} // namespace ext
+template <typename T>
+std::enable_if_t<std::is_same<T, bfloat16>::value, T> fabs(T x) {
+#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
+  return bfloat16::from_bits(__clc_fabs(x.raw()));
+#else
+  std::ignore = x;
+  throw runtime_error("bfloat16 is not currently supported on the host device.",
+                      PI_ERROR_INVALID_DEVICE);
+#endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
+}
+
+template <size_t N>
+sycl::marray<bfloat16, N> fabs(sycl::marray<bfloat16, N> x) {
+#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
+  sycl::marray<bfloat16, N> res;
+  auto x_storage = reinterpret_cast<uint32_t const *>(&x);
+  auto res_storage = reinterpret_cast<uint32_t *>(&res);
+
+  for (size_t i = 0; i < N / 2; i++)
+    res_storage[i] = __clc_fabs(x_storage[i]);
+
+  if constexpr (N % 2) {
+    res[N - 1] = bfloat16::from_bits(__clc_fabs(x[N - 1].raw()));
+  }
+  return res;
+#else
+  std::ignore = x;
+  throw runtime_error("bfloat16 is not currently supported on the host device.",
+                      PI_ERROR_INVALID_DEVICE);
+#endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
+}
+
+template <typename T>
+std::enable_if_t<std::is_same<T, bfloat16>::value, T> fmin(T x, T y) {
+#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
+  return bfloat16::from_bits(__clc_fmin(x.raw(), y.raw()));
+#else
+  std::ignore = x;
+  (void)y;
+  throw runtime_error("bfloat16 is not currently supported on the host device.",
+                      PI_ERROR_INVALID_DEVICE);
+#endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
+}
+
+template <size_t N>
+sycl::marray<bfloat16, N> fmin(sycl::marray<bfloat16, N> x,
+                               sycl::marray<bfloat16, N> y) {
+#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
+  sycl::marray<bfloat16, N> res;
+  auto x_storage = reinterpret_cast<uint32_t const *>(&x);
+  auto y_storage = reinterpret_cast<uint32_t const *>(&y);
+  auto res_storage = reinterpret_cast<uint32_t *>(&res);
+
+  for (size_t i = 0; i < N / 2; i++)
+    res_storage[i] = __clc_fmin(x_storage[i], y_storage[i]);
+
+  if constexpr (N % 2) {
+    res[N - 1] =
+        bfloat16::from_bits(__clc_fmin(x[N - 1].raw(), y[N - 1].raw()));
+  }
+
+  return res;
+#else
+  std::ignore = x;
+  (void)y;
+  throw runtime_error("bfloat16 is not currently supported on the host device.",
+                      PI_ERROR_INVALID_DEVICE);
+#endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
+}
+
+template <typename T>
+std::enable_if_t<std::is_same<T, bfloat16>::value, T> fmax(T x, T y) {
+#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
+  return bfloat16::from_bits(__clc_fmax(x.raw(), y.raw()));
+#else
+  std::ignore = x;
+  (void)y;
+  throw runtime_error("bfloat16 is not currently supported on the host device.",
+                      PI_ERROR_INVALID_DEVICE);
+#endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
+}
+
+template <size_t N>
+sycl::marray<bfloat16, N> fmax(sycl::marray<bfloat16, N> x,
+                               sycl::marray<bfloat16, N> y) {
+#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
+  sycl::marray<bfloat16, N> res;
+  auto x_storage = reinterpret_cast<uint32_t const *>(&x);
+  auto y_storage = reinterpret_cast<uint32_t const *>(&y);
+  auto res_storage = reinterpret_cast<uint32_t *>(&res);
+
+  for (size_t i = 0; i < N / 2; i++)
+    res_storage[i] = __clc_fmax(x_storage[i], y_storage[i]);
+
+  if constexpr (N % 2) {
+    res[N - 1] =
+        bfloat16::from_bits(__clc_fmax(x[N - 1].raw(), y[N - 1].raw()));
+  }
+  return res;
+#else
+  std::ignore = x;
+  (void)y;
+  throw runtime_error("bfloat16 is not currently supported on the host device.",
+                      PI_ERROR_INVALID_DEVICE);
+#endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
+}
+
+template <typename T>
+std::enable_if_t<std::is_same<T, bfloat16>::value, T> fma(T x, T y, T z) {
+#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
+  return bfloat16::from_bits(__clc_fma(x.raw(), y.raw(), z.raw()));
+#else
+  std::ignore = x;
+  (void)y;
+  (void)z;
+  throw runtime_error("bfloat16 is not currently supported on the host device.",
+                      PI_ERROR_INVALID_DEVICE);
+#endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
+}
+
+template <size_t N>
+sycl::marray<bfloat16, N> fma(sycl::marray<bfloat16, N> x,
+                              sycl::marray<bfloat16, N> y,
+                              sycl::marray<bfloat16, N> z) {
+#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
+  sycl::marray<bfloat16, N> res;
+  auto x_storage = reinterpret_cast<uint32_t const *>(&x);
+  auto y_storage = reinterpret_cast<uint32_t const *>(&y);
+  auto z_storage = reinterpret_cast<uint32_t const *>(&z);
+  auto res_storage = reinterpret_cast<uint32_t *>(&res);
+
+  for (size_t i = 0; i < N / 2; i++)
+    res_storage[i] = __clc_fma(x_storage[i], y_storage[i], z_storage[i]);
+
+  if constexpr (N % 2) {
+    res[N - 1] = bfloat16::from_bits(
+        __clc_fma(x[N - 1].raw(), y[N - 1].raw(), z[N - 1].raw()));
+  }
+  return res;
+#else
+  std::ignore = x;
+  (void)y;
+  throw runtime_error("bfloat16 is not currently supported on the host device.",
+                      PI_ERROR_INVALID_DEVICE);
+#endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
+}
 
-} // namespace sycl
+} // namespace sycl::ext::oneapi::experimental
 } // __SYCL_INLINE_NAMESPACE(cl)
 
 #undef __SYCL_CONSTANT_AS