intel
diff --git a/‎sycl/include/CL/sycl.hpp
-1 b/‎sycl/include/CL/sycl.hpp
-1
diff --git a/‎sycl/include/sycl/ext/oneapi/experimental/builtins.hpp
+157-8 b/‎sycl/include/sycl/ext/oneapi/experimental/builtins.hpp
+157-8
@@ -60,7 +60,6 @@
 #if SYCL_EXT_ONEAPI_BACKEND_LEVEL_ZERO
 #include <sycl/ext/oneapi/backend/level_zero.hpp>
 #endif
-#include <sycl/ext/oneapi/bf16_storage_builtins.hpp>
 #include <sycl/ext/oneapi/device_global/properties.hpp>
 #include <sycl/ext/oneapi/experimental/builtins.hpp>
 #include <sycl/ext/oneapi/experimental/cuda/barrier.hpp>
 
@@ -15,6 +15,7 @@
 #include <CL/sycl/detail/type_traits.hpp>
 
 #include <CL/__spirv/spirv_ops.hpp>
+#include <sycl/ext/oneapi/experimental/bfloat16.hpp>
 
 // TODO Decide whether to mark functions with this attribute.
 #define __NOEXC /*noexcept*/
@@ -26,10 +27,15 @@
 #endif
 
 __SYCL_INLINE_NAMESPACE(cl) {
-namespace sycl {
-namespace ext {
-namespace oneapi {
-namespace experimental {
+namespace sycl::ext::oneapi::experimental {
+namespace detail {
+template <size_t N>
+uint32_t to_uint32_t(sycl::marray<bfloat16, N> x, size_t start) {
+  uint32_t res;
+  std::memcpy(&res, &x[start], sizeof(uint32_t));
+  return res;
+}
+} // namespace detail
 
 // Provides functionality to print data from kernels in a C way:
 // - On non-host devices this function is directly mapped to printf from
@@ -117,11 +123,154 @@ inline __SYCL_ALWAYS_INLINE
 
 } // namespace native
 
-} // namespace experimental
-} // namespace oneapi
-} // namespace ext
+template <typename T>
+std::enable_if_t<std::is_same<T, bfloat16>::value, T> fabs(T x) {
+#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
+  return bfloat16::from_bits(__clc_fabs(x.raw()));
+#else
+  std::ignore = x;
+  throw runtime_error("bfloat16 is not currently supported on the host device.",
+                      PI_ERROR_INVALID_DEVICE);
+#endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
+}
+
+template <size_t N>
+sycl::marray<bfloat16, N> fabs(sycl::marray<bfloat16, N> x) {
+#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
+  sycl::marray<bfloat16, N> res;
+
+  for (size_t i = 0; i < N / 2; i++) {
+    auto partial_res = __clc_fabs(detail::to_uint32_t(x, i * 2));
+    std::memcpy(&res[i * 2], &partial_res, sizeof(uint32_t));
+  }
+
+  if constexpr (N % 2) {
+    res[N - 1] = bfloat16::from_bits(__clc_fabs(x[N - 1].raw()));
+  }
+  return res;
+#else
+  std::ignore = x;
+  throw runtime_error("bfloat16 is not currently supported on the host device.",
+                      PI_ERROR_INVALID_DEVICE);
+#endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
+}
+
+template <typename T>
+std::enable_if_t<std::is_same<T, bfloat16>::value, T> fmin(T x, T y) {
+#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
+  return bfloat16::from_bits(__clc_fmin(x.raw(), y.raw()));
+#else
+  std::ignore = x;
+  (void)y;
+  throw runtime_error("bfloat16 is not currently supported on the host device.",
+                      PI_ERROR_INVALID_DEVICE);
+#endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
+}
+
+template <size_t N>
+sycl::marray<bfloat16, N> fmin(sycl::marray<bfloat16, N> x,
+                               sycl::marray<bfloat16, N> y) {
+#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
+  sycl::marray<bfloat16, N> res;
+
+  for (size_t i = 0; i < N / 2; i++) {
+    auto partial_res = __clc_fmin(detail::to_uint32_t(x, i * 2),
+                                  detail::to_uint32_t(y, i * 2));
+    std::memcpy(&res[i * 2], &partial_res, sizeof(uint32_t));
+  }
+
+  if constexpr (N % 2) {
+    res[N - 1] =
+        bfloat16::from_bits(__clc_fmin(x[N - 1].raw(), y[N - 1].raw()));
+  }
+
+  return res;
+#else
+  std::ignore = x;
+  (void)y;
+  throw runtime_error("bfloat16 is not currently supported on the host device.",
+                      PI_ERROR_INVALID_DEVICE);
+#endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
+}
+
+template <typename T>
+std::enable_if_t<std::is_same<T, bfloat16>::value, T> fmax(T x, T y) {
+#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
+  return bfloat16::from_bits(__clc_fmax(x.raw(), y.raw()));
+#else
+  std::ignore = x;
+  (void)y;
+  throw runtime_error("bfloat16 is not currently supported on the host device.",
+                      PI_ERROR_INVALID_DEVICE);
+#endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
+}
+
+template <size_t N>
+sycl::marray<bfloat16, N> fmax(sycl::marray<bfloat16, N> x,
+                               sycl::marray<bfloat16, N> y) {
+#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
+  sycl::marray<bfloat16, N> res;
+
+  for (size_t i = 0; i < N / 2; i++) {
+    auto partial_res = __clc_fmax(detail::to_uint32_t(x, i * 2),
+                                  detail::to_uint32_t(y, i * 2));
+    std::memcpy(&res[i * 2], &partial_res, sizeof(uint32_t));
+  }
+
+  if constexpr (N % 2) {
+    res[N - 1] =
+        bfloat16::from_bits(__clc_fmax(x[N - 1].raw(), y[N - 1].raw()));
+  }
+  return res;
+#else
+  std::ignore = x;
+  (void)y;
+  throw runtime_error("bfloat16 is not currently supported on the host device.",
+                      PI_ERROR_INVALID_DEVICE);
+#endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
+}
+
+template <typename T>
+std::enable_if_t<std::is_same<T, bfloat16>::value, T> fma(T x, T y, T z) {
+#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
+  return bfloat16::from_bits(__clc_fma(x.raw(), y.raw(), z.raw()));
+#else
+  std::ignore = x;
+  (void)y;
+  (void)z;
+  throw runtime_error("bfloat16 is not currently supported on the host device.",
+                      PI_ERROR_INVALID_DEVICE);
+#endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
+}
+
+template <size_t N>
+sycl::marray<bfloat16, N> fma(sycl::marray<bfloat16, N> x,
+                              sycl::marray<bfloat16, N> y,
+                              sycl::marray<bfloat16, N> z) {
+#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
+  sycl::marray<bfloat16, N> res;
+
+  for (size_t i = 0; i < N / 2; i++) {
+    auto partial_res =
+        __clc_fma(detail::to_uint32_t(x, i * 2), detail::to_uint32_t(y, i * 2),
+                  detail::to_uint32_t(z, i * 2));
+    std::memcpy(&res[i * 2], &partial_res, sizeof(uint32_t));
+  }
+
+  if constexpr (N % 2) {
+    res[N - 1] = bfloat16::from_bits(
+        __clc_fma(x[N - 1].raw(), y[N - 1].raw(), z[N - 1].raw()));
+  }
+  return res;
+#else
+  std::ignore = x;
+  (void)y;
+  throw runtime_error("bfloat16 is not currently supported on the host device.",
+                      PI_ERROR_INVALID_DEVICE);
+#endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
+}
 
-} // namespace sycl
+} // namespace sycl::ext::oneapi::experimental
 } // __SYCL_INLINE_NAMESPACE(cl)
 
 #undef __SYCL_CONSTANT_AS