ROCm
diff --git a/‎tensorflow/tools/ci_build/linux/rocm/test_ocp_fp8.sh
+75 b/‎tensorflow/tools/ci_build/linux/rocm/test_ocp_fp8.sh
+75
diff --git a/‎third_party/xla/xla/debug_options_flags.cc
+1-1 b/‎third_party/xla/xla/debug_options_flags.cc
+1-1
diff --git a/‎third_party/xla/xla/service/gpu/autotuning/gemm_algorithm_picker.cc
+13-1 b/‎third_party/xla/xla/service/gpu/autotuning/gemm_algorithm_picker.cc
+13-1
diff --git a/‎third_party/xla/xla/service/gpu/backend_configs.proto
+2 b/‎third_party/xla/xla/service/gpu/backend_configs.proto
+2
diff --git a/‎third_party/xla/xla/service/gpu/buffer_comparator.cc
+2-2 b/‎third_party/xla/xla/service/gpu/buffer_comparator.cc
+2-2
diff --git a/‎third_party/xla/xla/service/gpu/buffer_comparator.cu.cc
+33-15 b/‎third_party/xla/xla/service/gpu/buffer_comparator.cu.cc
+33-15
diff --git a/‎third_party/xla/xla/service/gpu/matmul_utils.cc
+6-5 b/‎third_party/xla/xla/service/gpu/matmul_utils.cc
+6-5
diff --git a/‎third_party/xla/xla/service/gpu/matmul_utils.h
+2-2 b/‎third_party/xla/xla/service/gpu/matmul_utils.h
+2-2
diff --git a/‎third_party/xla/xla/service/gpu/runtime/command_buffer_thunk_test.cc
+2-2 b/‎third_party/xla/xla/service/gpu/runtime/command_buffer_thunk_test.cc
+2-2
diff --git a/‎third_party/xla/xla/service/gpu/runtime/dynamic_slice_thunk_test.cc
+6-6 b/‎third_party/xla/xla/service/gpu/runtime/dynamic_slice_thunk_test.cc
+6-6
@@ -0,0 +1,75 @@
+#!/usr/bin/env bash
+# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# ==============================================================================
+
+set -e
+set -x
+
+N_BUILD_JOBS=$(grep -c ^processor /proc/cpuinfo)
+# If rocm-smi exists locally (it should) use it to find
+# out how many GPUs we have to test with.
+rocm-smi -i
+STATUS=$?
+if [ $STATUS -ne 0 ]; then TF_GPU_COUNT=1; else
+   TF_GPU_COUNT=$(rocm-smi -i|grep 'Device ID' |grep 'GPU' |wc -l)
+fi
+TF_TESTS_PER_GPU=1
+N_TEST_JOBS=$(expr ${TF_GPU_COUNT} \* ${TF_TESTS_PER_GPU})
+
+echo ""
+echo "Bazel will use ${N_BUILD_JOBS} concurrent build job(s) and ${N_TEST_JOBS} concurrent test job(s)."
+echo ""
+
+# First positional argument (if any) specifies the ROCM_INSTALL_DIR
+if [[ -n $1 ]]; then
+    ROCM_INSTALL_DIR=$1
+else
+    if [[ -z "${ROCM_PATH}" ]]; then
+        ROCM_INSTALL_DIR=/opt/rocm/
+    else
+        ROCM_INSTALL_DIR=$ROCM_PATH
+    fi
+fi
+
+export PYTHON_BIN_PATH=`which python3`
+PYTHON_VERSION=`python3 -c "import sys;print(f'{sys.version_info.major}.{sys.version_info.minor}')"`
+export TF_PYTHON_VERSION=$PYTHON_VERSION
+export TF_NEED_ROCM=1
+export ROCM_PATH=$ROCM_INSTALL_DIR
+
+if [ -f /usertools/rocm.bazelrc ]; then
+        # Use the bazelrc files in /usertools if available
+	if [ ! -d /tf ];then
+           # The bazelrc files in /usertools expect /tf to exist
+           mkdir /tf
+        fi
+ 
+	bazel \
+    		--bazelrc=/usertools/rocm.bazelrc \
+        	test \
+    		--config=sigbuild_local_cache \
+    		--config=rocm \
+    		--config=xla_cpp_filters \
+    		--test_output=errors \
+    		--local_test_jobs=${N_TEST_JOBS} \
+    		--test_env=TF_TESTS_PER_GPU=$TF_TESTS_PER_GPU \
+    		--test_env=TF_GPU_COUNT=$TF_GPU_COUNT \
+			--test_output=streamed \
+			--test_env=TF_CPP_VMODULE="gemm_rewriter=3" \
+			--test_env=XLA_FLAGS="--xla_dump_to=/tmp/generated --xla_dump_hlo_as_text --xla_dump_hlo_as_html --xla_gpu_enable_cublaslt=true --xla_gpu_autotune_level=4" \
+    		--action_env=XLA_FLAGS=--xla_gpu_force_compilation_parallelism=16 \
+    		-- @local_xla//xla/service/gpu/transforms:gemm_rewriter_test_gpu_amd_any
+fi
@@ -110,7 +110,7 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
 
   opts.set_xla_gpu_enable_cudnn_frontend(true);
 
-  opts.set_xla_gpu_enable_cublaslt(false);
+  opts.set_xla_gpu_enable_cublaslt(true);
 
   opts.add_xla_gpu_enable_command_buffer(DebugOptions::FUSION);
   opts.add_xla_gpu_enable_command_buffer(DebugOptions::CUBLAS);
 
@@ -168,9 +168,21 @@ class GemmAutotuner {
     se::DeviceMemoryBase a_scale_buffer, b_scale_buffer, c_scale_buffer,
         d_scale_buffer, d_amax_buffer, bias_buffer, aux_buffer;
 
+    int input_buffer_idx = 2;  // lhs is at 0, rhs is at 1
     if (has_vector_bias) {
-      bias_buffer = rz_buffers_.input_buffers().at(has_matrix_bias ? 3 : 2);
+      if (has_matrix_bias) {
+        input_buffer_idx++;
+      }
+      bias_buffer = rz_buffers_.input_buffers().at(input_buffer_idx++);
+    }
+    // In the current GemmRewriter design for FP8, the a/b scales remain active
+    // even when they are not used. Consequently, we must inform the autotuner
+    // so it can choose algorithms that properly support a/b scales.
+    if (gemm_config.is_fp8) {
+      a_scale_buffer = rz_buffers_.input_buffers().at(input_buffer_idx++);
+      b_scale_buffer = rz_buffers_.input_buffers().at(input_buffer_idx++);
     }
+    
     if (has_aux_output) {
       aux_buffer = rz_buffers_.output_buffers().at(1);
     }
 
@@ -105,6 +105,8 @@ message GemmBackendConfig {
   optional bool grad_x = 16;
   optional bool grad_y = 17;
   bool damax_output = 18;
+
+  bool is_fp8 = 19;
 }
 
 // Backend config for bitcast operation generated from MLIR MHLO dialect.
 
@@ -187,7 +187,7 @@ absl::StatusOr<bool> BufferComparator::CompareEqual(
                           stream,        current,  expected};
 
   switch (shape_.element_type()) {
-#if GOOGLE_CUDA  // not available for ROCm yet..
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM && TF_ROCM_VERSION >= 60300
     case xla::F8E4M3FN:
       return CompareEqualParameterized<tsl::float8_e4m3fn, float>(
           "fp8_e4m3fn_comparison", buffer_comparator::fp8_e4m3fn_comparison(),
@@ -196,7 +196,7 @@ absl::StatusOr<bool> BufferComparator::CompareEqual(
       return CompareEqualParameterized<tsl::float8_e5m2, float>(
           "fp8_e5m2_comparison", buffer_comparator::fp8_e5m2_comparison(),
           params);
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM && TF_ROCM_VERSION >= 60300
 #if TENSORFLOW_USE_ROCM && TF_ROCM_VERSION >= 60200
     case xla::F8E4M3FNUZ:
       return CompareEqualParameterized<tsl::float8_e4m3fnuz, float>(
 
@@ -54,20 +54,29 @@ __device__ __inline__ float Canonicalize(float input) {
   return isnan(input) ? input : max(-65505.0f, min(input, 65505.0f));
 }
 
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM && TF_ROCM_VERSION >= 60300
+__global__ void xla_fp8_e4m3fn_comparison(
 #if GOOGLE_CUDA
-__global__ void xla_fp8_e4m3fn_comparison(__nv_fp8_storage_t* buffer_a,
-                                          __nv_fp8_storage_t* buffer_b,
-                                          float rel_error_threshold,
-                                          uint64_t buffer_length,
-                                          int* mismatch_count) {
+    __nv_fp8_storage_t* buffer_a, __nv_fp8_storage_t* buffer_b,
+#else  // TENSORFLOW_USE_ROCM && TF_ROCM_VERSION >= 60300
+    __hip_fp8_storage_t* buffer_a, __hip_fp8_storage_t* buffer_b,
+#endif
+    float rel_error_threshold, uint64_t buffer_length, int* mismatch_count) {
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
   if (idx >= buffer_length) return;
   // TODO(philipphack): Replace with direct conversion to float when this
   // functionality becomes available.
+#if GOOGLE_CUDA
   float elem_a =
       __half2float(__nv_cvt_fp8_to_halfraw(buffer_a[idx], __NV_E4M3));
   float elem_b =
       __half2float(__nv_cvt_fp8_to_halfraw(buffer_b[idx], __NV_E4M3));
+#else  // TENSORFLOW_USE_ROCM && TF_ROCM_VERSION >= 60300
+  float elem_a =
+      __half2float(__hip_cvt_fp8_to_halfraw(buffer_a[idx], __HIP_E4M3));
+  float elem_b =
+      __half2float(__hip_cvt_fp8_to_halfraw(buffer_b[idx], __HIP_E4M3));
+#endif
   elem_a = Canonicalize(elem_a);
   elem_b = Canonicalize(elem_b);
   if (isnan(elem_a) && isnan(elem_b)) return;
@@ -78,19 +87,28 @@ __global__ void xla_fp8_e4m3fn_comparison(__nv_fp8_storage_t* buffer_a,
     atomicAdd(mismatch_count, 1);
 }
 
-__global__ void xla_fp8_e5m2_comparison(__nv_fp8_storage_t* buffer_a,
-                                        __nv_fp8_storage_t* buffer_b,
-                                        float rel_error_threshold,
-                                        uint64_t buffer_length,
-                                        int* mismatch_count) {
+__global__ void xla_fp8_e5m2_comparison(
+#if GOOGLE_CUDA
+    __nv_fp8_storage_t* buffer_a, __nv_fp8_storage_t* buffer_b,
+#else  // TENSORFLOW_USE_ROCM && TF_ROCM_VERSION >= 60300
+    __hip_fp8_storage_t* buffer_a, __hip_fp8_storage_t* buffer_b,
+#endif
+    float rel_error_threshold, uint64_t buffer_length, int* mismatch_count) {
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
   if (idx >= buffer_length) return;
-  // TODO(philipphack): Replace with direct conversion to float when this
-  // functionality becomes available.
+// TODO(philipphack): Replace with direct conversion to float when this
+// functionality becomes available.
+#if GOOGLE_CUDA
   float elem_a =
       __half2float(__nv_cvt_fp8_to_halfraw(buffer_a[idx], __NV_E5M2));
   float elem_b =
       __half2float(__nv_cvt_fp8_to_halfraw(buffer_b[idx], __NV_E5M2));
+#else  // TENSORFLOW_USE_ROCM && TF_ROCM_VERSION >= 60300
+  float elem_a =
+      __half2float(__hip_cvt_fp8_to_halfraw(buffer_a[idx], __HIP_E5M2));
+  float elem_b =
+      __half2float(__hip_cvt_fp8_to_halfraw(buffer_b[idx], __HIP_E5M2));
+#endif
   elem_a = Canonicalize(elem_a);
   elem_b = Canonicalize(elem_b);
   if (isnan(elem_a) && isnan(elem_b)) return;
@@ -100,7 +118,7 @@ __global__ void xla_fp8_e5m2_comparison(__nv_fp8_storage_t* buffer_a,
   if (rel_error > rel_error_threshold || isnan(rel_error))
     atomicAdd(mismatch_count, 1);
 }
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM && TF_ROCM_VERSION >= 60300
 
 #if TENSORFLOW_USE_ROCM && TF_ROCM_VERSION >= 60200
 
@@ -262,15 +280,15 @@ __global__ void xla_int32_comparison(int* buffer_a, int* buffer_b,
 
 }  // namespace
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM && TF_ROCM_VERSION >= 60300
 void* fp8_e4m3fn_comparison() {
   return reinterpret_cast<void*>(&xla_fp8_e4m3fn_comparison);
 }
 
 void* fp8_e5m2_comparison() {
   return reinterpret_cast<void*>(&xla_fp8_e5m2_comparison);
 }
-#endif
+#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM && TF_ROCM_VERSION >= 60300
 
 #if TENSORFLOW_USE_ROCM && TF_ROCM_VERSION >= 60200
 void* fp8_e4m3fnuz_comparison() {
 
@@ -301,13 +301,13 @@ absl::StatusOr<bool> CanFoldTransposeOperandIntoDot(const HloInstruction& dot,
     double alpha_real, double alpha_imag, double beta,
     PrecisionConfig::Algorithm precision_algorithm,
     std::optional<int64_t> algorithm, int64_t compute_precision, bool grad_x,
-    bool grad_y) {
+    bool grad_y, bool is_fp8) {
   return GemmConfig::For(lhs_shape, lhs_batch_dims, lhs_contracting_dims,
                          rhs_shape, rhs_batch_dims, rhs_contracting_dims,
                          /*c_shape=*/output_shape, /*bias_shape_ptr=*/nullptr,
                          output_shape, alpha_real, alpha_imag, beta,
                          precision_algorithm, algorithm, compute_precision,
-                         grad_x, grad_y);
+                         grad_x, grad_y, is_fp8);
 }
 
 /*static*/ absl::StatusOr<GemmConfig> GemmConfig::For(
@@ -319,7 +319,7 @@ absl::StatusOr<bool> CanFoldTransposeOperandIntoDot(const HloInstruction& dot,
     double alpha_imag, double beta,
     PrecisionConfig::Algorithm precision_algorithm,
     std::optional<int64_t> algorithm, int64_t compute_precision, bool grad_x,
-    bool grad_y) {
+    bool grad_y, bool is_fp8) {
   absl::Span<const int64_t> lhs_col_dims = lhs_contracting_dims;
   TF_ASSIGN_OR_RETURN(
       std::vector<int64_t> lhs_row_dims,
@@ -436,7 +436,8 @@ absl::StatusOr<bool> CanFoldTransposeOperandIntoDot(const HloInstruction& dot,
                     precision_algorithm,
                     algorithm,
                     grad_x,
-                    grad_y};
+                    grad_y,
+                    is_fp8};
 }
 
 namespace {
@@ -509,7 +510,7 @@ bool IsTf32Allowed(PrecisionConfig::Algorithm algorithm,
       /*bias_shape_ptr=*/
       vector_bias_shape ? &vector_bias_shape.value() : nullptr, output_shape,
       config.alpha_real(), config.alpha_imag(), config.beta(),
-      precision_algorithm, algorithm, precision, grad_x, grad_y);
+      precision_algorithm, algorithm, precision, grad_x, grad_y, config.is_fp8());
 }
 
 absl::StatusOr<GemmConfig::DescriptorsTuple> GemmConfig::GetMatrixDescriptors(
 
@@ -121,7 +121,7 @@ struct GemmConfig : public se::gpu::GemmConfig {
       double alpha_real, double alpha_imag, double beta,
       PrecisionConfig::Algorithm precision_algorithm,
       std::optional<int64_t> algorithm, int64_t compute_precision, bool grad_x,
-      bool grad_y);
+      bool grad_y, bool is_fp8);
 
   // As above with additional `c_shape` and `bias_shape_ptr` parameter, both
   // which are only necessarily for F8 gemms.
@@ -134,7 +134,7 @@ struct GemmConfig : public se::gpu::GemmConfig {
       double alpha_imag, double beta,
       PrecisionConfig::Algorithm precision_algorithm,
       std::optional<int64_t> algorithm, int64_t compute_precision, bool grad_x,
-      bool grad_y);
+      bool grad_y, bool is_fp8);
 
   struct DescriptorsTuple {
     se::gpu::MatrixDescriptor lhs;
 
@@ -642,7 +642,7 @@ TEST(CommandBufferThunkTest, GemmCmd) {
                       ShapeUtil::MakeShape(PrimitiveType::F32, {4, 3}), {}, {0},
                       ShapeUtil::MakeShape(PrimitiveType::F32, {2, 3}), 1.0,
                       0.0, 0.0, PrecisionConfig::ALG_UNSET, std::nullopt,
-                      se::blas::kDefaultComputePrecision, false, false);
+                      se::blas::kDefaultComputePrecision, false, false, false);
   ASSERT_TRUE(config.ok());
 
   // Prepare commands sequence for constructing command buffer.
@@ -750,7 +750,7 @@ TEST(CommandBufferThunkTest, CublasLtCmd) {
       /*precision_algorithm*/ PrecisionConfig::ALG_UNSET,
       /*algorithm*/ std::nullopt,
       /*compute_precision*/ se::blas::kDefaultComputePrecision,
-      /*grad_x*/ false, /*grad_y*/ false);
+      /*grad_x*/ false, /*grad_y*/ false, /*is_fp8*/ false);
   ASSERT_TRUE(config.ok());
 
   // Prepare commands sequence for constructing command buffer.
 
@@ -126,7 +126,7 @@ TEST(DynamicSliceThunkTest, SlicedGemm) {
                       ShapeUtil::MakeShape(PrimitiveType::F32, {3, 1}), {}, {0},
                       ShapeUtil::MakeShape(PrimitiveType::F32, {1, 1}), 1.0,
                       0.0, 0.0, PrecisionConfig::ALG_UNSET, std::nullopt,
-                      se::blas::kDefaultComputePrecision, false, false);
+                      se::blas::kDefaultComputePrecision, false, false, false);
   ASSERT_TRUE(config.ok());
 
   // Creating embedded GEMM thunk.
@@ -278,7 +278,7 @@ TEST(DynamicSliceThunkTest, MulipleSlicedOperandsGemm) {
                       ShapeUtil::MakeShape(PrimitiveType::F32, {3, 1}), {}, {0},
                       ShapeUtil::MakeShape(PrimitiveType::F32, {1, 1}), 1.0,
                       0.0, 0.0, PrecisionConfig::ALG_UNSET, std::nullopt,
-                      se::blas::kDefaultComputePrecision, false, false);
+                      se::blas::kDefaultComputePrecision, false, false, false);
   ASSERT_TRUE(config.ok());
 
   // Creating embedded GEMM thunk.
@@ -797,7 +797,7 @@ TEST(DynamicSliceThunkTest, SlicedGemmArbitraryArgumentOrder) {
                       ShapeUtil::MakeShape(PrimitiveType::F32, {3, 1}), {}, {0},
                       ShapeUtil::MakeShape(PrimitiveType::F32, {1, 1}), 1.0,
                       0.0, 0.0, PrecisionConfig::ALG_UNSET, std::nullopt,
-                      se::blas::kDefaultComputePrecision, false, false);
+                      se::blas::kDefaultComputePrecision, false, false, false);
   ASSERT_TRUE(config.ok());
 
   // Creating embedded GEMM thunk.
@@ -945,7 +945,7 @@ TEST(DynamicSliceThunkTest, SlicedGemmArbitraryNumberOfArguments) {
                       ShapeUtil::MakeShape(PrimitiveType::F32, {3, 1}), {}, {0},
                       ShapeUtil::MakeShape(PrimitiveType::F32, {1, 1}), 1.0,
                       0.0, 0.0, PrecisionConfig::ALG_UNSET, std::nullopt,
-                      se::blas::kDefaultComputePrecision, false, false);
+                      se::blas::kDefaultComputePrecision, false, false, false);
   ASSERT_TRUE(config.ok());
 
   // Creating embedded GEMM thunk.
@@ -1086,7 +1086,7 @@ TEST(DynamicSliceThunkTest, SlicedTupledOperandGemm) {
                       ShapeUtil::MakeShape(PrimitiveType::F32, {3, 1}), {}, {0},
                       ShapeUtil::MakeShape(PrimitiveType::F32, {1, 1}), 1.0,
                       0.0, 0.0, PrecisionConfig::ALG_UNSET, std::nullopt,
-                      se::blas::kDefaultComputePrecision, false, false);
+                      se::blas::kDefaultComputePrecision, false, false, false);
   ASSERT_TRUE(config.ok());
 
   // Creating embedded GEMM thunk.
@@ -1439,7 +1439,7 @@ TEST(DynamicSliceThunkTest, SlicedOperandsSameBufferGemm) {
                       ShapeUtil::MakeShape(PrimitiveType::F32, {3, 1}), {}, {0},
                       ShapeUtil::MakeShape(PrimitiveType::F32, {1, 1}), 1.0,
                       0.0, 0.0, PrecisionConfig::ALG_UNSET, std::nullopt,
-                      se::blas::kDefaultComputePrecision, false, false);
+                      se::blas::kDefaultComputePrecision, false, false, false);
   ASSERT_TRUE(config.ok());
 
   // Creating embedded GEMM thunk.
Original file line number	Diff line number	Diff line change
`@@ -105,6 +105,8 @@ message GemmBackendConfig {`
`105`	`105`	`optional bool grad_x = 16;`
`106`	`106`	`optional bool grad_y = 17;`
`107`	`107`	`bool damax_output = 18;`
	`108`	`+`
	`109`	`+ bool is_fp8 = 19;`
`108`	`110`	`}`
`109`	`111`
`110`	`112`	`// Backend config for bitcast operation generated from MLIR MHLO dialect.`