diff --git a/sycl/include/CL/__spirv/spirv_ops.hpp b/sycl/include/CL/__spirv/spirv_ops.hpp index 020441c071e0b..9cc74c2aceafc 100644 --- a/sycl/include/CL/__spirv/spirv_ops.hpp +++ b/sycl/include/CL/__spirv/spirv_ops.hpp @@ -22,11 +22,11 @@ #endif #ifdef __SYCL_DEVICE_ONLY__ -template extern SYCL_EXTERNAL __spv::__spirv_JointMatrixINTEL * -__spirv_JointMatrixLoadINTEL(T *Ptr, std::size_t Stride, +__spirv_JointMatrixLoadINTEL(Ts *Ptr, std::size_t Stride, __spv::MatrixLayout Layout = L, __spv::Scope::Flag Sc = S, int MemOperand = 0); @@ -97,16 +97,18 @@ template *); -template -extern SYCL_EXTERNAL T __spirv_VectorExtractDynamic( +extern SYCL_EXTERNAL Ts __spirv_VectorExtractDynamic( __spv::__spirv_JointMatrixINTEL *, size_t i); -template extern SYCL_EXTERNAL __spv::__spirv_JointMatrixINTEL * __spirv_VectorInsertDynamic(__spv::__spirv_JointMatrixINTEL *, - T val, size_t i); + Ts val, size_t i); #ifndef __SPIRV_BUILTIN_DECLARATIONS__ #error \ diff --git a/sycl/include/sycl/ext/oneapi/matrix/matrix-jit.hpp b/sycl/include/sycl/ext/oneapi/matrix/matrix-jit.hpp index 3368919af9438..bfe215d06790f 100644 --- a/sycl/include/sycl/ext/oneapi/matrix/matrix-jit.hpp +++ b/sycl/include/sycl/ext/oneapi/matrix/matrix-jit.hpp @@ -72,42 +72,67 @@ struct joint_matrix { } }; -template struct helper_traits { + using element_type = T; + using storage_element_type = T; + using fill_argument_type = T; +}; + +template <> struct helper_traits { + using element_type = precision::tf32; + using storage_element_type = float; + using fill_argument_type = float; +}; + +template inline __SYCL_ALWAYS_INLINE void joint_matrix_load(Group sg, - joint_matrix &res, - multi_ptr src, size_t stride, matrix_layout MemL) { + joint_matrix &res, + multi_ptr src, size_t stride, matrix_layout MemL) { #ifdef __SYCL_DEVICE_ONLY__ - T *Ptr = src.get(); + // For non tf32 case, check that Te is the same that Tm + Tm *Ptr = src.get(); + using Ts = typename helper_traits::storage_element_type; switch (MemL) { default: assert(false && "Invalid Memory Layout!"); case matrix_layout::row_major: res.spvm = - __spirv_JointMatrixLoadINTEL::value>( Ptr, stride, __spv::MatrixLayout::RowMajor, spv_scope_traits::value); break; case matrix_layout::col_major: res.spvm = - __spirv_JointMatrixLoadINTEL::value>( Ptr, stride, __spv::MatrixLayout::ColumnMajor, spv_scope_traits::value); break; case matrix_layout::packed_a: res.spvm = - __spirv_JointMatrixLoadINTEL::value>( Ptr, stride, __spv::MatrixLayout::PackedA, spv_scope_traits::value); break; case matrix_layout::packed_b: res.spvm = - __spirv_JointMatrixLoadINTEL::value>( Ptr, stride, __spv::MatrixLayout::PackedB, spv_scope_traits::value); @@ -235,12 +260,16 @@ class wi_element { std::size_t idx; public: + using storage_element_type = typename helper_traits::storage_element_type; wi_element(joint_matrix &Mat, std::size_t i) : M(Mat), idx(i) {} - operator T() { + operator storage_element_type() { #ifdef __SYCL_DEVICE_ONLY__ - return __spirv_VectorExtractDynamic(M.spvm, idx); + // __spirv_VectorExtractDynamic returns storage_element_type + storage_element_type elem = + __spirv_VectorExtractDynamic(M.spvm, idx); + return elem; #else throw runtime_error("joint matrix is not supported on host device.", PI_ERROR_INVALID_DEVICE); @@ -249,7 +278,10 @@ class wi_element { explicit operator bool() { #ifdef __SYCL_DEVICE_ONLY__ - return __spirv_VectorExtractDynamic(M.spvm, idx) != static_cast(0); + // __spirv_VectorExtractDynamic returns storage_element_type + storage_element_type elems = + __spirv_VectorExtractDynamic(M.spvm, idx); + return elems != static_cast(0); #else throw runtime_error("joint matrix is not supported on host device.", PI_ERROR_INVALID_DEVICE); @@ -258,7 +290,9 @@ class wi_element { template wi_element &operator=(const T2 &rhs) { #ifdef __SYCL_DEVICE_ONLY__ - M.spvm = __spirv_VectorInsertDynamic(M.spvm, static_cast(rhs), idx); + // __spirv_VectorInsertDynamic takes storage_element_type as argument + M.spvm = __spirv_VectorInsertDynamic( + M.spvm, static_cast(rhs), idx); return *this; #else (void)rhs; @@ -283,10 +317,12 @@ class wi_element { #if __SYCL_DEVICE_ONLY__ #define OP(op) \ template wi_element &operator op##=(const T2 &rhs) { \ + storage_element_type elems = \ + __spirv_VectorExtractDynamic(M.spvm, idx); \ M.spvm = __spirv_VectorInsertDynamic( \ M.spvm, \ - static_cast(__spirv_VectorExtractDynamic(M.spvm, idx) \ - op static_cast(rhs)), \ + static_cast( \ + elems op static_cast(rhs)), \ idx); \ return *this; \ } diff --git a/sycl/test/matrix/matrix-tf32-test.cpp b/sycl/test/matrix/matrix-tf32-test.cpp new file mode 100644 index 0000000000000..abc616edb2545 --- /dev/null +++ b/sycl/test/matrix/matrix-tf32-test.cpp @@ -0,0 +1,175 @@ +// RUN: %clangxx -fsycl -O2 %s -o %t.out + +#include +#if (SYCL_EXT_ONEAPI_MATRIX == 2) +#include + +using namespace sycl; +using namespace sycl::ext::oneapi::experimental::matrix; + +auto constexpr SG_SZ = 8; + +#define TM 8 +#define TN SG_SZ +#define TK 16 + +template struct big_matrix { +public: + T *mat; + +public: + T *get_data() { return mat; } + void set_data(T *data) { mat = data; } + big_matrix(T *data) : mat(data) {} +}; + +// this should be replaced with a DPC++ and spirv functions +float round_to_tf32(float a) { + uint32_t tmp_uint = reinterpret_cast(a); + tmp_uint += 0x1000u; // Round up the 13th last bit + tmp_uint &= 0xFFFFE000u; // Zero out the bottom 13 bits + float ret = reinterpret_cast(tmp_uint); + return ret; +} + +template +void matrix_multiply(big_matrix &C, + big_matrix &A, + big_matrix &B) { + size_t M = NUM_ROWS_C; + size_t N = NUM_COLS_C; + size_t K = NUM_COLS_A; + + assert(NUM_ROWS_C == NUM_ROWS_A && NUM_COLS_A == NUM_ROWS_B); + size_t NDRangeM = M / TM; + size_t NDRangeN = N / TN; + // buffer bufA(A.get_data(), range<2>(M, K)); + buffer bufA(A.get_data(), range<2>(M, K)); + buffer bufB(B.get_data(), range<2>(K, N)); + buffer bufC((float *)C.get_data(), range<2>(M, N)); + + queue q; + q.submit([&](handler &cgh) { + auto accC = bufC.get_access(cgh); + auto accA = bufA.get_access(cgh); + auto accB = bufB.get_access(cgh); + + cgh.parallel_for( + nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}), [= + ](nd_item<2> spmd_item)[[intel::reqd_sub_group_size(SG_SZ)]] + + { + // The matrix API has to be accessed by all the workitems in a + // subgroup these functions will be called once by the subgroup no + // code divergence between the workitems + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + sub_group sg = spmd_item.get_sub_group(); + joint_matrix sub_a(sg); + joint_matrix sub_b( + sg); + joint_matrix sub_c(sg); + joint_matrix_load(sg, sub_c, + accC.get_pointer() + (sg_startx * TM) * N + + sg_starty / SG_SZ * TN, + N, matrix_layout::row_major); + for (int k = 0; k < K; k += TK) { + joint_matrix_load(sg, sub_a, + accA.get_pointer() + (sg_startx * TM) * K + k, K, + matrix_layout::row_major); + // Assume we alreay in vnni format. + joint_matrix_load(sg, sub_b, + accB.get_pointer() + (k) * (N) + + sg_starty / SG_SZ * TN, + N, matrix_layout::packed_b); + // If no rounding to tf32 function is called, the mad function will + // work on truncated floats. + // TODO: change signature of __spirv_VectorInsertDynamic to have + // two types: matrix type can be different from value type + for (int i = 0; i < sub_a.get_wi_data().length(); i++) { + sub_a.get_wi_data()[i] = round_to_tf32(sub_a.get_wi_data()[i]); + } + for (int i = 0; i < sub_b.get_wi_data().length(); i++) { + sub_b.get_wi_data()[i] = round_to_tf32(sub_b.get_wi_data()[i]); + } + sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c); + } + auto wi_slice_a = sub_a.get_wi_data(); + for (int i = 0; i < wi_slice_a.length(); i++) { + float elem = wi_slice_a[i]; + wi_slice_a[i] *= 2; + } + joint_matrix_store(sg, sub_c, + accC.get_pointer() + (sg_startx * TM) * N + + sg_starty / SG_SZ * TN, + N, matrix_layout::row_major); + }); // parallel for + }) + .wait(); +} + +static constexpr size_t MATRIX_M = TM * 2; +static constexpr size_t MATRIX_N = TN * 2; +static constexpr size_t MATRIX_K = TK * 2; +float A[MATRIX_M][MATRIX_K]; +float B[MATRIX_K][MATRIX_N]; +float C[MATRIX_M][MATRIX_N]; +float D[MATRIX_M][MATRIX_N]; + +void matrix_multiply_ref(float *A_mem, float *B_mem, float *C_mem, int M, int N, + int K) { + for (int m = 0; m < M; m++) + for (int n = 0; n < N; n++) { + for (int k = 0; k < K; k++) { + float va = A_mem[m * K + k]; + float vb = B_mem[k * N + n]; + float acc = C_mem[m * N + n]; + C_mem[m * N + n] = va * vb; + } + } +} + +int main() { + for (int i = 0; i < MATRIX_M; i++) { + for (int j = 0; j < MATRIX_K; j++) { + A[i][j] = 1.0f * (i + j); + } + } + for (int i = 0; i < MATRIX_K / 2; i++) { + for (int j = 0; j < MATRIX_N * 2; j++) { + B[i][j] = 2.0f * i + 3.0f * j; + } + } + for (int i = 0; i < MATRIX_M; i++) { + for (int j = 0; j < MATRIX_N; j++) { + C[i][j] = 1.0; + D[i][j] = 1.0; + } + } + + big_matrix MC((float *)&C); + big_matrix MD((float *)&D); + big_matrix MA((float *)&A); + big_matrix MB((float *)&B); + matrix_multiply(MC, MA, MB); + matrix_multiply_ref((float *)A, (float *)B, (float *)D, MATRIX_M, MATRIX_N, + MATRIX_K / 2); + + bool res = true; + for (int i = 0; i < MATRIX_M; i++) { + for (int j = 0; j < MATRIX_N; j++) { + if (C[i][j] != D[i][j]) + res = false; + } + } + if (res) + std::cout << "passed\n"; + else + std::cout << "failed\n"; +} +#endif // (SYCL_EXT_ONEAPI_MATRIX == 2)