intel · bader · Apr 23, 2021 · Apr 21, 2021
@@ -405,26 +405,27 @@ inline __SYCL_ALWAYS_INLINE typename std::enable_if<
         (LayoutA == matrix_layout::row_major) &&
         (LayoutB == matrix_layout::packed_b) &&
         (LayoutC == matrix_layout::row_major),
-    void>::type
+    joint_matrix<Group, T2, NumRowsC, NumColsC, LayoutC>>::type
 joint_matrix_mad(Group sg,
                  joint_matrix<Group, T1, NumRowsA, NumColsA, LayoutA> &jmA,
                  joint_matrix<Group, T1, NumRowsB, NumColsB, LayoutB> &jmB,
                  joint_matrix<Group, T2, NumRowsC, NumColsC, LayoutC> &jmC) {
+  joint_matrix<Group, T2, NumRowsC, NumColsC, LayoutC> res(jmC);
   constexpr size_t epd = detail::elems_per_dword<T1>::value;
   // If A is large and C is small, in joint_matrix_load, we do memcpy for A, and
   // we do tileload for C whose shape is not tile_size*tile_size*4. In
   // joint_matrix_mad, we do tileload for A and shape is tile_size*tile_size*4.
   // So we need to reshape C before we do dpbssd.
-  bool Cshouldreload = jmC.isSmall && !jmA.isSmall && !jmB.isSmall;
+  bool Cshouldreload = res.isSmall && !jmA.isSmall && !jmB.isSmall;
   bool Ashouldreload = jmA.isSmall && !jmB.isSmall;
   bool Bshouldreload = jmB.isSmall && !jmA.isSmall;
 
-  for (int m = 0; m < jmC.trows; ++m) {
-    for (int n = 0; n < jmC.tcols; ++n) {
+  for (int m = 0; m < res.trows; ++m) {
+    for (int n = 0; n < res.tcols; ++n) {
       detail::submatrix<T2> sub_c;
 
       // AMX: 8 register tiles : 1k byte size, SMmaxxSKmax =16x64
-      submatrix_load(sub_c, jmC, m * tile_size, n * tile_size, jmC.stride,
+      submatrix_load(sub_c, res, m * tile_size, n * tile_size, res.stride,
                      matrix_layout::row_major, Cshouldreload);
       for (int k = 0; k < jmA.tcols; ++k) { // K->int8_t
         detail::submatrix<T1> sub_a;
@@ -436,11 +437,11 @@ joint_matrix_mad(Group sg,
                        jmB.stride, matrix_layout::packed_b, Bshouldreload);
         submatrix_mad(sub_a, sub_b, sub_c);
       }
-      submatrix_store(sub_c, jmC, m * tile_size, n * tile_size, jmC.stride,
+      submatrix_store(sub_c, res, m * tile_size, n * tile_size, res.stride,
                       matrix_layout::row_major, Cshouldreload);
     }
   }
-  return;
+  return res;
 }
 
 } // namespace experimental::matrix

@@ -83,7 +83,7 @@ void matrix_multiply(big_matrix<T1, NUM_ROWS_C, NUM_COLS_C> &C, big_matrix<T2, N
                                accB.get_pointer() +
                                    (k * TK / 2) * (N * 2) + sg_starty * TN * 2,
                                N * 2, matrix_layout::packed_b);
-             joint_matrix_mad(sg, sub_a, sub_b, sub_c);
+             sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c);
            }
            joint_matrix_store(sg, sub_c,
                               accC.get_pointer() + (sg_startx * TM) * N +

@@ -83,7 +83,7 @@ void matrix_multiply(big_matrix<T1, NUM_ROWS_C, NUM_COLS_C> &C, big_matrix<T2, N
                                accB.get_pointer() +
                                    (k * TK / 4) * (N * 4) + sg_starty * TN * 4,
                                N * 4,  matrix_layout::packed_b);
-             joint_matrix_mad(sg, sub_a, sub_b, sub_c);
+             sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c);
            }
            joint_matrix_store(sg, sub_c,
                               accC.get_pointer() + (sg_startx * TM) * N +