Skip to content

Commit e8b506b

Browse files
authored
[MLIR][XeGPU] Switch to 1D representation for SIMT code (#135116)
This PR switches to using a 1D vector to represent SIMT code for simplification.
1 parent 12becff commit e8b506b

File tree

6 files changed

+281
-342
lines changed

6 files changed

+281
-342
lines changed

mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td

Lines changed: 8 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -833,30 +833,27 @@ def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>]
833833
data type, the matrices are `A: vector<8x16xf16>`, `B: vector<16x16xf16>`,
834834
and `C/D: vector<8x16xf32>`. Besides the matrix size requirements, DPAS
835835
also requires A and B to be loaded with the required data layout. Specially,
836-
837836
VNNI layout is required for B operand. It is achieved via adding `packed`
838837
attribute to the `load_nd` operator. Due to the VNNI transformation, B operands
839838
can be represented as a 3D vector, with the last dimension representing the VNNI
840839
factor, which is computed as `32/bit_width_of_elem_type`. Thus, `B: vector<16x16xf16>`
841840
can be represented as `B: vector<8x16x2xf16>`.
842841

843-
In SIMT mode, DpasOp expects layout attributes `a`, `b`, and `c` (only if acc is used)
844-
which describe the data fragment owned by each work-item w.r.t. the tensor descriptor
845-
these data are loaded from.
842+
In SIMT code, each work-item from a subgroup holds a data fragment for A, B, C and the result,
843+
which are represented as 1D vectors. Please refer to [OpenCL Intel extentions]
844+
(https://registry.khronos.org/OpenCL/extensions/intel/cl_intel_subgroup_matrix_multiply_accumulate.html)
845+
for more details about the fragment distribution.
846846

847847
Note: on PVC, the hardware can perform load with VNNI transformation when data
848848
element type is 16-bit or lower precision, taking 2 or 4 elements from
849849
the first dimension and inserted into the newly added innermost dimension.
850850
}];
851851

852852
let arguments = (ins
853-
XeGPU_DpasOpType : $lhs,
854-
XeGPU_DpasOpType : $rhs,
855-
Optional<XeGPU_Vector2DType>: $acc,
856-
OptionalAttr<XeGPU_LayoutAttr>:$a_layout,
857-
OptionalAttr<XeGPU_LayoutAttr>:$b_layout,
858-
OptionalAttr<XeGPU_LayoutAttr>:$c_layout);
859-
let results = (outs XeGPU_Vector2DType: $result);
853+
XeGPU_DpasOprType : $lhs,
854+
XeGPU_DpasOprType : $rhs,
855+
Optional<XeGPU_DpasResType>: $acc);
856+
let results = (outs XeGPU_DpasResType: $result);
860857

861858
let extraClassDeclaration = [{
862859
VectorType getLhsType() {

mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,8 @@ def XeGPU_IntType: AnyTypeOf<[I1, I8, I16, I32, I64, SI1, SI8, SI16, SI32, SI64,
1717
def XeGPU_FloatType: AnyTypeOf<[F16, F32, F64, BF16, TF32]>;
1818
def XeGPU_ScalarType: AnyTypeOf<[XeGPU_IntType, XeGPU_FloatType]>;
1919
def XeGPU_BaseAddrType: AnyTypeOf<[Non0RankedMemRefOf<[XeGPU_ScalarType]>, UI64, UI32, I64, I32]>;
20-
def XeGPU_DpasOpType: VectorOfRankAndType<[2, 3], [XeGPU_ScalarType]>;
20+
def XeGPU_DpasOprType: VectorOfRankAndType<[1, 2, 3], [XeGPU_ScalarType]>;
21+
def XeGPU_DpasResType: VectorOfRankAndType<[1, 2], [XeGPU_ScalarType]>;
2122
def XeGPU_OffsetType: VectorOfRankAndType<[1], [Index]>;
2223
def XeGPU_MaskType: AnyTypeOf<[VectorOfRankAndType<[1], [I1]>, I1]>;
2324
def XeGPU_ValueType: AnyTypeOf<[VectorOfRankAndType<[1,2,3,4], [XeGPU_ScalarType]>, XeGPU_ScalarType]>;

mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp

Lines changed: 19 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
#include "mlir/IR/Builders.h"
1111
#include "mlir/IR/DialectImplementation.h"
1212
#include "llvm/ADT/TypeSwitch.h"
13+
#include <numeric>
1314

1415
namespace mlir {
1516
namespace xegpu {
@@ -319,49 +320,48 @@ LogicalResult TensorDescType::verify(
319320
// ---------------------------------------------------------------------
320321
// Case 1: Regular loads/stores.
321322
// ---------------------------------------------------------------------
322-
// Distributed vector shape must be:
323-
// [chunk_size / lane_data_size, lane_data_size]
324-
// If the tensor descriptor shape is 1D, first dimension is ignored (set to 1).
325-
// [lane_data_size]
323+
// The following conditions must be met:
324+
// * tensor_desc[0] == lane_layout[0]
325+
// Distributed vector is a 1D vector with shape:
326+
// [chunk_size]
326327
// ---------------------------------------------------------------------
327328
// Case 2: Block loads/stores
328329
// ---------------------------------------------------------------------
329330
// Additional definitions:
330331
// tensor_size = tensor_desc[0] * .. * tensor_desc[r-1] * array_length
331332
// n_distribution_units = tensor_size / distribution_unit_size
333+
// fragment_size = n_distribution_units * lane_data_size
332334
// Given above definitions, the following conditions must be met:
333335
// * tensor_desc[0] % (lane_layout[0] × lane_data[0]) == 0
334336
// * tensor_desc[1] % (lane_layout[1] × lane_data[1]) == 0
335-
// Distributed vector shape must be:
336-
// [n_distribution_units, lane_data_size]
337+
// Distributed vector is a 1D vector with shape:
338+
// [fragment_size]
337339
FailureOr<VectorType> TensorDescType::getDistributedVectorType() {
338340
auto layout = llvm::dyn_cast_if_present<LayoutAttr>(getLayout());
339-
// If no layout is provided, tensor desc is not used in SIMT mode.
340-
if (!layout)
341+
// It only works for subgroup level layout, which only has lane_layout
342+
// and lane_data, and is to distribute a SIMD code into SIMT code.
343+
if (!layout || !layout.isSgLayout())
341344
return failure();
342345

343346
SmallVector<int64_t> laneData(layout.getLaneData().asArrayRef());
344347
SmallVector<int64_t> laneLayout(layout.getLaneLayout().asArrayRef());
345348
auto tdescShape = getShape();
346349

347-
auto laneDataSize = 1, sgSize = 1;
348-
for (auto [laneDim, laneDataDim] : llvm::zip_equal(laneLayout, laneData)) {
349-
laneDataSize *= laneDataDim;
350-
sgSize *= laneDim;
351-
}
350+
// compute sgSize by multiply elements of laneLayout
351+
// e.g. for 2D layout, sgSize = laneLayout[0] * laneLayout[1]
352+
// e.g. for 1D layout, sgSize = laneLayout[0]
353+
auto sgSize = std::accumulate(laneLayout.begin(), laneLayout.end(), 1,
354+
std::multiplies<int64_t>());
352355

353356
// Case 1: regular loads/stores
354357
auto scatterAttr = getEncodingAsScatterTensorDescAttr();
355358
if (scatterAttr) {
356359
auto chunkSize = scatterAttr.getChunkSize().getInt();
357360
// Verify if the first dimension of the tensor descriptor shape is
358361
// distributable.
359-
assert(tdescShape[0] % (laneLayout[0]) == 0 &&
362+
assert(tdescShape[0] == laneLayout[0] &&
360363
"tensor descriptor shape is not distributable");
361-
if (chunkSize > 1)
362-
return VectorType::get({chunkSize / laneDataSize, laneDataSize},
363-
getElementType());
364-
return VectorType::get({laneDataSize}, getElementType());
364+
return VectorType::get({chunkSize}, getElementType());
365365
}
366366

367367
// Case 2: block loads/stores
@@ -376,8 +376,7 @@ FailureOr<VectorType> TensorDescType::getDistributedVectorType() {
376376
// tensorSize must be adjusted for array_length.
377377
tensorSize *= getArrayLength();
378378

379-
return VectorType::get({tensorSize / (sgSize * laneDataSize), laneDataSize},
380-
getElementType());
379+
return VectorType::get({tensorSize / sgSize}, getElementType());
381380
}
382381

383382
} // namespace xegpu

0 commit comments

Comments
 (0)