diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td index 3f27e1541cf38..aa2b4543927a7 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td @@ -433,6 +433,46 @@ def AMDGPU_LDSBarrierOp : AMDGPU_Op<"lds_barrier"> { let assemblyFormat = "attr-dict"; } +def AMDGPU_SchedBarrierOpOpt : I32BitEnumAttr<"sched_barrier_opt_enum", + "The possible options for scheduling barriers", + [ + I32BitEnumAttrCaseNone<"none">, + I32BitEnumAttrCaseBit<"non_mem_non_sideffect", 0>, + I32BitEnumAttrCaseBit<"valu", 1>, + I32BitEnumAttrCaseBit<"salu", 2>, + I32BitEnumAttrCaseBit<"mfma_wmma", 3>, + I32BitEnumAttrCaseBit<"all_vmem", 4>, + I32BitEnumAttrCaseBit<"vmem_read", 5>, + I32BitEnumAttrCaseBit<"vmem_write", 6>, + I32BitEnumAttrCaseBit<"all_ds", 7>, + I32BitEnumAttrCaseBit<"ds_read", 8>, + I32BitEnumAttrCaseBit<"ds_write", 9>, + I32BitEnumAttrCaseBit<"transcendental", 10> + ]> { + let genSpecializedAttr = 0; + let cppNamespace = "::mlir::amdgpu"; +} + +def AMDGPU_SchedBarrierOpOptAttr : EnumAttr{ + let assemblyFormat = "`<` $value `>`"; +} + +def AMDGPU_SchedBarrierOp : + AMDGPU_Op<"sched_barrier">, + Arguments<(ins AMDGPU_SchedBarrierOpOptAttr:$opts)> + { + let summary = "Barrier that limits the backend scheduler of instruction movement"; + let description = [{ + `amdgpu.sched_barrier` serves as a barrier that could be + configured to restrict movements of instructions through it as + defined by sched_barrier_opts. + }]; + let assemblyFormat = [{ + `allow` `=` $opts attr-dict + }]; +} + def AMDGPU_MFMAPermB : I32EnumAttr<"MFMAPermB", "The possible permutations of the lanes storing B available in an MFMA", [ diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp index 033e66c6118f3..b808738804030 100644 --- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp +++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp @@ -321,6 +321,22 @@ struct LDSBarrierOpLowering : public ConvertOpToLLVMPattern { return success(); } }; + +struct SchedBarrierOpLowering : public ConvertOpToLLVMPattern { + SchedBarrierOpLowering(LLVMTypeConverter &converter, Chipset chipset) + : ConvertOpToLLVMPattern(converter), chipset(chipset) {} + + Chipset chipset; + + LogicalResult + matchAndRewrite(SchedBarrierOp op, SchedBarrierOp::Adaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + rewriter.replaceOpWithNewOp(op, + (uint32_t)op.getOpts()); + return success(); + } +}; + } // namespace /// If `input` is a vector of bytes, concatentate those bytes in little-endian @@ -879,8 +895,8 @@ void mlir::populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter, ROCDL::RawPtrBufferAtomicUminOp>, RawBufferOpLowering, - LDSBarrierOpLowering, MFMAOpLowering, WMMAOpLowering, - ExtPackedFp8OpLowering, PackedTrunc2xFp8OpLowering, + LDSBarrierOpLowering, SchedBarrierOpLowering, MFMAOpLowering, + WMMAOpLowering, ExtPackedFp8OpLowering, PackedTrunc2xFp8OpLowering, PackedStochRoundFp8OpLowering>(converter, chipset); } diff --git a/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir index bb1cedaa276b3..717667c22af80 100644 --- a/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir +++ b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir @@ -226,3 +226,34 @@ func.func @lds_barrier() { amdgpu.lds_barrier func.return } + +// CHECK-LABEL: func @sched_barrier +func.func @sched_barrier() { + // CHECK: rocdl.sched.barrier 0 + amdgpu.sched_barrier allow = + // CHECK: rocdl.sched.barrier 1 + amdgpu.sched_barrier allow = + // CHECK: rocdl.sched.barrier 2 + amdgpu.sched_barrier allow = + // CHECK: rocdl.sched.barrier 4 + amdgpu.sched_barrier allow = + // CHECK: rocdl.sched.barrier 8 + amdgpu.sched_barrier allow = + // CHECK: rocdl.sched.barrier 16 + amdgpu.sched_barrier allow = + // CHECK: rocdl.sched.barrier 32 + amdgpu.sched_barrier allow = + // CHECK: rocdl.sched.barrier 64 + amdgpu.sched_barrier allow = + // CHECK: rocdl.sched.barrier 128 + amdgpu.sched_barrier allow = + // CHECK: rocdl.sched.barrier 256 + amdgpu.sched_barrier allow = + // CHECK: rocdl.sched.barrier 512 + amdgpu.sched_barrier allow = + // CHECK: rocdl.sched.barrier 1024 + amdgpu.sched_barrier allow = + // CHECK: rocdl.sched.barrier 18 + amdgpu.sched_barrier allow = + func.return +} diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir index 744a096d757e0..9457a1b9e4498 100644 --- a/mlir/test/Dialect/AMDGPU/ops.mlir +++ b/mlir/test/Dialect/AMDGPU/ops.mlir @@ -109,6 +109,15 @@ func.func @lds_barrier() { func.return } +// CHECK-LABEL: func @sched_barrier +func.func @sched_barrier() { + // CHECK: amdgpu.sched_barrier allow = + amdgpu.sched_barrier allow = + // CHECK: amdgpu.sched_barrier allow = + amdgpu.sched_barrier allow = + func.return +} + // CHECK-LABEL: func @mfma func.func @mfma(%arg0 : f32, %arg1 : vector<32xf32>) -> vector<32xf32> { // CHECK: amdgpu.mfma