[mlir][gpu] Add 'cluster_size' attribute to gpu.subgroup_reduce

andfau-amd · andfau-amd · commit 24fd0e5f6515 · 2024-08-20T15:14:16.000+02:00
This enables performing several reductions in parallel, each smaller
than the size of the subgroup.

One potential application is flash attention with subgroup-wide matrix
multiplication and reduction combined in one kernel. The multiplication
operation requires a 2D matrix to be distributed over the lanes of the
subgroup, which then constrains the shape the following reduction can
have if we want to keep data in registers.
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -1198,21 +1198,31 @@ def GPU_SubgroupReduceOp : GPU_Op<"subgroup_reduce", [SameOperandsAndResultType]
   let summary = "Reduce values among subgroup.";
   let description = [{
     The `subgroup_reduce` op reduces the value of every lane (work item) across
-    a subgroup. The result is equal for all lanes.
+    a subgroup.
 
     When the reduced value is of a vector type, each vector element is reduced
     independently. Only 1-d vector types are allowed.
 
     Example:
 
     ```mlir
-    %1 = gpu.subgroup_reduce add %a : (f32) -> (f32)
-    %2 = gpu.subgroup_reduce add %b : (vector<4xf16>) -> (vector<4xf16>)
+    %1 = gpu.subgroup_reduce add %a : (f32) -> f32
+    %2 = gpu.subgroup_reduce add %b : (vector<4xf16>) -> vector<4xf16>
+    %3 = gpu.subgroup_reduce add %c cluster_size(4) : (f32) -> f32
     ```
 
     If `uniform` flag is set either none or all lanes of a subgroup need to execute
-    this op in convergence. The reduction operation must be one
-    of:
+    this op in convergence.
+
+    If a `cluster_size` is not provided, the reduction covers all lanes of the
+    subgroup and the result is equal for all lanes.
+
+    If a `cluster_size` is provided, the subgroup is divided into clusters of
+    `cluster_size` contiguous lanes each, a reduction is done for all lanes of
+    each cluster (in parallel), and the result is equal for all lanes in a
+    cluster.
+
+    The reduction operation must be one of:
     *  Integer types: `add`, `mul`, `minui`, `minsi`, `maxui`, `maxsi`, `and`,
        `or`, `xor`
     *  Floating point types: `add`, `mul`, `minnumf`, `maxnumf`, `minimumf`,
@@ -1222,12 +1232,32 @@ def GPU_SubgroupReduceOp : GPU_Op<"subgroup_reduce", [SameOperandsAndResultType]
   let arguments = (ins
     AnyIntegerOrFloatOr1DVector:$value,
     GPU_AllReduceOperationAttr:$op,
-    UnitAttr:$uniform
+    UnitAttr:$uniform,
+    OptionalAttr<I32Attr>:$cluster_size
   );
   let results = (outs AnyIntegerOrFloatOr1DVector:$result);
 
+  let builders = [
+    OpBuilder<(ins "Value":$value,
+               "::mlir::gpu::AllReduceOperation":$op,
+               "bool":$uniform), [{
+      build($_builder, $_state, value, op, uniform, /*cluster_size=*/ nullptr);
+    }]>,
+    OpBuilder<(ins "Value":$value,
+               "::mlir::gpu::AllReduceOperation":$op,
+               "bool":$uniform,
+               "std::optional<uint32_t>":$cluster_size), [{
+      if (cluster_size)
+        build($_builder, $_state, value, op, uniform, $_builder.getI32IntegerAttr(*cluster_size));
+      else
+        build($_builder, $_state, value, op, uniform, nullptr);
+    }]>
+  ];
+
   let assemblyFormat = [{ custom<AllReduceOperation>($op) $value
-                          (`uniform` $uniform^)? attr-dict
+                          (`uniform` $uniform^)?
+                          (`cluster_size` `(` $cluster_size^ `)`)?
+                          attr-dict
                           `:` functional-type(operands, results) }];
 
   let hasFolder = 1;
diff --git a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
@@ -102,6 +102,10 @@ struct GPUSubgroupReduceOpLowering
 
   matchAndRewrite(gpu::SubgroupReduceOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
+    if (op.getClusterSize())
+      return rewriter.notifyMatchFailure(
+          op, "lowering for clustered reduce not implemented");
+
     if (!op.getUniform())
       return rewriter.notifyMatchFailure(
           op, "cannot be lowered to redux as the op must be run "
diff --git a/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRV.cpp b/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRV.cpp
@@ -579,6 +579,10 @@ class GPUSubgroupReduceConversion final
   LogicalResult
   matchAndRewrite(gpu::SubgroupReduceOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
+    if (op.getClusterSize())
+      return rewriter.notifyMatchFailure(
+          op, "lowering for clustered reduce not implemented");
+
     if (!isa<spirv::ScalarType>(adaptor.getValue().getType()))
       return rewriter.notifyMatchFailure(op, "reduction type is not a scalar");
 
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -620,10 +620,21 @@ LogicalResult gpu::SubgroupReduceOp::verify() {
                        << "` reduction operation is not compatible with type "
                        << getType();
   }
+
+  if (auto clusterSize = getClusterSize()) {
+    uint32_t size = *clusterSize;
+    if (!llvm::isPowerOf2_32(size)) {
+      return emitError() << "cluster size " << size << " is not a power of two";
+    }
+  }
+
   return success();
 }
 
 OpFoldResult gpu::SubgroupReduceOp::fold(FoldAdaptor /*adaptor*/) {
+  if (getClusterSize() == 1)
+    return getValue();
+
   if (!getUniform() && canMakeGroupOpUniform(*this)) {
     setUniform(true);
     return getResult();
diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
@@ -50,6 +50,8 @@ struct BreakDownSubgroupReduce final : OpRewritePattern<gpu::SubgroupReduceOp> {
 
   LogicalResult matchAndRewrite(gpu::SubgroupReduceOp op,
                                 PatternRewriter &rewriter) const override {
+    std::optional<uint32_t> clusterSize = op.getClusterSize();
+
     auto vecTy = dyn_cast<VectorType>(op.getType());
     if (!vecTy || vecTy.getNumElements() < 2)
       return rewriter.notifyMatchFailure(op, "not a multi-element reduction");
@@ -95,7 +97,7 @@ struct BreakDownSubgroupReduce final : OpRewritePattern<gpu::SubgroupReduceOp> {
       }
 
       Value reduce = rewriter.create<gpu::SubgroupReduceOp>(
-          loc, extracted, op.getOp(), op.getUniform());
+          loc, extracted, op.getOp(), op.getUniform(), clusterSize);
       if (numElems == 1) {
         res = rewriter.create<vector::InsertOp>(loc, reduce, res, startIdx);
         continue;
@@ -127,6 +129,8 @@ struct ScalarizeSingleElementReduce final
 
   LogicalResult matchAndRewrite(gpu::SubgroupReduceOp op,
                                 PatternRewriter &rewriter) const override {
+    std::optional<uint32_t> clusterSize = op.getClusterSize();
+
     auto vecTy = dyn_cast<VectorType>(op.getType());
     if (!vecTy || vecTy.getNumElements() != 1)
       return rewriter.notifyMatchFailure(op, "not a single-element reduction");
@@ -136,7 +140,7 @@ struct ScalarizeSingleElementReduce final
     Location loc = op.getLoc();
     Value extracted = rewriter.create<vector::ExtractOp>(loc, op.getValue(), 0);
     Value reduce = rewriter.create<gpu::SubgroupReduceOp>(
-        loc, extracted, op.getOp(), op.getUniform());
+        loc, extracted, op.getOp(), op.getUniform(), clusterSize);
     rewriter.replaceOpWithNewOp<vector::BroadcastOp>(op, vecTy, reduce);
     return success();
   }
@@ -147,17 +151,20 @@ struct ScalarizeSingleElementReduce final
 /// type, respectively. For example, with `input` of type `f16`, `packFn` could
 /// build ops to cast the value to `i32` to perform shuffles, while `unpackFn`
 /// would cast it back to `f16` to perform arithmetic reduction on. Assumes that
-/// the subgroup is `subgroupSize` lanes wide and reduces across all of them.
+/// the subgroup is `subgroupSize` lanes wide and divides it into clusters of
+/// `clusterSize` lanes, reducing all lanes in each cluster in parallel.
 static Value createSubgroupShuffleReduction(
     OpBuilder &builder, Location loc, Value input, gpu::AllReduceOperation mode,
-    unsigned subgroupSize, function_ref<Value(Value)> packFn,
-    function_ref<Value(Value)> unpackFn) {
+    unsigned clusterSize, unsigned subgroupSize,
+    function_ref<Value(Value)> packFn, function_ref<Value(Value)> unpackFn) {
+  assert(llvm::isPowerOf2_32(clusterSize));
   assert(llvm::isPowerOf2_32(subgroupSize));
+  assert(clusterSize <= subgroupSize);
   // Lane value always stays in the original type. We use it to perform arith
   // reductions.
   Value laneVal = input;
   // Parallel reduction using butterfly shuffles.
-  for (unsigned i = 1; i < subgroupSize; i <<= 1) {
+  for (unsigned i = 1; i < clusterSize; i <<= 1) {
     Value shuffled = builder
                          .create<gpu::ShuffleOp>(loc, packFn(laneVal), i,
                                                  /*width=*/subgroupSize,
@@ -183,6 +190,13 @@ struct ScalarSubgroupReduceToShuffles final
 
   LogicalResult matchAndRewrite(gpu::SubgroupReduceOp op,
                                 PatternRewriter &rewriter) const override {
+    std::optional<uint32_t> clusterSize = op.getClusterSize();
+    if (clusterSize && *clusterSize > subgroupSize)
+      return op.emitError()
+             << "cluster size " << *clusterSize
+             << " is greater than subgroup size " << subgroupSize;
+    unsigned effectiveClusterSize = clusterSize.value_or(subgroupSize);
+
     Type valueTy = op.getType();
     unsigned elemBitwidth =
         getElementTypeOrSelf(valueTy).getIntOrFloatBitWidth();
@@ -196,7 +210,8 @@ struct ScalarSubgroupReduceToShuffles final
       auto identityFn = [](Value v) { return v; };
       rewriter.replaceOp(op, createSubgroupShuffleReduction(
                                  rewriter, loc, op.getValue(), op.getOp(),
-                                 subgroupSize, identityFn, identityFn));
+                                 effectiveClusterSize, subgroupSize, identityFn,
+                                 identityFn));
       return success();
     }
 
@@ -215,9 +230,10 @@ struct ScalarSubgroupReduceToShuffles final
       return rewriter.create<arith::BitcastOp>(loc, valueTy, asInt);
     };
 
-    rewriter.replaceOp(op, createSubgroupShuffleReduction(
-                               rewriter, loc, op.getValue(), op.getOp(),
-                               subgroupSize, packFn, unpackFn));
+    rewriter.replaceOp(
+        op, createSubgroupShuffleReduction(rewriter, loc, op.getValue(),
+                                           op.getOp(), effectiveClusterSize,
+                                           subgroupSize, packFn, unpackFn));
     return success();
   }
 
@@ -237,6 +253,13 @@ struct VectorSubgroupReduceToShuffles final
 
   LogicalResult matchAndRewrite(gpu::SubgroupReduceOp op,
                                 PatternRewriter &rewriter) const override {
+    std::optional<uint32_t> clusterSize = op.getClusterSize();
+    if (clusterSize && *clusterSize > subgroupSize)
+      return op.emitError()
+             << "cluster size " << *clusterSize
+             << " is greater than subgroup size " << subgroupSize;
+    unsigned effectiveClusterSize = clusterSize.value_or(subgroupSize);
+
     auto vecTy = dyn_cast<VectorType>(op.getType());
     if (!vecTy)
       return rewriter.notifyMatchFailure(op, "value type is not a vector");
@@ -285,9 +308,9 @@ struct VectorSubgroupReduceToShuffles final
       return rewriter.create<vector::BitCastOp>(loc, extendedVecTy, asIntVec);
     };
 
-    Value res =
-        createSubgroupShuffleReduction(rewriter, loc, extendedInput, op.getOp(),
-                                       subgroupSize, packFn, unpackFn);
+    Value res = createSubgroupShuffleReduction(rewriter, loc, extendedInput,
+                                               op.getOp(), effectiveClusterSize,
+                                               subgroupSize, packFn, unpackFn);
 
     if (vecBitwidth < shuffleBitwidth) {
       res = rewriter.create<vector::ExtractStridedSliceOp>(
diff --git a/mlir/test/Dialect/GPU/canonicalize.mlir b/mlir/test/Dialect/GPU/canonicalize.mlir
@@ -246,6 +246,24 @@ func.func @make_subgroup_reduce_uniform() {
 
 // -----
 
+// CHECK-LABEL: func @subgroup_reduce_cluster_size_1
+//       CHECK: gpu.launch blocks
+//       CHECK: %[[V1:.*]] = "test.test2"() : () -> i32
+//       CHECK: "test.test3"(%[[V1]]) : (i32) -> ()
+func.func @subgroup_reduce_cluster_size_1() {
+  %0:6 = "test.test1"() : () -> (index, index, index, index, index, index)
+  gpu.launch blocks(%arg0, %arg1, %arg2) in (%arg6 = %0#0, %arg7 = %0#1, %arg8 = %0#2)
+    threads(%arg3, %arg4, %arg5) in (%arg9 = %0#3, %arg10 = %0#4, %arg11 = %0#5) {
+    %1 = "test.test2"() : () -> i32
+    %2 = gpu.subgroup_reduce add %1 cluster_size(1) : (i32) -> (i32)
+    "test.test3"(%2) : (i32) -> ()
+    gpu.terminator
+  }
+  return
+}
+
+// -----
+
 // The GPU kernel does not have any side effecting ops, so the entire
 // gpu.launch op can fold away.
 
diff --git a/mlir/test/Dialect/GPU/invalid.mlir b/mlir/test/Dialect/GPU/invalid.mlir
@@ -333,6 +333,22 @@ func.func @reduce_invalid_op_type_maximumf(%arg0 : i32) {
 
 // -----
 
+func.func @subgroup_reduce_zero_cluster_size(%arg0 : vector<4xf32>) {
+  // expected-error@+1 {{cluster size 0 is not a power of two}}
+  %res = gpu.subgroup_reduce add %arg0 cluster_size(0) : (vector<4xf32>) -> vector<4xf32>
+  return
+}
+
+// -----
+
+func.func @subgroup_reduce_npot_cluster_size(%arg0 : vector<4xf32>) {
+  // expected-error@+1 {{cluster size 3 is not a power of two}}
+  %res = gpu.subgroup_reduce add %arg0 cluster_size(3) : (vector<4xf32>) -> vector<4xf32>
+  return
+}
+
+// -----
+
 func.func @subgroup_reduce_bad_type(%arg0 : vector<2x2xf32>) {
   // expected-error@+1 {{'gpu.subgroup_reduce' op operand #0 must be Integer or Float or vector of}}
   %res = gpu.subgroup_reduce add %arg0 : (vector<2x2xf32>) -> vector<2x2xf32>
diff --git a/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir b/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir