[mlir][TilingInterface] Early return cloned ops if tile sizes are zeros. (#75410)

hanhanW · web-flow · commit 899c2bed9e95 · 2023-12-19T09:14:43.000-08:00
It is a trivial early-return case. If the cloned ops are not returned,
it will generate `extract_slice` op that extracts the whole slice.
However, it is not folded away. Early-return to avoid the case.

E.g.,

```mlir
func.func @matmul_tensors(
  %arg0: tensor&lt;?x?xf32&gt;, %arg1: tensor&lt;?x?xf32&gt;, %arg2: tensor&lt;?x?xf32&gt;)
    -&gt; tensor&lt;?x?xf32&gt; {
  %0 = linalg.matmul  ins(%arg0, %arg1: tensor&lt;?x?xf32&gt;, tensor&lt;?x?xf32&gt;)
                     outs(%arg2: tensor&lt;?x?xf32&gt;)
    -&gt; tensor&lt;?x?xf32&gt;
  return %0 : tensor&lt;?x?xf32&gt;
}

module attributes {transform.with_named_sequence} {
  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
    %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -&gt; !transform.any_op
    %1 = transform.structured.tile_using_for %0 [0, 0, 0] : (!transform.any_op) -&gt; (!transform.any_op)
    transform.yield
  }
}
```

Apply the transforms and canonicalize the IR:

```
mlir-opt --transform-interpreter -canonicalize input.mlir
```

we will get

```mlir
module {
  func.func @matmul_tensors(%arg0: tensor&lt;?x?xf32&gt;, %arg1: tensor&lt;?x?xf32&gt;, %arg2: tensor&lt;?x?xf32&gt;) -&gt; tensor&lt;?x?xf32&gt; {
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %dim = tensor.dim %arg0, %c0 : tensor&lt;?x?xf32&gt;
    %dim_0 = tensor.dim %arg0, %c1 : tensor&lt;?x?xf32&gt;
    %dim_1 = tensor.dim %arg1, %c1 : tensor&lt;?x?xf32&gt;
    %extracted_slice = tensor.extract_slice %arg0[0, 0] [%dim, %dim_0] [1, 1] : tensor&lt;?x?xf32&gt; to tensor&lt;?x?xf32&gt;
    %extracted_slice_2 = tensor.extract_slice %arg1[0, 0] [%dim_0, %dim_1] [1, 1] : tensor&lt;?x?xf32&gt; to tensor&lt;?x?xf32&gt;
    %extracted_slice_3 = tensor.extract_slice %arg2[0, 0] [%dim, %dim_1] [1, 1] : tensor&lt;?x?xf32&gt; to tensor&lt;?x?xf32&gt;
    %0 = linalg.matmul ins(%extracted_slice, %extracted_slice_2 : tensor&lt;?x?xf32&gt;, tensor&lt;?x?xf32&gt;) outs(%extracted_slice_3 : tensor&lt;?x?xf32&gt;) -&gt; tensor&lt;?x?xf32&gt;
    return %0 : tensor&lt;?x?xf32&gt;
  }
}
```

The revision early-return the case so we can get:

```mlir
func.func @matmul_tensors(%arg0: tensor&lt;?x?xf32&gt;, %arg1: tensor&lt;?x?xf32&gt;, %arg2: tensor&lt;?x?xf32&gt;) -&gt; tensor&lt;?x?xf32&gt; {
  %0 = linalg.matmul ins(%arg0, %arg1 : tensor&lt;?x?xf32&gt;, tensor&lt;?x?xf32&gt;) outs(%arg2 : tensor&lt;?x?xf32&gt;) -&gt; tensor&lt;?x?xf32&gt;
  return %0 : tensor&lt;?x?xf32&gt;
}
```
diff --git a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp
@@ -362,14 +362,22 @@ mlir::scf::tileUsingSCFForOp(RewriterBase &rewriter, TilingInterface op,
   auto clonedOp = cast<TilingInterface>(
       cloneOpAndUpdateDestinationArgs(rewriter, op, clonedOpDestination));
 
-  // 5b. Tile the cloned operation.
+  // 5b. Early return cloned op if tiling is not happening. We can not return
+  // the original op because it could lead to
+  // `rewriter.replaceOp(op, op->getResults())` and user would get crash.
+  if (llvm::all_of(tileSizeVector, isZeroIndex)) {
+    return scf::SCFTilingResult{/*tiledOps=*/{clonedOp}, /*loops=*/{},
+                                clonedOp->getResults()};
+  }
+
+  // 5c. Tile the cloned operation.
   FailureOr<TilingResult> tiledImplementation =
       clonedOp.getTiledImplementation(rewriter, offsets, sizes);
   if (failed(tiledImplementation)) {
     return rewriter.notifyMatchFailure(op, "failed to tile operation");
   }
 
-  // 5c. Delete the cloned operation.
+  // 5d. Delete the cloned operation.
   rewriter.eraseOp(clonedOp);
 
   // If loops are empty, the tiled op is used as the replacement for the untiled
diff --git a/mlir/test/Dialect/Linalg/tile-tensors.mlir b/mlir/test/Dialect/Linalg/tile-tensors.mlir
@@ -37,6 +37,33 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
+// CHECK-LABEL: func @matmul_tensors_with_size_zeros(
+// CHECK-SAME:    %[[TA:[0-9a-z]+]]: tensor<?x?xf32>
+// CHECK-SAME:    %[[TB:[0-9a-z]+]]: tensor<?x?xf32>
+// CHECK-SAME:    %[[TC:[0-9a-z]+]]: tensor<?x?xf32>) -> tensor<?x?xf32> {
+func.func @matmul_tensors_with_size_zeros(
+  %arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>, %arg2: tensor<?x?xf32>)
+    -> tensor<?x?xf32> {
+
+//      CHECK:     %[[RES:.*]] = linalg.matmul ins(%[[TA]], %[[TB]] : tensor<?x?xf32>, tensor<?x?xf32>)
+// CHECK-SAME:                                outs(%[[TC]] : tensor<?x?xf32>)  -> tensor<?x?xf32>
+//      CHECK:     return %[[RES]]
+  %0 = linalg.matmul  ins(%arg0, %arg1: tensor<?x?xf32>, tensor<?x?xf32>)
+                     outs(%arg2: tensor<?x?xf32>)
+    -> tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1 = transform.structured.tile_using_for %0 [0, 0, 0] : (!transform.any_op) -> (!transform.any_op)
+    transform.yield
+  }
+}
+
+// -----
+
 func.func @generic_op_tensors(
   %arg0 : tensor<?x?x?xf32>, %arg1 : tensor<?x?x?xf32>) -> tensor<?x?x?xf32> {
   %c0 = arith.constant 0 : index