@@ -11,6 +11,8 @@ func.func @fold_wait_op_test1() {
11
11
}
12
12
// CHECK-NOT: gpu.wait
13
13
14
+ // -----
15
+
14
16
// Erase duplicate barriers.
15
17
// CHECK-LABEL: func @erase_barriers
16
18
// CHECK-NEXT: gpu.barrier
@@ -21,6 +23,8 @@ func.func @erase_barriers() {
21
23
return
22
24
}
23
25
26
+ // -----
27
+
24
28
// Replace uses of gpu.wait op with its async dependency.
25
29
// CHECK-LABEL: func @fold_wait_op_test2
26
30
func.func @fold_wait_op_test2 (%arg0: i1 ) -> (memref <5 xf16 >, memref <5 xf16 >) {
@@ -38,6 +42,8 @@ func.func @fold_wait_op_test2(%arg0: i1) -> (memref<5xf16>, memref<5xf16>) {
38
42
// CHECK-NEXT: gpu.alloc async [%[[TOKEN1]]] ()
39
43
// CHECK-NEXT: return
40
44
45
+ // -----
46
+
41
47
// CHECK-LABEL: func @fold_memcpy_op
42
48
func.func @fold_memcpy_op (%arg0: i1 ) {
43
49
%cst = arith.constant 0.000000e+00 : f16
@@ -60,6 +66,8 @@ func.func @fold_memcpy_op(%arg0: i1) {
60
66
}
61
67
// CHECK-NOT: gpu.memcpy
62
68
69
+ // -----
70
+
63
71
// We cannot fold memcpy here as dest is a block argument.
64
72
// CHECK-LABEL: func @do_not_fold_memcpy_op1
65
73
func.func @do_not_fold_memcpy_op1 (%arg0: i1 , %arg1: memref <2 xf16 >) {
@@ -75,6 +83,8 @@ func.func @do_not_fold_memcpy_op1(%arg0: i1, %arg1: memref<2xf16>) {
75
83
}
76
84
// CHECK: gpu.memcpy
77
85
86
+ // -----
87
+
78
88
// We cannot fold gpu.memcpy as it is used by an op having read effect on dest.
79
89
// CHECK-LABEL: func @do_not_fold_memcpy_op2
80
90
func.func @do_not_fold_memcpy_op2 (%arg0: i1 , %arg1: index ) -> f16 {
@@ -92,6 +102,8 @@ func.func @do_not_fold_memcpy_op2(%arg0: i1, %arg1: index) -> f16 {
92
102
}
93
103
// CHECK: gpu.memcpy
94
104
105
+ // -----
106
+
95
107
// We cannot fold gpu.memcpy, as the defining op if dest is not a alloc like op.
96
108
// CHECK-LABEL: func @do_not_fold_memcpy_op3
97
109
func.func @do_not_fold_memcpy_op3 (%arg0: memref <1 xi8 >, %arg1: memref <i1 >) {
@@ -102,6 +114,8 @@ func.func @do_not_fold_memcpy_op3(%arg0: memref<1xi8>, %arg1: memref<i1>) {
102
114
}
103
115
// CHECK: gpu.memcpy
104
116
117
+ // -----
118
+
105
119
// CHECK-LABEL: @memcpy_after_cast
106
120
func.func @memcpy_after_cast (%arg0: memref <10 xf32 >, %arg1: memref <10 xf32 >) {
107
121
// CHECK-NOT: memref.cast
@@ -112,6 +126,8 @@ func.func @memcpy_after_cast(%arg0: memref<10xf32>, %arg1: memref<10xf32>) {
112
126
return
113
127
}
114
128
129
+ // -----
130
+
115
131
// CHECK-LABEL: @memset_after_cast
116
132
func.func @memset_after_cast (%arg0: memref <10 xf32 >, %arg1: f32 ) {
117
133
// CHECK-NOT: memref.cast
@@ -227,3 +243,20 @@ func.func @make_subgroup_reduce_uniform() {
227
243
}
228
244
return
229
245
}
246
+
247
+ // -----
248
+
249
+ // The GPU kernel does not have any side effecting ops, so the entire
250
+ // gpu.launch op can fold away.
251
+
252
+ // CHECK-LABEL: func @gpu_launch_without_side_effects
253
+ // CHECK-NOT: gpu.launch
254
+ func.func @gpu_launch_without_side_effects () {
255
+ %0:6 = " test.test1" () : () -> (index , index , index , index , index , index )
256
+ gpu.launch blocks (%arg0 , %arg1 , %arg2 ) in (%arg6 = %0#0 , %arg7 = %0#1 , %arg8 = %0#2 )
257
+ threads (%arg3 , %arg4 , %arg5 ) in (%arg9 = %0#3 , %arg10 = %0#4 , %arg11 = %0#5 ) {
258
+ %1 = arith.addi %arg0 , %arg1 : index
259
+ gpu.terminator
260
+ }
261
+ return
262
+ }
0 commit comments