Skip to content

Commit 2910add

Browse files
AlexeySachkovjzc
andauthored
[SYCL] Enhance device code split call graph analysis (#8589)
This patch introduces significant changes to how device code split detects functions and global variables which should be included into a cloned module. There are two main changes done to that: 1. analysis algorithm now traces uses of global variables to allow adding all globals into every split module 2. analysis algorithm now traces indirect calls, trying to define a list of all functions which are potentially called indirectly to avoid the need to disabled device code split completely in presence of indirect calls Both things are implemented through new `DependencyGraph` entity, which replaces `CallGraph` entity we used. Instead of calls, that new graph is built over _uses_ of functions and variables to understand which functions and global variables are used by which functions and global variables. The most tricky part here is indirect calls: we can't understand which exact function is being called by an indirect call. However, we can compile a list of _potentially_-called function by comparing function signatures with signature of an indirect call. On top of that, ESIMD handling is refactored by this patch: - outlined ESIMD-specific handling into a separate function - created new ESIMD-specific device code split helper New ESIMD-specific device code split helper is needed, because we should use different rules for ESIMD and non-ESIMD parts of a module when splitting it to two. For ESIMD part we want to grab all ESIMD-functions even if they were not considered as entry points in the original module. For non-ESIMD part we **don't want** to grab _any_ ESIMD-functions, even if they are referenced/used by non-ESIMD functions. Both of those special rules come from `invoke_simd` feature support: non-ESIMD kernel can indirectly reference a ESIMD function. Since those different kind of functions require different processing, we have to completely separate them before processing step. Non-ESIMD module could be incomplete as a result of such split, but it will be merged back with ESIMD module after ESIMD lowering. That merge step is required for `invoke_simd` functionality. --------- Co-authored-by: Cai, Justin <[email protected]>
1 parent f448631 commit 2910add

13 files changed

+756
-227
lines changed

llvm/test/tools/sycl-post-link/assert/indirect-with-split-2.ll

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,16 @@
99
; marked as using asserts.
1010

1111
; RUN: sycl-post-link -split=auto -symbols -S < %s -o %t.table
12-
; RUN: FileCheck %s -input-file=%t_0.prop -check-prefix=PRESENCE-CHECK
13-
; RUN: FileCheck %s -input-file=%t_0.prop -check-prefix=ABSENCE-CHECK
12+
; RUN: FileCheck %s -input-file=%t_0.prop -check-prefixes=CHECK,CHECK0 \
13+
; RUN: --implicit-check-not TU1
14+
; RUN: FileCheck %s -input-file=%t_1.prop -check-prefixes=CHECK,CHECK1 \
15+
; RUN: --implicit-check-not TU0
16+
;
17+
; CHECK: [SYCL/assert used]
18+
; CHECK0-DAG: main_TU1_kernel0
19+
; CHECK0-DAG: main_TU1_kernel1
20+
;
21+
; CHECK1: main_TU0_kernel0
1422

1523
target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
1624
target triple = "spir64-unknown-linux"
@@ -40,7 +48,7 @@ entry:
4048
}
4149

4250
; ABSENCE-CHECK-NOT: empty_kernel
43-
define dso_local spir_kernel void @empty_kernel() {
51+
define dso_local spir_kernel void @empty_kernel() #2 {
4452
%1 = ptrtoint void ()* @bar to i64
4553
ret void
4654
}

llvm/test/tools/sycl-post-link/assert/indirect-with-split.ll

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,21 @@
77
; marked as using asserts.
88

99
; RUN: sycl-post-link -split=auto -symbols -S < %s -o %t.table
10-
; RUN: FileCheck %s -input-file=%t_0.prop
10+
; RUN: FileCheck %s -input-file=%t_0.prop --check-prefixes=CHECK,CHECK1 \
11+
; RUN: --implicit-check-not TU0
12+
; RUN: FileCheck %s -input-file=%t_1.prop --check-prefixes=CHECK,CHECK0 \
13+
; RUN: --implicit-check-not TU1 --implicit-check-not kernel1
14+
;
15+
; With recent improvements to device code split, this file is actually being
16+
; split to two modules and one of them does not contain "indirectly-referenced"
17+
; function, meaning that only direct users of 'assert' will be mentioned in
18+
; device image properties.
19+
;
20+
; CHECK: [SYCL/assert used]
21+
; CHECK0: main_TU0_kernel0
22+
;
23+
; CHECK1-DAG: main_TU1_kernel0
24+
; CHECK1-DAG: main_TU1_kernel1
1125

1226
target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
1327
target triple = "spir64-unknown-linux"
@@ -20,9 +34,6 @@ target triple = "spir64-unknown-linux"
2034
@__spirv_BuiltInLocalInvocationId = external dso_local local_unnamed_addr addrspace(1) constant <3 x i64>, align 32
2135
@_ZL10assert_fmt = internal addrspace(2) constant [85 x i8] c"%s:%d: %s: global id: [%lu,%lu,%lu], local id: [%lu,%lu,%lu] Assertion `%s` failed.\0A\00", align 1
2236

23-
; CHECK: [SYCL/assert used]
24-
25-
; CHECK-DAG: main_TU0_kernel0
2637
define dso_local spir_kernel void @main_TU0_kernel0() #0 {
2738
entry:
2839
call spir_func void @_Z3foov()
@@ -40,7 +51,6 @@ entry:
4051
ret void
4152
}
4253

43-
; CHECK-DAG: main_TU0_kernel1
4454
define dso_local spir_kernel void @main_TU0_kernel1() #0 {
4555
entry:
4656
call spir_func void @_Z4foo1v()
@@ -55,14 +65,12 @@ entry:
5565
ret void
5666
}
5767

58-
; CHECK-DAG: main_TU1_kernel0
5968
define dso_local spir_kernel void @main_TU1_kernel0() #2 {
6069
entry:
6170
call spir_func void @_Z3foov()
6271
ret void
6372
}
6473

65-
; CHECK-DAG: main_TU1_kernel1
6674
define dso_local spir_kernel void @main_TU1_kernel1() #2 {
6775
entry:
6876
call spir_func void @_Z4foo2v()
Lines changed: 53 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,42 @@
11
; RUN: sycl-post-link -split=auto -symbols -S < %s -o %t.table
2-
; In precense of indirectly callable function auto mode is equal to no split,
3-
; which means that separate LLVM IR file for device is not generated and we only
4-
; need to check generated symbol table
5-
; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK
2+
;
3+
; This is the same as auto-module-split-1 test with the only difference is that
4+
; @_Z3foov is marked with "referenced-indirectly" attribute.
5+
; The purpose of this test is to make sure that we can still perform device code
6+
; split as usual, because that function is not a part of any indirect calls
7+
;
8+
; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-TU0,CHECK
9+
; RUN: FileCheck %s -input-file=%t_1.ll --check-prefixes CHECK-TU1,CHECK
10+
; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK-TU0-TXT
11+
; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-TU1-TXT
612

713
target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
814
target triple = "spir64-unknown-linux"
915

1016
$_Z3barIiET_S0_ = comdat any
1117

18+
; CHECK-TU1-NOT: @{{.*}}GV{{.*}}
19+
; CHECK-TU0: @{{.*}}GV{{.*}} = internal addrspace(1) constant [1 x i32] [i32 42], align 4
1220
@_ZL2GV = internal addrspace(1) constant [1 x i32] [i32 42], align 4
1321

14-
; CHECK: {{.*}}TU0_kernel0{{.*}}
22+
; CHECK-TU1: define dso_local spir_kernel void @{{.*}}TU0_kernel0{{.*}}
23+
; CHECK-TU1-TXT: {{.*}}TU0_kernel0{{.*}}
24+
; CHECK-TU0-NOT: define dso_local spir_kernel void @{{.*}}TU0_kernel0{{.*}}
25+
; CHECK-TU0-TXT-NOT: {{.*}}TU0_kernel0{{.*}}
26+
27+
; CHECK-TU1: call spir_func void @{{.*}}foo{{.*}}()
1528

1629
define dso_local spir_kernel void @_ZTSZ4mainE11TU0_kernel0() #0 {
1730
entry:
1831
call spir_func void @_Z3foov()
1932
ret void
2033
}
2134

35+
; CHECK-TU1: define dso_local spir_func void @{{.*}}foo{{.*}}()
36+
; CHECK-TU0-NOT: define dso_local spir_func void @{{.*}}foo{{.*}}()
37+
38+
; CHECK-TU1: call spir_func i32 @{{.*}}bar{{.*}}(i32 1)
39+
2240
define dso_local spir_func void @_Z3foov() #2 {
2341
entry:
2442
%a = alloca i32, align 4
@@ -28,6 +46,9 @@ entry:
2846
ret void
2947
}
3048

49+
; CHECK-TU1: define {{.*}} spir_func i32 @{{.*}}bar{{.*}}(i32 %arg)
50+
; CHECK-TU0-NOT: define {{.*}} spir_func i32 @{{.*}}bar{{.*}}(i32 %arg)
51+
3152
; Function Attrs: nounwind
3253
define linkonce_odr dso_local spir_func i32 @_Z3barIiET_S0_(i32 %arg) comdat {
3354
entry:
@@ -37,33 +58,51 @@ entry:
3758
ret i32 %0
3859
}
3960

40-
; CHECK: {{.*}}TU0_kernel1{{.*}}
61+
; CHECK-TU1: define dso_local spir_kernel void @{{.*}}TU0_kernel1{{.*}}()
62+
; CHECK-TU1-TXT: {{.*}}TU0_kernel1{{.*}}
63+
; CHECK-TU0-NOT: define dso_local spir_kernel void @{{.*}}TU0_kernel1{{.*}}()
64+
; CHECK-TU0-TXT-NOT: {{.*}}TU0_kernel1{{.*}}
65+
66+
; CHECK-TU1: call spir_func void @{{.*}}foo1{{.*}}()
4167

4268
define dso_local spir_kernel void @_ZTSZ4mainE11TU0_kernel1() #0 {
4369
entry:
4470
call spir_func void @_Z4foo1v()
4571
ret void
4672
}
4773

74+
; CHECK-TU1: define dso_local spir_func void @{{.*}}foo1{{.*}}()
75+
; CHECK-TU0-NOT: define dso_local spir_func void @{{.*}}foo1{{.*}}()
76+
4877
; Function Attrs: nounwind
4978
define dso_local spir_func void @_Z4foo1v() {
5079
entry:
5180
%a = alloca i32, align 4
5281
store i32 2, i32* %a, align 4
5382
ret void
5483
}
55-
; CHECK: {{.*}}TU1_kernel{{.*}}
84+
85+
; CHECK-TU1-NOT: define dso_local spir_kernel void @{{.*}}TU1_kernel{{.*}}()
86+
; CHECK-TU1-TXT-NOT: {{.*}}TU1_kernel{{.*}}
87+
; CHECK-TU0: define dso_local spir_kernel void @{{.*}}TU1_kernel{{.*}}()
88+
; CHECK-TU0-TXT: {{.*}}TU1_kernel{{.*}}
89+
90+
; CHECK-TU0: call spir_func void @{{.*}}foo2{{.*}}()
5691

5792
define dso_local spir_kernel void @_ZTSZ4mainE10TU1_kernel() #1 {
5893
entry:
5994
call spir_func void @_Z4foo2v()
6095
ret void
6196
}
6297

98+
; CHECK-TU1-NOT: define dso_local spir_func void @{{.*}}foo2{{.*}}()
99+
; CHECK-TU0: define dso_local spir_func void @{{.*}}foo2{{.*}}()
100+
63101
; Function Attrs: nounwind
64102
define dso_local spir_func void @_Z4foo2v() {
65103
entry:
66104
%a = alloca i32, align 4
105+
; CHECK-TU0: %0 = load i32, i32 addrspace(4)* getelementptr inbounds ([1 x i32], [1 x i32] addrspace(4)* addrspacecast ([1 x i32] addrspace(1)* @{{.*}}GV{{.*}} to [1 x i32] addrspace(4)*), i64 0, i64 0), align 4
67106
%0 = load i32, i32 addrspace(4)* getelementptr inbounds ([1 x i32], [1 x i32] addrspace(4)* addrspacecast ([1 x i32] addrspace(1)* @_ZL2GV to [1 x i32] addrspace(4)*), i64 0, i64 0), align 4
68107
%add = add nsw i32 4, %0
69108
store i32 %add, i32* %a, align 4
@@ -74,8 +113,15 @@ attributes #0 = { "sycl-module-id"="TU1.cpp" }
74113
attributes #1 = { "sycl-module-id"="TU2.cpp" }
75114
attributes #2 = { "referenced-indirectly" }
76115

116+
; Metadata is saved in both modules.
117+
; CHECK: !opencl.spir.version = !{!0, !0}
118+
; CHECK: !spirv.Source = !{!1, !1}
119+
77120
!opencl.spir.version = !{!0, !0}
78121
!spirv.Source = !{!1, !1}
79122

123+
; CHECK: !0 = !{i32 1, i32 2}
124+
; CHECK: !1 = !{i32 4, i32 100000}
125+
80126
!0 = !{i32 1, i32 2}
81127
!1 = !{i32 4, i32 100000}

llvm/test/tools/sycl-post-link/device-code-split/auto-module-split-3.ll

Lines changed: 36 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,31 @@
11
; RUN: sycl-post-link -split=auto -symbols -S < %s -o %t.table
2-
; In precense of indirect calls auto mode is equal to no split,
3-
; which means that separate LLVM IR file for device is not generated and we only
4-
; need to check generated symbol table
5-
; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK
2+
;
3+
; In precense of indirect calls we start matching functions using their
4+
; signatures, i.e. we have an indirect call to i32(i32) function within
5+
; @_Z3foov, which means that all functions with i32(i32) signature should be
6+
; placed in the same module as @_Z3foov.
7+
;
8+
; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-TU0-IR \
9+
; RUN: --implicit-check-not TU0_kernel --implicit-check-not _Z3foov
10+
; RUN: FileCheck %s -input-file=%t_1.ll --check-prefixes CHECK-TU1-IR \
11+
; RUN: --implicit-check-not TU1_kernel --implicit-check-not _Z4foo2v
12+
; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK-TU0-SYM
13+
; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-TU1-SYM
14+
;
15+
; CHECK-TU0-SYM: _ZTSZ4mainE11TU1_kernel0
16+
; CHECK-TU0-SYM: _ZTSZ4mainE11TU1_kernel1
17+
;
18+
; CHECK-TU1-SYM: _ZTSZ4mainE10TU0_kernel
19+
;
20+
; CHECK-TU0-IR: @_ZL2GV = internal addrspace(1) constant
21+
; CHECK-TU0-IR: define dso_local spir_kernel void @_ZTSZ4mainE11TU1_kernel0
22+
; CHECK-TU0-IR: define dso_local spir_func i32 @_Z4foo1v
23+
; CHECK-TU0-IR: define dso_local spir_kernel void @_ZTSZ4mainE11TU1_kernel1
24+
; CHECK-TU0-IR: define dso_local spir_func void @_Z4foo2v
25+
;
26+
; CHECK-TU1-IR: define dso_local spir_kernel void @_ZTSZ4mainE10TU0_kernel
27+
; CHECK-TU1-IR: define dso_local spir_func void @_Z3foov
28+
; CHECK-TU1-IR: define dso_local spir_func i32 @_Z4foo1v
629

730
target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
831
target triple = "spir64-unknown-linux"
@@ -11,9 +34,7 @@ $_Z3barIiET_S0_ = comdat any
1134

1235
@_ZL2GV = internal addrspace(1) constant [1 x i32] [i32 42], align 4
1336

14-
; CHECK: {{.*}}TU0_kernel0{{.*}}
15-
16-
define dso_local spir_kernel void @_ZTSZ4mainE11TU0_kernel0() #0 {
37+
define dso_local spir_kernel void @_ZTSZ4mainE10TU0_kernel() #0 {
1738
entry:
1839
call spir_func void @_Z3foov()
1940
ret void
@@ -38,24 +59,23 @@ entry:
3859
ret i32 %0
3960
}
4061

41-
; CHECK: {{.*}}TU0_kernel1{{.*}}
42-
43-
define dso_local spir_kernel void @_ZTSZ4mainE11TU0_kernel1() #0 {
62+
define dso_local spir_kernel void @_ZTSZ4mainE11TU1_kernel0() #1 {
4463
entry:
45-
call spir_func void @_Z4foo1v()
64+
%a = alloca i32, align 4
65+
%arg = load i32, i32* %a, align 4
66+
%call = call spir_func i32 @_Z4foo1v(i32 %arg)
4667
ret void
4768
}
4869

4970
; Function Attrs: nounwind
50-
define dso_local spir_func void @_Z4foo1v() {
71+
define dso_local spir_func i32 @_Z4foo1v(i32 %arg) {
5172
entry:
5273
%a = alloca i32, align 4
53-
store i32 2, i32* %a, align 4
54-
ret void
74+
store i32 %arg, i32* %a, align 4
75+
ret i32 %arg
5576
}
56-
; CHECK: {{.*}}TU1_kernel{{.*}}
5777

58-
define dso_local spir_kernel void @_ZTSZ4mainE10TU1_kernel() #1 {
78+
define dso_local spir_kernel void @_ZTSZ4mainE11TU1_kernel1() #1 {
5979
entry:
6080
call spir_func void @_Z4foo2v()
6181
ret void

llvm/test/tools/sycl-post-link/device-code-split/auto-module-split-func-ptr.ll

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,20 @@
11
; RUN: sycl-post-link -split=auto -symbols -S < %s -o %t.table
2-
; RUN: FileCheck %s -input-file=%t_0.sym
3-
4-
; This test checkes that module is not split if function pointer's user is not
5-
; CallInst.
2+
; RUN: FileCheck %s -input-file=%t_0.sym --check-prefix=CHECK-SYM0
3+
; RUN: FileCheck %s -input-file=%t_1.sym --check-prefix=CHECK-SYM1
4+
; RUN: FileCheck %s -input-file=%t_0.ll --check-prefix=CHECK-IR0
5+
; RUN: FileCheck %s -input-file=%t_1.ll --check-prefix=CHECK-IR1
6+
7+
; This test checkes that we can properly perform device code split by tracking
8+
; all uses of functions (not only direct calls)
9+
10+
; CHECK-SYM0: kernel2
11+
; CHECK-SYM1: kernel1
12+
;
13+
; CHECK-IR0: define dso_local spir_kernel void @kernel2
14+
;
15+
; CHECK-IR1: @_Z2f1iTable = weak global [1 x i32 (i32)*] [i32 (i32)* @_Z2f1i]
16+
; CHECK-IR1: define dso_local spir_func i32 @_Z2f1i
17+
; CHECK-IR1: define weak_odr dso_local spir_kernel void @kernel1
618

719
target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64"
820
target triple = "spir64_x86_64-unknown-unknown"

0 commit comments

Comments
 (0)