@@ -72,6 +72,13 @@ def cancel_previous_build() {
72
72
}
73
73
}
74
74
75
+ def is_last_build () {
76
+ // whether it is last build
77
+ def job = Jenkins . instance. getItem(env. JOB_NAME )
78
+ def lastBuild = job. getLastBuild()
79
+ return lastBuild. getNumber() == env. BUILD_NUMBER
80
+ }
81
+
75
82
def init_git (submodule = false ) {
76
83
cleanWs()
77
84
// add retry in case checkout timeouts
@@ -95,57 +102,116 @@ def init_git(submodule = false) {
95
102
// }
96
103
// }
97
104
105
+ def run_unittest_CPU_AOT_COMPILE (node_type ) {
106
+ echo " Running CPU AOT Compile Unittest"
107
+ node(node_type) {
108
+ ws(per_exec_ws(' flashinfer-aot' )) {
109
+ init_git(true )
110
+ sh(script : " ls -alh" , label : ' Show work directory' )
111
+ sh(script : " ./scripts/task_show_node_info.sh" , label : ' Show node info' )
112
+ sh(script : " ${ docker_run} --no-gpu ./scripts/task_test_aot_build_import.sh" , label : ' Test AOT Build and Import' )
113
+ }
114
+ }
115
+ }
116
+
117
+ def shard_run_unittest_GPU (node_type , shard_id ) {
118
+ echo " Running unittest on ${ node_type} , shard ${ shard_id} "
119
+ node(node_type) {
120
+ ws(per_exec_ws(' flashinfer-unittest' )) {
121
+ init_git(true ) // we need cutlass submodule
122
+ sh(script : " ls -alh" , label : ' Show work directory' )
123
+ sh(script : " ./scripts/task_show_node_info.sh" , label : ' Show node info' )
124
+ sh(script : " ${ docker_run} ./scripts/task_jit_run_tests_part${ shard_id} .sh" , label : ' JIT Unittest Part ${shard_id}' )
125
+ }
126
+ }
127
+ }
128
+
98
129
stage(' Unittest' ) {
99
130
cancel_previous_build()
100
131
parallel(
101
132
failFast : true ,
102
133
' AOT-Build-Import' : {
103
- node(' CPU-LARGE-SPOT' ) {
104
- ws(per_exec_ws(' flashinfer-aot' )) {
105
- init_git(true )
106
- sh(script : " ls -alh" , label : ' Show work directory' )
107
- sh(script : " ./scripts/task_show_node_info.sh" , label : ' Show node info' )
108
- sh(script : " ${ docker_run} --no-gpu ./scripts/task_test_aot_build_import.sh" , label : ' Test AOT Build and Import' )
134
+ try {
135
+ run_unittest_CPU_AOT_COMPILE(' CPU-LARGE-SPOT' )
136
+ } catch (Throwable ex) {
137
+ if (is_last_build()) {
138
+ // retry if we are currently at last build
139
+ // mark the current stage as success
140
+ // and try again via on demand node
141
+ echo ' Exception during SPOT run ' + ex. toString() + ' retry on-demand'
142
+ currentBuild. result = ' SUCCESS'
143
+ run_unittest_CPU_AOT_COMPILE(' CPU-LARGE' )
144
+ } else {
145
+ echo ' Exception during SPOT run ' + ex. toString() + ' exit since it is not last build'
146
+ throw ex
109
147
}
110
148
}
111
149
},
112
150
' JIT-Unittest-1' : {
113
- node(' GPU-G5-SPOT' ) {
114
- ws(per_exec_ws(' flashinfer-unittest' )) {
115
- init_git(true ) // we need cutlass submodule
116
- sh(script : " ls -alh" , label : ' Show work directory' )
117
- sh(script : " ./scripts/task_show_node_info.sh" , label : ' Show node info' )
118
- sh(script : " ${ docker_run} ./scripts/task_jit_run_tests_part1.sh" , label : ' JIT Unittest Part 1' )
151
+ try {
152
+ shard_run_unittest_GPU(' GPU-G5-SPOT' , 1 )
153
+ } catch (Throwable ex) {
154
+ if (is_last_build()) {
155
+ // retry if we are currently at last build
156
+ // mark the current stage as success
157
+ // and try again via on demand node
158
+ echo ' Exception during SPOT run ' + ex. toString() + ' retry on-demand'
159
+ currentBuild. result = ' SUCCESS'
160
+ shard_run_unittest_GPU(' GPU-G5' , 1 )
161
+ } else {
162
+ echo ' Exception during SPOT run ' + ex. toString() + ' exit since it is not last build'
163
+ throw ex
119
164
}
120
165
}
121
166
},
122
167
' JIT-Unittest-2' : {
123
- node(' GPU-G5-SPOT' ) {
124
- ws(per_exec_ws(' flashinfer-unittest' )) {
125
- init_git(true ) // we need cutlass submodule
126
- sh(script : " ls -alh" , label : ' Show work directory' )
127
- sh(script : " ./scripts/task_show_node_info.sh" , label : ' Show node info' )
128
- sh(script : " ${ docker_run} ./scripts/task_jit_run_tests_part2.sh" , label : ' JIT Unittest Part 2' )
168
+ try {
169
+ shard_run_unittest_GPU(' GPU-G5-SPOT' , 2 )
170
+ } catch (Throwable ex) {
171
+ if (is_last_build()) {
172
+ // retry if we are currently at last build
173
+ // mark the current stage as success
174
+ // and try again via on demand node
175
+ echo ' Exception during SPOT run ' + ex. toString() + ' retry on-demand'
176
+ currentBuild. result = ' SUCCESS'
177
+ shard_run_unittest_GPU(' GPU-G5' , 2 )
178
+ } else {
179
+ echo ' Exception during SPOT run ' + ex. toString() + ' exit since it is not last build'
180
+ throw ex
129
181
}
130
182
}
131
183
},
132
184
' JIT-Unittest-3' : {
133
- node(' GPU-G5-SPOT' ) {
134
- ws(per_exec_ws(' flashinfer-unittest' )) {
135
- init_git(true ) // we need cutlass submodule
136
- sh(script : " ls -alh" , label : ' Show work directory' )
137
- sh(script : " ./scripts/task_show_node_info.sh" , label : ' Show node info' )
138
- sh(script : " ${ docker_run} ./scripts/task_jit_run_tests_part3.sh" , label : ' JIT Unittest Part 3' )
185
+ try {
186
+ shard_run_unittest_GPU(' GPU-G5-SPOT' , 3 )
187
+ } catch (Throwable ex) {
188
+ if (is_last_build()) {
189
+ // retry if we are currently at last build
190
+ // mark the current stage as success
191
+ // and try again via on demand node
192
+ echo ' Exception during SPOT run ' + ex. toString() + ' retry on-demand'
193
+ currentBuild. result = ' SUCCESS'
194
+ shard_run_unittest_GPU(' GPU-G5' , 3 )
195
+ } else {
196
+ echo ' Exception during SPOT run ' + ex. toString() + ' exit since it is not last build'
197
+ throw ex
139
198
}
140
199
}
141
200
},
142
201
' JIT-Unittest-4' : {
143
- node(' GPU-G5-SPOT' ) {
144
- ws(per_exec_ws(' flashinfer-unittest' )) {
145
- init_git(true ) // we need cutlass submodule
146
- sh(script : " ls -alh" , label : ' Show work directory' )
147
- sh(script : " ./scripts/task_show_node_info.sh" , label : ' Show node info' )
148
- sh(script : " ${ docker_run} ./scripts/task_jit_run_tests_part4.sh" , label : ' JIT Unittest Part 4' )
202
+ try {
203
+ shard_run_unittest_GPU(' GPU-G5-SPOT' , 4 )
204
+ } catch (Throwable ex) {
205
+ if (is_last_build()) {
206
+ // retry if we are currently at last build
207
+ // mark the current stage as success
208
+ // and try again via on demand node
209
+ echo ' Exception during SPOT run ' + ex. toString() + ' retry on-demand'
210
+ currentBuild. result = ' SUCCESS'
211
+ shard_run_unittest_GPU(' GPU-G5' , 4 )
212
+ } else {
213
+ echo ' Exception during SPOT run ' + ex. toString() + ' exit since it is not last build'
214
+ throw ex
149
215
}
150
216
}
151
217
}
0 commit comments