Skip to content

Commit afa9332

Browse files
authored
ci: switch to on-demand instances if spot instance is interrupted (#987)
cc @yongwww
1 parent 86da6b8 commit afa9332

File tree

1 file changed

+96
-30
lines changed

1 file changed

+96
-30
lines changed

Jenkinsfile

+96-30
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,13 @@ def cancel_previous_build() {
7272
}
7373
}
7474

75+
def is_last_build() {
76+
// whether it is last build
77+
def job = Jenkins.instance.getItem(env.JOB_NAME)
78+
def lastBuild = job.getLastBuild()
79+
return lastBuild.getNumber() == env.BUILD_NUMBER
80+
}
81+
7582
def init_git(submodule = false) {
7683
cleanWs()
7784
// add retry in case checkout timeouts
@@ -95,57 +102,116 @@ def init_git(submodule = false) {
95102
// }
96103
// }
97104

105+
def run_unittest_CPU_AOT_COMPILE(node_type) {
106+
echo "Running CPU AOT Compile Unittest"
107+
node(node_type) {
108+
ws(per_exec_ws('flashinfer-aot')) {
109+
init_git(true)
110+
sh(script: "ls -alh", label: 'Show work directory')
111+
sh(script: "./scripts/task_show_node_info.sh", label: 'Show node info')
112+
sh(script: "${docker_run} --no-gpu ./scripts/task_test_aot_build_import.sh", label: 'Test AOT Build and Import')
113+
}
114+
}
115+
}
116+
117+
def shard_run_unittest_GPU(node_type, shard_id) {
118+
echo "Running unittest on ${node_type}, shard ${shard_id}"
119+
node(node_type) {
120+
ws(per_exec_ws('flashinfer-unittest')) {
121+
init_git(true) // we need cutlass submodule
122+
sh(script: "ls -alh", label: 'Show work directory')
123+
sh(script: "./scripts/task_show_node_info.sh", label: 'Show node info')
124+
sh(script: "${docker_run} ./scripts/task_jit_run_tests_part${shard_id}.sh", label: 'JIT Unittest Part ${shard_id}')
125+
}
126+
}
127+
}
128+
98129
stage('Unittest') {
99130
cancel_previous_build()
100131
parallel(
101132
failFast: true,
102133
'AOT-Build-Import': {
103-
node('CPU-LARGE-SPOT') {
104-
ws(per_exec_ws('flashinfer-aot')) {
105-
init_git(true)
106-
sh(script: "ls -alh", label: 'Show work directory')
107-
sh(script: "./scripts/task_show_node_info.sh", label: 'Show node info')
108-
sh(script: "${docker_run} --no-gpu ./scripts/task_test_aot_build_import.sh", label: 'Test AOT Build and Import')
134+
try {
135+
run_unittest_CPU_AOT_COMPILE('CPU-LARGE-SPOT')
136+
} catch (Throwable ex) {
137+
if (is_last_build()) {
138+
// retry if we are currently at last build
139+
// mark the current stage as success
140+
// and try again via on demand node
141+
echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
142+
currentBuild.result = 'SUCCESS'
143+
run_unittest_CPU_AOT_COMPILE('CPU-LARGE')
144+
} else {
145+
echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build'
146+
throw ex
109147
}
110148
}
111149
},
112150
'JIT-Unittest-1': {
113-
node('GPU-G5-SPOT') {
114-
ws(per_exec_ws('flashinfer-unittest')) {
115-
init_git(true) // we need cutlass submodule
116-
sh(script: "ls -alh", label: 'Show work directory')
117-
sh(script: "./scripts/task_show_node_info.sh", label: 'Show node info')
118-
sh(script: "${docker_run} ./scripts/task_jit_run_tests_part1.sh", label: 'JIT Unittest Part 1')
151+
try {
152+
shard_run_unittest_GPU('GPU-G5-SPOT', 1)
153+
} catch (Throwable ex) {
154+
if (is_last_build()) {
155+
// retry if we are currently at last build
156+
// mark the current stage as success
157+
// and try again via on demand node
158+
echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
159+
currentBuild.result = 'SUCCESS'
160+
shard_run_unittest_GPU('GPU-G5', 1)
161+
} else {
162+
echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build'
163+
throw ex
119164
}
120165
}
121166
},
122167
'JIT-Unittest-2': {
123-
node('GPU-G5-SPOT') {
124-
ws(per_exec_ws('flashinfer-unittest')) {
125-
init_git(true) // we need cutlass submodule
126-
sh(script: "ls -alh", label: 'Show work directory')
127-
sh(script: "./scripts/task_show_node_info.sh", label: 'Show node info')
128-
sh(script: "${docker_run} ./scripts/task_jit_run_tests_part2.sh", label: 'JIT Unittest Part 2')
168+
try {
169+
shard_run_unittest_GPU('GPU-G5-SPOT', 2)
170+
} catch (Throwable ex) {
171+
if (is_last_build()) {
172+
// retry if we are currently at last build
173+
// mark the current stage as success
174+
// and try again via on demand node
175+
echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
176+
currentBuild.result = 'SUCCESS'
177+
shard_run_unittest_GPU('GPU-G5', 2)
178+
} else {
179+
echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build'
180+
throw ex
129181
}
130182
}
131183
},
132184
'JIT-Unittest-3': {
133-
node('GPU-G5-SPOT') {
134-
ws(per_exec_ws('flashinfer-unittest')) {
135-
init_git(true) // we need cutlass submodule
136-
sh(script: "ls -alh", label: 'Show work directory')
137-
sh(script: "./scripts/task_show_node_info.sh", label: 'Show node info')
138-
sh(script: "${docker_run} ./scripts/task_jit_run_tests_part3.sh", label: 'JIT Unittest Part 3')
185+
try {
186+
shard_run_unittest_GPU('GPU-G5-SPOT', 3)
187+
} catch (Throwable ex) {
188+
if (is_last_build()) {
189+
// retry if we are currently at last build
190+
// mark the current stage as success
191+
// and try again via on demand node
192+
echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
193+
currentBuild.result = 'SUCCESS'
194+
shard_run_unittest_GPU('GPU-G5', 3)
195+
} else {
196+
echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build'
197+
throw ex
139198
}
140199
}
141200
},
142201
'JIT-Unittest-4': {
143-
node('GPU-G5-SPOT') {
144-
ws(per_exec_ws('flashinfer-unittest')) {
145-
init_git(true) // we need cutlass submodule
146-
sh(script: "ls -alh", label: 'Show work directory')
147-
sh(script: "./scripts/task_show_node_info.sh", label: 'Show node info')
148-
sh(script: "${docker_run} ./scripts/task_jit_run_tests_part4.sh", label: 'JIT Unittest Part 4')
202+
try {
203+
shard_run_unittest_GPU('GPU-G5-SPOT', 4)
204+
} catch (Throwable ex) {
205+
if (is_last_build()) {
206+
// retry if we are currently at last build
207+
// mark the current stage as success
208+
// and try again via on demand node
209+
echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
210+
currentBuild.result = 'SUCCESS'
211+
shard_run_unittest_GPU('GPU-G5', 4)
212+
} else {
213+
echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build'
214+
throw ex
149215
}
150216
}
151217
}

0 commit comments

Comments
 (0)