Skip to content

Commit 75044fc

Browse files
mszhanyiTed Themistokleous
authored and
Ted Themistokleous
committed
Refactor Python CUDA packaging pipeline to fix random hangs in building (microsoft#19989)
### Description 1. Move building on CPU machine. 2. Optimize the pipeline 3. Since there isn't official ONNX package for python 12, the python 12 test stage uses the packages built with ONNX source in build stage. ### Motivation and Context 1. Resolve the random hang in compilation 4. Save a lot of GPU resources. ---------
1 parent 04708eb commit 75044fc

File tree

4 files changed

+351
-280
lines changed

4 files changed

+351
-280
lines changed

tools/ci_build/github/azure-pipelines/py-cuda-packaging-pipeline.yml

+12-1
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,15 @@ parameters:
2121
values:
2222
- 11.8
2323
- 12.2
24+
- name: SpecificArtifact
25+
displayName: Use Specific Artifact
26+
type: boolean
27+
default: false
28+
29+
- name: BuildId
30+
displayName: Specific Artifact's BuildId
31+
type: string
32+
default: '0'
2433

2534
resources:
2635
repositories:
@@ -36,4 +45,6 @@ stages:
3645
enable_linux_gpu: ${{ parameters.enable_linux_gpu }}
3746
enable_windows_gpu: ${{ parameters.enable_windows_gpu }}
3847
cmake_build_type: ${{ parameters.cmake_build_type }}
39-
cuda_version: ${{ parameters.cuda_version }}
48+
cuda_version: ${{ parameters.cuda_version }}
49+
SpecificArtifact: ${{ parameters.SpecificArtifact }}
50+
BuildId: ${{ parameters.BuildId }}

tools/ci_build/github/azure-pipelines/stages/py-cuda-packaging-stage.yml

+33-61
Original file line numberDiff line numberDiff line change
@@ -34,72 +34,40 @@ parameters:
3434
- 11.8
3535
- 12.2
3636

37-
stages:
38-
- stage: Python_Packaging
39-
dependsOn: []
40-
variables:
41-
- name: docker_base_image
42-
${{ if eq(parameters.cuda_version, '11.8') }}:
43-
value: nvidia/cuda:11.8.0-cudnn8-devel-ubi8
44-
${{ if eq(parameters.cuda_version, '12.2') }}:
45-
value: nvidia/cuda:12.2.2-cudnn8-devel-ubi8
46-
- name: linux_trt_version
47-
${{ if eq(parameters.cuda_version, '11.8') }}:
48-
value: 8.6.1.6-1.cuda11.8
49-
${{ if eq(parameters.cuda_version, '12.2') }}:
50-
value: 8.6.1.6-1.cuda12.0
51-
- name: win_trt_home
52-
${{ if eq(parameters.cuda_version, '11.8') }}:
53-
value: $(Agent.TempDirectory)\TensorRT-8.6.1.6.Windows10.x86_64.cuda-11.8
54-
${{ if eq(parameters.cuda_version, '12.2') }}:
55-
value: $(Agent.TempDirectory)\TensorRT-8.6.1.6.Windows10.x86_64.cuda-12.0
56-
- name: win_cuda_home
57-
${{ if eq(parameters.cuda_version, '11.8') }}:
58-
value: $(Agent.TempDirectory)\v11.8
59-
${{ if eq(parameters.cuda_version, '12.2') }}:
60-
value: $(Agent.TempDirectory)\v12.2
61-
jobs:
62-
- ${{ if eq(parameters.enable_windows_gpu, true) }}:
63-
- template: ../templates/py-win-gpu.yml
64-
parameters:
65-
MACHINE_POOL: 'onnxruntime-Win2022-GPU-T4'
66-
PYTHON_VERSION: '3.8'
67-
EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home=${{ variables.win_trt_home }} --cuda_home=${{ variables.win_cuda_home }} --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
68-
EP_NAME: gpu
69-
CudaVersion: ${{ parameters.cuda_version }}
70-
71-
- template: ../templates/py-win-gpu.yml
72-
parameters:
73-
MACHINE_POOL: 'onnxruntime-Win2022-GPU-T4'
74-
PYTHON_VERSION: '3.9'
75-
EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home=${{ variables.win_trt_home }} --cuda_home=${{ variables.win_cuda_home }} --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
76-
EP_NAME: gpu
77-
CudaVersion: ${{ parameters.cuda_version }}
37+
- name: SpecificArtifact
38+
displayName: Use Specific Artifact
39+
type: boolean
40+
default: false
7841

79-
- template: ../templates/py-win-gpu.yml
80-
parameters:
81-
MACHINE_POOL: 'onnxruntime-Win2022-GPU-T4'
82-
PYTHON_VERSION: '3.10'
83-
EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home=${{ variables.win_trt_home }} --cuda_home=${{ variables.win_cuda_home }} --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
84-
EP_NAME: gpu
85-
CudaVersion: ${{ parameters.cuda_version }}
42+
- name: BuildId
43+
displayName: Specific Artifact's BuildId
44+
type: string
45+
default: '0'
8646

87-
- template: ../templates/py-win-gpu.yml
88-
parameters:
89-
MACHINE_POOL: 'onnxruntime-Win2022-GPU-T4'
90-
PYTHON_VERSION: '3.11'
91-
EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home=${{ variables.win_trt_home }} --cuda_home=${{ variables.win_cuda_home }} --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
92-
EP_NAME: gpu
93-
CudaVersion: ${{ parameters.cuda_version }}
47+
- name: PythonVersions
48+
type: object
49+
displayName: 'Python versions to build'
50+
default:
51+
- '3.8'
52+
- '3.9'
53+
- '3.10'
54+
- '3.11'
55+
- '3.12'
9456

57+
stages:
58+
- ${{ if eq(parameters.enable_windows_gpu, true) }}:
59+
- ${{ each python_version in parameters.PythonVersions }}:
9560
- template: ../templates/py-win-gpu.yml
9661
parameters:
97-
MACHINE_POOL: 'onnxruntime-Win2022-GPU-T4'
98-
PYTHON_VERSION: '3.12'
99-
EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home=${{ variables.win_trt_home }} --cuda_home=${{ variables.win_cuda_home }} --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
62+
PYTHON_VERSION: ${{ python_version }}
10063
EP_NAME: gpu
10164
CudaVersion: ${{ parameters.cuda_version }}
102-
65+
SpecificArtifact: ${{ parameters.SpecificArtifact }}
66+
BuildId: ${{ parameters.BuildId }}
67+
${{ if eq(parameters.cuda_version, '11.8') }}:
68+
EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home=$(Agent.TempDirectory)\TensorRT-8.6.1.6.Windows10.x86_64.cuda-11.8 --cuda_home=$(Agent.TempDirectory)\v11.8 --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
69+
${{ if eq(parameters.cuda_version, '12.2') }}:
70+
EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home=$(Agent.TempDirectory)\TensorRT-8.6.1.6.Windows10.x86_64.cuda-12.0 --cuda_home=$(Agent.TempDirectory)\v12.2 --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
10371

10472
- ${{ if eq(parameters.enable_linux_gpu, true) }}:
10573
- template: ../templates/py-linux-gpu.yml
@@ -108,6 +76,10 @@ stages:
10876
machine_pool: 'onnxruntime-Ubuntu2204-AMD-CPU'
10977
extra_build_arg: ${{ parameters.build_py_parameters }}
11078
cmake_build_type: ${{ parameters.cmake_build_type }}
111-
docker_base_image: ${{ variables.docker_base_image }}
112-
trt_version: ${{ variables.linux_trt_version }}
11379
cuda_version: ${{ parameters.cuda_version }}
80+
${{ if eq(parameters.cuda_version, '11.8') }}:
81+
docker_base_image: nvidia/cuda:11.8.0-cudnn8-devel-ubi8
82+
trt_version: 8.6.1.6-1.cuda11.8
83+
${{ if eq(parameters.cuda_version, '12.2') }}:
84+
docker_base_image: nvidia/cuda:12.2.2-cudnn8-devel-ubi8
85+
trt_version: 8.6.1.6-1.cuda12.0

tools/ci_build/github/azure-pipelines/templates/py-linux-gpu.yml

+55-51
Original file line numberDiff line numberDiff line change
@@ -35,62 +35,66 @@ parameters:
3535
values:
3636
- 11.8
3737
- 12.2
38-
jobs:
39-
- job: Linux_py_GPU_Wheels_${{ parameters.arch }}
40-
timeoutInMinutes: 240
41-
workspace:
42-
clean: all
43-
pool: ${{ parameters.machine_pool }}
44-
variables:
45-
# The build machine pool doesn't have dotnet, so it can't run CG.
46-
- name: skipComponentGovernanceDetection
47-
value: true
48-
- name: extra_build_args
49-
${{ if ne(parameters.extra_build_arg, '') }}:
50-
value: -x ${{ parameters.extra_build_arg }}
51-
${{ if eq(parameters.extra_build_arg, '') }}:
52-
value: ''
53-
steps:
54-
- checkout: self
55-
clean: true
56-
submodules: recursive
5738

58-
- template: set-nightly-build-option-variable-step.yml
39+
stages:
40+
- stage: Linux_py_GPU_Wheels_${{ parameters.arch }}
41+
dependsOn: []
42+
jobs:
43+
- job: Linux_py_GPU_Wheels_${{ parameters.arch }}
44+
timeoutInMinutes: 240
45+
workspace:
46+
clean: all
47+
pool: ${{ parameters.machine_pool }}
48+
variables:
49+
# The build machine pool doesn't have dotnet, so it can't run CG.
50+
- name: skipComponentGovernanceDetection
51+
value: true
52+
- name: extra_build_args
53+
${{ if ne(parameters.extra_build_arg, '') }}:
54+
value: -x ${{ parameters.extra_build_arg }}
55+
${{ if eq(parameters.extra_build_arg, '') }}:
56+
value: ''
57+
steps:
58+
- checkout: self
59+
clean: true
60+
submodules: recursive
5961

60-
- template: get-docker-image-steps.yml
61-
parameters:
62-
Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
63-
Context: tools/ci_build/github/linux/docker
64-
DockerBuildArgs: "
65-
--network=host
66-
--build-arg BASEIMAGE=${{ parameters.docker_base_image }}
67-
--build-arg TRT_VERSION=${{ parameters.trt_version }}
68-
--build-arg BUILD_UID=$( id -u )
69-
--build-arg PLATFORM=${{ parameters.arch }}
70-
"
71-
Repository: onnxruntimecuda${{ replace(parameters.cuda_version, '.', '') }}xtrt86build${{ parameters.arch }}
62+
- template: set-nightly-build-option-variable-step.yml
7263

64+
- template: get-docker-image-steps.yml
65+
parameters:
66+
Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
67+
Context: tools/ci_build/github/linux/docker
68+
DockerBuildArgs: "
69+
--network=host
70+
--build-arg BASEIMAGE=${{ parameters.docker_base_image }}
71+
--build-arg TRT_VERSION=${{ parameters.trt_version }}
72+
--build-arg BUILD_UID=$( id -u )
73+
--build-arg PLATFORM=${{ parameters.arch }}
74+
"
75+
Repository: onnxruntimecuda${{ replace(parameters.cuda_version, '.', '') }}xtrt86build${{ parameters.arch }}
7376

74-
- task: Bash@3
75-
displayName: 'Build Python Wheel'
76-
inputs:
77-
targetType: filePath
78-
filePath: tools/ci_build/github/linux/run_python_dockerbuild.sh
79-
arguments: -i onnxruntimecuda${{ replace(parameters.cuda_version, '.', '') }}xtrt86build${{ parameters.arch }} -d "GPU" -c ${{ parameters.cmake_build_type }} $(extra_build_args)
8077

81-
- task: PublishBuildArtifacts@1
82-
displayName: 'Publish Artifact: ONNXRuntime python wheel'
83-
inputs:
84-
PathtoPublish: '$(Build.BinariesDirectory)/dist'
85-
ArtifactName: onnxruntime_gpu
78+
- task: Bash@3
79+
displayName: 'Build Python Wheel'
80+
inputs:
81+
targetType: filePath
82+
filePath: tools/ci_build/github/linux/run_python_dockerbuild.sh
83+
arguments: -i onnxruntimecuda${{ replace(parameters.cuda_version, '.', '') }}xtrt86build${{ parameters.arch }} -d "GPU" -c ${{ parameters.cmake_build_type }} $(extra_build_args)
8684

87-
- task: PublishPipelineArtifact@0
88-
displayName: 'Publish Test Binaries'
89-
inputs:
90-
artifactName: 'drop-linux-gpu-${{ parameters.arch }}'
91-
targetPath: '$(Build.BinariesDirectory)/Release'
85+
- task: PublishBuildArtifacts@1
86+
displayName: 'Publish Artifact: ONNXRuntime python wheel'
87+
inputs:
88+
PathtoPublish: '$(Build.BinariesDirectory)/dist'
89+
ArtifactName: onnxruntime_gpu
9290

91+
- task: PublishPipelineArtifact@0
92+
displayName: 'Publish Test Binaries'
93+
inputs:
94+
artifactName: 'drop-linux-gpu-${{ parameters.arch }}'
95+
targetPath: '$(Build.BinariesDirectory)/Release'
9396

94-
- task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
95-
displayName: 'Clean Agent Directories'
96-
condition: always()
97+
98+
- task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
99+
displayName: 'Clean Agent Directories'
100+
condition: always()

0 commit comments

Comments
 (0)