Skip to content

Commit afe4006

Browse files
authored
[MLIR] Use test-lower-to-nvvm for sm_90 Integration Tests on GitHub (llvm#68184)
This PR enables `test-lower-to-nvvm` pass pipeline for the integration tests for NVIDIA sm_90 architecture. This PR adjusts `test-lower-to-nvvm` pass in two ways: 1) Calls `createConvertNVGPUToNVVMPass` before the outlining process. This particular pass is responsible for generating both device and host code. On the host, it calls the CUDA driver to build the TMA descriptor (`cuTensorMap`). 2) Integrates the `createConvertNVVMToLLVMPass` to generate PTXs for NVVM Ops.
1 parent 20fc2ff commit afe4006

File tree

4 files changed

+27
-48
lines changed

4 files changed

+27
-48
lines changed

mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir

Lines changed: 1 addition & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,11 @@
11
// RUN: mlir-opt %s \
2-
// RUN: -convert-nvgpu-to-nvvm \
3-
// RUN: -gpu-kernel-outlining \
4-
// RUN: -convert-vector-to-scf \
5-
// RUN: -convert-scf-to-cf \
6-
// RUN: -convert-nvvm-to-llvm \
7-
// RUN: -convert-vector-to-llvm \
8-
// RUN: -convert-index-to-llvm=index-bitwidth=32 \
9-
// RUN: -convert-arith-to-llvm \
10-
// RUN: -finalize-memref-to-llvm='use-opaque-pointers=1' \
11-
// RUN: -convert-func-to-llvm \
12-
// RUN: -canonicalize -cse \
13-
// RUN: -expand-strided-metadata --nvvm-attach-target="module=main_kernel features=+ptx80 chip=sm_90 O=3" \
14-
// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-index-to-llvm{index-bitwidth=32},canonicalize,cse))' \
15-
// RUN: | mlir-opt --gpu-to-llvm --gpu-module-to-binary=format=%gpu_compilation_format -canonicalize -cse -reconcile-unrealized-casts \
2+
// RUN: -test-lower-to-nvvm="cubin-chip=sm_90 cubin-features=+ptx80 opt-level=3" \
163
// RUN: | mlir-cpu-runner \
174
// RUN: --shared-libs=%mlir_cuda_runtime \
185
// RUN: --shared-libs=%mlir_runner_utils \
196
// RUN: --entry-point-result=void \
207
// RUN: | FileCheck %s
218

22-
239
// Test swizzling with TMA load
2410
// 128B Swizzle Each numbered cell is 16 byte
2511
// |-------------------------------|

mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x64_swizzle128b.mlir

Lines changed: 1 addition & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,5 @@
11
// RUN: mlir-opt %s \
2-
// RUN: -convert-nvgpu-to-nvvm \
3-
// RUN: -canonicalize -cse \
4-
// RUN: -gpu-kernel-outlining \
5-
// RUN: -convert-vector-to-scf \
6-
// RUN: -convert-scf-to-cf \
7-
// RUN: -convert-nvvm-to-llvm \
8-
// RUN: -convert-vector-to-llvm \
9-
// RUN: -convert-index-to-llvm=index-bitwidth=32 \
10-
// RUN: -convert-arith-to-llvm \
11-
// RUN: -finalize-memref-to-llvm='use-opaque-pointers=1' \
12-
// RUN: -convert-func-to-llvm \
13-
// RUN: -canonicalize -cse \
14-
// RUN: -expand-strided-metadata --nvvm-attach-target="module=main_kernel features=+ptx80 chip=sm_90 O=3" \
15-
// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-index-to-llvm{index-bitwidth=32},canonicalize,cse))' \
16-
// RUN: | mlir-opt --gpu-to-llvm --gpu-module-to-binary -canonicalize -cse -reconcile-unrealized-casts \
2+
// RUN: -test-lower-to-nvvm="cubin-chip=sm_90 cubin-features=+ptx80 opt-level=3" \
173
// RUN: | mlir-cpu-runner \
184
// RUN: --shared-libs=%mlir_cuda_runtime \
195
// RUN: --shared-libs=%mlir_runner_utils \

mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir

Lines changed: 7 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,10 @@
1-
// RUN: mlir-opt %s --convert-nvgpu-to-nvvm \
2-
// RUN: -gpu-kernel-outlining \
3-
// RUN: -convert-nvvm-to-llvm \
4-
// RUN: -convert-scf-to-cf \
5-
// RUN: -convert-vector-to-llvm \
6-
// RUN: -convert-index-to-llvm=index-bitwidth=32 \
7-
// RUN: -convert-arith-to-llvm \
8-
// RUN: -finalize-memref-to-llvm='use-opaque-pointers=1' \
9-
// RUN: -convert-func-to-llvm \
10-
// RUN: -expand-strided-metadata --nvvm-attach-target="module=main_kernel features=+ptx80 chip=sm_90 O=3" \
11-
// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-index-to-llvm{index-bitwidth=32},canonicalize,cse))' \
12-
// RUN: | mlir-opt --gpu-to-llvm --gpu-module-to-binary=format=%gpu_compilation_format -canonicalize -cse -reconcile-unrealized-casts -debug-only=serialize-to-isa \
13-
// RUN: 2>&1 | FileCheck %s --check-prefixes=CHECK-PTX
1+
// RUN: mlir-opt %s \
2+
// RUN: -test-lower-to-nvvm="cubin-chip=sm_90 cubin-features=+ptx80 opt-level=3" \
3+
// RUN: | mlir-cpu-runner \
4+
// RUN: --shared-libs=%mlir_cuda_runtime \
5+
// RUN: --shared-libs=%mlir_runner_utils \
6+
// RUN: --entry-point-result=void \
7+
// RUN: | FileCheck %s
148

159
// Basic PTX check to make sure we are generating the right instructions.
1610

mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
#include "mlir/Conversion/MathToLLVM/MathToLLVM.h"
2121
#include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h"
2222
#include "mlir/Conversion/NVGPUToNVVM/NVGPUToNVVM.h"
23+
#include "mlir/Conversion/NVVMToLLVM/NVVMToLLVM.h"
2324
#include "mlir/Conversion/ReconcileUnrealizedCasts/ReconcileUnrealizedCasts.h"
2425
#include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"
2526
#include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVMPass.h"
@@ -143,11 +144,6 @@ void buildGpuPassPipeline(OpPassManager &pm,
143144
pm.addNestedPass<gpu::GPUModuleOp>(
144145
createConvertGpuOpsToNVVMOps(convertGpuOpsToNVVMOpsOptions));
145146

146-
// TODO: C++20 designated initializers.
147-
ConvertNVGPUToNVVMPassOptions convertNVGPUToNVVMPassOptions;
148-
convertNVGPUToNVVMPassOptions.useOpaquePointers = true;
149-
pm.addNestedPass<gpu::GPUModuleOp>(
150-
createConvertNVGPUToNVVMPass(convertNVGPUToNVVMPassOptions));
151147
pm.addNestedPass<gpu::GPUModuleOp>(createConvertSCFToCFPass());
152148

153149
// Convert vector to LLVM (always needed).
@@ -157,6 +153,9 @@ void buildGpuPassPipeline(OpPassManager &pm,
157153
pm.addNestedPass<gpu::GPUModuleOp>(
158154
createConvertVectorToLLVMPass(convertVectorToLLVMPassOptions));
159155

156+
// This pass is needed for PTX building
157+
pm.addNestedPass<gpu::GPUModuleOp>(createConvertNVVMToLLVMPass());
158+
160159
// Sprinkle some cleanups.
161160
pm.addPass(createCanonicalizerPass());
162161
pm.addPass(createCSEPass());
@@ -167,6 +166,20 @@ void buildGpuPassPipeline(OpPassManager &pm,
167166

168167
void buildLowerToNVVMPassPipeline(OpPassManager &pm,
169168
const TestLowerToNVVMOptions &options) {
169+
// Start with a cleanup pass.
170+
pm.addPass(createCanonicalizerPass());
171+
pm.addPass(createCSEPass());
172+
173+
//===----------------------------------------------------------------------===//
174+
// NVGPU lowers device code as well as host code to the driver, so must run
175+
// before outlining.
176+
//===----------------------------------------------------------------------===//
177+
// TODO: C++20 designated initializers.
178+
ConvertNVGPUToNVVMPassOptions convertNVGPUToNVVMPassOptions;
179+
convertNVGPUToNVVMPassOptions.useOpaquePointers = true;
180+
pm.addNestedPass<func::FuncOp>(
181+
createConvertNVGPUToNVVMPass(convertNVGPUToNVVMPassOptions));
182+
170183
//===----------------------------------------------------------------------===//
171184
// Host-specific stuff.
172185
//===----------------------------------------------------------------------===//

0 commit comments

Comments
 (0)