Skip to content

[CIR][Lowering] Introduce HoistAllocasPass #887

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Oct 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions clang/include/clang/CIR/Dialect/Passes.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ std::unique_ptr<Pass> createIdiomRecognizerPass(clang::ASTContext *astCtx);
std::unique_ptr<Pass> createLibOptPass();
std::unique_ptr<Pass> createLibOptPass(clang::ASTContext *astCtx);
std::unique_ptr<Pass> createFlattenCFGPass();
std::unique_ptr<Pass> createHoistAllocasPass();
std::unique_ptr<Pass> createGotoSolverPass();

/// Create a pass to lower ABI-independent function definitions/calls.
Expand Down
10 changes: 10 additions & 0 deletions clang/include/clang/CIR/Dialect/Passes.td
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,16 @@ def SCFPrepare : Pass<"cir-mlir-scf-prepare"> {
let dependentDialects = ["cir::CIRDialect"];
}

def HoistAllocas : Pass<"cir-hoist-allocas"> {
let summary = "Hoist allocas to the entry of the function";
let description = [{
This pass hoist all non-dynamic allocas to the entry of the function.
This is helpful for later code generation.
}];
let constructor = "mlir::createHoistAllocasPass()";
let dependentDialects = ["cir::CIRDialect"];
}

def FlattenCFG : Pass<"cir-flatten-cfg"> {
let summary = "Produces flatten cfg";
let description = [{
Expand Down
1 change: 1 addition & 0 deletions clang/lib/CIR/CodeGen/CIRPasses.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ mlir::LogicalResult runCIRToCIRPasses(
namespace mlir {

void populateCIRPreLoweringPasses(OpPassManager &pm) {
pm.addPass(createHoistAllocasPass());
pm.addPass(createFlattenCFGPass());
pm.addPass(createGotoSolverPass());
}
Expand Down
1 change: 1 addition & 0 deletions clang/lib/CIR/Dialect/Transforms/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ add_clang_library(MLIRCIRTransforms
GotoSolver.cpp
SCFPrepare.cpp
CallConvLowering.cpp
HoistAllocas.cpp

DEPENDS
MLIRCIRPassIncGen
Expand Down
65 changes: 65 additions & 0 deletions clang/lib/CIR/Dialect/Transforms/HoistAllocas.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
//====- HoistAllocas.cpp --------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "PassDetail.h"
#include "mlir/Dialect/Func/IR/FuncOps.h"
#include "mlir/IR/PatternMatch.h"
#include "mlir/Support/LogicalResult.h"
#include "mlir/Transforms/DialectConversion.h"
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
#include "clang/CIR/Dialect/IR/CIRDialect.h"
#include "clang/CIR/Dialect/Passes.h"

#include "llvm/Support/TimeProfiler.h"

using namespace mlir;
using namespace mlir::cir;

namespace {

struct HoistAllocasPass : public HoistAllocasBase<HoistAllocasPass> {

HoistAllocasPass() = default;
void runOnOperation() override;
};

static void process(mlir::cir::FuncOp func) {
if (func.getRegion().empty())
return;

// Hoist all static allocas to the entry block.
mlir::Block &entryBlock = func.getRegion().front();
llvm::SmallVector<mlir::cir::AllocaOp> allocas;
func.getBody().walk([&](mlir::cir::AllocaOp alloca) {
if (alloca->getBlock() == &entryBlock)
return;
// Don't hoist allocas with dynamic alloca size.
if (alloca.getDynAllocSize())
return;
allocas.push_back(alloca);
});
if (allocas.empty())
return;

mlir::Operation *insertPoint = &*entryBlock.begin();

for (auto alloca : allocas)
alloca->moveBefore(insertPoint);
}

void HoistAllocasPass::runOnOperation() {
llvm::TimeTraceScope scope("Hoist Allocas");
SmallVector<Operation *, 16> ops;
getOperation()->walk([&](mlir::cir::FuncOp op) { process(op); });
}

} // namespace

std::unique_ptr<Pass> mlir::createHoistAllocasPass() {
return std::make_unique<HoistAllocasPass>();
}
120 changes: 30 additions & 90 deletions clang/test/CIR/CodeGen/AArch64/neon-ldst.c

Large diffs are not rendered by default.

32 changes: 32 additions & 0 deletions clang/test/CIR/CodeGen/AArch64/neon-misc.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ uint8x8_t test_vset_lane_u8(uint8_t a, uint8x8_t b) {
// CIR: {{%.*}} = cir.vec.insert {{%.*}}, {{%.*}}[[[IDX]] : !s32i] : !cir.vector<!s8i x 8>

// LLVM: define dso_local <8 x i8> @test_vset_lane_u8(i8 [[A:%.*]], <8 x i8> [[B:%.*]])
// LLVM: alloca <8 x i8>
// LLVM: alloca i8
// LLVM: [[A_ADR:%.*]] = alloca i8, i64 1, align 1
// LLVM: [[B_ADR:%.*]] = alloca <8 x i8>, i64 1, align 8
// LLVM: store i8 [[A]], ptr [[A_ADR]], align 1
Expand All @@ -42,6 +44,8 @@ uint16x4_t test_vset_lane_u16(uint16_t a, uint16x4_t b) {
// CIR: {{%.*}} = cir.vec.insert {{%.*}}, {{%.*}}[[[IDX]] : !s32i] : !cir.vector<!s16i x 4>

// LLVM: define dso_local <4 x i16> @test_vset_lane_u16(i16 [[A:%.*]], <4 x i16> [[B:%.*]])
// LLVM: alloca <4 x i16>
// LLVM: alloca i16
// LLVM: [[A_ADR:%.*]] = alloca i16, i64 1, align 2
// LLVM: [[B_ADR:%.*]] = alloca <4 x i16>, i64 1, align 8
// LLVM: store i16 [[A]], ptr [[A_ADR]], align 2
Expand All @@ -64,6 +68,8 @@ uint32x2_t test_vset_lane_u32(uint32_t a, uint32x2_t b) {
// CIR: {{%.*}} = cir.vec.insert {{%.*}}, {{%.*}}[[[IDX]] : !s32i] : !cir.vector<!s32i x 2>

// LLVM: define dso_local <2 x i32> @test_vset_lane_u32(i32 [[A:%.*]], <2 x i32> [[B:%.*]])
// LLVM: alloca <2 x i32>
// LLVM: alloca i32
// LLVM: [[A_ADR:%.*]] = alloca i32, i64 1, align 4
// LLVM: [[B_ADR:%.*]] = alloca <2 x i32>, i64 1, align 8
// LLVM: store i32 [[A]], ptr [[A_ADR]], align 4
Expand All @@ -87,6 +93,8 @@ int64x1_t test_vset_lane_u64(int64_t a, int64x1_t b) {
// CIR: {{%.*}} = cir.vec.insert {{%.*}}, {{%.*}}[[[IDX]] : !s32i] : !cir.vector<!s64i x 1>

// LLVM: define dso_local <1 x i64> @test_vset_lane_u64(i64 [[A:%.*]], <1 x i64> [[B:%.*]])
// LLVM: alloca <1 x i64>
// LLVM: alloca i64
// LLVM: [[A_ADR:%.*]] = alloca i64, i64 1, align 8
// LLVM: [[B_ADR:%.*]] = alloca <1 x i64>, i64 1, align 8
// LLVM: store i64 [[A]], ptr [[A_ADR]], align 8
Expand All @@ -109,6 +117,8 @@ float32x2_t test_vset_lane_f32(float32_t a, float32x2_t b) {
// CIR: {{%.*}} = cir.vec.insert {{%.*}}, {{%.*}}[[[IDX]] : !s32i] : !cir.vector<!cir.float x 2>

// LLVM: define dso_local <2 x float> @test_vset_lane_f32(float [[A:%.*]], <2 x float> [[B:%.*]])
// LLVM: alloca <2 x float>
// LLVM: alloca float
// LLVM: [[A_ADR:%.*]] = alloca float, i64 1, align 4
// LLVM: [[B_ADR:%.*]] = alloca <2 x float>, i64 1, align 8
// LLVM: store float [[A]], ptr [[A_ADR]], align 4
Expand All @@ -131,6 +141,8 @@ uint8x16_t test_vsetq_lane_u8(uint8_t a, uint8x16_t b) {
// CIR: {{%.*}} = cir.vec.insert {{%.*}}, {{%.*}}[[[IDX]] : !s32i] : !cir.vector<!s8i x 16>

// LLVM: define dso_local <16 x i8> @test_vsetq_lane_u8(i8 [[A:%.*]], <16 x i8> [[B:%.*]])
// LLVM: alloca <16 x i8>
// LLVM: alloca i8
// LLVM: [[A_ADR:%.*]] = alloca i8, i64 1, align 1
// LLVM: [[B_ADR:%.*]] = alloca <16 x i8>, i64 1, align 16
// LLVM: store i8 [[A]], ptr [[A_ADR]], align 1
Expand All @@ -153,6 +165,8 @@ uint16x8_t test_vsetq_lane_u16(uint16_t a, uint16x8_t b) {
// CIR: {{%.*}} = cir.vec.insert {{%.*}}, {{%.*}}[[[IDX]] : !s32i] : !cir.vector<!s16i x 8>

// LLVM: define dso_local <8 x i16> @test_vsetq_lane_u16(i16 [[A:%.*]], <8 x i16> [[B:%.*]])
// LLVM: alloca <8 x i16>
// LLVM: alloca i16
// LLVM: [[A_ADR:%.*]] = alloca i16, i64 1, align 2
// LLVM: [[B_ADR:%.*]] = alloca <8 x i16>, i64 1, align 16
// LLVM: store i16 [[A]], ptr [[A_ADR]], align 2
Expand All @@ -175,6 +189,8 @@ uint32x4_t test_vsetq_lane_u32(uint32_t a, uint32x4_t b) {
// CIR: {{%.*}} = cir.vec.insert {{%.*}}, {{%.*}}[[[IDX]] : !s32i] : !cir.vector<!s32i x 4>

// LLVM: define dso_local <4 x i32> @test_vsetq_lane_u32(i32 [[A:%.*]], <4 x i32> [[B:%.*]])
// LLVM: alloca <4 x i32>
// LLVM: alloca i32
// LLVM: [[A_ADR:%.*]] = alloca i32, i64 1, align 4
// LLVM: [[B_ADR:%.*]] = alloca <4 x i32>, i64 1, align 16
// LLVM: store i32 [[A]], ptr [[A_ADR]], align 4
Expand All @@ -197,6 +213,8 @@ int64x2_t test_vsetq_lane_s64(int64_t a, int64x2_t b) {
// CIR: {{%.*}} = cir.vec.insert {{%.*}}, {{%.*}}[[[IDX]] : !s32i] : !cir.vector<!s64i x 2>

// LLVM: define dso_local <2 x i64> @test_vsetq_lane_s64(i64 [[A:%.*]], <2 x i64> [[B:%.*]])
// LLVM: alloca <2 x i64>
// LLVM: alloca i64
// LLVM: [[A_ADR:%.*]] = alloca i64, i64 1, align 8
// LLVM: [[B_ADR:%.*]] = alloca <2 x i64>, i64 1, align 16
// LLVM: store i64 [[A]], ptr [[A_ADR]], align 8
Expand All @@ -219,6 +237,8 @@ float32x4_t test_vsetq_lane_f32(float32_t a, float32x4_t b) {
// CIR: {{%.*}} = cir.vec.insert {{%.*}}, {{%.*}}[[[IDX]] : !s32i] : !cir.vector<!cir.float x 4>

// LLVM: define dso_local <4 x float> @test_vsetq_lane_f32(float [[A:%.*]], <4 x float> [[B:%.*]])
// LLVM: alloca <4 x float>
// LLVM: alloca float
// LLVM: [[A_ADR:%.*]] = alloca float, i64 1, align 4
// LLVM: [[B_ADR:%.*]] = alloca <4 x float>, i64 1, align 16
// LLVM: store float [[A]], ptr [[A_ADR]], align 4
Expand All @@ -241,6 +261,7 @@ uint8_t test_vget_lane_u8(uint8x8_t a) {
// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector<!u8i x 8>

// LLVM: define dso_local i8 @test_vget_lane_u8(<8 x i8> [[ARG:%.*]])
// LLVM: alloca <8 x i8>
// LLVM: [[ARG_SAVE:%.*]] = alloca <8 x i8>, i64 1, align 8
// LLVM: store <8 x i8> [[ARG]], ptr [[ARG_SAVE]], align 8
// LLVM: [[TMP:%.*]] = load <8 x i8>, ptr [[ARG_SAVE:%.*]], align 8
Expand All @@ -258,6 +279,7 @@ uint8_t test_vgetq_lane_u8(uint8x16_t a) {
// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector<!u8i x 16>

// LLVM: define dso_local i8 @test_vgetq_lane_u8(<16 x i8> [[ARG:%.*]])
// LLVM: alloca <16 x i8>
// LLVM: [[ARG_SAVE:%.*]] = alloca <16 x i8>, i64 1, align 16
// LLVM: store <16 x i8> [[ARG]], ptr [[ARG_SAVE]], align 16
// LLVM: [[TMP:%.*]] = load <16 x i8>, ptr [[ARG_SAVE:%.*]], align 16
Expand All @@ -275,6 +297,7 @@ uint16_t test_vget_lane_u16(uint16x4_t a) {
// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector<!u16i x 4>

// LLVM: define dso_local i16 @test_vget_lane_u16(<4 x i16> [[ARG:%.*]])
// LLVM: alloca <4 x i16>
// LLVM: [[ARG_SAVE:%.*]] = alloca <4 x i16>, i64 1, align 8
// LLVM: store <4 x i16> [[ARG]], ptr [[ARG_SAVE]], align 8
// LLVM: [[TMP:%.*]] = load <4 x i16>, ptr [[ARG_SAVE:%.*]], align 8
Expand All @@ -292,6 +315,7 @@ uint16_t test_vgetq_lane_u16(uint16x8_t a) {
// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector<!u16i x 8>

// LLVM: define dso_local i16 @test_vgetq_lane_u16(<8 x i16> [[ARG:%.*]])
// LLVM: alloca <8 x i16>
// LLVM: [[ARG_SAVE:%.*]] = alloca <8 x i16>, i64 1, align 16
// LLVM: store <8 x i16> [[ARG]], ptr [[ARG_SAVE]], align 16
// LLVM: [[TMP:%.*]] = load <8 x i16>, ptr [[ARG_SAVE:%.*]], align 16
Expand All @@ -309,6 +333,7 @@ uint32_t test_vget_lane_u32(uint32x2_t a) {
// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector<!u32i x 2>

// LLVM: define dso_local i32 @test_vget_lane_u32(<2 x i32> [[ARG:%.*]])
// LLVM: alloca <2 x i32>
// LLVM: [[ARG_SAVE:%.*]] = alloca <2 x i32>, i64 1, align 8
// LLVM: store <2 x i32> [[ARG]], ptr [[ARG_SAVE]], align 8
// LLVM: [[TMP:%.*]] = load <2 x i32>, ptr [[ARG_SAVE:%.*]], align 8
Expand All @@ -326,6 +351,7 @@ uint32_t test_vgetq_lane_u32(uint32x4_t a) {
// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector<!u32i x 4>

// LLVM: define dso_local i32 @test_vgetq_lane_u32(<4 x i32> [[ARG:%.*]])
// LLVM: alloca <4 x i32>
// LLVM: [[ARG_SAVE:%.*]] = alloca <4 x i32>, i64 1, align 16
// LLVM: store <4 x i32> [[ARG]], ptr [[ARG_SAVE]], align 16
// LLVM: [[TMP:%.*]] = load <4 x i32>, ptr [[ARG_SAVE:%.*]], align 16
Expand All @@ -343,6 +369,7 @@ uint64_t test_vget_lane_u64(uint64x1_t a) {
// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector<!u64i x 1>

// LLVM: define dso_local i64 @test_vget_lane_u64(<1 x i64> [[ARG:%.*]])
// LLVM: alloca <1 x i64>
// LLVM: [[ARG_SAVE:%.*]] = alloca <1 x i64>, i64 1, align 8
// LLVM: store <1 x i64> [[ARG]], ptr [[ARG_SAVE]], align 8
// LLVM: [[TMP:%.*]] = load <1 x i64>, ptr [[ARG_SAVE:%.*]], align 8
Expand All @@ -360,6 +387,7 @@ uint64_t test_vgetq_lane_u64(uint64x2_t a) {
// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector<!u64i x 2>

// LLVM: define dso_local i64 @test_vgetq_lane_u64(<2 x i64> [[ARG:%.*]])
// LLVM: alloca <2 x i64>
// LLVM: [[ARG_SAVE:%.*]] = alloca <2 x i64>, i64 1, align 16
// LLVM: store <2 x i64> [[ARG]], ptr [[ARG_SAVE]], align 16
// LLVM: [[TMP:%.*]] = load <2 x i64>, ptr [[ARG_SAVE:%.*]], align 16
Expand All @@ -377,6 +405,7 @@ float32_t test_vget_lane_f32(float32x2_t a) {
// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector<!cir.float x 2>

// LLVM: define dso_local float @test_vget_lane_f32(<2 x float> [[ARG:%.*]])
// LLVM: alloca <2 x float>
// LLVM: [[ARG_SAVE:%.*]] = alloca <2 x float>, i64 1, align 8
// LLVM: store <2 x float> [[ARG]], ptr [[ARG_SAVE]], align 8
// LLVM: [[TMP:%.*]] = load <2 x float>, ptr [[ARG_SAVE:%.*]], align 8
Expand All @@ -394,6 +423,7 @@ float64_t test_vget_lane_f64(float64x1_t a) {
// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector<!cir.double x 1>

// LLVM: define dso_local double @test_vget_lane_f64(<1 x double> [[ARG:%.*]])
// LLVM: alloca <1 x double>
// LLVM: [[ARG_SAVE:%.*]] = alloca <1 x double>, i64 1, align 8
// LLVM: store <1 x double> [[ARG]], ptr [[ARG_SAVE]], align 8
// LLVM: [[TMP:%.*]] = load <1 x double>, ptr [[ARG_SAVE:%.*]], align 8
Expand All @@ -411,6 +441,7 @@ float32_t test_vgetq_lane_f32(float32x4_t a) {
// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector<!cir.float x 4>

// LLVM: define dso_local float @test_vgetq_lane_f32(<4 x float> [[ARG:%.*]])
// LLVM: alloca <4 x float>
// LLVM: [[ARG_SAVE:%.*]] = alloca <4 x float>, i64 1, align 16
// LLVM: store <4 x float> [[ARG]], ptr [[ARG_SAVE]], align 16
// LLVM: [[TMP:%.*]] = load <4 x float>, ptr [[ARG_SAVE:%.*]], align 16
Expand All @@ -428,6 +459,7 @@ float64_t test_vgetq_lane_f64(float64x2_t a) {
// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector<!cir.double x 2>

// LLVM: define dso_local double @test_vgetq_lane_f64(<2 x double> [[ARG:%.*]])
// LLVM: alloca <2 x double>
// LLVM: [[ARG_SAVE:%.*]] = alloca <2 x double>, i64 1, align 16
// LLVM: store <2 x double> [[ARG]], ptr [[ARG_SAVE]], align 16
// LLVM: [[TMP:%.*]] = load <2 x double>, ptr [[ARG_SAVE:%.*]], align 16
Expand Down
12 changes: 3 additions & 9 deletions clang/test/CIR/CodeGen/AArch64/neon.c
Original file line number Diff line number Diff line change
Expand Up @@ -6008,9 +6008,7 @@ uint8x8_t test_vqrshrun_n_s16(int16x8_t a) {
// LLVM: [[TMP0:%.*]] = bitcast <8 x i16> {{%.*}} to <16 x i8>
// LLVM: [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// LLVM: [[VQRSHRUN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> [[VQRSHRUN_N]], i32 3)
// LLVM: store <8 x i8> [[VQRSHRUN_N1]], ptr [[RET:%.*]], align 8
// LLVM: [[RETVAL:%.*]] = load <8 x i8>, ptr [[RET]], align 8
// LLVM: ret <8 x i8> [[RETVAL]]
// LLVM: ret <8 x i8> [[VQRSHRUN_N1]]
}

uint16x4_t test_vqrshrun_n_s32(int32x4_t a) {
Expand All @@ -6025,9 +6023,7 @@ uint16x4_t test_vqrshrun_n_s32(int32x4_t a) {
// LLVM: [[TMP0:%.*]] = bitcast <4 x i32> {{%.*}} to <16 x i8>
// LLVM: [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// LLVM: [[VQRSHRUN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> [[VQRSHRUN_N]], i32 9)
// LLVM: store <4 x i16> [[VQRSHRUN_N1]], ptr [[RET:%.*]], align 8
// LLVM: [[RETVAL:%.*]] = load <4 x i16>, ptr [[RET]], align 8
// LLVM: ret <4 x i16> [[RETVAL]]
// LLVM: ret <4 x i16> [[VQRSHRUN_N1]]
}

uint32x2_t test_vqrshrun_n_s64(int64x2_t a) {
Expand All @@ -6042,9 +6038,7 @@ uint32x2_t test_vqrshrun_n_s64(int64x2_t a) {
// LLVM: [[TMP0:%.*]] = bitcast <2 x i64> {{%.*}} to <16 x i8>
// LLVM: [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
// LLVM: [[VQRSHRUN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64> [[VQRSHRUN_N]], i32 19)
// LLVM: store <2 x i32> [[VQRSHRUN_N1]], ptr [[RET:%.*]], align 8
// LLVM: [[RETVAL:%.*]] = load <2 x i32>, ptr [[RET]], align 8
// LLVM: ret <2 x i32> [[RETVAL]]
// LLVM: ret <2 x i32> [[VQRSHRUN_N1]]
}

// NYI-LABEL: @test_vqrshrun_high_n_s16(
Expand Down
Loading