diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp index cad6f67f9795..c0f57c6fe438 100644 --- a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp @@ -2968,8 +2968,8 @@ CIRGenFunction::buildAArch64BuiltinExpr(unsigned BuiltinID, const CallExpr *E, } } - mlir::cir::VectorType Ty = GetNeonType(this, Type); - if (!Ty) + mlir::cir::VectorType ty = GetNeonType(this, Type); + if (!ty) return nullptr; // Not all intrinsics handled by the common case work for AArch64 yet, so only @@ -2986,7 +2986,7 @@ CIRGenFunction::buildAArch64BuiltinExpr(unsigned BuiltinID, const CallExpr *E, buildAArch64TblBuiltinExpr(*this, BuiltinID, E, Ops, Arch)) return V; - mlir::cir::VectorType VTy = Ty; + mlir::cir::VectorType vTy = ty; llvm::SmallVector args; switch (BuiltinID) { default: @@ -3066,8 +3066,8 @@ CIRGenFunction::buildAArch64BuiltinExpr(unsigned BuiltinID, const CallExpr *E, // https://developer.arm.com/architectures/instruction-sets/intrinsics/ return buildNeonCall( BuiltinID, *this, - {builder.getExtendedElementVectorType(Ty, true), SInt32Ty}, Ops, - "llvm.aarch64.neon.sqrshrun", Ty, getLoc(E->getExprLoc())); + {builder.getExtendedElementVectorType(ty, true), SInt32Ty}, Ops, + "llvm.aarch64.neon.sqrshrun", ty, getLoc(E->getExprLoc())); case NEON::BI__builtin_neon_vqshrn_n_v: llvm_unreachable("NYI"); case NEON::BI__builtin_neon_vrshrn_n_v: @@ -3080,7 +3080,7 @@ CIRGenFunction::buildAArch64BuiltinExpr(unsigned BuiltinID, const CallExpr *E, case NEON::BI__builtin_neon_vrnda_v: case NEON::BI__builtin_neon_vrndaq_v: { assert(!MissingFeatures::buildConstrainedFPCall()); - return buildNeonCall(BuiltinID, *this, {Ty}, Ops, "llvm.round", Ty, + return buildNeonCall(BuiltinID, *this, {ty}, Ops, "llvm.round", ty, getLoc(E->getExprLoc())); } case NEON::BI__builtin_neon_vrndih_f16: { @@ -3407,20 +3407,20 @@ CIRGenFunction::buildAArch64BuiltinExpr(unsigned BuiltinID, const CallExpr *E, } case NEON::BI__builtin_neon_vld1_v: case NEON::BI__builtin_neon_vld1q_v: { - return builder.createAlignedLoad(Ops[0].getLoc(), VTy, Ops[0], + return builder.createAlignedLoad(Ops[0].getLoc(), vTy, Ops[0], PtrOp0.getAlignment()); } case NEON::BI__builtin_neon_vst1_v: case NEON::BI__builtin_neon_vst1q_v: { - Ops[1] = builder.createBitcast(Ops[1], VTy); + Ops[1] = builder.createBitcast(Ops[1], vTy); (void)builder.createAlignedStore(Ops[1].getLoc(), Ops[1], Ops[0], PtrOp0.getAlignment()); return Ops[1]; } case NEON::BI__builtin_neon_vld1_lane_v: case NEON::BI__builtin_neon_vld1q_lane_v: { - Ops[1] = builder.createBitcast(Ops[1], VTy); - Ops[0] = builder.createAlignedLoad(Ops[0].getLoc(), VTy.getEltType(), + Ops[1] = builder.createBitcast(Ops[1], vTy); + Ops[0] = builder.createAlignedLoad(Ops[0].getLoc(), vTy.getEltType(), Ops[0], PtrOp0.getAlignment()); return builder.create(getLoc(E->getExprLoc()), Ops[1], Ops[0], Ops[2]); @@ -3435,7 +3435,7 @@ CIRGenFunction::buildAArch64BuiltinExpr(unsigned BuiltinID, const CallExpr *E, } case NEON::BI__builtin_neon_vst1_lane_v: case NEON::BI__builtin_neon_vst1q_lane_v: { - Ops[1] = builder.createBitcast(Ops[1], Ty); + Ops[1] = builder.createBitcast(Ops[1], ty); Ops[1] = builder.create(Ops[1].getLoc(), Ops[1], Ops[2]); (void)builder.createAlignedStore(getLoc(E->getExprLoc()), Ops[1], Ops[0], @@ -3508,7 +3508,41 @@ CIRGenFunction::buildAArch64BuiltinExpr(unsigned BuiltinID, const CallExpr *E, } case NEON::BI__builtin_neon_vtrn_v: case NEON::BI__builtin_neon_vtrnq_v: { - llvm_unreachable("NYI"); + // This set of neon intrinsics implement SIMD matrix transpose. + // The matrix transposed is always 2x2, and these intrincis transpose + // multiple 2x2 matrices in parallel, that is why result type is + // always 2-D matrix whose last dimension is 2. + // For example `vtrn_s16` would have: + // input 1: {0, 1, 2, 3} + // input 2; {4, 5, 6, 7} + // This basically represents two 2x2 matrices: + // [ 0, 1 ] and [ 2, 3] + // [ 4, 5 ] [ 6, 7] + // They should be simultaneously and independently transposed. + // Thus, result is : + // { {0, 4, 2, 6}, + // {1, 5, 3, 7 } } + Ops[1] = builder.createBitcast(Ops[1], ty); + Ops[2] = builder.createBitcast(Ops[2], ty); + // Adding a bitcast here as Ops[0] might be a void pointer. + mlir::Value baseAddr = + builder.createBitcast(Ops[0], builder.getPointerTo(ty)); + mlir::Value sv; + mlir::Location loc = getLoc(E->getExprLoc()); + + for (unsigned vi = 0; vi != 2; ++vi) { + llvm::SmallVector indices; + for (unsigned i = 0, e = vTy.getSize(); i != e; i += 2) { + indices.push_back(i + vi); + indices.push_back(i + e + vi); + } + mlir::cir::ConstantOp idx = builder.getConstInt(loc, SInt32Ty, vi); + mlir::Value addr = builder.create( + loc, baseAddr.getType(), baseAddr, idx); + sv = builder.createVecShuffle(loc, Ops[1], Ops[2], indices); + (void)builder.CIRBaseBuilderTy::createStore(loc, sv, addr); + } + return sv; } case NEON::BI__builtin_neon_vuzp_v: case NEON::BI__builtin_neon_vuzpq_v: { diff --git a/clang/test/CIR/CodeGen/AArch64/neon-misc.c b/clang/test/CIR/CodeGen/AArch64/neon-misc.c index 80afd1bf17c6..42465990244e 100644 --- a/clang/test/CIR/CodeGen/AArch64/neon-misc.c +++ b/clang/test/CIR/CodeGen/AArch64/neon-misc.c @@ -1,14 +1,20 @@ -// RUN: %clang_cc1 -triple aarch64-none-linux-android24 -fclangir \ -// RUN: -emit-cir -fno-clangir-call-conv-lowering -target-feature +neon %s -o %t.cir + +// RUN: %clang_cc1 -triple aarch64-none-linux-android24 -target-feature +neon \ +// RUN: -fclangir -disable-O0-optnone \ +// RUN: -flax-vector-conversions=none -fno-clangir-call-conv-lowering -emit-cir -o %t.cir %s // RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s -// RUN: %clang_cc1 -triple aarch64-none-linux-android24 -fclangir \ -// RUN: -emit-llvm -fno-clangir-call-conv-lowering -target-feature +neon %s -o %t.ll -// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s -// This test file contains tests of AArch64 NEON intrinsics -// that are not covered by other tests. +// RUN: %clang_cc1 -triple aarch64-none-linux-android24 -target-feature +neon \ +// RUN: -fclangir -disable-O0-optnone \ +// RUN: -flax-vector-conversions=none -fno-clangir-call-conv-lowering -emit-llvm -o - %s \ +// RUN: | opt -S -passes=mem2reg,simplifycfg -o %t.ll +// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s // REQUIRES: aarch64-registered-target || arm-registered-target + +// This test file contains test cases for the intrinsics that are not covered +// by the other neon test files. + #include uint8x8_t test_vset_lane_u8(uint8_t a, uint8x8_t b) { @@ -19,21 +25,9 @@ uint8x8_t test_vset_lane_u8(uint8_t a, uint8x8_t b) { // CIR: [[IDX:%.*]] = cir.const #cir.int<7> : !s32i loc(#loc7) // CIR: {{%.*}} = cir.vec.insert {{%.*}}, {{%.*}}[[[IDX]] : !s32i] : !cir.vector -// LLVM: define dso_local <8 x i8> @test_vset_lane_u8(i8 [[A:%.*]], <8 x i8> [[B:%.*]]) -// LLVM: alloca <8 x i8> -// LLVM: alloca i8 -// LLVM: [[A_ADR:%.*]] = alloca i8, i64 1, align 1 -// LLVM: [[B_ADR:%.*]] = alloca <8 x i8>, i64 1, align 8 -// LLVM: store i8 [[A]], ptr [[A_ADR]], align 1 -// LLVM: store <8 x i8> [[B]], ptr [[B_ADR]], align 8 -// LLVM: [[TMP_A0:%.*]] = load i8, ptr [[A_ADR]], align 1 -// LLVM: store i8 [[TMP_A0]], ptr [[S0:%.*]], align 1 -// LLVM: [[TMP_B0:%.*]] = load <8 x i8>, ptr [[B_ADR]], align 8 -// LLVM: store <8 x i8> [[TMP_B0]], ptr [[S1:%.*]], align 8 -// LLVM: [[INTRN_ARG0:%.*]] = load i8, ptr [[S0]], align 1 -// LLVM: [[INTRN_ARG1:%.*]] = load <8 x i8>, ptr [[S1]], align 8 -// LLVM: [[INTRN_RES:%.*]] = insertelement <8 x i8> [[INTRN_ARG1]], i8 [[INTRN_ARG0]], i32 7 -// LLVM: ret <8 x i8> {{%.*}} +// LLVM: {{.*}}test_vset_lane_u8(i8{{.*}}[[A:%.*]], <8 x i8>{{.*}}[[B:%.*]]) +// LLVM: [[INTRN_RES:%.*]] = insertelement <8 x i8> [[B]], i8 [[A]], i32 7 +// LLVM: ret <8 x i8> [[INTRN_RES]] uint16x4_t test_vset_lane_u16(uint16_t a, uint16x4_t b) { return vset_lane_u16(a, b, 3); @@ -43,21 +37,9 @@ uint16x4_t test_vset_lane_u16(uint16_t a, uint16x4_t b) { // CIR: [[IDX:%.*]] = cir.const #cir.int<3> : !s32i // CIR: {{%.*}} = cir.vec.insert {{%.*}}, {{%.*}}[[[IDX]] : !s32i] : !cir.vector -// LLVM: define dso_local <4 x i16> @test_vset_lane_u16(i16 [[A:%.*]], <4 x i16> [[B:%.*]]) -// LLVM: alloca <4 x i16> -// LLVM: alloca i16 -// LLVM: [[A_ADR:%.*]] = alloca i16, i64 1, align 2 -// LLVM: [[B_ADR:%.*]] = alloca <4 x i16>, i64 1, align 8 -// LLVM: store i16 [[A]], ptr [[A_ADR]], align 2 -// LLVM: store <4 x i16> [[B]], ptr [[B_ADR]], align 8 -// LLVM: [[TMP_A0:%.*]] = load i16, ptr [[A_ADR]], align 2 -// LLVM: store i16 [[TMP_A0]], ptr [[S0:%.*]], align 2 -// LLVM: [[TMP_B0:%.*]] = load <4 x i16>, ptr [[B_ADR]], align 8 -// LLVM: store <4 x i16> [[TMP_B0]], ptr [[S1:%.*]], align 8 -// LLVM: [[INTRN_ARG0:%.*]] = load i16, ptr [[S0]], align 2 -// LLVM: [[INTRN_ARG1:%.*]] = load <4 x i16>, ptr [[S1]], align 8 -// LLVM: [[INTRN_RES:%.*]] = insertelement <4 x i16> [[INTRN_ARG1]], i16 [[INTRN_ARG0]], i32 3 -// LLVM: ret <4 x i16> {{%.*}} +// LLVM: {{.*}}test_vset_lane_u16(i16{{.*}}[[A:%.*]], <4 x i16>{{.*}}[[B:%.*]]) +// LLVM: [[INTRN_RES:%.*]] = insertelement <4 x i16> [[B]], i16 [[A]], i32 3 +// LLVM: ret <4 x i16> [[INTRN_RES]] uint32x2_t test_vset_lane_u32(uint32_t a, uint32x2_t b) { return vset_lane_u32(a, b, 1); @@ -67,24 +49,11 @@ uint32x2_t test_vset_lane_u32(uint32_t a, uint32x2_t b) { // CIR: [[IDX:%.*]] = cir.const #cir.int<1> : !s32i // CIR: {{%.*}} = cir.vec.insert {{%.*}}, {{%.*}}[[[IDX]] : !s32i] : !cir.vector -// LLVM: define dso_local <2 x i32> @test_vset_lane_u32(i32 [[A:%.*]], <2 x i32> [[B:%.*]]) -// LLVM: alloca <2 x i32> -// LLVM: alloca i32 -// LLVM: [[A_ADR:%.*]] = alloca i32, i64 1, align 4 -// LLVM: [[B_ADR:%.*]] = alloca <2 x i32>, i64 1, align 8 -// LLVM: store i32 [[A]], ptr [[A_ADR]], align 4 -// LLVM: store <2 x i32> [[B]], ptr [[B_ADR]], align 8 -// LLVM: [[TMP_A0:%.*]] = load i32, ptr [[A_ADR]], align 4 -// LLVM: store i32 [[TMP_A0]], ptr [[S0:%.*]], align 4 -// LLVM: [[TMP_B0:%.*]] = load <2 x i32>, ptr [[B_ADR]], align 8 -// LLVM: store <2 x i32> [[TMP_B0]], ptr [[S1:%.*]], align 8 -// LLVM: [[INTRN_ARG0:%.*]] = load i32, ptr [[S0]], align 4 -// LLVM: [[INTRN_ARG1:%.*]] = load <2 x i32>, ptr [[S1]], align 8 -// LLVM: [[INTRN_RES:%.*]] = insertelement <2 x i32> [[INTRN_ARG1]], i32 [[INTRN_ARG0]], i32 1 -// LLVM: ret <2 x i32> {{%.*}} - - -int64x1_t test_vset_lane_u64(int64_t a, int64x1_t b) { +// LLVM: {{.*}}test_vset_lane_u32(i32{{.*}}[[A:%.*]], <2 x i32>{{.*}}[[B:%.*]]) +// LLVM: [[INTRN_RES:%.*]] = insertelement <2 x i32> [[B]], i32 [[A]], i32 1 +// LLVM: ret <2 x i32> [[INTRN_RES]] + +uint64x1_t test_vset_lane_u64(uint64_t a, uint64x1_t b) { return vset_lane_u64(a, b, 0); } @@ -92,21 +61,9 @@ int64x1_t test_vset_lane_u64(int64_t a, int64x1_t b) { // CIR: [[IDX:%.*]] = cir.const #cir.int<0> : !s32i // CIR: {{%.*}} = cir.vec.insert {{%.*}}, {{%.*}}[[[IDX]] : !s32i] : !cir.vector -// LLVM: define dso_local <1 x i64> @test_vset_lane_u64(i64 [[A:%.*]], <1 x i64> [[B:%.*]]) -// LLVM: alloca <1 x i64> -// LLVM: alloca i64 -// LLVM: [[A_ADR:%.*]] = alloca i64, i64 1, align 8 -// LLVM: [[B_ADR:%.*]] = alloca <1 x i64>, i64 1, align 8 -// LLVM: store i64 [[A]], ptr [[A_ADR]], align 8 -// LLVM: store <1 x i64> [[B]], ptr [[B_ADR]], align 8 -// LLVM: [[TMP_A0:%.*]] = load i64, ptr [[A_ADR]], align 8 -// LLVM: store i64 [[TMP_A0]], ptr [[S0:%.*]], align 8 -// LLVM: [[TMP_B0:%.*]] = load <1 x i64>, ptr [[B_ADR]], align 8 -// LLVM: store <1 x i64> [[TMP_B0]], ptr [[S1:%.*]], align 8 -// LLVM: [[INTRN_ARG0:%.*]] = load i64, ptr [[S0]], align 8 -// LLVM: [[INTRN_ARG1:%.*]] = load <1 x i64>, ptr [[S1]], align 8 -// LLVM: [[INTRN_RES:%.*]] = insertelement <1 x i64> [[INTRN_ARG1]], i64 [[INTRN_ARG0]], i32 0 -// LLVM: ret <1 x i64> {{%.*}} +// LLVM: {{.*}}test_vset_lane_u64(i64{{.*}}[[A:%.*]], <1 x i64>{{.*}}[[B:%.*]]) +// LLVM: [[INTRN_RES:%.*]] = insertelement <1 x i64> [[B]], i64 [[A]], i32 0 +// LLVM: ret <1 x i64> [[INTRN_RES]] float32x2_t test_vset_lane_f32(float32_t a, float32x2_t b) { return vset_lane_f32(a, b, 1); @@ -116,21 +73,9 @@ float32x2_t test_vset_lane_f32(float32_t a, float32x2_t b) { // CIR: [[IDX:%.*]] = cir.const #cir.int<1> : !s32i // CIR: {{%.*}} = cir.vec.insert {{%.*}}, {{%.*}}[[[IDX]] : !s32i] : !cir.vector -// LLVM: define dso_local <2 x float> @test_vset_lane_f32(float [[A:%.*]], <2 x float> [[B:%.*]]) -// LLVM: alloca <2 x float> -// LLVM: alloca float -// LLVM: [[A_ADR:%.*]] = alloca float, i64 1, align 4 -// LLVM: [[B_ADR:%.*]] = alloca <2 x float>, i64 1, align 8 -// LLVM: store float [[A]], ptr [[A_ADR]], align 4 -// LLVM: store <2 x float> [[B]], ptr [[B_ADR]], align 8 -// LLVM: [[TMP_A0:%.*]] = load float, ptr [[A_ADR]], align 4 -// LLVM: store float [[TMP_A0]], ptr [[S0:%.*]], align 4 -// LLVM: [[TMP_B0:%.*]] = load <2 x float>, ptr [[B_ADR]], align 8 -// LLVM: store <2 x float> [[TMP_B0]], ptr [[S1:%.*]], align 8 -// LLVM: [[INTRN_ARG0:%.*]] = load float, ptr [[S0]], align 4 -// LLVM: [[INTRN_ARG1:%.*]] = load <2 x float>, ptr [[S1]], align 8 -// LLVM: [[INTRN_RES:%.*]] = insertelement <2 x float> [[INTRN_ARG1]], float [[INTRN_ARG0]], i32 1 -// LLVM: ret <2 x float> {{%.*}} +// LLVM: {{.*}}test_vset_lane_f32(float{{.*}}[[A:%.*]], <2 x float>{{.*}}[[B:%.*]]) +// LLVM: [[INTRN_RES:%.*]] = insertelement <2 x float> [[B]], float [[A]], i32 1 +// LLVM: ret <2 x float> [[INTRN_RES]] uint8x16_t test_vsetq_lane_u8(uint8_t a, uint8x16_t b) { return vsetq_lane_u8(a, b, 15); @@ -140,21 +85,9 @@ uint8x16_t test_vsetq_lane_u8(uint8_t a, uint8x16_t b) { // CIR: [[IDX:%.*]] = cir.const #cir.int<15> : !s32i // CIR: {{%.*}} = cir.vec.insert {{%.*}}, {{%.*}}[[[IDX]] : !s32i] : !cir.vector -// LLVM: define dso_local <16 x i8> @test_vsetq_lane_u8(i8 [[A:%.*]], <16 x i8> [[B:%.*]]) -// LLVM: alloca <16 x i8> -// LLVM: alloca i8 -// LLVM: [[A_ADR:%.*]] = alloca i8, i64 1, align 1 -// LLVM: [[B_ADR:%.*]] = alloca <16 x i8>, i64 1, align 16 -// LLVM: store i8 [[A]], ptr [[A_ADR]], align 1 -// LLVM: store <16 x i8> [[B]], ptr [[B_ADR]], align 16 -// LLVM: [[TMP_A0:%.*]] = load i8, ptr [[A_ADR]], align 1 -// LLVM: store i8 [[TMP_A0]], ptr [[S0:%.*]], align 1 -// LLVM: [[TMP_B0:%.*]] = load <16 x i8>, ptr [[B_ADR]], align 16 -// LLVM: store <16 x i8> [[TMP_B0]], ptr [[S1:%.*]], align 16 -// LLVM: [[INTRN_ARG0:%.*]] = load i8, ptr [[S0]], align 1 -// LLVM: [[INTRN_ARG1:%.*]] = load <16 x i8>, ptr [[S1]], align 16 -// LLVM: [[INTRN_RES:%.*]] = insertelement <16 x i8> [[INTRN_ARG1]], i8 [[INTRN_ARG0]], i32 15 -// LLVM: ret <16 x i8> {{%.*}} +// LLVM: {{.*}}test_vsetq_lane_u8(i8{{.*}}[[A:%.*]], <16 x i8>{{.*}}[[B:%.*]]) +// LLVM: [[INTRN_RES:%.*]] = insertelement <16 x i8> [[B]], i8 [[A]], i32 15 +// LLVM: ret <16 x i8> [[INTRN_RES]] uint16x8_t test_vsetq_lane_u16(uint16_t a, uint16x8_t b) { return vsetq_lane_u16(a, b, 7); @@ -164,21 +97,9 @@ uint16x8_t test_vsetq_lane_u16(uint16_t a, uint16x8_t b) { // CIR: [[IDX:%.*]] = cir.const #cir.int<7> : !s32i // CIR: {{%.*}} = cir.vec.insert {{%.*}}, {{%.*}}[[[IDX]] : !s32i] : !cir.vector -// LLVM: define dso_local <8 x i16> @test_vsetq_lane_u16(i16 [[A:%.*]], <8 x i16> [[B:%.*]]) -// LLVM: alloca <8 x i16> -// LLVM: alloca i16 -// LLVM: [[A_ADR:%.*]] = alloca i16, i64 1, align 2 -// LLVM: [[B_ADR:%.*]] = alloca <8 x i16>, i64 1, align 16 -// LLVM: store i16 [[A]], ptr [[A_ADR]], align 2 -// LLVM: store <8 x i16> [[B]], ptr [[B_ADR]], align 16 -// LLVM: [[TMP_A0:%.*]] = load i16, ptr [[A_ADR]], align 2 -// LLVM: store i16 [[TMP_A0]], ptr [[S0:%.*]], align 2 -// LLVM: [[TMP_B0:%.*]] = load <8 x i16>, ptr [[B_ADR]], align 16 -// LLVM: store <8 x i16> [[TMP_B0]], ptr [[S1:%.*]], align 16 -// LLVM: [[INTRN_ARG0:%.*]] = load i16, ptr [[S0]], align 2 -// LLVM: [[INTRN_ARG1:%.*]] = load <8 x i16>, ptr [[S1]], align 16 -// LLVM: [[INTRN_RES:%.*]] = insertelement <8 x i16> [[INTRN_ARG1]], i16 [[INTRN_ARG0]], i32 7 -// LLVM: ret <8 x i16> {{%.*}} +// LLVM: {{.*}}test_vsetq_lane_u16(i16{{.*}}[[A:%.*]], <8 x i16>{{.*}}[[B:%.*]]) +// LLVM: [[INTRN_RES:%.*]] = insertelement <8 x i16> [[B]], i16 [[A]], i32 7 +// LLVM: ret <8 x i16> [[INTRN_RES]] uint32x4_t test_vsetq_lane_u32(uint32_t a, uint32x4_t b) { return vsetq_lane_u32(a, b, 3); @@ -188,21 +109,9 @@ uint32x4_t test_vsetq_lane_u32(uint32_t a, uint32x4_t b) { // CIR: [[IDX:%.*]] = cir.const #cir.int<3> : !s32i // CIR: {{%.*}} = cir.vec.insert {{%.*}}, {{%.*}}[[[IDX]] : !s32i] : !cir.vector -// LLVM: define dso_local <4 x i32> @test_vsetq_lane_u32(i32 [[A:%.*]], <4 x i32> [[B:%.*]]) -// LLVM: alloca <4 x i32> -// LLVM: alloca i32 -// LLVM: [[A_ADR:%.*]] = alloca i32, i64 1, align 4 -// LLVM: [[B_ADR:%.*]] = alloca <4 x i32>, i64 1, align 16 -// LLVM: store i32 [[A]], ptr [[A_ADR]], align 4 -// LLVM: store <4 x i32> [[B]], ptr [[B_ADR]], align 16 -// LLVM: [[TMP_A0:%.*]] = load i32, ptr [[A_ADR]], align 4 -// LLVM: store i32 [[TMP_A0]], ptr [[S0:%.*]], align 4 -// LLVM: [[TMP_B0:%.*]] = load <4 x i32>, ptr [[B_ADR]], align 16 -// LLVM: store <4 x i32> [[TMP_B0]], ptr [[S1:%.*]], align 16 -// LLVM: [[INTRN_ARG0:%.*]] = load i32, ptr [[S0]], align 4 -// LLVM: [[INTRN_ARG1:%.*]] = load <4 x i32>, ptr [[S1]], align 16 -// LLVM: [[INTRN_RES:%.*]] = insertelement <4 x i32> [[INTRN_ARG1]], i32 [[INTRN_ARG0]], i32 3 -// LLVM: ret <4 x i32> {{%.*}} +// LLVM: {{.*}}test_vsetq_lane_u32(i32{{.*}}[[A:%.*]], <4 x i32>{{.*}}[[B:%.*]]) +// LLVM: [[INTRN_RES:%.*]] = insertelement <4 x i32> [[B]], i32 [[A]], i32 3 +// LLVM: ret <4 x i32> [[INTRN_RES]] int64x2_t test_vsetq_lane_s64(int64_t a, int64x2_t b) { return vsetq_lane_s64(a, b, 1); @@ -212,21 +121,9 @@ int64x2_t test_vsetq_lane_s64(int64_t a, int64x2_t b) { // CIR: [[IDX:%.*]] = cir.const #cir.int<1> : !s32i // CIR: {{%.*}} = cir.vec.insert {{%.*}}, {{%.*}}[[[IDX]] : !s32i] : !cir.vector -// LLVM: define dso_local <2 x i64> @test_vsetq_lane_s64(i64 [[A:%.*]], <2 x i64> [[B:%.*]]) -// LLVM: alloca <2 x i64> -// LLVM: alloca i64 -// LLVM: [[A_ADR:%.*]] = alloca i64, i64 1, align 8 -// LLVM: [[B_ADR:%.*]] = alloca <2 x i64>, i64 1, align 16 -// LLVM: store i64 [[A]], ptr [[A_ADR]], align 8 -// LLVM: store <2 x i64> [[B]], ptr [[B_ADR]], align 16 -// LLVM: [[TMP_A0:%.*]] = load i64, ptr [[A_ADR]], align 8 -// LLVM: store i64 [[TMP_A0]], ptr [[S0:%.*]], align 8 -// LLVM: [[TMP_B0:%.*]] = load <2 x i64>, ptr [[B_ADR]], align 16 -// LLVM: store <2 x i64> [[TMP_B0]], ptr [[S1:%.*]], align 16 -// LLVM: [[INTRN_ARG0:%.*]] = load i64, ptr [[S0]], align 8 -// LLVM: [[INTRN_ARG1:%.*]] = load <2 x i64>, ptr [[S1]], align 16 -// LLVM: [[INTRN_RES:%.*]] = insertelement <2 x i64> [[INTRN_ARG1]], i64 [[INTRN_ARG0]], i32 1 -// LLVM: ret <2 x i64> {{%.*}} +// LLVM: {{.*}}test_vsetq_lane_s64(i64{{.*}}[[A:%.*]], <2 x i64>{{.*}}[[B:%.*]]) +// LLVM: [[INTRN_RES:%.*]] = insertelement <2 x i64> [[B]], i64 [[A]], i32 1 +// LLVM: ret <2 x i64> [[INTRN_RES]] float32x4_t test_vsetq_lane_f32(float32_t a, float32x4_t b) { return vsetq_lane_f32(a, b, 3); @@ -236,21 +133,9 @@ float32x4_t test_vsetq_lane_f32(float32_t a, float32x4_t b) { // CIR: [[IDX:%.*]] = cir.const #cir.int<3> : !s32i // CIR: {{%.*}} = cir.vec.insert {{%.*}}, {{%.*}}[[[IDX]] : !s32i] : !cir.vector -// LLVM: define dso_local <4 x float> @test_vsetq_lane_f32(float [[A:%.*]], <4 x float> [[B:%.*]]) -// LLVM: alloca <4 x float> -// LLVM: alloca float -// LLVM: [[A_ADR:%.*]] = alloca float, i64 1, align 4 -// LLVM: [[B_ADR:%.*]] = alloca <4 x float>, i64 1, align 16 -// LLVM: store float [[A]], ptr [[A_ADR]], align 4 -// LLVM: store <4 x float> [[B]], ptr [[B_ADR]], align 16 -// LLVM: [[TMP_A0:%.*]] = load float, ptr [[A_ADR]], align 4 -// LLVM: store float [[TMP_A0]], ptr [[S0:%.*]], align 4 -// LLVM: [[TMP_B0:%.*]] = load <4 x float>, ptr [[B_ADR]], align 16 -// LLVM: store <4 x float> [[TMP_B0]], ptr [[S1:%.*]], align 16 -// LLVM: [[INTRN_ARG0:%.*]] = load float, ptr [[S0]], align 4 -// LLVM: [[INTRN_ARG1:%.*]] = load <4 x float>, ptr [[S1]], align 16 -// LLVM: [[INTRN_RES:%.*]] = insertelement <4 x float> [[INTRN_ARG1]], float [[INTRN_ARG0]], i32 3 -// LLVM: ret <4 x float> {{%.*}} +// LLVM: {{.*}}test_vsetq_lane_f32(float{{.*}}[[A:%.*]], <4 x float>{{.*}}[[B:%.*]]) +// LLVM: [[INTRN_RES:%.*]] = insertelement <4 x float> [[B]], float [[A]], i32 3 +// LLVM: ret <4 x float> [[INTRN_RES]] uint8_t test_vget_lane_u8(uint8x8_t a) { return vget_lane_u8(a, 7); @@ -260,15 +145,9 @@ uint8_t test_vget_lane_u8(uint8x8_t a) { // CIR: [[IDX:%.*]] = cir.const #cir.int<7> : !s32i // CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector -// LLVM: define dso_local i8 @test_vget_lane_u8(<8 x i8> [[ARG:%.*]]) -// LLVM: alloca <8 x i8> -// LLVM: [[ARG_SAVE:%.*]] = alloca <8 x i8>, i64 1, align 8 -// LLVM: store <8 x i8> [[ARG]], ptr [[ARG_SAVE]], align 8 -// LLVM: [[TMP:%.*]] = load <8 x i8>, ptr [[ARG_SAVE:%.*]], align 8 -// LLVM: store <8 x i8> [[TMP]], ptr [[S0:%.*]], align 8 -// LLVM: [[INTRN_ARG:%.*]] = load <8 x i8>, ptr [[S0]], align 8 -// LLVM: {{%.*}} = extractelement <8 x i8> [[INTRN_ARG]], i32 7 -// LLVM: ret i8 {{%.*}} +// LLVM: {{.*}}test_vget_lane_u8(<8 x i8>{{.*}}[[ARG:%.*]]) +// LLVM: [[RES:%.*]] = extractelement <8 x i8> [[ARG]], i32 7 +// LLVM: ret i8 [[RES]] uint8_t test_vgetq_lane_u8(uint8x16_t a) { return vgetq_lane_u8(a, 15); @@ -278,15 +157,9 @@ uint8_t test_vgetq_lane_u8(uint8x16_t a) { // CIR: [[IDX:%.*]] = cir.const #cir.int<15> : !s32i // CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector -// LLVM: define dso_local i8 @test_vgetq_lane_u8(<16 x i8> [[ARG:%.*]]) -// LLVM: alloca <16 x i8> -// LLVM: [[ARG_SAVE:%.*]] = alloca <16 x i8>, i64 1, align 16 -// LLVM: store <16 x i8> [[ARG]], ptr [[ARG_SAVE]], align 16 -// LLVM: [[TMP:%.*]] = load <16 x i8>, ptr [[ARG_SAVE:%.*]], align 16 -// LLVM: store <16 x i8> [[TMP]], ptr [[S0:%.*]], align 16 -// LLVM: [[INTRN_ARG:%.*]] = load <16 x i8>, ptr [[S0]], align 16 -// LLVM: {{%.*}} = extractelement <16 x i8> [[INTRN_ARG]], i32 15 -// LLVM: ret i8 {{%.*}} +// LLVM: {{.*}}test_vgetq_lane_u8(<16 x i8>{{.*}}[[ARG:%.*]]) +// LLVM: [[RES:%.*]] = extractelement <16 x i8> [[ARG]], i32 15 +// LLVM: ret i8 [[RES]] uint16_t test_vget_lane_u16(uint16x4_t a) { return vget_lane_u16(a, 3); @@ -296,15 +169,9 @@ uint16_t test_vget_lane_u16(uint16x4_t a) { // CIR: [[IDX:%.*]] = cir.const #cir.int<3> : !s32i // CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector -// LLVM: define dso_local i16 @test_vget_lane_u16(<4 x i16> [[ARG:%.*]]) -// LLVM: alloca <4 x i16> -// LLVM: [[ARG_SAVE:%.*]] = alloca <4 x i16>, i64 1, align 8 -// LLVM: store <4 x i16> [[ARG]], ptr [[ARG_SAVE]], align 8 -// LLVM: [[TMP:%.*]] = load <4 x i16>, ptr [[ARG_SAVE:%.*]], align 8 -// LLVM: store <4 x i16> [[TMP]], ptr [[S0:%.*]], align 8 -// LLVM: [[INTRN_ARG:%.*]] = load <4 x i16>, ptr [[S0]], align 8 -// LLVM: {{%.*}} = extractelement <4 x i16> [[INTRN_ARG]], i32 3 -// LLVM: ret i16 {{%.*}} +// LLVM: {{.*}}test_vget_lane_u16(<4 x i16>{{.*}}[[ARG:%.*]]) +// LLVM: [[RES:%.*]] = extractelement <4 x i16> [[ARG]], i32 3 +// LLVM: ret i16 [[RES]] uint16_t test_vgetq_lane_u16(uint16x8_t a) { return vgetq_lane_u16(a, 7); @@ -314,15 +181,9 @@ uint16_t test_vgetq_lane_u16(uint16x8_t a) { // CIR: [[IDX:%.*]] = cir.const #cir.int<7> : !s32i // CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector -// LLVM: define dso_local i16 @test_vgetq_lane_u16(<8 x i16> [[ARG:%.*]]) -// LLVM: alloca <8 x i16> -// LLVM: [[ARG_SAVE:%.*]] = alloca <8 x i16>, i64 1, align 16 -// LLVM: store <8 x i16> [[ARG]], ptr [[ARG_SAVE]], align 16 -// LLVM: [[TMP:%.*]] = load <8 x i16>, ptr [[ARG_SAVE:%.*]], align 16 -// LLVM: store <8 x i16> [[TMP]], ptr [[S0:%.*]], align 16 -// LLVM: [[INTRN_ARG:%.*]] = load <8 x i16>, ptr [[S0]], align 16 -// LLVM: {{%.*}} = extractelement <8 x i16> [[INTRN_ARG]], i32 7 -// LLVM: ret i16 {{%.*}} +// LLVM: {{.*}}test_vgetq_lane_u16(<8 x i16>{{.*}}[[ARG:%.*]]) +// LLVM: [[RES:%.*]] = extractelement <8 x i16> [[ARG]], i32 7 +// LLVM: ret i16 [[RES]] uint32_t test_vget_lane_u32(uint32x2_t a) { return vget_lane_u32(a, 1); @@ -332,15 +193,9 @@ uint32_t test_vget_lane_u32(uint32x2_t a) { // CIR: [[IDX:%.*]] = cir.const #cir.int<1> : !s32i // CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector -// LLVM: define dso_local i32 @test_vget_lane_u32(<2 x i32> [[ARG:%.*]]) -// LLVM: alloca <2 x i32> -// LLVM: [[ARG_SAVE:%.*]] = alloca <2 x i32>, i64 1, align 8 -// LLVM: store <2 x i32> [[ARG]], ptr [[ARG_SAVE]], align 8 -// LLVM: [[TMP:%.*]] = load <2 x i32>, ptr [[ARG_SAVE:%.*]], align 8 -// LLVM: store <2 x i32> [[TMP]], ptr [[S0:%.*]], align 8 -// LLVM: [[INTRN_ARG:%.*]] = load <2 x i32>, ptr [[S0]], align 8 -// LLVM: {{%.*}} = extractelement <2 x i32> [[INTRN_ARG]], i32 1 -// LLVM: ret i32 {{%.*}} +// LLVM: {{.*}}test_vget_lane_u32(<2 x i32>{{.*}}[[ARG:%.*]]) +// LLVM: [[RES:%.*]] = extractelement <2 x i32> [[ARG]], i32 1 +// LLVM: ret i32 [[RES]] uint32_t test_vgetq_lane_u32(uint32x4_t a) { return vgetq_lane_u32(a, 3); @@ -350,15 +205,9 @@ uint32_t test_vgetq_lane_u32(uint32x4_t a) { // CIR: [[IDX:%.*]] = cir.const #cir.int<3> : !s32i // CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector -// LLVM: define dso_local i32 @test_vgetq_lane_u32(<4 x i32> [[ARG:%.*]]) -// LLVM: alloca <4 x i32> -// LLVM: [[ARG_SAVE:%.*]] = alloca <4 x i32>, i64 1, align 16 -// LLVM: store <4 x i32> [[ARG]], ptr [[ARG_SAVE]], align 16 -// LLVM: [[TMP:%.*]] = load <4 x i32>, ptr [[ARG_SAVE:%.*]], align 16 -// LLVM: store <4 x i32> [[TMP]], ptr [[S0:%.*]], align 16 -// LLVM: [[INTRN_ARG:%.*]] = load <4 x i32>, ptr [[S0]], align 16 -// LLVM: {{%.*}} = extractelement <4 x i32> [[INTRN_ARG]], i32 3 -// LLVM: ret i32 {{%.*}} +// LLVM: {{.*}}test_vgetq_lane_u32(<4 x i32>{{.*}}[[ARG:%.*]]) +// LLVM: [[RES:%.*]] = extractelement <4 x i32> [[ARG]], i32 3 +// LLVM: ret i32 [[RES]] uint64_t test_vget_lane_u64(uint64x1_t a) { return vget_lane_u64(a, 0); @@ -368,15 +217,9 @@ uint64_t test_vget_lane_u64(uint64x1_t a) { // CIR: [[IDX:%.*]] = cir.const #cir.int<0> : !s32i // CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector -// LLVM: define dso_local i64 @test_vget_lane_u64(<1 x i64> [[ARG:%.*]]) -// LLVM: alloca <1 x i64> -// LLVM: [[ARG_SAVE:%.*]] = alloca <1 x i64>, i64 1, align 8 -// LLVM: store <1 x i64> [[ARG]], ptr [[ARG_SAVE]], align 8 -// LLVM: [[TMP:%.*]] = load <1 x i64>, ptr [[ARG_SAVE:%.*]], align 8 -// LLVM: store <1 x i64> [[TMP]], ptr [[S0:%.*]], align 8 -// LLVM: [[INTRN_ARG:%.*]] = load <1 x i64>, ptr [[S0]], align 8 -// LLVM: {{%.*}} = extractelement <1 x i64> [[INTRN_ARG]], i32 0 -// LLVM: ret i64 {{%.*}} +// LLVM: {{.*}}test_vget_lane_u64(<1 x i64>{{.*}}[[ARG:%.*]]) +// LLVM: [[RES:%.*]] = extractelement <1 x i64> [[ARG]], i32 0 +// LLVM: ret i64 [[RES]] uint64_t test_vgetq_lane_u64(uint64x2_t a) { return vgetq_lane_u64(a, 1); @@ -386,15 +229,9 @@ uint64_t test_vgetq_lane_u64(uint64x2_t a) { // CIR: [[IDX:%.*]] = cir.const #cir.int<1> : !s32i // CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector -// LLVM: define dso_local i64 @test_vgetq_lane_u64(<2 x i64> [[ARG:%.*]]) -// LLVM: alloca <2 x i64> -// LLVM: [[ARG_SAVE:%.*]] = alloca <2 x i64>, i64 1, align 16 -// LLVM: store <2 x i64> [[ARG]], ptr [[ARG_SAVE]], align 16 -// LLVM: [[TMP:%.*]] = load <2 x i64>, ptr [[ARG_SAVE:%.*]], align 16 -// LLVM: store <2 x i64> [[TMP]], ptr [[S0:%.*]], align 16 -// LLVM: [[INTRN_ARG:%.*]] = load <2 x i64>, ptr [[S0]], align 16 -// LLVM: {{%.*}} = extractelement <2 x i64> [[INTRN_ARG]], i32 1 -// LLVM: ret i64 {{%.*}} +// LLVM: {{.*}}test_vgetq_lane_u64(<2 x i64>{{.*}}[[ARG:%.*]]) +// LLVM: [[RES:%.*]] = extractelement <2 x i64> [[ARG]], i32 1 +// LLVM: ret i64 [[RES]] float32_t test_vget_lane_f32(float32x2_t a) { return vget_lane_f32(a, 1); @@ -404,15 +241,9 @@ float32_t test_vget_lane_f32(float32x2_t a) { // CIR: [[IDX:%.*]] = cir.const #cir.int<1> : !s32i // CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector -// LLVM: define dso_local float @test_vget_lane_f32(<2 x float> [[ARG:%.*]]) -// LLVM: alloca <2 x float> -// LLVM: [[ARG_SAVE:%.*]] = alloca <2 x float>, i64 1, align 8 -// LLVM: store <2 x float> [[ARG]], ptr [[ARG_SAVE]], align 8 -// LLVM: [[TMP:%.*]] = load <2 x float>, ptr [[ARG_SAVE:%.*]], align 8 -// LLVM: store <2 x float> [[TMP]], ptr [[S0:%.*]], align 8 -// LLVM: [[INTRN_ARG:%.*]] = load <2 x float>, ptr [[S0]], align 8 -// LLVM: {{%.*}} = extractelement <2 x float> [[INTRN_ARG]], i32 1 -// LLVM: ret float {{%.*}} +// LLVM: {{.*}}test_vget_lane_f32(<2 x float>{{.*}}[[ARG:%.*]]) +// LLVM: [[RES:%.*]] = extractelement <2 x float> [[ARG]], i32 1 +// LLVM: ret float [[RES]] float64_t test_vget_lane_f64(float64x1_t a) { return vget_lane_f64(a, 0); @@ -422,15 +253,9 @@ float64_t test_vget_lane_f64(float64x1_t a) { // CIR: [[IDX:%.*]] = cir.const #cir.int<0> : !s32i // CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector -// LLVM: define dso_local double @test_vget_lane_f64(<1 x double> [[ARG:%.*]]) -// LLVM: alloca <1 x double> -// LLVM: [[ARG_SAVE:%.*]] = alloca <1 x double>, i64 1, align 8 -// LLVM: store <1 x double> [[ARG]], ptr [[ARG_SAVE]], align 8 -// LLVM: [[TMP:%.*]] = load <1 x double>, ptr [[ARG_SAVE:%.*]], align 8 -// LLVM: store <1 x double> [[TMP]], ptr [[S0:%.*]], align 8 -// LLVM: [[INTRN_ARG:%.*]] = load <1 x double>, ptr [[S0]], align 8 -// LLVM: {{%.*}} = extractelement <1 x double> [[INTRN_ARG]], i32 0 -// LLVM: ret double {{%.*}} +// LLVM: {{.*}}test_vget_lane_f64(<1 x double>{{.*}}[[ARG:%.*]]) +// LLVM: [[RES:%.*]] = extractelement <1 x double> [[ARG]], i32 0 +// LLVM: ret double [[RES]] float32_t test_vgetq_lane_f32(float32x4_t a) { return vgetq_lane_f32(a, 3); @@ -440,15 +265,9 @@ float32_t test_vgetq_lane_f32(float32x4_t a) { // CIR: [[IDX:%.*]] = cir.const #cir.int<3> : !s32i // CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector -// LLVM: define dso_local float @test_vgetq_lane_f32(<4 x float> [[ARG:%.*]]) -// LLVM: alloca <4 x float> -// LLVM: [[ARG_SAVE:%.*]] = alloca <4 x float>, i64 1, align 16 -// LLVM: store <4 x float> [[ARG]], ptr [[ARG_SAVE]], align 16 -// LLVM: [[TMP:%.*]] = load <4 x float>, ptr [[ARG_SAVE:%.*]], align 16 -// LLVM: store <4 x float> [[TMP]], ptr [[S0:%.*]], align 16 -// LLVM: [[INTRN_ARG:%.*]] = load <4 x float>, ptr [[S0]], align 16 -// LLVM: {{%.*}} = extractelement <4 x float> [[INTRN_ARG]], i32 3 -// LLVM: ret float {{%.*}} +// LLVM: {{.*}}test_vgetq_lane_f32(<4 x float>{{.*}}[[ARG:%.*]]) +// LLVM: [[RES:%.*]] = extractelement <4 x float> [[ARG]], i32 3 +// LLVM: ret float [[RES]] float64_t test_vgetq_lane_f64(float64x2_t a) { return vgetq_lane_f64(a, 1); @@ -458,12 +277,177 @@ float64_t test_vgetq_lane_f64(float64x2_t a) { // CIR: [[IDX:%.*]] = cir.const #cir.int<1> : !s32i // CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector -// LLVM: define dso_local double @test_vgetq_lane_f64(<2 x double> [[ARG:%.*]]) -// LLVM: alloca <2 x double> -// LLVM: [[ARG_SAVE:%.*]] = alloca <2 x double>, i64 1, align 16 -// LLVM: store <2 x double> [[ARG]], ptr [[ARG_SAVE]], align 16 -// LLVM: [[TMP:%.*]] = load <2 x double>, ptr [[ARG_SAVE:%.*]], align 16 -// LLVM: store <2 x double> [[TMP]], ptr [[S0:%.*]], align 16 -// LLVM: [[INTRN_ARG:%.*]] = load <2 x double>, ptr [[S0]], align 16 -// LLVM: {{%.*}} = extractelement <2 x double> [[INTRN_ARG]], i32 1 -// LLVM: ret double {{%.*}} +// LLVM: {{.*}}test_vgetq_lane_f64(<2 x double>{{.*}}[[ARG:%.*]]) +// LLVM: [[RES:%.*]] = extractelement <2 x double> [[ARG]], i32 1 +// LLVM: ret double [[RES]] + +uint8x8x2_t test_vtrn_u8(uint8x8_t a, uint8x8_t b) { + return vtrn_u8(a, b); + + // CIR-LABEL: vtrn_u8 + // CIR: [[PTR:%.*]] = cir.cast(bitcast, {{%.*}} : !cir.ptr), !cir.ptr> + // CIR: [[ZERO:%.*]] = cir.const #cir.int<0> : !s32i + // CIR: [[ADDR:%.*]] = cir.ptr_stride([[PTR]] : !cir.ptr>, [[ZERO]] : !s32i), !cir.ptr> + // CIR: [[RES:%.*]] = cir.vec.shuffle([[INP1:%.*]], [[INP2:%.*]] : !cir.vector) + // CIR-SAME: [#cir.int<0> : !s32i, #cir.int<8> : !s32i, #cir.int<2> : !s32i, #cir.int<10> : !s32i, + // CIR-SAME: #cir.int<4> : !s32i, #cir.int<12> : !s32i, #cir.int<6> : !s32i, + // CIR-SAME: #cir.int<14> : !s32i] : !cir.vector + // CIR: cir.store [[RES]], [[ADDR]] : !cir.vector, !cir.ptr> + // CIR: [[ONE:%.*]] = cir.const #cir.int<1> : !s32i + // CIR: [[ADDR1:%.*]] = cir.ptr_stride([[PTR]] : !cir.ptr>, [[ONE]] : !s32i), !cir.ptr> + // CIR: [[RES1:%.*]] = cir.vec.shuffle([[INP1]], [[INP2]] : !cir.vector) + // CIR-SAME: [#cir.int<1> : !s32i, #cir.int<9> : !s32i, #cir.int<3> : !s32i, #cir.int<11> : !s32i, + // CIR-SAME: #cir.int<5> : !s32i, #cir.int<13> : !s32i, #cir.int<7> : !s32i, #cir.int<15> : !s32i] : + // CIR-SAME: !cir.vector + // CIR: cir.store [[RES1]], [[ADDR1]] : !cir.vector, !cir.ptr> + + // LLVM: {{.*}}test_vtrn_u8(<8 x i8>{{.*}}[[A:%.*]], <8 x i8>{{.*}}[[B:%.*]]) + // LLVM: [[VTRN:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], + // LLVM-SAME: <8 x i32> + // LLVM: store <8 x i8> [[VTRN]], ptr [[RES:%.*]], align 8 + // LLVM: [[RES1:%.*]] = getelementptr {{.*}}<8 x i8>, ptr [[RES]], i64 1 + // LLVM: [[VTRN1:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> + // LLVM: store <8 x i8> [[VTRN1]], ptr [[RES1]], align 8 + // LLVM: ret %struct.uint8x8x2_t {{.*}} +} + +uint16x4x2_t test_vtrn_u16(uint16x4_t a, uint16x4_t b) { + return vtrn_u16(a, b); + + // CIR-LABEL: vtrn_u16 + // CIR: [[PTR:%.*]] = cir.cast(bitcast, {{%.*}} : !cir.ptr), !cir.ptr> + // CIR: [[ZERO:%.*]] = cir.const #cir.int<0> : !s32i + // CIR: [[ADDR:%.*]] = cir.ptr_stride([[PTR]] : !cir.ptr>, [[ZERO]] : !s32i), !cir.ptr> + // CIR: [[RES:%.*]] = cir.vec.shuffle([[INP1:%.*]], [[INP2:%.*]] : !cir.vector) + // CIR-SAME: [#cir.int<0> : !s32i, #cir.int<4> : !s32i, #cir.int<2> : !s32i, #cir.int<6> : !s32i] : !cir.vector + // CIR: cir.store [[RES]], [[ADDR]] : !cir.vector, !cir.ptr> + // CIR: [[ONE:%.*]] = cir.const #cir.int<1> : !s32i + // CIR: [[ADDR1:%.*]] = cir.ptr_stride([[PTR]] : !cir.ptr>, [[ONE]] : !s32i), !cir.ptr> + // CIR: [[RES1:%.*]] = cir.vec.shuffle([[INP1]], [[INP2]] : !cir.vector) + // CIR-SAME: [#cir.int<1> : !s32i, #cir.int<5> : !s32i, #cir.int<3> : !s32i, #cir.int<7> : !s32i] : + // CIR-SAME: !cir.vector + // CIR: cir.store [[RES1]], [[ADDR1]] : !cir.vector, !cir.ptr> + + // LLVM: {{.*}}test_vtrn_u16(<4 x i16>{{.*}}[[A:%.*]], <4 x i16>{{.*}}[[B:%.*]]) + // LLVM: [[VTRN:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], + // LLVM-SAME: <4 x i32> + // LLVM: store <4 x i16> [[VTRN]], ptr [[RES:%.*]], align 8 + // LLVM: [[RES1:%.*]] = getelementptr {{.*}}<4 x i16>, ptr [[RES]], i64 1 + // LLVM: [[VTRN1:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], + // LLVM-SAME: <4 x i32> + // LLVM: store <4 x i16> [[VTRN1]], ptr [[RES1]], align 8 + // LLVM: ret %struct.uint16x4x2_t {{.*}} +} + +int32x2x2_t test_vtrn_s32(int32x2_t a, int32x2_t b) { + return vtrn_s32(a, b); + + // CIR-LABEL: vtrn_s32 + // CIR: [[PTR:%.*]] = cir.cast(bitcast, {{%.*}} : !cir.ptr), !cir.ptr> + // CIR: [[ZERO:%.*]] = cir.const #cir.int<0> : !s32i + // CIR: [[ADDR:%.*]] = cir.ptr_stride([[PTR]] : !cir.ptr>, [[ZERO]] : !s32i), !cir.ptr> + // CIR: [[RES:%.*]] = cir.vec.shuffle([[INP1:%.*]], [[INP2:%.*]] : !cir.vector) + // CIR-SAME: [#cir.int<0> : !s32i, #cir.int<2> : !s32i] : !cir.vector + // CIR: cir.store [[RES]], [[ADDR]] : !cir.vector, !cir.ptr> + // CIR: [[ONE:%.*]] = cir.const #cir.int<1> : !s32i + // CIR: [[ADDR1:%.*]] = cir.ptr_stride([[PTR]] : !cir.ptr>, [[ONE]] : !s32i), !cir.ptr> + // CIR: [[RES1:%.*]] = cir.vec.shuffle([[INP1]], [[INP2]] : !cir.vector) + // CIR-SAME: [#cir.int<1> : !s32i, #cir.int<3> : !s32i] : + // CIR-SAME: !cir.vector + // CIR: cir.store [[RES1]], [[ADDR1]] : !cir.vector, !cir.ptr> + + // LLVM: {{.*}}test_vtrn_s32(<2 x i32>{{.*}}[[A:%.*]], <2 x i32>{{.*}}[[B:%.*]]) + // LLVM: [[VTRN:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], + // LLVM-SAME: <2 x i32> + // LLVM: store <2 x i32> [[VTRN]], ptr [[RES:%.*]], align 8 + // LLVM: [[RES1:%.*]] = getelementptr {{.*}}<2 x i32>, ptr [[RES]], i64 1 + // LLVM: [[VTRN1:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], + // LLVM-SAME: <2 x i32> + // LLVM: store <2 x i32> [[VTRN1]], ptr [[RES1]], align 8 + // LLVM: ret %struct.int32x2x2_t {{.*}} +} + +uint8x16x2_t test_vtrnq_u8(uint8x16_t a, uint8x16_t b) { + return vtrnq_u8(a, b); + + // CIR-LABEL: vtrnq_u8 + // CIR: [[PTR:%.*]] = cir.cast(bitcast, {{%.*}} : !cir.ptr), !cir.ptr> + // CIR: [[ZERO:%.*]] = cir.const #cir.int<0> : !s32i + // CIR: [[ADDR:%.*]] = cir.ptr_stride([[PTR]] : !cir.ptr>, [[ZERO]] : !s32i), !cir.ptr> + // CIR: [[RES:%.*]] = cir.vec.shuffle([[INP1:%.*]], [[INP2:%.*]] : !cir.vector) + // CIR-SAME: [#cir.int<0> : !s32i, #cir.int<16> : !s32i, #cir.int<2> : !s32i, #cir.int<18> : !s32i, + // CIR-SAME: #cir.int<4> : !s32i, #cir.int<20> : !s32i, #cir.int<6> : !s32i, #cir.int<22> : !s32i, + // CIR-SAME: #cir.int<8> : !s32i, #cir.int<24> : !s32i, #cir.int<10> : !s32i, #cir.int<26> : !s32i, + // CIR-SAME: #cir.int<12> : !s32i, #cir.int<28> : !s32i, #cir.int<14> : !s32i, #cir.int<30> : !s32i] : !cir.vector + // CIR: cir.store [[RES]], [[ADDR]] : !cir.vector, !cir.ptr> + // CIR: [[ONE:%.*]] = cir.const #cir.int<1> : !s32i + // CIR: [[ADDR1:%.*]] = cir.ptr_stride([[PTR]] : !cir.ptr>, [[ONE]] : !s32i), !cir.ptr> + // CIR: [[RES1:%.*]] = cir.vec.shuffle([[INP1]], [[INP2]] : !cir.vector) + // CIR-SAME: [#cir.int<1> : !s32i, #cir.int<17> : !s32i, #cir.int<3> : !s32i, #cir.int<19> : !s32i, + // CIR-SAME: #cir.int<5> : !s32i, #cir.int<21> : !s32i, #cir.int<7> : !s32i, #cir.int<23> : !s32i, + // CIR-SAME: #cir.int<9> : !s32i, #cir.int<25> : !s32i, #cir.int<11> : !s32i, #cir.int<27> : !s32i, + // CIR-SAME: #cir.int<13> : !s32i, #cir.int<29> : !s32i, #cir.int<15> : !s32i, #cir.int<31> : !s32i] : !cir.vector + // CIR: cir.store [[RES1]], [[ADDR1]] : !cir.vector, !cir.ptr> + + // LLVM: {{.*}}test_vtrnq_u8(<16 x i8>{{.*}}[[A:%.*]], <16 x i8>{{.*}}[[B:%.*]]) + // LLVM: [[VTRN:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], + // LLVM-SAME: <16 x i32> + // LLVM: store <16 x i8> [[VTRN]], ptr [[RES:%.*]], align 16 + // LLVM: [[RES1:%.*]] = getelementptr {{.*}}<16 x i8>, ptr [[RES]], i64 1 + // LLVM: [[VTRN1:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], + // LLVM-SAME: <16 x i32> + // LLVM: store <16 x i8> [[VTRN1]], ptr [[RES1]], align 16 + // LLVM: ret %struct.uint8x16x2_t {{.*}} +} + +int16x8x2_t test_vtrnq_s16(int16x8_t a, int16x8_t b) { + return vtrnq_s16(a, b); + + // CIR-LABEL: vtrnq_s16 + // CIR: [[PTR:%.*]] = cir.cast(bitcast, {{%.*}} : !cir.ptr), !cir.ptr> + // CIR: [[ZERO:%.*]] = cir.const #cir.int<0> : !s32i + // CIR: [[ADDR:%.*]] = cir.ptr_stride([[PTR]] : !cir.ptr>, [[ZERO]] : !s32i), !cir.ptr> + // CIR: [[RES:%.*]] = cir.vec.shuffle([[INP1:%.*]], [[INP2:%.*]] : !cir.vector) + // CIR-SAME: [#cir.int<0> : !s32i, #cir.int<8> : !s32i, #cir.int<2> : !s32i, #cir.int<10> : !s32i, + // CIR-SAME: #cir.int<4> : !s32i, #cir.int<12> : !s32i, #cir.int<6> : !s32i, + // CIR-SAME: #cir.int<14> : !s32i] : !cir.vector + // CIR: cir.store [[RES]], [[ADDR]] : !cir.vector, !cir.ptr> + // CIR: [[ONE:%.*]] = cir.const #cir.int<1> : !s32i + // CIR: [[ADDR1:%.*]] = cir.ptr_stride([[PTR]] : !cir.ptr>, [[ONE]] : !s32i), !cir.ptr> + // CIR: [[RES1:%.*]] = cir.vec.shuffle([[INP1]], [[INP2]] : !cir.vector) + // CIR-SAME: [#cir.int<1> : !s32i, #cir.int<9> : !s32i, #cir.int<3> : !s32i, #cir.int<11> : !s32i, + // CIR-SAME: #cir.int<5> : !s32i, #cir.int<13> : !s32i, #cir.int<7> : !s32i, #cir.int<15> : !s32i] : + // CIR-SAME: !cir.vector + // CIR: cir.store [[RES1]], [[ADDR1]] : !cir.vector, !cir.ptr> + + // LLVM: {{.*}}test_vtrnq_s16(<8 x i16>{{.*}}[[A:%.*]], <8 x i16>{{.*}}[[B:%.*]]) + // LLVM: [[VTRN:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], + // LLVM-SAME: <8 x i32> + // LLVM: store <8 x i16> [[VTRN]], ptr [[RES:%.*]], align 16 + // LLVM: [[RES1:%.*]] = getelementptr {{.*}}<8 x i16>, ptr [[RES]], i64 1 + // LLVM: [[VTRN1:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> + // LLVM: store <8 x i16> [[VTRN1]], ptr [[RES1]], align 16 + // LLVM: ret %struct.int16x8x2_t {{.*}} +} + +uint32x4x2_t test_vtrnq_u32(uint32x4_t a, uint32x4_t b) { + return vtrnq_u32(a, b); + + // CIR-LABEL: vtrnq_u32 + // CIR: [[PTR:%.*]] = cir.cast(bitcast, {{%.*}} : !cir.ptr), !cir.ptr> + // CIR: [[ZERO:%.*]] = cir.const #cir.int<0> : !s32i + // CIR: [[ADDR:%.*]] = cir.ptr_stride([[PTR]] : !cir.ptr>, [[ZERO]] : !s32i), !cir.ptr> + // CIR: [[RES:%.*]] = cir.vec.shuffle([[INP1:%.*]], [[INP2:%.*]] : !cir.vector) + // CIR-SAME: [#cir.int<0> : !s32i, #cir.int<4> : !s32i, #cir.int<2> : !s32i, #cir.int<6> : !s32i] : + // CIR-SAME: !cir.vector + // CIR: cir.store [[RES]], [[ADDR]] : !cir.vector, !cir.ptr> + // CIR: [[ONE:%.*]] = cir.const #cir.int<1> : !s32i + // CIR: [[ADDR1:%.*]] = cir.ptr_stride([[PTR]] : !cir.ptr>, [[ONE]] : !s32i), !cir.ptr> + // CIR: [[RES1:%.*]] = cir.vec.shuffle([[INP1]], [[INP2]] : !cir.vector) + // CIR-SAME: [#cir.int<1> : !s32i, #cir.int<5> : !s32i, #cir.int<3> : !s32i, #cir.int<7> : !s32i] : + // CIR-SAME: !cir.vector + // CIR: cir.store [[RES1]], [[ADDR1]] : !cir.vector, !cir.ptr> + // LLVM: ret %struct.uint32x4x2_t {{.*}} +}