From 6ae7f330c49152030e037c59dff3a48ff10b0af5 Mon Sep 17 00:00:00 2001 From: Guojin He Date: Wed, 25 Sep 2024 12:01:20 -0400 Subject: [PATCH] Generate CIR for neon_vget lane and neon_vdup lane intrinsics --- .../lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp | 66 ++++-- .../test/CIR/CodeGen/aarch64-neon-vdup-lane.c | 216 +++++++++++++++++ clang/test/CIR/CodeGen/aarch64-neon-vget.c | 219 ++++++++++++++++++ 3 files changed, 485 insertions(+), 16 deletions(-) create mode 100644 clang/test/CIR/CodeGen/aarch64-neon-vdup-lane.c create mode 100644 clang/test/CIR/CodeGen/aarch64-neon-vget.c diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp index b979b41b8bc6..5b74321d36f0 100644 --- a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp @@ -2186,42 +2186,76 @@ CIRGenFunction::buildAArch64BuiltinExpr(unsigned BuiltinID, const CallExpr *E, case NEON::BI__builtin_neon_vget_lane_i8: case NEON::BI__builtin_neon_vdupb_lane_i8: - llvm_unreachable("NYI"); + Ops[0] = builder.createBitcast( + Ops[0], mlir::cir::VectorType::get(builder.getContext(), UInt8Ty, 8)); + return builder.create( + getLoc(E->getExprLoc()), Ops[0], buildScalarExpr(E->getArg(1))); case NEON::BI__builtin_neon_vgetq_lane_i8: case NEON::BI__builtin_neon_vdupb_laneq_i8: - llvm_unreachable("NYI"); + Ops[0] = builder.createBitcast( + Ops[0], mlir::cir::VectorType::get(builder.getContext(), UInt8Ty, 16)); + return builder.create( + getLoc(E->getExprLoc()), Ops[0], buildScalarExpr(E->getArg(1))); case NEON::BI__builtin_neon_vget_lane_i16: case NEON::BI__builtin_neon_vduph_lane_i16: - llvm_unreachable("NYI"); + Ops[0] = builder.createBitcast( + Ops[0], mlir::cir::VectorType::get(builder.getContext(), UInt16Ty, 4)); + return builder.create( + getLoc(E->getExprLoc()), Ops[0], buildScalarExpr(E->getArg(1))); case NEON::BI__builtin_neon_vgetq_lane_i16: case NEON::BI__builtin_neon_vduph_laneq_i16: - llvm_unreachable("NYI"); + Ops[0] = builder.createBitcast( + Ops[0], mlir::cir::VectorType::get(builder.getContext(), UInt16Ty, 8)); + return builder.create( + getLoc(E->getExprLoc()), Ops[0], buildScalarExpr(E->getArg(1))); case NEON::BI__builtin_neon_vget_lane_i32: case NEON::BI__builtin_neon_vdups_lane_i32: - llvm_unreachable("NYI"); + Ops[0] = builder.createBitcast( + Ops[0], mlir::cir::VectorType::get(builder.getContext(), UInt32Ty, 2)); + return builder.create( + getLoc(E->getExprLoc()), Ops[0], buildScalarExpr(E->getArg(1))); + case NEON::BI__builtin_neon_vget_lane_f32: case NEON::BI__builtin_neon_vdups_lane_f32: - llvm_unreachable("NYI"); + Ops[0] = builder.createBitcast( + Ops[0], mlir::cir::VectorType::get(builder.getContext(), FloatTy, 2)); + return builder.create( + getLoc(E->getExprLoc()), Ops[0], buildScalarExpr(E->getArg(1))); case NEON::BI__builtin_neon_vgetq_lane_i32: case NEON::BI__builtin_neon_vdups_laneq_i32: - llvm_unreachable("NYI"); + Ops[0] = builder.createBitcast( + Ops[0], mlir::cir::VectorType::get(builder.getContext(), UInt32Ty, 4)); + return builder.create( + getLoc(E->getExprLoc()), Ops[0], buildScalarExpr(E->getArg(1))); case NEON::BI__builtin_neon_vget_lane_i64: case NEON::BI__builtin_neon_vdupd_lane_i64: - llvm_unreachable("NYI"); + Ops[0] = builder.createBitcast( + Ops[0], mlir::cir::VectorType::get(builder.getContext(), UInt64Ty, 1)); + return builder.create( + getLoc(E->getExprLoc()), Ops[0], buildScalarExpr(E->getArg(1))); case NEON::BI__builtin_neon_vdupd_lane_f64: - llvm_unreachable("NYI"); + case NEON::BI__builtin_neon_vget_lane_f64: + Ops[0] = builder.createBitcast( + Ops[0], mlir::cir::VectorType::get(builder.getContext(), DoubleTy, 1)); + return builder.create( + getLoc(E->getExprLoc()), Ops[0], buildScalarExpr(E->getArg(1))); case NEON::BI__builtin_neon_vgetq_lane_i64: case NEON::BI__builtin_neon_vdupd_laneq_i64: - llvm_unreachable("NYI"); - case NEON::BI__builtin_neon_vget_lane_f32: - llvm_unreachable("NYI"); - case NEON::BI__builtin_neon_vget_lane_f64: - llvm_unreachable("NYI"); + Ops[0] = builder.createBitcast( + Ops[0], mlir::cir::VectorType::get(builder.getContext(), UInt64Ty, 2)); + return builder.create( + getLoc(E->getExprLoc()), Ops[0], buildScalarExpr(E->getArg(1))); case NEON::BI__builtin_neon_vgetq_lane_f32: case NEON::BI__builtin_neon_vdups_laneq_f32: - llvm_unreachable("NYI"); + Ops[0] = builder.createBitcast( + Ops[0], mlir::cir::VectorType::get(builder.getContext(), FloatTy, 4)); + return builder.create( + getLoc(E->getExprLoc()), Ops[0], buildScalarExpr(E->getArg(1))); case NEON::BI__builtin_neon_vgetq_lane_f64: case NEON::BI__builtin_neon_vdupd_laneq_f64: - llvm_unreachable("NYI"); + Ops[0] = builder.createBitcast( + Ops[0], mlir::cir::VectorType::get(builder.getContext(), DoubleTy, 2)); + return builder.create( + getLoc(E->getExprLoc()), Ops[0], buildScalarExpr(E->getArg(1))); case NEON::BI__builtin_neon_vaddh_f16: llvm_unreachable("NYI"); case NEON::BI__builtin_neon_vsubh_f16: diff --git a/clang/test/CIR/CodeGen/aarch64-neon-vdup-lane.c b/clang/test/CIR/CodeGen/aarch64-neon-vdup-lane.c new file mode 100644 index 000000000000..4799e0931c55 --- /dev/null +++ b/clang/test/CIR/CodeGen/aarch64-neon-vdup-lane.c @@ -0,0 +1,216 @@ +// RUN: %clang_cc1 -triple aarch64-none-linux-android24 -fclangir \ +// RUN: -emit-cir -target-feature +neon %s -o %t.cir +// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s +// RUN: %clang_cc1 -triple aarch64-none-linux-android24 -fclangir \ +// RUN: -emit-llvm -target-feature +neon %s -o %t.ll +// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s + +// Tetsting normal situation of vdup lane intrinsics. + +// REQUIRES: aarch64-registered-target || arm-registered-target +#include + +int8_t test_vdupb_lane_s8(int8x8_t src) { + return vdupb_lane_s8(src, 7); +} + +// CIR-LABEL: test_vdupb_lane_s8 +// CIR: [[IDX:%.*]] = cir.const #cir.int<7> : !s32i +// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector + +// LLVM: define dso_local i8 @test_vdupb_lane_s8(<8 x i8> [[ARG:%.*]]) +// LLVM: [[ARG_SAVE:%.*]] = alloca <8 x i8>, i64 1, align 8 +// LLVM: store <8 x i8> [[ARG]], ptr [[ARG_SAVE]], align 8 +// LLVM: [[TMP:%.*]] = load <8 x i8>, ptr [[ARG_SAVE:%.*]], align 8 +// LLVM: store <8 x i8> [[TMP]], ptr [[S0:%.*]], align 8 +// LLVM: [[INTRN_ARG:%.*]] = load <8 x i8>, ptr [[S0]], align 8 +// LLVM: {{%.*}} = extractelement <8 x i8> [[INTRN_ARG]], i32 7 +// LLVM: ret i8 {{%.*}} + +int8_t test_vdupb_laneq_s8(int8x16_t a) { + return vdupb_laneq_s8(a, 15); +} + +// CIR-LABEL: test_vdupb_laneq_s8 +// CIR: [[IDX:%.*]] = cir.const #cir.int<15> : !s32i +// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector + +// LLVM: define dso_local i8 @test_vdupb_laneq_s8(<16 x i8> [[ARG:%.*]]) +// LLVM: [[ARG_SAVE:%.*]] = alloca <16 x i8>, i64 1, align 16 +// LLVM: store <16 x i8> [[ARG]], ptr [[ARG_SAVE]], align 16 +// LLVM: [[TMP:%.*]] = load <16 x i8>, ptr [[ARG_SAVE:%.*]], align 16 +// LLVM: store <16 x i8> [[TMP]], ptr [[S0:%.*]], align 16 +// LLVM: [[INTRN_ARG:%.*]] = load <16 x i8>, ptr [[S0]], align 16 +// LLVM: {{%.*}} = extractelement <16 x i8> [[INTRN_ARG]], i32 15 +// LLVM: ret i8 {{%.*}} + +int16_t test_vduph_lane_s16(int16x4_t src) { + return vduph_lane_s16(src, 3); +} + +// CIR-LABEL: test_vduph_lane_s16 +// CIR: [[IDX:%.*]] = cir.const #cir.int<3> : !s32i +// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector + + +// LLVM: define dso_local i16 @test_vduph_lane_s16(<4 x i16> [[ARG:%.*]]) +// LLVM: [[ARG_SAVE:%.*]] = alloca <4 x i16>, i64 1, align 8 +// LLVM: store <4 x i16> [[ARG]], ptr [[ARG_SAVE]], align 8 +// LLVM: [[TMP:%.*]] = load <4 x i16>, ptr [[ARG_SAVE:%.*]], align 8 +// LLVM: store <4 x i16> [[TMP]], ptr [[S0:%.*]], align 8 +// LLVM: [[INTRN_ARG:%.*]] = load <4 x i16>, ptr [[S0]], align 8 +// LLVM: {{%.*}} = extractelement <4 x i16> [[INTRN_ARG]], i32 3 +// LLVM: ret i16 {{%.*}} + +int16_t test_vduph_laneq_s16(int16x8_t a) { + return vduph_laneq_s16(a, 7); +} + +// CIR-LABEL: test_vduph_laneq_s16 +// CIR: [[IDX:%.*]] = cir.const #cir.int<7> : !s32i +// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector + +// LLVM: define dso_local i16 @test_vduph_laneq_s16(<8 x i16> [[ARG:%.*]]) +// LLVM: [[ARG_SAVE:%.*]] = alloca <8 x i16>, i64 1, align 16 +// LLVM: store <8 x i16> [[ARG]], ptr [[ARG_SAVE]], align 16 +// LLVM: [[TMP:%.*]] = load <8 x i16>, ptr [[ARG_SAVE:%.*]], align 16 +// LLVM: store <8 x i16> [[TMP]], ptr [[S0:%.*]], align 16 +// LLVM: [[INTRN_ARG:%.*]] = load <8 x i16>, ptr [[S0]], align 16 +// LLVM: {{%.*}} = extractelement <8 x i16> [[INTRN_ARG]], i32 7 +// LLVM: ret i16 {{%.*}} + +int32_t test_vdups_lane_s32(int32x2_t a) { + return vdups_lane_s32(a, 1); +} + +// CIR-LABEL: test_vdups_lane_s32 +// CIR: [[IDX:%.*]] = cir.const #cir.int<1> : !s32i +// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector + +// LLVM: define dso_local i32 @test_vdups_lane_s32(<2 x i32> [[ARG:%.*]]) +// LLVM: [[ARG_SAVE:%.*]] = alloca <2 x i32>, i64 1, align 8 +// LLVM: store <2 x i32> [[ARG]], ptr [[ARG_SAVE]], align 8 +// LLVM: [[TMP:%.*]] = load <2 x i32>, ptr [[ARG_SAVE:%.*]], align 8 +// LLVM: store <2 x i32> [[TMP]], ptr [[S0:%.*]], align 8 +// LLVM: [[INTRN_ARG:%.*]] = load <2 x i32>, ptr [[S0]], align 8 +// LLVM: {{%.*}} = extractelement <2 x i32> [[INTRN_ARG]], i32 1 +// LLVM: ret i32 {{%.*}} + +int32_t test_vdups_laneq_s32(int32x4_t a) { + return vdups_laneq_s32(a, 3); +} + +// CIR-LABEL: test_vdups_laneq_s32 +// CIR: [[IDX:%.*]] = cir.const #cir.int<3> : !s32i +// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector + +// LLVM: define dso_local i32 @test_vdups_laneq_s32(<4 x i32> [[ARG:%.*]]) +// LLVM: [[ARG_SAVE:%.*]] = alloca <4 x i32>, i64 1, align 16 +// LLVM: store <4 x i32> [[ARG]], ptr [[ARG_SAVE]], align 16 +// LLVM: [[TMP:%.*]] = load <4 x i32>, ptr [[ARG_SAVE:%.*]], align 16 +// LLVM: store <4 x i32> [[TMP]], ptr [[S0:%.*]], align 16 +// LLVM: [[INTRN_ARG:%.*]] = load <4 x i32>, ptr [[S0]], align 16 +// LLVM: {{%.*}} = extractelement <4 x i32> [[INTRN_ARG]], i32 3 +// LLVM: ret i32 {{%.*}} + +int64_t test_vdupd_lane_s64(int64x1_t src) { + return vdupd_lane_s64(src, 0); +} + +// CIR-LABEL: test_vdupd_lane_s64 +// CIR: [[IDX:%.*]] = cir.const #cir.int<0> : !s32i +// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector + +// LLVM: define dso_local i64 @test_vdupd_lane_s64(<1 x i64> [[ARG:%.*]]) +// LLVM: [[ARG_SAVE:%.*]] = alloca <1 x i64>, i64 1, align 8 +// LLVM: store <1 x i64> [[ARG]], ptr [[ARG_SAVE]], align 8 +// LLVM: [[TMP:%.*]] = load <1 x i64>, ptr [[ARG_SAVE:%.*]], align 8 +// LLVM: store <1 x i64> [[TMP]], ptr [[S0:%.*]], align 8 +// LLVM: [[INTRN_ARG:%.*]] = load <1 x i64>, ptr [[S0]], align 8 +// LLVM: {{%.*}} = extractelement <1 x i64> [[INTRN_ARG]], i32 0 +// LLVM: ret i64 {{%.*}} + +int64_t test_vdupd_laneq_s64(int64x2_t a) { + return vdupd_laneq_s64(a, 1); +} + +// CIR-LABEL: test_vdupd_laneq_s64 +// CIR: [[IDX:%.*]] = cir.const #cir.int<1> : !s32i +// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector + +// LLVM: define dso_local i64 @test_vdupd_laneq_s64(<2 x i64> [[ARG:%.*]]) +// LLVM: [[ARG_SAVE:%.*]] = alloca <2 x i64>, i64 1, align 16 +// LLVM: store <2 x i64> [[ARG]], ptr [[ARG_SAVE]], align 16 +// LLVM: [[TMP:%.*]] = load <2 x i64>, ptr [[ARG_SAVE:%.*]], align 16 +// LLVM: store <2 x i64> [[TMP]], ptr [[S0:%.*]], align 16 +// LLVM: [[INTRN_ARG:%.*]] = load <2 x i64>, ptr [[S0]], align 16 +// LLVM: {{%.*}} = extractelement <2 x i64> [[INTRN_ARG]], i32 1 +// LLVM: ret i64 {{%.*}} + +float32_t test_vdups_lane_f32(float32x2_t src) { + return vdups_lane_f32(src, 1); +} + +// CIR-LABEL: test_vdups_lane_f32 +// CIR: [[IDX:%.*]] = cir.const #cir.int<1> : !s32i +// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector + +// LLVM: define dso_local float @test_vdups_lane_f32(<2 x float> [[ARG:%.*]]) +// LLVM: [[ARG_SAVE:%.*]] = alloca <2 x float>, i64 1, align 8 +// LLVM: store <2 x float> [[ARG]], ptr [[ARG_SAVE]], align 8 +// LLVM: [[TMP:%.*]] = load <2 x float>, ptr [[ARG_SAVE:%.*]], align 8 +// LLVM: store <2 x float> [[TMP]], ptr [[S0:%.*]], align 8 +// LLVM: [[INTRN_ARG:%.*]] = load <2 x float>, ptr [[S0]], align 8 +// LLVM: {{%.*}} = extractelement <2 x float> [[INTRN_ARG]], i32 1 +// LLVM: ret float {{%.*}} + +float64_t test_vdupd_lane_f64(float64x1_t src) { + return vdupd_lane_f64(src, 0); +} + +// CIR-LABEL: test_vdupd_lane_f64 +// CIR: [[IDX:%.*]] = cir.const #cir.int<0> : !s32i +// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector + +// LLVM: define dso_local double @test_vdupd_lane_f64(<1 x double> [[ARG:%.*]]) +// LLVM: [[ARG_SAVE:%.*]] = alloca <1 x double>, i64 1, align 8 +// LLVM: store <1 x double> [[ARG]], ptr [[ARG_SAVE]], align 8 +// LLVM: [[TMP:%.*]] = load <1 x double>, ptr [[ARG_SAVE:%.*]], align 8 +// LLVM: store <1 x double> [[TMP]], ptr [[S0:%.*]], align 8 +// LLVM: [[INTRN_ARG:%.*]] = load <1 x double>, ptr [[S0]], align 8 +// LLVM: {{%.*}} = extractelement <1 x double> [[INTRN_ARG]], i32 0 +// LLVM: ret double {{%.*}} + +float32_t test_vdups_laneq_f32(float32x4_t src) { + return vdups_laneq_f32(src, 3); +} + +// CIR-LABEL: test_vdups_laneq_f32 +// CIR: [[IDX:%.*]] = cir.const #cir.int<3> : !s32i +// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector + +// LLVM: define dso_local float @test_vdups_laneq_f32(<4 x float> [[ARG:%.*]]) +// LLVM: [[ARG_SAVE:%.*]] = alloca <4 x float>, i64 1, align 16 +// LLVM: store <4 x float> [[ARG]], ptr [[ARG_SAVE]], align 16 +// LLVM: [[TMP:%.*]] = load <4 x float>, ptr [[ARG_SAVE:%.*]], align 16 +// LLVM: store <4 x float> [[TMP]], ptr [[S0:%.*]], align 16 +// LLVM: [[INTRN_ARG:%.*]] = load <4 x float>, ptr [[S0]], align 16 +// LLVM: {{%.*}} = extractelement <4 x float> [[INTRN_ARG]], i32 3 +// LLVM: ret float {{%.*}} + +float64_t test_vdupd_laneq_f64(float64x2_t src) { + return vdupd_laneq_f64(src, 1); +} + +// CIR-LABEL: test_vdupd_laneq_f64 +// CIR: [[IDX:%.*]] = cir.const #cir.int<1> : !s32i +// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector + +// LLVM: define dso_local double @test_vdupd_laneq_f64(<2 x double> [[ARG:%.*]]) +// LLVM: [[ARG_SAVE:%.*]] = alloca <2 x double>, i64 1, align 16 +// LLVM: store <2 x double> [[ARG]], ptr [[ARG_SAVE]], align 16 +// LLVM: [[TMP:%.*]] = load <2 x double>, ptr [[ARG_SAVE:%.*]], align 16 +// LLVM: store <2 x double> [[TMP]], ptr [[S0:%.*]], align 16 +// LLVM: [[INTRN_ARG:%.*]] = load <2 x double>, ptr [[S0]], align 16 +// LLVM: {{%.*}} = extractelement <2 x double> [[INTRN_ARG]], i32 1 +// LLVM: ret double {{%.*}} diff --git a/clang/test/CIR/CodeGen/aarch64-neon-vget.c b/clang/test/CIR/CodeGen/aarch64-neon-vget.c new file mode 100644 index 000000000000..b16648691d1b --- /dev/null +++ b/clang/test/CIR/CodeGen/aarch64-neon-vget.c @@ -0,0 +1,219 @@ +// RUN: %clang_cc1 -triple aarch64-none-linux-android24 -fclangir \ +// RUN: -emit-cir -target-feature +neon %s -o %t.cir +// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s +// RUN: %clang_cc1 -triple aarch64-none-linux-android24 -fclangir \ +// RUN: -emit-llvm -target-feature +neon %s -o %t.ll +// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s + +// This test file contains test cases to those of +// clang/test/CodeGen/aarch64-neon-vget.c +// The difference is that this file only tests uses vget intrinsics, as we feel +// it would be proper to have a separate test file testing vset intrinsics +// with the file name aarch64-neon-vset.c + +// REQUIRES: aarch64-registered-target || arm-registered-target +#include + +uint8_t test_vget_lane_u8(uint8x8_t a) { + return vget_lane_u8(a, 7); +} + +// CIR-LABEL: test_vget_lane_u8 +// CIR: [[IDX:%.*]] = cir.const #cir.int<7> : !s32i +// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector + +// LLVM: define dso_local i8 @test_vget_lane_u8(<8 x i8> [[ARG:%.*]]) +// LLVM: [[ARG_SAVE:%.*]] = alloca <8 x i8>, i64 1, align 8 +// LLVM: store <8 x i8> [[ARG]], ptr [[ARG_SAVE]], align 8 +// LLVM: [[TMP:%.*]] = load <8 x i8>, ptr [[ARG_SAVE:%.*]], align 8 +// LLVM: store <8 x i8> [[TMP]], ptr [[S0:%.*]], align 8 +// LLVM: [[INTRN_ARG:%.*]] = load <8 x i8>, ptr [[S0]], align 8 +// LLVM: {{%.*}} = extractelement <8 x i8> [[INTRN_ARG]], i32 7 +// LLVM: ret i8 {{%.*}} + +uint8_t test_vgetq_lane_u8(uint8x16_t a) { + return vgetq_lane_u8(a, 15); +} + +// CIR-LABEL: test_vgetq_lane_u8 +// CIR: [[IDX:%.*]] = cir.const #cir.int<15> : !s32i +// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector + +// LLVM: define dso_local i8 @test_vgetq_lane_u8(<16 x i8> [[ARG:%.*]]) +// LLVM: [[ARG_SAVE:%.*]] = alloca <16 x i8>, i64 1, align 16 +// LLVM: store <16 x i8> [[ARG]], ptr [[ARG_SAVE]], align 16 +// LLVM: [[TMP:%.*]] = load <16 x i8>, ptr [[ARG_SAVE:%.*]], align 16 +// LLVM: store <16 x i8> [[TMP]], ptr [[S0:%.*]], align 16 +// LLVM: [[INTRN_ARG:%.*]] = load <16 x i8>, ptr [[S0]], align 16 +// LLVM: {{%.*}} = extractelement <16 x i8> [[INTRN_ARG]], i32 15 +// LLVM: ret i8 {{%.*}} + +uint16_t test_vget_lane_u16(uint16x4_t a) { + return vget_lane_u16(a, 3); +} + +// CIR-LABEL: test_vget_lane_u16 +// CIR: [[IDX:%.*]] = cir.const #cir.int<3> : !s32i +// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector + +// LLVM: define dso_local i16 @test_vget_lane_u16(<4 x i16> [[ARG:%.*]]) +// LLVM: [[ARG_SAVE:%.*]] = alloca <4 x i16>, i64 1, align 8 +// LLVM: store <4 x i16> [[ARG]], ptr [[ARG_SAVE]], align 8 +// LLVM: [[TMP:%.*]] = load <4 x i16>, ptr [[ARG_SAVE:%.*]], align 8 +// LLVM: store <4 x i16> [[TMP]], ptr [[S0:%.*]], align 8 +// LLVM: [[INTRN_ARG:%.*]] = load <4 x i16>, ptr [[S0]], align 8 +// LLVM: {{%.*}} = extractelement <4 x i16> [[INTRN_ARG]], i32 3 +// LLVM: ret i16 {{%.*}} + +uint16_t test_vgetq_lane_u16(uint16x8_t a) { + return vgetq_lane_u16(a, 7); +} + +// CIR-LABEL: test_vgetq_lane_u16 +// CIR: [[IDX:%.*]] = cir.const #cir.int<7> : !s32i +// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector + +// LLVM: define dso_local i16 @test_vgetq_lane_u16(<8 x i16> [[ARG:%.*]]) +// LLVM: [[ARG_SAVE:%.*]] = alloca <8 x i16>, i64 1, align 16 +// LLVM: store <8 x i16> [[ARG]], ptr [[ARG_SAVE]], align 16 +// LLVM: [[TMP:%.*]] = load <8 x i16>, ptr [[ARG_SAVE:%.*]], align 16 +// LLVM: store <8 x i16> [[TMP]], ptr [[S0:%.*]], align 16 +// LLVM: [[INTRN_ARG:%.*]] = load <8 x i16>, ptr [[S0]], align 16 +// LLVM: {{%.*}} = extractelement <8 x i16> [[INTRN_ARG]], i32 7 +// LLVM: ret i16 {{%.*}} + +uint32_t test_vget_lane_u32(uint32x2_t a) { + return vget_lane_u32(a, 1); +} + +// CIR-LABEL: test_vget_lane_u32 +// CIR: [[IDX:%.*]] = cir.const #cir.int<1> : !s32i +// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector + +// LLVM: define dso_local i32 @test_vget_lane_u32(<2 x i32> [[ARG:%.*]]) +// LLVM: [[ARG_SAVE:%.*]] = alloca <2 x i32>, i64 1, align 8 +// LLVM: store <2 x i32> [[ARG]], ptr [[ARG_SAVE]], align 8 +// LLVM: [[TMP:%.*]] = load <2 x i32>, ptr [[ARG_SAVE:%.*]], align 8 +// LLVM: store <2 x i32> [[TMP]], ptr [[S0:%.*]], align 8 +// LLVM: [[INTRN_ARG:%.*]] = load <2 x i32>, ptr [[S0]], align 8 +// LLVM: {{%.*}} = extractelement <2 x i32> [[INTRN_ARG]], i32 1 +// LLVM: ret i32 {{%.*}} + +uint32_t test_vgetq_lane_u32(uint32x4_t a) { + return vgetq_lane_u32(a, 3); +} + +// CIR-LABEL: test_vgetq_lane_u32 +// CIR: [[IDX:%.*]] = cir.const #cir.int<3> : !s32i +// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector + +// LLVM: define dso_local i32 @test_vgetq_lane_u32(<4 x i32> [[ARG:%.*]]) +// LLVM: [[ARG_SAVE:%.*]] = alloca <4 x i32>, i64 1, align 16 +// LLVM: store <4 x i32> [[ARG]], ptr [[ARG_SAVE]], align 16 +// LLVM: [[TMP:%.*]] = load <4 x i32>, ptr [[ARG_SAVE:%.*]], align 16 +// LLVM: store <4 x i32> [[TMP]], ptr [[S0:%.*]], align 16 +// LLVM: [[INTRN_ARG:%.*]] = load <4 x i32>, ptr [[S0]], align 16 +// LLVM: {{%.*}} = extractelement <4 x i32> [[INTRN_ARG]], i32 3 +// LLVM: ret i32 {{%.*}} + +uint64_t test_vget_lane_u64(uint64x1_t a) { + return vget_lane_u64(a, 0); +} + +// CIR-LABEL: test_vget_lane_u64 +// CIR: [[IDX:%.*]] = cir.const #cir.int<0> : !s32i +// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector + +// LLVM: define dso_local i64 @test_vget_lane_u64(<1 x i64> [[ARG:%.*]]) +// LLVM: [[ARG_SAVE:%.*]] = alloca <1 x i64>, i64 1, align 8 +// LLVM: store <1 x i64> [[ARG]], ptr [[ARG_SAVE]], align 8 +// LLVM: [[TMP:%.*]] = load <1 x i64>, ptr [[ARG_SAVE:%.*]], align 8 +// LLVM: store <1 x i64> [[TMP]], ptr [[S0:%.*]], align 8 +// LLVM: [[INTRN_ARG:%.*]] = load <1 x i64>, ptr [[S0]], align 8 +// LLVM: {{%.*}} = extractelement <1 x i64> [[INTRN_ARG]], i32 0 +// LLVM: ret i64 {{%.*}} + +uint64_t test_vgetq_lane_u64(uint64x2_t a) { + return vgetq_lane_u64(a, 1); +} + +// CIR-LABEL: test_vgetq_lane_u64 +// CIR: [[IDX:%.*]] = cir.const #cir.int<1> : !s32i +// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector + +// LLVM: define dso_local i64 @test_vgetq_lane_u64(<2 x i64> [[ARG:%.*]]) +// LLVM: [[ARG_SAVE:%.*]] = alloca <2 x i64>, i64 1, align 16 +// LLVM: store <2 x i64> [[ARG]], ptr [[ARG_SAVE]], align 16 +// LLVM: [[TMP:%.*]] = load <2 x i64>, ptr [[ARG_SAVE:%.*]], align 16 +// LLVM: store <2 x i64> [[TMP]], ptr [[S0:%.*]], align 16 +// LLVM: [[INTRN_ARG:%.*]] = load <2 x i64>, ptr [[S0]], align 16 +// LLVM: {{%.*}} = extractelement <2 x i64> [[INTRN_ARG]], i32 1 +// LLVM: ret i64 {{%.*}} + +float32_t test_vget_lane_f32(float32x2_t a) { + return vget_lane_f32(a, 1); +} + +// CIR-LABEL: test_vget_lane_f32 +// CIR: [[IDX:%.*]] = cir.const #cir.int<1> : !s32i +// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector + +// LLVM: define dso_local float @test_vget_lane_f32(<2 x float> [[ARG:%.*]]) +// LLVM: [[ARG_SAVE:%.*]] = alloca <2 x float>, i64 1, align 8 +// LLVM: store <2 x float> [[ARG]], ptr [[ARG_SAVE]], align 8 +// LLVM: [[TMP:%.*]] = load <2 x float>, ptr [[ARG_SAVE:%.*]], align 8 +// LLVM: store <2 x float> [[TMP]], ptr [[S0:%.*]], align 8 +// LLVM: [[INTRN_ARG:%.*]] = load <2 x float>, ptr [[S0]], align 8 +// LLVM: {{%.*}} = extractelement <2 x float> [[INTRN_ARG]], i32 1 +// LLVM: ret float {{%.*}} + +float64_t test_vget_lane_f64(float64x1_t a) { + return vget_lane_f64(a, 0); +} + +// CIR-LABEL: test_vget_lane_f64 +// CIR: [[IDX:%.*]] = cir.const #cir.int<0> : !s32i +// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector + +// LLVM: define dso_local double @test_vget_lane_f64(<1 x double> [[ARG:%.*]]) +// LLVM: [[ARG_SAVE:%.*]] = alloca <1 x double>, i64 1, align 8 +// LLVM: store <1 x double> [[ARG]], ptr [[ARG_SAVE]], align 8 +// LLVM: [[TMP:%.*]] = load <1 x double>, ptr [[ARG_SAVE:%.*]], align 8 +// LLVM: store <1 x double> [[TMP]], ptr [[S0:%.*]], align 8 +// LLVM: [[INTRN_ARG:%.*]] = load <1 x double>, ptr [[S0]], align 8 +// LLVM: {{%.*}} = extractelement <1 x double> [[INTRN_ARG]], i32 0 +// LLVM: ret double {{%.*}} + +float32_t test_vgetq_lane_f32(float32x4_t a) { + return vgetq_lane_f32(a, 3); +} + +// CIR-LABEL: test_vgetq_lane_f32 +// CIR: [[IDX:%.*]] = cir.const #cir.int<3> : !s32i +// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector + +// LLVM: define dso_local float @test_vgetq_lane_f32(<4 x float> [[ARG:%.*]]) +// LLVM: [[ARG_SAVE:%.*]] = alloca <4 x float>, i64 1, align 16 +// LLVM: store <4 x float> [[ARG]], ptr [[ARG_SAVE]], align 16 +// LLVM: [[TMP:%.*]] = load <4 x float>, ptr [[ARG_SAVE:%.*]], align 16 +// LLVM: store <4 x float> [[TMP]], ptr [[S0:%.*]], align 16 +// LLVM: [[INTRN_ARG:%.*]] = load <4 x float>, ptr [[S0]], align 16 +// LLVM: {{%.*}} = extractelement <4 x float> [[INTRN_ARG]], i32 3 +// LLVM: ret float {{%.*}} + +float64_t test_vgetq_lane_f64(float64x2_t a) { + return vgetq_lane_f64(a, 1); +} + +// CIR-LABEL: test_vgetq_lane_f64 +// CIR: [[IDX:%.*]] = cir.const #cir.int<1> : !s32i +// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector + +// LLVM: define dso_local double @test_vgetq_lane_f64(<2 x double> [[ARG:%.*]]) +// LLVM: [[ARG_SAVE:%.*]] = alloca <2 x double>, i64 1, align 16 +// LLVM: store <2 x double> [[ARG]], ptr [[ARG_SAVE]], align 16 +// LLVM: [[TMP:%.*]] = load <2 x double>, ptr [[ARG_SAVE:%.*]], align 16 +// LLVM: store <2 x double> [[TMP]], ptr [[S0:%.*]], align 16 +// LLVM: [[INTRN_ARG:%.*]] = load <2 x double>, ptr [[S0]], align 16 +// LLVM: {{%.*}} = extractelement <2 x double> [[INTRN_ARG]], i32 1 +// LLVM: ret double {{%.*}}