From 6c8239682ac4b76f957242e074dd3c29a2cabe26 Mon Sep 17 00:00:00 2001 From: Guojin He Date: Mon, 30 Sep 2024 22:18:59 -0400 Subject: [PATCH] Lower neon vst1q_lane and vst1_lane --- .../lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp | 14 +- clang/test/CIR/CodeGen/AArch64/neon-ldst.c | 394 +++++++++++++----- 2 files changed, 300 insertions(+), 108 deletions(-) diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp index 4f4151efcb3f..40edc0efc16d 100644 --- a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp @@ -1949,7 +1949,9 @@ static mlir::cir::VectorType GetNeonType(CIRGenFunction *CGF, CGF->getCIRGenModule().FloatTy, V1Ty ? 1 : (2 << IsQuad)); case NeonTypeFlags::Float64: - llvm_unreachable("NYI"); + return mlir::cir::VectorType::get(CGF->getBuilder().getContext(), + CGF->getCIRGenModule().DoubleTy, + V1Ty ? 1 : (1 << IsQuad)); } llvm_unreachable("Unknown vector element type!"); } @@ -3414,8 +3416,14 @@ CIRGenFunction::buildAArch64BuiltinExpr(unsigned BuiltinID, const CallExpr *E, llvm_unreachable("NYI"); } case NEON::BI__builtin_neon_vst1_lane_v: - case NEON::BI__builtin_neon_vst1q_lane_v: - llvm_unreachable("NYI"); + case NEON::BI__builtin_neon_vst1q_lane_v: { + Ops[1] = builder.createBitcast(Ops[1], Ty); + Ops[1] = builder.create(Ops[1].getLoc(), Ops[1], + Ops[2]); + (void)builder.createAlignedStore(getLoc(E->getExprLoc()), Ops[1], Ops[0], + PtrOp0.getAlignment()); + return Ops[1]; + } case NEON::BI__builtin_neon_vstl1_lane_s64: case NEON::BI__builtin_neon_vstl1q_lane_s64: { llvm_unreachable("NYI"); diff --git a/clang/test/CIR/CodeGen/AArch64/neon-ldst.c b/clang/test/CIR/CodeGen/AArch64/neon-ldst.c index d112f3a81808..6b6d46cbf03d 100644 --- a/clang/test/CIR/CodeGen/AArch64/neon-ldst.c +++ b/clang/test/CIR/CodeGen/AArch64/neon-ldst.c @@ -1,8 +1,12 @@ -// RUN: %clang_cc1 -triple aarch64-none-linux-android24 -fclangir \ -// RUN: -ffreestanding -emit-cir -target-feature +neon %s -o %t.cir +// RUN: %clang_cc1 -triple aarch64-none-linux-android24 -target-feature +neon \ +// RUN: -fclangir -disable-O0-optnone \ +// RUN: -flax-vector-conversions=none -emit-cir -o %t.cir %s // RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s -// RUN: %clang_cc1 -triple aarch64-none-linux-android24 -fclangir \ -// RUN: -ffreestanding -emit-llvm -target-feature +neon %s -o %t.ll + +// RUN: %clang_cc1 -triple aarch64-none-linux-android24 -target-feature +neon \ +// RUN: -fclangir -disable-O0-optnone \ +// RUN: -flax-vector-conversions=none -emit-llvm -o - %s \ +// RUN: | opt -S -passes=mem2reg,simplifycfg -o %t.ll // RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s // REQUIRES: aarch64-registered-target || arm-registered-target @@ -22,13 +26,9 @@ int8x8_t test_vld1_lane_s8(int8_t const * ptr, int8x8_t src) { // CIR: {{%.*}} = cir.vec.insert [[VAL]], {{%.*}}[[[IDX]] : !s32i] : !cir.vector // LLVM: {{.*}}test_vld1_lane_s8(ptr{{.*}}[[PTR:%.*]], <8 x i8>{{.*}}[[SRC:%.*]]) -// LLVM: store ptr [[PTR]], ptr [[PTR_ADDR:%.*]], align 8 // LLVM: store <8 x i8> [[SRC]], ptr [[SRC_ADDR:%.*]], align 8 -// LLVM: [[SRC_VAL:%.*]] = load <8 x i8>, ptr [[SRC_ADDR]], align 8 -// LLVM: store <8 x i8> [[SRC_VAL]], ptr [[S1:%.*]], align 8 -// LLVM: [[PTR_VAL:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 -// LLVM: [[INTRN_VEC:%.*]] = load <8 x i8>, ptr [[S1]], align 8 -// LLVM: [[INTRN_VAL:%.*]] = load i8, ptr [[PTR_VAL]], align 1 +// LLVM: [[INTRN_VEC:%.*]] = load <8 x i8>, ptr [[SRC_ADDR]], align 8 +// LLVM: [[INTRN_VAL:%.*]] = load i8, ptr [[PTR]], align 1 // LLVM: {{.*}} = insertelement <8 x i8> [[INTRN_VEC]], i8 [[INTRN_VAL]], i32 7 // LLVM: ret <8 x i8> {{.*}} @@ -43,13 +43,9 @@ int8x16_t test_vld1q_lane_s8(int8_t const * ptr, int8x16_t src) { // CIR: {{%.*}} = cir.vec.insert [[VAL]], {{%.*}}[[[IDX]] : !s32i] : !cir.vector // LLVM: {{.*}}test_vld1q_lane_s8(ptr{{.*}}[[PTR:%.*]], <16 x i8>{{.*}}[[SRC:%.*]]) -// LLVM: store ptr [[PTR]], ptr [[PTR_ADDR:%.*]], align 8 // LLVM: store <16 x i8> [[SRC]], ptr [[SRC_ADDR:%.*]], align 16 -// LLVM: [[SRC_VAL:%.*]] = load <16 x i8>, ptr [[SRC_ADDR]], align 16 -// LLVM: store <16 x i8> [[SRC_VAL]], ptr [[S1:%.*]], align 16 -// LLVM: [[PTR_VAL:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 -// LLVM: [[INTRN_VEC:%.*]] = load <16 x i8>, ptr [[S1]], align 16 -// LLVM: [[INTRN_VAL:%.*]] = load i8, ptr [[PTR_VAL]], align 1 +// LLVM: [[INTRN_VEC:%.*]] = load <16 x i8>, ptr [[SRC_ADDR]], align 16 +// LLVM: [[INTRN_VAL:%.*]] = load i8, ptr [[PTR]], align 1 // LLVM: {{.*}} = insertelement <16 x i8> [[INTRN_VEC]], i8 [[INTRN_VAL]], i32 15 // LLVM: ret <16 x i8> {{.*}} @@ -64,17 +60,12 @@ uint8x16_t test_vld1q_lane_u8(uint8_t const * ptr, uint8x16_t src) { // CIR: {{%.*}} = cir.vec.insert [[VAL]], {{%.*}}[[[IDX]] : !s32i] : !cir.vector // LLVM: {{.*}}test_vld1q_lane_u8(ptr{{.*}}[[PTR:%.*]], <16 x i8>{{.*}}[[SRC:%.*]]) -// LLVM: store ptr [[PTR]], ptr [[PTR_ADDR:%.*]], align 8 // LLVM: store <16 x i8> [[SRC]], ptr [[SRC_ADDR:%.*]], align 16 -// LLVM: [[SRC_VAL:%.*]] = load <16 x i8>, ptr [[SRC_ADDR]], align 16 -// LLVM: store <16 x i8> [[SRC_VAL]], ptr [[S1:%.*]], align 16 -// LLVM: [[PTR_VAL:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 -// LLVM: [[INTRN_VEC:%.*]] = load <16 x i8>, ptr [[S1]], align 16 -// LLVM: [[INTRN_VAL:%.*]] = load i8, ptr [[PTR_VAL]], align 1 +// LLVM: [[INTRN_VEC:%.*]] = load <16 x i8>, ptr [[SRC_ADDR]], align 16 +// LLVM: [[INTRN_VAL:%.*]] = load i8, ptr [[PTR]], align 1 // LLVM: {{.*}} = insertelement <16 x i8> [[INTRN_VEC]], i8 [[INTRN_VAL]], i32 15 // LLVM: ret <16 x i8> {{.*}} - uint8x8_t test_vld1_lane_u8(uint8_t const * ptr, uint8x8_t src) { return vld1_lane_u8(ptr, src, 7); } @@ -86,17 +77,12 @@ uint8x8_t test_vld1_lane_u8(uint8_t const * ptr, uint8x8_t src) { // CIR: {{%.*}} = cir.vec.insert [[VAL]], {{%.*}}[[[IDX]] : !s32i] : !cir.vector // LLVM: {{.*}}test_vld1_lane_u8(ptr{{.*}}[[PTR:%.*]], <8 x i8>{{.*}}[[SRC:%.*]]) -// LLVM: store ptr [[PTR]], ptr [[PTR_ADDR:%.*]], align 8 // LLVM: store <8 x i8> [[SRC]], ptr [[SRC_ADDR:%.*]], align 8 -// LLVM: [[SRC_VAL:%.*]] = load <8 x i8>, ptr [[SRC_ADDR]], align 8 -// LLVM: store <8 x i8> [[SRC_VAL]], ptr [[S1:%.*]], align 8 -// LLVM: [[PTR_VAL:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 -// LLVM: [[INTRN_VEC:%.*]] = load <8 x i8>, ptr [[S1]], align 8 -// LLVM: [[INTRN_VAL:%.*]] = load i8, ptr [[PTR_VAL]], align 1 +// LLVM: [[INTRN_VEC:%.*]] = load <8 x i8>, ptr [[SRC_ADDR]], align 8 +// LLVM: [[INTRN_VAL:%.*]] = load i8, ptr [[PTR]], align 1 // LLVM: {{.*}} = insertelement <8 x i8> [[INTRN_VEC]], i8 [[INTRN_VAL]], i32 7 // LLVM: ret <8 x i8> {{.*}} - int16x4_t test_vld1_lane_s16(int16_t const * ptr, int16x4_t src) { return vld1_lane_s16(ptr, src, 3); } @@ -108,15 +94,11 @@ int16x4_t test_vld1_lane_s16(int16_t const * ptr, int16x4_t src) { // CIR: {{%.*}} = cir.vec.insert [[VAL]], {{%.*}}[[[IDX]] : !s32i] : !cir.vector // LLVM: {{.*}}test_vld1_lane_s16(ptr{{.*}}[[PTR:%.*]], <4 x i16>{{.*}}[[SRC:%.*]]) -// LLVM: store ptr [[PTR]], ptr [[PTR_ADDR:%.*]], align 8 // LLVM: store <4 x i16> [[SRC]], ptr [[SRC_ADDR:%.*]], align 8 -// LLVM: [[SRC_VAL:%.*]] = load <4 x i16>, ptr [[SRC_ADDR]], align 8 -// LLVM: store <4 x i16> [[SRC_VAL]], ptr [[S1:%.*]], align 8 -// LLVM: [[PTR_VAL:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 -// LLVM: [[INTRN_VEC:%.*]] = load <4 x i16>, ptr [[S1]], align 8 +// LLVM: [[INTRN_VEC:%.*]] = load <4 x i16>, ptr [[SRC_ADDR]], align 8 // LLVM: [[INTRN_VEC_CAST0:%.*]] = bitcast <4 x i16> [[INTRN_VEC]] to <8 x i8> // LLVM: [[INTRN_VEC_CAST1:%.*]] = bitcast <8 x i8> [[INTRN_VEC_CAST0]] to <4 x i16> -// LLVM: [[INTRN_VAL:%.*]] = load i16, ptr [[PTR_VAL]], align 2 +// LLVM: [[INTRN_VAL:%.*]] = load i16, ptr [[PTR]], align 2 // LLVM: {{.*}} = insertelement <4 x i16> [[INTRN_VEC_CAST1]], i16 [[INTRN_VAL]], i32 3 // LLVM: ret <4 x i16> {{.*}} @@ -131,15 +113,11 @@ uint16x4_t test_vld1_lane_u16(uint16_t const * ptr, uint16x4_t src) { // CIR: {{%.*}} = cir.vec.insert [[VAL]], {{%.*}}[[[IDX]] : !s32i] : !cir.vector // LLVM: {{.*}}test_vld1_lane_u16(ptr{{.*}}[[PTR:%.*]], <4 x i16>{{.*}}[[SRC:%.*]]) -// LLVM: store ptr [[PTR]], ptr [[PTR_ADDR:%.*]], align 8 // LLVM: store <4 x i16> [[SRC]], ptr [[SRC_ADDR:%.*]], align 8 -// LLVM: [[SRC_VAL:%.*]] = load <4 x i16>, ptr [[SRC_ADDR]], align 8 -// LLVM: store <4 x i16> [[SRC_VAL]], ptr [[S1:%.*]], align 8 -// LLVM: [[PTR_VAL:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 -// LLVM: [[INTRN_VEC:%.*]] = load <4 x i16>, ptr [[S1]], align 8 +// LLVM: [[INTRN_VEC:%.*]] = load <4 x i16>, ptr [[SRC_ADDR]], align 8 // LLVM: [[INTRN_VEC_CAST0:%.*]] = bitcast <4 x i16> [[INTRN_VEC]] to <8 x i8> // LLVM: [[INTRN_VEC_CAST1:%.*]] = bitcast <8 x i8> [[INTRN_VEC_CAST0]] to <4 x i16> -// LLVM: [[INTRN_VAL:%.*]] = load i16, ptr [[PTR_VAL]], align 2 +// LLVM: [[INTRN_VAL:%.*]] = load i16, ptr [[PTR]], align 2 // LLVM: {{.*}} = insertelement <4 x i16> [[INTRN_VEC_CAST1]], i16 [[INTRN_VAL]], i32 3 // LLVM: ret <4 x i16> {{.*}} @@ -154,15 +132,11 @@ int16x8_t test_vld1q_lane_s16(int16_t const * ptr, int16x8_t src) { // CIR: {{%.*}} = cir.vec.insert [[VAL]], {{%.*}}[[[IDX]] : !s32i] : !cir.vector // LLVM: {{.*}}test_vld1q_lane_s16(ptr{{.*}}[[PTR:%.*]], <8 x i16>{{.*}}[[SRC:%.*]]) -// LLVM: store ptr [[PTR]], ptr [[PTR_ADDR:%.*]], align 8 // LLVM: store <8 x i16> [[SRC]], ptr [[SRC_ADDR:%.*]], align 16 -// LLVM: [[SRC_VAL:%.*]] = load <8 x i16>, ptr [[SRC_ADDR]], align 16 -// LLVM: store <8 x i16> [[SRC_VAL]], ptr [[S1:%.*]], align 16 -// LLVM: [[PTR_VAL:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 -// LLVM: [[INTRN_VEC:%.*]] = load <8 x i16>, ptr [[S1]], align 16 +// LLVM: [[INTRN_VEC:%.*]] = load <8 x i16>, ptr [[SRC_ADDR]], align 16 // LLVM: [[INTRN_VEC_CAST0:%.*]] = bitcast <8 x i16> [[INTRN_VEC]] to <16 x i8> // LLVM: [[INTRN_VEC_CAST1:%.*]] = bitcast <16 x i8> [[INTRN_VEC_CAST0]] to <8 x i16> -// LLVM: [[INTRN_VAL:%.*]] = load i16, ptr [[PTR_VAL]], align 2 +// LLVM: [[INTRN_VAL:%.*]] = load i16, ptr [[PTR]], align 2 // LLVM: {{.*}} = insertelement <8 x i16> [[INTRN_VEC_CAST1]], i16 [[INTRN_VAL]], i32 7 // LLVM: ret <8 x i16> {{.*}} @@ -177,21 +151,14 @@ uint16x8_t test_vld1q_lane_u16(uint16_t const * ptr, uint16x8_t src) { // CIR: {{%.*}} = cir.vec.insert [[VAL]], {{%.*}}[[[IDX]] : !s32i] : !cir.vector // LLVM: {{.*}}test_vld1q_lane_u16(ptr{{.*}}[[PTR:%.*]], <8 x i16>{{.*}}[[SRC:%.*]]) -// LLVM: store ptr [[PTR]], ptr [[PTR_ADDR:%.*]], align 8 // LLVM: store <8 x i16> [[SRC]], ptr [[SRC_ADDR:%.*]], align 16 -// LLVM: [[SRC_VAL:%.*]] = load <8 x i16>, ptr [[SRC_ADDR]], align 16 -// LLVM: store <8 x i16> [[SRC_VAL]], ptr [[S1:%.*]], align 16 -// LLVM: [[PTR_VAL:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 -// LLVM: [[INTRN_VEC:%.*]] = load <8 x i16>, ptr [[S1]], align 16 +// LLVM: [[INTRN_VEC:%.*]] = load <8 x i16>, ptr [[SRC_ADDR]], align 16 // LLVM: [[INTRN_VEC_CAST0:%.*]] = bitcast <8 x i16> [[INTRN_VEC]] to <16 x i8> // LLVM: [[INTRN_VEC_CAST1:%.*]] = bitcast <16 x i8> [[INTRN_VEC_CAST0]] to <8 x i16> -// LLVM: [[INTRN_VAL:%.*]] = load i16, ptr [[PTR_VAL]], align 2 +// LLVM: [[INTRN_VAL:%.*]] = load i16, ptr [[PTR]], align 2 // LLVM: {{.*}} = insertelement <8 x i16> [[INTRN_VEC_CAST1]], i16 [[INTRN_VAL]], i32 7 // LLVM: ret <8 x i16> {{.*}} - - - int32x2_t test_vld1_lane_s32(int32_t const * ptr, int32x2_t src) { return vld1_lane_s32(ptr, src, 1); } @@ -203,15 +170,11 @@ int32x2_t test_vld1_lane_s32(int32_t const * ptr, int32x2_t src) { // CIR: {{%.*}} = cir.vec.insert [[VAL]], {{%.*}}[[[IDX]] : !s32i] : !cir.vector // LLVM: {{.*}}test_vld1_lane_s32(ptr{{.*}}[[PTR:%.*]], <2 x i32>{{.*}}[[SRC:%.*]]) -// LLVM: store ptr [[PTR]], ptr [[PTR_ADDR:%.*]], align 8 // LLVM: store <2 x i32> [[SRC]], ptr [[SRC_ADDR:%.*]], align 8 -// LLVM: [[SRC_VAL:%.*]] = load <2 x i32>, ptr [[SRC_ADDR]], align 8 -// LLVM: store <2 x i32> [[SRC_VAL]], ptr [[S1:%.*]], align 8 -// LLVM: [[PTR_VAL:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 -// LLVM: [[INTRN_VEC:%.*]] = load <2 x i32>, ptr [[S1]], align 8 +// LLVM: [[INTRN_VEC:%.*]] = load <2 x i32>, ptr [[SRC_ADDR]], align 8 // LLVM: [[INTRN_VEC_CAST0:%.*]] = bitcast <2 x i32> [[INTRN_VEC]] to <8 x i8> // LLVM: [[INTRN_VEC_CAST1:%.*]] = bitcast <8 x i8> [[INTRN_VEC_CAST0]] to <2 x i32> -// LLVM: [[INTRN_VAL:%.*]] = load i32, ptr [[PTR_VAL]], align 4 +// LLVM: [[INTRN_VAL:%.*]] = load i32, ptr [[PTR]], align 4 // LLVM: {{.*}} = insertelement <2 x i32> [[INTRN_VEC_CAST1]], i32 [[INTRN_VAL]], i32 1 // LLVM: ret <2 x i32> {{.*}} @@ -226,15 +189,11 @@ uint32x2_t test_vld1_lane_u32(uint32_t const * ptr, uint32x2_t src) { // CIR: {{%.*}} = cir.vec.insert [[VAL]], {{%.*}}[[[IDX]] : !s32i] : !cir.vector // LLVM: {{.*}}test_vld1_lane_u32(ptr{{.*}}[[PTR:%.*]], <2 x i32>{{.*}}[[SRC:%.*]]) -// LLVM: store ptr [[PTR]], ptr [[PTR_ADDR:%.*]], align 8 // LLVM: store <2 x i32> [[SRC]], ptr [[SRC_ADDR:%.*]], align 8 -// LLVM: [[SRC_VAL:%.*]] = load <2 x i32>, ptr [[SRC_ADDR]], align 8 -// LLVM: store <2 x i32> [[SRC_VAL]], ptr [[S1:%.*]], align 8 -// LLVM: [[PTR_VAL:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 -// LLVM: [[INTRN_VEC:%.*]] = load <2 x i32>, ptr [[S1]], align 8 +// LLVM: [[INTRN_VEC:%.*]] = load <2 x i32>, ptr [[SRC_ADDR]], align 8 // LLVM: [[INTRN_VEC_CAST0:%.*]] = bitcast <2 x i32> [[INTRN_VEC]] to <8 x i8> // LLVM: [[INTRN_VEC_CAST1:%.*]] = bitcast <8 x i8> [[INTRN_VEC_CAST0]] to <2 x i32> -// LLVM: [[INTRN_VAL:%.*]] = load i32, ptr [[PTR_VAL]], align 4 +// LLVM: [[INTRN_VAL:%.*]] = load i32, ptr [[PTR]], align 4 // LLVM: {{.*}} = insertelement <2 x i32> [[INTRN_VEC_CAST1]], i32 [[INTRN_VAL]], i32 1 // LLVM: ret <2 x i32> {{.*}} @@ -250,15 +209,11 @@ int32x4_t test_vld1q_lane_s32(int32_t const * ptr, int32x4_t src) { // CIR: {{%.*}} = cir.vec.insert [[VAL]], {{%.*}}[[[IDX]] : !s32i] : !cir.vector // LLVM: {{.*}}test_vld1q_lane_s32(ptr{{.*}}[[PTR:%.*]], <4 x i32>{{.*}}[[SRC:%.*]]) -// LLVM: store ptr [[PTR]], ptr [[PTR_ADDR:%.*]], align 8 // LLVM: store <4 x i32> [[SRC]], ptr [[SRC_ADDR:%.*]], align 16 -// LLVM: [[SRC_VAL:%.*]] = load <4 x i32>, ptr [[SRC_ADDR]], align 16 -// LLVM: store <4 x i32> [[SRC_VAL]], ptr [[S1:%.*]], align 16 -// LLVM: [[PTR_VAL:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 -// LLVM: [[INTRN_VEC:%.*]] = load <4 x i32>, ptr [[S1]], align 16 +// LLVM: [[INTRN_VEC:%.*]] = load <4 x i32>, ptr [[SRC_ADDR]], align 16 // LLVM: [[INTRN_VEC_CAST0:%.*]] = bitcast <4 x i32> [[INTRN_VEC]] to <16 x i8> // LLVM: [[INTRN_VEC_CAST1:%.*]] = bitcast <16 x i8> [[INTRN_VEC_CAST0]] to <4 x i32> -// LLVM: [[INTRN_VAL:%.*]] = load i32, ptr [[PTR_VAL]], align 4 +// LLVM: [[INTRN_VAL:%.*]] = load i32, ptr [[PTR]], align 4 // LLVM: {{.*}} = insertelement <4 x i32> [[INTRN_VEC_CAST1]], i32 [[INTRN_VAL]], i32 3 // LLVM: ret <4 x i32> {{.*}} @@ -274,15 +229,11 @@ uint32x4_t test_vld1q_lane_u32(uint32_t const * ptr, uint32x4_t src) { // CIR: {{%.*}} = cir.vec.insert [[VAL]], {{%.*}}[[[IDX]] : !s32i] : !cir.vector // LLVM: {{.*}}test_vld1q_lane_u32(ptr{{.*}}[[PTR:%.*]], <4 x i32>{{.*}}[[SRC:%.*]]) -// LLVM: store ptr [[PTR]], ptr [[PTR_ADDR:%.*]], align 8 // LLVM: store <4 x i32> [[SRC]], ptr [[SRC_ADDR:%.*]], align 16 -// LLVM: [[SRC_VAL:%.*]] = load <4 x i32>, ptr [[SRC_ADDR]], align 16 -// LLVM: store <4 x i32> [[SRC_VAL]], ptr [[S1:%.*]], align 16 -// LLVM: [[PTR_VAL:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 -// LLVM: [[INTRN_VEC:%.*]] = load <4 x i32>, ptr [[S1]], align 16 +// LLVM: [[INTRN_VEC:%.*]] = load <4 x i32>, ptr [[SRC_ADDR]], align 16 // LLVM: [[INTRN_VEC_CAST0:%.*]] = bitcast <4 x i32> [[INTRN_VEC]] to <16 x i8> // LLVM: [[INTRN_VEC_CAST1:%.*]] = bitcast <16 x i8> [[INTRN_VEC_CAST0]] to <4 x i32> -// LLVM: [[INTRN_VAL:%.*]] = load i32, ptr [[PTR_VAL]], align 4 +// LLVM: [[INTRN_VAL:%.*]] = load i32, ptr [[PTR]], align 4 // LLVM: {{.*}} = insertelement <4 x i32> [[INTRN_VEC_CAST1]], i32 [[INTRN_VAL]], i32 3 // LLVM: ret <4 x i32> {{.*}} @@ -297,15 +248,11 @@ int64x1_t test_vld1_lane_s64(int64_t const * ptr, int64x1_t src) { // CIR: {{%.*}} = cir.vec.insert [[VAL]], {{%.*}}[[[IDX]] : !s32i] : !cir.vector // LLVM: {{.*}}test_vld1_lane_s64(ptr{{.*}}[[PTR:%.*]], <1 x i64>{{.*}}[[SRC:%.*]]) -// LLVM: store ptr [[PTR]], ptr [[PTR_ADDR:%.*]], align 8 // LLVM: store <1 x i64> [[SRC]], ptr [[SRC_ADDR:%.*]], align 8 -// LLVM: [[SRC_VAL:%.*]] = load <1 x i64>, ptr [[SRC_ADDR]], align 8 -// LLVM: store <1 x i64> [[SRC_VAL]], ptr [[S1:%.*]], align 8 -// LLVM: [[PTR_VAL:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 -// LLVM: [[INTRN_VEC:%.*]] = load <1 x i64>, ptr [[S1]], align 8 +// LLVM: [[INTRN_VEC:%.*]] = load <1 x i64>, ptr [[SRC_ADDR]], align 8 // LLVM: [[INTRN_VEC_CAST0:%.*]] = bitcast <1 x i64> [[INTRN_VEC]] to <8 x i8> // LLVM: [[INTRN_VEC_CAST1:%.*]] = bitcast <8 x i8> [[INTRN_VEC_CAST0]] to <1 x i64> -// LLVM: [[INTRN_VAL:%.*]] = load i64, ptr [[PTR_VAL]], align 8 +// LLVM: [[INTRN_VAL:%.*]] = load i64, ptr [[PTR]], align 8 // LLVM: {{.*}} = insertelement <1 x i64> [[INTRN_VEC_CAST1]], i64 [[INTRN_VAL]], i32 0 // LLVM: ret <1 x i64> {{.*}} @@ -320,15 +267,11 @@ uint64x1_t test_vld1_lane_u64(uint64_t const * ptr, uint64x1_t src) { // CIR: {{%.*}} = cir.vec.insert [[VAL]], {{%.*}}[[[IDX]] : !s32i] : !cir.vector // LLVM: {{.*}}test_vld1_lane_u64(ptr{{.*}}[[PTR:%.*]], <1 x i64>{{.*}}[[SRC:%.*]]) -// LLVM: store ptr [[PTR]], ptr [[PTR_ADDR:%.*]], align 8 // LLVM: store <1 x i64> [[SRC]], ptr [[SRC_ADDR:%.*]], align 8 -// LLVM: [[SRC_VAL:%.*]] = load <1 x i64>, ptr [[SRC_ADDR]], align 8 -// LLVM: store <1 x i64> [[SRC_VAL]], ptr [[S1:%.*]], align 8 -// LLVM: [[PTR_VAL:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 -// LLVM: [[INTRN_VEC:%.*]] = load <1 x i64>, ptr [[S1]], align 8 +// LLVM: [[INTRN_VEC:%.*]] = load <1 x i64>, ptr [[SRC_ADDR]], align 8 // LLVM: [[INTRN_VEC_CAST0:%.*]] = bitcast <1 x i64> [[INTRN_VEC]] to <8 x i8> // LLVM: [[INTRN_VEC_CAST1:%.*]] = bitcast <8 x i8> [[INTRN_VEC_CAST0]] to <1 x i64> -// LLVM: [[INTRN_VAL:%.*]] = load i64, ptr [[PTR_VAL]], align 8 +// LLVM: [[INTRN_VAL:%.*]] = load i64, ptr [[PTR]], align 8 // LLVM: {{.*}} = insertelement <1 x i64> [[INTRN_VEC_CAST1]], i64 [[INTRN_VAL]], i32 0 // LLVM: ret <1 x i64> {{.*}} @@ -343,15 +286,11 @@ int64x2_t test_vld1q_lane_s64(int64_t const * ptr, int64x2_t src) { // CIR: {{%.*}} = cir.vec.insert [[VAL]], {{%.*}}[[[IDX]] : !s32i] : !cir.vector // LLVM: {{.*}}test_vld1q_lane_s64(ptr{{.*}}[[PTR:%.*]], <2 x i64>{{.*}}[[SRC:%.*]]) -// LLVM: store ptr [[PTR]], ptr [[PTR_ADDR:%.*]], align 8 // LLVM: store <2 x i64> [[SRC]], ptr [[SRC_ADDR:%.*]], align 16 -// LLVM: [[SRC_VAL:%.*]] = load <2 x i64>, ptr [[SRC_ADDR]], align 16 -// LLVM: store <2 x i64> [[SRC_VAL]], ptr [[S1:%.*]], align 16 -// LLVM: [[PTR_VAL:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 -// LLVM: [[INTRN_VEC:%.*]] = load <2 x i64>, ptr [[S1]], align 16 +// LLVM: [[INTRN_VEC:%.*]] = load <2 x i64>, ptr [[SRC_ADDR]], align 16 // LLVM: [[INTRN_VEC_CAST0:%.*]] = bitcast <2 x i64> [[INTRN_VEC]] to <16 x i8> // LLVM: [[INTRN_VEC_CAST1:%.*]] = bitcast <16 x i8> [[INTRN_VEC_CAST0]] to <2 x i64> -// LLVM: [[INTRN_VAL:%.*]] = load i64, ptr [[PTR_VAL]], align 8 +// LLVM: [[INTRN_VAL:%.*]] = load i64, ptr [[PTR]], align 8 // LLVM: {{.*}} = insertelement <2 x i64> [[INTRN_VEC_CAST1]], i64 [[INTRN_VAL]], i32 1 // LLVM: ret <2 x i64> {{.*}} @@ -366,14 +305,259 @@ uint64x2_t test_vld1q_lane_u64(uint64_t const * ptr, uint64x2_t src) { // CIR: {{%.*}} = cir.vec.insert [[VAL]], {{%.*}}[[[IDX]] : !s32i] : !cir.vector // LLVM: {{.*}}test_vld1q_lane_u64(ptr{{.*}}[[PTR:%.*]], <2 x i64>{{.*}}[[SRC:%.*]]) -// LLVM: store ptr [[PTR]], ptr [[PTR_ADDR:%.*]], align 8 // LLVM: store <2 x i64> [[SRC]], ptr [[SRC_ADDR:%.*]], align 16 -// LLVM: [[SRC_VAL:%.*]] = load <2 x i64>, ptr [[SRC_ADDR]], align 16 -// LLVM: store <2 x i64> [[SRC_VAL]], ptr [[S1:%.*]], align 16 -// LLVM: [[PTR_VAL:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 -// LLVM: [[INTRN_VEC:%.*]] = load <2 x i64>, ptr [[S1]], align 16 +// LLVM: [[INTRN_VEC:%.*]] = load <2 x i64>, ptr [[SRC_ADDR]], align 16 // LLVM: [[INTRN_VEC_CAST0:%.*]] = bitcast <2 x i64> [[INTRN_VEC]] to <16 x i8> // LLVM: [[INTRN_VEC_CAST1:%.*]] = bitcast <16 x i8> [[INTRN_VEC_CAST0]] to <2 x i64> -// LLVM: [[INTRN_VAL:%.*]] = load i64, ptr [[PTR_VAL]], align 8 +// LLVM: [[INTRN_VAL:%.*]] = load i64, ptr [[PTR]], align 8 // LLVM: {{.*}} = insertelement <2 x i64> [[INTRN_VEC_CAST1]], i64 [[INTRN_VAL]], i32 1 // LLVM: ret <2 x i64> {{.*}} + +void test_vst1_lane_s8(int8_t * ptr, int8x8_t src) { + vst1_lane_s8(ptr, src, 7); +} + +// CIR-LABEL: test_vst1_lane_s8 +// CIR: [[LANE:%.*]] = cir.const #cir.int<7> : !s32i +// CIR: [[VAL:%.*]] = cir.vec.extract {{%.*}}[[[LANE]] : !s32i] : !cir.vector +// CIR: [[PTR:%.*]] = cir.cast(bitcast, {{%.*}} : !cir.ptr), !cir.ptr +// CIR: cir.store align(1) [[VAL]], [[PTR]] : !s8i, !cir.ptr + +// LLVM: {{.*}}test_vst1_lane_s8(ptr{{.*}}[[PTR:%.*]], <8 x i8>{{.*}}[[SRC:%.*]]) +// LLVM: store <8 x i8> [[SRC]], ptr [[SRC_ADDR:%.*]], align 8 +// LLVM: [[VEC:%.*]] = load <8 x i8>, ptr [[SRC_ADDR]], align 8 +// LLVM: [[RES:%.*]] = extractelement <8 x i8> [[VEC]], i32 7 +// LLVM: store i8 [[RES]], ptr [[PTR]], align 1 + +void test_vst1_lane_s16(int16_t * ptr, int16x4_t src) { + vst1_lane_s16(ptr, src, 3); +} + +// CIR-LABEL: test_vst1_lane_s16 +// CIR: [[LANE:%.*]] = cir.const #cir.int<3> : !s32i +// CIR: [[VAL:%.*]] = cir.vec.extract {{%.*}}[[[LANE]] : !s32i] : !cir.vector +// CIR: [[PTR:%.*]] = cir.cast(bitcast, {{%.*}} : !cir.ptr), !cir.ptr +// CIR: cir.store align(2) [[VAL]], [[PTR]] : !s16i, !cir.ptr + +// LLVM: {{.*}}test_vst1_lane_s16(ptr{{.*}}[[PTR:%.*]], <4 x i16>{{.*}}[[SRC:%.*]]) +// LLVM: store <4 x i16> [[SRC]], ptr [[SRC_ADDR:%.*]], align 8 +// LLVM: [[VEC:%.*]] = load <4 x i16>, ptr [[SRC_ADDR]], align 8 +// LLVM: [[VEC_CAST0:%.*]] = bitcast <4 x i16> [[VEC]] to <8 x i8> +// LLVM: [[VEC_CAST1:%.*]] = bitcast <8 x i8> [[VEC_CAST0]] to <4 x i16> +// LLVM: [[RES:%.*]] = extractelement <4 x i16> [[VEC_CAST1]], i32 3 +// LLVM: store i16 [[RES]], ptr [[PTR]], align 2 + +void test_vst1_lane_u16(uint16_t * ptr, uint16x4_t src) { + vst1_lane_u16(ptr, src, 3); +} + +// CIR-LABEL: test_vst1_lane_u16 +// CIR: [[LANE:%.*]] = cir.const #cir.int<3> : !s32i +// CIR: [[VAL:%.*]] = cir.vec.extract {{%.*}}[[[LANE]] : !s32i] : !cir.vector +// CIR: [[PTR:%.*]] = cir.cast(bitcast, {{%.*}} : !cir.ptr), !cir.ptr +// CIR: cir.store align(2) [[VAL]], [[PTR]] : !u16i, !cir.ptr + +// LLVM: {{.*}}test_vst1_lane_u16(ptr{{.*}}[[PTR:%.*]], <4 x i16>{{.*}}[[SRC:%.*]]) +// LLVM: store <4 x i16> [[SRC]], ptr [[SRC_ADDR:%.*]], align 8 +// LLVM: [[VEC:%.*]] = load <4 x i16>, ptr [[SRC_ADDR]], align 8 +// LLVM: [[VEC_CAST0:%.*]] = bitcast <4 x i16> [[VEC]] to <8 x i8> +// LLVM: [[VEC_CAST1:%.*]] = bitcast <8 x i8> [[VEC_CAST0]] to <4 x i16> +// LLVM: [[RES:%.*]] = extractelement <4 x i16> [[VEC_CAST1]], i32 3 +// LLVM: store i16 [[RES]], ptr [[PTR]], align 2 + +void test_vst1_lane_s32(int32_t * ptr, int32x2_t src) { + vst1_lane_s32(ptr, src, 1); +} + +// CIR-LABEL: test_vst1_lane_s32 +// CIR: [[LANE:%.*]] = cir.const #cir.int<1> : !s32i +// CIR: [[VAL:%.*]] = cir.vec.extract {{%.*}}[[[LANE]] : !s32i] : !cir.vector +// CIR: [[PTR:%.*]] = cir.cast(bitcast, {{%.*}} : !cir.ptr), !cir.ptr +// CIR: cir.store align(4) [[VAL]], [[PTR]] : !s32i, !cir.ptr + +// LLVM: {{.*}}test_vst1_lane_s32(ptr{{.*}}[[PTR:%.*]], <2 x i32>{{.*}}[[SRC:%.*]]) +// LLVM: store <2 x i32> [[SRC]], ptr [[SRC_ADDR:%.*]], align 8 +// LLVM: [[VEC:%.*]] = load <2 x i32>, ptr [[SRC_ADDR]], align 8 +// LLVM: [[VEC_CAST0:%.*]] = bitcast <2 x i32> [[VEC]] to <8 x i8> +// LLVM: [[VEC_CAST1:%.*]] = bitcast <8 x i8> [[VEC_CAST0]] to <2 x i32> +// LLVM: [[RES:%.*]] = extractelement <2 x i32> [[VEC_CAST1]], i32 1 +// LLVM: store i32 [[RES]], ptr [[PTR]], align 4 + +void test_vst1_lane_f32(float32_t * ptr, float32x2_t src) { + vst1_lane_f32(ptr, src, 1); +} + +// CIR-LABEL: test_vst1_lane_f32 +// CIR: [[LANE:%.*]] = cir.const #cir.int<1> : !s32i +// CIR: [[VAL:%.*]] = cir.vec.extract {{%.*}}[[[LANE]] : !s32i] : !cir.vector +// CIR: [[PTR:%.*]] = cir.cast(bitcast, {{%.*}} : !cir.ptr), !cir.ptr +// CIR: cir.store align(4) [[VAL]], [[PTR]] : !cir.float, !cir.ptr + +// LLVM: {{.*}}test_vst1_lane_f32(ptr{{.*}}[[PTR:%.*]], <2 x float>{{.*}}[[SRC:%.*]]) +// LLVM: store <2 x float> [[SRC]], ptr [[SRC_ADDR:%.*]], align 8 +// LLVM: [[VEC:%.*]] = load <2 x float>, ptr [[SRC_ADDR]], align 8 +// LLVM: [[VEC_CAST0:%.*]] = bitcast <2 x float> [[VEC]] to <8 x i8> +// LLVM: [[VEC_CAST1:%.*]] = bitcast <8 x i8> [[VEC_CAST0]] to <2 x float> +// LLVM: [[RES:%.*]] = extractelement <2 x float> [[VEC_CAST1]], i32 1 +// LLVM: store float [[RES]], ptr [[PTR]], align 4 + +void test_vst1_lane_s64(int64_t * ptr, int64x1_t src) { + vst1_lane_s64(ptr, src, 0); +} + +// CIR-LABEL: test_vst1_lane_s64 +// CIR: [[LANE:%.*]] = cir.const #cir.int<0> : !s32i +// CIR: [[VAL:%.*]] = cir.vec.extract {{%.*}}[[[LANE]] : !s32i] : !cir.vector +// CIR: [[PTR:%.*]] = cir.cast(bitcast, {{%.*}} : !cir.ptr), !cir.ptr +// CIR: cir.store align(8) [[VAL]], [[PTR]] : !s64i, !cir.ptr + +// LLVM: {{.*}}test_vst1_lane_s64(ptr{{.*}}[[PTR:%.*]], <1 x i64>{{.*}}[[SRC:%.*]]) +// LLVM: store <1 x i64> [[SRC]], ptr [[SRC_ADDR:%.*]], align 8 +// LLVM: [[VEC:%.*]] = load <1 x i64>, ptr [[SRC_ADDR]], align 8 +// LLVM: [[VEC_CAST0:%.*]] = bitcast <1 x i64> [[VEC]] to <8 x i8> +// LLVM: [[VEC_CAST1:%.*]] = bitcast <8 x i8> [[VEC_CAST0]] to <1 x i64> +// LLVM: [[RES:%.*]] = extractelement <1 x i64> [[VEC_CAST1]], i32 0 +// LLVM: store i64 [[RES]], ptr [[PTR]], align 8 + +void test_vst1_lane_f64(float64_t * ptr, float64x1_t src) { + vst1_lane_f64(ptr, src, 0); +} + +// CIR-LABEL: test_vst1_lane_f64 +// CIR: [[LANE:%.*]] = cir.const #cir.int<0> : !s32i +// CIR: [[VAL:%.*]] = cir.vec.extract {{%.*}}[[[LANE]] : !s32i] : !cir.vector +// CIR: [[PTR:%.*]] = cir.cast(bitcast, {{%.*}} : !cir.ptr), !cir.ptr +// CIR: cir.store align(8) [[VAL]], [[PTR]] : !cir.double, !cir.ptr + +// LLVM: {{.*}}test_vst1_lane_f64(ptr{{.*}}[[PTR:%.*]], <1 x double>{{.*}}[[SRC:%.*]]) +// LLVM: store <1 x double> [[SRC]], ptr [[SRC_ADDR:%.*]], align 8 +// LLVM: [[VEC:%.*]] = load <1 x double>, ptr [[SRC_ADDR]], align 8 +// LLVM: [[VEC_CAST0:%.*]] = bitcast <1 x double> [[VEC]] to <8 x i8> +// LLVM: [[VEC_CAST1:%.*]] = bitcast <8 x i8> [[VEC_CAST0]] to <1 x double> +// LLVM: [[RES:%.*]] = extractelement <1 x double> [[VEC_CAST1]], i32 0 +// LLVM: store double [[RES]], ptr [[PTR]], align 8 + +void test_vst1q_lane_s8(int8_t * ptr, int8x16_t src) { + vst1q_lane_s8(ptr, src, 15); +} + +// CIR-LABEL: test_vst1q_lane_s8 +// CIR: [[LANE:%.*]] = cir.const #cir.int<15> : !s32i +// CIR: [[VAL:%.*]] = cir.vec.extract {{%.*}}[[[LANE]] : !s32i] : !cir.vector +// CIR: [[PTR:%.*]] = cir.cast(bitcast, {{%.*}} : !cir.ptr), !cir.ptr +// CIR: cir.store align(1) [[VAL]], [[PTR]] : !s8i, !cir.ptr + +// LLVM: {{.*}}test_vst1q_lane_s8(ptr{{.*}}[[PTR:%.*]], <16 x i8>{{.*}}[[SRC:%.*]]) +// LLVM: store <16 x i8> [[SRC]], ptr [[SRC_ADDR:%.*]], align 16 +// LLVM: [[VEC:%.*]] = load <16 x i8>, ptr [[SRC_ADDR]], align 16 +// LLVM: [[RES:%.*]] = extractelement <16 x i8> [[VEC]], i32 15 +// LLVM: store i8 [[RES]], ptr [[PTR]], align 1 + + +void test_vst1q_lane_s16(int16_t * ptr, int16x8_t src) { + vst1q_lane_s16(ptr, src, 7); +} + +// CIR-LABEL: test_vst1q_lane_s16 +// CIR: [[LANE:%.*]] = cir.const #cir.int<7> : !s32i +// CIR: [[VAL:%.*]] = cir.vec.extract {{%.*}}[[[LANE]] : !s32i] : !cir.vector +// CIR: [[PTR:%.*]] = cir.cast(bitcast, {{%.*}} : !cir.ptr), !cir.ptr +// CIR: cir.store align(2) [[VAL]], [[PTR]] : !s16i, !cir.ptr + +// LLVM: {{.*}}test_vst1q_lane_s16(ptr{{.*}}[[PTR:%.*]], <8 x i16>{{.*}}[[SRC:%.*]]) +// LLVM: store <8 x i16> [[SRC]], ptr [[SRC_ADDR:%.*]], align 16 +// LLVM: [[VEC:%.*]] = load <8 x i16>, ptr [[SRC_ADDR]], align 16 +// LLVM: [[VEC_CAST0:%.*]] = bitcast <8 x i16> [[VEC]] to <16 x i8> +// LLVM: [[VEC_CAST1:%.*]] = bitcast <16 x i8> [[VEC_CAST0]] to <8 x i16> +// LLVM: [[RES:%.*]] = extractelement <8 x i16> [[VEC_CAST1]], i32 7 +// LLVM: store i16 [[RES]], ptr [[PTR]], align 2 + +void test_vst1q_lane_u16(uint16_t * ptr, uint16x8_t src) { + vst1q_lane_u16(ptr, src, 7); +} + +// CIR-LABEL: test_vst1q_lane_u16 +// CIR: [[LANE:%.*]] = cir.const #cir.int<7> : !s32i +// CIR: [[VAL:%.*]] = cir.vec.extract {{%.*}}[[[LANE]] : !s32i] : !cir.vector +// CIR: [[PTR:%.*]] = cir.cast(bitcast, {{%.*}} : !cir.ptr), !cir.ptr +// CIR: cir.store align(2) [[VAL]], [[PTR]] : !u16i, !cir.ptr + +// LLVM: {{.*}}test_vst1q_lane_u16(ptr{{.*}}[[PTR:%.*]], <8 x i16>{{.*}}[[SRC:%.*]]) +// LLVM: store <8 x i16> [[SRC]], ptr [[SRC_ADDR:%.*]], align 16 +// LLVM: [[VEC:%.*]] = load <8 x i16>, ptr [[SRC_ADDR]], align 16 +// LLVM: [[VEC_CAST0:%.*]] = bitcast <8 x i16> [[VEC]] to <16 x i8> +// LLVM: [[VEC_CAST1:%.*]] = bitcast <16 x i8> [[VEC_CAST0]] to <8 x i16> +// LLVM: [[RES:%.*]] = extractelement <8 x i16> [[VEC_CAST1]], i32 7 +// LLVM: store i16 [[RES]], ptr [[PTR]], align 2 + +void test_vst1q_lane_s32(int32_t * ptr, int32x4_t src) { + vst1q_lane_s32(ptr, src, 3); +} + +// CIR-LABEL: test_vst1q_lane_s32 +// CIR: [[LANE:%.*]] = cir.const #cir.int<3> : !s32i +// CIR: [[VAL:%.*]] = cir.vec.extract {{%.*}}[[[LANE]] : !s32i] : !cir.vector +// CIR: [[PTR:%.*]] = cir.cast(bitcast, {{%.*}} : !cir.ptr), !cir.ptr +// CIR: cir.store align(4) [[VAL]], [[PTR]] : !s32i, !cir.ptr + +// LLVM: {{.*}}test_vst1q_lane_s32(ptr{{.*}}[[PTR:%.*]], <4 x i32>{{.*}}[[SRC:%.*]]) +// LLVM: store <4 x i32> [[SRC]], ptr [[SRC_ADDR:%.*]], align 16 +// LLVM: [[VEC:%.*]] = load <4 x i32>, ptr [[SRC_ADDR]], align 16 +// LLVM: [[VEC_CAST0:%.*]] = bitcast <4 x i32> [[VEC]] to <16 x i8> +// LLVM: [[VEC_CAST1:%.*]] = bitcast <16 x i8> [[VEC_CAST0]] to <4 x i32> +// LLVM: [[RES:%.*]] = extractelement <4 x i32> [[VEC_CAST1]], i32 3 +// LLVM: store i32 [[RES]], ptr [[PTR]], align 4 + +void test_vst1q_lane_s64(int64_t * ptr, int64x2_t src) { + vst1q_lane_s64(ptr, src, 1); +} + +// CIR-LABEL: test_vst1q_lane_s64 +// CIR: [[LANE:%.*]] = cir.const #cir.int<1> : !s32i +// CIR: [[VAL:%.*]] = cir.vec.extract {{%.*}}[[[LANE]] : !s32i] : !cir.vector +// CIR: [[PTR:%.*]] = cir.cast(bitcast, {{%.*}} : !cir.ptr), !cir.ptr +// CIR: cir.store align(8) [[VAL]], [[PTR]] : !s64i, !cir.ptr + +// LLVM: {{.*}}test_vst1q_lane_s64(ptr{{.*}}[[PTR:%.*]], <2 x i64>{{.*}}[[SRC:%.*]]) +// LLVM: store <2 x i64> [[SRC]], ptr [[SRC_ADDR:%.*]], align 16 +// LLVM: [[VEC:%.*]] = load <2 x i64>, ptr [[SRC_ADDR]], align 16 +// LLVM: [[VEC_CAST0:%.*]] = bitcast <2 x i64> [[VEC]] to <16 x i8> +// LLVM: [[VEC_CAST1:%.*]] = bitcast <16 x i8> [[VEC_CAST0]] to <2 x i64> +// LLVM: [[RES:%.*]] = extractelement <2 x i64> [[VEC_CAST1]], i32 1 +// LLVM: store i64 [[RES]], ptr [[PTR]], align 8 + +void test_vst1q_lane_f32(float32_t * ptr, float32x4_t src) { + vst1q_lane_f32(ptr, src, 3); +} + +// CIR-LABEL: test_vst1q_lane_f32 +// CIR: [[LANE:%.*]] = cir.const #cir.int<3> : !s32i +// CIR: [[VAL:%.*]] = cir.vec.extract {{%.*}}[[[LANE]] : !s32i] : !cir.vector +// CIR: [[PTR:%.*]] = cir.cast(bitcast, {{%.*}} : !cir.ptr), !cir.ptr +// CIR: cir.store align(4) [[VAL]], [[PTR]] : !cir.float, !cir.ptr + +// LLVM: {{.*}}test_vst1q_lane_f32(ptr{{.*}}[[PTR:%.*]], <4 x float>{{.*}}[[SRC:%.*]]) +// LLVM: store <4 x float> [[SRC]], ptr [[SRC_ADDR:%.*]], align 16 +// LLVM: [[VEC:%.*]] = load <4 x float>, ptr [[SRC_ADDR]], align 16 +// LLVM: [[VEC_CAST0:%.*]] = bitcast <4 x float> [[VEC]] to <16 x i8> +// LLVM: [[VEC_CAST1:%.*]] = bitcast <16 x i8> [[VEC_CAST0]] to <4 x float> +// LLVM: [[RES:%.*]] = extractelement <4 x float> [[VEC_CAST1]], i32 3 +// LLVM: store float [[RES]], ptr [[PTR]], align 4 + +void test_vst1q_lane_f64(float64_t * ptr, float64x2_t src) { + vst1q_lane_f64(ptr, src, 1); +} + +// CIR-LABEL: test_vst1q_lane_f64 +// CIR: [[LANE:%.*]] = cir.const #cir.int<1> : !s32i +// CIR: [[VAL:%.*]] = cir.vec.extract {{%.*}}[[[LANE]] : !s32i] : !cir.vector +// CIR: [[PTR:%.*]] = cir.cast(bitcast, {{%.*}} : !cir.ptr), !cir.ptr +// CIR: cir.store align(8) [[VAL]], [[PTR]] : !cir.double, !cir.ptr + +// LLVM: {{.*}}test_vst1q_lane_f64(ptr{{.*}}[[PTR:%.*]], <2 x double>{{.*}}[[SRC:%.*]]) +// LLVM: store <2 x double> [[SRC]], ptr [[SRC_ADDR:%.*]], align 16 +// LLVM: [[VEC:%.*]] = load <2 x double>, ptr [[SRC_ADDR]], align 16 +// LLVM: [[VEC_CAST0:%.*]] = bitcast <2 x double> [[VEC]] to <16 x i8> +// LLVM: [[VEC_CAST1:%.*]] = bitcast <16 x i8> [[VEC_CAST0]] to <2 x double> +// LLVM: [[RES:%.*]] = extractelement <2 x double> [[VEC_CAST1]], i32 1 +// LLVM: store double [[RES]], ptr [[PTR]], align 8