From 4bb39fc9013d98e2b210504c53fa2ea76ec4a1b3 Mon Sep 17 00:00:00 2001 From: Guojin He Date: Tue, 1 Oct 2024 16:20:21 -0400 Subject: [PATCH 1/5] Organize AArch64 neon tests --- .../neon-arith.c} | 3 + .../neon-ldst.c} | 3 + .../neon-misc.c} | 213 ++++++++++++++++- .../neon-varith.c} | 62 ++++- .../CIR/CodeGen/aarch64-neon-simd-shift.c | 69 ------ clang/test/CIR/CodeGen/aarch64-neon-vget.c | 219 ------------------ 6 files changed, 273 insertions(+), 296 deletions(-) rename clang/test/CIR/CodeGen/{arm-neon-directed-rounding.c => AArch64/neon-arith.c} (98%) rename clang/test/CIR/CodeGen/{aarch64-neon-ldst.c => AArch64/neon-ldst.c} (99%) rename clang/test/CIR/CodeGen/{aarch64-neon-vset.c => AArch64/neon-misc.c} (55%) rename clang/test/CIR/CodeGen/{aarch64-neon-vqadd.c => AArch64/neon-varith.c} (73%) delete mode 100644 clang/test/CIR/CodeGen/aarch64-neon-simd-shift.c delete mode 100644 clang/test/CIR/CodeGen/aarch64-neon-vget.c diff --git a/clang/test/CIR/CodeGen/arm-neon-directed-rounding.c b/clang/test/CIR/CodeGen/AArch64/neon-arith.c similarity index 98% rename from clang/test/CIR/CodeGen/arm-neon-directed-rounding.c rename to clang/test/CIR/CodeGen/AArch64/neon-arith.c index 92b4a9298eac..7d8636758652 100644 --- a/clang/test/CIR/CodeGen/arm-neon-directed-rounding.c +++ b/clang/test/CIR/CodeGen/AArch64/neon-arith.c @@ -8,6 +8,9 @@ // REQUIRES: aarch64-registered-target || arm-registered-target #include +// This test file contains aarch64 NEON arithmetic intrinsics that are not +// vector type related. + float32_t test_vrndns_f32(float32_t a) { return vrndns_f32(a); } diff --git a/clang/test/CIR/CodeGen/aarch64-neon-ldst.c b/clang/test/CIR/CodeGen/AArch64/neon-ldst.c similarity index 99% rename from clang/test/CIR/CodeGen/aarch64-neon-ldst.c rename to clang/test/CIR/CodeGen/AArch64/neon-ldst.c index 9b6ed9ee479c..d112f3a81808 100644 --- a/clang/test/CIR/CodeGen/aarch64-neon-ldst.c +++ b/clang/test/CIR/CodeGen/AArch64/neon-ldst.c @@ -6,6 +6,9 @@ // RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s // REQUIRES: aarch64-registered-target || arm-registered-target + +// This test file contains tests for the AArch64 NEON load/store intrinsics. + #include int8x8_t test_vld1_lane_s8(int8_t const * ptr, int8x8_t src) { diff --git a/clang/test/CIR/CodeGen/aarch64-neon-vset.c b/clang/test/CIR/CodeGen/AArch64/neon-misc.c similarity index 55% rename from clang/test/CIR/CodeGen/aarch64-neon-vset.c rename to clang/test/CIR/CodeGen/AArch64/neon-misc.c index 5da779ff69eb..6b0c3a866da5 100644 --- a/clang/test/CIR/CodeGen/aarch64-neon-vset.c +++ b/clang/test/CIR/CodeGen/AArch64/neon-misc.c @@ -5,13 +5,8 @@ // RUN: -emit-llvm -target-feature +neon %s -o %t.ll // RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s -// This test file is similar to but not the same as -// clang/test/CodeGen/aarch64-neon-vget.c -// The difference is that this file only tests uses vset intrinsics, as we feel -// it would be proper to have a separate test file testing vget intrinsics -// with the file name aarch64-neon-vget.c -// Also, for each integer type, we only test signed or unsigned, not both. -// This is because integer types of the same size just use same intrinsic. +// This test file contains AArch64 NEON intrinsics that are not covered by +// other tests. // REQUIRES: aarch64-registered-target || arm-registered-target #include @@ -236,3 +231,207 @@ float32x4_t test_vsetq_lane_f32(float32_t a, float32x4_t b) { // LLVM: [[INTRN_ARG1:%.*]] = load <4 x float>, ptr [[S1]], align 16 // LLVM: [[INTRN_RES:%.*]] = insertelement <4 x float> [[INTRN_ARG1]], float [[INTRN_ARG0]], i32 3 // LLVM: ret <4 x float> {{%.*}} + +uint8_t test_vget_lane_u8(uint8x8_t a) { + return vget_lane_u8(a, 7); +} + +// CIR-LABEL: test_vget_lane_u8 +// CIR: [[IDX:%.*]] = cir.const #cir.int<7> : !s32i +// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector + +// LLVM: define dso_local i8 @test_vget_lane_u8(<8 x i8> [[ARG:%.*]]) +// LLVM: [[ARG_SAVE:%.*]] = alloca <8 x i8>, i64 1, align 8 +// LLVM: store <8 x i8> [[ARG]], ptr [[ARG_SAVE]], align 8 +// LLVM: [[TMP:%.*]] = load <8 x i8>, ptr [[ARG_SAVE:%.*]], align 8 +// LLVM: store <8 x i8> [[TMP]], ptr [[S0:%.*]], align 8 +// LLVM: [[INTRN_ARG:%.*]] = load <8 x i8>, ptr [[S0]], align 8 +// LLVM: {{%.*}} = extractelement <8 x i8> [[INTRN_ARG]], i32 7 +// LLVM: ret i8 {{%.*}} + +uint8_t test_vgetq_lane_u8(uint8x16_t a) { + return vgetq_lane_u8(a, 15); +} + +// CIR-LABEL: test_vgetq_lane_u8 +// CIR: [[IDX:%.*]] = cir.const #cir.int<15> : !s32i +// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector + +// LLVM: define dso_local i8 @test_vgetq_lane_u8(<16 x i8> [[ARG:%.*]]) +// LLVM: [[ARG_SAVE:%.*]] = alloca <16 x i8>, i64 1, align 16 +// LLVM: store <16 x i8> [[ARG]], ptr [[ARG_SAVE]], align 16 +// LLVM: [[TMP:%.*]] = load <16 x i8>, ptr [[ARG_SAVE:%.*]], align 16 +// LLVM: store <16 x i8> [[TMP]], ptr [[S0:%.*]], align 16 +// LLVM: [[INTRN_ARG:%.*]] = load <16 x i8>, ptr [[S0]], align 16 +// LLVM: {{%.*}} = extractelement <16 x i8> [[INTRN_ARG]], i32 15 +// LLVM: ret i8 {{%.*}} + +uint16_t test_vget_lane_u16(uint16x4_t a) { + return vget_lane_u16(a, 3); +} + +// CIR-LABEL: test_vget_lane_u16 +// CIR: [[IDX:%.*]] = cir.const #cir.int<3> : !s32i +// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector + +// LLVM: define dso_local i16 @test_vget_lane_u16(<4 x i16> [[ARG:%.*]]) +// LLVM: [[ARG_SAVE:%.*]] = alloca <4 x i16>, i64 1, align 8 +// LLVM: store <4 x i16> [[ARG]], ptr [[ARG_SAVE]], align 8 +// LLVM: [[TMP:%.*]] = load <4 x i16>, ptr [[ARG_SAVE:%.*]], align 8 +// LLVM: store <4 x i16> [[TMP]], ptr [[S0:%.*]], align 8 +// LLVM: [[INTRN_ARG:%.*]] = load <4 x i16>, ptr [[S0]], align 8 +// LLVM: {{%.*}} = extractelement <4 x i16> [[INTRN_ARG]], i32 3 +// LLVM: ret i16 {{%.*}} + +uint16_t test_vgetq_lane_u16(uint16x8_t a) { + return vgetq_lane_u16(a, 7); +} + +// CIR-LABEL: test_vgetq_lane_u16 +// CIR: [[IDX:%.*]] = cir.const #cir.int<7> : !s32i +// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector + +// LLVM: define dso_local i16 @test_vgetq_lane_u16(<8 x i16> [[ARG:%.*]]) +// LLVM: [[ARG_SAVE:%.*]] = alloca <8 x i16>, i64 1, align 16 +// LLVM: store <8 x i16> [[ARG]], ptr [[ARG_SAVE]], align 16 +// LLVM: [[TMP:%.*]] = load <8 x i16>, ptr [[ARG_SAVE:%.*]], align 16 +// LLVM: store <8 x i16> [[TMP]], ptr [[S0:%.*]], align 16 +// LLVM: [[INTRN_ARG:%.*]] = load <8 x i16>, ptr [[S0]], align 16 +// LLVM: {{%.*}} = extractelement <8 x i16> [[INTRN_ARG]], i32 7 +// LLVM: ret i16 {{%.*}} + +uint32_t test_vget_lane_u32(uint32x2_t a) { + return vget_lane_u32(a, 1); +} + +// CIR-LABEL: test_vget_lane_u32 +// CIR: [[IDX:%.*]] = cir.const #cir.int<1> : !s32i +// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector + +// LLVM: define dso_local i32 @test_vget_lane_u32(<2 x i32> [[ARG:%.*]]) +// LLVM: [[ARG_SAVE:%.*]] = alloca <2 x i32>, i64 1, align 8 +// LLVM: store <2 x i32> [[ARG]], ptr [[ARG_SAVE]], align 8 +// LLVM: [[TMP:%.*]] = load <2 x i32>, ptr [[ARG_SAVE:%.*]], align 8 +// LLVM: store <2 x i32> [[TMP]], ptr [[S0:%.*]], align 8 +// LLVM: [[INTRN_ARG:%.*]] = load <2 x i32>, ptr [[S0]], align 8 +// LLVM: {{%.*}} = extractelement <2 x i32> [[INTRN_ARG]], i32 1 +// LLVM: ret i32 {{%.*}} + +uint32_t test_vgetq_lane_u32(uint32x4_t a) { + return vgetq_lane_u32(a, 3); +} + +// CIR-LABEL: test_vgetq_lane_u32 +// CIR: [[IDX:%.*]] = cir.const #cir.int<3> : !s32i +// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector + +// LLVM: define dso_local i32 @test_vgetq_lane_u32(<4 x i32> [[ARG:%.*]]) +// LLVM: [[ARG_SAVE:%.*]] = alloca <4 x i32>, i64 1, align 16 +// LLVM: store <4 x i32> [[ARG]], ptr [[ARG_SAVE]], align 16 +// LLVM: [[TMP:%.*]] = load <4 x i32>, ptr [[ARG_SAVE:%.*]], align 16 +// LLVM: store <4 x i32> [[TMP]], ptr [[S0:%.*]], align 16 +// LLVM: [[INTRN_ARG:%.*]] = load <4 x i32>, ptr [[S0]], align 16 +// LLVM: {{%.*}} = extractelement <4 x i32> [[INTRN_ARG]], i32 3 +// LLVM: ret i32 {{%.*}} + +uint64_t test_vget_lane_u64(uint64x1_t a) { + return vget_lane_u64(a, 0); +} + +// CIR-LABEL: test_vget_lane_u64 +// CIR: [[IDX:%.*]] = cir.const #cir.int<0> : !s32i +// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector + +// LLVM: define dso_local i64 @test_vget_lane_u64(<1 x i64> [[ARG:%.*]]) +// LLVM: [[ARG_SAVE:%.*]] = alloca <1 x i64>, i64 1, align 8 +// LLVM: store <1 x i64> [[ARG]], ptr [[ARG_SAVE]], align 8 +// LLVM: [[TMP:%.*]] = load <1 x i64>, ptr [[ARG_SAVE:%.*]], align 8 +// LLVM: store <1 x i64> [[TMP]], ptr [[S0:%.*]], align 8 +// LLVM: [[INTRN_ARG:%.*]] = load <1 x i64>, ptr [[S0]], align 8 +// LLVM: {{%.*}} = extractelement <1 x i64> [[INTRN_ARG]], i32 0 +// LLVM: ret i64 {{%.*}} + +uint64_t test_vgetq_lane_u64(uint64x2_t a) { + return vgetq_lane_u64(a, 1); +} + +// CIR-LABEL: test_vgetq_lane_u64 +// CIR: [[IDX:%.*]] = cir.const #cir.int<1> : !s32i +// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector + +// LLVM: define dso_local i64 @test_vgetq_lane_u64(<2 x i64> [[ARG:%.*]]) +// LLVM: [[ARG_SAVE:%.*]] = alloca <2 x i64>, i64 1, align 16 +// LLVM: store <2 x i64> [[ARG]], ptr [[ARG_SAVE]], align 16 +// LLVM: [[TMP:%.*]] = load <2 x i64>, ptr [[ARG_SAVE:%.*]], align 16 +// LLVM: store <2 x i64> [[TMP]], ptr [[S0:%.*]], align 16 +// LLVM: [[INTRN_ARG:%.*]] = load <2 x i64>, ptr [[S0]], align 16 +// LLVM: {{%.*}} = extractelement <2 x i64> [[INTRN_ARG]], i32 1 +// LLVM: ret i64 {{%.*}} + +float32_t test_vget_lane_f32(float32x2_t a) { + return vget_lane_f32(a, 1); +} + +// CIR-LABEL: test_vget_lane_f32 +// CIR: [[IDX:%.*]] = cir.const #cir.int<1> : !s32i +// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector + +// LLVM: define dso_local float @test_vget_lane_f32(<2 x float> [[ARG:%.*]]) +// LLVM: [[ARG_SAVE:%.*]] = alloca <2 x float>, i64 1, align 8 +// LLVM: store <2 x float> [[ARG]], ptr [[ARG_SAVE]], align 8 +// LLVM: [[TMP:%.*]] = load <2 x float>, ptr [[ARG_SAVE:%.*]], align 8 +// LLVM: store <2 x float> [[TMP]], ptr [[S0:%.*]], align 8 +// LLVM: [[INTRN_ARG:%.*]] = load <2 x float>, ptr [[S0]], align 8 +// LLVM: {{%.*}} = extractelement <2 x float> [[INTRN_ARG]], i32 1 +// LLVM: ret float {{%.*}} + +float64_t test_vget_lane_f64(float64x1_t a) { + return vget_lane_f64(a, 0); +} + +// CIR-LABEL: test_vget_lane_f64 +// CIR: [[IDX:%.*]] = cir.const #cir.int<0> : !s32i +// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector + +// LLVM: define dso_local double @test_vget_lane_f64(<1 x double> [[ARG:%.*]]) +// LLVM: [[ARG_SAVE:%.*]] = alloca <1 x double>, i64 1, align 8 +// LLVM: store <1 x double> [[ARG]], ptr [[ARG_SAVE]], align 8 +// LLVM: [[TMP:%.*]] = load <1 x double>, ptr [[ARG_SAVE:%.*]], align 8 +// LLVM: store <1 x double> [[TMP]], ptr [[S0:%.*]], align 8 +// LLVM: [[INTRN_ARG:%.*]] = load <1 x double>, ptr [[S0]], align 8 +// LLVM: {{%.*}} = extractelement <1 x double> [[INTRN_ARG]], i32 0 +// LLVM: ret double {{%.*}} + +float32_t test_vgetq_lane_f32(float32x4_t a) { + return vgetq_lane_f32(a, 3); +} + +// CIR-LABEL: test_vgetq_lane_f32 +// CIR: [[IDX:%.*]] = cir.const #cir.int<3> : !s32i +// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector + +// LLVM: define dso_local float @test_vgetq_lane_f32(<4 x float> [[ARG:%.*]]) +// LLVM: [[ARG_SAVE:%.*]] = alloca <4 x float>, i64 1, align 16 +// LLVM: store <4 x float> [[ARG]], ptr [[ARG_SAVE]], align 16 +// LLVM: [[TMP:%.*]] = load <4 x float>, ptr [[ARG_SAVE:%.*]], align 16 +// LLVM: store <4 x float> [[TMP]], ptr [[S0:%.*]], align 16 +// LLVM: [[INTRN_ARG:%.*]] = load <4 x float>, ptr [[S0]], align 16 +// LLVM: {{%.*}} = extractelement <4 x float> [[INTRN_ARG]], i32 3 +// LLVM: ret float {{%.*}} + +float64_t test_vgetq_lane_f64(float64x2_t a) { + return vgetq_lane_f64(a, 1); +} + +// CIR-LABEL: test_vgetq_lane_f64 +// CIR: [[IDX:%.*]] = cir.const #cir.int<1> : !s32i +// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector + +// LLVM: define dso_local double @test_vgetq_lane_f64(<2 x double> [[ARG:%.*]]) +// LLVM: [[ARG_SAVE:%.*]] = alloca <2 x double>, i64 1, align 16 +// LLVM: store <2 x double> [[ARG]], ptr [[ARG_SAVE]], align 16 +// LLVM: [[TMP:%.*]] = load <2 x double>, ptr [[ARG_SAVE:%.*]], align 16 +// LLVM: store <2 x double> [[TMP]], ptr [[S0:%.*]], align 16 +// LLVM: [[INTRN_ARG:%.*]] = load <2 x double>, ptr [[S0]], align 16 +// LLVM: {{%.*}} = extractelement <2 x double> [[INTRN_ARG]], i32 1 +// LLVM: ret double {{%.*}} diff --git a/clang/test/CIR/CodeGen/aarch64-neon-vqadd.c b/clang/test/CIR/CodeGen/AArch64/neon-varith.c similarity index 73% rename from clang/test/CIR/CodeGen/aarch64-neon-vqadd.c rename to clang/test/CIR/CodeGen/AArch64/neon-varith.c index 0932d95866c5..9643342f093c 100644 --- a/clang/test/CIR/CodeGen/aarch64-neon-vqadd.c +++ b/clang/test/CIR/CodeGen/AArch64/neon-varith.c @@ -5,7 +5,7 @@ // RUN: -emit-llvm -target-feature +neon %s -o %t.ll // RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s -// Tetsting normal situation of vdup lane intrinsics. +// This test file contains tests of aarch64 NEON vector arithmetic intrinsics. // REQUIRES: aarch64-registered-target || arm-registered-target #include @@ -177,3 +177,63 @@ int64x1_t test_vqadd_s64(int64x1_t a, int64x1_t b) { // LLVM: [[INTRN_B:%.*]] = load <1 x i64>, ptr [[P1_ADDR]], align 8 // LLVM: {{%.*}} = call <1 x i64> @llvm.aarch64.neon.sqadd.v1i64(<1 x i64> [[INTRN_A]], <1 x i64> [[INTRN_B]]) // LLVM: ret <1 x i64> + +uint8x8_t test_vqrshrun_n_s16(int16x8_t a) { + return vqrshrun_n_s16(a, 3); +} + +// CIR-LABEL: test_vqrshrun_n_s16 +// CIR: [[INTRN_ARG1:%.*]] = cir.const #cir.int<3> : !s32i +// CIR: [[INTRN_ARG0:%.*]] = cir.cast(bitcast, {{%.*}} : !cir.vector), !cir.vector +// CIR: {{%.*}} = cir.llvm.intrinsic "llvm.aarch64.neon.sqrshrun" [[INTRN_ARG0]], [[INTRN_ARG1]] : +// CIR-SAME: (!cir.vector, !s32i) -> !cir.vector + +// LLVM: {{.*}}test_vqrshrun_n_s16(<8 x i16>{{.*}} [[A:%.*]]) +// LLVM: store <8 x i16> [[A]], ptr [[A_ADDR:%.*]], align 16 +// LLVM: [[A_VAL:%.*]] = load <8 x i16>, ptr [[A_ADDR]], align 16 +// LLVM: store <8 x i16> [[A_VAL]], ptr [[S0:%.*]], align 16 +// LLVM: [[S0_VAL:%.*]] = load <8 x i16>, ptr [[S0]], align 16 +// LLVM: [[S0_VAL_CAST:%.*]] = bitcast <8 x i16> [[S0_VAL]] to <16 x i8> +// LLVM: [[INTRN_ARG:%.*]] = bitcast <16 x i8> [[S0_VAL_CAST]] to <8 x i16> +// LLVM: {{%.*}} = call <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> [[INTRN_ARG]], i32 3) +// LLVM: ret <8 x i8> {{%.*}} + +uint16x4_t test_vqrshrun_n_s32(int32x4_t a) { + return vqrshrun_n_s32(a, 7); +} + +// CIR-LABEL: test_vqrshrun_n_s32 +// CIR: [[INTRN_ARG1:%.*]] = cir.const #cir.int<7> : !s32i +// CIR: [[INTRN_ARG0:%.*]] = cir.cast(bitcast, {{%.*}} : !cir.vector), !cir.vector +// CIR: {{%.*}} = cir.llvm.intrinsic "llvm.aarch64.neon.sqrshrun" [[INTRN_ARG0]], [[INTRN_ARG1]] : +// CIR-SAME: (!cir.vector, !s32i) -> !cir.vector + +// LLVM: {{.*}}test_vqrshrun_n_s32(<4 x i32>{{.*}} [[A:%.*]]) +// LLVM: store <4 x i32> [[A]], ptr [[A_ADDR:%.*]], align 16 +// LLVM: [[A_VAL:%.*]] = load <4 x i32>, ptr [[A_ADDR]], align 16 +// LLVM: store <4 x i32> [[A_VAL]], ptr [[S0:%.*]], align 16 +// LLVM: [[S0_VAL:%.*]] = load <4 x i32>, ptr [[S0]], align 16 +// LLVM: [[S0_VAL_CAST:%.*]] = bitcast <4 x i32> [[S0_VAL]] to <16 x i8> +// LLVM: [[INTRN_ARG:%.*]] = bitcast <16 x i8> [[S0_VAL_CAST]] to <4 x i32> +// LLVM: {{%.*}} = call <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> [[INTRN_ARG]], i32 7) +// LLVM: ret <4 x i16> {{%.*}} + +uint32x2_t test_vqrshrun_n_s64(int64x2_t a) { + return vqrshrun_n_s64(a, 15); +} + +// CIR-LABEL: test_vqrshrun_n_s64 +// CIR: [[INTRN_ARG1:%.*]] = cir.const #cir.int<15> : !s32i +// CIR: [[INTRN_ARG0:%.*]] = cir.cast(bitcast, {{%.*}} : !cir.vector), !cir.vector +// CIR: {{%.*}} = cir.llvm.intrinsic "llvm.aarch64.neon.sqrshrun" [[INTRN_ARG0]], [[INTRN_ARG1]] : +// CIR-SAME: (!cir.vector, !s32i) -> !cir.vector + +// LLVM: {{.*}}test_vqrshrun_n_s64(<2 x i64>{{.*}} [[A:%.*]]) +// LLVM: store <2 x i64> [[A]], ptr [[A_ADDR:%.*]], align 16 +// LLVM: [[A_VAL:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16 +// LLVM: store <2 x i64> [[A_VAL]], ptr [[S0:%.*]], align 16 +// LLVM: [[S0_VAL:%.*]] = load <2 x i64>, ptr [[S0]], align 16 +// LLVM: [[S0_VAL_CAST:%.*]] = bitcast <2 x i64> [[S0_VAL]] to <16 x i8> +// LLVM: [[INTRN_ARG:%.*]] = bitcast <16 x i8> [[S0_VAL_CAST]] to <2 x i64> +// LLVM: {{%.*}} = call <2 x i32> @llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64> [[INTRN_ARG]], i32 15) +// LLVM: ret <2 x i32> {{%.*}} diff --git a/clang/test/CIR/CodeGen/aarch64-neon-simd-shift.c b/clang/test/CIR/CodeGen/aarch64-neon-simd-shift.c deleted file mode 100644 index 8619ad0c78d6..000000000000 --- a/clang/test/CIR/CodeGen/aarch64-neon-simd-shift.c +++ /dev/null @@ -1,69 +0,0 @@ -// RUN: %clang_cc1 -triple aarch64-none-linux-android24 -fclangir \ -// RUN: -ffreestanding -emit-cir -target-feature +neon %s -o %t.cir -// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s -// RUN: %clang_cc1 -triple aarch64-none-linux-android24 -fclangir \ -// RUN: -ffreestanding -emit-llvm -target-feature +neon %s -o %t.ll -// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s - -// REQUIRES: aarch64-registered-target || arm-registered-target -#include - -uint8x8_t test_vqrshrun_n_s16(int16x8_t a) { - return vqrshrun_n_s16(a, 3); -} - -// CIR-LABEL: test_vqrshrun_n_s16 -// CIR: [[INTRN_ARG1:%.*]] = cir.const #cir.int<3> : !s32i -// CIR: [[INTRN_ARG0:%.*]] = cir.cast(bitcast, {{%.*}} : !cir.vector), !cir.vector -// CIR: {{%.*}} = cir.llvm.intrinsic "llvm.aarch64.neon.sqrshrun" [[INTRN_ARG0]], [[INTRN_ARG1]] : -// CIR-SAME: (!cir.vector, !s32i) -> !cir.vector - -// LLVM: {{.*}}test_vqrshrun_n_s16(<8 x i16>{{.*}} [[A:%.*]]) -// LLVM: store <8 x i16> [[A]], ptr [[A_ADDR:%.*]], align 16 -// LLVM: [[A_VAL:%.*]] = load <8 x i16>, ptr [[A_ADDR]], align 16 -// LLVM: store <8 x i16> [[A_VAL]], ptr [[S0:%.*]], align 16 -// LLVM: [[S0_VAL:%.*]] = load <8 x i16>, ptr [[S0]], align 16 -// LLVM: [[S0_VAL_CAST:%.*]] = bitcast <8 x i16> [[S0_VAL]] to <16 x i8> -// LLVM: [[INTRN_ARG:%.*]] = bitcast <16 x i8> [[S0_VAL_CAST]] to <8 x i16> -// LLVM: {{%.*}} = call <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> [[INTRN_ARG]], i32 3) -// LLVM: ret <8 x i8> {{%.*}} - -uint16x4_t test_vqrshrun_n_s32(int32x4_t a) { - return vqrshrun_n_s32(a, 7); -} - -// CIR-LABEL: test_vqrshrun_n_s32 -// CIR: [[INTRN_ARG1:%.*]] = cir.const #cir.int<7> : !s32i -// CIR: [[INTRN_ARG0:%.*]] = cir.cast(bitcast, {{%.*}} : !cir.vector), !cir.vector -// CIR: {{%.*}} = cir.llvm.intrinsic "llvm.aarch64.neon.sqrshrun" [[INTRN_ARG0]], [[INTRN_ARG1]] : -// CIR-SAME: (!cir.vector, !s32i) -> !cir.vector - -// LLVM: {{.*}}test_vqrshrun_n_s32(<4 x i32>{{.*}} [[A:%.*]]) -// LLVM: store <4 x i32> [[A]], ptr [[A_ADDR:%.*]], align 16 -// LLVM: [[A_VAL:%.*]] = load <4 x i32>, ptr [[A_ADDR]], align 16 -// LLVM: store <4 x i32> [[A_VAL]], ptr [[S0:%.*]], align 16 -// LLVM: [[S0_VAL:%.*]] = load <4 x i32>, ptr [[S0]], align 16 -// LLVM: [[S0_VAL_CAST:%.*]] = bitcast <4 x i32> [[S0_VAL]] to <16 x i8> -// LLVM: [[INTRN_ARG:%.*]] = bitcast <16 x i8> [[S0_VAL_CAST]] to <4 x i32> -// LLVM: {{%.*}} = call <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> [[INTRN_ARG]], i32 7) -// LLVM: ret <4 x i16> {{%.*}} - -uint32x2_t test_vqrshrun_n_s64(int64x2_t a) { - return vqrshrun_n_s64(a, 15); -} - -// CIR-LABEL: test_vqrshrun_n_s64 -// CIR: [[INTRN_ARG1:%.*]] = cir.const #cir.int<15> : !s32i -// CIR: [[INTRN_ARG0:%.*]] = cir.cast(bitcast, {{%.*}} : !cir.vector), !cir.vector -// CIR: {{%.*}} = cir.llvm.intrinsic "llvm.aarch64.neon.sqrshrun" [[INTRN_ARG0]], [[INTRN_ARG1]] : -// CIR-SAME: (!cir.vector, !s32i) -> !cir.vector - -// LLVM: {{.*}}test_vqrshrun_n_s64(<2 x i64>{{.*}} [[A:%.*]]) -// LLVM: store <2 x i64> [[A]], ptr [[A_ADDR:%.*]], align 16 -// LLVM: [[A_VAL:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16 -// LLVM: store <2 x i64> [[A_VAL]], ptr [[S0:%.*]], align 16 -// LLVM: [[S0_VAL:%.*]] = load <2 x i64>, ptr [[S0]], align 16 -// LLVM: [[S0_VAL_CAST:%.*]] = bitcast <2 x i64> [[S0_VAL]] to <16 x i8> -// LLVM: [[INTRN_ARG:%.*]] = bitcast <16 x i8> [[S0_VAL_CAST]] to <2 x i64> -// LLVM: {{%.*}} = call <2 x i32> @llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64> [[INTRN_ARG]], i32 15) -// LLVM: ret <2 x i32> {{%.*}} diff --git a/clang/test/CIR/CodeGen/aarch64-neon-vget.c b/clang/test/CIR/CodeGen/aarch64-neon-vget.c deleted file mode 100644 index b16648691d1b..000000000000 --- a/clang/test/CIR/CodeGen/aarch64-neon-vget.c +++ /dev/null @@ -1,219 +0,0 @@ -// RUN: %clang_cc1 -triple aarch64-none-linux-android24 -fclangir \ -// RUN: -emit-cir -target-feature +neon %s -o %t.cir -// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s -// RUN: %clang_cc1 -triple aarch64-none-linux-android24 -fclangir \ -// RUN: -emit-llvm -target-feature +neon %s -o %t.ll -// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s - -// This test file contains test cases to those of -// clang/test/CodeGen/aarch64-neon-vget.c -// The difference is that this file only tests uses vget intrinsics, as we feel -// it would be proper to have a separate test file testing vset intrinsics -// with the file name aarch64-neon-vset.c - -// REQUIRES: aarch64-registered-target || arm-registered-target -#include - -uint8_t test_vget_lane_u8(uint8x8_t a) { - return vget_lane_u8(a, 7); -} - -// CIR-LABEL: test_vget_lane_u8 -// CIR: [[IDX:%.*]] = cir.const #cir.int<7> : !s32i -// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector - -// LLVM: define dso_local i8 @test_vget_lane_u8(<8 x i8> [[ARG:%.*]]) -// LLVM: [[ARG_SAVE:%.*]] = alloca <8 x i8>, i64 1, align 8 -// LLVM: store <8 x i8> [[ARG]], ptr [[ARG_SAVE]], align 8 -// LLVM: [[TMP:%.*]] = load <8 x i8>, ptr [[ARG_SAVE:%.*]], align 8 -// LLVM: store <8 x i8> [[TMP]], ptr [[S0:%.*]], align 8 -// LLVM: [[INTRN_ARG:%.*]] = load <8 x i8>, ptr [[S0]], align 8 -// LLVM: {{%.*}} = extractelement <8 x i8> [[INTRN_ARG]], i32 7 -// LLVM: ret i8 {{%.*}} - -uint8_t test_vgetq_lane_u8(uint8x16_t a) { - return vgetq_lane_u8(a, 15); -} - -// CIR-LABEL: test_vgetq_lane_u8 -// CIR: [[IDX:%.*]] = cir.const #cir.int<15> : !s32i -// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector - -// LLVM: define dso_local i8 @test_vgetq_lane_u8(<16 x i8> [[ARG:%.*]]) -// LLVM: [[ARG_SAVE:%.*]] = alloca <16 x i8>, i64 1, align 16 -// LLVM: store <16 x i8> [[ARG]], ptr [[ARG_SAVE]], align 16 -// LLVM: [[TMP:%.*]] = load <16 x i8>, ptr [[ARG_SAVE:%.*]], align 16 -// LLVM: store <16 x i8> [[TMP]], ptr [[S0:%.*]], align 16 -// LLVM: [[INTRN_ARG:%.*]] = load <16 x i8>, ptr [[S0]], align 16 -// LLVM: {{%.*}} = extractelement <16 x i8> [[INTRN_ARG]], i32 15 -// LLVM: ret i8 {{%.*}} - -uint16_t test_vget_lane_u16(uint16x4_t a) { - return vget_lane_u16(a, 3); -} - -// CIR-LABEL: test_vget_lane_u16 -// CIR: [[IDX:%.*]] = cir.const #cir.int<3> : !s32i -// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector - -// LLVM: define dso_local i16 @test_vget_lane_u16(<4 x i16> [[ARG:%.*]]) -// LLVM: [[ARG_SAVE:%.*]] = alloca <4 x i16>, i64 1, align 8 -// LLVM: store <4 x i16> [[ARG]], ptr [[ARG_SAVE]], align 8 -// LLVM: [[TMP:%.*]] = load <4 x i16>, ptr [[ARG_SAVE:%.*]], align 8 -// LLVM: store <4 x i16> [[TMP]], ptr [[S0:%.*]], align 8 -// LLVM: [[INTRN_ARG:%.*]] = load <4 x i16>, ptr [[S0]], align 8 -// LLVM: {{%.*}} = extractelement <4 x i16> [[INTRN_ARG]], i32 3 -// LLVM: ret i16 {{%.*}} - -uint16_t test_vgetq_lane_u16(uint16x8_t a) { - return vgetq_lane_u16(a, 7); -} - -// CIR-LABEL: test_vgetq_lane_u16 -// CIR: [[IDX:%.*]] = cir.const #cir.int<7> : !s32i -// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector - -// LLVM: define dso_local i16 @test_vgetq_lane_u16(<8 x i16> [[ARG:%.*]]) -// LLVM: [[ARG_SAVE:%.*]] = alloca <8 x i16>, i64 1, align 16 -// LLVM: store <8 x i16> [[ARG]], ptr [[ARG_SAVE]], align 16 -// LLVM: [[TMP:%.*]] = load <8 x i16>, ptr [[ARG_SAVE:%.*]], align 16 -// LLVM: store <8 x i16> [[TMP]], ptr [[S0:%.*]], align 16 -// LLVM: [[INTRN_ARG:%.*]] = load <8 x i16>, ptr [[S0]], align 16 -// LLVM: {{%.*}} = extractelement <8 x i16> [[INTRN_ARG]], i32 7 -// LLVM: ret i16 {{%.*}} - -uint32_t test_vget_lane_u32(uint32x2_t a) { - return vget_lane_u32(a, 1); -} - -// CIR-LABEL: test_vget_lane_u32 -// CIR: [[IDX:%.*]] = cir.const #cir.int<1> : !s32i -// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector - -// LLVM: define dso_local i32 @test_vget_lane_u32(<2 x i32> [[ARG:%.*]]) -// LLVM: [[ARG_SAVE:%.*]] = alloca <2 x i32>, i64 1, align 8 -// LLVM: store <2 x i32> [[ARG]], ptr [[ARG_SAVE]], align 8 -// LLVM: [[TMP:%.*]] = load <2 x i32>, ptr [[ARG_SAVE:%.*]], align 8 -// LLVM: store <2 x i32> [[TMP]], ptr [[S0:%.*]], align 8 -// LLVM: [[INTRN_ARG:%.*]] = load <2 x i32>, ptr [[S0]], align 8 -// LLVM: {{%.*}} = extractelement <2 x i32> [[INTRN_ARG]], i32 1 -// LLVM: ret i32 {{%.*}} - -uint32_t test_vgetq_lane_u32(uint32x4_t a) { - return vgetq_lane_u32(a, 3); -} - -// CIR-LABEL: test_vgetq_lane_u32 -// CIR: [[IDX:%.*]] = cir.const #cir.int<3> : !s32i -// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector - -// LLVM: define dso_local i32 @test_vgetq_lane_u32(<4 x i32> [[ARG:%.*]]) -// LLVM: [[ARG_SAVE:%.*]] = alloca <4 x i32>, i64 1, align 16 -// LLVM: store <4 x i32> [[ARG]], ptr [[ARG_SAVE]], align 16 -// LLVM: [[TMP:%.*]] = load <4 x i32>, ptr [[ARG_SAVE:%.*]], align 16 -// LLVM: store <4 x i32> [[TMP]], ptr [[S0:%.*]], align 16 -// LLVM: [[INTRN_ARG:%.*]] = load <4 x i32>, ptr [[S0]], align 16 -// LLVM: {{%.*}} = extractelement <4 x i32> [[INTRN_ARG]], i32 3 -// LLVM: ret i32 {{%.*}} - -uint64_t test_vget_lane_u64(uint64x1_t a) { - return vget_lane_u64(a, 0); -} - -// CIR-LABEL: test_vget_lane_u64 -// CIR: [[IDX:%.*]] = cir.const #cir.int<0> : !s32i -// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector - -// LLVM: define dso_local i64 @test_vget_lane_u64(<1 x i64> [[ARG:%.*]]) -// LLVM: [[ARG_SAVE:%.*]] = alloca <1 x i64>, i64 1, align 8 -// LLVM: store <1 x i64> [[ARG]], ptr [[ARG_SAVE]], align 8 -// LLVM: [[TMP:%.*]] = load <1 x i64>, ptr [[ARG_SAVE:%.*]], align 8 -// LLVM: store <1 x i64> [[TMP]], ptr [[S0:%.*]], align 8 -// LLVM: [[INTRN_ARG:%.*]] = load <1 x i64>, ptr [[S0]], align 8 -// LLVM: {{%.*}} = extractelement <1 x i64> [[INTRN_ARG]], i32 0 -// LLVM: ret i64 {{%.*}} - -uint64_t test_vgetq_lane_u64(uint64x2_t a) { - return vgetq_lane_u64(a, 1); -} - -// CIR-LABEL: test_vgetq_lane_u64 -// CIR: [[IDX:%.*]] = cir.const #cir.int<1> : !s32i -// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector - -// LLVM: define dso_local i64 @test_vgetq_lane_u64(<2 x i64> [[ARG:%.*]]) -// LLVM: [[ARG_SAVE:%.*]] = alloca <2 x i64>, i64 1, align 16 -// LLVM: store <2 x i64> [[ARG]], ptr [[ARG_SAVE]], align 16 -// LLVM: [[TMP:%.*]] = load <2 x i64>, ptr [[ARG_SAVE:%.*]], align 16 -// LLVM: store <2 x i64> [[TMP]], ptr [[S0:%.*]], align 16 -// LLVM: [[INTRN_ARG:%.*]] = load <2 x i64>, ptr [[S0]], align 16 -// LLVM: {{%.*}} = extractelement <2 x i64> [[INTRN_ARG]], i32 1 -// LLVM: ret i64 {{%.*}} - -float32_t test_vget_lane_f32(float32x2_t a) { - return vget_lane_f32(a, 1); -} - -// CIR-LABEL: test_vget_lane_f32 -// CIR: [[IDX:%.*]] = cir.const #cir.int<1> : !s32i -// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector - -// LLVM: define dso_local float @test_vget_lane_f32(<2 x float> [[ARG:%.*]]) -// LLVM: [[ARG_SAVE:%.*]] = alloca <2 x float>, i64 1, align 8 -// LLVM: store <2 x float> [[ARG]], ptr [[ARG_SAVE]], align 8 -// LLVM: [[TMP:%.*]] = load <2 x float>, ptr [[ARG_SAVE:%.*]], align 8 -// LLVM: store <2 x float> [[TMP]], ptr [[S0:%.*]], align 8 -// LLVM: [[INTRN_ARG:%.*]] = load <2 x float>, ptr [[S0]], align 8 -// LLVM: {{%.*}} = extractelement <2 x float> [[INTRN_ARG]], i32 1 -// LLVM: ret float {{%.*}} - -float64_t test_vget_lane_f64(float64x1_t a) { - return vget_lane_f64(a, 0); -} - -// CIR-LABEL: test_vget_lane_f64 -// CIR: [[IDX:%.*]] = cir.const #cir.int<0> : !s32i -// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector - -// LLVM: define dso_local double @test_vget_lane_f64(<1 x double> [[ARG:%.*]]) -// LLVM: [[ARG_SAVE:%.*]] = alloca <1 x double>, i64 1, align 8 -// LLVM: store <1 x double> [[ARG]], ptr [[ARG_SAVE]], align 8 -// LLVM: [[TMP:%.*]] = load <1 x double>, ptr [[ARG_SAVE:%.*]], align 8 -// LLVM: store <1 x double> [[TMP]], ptr [[S0:%.*]], align 8 -// LLVM: [[INTRN_ARG:%.*]] = load <1 x double>, ptr [[S0]], align 8 -// LLVM: {{%.*}} = extractelement <1 x double> [[INTRN_ARG]], i32 0 -// LLVM: ret double {{%.*}} - -float32_t test_vgetq_lane_f32(float32x4_t a) { - return vgetq_lane_f32(a, 3); -} - -// CIR-LABEL: test_vgetq_lane_f32 -// CIR: [[IDX:%.*]] = cir.const #cir.int<3> : !s32i -// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector - -// LLVM: define dso_local float @test_vgetq_lane_f32(<4 x float> [[ARG:%.*]]) -// LLVM: [[ARG_SAVE:%.*]] = alloca <4 x float>, i64 1, align 16 -// LLVM: store <4 x float> [[ARG]], ptr [[ARG_SAVE]], align 16 -// LLVM: [[TMP:%.*]] = load <4 x float>, ptr [[ARG_SAVE:%.*]], align 16 -// LLVM: store <4 x float> [[TMP]], ptr [[S0:%.*]], align 16 -// LLVM: [[INTRN_ARG:%.*]] = load <4 x float>, ptr [[S0]], align 16 -// LLVM: {{%.*}} = extractelement <4 x float> [[INTRN_ARG]], i32 3 -// LLVM: ret float {{%.*}} - -float64_t test_vgetq_lane_f64(float64x2_t a) { - return vgetq_lane_f64(a, 1); -} - -// CIR-LABEL: test_vgetq_lane_f64 -// CIR: [[IDX:%.*]] = cir.const #cir.int<1> : !s32i -// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector - -// LLVM: define dso_local double @test_vgetq_lane_f64(<2 x double> [[ARG:%.*]]) -// LLVM: [[ARG_SAVE:%.*]] = alloca <2 x double>, i64 1, align 16 -// LLVM: store <2 x double> [[ARG]], ptr [[ARG_SAVE]], align 16 -// LLVM: [[TMP:%.*]] = load <2 x double>, ptr [[ARG_SAVE:%.*]], align 16 -// LLVM: store <2 x double> [[TMP]], ptr [[S0:%.*]], align 16 -// LLVM: [[INTRN_ARG:%.*]] = load <2 x double>, ptr [[S0]], align 16 -// LLVM: {{%.*}} = extractelement <2 x double> [[INTRN_ARG]], i32 1 -// LLVM: ret double {{%.*}} From 16615f29a1bfc26b61318cedd482379b11b9b617 Mon Sep 17 00:00:00 2001 From: Guojin He Date: Tue, 1 Oct 2024 16:25:19 -0400 Subject: [PATCH 2/5] polish comments --- clang/test/CIR/CodeGen/AArch64/neon-arith.c | 4 ++-- clang/test/CIR/CodeGen/AArch64/neon-misc.c | 4 ++-- clang/test/CIR/CodeGen/neon-tmp.c | 26 +++++++++++++++++++++ 3 files changed, 30 insertions(+), 4 deletions(-) create mode 100644 clang/test/CIR/CodeGen/neon-tmp.c diff --git a/clang/test/CIR/CodeGen/AArch64/neon-arith.c b/clang/test/CIR/CodeGen/AArch64/neon-arith.c index 7d8636758652..192486579143 100644 --- a/clang/test/CIR/CodeGen/AArch64/neon-arith.c +++ b/clang/test/CIR/CodeGen/AArch64/neon-arith.c @@ -8,8 +8,8 @@ // REQUIRES: aarch64-registered-target || arm-registered-target #include -// This test file contains aarch64 NEON arithmetic intrinsics that are not -// vector type related. +// This test file contains tests for aarch64 NEON arithmetic intrinsics +// that are not vector type related. float32_t test_vrndns_f32(float32_t a) { return vrndns_f32(a); diff --git a/clang/test/CIR/CodeGen/AArch64/neon-misc.c b/clang/test/CIR/CodeGen/AArch64/neon-misc.c index 6b0c3a866da5..0c20576e62d8 100644 --- a/clang/test/CIR/CodeGen/AArch64/neon-misc.c +++ b/clang/test/CIR/CodeGen/AArch64/neon-misc.c @@ -5,8 +5,8 @@ // RUN: -emit-llvm -target-feature +neon %s -o %t.ll // RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s -// This test file contains AArch64 NEON intrinsics that are not covered by -// other tests. +// This test file contains tests of AArch64 NEON intrinsics +// that are not covered by other tests. // REQUIRES: aarch64-registered-target || arm-registered-target #include diff --git a/clang/test/CIR/CodeGen/neon-tmp.c b/clang/test/CIR/CodeGen/neon-tmp.c new file mode 100644 index 000000000000..f11f1b9454dd --- /dev/null +++ b/clang/test/CIR/CodeGen/neon-tmp.c @@ -0,0 +1,26 @@ +// RUN: %clang_cc1 -triple aarch64-none-linux-android24 -fclangir \ +// RUN: -ffreestanding -emit-cir -target-feature +neon %s -o %t.cir +// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s +// RUN: %clang_cc1 -triple aarch64-none-linux-android24 -fclangir \ +// RUN: -ffreestanding -emit-llvm -target-feature +neon %s -o %t.ll +// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s + +// REQUIRES: aarch64-registered-target || arm-registered-target +#include + +uint8x8_t test_vmovn_u16(uint16x8_t a) { + return vmovn_u16(a); +} + +// CIR-LABEL: vmovn_u16 +// CIR: [[TMP:%.*]] = cir.cast(bitcast, {{%.*}} : !cir.vector), !cir.vector +// CIR: [[SRC:%.*]] = cir.cast(bitcast, [[TMP]] : !cir.vector), !cir.vector +// CIR: {{.*}} = cir.cast(integral, [[SRC]] : !cir.vector), !cir.vector + +// LLVM: {{.*}}test_vmovn_u16(<8 x i16>{{.*}}[[A:%.*]]) +// LLVM: store <8 x i16> [[A]], ptr [[A_ADDR:%.*]], align 16 +// LLVM: [[TMP0:%.*]] = load <8 x i16>, ptr [[A_ADDR]], align 16 +// LLVM: store <8 x i16> [[TMP0]], ptr [[P0:%.*]], align 16 +// LLVM: [[INTRN_ARG:%.*]] = load <8 x i16>, ptr [[P0]], align 16 +// LLVM: {{%.*}} = bitcast <8 x i16> [[INTRN_ARG]] to <16 x i8> +// LLVM: {{%.*}} = trunc <8 x i16> [[INTRN_ARG]] to <8 x i8> From 824ab72dc21b27463da07fc1f4b67dadd7a1d314 Mon Sep 17 00:00:00 2001 From: Guojin He Date: Tue, 1 Oct 2024 16:27:54 -0400 Subject: [PATCH 3/5] get rid of local file --- clang/test/CIR/CodeGen/neon-tmp.c | 26 -------------------------- 1 file changed, 26 deletions(-) delete mode 100644 clang/test/CIR/CodeGen/neon-tmp.c diff --git a/clang/test/CIR/CodeGen/neon-tmp.c b/clang/test/CIR/CodeGen/neon-tmp.c deleted file mode 100644 index f11f1b9454dd..000000000000 --- a/clang/test/CIR/CodeGen/neon-tmp.c +++ /dev/null @@ -1,26 +0,0 @@ -// RUN: %clang_cc1 -triple aarch64-none-linux-android24 -fclangir \ -// RUN: -ffreestanding -emit-cir -target-feature +neon %s -o %t.cir -// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s -// RUN: %clang_cc1 -triple aarch64-none-linux-android24 -fclangir \ -// RUN: -ffreestanding -emit-llvm -target-feature +neon %s -o %t.ll -// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s - -// REQUIRES: aarch64-registered-target || arm-registered-target -#include - -uint8x8_t test_vmovn_u16(uint16x8_t a) { - return vmovn_u16(a); -} - -// CIR-LABEL: vmovn_u16 -// CIR: [[TMP:%.*]] = cir.cast(bitcast, {{%.*}} : !cir.vector), !cir.vector -// CIR: [[SRC:%.*]] = cir.cast(bitcast, [[TMP]] : !cir.vector), !cir.vector -// CIR: {{.*}} = cir.cast(integral, [[SRC]] : !cir.vector), !cir.vector - -// LLVM: {{.*}}test_vmovn_u16(<8 x i16>{{.*}}[[A:%.*]]) -// LLVM: store <8 x i16> [[A]], ptr [[A_ADDR:%.*]], align 16 -// LLVM: [[TMP0:%.*]] = load <8 x i16>, ptr [[A_ADDR]], align 16 -// LLVM: store <8 x i16> [[TMP0]], ptr [[P0:%.*]], align 16 -// LLVM: [[INTRN_ARG:%.*]] = load <8 x i16>, ptr [[P0]], align 16 -// LLVM: {{%.*}} = bitcast <8 x i16> [[INTRN_ARG]] to <16 x i8> -// LLVM: {{%.*}} = trunc <8 x i16> [[INTRN_ARG]] to <8 x i8> From 5cac33d3a1ad8edb94dd33efaf5a46d9fbc3769b Mon Sep 17 00:00:00 2001 From: Guojin He Date: Tue, 1 Oct 2024 19:09:48 -0400 Subject: [PATCH 4/5] make sure aarch64-neon-intrinsics.c test cases not duplicated --- clang/test/CIR/CodeGen/AArch64/neon-varith.c | 239 ------------------ .../CIR/CodeGen/aarch64-neon-intrinsics.c | 238 ++++++++++------- 2 files changed, 143 insertions(+), 334 deletions(-) delete mode 100644 clang/test/CIR/CodeGen/AArch64/neon-varith.c diff --git a/clang/test/CIR/CodeGen/AArch64/neon-varith.c b/clang/test/CIR/CodeGen/AArch64/neon-varith.c deleted file mode 100644 index 9643342f093c..000000000000 --- a/clang/test/CIR/CodeGen/AArch64/neon-varith.c +++ /dev/null @@ -1,239 +0,0 @@ -// RUN: %clang_cc1 -triple aarch64-none-linux-android24 -fclangir \ -// RUN: -emit-cir -target-feature +neon %s -o %t.cir -// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s -// RUN: %clang_cc1 -triple aarch64-none-linux-android24 -fclangir \ -// RUN: -emit-llvm -target-feature +neon %s -o %t.ll -// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s - -// This test file contains tests of aarch64 NEON vector arithmetic intrinsics. - -// REQUIRES: aarch64-registered-target || arm-registered-target -#include - -uint8x8_t test_vqadd_u8(uint8x8_t a, uint8x8_t b) { - return vqadd_u8(a,b); -} - -// CIR-LABEL: vqadd_u8 -// CIR: {{%.*}} = cir.llvm.intrinsic "llvm.aarch64.neon.uqadd" {{%.*}}, {{%.*}} : -// CIR-SAME: (!cir.vector, !cir.vector) -> !cir.vector -// CIR: cir.return - -// LLVM: {{.*}}test_vqadd_u8(<8 x i8>{{.*}} [[A:%.*]], <8 x i8>{{.*}} [[B:%.*]]) -// LLVM: store <8 x i8> [[A]], ptr [[A_ADDR:%.*]], align 8 -// LLVM: store <8 x i8> [[B]], ptr [[B_ADDR:%.*]], align 8 -// LLVM: [[TMP_A:%.*]] = load <8 x i8>, ptr [[A_ADDR]], align 8 -// LLVM: [[TMP_B:%.*]] = load <8 x i8>, ptr [[B_ADDR]], align 8 -// LLVM: store <8 x i8> [[TMP_A]], ptr [[P0_ADDR:%.*]], align 8 -// LLVM: store <8 x i8> [[TMP_B]], ptr [[P1_ADDR:%.*]], align 8 -// LLVM: [[INTRN_A:%.*]] = load <8 x i8>, ptr [[P0_ADDR]], align 8 -// LLVM: [[INTRN_B:%.*]] = load <8 x i8>, ptr [[P1_ADDR]], align 8 -// LLVM: {{%.*}} = call <8 x i8> @llvm.aarch64.neon.uqadd.v8i8(<8 x i8> [[INTRN_A]], <8 x i8> [[INTRN_B]]) -// LLVM: ret <8 x i8> - -int8x8_t test_vqadd_s8(int8x8_t a, int8x8_t b) { - return vqadd_s8(a,b); -} - -// CIR-LABEL: vqadd_s8 -// CIR: {{%.*}} = cir.llvm.intrinsic "llvm.aarch64.neon.sqadd" {{%.*}}, {{%.*}} : -// CIR-SAME: (!cir.vector, !cir.vector) -> !cir.vector -// CIR: cir.return - -// LLVM: {{.*}}test_vqadd_s8(<8 x i8>{{.*}} [[A:%.*]], <8 x i8>{{.*}} [[B:%.*]]) -// LLVM: store <8 x i8> [[A]], ptr [[A_ADDR:%.*]], align 8 -// LLVM: store <8 x i8> [[B]], ptr [[B_ADDR:%.*]], align 8 -// LLVM: [[TMP_A:%.*]] = load <8 x i8>, ptr [[A_ADDR]], align 8 -// LLVM: [[TMP_B:%.*]] = load <8 x i8>, ptr [[B_ADDR]], align 8 -// LLVM: store <8 x i8> [[TMP_A]], ptr [[P0_ADDR:%.*]], align 8 -// LLVM: store <8 x i8> [[TMP_B]], ptr [[P1_ADDR:%.*]], align 8 -// LLVM: [[INTRN_A:%.*]] = load <8 x i8>, ptr [[P0_ADDR]], align 8 -// LLVM: [[INTRN_B:%.*]] = load <8 x i8>, ptr [[P1_ADDR]], align 8 -// LLVM: {{%.*}} = call <8 x i8> @llvm.aarch64.neon.sqadd.v8i8(<8 x i8> [[INTRN_A]], <8 x i8> [[INTRN_B]]) -// LLVM: ret <8 x i8> - -uint16x4_t test_vqadd_u16(uint16x4_t a, uint16x4_t b) { - return vqadd_u16(a,b); -} - -// CIR-LABEL: vqadd_u16 -// CIR: {{%.*}} = cir.llvm.intrinsic "llvm.aarch64.neon.uqadd" {{%.*}}, {{%.*}} : -// CIR-SAME: (!cir.vector, !cir.vector) -> !cir.vector -// CIR: cir.return - -// LLVM: {{.*}}test_vqadd_u16(<4 x i16>{{.*}} [[A:%.*]], <4 x i16>{{.*}} [[B:%.*]]) -// LLVM: store <4 x i16> [[A]], ptr [[A_ADDR:%.*]], align 8 -// LLVM: store <4 x i16> [[B]], ptr [[B_ADDR:%.*]], align 8 -// LLVM: [[TMP_A:%.*]] = load <4 x i16>, ptr [[A_ADDR]], align 8 -// LLVM: [[TMP_B:%.*]] = load <4 x i16>, ptr [[B_ADDR]], align 8 -// LLVM: store <4 x i16> [[TMP_A]], ptr [[P0_ADDR:%.*]], align 8 -// LLVM: store <4 x i16> [[TMP_B]], ptr [[P1_ADDR:%.*]], align 8 -// LLVM: [[INTRN_A:%.*]] = load <4 x i16>, ptr [[P0_ADDR]], align 8 -// LLVM: [[INTRN_B:%.*]] = load <4 x i16>, ptr [[P1_ADDR]], align 8 -// LLVM: {{%.*}} = call <4 x i16> @llvm.aarch64.neon.uqadd.v4i16(<4 x i16> [[INTRN_A]], <4 x i16> [[INTRN_B]]) -// LLVM: ret <4 x i16> - -int16x4_t test_vqadd_s16(int16x4_t a, int16x4_t b) { - return vqadd_s16(a,b); -} - -// CIR-LABEL: vqadd_u16 -// CIR: {{%.*}} = cir.llvm.intrinsic "llvm.aarch64.neon.sqadd" {{%.*}}, {{%.*}} : -// CIR-SAME: (!cir.vector, !cir.vector) -> !cir.vector -// CIR: cir.return - -// LLVM: {{.*}}test_vqadd_s16(<4 x i16>{{.*}} [[A:%.*]], <4 x i16>{{.*}} [[B:%.*]]) -// LLVM: store <4 x i16> [[A]], ptr [[A_ADDR:%.*]], align 8 -// LLVM: store <4 x i16> [[B]], ptr [[B_ADDR:%.*]], align 8 -// LLVM: [[TMP_A:%.*]] = load <4 x i16>, ptr [[A_ADDR]], align 8 -// LLVM: [[TMP_B:%.*]] = load <4 x i16>, ptr [[B_ADDR]], align 8 -// LLVM: store <4 x i16> [[TMP_A]], ptr [[P0_ADDR:%.*]], align 8 -// LLVM: store <4 x i16> [[TMP_B]], ptr [[P1_ADDR:%.*]], align 8 -// LLVM: [[INTRN_A:%.*]] = load <4 x i16>, ptr [[P0_ADDR]], align 8 -// LLVM: [[INTRN_B:%.*]] = load <4 x i16>, ptr [[P1_ADDR]], align 8 -// LLVM: {{%.*}} = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> [[INTRN_A]], <4 x i16> [[INTRN_B]]) -// LLVM: ret <4 x i16> - -uint32x2_t test_vqadd_u32(uint32x2_t a, uint32x2_t b) { - return vqadd_u32(a,b); -} - -// CIR-LABEL: vqadd_u32 -// CIR: {{%.*}} = cir.llvm.intrinsic "llvm.aarch64.neon.uqadd" {{%.*}}, {{%.*}} : -// CIR-SAME: (!cir.vector, !cir.vector) -> !cir.vector -// CIR: cir.return - -// LLVM: {{.*}}test_vqadd_u32(<2 x i32>{{.*}} [[A:%.*]], <2 x i32>{{.*}} [[B:%.*]]) -// LLVM: store <2 x i32> [[A]], ptr [[A_ADDR:%.*]], align 8 -// LLVM: store <2 x i32> [[B]], ptr [[B_ADDR:%.*]], align 8 -// LLVM: [[TMP_A:%.*]] = load <2 x i32>, ptr [[A_ADDR]], align 8 -// LLVM: [[TMP_B:%.*]] = load <2 x i32>, ptr [[B_ADDR]], align 8 -// LLVM: store <2 x i32> [[TMP_A]], ptr [[P0_ADDR:%.*]], align 8 -// LLVM: store <2 x i32> [[TMP_B]], ptr [[P1_ADDR:%.*]], align 8 -// LLVM: [[INTRN_A:%.*]] = load <2 x i32>, ptr [[P0_ADDR]], align 8 -// LLVM: [[INTRN_B:%.*]] = load <2 x i32>, ptr [[P1_ADDR]], align 8 -// LLVM: {{%.*}} = call <2 x i32> @llvm.aarch64.neon.uqadd.v2i32(<2 x i32> [[INTRN_A]], <2 x i32> [[INTRN_B]]) -// LLVM: ret <2 x i32> - -int32x2_t test_vqadd_s32(int32x2_t a, int32x2_t b) { - return vqadd_s32(a,b); -} - -// CIR-LABEL: vqadd_s32 -// CIR: {{%.*}} = cir.llvm.intrinsic "llvm.aarch64.neon.sqadd" {{%.*}}, {{%.*}} : -// CIR-SAME: (!cir.vector, !cir.vector) -> !cir.vector -// CIR: cir.return - -// LLVM: {{.*}}test_vqadd_s32(<2 x i32>{{.*}} [[A:%.*]], <2 x i32>{{.*}} [[B:%.*]]) -// LLVM: store <2 x i32> [[A]], ptr [[A_ADDR:%.*]], align 8 -// LLVM: store <2 x i32> [[B]], ptr [[B_ADDR:%.*]], align 8 -// LLVM: [[TMP_A:%.*]] = load <2 x i32>, ptr [[A_ADDR]], align 8 -// LLVM: [[TMP_B:%.*]] = load <2 x i32>, ptr [[B_ADDR]], align 8 -// LLVM: store <2 x i32> [[TMP_A]], ptr [[P0_ADDR:%.*]], align 8 -// LLVM: store <2 x i32> [[TMP_B]], ptr [[P1_ADDR:%.*]], align 8 -// LLVM: [[INTRN_A:%.*]] = load <2 x i32>, ptr [[P0_ADDR]], align 8 -// LLVM: [[INTRN_B:%.*]] = load <2 x i32>, ptr [[P1_ADDR]], align 8 -// LLVM: {{%.*}} = call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> [[INTRN_A]], <2 x i32> [[INTRN_B]]) -// LLVM: ret <2 x i32> - -uint64x1_t test_vqadd_u64(uint64x1_t a, uint64x1_t b) { - return vqadd_u64(a,b); -} - -// CIR-LABEL: vqadd_u64 -// CIR: {{%.*}} = cir.llvm.intrinsic "llvm.aarch64.neon.uqadd" {{%.*}}, {{%.*}} : -// CIR-SAME: (!cir.vector, !cir.vector) -> !cir.vector -// CIR: cir.return - -// LLVM: {{.*}}test_vqadd_u64(<1 x i64>{{.*}} [[A:%.*]], <1 x i64>{{.*}} [[B:%.*]]) -// LLVM: store <1 x i64> [[A]], ptr [[A_ADDR:%.*]], align 8 -// LLVM: store <1 x i64> [[B]], ptr [[B_ADDR:%.*]], align 8 -// LLVM: [[TMP_A:%.*]] = load <1 x i64>, ptr [[A_ADDR]], align 8 -// LLVM: [[TMP_B:%.*]] = load <1 x i64>, ptr [[B_ADDR]], align 8 -// LLVM: store <1 x i64> [[TMP_A]], ptr [[P0_ADDR:%.*]], align 8 -// LLVM: store <1 x i64> [[TMP_B]], ptr [[P1_ADDR:%.*]], align 8 -// LLVM: [[INTRN_A:%.*]] = load <1 x i64>, ptr [[P0_ADDR]], align 8 -// LLVM: [[INTRN_B:%.*]] = load <1 x i64>, ptr [[P1_ADDR]], align 8 -// LLVM: {{%.*}} = call <1 x i64> @llvm.aarch64.neon.uqadd.v1i64(<1 x i64> [[INTRN_A]], <1 x i64> [[INTRN_B]]) -// LLVM: ret <1 x i64> - -int64x1_t test_vqadd_s64(int64x1_t a, int64x1_t b) { - return vqadd_s64(a,b); -} - -// CIR-LABEL: vqadd_s64 -// CIR: {{%.*}} = cir.llvm.intrinsic "llvm.aarch64.neon.sqadd" {{%.*}}, {{%.*}} : -// CIR-SAME: (!cir.vector, !cir.vector) -> !cir.vector -// CIR: cir.return - -// LLVM: {{.*}}test_vqadd_s64(<1 x i64>{{.*}} [[A:%.*]], <1 x i64>{{.*}} [[B:%.*]]) -// LLVM: store <1 x i64> [[A]], ptr [[A_ADDR:%.*]], align 8 -// LLVM: store <1 x i64> [[B]], ptr [[B_ADDR:%.*]], align 8 -// LLVM: [[TMP_A:%.*]] = load <1 x i64>, ptr [[A_ADDR]], align 8 -// LLVM: [[TMP_B:%.*]] = load <1 x i64>, ptr [[B_ADDR]], align 8 -// LLVM: store <1 x i64> [[TMP_A]], ptr [[P0_ADDR:%.*]], align 8 -// LLVM: store <1 x i64> [[TMP_B]], ptr [[P1_ADDR:%.*]], align 8 -// LLVM: [[INTRN_A:%.*]] = load <1 x i64>, ptr [[P0_ADDR]], align 8 -// LLVM: [[INTRN_B:%.*]] = load <1 x i64>, ptr [[P1_ADDR]], align 8 -// LLVM: {{%.*}} = call <1 x i64> @llvm.aarch64.neon.sqadd.v1i64(<1 x i64> [[INTRN_A]], <1 x i64> [[INTRN_B]]) -// LLVM: ret <1 x i64> - -uint8x8_t test_vqrshrun_n_s16(int16x8_t a) { - return vqrshrun_n_s16(a, 3); -} - -// CIR-LABEL: test_vqrshrun_n_s16 -// CIR: [[INTRN_ARG1:%.*]] = cir.const #cir.int<3> : !s32i -// CIR: [[INTRN_ARG0:%.*]] = cir.cast(bitcast, {{%.*}} : !cir.vector), !cir.vector -// CIR: {{%.*}} = cir.llvm.intrinsic "llvm.aarch64.neon.sqrshrun" [[INTRN_ARG0]], [[INTRN_ARG1]] : -// CIR-SAME: (!cir.vector, !s32i) -> !cir.vector - -// LLVM: {{.*}}test_vqrshrun_n_s16(<8 x i16>{{.*}} [[A:%.*]]) -// LLVM: store <8 x i16> [[A]], ptr [[A_ADDR:%.*]], align 16 -// LLVM: [[A_VAL:%.*]] = load <8 x i16>, ptr [[A_ADDR]], align 16 -// LLVM: store <8 x i16> [[A_VAL]], ptr [[S0:%.*]], align 16 -// LLVM: [[S0_VAL:%.*]] = load <8 x i16>, ptr [[S0]], align 16 -// LLVM: [[S0_VAL_CAST:%.*]] = bitcast <8 x i16> [[S0_VAL]] to <16 x i8> -// LLVM: [[INTRN_ARG:%.*]] = bitcast <16 x i8> [[S0_VAL_CAST]] to <8 x i16> -// LLVM: {{%.*}} = call <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> [[INTRN_ARG]], i32 3) -// LLVM: ret <8 x i8> {{%.*}} - -uint16x4_t test_vqrshrun_n_s32(int32x4_t a) { - return vqrshrun_n_s32(a, 7); -} - -// CIR-LABEL: test_vqrshrun_n_s32 -// CIR: [[INTRN_ARG1:%.*]] = cir.const #cir.int<7> : !s32i -// CIR: [[INTRN_ARG0:%.*]] = cir.cast(bitcast, {{%.*}} : !cir.vector), !cir.vector -// CIR: {{%.*}} = cir.llvm.intrinsic "llvm.aarch64.neon.sqrshrun" [[INTRN_ARG0]], [[INTRN_ARG1]] : -// CIR-SAME: (!cir.vector, !s32i) -> !cir.vector - -// LLVM: {{.*}}test_vqrshrun_n_s32(<4 x i32>{{.*}} [[A:%.*]]) -// LLVM: store <4 x i32> [[A]], ptr [[A_ADDR:%.*]], align 16 -// LLVM: [[A_VAL:%.*]] = load <4 x i32>, ptr [[A_ADDR]], align 16 -// LLVM: store <4 x i32> [[A_VAL]], ptr [[S0:%.*]], align 16 -// LLVM: [[S0_VAL:%.*]] = load <4 x i32>, ptr [[S0]], align 16 -// LLVM: [[S0_VAL_CAST:%.*]] = bitcast <4 x i32> [[S0_VAL]] to <16 x i8> -// LLVM: [[INTRN_ARG:%.*]] = bitcast <16 x i8> [[S0_VAL_CAST]] to <4 x i32> -// LLVM: {{%.*}} = call <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> [[INTRN_ARG]], i32 7) -// LLVM: ret <4 x i16> {{%.*}} - -uint32x2_t test_vqrshrun_n_s64(int64x2_t a) { - return vqrshrun_n_s64(a, 15); -} - -// CIR-LABEL: test_vqrshrun_n_s64 -// CIR: [[INTRN_ARG1:%.*]] = cir.const #cir.int<15> : !s32i -// CIR: [[INTRN_ARG0:%.*]] = cir.cast(bitcast, {{%.*}} : !cir.vector), !cir.vector -// CIR: {{%.*}} = cir.llvm.intrinsic "llvm.aarch64.neon.sqrshrun" [[INTRN_ARG0]], [[INTRN_ARG1]] : -// CIR-SAME: (!cir.vector, !s32i) -> !cir.vector - -// LLVM: {{.*}}test_vqrshrun_n_s64(<2 x i64>{{.*}} [[A:%.*]]) -// LLVM: store <2 x i64> [[A]], ptr [[A_ADDR:%.*]], align 16 -// LLVM: [[A_VAL:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16 -// LLVM: store <2 x i64> [[A_VAL]], ptr [[S0:%.*]], align 16 -// LLVM: [[S0_VAL:%.*]] = load <2 x i64>, ptr [[S0]], align 16 -// LLVM: [[S0_VAL_CAST:%.*]] = bitcast <2 x i64> [[S0_VAL]] to <16 x i8> -// LLVM: [[INTRN_ARG:%.*]] = bitcast <16 x i8> [[S0_VAL_CAST]] to <2 x i64> -// LLVM: {{%.*}} = call <2 x i32> @llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64> [[INTRN_ARG]], i32 15) -// LLVM: ret <2 x i32> {{%.*}} diff --git a/clang/test/CIR/CodeGen/aarch64-neon-intrinsics.c b/clang/test/CIR/CodeGen/aarch64-neon-intrinsics.c index 02aa70a4d628..54520e688a59 100644 --- a/clang/test/CIR/CodeGen/aarch64-neon-intrinsics.c +++ b/clang/test/CIR/CodeGen/aarch64-neon-intrinsics.c @@ -2839,79 +2839,103 @@ // return vrhaddq_u32(v1, v2); // } -// NYI-LABEL: @test_vqadd_s8( -// NYI: [[VQADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqadd.v8i8(<8 x i8> %a, <8 x i8> %b) -// NYI: ret <8 x i8> [[VQADD_V_I]] -// int8x8_t test_vqadd_s8(int8x8_t a, int8x8_t b) { -// return vqadd_s8(a, b); -// } -// NYI-LABEL: @test_vqadd_s16( -// NYI: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// NYI: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// NYI: [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %a, <4 x i16> %b) -// NYI: [[VQADD_V3_I:%.*]] = bitcast <4 x i16> [[VQADD_V2_I]] to <8 x i8> -// NYI: ret <4 x i16> [[VQADD_V2_I]] -// int16x4_t test_vqadd_s16(int16x4_t a, int16x4_t b) { -// return vqadd_s16(a, b); -// } +int8x8_t test_vqadd_s8(int8x8_t a, int8x8_t b) { + return vqadd_s8(a, b); + // CIR-LABEL: vqadd_s8 + // CIR: {{%.*}} = cir.llvm.intrinsic "llvm.aarch64.neon.sqadd" {{%.*}}, {{%.*}} : + // CIR-SAME: (!cir.vector, !cir.vector) -> !cir.vector -// NYI-LABEL: @test_vqadd_s32( -// NYI: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// NYI: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// NYI: [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> %a, <2 x i32> %b) -// NYI: [[VQADD_V3_I:%.*]] = bitcast <2 x i32> [[VQADD_V2_I]] to <8 x i8> -// NYI: ret <2 x i32> [[VQADD_V2_I]] -// int32x2_t test_vqadd_s32(int32x2_t a, int32x2_t b) { -// return vqadd_s32(a, b); -// } - -// NYI-LABEL: @test_vqadd_s64( -// NYI: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// NYI: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// NYI: [[VQADD_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqadd.v1i64(<1 x i64> %a, <1 x i64> %b) -// NYI: [[VQADD_V3_I:%.*]] = bitcast <1 x i64> [[VQADD_V2_I]] to <8 x i8> -// NYI: ret <1 x i64> [[VQADD_V2_I]] -// int64x1_t test_vqadd_s64(int64x1_t a, int64x1_t b) { -// return vqadd_s64(a, b); -// } - -// NYI-LABEL: @test_vqadd_u8( -// NYI: [[VQADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqadd.v8i8(<8 x i8> %a, <8 x i8> %b) -// NYI: ret <8 x i8> [[VQADD_V_I]] -// uint8x8_t test_vqadd_u8(uint8x8_t a, uint8x8_t b) { -// return vqadd_u8(a, b); -// } - -// NYI-LABEL: @test_vqadd_u16( -// NYI: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// NYI: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// NYI: [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqadd.v4i16(<4 x i16> %a, <4 x i16> %b) -// NYI: [[VQADD_V3_I:%.*]] = bitcast <4 x i16> [[VQADD_V2_I]] to <8 x i8> -// NYI: ret <4 x i16> [[VQADD_V2_I]] -// uint16x4_t test_vqadd_u16(uint16x4_t a, uint16x4_t b) { -// return vqadd_u16(a, b); -// } - -// NYI-LABEL: @test_vqadd_u32( -// NYI: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// NYI: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// NYI: [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqadd.v2i32(<2 x i32> %a, <2 x i32> %b) -// NYI: [[VQADD_V3_I:%.*]] = bitcast <2 x i32> [[VQADD_V2_I]] to <8 x i8> -// NYI: ret <2 x i32> [[VQADD_V2_I]] -// uint32x2_t test_vqadd_u32(uint32x2_t a, uint32x2_t b) { -// return vqadd_u32(a, b); -// } + // LLVM-LABEL: @test_vqadd_s8( + // LLVM: [[VQADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqadd.v8i8(<8 x i8> %0, <8 x i8> %1) + // LLVM: ret <8 x i8> [[VQADD_V_I]] +} -// NYI-LABEL: @test_vqadd_u64( -// NYI: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// NYI: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// NYI: [[VQADD_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqadd.v1i64(<1 x i64> %a, <1 x i64> %b) -// NYI: [[VQADD_V3_I:%.*]] = bitcast <1 x i64> [[VQADD_V2_I]] to <8 x i8> -// NYI: ret <1 x i64> [[VQADD_V2_I]] -// uint64x1_t test_vqadd_u64(uint64x1_t a, uint64x1_t b) { -// return vqadd_u64(a, b); -// } + int16x4_t test_vqadd_s16(int16x4_t a, int16x4_t b) { + return vqadd_s16(a, b); + // CIR-LABEL: vqadd_s16 + // CIR: {{%.*}} = cir.llvm.intrinsic "llvm.aarch64.neon.sqadd" {{%.*}}, {{%.*}} : + // CIR-SAME: (!cir.vector, !cir.vector) -> !cir.vector + + // LLVM-LABEL: @test_vqadd_s16( + // LLVM: [[TMP0:%.*]] = bitcast <4 x i16> %0 to <8 x i8> + // LLVM: [[TMP1:%.*]] = bitcast <4 x i16> %1 to <8 x i8> + // LLVM: [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %0, <4 x i16> %1) + // LLVM: [[VQADD_V3_I:%.*]] = bitcast <4 x i16> [[VQADD_V2_I]] to <8 x i8> + // LLVM: ret <4 x i16> [[VQADD_V2_I]] + } + + int32x2_t test_vqadd_s32(int32x2_t a, int32x2_t b) { + return vqadd_s32(a, b); + // CIR-LABEL: vqadd_s32 + // CIR: {{%.*}} = cir.llvm.intrinsic "llvm.aarch64.neon.sqadd" {{%.*}}, {{%.*}} : + // CIR-SAME: (!cir.vector, !cir.vector) -> !cir.vector + + // LLVM-LABEL: @test_vqadd_s32( + // LLVM: [[TMP0:%.*]] = bitcast <2 x i32> %0 to <8 x i8> + // LLVM: [[TMP1:%.*]] = bitcast <2 x i32> %1 to <8 x i8> + // LLVM: [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> %0, <2 x i32> %1) + // LLVM: [[VQADD_V3_I:%.*]] = bitcast <2 x i32> [[VQADD_V2_I]] to <8 x i8> + // LLVM: ret <2 x i32> [[VQADD_V2_I]] + } + + int64x1_t test_vqadd_s64(int64x1_t a, int64x1_t b) { + return vqadd_s64(a, b); + // CIR-LABEL: vqadd_s64 + // CIR: {{%.*}} = cir.llvm.intrinsic "llvm.aarch64.neon.sqadd" {{%.*}}, {{%.*}} : + // CIR-SAME: (!cir.vector, !cir.vector) -> !cir.vector + + // LLVM-LABEL: @test_vqadd_s64( + // LLVM: [[TMP0:%.*]] = bitcast <1 x i64> %0 to <8 x i8> + // LLVM: [[TMP1:%.*]] = bitcast <1 x i64> %1 to <8 x i8> + // LLVM: [[VQADD_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqadd.v1i64(<1 x i64> %0, <1 x i64> %1) + // LLVM: [[VQADD_V3_I:%.*]] = bitcast <1 x i64> [[VQADD_V2_I]] to <8 x i8> + // LLVM: ret <1 x i64> [[VQADD_V2_I]] + } + + uint8x8_t test_vqadd_u8(uint8x8_t a, uint8x8_t b) { + return vqadd_u8(a, b); + // CIR-LABEL: vqadd_u8 + // CIR: {{%.*}} = cir.llvm.intrinsic "llvm.aarch64.neon.uqadd" {{%.*}}, {{%.*}} : + // CIR-SAME: (!cir.vector, !cir.vector) -> !cir.vector + + // LLVM-LABEL: @test_vqadd_u8( + // LLVM: [[VQADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqadd.v8i8(<8 x i8> %0, <8 x i8> %1) + // LLVM: ret <8 x i8> [[VQADD_V_I]] + } + + uint16x4_t test_vqadd_u16(uint16x4_t a, uint16x4_t b) { + return vqadd_u16(a, b); + // CIR-LABEL: vqadd_u16 + // CIR: {{%.*}} = cir.llvm.intrinsic "llvm.aarch64.neon.uqadd" {{%.*}}, {{%.*}} : + // CIR-SAME: (!cir.vector, !cir.vector) -> !cir.vector + + // LLVM-LABEL: @test_vqadd_u16( + // LLVM: [[VQADD_V_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqadd.v4i16(<4 x i16> %0, <4 x i16> %1) + // LLVM: ret <4 x i16> [[VQADD_V_I]] + } + + uint32x2_t test_vqadd_u32(uint32x2_t a, uint32x2_t b) { + return vqadd_u32(a, b); + // CIR-LABEL: vqadd_u32 + // CIR: {{%.*}} = cir.llvm.intrinsic "llvm.aarch64.neon.uqadd" {{%.*}}, {{%.*}} : + // CIR-SAME: (!cir.vector, !cir.vector) -> !cir.vector + + // LLVM-LABEL: @test_vqadd_u32( + // LLVM: [[VQADD_V_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqadd.v2i32(<2 x i32> %0, <2 x i32> %1) + // LLVM: ret <2 x i32> [[VQADD_V_I]] + } + + uint64x1_t test_vqadd_u64(uint64x1_t a, uint64x1_t b) { + return vqadd_u64(a, b); + // CIR-LABEL: vqadd_u64 + // CIR: {{%.*}} = cir.llvm.intrinsic "llvm.aarch64.neon.uqadd" {{%.*}}, {{%.*}} : + // CIR-SAME: (!cir.vector, !cir.vector) -> !cir.vector + + // LLVM-LABEL: @test_vqadd_u64( + // LLVM: [[VQADD_V_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqadd.v1i64(<1 x i64> %0, <1 x i64> %1) + // LLVM: ret <1 x i64> [[VQADD_V_I]] + } // NYI-LABEL: @test_vqaddq_s8( // NYI: [[VQADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqadd.v16i8(<16 x i8> %a, <16 x i8> %b) @@ -5972,32 +5996,56 @@ // return vrshrn_high_n_u64(a, b, 19); // } -// NYI-LABEL: @test_vqrshrun_n_s16( -// NYI: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// NYI: [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// NYI: [[VQRSHRUN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> [[VQRSHRUN_N]], i32 3) -// NYI: ret <8 x i8> [[VQRSHRUN_N1]] -// uint8x8_t test_vqrshrun_n_s16(int16x8_t a) { -// return vqrshrun_n_s16(a, 3); -// } +uint8x8_t test_vqrshrun_n_s16(int16x8_t a) { + return vqrshrun_n_s16(a, 3); + // CIR-LABEL: test_vqrshrun_n_s16 + // CIR: [[INTRN_ARG1:%.*]] = cir.const #cir.int<3> : !s32i + // CIR: [[INTRN_ARG0:%.*]] = cir.cast(bitcast, {{%.*}} : !cir.vector), !cir.vector + // CIR: {{%.*}} = cir.llvm.intrinsic "llvm.aarch64.neon.sqrshrun" [[INTRN_ARG0]], [[INTRN_ARG1]] : + // CIR-SAME: (!cir.vector, !s32i) -> !cir.vector + + // LLVM-LABEL: @test_vqrshrun_n_s16( + // LLVM: [[TMP0:%.*]] = bitcast <8 x i16> {{%.*}} to <16 x i8> + // LLVM: [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> + // LLVM: [[VQRSHRUN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> [[VQRSHRUN_N]], i32 3) + // LLVM: store <8 x i8> [[VQRSHRUN_N1]], ptr [[RET:%.*]], align 8 + // LLVM: [[RETVAL:%.*]] = load <8 x i8>, ptr [[RET]], align 8 + // LLVM: ret <8 x i8> [[RETVAL]] +} -// NYI-LABEL: @test_vqrshrun_n_s32( -// NYI: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// NYI: [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// NYI: [[VQRSHRUN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> [[VQRSHRUN_N]], i32 9) -// NYI: ret <4 x i16> [[VQRSHRUN_N1]] -// uint16x4_t test_vqrshrun_n_s32(int32x4_t a) { -// return vqrshrun_n_s32(a, 9); -// } +uint16x4_t test_vqrshrun_n_s32(int32x4_t a) { + return vqrshrun_n_s32(a, 9); + // CIR-LABEL: test_vqrshrun_n_s32 + // CIR: [[INTRN_ARG1:%.*]] = cir.const #cir.int<9> : !s32i + // CIR: [[INTRN_ARG0:%.*]] = cir.cast(bitcast, {{%.*}} : !cir.vector), !cir.vector + // CIR: {{%.*}} = cir.llvm.intrinsic "llvm.aarch64.neon.sqrshrun" [[INTRN_ARG0]], [[INTRN_ARG1]] : + // CIR-SAME: (!cir.vector, !s32i) -> !cir.vector + + // LLVM-LABEL: @test_vqrshrun_n_s32( + // LLVM: [[TMP0:%.*]] = bitcast <4 x i32> {{%.*}} to <16 x i8> + // LLVM: [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> + // LLVM: [[VQRSHRUN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> [[VQRSHRUN_N]], i32 9) + // LLVM: store <4 x i16> [[VQRSHRUN_N1]], ptr [[RET:%.*]], align 8 + // LLVM: [[RETVAL:%.*]] = load <4 x i16>, ptr [[RET]], align 8 + // LLVM: ret <4 x i16> [[RETVAL]] +} -// NYI-LABEL: @test_vqrshrun_n_s64( -// NYI: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// NYI: [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// NYI: [[VQRSHRUN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64> [[VQRSHRUN_N]], i32 19) -// NYI: ret <2 x i32> [[VQRSHRUN_N1]] -// uint32x2_t test_vqrshrun_n_s64(int64x2_t a) { -// return vqrshrun_n_s64(a, 19); -// } +uint32x2_t test_vqrshrun_n_s64(int64x2_t a) { + return vqrshrun_n_s64(a, 19); + // CIR-LABEL: test_vqrshrun_n_s64 + // CIR: [[INTRN_ARG1:%.*]] = cir.const #cir.int<19> : !s32i + // CIR: [[INTRN_ARG0:%.*]] = cir.cast(bitcast, {{%.*}} : !cir.vector), !cir.vector + // CIR: {{%.*}} = cir.llvm.intrinsic "llvm.aarch64.neon.sqrshrun" [[INTRN_ARG0]], [[INTRN_ARG1]] : + // CIR-SAME: (!cir.vector, !s32i) -> !cir.vector + + // LLVM-LABEL: @test_vqrshrun_n_s64( + // LLVM: [[TMP0:%.*]] = bitcast <2 x i64> {{%.*}} to <16 x i8> + // LLVM: [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> + // LLVM: [[VQRSHRUN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64> [[VQRSHRUN_N]], i32 19) + // LLVM: store <2 x i32> [[VQRSHRUN_N1]], ptr [[RET:%.*]], align 8 + // LLVM: [[RETVAL:%.*]] = load <2 x i32>, ptr [[RET]], align 8 + // LLVM: ret <2 x i32> [[RETVAL]] +} // NYI-LABEL: @test_vqrshrun_high_n_s16( // NYI: [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8> @@ -6041,7 +6089,7 @@ // NYI-LABEL: @test_vqshrn_n_s32( // NYI: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> // NYI: [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// NYI: [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32> [[VQSHRN_N]], i32 9) +// NYI: [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32> [[VQSHRN_N]], i32 19) // NYI: ret <4 x i16> [[VQSHRN_N1]] // int16x4_t test_vqshrn_n_s32(int32x4_t a) { // return vqshrn_n_s32(a, 9); From 780c1f3b4b4469ad04537e8015103f154fda0191 Mon Sep 17 00:00:00 2001 From: Guojin He Date: Tue, 1 Oct 2024 19:23:40 -0400 Subject: [PATCH 5/5] moving aarch64-neon-intrinsics.c --- .../CIR/CodeGen/{aarch64-neon-intrinsics.c => AArch64/neon.c} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename clang/test/CIR/CodeGen/{aarch64-neon-intrinsics.c => AArch64/neon.c} (100%) diff --git a/clang/test/CIR/CodeGen/aarch64-neon-intrinsics.c b/clang/test/CIR/CodeGen/AArch64/neon.c similarity index 100% rename from clang/test/CIR/CodeGen/aarch64-neon-intrinsics.c rename to clang/test/CIR/CodeGen/AArch64/neon.c