From 4bb39fc9013d98e2b210504c53fa2ea76ec4a1b3 Mon Sep 17 00:00:00 2001
From: Guojin He <guojinhe@meta.com>
Date: Tue, 1 Oct 2024 16:20:21 -0400
Subject: [PATCH 1/5] Organize AArch64 neon tests

---
 .../neon-arith.c}                             |   3 +
 .../neon-ldst.c}                              |   3 +
 .../neon-misc.c}                              | 213 ++++++++++++++++-
 .../neon-varith.c}                            |  62 ++++-
 .../CIR/CodeGen/aarch64-neon-simd-shift.c     |  69 ------
 clang/test/CIR/CodeGen/aarch64-neon-vget.c    | 219 ------------------
 6 files changed, 273 insertions(+), 296 deletions(-)
 rename clang/test/CIR/CodeGen/{arm-neon-directed-rounding.c => AArch64/neon-arith.c} (98%)
 rename clang/test/CIR/CodeGen/{aarch64-neon-ldst.c => AArch64/neon-ldst.c} (99%)
 rename clang/test/CIR/CodeGen/{aarch64-neon-vset.c => AArch64/neon-misc.c} (55%)
 rename clang/test/CIR/CodeGen/{aarch64-neon-vqadd.c => AArch64/neon-varith.c} (73%)
 delete mode 100644 clang/test/CIR/CodeGen/aarch64-neon-simd-shift.c
 delete mode 100644 clang/test/CIR/CodeGen/aarch64-neon-vget.c

diff --git a/clang/test/CIR/CodeGen/arm-neon-directed-rounding.c b/clang/test/CIR/CodeGen/AArch64/neon-arith.c
similarity index 98%
rename from clang/test/CIR/CodeGen/arm-neon-directed-rounding.c
rename to clang/test/CIR/CodeGen/AArch64/neon-arith.c
index 92b4a9298eac..7d8636758652 100644
--- a/clang/test/CIR/CodeGen/arm-neon-directed-rounding.c
+++ b/clang/test/CIR/CodeGen/AArch64/neon-arith.c
@@ -8,6 +8,9 @@
 // REQUIRES: aarch64-registered-target || arm-registered-target
 #include <arm_neon.h>
 
+// This test file contains aarch64 NEON arithmetic intrinsics that are not 
+// vector type related.
+
 float32_t test_vrndns_f32(float32_t a) {
   return vrndns_f32(a);
 }
diff --git a/clang/test/CIR/CodeGen/aarch64-neon-ldst.c b/clang/test/CIR/CodeGen/AArch64/neon-ldst.c
similarity index 99%
rename from clang/test/CIR/CodeGen/aarch64-neon-ldst.c
rename to clang/test/CIR/CodeGen/AArch64/neon-ldst.c
index 9b6ed9ee479c..d112f3a81808 100644
--- a/clang/test/CIR/CodeGen/aarch64-neon-ldst.c
+++ b/clang/test/CIR/CodeGen/AArch64/neon-ldst.c
@@ -6,6 +6,9 @@
 // RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
+
+// This test file contains tests for the AArch64 NEON load/store intrinsics.
+
 #include <arm_neon.h>
 
 int8x8_t test_vld1_lane_s8(int8_t const * ptr, int8x8_t src) {
diff --git a/clang/test/CIR/CodeGen/aarch64-neon-vset.c b/clang/test/CIR/CodeGen/AArch64/neon-misc.c
similarity index 55%
rename from clang/test/CIR/CodeGen/aarch64-neon-vset.c
rename to clang/test/CIR/CodeGen/AArch64/neon-misc.c
index 5da779ff69eb..6b0c3a866da5 100644
--- a/clang/test/CIR/CodeGen/aarch64-neon-vset.c
+++ b/clang/test/CIR/CodeGen/AArch64/neon-misc.c
@@ -5,13 +5,8 @@
 // RUN:            -emit-llvm -target-feature +neon %s -o %t.ll
 // RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s
 
-// This test file is similar to but not the same as 
-// clang/test/CodeGen/aarch64-neon-vget.c 
-// The difference is that this file only tests uses vset intrinsics, as we feel
-// it would be proper to have a separate test file testing vget intrinsics 
-// with the file name aarch64-neon-vget.c 
-// Also, for each integer type, we only test signed or unsigned, not both. 
-// This is because integer types of the same size just use same intrinsic.
+// This test file contains AArch64 NEON intrinsics that are not covered by
+// other tests.
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 #include <arm_neon.h>
@@ -236,3 +231,207 @@ float32x4_t test_vsetq_lane_f32(float32_t a, float32x4_t b) {
 // LLVM: [[INTRN_ARG1:%.*]] = load <4 x float>, ptr [[S1]], align 16
 // LLVM: [[INTRN_RES:%.*]] = insertelement <4 x float> [[INTRN_ARG1]], float [[INTRN_ARG0]], i32 3
 // LLVM: ret <4 x float> {{%.*}}
+
+uint8_t test_vget_lane_u8(uint8x8_t a) {
+  return vget_lane_u8(a, 7);
+}
+
+// CIR-LABEL: test_vget_lane_u8
+// CIR: [[IDX:%.*]]  = cir.const #cir.int<7> : !s32i
+// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector<!u8i x 8>
+
+// LLVM: define dso_local i8 @test_vget_lane_u8(<8 x i8> [[ARG:%.*]])
+// LLVM: [[ARG_SAVE:%.*]] = alloca <8 x i8>, i64 1, align 8
+// LLVM: store <8 x i8> [[ARG]], ptr [[ARG_SAVE]], align 8
+// LLVM: [[TMP:%.*]] = load <8 x i8>, ptr [[ARG_SAVE:%.*]], align 8
+// LLVM: store <8 x i8> [[TMP]], ptr [[S0:%.*]], align 8
+// LLVM: [[INTRN_ARG:%.*]] = load <8 x i8>, ptr [[S0]], align 8
+// LLVM: {{%.*}} = extractelement <8 x i8> [[INTRN_ARG]], i32 7
+// LLVM: ret i8 {{%.*}}
+
+uint8_t test_vgetq_lane_u8(uint8x16_t a) {
+  return vgetq_lane_u8(a, 15);
+}
+
+// CIR-LABEL: test_vgetq_lane_u8
+// CIR: [[IDX:%.*]]  = cir.const #cir.int<15> : !s32i
+// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector<!u8i x 16>
+
+// LLVM: define dso_local i8 @test_vgetq_lane_u8(<16 x i8> [[ARG:%.*]])
+// LLVM: [[ARG_SAVE:%.*]] = alloca <16 x i8>, i64 1, align 16
+// LLVM: store <16 x i8> [[ARG]], ptr [[ARG_SAVE]], align 16
+// LLVM: [[TMP:%.*]] = load <16 x i8>, ptr [[ARG_SAVE:%.*]], align 16
+// LLVM: store <16 x i8> [[TMP]], ptr [[S0:%.*]], align 16
+// LLVM: [[INTRN_ARG:%.*]] = load <16 x i8>, ptr [[S0]], align 16
+// LLVM: {{%.*}} = extractelement <16 x i8> [[INTRN_ARG]], i32 15
+// LLVM: ret i8 {{%.*}}
+
+uint16_t test_vget_lane_u16(uint16x4_t a) {
+  return vget_lane_u16(a, 3);
+}
+
+// CIR-LABEL: test_vget_lane_u16
+// CIR: [[IDX:%.*]]  = cir.const #cir.int<3> : !s32i
+// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector<!u16i x 4>
+
+// LLVM: define dso_local i16 @test_vget_lane_u16(<4 x i16> [[ARG:%.*]])
+// LLVM: [[ARG_SAVE:%.*]] = alloca <4 x i16>, i64 1, align 8
+// LLVM: store <4 x i16> [[ARG]], ptr [[ARG_SAVE]], align 8
+// LLVM: [[TMP:%.*]] = load <4 x i16>, ptr [[ARG_SAVE:%.*]], align 8
+// LLVM: store <4 x i16> [[TMP]], ptr [[S0:%.*]], align 8
+// LLVM: [[INTRN_ARG:%.*]] = load <4 x i16>, ptr [[S0]], align 8
+// LLVM: {{%.*}} = extractelement <4 x i16> [[INTRN_ARG]], i32 3
+// LLVM: ret i16 {{%.*}}
+
+uint16_t test_vgetq_lane_u16(uint16x8_t a) {
+  return vgetq_lane_u16(a, 7);
+}
+
+// CIR-LABEL: test_vgetq_lane_u16
+// CIR: [[IDX:%.*]]  = cir.const #cir.int<7> : !s32i
+// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector<!u16i x 8>
+
+// LLVM: define dso_local i16 @test_vgetq_lane_u16(<8 x i16> [[ARG:%.*]])
+// LLVM: [[ARG_SAVE:%.*]] = alloca <8 x i16>, i64 1, align 16
+// LLVM: store <8 x i16> [[ARG]], ptr [[ARG_SAVE]], align 16
+// LLVM: [[TMP:%.*]] = load <8 x i16>, ptr [[ARG_SAVE:%.*]], align 16
+// LLVM: store <8 x i16> [[TMP]], ptr [[S0:%.*]], align 16
+// LLVM: [[INTRN_ARG:%.*]] = load <8 x i16>, ptr [[S0]], align 16
+// LLVM: {{%.*}} = extractelement <8 x i16> [[INTRN_ARG]], i32 7
+// LLVM: ret i16 {{%.*}}
+
+uint32_t test_vget_lane_u32(uint32x2_t a) {
+  return vget_lane_u32(a, 1);
+}
+
+// CIR-LABEL: test_vget_lane_u32
+// CIR: [[IDX:%.*]]  = cir.const #cir.int<1> : !s32i
+// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector<!u32i x 2>
+
+// LLVM: define dso_local i32 @test_vget_lane_u32(<2 x i32> [[ARG:%.*]])
+// LLVM: [[ARG_SAVE:%.*]] = alloca <2 x i32>, i64 1, align 8
+// LLVM: store <2 x i32> [[ARG]], ptr [[ARG_SAVE]], align 8
+// LLVM: [[TMP:%.*]] = load <2 x i32>, ptr [[ARG_SAVE:%.*]], align 8
+// LLVM: store <2 x i32> [[TMP]], ptr [[S0:%.*]], align 8
+// LLVM: [[INTRN_ARG:%.*]] = load <2 x i32>, ptr [[S0]], align 8
+// LLVM: {{%.*}} = extractelement <2 x i32> [[INTRN_ARG]], i32 1
+// LLVM: ret i32 {{%.*}}
+
+uint32_t test_vgetq_lane_u32(uint32x4_t a) {
+  return vgetq_lane_u32(a, 3);
+}
+
+// CIR-LABEL: test_vgetq_lane_u32
+// CIR: [[IDX:%.*]]  = cir.const #cir.int<3> : !s32i
+// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector<!u32i x 4>
+
+// LLVM: define dso_local i32 @test_vgetq_lane_u32(<4 x i32> [[ARG:%.*]])
+// LLVM: [[ARG_SAVE:%.*]] = alloca <4 x i32>, i64 1, align 16
+// LLVM: store <4 x i32> [[ARG]], ptr [[ARG_SAVE]], align 16
+// LLVM: [[TMP:%.*]] = load <4 x i32>, ptr [[ARG_SAVE:%.*]], align 16
+// LLVM: store <4 x i32> [[TMP]], ptr [[S0:%.*]], align 16
+// LLVM: [[INTRN_ARG:%.*]] = load <4 x i32>, ptr [[S0]], align 16
+// LLVM: {{%.*}} = extractelement <4 x i32> [[INTRN_ARG]], i32 3
+// LLVM: ret i32 {{%.*}}
+
+uint64_t test_vget_lane_u64(uint64x1_t a) {
+  return vget_lane_u64(a, 0);
+}
+
+// CIR-LABEL: test_vget_lane_u64
+// CIR: [[IDX:%.*]]  = cir.const #cir.int<0> : !s32i
+// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector<!u64i x 1>
+
+// LLVM: define dso_local i64 @test_vget_lane_u64(<1 x i64> [[ARG:%.*]])
+// LLVM: [[ARG_SAVE:%.*]] = alloca <1 x i64>, i64 1, align 8
+// LLVM: store <1 x i64> [[ARG]], ptr [[ARG_SAVE]], align 8
+// LLVM: [[TMP:%.*]] = load <1 x i64>, ptr [[ARG_SAVE:%.*]], align 8
+// LLVM: store <1 x i64> [[TMP]], ptr [[S0:%.*]], align 8
+// LLVM: [[INTRN_ARG:%.*]] = load <1 x i64>, ptr [[S0]], align 8
+// LLVM: {{%.*}} = extractelement <1 x i64> [[INTRN_ARG]], i32 0
+// LLVM: ret i64 {{%.*}}
+
+uint64_t test_vgetq_lane_u64(uint64x2_t a) {
+  return vgetq_lane_u64(a, 1);
+}
+
+// CIR-LABEL: test_vgetq_lane_u64
+// CIR: [[IDX:%.*]]  = cir.const #cir.int<1> : !s32i
+// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector<!u64i x 2>
+
+// LLVM: define dso_local i64 @test_vgetq_lane_u64(<2 x i64> [[ARG:%.*]])
+// LLVM: [[ARG_SAVE:%.*]] = alloca <2 x i64>, i64 1, align 16
+// LLVM: store <2 x i64> [[ARG]], ptr [[ARG_SAVE]], align 16
+// LLVM: [[TMP:%.*]] = load <2 x i64>, ptr [[ARG_SAVE:%.*]], align 16
+// LLVM: store <2 x i64> [[TMP]], ptr [[S0:%.*]], align 16
+// LLVM: [[INTRN_ARG:%.*]] = load <2 x i64>, ptr [[S0]], align 16
+// LLVM: {{%.*}} = extractelement <2 x i64> [[INTRN_ARG]], i32 1
+// LLVM: ret i64 {{%.*}}
+
+float32_t test_vget_lane_f32(float32x2_t a) {
+  return vget_lane_f32(a, 1);
+}
+
+// CIR-LABEL: test_vget_lane_f32
+// CIR: [[IDX:%.*]]  = cir.const #cir.int<1> : !s32i
+// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector<!cir.float x 2>
+
+// LLVM: define dso_local float @test_vget_lane_f32(<2 x float> [[ARG:%.*]])
+// LLVM: [[ARG_SAVE:%.*]] = alloca <2 x float>, i64 1, align 8
+// LLVM: store <2 x float> [[ARG]], ptr [[ARG_SAVE]], align 8
+// LLVM: [[TMP:%.*]] = load <2 x float>, ptr [[ARG_SAVE:%.*]], align 8
+// LLVM: store <2 x float> [[TMP]], ptr [[S0:%.*]], align 8
+// LLVM: [[INTRN_ARG:%.*]] = load <2 x float>, ptr [[S0]], align 8
+// LLVM: {{%.*}} = extractelement <2 x float> [[INTRN_ARG]], i32 1
+// LLVM: ret float {{%.*}}
+
+float64_t test_vget_lane_f64(float64x1_t a) {
+  return vget_lane_f64(a, 0);
+}
+
+// CIR-LABEL: test_vget_lane_f64
+// CIR: [[IDX:%.*]]  = cir.const #cir.int<0> : !s32i
+// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector<!cir.double x 1>
+
+// LLVM: define dso_local double @test_vget_lane_f64(<1 x double> [[ARG:%.*]])
+// LLVM: [[ARG_SAVE:%.*]] = alloca <1 x double>, i64 1, align 8
+// LLVM: store <1 x double> [[ARG]], ptr [[ARG_SAVE]], align 8
+// LLVM: [[TMP:%.*]] = load <1 x double>, ptr [[ARG_SAVE:%.*]], align 8
+// LLVM: store <1 x double> [[TMP]], ptr [[S0:%.*]], align 8
+// LLVM: [[INTRN_ARG:%.*]] = load <1 x double>, ptr [[S0]], align 8
+// LLVM: {{%.*}} = extractelement <1 x double> [[INTRN_ARG]], i32 0
+// LLVM: ret double {{%.*}}
+
+float32_t test_vgetq_lane_f32(float32x4_t a) {
+  return vgetq_lane_f32(a, 3);
+}
+
+// CIR-LABEL: test_vgetq_lane_f32
+// CIR: [[IDX:%.*]]  = cir.const #cir.int<3> : !s32i
+// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector<!cir.float x 4>
+
+// LLVM: define dso_local float @test_vgetq_lane_f32(<4 x float> [[ARG:%.*]])
+// LLVM: [[ARG_SAVE:%.*]] = alloca <4 x float>, i64 1, align 16
+// LLVM: store <4 x float> [[ARG]], ptr [[ARG_SAVE]], align 16
+// LLVM: [[TMP:%.*]] = load <4 x float>, ptr [[ARG_SAVE:%.*]], align 16
+// LLVM: store <4 x float> [[TMP]], ptr [[S0:%.*]], align 16
+// LLVM: [[INTRN_ARG:%.*]] = load <4 x float>, ptr [[S0]], align 16
+// LLVM: {{%.*}} = extractelement <4 x float> [[INTRN_ARG]], i32 3
+// LLVM: ret float {{%.*}}
+
+float64_t test_vgetq_lane_f64(float64x2_t a) {
+  return vgetq_lane_f64(a, 1);
+}
+
+// CIR-LABEL: test_vgetq_lane_f64
+// CIR: [[IDX:%.*]]  = cir.const #cir.int<1> : !s32i
+// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector<!cir.double x 2>
+
+// LLVM: define dso_local double @test_vgetq_lane_f64(<2 x double> [[ARG:%.*]])
+// LLVM: [[ARG_SAVE:%.*]] = alloca <2 x double>, i64 1, align 16
+// LLVM: store <2 x double> [[ARG]], ptr [[ARG_SAVE]], align 16
+// LLVM: [[TMP:%.*]] = load <2 x double>, ptr [[ARG_SAVE:%.*]], align 16
+// LLVM: store <2 x double> [[TMP]], ptr [[S0:%.*]], align 16
+// LLVM: [[INTRN_ARG:%.*]] = load <2 x double>, ptr [[S0]], align 16
+// LLVM: {{%.*}} = extractelement <2 x double> [[INTRN_ARG]], i32 1
+// LLVM: ret double {{%.*}}
diff --git a/clang/test/CIR/CodeGen/aarch64-neon-vqadd.c b/clang/test/CIR/CodeGen/AArch64/neon-varith.c
similarity index 73%
rename from clang/test/CIR/CodeGen/aarch64-neon-vqadd.c
rename to clang/test/CIR/CodeGen/AArch64/neon-varith.c
index 0932d95866c5..9643342f093c 100644
--- a/clang/test/CIR/CodeGen/aarch64-neon-vqadd.c
+++ b/clang/test/CIR/CodeGen/AArch64/neon-varith.c
@@ -5,7 +5,7 @@
 // RUN:            -emit-llvm -target-feature +neon %s -o %t.ll
 // RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s
 
-// Tetsting normal situation of vdup lane intrinsics.
+// This test file contains tests of aarch64 NEON vector arithmetic intrinsics.
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 #include <arm_neon.h>
@@ -177,3 +177,63 @@ int64x1_t test_vqadd_s64(int64x1_t a, int64x1_t b) {
 // LLVM: [[INTRN_B:%.*]] = load <1 x i64>, ptr [[P1_ADDR]], align 8
 // LLVM: {{%.*}} = call <1 x i64> @llvm.aarch64.neon.sqadd.v1i64(<1 x i64> [[INTRN_A]], <1 x i64> [[INTRN_B]])
 // LLVM: ret <1 x i64>
+
+uint8x8_t test_vqrshrun_n_s16(int16x8_t a) {
+  return vqrshrun_n_s16(a, 3);
+}
+
+// CIR-LABEL: test_vqrshrun_n_s16
+// CIR: [[INTRN_ARG1:%.*]] = cir.const #cir.int<3> : !s32i
+// CIR: [[INTRN_ARG0:%.*]] = cir.cast(bitcast, {{%.*}} : !cir.vector<!s8i x 16>), !cir.vector<!s16i x 8> 
+// CIR: {{%.*}} = cir.llvm.intrinsic "llvm.aarch64.neon.sqrshrun" [[INTRN_ARG0]], [[INTRN_ARG1]] :
+// CIR-SAME: (!cir.vector<!s16i x 8>, !s32i) -> !cir.vector<!u8i x 8>
+
+// LLVM: {{.*}}test_vqrshrun_n_s16(<8 x i16>{{.*}} [[A:%.*]])
+// LLVM: store <8 x i16> [[A]], ptr [[A_ADDR:%.*]], align 16
+// LLVM: [[A_VAL:%.*]] = load <8 x i16>, ptr [[A_ADDR]], align 16
+// LLVM: store <8 x i16> [[A_VAL]], ptr [[S0:%.*]], align 16
+// LLVM: [[S0_VAL:%.*]] = load <8 x i16>, ptr [[S0]], align 16
+// LLVM: [[S0_VAL_CAST:%.*]] = bitcast <8 x i16> [[S0_VAL]] to <16 x i8>
+// LLVM: [[INTRN_ARG:%.*]] = bitcast <16 x i8> [[S0_VAL_CAST]] to <8 x i16>
+// LLVM: {{%.*}} = call <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> [[INTRN_ARG]], i32 3)
+// LLVM: ret <8 x i8> {{%.*}}
+
+uint16x4_t test_vqrshrun_n_s32(int32x4_t a) {
+  return vqrshrun_n_s32(a, 7);
+}
+
+// CIR-LABEL: test_vqrshrun_n_s32
+// CIR: [[INTRN_ARG1:%.*]] = cir.const #cir.int<7> : !s32i
+// CIR: [[INTRN_ARG0:%.*]] = cir.cast(bitcast, {{%.*}} : !cir.vector<!s8i x 16>), !cir.vector<!s32i x 4> 
+// CIR: {{%.*}} = cir.llvm.intrinsic "llvm.aarch64.neon.sqrshrun" [[INTRN_ARG0]], [[INTRN_ARG1]] :
+// CIR-SAME: (!cir.vector<!s32i x 4>, !s32i) -> !cir.vector<!u16i x 4>
+
+// LLVM: {{.*}}test_vqrshrun_n_s32(<4 x i32>{{.*}} [[A:%.*]])
+// LLVM: store <4 x i32> [[A]], ptr [[A_ADDR:%.*]], align 16
+// LLVM: [[A_VAL:%.*]] = load <4 x i32>, ptr [[A_ADDR]], align 16
+// LLVM: store <4 x i32> [[A_VAL]], ptr [[S0:%.*]], align 16
+// LLVM: [[S0_VAL:%.*]] = load <4 x i32>, ptr [[S0]], align 16
+// LLVM: [[S0_VAL_CAST:%.*]] = bitcast <4 x i32> [[S0_VAL]] to <16 x i8>
+// LLVM: [[INTRN_ARG:%.*]] = bitcast <16 x i8> [[S0_VAL_CAST]] to <4 x i32>
+// LLVM: {{%.*}} = call <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> [[INTRN_ARG]], i32 7)
+// LLVM: ret <4 x i16> {{%.*}}
+
+uint32x2_t test_vqrshrun_n_s64(int64x2_t a) {
+  return vqrshrun_n_s64(a, 15);
+}
+
+// CIR-LABEL: test_vqrshrun_n_s64
+// CIR: [[INTRN_ARG1:%.*]] = cir.const #cir.int<15> : !s32i
+// CIR: [[INTRN_ARG0:%.*]] = cir.cast(bitcast, {{%.*}} : !cir.vector<!s8i x 16>), !cir.vector<!s64i x 2> 
+// CIR: {{%.*}} = cir.llvm.intrinsic "llvm.aarch64.neon.sqrshrun" [[INTRN_ARG0]], [[INTRN_ARG1]] :
+// CIR-SAME: (!cir.vector<!s64i x 2>, !s32i) -> !cir.vector<!u32i x 2>
+
+// LLVM: {{.*}}test_vqrshrun_n_s64(<2 x i64>{{.*}} [[A:%.*]])
+// LLVM: store <2 x i64> [[A]], ptr [[A_ADDR:%.*]], align 16
+// LLVM: [[A_VAL:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// LLVM: store <2 x i64> [[A_VAL]], ptr [[S0:%.*]], align 16
+// LLVM: [[S0_VAL:%.*]] = load <2 x i64>, ptr [[S0]], align 16
+// LLVM: [[S0_VAL_CAST:%.*]] = bitcast <2 x i64> [[S0_VAL]] to <16 x i8>
+// LLVM: [[INTRN_ARG:%.*]] = bitcast <16 x i8> [[S0_VAL_CAST]] to <2 x i64>
+// LLVM: {{%.*}} = call <2 x i32> @llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64> [[INTRN_ARG]], i32 15)
+// LLVM: ret <2 x i32> {{%.*}}
diff --git a/clang/test/CIR/CodeGen/aarch64-neon-simd-shift.c b/clang/test/CIR/CodeGen/aarch64-neon-simd-shift.c
deleted file mode 100644
index 8619ad0c78d6..000000000000
--- a/clang/test/CIR/CodeGen/aarch64-neon-simd-shift.c
+++ /dev/null
@@ -1,69 +0,0 @@
-// RUN: %clang_cc1 -triple aarch64-none-linux-android24  -fclangir \
-// RUN:            -ffreestanding -emit-cir -target-feature +neon %s -o %t.cir
-// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-android24  -fclangir \
-// RUN:            -ffreestanding -emit-llvm -target-feature +neon %s -o %t.ll
-// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s
-
-// REQUIRES: aarch64-registered-target || arm-registered-target
-#include <arm_neon.h>
-
-uint8x8_t test_vqrshrun_n_s16(int16x8_t a) {
-  return vqrshrun_n_s16(a, 3);
-}
-
-// CIR-LABEL: test_vqrshrun_n_s16
-// CIR: [[INTRN_ARG1:%.*]] = cir.const #cir.int<3> : !s32i
-// CIR: [[INTRN_ARG0:%.*]] = cir.cast(bitcast, {{%.*}} : !cir.vector<!s8i x 16>), !cir.vector<!s16i x 8> 
-// CIR: {{%.*}} = cir.llvm.intrinsic "llvm.aarch64.neon.sqrshrun" [[INTRN_ARG0]], [[INTRN_ARG1]] :
-// CIR-SAME: (!cir.vector<!s16i x 8>, !s32i) -> !cir.vector<!u8i x 8>
-
-// LLVM: {{.*}}test_vqrshrun_n_s16(<8 x i16>{{.*}} [[A:%.*]])
-// LLVM: store <8 x i16> [[A]], ptr [[A_ADDR:%.*]], align 16
-// LLVM: [[A_VAL:%.*]] = load <8 x i16>, ptr [[A_ADDR]], align 16
-// LLVM: store <8 x i16> [[A_VAL]], ptr [[S0:%.*]], align 16
-// LLVM: [[S0_VAL:%.*]] = load <8 x i16>, ptr [[S0]], align 16
-// LLVM: [[S0_VAL_CAST:%.*]] = bitcast <8 x i16> [[S0_VAL]] to <16 x i8>
-// LLVM: [[INTRN_ARG:%.*]] = bitcast <16 x i8> [[S0_VAL_CAST]] to <8 x i16>
-// LLVM: {{%.*}} = call <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> [[INTRN_ARG]], i32 3)
-// LLVM: ret <8 x i8> {{%.*}}
-
-uint16x4_t test_vqrshrun_n_s32(int32x4_t a) {
-  return vqrshrun_n_s32(a, 7);
-}
-
-// CIR-LABEL: test_vqrshrun_n_s32
-// CIR: [[INTRN_ARG1:%.*]] = cir.const #cir.int<7> : !s32i
-// CIR: [[INTRN_ARG0:%.*]] = cir.cast(bitcast, {{%.*}} : !cir.vector<!s8i x 16>), !cir.vector<!s32i x 4> 
-// CIR: {{%.*}} = cir.llvm.intrinsic "llvm.aarch64.neon.sqrshrun" [[INTRN_ARG0]], [[INTRN_ARG1]] :
-// CIR-SAME: (!cir.vector<!s32i x 4>, !s32i) -> !cir.vector<!u16i x 4>
-
-// LLVM: {{.*}}test_vqrshrun_n_s32(<4 x i32>{{.*}} [[A:%.*]])
-// LLVM: store <4 x i32> [[A]], ptr [[A_ADDR:%.*]], align 16
-// LLVM: [[A_VAL:%.*]] = load <4 x i32>, ptr [[A_ADDR]], align 16
-// LLVM: store <4 x i32> [[A_VAL]], ptr [[S0:%.*]], align 16
-// LLVM: [[S0_VAL:%.*]] = load <4 x i32>, ptr [[S0]], align 16
-// LLVM: [[S0_VAL_CAST:%.*]] = bitcast <4 x i32> [[S0_VAL]] to <16 x i8>
-// LLVM: [[INTRN_ARG:%.*]] = bitcast <16 x i8> [[S0_VAL_CAST]] to <4 x i32>
-// LLVM: {{%.*}} = call <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> [[INTRN_ARG]], i32 7)
-// LLVM: ret <4 x i16> {{%.*}}
-
-uint32x2_t test_vqrshrun_n_s64(int64x2_t a) {
-  return vqrshrun_n_s64(a, 15);
-}
-
-// CIR-LABEL: test_vqrshrun_n_s64
-// CIR: [[INTRN_ARG1:%.*]] = cir.const #cir.int<15> : !s32i
-// CIR: [[INTRN_ARG0:%.*]] = cir.cast(bitcast, {{%.*}} : !cir.vector<!s8i x 16>), !cir.vector<!s64i x 2> 
-// CIR: {{%.*}} = cir.llvm.intrinsic "llvm.aarch64.neon.sqrshrun" [[INTRN_ARG0]], [[INTRN_ARG1]] :
-// CIR-SAME: (!cir.vector<!s64i x 2>, !s32i) -> !cir.vector<!u32i x 2>
-
-// LLVM: {{.*}}test_vqrshrun_n_s64(<2 x i64>{{.*}} [[A:%.*]])
-// LLVM: store <2 x i64> [[A]], ptr [[A_ADDR:%.*]], align 16
-// LLVM: [[A_VAL:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
-// LLVM: store <2 x i64> [[A_VAL]], ptr [[S0:%.*]], align 16
-// LLVM: [[S0_VAL:%.*]] = load <2 x i64>, ptr [[S0]], align 16
-// LLVM: [[S0_VAL_CAST:%.*]] = bitcast <2 x i64> [[S0_VAL]] to <16 x i8>
-// LLVM: [[INTRN_ARG:%.*]] = bitcast <16 x i8> [[S0_VAL_CAST]] to <2 x i64>
-// LLVM: {{%.*}} = call <2 x i32> @llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64> [[INTRN_ARG]], i32 15)
-// LLVM: ret <2 x i32> {{%.*}}
diff --git a/clang/test/CIR/CodeGen/aarch64-neon-vget.c b/clang/test/CIR/CodeGen/aarch64-neon-vget.c
deleted file mode 100644
index b16648691d1b..000000000000
--- a/clang/test/CIR/CodeGen/aarch64-neon-vget.c
+++ /dev/null
@@ -1,219 +0,0 @@
-// RUN: %clang_cc1 -triple aarch64-none-linux-android24  -fclangir \
-// RUN:            -emit-cir -target-feature +neon %s -o %t.cir
-// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-android24  -fclangir \
-// RUN:            -emit-llvm -target-feature +neon %s -o %t.ll
-// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s
-
-// This test file contains test cases to those of 
-// clang/test/CodeGen/aarch64-neon-vget.c 
-// The difference is that this file only tests uses vget intrinsics, as we feel
-// it would be proper to have a separate test file testing vset intrinsics 
-// with the file name aarch64-neon-vset.c
-
-// REQUIRES: aarch64-registered-target || arm-registered-target
-#include <arm_neon.h>
-
-uint8_t test_vget_lane_u8(uint8x8_t a) {
-  return vget_lane_u8(a, 7);
-}
-
-// CIR-LABEL: test_vget_lane_u8
-// CIR: [[IDX:%.*]]  = cir.const #cir.int<7> : !s32i
-// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector<!u8i x 8>
-
-// LLVM: define dso_local i8 @test_vget_lane_u8(<8 x i8> [[ARG:%.*]])
-// LLVM: [[ARG_SAVE:%.*]] = alloca <8 x i8>, i64 1, align 8
-// LLVM: store <8 x i8> [[ARG]], ptr [[ARG_SAVE]], align 8
-// LLVM: [[TMP:%.*]] = load <8 x i8>, ptr [[ARG_SAVE:%.*]], align 8
-// LLVM: store <8 x i8> [[TMP]], ptr [[S0:%.*]], align 8
-// LLVM: [[INTRN_ARG:%.*]] = load <8 x i8>, ptr [[S0]], align 8
-// LLVM: {{%.*}} = extractelement <8 x i8> [[INTRN_ARG]], i32 7
-// LLVM: ret i8 {{%.*}}
-
-uint8_t test_vgetq_lane_u8(uint8x16_t a) {
-  return vgetq_lane_u8(a, 15);
-}
-
-// CIR-LABEL: test_vgetq_lane_u8
-// CIR: [[IDX:%.*]]  = cir.const #cir.int<15> : !s32i
-// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector<!u8i x 16>
-
-// LLVM: define dso_local i8 @test_vgetq_lane_u8(<16 x i8> [[ARG:%.*]])
-// LLVM: [[ARG_SAVE:%.*]] = alloca <16 x i8>, i64 1, align 16
-// LLVM: store <16 x i8> [[ARG]], ptr [[ARG_SAVE]], align 16
-// LLVM: [[TMP:%.*]] = load <16 x i8>, ptr [[ARG_SAVE:%.*]], align 16
-// LLVM: store <16 x i8> [[TMP]], ptr [[S0:%.*]], align 16
-// LLVM: [[INTRN_ARG:%.*]] = load <16 x i8>, ptr [[S0]], align 16
-// LLVM: {{%.*}} = extractelement <16 x i8> [[INTRN_ARG]], i32 15
-// LLVM: ret i8 {{%.*}}
-
-uint16_t test_vget_lane_u16(uint16x4_t a) {
-  return vget_lane_u16(a, 3);
-}
-
-// CIR-LABEL: test_vget_lane_u16
-// CIR: [[IDX:%.*]]  = cir.const #cir.int<3> : !s32i
-// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector<!u16i x 4>
-
-// LLVM: define dso_local i16 @test_vget_lane_u16(<4 x i16> [[ARG:%.*]])
-// LLVM: [[ARG_SAVE:%.*]] = alloca <4 x i16>, i64 1, align 8
-// LLVM: store <4 x i16> [[ARG]], ptr [[ARG_SAVE]], align 8
-// LLVM: [[TMP:%.*]] = load <4 x i16>, ptr [[ARG_SAVE:%.*]], align 8
-// LLVM: store <4 x i16> [[TMP]], ptr [[S0:%.*]], align 8
-// LLVM: [[INTRN_ARG:%.*]] = load <4 x i16>, ptr [[S0]], align 8
-// LLVM: {{%.*}} = extractelement <4 x i16> [[INTRN_ARG]], i32 3
-// LLVM: ret i16 {{%.*}}
-
-uint16_t test_vgetq_lane_u16(uint16x8_t a) {
-  return vgetq_lane_u16(a, 7);
-}
-
-// CIR-LABEL: test_vgetq_lane_u16
-// CIR: [[IDX:%.*]]  = cir.const #cir.int<7> : !s32i
-// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector<!u16i x 8>
-
-// LLVM: define dso_local i16 @test_vgetq_lane_u16(<8 x i16> [[ARG:%.*]])
-// LLVM: [[ARG_SAVE:%.*]] = alloca <8 x i16>, i64 1, align 16
-// LLVM: store <8 x i16> [[ARG]], ptr [[ARG_SAVE]], align 16
-// LLVM: [[TMP:%.*]] = load <8 x i16>, ptr [[ARG_SAVE:%.*]], align 16
-// LLVM: store <8 x i16> [[TMP]], ptr [[S0:%.*]], align 16
-// LLVM: [[INTRN_ARG:%.*]] = load <8 x i16>, ptr [[S0]], align 16
-// LLVM: {{%.*}} = extractelement <8 x i16> [[INTRN_ARG]], i32 7
-// LLVM: ret i16 {{%.*}}
-
-uint32_t test_vget_lane_u32(uint32x2_t a) {
-  return vget_lane_u32(a, 1);
-}
-
-// CIR-LABEL: test_vget_lane_u32
-// CIR: [[IDX:%.*]]  = cir.const #cir.int<1> : !s32i
-// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector<!u32i x 2>
-
-// LLVM: define dso_local i32 @test_vget_lane_u32(<2 x i32> [[ARG:%.*]])
-// LLVM: [[ARG_SAVE:%.*]] = alloca <2 x i32>, i64 1, align 8
-// LLVM: store <2 x i32> [[ARG]], ptr [[ARG_SAVE]], align 8
-// LLVM: [[TMP:%.*]] = load <2 x i32>, ptr [[ARG_SAVE:%.*]], align 8
-// LLVM: store <2 x i32> [[TMP]], ptr [[S0:%.*]], align 8
-// LLVM: [[INTRN_ARG:%.*]] = load <2 x i32>, ptr [[S0]], align 8
-// LLVM: {{%.*}} = extractelement <2 x i32> [[INTRN_ARG]], i32 1
-// LLVM: ret i32 {{%.*}}
-
-uint32_t test_vgetq_lane_u32(uint32x4_t a) {
-  return vgetq_lane_u32(a, 3);
-}
-
-// CIR-LABEL: test_vgetq_lane_u32
-// CIR: [[IDX:%.*]]  = cir.const #cir.int<3> : !s32i
-// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector<!u32i x 4>
-
-// LLVM: define dso_local i32 @test_vgetq_lane_u32(<4 x i32> [[ARG:%.*]])
-// LLVM: [[ARG_SAVE:%.*]] = alloca <4 x i32>, i64 1, align 16
-// LLVM: store <4 x i32> [[ARG]], ptr [[ARG_SAVE]], align 16
-// LLVM: [[TMP:%.*]] = load <4 x i32>, ptr [[ARG_SAVE:%.*]], align 16
-// LLVM: store <4 x i32> [[TMP]], ptr [[S0:%.*]], align 16
-// LLVM: [[INTRN_ARG:%.*]] = load <4 x i32>, ptr [[S0]], align 16
-// LLVM: {{%.*}} = extractelement <4 x i32> [[INTRN_ARG]], i32 3
-// LLVM: ret i32 {{%.*}}
-
-uint64_t test_vget_lane_u64(uint64x1_t a) {
-  return vget_lane_u64(a, 0);
-}
-
-// CIR-LABEL: test_vget_lane_u64
-// CIR: [[IDX:%.*]]  = cir.const #cir.int<0> : !s32i
-// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector<!u64i x 1>
-
-// LLVM: define dso_local i64 @test_vget_lane_u64(<1 x i64> [[ARG:%.*]])
-// LLVM: [[ARG_SAVE:%.*]] = alloca <1 x i64>, i64 1, align 8
-// LLVM: store <1 x i64> [[ARG]], ptr [[ARG_SAVE]], align 8
-// LLVM: [[TMP:%.*]] = load <1 x i64>, ptr [[ARG_SAVE:%.*]], align 8
-// LLVM: store <1 x i64> [[TMP]], ptr [[S0:%.*]], align 8
-// LLVM: [[INTRN_ARG:%.*]] = load <1 x i64>, ptr [[S0]], align 8
-// LLVM: {{%.*}} = extractelement <1 x i64> [[INTRN_ARG]], i32 0
-// LLVM: ret i64 {{%.*}}
-
-uint64_t test_vgetq_lane_u64(uint64x2_t a) {
-  return vgetq_lane_u64(a, 1);
-}
-
-// CIR-LABEL: test_vgetq_lane_u64
-// CIR: [[IDX:%.*]]  = cir.const #cir.int<1> : !s32i
-// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector<!u64i x 2>
-
-// LLVM: define dso_local i64 @test_vgetq_lane_u64(<2 x i64> [[ARG:%.*]])
-// LLVM: [[ARG_SAVE:%.*]] = alloca <2 x i64>, i64 1, align 16
-// LLVM: store <2 x i64> [[ARG]], ptr [[ARG_SAVE]], align 16
-// LLVM: [[TMP:%.*]] = load <2 x i64>, ptr [[ARG_SAVE:%.*]], align 16
-// LLVM: store <2 x i64> [[TMP]], ptr [[S0:%.*]], align 16
-// LLVM: [[INTRN_ARG:%.*]] = load <2 x i64>, ptr [[S0]], align 16
-// LLVM: {{%.*}} = extractelement <2 x i64> [[INTRN_ARG]], i32 1
-// LLVM: ret i64 {{%.*}}
-
-float32_t test_vget_lane_f32(float32x2_t a) {
-  return vget_lane_f32(a, 1);
-}
-
-// CIR-LABEL: test_vget_lane_f32
-// CIR: [[IDX:%.*]]  = cir.const #cir.int<1> : !s32i
-// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector<!cir.float x 2>
-
-// LLVM: define dso_local float @test_vget_lane_f32(<2 x float> [[ARG:%.*]])
-// LLVM: [[ARG_SAVE:%.*]] = alloca <2 x float>, i64 1, align 8
-// LLVM: store <2 x float> [[ARG]], ptr [[ARG_SAVE]], align 8
-// LLVM: [[TMP:%.*]] = load <2 x float>, ptr [[ARG_SAVE:%.*]], align 8
-// LLVM: store <2 x float> [[TMP]], ptr [[S0:%.*]], align 8
-// LLVM: [[INTRN_ARG:%.*]] = load <2 x float>, ptr [[S0]], align 8
-// LLVM: {{%.*}} = extractelement <2 x float> [[INTRN_ARG]], i32 1
-// LLVM: ret float {{%.*}}
-
-float64_t test_vget_lane_f64(float64x1_t a) {
-  return vget_lane_f64(a, 0);
-}
-
-// CIR-LABEL: test_vget_lane_f64
-// CIR: [[IDX:%.*]]  = cir.const #cir.int<0> : !s32i
-// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector<!cir.double x 1>
-
-// LLVM: define dso_local double @test_vget_lane_f64(<1 x double> [[ARG:%.*]])
-// LLVM: [[ARG_SAVE:%.*]] = alloca <1 x double>, i64 1, align 8
-// LLVM: store <1 x double> [[ARG]], ptr [[ARG_SAVE]], align 8
-// LLVM: [[TMP:%.*]] = load <1 x double>, ptr [[ARG_SAVE:%.*]], align 8
-// LLVM: store <1 x double> [[TMP]], ptr [[S0:%.*]], align 8
-// LLVM: [[INTRN_ARG:%.*]] = load <1 x double>, ptr [[S0]], align 8
-// LLVM: {{%.*}} = extractelement <1 x double> [[INTRN_ARG]], i32 0
-// LLVM: ret double {{%.*}}
-
-float32_t test_vgetq_lane_f32(float32x4_t a) {
-  return vgetq_lane_f32(a, 3);
-}
-
-// CIR-LABEL: test_vgetq_lane_f32
-// CIR: [[IDX:%.*]]  = cir.const #cir.int<3> : !s32i
-// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector<!cir.float x 4>
-
-// LLVM: define dso_local float @test_vgetq_lane_f32(<4 x float> [[ARG:%.*]])
-// LLVM: [[ARG_SAVE:%.*]] = alloca <4 x float>, i64 1, align 16
-// LLVM: store <4 x float> [[ARG]], ptr [[ARG_SAVE]], align 16
-// LLVM: [[TMP:%.*]] = load <4 x float>, ptr [[ARG_SAVE:%.*]], align 16
-// LLVM: store <4 x float> [[TMP]], ptr [[S0:%.*]], align 16
-// LLVM: [[INTRN_ARG:%.*]] = load <4 x float>, ptr [[S0]], align 16
-// LLVM: {{%.*}} = extractelement <4 x float> [[INTRN_ARG]], i32 3
-// LLVM: ret float {{%.*}}
-
-float64_t test_vgetq_lane_f64(float64x2_t a) {
-  return vgetq_lane_f64(a, 1);
-}
-
-// CIR-LABEL: test_vgetq_lane_f64
-// CIR: [[IDX:%.*]]  = cir.const #cir.int<1> : !s32i
-// CIR: {{%.*}} = cir.vec.extract {{%.*}}[[[IDX]] : !s32i] : !cir.vector<!cir.double x 2>
-
-// LLVM: define dso_local double @test_vgetq_lane_f64(<2 x double> [[ARG:%.*]])
-// LLVM: [[ARG_SAVE:%.*]] = alloca <2 x double>, i64 1, align 16
-// LLVM: store <2 x double> [[ARG]], ptr [[ARG_SAVE]], align 16
-// LLVM: [[TMP:%.*]] = load <2 x double>, ptr [[ARG_SAVE:%.*]], align 16
-// LLVM: store <2 x double> [[TMP]], ptr [[S0:%.*]], align 16
-// LLVM: [[INTRN_ARG:%.*]] = load <2 x double>, ptr [[S0]], align 16
-// LLVM: {{%.*}} = extractelement <2 x double> [[INTRN_ARG]], i32 1
-// LLVM: ret double {{%.*}}

From 16615f29a1bfc26b61318cedd482379b11b9b617 Mon Sep 17 00:00:00 2001
From: Guojin He <guojinhe@meta.com>
Date: Tue, 1 Oct 2024 16:25:19 -0400
Subject: [PATCH 2/5] polish comments

---
 clang/test/CIR/CodeGen/AArch64/neon-arith.c |  4 ++--
 clang/test/CIR/CodeGen/AArch64/neon-misc.c  |  4 ++--
 clang/test/CIR/CodeGen/neon-tmp.c           | 26 +++++++++++++++++++++
 3 files changed, 30 insertions(+), 4 deletions(-)
 create mode 100644 clang/test/CIR/CodeGen/neon-tmp.c

diff --git a/clang/test/CIR/CodeGen/AArch64/neon-arith.c b/clang/test/CIR/CodeGen/AArch64/neon-arith.c
index 7d8636758652..192486579143 100644
--- a/clang/test/CIR/CodeGen/AArch64/neon-arith.c
+++ b/clang/test/CIR/CodeGen/AArch64/neon-arith.c
@@ -8,8 +8,8 @@
 // REQUIRES: aarch64-registered-target || arm-registered-target
 #include <arm_neon.h>
 
-// This test file contains aarch64 NEON arithmetic intrinsics that are not 
-// vector type related.
+// This test file contains tests for aarch64 NEON arithmetic intrinsics 
+// that are not vector type related.
 
 float32_t test_vrndns_f32(float32_t a) {
   return vrndns_f32(a);
diff --git a/clang/test/CIR/CodeGen/AArch64/neon-misc.c b/clang/test/CIR/CodeGen/AArch64/neon-misc.c
index 6b0c3a866da5..0c20576e62d8 100644
--- a/clang/test/CIR/CodeGen/AArch64/neon-misc.c
+++ b/clang/test/CIR/CodeGen/AArch64/neon-misc.c
@@ -5,8 +5,8 @@
 // RUN:            -emit-llvm -target-feature +neon %s -o %t.ll
 // RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s
 
-// This test file contains AArch64 NEON intrinsics that are not covered by
-// other tests.
+// This test file contains tests of AArch64 NEON intrinsics 
+// that are not covered by other tests.
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 #include <arm_neon.h>
diff --git a/clang/test/CIR/CodeGen/neon-tmp.c b/clang/test/CIR/CodeGen/neon-tmp.c
new file mode 100644
index 000000000000..f11f1b9454dd
--- /dev/null
+++ b/clang/test/CIR/CodeGen/neon-tmp.c
@@ -0,0 +1,26 @@
+// RUN: %clang_cc1 -triple aarch64-none-linux-android24  -fclangir \
+// RUN:            -ffreestanding -emit-cir -target-feature +neon %s -o %t.cir
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-android24  -fclangir \
+// RUN:            -ffreestanding -emit-llvm -target-feature +neon %s -o %t.ll
+// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s
+
+// REQUIRES: aarch64-registered-target || arm-registered-target
+#include <arm_neon.h>
+
+uint8x8_t test_vmovn_u16(uint16x8_t a) {
+  return vmovn_u16(a);
+}
+
+// CIR-LABEL: vmovn_u16
+// CIR: [[TMP:%.*]] = cir.cast(bitcast, {{%.*}} : !cir.vector<!u16i x 8>), !cir.vector<!s8i x 16>
+// CIR: [[SRC:%.*]] = cir.cast(bitcast, [[TMP]] : !cir.vector<!s8i x 16>), !cir.vector<!u16i x 8>
+// CIR: {{.*}} = cir.cast(integral, [[SRC]] : !cir.vector<!u16i x 8>), !cir.vector<!u8i x 8>
+
+// LLVM: {{.*}}test_vmovn_u16(<8 x i16>{{.*}}[[A:%.*]])
+// LLVM: store <8 x i16> [[A]], ptr [[A_ADDR:%.*]], align 16
+// LLVM: [[TMP0:%.*]] = load <8 x i16>, ptr [[A_ADDR]], align 16
+// LLVM: store <8 x i16> [[TMP0]], ptr [[P0:%.*]], align 16
+// LLVM: [[INTRN_ARG:%.*]] = load <8 x i16>, ptr [[P0]], align 16
+// LLVM: {{%.*}} = bitcast <8 x i16> [[INTRN_ARG]] to <16 x i8>
+// LLVM: {{%.*}} = trunc <8 x i16> [[INTRN_ARG]] to <8 x i8>

From 824ab72dc21b27463da07fc1f4b67dadd7a1d314 Mon Sep 17 00:00:00 2001
From: Guojin He <guojinhe@meta.com>
Date: Tue, 1 Oct 2024 16:27:54 -0400
Subject: [PATCH 3/5] get rid of local file

---
 clang/test/CIR/CodeGen/neon-tmp.c | 26 --------------------------
 1 file changed, 26 deletions(-)
 delete mode 100644 clang/test/CIR/CodeGen/neon-tmp.c

diff --git a/clang/test/CIR/CodeGen/neon-tmp.c b/clang/test/CIR/CodeGen/neon-tmp.c
deleted file mode 100644
index f11f1b9454dd..000000000000
--- a/clang/test/CIR/CodeGen/neon-tmp.c
+++ /dev/null
@@ -1,26 +0,0 @@
-// RUN: %clang_cc1 -triple aarch64-none-linux-android24  -fclangir \
-// RUN:            -ffreestanding -emit-cir -target-feature +neon %s -o %t.cir
-// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-android24  -fclangir \
-// RUN:            -ffreestanding -emit-llvm -target-feature +neon %s -o %t.ll
-// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s
-
-// REQUIRES: aarch64-registered-target || arm-registered-target
-#include <arm_neon.h>
-
-uint8x8_t test_vmovn_u16(uint16x8_t a) {
-  return vmovn_u16(a);
-}
-
-// CIR-LABEL: vmovn_u16
-// CIR: [[TMP:%.*]] = cir.cast(bitcast, {{%.*}} : !cir.vector<!u16i x 8>), !cir.vector<!s8i x 16>
-// CIR: [[SRC:%.*]] = cir.cast(bitcast, [[TMP]] : !cir.vector<!s8i x 16>), !cir.vector<!u16i x 8>
-// CIR: {{.*}} = cir.cast(integral, [[SRC]] : !cir.vector<!u16i x 8>), !cir.vector<!u8i x 8>
-
-// LLVM: {{.*}}test_vmovn_u16(<8 x i16>{{.*}}[[A:%.*]])
-// LLVM: store <8 x i16> [[A]], ptr [[A_ADDR:%.*]], align 16
-// LLVM: [[TMP0:%.*]] = load <8 x i16>, ptr [[A_ADDR]], align 16
-// LLVM: store <8 x i16> [[TMP0]], ptr [[P0:%.*]], align 16
-// LLVM: [[INTRN_ARG:%.*]] = load <8 x i16>, ptr [[P0]], align 16
-// LLVM: {{%.*}} = bitcast <8 x i16> [[INTRN_ARG]] to <16 x i8>
-// LLVM: {{%.*}} = trunc <8 x i16> [[INTRN_ARG]] to <8 x i8>

From 5cac33d3a1ad8edb94dd33efaf5a46d9fbc3769b Mon Sep 17 00:00:00 2001
From: Guojin He <guojinhe@meta.com>
Date: Tue, 1 Oct 2024 19:09:48 -0400
Subject: [PATCH 4/5] make sure aarch64-neon-intrinsics.c test cases not
 duplicated

---
 clang/test/CIR/CodeGen/AArch64/neon-varith.c  | 239 ------------------
 .../CIR/CodeGen/aarch64-neon-intrinsics.c     | 238 ++++++++++-------
 2 files changed, 143 insertions(+), 334 deletions(-)
 delete mode 100644 clang/test/CIR/CodeGen/AArch64/neon-varith.c

diff --git a/clang/test/CIR/CodeGen/AArch64/neon-varith.c b/clang/test/CIR/CodeGen/AArch64/neon-varith.c
deleted file mode 100644
index 9643342f093c..000000000000
--- a/clang/test/CIR/CodeGen/AArch64/neon-varith.c
+++ /dev/null
@@ -1,239 +0,0 @@
-// RUN: %clang_cc1 -triple aarch64-none-linux-android24  -fclangir \
-// RUN:            -emit-cir -target-feature +neon %s -o %t.cir
-// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-android24  -fclangir \
-// RUN:            -emit-llvm -target-feature +neon %s -o %t.ll
-// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s
-
-// This test file contains tests of aarch64 NEON vector arithmetic intrinsics.
-
-// REQUIRES: aarch64-registered-target || arm-registered-target
-#include <arm_neon.h>
-
-uint8x8_t test_vqadd_u8(uint8x8_t a, uint8x8_t b) {
-  return vqadd_u8(a,b);
-}
-
-// CIR-LABEL: vqadd_u8
-// CIR: {{%.*}} = cir.llvm.intrinsic "llvm.aarch64.neon.uqadd" {{%.*}}, {{%.*}} : 
-// CIR-SAME: (!cir.vector<!u8i x 8>, !cir.vector<!u8i x 8>) -> !cir.vector<!u8i x 8>
-// CIR: cir.return
-
-// LLVM: {{.*}}test_vqadd_u8(<8 x i8>{{.*}} [[A:%.*]], <8 x i8>{{.*}} [[B:%.*]])
-// LLVM: store <8 x i8> [[A]], ptr [[A_ADDR:%.*]], align 8
-// LLVM: store <8 x i8> [[B]], ptr [[B_ADDR:%.*]], align 8
-// LLVM: [[TMP_A:%.*]] = load <8 x i8>, ptr [[A_ADDR]], align 8
-// LLVM: [[TMP_B:%.*]] = load <8 x i8>, ptr [[B_ADDR]], align 8
-// LLVM: store <8 x i8> [[TMP_A]], ptr [[P0_ADDR:%.*]], align 8
-// LLVM: store <8 x i8> [[TMP_B]], ptr [[P1_ADDR:%.*]], align 8
-// LLVM: [[INTRN_A:%.*]] = load <8 x i8>, ptr [[P0_ADDR]], align 8
-// LLVM: [[INTRN_B:%.*]] = load <8 x i8>, ptr [[P1_ADDR]], align 8
-// LLVM: {{%.*}} = call <8 x i8> @llvm.aarch64.neon.uqadd.v8i8(<8 x i8> [[INTRN_A]], <8 x i8> [[INTRN_B]])
-// LLVM: ret <8 x i8>
-
-int8x8_t test_vqadd_s8(int8x8_t a, int8x8_t b) {
-  return vqadd_s8(a,b);
-}
-
-// CIR-LABEL: vqadd_s8
-// CIR: {{%.*}} = cir.llvm.intrinsic "llvm.aarch64.neon.sqadd" {{%.*}}, {{%.*}} : 
-// CIR-SAME: (!cir.vector<!s8i x 8>, !cir.vector<!s8i x 8>) -> !cir.vector<!s8i x 8>
-// CIR: cir.return
-
-// LLVM: {{.*}}test_vqadd_s8(<8 x i8>{{.*}} [[A:%.*]], <8 x i8>{{.*}} [[B:%.*]])
-// LLVM: store <8 x i8> [[A]], ptr [[A_ADDR:%.*]], align 8
-// LLVM: store <8 x i8> [[B]], ptr [[B_ADDR:%.*]], align 8
-// LLVM: [[TMP_A:%.*]] = load <8 x i8>, ptr [[A_ADDR]], align 8
-// LLVM: [[TMP_B:%.*]] = load <8 x i8>, ptr [[B_ADDR]], align 8
-// LLVM: store <8 x i8> [[TMP_A]], ptr [[P0_ADDR:%.*]], align 8
-// LLVM: store <8 x i8> [[TMP_B]], ptr [[P1_ADDR:%.*]], align 8
-// LLVM: [[INTRN_A:%.*]] = load <8 x i8>, ptr [[P0_ADDR]], align 8
-// LLVM: [[INTRN_B:%.*]] = load <8 x i8>, ptr [[P1_ADDR]], align 8
-// LLVM: {{%.*}} = call <8 x i8> @llvm.aarch64.neon.sqadd.v8i8(<8 x i8> [[INTRN_A]], <8 x i8> [[INTRN_B]])
-// LLVM: ret <8 x i8>
-
-uint16x4_t test_vqadd_u16(uint16x4_t a, uint16x4_t b) {
-  return vqadd_u16(a,b);
-}
-
-// CIR-LABEL: vqadd_u16
-// CIR: {{%.*}} = cir.llvm.intrinsic "llvm.aarch64.neon.uqadd" {{%.*}}, {{%.*}} : 
-// CIR-SAME: (!cir.vector<!u16i x 4>, !cir.vector<!u16i x 4>) -> !cir.vector<!u16i x 4>
-// CIR: cir.return
-
-// LLVM: {{.*}}test_vqadd_u16(<4 x i16>{{.*}} [[A:%.*]], <4 x i16>{{.*}} [[B:%.*]])
-// LLVM: store <4 x i16> [[A]], ptr [[A_ADDR:%.*]], align 8
-// LLVM: store <4 x i16> [[B]], ptr [[B_ADDR:%.*]], align 8
-// LLVM: [[TMP_A:%.*]] = load <4 x i16>, ptr [[A_ADDR]], align 8
-// LLVM: [[TMP_B:%.*]] = load <4 x i16>, ptr [[B_ADDR]], align 8
-// LLVM: store <4 x i16> [[TMP_A]], ptr [[P0_ADDR:%.*]], align 8
-// LLVM: store <4 x i16> [[TMP_B]], ptr [[P1_ADDR:%.*]], align 8
-// LLVM: [[INTRN_A:%.*]] = load <4 x i16>, ptr [[P0_ADDR]], align 8
-// LLVM: [[INTRN_B:%.*]] = load <4 x i16>, ptr [[P1_ADDR]], align 8
-// LLVM: {{%.*}} = call <4 x i16> @llvm.aarch64.neon.uqadd.v4i16(<4 x i16> [[INTRN_A]], <4 x i16> [[INTRN_B]])
-// LLVM: ret <4 x i16>
-
-int16x4_t test_vqadd_s16(int16x4_t a, int16x4_t b) {
-  return vqadd_s16(a,b);
-}
-
-// CIR-LABEL: vqadd_u16
-// CIR: {{%.*}} = cir.llvm.intrinsic "llvm.aarch64.neon.sqadd" {{%.*}}, {{%.*}} : 
-// CIR-SAME: (!cir.vector<!s16i x 4>, !cir.vector<!s16i x 4>) -> !cir.vector<!s16i x 4>
-// CIR: cir.return
-
-// LLVM: {{.*}}test_vqadd_s16(<4 x i16>{{.*}} [[A:%.*]], <4 x i16>{{.*}} [[B:%.*]])
-// LLVM: store <4 x i16> [[A]], ptr [[A_ADDR:%.*]], align 8
-// LLVM: store <4 x i16> [[B]], ptr [[B_ADDR:%.*]], align 8
-// LLVM: [[TMP_A:%.*]] = load <4 x i16>, ptr [[A_ADDR]], align 8
-// LLVM: [[TMP_B:%.*]] = load <4 x i16>, ptr [[B_ADDR]], align 8
-// LLVM: store <4 x i16> [[TMP_A]], ptr [[P0_ADDR:%.*]], align 8
-// LLVM: store <4 x i16> [[TMP_B]], ptr [[P1_ADDR:%.*]], align 8
-// LLVM: [[INTRN_A:%.*]] = load <4 x i16>, ptr [[P0_ADDR]], align 8
-// LLVM: [[INTRN_B:%.*]] = load <4 x i16>, ptr [[P1_ADDR]], align 8
-// LLVM: {{%.*}} = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> [[INTRN_A]], <4 x i16> [[INTRN_B]])
-// LLVM: ret <4 x i16>
-
-uint32x2_t test_vqadd_u32(uint32x2_t a, uint32x2_t b) {
-  return vqadd_u32(a,b);
-}
-
-// CIR-LABEL: vqadd_u32
-// CIR: {{%.*}} = cir.llvm.intrinsic "llvm.aarch64.neon.uqadd" {{%.*}}, {{%.*}} : 
-// CIR-SAME: (!cir.vector<!u32i x 2>, !cir.vector<!u32i x 2>) -> !cir.vector<!u32i x 2>
-// CIR: cir.return
-
-// LLVM: {{.*}}test_vqadd_u32(<2 x i32>{{.*}} [[A:%.*]], <2 x i32>{{.*}} [[B:%.*]])
-// LLVM: store <2 x i32> [[A]], ptr [[A_ADDR:%.*]], align 8
-// LLVM: store <2 x i32> [[B]], ptr [[B_ADDR:%.*]], align 8
-// LLVM: [[TMP_A:%.*]] = load <2 x i32>, ptr [[A_ADDR]], align 8
-// LLVM: [[TMP_B:%.*]] = load <2 x i32>, ptr [[B_ADDR]], align 8
-// LLVM: store <2 x i32> [[TMP_A]], ptr [[P0_ADDR:%.*]], align 8
-// LLVM: store <2 x i32> [[TMP_B]], ptr [[P1_ADDR:%.*]], align 8
-// LLVM: [[INTRN_A:%.*]] = load <2 x i32>, ptr [[P0_ADDR]], align 8
-// LLVM: [[INTRN_B:%.*]] = load <2 x i32>, ptr [[P1_ADDR]], align 8
-// LLVM: {{%.*}} = call <2 x i32> @llvm.aarch64.neon.uqadd.v2i32(<2 x i32> [[INTRN_A]], <2 x i32> [[INTRN_B]])
-// LLVM: ret <2 x i32>
-
-int32x2_t test_vqadd_s32(int32x2_t a, int32x2_t b) {
-  return vqadd_s32(a,b);
-}
-
-// CIR-LABEL: vqadd_s32
-// CIR: {{%.*}} = cir.llvm.intrinsic "llvm.aarch64.neon.sqadd" {{%.*}}, {{%.*}} : 
-// CIR-SAME: (!cir.vector<!s32i x 2>, !cir.vector<!s32i x 2>) -> !cir.vector<!s32i x 2>
-// CIR: cir.return
-
-// LLVM: {{.*}}test_vqadd_s32(<2 x i32>{{.*}} [[A:%.*]], <2 x i32>{{.*}} [[B:%.*]])
-// LLVM: store <2 x i32> [[A]], ptr [[A_ADDR:%.*]], align 8
-// LLVM: store <2 x i32> [[B]], ptr [[B_ADDR:%.*]], align 8
-// LLVM: [[TMP_A:%.*]] = load <2 x i32>, ptr [[A_ADDR]], align 8
-// LLVM: [[TMP_B:%.*]] = load <2 x i32>, ptr [[B_ADDR]], align 8
-// LLVM: store <2 x i32> [[TMP_A]], ptr [[P0_ADDR:%.*]], align 8
-// LLVM: store <2 x i32> [[TMP_B]], ptr [[P1_ADDR:%.*]], align 8
-// LLVM: [[INTRN_A:%.*]] = load <2 x i32>, ptr [[P0_ADDR]], align 8
-// LLVM: [[INTRN_B:%.*]] = load <2 x i32>, ptr [[P1_ADDR]], align 8
-// LLVM: {{%.*}} = call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> [[INTRN_A]], <2 x i32> [[INTRN_B]])
-// LLVM: ret <2 x i32>
-
-uint64x1_t test_vqadd_u64(uint64x1_t a, uint64x1_t b) {
-  return vqadd_u64(a,b);
-}
-
-// CIR-LABEL: vqadd_u64
-// CIR: {{%.*}} = cir.llvm.intrinsic "llvm.aarch64.neon.uqadd" {{%.*}}, {{%.*}} : 
-// CIR-SAME: (!cir.vector<!u64i x 1>, !cir.vector<!u64i x 1>) -> !cir.vector<!u64i x 1>
-// CIR: cir.return
-
-// LLVM: {{.*}}test_vqadd_u64(<1 x i64>{{.*}} [[A:%.*]], <1 x i64>{{.*}} [[B:%.*]])
-// LLVM: store <1 x i64> [[A]], ptr [[A_ADDR:%.*]], align 8
-// LLVM: store <1 x i64> [[B]], ptr [[B_ADDR:%.*]], align 8
-// LLVM: [[TMP_A:%.*]] = load <1 x i64>, ptr [[A_ADDR]], align 8
-// LLVM: [[TMP_B:%.*]] = load <1 x i64>, ptr [[B_ADDR]], align 8
-// LLVM: store <1 x i64> [[TMP_A]], ptr [[P0_ADDR:%.*]], align 8
-// LLVM: store <1 x i64> [[TMP_B]], ptr [[P1_ADDR:%.*]], align 8
-// LLVM: [[INTRN_A:%.*]] = load <1 x i64>, ptr [[P0_ADDR]], align 8
-// LLVM: [[INTRN_B:%.*]] = load <1 x i64>, ptr [[P1_ADDR]], align 8
-// LLVM: {{%.*}} = call <1 x i64> @llvm.aarch64.neon.uqadd.v1i64(<1 x i64> [[INTRN_A]], <1 x i64> [[INTRN_B]])
-// LLVM: ret <1 x i64>
-
-int64x1_t test_vqadd_s64(int64x1_t a, int64x1_t b) {
-  return vqadd_s64(a,b);
-}
-
-// CIR-LABEL: vqadd_s64
-// CIR: {{%.*}} = cir.llvm.intrinsic "llvm.aarch64.neon.sqadd" {{%.*}}, {{%.*}} : 
-// CIR-SAME: (!cir.vector<!s64i x 1>, !cir.vector<!s64i x 1>) -> !cir.vector<!s64i x 1>
-// CIR: cir.return
-
-// LLVM: {{.*}}test_vqadd_s64(<1 x i64>{{.*}} [[A:%.*]], <1 x i64>{{.*}} [[B:%.*]])
-// LLVM: store <1 x i64> [[A]], ptr [[A_ADDR:%.*]], align 8
-// LLVM: store <1 x i64> [[B]], ptr [[B_ADDR:%.*]], align 8
-// LLVM: [[TMP_A:%.*]] = load <1 x i64>, ptr [[A_ADDR]], align 8
-// LLVM: [[TMP_B:%.*]] = load <1 x i64>, ptr [[B_ADDR]], align 8
-// LLVM: store <1 x i64> [[TMP_A]], ptr [[P0_ADDR:%.*]], align 8
-// LLVM: store <1 x i64> [[TMP_B]], ptr [[P1_ADDR:%.*]], align 8
-// LLVM: [[INTRN_A:%.*]] = load <1 x i64>, ptr [[P0_ADDR]], align 8
-// LLVM: [[INTRN_B:%.*]] = load <1 x i64>, ptr [[P1_ADDR]], align 8
-// LLVM: {{%.*}} = call <1 x i64> @llvm.aarch64.neon.sqadd.v1i64(<1 x i64> [[INTRN_A]], <1 x i64> [[INTRN_B]])
-// LLVM: ret <1 x i64>
-
-uint8x8_t test_vqrshrun_n_s16(int16x8_t a) {
-  return vqrshrun_n_s16(a, 3);
-}
-
-// CIR-LABEL: test_vqrshrun_n_s16
-// CIR: [[INTRN_ARG1:%.*]] = cir.const #cir.int<3> : !s32i
-// CIR: [[INTRN_ARG0:%.*]] = cir.cast(bitcast, {{%.*}} : !cir.vector<!s8i x 16>), !cir.vector<!s16i x 8> 
-// CIR: {{%.*}} = cir.llvm.intrinsic "llvm.aarch64.neon.sqrshrun" [[INTRN_ARG0]], [[INTRN_ARG1]] :
-// CIR-SAME: (!cir.vector<!s16i x 8>, !s32i) -> !cir.vector<!u8i x 8>
-
-// LLVM: {{.*}}test_vqrshrun_n_s16(<8 x i16>{{.*}} [[A:%.*]])
-// LLVM: store <8 x i16> [[A]], ptr [[A_ADDR:%.*]], align 16
-// LLVM: [[A_VAL:%.*]] = load <8 x i16>, ptr [[A_ADDR]], align 16
-// LLVM: store <8 x i16> [[A_VAL]], ptr [[S0:%.*]], align 16
-// LLVM: [[S0_VAL:%.*]] = load <8 x i16>, ptr [[S0]], align 16
-// LLVM: [[S0_VAL_CAST:%.*]] = bitcast <8 x i16> [[S0_VAL]] to <16 x i8>
-// LLVM: [[INTRN_ARG:%.*]] = bitcast <16 x i8> [[S0_VAL_CAST]] to <8 x i16>
-// LLVM: {{%.*}} = call <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> [[INTRN_ARG]], i32 3)
-// LLVM: ret <8 x i8> {{%.*}}
-
-uint16x4_t test_vqrshrun_n_s32(int32x4_t a) {
-  return vqrshrun_n_s32(a, 7);
-}
-
-// CIR-LABEL: test_vqrshrun_n_s32
-// CIR: [[INTRN_ARG1:%.*]] = cir.const #cir.int<7> : !s32i
-// CIR: [[INTRN_ARG0:%.*]] = cir.cast(bitcast, {{%.*}} : !cir.vector<!s8i x 16>), !cir.vector<!s32i x 4> 
-// CIR: {{%.*}} = cir.llvm.intrinsic "llvm.aarch64.neon.sqrshrun" [[INTRN_ARG0]], [[INTRN_ARG1]] :
-// CIR-SAME: (!cir.vector<!s32i x 4>, !s32i) -> !cir.vector<!u16i x 4>
-
-// LLVM: {{.*}}test_vqrshrun_n_s32(<4 x i32>{{.*}} [[A:%.*]])
-// LLVM: store <4 x i32> [[A]], ptr [[A_ADDR:%.*]], align 16
-// LLVM: [[A_VAL:%.*]] = load <4 x i32>, ptr [[A_ADDR]], align 16
-// LLVM: store <4 x i32> [[A_VAL]], ptr [[S0:%.*]], align 16
-// LLVM: [[S0_VAL:%.*]] = load <4 x i32>, ptr [[S0]], align 16
-// LLVM: [[S0_VAL_CAST:%.*]] = bitcast <4 x i32> [[S0_VAL]] to <16 x i8>
-// LLVM: [[INTRN_ARG:%.*]] = bitcast <16 x i8> [[S0_VAL_CAST]] to <4 x i32>
-// LLVM: {{%.*}} = call <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> [[INTRN_ARG]], i32 7)
-// LLVM: ret <4 x i16> {{%.*}}
-
-uint32x2_t test_vqrshrun_n_s64(int64x2_t a) {
-  return vqrshrun_n_s64(a, 15);
-}
-
-// CIR-LABEL: test_vqrshrun_n_s64
-// CIR: [[INTRN_ARG1:%.*]] = cir.const #cir.int<15> : !s32i
-// CIR: [[INTRN_ARG0:%.*]] = cir.cast(bitcast, {{%.*}} : !cir.vector<!s8i x 16>), !cir.vector<!s64i x 2> 
-// CIR: {{%.*}} = cir.llvm.intrinsic "llvm.aarch64.neon.sqrshrun" [[INTRN_ARG0]], [[INTRN_ARG1]] :
-// CIR-SAME: (!cir.vector<!s64i x 2>, !s32i) -> !cir.vector<!u32i x 2>
-
-// LLVM: {{.*}}test_vqrshrun_n_s64(<2 x i64>{{.*}} [[A:%.*]])
-// LLVM: store <2 x i64> [[A]], ptr [[A_ADDR:%.*]], align 16
-// LLVM: [[A_VAL:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
-// LLVM: store <2 x i64> [[A_VAL]], ptr [[S0:%.*]], align 16
-// LLVM: [[S0_VAL:%.*]] = load <2 x i64>, ptr [[S0]], align 16
-// LLVM: [[S0_VAL_CAST:%.*]] = bitcast <2 x i64> [[S0_VAL]] to <16 x i8>
-// LLVM: [[INTRN_ARG:%.*]] = bitcast <16 x i8> [[S0_VAL_CAST]] to <2 x i64>
-// LLVM: {{%.*}} = call <2 x i32> @llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64> [[INTRN_ARG]], i32 15)
-// LLVM: ret <2 x i32> {{%.*}}
diff --git a/clang/test/CIR/CodeGen/aarch64-neon-intrinsics.c b/clang/test/CIR/CodeGen/aarch64-neon-intrinsics.c
index 02aa70a4d628..54520e688a59 100644
--- a/clang/test/CIR/CodeGen/aarch64-neon-intrinsics.c
+++ b/clang/test/CIR/CodeGen/aarch64-neon-intrinsics.c
@@ -2839,79 +2839,103 @@
 //   return vrhaddq_u32(v1, v2);
 // }
 
-// NYI-LABEL: @test_vqadd_s8(
-// NYI:   [[VQADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqadd.v8i8(<8 x i8> %a, <8 x i8> %b)
-// NYI:   ret <8 x i8> [[VQADD_V_I]]
-// int8x8_t test_vqadd_s8(int8x8_t a, int8x8_t b) {
-//   return vqadd_s8(a, b);
-// }
 
-// NYI-LABEL: @test_vqadd_s16(
-// NYI:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// NYI:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// NYI:   [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %a, <4 x i16> %b)
-// NYI:   [[VQADD_V3_I:%.*]] = bitcast <4 x i16> [[VQADD_V2_I]] to <8 x i8>
-// NYI:   ret <4 x i16> [[VQADD_V2_I]]
-// int16x4_t test_vqadd_s16(int16x4_t a, int16x4_t b) {
-//   return vqadd_s16(a, b);
-// }
+int8x8_t test_vqadd_s8(int8x8_t a, int8x8_t b) {
+  return vqadd_s8(a, b);
+  // CIR-LABEL: vqadd_s8
+  // CIR: {{%.*}} = cir.llvm.intrinsic "llvm.aarch64.neon.sqadd" {{%.*}}, {{%.*}} : 
+  // CIR-SAME: (!cir.vector<!s8i x 8>, !cir.vector<!s8i x 8>) -> !cir.vector<!s8i x 8>
 
-// NYI-LABEL: @test_vqadd_s32(
-// NYI:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// NYI:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// NYI:   [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> %a, <2 x i32> %b)
-// NYI:   [[VQADD_V3_I:%.*]] = bitcast <2 x i32> [[VQADD_V2_I]] to <8 x i8>
-// NYI:   ret <2 x i32> [[VQADD_V2_I]]
-// int32x2_t test_vqadd_s32(int32x2_t a, int32x2_t b) {
-//   return vqadd_s32(a, b);
-// }
-
-// NYI-LABEL: @test_vqadd_s64(
-// NYI:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// NYI:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// NYI:   [[VQADD_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqadd.v1i64(<1 x i64> %a, <1 x i64> %b)
-// NYI:   [[VQADD_V3_I:%.*]] = bitcast <1 x i64> [[VQADD_V2_I]] to <8 x i8>
-// NYI:   ret <1 x i64> [[VQADD_V2_I]]
-// int64x1_t test_vqadd_s64(int64x1_t a, int64x1_t b) {
-//   return vqadd_s64(a, b);
-// }
-
-// NYI-LABEL: @test_vqadd_u8(
-// NYI:   [[VQADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqadd.v8i8(<8 x i8> %a, <8 x i8> %b)
-// NYI:   ret <8 x i8> [[VQADD_V_I]]
-// uint8x8_t test_vqadd_u8(uint8x8_t a, uint8x8_t b) {
-//   return vqadd_u8(a, b);
-// }
-
-// NYI-LABEL: @test_vqadd_u16(
-// NYI:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// NYI:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// NYI:   [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqadd.v4i16(<4 x i16> %a, <4 x i16> %b)
-// NYI:   [[VQADD_V3_I:%.*]] = bitcast <4 x i16> [[VQADD_V2_I]] to <8 x i8>
-// NYI:   ret <4 x i16> [[VQADD_V2_I]]
-// uint16x4_t test_vqadd_u16(uint16x4_t a, uint16x4_t b) {
-//   return vqadd_u16(a, b);
-// }
-
-// NYI-LABEL: @test_vqadd_u32(
-// NYI:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// NYI:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// NYI:   [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqadd.v2i32(<2 x i32> %a, <2 x i32> %b)
-// NYI:   [[VQADD_V3_I:%.*]] = bitcast <2 x i32> [[VQADD_V2_I]] to <8 x i8>
-// NYI:   ret <2 x i32> [[VQADD_V2_I]]
-// uint32x2_t test_vqadd_u32(uint32x2_t a, uint32x2_t b) {
-//   return vqadd_u32(a, b);
-// }
+  // LLVM-LABEL: @test_vqadd_s8(
+  // LLVM:   [[VQADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqadd.v8i8(<8 x i8> %0, <8 x i8> %1)
+  // LLVM:   ret <8 x i8> [[VQADD_V_I]]
+}
 
-// NYI-LABEL: @test_vqadd_u64(
-// NYI:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// NYI:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// NYI:   [[VQADD_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqadd.v1i64(<1 x i64> %a, <1 x i64> %b)
-// NYI:   [[VQADD_V3_I:%.*]] = bitcast <1 x i64> [[VQADD_V2_I]] to <8 x i8>
-// NYI:   ret <1 x i64> [[VQADD_V2_I]]
-// uint64x1_t test_vqadd_u64(uint64x1_t a, uint64x1_t b) {
-//   return vqadd_u64(a, b);
-// }
+  int16x4_t test_vqadd_s16(int16x4_t a, int16x4_t b) {
+    return vqadd_s16(a, b);
+    // CIR-LABEL: vqadd_s16
+    // CIR: {{%.*}} = cir.llvm.intrinsic "llvm.aarch64.neon.sqadd" {{%.*}}, {{%.*}} : 
+    // CIR-SAME: (!cir.vector<!s16i x 4>, !cir.vector<!s16i x 4>) -> !cir.vector<!s16i x 4>
+
+    // LLVM-LABEL: @test_vqadd_s16(
+    // LLVM:   [[TMP0:%.*]] = bitcast <4 x i16> %0 to <8 x i8>
+    // LLVM:   [[TMP1:%.*]] = bitcast <4 x i16> %1 to <8 x i8>
+    // LLVM:   [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %0, <4 x i16> %1)
+    // LLVM:   [[VQADD_V3_I:%.*]] = bitcast <4 x i16> [[VQADD_V2_I]] to <8 x i8>
+    // LLVM:   ret <4 x i16> [[VQADD_V2_I]]
+  }
+
+  int32x2_t test_vqadd_s32(int32x2_t a, int32x2_t b) {
+    return vqadd_s32(a, b);
+    // CIR-LABEL: vqadd_s32
+    // CIR: {{%.*}} = cir.llvm.intrinsic "llvm.aarch64.neon.sqadd" {{%.*}}, {{%.*}} : 
+    // CIR-SAME: (!cir.vector<!s32i x 2>, !cir.vector<!s32i x 2>) -> !cir.vector<!s32i x 2>
+
+    // LLVM-LABEL: @test_vqadd_s32(
+    // LLVM:   [[TMP0:%.*]] = bitcast <2 x i32> %0 to <8 x i8>
+    // LLVM:   [[TMP1:%.*]] = bitcast <2 x i32> %1 to <8 x i8>
+    // LLVM:   [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> %0, <2 x i32> %1)
+    // LLVM:   [[VQADD_V3_I:%.*]] = bitcast <2 x i32> [[VQADD_V2_I]] to <8 x i8>
+    // LLVM:   ret <2 x i32> [[VQADD_V2_I]]
+  }
+
+  int64x1_t test_vqadd_s64(int64x1_t a, int64x1_t b) {
+    return vqadd_s64(a, b);
+    // CIR-LABEL: vqadd_s64
+    // CIR: {{%.*}} = cir.llvm.intrinsic "llvm.aarch64.neon.sqadd" {{%.*}}, {{%.*}} : 
+    // CIR-SAME: (!cir.vector<!s64i x 1>, !cir.vector<!s64i x 1>) -> !cir.vector<!s64i x 1>
+
+    // LLVM-LABEL: @test_vqadd_s64(
+    // LLVM:   [[TMP0:%.*]] = bitcast <1 x i64> %0 to <8 x i8>
+    // LLVM:   [[TMP1:%.*]] = bitcast <1 x i64> %1 to <8 x i8>
+    // LLVM:   [[VQADD_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqadd.v1i64(<1 x i64> %0, <1 x i64> %1)
+    // LLVM:   [[VQADD_V3_I:%.*]] = bitcast <1 x i64> [[VQADD_V2_I]] to <8 x i8>
+    // LLVM:   ret <1 x i64> [[VQADD_V2_I]]
+  }  
+
+  uint8x8_t test_vqadd_u8(uint8x8_t a, uint8x8_t b) {
+    return vqadd_u8(a, b);
+    // CIR-LABEL: vqadd_u8
+    // CIR: {{%.*}} = cir.llvm.intrinsic "llvm.aarch64.neon.uqadd" {{%.*}}, {{%.*}} : 
+    // CIR-SAME: (!cir.vector<!u8i x 8>, !cir.vector<!u8i x 8>) -> !cir.vector<!u8i x 8>
+
+    // LLVM-LABEL: @test_vqadd_u8(
+    // LLVM:   [[VQADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqadd.v8i8(<8 x i8> %0, <8 x i8> %1)
+    // LLVM:   ret <8 x i8> [[VQADD_V_I]]
+  }
+
+  uint16x4_t test_vqadd_u16(uint16x4_t a, uint16x4_t b) {
+    return vqadd_u16(a, b);
+    // CIR-LABEL: vqadd_u16
+    // CIR: {{%.*}} = cir.llvm.intrinsic "llvm.aarch64.neon.uqadd" {{%.*}}, {{%.*}} : 
+    // CIR-SAME: (!cir.vector<!u16i x 4>, !cir.vector<!u16i x 4>) -> !cir.vector<!u16i x 4>
+
+    // LLVM-LABEL: @test_vqadd_u16(
+    // LLVM:   [[VQADD_V_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqadd.v4i16(<4 x i16> %0, <4 x i16> %1)
+    // LLVM:   ret <4 x i16> [[VQADD_V_I]]
+  }
+
+  uint32x2_t test_vqadd_u32(uint32x2_t a, uint32x2_t b) {
+    return vqadd_u32(a, b);
+    // CIR-LABEL: vqadd_u32
+    // CIR: {{%.*}} = cir.llvm.intrinsic "llvm.aarch64.neon.uqadd" {{%.*}}, {{%.*}} : 
+    // CIR-SAME: (!cir.vector<!u32i x 2>, !cir.vector<!u32i x 2>) -> !cir.vector<!u32i x 2>
+
+    // LLVM-LABEL: @test_vqadd_u32(
+    // LLVM:   [[VQADD_V_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqadd.v2i32(<2 x i32> %0, <2 x i32> %1)
+    // LLVM:   ret <2 x i32> [[VQADD_V_I]]
+  }
+
+  uint64x1_t test_vqadd_u64(uint64x1_t a, uint64x1_t b) {
+    return vqadd_u64(a, b);
+    // CIR-LABEL: vqadd_u64
+    // CIR: {{%.*}} = cir.llvm.intrinsic "llvm.aarch64.neon.uqadd" {{%.*}}, {{%.*}} : 
+    // CIR-SAME: (!cir.vector<!u64i x 1>, !cir.vector<!u64i x 1>) -> !cir.vector<!u64i x 1>
+
+    // LLVM-LABEL: @test_vqadd_u64(
+    // LLVM:   [[VQADD_V_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqadd.v1i64(<1 x i64> %0, <1 x i64> %1)
+    // LLVM:   ret <1 x i64> [[VQADD_V_I]]
+  }
 
 // NYI-LABEL: @test_vqaddq_s8(
 // NYI:   [[VQADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqadd.v16i8(<16 x i8> %a, <16 x i8> %b)
@@ -5972,32 +5996,56 @@
 //   return vrshrn_high_n_u64(a, b, 19);
 // }
 
-// NYI-LABEL: @test_vqrshrun_n_s16(
-// NYI:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// NYI:   [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// NYI:   [[VQRSHRUN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> [[VQRSHRUN_N]], i32 3)
-// NYI:   ret <8 x i8> [[VQRSHRUN_N1]]
-// uint8x8_t test_vqrshrun_n_s16(int16x8_t a) {
-//   return vqrshrun_n_s16(a, 3);
-// }
+uint8x8_t test_vqrshrun_n_s16(int16x8_t a) {
+  return vqrshrun_n_s16(a, 3);
+  // CIR-LABEL: test_vqrshrun_n_s16
+  // CIR: [[INTRN_ARG1:%.*]] = cir.const #cir.int<3> : !s32i
+  // CIR: [[INTRN_ARG0:%.*]] = cir.cast(bitcast, {{%.*}} : !cir.vector<!s8i x 16>), !cir.vector<!s16i x 8> 
+  // CIR: {{%.*}} = cir.llvm.intrinsic "llvm.aarch64.neon.sqrshrun" [[INTRN_ARG0]], [[INTRN_ARG1]] :
+  // CIR-SAME: (!cir.vector<!s16i x 8>, !s32i) -> !cir.vector<!u8i x 8>
+  
+  // LLVM-LABEL: @test_vqrshrun_n_s16(
+  // LLVM:   [[TMP0:%.*]] = bitcast <8 x i16> {{%.*}} to <16 x i8>
+  // LLVM:   [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+  // LLVM:   [[VQRSHRUN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> [[VQRSHRUN_N]], i32 3)
+  // LLVM:   store <8 x i8> [[VQRSHRUN_N1]], ptr [[RET:%.*]], align 8
+  // LLVM:   [[RETVAL:%.*]] = load <8 x i8>, ptr [[RET]], align 8
+  // LLVM:   ret <8 x i8> [[RETVAL]]
+}
 
-// NYI-LABEL: @test_vqrshrun_n_s32(
-// NYI:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// NYI:   [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// NYI:   [[VQRSHRUN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> [[VQRSHRUN_N]], i32 9)
-// NYI:   ret <4 x i16> [[VQRSHRUN_N1]]
-// uint16x4_t test_vqrshrun_n_s32(int32x4_t a) {
-//   return vqrshrun_n_s32(a, 9);
-// }
+uint16x4_t test_vqrshrun_n_s32(int32x4_t a) {
+  return vqrshrun_n_s32(a, 9);
+  // CIR-LABEL: test_vqrshrun_n_s32
+  // CIR: [[INTRN_ARG1:%.*]] = cir.const #cir.int<9> : !s32i
+  // CIR: [[INTRN_ARG0:%.*]] = cir.cast(bitcast, {{%.*}} : !cir.vector<!s8i x 16>), !cir.vector<!s32i x 4> 
+  // CIR: {{%.*}} = cir.llvm.intrinsic "llvm.aarch64.neon.sqrshrun" [[INTRN_ARG0]], [[INTRN_ARG1]] :
+  // CIR-SAME: (!cir.vector<!s32i x 4>, !s32i) -> !cir.vector<!u16i x 4>
+  
+  // LLVM-LABEL: @test_vqrshrun_n_s32(
+  // LLVM:   [[TMP0:%.*]] = bitcast <4 x i32> {{%.*}} to <16 x i8>
+  // LLVM:   [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+  // LLVM:   [[VQRSHRUN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> [[VQRSHRUN_N]], i32 9)
+  // LLVM:   store <4 x i16> [[VQRSHRUN_N1]], ptr [[RET:%.*]], align 8
+  // LLVM:   [[RETVAL:%.*]] = load <4 x i16>, ptr [[RET]], align 8
+  // LLVM:   ret <4 x i16> [[RETVAL]]
+}
 
-// NYI-LABEL: @test_vqrshrun_n_s64(
-// NYI:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// NYI:   [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// NYI:   [[VQRSHRUN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64> [[VQRSHRUN_N]], i32 19)
-// NYI:   ret <2 x i32> [[VQRSHRUN_N1]]
-// uint32x2_t test_vqrshrun_n_s64(int64x2_t a) {
-//   return vqrshrun_n_s64(a, 19);
-// }
+uint32x2_t test_vqrshrun_n_s64(int64x2_t a) {
+  return vqrshrun_n_s64(a, 19);
+  // CIR-LABEL: test_vqrshrun_n_s64
+  // CIR: [[INTRN_ARG1:%.*]] = cir.const #cir.int<19> : !s32i
+  // CIR: [[INTRN_ARG0:%.*]] = cir.cast(bitcast, {{%.*}} : !cir.vector<!s8i x 16>), !cir.vector<!s64i x 2> 
+  // CIR: {{%.*}} = cir.llvm.intrinsic "llvm.aarch64.neon.sqrshrun" [[INTRN_ARG0]], [[INTRN_ARG1]] :
+  // CIR-SAME: (!cir.vector<!s64i x 2>, !s32i) -> !cir.vector<!u32i x 2>
+  
+  // LLVM-LABEL: @test_vqrshrun_n_s64(
+  // LLVM:   [[TMP0:%.*]] = bitcast <2 x i64> {{%.*}} to <16 x i8>
+  // LLVM:   [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+  // LLVM:   [[VQRSHRUN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64> [[VQRSHRUN_N]], i32 19)
+  // LLVM:   store <2 x i32> [[VQRSHRUN_N1]], ptr [[RET:%.*]], align 8
+  // LLVM:   [[RETVAL:%.*]] = load <2 x i32>, ptr [[RET]], align 8
+  // LLVM:   ret <2 x i32> [[RETVAL]]
+}
 
 // NYI-LABEL: @test_vqrshrun_high_n_s16(
 // NYI:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
@@ -6041,7 +6089,7 @@
 // NYI-LABEL: @test_vqshrn_n_s32(
 // NYI:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // NYI:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// NYI:   [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32> [[VQSHRN_N]], i32 9)
+// NYI:   [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32> [[VQSHRN_N]], i32 19)
 // NYI:   ret <4 x i16> [[VQSHRN_N1]]
 // int16x4_t test_vqshrn_n_s32(int32x4_t a) {
 //   return vqshrn_n_s32(a, 9);

From 780c1f3b4b4469ad04537e8015103f154fda0191 Mon Sep 17 00:00:00 2001
From: Guojin He <guojinhe@meta.com>
Date: Tue, 1 Oct 2024 19:23:40 -0400
Subject: [PATCH 5/5] moving aarch64-neon-intrinsics.c

---
 .../CIR/CodeGen/{aarch64-neon-intrinsics.c => AArch64/neon.c}     | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename clang/test/CIR/CodeGen/{aarch64-neon-intrinsics.c => AArch64/neon.c} (100%)

diff --git a/clang/test/CIR/CodeGen/aarch64-neon-intrinsics.c b/clang/test/CIR/CodeGen/AArch64/neon.c
similarity index 100%
rename from clang/test/CIR/CodeGen/aarch64-neon-intrinsics.c
rename to clang/test/CIR/CodeGen/AArch64/neon.c