[CIR][CIRGen][Builtin][Neon] Lower neon vqadd_v (#890)

ghehg · lanza · commit 0d5f7edf8665 · 2025-03-13T19:35:11.000-07:00
as title. 
Also add function buildCommonNeonBuiltinExpr just like OG's
emitCommonNeonBuiltinExpr. This might help consolidate neon cases and
share common code.
Notice:

- I pretty much keep the skeleton of OG's emitCommonNeonBuiltinExpr at
the cost of that we didn't use a few variables they calculate. They
might help in the future.
- The purpose of having CommonNeonBuiltinExpr is to reduce
implementation code duplication. So far, we only have one type
implemented, and it's hard for CIR to be more generic. But we should see
if in future we can have different types of intrinsics share more
generic code path.

---------

Co-authored-by: Guojin He &lt;guojinhe@meta.com&gt;
diff --git a/clang/include/clang/CIR/MissingFeatures.h b/clang/include/clang/CIR/MissingFeatures.h
@@ -136,6 +136,9 @@ struct MissingFeatures {
   // AArch64 Neon builtin related.
   static bool buildNeonShiftVector() { return false; }
 
+  // ABIInfo queries.
+  static bool useTargetLoweringABIInfo() { return false; }
+
   // Misc
   static bool cacheRecordLayouts() { return false; }
   static bool capturedByInit() { return false; }
diff --git a/clang/lib/CIR/CodeGen/ABIInfo.h b/clang/lib/CIR/CodeGen/ABIInfo.h
@@ -35,6 +35,8 @@ class ABIInfo {
 
   virtual void computeInfo(CIRGenFunctionInfo &FI) const = 0;
 
+  virtual bool allowBFloatArgsAndRet() const { return false; }
+
   // Implement the Type::IsPromotableIntegerType for ABI specific needs. The
   // only difference is that this consideres bit-precise integer types as well.
   bool isPromotableIntegerTypeForABI(clang::QualType Ty) const;
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
@@ -1603,7 +1603,7 @@ static mlir::Value buildArmLdrexNon128Intrinsic(unsigned int builtinID,
 
 mlir::Value buildNeonCall(unsigned int builtinID, CIRGenFunction &cgf,
                           llvm::SmallVector<mlir::Type> argTypes,
-                          llvm::SmallVector<mlir::Value, 4> args,
+                          llvm::SmallVectorImpl<mlir::Value> &args,
                           llvm::StringRef intrinsicName, mlir::Type funcResTy,
                           mlir::Location loc,
                           bool isConstrainedFPIntrinsic = false,
@@ -1640,6 +1640,55 @@ mlir::Value buildNeonCall(unsigned int builtinID, CIRGenFunction &cgf,
   }
 }
 
+mlir::Value CIRGenFunction::buildCommonNeonBuiltinExpr(
+    unsigned builtinID, unsigned llvmIntrinsic, unsigned altLLVMIntrinsic,
+    const char *nameHint, unsigned modifier, const CallExpr *e,
+    llvm::SmallVectorImpl<mlir::Value> &ops, cir::Address ptrOp0,
+    cir::Address ptrOp1, llvm::Triple::ArchType arch) {
+  // Get the last argument, which specifies the vector type.
+  const clang::Expr *arg = e->getArg(e->getNumArgs() - 1);
+  std::optional<llvm::APSInt> neonTypeConst =
+      arg->getIntegerConstantExpr(getContext());
+  if (!neonTypeConst)
+    return nullptr;
+
+  // Determine the type of this overloaded NEON intrinsic.
+  NeonTypeFlags neonType(neonTypeConst->getZExtValue());
+  bool isUnsigned = neonType.isUnsigned();
+  bool isQuad = neonType.isQuad();
+  const bool hasLegalHalfType = getTarget().hasLegalHalfType();
+  // The value of allowBFloatArgsAndRet is true for AArch64, but it should
+  // come from ABI info.
+  const bool allowBFloatArgsAndRet =
+      getTargetHooks().getABIInfo().allowBFloatArgsAndRet();
+
+  mlir::Type vTy = GetNeonType(this, neonType, hasLegalHalfType, false,
+                               allowBFloatArgsAndRet);
+  if (!vTy)
+    return nullptr;
+
+  unsigned intrinicId = llvmIntrinsic;
+  if ((modifier & UnsignedAlts) && !isUnsigned)
+    intrinicId = altLLVMIntrinsic;
+
+  switch (builtinID) {
+  default:
+    llvm_unreachable("NYI");
+  case NEON::BI__builtin_neon_vqadd_v:
+    mlir::Value res = buildNeonCall(builtinID, *this, {vTy, vTy}, ops,
+                                    (intrinicId != altLLVMIntrinsic)
+                                        ? "llvm.aarch64.neon.uqadd"
+                                        : "llvm.aarch64.neon.sqadd",
+                                    vTy, getLoc(e->getExprLoc()));
+    mlir::Type resultType = ConvertType(e->getType());
+    // AArch64 intrinsic one-element vector type cast to
+    // scalar type expected by the builtin
+    return builder.createBitcast(res, resultType);
+    break;
+  }
+  return nullptr;
+}
+
 mlir::Value
 CIRGenFunction::buildAArch64BuiltinExpr(unsigned BuiltinID, const CallExpr *E,
                                         ReturnValueSlot ReturnValue,
@@ -2359,9 +2408,11 @@ CIRGenFunction::buildAArch64BuiltinExpr(unsigned BuiltinID, const CallExpr *E,
   // defer to common code if it's been added to our special map.
   Builtin = findARMVectorIntrinsicInMap(AArch64SIMDIntrinsicMap, BuiltinID,
                                         AArch64SIMDIntrinsicsProvenSorted);
-  if (Builtin) {
-    llvm_unreachable("NYI");
-  }
+  if (Builtin)
+    return buildCommonNeonBuiltinExpr(
+        Builtin->BuiltinID, Builtin->LLVMIntrinsic, Builtin->AltLLVMIntrinsic,
+        Builtin->NameHint, Builtin->TypeModifier, E, Ops,
+        /*never use addresses*/ Address::invalid(), Address::invalid(), Arch);
 
   if (mlir::Value V =
           buildAArch64TblBuiltinExpr(*this, BuiltinID, E, Ops, Arch))
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h
@@ -980,6 +980,11 @@ class CIRGenFunction : public CIRGenTypeCache {
   mlir::Value buildARMCDEBuiltinExpr(unsigned BuiltinID, const CallExpr *E,
                                      ReturnValueSlot ReturnValue,
                                      llvm::Triple::ArchType Arch);
+  mlir::Value buildCommonNeonBuiltinExpr(
+      unsigned builtinID, unsigned llvmIntrinsic, unsigned altLLVMIntrinsic,
+      const char *nameHint, unsigned modifier, const CallExpr *e,
+      llvm::SmallVectorImpl<mlir::Value> &ops, cir::Address ptrOp0,
+      cir::Address ptrOp1, llvm::Triple::ArchType arch);
 
   mlir::Value buildAlignmentAssumption(mlir::Value ptrValue, QualType ty,
                                        SourceLocation loc,
diff --git a/clang/lib/CIR/CodeGen/TargetInfo.cpp b/clang/lib/CIR/CodeGen/TargetInfo.cpp
@@ -5,6 +5,7 @@
 #include "CIRGenTypes.h"
 
 #include "clang/Basic/TargetInfo.h"
+#include "clang/CIR/MissingFeatures.h"
 #include "clang/CIR/Target/x86.h"
 
 using namespace cir;
@@ -103,6 +104,11 @@ class AArch64ABIInfo : public ABIInfo {
 
 public:
   AArch64ABIInfo(CIRGenTypes &CGT, ABIKind Kind) : ABIInfo(CGT), Kind(Kind) {}
+  virtual bool allowBFloatArgsAndRet() const override {
+    // TODO: Should query target info instead of hardcoding.
+    assert(!cir::MissingFeatures::useTargetLoweringABIInfo());
+    return true;
+  }
 
 private:
   ABIKind getABIKind() const { return Kind; }
diff --git a/clang/test/CIR/CodeGen/aarch64-neon-vqadd.c b/clang/test/CIR/CodeGen/aarch64-neon-vqadd.c
@@ -0,0 +1,179 @@
+// RUN: %clang_cc1 -triple aarch64-none-linux-android24  -fclangir \
+// RUN:            -emit-cir -target-feature +neon %s -o %t.cir
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-android24  -fclangir \
+// RUN:            -emit-llvm -target-feature +neon %s -o %t.ll
+// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s
+
+// Tetsting normal situation of vdup lane intrinsics.
+
+// REQUIRES: aarch64-registered-target || arm-registered-target
+#include <arm_neon.h>
+
+uint8x8_t test_vqadd_u8(uint8x8_t a, uint8x8_t b) {
+  return vqadd_u8(a,b);
+}
+
+// CIR-LABEL: vqadd_u8
+// CIR: {{%.*}} = cir.llvm.intrinsic "llvm.aarch64.neon.uqadd" {{%.*}}, {{%.*}} : 
+// CIR-SAME: (!cir.vector<!u8i x 8>, !cir.vector<!u8i x 8>) -> !cir.vector<!u8i x 8>
+// CIR: cir.return
+
+// LLVM: {{.*}}test_vqadd_u8(<8 x i8>{{.*}} [[A:%.*]], <8 x i8>{{.*}} [[B:%.*]])
+// LLVM: store <8 x i8> [[A]], ptr [[A_ADDR:%.*]], align 8
+// LLVM: store <8 x i8> [[B]], ptr [[B_ADDR:%.*]], align 8
+// LLVM: [[TMP_A:%.*]] = load <8 x i8>, ptr [[A_ADDR]], align 8
+// LLVM: [[TMP_B:%.*]] = load <8 x i8>, ptr [[B_ADDR]], align 8
+// LLVM: store <8 x i8> [[TMP_A]], ptr [[P0_ADDR:%.*]], align 8
+// LLVM: store <8 x i8> [[TMP_B]], ptr [[P1_ADDR:%.*]], align 8
+// LLVM: [[INTRN_A:%.*]] = load <8 x i8>, ptr [[P0_ADDR]], align 8
+// LLVM: [[INTRN_B:%.*]] = load <8 x i8>, ptr [[P1_ADDR]], align 8
+// LLVM: {{%.*}} = call <8 x i8> @llvm.aarch64.neon.uqadd.v8i8(<8 x i8> [[INTRN_A]], <8 x i8> [[INTRN_B]])
+// LLVM: ret <8 x i8>
+
+int8x8_t test_vqadd_s8(int8x8_t a, int8x8_t b) {
+  return vqadd_s8(a,b);
+}
+
+// CIR-LABEL: vqadd_s8
+// CIR: {{%.*}} = cir.llvm.intrinsic "llvm.aarch64.neon.sqadd" {{%.*}}, {{%.*}} : 
+// CIR-SAME: (!cir.vector<!s8i x 8>, !cir.vector<!s8i x 8>) -> !cir.vector<!s8i x 8>
+// CIR: cir.return
+
+// LLVM: {{.*}}test_vqadd_s8(<8 x i8>{{.*}} [[A:%.*]], <8 x i8>{{.*}} [[B:%.*]])
+// LLVM: store <8 x i8> [[A]], ptr [[A_ADDR:%.*]], align 8
+// LLVM: store <8 x i8> [[B]], ptr [[B_ADDR:%.*]], align 8
+// LLVM: [[TMP_A:%.*]] = load <8 x i8>, ptr [[A_ADDR]], align 8
+// LLVM: [[TMP_B:%.*]] = load <8 x i8>, ptr [[B_ADDR]], align 8
+// LLVM: store <8 x i8> [[TMP_A]], ptr [[P0_ADDR:%.*]], align 8
+// LLVM: store <8 x i8> [[TMP_B]], ptr [[P1_ADDR:%.*]], align 8
+// LLVM: [[INTRN_A:%.*]] = load <8 x i8>, ptr [[P0_ADDR]], align 8
+// LLVM: [[INTRN_B:%.*]] = load <8 x i8>, ptr [[P1_ADDR]], align 8
+// LLVM: {{%.*}} = call <8 x i8> @llvm.aarch64.neon.sqadd.v8i8(<8 x i8> [[INTRN_A]], <8 x i8> [[INTRN_B]])
+// LLVM: ret <8 x i8>
+
+uint16x4_t test_vqadd_u16(uint16x4_t a, uint16x4_t b) {
+  return vqadd_u16(a,b);
+}
+
+// CIR-LABEL: vqadd_u16
+// CIR: {{%.*}} = cir.llvm.intrinsic "llvm.aarch64.neon.uqadd" {{%.*}}, {{%.*}} : 
+// CIR-SAME: (!cir.vector<!u16i x 4>, !cir.vector<!u16i x 4>) -> !cir.vector<!u16i x 4>
+// CIR: cir.return
+
+// LLVM: {{.*}}test_vqadd_u16(<4 x i16>{{.*}} [[A:%.*]], <4 x i16>{{.*}} [[B:%.*]])
+// LLVM: store <4 x i16> [[A]], ptr [[A_ADDR:%.*]], align 8
+// LLVM: store <4 x i16> [[B]], ptr [[B_ADDR:%.*]], align 8
+// LLVM: [[TMP_A:%.*]] = load <4 x i16>, ptr [[A_ADDR]], align 8
+// LLVM: [[TMP_B:%.*]] = load <4 x i16>, ptr [[B_ADDR]], align 8
+// LLVM: store <4 x i16> [[TMP_A]], ptr [[P0_ADDR:%.*]], align 8
+// LLVM: store <4 x i16> [[TMP_B]], ptr [[P1_ADDR:%.*]], align 8
+// LLVM: [[INTRN_A:%.*]] = load <4 x i16>, ptr [[P0_ADDR]], align 8
+// LLVM: [[INTRN_B:%.*]] = load <4 x i16>, ptr [[P1_ADDR]], align 8
+// LLVM: {{%.*}} = call <4 x i16> @llvm.aarch64.neon.uqadd.v4i16(<4 x i16> [[INTRN_A]], <4 x i16> [[INTRN_B]])
+// LLVM: ret <4 x i16>
+
+int16x4_t test_vqadd_s16(int16x4_t a, int16x4_t b) {
+  return vqadd_s16(a,b);
+}
+
+// CIR-LABEL: vqadd_u16
+// CIR: {{%.*}} = cir.llvm.intrinsic "llvm.aarch64.neon.sqadd" {{%.*}}, {{%.*}} : 
+// CIR-SAME: (!cir.vector<!s16i x 4>, !cir.vector<!s16i x 4>) -> !cir.vector<!s16i x 4>
+// CIR: cir.return
+
+// LLVM: {{.*}}test_vqadd_s16(<4 x i16>{{.*}} [[A:%.*]], <4 x i16>{{.*}} [[B:%.*]])
+// LLVM: store <4 x i16> [[A]], ptr [[A_ADDR:%.*]], align 8
+// LLVM: store <4 x i16> [[B]], ptr [[B_ADDR:%.*]], align 8
+// LLVM: [[TMP_A:%.*]] = load <4 x i16>, ptr [[A_ADDR]], align 8
+// LLVM: [[TMP_B:%.*]] = load <4 x i16>, ptr [[B_ADDR]], align 8
+// LLVM: store <4 x i16> [[TMP_A]], ptr [[P0_ADDR:%.*]], align 8
+// LLVM: store <4 x i16> [[TMP_B]], ptr [[P1_ADDR:%.*]], align 8
+// LLVM: [[INTRN_A:%.*]] = load <4 x i16>, ptr [[P0_ADDR]], align 8
+// LLVM: [[INTRN_B:%.*]] = load <4 x i16>, ptr [[P1_ADDR]], align 8
+// LLVM: {{%.*}} = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> [[INTRN_A]], <4 x i16> [[INTRN_B]])
+// LLVM: ret <4 x i16>
+
+uint32x2_t test_vqadd_u32(uint32x2_t a, uint32x2_t b) {
+  return vqadd_u32(a,b);
+}
+
+// CIR-LABEL: vqadd_u32
+// CIR: {{%.*}} = cir.llvm.intrinsic "llvm.aarch64.neon.uqadd" {{%.*}}, {{%.*}} : 
+// CIR-SAME: (!cir.vector<!u32i x 2>, !cir.vector<!u32i x 2>) -> !cir.vector<!u32i x 2>
+// CIR: cir.return
+
+// LLVM: {{.*}}test_vqadd_u32(<2 x i32>{{.*}} [[A:%.*]], <2 x i32>{{.*}} [[B:%.*]])
+// LLVM: store <2 x i32> [[A]], ptr [[A_ADDR:%.*]], align 8
+// LLVM: store <2 x i32> [[B]], ptr [[B_ADDR:%.*]], align 8
+// LLVM: [[TMP_A:%.*]] = load <2 x i32>, ptr [[A_ADDR]], align 8
+// LLVM: [[TMP_B:%.*]] = load <2 x i32>, ptr [[B_ADDR]], align 8
+// LLVM: store <2 x i32> [[TMP_A]], ptr [[P0_ADDR:%.*]], align 8
+// LLVM: store <2 x i32> [[TMP_B]], ptr [[P1_ADDR:%.*]], align 8
+// LLVM: [[INTRN_A:%.*]] = load <2 x i32>, ptr [[P0_ADDR]], align 8
+// LLVM: [[INTRN_B:%.*]] = load <2 x i32>, ptr [[P1_ADDR]], align 8
+// LLVM: {{%.*}} = call <2 x i32> @llvm.aarch64.neon.uqadd.v2i32(<2 x i32> [[INTRN_A]], <2 x i32> [[INTRN_B]])
+// LLVM: ret <2 x i32>
+
+int32x2_t test_vqadd_s32(int32x2_t a, int32x2_t b) {
+  return vqadd_s32(a,b);
+}
+
+// CIR-LABEL: vqadd_s32
+// CIR: {{%.*}} = cir.llvm.intrinsic "llvm.aarch64.neon.sqadd" {{%.*}}, {{%.*}} : 
+// CIR-SAME: (!cir.vector<!s32i x 2>, !cir.vector<!s32i x 2>) -> !cir.vector<!s32i x 2>
+// CIR: cir.return
+
+// LLVM: {{.*}}test_vqadd_s32(<2 x i32>{{.*}} [[A:%.*]], <2 x i32>{{.*}} [[B:%.*]])
+// LLVM: store <2 x i32> [[A]], ptr [[A_ADDR:%.*]], align 8
+// LLVM: store <2 x i32> [[B]], ptr [[B_ADDR:%.*]], align 8
+// LLVM: [[TMP_A:%.*]] = load <2 x i32>, ptr [[A_ADDR]], align 8
+// LLVM: [[TMP_B:%.*]] = load <2 x i32>, ptr [[B_ADDR]], align 8
+// LLVM: store <2 x i32> [[TMP_A]], ptr [[P0_ADDR:%.*]], align 8
+// LLVM: store <2 x i32> [[TMP_B]], ptr [[P1_ADDR:%.*]], align 8
+// LLVM: [[INTRN_A:%.*]] = load <2 x i32>, ptr [[P0_ADDR]], align 8
+// LLVM: [[INTRN_B:%.*]] = load <2 x i32>, ptr [[P1_ADDR]], align 8
+// LLVM: {{%.*}} = call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> [[INTRN_A]], <2 x i32> [[INTRN_B]])
+// LLVM: ret <2 x i32>
+
+uint64x1_t test_vqadd_u64(uint64x1_t a, uint64x1_t b) {
+  return vqadd_u64(a,b);
+}
+
+// CIR-LABEL: vqadd_u64
+// CIR: {{%.*}} = cir.llvm.intrinsic "llvm.aarch64.neon.uqadd" {{%.*}}, {{%.*}} : 
+// CIR-SAME: (!cir.vector<!u64i x 1>, !cir.vector<!u64i x 1>) -> !cir.vector<!u64i x 1>
+// CIR: cir.return
+
+// LLVM: {{.*}}test_vqadd_u64(<1 x i64>{{.*}} [[A:%.*]], <1 x i64>{{.*}} [[B:%.*]])
+// LLVM: store <1 x i64> [[A]], ptr [[A_ADDR:%.*]], align 8
+// LLVM: store <1 x i64> [[B]], ptr [[B_ADDR:%.*]], align 8
+// LLVM: [[TMP_A:%.*]] = load <1 x i64>, ptr [[A_ADDR]], align 8
+// LLVM: [[TMP_B:%.*]] = load <1 x i64>, ptr [[B_ADDR]], align 8
+// LLVM: store <1 x i64> [[TMP_A]], ptr [[P0_ADDR:%.*]], align 8
+// LLVM: store <1 x i64> [[TMP_B]], ptr [[P1_ADDR:%.*]], align 8
+// LLVM: [[INTRN_A:%.*]] = load <1 x i64>, ptr [[P0_ADDR]], align 8
+// LLVM: [[INTRN_B:%.*]] = load <1 x i64>, ptr [[P1_ADDR]], align 8
+// LLVM: {{%.*}} = call <1 x i64> @llvm.aarch64.neon.uqadd.v1i64(<1 x i64> [[INTRN_A]], <1 x i64> [[INTRN_B]])
+// LLVM: ret <1 x i64>
+
+int64x1_t test_vqadd_s64(int64x1_t a, int64x1_t b) {
+  return vqadd_s64(a,b);
+}
+
+// CIR-LABEL: vqadd_s64
+// CIR: {{%.*}} = cir.llvm.intrinsic "llvm.aarch64.neon.sqadd" {{%.*}}, {{%.*}} : 
+// CIR-SAME: (!cir.vector<!s64i x 1>, !cir.vector<!s64i x 1>) -> !cir.vector<!s64i x 1>
+// CIR: cir.return
+
+// LLVM: {{.*}}test_vqadd_s64(<1 x i64>{{.*}} [[A:%.*]], <1 x i64>{{.*}} [[B:%.*]])
+// LLVM: store <1 x i64> [[A]], ptr [[A_ADDR:%.*]], align 8
+// LLVM: store <1 x i64> [[B]], ptr [[B_ADDR:%.*]], align 8
+// LLVM: [[TMP_A:%.*]] = load <1 x i64>, ptr [[A_ADDR]], align 8
+// LLVM: [[TMP_B:%.*]] = load <1 x i64>, ptr [[B_ADDR]], align 8
+// LLVM: store <1 x i64> [[TMP_A]], ptr [[P0_ADDR:%.*]], align 8
+// LLVM: store <1 x i64> [[TMP_B]], ptr [[P1_ADDR:%.*]], align 8
+// LLVM: [[INTRN_A:%.*]] = load <1 x i64>, ptr [[P0_ADDR]], align 8
+// LLVM: [[INTRN_B:%.*]] = load <1 x i64>, ptr [[P1_ADDR]], align 8
+// LLVM: {{%.*}} = call <1 x i64> @llvm.aarch64.neon.sqadd.v1i64(<1 x i64> [[INTRN_A]], <1 x i64> [[INTRN_B]])
+// LLVM: ret <1 x i64>