[CIR][ABI][Lowering] Add CCLower support for int128 on x86_64 (#1036)

Lancern · web-flow · commit 734d3443eead · 2024-10-31T14:26:23.000-07:00
This PR adds calling convention lowering support for the int128 type on x86_64. This is a follow up on #953 .
diff --git a/clang/lib/CIR/Dialect/Transforms/TargetLowering/CIRToCIRArgMapping.h b/clang/lib/CIR/Dialect/Transforms/TargetLowering/CIRToCIRArgMapping.h
@@ -97,7 +97,7 @@ class CIRToCIRArgMapping {
         cir_cconv_assert(AI.getCoerceToType() && "Missing coerced type!!");
         StructType STy = dyn_cast<StructType>(AI.getCoerceToType());
         if (AI.isDirect() && AI.getCanBeFlattened() && STy) {
-          cir_cconv_unreachable("NYI");
+          IRArgs.NumberOfArgs = STy.getNumElements();
         } else {
           IRArgs.NumberOfArgs = 1;
         }
diff --git a/clang/lib/CIR/Dialect/Transforms/TargetLowering/LowerFunction.cpp b/clang/lib/CIR/Dialect/Transforms/TargetLowering/LowerFunction.cpp
@@ -369,6 +369,12 @@ LowerFunction::buildFunctionProlog(const LowerFunctionInfo &FI, FuncOp Fn,
 
       cir_cconv_assert(!::cir::MissingFeatures::vectorType());
 
+      StructType STy = dyn_cast<StructType>(ArgI.getCoerceToType());
+      if (ArgI.isDirect() && !ArgI.getCanBeFlattened() && STy &&
+          STy.getNumElements() > 1) {
+        cir_cconv_unreachable("NYI");
+      }
+
       // Allocate original argument to be "uncoerced".
       // FIXME(cir): We should have a alloca op builder that does not required
       // the pointer type to be explicitly passed.
@@ -383,10 +389,45 @@ LowerFunction::buildFunctionProlog(const LowerFunctionInfo &FI, FuncOp Fn,
 
       // Fast-isel and the optimizer generally like scalar values better than
       // FCAs, so we flatten them if this is safe to do for this argument.
-      StructType STy = dyn_cast<StructType>(ArgI.getCoerceToType());
       if (ArgI.isDirect() && ArgI.getCanBeFlattened() && STy &&
           STy.getNumElements() > 1) {
-        cir_cconv_unreachable("NYI");
+        auto ptrType = cast<PointerType>(Ptr.getType());
+        llvm::TypeSize structSize =
+            LM.getTypes().getDataLayout().getTypeAllocSize(STy);
+        llvm::TypeSize ptrElementSize =
+            LM.getTypes().getDataLayout().getTypeAllocSize(
+                ptrType.getPointee());
+        if (structSize.isScalable()) {
+          cir_cconv_unreachable("NYI");
+        } else {
+          uint64_t srcSize = structSize.getFixedValue();
+          uint64_t dstSize = ptrElementSize.getFixedValue();
+
+          Value addrToStoreInto;
+          if (srcSize <= dstSize) {
+            addrToStoreInto = rewriter.create<CastOp>(
+                Ptr.getLoc(), PointerType::get(STy, ptrType.getAddrSpace()),
+                CastKind::bitcast, Ptr);
+          } else {
+            cir_cconv_unreachable("NYI");
+          }
+
+          assert(STy.getNumElements() == NumIRArgs);
+          for (unsigned i = 0, e = STy.getNumElements(); i != e; ++i) {
+            Value ai = Fn.getArgument(FirstIRArg + i);
+            Type elementTy = STy.getMembers()[i];
+            Value eltPtr = rewriter.create<GetMemberOp>(
+                ai.getLoc(),
+                PointerType::get(elementTy, ptrType.getAddrSpace()),
+                addrToStoreInto,
+                /*name=*/"", /*index=*/i);
+            rewriter.create<StoreOp>(ai.getLoc(), ai, eltPtr);
+          }
+
+          if (srcSize > dstSize) {
+            cir_cconv_unreachable("NYI");
+          }
+        }
       } else {
         // Simple case, just do a coerced store of the argument into the alloca.
         cir_cconv_assert(NumIRArgs == 1);
@@ -567,8 +608,13 @@ LogicalResult LowerFunction::generateCode(FuncOp oldFn, FuncOp newFn,
   rewriter.inlineRegionBefore(oldFn.getBody(), newFn.getBody(),
                               newFn.getBody().end());
 
+  // The block arguments of srcBlock are the old function's arguments. At this
+  // point, all old arguments should be replaced with the lowered values.
+  // Thus we could safely remove all the block arguments on srcBlock here.
+  srcBlock->eraseArguments(0, srcBlock->getNumArguments());
+
   // Merge entry blocks to ensure correct branching.
-  rewriter.mergeBlocks(srcBlock, dstBlock, newFn.getArguments());
+  rewriter.mergeBlocks(srcBlock, dstBlock);
 
   // FIXME(cir): What about saving parameters for corotines? Should we do
   // something about it in this pass? If the change with the calling
diff --git a/clang/lib/CIR/Dialect/Transforms/TargetLowering/Targets/X86.cpp b/clang/lib/CIR/Dialect/Transforms/TargetLowering/Targets/X86.cpp
@@ -234,7 +234,12 @@ void X86_64ABIInfo::classify(Type Ty, uint64_t OffsetBase, Class &Lo, Class &Hi,
   if (/*isBuitinType=*/true) {
     if (isa<VoidType>(Ty)) {
       Current = Class::NoClass;
-    } else if (isa<IntType>(Ty)) {
+    } else if (auto IntTy = dyn_cast<IntType>(Ty)) {
+      if (IntTy.getWidth() == 128) {
+        Lo = Class::Integer;
+        Hi = Class::Integer;
+        return;
+      }
 
       // FIXME(cir): Clang's BuiltinType::Kind allow comparisons (GT, LT, etc).
       // We should implement this in CIR to simplify the conditions below.
@@ -456,6 +461,50 @@ Type X86_64ABIInfo::GetINTEGERTypeAtOffset(Type DestTy, unsigned IROffset,
                       std::min(TySizeInBytes - SourceOffset, 8U) * 8, isSigned);
 }
 
+/// GetX86_64ByValArgumentPair - Given a high and low type that can ideally
+/// be used as elements of a two register pair to pass or return, return a
+/// first class aggregate to represent them.  For example, if the low part of
+/// a by-value argument should be passed as i32* and the high part as float,
+/// return {i32*, float}.
+static mlir::Type GetX86_64ByValArgumentPair(mlir::Type lo, mlir::Type hi,
+                                             const ::cir::CIRDataLayout &td) {
+  // In order to correctly satisfy the ABI, we need to the high part to start
+  // at offset 8.  If the high and low parts we inferred are both 4-byte types
+  // (e.g. i32 and i32) then the resultant struct type ({i32,i32}) won't have
+  // the second element at offset 8.  Check for this:
+  unsigned loSize = (unsigned)td.getTypeAllocSize(lo);
+  llvm::Align highAlign = td.getABITypeAlign(hi);
+  unsigned highStart = llvm::alignTo(loSize, highAlign);
+  assert(highStart != 0 && highStart <= 8 && "Invalid x86-64 argument pair!");
+
+  // To handle this, we have to increase the size of the low part so that the
+  // second element will start at an 8 byte offset.  We can't increase the size
+  // of the second element because it might make us access off the end of the
+  // struct.
+  if (highStart != 8) {
+    // There are usually two sorts of types the ABI generation code can produce
+    // for the low part of a pair that aren't 8 bytes in size: half, float or
+    // i8/i16/i32.  This can also include pointers when they are 32-bit (X32 and
+    // NaCl).
+    // Promote these to a larger type.
+    if (isa<FP16Type, SingleType>(lo))
+      lo = DoubleType::get(lo.getContext());
+    else {
+      assert((isa<IntType, PointerType>(lo)) && "Invalid/unknown lo type");
+      // TODO(cir): does the sign of the int64 type matter here?
+      lo = IntType::get(lo.getContext(), 64, true);
+    }
+  }
+
+  auto result = StructType::get(lo.getContext(), {lo, hi}, /*packed=*/false,
+                                StructType::Struct);
+
+  // Verify that the second element is at an 8-byte offset.
+  assert(td.getStructLayout(result)->getElementOffset(1) == 8 &&
+         "Invalid x86-64 argument pair!");
+  return result;
+}
+
 ::cir::ABIArgInfo X86_64ABIInfo::classifyReturnType(Type RetTy) const {
   // AMD64-ABI 3.2.3p4: Rule 1. Classify the return type with the
   // classification algorithm.
@@ -507,6 +556,12 @@ ::cir::ABIArgInfo X86_64ABIInfo::classifyReturnType(Type RetTy) const {
   case Class::NoClass:
     break;
 
+  case Class::Integer:
+    HighPart = GetINTEGERTypeAtOffset(RetTy, 8, RetTy, 8);
+    if (Lo == Class::NoClass) // Return HighPart at offset 8 in memory.
+      return ABIArgInfo::getDirect(HighPart, 8);
+    break;
+
   default:
     cir_cconv_unreachable("NYI");
   }
@@ -515,7 +570,7 @@ ::cir::ABIArgInfo X86_64ABIInfo::classifyReturnType(Type RetTy) const {
   // known to pass in the high eightbyte of the result.  We do this by forming
   // a first class struct aggregate with the high and low part: {low, high}
   if (HighPart)
-    cir_cconv_unreachable("NYI");
+    resType = GetX86_64ByValArgumentPair(resType, HighPart, getDataLayout());
 
   return ABIArgInfo::getDirect(resType);
 }
@@ -580,12 +635,25 @@ ABIArgInfo X86_64ABIInfo::classifyArgumentType(Type Ty, unsigned freeIntRegs,
   switch (Hi) {
   case Class::NoClass:
     break;
+
+  case Class::Integer:
+    ++neededInt;
+    // Pick an 8-byte type based on the preferred type.
+    HighPart = GetINTEGERTypeAtOffset(Ty, 8, Ty, 8);
+
+    if (Lo == Class::NoClass) // Pass HighPart at offset 8 in memory.
+      return ABIArgInfo::getDirect(HighPart, 8);
+    break;
+
   default:
     cir_cconv_unreachable("NYI");
   }
 
+  // If a high part was specified, merge it together with the low part.  It is
+  // known to pass in the high eightbyte of the result.  We do this by forming a
+  // first class struct aggregate with the high and low part: {low, high}
   if (HighPart)
-    cir_cconv_unreachable("NYI");
+    ResType = GetX86_64ByValArgumentPair(ResType, HighPart, getDataLayout());
 
   return ABIArgInfo::getDirect(ResType);
 }
diff --git a/clang/test/CIR/CallConvLowering/x86_64/int128.cpp b/clang/test/CIR/CallConvLowering/x86_64/int128.cpp
@@ -0,0 +1,54 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -fclangir-call-conv-lowering -emit-cir-flat -mmlir --mlir-print-ir-after=cir-call-conv-lowering %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -fclangir-call-conv-lowering -emit-llvm %s -o %t.ll
+// RUN: FileCheck --input-file=%t.ll %s --check-prefix=LLVM
+
+// CHECK: ![[I128_STRUCT:.+]] = !cir.struct<struct  {!s64i, !s64i}>
+
+// CHECK: @_Z5test1nn(%[[ARG0:.+]]: !s64i loc({{.+}}), %[[ARG1:.+]]: !s64i loc({{.+}}), %[[ARG2:.+]]: !s64i loc({{.+}}), %[[ARG3:.+]]: !s64i loc({{.+}})) -> ![[I128_STRUCT]]
+// LLVM: define dso_local { i64, i64 } @_Z5test1nn(i64 %[[#A_LO:]], i64 %[[#A_HI:]], i64 %[[#B_LO:]], i64 %[[#B_HI:]])
+__int128 test1(__int128 a, __int128 b) {
+  //      CHECK: %[[#SLOT_A:]] = cir.alloca !s128i, !cir.ptr<!s128i>
+  // CHECK-NEXT: %[[#SLOT_A2:]] = cir.cast(bitcast, %[[#SLOT_A]] : !cir.ptr<!s128i>), !cir.ptr<![[I128_STRUCT]]>
+  // CHECK-NEXT: %[[#SLOT_A_LO:]] = cir.get_member %[[#SLOT_A2]][0] {name = ""} : !cir.ptr<![[I128_STRUCT]]> -> !cir.ptr<!s64i>
+  // CHECK-NEXT: cir.store %[[ARG0]], %[[#SLOT_A_LO]] : !s64i, !cir.ptr<!s64i>
+  // CHECK-NEXT: %[[#SLOT_A_HI:]] = cir.get_member %[[#SLOT_A2]][1] {name = ""} : !cir.ptr<![[I128_STRUCT]]> -> !cir.ptr<!s64i>
+  // CHECK-NEXT: cir.store %arg1, %[[#SLOT_A_HI]] : !s64i, !cir.ptr<!s64i>
+  // CHECK-NEXT: %[[#SLOT_B:]] = cir.alloca !s128i, !cir.ptr<!s128i>
+  // CHECK-NEXT: %[[#SLOT_B2:]] = cir.cast(bitcast, %[[#SLOT_B]] : !cir.ptr<!s128i>), !cir.ptr<![[I128_STRUCT]]>
+  // CHECK-NEXT: %[[#SLOT_B_LO:]] = cir.get_member %[[#SLOT_B2]][0] {name = ""} : !cir.ptr<![[I128_STRUCT]]> -> !cir.ptr<!s64i>
+  // CHECK-NEXT: cir.store %arg2, %[[#SLOT_B_LO]] : !s64i, !cir.ptr<!s64i>
+  // CHECK-NEXT: %[[#SLOT_B_HI:]] = cir.get_member %[[#SLOT_B2]][1] {name = ""} : !cir.ptr<![[I128_STRUCT]]> -> !cir.ptr<!s64i>
+  // CHECK-NEXT: cir.store %arg3, %[[#SLOT_B_HI]] : !s64i, !cir.ptr<!s64i>
+  // CHECK-NEXT: %[[#SLOT_RET:]] = cir.alloca !s128i, !cir.ptr<!s128i>, ["__retval"]
+
+  //      LLVM: %[[#A_SLOT:]] = alloca i128, i64 1, align 4
+  // LLVM-NEXT: %[[#A_SLOT_LO:]] = getelementptr { i64, i64 }, ptr %[[#A_SLOT]], i32 0, i32 0
+  // LLVM-NEXT: store i64 %[[#A_LO]], ptr %[[#A_SLOT_LO]], align 8
+  // LLVM-NEXT: %[[#A_SLOT_HI:]] = getelementptr { i64, i64 }, ptr %[[#A_SLOT]], i32 0, i32 1
+  // LLVM-NEXT: store i64 %[[#A_HI]], ptr %[[#A_SLOT_HI]], align 8
+  // LLVM-NEXT: %[[#B_SLOT:]] = alloca i128, i64 1, align 4
+  // LLVM-NEXT: %[[#B_SLOT_LO:]] = getelementptr { i64, i64 }, ptr %[[#B_SLOT]], i32 0, i32 0
+  // LLVM-NEXT: store i64 %[[#B_LO]], ptr %[[#B_SLOT_LO]], align 8
+  // LLVM-NEXT: %[[#B_SLOT_HI:]] = getelementptr { i64, i64 }, ptr %[[#B_SLOT]], i32 0, i32 1
+  // LLVM-NEXT: store i64 %[[#B_HI]], ptr %[[#B_SLOT_HI]], align 8
+  // LLVM-NEXT: %[[#RET_SLOT:]] = alloca i128, i64 1, align 16
+
+  return a + b;
+  //      CHECK: %[[#A:]] = cir.load %[[#SLOT_A]] : !cir.ptr<!s128i>, !s128i
+  // CHECK-NEXT: %[[#B:]] = cir.load %[[#SLOT_B]] : !cir.ptr<!s128i>, !s128i
+  // CHECK-NEXT: %[[#SUM:]] = cir.binop(add, %[[#A]], %[[#B]]) nsw : !s128i
+  // CHECK-NEXT: cir.store %[[#SUM]], %[[#SLOT_RET]] : !s128i, !cir.ptr<!s128i>
+
+  //      LLVM: %[[#A:]] = load i128, ptr %5, align 16
+  // LLVM-NEXT: %[[#B:]] = load i128, ptr %8, align 16
+  // LLVM-NEXT: %[[#SUM:]] = add nsw i128 %[[#A]], %[[#B]]
+  // LLVM-NEXT: store i128 %[[#SUM]], ptr %[[#RET_SLOT]], align 16
+
+  //      CHECK: %[[#SLOT_RET2:]] = cir.cast(bitcast, %[[#SLOT_RET]] : !cir.ptr<!s128i>), !cir.ptr<![[I128_STRUCT]]>
+  // CHECK-NEXT: %[[#RET:]] = cir.load %[[#SLOT_RET2]] : !cir.ptr<![[I128_STRUCT]]>, ![[I128_STRUCT]]
+  // CHECK-NEXT: cir.return %[[#RET]] : ![[I128_STRUCT]]
+
+  //      LLVM: %[[#RET:]] = load { i64, i64 }, ptr %[[#RET_SLOT]], align 8
+  // LLVM-NEXT: ret { i64, i64 } %[[#RET]]
+}

Original file line number	Diff line number	Diff line change
`@@ -97,7 +97,7 @@ class CIRToCIRArgMapping {`
`97`	`97`	`cir_cconv_assert(AI.getCoerceToType() && "Missing coerced type!!");`
`98`	`98`	`StructType STy = dyn_cast<StructType>(AI.getCoerceToType());`
`99`	`99`	`if (AI.isDirect() && AI.getCanBeFlattened() && STy) {`
`100`		`- cir_cconv_unreachable("NYI");`
	`100`	`+ IRArgs.NumberOfArgs = STy.getNumElements();`
`101`	`101`	`} else {`
`102`	`102`	`IRArgs.NumberOfArgs = 1;`
`103`	`103`	`}`