Skip to content

Commit 734d344

Browse files
authored
[CIR][ABI][Lowering] Add CCLower support for int128 on x86_64 (#1036)
This PR adds calling convention lowering support for the int128 type on x86_64. This is a follow up on #953 .
1 parent 2eaa500 commit 734d344

File tree

4 files changed

+175
-7
lines changed

4 files changed

+175
-7
lines changed

clang/lib/CIR/Dialect/Transforms/TargetLowering/CIRToCIRArgMapping.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ class CIRToCIRArgMapping {
9797
cir_cconv_assert(AI.getCoerceToType() && "Missing coerced type!!");
9898
StructType STy = dyn_cast<StructType>(AI.getCoerceToType());
9999
if (AI.isDirect() && AI.getCanBeFlattened() && STy) {
100-
cir_cconv_unreachable("NYI");
100+
IRArgs.NumberOfArgs = STy.getNumElements();
101101
} else {
102102
IRArgs.NumberOfArgs = 1;
103103
}

clang/lib/CIR/Dialect/Transforms/TargetLowering/LowerFunction.cpp

+49-3
Original file line numberDiff line numberDiff line change
@@ -369,6 +369,12 @@ LowerFunction::buildFunctionProlog(const LowerFunctionInfo &FI, FuncOp Fn,
369369

370370
cir_cconv_assert(!::cir::MissingFeatures::vectorType());
371371

372+
StructType STy = dyn_cast<StructType>(ArgI.getCoerceToType());
373+
if (ArgI.isDirect() && !ArgI.getCanBeFlattened() && STy &&
374+
STy.getNumElements() > 1) {
375+
cir_cconv_unreachable("NYI");
376+
}
377+
372378
// Allocate original argument to be "uncoerced".
373379
// FIXME(cir): We should have a alloca op builder that does not required
374380
// the pointer type to be explicitly passed.
@@ -383,10 +389,45 @@ LowerFunction::buildFunctionProlog(const LowerFunctionInfo &FI, FuncOp Fn,
383389

384390
// Fast-isel and the optimizer generally like scalar values better than
385391
// FCAs, so we flatten them if this is safe to do for this argument.
386-
StructType STy = dyn_cast<StructType>(ArgI.getCoerceToType());
387392
if (ArgI.isDirect() && ArgI.getCanBeFlattened() && STy &&
388393
STy.getNumElements() > 1) {
389-
cir_cconv_unreachable("NYI");
394+
auto ptrType = cast<PointerType>(Ptr.getType());
395+
llvm::TypeSize structSize =
396+
LM.getTypes().getDataLayout().getTypeAllocSize(STy);
397+
llvm::TypeSize ptrElementSize =
398+
LM.getTypes().getDataLayout().getTypeAllocSize(
399+
ptrType.getPointee());
400+
if (structSize.isScalable()) {
401+
cir_cconv_unreachable("NYI");
402+
} else {
403+
uint64_t srcSize = structSize.getFixedValue();
404+
uint64_t dstSize = ptrElementSize.getFixedValue();
405+
406+
Value addrToStoreInto;
407+
if (srcSize <= dstSize) {
408+
addrToStoreInto = rewriter.create<CastOp>(
409+
Ptr.getLoc(), PointerType::get(STy, ptrType.getAddrSpace()),
410+
CastKind::bitcast, Ptr);
411+
} else {
412+
cir_cconv_unreachable("NYI");
413+
}
414+
415+
assert(STy.getNumElements() == NumIRArgs);
416+
for (unsigned i = 0, e = STy.getNumElements(); i != e; ++i) {
417+
Value ai = Fn.getArgument(FirstIRArg + i);
418+
Type elementTy = STy.getMembers()[i];
419+
Value eltPtr = rewriter.create<GetMemberOp>(
420+
ai.getLoc(),
421+
PointerType::get(elementTy, ptrType.getAddrSpace()),
422+
addrToStoreInto,
423+
/*name=*/"", /*index=*/i);
424+
rewriter.create<StoreOp>(ai.getLoc(), ai, eltPtr);
425+
}
426+
427+
if (srcSize > dstSize) {
428+
cir_cconv_unreachable("NYI");
429+
}
430+
}
390431
} else {
391432
// Simple case, just do a coerced store of the argument into the alloca.
392433
cir_cconv_assert(NumIRArgs == 1);
@@ -567,8 +608,13 @@ LogicalResult LowerFunction::generateCode(FuncOp oldFn, FuncOp newFn,
567608
rewriter.inlineRegionBefore(oldFn.getBody(), newFn.getBody(),
568609
newFn.getBody().end());
569610

611+
// The block arguments of srcBlock are the old function's arguments. At this
612+
// point, all old arguments should be replaced with the lowered values.
613+
// Thus we could safely remove all the block arguments on srcBlock here.
614+
srcBlock->eraseArguments(0, srcBlock->getNumArguments());
615+
570616
// Merge entry blocks to ensure correct branching.
571-
rewriter.mergeBlocks(srcBlock, dstBlock, newFn.getArguments());
617+
rewriter.mergeBlocks(srcBlock, dstBlock);
572618

573619
// FIXME(cir): What about saving parameters for corotines? Should we do
574620
// something about it in this pass? If the change with the calling

clang/lib/CIR/Dialect/Transforms/TargetLowering/Targets/X86.cpp

+71-3
Original file line numberDiff line numberDiff line change
@@ -234,7 +234,12 @@ void X86_64ABIInfo::classify(Type Ty, uint64_t OffsetBase, Class &Lo, Class &Hi,
234234
if (/*isBuitinType=*/true) {
235235
if (isa<VoidType>(Ty)) {
236236
Current = Class::NoClass;
237-
} else if (isa<IntType>(Ty)) {
237+
} else if (auto IntTy = dyn_cast<IntType>(Ty)) {
238+
if (IntTy.getWidth() == 128) {
239+
Lo = Class::Integer;
240+
Hi = Class::Integer;
241+
return;
242+
}
238243

239244
// FIXME(cir): Clang's BuiltinType::Kind allow comparisons (GT, LT, etc).
240245
// We should implement this in CIR to simplify the conditions below.
@@ -456,6 +461,50 @@ Type X86_64ABIInfo::GetINTEGERTypeAtOffset(Type DestTy, unsigned IROffset,
456461
std::min(TySizeInBytes - SourceOffset, 8U) * 8, isSigned);
457462
}
458463

464+
/// GetX86_64ByValArgumentPair - Given a high and low type that can ideally
465+
/// be used as elements of a two register pair to pass or return, return a
466+
/// first class aggregate to represent them. For example, if the low part of
467+
/// a by-value argument should be passed as i32* and the high part as float,
468+
/// return {i32*, float}.
469+
static mlir::Type GetX86_64ByValArgumentPair(mlir::Type lo, mlir::Type hi,
470+
const ::cir::CIRDataLayout &td) {
471+
// In order to correctly satisfy the ABI, we need to the high part to start
472+
// at offset 8. If the high and low parts we inferred are both 4-byte types
473+
// (e.g. i32 and i32) then the resultant struct type ({i32,i32}) won't have
474+
// the second element at offset 8. Check for this:
475+
unsigned loSize = (unsigned)td.getTypeAllocSize(lo);
476+
llvm::Align highAlign = td.getABITypeAlign(hi);
477+
unsigned highStart = llvm::alignTo(loSize, highAlign);
478+
assert(highStart != 0 && highStart <= 8 && "Invalid x86-64 argument pair!");
479+
480+
// To handle this, we have to increase the size of the low part so that the
481+
// second element will start at an 8 byte offset. We can't increase the size
482+
// of the second element because it might make us access off the end of the
483+
// struct.
484+
if (highStart != 8) {
485+
// There are usually two sorts of types the ABI generation code can produce
486+
// for the low part of a pair that aren't 8 bytes in size: half, float or
487+
// i8/i16/i32. This can also include pointers when they are 32-bit (X32 and
488+
// NaCl).
489+
// Promote these to a larger type.
490+
if (isa<FP16Type, SingleType>(lo))
491+
lo = DoubleType::get(lo.getContext());
492+
else {
493+
assert((isa<IntType, PointerType>(lo)) && "Invalid/unknown lo type");
494+
// TODO(cir): does the sign of the int64 type matter here?
495+
lo = IntType::get(lo.getContext(), 64, true);
496+
}
497+
}
498+
499+
auto result = StructType::get(lo.getContext(), {lo, hi}, /*packed=*/false,
500+
StructType::Struct);
501+
502+
// Verify that the second element is at an 8-byte offset.
503+
assert(td.getStructLayout(result)->getElementOffset(1) == 8 &&
504+
"Invalid x86-64 argument pair!");
505+
return result;
506+
}
507+
459508
::cir::ABIArgInfo X86_64ABIInfo::classifyReturnType(Type RetTy) const {
460509
// AMD64-ABI 3.2.3p4: Rule 1. Classify the return type with the
461510
// classification algorithm.
@@ -507,6 +556,12 @@ ::cir::ABIArgInfo X86_64ABIInfo::classifyReturnType(Type RetTy) const {
507556
case Class::NoClass:
508557
break;
509558

559+
case Class::Integer:
560+
HighPart = GetINTEGERTypeAtOffset(RetTy, 8, RetTy, 8);
561+
if (Lo == Class::NoClass) // Return HighPart at offset 8 in memory.
562+
return ABIArgInfo::getDirect(HighPart, 8);
563+
break;
564+
510565
default:
511566
cir_cconv_unreachable("NYI");
512567
}
@@ -515,7 +570,7 @@ ::cir::ABIArgInfo X86_64ABIInfo::classifyReturnType(Type RetTy) const {
515570
// known to pass in the high eightbyte of the result. We do this by forming
516571
// a first class struct aggregate with the high and low part: {low, high}
517572
if (HighPart)
518-
cir_cconv_unreachable("NYI");
573+
resType = GetX86_64ByValArgumentPair(resType, HighPart, getDataLayout());
519574

520575
return ABIArgInfo::getDirect(resType);
521576
}
@@ -580,12 +635,25 @@ ABIArgInfo X86_64ABIInfo::classifyArgumentType(Type Ty, unsigned freeIntRegs,
580635
switch (Hi) {
581636
case Class::NoClass:
582637
break;
638+
639+
case Class::Integer:
640+
++neededInt;
641+
// Pick an 8-byte type based on the preferred type.
642+
HighPart = GetINTEGERTypeAtOffset(Ty, 8, Ty, 8);
643+
644+
if (Lo == Class::NoClass) // Pass HighPart at offset 8 in memory.
645+
return ABIArgInfo::getDirect(HighPart, 8);
646+
break;
647+
583648
default:
584649
cir_cconv_unreachable("NYI");
585650
}
586651

652+
// If a high part was specified, merge it together with the low part. It is
653+
// known to pass in the high eightbyte of the result. We do this by forming a
654+
// first class struct aggregate with the high and low part: {low, high}
587655
if (HighPart)
588-
cir_cconv_unreachable("NYI");
656+
ResType = GetX86_64ByValArgumentPair(ResType, HighPart, getDataLayout());
589657

590658
return ABIArgInfo::getDirect(ResType);
591659
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -fclangir-call-conv-lowering -emit-cir-flat -mmlir --mlir-print-ir-after=cir-call-conv-lowering %s -o %t.cir
2+
// RUN: FileCheck --input-file=%t.cir %s
3+
// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -fclangir-call-conv-lowering -emit-llvm %s -o %t.ll
4+
// RUN: FileCheck --input-file=%t.ll %s --check-prefix=LLVM
5+
6+
// CHECK: ![[I128_STRUCT:.+]] = !cir.struct<struct {!s64i, !s64i}>
7+
8+
// CHECK: @_Z5test1nn(%[[ARG0:.+]]: !s64i loc({{.+}}), %[[ARG1:.+]]: !s64i loc({{.+}}), %[[ARG2:.+]]: !s64i loc({{.+}}), %[[ARG3:.+]]: !s64i loc({{.+}})) -> ![[I128_STRUCT]]
9+
// LLVM: define dso_local { i64, i64 } @_Z5test1nn(i64 %[[#A_LO:]], i64 %[[#A_HI:]], i64 %[[#B_LO:]], i64 %[[#B_HI:]])
10+
__int128 test1(__int128 a, __int128 b) {
11+
// CHECK: %[[#SLOT_A:]] = cir.alloca !s128i, !cir.ptr<!s128i>
12+
// CHECK-NEXT: %[[#SLOT_A2:]] = cir.cast(bitcast, %[[#SLOT_A]] : !cir.ptr<!s128i>), !cir.ptr<![[I128_STRUCT]]>
13+
// CHECK-NEXT: %[[#SLOT_A_LO:]] = cir.get_member %[[#SLOT_A2]][0] {name = ""} : !cir.ptr<![[I128_STRUCT]]> -> !cir.ptr<!s64i>
14+
// CHECK-NEXT: cir.store %[[ARG0]], %[[#SLOT_A_LO]] : !s64i, !cir.ptr<!s64i>
15+
// CHECK-NEXT: %[[#SLOT_A_HI:]] = cir.get_member %[[#SLOT_A2]][1] {name = ""} : !cir.ptr<![[I128_STRUCT]]> -> !cir.ptr<!s64i>
16+
// CHECK-NEXT: cir.store %arg1, %[[#SLOT_A_HI]] : !s64i, !cir.ptr<!s64i>
17+
// CHECK-NEXT: %[[#SLOT_B:]] = cir.alloca !s128i, !cir.ptr<!s128i>
18+
// CHECK-NEXT: %[[#SLOT_B2:]] = cir.cast(bitcast, %[[#SLOT_B]] : !cir.ptr<!s128i>), !cir.ptr<![[I128_STRUCT]]>
19+
// CHECK-NEXT: %[[#SLOT_B_LO:]] = cir.get_member %[[#SLOT_B2]][0] {name = ""} : !cir.ptr<![[I128_STRUCT]]> -> !cir.ptr<!s64i>
20+
// CHECK-NEXT: cir.store %arg2, %[[#SLOT_B_LO]] : !s64i, !cir.ptr<!s64i>
21+
// CHECK-NEXT: %[[#SLOT_B_HI:]] = cir.get_member %[[#SLOT_B2]][1] {name = ""} : !cir.ptr<![[I128_STRUCT]]> -> !cir.ptr<!s64i>
22+
// CHECK-NEXT: cir.store %arg3, %[[#SLOT_B_HI]] : !s64i, !cir.ptr<!s64i>
23+
// CHECK-NEXT: %[[#SLOT_RET:]] = cir.alloca !s128i, !cir.ptr<!s128i>, ["__retval"]
24+
25+
// LLVM: %[[#A_SLOT:]] = alloca i128, i64 1, align 4
26+
// LLVM-NEXT: %[[#A_SLOT_LO:]] = getelementptr { i64, i64 }, ptr %[[#A_SLOT]], i32 0, i32 0
27+
// LLVM-NEXT: store i64 %[[#A_LO]], ptr %[[#A_SLOT_LO]], align 8
28+
// LLVM-NEXT: %[[#A_SLOT_HI:]] = getelementptr { i64, i64 }, ptr %[[#A_SLOT]], i32 0, i32 1
29+
// LLVM-NEXT: store i64 %[[#A_HI]], ptr %[[#A_SLOT_HI]], align 8
30+
// LLVM-NEXT: %[[#B_SLOT:]] = alloca i128, i64 1, align 4
31+
// LLVM-NEXT: %[[#B_SLOT_LO:]] = getelementptr { i64, i64 }, ptr %[[#B_SLOT]], i32 0, i32 0
32+
// LLVM-NEXT: store i64 %[[#B_LO]], ptr %[[#B_SLOT_LO]], align 8
33+
// LLVM-NEXT: %[[#B_SLOT_HI:]] = getelementptr { i64, i64 }, ptr %[[#B_SLOT]], i32 0, i32 1
34+
// LLVM-NEXT: store i64 %[[#B_HI]], ptr %[[#B_SLOT_HI]], align 8
35+
// LLVM-NEXT: %[[#RET_SLOT:]] = alloca i128, i64 1, align 16
36+
37+
return a + b;
38+
// CHECK: %[[#A:]] = cir.load %[[#SLOT_A]] : !cir.ptr<!s128i>, !s128i
39+
// CHECK-NEXT: %[[#B:]] = cir.load %[[#SLOT_B]] : !cir.ptr<!s128i>, !s128i
40+
// CHECK-NEXT: %[[#SUM:]] = cir.binop(add, %[[#A]], %[[#B]]) nsw : !s128i
41+
// CHECK-NEXT: cir.store %[[#SUM]], %[[#SLOT_RET]] : !s128i, !cir.ptr<!s128i>
42+
43+
// LLVM: %[[#A:]] = load i128, ptr %5, align 16
44+
// LLVM-NEXT: %[[#B:]] = load i128, ptr %8, align 16
45+
// LLVM-NEXT: %[[#SUM:]] = add nsw i128 %[[#A]], %[[#B]]
46+
// LLVM-NEXT: store i128 %[[#SUM]], ptr %[[#RET_SLOT]], align 16
47+
48+
// CHECK: %[[#SLOT_RET2:]] = cir.cast(bitcast, %[[#SLOT_RET]] : !cir.ptr<!s128i>), !cir.ptr<![[I128_STRUCT]]>
49+
// CHECK-NEXT: %[[#RET:]] = cir.load %[[#SLOT_RET2]] : !cir.ptr<![[I128_STRUCT]]>, ![[I128_STRUCT]]
50+
// CHECK-NEXT: cir.return %[[#RET]] : ![[I128_STRUCT]]
51+
52+
// LLVM: %[[#RET:]] = load { i64, i64 }, ptr %[[#RET_SLOT]], align 8
53+
// LLVM-NEXT: ret { i64, i64 } %[[#RET]]
54+
}

0 commit comments

Comments
 (0)