Skip to content

[AMDGPU] Extend permlane16, permlanex16 and permlane64 intrinsic lowering for generic types #92725

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Jun 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions clang/lib/CodeGen/CGBuiltin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18467,6 +18467,16 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
CGM.getIntrinsic(Intrinsic::amdgcn_update_dpp, Args[0]->getType());
return Builder.CreateCall(F, Args);
}
case AMDGPU::BI__builtin_amdgcn_permlane16:
case AMDGPU::BI__builtin_amdgcn_permlanex16:
return emitBuiltinWithOneOverloadedType<6>(
*this, E,
BuiltinID == AMDGPU::BI__builtin_amdgcn_permlane16
? Intrinsic::amdgcn_permlane16
: Intrinsic::amdgcn_permlanex16);
case AMDGPU::BI__builtin_amdgcn_permlane64:
return emitBuiltinWithOneOverloadedType<1>(*this, E,
Intrinsic::amdgcn_permlane64);
case AMDGPU::BI__builtin_amdgcn_readlane:
return emitBuiltinWithOneOverloadedType<2>(*this, E,
Intrinsic::amdgcn_readlane);
Expand Down
4 changes: 2 additions & 2 deletions clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,13 @@ typedef unsigned int uint;
typedef unsigned long ulong;

// CHECK-LABEL: @test_permlane16(
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.permlane16(i32 %a, i32 %b, i32 %c, i32 %d, i1 false, i1 false)
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.permlane16.i32(i32 %a, i32 %b, i32 %c, i32 %d, i1 false, i1 false)
void test_permlane16(global uint* out, uint a, uint b, uint c, uint d) {
*out = __builtin_amdgcn_permlane16(a, b, c, d, 0, 0);
}

// CHECK-LABEL: @test_permlanex16(
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.permlanex16(i32 %a, i32 %b, i32 %c, i32 %d, i1 false, i1 false)
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.permlanex16.i32(i32 %a, i32 %b, i32 %c, i32 %d, i1 false, i1 false)
void test_permlanex16(global uint* out, uint a, uint b, uint c, uint d) {
*out = __builtin_amdgcn_permlanex16(a, b, c, d, 0, 0);
}
Expand Down
2 changes: 1 addition & 1 deletion clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11.cl
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ void test_ds_bvh_stack_rtn(global uint2* out, uint addr, uint data, uint4 data1)
}

// CHECK-LABEL: @test_permlane64(
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.permlane64(i32 %a)
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.permlane64.i32(i32 %a)
void test_permlane64(global uint* out, uint a) {
*out = __builtin_amdgcn_permlane64(a);
}
Expand Down
20 changes: 20 additions & 0 deletions llvm/docs/AMDGPUUsage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1225,6 +1225,26 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
reduction will be performed using default iterative strategy.
Intrinsic is currently only implemented for i32.

llvm.amdgcn.permlane16 Provides direct access to v_permlane16_b32. Performs arbitrary gather-style
operation within a row (16 contiguous lanes) of the second input operand.
The third and fourth inputs must be scalar values. these are combined into
a single 64-bit value representing lane selects used to swizzle within each
row. Currently implemented for i16, i32, float, half, bfloat, <2 x i16>,
<2 x half>, <2 x bfloat>, i64, double, pointers, multiples of the 32-bit vectors.

llvm.amdgcn.permlanex16 Provides direct access to v_permlanex16_b32. Performs arbitrary gather-style
operation across two rows of the second input operand (each row is 16 contiguous
lanes). The third and fourth inputs must be scalar values. these are combined
into a single 64-bit value representing lane selects used to swizzle within each
row. Currently implemented for i16, i32, float, half, bfloat, <2 x i16>, <2 x half>,
<2 x bfloat>, i64, double, pointers, multiples of the 32-bit vectors.

llvm.amdgcn.permlane64 Provides direct access to v_permlane64_b32. Performs a specific permutation across
lanes of the input operand where the high half and low half of a wave64 are swapped.
Performs no operation in wave32 mode. Currently implemented for i16, i32, float, half,
bfloat, <2 x i16>, <2 x half>, <2 x bfloat>, i64, double, pointers, multiples of the
32-bit vectors.

llvm.amdgcn.udot2 Provides direct access to v_dot2_u32_u16 across targets which
support such instructions. This performs unsigned dot product
with two v2i16 operands, summed with the third i32 operand. The
Expand Down
15 changes: 7 additions & 8 deletions llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -2355,16 +2355,16 @@ def int_amdgcn_pops_exiting_wave_id :
//===----------------------------------------------------------------------===//

// llvm.amdgcn.permlane16 <old> <src0> <src1> <src2> <fi> <bound_control>
def int_amdgcn_permlane16 : ClangBuiltin<"__builtin_amdgcn_permlane16">,
Intrinsic<[llvm_i32_ty],
[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty],
def int_amdgcn_permlane16 :
Intrinsic<[llvm_any_ty],
[LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty],
[IntrNoMem, IntrConvergent, IntrWillReturn,
ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>, IntrNoCallback, IntrNoFree]>;

// llvm.amdgcn.permlanex16 <old> <src0> <src1> <src2> <fi> <bound_control>
def int_amdgcn_permlanex16 : ClangBuiltin<"__builtin_amdgcn_permlanex16">,
Intrinsic<[llvm_i32_ty],
[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty],
def int_amdgcn_permlanex16 :
Intrinsic<[llvm_any_ty],
[LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty],
[IntrNoMem, IntrConvergent, IntrWillReturn,
ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>, IntrNoCallback, IntrNoFree]>;

Expand Down Expand Up @@ -2407,8 +2407,7 @@ def int_amdgcn_image_bvh_intersect_ray :

// llvm.amdgcn.permlane64 <src0>
def int_amdgcn_permlane64 :
ClangBuiltin<"__builtin_amdgcn_permlane64">,
Intrinsic<[llvm_i32_ty], [llvm_i32_ty],
Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;

def int_amdgcn_ds_add_gs_reg_rtn :
Expand Down
6 changes: 3 additions & 3 deletions llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -404,7 +404,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B,
assert(ST->hasPermLaneX16());
V = B.CreateBitCast(V, IntNTy);
Value *Permlanex16Call = B.CreateIntrinsic(
Intrinsic::amdgcn_permlanex16, {},
V->getType(), Intrinsic::amdgcn_permlanex16,
{V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()});
V = buildNonAtomicBinOp(B, Op, B.CreateBitCast(V, AtomicTy),
B.CreateBitCast(Permlanex16Call, AtomicTy));
Expand All @@ -416,7 +416,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B,
// Reduce across the upper and lower 32 lanes.
V = B.CreateBitCast(V, IntNTy);
Value *Permlane64Call =
B.CreateIntrinsic(Intrinsic::amdgcn_permlane64, {}, V);
B.CreateIntrinsic(V->getType(), Intrinsic::amdgcn_permlane64, V);
return buildNonAtomicBinOp(B, Op, B.CreateBitCast(V, AtomicTy),
B.CreateBitCast(Permlane64Call, AtomicTy));
}
Expand Down Expand Up @@ -472,7 +472,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
assert(ST->hasPermLaneX16());
V = B.CreateBitCast(V, IntNTy);
Value *PermX = B.CreateIntrinsic(
Intrinsic::amdgcn_permlanex16, {},
V->getType(), Intrinsic::amdgcn_permlanex16,
{V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()});

Value *UpdateDPPCall =
Expand Down
49 changes: 40 additions & 9 deletions llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5438,16 +5438,32 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
MachineIRBuilder &B = Helper.MIRBuilder;
MachineRegisterInfo &MRI = *B.getMRI();

auto createLaneOp = [&IID, &B](Register Src0, Register Src1, Register Src2,
LLT VT) -> Register {
bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
IID == Intrinsic::amdgcn_permlanex16;

auto createLaneOp = [&IID, &B, &MI](Register Src0, Register Src1,
Register Src2, LLT VT) -> Register {
auto LaneOp = B.buildIntrinsic(IID, {VT}).addUse(Src0);
switch (IID) {
case Intrinsic::amdgcn_readfirstlane:
case Intrinsic::amdgcn_permlane64:
return LaneOp.getReg(0);
case Intrinsic::amdgcn_readlane:
return LaneOp.addUse(Src1).getReg(0);
case Intrinsic::amdgcn_writelane:
return LaneOp.addUse(Src1).addUse(Src2).getReg(0);
case Intrinsic::amdgcn_permlane16:
case Intrinsic::amdgcn_permlanex16: {
Register Src3 = MI.getOperand(5).getReg();
Register Src4 = MI.getOperand(6).getImm();
Register Src5 = MI.getOperand(7).getImm();
return LaneOp.addUse(Src1)
.addUse(Src2)
.addUse(Src3)
.addImm(Src4)
.addImm(Src5)
.getReg(0);
}
default:
llvm_unreachable("unhandled lane op");
}
Expand All @@ -5456,9 +5472,10 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
Register DstReg = MI.getOperand(0).getReg();
Register Src0 = MI.getOperand(2).getReg();
Register Src1, Src2;
if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane) {
if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
IsPermLane16) {
Src1 = MI.getOperand(3).getReg();
if (IID == Intrinsic::amdgcn_writelane) {
if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) {
Src2 = MI.getOperand(4).getReg();
}
}
Expand All @@ -5473,12 +5490,15 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,

if (Size < 32) {
Src0 = B.buildAnyExt(S32, Src0).getReg(0);
if (Src2.isValid())

if (IsPermLane16)
Src1 = B.buildAnyExt(LLT::scalar(32), Src1).getReg(0);

if (IID == Intrinsic::amdgcn_writelane)
Src2 = B.buildAnyExt(LLT::scalar(32), Src2).getReg(0);

Register LaneOpDst = createLaneOp(Src0, Src1, Src2, S32);
B.buildTrunc(DstReg, LaneOpDst);

MI.eraseFromParent();
return true;
}
Expand All @@ -5505,15 +5525,23 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
SmallVector<Register, 2> PartialRes;
unsigned NumParts = Size / 32;
MachineInstrBuilder Src0Parts = B.buildUnmerge(PartialResTy, Src0);
MachineInstrBuilder Src2Parts;
MachineInstrBuilder Src1Parts, Src2Parts;

if (IsPermLane16)
Src1Parts = B.buildUnmerge(PartialResTy, Src1);

if (Src2.isValid())
if (IID == Intrinsic::amdgcn_writelane)
Src2Parts = B.buildUnmerge(PartialResTy, Src2);

for (unsigned i = 0; i < NumParts; ++i) {
Src0 = Src0Parts.getReg(i);
if (Src2.isValid())

if (IsPermLane16)
Src1 = Src1Parts.getReg(i);

if (IID == Intrinsic::amdgcn_writelane)
Src2 = Src2Parts.getReg(i);

PartialRes.push_back(createLaneOp(Src0, Src1, Src2, PartialResTy));
}

Expand Down Expand Up @@ -7465,6 +7493,9 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
case Intrinsic::amdgcn_readlane:
case Intrinsic::amdgcn_writelane:
case Intrinsic::amdgcn_readfirstlane:
case Intrinsic::amdgcn_permlane16:
case Intrinsic::amdgcn_permlanex16:
case Intrinsic::amdgcn_permlane64:
return legalizeLaneOp(Helper, MI, IntrID);
default: {
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
Expand Down
68 changes: 49 additions & 19 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6103,28 +6103,38 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
EVT VT = N->getValueType(0);
unsigned ValSize = VT.getSizeInBits();
unsigned IID = N->getConstantOperandVal(0);
bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
IID == Intrinsic::amdgcn_permlanex16;
SDLoc SL(N);
MVT IntVT = MVT::getIntegerVT(ValSize);

auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1,
SDValue Src2, MVT ValT) -> SDValue {
SmallVector<SDValue, 8> Operands;
Operands.push_back(DAG.getTargetConstant(IID, SL, MVT::i32));
switch (IID) {
case Intrinsic::amdgcn_readfirstlane:
Operands.push_back(Src0);
break;
case Intrinsic::amdgcn_permlane16:
case Intrinsic::amdgcn_permlanex16:
Operands.push_back(N->getOperand(6));
Operands.push_back(N->getOperand(5));
Operands.push_back(N->getOperand(4));
[[fallthrough]];
case Intrinsic::amdgcn_writelane:
Operands.push_back(Src2);
[[fallthrough]];
case Intrinsic::amdgcn_readlane:
Operands.push_back(Src0);
Operands.push_back(Src1);
break;
case Intrinsic::amdgcn_writelane:
[[fallthrough]];
case Intrinsic::amdgcn_readfirstlane:
case Intrinsic::amdgcn_permlane64:
Operands.push_back(Src0);
Operands.push_back(Src1);
Operands.push_back(Src2);
break;
default:
llvm_unreachable("unhandled lane op");
}

Operands.push_back(DAG.getTargetConstant(IID, SL, MVT::i32));
std::reverse(Operands.begin(), Operands.end());

if (SDNode *GL = N->getGluedNode()) {
assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
GL = GL->getOperand(0).getNode();
Expand All @@ -6137,9 +6147,10 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,

SDValue Src0 = N->getOperand(1);
SDValue Src1, Src2;
if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane) {
if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
IsPermLane16) {
Src1 = N->getOperand(2);
if (IID == Intrinsic::amdgcn_writelane)
if (IID == Intrinsic::amdgcn_writelane || IsPermLane16)
Src2 = N->getOperand(3);
}

Expand All @@ -6152,10 +6163,17 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
bool IsFloat = VT.isFloatingPoint();
Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0,
SL, MVT::i32);
if (Src2.getNode()) {

if (IsPermLane16) {
Src1 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src1) : Src1,
SL, MVT::i32);
}

if (IID == Intrinsic::amdgcn_writelane) {
Src2 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src2) : Src2,
SL, MVT::i32);
}

SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
SDValue Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT);
return IsFloat ? DAG.getBitcast(VT, Trunc) : Trunc;
Expand Down Expand Up @@ -6217,17 +6235,23 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
case MVT::bf16: {
MVT SubVecVT = MVT::getVectorVT(EltTy, 2);
SmallVector<SDValue, 4> Pieces;
SDValue Src0SubVec, Src1SubVec, Src2SubVec;
for (unsigned i = 0, EltIdx = 0; i < ValSize / 32; i++) {
SDValue Src0SubVec =
DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0,
DAG.getConstant(EltIdx, SL, MVT::i32));
Src0SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0,
DAG.getConstant(EltIdx, SL, MVT::i32));

SDValue Src2SubVec;
if (Src2)
if (IsPermLane16)
Src1SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src1,
DAG.getConstant(EltIdx, SL, MVT::i32));

if (IID == Intrinsic::amdgcn_writelane)
Src2SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src2,
DAG.getConstant(EltIdx, SL, MVT::i32));

Pieces.push_back(createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
Pieces.push_back(
IsPermLane16
? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
: createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
EltIdx += 2;
}
return DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, Pieces);
Expand All @@ -6241,7 +6265,10 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
MVT VecVT = MVT::getVectorVT(MVT::i32, ValSize / 32);
Src0 = DAG.getBitcast(VecVT, Src0);

if (Src2)
if (IsPermLane16)
Src1 = DAG.getBitcast(VecVT, Src1);

if (IID == Intrinsic::amdgcn_writelane)
Src2 = DAG.getBitcast(VecVT, Src2);

SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
Expand Down Expand Up @@ -8718,6 +8745,9 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
case Intrinsic::amdgcn_readlane:
case Intrinsic::amdgcn_readfirstlane:
case Intrinsic::amdgcn_writelane:
case Intrinsic::amdgcn_permlane16:
case Intrinsic::amdgcn_permlanex16:
case Intrinsic::amdgcn_permlane64:
return lowerLaneOp(*this, Op.getNode(), DAG);
default:
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
Expand Down
10 changes: 7 additions & 3 deletions llvm/lib/Target/AMDGPU/VOP1Instructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -732,9 +732,7 @@ def V_ACCVGPR_MOV_B32 : VOP1_Pseudo<"v_accvgpr_mov_b32", VOPProfileAccMov, [], 1
let SubtargetPredicate = isGFX11Plus in {
// Restrict src0 to be VGPR
def V_PERMLANE64_B32 : VOP1_Pseudo<"v_permlane64_b32", VOP_MOVRELS,
getVOP1Pat<int_amdgcn_permlane64,
VOP_MOVRELS>.ret,
/*VOP1Only=*/ 1> {
[], /*VOP1Only=*/ 1> {
let IsInvalidSingleUseConsumer = 1;
let IsInvalidSingleUseProducer = 1;
}
Expand All @@ -744,6 +742,12 @@ let SubtargetPredicate = isGFX11Plus in {
defm V_CVT_U32_U16 : VOP1Inst_t16<"v_cvt_u32_u16", VOP_I32_I16>;
} // End SubtargetPredicate = isGFX11Plus

foreach vt = Reg32Types.types in {
def : GCNPat<(int_amdgcn_permlane64 (vt VRegSrc_32:$src0)),
(vt (V_PERMLANE64_B32 (vt VRegSrc_32:$src0)))
>;
}

//===----------------------------------------------------------------------===//
// Target-specific instruction encodings.
//===----------------------------------------------------------------------===//
Expand Down
Loading