Skip to content

[AMDGPU] Extend permlane16, permlanex16 and permlane64 intrinsic lowering for generic types #92725

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Jun 26, 2024
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions clang/lib/CodeGen/CGBuiltin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18467,6 +18467,22 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
CGM.getIntrinsic(Intrinsic::amdgcn_update_dpp, Args[0]->getType());
return Builder.CreateCall(F, Args);
}
case AMDGPU::BI__builtin_amdgcn_permlane16:
case AMDGPU::BI__builtin_amdgcn_permlanex16:
return emitBuiltinWithOneOverloadedType<6>(
*this, E,
BuiltinID == AMDGPU::BI__builtin_amdgcn_permlane16
? Intrinsic::amdgcn_permlane16
: Intrinsic::amdgcn_permlanex16);
case AMDGPU::BI__builtin_amdgcn_permlane64:
return emitBuiltinWithOneOverloadedType<1>(*this, E,
Intrinsic::amdgcn_permlane64);
case AMDGPU::BI__builtin_amdgcn_readlane:
return emitBuiltinWithOneOverloadedType<2>(*this, E,
Intrinsic::amdgcn_readlane);
case AMDGPU::BI__builtin_amdgcn_readfirstlane:
return emitBuiltinWithOneOverloadedType<1>(*this, E,
Intrinsic::amdgcn_readfirstlane);
case AMDGPU::BI__builtin_amdgcn_div_fixup:
case AMDGPU::BI__builtin_amdgcn_div_fixupf:
case AMDGPU::BI__builtin_amdgcn_div_fixuph:
Expand Down
4 changes: 2 additions & 2 deletions clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,13 @@ typedef unsigned int uint;
typedef unsigned long ulong;

// CHECK-LABEL: @test_permlane16(
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.permlane16(i32 %a, i32 %b, i32 %c, i32 %d, i1 false, i1 false)
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.permlane16.i32(i32 %a, i32 %b, i32 %c, i32 %d, i1 false, i1 false)
void test_permlane16(global uint* out, uint a, uint b, uint c, uint d) {
*out = __builtin_amdgcn_permlane16(a, b, c, d, 0, 0);
}

// CHECK-LABEL: @test_permlanex16(
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.permlanex16(i32 %a, i32 %b, i32 %c, i32 %d, i1 false, i1 false)
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.permlanex16.i32(i32 %a, i32 %b, i32 %c, i32 %d, i1 false, i1 false)
void test_permlanex16(global uint* out, uint a, uint b, uint c, uint d) {
*out = __builtin_amdgcn_permlanex16(a, b, c, d, 0, 0);
}
Expand Down
2 changes: 1 addition & 1 deletion clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11.cl
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ void test_ds_bvh_stack_rtn(global uint2* out, uint addr, uint data, uint4 data1)
}

// CHECK-LABEL: @test_permlane64(
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.permlane64(i32 %a)
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.permlane64.i32(i32 %a)
void test_permlane64(global uint* out, uint a) {
*out = __builtin_amdgcn_permlane64(a);
}
Expand Down
4 changes: 2 additions & 2 deletions clang/test/CodeGenOpenCL/builtins-amdgcn.cl
Original file line number Diff line number Diff line change
Expand Up @@ -308,14 +308,14 @@ void test_ds_bpermute(global int* out, int a, int b)
}

// CHECK-LABEL: @test_readfirstlane
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.readfirstlane(i32 %a)
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.readfirstlane.i32(i32 %a)
void test_readfirstlane(global int* out, int a)
{
*out = __builtin_amdgcn_readfirstlane(a);
}

// CHECK-LABEL: @test_readlane
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.readlane(i32 %a, i32 %b)
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.readlane.i32(i32 %a, i32 %b)
void test_readlane(global int* out, int a, int b)
{
*out = __builtin_amdgcn_readlane(a, b);
Expand Down
20 changes: 20 additions & 0 deletions llvm/docs/AMDGPUUsage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1208,6 +1208,26 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
reduction will be performed using default iterative strategy.
Intrinsic is currently only implemented for i32.

llvm.amdgcn.permlane16 Provides direct access to v_permlane16_b32. Performs arbitrary gather-style
operation within a row (16 contiguous lanes) of the second input operand.
The third and fourth inputs must be scalar values. these are combined into
a single 64-bit value representing lane selects used to swizzle within each
row. Currently implemented for i16, i32, float, half, bfloat, <2 x i16>,
<2 x half>, <2 x bfloat>, i64, double, pointers, multiples of the 32-bit vectors.

llvm.amdgcn.permlanex16 Provides direct access to v_permlanex16_b32. Performs arbitrary gather-style
operation across two rows of the second input operand (each row is 16 contiguous
lanes). The third and fourth inputs must be scalar values. these are combined
into a single 64-bit value representing lane selects used to swizzle within each
row. Currently implemented for i16, i32, float, half, bfloat, <2 x i16>, <2 x half>,
<2 x bfloat>, i64, double, pointers, multiples of the 32-bit vectors.

llvm.amdgcn.permlane64 Provides direct access to v_permlane64_b32. Performs a specific permutation across
lanes of the input operand where the high half and low half of a wave64 are swapped.
Performs no operation in wave32 mode. Currently implemented for i16, i32, float, half,
bfloat, <2 x i16>, <2 x half>, <2 x bfloat>, i64, double, pointers, multiples of the
32-bit vectors.

llvm.amdgcn.udot2 Provides direct access to v_dot2_u32_u16 across targets which
support such instructions. This performs unsigned dot product
with two v2i16 operands, summed with the third i32 operand. The
Expand Down
30 changes: 13 additions & 17 deletions llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -2042,26 +2042,23 @@ def int_amdgcn_wave_reduce_umin : AMDGPUWaveReduce;
def int_amdgcn_wave_reduce_umax : AMDGPUWaveReduce;

def int_amdgcn_readfirstlane :
ClangBuiltin<"__builtin_amdgcn_readfirstlane">,
Intrinsic<[llvm_i32_ty], [llvm_i32_ty],
Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;

// The lane argument must be uniform across the currently active threads of the
// current wave. Otherwise, the result is undefined.
def int_amdgcn_readlane :
ClangBuiltin<"__builtin_amdgcn_readlane">,
Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
Intrinsic<[llvm_any_ty], [LLVMMatchType<0>, llvm_i32_ty],
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;

// The value to write and lane select arguments must be uniform across the
// currently active threads of the current wave. Otherwise, the result is
// undefined.
def int_amdgcn_writelane :
ClangBuiltin<"__builtin_amdgcn_writelane">,
Intrinsic<[llvm_i32_ty], [
llvm_i32_ty, // uniform value to write: returned by the selected lane
llvm_i32_ty, // uniform lane select
llvm_i32_ty // returned by all lanes other than the selected one
Intrinsic<[llvm_any_ty], [
LLVMMatchType<0>, // uniform value to write: returned by the selected lane
llvm_i32_ty, // uniform lane select
LLVMMatchType<0> // returned by all lanes other than the selected one
],
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]
>;
Expand Down Expand Up @@ -2358,16 +2355,16 @@ def int_amdgcn_pops_exiting_wave_id :
//===----------------------------------------------------------------------===//

// llvm.amdgcn.permlane16 <old> <src0> <src1> <src2> <fi> <bound_control>
def int_amdgcn_permlane16 : ClangBuiltin<"__builtin_amdgcn_permlane16">,
Intrinsic<[llvm_i32_ty],
[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty],
def int_amdgcn_permlane16 :
Intrinsic<[llvm_any_ty],
[LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty],
[IntrNoMem, IntrConvergent, IntrWillReturn,
ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>, IntrNoCallback, IntrNoFree]>;

// llvm.amdgcn.permlanex16 <old> <src0> <src1> <src2> <fi> <bound_control>
def int_amdgcn_permlanex16 : ClangBuiltin<"__builtin_amdgcn_permlanex16">,
Intrinsic<[llvm_i32_ty],
[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty],
def int_amdgcn_permlanex16 :
Intrinsic<[llvm_any_ty],
[LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty],
[IntrNoMem, IntrConvergent, IntrWillReturn,
ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>, IntrNoCallback, IntrNoFree]>;

Expand Down Expand Up @@ -2410,8 +2407,7 @@ def int_amdgcn_image_bvh_intersect_ray :

// llvm.amdgcn.permlane64 <src0>
def int_amdgcn_permlane64 :
ClangBuiltin<"__builtin_amdgcn_permlane64">,
Intrinsic<[llvm_i32_ty], [llvm_i32_ty],
Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;

def int_amdgcn_ds_add_gs_reg_rtn :
Expand Down
16 changes: 8 additions & 8 deletions llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -404,7 +404,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B,
assert(ST->hasPermLaneX16());
V = B.CreateBitCast(V, IntNTy);
Value *Permlanex16Call = B.CreateIntrinsic(
Intrinsic::amdgcn_permlanex16, {},
V->getType(), Intrinsic::amdgcn_permlanex16,
{V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()});
V = buildNonAtomicBinOp(B, Op, B.CreateBitCast(V, AtomicTy),
B.CreateBitCast(Permlanex16Call, AtomicTy));
Expand All @@ -416,15 +416,15 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B,
// Reduce across the upper and lower 32 lanes.
V = B.CreateBitCast(V, IntNTy);
Value *Permlane64Call =
B.CreateIntrinsic(Intrinsic::amdgcn_permlane64, {}, V);
B.CreateIntrinsic(V->getType(), Intrinsic::amdgcn_permlane64, V);
return buildNonAtomicBinOp(B, Op, B.CreateBitCast(V, AtomicTy),
B.CreateBitCast(Permlane64Call, AtomicTy));
}

// Pick an arbitrary lane from 0..31 and an arbitrary lane from 32..63 and
// combine them with a scalar operation.
Function *ReadLane =
Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {});
Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, B.getInt32Ty());
V = B.CreateBitCast(V, IntNTy);
Value *Lane0 = B.CreateCall(ReadLane, {V, B.getInt32(0)});
Value *Lane32 = B.CreateCall(ReadLane, {V, B.getInt32(32)});
Expand Down Expand Up @@ -472,7 +472,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
assert(ST->hasPermLaneX16());
V = B.CreateBitCast(V, IntNTy);
Value *PermX = B.CreateIntrinsic(
Intrinsic::amdgcn_permlanex16, {},
V->getType(), Intrinsic::amdgcn_permlanex16,
{V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()});

Value *UpdateDPPCall =
Expand Down Expand Up @@ -514,10 +514,10 @@ Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V,
{Identity, V, B.getInt32(DPP::WAVE_SHR1), B.getInt32(0xf),
B.getInt32(0xf), B.getFalse()});
} else {
Function *ReadLane =
Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {});
Function *WriteLane =
Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, {});
Function *ReadLane = Intrinsic::getDeclaration(
M, Intrinsic::amdgcn_readlane, B.getInt32Ty());
Function *WriteLane = Intrinsic::getDeclaration(
M, Intrinsic::amdgcn_writelane, B.getInt32Ty());

// On GFX10 all DPP operations are confined to a single row. To get cross-
// row operations we have to use permlane or readlane.
Expand Down
126 changes: 126 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5430,6 +5430,125 @@ bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
return true;
}

bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
MachineInstr &MI,
Intrinsic::ID IID) const {

MachineIRBuilder &B = Helper.MIRBuilder;
MachineRegisterInfo &MRI = *B.getMRI();

bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
IID == Intrinsic::amdgcn_permlanex16;

auto createLaneOp = [&IID, &B, &MI](Register Src0, Register Src1,
Register Src2, LLT VT) -> Register {
auto LaneOp = B.buildIntrinsic(IID, {VT}).addUse(Src0);
switch (IID) {
case Intrinsic::amdgcn_readfirstlane:
case Intrinsic::amdgcn_permlane64:
return LaneOp.getReg(0);
case Intrinsic::amdgcn_readlane:
return LaneOp.addUse(Src1).getReg(0);
case Intrinsic::amdgcn_writelane:
return LaneOp.addUse(Src1).addUse(Src2).getReg(0);
case Intrinsic::amdgcn_permlane16:
case Intrinsic::amdgcn_permlanex16: {
Register Src3 = MI.getOperand(5).getReg();
Register Src4 = MI.getOperand(6).getImm();
Register Src5 = MI.getOperand(7).getImm();
return LaneOp.addUse(Src1)
.addUse(Src2)
.addUse(Src3)
.addImm(Src4)
.addImm(Src5)
.getReg(0);
}
default:
llvm_unreachable("unhandled lane op");
}
};

Register DstReg = MI.getOperand(0).getReg();
Register Src0 = MI.getOperand(2).getReg();
Register Src1, Src2;
if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
IsPermLane16) {
Src1 = MI.getOperand(3).getReg();
if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) {
Src2 = MI.getOperand(4).getReg();
}
}

LLT Ty = MRI.getType(DstReg);
unsigned Size = Ty.getSizeInBits();

if (Size == 32) {
// Already legal
return true;
}

if (Size < 32) {
Src0 = B.buildAnyExt(S32, Src0).getReg(0);

if (IsPermLane16)
Src1 = B.buildAnyExt(LLT::scalar(32), Src1).getReg(0);

if (IID == Intrinsic::amdgcn_writelane)
Src2 = B.buildAnyExt(LLT::scalar(32), Src2).getReg(0);

Register LaneOpDst = createLaneOp(Src0, Src1, Src2, S32);
B.buildTrunc(DstReg, LaneOpDst);
MI.eraseFromParent();
return true;
}

if (Size % 32 != 0)
return false;

LLT PartialResTy = S32;
if (Ty.isVector()) {
LLT EltTy = Ty.getElementType();
switch (EltTy.getSizeInBits()) {
case 16:
PartialResTy = Ty.changeElementCount(ElementCount::getFixed(2));
break;
case 32:
PartialResTy = EltTy;
break;
default:
// Handle all other cases via S32 pieces;
break;
}
}

SmallVector<Register, 2> PartialRes;
unsigned NumParts = Size / 32;
MachineInstrBuilder Src0Parts = B.buildUnmerge(PartialResTy, Src0);
MachineInstrBuilder Src1Parts, Src2Parts;

if (IsPermLane16)
Src1Parts = B.buildUnmerge(PartialResTy, Src1);

if (IID == Intrinsic::amdgcn_writelane)
Src2Parts = B.buildUnmerge(PartialResTy, Src2);

for (unsigned i = 0; i < NumParts; ++i) {
Src0 = Src0Parts.getReg(i);

if (IsPermLane16)
Src1 = Src1Parts.getReg(i);

if (IID == Intrinsic::amdgcn_writelane)
Src2 = Src2Parts.getReg(i);

PartialRes.push_back(createLaneOp(Src0, Src1, Src2, PartialResTy));
}

B.buildMergeLikeInstr(DstReg, PartialRes);
MI.eraseFromParent();
return true;
}

bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
Expand Down Expand Up @@ -7370,6 +7489,13 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
Observer.changedInstr(MI);
return true;
}
case Intrinsic::amdgcn_readlane:
case Intrinsic::amdgcn_writelane:
case Intrinsic::amdgcn_readfirstlane:
case Intrinsic::amdgcn_permlane16:
case Intrinsic::amdgcn_permlanex16:
case Intrinsic::amdgcn_permlane64:
return legalizeLaneOp(Helper, MI, IntrID);
default: {
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
AMDGPU::getImageDimIntrinsicInfo(IntrID))
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,9 @@ class AMDGPULegalizerInfo final : public LegalizerInfo {
bool legalizeBufferAtomic(MachineInstr &MI, MachineIRBuilder &B,
Intrinsic::ID IID) const;

bool legalizeLaneOp(LegalizerHelper &Helper, MachineInstr &MI,
Intrinsic::ID IID) const;

bool legalizeBVHIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const;

bool legalizeFPTruncRound(MachineInstr &MI, MachineIRBuilder &B) const;
Expand Down
Loading