Skip to content

Commit 35f7b60

Browse files
authored
[AMDGPU] Extend permlane16, permlanex16 and permlane64 intrinsic lowering for generic types (#92725)
These are incremental changes over #89217 , with core logic being the same. This patch along with #89217 and #91190 should get us ready to enable 64 bit optimizations in atomic optimizer.
1 parent 89d8df1 commit 35f7b60

17 files changed

+10752
-1063
lines changed

clang/lib/CodeGen/CGBuiltin.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18479,6 +18479,16 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1847918479
CGM.getIntrinsic(Intrinsic::amdgcn_update_dpp, Args[0]->getType());
1848018480
return Builder.CreateCall(F, Args);
1848118481
}
18482+
case AMDGPU::BI__builtin_amdgcn_permlane16:
18483+
case AMDGPU::BI__builtin_amdgcn_permlanex16:
18484+
return emitBuiltinWithOneOverloadedType<6>(
18485+
*this, E,
18486+
BuiltinID == AMDGPU::BI__builtin_amdgcn_permlane16
18487+
? Intrinsic::amdgcn_permlane16
18488+
: Intrinsic::amdgcn_permlanex16);
18489+
case AMDGPU::BI__builtin_amdgcn_permlane64:
18490+
return emitBuiltinWithOneOverloadedType<1>(*this, E,
18491+
Intrinsic::amdgcn_permlane64);
1848218492
case AMDGPU::BI__builtin_amdgcn_readlane:
1848318493
return emitBuiltinWithOneOverloadedType<2>(*this, E,
1848418494
Intrinsic::amdgcn_readlane);

clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,13 @@ typedef unsigned int uint;
88
typedef unsigned long ulong;
99

1010
// CHECK-LABEL: @test_permlane16(
11-
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.permlane16(i32 %a, i32 %b, i32 %c, i32 %d, i1 false, i1 false)
11+
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.permlane16.i32(i32 %a, i32 %b, i32 %c, i32 %d, i1 false, i1 false)
1212
void test_permlane16(global uint* out, uint a, uint b, uint c, uint d) {
1313
*out = __builtin_amdgcn_permlane16(a, b, c, d, 0, 0);
1414
}
1515

1616
// CHECK-LABEL: @test_permlanex16(
17-
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.permlanex16(i32 %a, i32 %b, i32 %c, i32 %d, i1 false, i1 false)
17+
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.permlanex16.i32(i32 %a, i32 %b, i32 %c, i32 %d, i1 false, i1 false)
1818
void test_permlanex16(global uint* out, uint a, uint b, uint c, uint d) {
1919
*out = __builtin_amdgcn_permlanex16(a, b, c, d, 0, 0);
2020
}

clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11.cl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ void test_ds_bvh_stack_rtn(global uint2* out, uint addr, uint data, uint4 data1)
3737
}
3838

3939
// CHECK-LABEL: @test_permlane64(
40-
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.permlane64(i32 %a)
40+
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.permlane64.i32(i32 %a)
4141
void test_permlane64(global uint* out, uint a) {
4242
*out = __builtin_amdgcn_permlane64(a);
4343
}

llvm/docs/AMDGPUUsage.rst

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1225,6 +1225,26 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
12251225
reduction will be performed using default iterative strategy.
12261226
Intrinsic is currently only implemented for i32.
12271227

1228+
llvm.amdgcn.permlane16 Provides direct access to v_permlane16_b32. Performs arbitrary gather-style
1229+
operation within a row (16 contiguous lanes) of the second input operand.
1230+
The third and fourth inputs must be scalar values. these are combined into
1231+
a single 64-bit value representing lane selects used to swizzle within each
1232+
row. Currently implemented for i16, i32, float, half, bfloat, <2 x i16>,
1233+
<2 x half>, <2 x bfloat>, i64, double, pointers, multiples of the 32-bit vectors.
1234+
1235+
llvm.amdgcn.permlanex16 Provides direct access to v_permlanex16_b32. Performs arbitrary gather-style
1236+
operation across two rows of the second input operand (each row is 16 contiguous
1237+
lanes). The third and fourth inputs must be scalar values. these are combined
1238+
into a single 64-bit value representing lane selects used to swizzle within each
1239+
row. Currently implemented for i16, i32, float, half, bfloat, <2 x i16>, <2 x half>,
1240+
<2 x bfloat>, i64, double, pointers, multiples of the 32-bit vectors.
1241+
1242+
llvm.amdgcn.permlane64 Provides direct access to v_permlane64_b32. Performs a specific permutation across
1243+
lanes of the input operand where the high half and low half of a wave64 are swapped.
1244+
Performs no operation in wave32 mode. Currently implemented for i16, i32, float, half,
1245+
bfloat, <2 x i16>, <2 x half>, <2 x bfloat>, i64, double, pointers, multiples of the
1246+
32-bit vectors.
1247+
12281248
llvm.amdgcn.udot2 Provides direct access to v_dot2_u32_u16 across targets which
12291249
support such instructions. This performs unsigned dot product
12301250
with two v2i16 operands, summed with the third i32 operand. The

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2355,16 +2355,16 @@ def int_amdgcn_pops_exiting_wave_id :
23552355
//===----------------------------------------------------------------------===//
23562356

23572357
// llvm.amdgcn.permlane16 <old> <src0> <src1> <src2> <fi> <bound_control>
2358-
def int_amdgcn_permlane16 : ClangBuiltin<"__builtin_amdgcn_permlane16">,
2359-
Intrinsic<[llvm_i32_ty],
2360-
[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty],
2358+
def int_amdgcn_permlane16 :
2359+
Intrinsic<[llvm_any_ty],
2360+
[LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty],
23612361
[IntrNoMem, IntrConvergent, IntrWillReturn,
23622362
ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>, IntrNoCallback, IntrNoFree]>;
23632363

23642364
// llvm.amdgcn.permlanex16 <old> <src0> <src1> <src2> <fi> <bound_control>
2365-
def int_amdgcn_permlanex16 : ClangBuiltin<"__builtin_amdgcn_permlanex16">,
2366-
Intrinsic<[llvm_i32_ty],
2367-
[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty],
2365+
def int_amdgcn_permlanex16 :
2366+
Intrinsic<[llvm_any_ty],
2367+
[LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty],
23682368
[IntrNoMem, IntrConvergent, IntrWillReturn,
23692369
ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>, IntrNoCallback, IntrNoFree]>;
23702370

@@ -2407,8 +2407,7 @@ def int_amdgcn_image_bvh_intersect_ray :
24072407

24082408
// llvm.amdgcn.permlane64 <src0>
24092409
def int_amdgcn_permlane64 :
2410-
ClangBuiltin<"__builtin_amdgcn_permlane64">,
2411-
Intrinsic<[llvm_i32_ty], [llvm_i32_ty],
2410+
Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],
24122411
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
24132412

24142413
def int_amdgcn_ds_add_gs_reg_rtn :

llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -404,7 +404,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B,
404404
assert(ST->hasPermLaneX16());
405405
V = B.CreateBitCast(V, IntNTy);
406406
Value *Permlanex16Call = B.CreateIntrinsic(
407-
Intrinsic::amdgcn_permlanex16, {},
407+
V->getType(), Intrinsic::amdgcn_permlanex16,
408408
{V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()});
409409
V = buildNonAtomicBinOp(B, Op, B.CreateBitCast(V, AtomicTy),
410410
B.CreateBitCast(Permlanex16Call, AtomicTy));
@@ -416,7 +416,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B,
416416
// Reduce across the upper and lower 32 lanes.
417417
V = B.CreateBitCast(V, IntNTy);
418418
Value *Permlane64Call =
419-
B.CreateIntrinsic(Intrinsic::amdgcn_permlane64, {}, V);
419+
B.CreateIntrinsic(V->getType(), Intrinsic::amdgcn_permlane64, V);
420420
return buildNonAtomicBinOp(B, Op, B.CreateBitCast(V, AtomicTy),
421421
B.CreateBitCast(Permlane64Call, AtomicTy));
422422
}
@@ -472,7 +472,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
472472
assert(ST->hasPermLaneX16());
473473
V = B.CreateBitCast(V, IntNTy);
474474
Value *PermX = B.CreateIntrinsic(
475-
Intrinsic::amdgcn_permlanex16, {},
475+
V->getType(), Intrinsic::amdgcn_permlanex16,
476476
{V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()});
477477

478478
Value *UpdateDPPCall =

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

Lines changed: 40 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5438,16 +5438,32 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
54385438
MachineIRBuilder &B = Helper.MIRBuilder;
54395439
MachineRegisterInfo &MRI = *B.getMRI();
54405440

5441-
auto createLaneOp = [&IID, &B](Register Src0, Register Src1, Register Src2,
5442-
LLT VT) -> Register {
5441+
bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
5442+
IID == Intrinsic::amdgcn_permlanex16;
5443+
5444+
auto createLaneOp = [&IID, &B, &MI](Register Src0, Register Src1,
5445+
Register Src2, LLT VT) -> Register {
54435446
auto LaneOp = B.buildIntrinsic(IID, {VT}).addUse(Src0);
54445447
switch (IID) {
54455448
case Intrinsic::amdgcn_readfirstlane:
5449+
case Intrinsic::amdgcn_permlane64:
54465450
return LaneOp.getReg(0);
54475451
case Intrinsic::amdgcn_readlane:
54485452
return LaneOp.addUse(Src1).getReg(0);
54495453
case Intrinsic::amdgcn_writelane:
54505454
return LaneOp.addUse(Src1).addUse(Src2).getReg(0);
5455+
case Intrinsic::amdgcn_permlane16:
5456+
case Intrinsic::amdgcn_permlanex16: {
5457+
Register Src3 = MI.getOperand(5).getReg();
5458+
Register Src4 = MI.getOperand(6).getImm();
5459+
Register Src5 = MI.getOperand(7).getImm();
5460+
return LaneOp.addUse(Src1)
5461+
.addUse(Src2)
5462+
.addUse(Src3)
5463+
.addImm(Src4)
5464+
.addImm(Src5)
5465+
.getReg(0);
5466+
}
54515467
default:
54525468
llvm_unreachable("unhandled lane op");
54535469
}
@@ -5456,9 +5472,10 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
54565472
Register DstReg = MI.getOperand(0).getReg();
54575473
Register Src0 = MI.getOperand(2).getReg();
54585474
Register Src1, Src2;
5459-
if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane) {
5475+
if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
5476+
IsPermLane16) {
54605477
Src1 = MI.getOperand(3).getReg();
5461-
if (IID == Intrinsic::amdgcn_writelane) {
5478+
if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) {
54625479
Src2 = MI.getOperand(4).getReg();
54635480
}
54645481
}
@@ -5473,12 +5490,15 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
54735490

54745491
if (Size < 32) {
54755492
Src0 = B.buildAnyExt(S32, Src0).getReg(0);
5476-
if (Src2.isValid())
5493+
5494+
if (IsPermLane16)
5495+
Src1 = B.buildAnyExt(LLT::scalar(32), Src1).getReg(0);
5496+
5497+
if (IID == Intrinsic::amdgcn_writelane)
54775498
Src2 = B.buildAnyExt(LLT::scalar(32), Src2).getReg(0);
54785499

54795500
Register LaneOpDst = createLaneOp(Src0, Src1, Src2, S32);
54805501
B.buildTrunc(DstReg, LaneOpDst);
5481-
54825502
MI.eraseFromParent();
54835503
return true;
54845504
}
@@ -5505,15 +5525,23 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
55055525
SmallVector<Register, 2> PartialRes;
55065526
unsigned NumParts = Size / 32;
55075527
MachineInstrBuilder Src0Parts = B.buildUnmerge(PartialResTy, Src0);
5508-
MachineInstrBuilder Src2Parts;
5528+
MachineInstrBuilder Src1Parts, Src2Parts;
5529+
5530+
if (IsPermLane16)
5531+
Src1Parts = B.buildUnmerge(PartialResTy, Src1);
55095532

5510-
if (Src2.isValid())
5533+
if (IID == Intrinsic::amdgcn_writelane)
55115534
Src2Parts = B.buildUnmerge(PartialResTy, Src2);
55125535

55135536
for (unsigned i = 0; i < NumParts; ++i) {
55145537
Src0 = Src0Parts.getReg(i);
5515-
if (Src2.isValid())
5538+
5539+
if (IsPermLane16)
5540+
Src1 = Src1Parts.getReg(i);
5541+
5542+
if (IID == Intrinsic::amdgcn_writelane)
55165543
Src2 = Src2Parts.getReg(i);
5544+
55175545
PartialRes.push_back(createLaneOp(Src0, Src1, Src2, PartialResTy));
55185546
}
55195547

@@ -7465,6 +7493,9 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
74657493
case Intrinsic::amdgcn_readlane:
74667494
case Intrinsic::amdgcn_writelane:
74677495
case Intrinsic::amdgcn_readfirstlane:
7496+
case Intrinsic::amdgcn_permlane16:
7497+
case Intrinsic::amdgcn_permlanex16:
7498+
case Intrinsic::amdgcn_permlane64:
74687499
return legalizeLaneOp(Helper, MI, IntrID);
74697500
default: {
74707501
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 49 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -6119,28 +6119,38 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
61196119
EVT VT = N->getValueType(0);
61206120
unsigned ValSize = VT.getSizeInBits();
61216121
unsigned IID = N->getConstantOperandVal(0);
6122+
bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
6123+
IID == Intrinsic::amdgcn_permlanex16;
61226124
SDLoc SL(N);
61236125
MVT IntVT = MVT::getIntegerVT(ValSize);
61246126

61256127
auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1,
61266128
SDValue Src2, MVT ValT) -> SDValue {
61276129
SmallVector<SDValue, 8> Operands;
6128-
Operands.push_back(DAG.getTargetConstant(IID, SL, MVT::i32));
61296130
switch (IID) {
6130-
case Intrinsic::amdgcn_readfirstlane:
6131-
Operands.push_back(Src0);
6132-
break;
6131+
case Intrinsic::amdgcn_permlane16:
6132+
case Intrinsic::amdgcn_permlanex16:
6133+
Operands.push_back(N->getOperand(6));
6134+
Operands.push_back(N->getOperand(5));
6135+
Operands.push_back(N->getOperand(4));
6136+
[[fallthrough]];
6137+
case Intrinsic::amdgcn_writelane:
6138+
Operands.push_back(Src2);
6139+
[[fallthrough]];
61336140
case Intrinsic::amdgcn_readlane:
6134-
Operands.push_back(Src0);
61356141
Operands.push_back(Src1);
6136-
break;
6137-
case Intrinsic::amdgcn_writelane:
6142+
[[fallthrough]];
6143+
case Intrinsic::amdgcn_readfirstlane:
6144+
case Intrinsic::amdgcn_permlane64:
61386145
Operands.push_back(Src0);
6139-
Operands.push_back(Src1);
6140-
Operands.push_back(Src2);
61416146
break;
6147+
default:
6148+
llvm_unreachable("unhandled lane op");
61426149
}
61436150

6151+
Operands.push_back(DAG.getTargetConstant(IID, SL, MVT::i32));
6152+
std::reverse(Operands.begin(), Operands.end());
6153+
61446154
if (SDNode *GL = N->getGluedNode()) {
61456155
assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
61466156
GL = GL->getOperand(0).getNode();
@@ -6153,9 +6163,10 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
61536163

61546164
SDValue Src0 = N->getOperand(1);
61556165
SDValue Src1, Src2;
6156-
if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane) {
6166+
if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
6167+
IsPermLane16) {
61576168
Src1 = N->getOperand(2);
6158-
if (IID == Intrinsic::amdgcn_writelane)
6169+
if (IID == Intrinsic::amdgcn_writelane || IsPermLane16)
61596170
Src2 = N->getOperand(3);
61606171
}
61616172

@@ -6168,10 +6179,17 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
61686179
bool IsFloat = VT.isFloatingPoint();
61696180
Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0,
61706181
SL, MVT::i32);
6171-
if (Src2.getNode()) {
6182+
6183+
if (IsPermLane16) {
6184+
Src1 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src1) : Src1,
6185+
SL, MVT::i32);
6186+
}
6187+
6188+
if (IID == Intrinsic::amdgcn_writelane) {
61726189
Src2 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src2) : Src2,
61736190
SL, MVT::i32);
61746191
}
6192+
61756193
SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
61766194
SDValue Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT);
61776195
return IsFloat ? DAG.getBitcast(VT, Trunc) : Trunc;
@@ -6233,17 +6251,23 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
62336251
case MVT::bf16: {
62346252
MVT SubVecVT = MVT::getVectorVT(EltTy, 2);
62356253
SmallVector<SDValue, 4> Pieces;
6254+
SDValue Src0SubVec, Src1SubVec, Src2SubVec;
62366255
for (unsigned i = 0, EltIdx = 0; i < ValSize / 32; i++) {
6237-
SDValue Src0SubVec =
6238-
DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0,
6239-
DAG.getConstant(EltIdx, SL, MVT::i32));
6256+
Src0SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0,
6257+
DAG.getConstant(EltIdx, SL, MVT::i32));
62406258

6241-
SDValue Src2SubVec;
6242-
if (Src2)
6259+
if (IsPermLane16)
6260+
Src1SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src1,
6261+
DAG.getConstant(EltIdx, SL, MVT::i32));
6262+
6263+
if (IID == Intrinsic::amdgcn_writelane)
62436264
Src2SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src2,
62446265
DAG.getConstant(EltIdx, SL, MVT::i32));
62456266

6246-
Pieces.push_back(createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
6267+
Pieces.push_back(
6268+
IsPermLane16
6269+
? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
6270+
: createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
62476271
EltIdx += 2;
62486272
}
62496273
return DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, Pieces);
@@ -6257,7 +6281,10 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
62576281
MVT VecVT = MVT::getVectorVT(MVT::i32, ValSize / 32);
62586282
Src0 = DAG.getBitcast(VecVT, Src0);
62596283

6260-
if (Src2)
6284+
if (IsPermLane16)
6285+
Src1 = DAG.getBitcast(VecVT, Src1);
6286+
6287+
if (IID == Intrinsic::amdgcn_writelane)
62616288
Src2 = DAG.getBitcast(VecVT, Src2);
62626289

62636290
SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
@@ -8734,6 +8761,9 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
87348761
case Intrinsic::amdgcn_readlane:
87358762
case Intrinsic::amdgcn_readfirstlane:
87368763
case Intrinsic::amdgcn_writelane:
8764+
case Intrinsic::amdgcn_permlane16:
8765+
case Intrinsic::amdgcn_permlanex16:
8766+
case Intrinsic::amdgcn_permlane64:
87378767
return lowerLaneOp(*this, Op.getNode(), DAG);
87388768
default:
87398769
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =

llvm/lib/Target/AMDGPU/VOP1Instructions.td

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -732,9 +732,7 @@ def V_ACCVGPR_MOV_B32 : VOP1_Pseudo<"v_accvgpr_mov_b32", VOPProfileAccMov, [], 1
732732
let SubtargetPredicate = isGFX11Plus in {
733733
// Restrict src0 to be VGPR
734734
def V_PERMLANE64_B32 : VOP1_Pseudo<"v_permlane64_b32", VOP_MOVRELS,
735-
getVOP1Pat<int_amdgcn_permlane64,
736-
VOP_MOVRELS>.ret,
737-
/*VOP1Only=*/ 1> {
735+
[], /*VOP1Only=*/ 1> {
738736
let IsInvalidSingleUseConsumer = 1;
739737
let IsInvalidSingleUseProducer = 1;
740738
}
@@ -744,6 +742,12 @@ let SubtargetPredicate = isGFX11Plus in {
744742
defm V_CVT_U32_U16 : VOP1Inst_t16<"v_cvt_u32_u16", VOP_I32_I16>;
745743
} // End SubtargetPredicate = isGFX11Plus
746744

745+
foreach vt = Reg32Types.types in {
746+
def : GCNPat<(int_amdgcn_permlane64 (vt VRegSrc_32:$src0)),
747+
(vt (V_PERMLANE64_B32 (vt VRegSrc_32:$src0)))
748+
>;
749+
}
750+
747751
//===----------------------------------------------------------------------===//
748752
// Target-specific instruction encodings.
749753
//===----------------------------------------------------------------------===//

0 commit comments

Comments
 (0)