Skip to content

Commit 1b65742

Browse files
authored
[SeparateConstOffsetFromGEP] Reorder trivial GEP chains to separate constants (#73056)
In this case, a trivial GEP chain has the form: ``` %ptr = getelementptr sameType, %base, constant %val = getelementptr sameType, %ptr, %variable ``` That is, a one-index GEP consumes another (of the same basis and result type) one-index GEP, where the inner GEP uses a constant index and the outer GEP uses a variable index. For chains of this type, it is trivial to reorder them (by simply swapping the indexes). The result of doing so is better AddrMode matching for users of the ultimate ptr produced by GEP chain. Future patches can extend this to support non-trivial GEP chains (e.g. those with different basis types and/or multiple indices).
1 parent 3985eda commit 1b65742

File tree

8 files changed

+682
-157
lines changed

8 files changed

+682
-157
lines changed

llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp

Lines changed: 70 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -391,6 +391,11 @@ class SeparateConstOffsetFromGEP {
391391
/// and returns true if the splitting succeeds.
392392
bool splitGEP(GetElementPtrInst *GEP);
393393

394+
/// Tries to reorder the given GEP with the GEP that produces the base if
395+
/// doing so results in producing a constant offset as the outermost
396+
/// index.
397+
bool reorderGEP(GetElementPtrInst *GEP, TargetTransformInfo &TTI);
398+
394399
/// Lower a GEP with multiple indices into multiple GEPs with a single index.
395400
/// Function splitGEP already split the original GEP into a variadic part and
396401
/// a constant offset (i.e., AccumulativeByteOffset). This function lowers the
@@ -964,6 +969,66 @@ SeparateConstOffsetFromGEP::lowerToArithmetics(GetElementPtrInst *Variadic,
964969
Variadic->eraseFromParent();
965970
}
966971

972+
bool SeparateConstOffsetFromGEP::reorderGEP(GetElementPtrInst *GEP,
973+
TargetTransformInfo &TTI) {
974+
Type *GEPType = GEP->getResultElementType();
975+
// TODO: support reordering for non-trivial GEP chains
976+
if (GEPType->isAggregateType() || GEP->getNumIndices() != 1)
977+
return false;
978+
979+
auto PtrGEP = dyn_cast<GetElementPtrInst>(GEP->getPointerOperand());
980+
if (!PtrGEP)
981+
return false;
982+
Type *PtrGEPType = PtrGEP->getResultElementType();
983+
// TODO: support reordering for non-trivial GEP chains
984+
if (PtrGEPType->isAggregateType() || PtrGEP->getNumIndices() != 1)
985+
return false;
986+
987+
// TODO: support reordering for non-trivial GEP chains
988+
if (PtrGEPType != GEPType ||
989+
PtrGEP->getSourceElementType() != GEP->getSourceElementType())
990+
return false;
991+
992+
bool NestedNeedsExtraction;
993+
int64_t NestedByteOffset =
994+
accumulateByteOffset(PtrGEP, NestedNeedsExtraction);
995+
if (!NestedNeedsExtraction)
996+
return false;
997+
998+
unsigned AddrSpace = PtrGEP->getPointerAddressSpace();
999+
if (!TTI.isLegalAddressingMode(GEP->getResultElementType(),
1000+
/*BaseGV=*/nullptr, NestedByteOffset,
1001+
/*HasBaseReg=*/true, /*Scale=*/0, AddrSpace))
1002+
return false;
1003+
1004+
IRBuilder<> Builder(GEP);
1005+
Builder.SetCurrentDebugLocation(GEP->getDebugLoc());
1006+
bool GEPInBounds = GEP->isInBounds();
1007+
bool PtrGEPInBounds = PtrGEP->isInBounds();
1008+
bool IsChainInBounds = GEPInBounds && PtrGEPInBounds;
1009+
if (IsChainInBounds) {
1010+
auto GEPIdx = GEP->indices().begin();
1011+
auto KnownGEPIdx = computeKnownBits(GEPIdx->get(), *DL);
1012+
IsChainInBounds &= KnownGEPIdx.isNonNegative();
1013+
if (IsChainInBounds) {
1014+
auto PtrGEPIdx = GEP->indices().begin();
1015+
auto KnownPtrGEPIdx = computeKnownBits(PtrGEPIdx->get(), *DL);
1016+
IsChainInBounds &= KnownPtrGEPIdx.isNonNegative();
1017+
}
1018+
}
1019+
1020+
// For trivial GEP chains, we can swap the indicies.
1021+
auto NewSrc = Builder.CreateGEP(PtrGEPType, PtrGEP->getPointerOperand(),
1022+
SmallVector<Value *, 4>(GEP->indices()));
1023+
cast<GetElementPtrInst>(NewSrc)->setIsInBounds(IsChainInBounds);
1024+
auto NewGEP = Builder.CreateGEP(GEPType, NewSrc,
1025+
SmallVector<Value *, 4>(PtrGEP->indices()));
1026+
cast<GetElementPtrInst>(NewGEP)->setIsInBounds(IsChainInBounds);
1027+
GEP->replaceAllUsesWith(NewGEP);
1028+
RecursivelyDeleteTriviallyDeadInstructions(GEP);
1029+
return true;
1030+
}
1031+
9671032
bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
9681033
// Skip vector GEPs.
9691034
if (GEP->getType()->isVectorTy())
@@ -979,11 +1044,13 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
9791044
bool NeedsExtraction;
9801045
int64_t AccumulativeByteOffset = accumulateByteOffset(GEP, NeedsExtraction);
9811046

982-
if (!NeedsExtraction)
983-
return Changed;
984-
9851047
TargetTransformInfo &TTI = GetTTI(*GEP->getFunction());
9861048

1049+
if (!NeedsExtraction) {
1050+
Changed |= reorderGEP(GEP, TTI);
1051+
return Changed;
1052+
}
1053+
9871054
// If LowerGEP is disabled, before really splitting the GEP, check whether the
9881055
// backend supports the addressing mode we are about to produce. If no, this
9891056
// splitting probably won't be beneficial.

llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll

Lines changed: 16 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -273,11 +273,11 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
273273
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
274274
; CHECK-NEXT: ds_write_b32 v0, v58
275275
; CHECK-NEXT: s_branch .LBB0_7
276-
; CHECK-NEXT: .LBB0_16: ; %Flow43
276+
; CHECK-NEXT: .LBB0_16: ; %Flow45
277277
; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1
278278
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s53
279279
; CHECK-NEXT: v_mov_b32_e32 v57, v0
280-
; CHECK-NEXT: .LBB0_17: ; %Flow44
280+
; CHECK-NEXT: .LBB0_17: ; %Flow46
281281
; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1
282282
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s52
283283
; CHECK-NEXT: s_mov_b32 s49, exec_lo
@@ -323,11 +323,11 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
323323
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
324324
; CHECK-NEXT: ds_write_b32 v0, v57
325325
; CHECK-NEXT: s_branch .LBB0_19
326-
; CHECK-NEXT: .LBB0_22: ; %Flow41
326+
; CHECK-NEXT: .LBB0_22: ; %Flow43
327327
; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1
328328
; CHECK-NEXT: s_inst_prefetch 0x2
329329
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s52
330-
; CHECK-NEXT: .LBB0_23: ; %Flow42
330+
; CHECK-NEXT: .LBB0_23: ; %Flow44
331331
; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1
332332
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s49
333333
; CHECK-NEXT: ; %bb.24: ; in Loop: Header=BB0_5 Depth=1
@@ -340,7 +340,7 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
340340
; CHECK-NEXT: s_or_b32 s43, s4, s43
341341
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s43
342342
; CHECK-NEXT: s_cbranch_execnz .LBB0_5
343-
; CHECK-NEXT: .LBB0_25: ; %Flow49
343+
; CHECK-NEXT: .LBB0_25: ; %Flow51
344344
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s42
345345
; CHECK-NEXT: v_mov_b32_e32 v31, v40
346346
; CHECK-NEXT: v_mov_b32_e32 v0, 1
@@ -362,12 +362,10 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
362362
; CHECK-NEXT: v_cmpx_gt_u32_e64 v47, v41
363363
; CHECK-NEXT: s_cbranch_execz .LBB0_33
364364
; CHECK-NEXT: ; %bb.26:
365-
; CHECK-NEXT: s_add_u32 s42, s44, 8
366-
; CHECK-NEXT: s_addc_u32 s43, s45, 0
367-
; CHECK-NEXT: s_mov_b32 s44, 0
365+
; CHECK-NEXT: s_mov_b32 s42, 0
368366
; CHECK-NEXT: s_branch .LBB0_28
369367
; CHECK-NEXT: .LBB0_27: ; in Loop: Header=BB0_28 Depth=1
370-
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s45
368+
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s43
371369
; CHECK-NEXT: v_mov_b32_e32 v31, v40
372370
; CHECK-NEXT: v_mov_b32_e32 v0, 0
373371
; CHECK-NEXT: s_add_u32 s8, s34, 40
@@ -383,12 +381,12 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
383381
; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7]
384382
; CHECK-NEXT: v_add_co_u32 v41, vcc_lo, v0, v41
385383
; CHECK-NEXT: v_cmp_le_u32_e32 vcc_lo, v47, v41
386-
; CHECK-NEXT: s_or_b32 s44, vcc_lo, s44
387-
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s44
384+
; CHECK-NEXT: s_or_b32 s42, vcc_lo, s42
385+
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s42
388386
; CHECK-NEXT: s_cbranch_execz .LBB0_33
389387
; CHECK-NEXT: .LBB0_28: ; =>This Inner Loop Header: Depth=1
390388
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v41
391-
; CHECK-NEXT: s_mov_b32 s45, exec_lo
389+
; CHECK-NEXT: s_mov_b32 s43, exec_lo
392390
; CHECK-NEXT: ds_read_b32 v0, v0
393391
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
394392
; CHECK-NEXT: v_lshrrev_b32_e32 v63, 10, v0
@@ -397,15 +395,15 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
397395
; CHECK-NEXT: v_mul_u32_u24_e32 v1, 0x180, v63
398396
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 5, v62
399397
; CHECK-NEXT: v_lshlrev_b32_e32 v4, 5, v72
400-
; CHECK-NEXT: v_add_co_u32 v2, s4, s42, v1
401-
; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, s43, 0, s4
398+
; CHECK-NEXT: v_add_co_u32 v2, s4, s44, v1
399+
; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, s45, 0, s4
402400
; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0
403401
; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo
404402
; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
405403
; CHECK-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
406404
; CHECK-NEXT: s_clause 0x1
407-
; CHECK-NEXT: global_load_dwordx4 v[4:7], v[0:1], off
408-
; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off
405+
; CHECK-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:8
406+
; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off offset:8
409407
; CHECK-NEXT: s_waitcnt vmcnt(0)
410408
; CHECK-NEXT: v_xor_b32_e32 v46, v9, v5
411409
; CHECK-NEXT: v_xor_b32_e32 v45, v8, v4
@@ -417,8 +415,8 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
417415
; CHECK-NEXT: s_cbranch_execz .LBB0_27
418416
; CHECK-NEXT: ; %bb.29: ; in Loop: Header=BB0_28 Depth=1
419417
; CHECK-NEXT: s_clause 0x1
420-
; CHECK-NEXT: global_load_dwordx2 v[58:59], v[2:3], off offset:16
421-
; CHECK-NEXT: global_load_dwordx2 v[60:61], v[0:1], off offset:16
418+
; CHECK-NEXT: global_load_dwordx2 v[58:59], v[2:3], off offset:24
419+
; CHECK-NEXT: global_load_dwordx2 v[60:61], v[0:1], off offset:24
422420
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 4, v45
423421
; CHECK-NEXT: v_alignbit_b32 v1, v46, v45, 12
424422
; CHECK-NEXT: v_and_b32_e32 v2, 0xf0000, v45

0 commit comments

Comments
 (0)