Skip to content

Commit 5acddf5

Browse files
committed
[ARM] Lower non-extended small gathers via truncated gathers.
Corollary to 1113e06 this allows us to match gather that dont produce a full vector width results. They use an extended gather which is truncated back to the original type.
1 parent e41e865 commit 5acddf5

File tree

3 files changed

+68
-88
lines changed

3 files changed

+68
-88
lines changed

llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp

Lines changed: 42 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -488,27 +488,44 @@ Instruction *MVEGatherScatterLowering::tryCreateMaskedGatherOffset(
488488
// The size of the gather was already checked in isLegalTypeAndAlignment;
489489
// if it was not a full vector width an appropriate extend should follow.
490490
auto *Extend = Root;
491+
bool TruncResult = false;
491492
if (MemoryTy->getPrimitiveSizeInBits() < 128) {
492-
// Only transform gathers with exactly one use
493-
if (!I->hasOneUse())
494-
return nullptr;
493+
if (I->hasOneUse()) {
494+
// If the gather has a single extend of the correct type, use an extending
495+
// gather and replace the ext. In which case the correct root to replace
496+
// is not the CallInst itself, but the instruction which extends it.
497+
Instruction* User = cast<Instruction>(*I->users().begin());
498+
if (isa<SExtInst>(User) &&
499+
User->getType()->getPrimitiveSizeInBits() == 128) {
500+
LLVM_DEBUG(dbgs() << "masked gathers: Incorporating extend: "
501+
<< *User << "\n");
502+
Extend = User;
503+
ResultTy = User->getType();
504+
Unsigned = 0;
505+
} else if (isa<ZExtInst>(User) &&
506+
User->getType()->getPrimitiveSizeInBits() == 128) {
507+
LLVM_DEBUG(dbgs() << "masked gathers: Incorporating extend: "
508+
<< *ResultTy << "\n");
509+
Extend = User;
510+
ResultTy = User->getType();
511+
}
512+
}
495513

496-
// The correct root to replace is not the CallInst itself, but the
497-
// instruction which extends it
498-
Extend = cast<Instruction>(*I->users().begin());
499-
if (isa<SExtInst>(Extend)) {
500-
Unsigned = 0;
501-
} else if (!isa<ZExtInst>(Extend)) {
502-
LLVM_DEBUG(dbgs() << "masked gathers: extend needed but not provided. "
503-
<< "Expanding\n");
504-
return nullptr;
514+
// If an extend hasn't been found and the type is an integer, create an
515+
// extending gather and truncate back to the original type.
516+
if (ResultTy->getPrimitiveSizeInBits() < 128 &&
517+
ResultTy->isIntOrIntVectorTy()) {
518+
ResultTy = ResultTy->getWithNewBitWidth(
519+
128 / cast<FixedVectorType>(ResultTy)->getNumElements());
520+
TruncResult = true;
521+
LLVM_DEBUG(dbgs() << "masked gathers: Small input type, truncing to: "
522+
<< *ResultTy << "\n");
505523
}
506-
LLVM_DEBUG(dbgs() << "masked gathers: found an extending gather\n");
507-
ResultTy = Extend->getType();
524+
508525
// The final size of the gather must be a full vector width
509526
if (ResultTy->getPrimitiveSizeInBits() != 128) {
510-
LLVM_DEBUG(dbgs() << "masked gathers: extending from the wrong type. "
511-
<< "Expanding\n");
527+
LLVM_DEBUG(dbgs() << "masked gathers: Extend needed but not provided "
528+
"from the correct type. Expanding\n");
512529
return nullptr;
513530
}
514531
}
@@ -522,18 +539,25 @@ Instruction *MVEGatherScatterLowering::tryCreateMaskedGatherOffset(
522539

523540
Root = Extend;
524541
Value *Mask = I->getArgOperand(2);
542+
Instruction *Load = nullptr;
525543
if (!match(Mask, m_One()))
526-
return Builder.CreateIntrinsic(
544+
Load = Builder.CreateIntrinsic(
527545
Intrinsic::arm_mve_vldr_gather_offset_predicated,
528546
{ResultTy, BasePtr->getType(), Offsets->getType(), Mask->getType()},
529547
{BasePtr, Offsets, Builder.getInt32(MemoryTy->getScalarSizeInBits()),
530548
Builder.getInt32(Scale), Builder.getInt32(Unsigned), Mask});
531549
else
532-
return Builder.CreateIntrinsic(
550+
Load = Builder.CreateIntrinsic(
533551
Intrinsic::arm_mve_vldr_gather_offset,
534552
{ResultTy, BasePtr->getType(), Offsets->getType()},
535553
{BasePtr, Offsets, Builder.getInt32(MemoryTy->getScalarSizeInBits()),
536554
Builder.getInt32(Scale), Builder.getInt32(Unsigned)});
555+
556+
if (TruncResult) {
557+
Load = TruncInst::Create(Instruction::Trunc, Load, MemoryTy);
558+
Builder.Insert(Load);
559+
}
560+
return Load;
537561
}
538562

539563
Instruction *MVEGatherScatterLowering::lowerScatter(IntrinsicInst *I) {

llvm/test/CodeGen/Thumb2/mve-gather-ind8-unscaled.ll

Lines changed: 3 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -18,33 +18,9 @@ entry:
1818
define arm_aapcs_vfpcc <8 x i8> @unscaled_v8i8_i8(i8* %base, <8 x i8>* %offptr) {
1919
; CHECK-LABEL: unscaled_v8i8_i8:
2020
; CHECK: @ %bb.0: @ %entry
21-
; CHECK-NEXT: .save {r4, r5, r7, lr}
22-
; CHECK-NEXT: push {r4, r5, r7, lr}
23-
; CHECK-NEXT: vldrb.u32 q0, [r1, #4]
24-
; CHECK-NEXT: vadd.i32 q0, q0, r0
25-
; CHECK-NEXT: vmov r2, r12, d0
26-
; CHECK-NEXT: vmov r3, lr, d1
27-
; CHECK-NEXT: vldrb.u32 q0, [r1]
28-
; CHECK-NEXT: vadd.i32 q0, q0, r0
29-
; CHECK-NEXT: vmov r4, r5, d0
30-
; CHECK-NEXT: vmov r0, r1, d1
31-
; CHECK-NEXT: ldrb r2, [r2]
32-
; CHECK-NEXT: ldrb.w r12, [r12]
33-
; CHECK-NEXT: ldrb r3, [r3]
34-
; CHECK-NEXT: ldrb.w lr, [lr]
35-
; CHECK-NEXT: ldrb r4, [r4]
36-
; CHECK-NEXT: ldrb r5, [r5]
37-
; CHECK-NEXT: vmov.16 q0[0], r4
38-
; CHECK-NEXT: ldrb r0, [r0]
39-
; CHECK-NEXT: vmov.16 q0[1], r5
40-
; CHECK-NEXT: ldrb r1, [r1]
41-
; CHECK-NEXT: vmov.16 q0[2], r0
42-
; CHECK-NEXT: vmov.16 q0[3], r1
43-
; CHECK-NEXT: vmov.16 q0[4], r2
44-
; CHECK-NEXT: vmov.16 q0[5], r12
45-
; CHECK-NEXT: vmov.16 q0[6], r3
46-
; CHECK-NEXT: vmov.16 q0[7], lr
47-
; CHECK-NEXT: pop {r4, r5, r7, pc}
21+
; CHECK-NEXT: vldrb.u16 q1, [r1]
22+
; CHECK-NEXT: vldrb.u16 q0, [r0, q1]
23+
; CHECK-NEXT: bx lr
4824
entry:
4925
%offs = load <8 x i8>, <8 x i8>* %offptr, align 1
5026
%offs.zext = zext <8 x i8> %offs to <8 x i32>

llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll

Lines changed: 23 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -314,15 +314,9 @@ entry:
314314
define arm_aapcs_vfpcc <4 x i16> @ptr_v4i16(<4 x i16*>* %offptr) {
315315
; CHECK-LABEL: ptr_v4i16:
316316
; CHECK: @ %bb.0: @ %entry
317-
; CHECK-NEXT: vldrw.u32 q0, [r0]
318-
; CHECK-NEXT: vmov r0, r1, d1
319-
; CHECK-NEXT: vmov r2, r3, d0
320-
; CHECK-NEXT: ldrh r0, [r0]
321-
; CHECK-NEXT: ldrh r2, [r2]
322-
; CHECK-NEXT: ldrh r1, [r1]
323-
; CHECK-NEXT: ldrh r3, [r3]
324-
; CHECK-NEXT: vmov q0[2], q0[0], r2, r0
325-
; CHECK-NEXT: vmov q0[3], q0[1], r3, r1
317+
; CHECK-NEXT: vldrw.u32 q1, [r0]
318+
; CHECK-NEXT: movs r1, #0
319+
; CHECK-NEXT: vldrh.u32 q0, [r1, q1]
326320
; CHECK-NEXT: bx lr
327321
entry:
328322
%offs = load <4 x i16*>, <4 x i16*>* %offptr, align 4
@@ -658,15 +652,9 @@ entry:
658652
define arm_aapcs_vfpcc <4 x i8> @ptr_v4i8(<4 x i8*>* %offptr) {
659653
; CHECK-LABEL: ptr_v4i8:
660654
; CHECK: @ %bb.0: @ %entry
661-
; CHECK-NEXT: vldrw.u32 q0, [r0]
662-
; CHECK-NEXT: vmov r0, r1, d1
663-
; CHECK-NEXT: vmov r2, r3, d0
664-
; CHECK-NEXT: ldrb r0, [r0]
665-
; CHECK-NEXT: ldrb r2, [r2]
666-
; CHECK-NEXT: ldrb r1, [r1]
667-
; CHECK-NEXT: ldrb r3, [r3]
668-
; CHECK-NEXT: vmov q0[2], q0[0], r2, r0
669-
; CHECK-NEXT: vmov q0[3], q0[1], r3, r1
655+
; CHECK-NEXT: vldrw.u32 q1, [r0]
656+
; CHECK-NEXT: movs r1, #0
657+
; CHECK-NEXT: vldrb.u32 q0, [r1, q1]
670658
; CHECK-NEXT: bx lr
671659
entry:
672660
%offs = load <4 x i8*>, <4 x i8*>* %offptr, align 4
@@ -897,33 +885,25 @@ entry:
897885
define arm_aapcs_vfpcc <8 x i32> @sext_unsigned_unscaled_i8_i8_toi64(i8* %base, <8 x i8>* %offptr) {
898886
; CHECK-LABEL: sext_unsigned_unscaled_i8_i8_toi64:
899887
; CHECK: @ %bb.0: @ %entry
900-
; CHECK-NEXT: .save {r4, r5, r6, lr}
901-
; CHECK-NEXT: push {r4, r5, r6, lr}
902-
; CHECK-NEXT: vldrb.u32 q0, [r1, #4]
903-
; CHECK-NEXT: vadd.i32 q0, q0, r0
904-
; CHECK-NEXT: vmov r2, r12, d1
905-
; CHECK-NEXT: vmov r3, lr, d0
906-
; CHECK-NEXT: vldrb.u32 q0, [r1]
907-
; CHECK-NEXT: vadd.i32 q0, q0, r0
908-
; CHECK-NEXT: vmov r0, r1, d1
909-
; CHECK-NEXT: vmov r4, r5, d0
910-
; CHECK-NEXT: ldrb r6, [r2]
911-
; CHECK-NEXT: ldrb r3, [r3]
912-
; CHECK-NEXT: ldrb.w r12, [r12]
913-
; CHECK-NEXT: ldrb.w r2, [lr]
914-
; CHECK-NEXT: vmov q1[2], q1[0], r3, r6
915-
; CHECK-NEXT: ldrb r0, [r0]
916-
; CHECK-NEXT: ldrb r4, [r4]
917-
; CHECK-NEXT: vmov q1[3], q1[1], r2, r12
918-
; CHECK-NEXT: ldrb r1, [r1]
919-
; CHECK-NEXT: vmovlb.s8 q1, q1
920-
; CHECK-NEXT: ldrb r5, [r5]
921-
; CHECK-NEXT: vmov q0[2], q0[0], r4, r0
922-
; CHECK-NEXT: vmovlb.s16 q1, q1
923-
; CHECK-NEXT: vmov q0[3], q0[1], r5, r1
888+
; CHECK-NEXT: vldrb.u16 q0, [r1]
889+
; CHECK-NEXT: vldrb.u16 q1, [r0, q0]
890+
; CHECK-NEXT: vmov.u16 r0, q1[2]
891+
; CHECK-NEXT: vmov.u16 r1, q1[0]
892+
; CHECK-NEXT: vmov q0[2], q0[0], r1, r0
893+
; CHECK-NEXT: vmov.u16 r0, q1[3]
894+
; CHECK-NEXT: vmov.u16 r1, q1[1]
895+
; CHECK-NEXT: vmov q0[3], q0[1], r1, r0
896+
; CHECK-NEXT: vmov.u16 r0, q1[6]
897+
; CHECK-NEXT: vmov.u16 r1, q1[4]
924898
; CHECK-NEXT: vmovlb.s8 q0, q0
899+
; CHECK-NEXT: vmov q2[2], q2[0], r1, r0
900+
; CHECK-NEXT: vmov.u16 r0, q1[7]
901+
; CHECK-NEXT: vmov.u16 r1, q1[5]
925902
; CHECK-NEXT: vmovlb.s16 q0, q0
926-
; CHECK-NEXT: pop {r4, r5, r6, pc}
903+
; CHECK-NEXT: vmov q2[3], q2[1], r1, r0
904+
; CHECK-NEXT: vmovlb.s8 q1, q2
905+
; CHECK-NEXT: vmovlb.s16 q1, q1
906+
; CHECK-NEXT: bx lr
927907
entry:
928908
%offs = load <8 x i8>, <8 x i8>* %offptr, align 1
929909
%offs.zext = zext <8 x i8> %offs to <8 x i32>

0 commit comments

Comments
 (0)