Skip to content

Commit 1113e06

Browse files
committed
[ARM] Extend narrow values to allow using truncating scatters
As a minor adjustment to the existing lowering of offset scatters, this extends any smaller-than-legal vectors into full vectors using a zext, so that the truncating scatters can be used. Due to the way MVE legalizes the vectors this should be cheap in most situations, and will prevent the vector from being scalarized. Differential Revision: https://reviews.llvm.org/D103704
1 parent 9f5f917 commit 1113e06

File tree

4 files changed

+33
-60
lines changed

4 files changed

+33
-60
lines changed

llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -639,6 +639,19 @@ Instruction *MVEGatherScatterLowering::tryCreateMaskedScatterOffset(
639639
InputTy = PreTruncTy;
640640
}
641641
}
642+
bool ExtendInput = false;
643+
if (InputTy->getPrimitiveSizeInBits() < 128 &&
644+
InputTy->isIntOrIntVectorTy()) {
645+
// If we can't find a trunc to incorporate into the instruction, create an
646+
// implicit one with a zext, so that we can still create a scatter. We know
647+
// that the input type is 4x/8x/16x and of type i8/i16/i32, so any type
648+
// smaller than 128 bits will divide evenly into a 128bit vector.
649+
InputTy = InputTy->getWithNewBitWidth(
650+
128 / cast<FixedVectorType>(InputTy)->getNumElements());
651+
ExtendInput = true;
652+
LLVM_DEBUG(dbgs() << "masked scatters: Small input type, will extend:\n"
653+
<< *Input << "\n");
654+
}
642655
if (InputTy->getPrimitiveSizeInBits() != 128) {
643656
LLVM_DEBUG(dbgs() << "masked scatters: cannot create scatters for "
644657
"non-standard input types. Expanding.\n");
@@ -652,6 +665,8 @@ Instruction *MVEGatherScatterLowering::tryCreateMaskedScatterOffset(
652665
if (!BasePtr)
653666
return nullptr;
654667

668+
if (ExtendInput)
669+
Input = Builder.CreateZExt(Input, InputTy);
655670
if (!match(Mask, m_One()))
656671
return Builder.CreateIntrinsic(
657672
Intrinsic::arm_mve_vstr_scatter_offset_predicated,

llvm/test/CodeGen/Thumb2/mve-scatter-ind32-unscaled.ll

Lines changed: 10 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -361,19 +361,11 @@ entry:
361361
define arm_aapcs_vfpcc void @trunc_signed_unscaled_i16_i8(i8* %base, <4 x i8>* %offptr, <4 x i16> %input) {
362362
; CHECK-LABEL: trunc_signed_unscaled_i16_i8:
363363
; CHECK: @ %bb.0: @ %entry
364-
; CHECK-NEXT: .save {r4, r5, r7, lr}
365-
; CHECK-NEXT: push {r4, r5, r7, lr}
366-
; CHECK-NEXT: vldrb.s32 q1, [r1]
367-
; CHECK-NEXT: vmov r1, r3, d0
368-
; CHECK-NEXT: vmov r4, r5, d1
369-
; CHECK-NEXT: vadd.i32 q1, q1, r0
370-
; CHECK-NEXT: vmov r0, r12, d2
371-
; CHECK-NEXT: vmov r2, lr, d3
372-
; CHECK-NEXT: strb r1, [r0]
373-
; CHECK-NEXT: strb.w r3, [r12]
374-
; CHECK-NEXT: strb r4, [r2]
375-
; CHECK-NEXT: strb.w r5, [lr]
376-
; CHECK-NEXT: pop {r4, r5, r7, pc}
364+
; CHECK-NEXT: vmov.i32 q1, #0xff
365+
; CHECK-NEXT: vldrb.s32 q2, [r1]
366+
; CHECK-NEXT: vand q0, q0, q1
367+
; CHECK-NEXT: vstrb.32 q0, [r0, q2]
368+
; CHECK-NEXT: bx lr
377369
entry:
378370
%offs = load <4 x i8>, <4 x i8>* %offptr, align 1
379371
%offs.sext = sext <4 x i8> %offs to <4 x i32>
@@ -386,19 +378,11 @@ entry:
386378
define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i16_i8(i8* %base, <4 x i8>* %offptr, <4 x i16> %input) {
387379
; CHECK-LABEL: trunc_unsigned_unscaled_i16_i8:
388380
; CHECK: @ %bb.0: @ %entry
389-
; CHECK-NEXT: .save {r4, r5, r7, lr}
390-
; CHECK-NEXT: push {r4, r5, r7, lr}
391-
; CHECK-NEXT: vldrb.u32 q1, [r1]
392-
; CHECK-NEXT: vmov r1, r3, d0
393-
; CHECK-NEXT: vmov r4, r5, d1
394-
; CHECK-NEXT: vadd.i32 q1, q1, r0
395-
; CHECK-NEXT: vmov r0, r12, d2
396-
; CHECK-NEXT: vmov r2, lr, d3
397-
; CHECK-NEXT: strb r1, [r0]
398-
; CHECK-NEXT: strb.w r3, [r12]
399-
; CHECK-NEXT: strb r4, [r2]
400-
; CHECK-NEXT: strb.w r5, [lr]
401-
; CHECK-NEXT: pop {r4, r5, r7, pc}
381+
; CHECK-NEXT: vmov.i32 q1, #0xff
382+
; CHECK-NEXT: vldrb.u32 q2, [r1]
383+
; CHECK-NEXT: vand q0, q0, q1
384+
; CHECK-NEXT: vstrb.32 q0, [r0, q2]
385+
; CHECK-NEXT: bx lr
402386
entry:
403387
%offs = load <4 x i8>, <4 x i8>* %offptr, align 1
404388
%offs.zext = zext <4 x i8> %offs to <4 x i32>

llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll

Lines changed: 4 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -16,37 +16,13 @@ entry:
1616
ret void
1717
}
1818

19-
; Expanded ?
2019
define arm_aapcs_vfpcc void @unscaled_v8i8_i8(i8* %base, <8 x i8>* %offptr, <8 x i8> %input) {
2120
; CHECK-LABEL: unscaled_v8i8_i8:
2221
; CHECK: @ %bb.0: @ %entry
23-
; CHECK-NEXT: .save {r4, r5, r6, lr}
24-
; CHECK-NEXT: push {r4, r5, r6, lr}
25-
; CHECK-NEXT: vldrb.u32 q1, [r1]
26-
; CHECK-NEXT: vmov.u16 r6, q0[0]
27-
; CHECK-NEXT: vadd.i32 q1, q1, r0
28-
; CHECK-NEXT: vmov r2, r3, d2
29-
; CHECK-NEXT: vmov r12, lr, d3
30-
; CHECK-NEXT: vldrb.u32 q1, [r1, #4]
31-
; CHECK-NEXT: vadd.i32 q1, q1, r0
32-
; CHECK-NEXT: vmov r0, r1, d2
33-
; CHECK-NEXT: vmov r4, r5, d3
34-
; CHECK-NEXT: strb r6, [r2]
35-
; CHECK-NEXT: vmov.u16 r2, q0[1]
36-
; CHECK-NEXT: strb r2, [r3]
37-
; CHECK-NEXT: vmov.u16 r2, q0[2]
38-
; CHECK-NEXT: strb.w r2, [r12]
39-
; CHECK-NEXT: vmov.u16 r2, q0[3]
40-
; CHECK-NEXT: strb.w r2, [lr]
41-
; CHECK-NEXT: vmov.u16 r2, q0[4]
42-
; CHECK-NEXT: strb r2, [r0]
43-
; CHECK-NEXT: vmov.u16 r0, q0[5]
44-
; CHECK-NEXT: strb r0, [r1]
45-
; CHECK-NEXT: vmov.u16 r0, q0[6]
46-
; CHECK-NEXT: strb r0, [r4]
47-
; CHECK-NEXT: vmov.u16 r0, q0[7]
48-
; CHECK-NEXT: strb r0, [r5]
49-
; CHECK-NEXT: pop {r4, r5, r6, pc}
22+
; CHECK-NEXT: vldrb.u16 q1, [r1]
23+
; CHECK-NEXT: vmovlb.u8 q0, q0
24+
; CHECK-NEXT: vstrb.16 q0, [r0, q1]
25+
; CHECK-NEXT: bx lr
5026
entry:
5127
%offs = load <8 x i8>, <8 x i8>* %offptr, align 1
5228
%offs.zext = zext <8 x i8> %offs to <8 x i32>

llvm/test/CodeGen/Thumb2/mve-scatter-ptrs.ll

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -245,12 +245,10 @@ entry:
245245
define arm_aapcs_vfpcc void @ptr_v4i16_dup(i32 %v, <4 x i16*> %offs) {
246246
; CHECK-LABEL: ptr_v4i16_dup:
247247
; CHECK: @ %bb.0: @ %entry
248-
; CHECK-NEXT: vmov r1, r2, d0
249-
; CHECK-NEXT: vmov r3, r12, d1
250-
; CHECK-NEXT: strh r0, [r1]
251-
; CHECK-NEXT: strh r0, [r2]
252-
; CHECK-NEXT: strh r0, [r3]
253-
; CHECK-NEXT: strh.w r0, [r12]
248+
; CHECK-NEXT: vdup.32 q1, r0
249+
; CHECK-NEXT: movs r1, #0
250+
; CHECK-NEXT: vmovlb.u16 q1, q1
251+
; CHECK-NEXT: vstrh.32 q1, [r1, q0]
254252
; CHECK-NEXT: bx lr
255253
entry:
256254
%ext = trunc i32 %v to i16

0 commit comments

Comments
 (0)