Skip to content

Commit bdc4110

Browse files
authored
[RISCV] Recurse on first operand of two operand shuffles (#79180)
This is the first step towards an alternate shuffle lowering design for the general two vector argument case. The goal is to leverage the existing lowering for single vector permutes to avoid as many of the vrgathers as required - even if we do need the other. This patch handles only the first argument, and is arguably a slightly weird half-step. However, the test changes from the full two argument recurse patch are a lot harder to reason about. Taking this half step gives much more easily reviewable changes, and is thus worthwhile. I intend to post the patch for the second argument once this has landed.
1 parent 50d33c6 commit bdc4110

6 files changed

+347
-407
lines changed

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

Lines changed: 48 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -5033,56 +5033,60 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
50335033
MVT IndexContainerVT =
50345034
ContainerVT.changeVectorElementType(IndexVT.getScalarType());
50355035

5036-
SDValue Gather;
5037-
// TODO: This doesn't trigger for i64 vectors on RV32, since there we
5038-
// encounter a bitcasted BUILD_VECTOR with low/high i32 values.
5039-
if (SDValue SplatValue = DAG.getSplatValue(V1, /*LegalTypes*/ true)) {
5040-
Gather = lowerScalarSplat(SDValue(), SplatValue, VL, ContainerVT, DL, DAG,
5041-
Subtarget);
5042-
} else {
5036+
// Base case for the recursion just below - handle the worst case
5037+
// single source permutation. Note that all the splat variants
5038+
// are handled above.
5039+
if (V2.isUndef()) {
50435040
V1 = convertToScalableVector(ContainerVT, V1, DAG, Subtarget);
5044-
// If only one index is used, we can use a "splat" vrgather.
5045-
// TODO: We can splat the most-common index and fix-up any stragglers, if
5046-
// that's beneficial.
5047-
if (LHSIndexCounts.size() == 1) {
5048-
int SplatIndex = LHSIndexCounts.begin()->getFirst();
5049-
Gather = DAG.getNode(GatherVXOpc, DL, ContainerVT, V1,
5050-
DAG.getConstant(SplatIndex, DL, XLenVT),
5051-
DAG.getUNDEF(ContainerVT), TrueMask, VL);
5052-
} else {
5053-
SDValue LHSIndices = DAG.getBuildVector(IndexVT, DL, GatherIndicesLHS);
5054-
LHSIndices =
5055-
convertToScalableVector(IndexContainerVT, LHSIndices, DAG, Subtarget);
5056-
5057-
Gather = DAG.getNode(GatherVVOpc, DL, ContainerVT, V1, LHSIndices,
5058-
DAG.getUNDEF(ContainerVT), TrueMask, VL);
5041+
SDValue LHSIndices = DAG.getBuildVector(IndexVT, DL, GatherIndicesLHS);
5042+
LHSIndices = convertToScalableVector(IndexContainerVT, LHSIndices, DAG,
5043+
Subtarget);
5044+
SDValue Gather = DAG.getNode(GatherVVOpc, DL, ContainerVT, V1, LHSIndices,
5045+
DAG.getUNDEF(ContainerVT), TrueMask, VL);
5046+
return convertFromScalableVector(VT, Gather, DAG, Subtarget);
5047+
}
5048+
5049+
// Translate the gather index we computed above (and possibly swapped)
5050+
// back to a shuffle mask. This step should disappear once we complete
5051+
// the migration to recursive design.
5052+
SmallVector<int> ShuffleMaskLHS;
5053+
ShuffleMaskLHS.reserve(GatherIndicesLHS.size());
5054+
for (SDValue GatherIndex : GatherIndicesLHS) {
5055+
if (GatherIndex.isUndef()) {
5056+
ShuffleMaskLHS.push_back(-1);
5057+
continue;
50595058
}
5059+
auto *IdxC = cast<ConstantSDNode>(GatherIndex);
5060+
ShuffleMaskLHS.push_back(IdxC->getZExtValue());
50605061
}
50615062

5062-
// If a second vector operand is used by this shuffle, blend it in with an
5063-
// additional vrgather.
5064-
if (!V2.isUndef()) {
5065-
V2 = convertToScalableVector(ContainerVT, V2, DAG, Subtarget);
5063+
// Recursively invoke lowering for the LHS as if there were no RHS.
5064+
// This allows us to leverage all of our single source permute tricks.
5065+
SDValue Gather =
5066+
DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), ShuffleMaskLHS);
5067+
Gather = convertToScalableVector(ContainerVT, Gather, DAG, Subtarget);
50665068

5067-
MVT MaskContainerVT = ContainerVT.changeVectorElementType(MVT::i1);
5068-
SelectMask =
5069-
convertToScalableVector(MaskContainerVT, SelectMask, DAG, Subtarget);
5069+
// Blend in second vector source with an additional vrgather.
5070+
V2 = convertToScalableVector(ContainerVT, V2, DAG, Subtarget);
50705071

5071-
// If only one index is used, we can use a "splat" vrgather.
5072-
// TODO: We can splat the most-common index and fix-up any stragglers, if
5073-
// that's beneficial.
5074-
if (RHSIndexCounts.size() == 1) {
5075-
int SplatIndex = RHSIndexCounts.begin()->getFirst();
5076-
Gather = DAG.getNode(GatherVXOpc, DL, ContainerVT, V2,
5077-
DAG.getConstant(SplatIndex, DL, XLenVT), Gather,
5078-
SelectMask, VL);
5079-
} else {
5080-
SDValue RHSIndices = DAG.getBuildVector(IndexVT, DL, GatherIndicesRHS);
5081-
RHSIndices =
5082-
convertToScalableVector(IndexContainerVT, RHSIndices, DAG, Subtarget);
5083-
Gather = DAG.getNode(GatherVVOpc, DL, ContainerVT, V2, RHSIndices, Gather,
5084-
SelectMask, VL);
5085-
}
5072+
MVT MaskContainerVT = ContainerVT.changeVectorElementType(MVT::i1);
5073+
SelectMask =
5074+
convertToScalableVector(MaskContainerVT, SelectMask, DAG, Subtarget);
5075+
5076+
// If only one index is used, we can use a "splat" vrgather.
5077+
// TODO: We can splat the most-common index and fix-up any stragglers, if
5078+
// that's beneficial.
5079+
if (RHSIndexCounts.size() == 1) {
5080+
int SplatIndex = RHSIndexCounts.begin()->getFirst();
5081+
Gather = DAG.getNode(GatherVXOpc, DL, ContainerVT, V2,
5082+
DAG.getConstant(SplatIndex, DL, XLenVT), Gather,
5083+
SelectMask, VL);
5084+
} else {
5085+
SDValue RHSIndices = DAG.getBuildVector(IndexVT, DL, GatherIndicesRHS);
5086+
RHSIndices =
5087+
convertToScalableVector(IndexContainerVT, RHSIndices, DAG, Subtarget);
5088+
Gather = DAG.getNode(GatherVVOpc, DL, ContainerVT, V2, RHSIndices, Gather,
5089+
SelectMask, VL);
50865090
}
50875091

50885092
return convertFromScalableVector(VT, Gather, DAG, Subtarget);

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll

Lines changed: 14 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -238,39 +238,26 @@ define <64 x half> @interleave_v32f16(<32 x half> %x, <32 x half> %y) {
238238
define <64 x float> @interleave_v32f32(<32 x float> %x, <32 x float> %y) {
239239
; V128-LABEL: interleave_v32f32:
240240
; V128: # %bb.0:
241-
; V128-NEXT: addi sp, sp, -16
242-
; V128-NEXT: .cfi_def_cfa_offset 16
243-
; V128-NEXT: csrr a0, vlenb
244-
; V128-NEXT: slli a0, a0, 2
245-
; V128-NEXT: sub sp, sp, a0
246-
; V128-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
247-
; V128-NEXT: lui a0, %hi(.LCPI10_0)
248-
; V128-NEXT: addi a0, a0, %lo(.LCPI10_0)
249-
; V128-NEXT: li a1, 32
250-
; V128-NEXT: vsetvli zero, a1, e32, m8, ta, mu
251-
; V128-NEXT: vle16.v v4, (a0)
252-
; V128-NEXT: lui a0, %hi(.LCPI10_1)
253-
; V128-NEXT: addi a0, a0, %lo(.LCPI10_1)
254-
; V128-NEXT: vle16.v v24, (a0)
255-
; V128-NEXT: addi a0, sp, 16
256-
; V128-NEXT: vs4r.v v24, (a0) # Unknown-size Folded Spill
257-
; V128-NEXT: lui a0, 699051
258-
; V128-NEXT: addi a0, a0, -1366
259-
; V128-NEXT: vmv.s.x v0, a0
260-
; V128-NEXT: vrgatherei16.vv v24, v8, v4
261-
; V128-NEXT: addi a0, sp, 16
262-
; V128-NEXT: vl4r.v v12, (a0) # Unknown-size Folded Reload
241+
; V128-NEXT: vsetivli zero, 16, e32, m8, ta, ma
242+
; V128-NEXT: vslidedown.vi v0, v8, 16
243+
; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma
244+
; V128-NEXT: vwaddu.vv v24, v0, v8
245+
; V128-NEXT: li a0, -1
246+
; V128-NEXT: vwmaccu.vx v24, a0, v8
247+
; V128-NEXT: lui a1, %hi(.LCPI10_0)
248+
; V128-NEXT: addi a1, a1, %lo(.LCPI10_0)
249+
; V128-NEXT: li a2, 32
250+
; V128-NEXT: vsetvli zero, a2, e32, m8, ta, mu
251+
; V128-NEXT: vle16.v v12, (a1)
252+
; V128-NEXT: lui a1, 699051
253+
; V128-NEXT: addi a1, a1, -1366
254+
; V128-NEXT: vmv.s.x v0, a1
263255
; V128-NEXT: vrgatherei16.vv v24, v16, v12, v0.t
264256
; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma
265257
; V128-NEXT: vwaddu.vv v0, v8, v16
266-
; V128-NEXT: li a0, -1
267258
; V128-NEXT: vwmaccu.vx v0, a0, v16
268259
; V128-NEXT: vmv8r.v v8, v0
269260
; V128-NEXT: vmv8r.v v16, v24
270-
; V128-NEXT: csrr a0, vlenb
271-
; V128-NEXT: slli a0, a0, 2
272-
; V128-NEXT: add sp, sp, a0
273-
; V128-NEXT: addi sp, sp, 16
274261
; V128-NEXT: ret
275262
;
276263
; V512-LABEL: interleave_v32f32:

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll

Lines changed: 28 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -188,24 +188,30 @@ define <4 x i32> @interleave_v4i32_offset_2(<4 x i32> %x, <4 x i32> %y) {
188188
define <4 x i32> @interleave_v4i32_offset_1(<4 x i32> %x, <4 x i32> %y) {
189189
; V128-LABEL: interleave_v4i32_offset_1:
190190
; V128: # %bb.0:
191+
; V128-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
192+
; V128-NEXT: vwaddu.vv v10, v8, v8
193+
; V128-NEXT: li a0, -1
194+
; V128-NEXT: vwmaccu.vx v10, a0, v8
191195
; V128-NEXT: vsetivli zero, 4, e32, m1, ta, mu
192-
; V128-NEXT: vid.v v10
193-
; V128-NEXT: vsrl.vi v11, v10, 1
194-
; V128-NEXT: vrgather.vv v10, v8, v11
196+
; V128-NEXT: vid.v v8
197+
; V128-NEXT: vsrl.vi v8, v8, 1
195198
; V128-NEXT: vmv.v.i v0, 10
196-
; V128-NEXT: vadd.vi v8, v11, 1
199+
; V128-NEXT: vadd.vi v8, v8, 1
197200
; V128-NEXT: vrgather.vv v10, v9, v8, v0.t
198201
; V128-NEXT: vmv.v.v v8, v10
199202
; V128-NEXT: ret
200203
;
201204
; V512-LABEL: interleave_v4i32_offset_1:
202205
; V512: # %bb.0:
206+
; V512-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
207+
; V512-NEXT: vwaddu.vv v10, v8, v8
208+
; V512-NEXT: li a0, -1
209+
; V512-NEXT: vwmaccu.vx v10, a0, v8
203210
; V512-NEXT: vsetivli zero, 4, e32, mf2, ta, mu
204-
; V512-NEXT: vid.v v10
205-
; V512-NEXT: vsrl.vi v11, v10, 1
206-
; V512-NEXT: vrgather.vv v10, v8, v11
211+
; V512-NEXT: vid.v v8
212+
; V512-NEXT: vsrl.vi v8, v8, 1
207213
; V512-NEXT: vmv.v.i v0, 10
208-
; V512-NEXT: vadd.vi v8, v11, 1
214+
; V512-NEXT: vadd.vi v8, v8, 1
209215
; V512-NEXT: vrgather.vv v10, v9, v8, v0.t
210216
; V512-NEXT: vmv1r.v v8, v10
211217
; V512-NEXT: ret
@@ -397,39 +403,26 @@ define <64 x i16> @interleave_v32i16(<32 x i16> %x, <32 x i16> %y) {
397403
define <64 x i32> @interleave_v32i32(<32 x i32> %x, <32 x i32> %y) {
398404
; V128-LABEL: interleave_v32i32:
399405
; V128: # %bb.0:
400-
; V128-NEXT: addi sp, sp, -16
401-
; V128-NEXT: .cfi_def_cfa_offset 16
402-
; V128-NEXT: csrr a0, vlenb
403-
; V128-NEXT: slli a0, a0, 2
404-
; V128-NEXT: sub sp, sp, a0
405-
; V128-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
406-
; V128-NEXT: lui a0, %hi(.LCPI17_0)
407-
; V128-NEXT: addi a0, a0, %lo(.LCPI17_0)
408-
; V128-NEXT: li a1, 32
409-
; V128-NEXT: vsetvli zero, a1, e32, m8, ta, mu
410-
; V128-NEXT: vle16.v v4, (a0)
411-
; V128-NEXT: lui a0, %hi(.LCPI17_1)
412-
; V128-NEXT: addi a0, a0, %lo(.LCPI17_1)
413-
; V128-NEXT: vle16.v v24, (a0)
414-
; V128-NEXT: addi a0, sp, 16
415-
; V128-NEXT: vs4r.v v24, (a0) # Unknown-size Folded Spill
416-
; V128-NEXT: lui a0, 699051
417-
; V128-NEXT: addi a0, a0, -1366
418-
; V128-NEXT: vmv.s.x v0, a0
419-
; V128-NEXT: vrgatherei16.vv v24, v8, v4
420-
; V128-NEXT: addi a0, sp, 16
421-
; V128-NEXT: vl4r.v v12, (a0) # Unknown-size Folded Reload
406+
; V128-NEXT: vsetivli zero, 16, e32, m8, ta, ma
407+
; V128-NEXT: vslidedown.vi v0, v8, 16
408+
; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma
409+
; V128-NEXT: vwaddu.vv v24, v0, v8
410+
; V128-NEXT: li a0, -1
411+
; V128-NEXT: vwmaccu.vx v24, a0, v8
412+
; V128-NEXT: lui a1, %hi(.LCPI17_0)
413+
; V128-NEXT: addi a1, a1, %lo(.LCPI17_0)
414+
; V128-NEXT: li a2, 32
415+
; V128-NEXT: vsetvli zero, a2, e32, m8, ta, mu
416+
; V128-NEXT: vle16.v v12, (a1)
417+
; V128-NEXT: lui a1, 699051
418+
; V128-NEXT: addi a1, a1, -1366
419+
; V128-NEXT: vmv.s.x v0, a1
422420
; V128-NEXT: vrgatherei16.vv v24, v16, v12, v0.t
423421
; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma
424422
; V128-NEXT: vwaddu.vv v0, v8, v16
425-
; V128-NEXT: li a0, -1
426423
; V128-NEXT: vwmaccu.vx v0, a0, v16
427424
; V128-NEXT: vmv8r.v v8, v0
428425
; V128-NEXT: vmv8r.v v16, v24
429-
; V128-NEXT: csrr a0, vlenb
430-
; V128-NEXT: slli a0, a0, 2
431-
; V128-NEXT: add sp, sp, a0
432-
; V128-NEXT: addi sp, sp, 16
433426
; V128-NEXT: ret
434427
;
435428
; V512-LABEL: interleave_v32i32:

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll

Lines changed: 16 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -612,13 +612,11 @@ define <8 x i8> @concat_4xi8_start_undef_at_start(<8 x i8> %v, <8 x i8> %w) {
612612
; CHECK-LABEL: concat_4xi8_start_undef_at_start:
613613
; CHECK: # %bb.0:
614614
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
615-
; CHECK-NEXT: vid.v v11
616-
; CHECK-NEXT: vrgather.vv v10, v8, v11
615+
; CHECK-NEXT: vid.v v10
617616
; CHECK-NEXT: li a0, 224
618617
; CHECK-NEXT: vmv.s.x v0, a0
619-
; CHECK-NEXT: vadd.vi v8, v11, -4
620-
; CHECK-NEXT: vrgather.vv v10, v9, v8, v0.t
621-
; CHECK-NEXT: vmv1r.v v8, v10
618+
; CHECK-NEXT: vadd.vi v10, v10, -4
619+
; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t
622620
; CHECK-NEXT: ret
623621
%res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 9, i32 10, i32 11>
624622
ret <8 x i8> %res
@@ -628,13 +626,11 @@ define <8 x i8> @merge_start_into_end_non_contiguous(<8 x i8> %v, <8 x i8> %w) {
628626
; CHECK-LABEL: merge_start_into_end_non_contiguous:
629627
; CHECK: # %bb.0:
630628
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
631-
; CHECK-NEXT: vid.v v11
632-
; CHECK-NEXT: vrgather.vv v10, v8, v11
629+
; CHECK-NEXT: vid.v v10
633630
; CHECK-NEXT: li a0, 144
634631
; CHECK-NEXT: vmv.s.x v0, a0
635-
; CHECK-NEXT: vadd.vi v8, v11, -4
636-
; CHECK-NEXT: vrgather.vv v10, v9, v8, v0.t
637-
; CHECK-NEXT: vmv1r.v v8, v10
632+
; CHECK-NEXT: vadd.vi v10, v10, -4
633+
; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t
638634
; CHECK-NEXT: ret
639635
%res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 5, i32 6, i32 11>
640636
ret <8 x i8> %res
@@ -675,13 +671,11 @@ define <8 x i8> @merge_slidedown(<8 x i8> %v, <8 x i8> %w) {
675671
; CHECK-LABEL: merge_slidedown:
676672
; CHECK: # %bb.0:
677673
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
678-
; CHECK-NEXT: vid.v v11
679-
; CHECK-NEXT: vadd.vi v12, v11, 1
674+
; CHECK-NEXT: vslidedown.vi v8, v8, 1
680675
; CHECK-NEXT: li a0, 195
681676
; CHECK-NEXT: vmv.s.x v0, a0
682-
; CHECK-NEXT: vrgather.vv v10, v8, v12
683-
; CHECK-NEXT: vrgather.vv v10, v9, v11, v0.t
684-
; CHECK-NEXT: vmv1r.v v8, v10
677+
; CHECK-NEXT: vid.v v10
678+
; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t
685679
; CHECK-NEXT: ret
686680
%res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 8, i32 9, i32 3, i32 4, i32 5, i32 6, i32 14, i32 15>
687681
ret <8 x i8> %res
@@ -692,14 +686,12 @@ define <8 x i8> @merge_non_contiguous_slideup_slidedown(<8 x i8> %v, <8 x i8> %w
692686
; CHECK-LABEL: merge_non_contiguous_slideup_slidedown:
693687
; CHECK: # %bb.0:
694688
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
695-
; CHECK-NEXT: vid.v v11
696-
; CHECK-NEXT: vadd.vi v12, v11, 2
697-
; CHECK-NEXT: vrgather.vv v10, v8, v12
689+
; CHECK-NEXT: vid.v v10
690+
; CHECK-NEXT: vadd.vi v10, v10, -1
698691
; CHECK-NEXT: li a0, 234
699692
; CHECK-NEXT: vmv.s.x v0, a0
700-
; CHECK-NEXT: vadd.vi v8, v11, -1
701-
; CHECK-NEXT: vrgather.vv v10, v9, v8, v0.t
702-
; CHECK-NEXT: vmv1r.v v8, v10
693+
; CHECK-NEXT: vslidedown.vi v8, v8, 2
694+
; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t
703695
; CHECK-NEXT: ret
704696
%res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 2, i32 8, i32 4, i32 10, i32 6, i32 12, i32 13, i32 14>
705697
ret <8 x i8> %res
@@ -710,16 +702,13 @@ define <8 x i8> @unmergable(<8 x i8> %v, <8 x i8> %w) {
710702
; CHECK-LABEL: unmergable:
711703
; CHECK: # %bb.0:
712704
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
713-
; CHECK-NEXT: vid.v v10
714-
; CHECK-NEXT: vadd.vi v11, v10, 2
715705
; CHECK-NEXT: lui a0, %hi(.LCPI46_0)
716706
; CHECK-NEXT: addi a0, a0, %lo(.LCPI46_0)
717-
; CHECK-NEXT: vle8.v v12, (a0)
707+
; CHECK-NEXT: vle8.v v10, (a0)
718708
; CHECK-NEXT: li a0, 234
719709
; CHECK-NEXT: vmv.s.x v0, a0
720-
; CHECK-NEXT: vrgather.vv v10, v8, v11
721-
; CHECK-NEXT: vrgather.vv v10, v9, v12, v0.t
722-
; CHECK-NEXT: vmv1r.v v8, v10
710+
; CHECK-NEXT: vslidedown.vi v8, v8, 2
711+
; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t
723712
; CHECK-NEXT: ret
724713
%res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 2, i32 9, i32 4, i32 11, i32 6, i32 13, i32 8, i32 15>
725714
ret <8 x i8> %res

0 commit comments

Comments
 (0)