Skip to content

Commit f2ac265

Browse files
authored
[RISCV] Reduce the LMUL for a vrgather operation if legal (#125768)
If we're lowering a shuffle to a vrgather (or vcompress), and we know that a prefix of the operation can be done while producing the same (defined) lanes, do the operation with a narrower LMUL.
1 parent f5c4f27 commit f2ac265

File tree

3 files changed

+47
-49
lines changed

3 files changed

+47
-49
lines changed

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5645,6 +5645,30 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
56455645
}
56465646
}
56475647

5648+
// If only a prefix of the source elements influence a prefix of the
5649+
// destination elements, try to see if we can reduce the required LMUL
5650+
unsigned MinVLen = Subtarget.getRealMinVLen();
5651+
unsigned MinVLMAX = MinVLen / VT.getScalarSizeInBits();
5652+
if (NumElts > MinVLMAX) {
5653+
unsigned MaxIdx = 0;
5654+
for (auto [I, M] : enumerate(Mask)) {
5655+
if (M == -1)
5656+
continue;
5657+
MaxIdx = std::max(std::max((unsigned)I, (unsigned)M), MaxIdx);
5658+
}
5659+
unsigned NewNumElts =
5660+
std::max((uint64_t)MinVLMAX, PowerOf2Ceil(MaxIdx + 1));
5661+
if (NewNumElts != NumElts) {
5662+
MVT NewVT = MVT::getVectorVT(VT.getVectorElementType(), NewNumElts);
5663+
SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
5664+
V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewVT, V1, ZeroIdx);
5665+
SDValue Res = DAG.getVectorShuffle(NewVT, DL, V1, DAG.getUNDEF(NewVT),
5666+
Mask.take_front(NewNumElts));
5667+
return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Res,
5668+
ZeroIdx);
5669+
}
5670+
}
5671+
56485672
// Before hitting generic lowering fallbacks, try to widen the mask
56495673
// to a wider SEW.
56505674
if (SDValue V = tryWidenMaskForShuffle(Op, DAG))
@@ -5717,9 +5741,7 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
57175741
SDValue Gather;
57185742
// If we have a locally repeating mask, then we can reuse the first register
57195743
// in the index register group for all registers within the source register
5720-
// group. TODO: This generalizes to m2, and m4. Also, this is currently
5721-
// picking up cases with a fully undef tail which could be more directly
5722-
// handled with fewer redundant vrgathers
5744+
// group. TODO: This generalizes to m2, and m4.
57235745
const MVT M1VT = getLMUL1VT(ContainerVT);
57245746
auto VLMAX = RISCVTargetLowering::computeVLMAXBounds(M1VT, Subtarget).first;
57255747
if (ContainerVT.bitsGT(M1VT) && isLocalRepeatingShuffle(Mask, VLMAX)) {

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll

Lines changed: 14 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1341,36 +1341,16 @@ define void @shuffle_i256_splat(ptr %p) nounwind {
13411341
}
13421342

13431343
define <16 x i32> @shuffle_m1_prefix(<16 x i32> %a) {
1344-
; RV32-LABEL: shuffle_m1_prefix:
1345-
; RV32: # %bb.0:
1346-
; RV32-NEXT: lui a0, %hi(.LCPI84_0)
1347-
; RV32-NEXT: addi a0, a0, %lo(.LCPI84_0)
1348-
; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma
1349-
; RV32-NEXT: vle16.v v16, (a0)
1350-
; RV32-NEXT: vsetvli a0, zero, e32, m1, ta, ma
1351-
; RV32-NEXT: vrgatherei16.vv v13, v9, v16
1352-
; RV32-NEXT: vrgatherei16.vv v12, v8, v16
1353-
; RV32-NEXT: vrgatherei16.vv v14, v10, v16
1354-
; RV32-NEXT: vrgatherei16.vv v15, v11, v16
1355-
; RV32-NEXT: vmv4r.v v8, v12
1356-
; RV32-NEXT: ret
1357-
;
1358-
; RV64-LABEL: shuffle_m1_prefix:
1359-
; RV64: # %bb.0:
1360-
; RV64-NEXT: lui a0, 131073
1361-
; RV64-NEXT: slli a0, a0, 4
1362-
; RV64-NEXT: addi a0, a0, 3
1363-
; RV64-NEXT: slli a0, a0, 16
1364-
; RV64-NEXT: addi a0, a0, 2
1365-
; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma
1366-
; RV64-NEXT: vmv.v.x v16, a0
1367-
; RV64-NEXT: vsetvli a0, zero, e32, m1, ta, ma
1368-
; RV64-NEXT: vrgatherei16.vv v13, v9, v16
1369-
; RV64-NEXT: vrgatherei16.vv v12, v8, v16
1370-
; RV64-NEXT: vrgatherei16.vv v14, v10, v16
1371-
; RV64-NEXT: vrgatherei16.vv v15, v11, v16
1372-
; RV64-NEXT: vmv4r.v v8, v12
1373-
; RV64-NEXT: ret
1344+
; CHECK-LABEL: shuffle_m1_prefix:
1345+
; CHECK: # %bb.0:
1346+
; CHECK-NEXT: lui a0, 8208
1347+
; CHECK-NEXT: addi a0, a0, 770
1348+
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
1349+
; CHECK-NEXT: vmv.s.x v9, a0
1350+
; CHECK-NEXT: vsext.vf4 v10, v9
1351+
; CHECK-NEXT: vrgather.vv v12, v8, v10
1352+
; CHECK-NEXT: vmv4r.v v8, v12
1353+
; CHECK-NEXT: ret
13741354
%out = shufflevector <16 x i32> %a, <16 x i32> poison, <16 x i32> <i32 2, i32 3, i32 1, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
13751355
ret <16 x i32> %out
13761356
}
@@ -1380,10 +1360,10 @@ define <16 x i32> @shuffle_m2_prefix(<16 x i32> %a) {
13801360
; CHECK: # %bb.0:
13811361
; CHECK-NEXT: lui a0, %hi(.LCPI85_0)
13821362
; CHECK-NEXT: addi a0, a0, %lo(.LCPI85_0)
1383-
; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
1384-
; CHECK-NEXT: vle16.v v16, (a0)
1385-
; CHECK-NEXT: vrgatherei16.vv v12, v8, v16
1386-
; CHECK-NEXT: vmv.v.v v8, v12
1363+
; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
1364+
; CHECK-NEXT: vle16.v v14, (a0)
1365+
; CHECK-NEXT: vrgatherei16.vv v12, v8, v14
1366+
; CHECK-NEXT: vmv4r.v v8, v12
13871367
; CHECK-NEXT: ret
13881368
%out = shufflevector <16 x i32> %a, <16 x i32> poison, <16 x i32> <i32 2, i32 3, i32 5, i32 2, i32 3, i32 5, i32 7, i32 4, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
13891369
ret <16 x i32> %out

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -525,29 +525,25 @@ define void @vnsrl_0_i32_single_src_m8(ptr %in, ptr %out) {
525525
; V-NEXT: li a2, 64
526526
; V-NEXT: vsetvli zero, a2, e32, m8, ta, ma
527527
; V-NEXT: vle32.v v8, (a0)
528-
; V-NEXT: lui a0, 341
529-
; V-NEXT: addiw a0, a0, 1365
530-
; V-NEXT: vsetivli zero, 1, e64, m1, ta, ma
531-
; V-NEXT: vmv.s.x v16, a0
528+
; V-NEXT: vsetivli zero, 16, e32, m2, ta, ma
529+
; V-NEXT: vnsrl.wi v16, v8, 0
532530
; V-NEXT: vsetvli zero, a2, e32, m8, ta, ma
533-
; V-NEXT: vcompress.vm v24, v8, v16
534-
; V-NEXT: vse32.v v24, (a1)
531+
; V-NEXT: vse32.v v16, (a1)
535532
; V-NEXT: ret
536533
;
537534
; ZVE32F-LABEL: vnsrl_0_i32_single_src_m8:
538535
; ZVE32F: # %bb.0: # %entry
539536
; ZVE32F-NEXT: li a2, 64
540537
; ZVE32F-NEXT: vsetvli zero, a2, e32, m8, ta, ma
541538
; ZVE32F-NEXT: vle32.v v8, (a0)
542-
; ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma
543-
; ZVE32F-NEXT: vmv.v.i v16, 0
544539
; ZVE32F-NEXT: lui a0, 341
545540
; ZVE32F-NEXT: addi a0, a0, 1365
546-
; ZVE32F-NEXT: vsetvli zero, zero, e32, m1, tu, ma
547-
; ZVE32F-NEXT: vmv.s.x v16, a0
541+
; ZVE32F-NEXT: vmv.s.x v12, a0
542+
; ZVE32F-NEXT: li a0, 32
543+
; ZVE32F-NEXT: vsetvli zero, a0, e32, m4, ta, ma
544+
; ZVE32F-NEXT: vcompress.vm v16, v8, v12
548545
; ZVE32F-NEXT: vsetvli zero, a2, e32, m8, ta, ma
549-
; ZVE32F-NEXT: vcompress.vm v24, v8, v16
550-
; ZVE32F-NEXT: vse32.v v24, (a1)
546+
; ZVE32F-NEXT: vse32.v v16, (a1)
551547
; ZVE32F-NEXT: ret
552548
entry:
553549
%0 = load <64 x i32>, ptr %in, align 4

0 commit comments

Comments
 (0)