diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index b41e2f40dc72f..cdc1cc3b96507 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -4650,6 +4650,85 @@ static SDValue lowerVECTOR_SHUFFLEAsRotate(ShuffleVectorSDNode *SVN, return DAG.getBitcast(VT, Rotate); } +// If compiling with an exactly known VLEN, see if we can split a +// shuffle on m2 or larger into a small number of m1 sized shuffles +// which write each destination registers exactly once. +static SDValue lowerShuffleViaVRegSplitting(ShuffleVectorSDNode *SVN, + SelectionDAG &DAG, + const RISCVSubtarget &Subtarget) { + SDLoc DL(SVN); + MVT VT = SVN->getSimpleValueType(0); + SDValue V1 = SVN->getOperand(0); + SDValue V2 = SVN->getOperand(1); + ArrayRef Mask = SVN->getMask(); + unsigned NumElts = VT.getVectorNumElements(); + + // If we don't know exact data layout, not much we can do. If this + // is already m1 or smaller, no point in splitting further. + const unsigned MinVLen = Subtarget.getRealMinVLen(); + const unsigned MaxVLen = Subtarget.getRealMaxVLen(); + if (MinVLen != MaxVLen || VT.getSizeInBits().getFixedValue() <= MinVLen) + return SDValue(); + + MVT ElemVT = VT.getVectorElementType(); + unsigned ElemsPerVReg = MinVLen / ElemVT.getFixedSizeInBits(); + unsigned VRegsPerSrc = NumElts / ElemsPerVReg; + + SmallVector>> + OutMasks(VRegsPerSrc, {-1, {}}); + + // Check if our mask can be done as a 1-to-1 mapping from source + // to destination registers in the group without needing to + // write each destination more than once. + for (unsigned DstIdx = 0; DstIdx < Mask.size(); DstIdx++) { + int DstVecIdx = DstIdx / ElemsPerVReg; + int DstSubIdx = DstIdx % ElemsPerVReg; + int SrcIdx = Mask[DstIdx]; + if (SrcIdx < 0 || (unsigned)SrcIdx >= 2 * NumElts) + continue; + int SrcVecIdx = SrcIdx / ElemsPerVReg; + int SrcSubIdx = SrcIdx % ElemsPerVReg; + if (OutMasks[DstVecIdx].first == -1) + OutMasks[DstVecIdx].first = SrcVecIdx; + if (OutMasks[DstVecIdx].first != SrcVecIdx) + // Note: This case could easily be handled by keeping track of a chain + // of source values and generating two element shuffles below. This is + // less an implementation question, and more a profitability one. + return SDValue(); + + OutMasks[DstVecIdx].second.resize(ElemsPerVReg, -1); + OutMasks[DstVecIdx].second[DstSubIdx] = SrcSubIdx; + } + + EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT, Subtarget); + MVT OneRegVT = MVT::getVectorVT(ElemVT, ElemsPerVReg); + MVT M1VT = getContainerForFixedLengthVector(DAG, OneRegVT, Subtarget); + assert(M1VT == getLMUL1VT(M1VT)); + unsigned NumOpElts = M1VT.getVectorMinNumElements(); + SDValue Vec = DAG.getUNDEF(ContainerVT); + // The following semantically builds up a fixed length concat_vector + // of the component shuffle_vectors. We eagerly lower to scalable here + // to avoid DAG combining it back to a large shuffle_vector again. + V1 = convertToScalableVector(ContainerVT, V1, DAG, Subtarget); + V2 = convertToScalableVector(ContainerVT, V2, DAG, Subtarget); + for (unsigned DstVecIdx = 0 ; DstVecIdx < OutMasks.size(); DstVecIdx++) { + auto &[SrcVecIdx, SrcSubMask] = OutMasks[DstVecIdx]; + if (SrcVecIdx == -1) + continue; + unsigned ExtractIdx = (SrcVecIdx % VRegsPerSrc) * NumOpElts; + SDValue SrcVec = (unsigned)SrcVecIdx > VRegsPerSrc ? V2 : V1; + SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, M1VT, SrcVec, + DAG.getVectorIdxConstant(ExtractIdx, DL)); + SubVec = convertFromScalableVector(OneRegVT, SubVec, DAG, Subtarget); + SubVec = DAG.getVectorShuffle(OneRegVT, DL, SubVec, SubVec, SrcSubMask); + SubVec = convertToScalableVector(M1VT, SubVec, DAG, Subtarget); + unsigned InsertIdx = DstVecIdx * NumOpElts; + Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ContainerVT, Vec, SubVec, + DAG.getVectorIdxConstant(InsertIdx, DL)); + } + return convertFromScalableVector(VT, Vec, DAG, Subtarget); +} + static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, const RISCVSubtarget &Subtarget) { SDValue V1 = Op.getOperand(0); @@ -4757,6 +4836,11 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, } } + // For exact VLEN m2 or greater, try to split to m1 operations if we + // can split cleanly. + if (SDValue V = lowerShuffleViaVRegSplitting(SVN, DAG, Subtarget)) + return V; + ArrayRef Mask = SVN->getMask(); if (SDValue V = diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll index b922ecdb8a2c2..f53b51e05c572 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll @@ -16,14 +16,10 @@ define <4 x i64> @m2_splat_0(<4 x i64> %v1) vscale_range(2,2) { define <4 x i64> @m2_splat_in_chunks(<4 x i64> %v1) vscale_range(2,2) { ; CHECK-LABEL: m2_splat_in_chunks: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, 8224 -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vmv.s.x v10, a0 -; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; CHECK-NEXT: vsext.vf2 v12, v10 -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; CHECK-NEXT: vrgatherei16.vv v10, v8, v12 -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-NEXT: vrgather.vi v10, v8, 0 +; CHECK-NEXT: vrgather.vi v11, v9, 0 +; CHECK-NEXT: vmv2r.v v8, v10 ; CHECK-NEXT: ret %res = shufflevector <4 x i64> %v1, <4 x i64> poison, <4 x i32> ret <4 x i64> %res @@ -32,12 +28,12 @@ define <4 x i64> @m2_splat_in_chunks(<4 x i64> %v1) vscale_range(2,2) { define <8 x i64> @m4_splat_in_chunks(<8 x i64> %v1) vscale_range(2,2) { ; CHECK-LABEL: m4_splat_in_chunks: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI2_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI2_0) -; CHECK-NEXT: vl1re16.v v16, (a0) -; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; CHECK-NEXT: vrgatherei16.vv v12, v8, v16 -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-NEXT: vrgather.vi v12, v8, 0 +; CHECK-NEXT: vrgather.vi v13, v9, 0 +; CHECK-NEXT: vrgather.vi v14, v10, 0 +; CHECK-NEXT: vrgather.vi v15, v11, 1 +; CHECK-NEXT: vmv4r.v v8, v12 ; CHECK-NEXT: ret %res = shufflevector <8 x i64> %v1, <8 x i64> poison, <8 x i32> ret <8 x i64> %res @@ -47,14 +43,10 @@ define <8 x i64> @m4_splat_in_chunks(<8 x i64> %v1) vscale_range(2,2) { define <4 x i64> @m2_splat_with_tail(<4 x i64> %v1) vscale_range(2,2) { ; CHECK-LABEL: m2_splat_with_tail: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, 12320 -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vmv.s.x v10, a0 -; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; CHECK-NEXT: vsext.vf2 v12, v10 -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; CHECK-NEXT: vrgatherei16.vv v10, v8, v12 -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-NEXT: vrgather.vi v10, v8, 0 +; CHECK-NEXT: vmv1r.v v11, v9 +; CHECK-NEXT: vmv2r.v v8, v10 ; CHECK-NEXT: ret %res = shufflevector <4 x i64> %v1, <4 x i64> poison, <4 x i32> ret <4 x i64> %res @@ -63,15 +55,12 @@ define <4 x i64> @m2_splat_with_tail(<4 x i64> %v1) vscale_range(2,2) { define <4 x i64> @m2_pair_swap_vl4(<4 x i64> %v1) vscale_range(2,2) { ; CHECK-LABEL: m2_pair_swap_vl4: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, 8240 -; CHECK-NEXT: addi a0, a0, 1 -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vmv.s.x v10, a0 -; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; CHECK-NEXT: vsext.vf2 v12, v10 -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; CHECK-NEXT: vrgatherei16.vv v10, v8, v12 -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-NEXT: vslidedown.vi v11, v9, 1 +; CHECK-NEXT: vslideup.vi v11, v9, 1 +; CHECK-NEXT: vslidedown.vi v10, v8, 1 +; CHECK-NEXT: vslideup.vi v10, v8, 1 +; CHECK-NEXT: vmv2r.v v8, v10 ; CHECK-NEXT: ret %res = shufflevector <4 x i64> %v1, <4 x i64> poison, <4 x i32> ret <4 x i64> %res @@ -107,14 +96,10 @@ define <8 x i32> @m2_pair_swap_vl8(<8 x i32> %v1) vscale_range(2,2) { define <4 x i64> @m2_splat_into_identity(<4 x i64> %v1) vscale_range(2,2) { ; CHECK-LABEL: m2_splat_into_identity: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, 12320 -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vmv.s.x v10, a0 -; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; CHECK-NEXT: vsext.vf2 v12, v10 -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; CHECK-NEXT: vrgatherei16.vv v10, v8, v12 -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-NEXT: vrgather.vi v10, v8, 0 +; CHECK-NEXT: vmv1r.v v11, v9 +; CHECK-NEXT: vmv2r.v v8, v10 ; CHECK-NEXT: ret %res = shufflevector <4 x i64> %v1, <4 x i64> poison, <4 x i32> ret <4 x i64> %res @@ -123,12 +108,7 @@ define <4 x i64> @m2_splat_into_identity(<4 x i64> %v1) vscale_range(2,2) { define <4 x i64> @m2_broadcast_i128(<4 x i64> %v1) vscale_range(2,2) { ; CHECK-LABEL: m2_broadcast_i128: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, 16 -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vmv.v.x v12, a0 -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; CHECK-NEXT: vrgatherei16.vv v10, v8, v12 -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 ; CHECK-NEXT: ret %res = shufflevector <4 x i64> %v1, <4 x i64> poison, <4 x i32> ret <4 x i64> %res @@ -137,12 +117,9 @@ define <4 x i64> @m2_broadcast_i128(<4 x i64> %v1) vscale_range(2,2) { define <8 x i64> @m4_broadcast_i128(<8 x i64> %v1) vscale_range(2,2) { ; CHECK-LABEL: m4_broadcast_i128: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, 16 -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vmv.v.x v16, a0 -; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; CHECK-NEXT: vrgatherei16.vv v12, v8, v16 -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vmv1r.v v11, v8 ; CHECK-NEXT: ret %res = shufflevector <8 x i64> %v1, <8 x i64> poison, <8 x i32> ret <8 x i64> %res @@ -152,13 +129,10 @@ define <8 x i64> @m4_broadcast_i128(<8 x i64> %v1) vscale_range(2,2) { define <4 x i64> @m2_splat_two_source(<4 x i64> %v1, <4 x i64> %v2) vscale_range(2,2) { ; CHECK-LABEL: m2_splat_two_source: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-NEXT: vrgather.vi v12, v8, 0 -; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma -; CHECK-NEXT: vmv.v.i v0, 12 -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu -; CHECK-NEXT: vrgather.vi v12, v10, 3, v0.t -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vrgather.vi v13, v11, 1 +; CHECK-NEXT: vmv2r.v v8, v12 ; CHECK-NEXT: ret %res = shufflevector <4 x i64> %v1, <4 x i64> %v2, <4 x i32> ret <4 x i64> %res @@ -167,15 +141,9 @@ define <4 x i64> @m2_splat_two_source(<4 x i64> %v1, <4 x i64> %v2) vscale_range define <4 x i64> @m2_splat_into_identity_two_source(<4 x i64> %v1, <4 x i64> %v2) vscale_range(2,2) { ; CHECK-LABEL: m2_splat_into_identity_two_source: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-NEXT: vrgather.vi v12, v8, 0 -; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma -; CHECK-NEXT: vmv.v.i v0, 12 -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vid.v v8 -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vrgatherei16.vv v12, v10, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-NEXT: vrgather.vi v10, v8, 0 +; CHECK-NEXT: vmv2r.v v8, v10 ; CHECK-NEXT: ret %res = shufflevector <4 x i64> %v1, <4 x i64> %v2, <4 x i32> ret <4 x i64> %res