Skip to content

Commit 52bc812

Browse files
authored
[X86] combineConcatVectorOps - concat(shuffle(x,y,m1),shuffle(x,y,m2)) -> shuffle(concat(x,x),concat(y,y),m3) on VBMI targets (#130134)
With VBMI we are guaranteed to support cross-lane 256-bit shuffles, so subvector splats should always be cheap. Fixes #116931
1 parent c687d78 commit 52bc812

File tree

4 files changed

+49
-22
lines changed

4 files changed

+49
-22
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

+7-2
Original file line numberDiff line numberDiff line change
@@ -57936,9 +57936,14 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
5793657936

5793757937
switch (Op0.getOpcode()) {
5793857938
case ISD::VECTOR_SHUFFLE: {
57939-
if (NumOps == 2 && VT.is256BitVector() &&
57939+
// TODO: Relax VBMI requirement for repeated shuffle ops - currently
57940+
// limited to targets that should always have good cross lane shuffles.
57941+
if (!IsSplat && NumOps == 2 && VT.is256BitVector() &&
5794057942
(EltSizeInBits >= 32 || Subtarget.hasInt256()) &&
57941-
(IsConcatFree(VT, Ops, 0) || IsConcatFree(VT, Ops, 1))) {
57943+
(IsConcatFree(VT, Ops, 0) || IsConcatFree(VT, Ops, 1) ||
57944+
(Ops[0].getOperand(0) == Ops[1].getOperand(0) &&
57945+
Ops[0].getOperand(1) == Ops[1].getOperand(1) &&
57946+
Subtarget.hasVBMI()))) {
5794257947
int NumSubElts = Op0.getValueType().getVectorNumElements();
5794357948
SmallVector<int> NewMask;
5794457949
for (int M : cast<ShuffleVectorSDNode>(Ops[0])->getMask()) {

llvm/test/CodeGen/X86/vector-fshr-128.ll

+6-5
Original file line numberDiff line numberDiff line change
@@ -1566,11 +1566,12 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %
15661566
;
15671567
; AVX512VLVBMI2-LABEL: splatvar_funnnel_v16i8:
15681568
; AVX512VLVBMI2: # %bb.0:
1569-
; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1570-
; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1571-
; AVX512VLVBMI2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
1572-
; AVX512VLVBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm1
1573-
; AVX512VLVBMI2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
1569+
; AVX512VLVBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
1570+
; AVX512VLVBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1571+
; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47]
1572+
; AVX512VLVBMI2-NEXT: vpermi2b %ymm0, %ymm1, %ymm3
1573+
; AVX512VLVBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm0
1574+
; AVX512VLVBMI2-NEXT: vpsrlw %xmm0, %ymm3, %ymm0
15741575
; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0
15751576
; AVX512VLVBMI2-NEXT: vzeroupper
15761577
; AVX512VLVBMI2-NEXT: retq

llvm/test/CodeGen/X86/vector-fshr-rot-128.ll

+3-3
Original file line numberDiff line numberDiff line change
@@ -1218,9 +1218,9 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind
12181218
;
12191219
; AVX512VLVBMI2-LABEL: splatvar_funnnel_v16i8:
12201220
; AVX512VLVBMI2: # %bb.0:
1221-
; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1222-
; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1223-
; AVX512VLVBMI2-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0
1221+
; AVX512VLVBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1222+
; AVX512VLVBMI2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
1223+
; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
12241224
; AVX512VLVBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
12251225
; AVX512VLVBMI2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
12261226
; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0

llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll

+33-12
Original file line numberDiff line numberDiff line change
@@ -5059,18 +5059,39 @@ define void @shuffle_v32i8_store_00_08_16_24_01_09_17_25_02_10_18_26_03_11_19_27
50595059
; AVX1-NEXT: vzeroupper
50605060
; AVX1-NEXT: retq
50615061
;
5062-
; AVX2OR512VL-LABEL: shuffle_v32i8_store_00_08_16_24_01_09_17_25_02_10_18_26_03_11_19_27_04_12_20_28_05_13_21_29_06_14_22_30_07_15_23_31:
5063-
; AVX2OR512VL: # %bb.0:
5064-
; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
5065-
; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15]
5066-
; AVX2OR512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
5067-
; AVX2OR512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
5068-
; AVX2OR512VL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
5069-
; AVX2OR512VL-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
5070-
; AVX2OR512VL-NEXT: vmovdqa %xmm0, 16(%rdi)
5071-
; AVX2OR512VL-NEXT: vmovdqa %xmm2, (%rdi)
5072-
; AVX2OR512VL-NEXT: vzeroupper
5073-
; AVX2OR512VL-NEXT: retq
5062+
; AVX2-LABEL: shuffle_v32i8_store_00_08_16_24_01_09_17_25_02_10_18_26_03_11_19_27_04_12_20_28_05_13_21_29_06_14_22_30_07_15_23_31:
5063+
; AVX2: # %bb.0:
5064+
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
5065+
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15]
5066+
; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
5067+
; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
5068+
; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
5069+
; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
5070+
; AVX2-NEXT: vmovdqa %xmm0, 16(%rdi)
5071+
; AVX2-NEXT: vmovdqa %xmm2, (%rdi)
5072+
; AVX2-NEXT: vzeroupper
5073+
; AVX2-NEXT: retq
5074+
;
5075+
; AVX512VLBW-LABEL: shuffle_v32i8_store_00_08_16_24_01_09_17_25_02_10_18_26_03_11_19_27_04_12_20_28_05_13_21_29_06_14_22_30_07_15_23_31:
5076+
; AVX512VLBW: # %bb.0:
5077+
; AVX512VLBW-NEXT: vextracti128 $1, %ymm0, %xmm1
5078+
; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15]
5079+
; AVX512VLBW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
5080+
; AVX512VLBW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
5081+
; AVX512VLBW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
5082+
; AVX512VLBW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
5083+
; AVX512VLBW-NEXT: vmovdqa %xmm0, 16(%rdi)
5084+
; AVX512VLBW-NEXT: vmovdqa %xmm2, (%rdi)
5085+
; AVX512VLBW-NEXT: vzeroupper
5086+
; AVX512VLBW-NEXT: retq
5087+
;
5088+
; AVX512VLVBMI-LABEL: shuffle_v32i8_store_00_08_16_24_01_09_17_25_02_10_18_26_03_11_19_27_04_12_20_28_05_13_21_29_06_14_22_30_07_15_23_31:
5089+
; AVX512VLVBMI: # %bb.0:
5090+
; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,16,24,1,9,17,25,2,10,18,26,3,11,19,27,4,12,20,28,5,13,21,29,6,14,22,30,7,15,23,31]
5091+
; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0
5092+
; AVX512VLVBMI-NEXT: vmovdqa %ymm0, (%rdi)
5093+
; AVX512VLVBMI-NEXT: vzeroupper
5094+
; AVX512VLVBMI-NEXT: retq
50745095
;
50755096
; XOPAVX1-LABEL: shuffle_v32i8_store_00_08_16_24_01_09_17_25_02_10_18_26_03_11_19_27_04_12_20_28_05_13_21_29_06_14_22_30_07_15_23_31:
50765097
; XOPAVX1: # %bb.0:

0 commit comments

Comments
 (0)