Skip to content

Commit 6663330

Browse files
committed
[X86][AVX] canonicalizeLaneShuffleWithRepeatedOps - don't merge VPERMILPD ops with different low/high masks.
Unlike VPERMILPS, VPERMILPD can have non-repeating masks in each 128-bit subvector, we weren't accounting for this when folding vperm2f128(vpermilpd(x,c),vpermilpd(y,c)) -> vpermilpd(vperm2f128(x,y),c). I'm intending to add support for this but wanted to get a minimal fix in first for merging into 12.xx. Fixes PR48908
1 parent da8845f commit 6663330

File tree

2 files changed

+30
-19
lines changed

2 files changed

+30
-19
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36916,11 +36916,18 @@ static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V,
3691636916
Res = DAG.getNode(SrcOpc0, DL, SrcVT0, DAG.getBitcast(SrcVT0, Res));
3691736917
return DAG.getBitcast(VT, Res);
3691836918
}
36919+
case X86ISD::VPERMILPI:
36920+
// TODO: Handle v4f64 permutes with different low/high lane masks.
36921+
if (SrcVT0 == MVT::v4f64) {
36922+
uint64_t Mask = Src0.getConstantOperandVal(1);
36923+
if ((Mask & 0x3) != ((Mask >> 2) & 0x3))
36924+
break;
36925+
}
36926+
LLVM_FALLTHROUGH;
3691936927
case X86ISD::VSHLI:
3692036928
case X86ISD::VSRLI:
3692136929
case X86ISD::VSRAI:
3692236930
case X86ISD::PSHUFD:
36923-
case X86ISD::VPERMILPI:
3692436931
if (Src1.isUndef() || Src0.getOperand(1) == Src1.getOperand(1)) {
3692536932
SDValue LHS = DAG.getBitcast(VT, Src0.getOperand(0));
3692636933
SDValue RHS =

llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll

Lines changed: 22 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -442,16 +442,18 @@ define void @PR48908(<4 x double> %v0, <4 x double> %v1, <4 x double> %v2, <4 x
442442
; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
443443
; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx
444444
; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx
445-
; X86-AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],ymm2[0,1]
446-
; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm4
447-
; X86-AVX1-NEXT: vshufpd {{.*#+}} ymm3 = ymm4[1],ymm3[0],ymm4[2],ymm3[3]
448-
; X86-AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[0,1],ymm0[0,1]
445+
; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3
446+
; X86-AVX1-NEXT: vpermilpd {{.*#+}} ymm3 = ymm3[0,1,2,2]
447+
; X86-AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm2[0,1]
448+
; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm5
449+
; X86-AVX1-NEXT: vshufpd {{.*#+}} ymm4 = ymm5[1],ymm4[0],ymm5[2],ymm4[3]
449450
; X86-AVX1-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[0,1],ymm2[0,1]
450-
; X86-AVX1-NEXT: vshufpd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[2]
451-
; X86-AVX1-NEXT: vmovapd %ymm4, (%edx)
452-
; X86-AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm3[2,3,0,1]
453-
; X86-AVX1-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm0[2],ymm3[3]
454-
; X86-AVX1-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3]
451+
; X86-AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,0,1]
452+
; X86-AVX1-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2],ymm3[3]
453+
; X86-AVX1-NEXT: vmovapd %ymm3, (%edx)
454+
; X86-AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3,0,1]
455+
; X86-AVX1-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm0[2],ymm4[3]
456+
; X86-AVX1-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3]
455457
; X86-AVX1-NEXT: vmovapd %ymm3, (%ecx)
456458
; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
457459
; X86-AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
@@ -513,16 +515,18 @@ define void @PR48908(<4 x double> %v0, <4 x double> %v1, <4 x double> %v2, <4 x
513515
;
514516
; X64-AVX1-LABEL: PR48908:
515517
; X64-AVX1: # %bb.0:
516-
; X64-AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],ymm2[0,1]
517-
; X64-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm4
518-
; X64-AVX1-NEXT: vshufpd {{.*#+}} ymm3 = ymm4[1],ymm3[0],ymm4[2],ymm3[3]
519-
; X64-AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[0,1],ymm0[0,1]
518+
; X64-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3
519+
; X64-AVX1-NEXT: vpermilpd {{.*#+}} ymm3 = ymm3[0,1,2,2]
520+
; X64-AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm2[0,1]
521+
; X64-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm5
522+
; X64-AVX1-NEXT: vshufpd {{.*#+}} ymm4 = ymm5[1],ymm4[0],ymm5[2],ymm4[3]
520523
; X64-AVX1-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[0,1],ymm2[0,1]
521-
; X64-AVX1-NEXT: vshufpd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[2]
522-
; X64-AVX1-NEXT: vmovapd %ymm4, (%rdi)
523-
; X64-AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm3[2,3,0,1]
524-
; X64-AVX1-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm0[2],ymm3[3]
525-
; X64-AVX1-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3]
524+
; X64-AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,0,1]
525+
; X64-AVX1-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2],ymm3[3]
526+
; X64-AVX1-NEXT: vmovapd %ymm3, (%rdi)
527+
; X64-AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3,0,1]
528+
; X64-AVX1-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm0[2],ymm4[3]
529+
; X64-AVX1-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3]
526530
; X64-AVX1-NEXT: vmovapd %ymm3, (%rsi)
527531
; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
528532
; X64-AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]

0 commit comments

Comments
 (0)