Skip to content

Commit 31c9bde

Browse files
committed
[X86] Improve KnownBits for X86ISD::PSADBW nodes
Don't just return the known zero upperbits, compute the absdiff Knownbits and perform the horizontal sum
1 parent cf922e5 commit 31c9bde

File tree

3 files changed

+71
-76
lines changed

3 files changed

+71
-76
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 49 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -36739,6 +36739,26 @@ X86TargetLowering::targetShrinkDemandedConstant(SDValue Op,
3673936739
return TLO.CombineTo(Op, NewOp);
3674036740
}
3674136741

36742+
static void computeKnownBitsForPSADBW(SDValue LHS, SDValue RHS,
36743+
KnownBits &Known,
36744+
const APInt &DemandedElts,
36745+
const SelectionDAG &DAG, unsigned Depth) {
36746+
KnownBits Known2;
36747+
unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
36748+
APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
36749+
Known = DAG.computeKnownBits(RHS, DemandedSrcElts, Depth + 1);
36750+
Known2 = DAG.computeKnownBits(LHS, DemandedSrcElts, Depth + 1);
36751+
Known = KnownBits::absdiff(Known, Known2).zext(16);
36752+
// Known = (((D0 + D1) + (D2 + D3)) + ((D4 + D5) + (D6 + D7)))
36753+
Known = KnownBits::computeForAddSub(/*Add=*/true, /*NSW=*/true, /*NUW=*/true,
36754+
Known, Known);
36755+
Known = KnownBits::computeForAddSub(/*Add=*/true, /*NSW=*/true, /*NUW=*/true,
36756+
Known, Known);
36757+
Known = KnownBits::computeForAddSub(/*Add=*/true, /*NSW=*/true, /*NUW=*/true,
36758+
Known, Known);
36759+
Known = Known.zext(64);
36760+
}
36761+
3674236762
void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
3674336763
KnownBits &Known,
3674436764
const APInt &DemandedElts,
@@ -36888,12 +36908,13 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
3688836908
break;
3688936909
}
3689036910
case X86ISD::PSADBW: {
36911+
SDValue LHS = Op.getOperand(0);
36912+
SDValue RHS = Op.getOperand(1);
3689136913
assert(VT.getScalarType() == MVT::i64 &&
36892-
Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&
36914+
LHS.getValueType() == RHS.getValueType() &&
36915+
LHS.getValueType().getScalarType() == MVT::i8 &&
3689336916
"Unexpected PSADBW types");
36894-
36895-
// PSADBW - fills low 16 bits and zeros upper 48 bits of each i64 result.
36896-
Known.Zero.setBitsFrom(16);
36917+
computeKnownBitsForPSADBW(LHS, RHS, Known, DemandedElts, DAG, Depth);
3689736918
break;
3689836919
}
3689936920
case X86ISD::PCMPGT:
@@ -37047,6 +37068,23 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
3704737068
}
3704837069
break;
3704937070
}
37071+
case ISD::INTRINSIC_WO_CHAIN: {
37072+
switch (Op->getConstantOperandVal(0)) {
37073+
case Intrinsic::x86_sse2_psad_bw:
37074+
case Intrinsic::x86_avx2_psad_bw:
37075+
case Intrinsic::x86_avx512_psad_bw_512: {
37076+
SDValue LHS = Op.getOperand(1);
37077+
SDValue RHS = Op.getOperand(2);
37078+
assert(VT.getScalarType() == MVT::i64 &&
37079+
LHS.getValueType() == RHS.getValueType() &&
37080+
LHS.getValueType().getScalarType() == MVT::i8 &&
37081+
"Unexpected PSADBW types");
37082+
computeKnownBitsForPSADBW(LHS, RHS, Known, DemandedElts, DAG, Depth);
37083+
break;
37084+
}
37085+
}
37086+
break;
37087+
}
3705037088
}
3705137089

3705237090
// Handle target shuffles.
@@ -54905,6 +54943,7 @@ static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
5490554943
}
5490654944

5490754945
static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
54946+
TargetLowering::DAGCombinerInfo &DCI,
5490854947
const X86Subtarget &Subtarget) {
5490954948
MVT VT = N->getSimpleValueType(0);
5491054949
SDLoc DL(N);
@@ -54916,6 +54955,11 @@ static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
5491654955
return DAG.getConstant(0, DL, VT);
5491754956
}
5491854957

54958+
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54959+
if (TLI.SimplifyDemandedBits(
54960+
SDValue(N, 0), APInt::getAllOnes(VT.getScalarSizeInBits()), DCI))
54961+
return SDValue(N, 0);
54962+
5491954963
return SDValue();
5492054964
}
5492154965

@@ -56639,7 +56683,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
5663956683
case ISD::MGATHER:
5664056684
case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI);
5664156685
case X86ISD::PCMPEQ:
56642-
case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
56686+
case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, DCI, Subtarget);
5664356687
case X86ISD::PMULDQ:
5664456688
case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI, Subtarget);
5664556689
case X86ISD::VPMADDUBSW:

llvm/test/CodeGen/X86/psadbw.ll

Lines changed: 19 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -50,30 +50,20 @@ define i64 @combine_psadbw_demandedelt(<16 x i8> %0, <16 x i8> %1) nounwind {
5050
define <2 x i64> @combine_psadbw_cmp_knownbits(<16 x i8> %a0) nounwind {
5151
; X86-SSE-LABEL: combine_psadbw_cmp_knownbits:
5252
; X86-SSE: # %bb.0:
53-
; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
54-
; X86-SSE-NEXT: pxor %xmm1, %xmm1
55-
; X86-SSE-NEXT: psadbw %xmm0, %xmm1
56-
; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
57-
; X86-SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
58-
; X86-SSE-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
53+
; X86-SSE-NEXT: xorps %xmm0, %xmm0
5954
; X86-SSE-NEXT: retl
6055
;
6156
; X64-SSE-LABEL: combine_psadbw_cmp_knownbits:
6257
; X64-SSE: # %bb.0:
6358
; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
6459
; X64-SSE-NEXT: pxor %xmm1, %xmm1
65-
; X64-SSE-NEXT: psadbw %xmm0, %xmm1
66-
; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
67-
; X64-SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
60+
; X64-SSE-NEXT: psadbw %xmm1, %xmm0
6861
; X64-SSE-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
6962
; X64-SSE-NEXT: retq
7063
;
7164
; AVX2-LABEL: combine_psadbw_cmp_knownbits:
7265
; AVX2: # %bb.0:
73-
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
74-
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
75-
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
76-
; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
66+
; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
7767
; AVX2-NEXT: retq
7868
%mask = and <16 x i8> %a0, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
7969
%sad = tail call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %mask, <16 x i8> zeroinitializer)
@@ -82,96 +72,67 @@ define <2 x i64> @combine_psadbw_cmp_knownbits(<16 x i8> %a0) nounwind {
8272
ret <2 x i64> %ext
8373
}
8474

85-
; TODO: No need to scalarize the sitofp as the PSADBW results are smaller than i32.
75+
; No need to scalarize the sitofp as the PSADBW results are smaller than i32.
8676
define <2 x double> @combine_psadbw_sitofp_knownbits(<16 x i8> %a0) nounwind {
8777
; X86-SSE-LABEL: combine_psadbw_sitofp_knownbits:
8878
; X86-SSE: # %bb.0:
89-
; X86-SSE-NEXT: pushl %ebp
90-
; X86-SSE-NEXT: movl %esp, %ebp
91-
; X86-SSE-NEXT: andl $-8, %esp
92-
; X86-SSE-NEXT: subl $32, %esp
9379
; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
9480
; X86-SSE-NEXT: pxor %xmm1, %xmm1
9581
; X86-SSE-NEXT: psadbw %xmm0, %xmm1
96-
; X86-SSE-NEXT: movq %xmm1, {{[0-9]+}}(%esp)
97-
; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
98-
; X86-SSE-NEXT: movq %xmm0, {{[0-9]+}}(%esp)
99-
; X86-SSE-NEXT: fildll {{[0-9]+}}(%esp)
100-
; X86-SSE-NEXT: fstpl {{[0-9]+}}(%esp)
101-
; X86-SSE-NEXT: fildll {{[0-9]+}}(%esp)
102-
; X86-SSE-NEXT: fstpl (%esp)
103-
; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
104-
; X86-SSE-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
105-
; X86-SSE-NEXT: movl %ebp, %esp
106-
; X86-SSE-NEXT: popl %ebp
82+
; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
83+
; X86-SSE-NEXT: cvtdq2pd %xmm0, %xmm0
10784
; X86-SSE-NEXT: retl
10885
;
10986
; X64-SSE-LABEL: combine_psadbw_sitofp_knownbits:
11087
; X64-SSE: # %bb.0:
11188
; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
11289
; X64-SSE-NEXT: pxor %xmm1, %xmm1
11390
; X64-SSE-NEXT: psadbw %xmm0, %xmm1
114-
; X64-SSE-NEXT: movd %xmm1, %eax
115-
; X64-SSE-NEXT: xorps %xmm0, %xmm0
116-
; X64-SSE-NEXT: cvtsi2sd %eax, %xmm0
117-
; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
118-
; X64-SSE-NEXT: movd %xmm1, %eax
119-
; X64-SSE-NEXT: xorps %xmm1, %xmm1
120-
; X64-SSE-NEXT: cvtsi2sd %eax, %xmm1
121-
; X64-SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
91+
; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
92+
; X64-SSE-NEXT: cvtdq2pd %xmm0, %xmm0
12293
; X64-SSE-NEXT: retq
12394
;
12495
; AVX2-LABEL: combine_psadbw_sitofp_knownbits:
12596
; AVX2: # %bb.0:
12697
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
12798
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
12899
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
129-
; AVX2-NEXT: vcvtdq2pd %xmm0, %xmm1
130-
; AVX2-NEXT: vpextrq $1, %xmm0, %rax
131-
; AVX2-NEXT: vcvtsi2sd %eax, %xmm2, %xmm0
132-
; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
100+
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
101+
; AVX2-NEXT: vcvtdq2pd %xmm0, %xmm0
133102
; AVX2-NEXT: retq
134103
%mask = and <16 x i8> %a0, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
135104
%sad = tail call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %mask, <16 x i8> zeroinitializer)
136105
%cvt = sitofp <2 x i64> %sad to <2 x double>
137106
ret <2 x double> %cvt
138107
}
139108

140-
; TODO: Convert from uitofp to sitofp as the PSADBW results are zero-extended.
109+
; Convert from uitofp to sitofp as the PSADBW results are zero-extended.
141110
define <2 x double> @combine_psadbw_uitofp_knownbits(<16 x i8> %a0) nounwind {
142111
; X86-SSE-LABEL: combine_psadbw_uitofp_knownbits:
143112
; X86-SSE: # %bb.0:
144113
; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
145114
; X86-SSE-NEXT: pxor %xmm1, %xmm1
146-
; X86-SSE-NEXT: psadbw %xmm1, %xmm0
147-
; X86-SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
148-
; X86-SSE-NEXT: movapd {{.*#+}} xmm1 = [0,1160773632,0,1160773632]
149-
; X86-SSE-NEXT: subpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
150-
; X86-SSE-NEXT: addpd %xmm1, %xmm0
115+
; X86-SSE-NEXT: psadbw %xmm0, %xmm1
116+
; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
117+
; X86-SSE-NEXT: cvtdq2pd %xmm0, %xmm0
151118
; X86-SSE-NEXT: retl
152119
;
153120
; X64-SSE-LABEL: combine_psadbw_uitofp_knownbits:
154121
; X64-SSE: # %bb.0:
155122
; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
156123
; X64-SSE-NEXT: pxor %xmm1, %xmm1
157-
; X64-SSE-NEXT: psadbw %xmm1, %xmm0
158-
; X64-SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
159-
; X64-SSE-NEXT: movapd {{.*#+}} xmm1 = [4985484787499139072,4985484787499139072]
160-
; X64-SSE-NEXT: subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
161-
; X64-SSE-NEXT: addpd %xmm1, %xmm0
124+
; X64-SSE-NEXT: psadbw %xmm0, %xmm1
125+
; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
126+
; X64-SSE-NEXT: cvtdq2pd %xmm0, %xmm0
162127
; X64-SSE-NEXT: retq
163128
;
164129
; AVX2-LABEL: combine_psadbw_uitofp_knownbits:
165130
; AVX2: # %bb.0:
166131
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
167132
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
168133
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
169-
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
170-
; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
171-
; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = [4985484787499139072,4985484787499139072]
172-
; AVX2-NEXT: # xmm1 = mem[0,0]
173-
; AVX2-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
174-
; AVX2-NEXT: vaddpd %xmm1, %xmm0, %xmm0
134+
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
135+
; AVX2-NEXT: vcvtdq2pd %xmm0, %xmm0
175136
; AVX2-NEXT: retq
176137
%mask = and <16 x i8> %a0, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
177138
%sad = tail call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %mask, <16 x i8> zeroinitializer)

llvm/test/CodeGen/X86/sad.ll

Lines changed: 3 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -989,9 +989,7 @@ define dso_local i32 @sad_unroll_nonzero_initial(ptr %arg, ptr %arg1, ptr %arg2,
989989
; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
990990
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
991991
; SSE2-NEXT: paddd %xmm2, %xmm0
992-
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
993-
; SSE2-NEXT: paddd %xmm0, %xmm1
994-
; SSE2-NEXT: movd %xmm1, %eax
992+
; SSE2-NEXT: movd %xmm0, %eax
995993
; SSE2-NEXT: retq
996994
;
997995
; AVX-LABEL: sad_unroll_nonzero_initial:
@@ -1053,9 +1051,7 @@ define dso_local i32 @sad_double_reduction(ptr %arg, ptr %arg1, ptr %arg2, ptr %
10531051
; SSE2-NEXT: paddd %xmm1, %xmm2
10541052
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
10551053
; SSE2-NEXT: paddd %xmm2, %xmm0
1056-
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1057-
; SSE2-NEXT: por %xmm0, %xmm1
1058-
; SSE2-NEXT: movd %xmm1, %eax
1054+
; SSE2-NEXT: movd %xmm0, %eax
10591055
; SSE2-NEXT: retq
10601056
;
10611057
; AVX-LABEL: sad_double_reduction:
@@ -1067,8 +1063,6 @@ define dso_local i32 @sad_double_reduction(ptr %arg, ptr %arg1, ptr %arg2, ptr %
10671063
; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0
10681064
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
10691065
; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
1070-
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1071-
; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
10721066
; AVX-NEXT: vmovd %xmm0, %eax
10731067
; AVX-NEXT: retq
10741068
bb:
@@ -1115,9 +1109,7 @@ define dso_local i32 @sad_double_reduction_abs(ptr %arg, ptr %arg1, ptr %arg2, p
11151109
; SSE2-NEXT: paddd %xmm1, %xmm2
11161110
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
11171111
; SSE2-NEXT: paddd %xmm2, %xmm0
1118-
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1119-
; SSE2-NEXT: por %xmm0, %xmm1
1120-
; SSE2-NEXT: movd %xmm1, %eax
1112+
; SSE2-NEXT: movd %xmm0, %eax
11211113
; SSE2-NEXT: retq
11221114
;
11231115
; AVX-LABEL: sad_double_reduction_abs:
@@ -1129,8 +1121,6 @@ define dso_local i32 @sad_double_reduction_abs(ptr %arg, ptr %arg1, ptr %arg2, p
11291121
; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0
11301122
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
11311123
; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
1132-
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1133-
; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
11341124
; AVX-NEXT: vmovd %xmm0, %eax
11351125
; AVX-NEXT: retq
11361126
bb:

0 commit comments

Comments
 (0)