Skip to content

Commit aa4d679

Browse files
committed
[X86] computeKnownBitsForTargetNode - add INTRINSIC_WO_CHAIN handling for PSADBW intrinsics
Waiting for intrinsics to be lowered to ISD target nodes is causing some poor combine decisions - at the very least we need better value tracking handling. As an initial example I've added support for the PSADBW intrinsics (which can be expanded along with the ISD node in llvm#81765) as this is a good example of an intrinsic that we need to handle as early as possible.
1 parent 27ce512 commit aa4d679

File tree

3 files changed

+31
-60
lines changed

3 files changed

+31
-60
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36995,6 +36995,18 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
3699536995
}
3699636996
break;
3699736997
}
36998+
case ISD::INTRINSIC_WO_CHAIN: {
36999+
switch (Op->getConstantOperandVal(0)) {
37000+
case Intrinsic::x86_sse2_psad_bw:
37001+
case Intrinsic::x86_avx2_psad_bw:
37002+
case Intrinsic::x86_avx512_psad_bw_512:
37003+
// PSADBW - fills low 16 bits and zeros upper 48 bits of each i64 result.
37004+
assert(VT.getScalarType() == MVT::i64 && "Unexpected PSADBW types");
37005+
Known.Zero.setBitsFrom(16);
37006+
break;
37007+
}
37008+
break;
37009+
}
3699837010
}
3699937011

3700037012
// Handle target shuffles.

llvm/test/CodeGen/X86/psadbw.ll

Lines changed: 16 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,6 @@ define <2 x i64> @combine_psadbw_cmp_knownbits(<16 x i8> %a0) nounwind {
5454
; X86-SSE-NEXT: pxor %xmm1, %xmm1
5555
; X86-SSE-NEXT: psadbw %xmm0, %xmm1
5656
; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
57-
; X86-SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
5857
; X86-SSE-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
5958
; X86-SSE-NEXT: retl
6059
;
@@ -64,7 +63,6 @@ define <2 x i64> @combine_psadbw_cmp_knownbits(<16 x i8> %a0) nounwind {
6463
; X64-SSE-NEXT: pxor %xmm1, %xmm1
6564
; X64-SSE-NEXT: psadbw %xmm0, %xmm1
6665
; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
67-
; X64-SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
6866
; X64-SSE-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
6967
; X64-SSE-NEXT: retq
7068
;
@@ -82,96 +80,67 @@ define <2 x i64> @combine_psadbw_cmp_knownbits(<16 x i8> %a0) nounwind {
8280
ret <2 x i64> %ext
8381
}
8482

85-
; TODO: No need to scalarize the sitofp as the PSADBW results are smaller than i32.
83+
; No need to scalarize the sitofp as the PSADBW results are smaller than i32.
8684
define <2 x double> @combine_psadbw_sitofp_knownbits(<16 x i8> %a0) nounwind {
8785
; X86-SSE-LABEL: combine_psadbw_sitofp_knownbits:
8886
; X86-SSE: # %bb.0:
89-
; X86-SSE-NEXT: pushl %ebp
90-
; X86-SSE-NEXT: movl %esp, %ebp
91-
; X86-SSE-NEXT: andl $-8, %esp
92-
; X86-SSE-NEXT: subl $32, %esp
9387
; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
9488
; X86-SSE-NEXT: pxor %xmm1, %xmm1
9589
; X86-SSE-NEXT: psadbw %xmm0, %xmm1
96-
; X86-SSE-NEXT: movq %xmm1, {{[0-9]+}}(%esp)
97-
; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
98-
; X86-SSE-NEXT: movq %xmm0, {{[0-9]+}}(%esp)
99-
; X86-SSE-NEXT: fildll {{[0-9]+}}(%esp)
100-
; X86-SSE-NEXT: fstpl {{[0-9]+}}(%esp)
101-
; X86-SSE-NEXT: fildll {{[0-9]+}}(%esp)
102-
; X86-SSE-NEXT: fstpl (%esp)
103-
; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
104-
; X86-SSE-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
105-
; X86-SSE-NEXT: movl %ebp, %esp
106-
; X86-SSE-NEXT: popl %ebp
90+
; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
91+
; X86-SSE-NEXT: cvtdq2pd %xmm0, %xmm0
10792
; X86-SSE-NEXT: retl
10893
;
10994
; X64-SSE-LABEL: combine_psadbw_sitofp_knownbits:
11095
; X64-SSE: # %bb.0:
11196
; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
11297
; X64-SSE-NEXT: pxor %xmm1, %xmm1
11398
; X64-SSE-NEXT: psadbw %xmm0, %xmm1
114-
; X64-SSE-NEXT: movd %xmm1, %eax
115-
; X64-SSE-NEXT: xorps %xmm0, %xmm0
116-
; X64-SSE-NEXT: cvtsi2sd %eax, %xmm0
117-
; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
118-
; X64-SSE-NEXT: movd %xmm1, %eax
119-
; X64-SSE-NEXT: xorps %xmm1, %xmm1
120-
; X64-SSE-NEXT: cvtsi2sd %eax, %xmm1
121-
; X64-SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
99+
; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
100+
; X64-SSE-NEXT: cvtdq2pd %xmm0, %xmm0
122101
; X64-SSE-NEXT: retq
123102
;
124103
; AVX2-LABEL: combine_psadbw_sitofp_knownbits:
125104
; AVX2: # %bb.0:
126105
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
127106
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
128107
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
129-
; AVX2-NEXT: vcvtdq2pd %xmm0, %xmm1
130-
; AVX2-NEXT: vpextrq $1, %xmm0, %rax
131-
; AVX2-NEXT: vcvtsi2sd %eax, %xmm2, %xmm0
132-
; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
108+
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
109+
; AVX2-NEXT: vcvtdq2pd %xmm0, %xmm0
133110
; AVX2-NEXT: retq
134111
%mask = and <16 x i8> %a0, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
135112
%sad = tail call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %mask, <16 x i8> zeroinitializer)
136113
%cvt = sitofp <2 x i64> %sad to <2 x double>
137114
ret <2 x double> %cvt
138115
}
139116

140-
; TODO: Convert from uitofp to sitofp as the PSADBW results are zero-extended.
117+
; Convert from uitofp to sitofp as the PSADBW results are zero-extended.
141118
define <2 x double> @combine_psadbw_uitofp_knownbits(<16 x i8> %a0) nounwind {
142119
; X86-SSE-LABEL: combine_psadbw_uitofp_knownbits:
143120
; X86-SSE: # %bb.0:
144121
; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
145122
; X86-SSE-NEXT: pxor %xmm1, %xmm1
146-
; X86-SSE-NEXT: psadbw %xmm1, %xmm0
147-
; X86-SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
148-
; X86-SSE-NEXT: movapd {{.*#+}} xmm1 = [0,1160773632,0,1160773632]
149-
; X86-SSE-NEXT: subpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
150-
; X86-SSE-NEXT: addpd %xmm1, %xmm0
123+
; X86-SSE-NEXT: psadbw %xmm0, %xmm1
124+
; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
125+
; X86-SSE-NEXT: cvtdq2pd %xmm0, %xmm0
151126
; X86-SSE-NEXT: retl
152127
;
153128
; X64-SSE-LABEL: combine_psadbw_uitofp_knownbits:
154129
; X64-SSE: # %bb.0:
155130
; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
156131
; X64-SSE-NEXT: pxor %xmm1, %xmm1
157-
; X64-SSE-NEXT: psadbw %xmm1, %xmm0
158-
; X64-SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
159-
; X64-SSE-NEXT: movapd {{.*#+}} xmm1 = [4985484787499139072,4985484787499139072]
160-
; X64-SSE-NEXT: subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
161-
; X64-SSE-NEXT: addpd %xmm1, %xmm0
132+
; X64-SSE-NEXT: psadbw %xmm0, %xmm1
133+
; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
134+
; X64-SSE-NEXT: cvtdq2pd %xmm0, %xmm0
162135
; X64-SSE-NEXT: retq
163136
;
164137
; AVX2-LABEL: combine_psadbw_uitofp_knownbits:
165138
; AVX2: # %bb.0:
166139
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
167140
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
168141
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
169-
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
170-
; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
171-
; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = [4985484787499139072,4985484787499139072]
172-
; AVX2-NEXT: # xmm1 = mem[0,0]
173-
; AVX2-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
174-
; AVX2-NEXT: vaddpd %xmm1, %xmm0, %xmm0
142+
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
143+
; AVX2-NEXT: vcvtdq2pd %xmm0, %xmm0
175144
; AVX2-NEXT: retq
176145
%mask = and <16 x i8> %a0, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
177146
%sad = tail call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %mask, <16 x i8> zeroinitializer)

llvm/test/CodeGen/X86/sad.ll

Lines changed: 3 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -989,9 +989,7 @@ define dso_local i32 @sad_unroll_nonzero_initial(ptr %arg, ptr %arg1, ptr %arg2,
989989
; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
990990
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
991991
; SSE2-NEXT: paddd %xmm2, %xmm0
992-
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
993-
; SSE2-NEXT: paddd %xmm0, %xmm1
994-
; SSE2-NEXT: movd %xmm1, %eax
992+
; SSE2-NEXT: movd %xmm0, %eax
995993
; SSE2-NEXT: retq
996994
;
997995
; AVX-LABEL: sad_unroll_nonzero_initial:
@@ -1053,9 +1051,7 @@ define dso_local i32 @sad_double_reduction(ptr %arg, ptr %arg1, ptr %arg2, ptr %
10531051
; SSE2-NEXT: paddd %xmm1, %xmm2
10541052
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
10551053
; SSE2-NEXT: paddd %xmm2, %xmm0
1056-
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1057-
; SSE2-NEXT: por %xmm0, %xmm1
1058-
; SSE2-NEXT: movd %xmm1, %eax
1054+
; SSE2-NEXT: movd %xmm0, %eax
10591055
; SSE2-NEXT: retq
10601056
;
10611057
; AVX-LABEL: sad_double_reduction:
@@ -1067,8 +1063,6 @@ define dso_local i32 @sad_double_reduction(ptr %arg, ptr %arg1, ptr %arg2, ptr %
10671063
; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0
10681064
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
10691065
; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
1070-
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1071-
; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
10721066
; AVX-NEXT: vmovd %xmm0, %eax
10731067
; AVX-NEXT: retq
10741068
bb:
@@ -1115,9 +1109,7 @@ define dso_local i32 @sad_double_reduction_abs(ptr %arg, ptr %arg1, ptr %arg2, p
11151109
; SSE2-NEXT: paddd %xmm1, %xmm2
11161110
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
11171111
; SSE2-NEXT: paddd %xmm2, %xmm0
1118-
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1119-
; SSE2-NEXT: por %xmm0, %xmm1
1120-
; SSE2-NEXT: movd %xmm1, %eax
1112+
; SSE2-NEXT: movd %xmm0, %eax
11211113
; SSE2-NEXT: retq
11221114
;
11231115
; AVX-LABEL: sad_double_reduction_abs:
@@ -1129,8 +1121,6 @@ define dso_local i32 @sad_double_reduction_abs(ptr %arg, ptr %arg1, ptr %arg2, p
11291121
; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0
11301122
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
11311123
; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
1132-
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1133-
; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
11341124
; AVX-NEXT: vmovd %xmm0, %eax
11351125
; AVX-NEXT: retq
11361126
bb:

0 commit comments

Comments
 (0)