Skip to content

Commit 8d8bb35

Browse files
committed
[X86] Add some basic test coverage for #81765
Test cases demonstrating poor value tracking of PSADBW results
1 parent 652081c commit 8d8bb35

File tree

1 file changed

+103
-2
lines changed

1 file changed

+103
-2
lines changed

llvm/test/CodeGen/X86/psadbw.ll

Lines changed: 103 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,X64
44

55
; Only bottom 16 bits are set - upper 48 bits are zero.
6-
define <2 x i64> @combine_psadbw_shift(<16 x i8> %0, <16 x i8> %1) {
6+
define <2 x i64> @combine_psadbw_shift(<16 x i8> %0, <16 x i8> %1) nounwind {
77
; CHECK-LABEL: combine_psadbw_shift:
88
; CHECK: # %bb.0:
99
; CHECK-NEXT: xorps %xmm0, %xmm0
@@ -14,7 +14,7 @@ define <2 x i64> @combine_psadbw_shift(<16 x i8> %0, <16 x i8> %1) {
1414
}
1515

1616
; Propagate the demanded result elements to the 8 aliasing source elements.
17-
define i64 @combine_psadbw_demandedelt(<16 x i8> %0, <16 x i8> %1) {
17+
define i64 @combine_psadbw_demandedelt(<16 x i8> %0, <16 x i8> %1) nounwind {
1818
; X86-LABEL: combine_psadbw_demandedelt:
1919
; X86: # %bb.0:
2020
; X86-NEXT: psadbw %xmm1, %xmm0
@@ -34,5 +34,106 @@ define i64 @combine_psadbw_demandedelt(<16 x i8> %0, <16 x i8> %1) {
3434
ret i64 %6
3535
}
3636

37+
; TODO: Each PSADBW source element has a maximum value of 3 - so max sum-of-diffs for each <8 x i8> should be 24.
38+
define <2 x i64> @combine_psadbw_cmp_knownbits(<16 x i8> %a0) nounwind {
39+
; X86-LABEL: combine_psadbw_cmp_knownbits:
40+
; X86: # %bb.0:
41+
; X86-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
42+
; X86-NEXT: pxor %xmm1, %xmm1
43+
; X86-NEXT: psadbw %xmm0, %xmm1
44+
; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
45+
; X86-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
46+
; X86-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
47+
; X86-NEXT: retl
48+
;
49+
; X64-LABEL: combine_psadbw_cmp_knownbits:
50+
; X64: # %bb.0:
51+
; X64-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
52+
; X64-NEXT: pxor %xmm1, %xmm1
53+
; X64-NEXT: psadbw %xmm0, %xmm1
54+
; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
55+
; X64-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
56+
; X64-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
57+
; X64-NEXT: retq
58+
%mask = and <16 x i8> %a0, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
59+
%sad = tail call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %mask, <16 x i8> zeroinitializer)
60+
%cmp = icmp sgt <2 x i64> %sad, <i64 32, i64 32>
61+
%ext = sext <2 x i1> %cmp to <2 x i64>
62+
ret <2 x i64> %ext
63+
}
64+
65+
; TODO: No need to scalarize the sitofp as the PSADBW results are smaller than i32.
66+
define <2 x double> @combine_psadbw_sitofp_knownbits(<16 x i8> %a0) nounwind {
67+
; X86-LABEL: combine_psadbw_sitofp_knownbits:
68+
; X86: # %bb.0:
69+
; X86-NEXT: pushl %ebp
70+
; X86-NEXT: movl %esp, %ebp
71+
; X86-NEXT: andl $-8, %esp
72+
; X86-NEXT: subl $32, %esp
73+
; X86-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
74+
; X86-NEXT: pxor %xmm1, %xmm1
75+
; X86-NEXT: psadbw %xmm0, %xmm1
76+
; X86-NEXT: movq %xmm1, {{[0-9]+}}(%esp)
77+
; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
78+
; X86-NEXT: movq %xmm0, {{[0-9]+}}(%esp)
79+
; X86-NEXT: fildll {{[0-9]+}}(%esp)
80+
; X86-NEXT: fstpl {{[0-9]+}}(%esp)
81+
; X86-NEXT: fildll {{[0-9]+}}(%esp)
82+
; X86-NEXT: fstpl (%esp)
83+
; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
84+
; X86-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
85+
; X86-NEXT: movl %ebp, %esp
86+
; X86-NEXT: popl %ebp
87+
; X86-NEXT: retl
88+
;
89+
; X64-LABEL: combine_psadbw_sitofp_knownbits:
90+
; X64: # %bb.0:
91+
; X64-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
92+
; X64-NEXT: pxor %xmm1, %xmm1
93+
; X64-NEXT: psadbw %xmm0, %xmm1
94+
; X64-NEXT: movd %xmm1, %eax
95+
; X64-NEXT: xorps %xmm0, %xmm0
96+
; X64-NEXT: cvtsi2sd %eax, %xmm0
97+
; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
98+
; X64-NEXT: movd %xmm1, %eax
99+
; X64-NEXT: xorps %xmm1, %xmm1
100+
; X64-NEXT: cvtsi2sd %eax, %xmm1
101+
; X64-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
102+
; X64-NEXT: retq
103+
%mask = and <16 x i8> %a0, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
104+
%sad = tail call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %mask, <16 x i8> zeroinitializer)
105+
%cvt = sitofp <2 x i64> %sad to <2 x double>
106+
ret <2 x double> %cvt
107+
}
108+
109+
; TODO: Convert from uitofp to sitofp as the PSADBW results are zero-extended.
110+
define <2 x double> @combine_psadbw_uitofp_knownbits(<16 x i8> %a0) nounwind {
111+
; X86-LABEL: combine_psadbw_uitofp_knownbits:
112+
; X86: # %bb.0:
113+
; X86-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
114+
; X86-NEXT: pxor %xmm1, %xmm1
115+
; X86-NEXT: psadbw %xmm1, %xmm0
116+
; X86-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
117+
; X86-NEXT: movapd {{.*#+}} xmm1 = [0,1160773632,0,1160773632]
118+
; X86-NEXT: subpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
119+
; X86-NEXT: addpd %xmm1, %xmm0
120+
; X86-NEXT: retl
121+
;
122+
; X64-LABEL: combine_psadbw_uitofp_knownbits:
123+
; X64: # %bb.0:
124+
; X64-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
125+
; X64-NEXT: pxor %xmm1, %xmm1
126+
; X64-NEXT: psadbw %xmm1, %xmm0
127+
; X64-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
128+
; X64-NEXT: movapd {{.*#+}} xmm1 = [4985484787499139072,4985484787499139072]
129+
; X64-NEXT: subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
130+
; X64-NEXT: addpd %xmm1, %xmm0
131+
; X64-NEXT: retq
132+
%mask = and <16 x i8> %a0, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
133+
%sad = tail call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %mask, <16 x i8> zeroinitializer)
134+
%cvt = uitofp <2 x i64> %sad to <2 x double>
135+
ret <2 x double> %cvt
136+
}
137+
37138
declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>)
38139

0 commit comments

Comments
 (0)