3
3
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,X64
4
4
5
5
; Only bottom 16 bits are set - upper 48 bits are zero.
6
- define <2 x i64 > @combine_psadbw_shift (<16 x i8 > %0 , <16 x i8 > %1 ) {
6
+ define <2 x i64 > @combine_psadbw_shift (<16 x i8 > %0 , <16 x i8 > %1 ) nounwind {
7
7
; CHECK-LABEL: combine_psadbw_shift:
8
8
; CHECK: # %bb.0:
9
9
; CHECK-NEXT: xorps %xmm0, %xmm0
@@ -14,7 +14,7 @@ define <2 x i64> @combine_psadbw_shift(<16 x i8> %0, <16 x i8> %1) {
14
14
}
15
15
16
16
; Propagate the demanded result elements to the 8 aliasing source elements.
17
- define i64 @combine_psadbw_demandedelt (<16 x i8 > %0 , <16 x i8 > %1 ) {
17
+ define i64 @combine_psadbw_demandedelt (<16 x i8 > %0 , <16 x i8 > %1 ) nounwind {
18
18
; X86-LABEL: combine_psadbw_demandedelt:
19
19
; X86: # %bb.0:
20
20
; X86-NEXT: psadbw %xmm1, %xmm0
@@ -34,5 +34,106 @@ define i64 @combine_psadbw_demandedelt(<16 x i8> %0, <16 x i8> %1) {
34
34
ret i64 %6
35
35
}
36
36
37
+ ; TODO: Each PSADBW source element has a maximum value of 3 - so max sum-of-diffs for each <8 x i8> should be 24.
38
+ define <2 x i64 > @combine_psadbw_cmp_knownbits (<16 x i8 > %a0 ) nounwind {
39
+ ; X86-LABEL: combine_psadbw_cmp_knownbits:
40
+ ; X86: # %bb.0:
41
+ ; X86-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
42
+ ; X86-NEXT: pxor %xmm1, %xmm1
43
+ ; X86-NEXT: psadbw %xmm0, %xmm1
44
+ ; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
45
+ ; X86-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
46
+ ; X86-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
47
+ ; X86-NEXT: retl
48
+ ;
49
+ ; X64-LABEL: combine_psadbw_cmp_knownbits:
50
+ ; X64: # %bb.0:
51
+ ; X64-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
52
+ ; X64-NEXT: pxor %xmm1, %xmm1
53
+ ; X64-NEXT: psadbw %xmm0, %xmm1
54
+ ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
55
+ ; X64-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
56
+ ; X64-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
57
+ ; X64-NEXT: retq
58
+ %mask = and <16 x i8 > %a0 , <i8 3 , i8 3 , i8 3 , i8 3 , i8 3 , i8 3 , i8 3 , i8 3 , i8 3 , i8 3 , i8 3 , i8 3 , i8 3 , i8 3 , i8 3 , i8 3 >
59
+ %sad = tail call <2 x i64 > @llvm.x86.sse2.psad.bw (<16 x i8 > %mask , <16 x i8 > zeroinitializer )
60
+ %cmp = icmp sgt <2 x i64 > %sad , <i64 32 , i64 32 >
61
+ %ext = sext <2 x i1 > %cmp to <2 x i64 >
62
+ ret <2 x i64 > %ext
63
+ }
64
+
65
+ ; TODO: No need to scalarize the sitofp as the PSADBW results are smaller than i32.
66
+ define <2 x double > @combine_psadbw_sitofp_knownbits (<16 x i8 > %a0 ) nounwind {
67
+ ; X86-LABEL: combine_psadbw_sitofp_knownbits:
68
+ ; X86: # %bb.0:
69
+ ; X86-NEXT: pushl %ebp
70
+ ; X86-NEXT: movl %esp, %ebp
71
+ ; X86-NEXT: andl $-8, %esp
72
+ ; X86-NEXT: subl $32, %esp
73
+ ; X86-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
74
+ ; X86-NEXT: pxor %xmm1, %xmm1
75
+ ; X86-NEXT: psadbw %xmm0, %xmm1
76
+ ; X86-NEXT: movq %xmm1, {{[0-9]+}}(%esp)
77
+ ; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
78
+ ; X86-NEXT: movq %xmm0, {{[0-9]+}}(%esp)
79
+ ; X86-NEXT: fildll {{[0-9]+}}(%esp)
80
+ ; X86-NEXT: fstpl {{[0-9]+}}(%esp)
81
+ ; X86-NEXT: fildll {{[0-9]+}}(%esp)
82
+ ; X86-NEXT: fstpl (%esp)
83
+ ; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
84
+ ; X86-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
85
+ ; X86-NEXT: movl %ebp, %esp
86
+ ; X86-NEXT: popl %ebp
87
+ ; X86-NEXT: retl
88
+ ;
89
+ ; X64-LABEL: combine_psadbw_sitofp_knownbits:
90
+ ; X64: # %bb.0:
91
+ ; X64-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
92
+ ; X64-NEXT: pxor %xmm1, %xmm1
93
+ ; X64-NEXT: psadbw %xmm0, %xmm1
94
+ ; X64-NEXT: movd %xmm1, %eax
95
+ ; X64-NEXT: xorps %xmm0, %xmm0
96
+ ; X64-NEXT: cvtsi2sd %eax, %xmm0
97
+ ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
98
+ ; X64-NEXT: movd %xmm1, %eax
99
+ ; X64-NEXT: xorps %xmm1, %xmm1
100
+ ; X64-NEXT: cvtsi2sd %eax, %xmm1
101
+ ; X64-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
102
+ ; X64-NEXT: retq
103
+ %mask = and <16 x i8 > %a0 , <i8 1 , i8 1 , i8 1 , i8 1 , i8 1 , i8 1 , i8 1 , i8 1 , i8 1 , i8 1 , i8 1 , i8 1 , i8 1 , i8 1 , i8 1 , i8 1 >
104
+ %sad = tail call <2 x i64 > @llvm.x86.sse2.psad.bw (<16 x i8 > %mask , <16 x i8 > zeroinitializer )
105
+ %cvt = sitofp <2 x i64 > %sad to <2 x double >
106
+ ret <2 x double > %cvt
107
+ }
108
+
109
+ ; TODO: Convert from uitofp to sitofp as the PSADBW results are zero-extended.
110
+ define <2 x double > @combine_psadbw_uitofp_knownbits (<16 x i8 > %a0 ) nounwind {
111
+ ; X86-LABEL: combine_psadbw_uitofp_knownbits:
112
+ ; X86: # %bb.0:
113
+ ; X86-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
114
+ ; X86-NEXT: pxor %xmm1, %xmm1
115
+ ; X86-NEXT: psadbw %xmm1, %xmm0
116
+ ; X86-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
117
+ ; X86-NEXT: movapd {{.*#+}} xmm1 = [0,1160773632,0,1160773632]
118
+ ; X86-NEXT: subpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
119
+ ; X86-NEXT: addpd %xmm1, %xmm0
120
+ ; X86-NEXT: retl
121
+ ;
122
+ ; X64-LABEL: combine_psadbw_uitofp_knownbits:
123
+ ; X64: # %bb.0:
124
+ ; X64-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
125
+ ; X64-NEXT: pxor %xmm1, %xmm1
126
+ ; X64-NEXT: psadbw %xmm1, %xmm0
127
+ ; X64-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
128
+ ; X64-NEXT: movapd {{.*#+}} xmm1 = [4985484787499139072,4985484787499139072]
129
+ ; X64-NEXT: subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
130
+ ; X64-NEXT: addpd %xmm1, %xmm0
131
+ ; X64-NEXT: retq
132
+ %mask = and <16 x i8 > %a0 , <i8 1 , i8 1 , i8 1 , i8 1 , i8 1 , i8 1 , i8 1 , i8 1 , i8 1 , i8 1 , i8 1 , i8 1 , i8 1 , i8 1 , i8 1 , i8 1 >
133
+ %sad = tail call <2 x i64 > @llvm.x86.sse2.psad.bw (<16 x i8 > %mask , <16 x i8 > zeroinitializer )
134
+ %cvt = uitofp <2 x i64 > %sad to <2 x double >
135
+ ret <2 x double > %cvt
136
+ }
137
+
37
138
declare <2 x i64 > @llvm.x86.sse2.psad.bw (<16 x i8 >, <16 x i8 >)
38
139
0 commit comments