Skip to content
This repository was archived by the owner on Apr 23, 2020. It is now read-only.

Commit 93b589a

Browse files
committed
[X86] Use xmm registers to implement 64-bit popcnt on 32-bit targets if possible if popcnt instruction is not available
On 32-bit targets without popcnt, we currently expand 64-bit popcnt to sequences of arithmetic and logic ops for each 32-bit half and then add the 32 bit halves together. If we have xmm registers we can use use those to implement the operation instead. This results in less instructions then doing two separate 32-bit popcnt sequences. This mitigates some of PR41151 for the i64 on i686 case when we have SSE2. Differential Revision: https://reviews.llvm.org/D59662 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@356808 91177308-0d34-0410-b5e6-96231b3b80d8
1 parent 962427e commit 93b589a

File tree

2 files changed

+149
-5
lines changed

2 files changed

+149
-5
lines changed

lib/Target/X86/X86ISelLowering.cpp

+22
Original file line numberDiff line numberDiff line change
@@ -414,6 +414,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
414414
setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
415415
if (Subtarget.is64Bit())
416416
setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
417+
else
418+
setOperationAction(ISD::CTPOP , MVT::i64 , Custom);
417419
}
418420

419421
setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
@@ -26715,6 +26717,26 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
2671526717
switch (N->getOpcode()) {
2671626718
default:
2671726719
llvm_unreachable("Do not know how to custom type legalize this operation!");
26720+
case ISD::CTPOP: {
26721+
assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
26722+
// Use a v2i64 if possible.
26723+
bool NoImplicitFloatOps =
26724+
DAG.getMachineFunction().getFunction().hasFnAttribute(
26725+
Attribute::NoImplicitFloat);
26726+
if (isTypeLegal(MVT::v2i64) && !NoImplicitFloatOps) {
26727+
SDValue Wide =
26728+
DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, N->getOperand(0));
26729+
Wide = DAG.getNode(ISD::CTPOP, dl, MVT::v2i64, Wide);
26730+
// Bit count should fit in 32-bits, extract it as that and then zero
26731+
// extend to i64. Otherwise we end up extracting bits 63:32 separately.
26732+
Wide = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Wide);
26733+
Wide = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Wide,
26734+
DAG.getIntPtrConstant(0, dl));
26735+
Wide = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Wide);
26736+
Results.push_back(Wide);
26737+
}
26738+
return;
26739+
}
2671826740
case ISD::MUL: {
2671926741
EVT VT = N->getValueType(0);
2672026742
assert(VT.isVector() && "Unexpected VT");

test/CodeGen/X86/popcnt.ll

+127-5
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2-
; RUN: llc < %s -mtriple=i686-unknown | FileCheck %s --check-prefix=X32
2+
; RUN: llc < %s -mtriple=i686-unknown | FileCheck %s --check-prefixes=X32,X32-NOSSE
33
; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s --check-prefix=X64
44
; RUN: llc < %s -mtriple=i686-unknown -mattr=+popcnt | FileCheck %s --check-prefix=X32-POPCNT
55
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+popcnt | FileCheck %s --check-prefix=X64-POPCNT
6+
; RUN: llc < %s -mtriple=i686-unknown -mattr=sse2 | FileCheck %s --check-prefixes=X32,X32-SSE2
7+
; RUN: llc < %s -mtriple=i686-unknown -mattr=ssse3 | FileCheck %s --check-prefixes=X32,X32-SSSE3
68

79
define i8 @cnt8(i8 %x) nounwind readnone {
810
; X32-LABEL: cnt8:
@@ -172,7 +174,127 @@ define i32 @cnt32(i32 %x) nounwind readnone {
172174
}
173175

174176
define i64 @cnt64(i64 %x) nounwind readnone {
175-
; X32-LABEL: cnt64:
177+
; X32-NOSSE-LABEL: cnt64:
178+
; X32-NOSSE: # %bb.0:
179+
; X32-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax
180+
; X32-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
181+
; X32-NOSSE-NEXT: movl %ecx, %edx
182+
; X32-NOSSE-NEXT: shrl %edx
183+
; X32-NOSSE-NEXT: andl $1431655765, %edx # imm = 0x55555555
184+
; X32-NOSSE-NEXT: subl %edx, %ecx
185+
; X32-NOSSE-NEXT: movl %ecx, %edx
186+
; X32-NOSSE-NEXT: andl $858993459, %edx # imm = 0x33333333
187+
; X32-NOSSE-NEXT: shrl $2, %ecx
188+
; X32-NOSSE-NEXT: andl $858993459, %ecx # imm = 0x33333333
189+
; X32-NOSSE-NEXT: addl %edx, %ecx
190+
; X32-NOSSE-NEXT: movl %ecx, %edx
191+
; X32-NOSSE-NEXT: shrl $4, %edx
192+
; X32-NOSSE-NEXT: addl %ecx, %edx
193+
; X32-NOSSE-NEXT: andl $252645135, %edx # imm = 0xF0F0F0F
194+
; X32-NOSSE-NEXT: imull $16843009, %edx, %ecx # imm = 0x1010101
195+
; X32-NOSSE-NEXT: shrl $24, %ecx
196+
; X32-NOSSE-NEXT: movl %eax, %edx
197+
; X32-NOSSE-NEXT: shrl %edx
198+
; X32-NOSSE-NEXT: andl $1431655765, %edx # imm = 0x55555555
199+
; X32-NOSSE-NEXT: subl %edx, %eax
200+
; X32-NOSSE-NEXT: movl %eax, %edx
201+
; X32-NOSSE-NEXT: andl $858993459, %edx # imm = 0x33333333
202+
; X32-NOSSE-NEXT: shrl $2, %eax
203+
; X32-NOSSE-NEXT: andl $858993459, %eax # imm = 0x33333333
204+
; X32-NOSSE-NEXT: addl %edx, %eax
205+
; X32-NOSSE-NEXT: movl %eax, %edx
206+
; X32-NOSSE-NEXT: shrl $4, %edx
207+
; X32-NOSSE-NEXT: addl %eax, %edx
208+
; X32-NOSSE-NEXT: andl $252645135, %edx # imm = 0xF0F0F0F
209+
; X32-NOSSE-NEXT: imull $16843009, %edx, %eax # imm = 0x1010101
210+
; X32-NOSSE-NEXT: shrl $24, %eax
211+
; X32-NOSSE-NEXT: addl %ecx, %eax
212+
; X32-NOSSE-NEXT: xorl %edx, %edx
213+
; X32-NOSSE-NEXT: retl
214+
;
215+
; X64-LABEL: cnt64:
216+
; X64: # %bb.0:
217+
; X64-NEXT: movq %rdi, %rax
218+
; X64-NEXT: shrq %rax
219+
; X64-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
220+
; X64-NEXT: andq %rax, %rcx
221+
; X64-NEXT: subq %rcx, %rdi
222+
; X64-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
223+
; X64-NEXT: movq %rdi, %rcx
224+
; X64-NEXT: andq %rax, %rcx
225+
; X64-NEXT: shrq $2, %rdi
226+
; X64-NEXT: andq %rax, %rdi
227+
; X64-NEXT: addq %rcx, %rdi
228+
; X64-NEXT: movq %rdi, %rax
229+
; X64-NEXT: shrq $4, %rax
230+
; X64-NEXT: leaq (%rax,%rdi), %rax
231+
; X64-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
232+
; X64-NEXT: andq %rax, %rcx
233+
; X64-NEXT: movabsq $72340172838076673, %rax # imm = 0x101010101010101
234+
; X64-NEXT: imulq %rcx, %rax
235+
; X64-NEXT: shrq $56, %rax
236+
; X64-NEXT: retq
237+
;
238+
; X32-POPCNT-LABEL: cnt64:
239+
; X32-POPCNT: # %bb.0:
240+
; X32-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %ecx
241+
; X32-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %eax
242+
; X32-POPCNT-NEXT: addl %ecx, %eax
243+
; X32-POPCNT-NEXT: xorl %edx, %edx
244+
; X32-POPCNT-NEXT: retl
245+
;
246+
; X64-POPCNT-LABEL: cnt64:
247+
; X64-POPCNT: # %bb.0:
248+
; X64-POPCNT-NEXT: popcntq %rdi, %rax
249+
; X64-POPCNT-NEXT: retq
250+
;
251+
; X32-SSE2-LABEL: cnt64:
252+
; X32-SSE2: # %bb.0:
253+
; X32-SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
254+
; X32-SSE2-NEXT: movdqa %xmm0, %xmm1
255+
; X32-SSE2-NEXT: psrlw $1, %xmm1
256+
; X32-SSE2-NEXT: pand {{\.LCPI.*}}, %xmm1
257+
; X32-SSE2-NEXT: psubb %xmm1, %xmm0
258+
; X32-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
259+
; X32-SSE2-NEXT: movdqa %xmm0, %xmm2
260+
; X32-SSE2-NEXT: pand %xmm1, %xmm2
261+
; X32-SSE2-NEXT: psrlw $2, %xmm0
262+
; X32-SSE2-NEXT: pand %xmm1, %xmm0
263+
; X32-SSE2-NEXT: paddb %xmm2, %xmm0
264+
; X32-SSE2-NEXT: movdqa %xmm0, %xmm1
265+
; X32-SSE2-NEXT: psrlw $4, %xmm1
266+
; X32-SSE2-NEXT: paddb %xmm0, %xmm1
267+
; X32-SSE2-NEXT: pand {{\.LCPI.*}}, %xmm1
268+
; X32-SSE2-NEXT: pxor %xmm0, %xmm0
269+
; X32-SSE2-NEXT: psadbw %xmm1, %xmm0
270+
; X32-SSE2-NEXT: movd %xmm0, %eax
271+
; X32-SSE2-NEXT: xorl %edx, %edx
272+
; X32-SSE2-NEXT: retl
273+
;
274+
; X32-SSSE3-LABEL: cnt64:
275+
; X32-SSSE3: # %bb.0:
276+
; X32-SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
277+
; X32-SSSE3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
278+
; X32-SSSE3-NEXT: movdqa %xmm1, %xmm2
279+
; X32-SSSE3-NEXT: pand %xmm0, %xmm2
280+
; X32-SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
281+
; X32-SSSE3-NEXT: movdqa %xmm3, %xmm4
282+
; X32-SSSE3-NEXT: pshufb %xmm2, %xmm4
283+
; X32-SSSE3-NEXT: psrlw $4, %xmm1
284+
; X32-SSSE3-NEXT: pand %xmm0, %xmm1
285+
; X32-SSSE3-NEXT: pshufb %xmm1, %xmm3
286+
; X32-SSSE3-NEXT: paddb %xmm4, %xmm3
287+
; X32-SSSE3-NEXT: pxor %xmm0, %xmm0
288+
; X32-SSSE3-NEXT: psadbw %xmm3, %xmm0
289+
; X32-SSSE3-NEXT: movd %xmm0, %eax
290+
; X32-SSSE3-NEXT: xorl %edx, %edx
291+
; X32-SSSE3-NEXT: retl
292+
%cnt = tail call i64 @llvm.ctpop.i64(i64 %x)
293+
ret i64 %cnt
294+
}
295+
296+
define i64 @cnt64_noimplicitfloat(i64 %x) nounwind readnone noimplicitfloat {
297+
; X32-LABEL: cnt64_noimplicitfloat:
176298
; X32: # %bb.0:
177299
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
178300
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
@@ -210,7 +332,7 @@ define i64 @cnt64(i64 %x) nounwind readnone {
210332
; X32-NEXT: xorl %edx, %edx
211333
; X32-NEXT: retl
212334
;
213-
; X64-LABEL: cnt64:
335+
; X64-LABEL: cnt64_noimplicitfloat:
214336
; X64: # %bb.0:
215337
; X64-NEXT: movq %rdi, %rax
216338
; X64-NEXT: shrq %rax
@@ -233,15 +355,15 @@ define i64 @cnt64(i64 %x) nounwind readnone {
233355
; X64-NEXT: shrq $56, %rax
234356
; X64-NEXT: retq
235357
;
236-
; X32-POPCNT-LABEL: cnt64:
358+
; X32-POPCNT-LABEL: cnt64_noimplicitfloat:
237359
; X32-POPCNT: # %bb.0:
238360
; X32-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %ecx
239361
; X32-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %eax
240362
; X32-POPCNT-NEXT: addl %ecx, %eax
241363
; X32-POPCNT-NEXT: xorl %edx, %edx
242364
; X32-POPCNT-NEXT: retl
243365
;
244-
; X64-POPCNT-LABEL: cnt64:
366+
; X64-POPCNT-LABEL: cnt64_noimplicitfloat:
245367
; X64-POPCNT: # %bb.0:
246368
; X64-POPCNT-NEXT: popcntq %rdi, %rax
247369
; X64-POPCNT-NEXT: retq

0 commit comments

Comments
 (0)