Skip to content

Commit 17e065f

Browse files
RKSimonmemfrob
authored and
memfrob
committed
[DAG] Combine fshl/fshr(load1,load0,c) if we have consecutive loads
As noted on D75114, if both arguments of a funnel shift are consecutive loads we are missing the opportunity to combine them into a single load. Differential Revision: https://reviews.llvm.org/D75624
1 parent 554b519 commit 17e065f

File tree

3 files changed

+81
-150
lines changed

3 files changed

+81
-150
lines changed

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

+37
Original file line numberDiff line numberDiff line change
@@ -8257,6 +8257,43 @@ SDValue DAGCombiner::visitFunnelShift(SDNode *N) {
82578257
return DAG.getNode(ISD::SHL, SDLoc(N), VT, N0,
82588258
DAG.getConstant(IsFSHL ? ShAmt : BitWidth - ShAmt,
82598259
SDLoc(N), ShAmtTy));
8260+
8261+
// fold (fshl ld1, ld0, c) -> (ld0[ofs]) iff ld0 and ld1 are consecutive.
8262+
// fold (fshr ld1, ld0, c) -> (ld0[ofs]) iff ld0 and ld1 are consecutive.
8263+
// TODO - bigendian support once we have test coverage.
8264+
// TODO - can we merge this with CombineConseutiveLoads/MatchLoadCombine?
8265+
if ((BitWidth % 8) == 0 && (ShAmt % 8) == 0 && !VT.isVector() &&
8266+
!DAG.getDataLayout().isBigEndian()) {
8267+
auto *LHS = dyn_cast<LoadSDNode>(N0);
8268+
auto *RHS = dyn_cast<LoadSDNode>(N1);
8269+
if (LHS && RHS && LHS->isSimple() && RHS->isSimple() &&
8270+
LHS->getAddressSpace() == RHS->getAddressSpace() &&
8271+
(LHS->hasOneUse() || RHS->hasOneUse()) && ISD::isNON_EXTLoad(RHS)) {
8272+
if (DAG.areNonVolatileConsecutiveLoads(LHS, RHS, BitWidth / 8, 1)) {
8273+
SDLoc DL(RHS);
8274+
uint64_t PtrOff =
8275+
IsFSHL ? (((BitWidth - ShAmt) % BitWidth) / 8) : (ShAmt / 8);
8276+
unsigned NewAlign = MinAlign(RHS->getAlignment(), PtrOff);
8277+
bool Fast = false;
8278+
if (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
8279+
RHS->getAddressSpace(), NewAlign,
8280+
RHS->getMemOperand()->getFlags(), &Fast) &&
8281+
Fast) {
8282+
SDValue NewPtr =
8283+
DAG.getMemBasePlusOffset(RHS->getBasePtr(), PtrOff, DL);
8284+
AddToWorklist(NewPtr.getNode());
8285+
SDValue Load = DAG.getLoad(
8286+
VT, DL, RHS->getChain(), NewPtr,
8287+
RHS->getPointerInfo().getWithOffset(PtrOff), NewAlign,
8288+
RHS->getMemOperand()->getFlags(), RHS->getAAInfo());
8289+
// Replace the old load's chain with the new load's chain.
8290+
WorklistRemover DeadNodes(*this);
8291+
DAG.ReplaceAllUsesOfValueWith(N1.getValue(1), Load.getValue(1));
8292+
return Load;
8293+
}
8294+
}
8295+
}
8296+
}
82608297
}
82618298

82628299
// fold fshr(undef_or_zero, N1, N2) -> lshr(N1, N2)

llvm/test/CodeGen/X86/fshl.ll

+22-71
Original file line numberDiff line numberDiff line change
@@ -547,39 +547,16 @@ define i8 @combine_fshl_load_i8(i8* %p) nounwind {
547547
}
548548

549549
define i16 @combine_fshl_load_i16(i16* %p) nounwind {
550-
; X86-FAST-LABEL: combine_fshl_load_i16:
551-
; X86-FAST: # %bb.0:
552-
; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax
553-
; X86-FAST-NEXT: movzwl (%eax), %ecx
554-
; X86-FAST-NEXT: movzwl 2(%eax), %eax
555-
; X86-FAST-NEXT: shldw $8, %cx, %ax
556-
; X86-FAST-NEXT: retl
557-
;
558-
; X86-SLOW-LABEL: combine_fshl_load_i16:
559-
; X86-SLOW: # %bb.0:
560-
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ecx
561-
; X86-SLOW-NEXT: movzwl 2(%ecx), %eax
562-
; X86-SLOW-NEXT: movzbl 1(%ecx), %ecx
563-
; X86-SLOW-NEXT: shll $8, %eax
564-
; X86-SLOW-NEXT: orl %ecx, %eax
565-
; X86-SLOW-NEXT: # kill: def $ax killed $ax killed $eax
566-
; X86-SLOW-NEXT: retl
567-
;
568-
; X64-FAST-LABEL: combine_fshl_load_i16:
569-
; X64-FAST: # %bb.0:
570-
; X64-FAST-NEXT: movzwl (%rdi), %ecx
571-
; X64-FAST-NEXT: movzwl 2(%rdi), %eax
572-
; X64-FAST-NEXT: shldw $8, %cx, %ax
573-
; X64-FAST-NEXT: retq
550+
; X86-LABEL: combine_fshl_load_i16:
551+
; X86: # %bb.0:
552+
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
553+
; X86-NEXT: movzwl 1(%eax), %eax
554+
; X86-NEXT: retl
574555
;
575-
; X64-SLOW-LABEL: combine_fshl_load_i16:
576-
; X64-SLOW: # %bb.0:
577-
; X64-SLOW-NEXT: movzwl 2(%rdi), %eax
578-
; X64-SLOW-NEXT: movzbl 1(%rdi), %ecx
579-
; X64-SLOW-NEXT: shll $8, %eax
580-
; X64-SLOW-NEXT: orl %ecx, %eax
581-
; X64-SLOW-NEXT: # kill: def $ax killed $ax killed $eax
582-
; X64-SLOW-NEXT: retq
556+
; X64-LABEL: combine_fshl_load_i16:
557+
; X64: # %bb.0:
558+
; X64-NEXT: movzwl 1(%rdi), %eax
559+
; X64-NEXT: retq
583560
%p0 = getelementptr i16, i16* %p, i32 0
584561
%p1 = getelementptr i16, i16* %p, i32 1
585562
%ld0 = load i16, i16 *%p0
@@ -589,31 +566,16 @@ define i16 @combine_fshl_load_i16(i16* %p) nounwind {
589566
}
590567

591568
define i32 @combine_fshl_load_i32(i32* %p) nounwind {
592-
; X86-FAST-LABEL: combine_fshl_load_i32:
593-
; X86-FAST: # %bb.0:
594-
; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax
595-
; X86-FAST-NEXT: movl 8(%eax), %ecx
596-
; X86-FAST-NEXT: movl 12(%eax), %eax
597-
; X86-FAST-NEXT: shldl $8, %ecx, %eax
598-
; X86-FAST-NEXT: retl
599-
;
600-
; X86-SLOW-LABEL: combine_fshl_load_i32:
601-
; X86-SLOW: # %bb.0:
602-
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax
603-
; X86-SLOW-NEXT: movl 11(%eax), %eax
604-
; X86-SLOW-NEXT: retl
605-
;
606-
; X64-FAST-LABEL: combine_fshl_load_i32:
607-
; X64-FAST: # %bb.0:
608-
; X64-FAST-NEXT: movl 8(%rdi), %ecx
609-
; X64-FAST-NEXT: movl 12(%rdi), %eax
610-
; X64-FAST-NEXT: shldl $8, %ecx, %eax
611-
; X64-FAST-NEXT: retq
569+
; X86-LABEL: combine_fshl_load_i32:
570+
; X86: # %bb.0:
571+
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
572+
; X86-NEXT: movl 11(%eax), %eax
573+
; X86-NEXT: retl
612574
;
613-
; X64-SLOW-LABEL: combine_fshl_load_i32:
614-
; X64-SLOW: # %bb.0:
615-
; X64-SLOW-NEXT: movl 11(%rdi), %eax
616-
; X64-SLOW-NEXT: retq
575+
; X64-LABEL: combine_fshl_load_i32:
576+
; X64: # %bb.0:
577+
; X64-NEXT: movl 11(%rdi), %eax
578+
; X64-NEXT: retq
617579
%p0 = getelementptr i32, i32* %p, i32 2
618580
%p1 = getelementptr i32, i32* %p, i32 3
619581
%ld0 = load i32, i32 *%p0
@@ -652,21 +614,10 @@ define i64 @combine_fshl_load_i64(i64* %p) nounwind {
652614
; X86-SLOW-NEXT: popl %esi
653615
; X86-SLOW-NEXT: retl
654616
;
655-
; X64-FAST-LABEL: combine_fshl_load_i64:
656-
; X64-FAST: # %bb.0:
657-
; X64-FAST-NEXT: movq 8(%rdi), %rcx
658-
; X64-FAST-NEXT: movq 16(%rdi), %rax
659-
; X64-FAST-NEXT: shldq $24, %rcx, %rax
660-
; X64-FAST-NEXT: retq
661-
;
662-
; X64-SLOW-LABEL: combine_fshl_load_i64:
663-
; X64-SLOW: # %bb.0:
664-
; X64-SLOW-NEXT: movq 8(%rdi), %rcx
665-
; X64-SLOW-NEXT: movq 16(%rdi), %rax
666-
; X64-SLOW-NEXT: shrq $40, %rcx
667-
; X64-SLOW-NEXT: shlq $24, %rax
668-
; X64-SLOW-NEXT: orq %rcx, %rax
669-
; X64-SLOW-NEXT: retq
617+
; X64-LABEL: combine_fshl_load_i64:
618+
; X64: # %bb.0:
619+
; X64-NEXT: movq 13(%rdi), %rax
620+
; X64-NEXT: retq
670621
%p0 = getelementptr i64, i64* %p, i64 1
671622
%p1 = getelementptr i64, i64* %p, i64 2
672623
%ld0 = load i64, i64 *%p0

llvm/test/CodeGen/X86/fshr.ll

+22-79
Original file line numberDiff line numberDiff line change
@@ -542,39 +542,16 @@ define i8 @combine_fshr_load_i8(i8* %p) nounwind {
542542
}
543543

544544
define i16 @combine_fshr_load_i16(i16* %p) nounwind {
545-
; X86-FAST-LABEL: combine_fshr_load_i16:
546-
; X86-FAST: # %bb.0:
547-
; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax
548-
; X86-FAST-NEXT: movzwl (%eax), %ecx
549-
; X86-FAST-NEXT: movzwl 2(%eax), %eax
550-
; X86-FAST-NEXT: shldw $8, %cx, %ax
551-
; X86-FAST-NEXT: retl
552-
;
553-
; X86-SLOW-LABEL: combine_fshr_load_i16:
554-
; X86-SLOW: # %bb.0:
555-
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ecx
556-
; X86-SLOW-NEXT: movzwl 2(%ecx), %eax
557-
; X86-SLOW-NEXT: movzbl 1(%ecx), %ecx
558-
; X86-SLOW-NEXT: shll $8, %eax
559-
; X86-SLOW-NEXT: orl %ecx, %eax
560-
; X86-SLOW-NEXT: # kill: def $ax killed $ax killed $eax
561-
; X86-SLOW-NEXT: retl
562-
;
563-
; X64-FAST-LABEL: combine_fshr_load_i16:
564-
; X64-FAST: # %bb.0:
565-
; X64-FAST-NEXT: movzwl (%rdi), %ecx
566-
; X64-FAST-NEXT: movzwl 2(%rdi), %eax
567-
; X64-FAST-NEXT: shldw $8, %cx, %ax
568-
; X64-FAST-NEXT: retq
545+
; X86-LABEL: combine_fshr_load_i16:
546+
; X86: # %bb.0:
547+
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
548+
; X86-NEXT: movzwl 1(%eax), %eax
549+
; X86-NEXT: retl
569550
;
570-
; X64-SLOW-LABEL: combine_fshr_load_i16:
571-
; X64-SLOW: # %bb.0:
572-
; X64-SLOW-NEXT: movzwl 2(%rdi), %eax
573-
; X64-SLOW-NEXT: movzbl 1(%rdi), %ecx
574-
; X64-SLOW-NEXT: shll $8, %eax
575-
; X64-SLOW-NEXT: orl %ecx, %eax
576-
; X64-SLOW-NEXT: # kill: def $ax killed $ax killed $eax
577-
; X64-SLOW-NEXT: retq
551+
; X64-LABEL: combine_fshr_load_i16:
552+
; X64: # %bb.0:
553+
; X64-NEXT: movzwl 1(%rdi), %eax
554+
; X64-NEXT: retq
578555
%p0 = getelementptr i16, i16* %p, i32 0
579556
%p1 = getelementptr i16, i16* %p, i32 1
580557
%ld0 = load i16, i16 *%p0
@@ -584,39 +561,16 @@ define i16 @combine_fshr_load_i16(i16* %p) nounwind {
584561
}
585562

586563
define i32 @combine_fshr_load_i32(i32* %p) nounwind {
587-
; X86-FAST-LABEL: combine_fshr_load_i32:
588-
; X86-FAST: # %bb.0:
589-
; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax
590-
; X86-FAST-NEXT: movl 8(%eax), %ecx
591-
; X86-FAST-NEXT: movl 12(%eax), %eax
592-
; X86-FAST-NEXT: shldl $24, %ecx, %eax
593-
; X86-FAST-NEXT: retl
594-
;
595-
; X86-SLOW-LABEL: combine_fshr_load_i32:
596-
; X86-SLOW: # %bb.0:
597-
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax
598-
; X86-SLOW-NEXT: movl 8(%eax), %ecx
599-
; X86-SLOW-NEXT: movl 12(%eax), %eax
600-
; X86-SLOW-NEXT: shrl $8, %ecx
601-
; X86-SLOW-NEXT: shll $24, %eax
602-
; X86-SLOW-NEXT: orl %ecx, %eax
603-
; X86-SLOW-NEXT: retl
604-
;
605-
; X64-FAST-LABEL: combine_fshr_load_i32:
606-
; X64-FAST: # %bb.0:
607-
; X64-FAST-NEXT: movl 8(%rdi), %ecx
608-
; X64-FAST-NEXT: movl 12(%rdi), %eax
609-
; X64-FAST-NEXT: shldl $24, %ecx, %eax
610-
; X64-FAST-NEXT: retq
564+
; X86-LABEL: combine_fshr_load_i32:
565+
; X86: # %bb.0:
566+
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
567+
; X86-NEXT: movl 9(%eax), %eax
568+
; X86-NEXT: retl
611569
;
612-
; X64-SLOW-LABEL: combine_fshr_load_i32:
613-
; X64-SLOW: # %bb.0:
614-
; X64-SLOW-NEXT: movl 8(%rdi), %ecx
615-
; X64-SLOW-NEXT: movl 12(%rdi), %eax
616-
; X64-SLOW-NEXT: shrl $8, %ecx
617-
; X64-SLOW-NEXT: shll $24, %eax
618-
; X64-SLOW-NEXT: orl %ecx, %eax
619-
; X64-SLOW-NEXT: retq
570+
; X64-LABEL: combine_fshr_load_i32:
571+
; X64: # %bb.0:
572+
; X64-NEXT: movl 9(%rdi), %eax
573+
; X64-NEXT: retq
620574
%p0 = getelementptr i32, i32* %p, i32 2
621575
%p1 = getelementptr i32, i32* %p, i32 3
622576
%ld0 = load i32, i32 *%p0
@@ -656,21 +610,10 @@ define i64 @combine_fshr_load_i64(i64* %p) nounwind {
656610
; X86-SLOW-NEXT: popl %esi
657611
; X86-SLOW-NEXT: retl
658612
;
659-
; X64-FAST-LABEL: combine_fshr_load_i64:
660-
; X64-FAST: # %bb.0:
661-
; X64-FAST-NEXT: movq 8(%rdi), %rcx
662-
; X64-FAST-NEXT: movq 16(%rdi), %rax
663-
; X64-FAST-NEXT: shldq $40, %rcx, %rax
664-
; X64-FAST-NEXT: retq
665-
;
666-
; X64-SLOW-LABEL: combine_fshr_load_i64:
667-
; X64-SLOW: # %bb.0:
668-
; X64-SLOW-NEXT: movq 8(%rdi), %rcx
669-
; X64-SLOW-NEXT: movq 16(%rdi), %rax
670-
; X64-SLOW-NEXT: shrq $24, %rcx
671-
; X64-SLOW-NEXT: shlq $40, %rax
672-
; X64-SLOW-NEXT: orq %rcx, %rax
673-
; X64-SLOW-NEXT: retq
613+
; X64-LABEL: combine_fshr_load_i64:
614+
; X64: # %bb.0:
615+
; X64-NEXT: movq 11(%rdi), %rax
616+
; X64-NEXT: retq
674617
%p0 = getelementptr i64, i64* %p, i64 1
675618
%p1 = getelementptr i64, i64* %p, i64 2
676619
%ld0 = load i64, i64 *%p0

0 commit comments

Comments
 (0)