Skip to content

Commit ad39049

Browse files
authored
[DAGCombiner] Attempt to fold 'add' nodes to funnel-shift or rotate (#125612)
Almost all of the rotate idioms that are valid for an 'or' are also valid when the halves are combined with an 'add'. Further, many of these cases are not handled by common bits tracking meaning that the 'add' is not converted to a 'disjoint or'.
1 parent 412f7fa commit ad39049

File tree

5 files changed

+92
-158
lines changed

5 files changed

+92
-158
lines changed

Diff for: llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

+58-58
Original file line numberDiff line numberDiff line change
@@ -649,14 +649,15 @@ namespace {
649649
bool DemandHighBits = true);
650650
SDValue MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1);
651651
SDValue MatchRotatePosNeg(SDValue Shifted, SDValue Pos, SDValue Neg,
652-
SDValue InnerPos, SDValue InnerNeg, bool HasPos,
653-
unsigned PosOpcode, unsigned NegOpcode,
654-
const SDLoc &DL);
652+
SDValue InnerPos, SDValue InnerNeg, bool FromAdd,
653+
bool HasPos, unsigned PosOpcode,
654+
unsigned NegOpcode, const SDLoc &DL);
655655
SDValue MatchFunnelPosNeg(SDValue N0, SDValue N1, SDValue Pos, SDValue Neg,
656-
SDValue InnerPos, SDValue InnerNeg, bool HasPos,
657-
unsigned PosOpcode, unsigned NegOpcode,
658-
const SDLoc &DL);
659-
SDValue MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL);
656+
SDValue InnerPos, SDValue InnerNeg, bool FromAdd,
657+
bool HasPos, unsigned PosOpcode,
658+
unsigned NegOpcode, const SDLoc &DL);
659+
SDValue MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL,
660+
bool FromAdd);
660661
SDValue MatchLoadCombine(SDNode *N);
661662
SDValue mergeTruncStores(StoreSDNode *N);
662663
SDValue reduceLoadWidth(SDNode *N);
@@ -2986,6 +2987,9 @@ SDValue DAGCombiner::visitADD(SDNode *N) {
29862987
if (SDValue V = foldAddSubOfSignBit(N, DL, DAG))
29872988
return V;
29882989

2990+
if (SDValue V = MatchRotate(N0, N1, SDLoc(N), /*FromAdd=*/true))
2991+
return V;
2992+
29892993
// Try to match AVGFLOOR fixedwidth pattern
29902994
if (SDValue V = foldAddToAvg(N, DL))
29912995
return V;
@@ -8175,7 +8179,7 @@ SDValue DAGCombiner::visitOR(SDNode *N) {
81758179
return V;
81768180

81778181
// See if this is some rotate idiom.
8178-
if (SDValue Rot = MatchRotate(N0, N1, DL))
8182+
if (SDValue Rot = MatchRotate(N0, N1, DL, /*FromAdd=*/false))
81798183
return Rot;
81808184

81818185
if (SDValue Load = MatchLoadCombine(N))
@@ -8364,7 +8368,7 @@ static SDValue extractShiftForRotate(SelectionDAG &DAG, SDValue OppShift,
83648368
// The IsRotate flag should be set when the LHS of both shifts is the same.
83658369
// Otherwise if matching a general funnel shift, it should be clear.
83668370
static bool matchRotateSub(SDValue Pos, SDValue Neg, unsigned EltSize,
8367-
SelectionDAG &DAG, bool IsRotate) {
8371+
SelectionDAG &DAG, bool IsRotate, bool FromAdd) {
83688372
const auto &TLI = DAG.getTargetLoweringInfo();
83698373
// If EltSize is a power of 2 then:
83708374
//
@@ -8403,7 +8407,7 @@ static bool matchRotateSub(SDValue Pos, SDValue Neg, unsigned EltSize,
84038407
// NOTE: We can only do this when matching operations which won't modify the
84048408
// least Log2(EltSize) significant bits and not a general funnel shift.
84058409
unsigned MaskLoBits = 0;
8406-
if (IsRotate && isPowerOf2_64(EltSize)) {
8410+
if (IsRotate && !FromAdd && isPowerOf2_64(EltSize)) {
84078411
unsigned Bits = Log2_64(EltSize);
84088412
unsigned NegBits = Neg.getScalarValueSizeInBits();
84098413
if (NegBits >= Bits) {
@@ -8486,22 +8490,21 @@ static bool matchRotateSub(SDValue Pos, SDValue Neg, unsigned EltSize,
84868490
// Neg with outer conversions stripped away.
84878491
SDValue DAGCombiner::MatchRotatePosNeg(SDValue Shifted, SDValue Pos,
84888492
SDValue Neg, SDValue InnerPos,
8489-
SDValue InnerNeg, bool HasPos,
8490-
unsigned PosOpcode, unsigned NegOpcode,
8491-
const SDLoc &DL) {
8492-
// fold (or (shl x, (*ext y)),
8493-
// (srl x, (*ext (sub 32, y)))) ->
8493+
SDValue InnerNeg, bool FromAdd,
8494+
bool HasPos, unsigned PosOpcode,
8495+
unsigned NegOpcode, const SDLoc &DL) {
8496+
// fold (or/add (shl x, (*ext y)),
8497+
// (srl x, (*ext (sub 32, y)))) ->
84948498
// (rotl x, y) or (rotr x, (sub 32, y))
84958499
//
8496-
// fold (or (shl x, (*ext (sub 32, y))),
8497-
// (srl x, (*ext y))) ->
8500+
// fold (or/add (shl x, (*ext (sub 32, y))),
8501+
// (srl x, (*ext y))) ->
84988502
// (rotr x, y) or (rotl x, (sub 32, y))
84998503
EVT VT = Shifted.getValueType();
85008504
if (matchRotateSub(InnerPos, InnerNeg, VT.getScalarSizeInBits(), DAG,
8501-
/*IsRotate*/ true)) {
8505+
/*IsRotate*/ true, FromAdd))
85028506
return DAG.getNode(HasPos ? PosOpcode : NegOpcode, DL, VT, Shifted,
85038507
HasPos ? Pos : Neg);
8504-
}
85058508

85068509
return SDValue();
85078510
}
@@ -8514,30 +8517,30 @@ SDValue DAGCombiner::MatchRotatePosNeg(SDValue Shifted, SDValue Pos,
85148517
// TODO: Merge with MatchRotatePosNeg.
85158518
SDValue DAGCombiner::MatchFunnelPosNeg(SDValue N0, SDValue N1, SDValue Pos,
85168519
SDValue Neg, SDValue InnerPos,
8517-
SDValue InnerNeg, bool HasPos,
8518-
unsigned PosOpcode, unsigned NegOpcode,
8519-
const SDLoc &DL) {
8520+
SDValue InnerNeg, bool FromAdd,
8521+
bool HasPos, unsigned PosOpcode,
8522+
unsigned NegOpcode, const SDLoc &DL) {
85208523
EVT VT = N0.getValueType();
85218524
unsigned EltBits = VT.getScalarSizeInBits();
85228525

8523-
// fold (or (shl x0, (*ext y)),
8524-
// (srl x1, (*ext (sub 32, y)))) ->
8526+
// fold (or/add (shl x0, (*ext y)),
8527+
// (srl x1, (*ext (sub 32, y)))) ->
85258528
// (fshl x0, x1, y) or (fshr x0, x1, (sub 32, y))
85268529
//
8527-
// fold (or (shl x0, (*ext (sub 32, y))),
8528-
// (srl x1, (*ext y))) ->
8530+
// fold (or/add (shl x0, (*ext (sub 32, y))),
8531+
// (srl x1, (*ext y))) ->
85298532
// (fshr x0, x1, y) or (fshl x0, x1, (sub 32, y))
8530-
if (matchRotateSub(InnerPos, InnerNeg, EltBits, DAG, /*IsRotate*/ N0 == N1)) {
8533+
if (matchRotateSub(InnerPos, InnerNeg, EltBits, DAG, /*IsRotate*/ N0 == N1,
8534+
FromAdd))
85318535
return DAG.getNode(HasPos ? PosOpcode : NegOpcode, DL, VT, N0, N1,
85328536
HasPos ? Pos : Neg);
8533-
}
85348537

85358538
// Matching the shift+xor cases, we can't easily use the xor'd shift amount
85368539
// so for now just use the PosOpcode case if its legal.
85378540
// TODO: When can we use the NegOpcode case?
85388541
if (PosOpcode == ISD::FSHL && isPowerOf2_32(EltBits)) {
85398542
SDValue X;
8540-
// fold (or (shl x0, y), (srl (srl x1, 1), (xor y, 31)))
8543+
// fold (or/add (shl x0, y), (srl (srl x1, 1), (xor y, 31)))
85418544
// -> (fshl x0, x1, y)
85428545
if (sd_match(N1, m_Srl(m_Value(X), m_One())) &&
85438546
sd_match(InnerNeg,
@@ -8546,7 +8549,7 @@ SDValue DAGCombiner::MatchFunnelPosNeg(SDValue N0, SDValue N1, SDValue Pos,
85468549
return DAG.getNode(ISD::FSHL, DL, VT, N0, X, Pos);
85478550
}
85488551

8549-
// fold (or (shl (shl x0, 1), (xor y, 31)), (srl x1, y))
8552+
// fold (or/add (shl (shl x0, 1), (xor y, 31)), (srl x1, y))
85508553
// -> (fshr x0, x1, y)
85518554
if (sd_match(N0, m_Shl(m_Value(X), m_One())) &&
85528555
sd_match(InnerPos,
@@ -8555,7 +8558,7 @@ SDValue DAGCombiner::MatchFunnelPosNeg(SDValue N0, SDValue N1, SDValue Pos,
85558558
return DAG.getNode(ISD::FSHR, DL, VT, X, N1, Neg);
85568559
}
85578560

8558-
// fold (or (shl (add x0, x0), (xor y, 31)), (srl x1, y))
8561+
// fold (or/add (shl (add x0, x0), (xor y, 31)), (srl x1, y))
85598562
// -> (fshr x0, x1, y)
85608563
// TODO: Should add(x,x) -> shl(x,1) be a general DAG canonicalization?
85618564
if (sd_match(N0, m_Add(m_Value(X), m_Deferred(X))) &&
@@ -8569,11 +8572,12 @@ SDValue DAGCombiner::MatchFunnelPosNeg(SDValue N0, SDValue N1, SDValue Pos,
85698572
return SDValue();
85708573
}
85718574

8572-
// MatchRotate - Handle an 'or' of two operands. If this is one of the many
8573-
// idioms for rotate, and if the target supports rotation instructions, generate
8574-
// a rot[lr]. This also matches funnel shift patterns, similar to rotation but
8575-
// with different shifted sources.
8576-
SDValue DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) {
8575+
// MatchRotate - Handle an 'or' or 'add' of two operands. If this is one of the
8576+
// many idioms for rotate, and if the target supports rotation instructions,
8577+
// generate a rot[lr]. This also matches funnel shift patterns, similar to
8578+
// rotation but with different shifted sources.
8579+
SDValue DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL,
8580+
bool FromAdd) {
85778581
EVT VT = LHS.getValueType();
85788582

85798583
// The target must have at least one rotate/funnel flavor.
@@ -8600,9 +8604,9 @@ SDValue DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) {
86008604
if (LHS.getOpcode() == ISD::TRUNCATE && RHS.getOpcode() == ISD::TRUNCATE &&
86018605
LHS.getOperand(0).getValueType() == RHS.getOperand(0).getValueType()) {
86028606
assert(LHS.getValueType() == RHS.getValueType());
8603-
if (SDValue Rot = MatchRotate(LHS.getOperand(0), RHS.getOperand(0), DL)) {
8607+
if (SDValue Rot =
8608+
MatchRotate(LHS.getOperand(0), RHS.getOperand(0), DL, FromAdd))
86048609
return DAG.getNode(ISD::TRUNCATE, SDLoc(LHS), LHS.getValueType(), Rot);
8605-
}
86068610
}
86078611

86088612
// Match "(X shl/srl V1) & V2" where V2 may not be present.
@@ -8736,10 +8740,10 @@ SDValue DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) {
87368740
return SDValue(); // Requires funnel shift support.
87378741
}
87388742

8739-
// fold (or (shl x, C1), (srl x, C2)) -> (rotl x, C1)
8740-
// fold (or (shl x, C1), (srl x, C2)) -> (rotr x, C2)
8741-
// fold (or (shl x, C1), (srl y, C2)) -> (fshl x, y, C1)
8742-
// fold (or (shl x, C1), (srl y, C2)) -> (fshr x, y, C2)
8743+
// fold (or/add (shl x, C1), (srl x, C2)) -> (rotl x, C1)
8744+
// fold (or/add (shl x, C1), (srl x, C2)) -> (rotr x, C2)
8745+
// fold (or/add (shl x, C1), (srl y, C2)) -> (fshl x, y, C1)
8746+
// fold (or/add (shl x, C1), (srl y, C2)) -> (fshr x, y, C2)
87438747
// iff C1+C2 == EltSizeInBits
87448748
if (ISD::matchBinaryPredicate(LHSShiftAmt, RHSShiftAmt, MatchRotateSum)) {
87458749
SDValue Res;
@@ -8782,29 +8786,25 @@ SDValue DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) {
87828786
}
87838787

87848788
if (IsRotate && (HasROTL || HasROTR)) {
8785-
SDValue TryL =
8786-
MatchRotatePosNeg(LHSShiftArg, LHSShiftAmt, RHSShiftAmt, LExtOp0,
8787-
RExtOp0, HasROTL, ISD::ROTL, ISD::ROTR, DL);
8788-
if (TryL)
8789+
if (SDValue TryL = MatchRotatePosNeg(LHSShiftArg, LHSShiftAmt, RHSShiftAmt,
8790+
LExtOp0, RExtOp0, FromAdd, HasROTL,
8791+
ISD::ROTL, ISD::ROTR, DL))
87898792
return TryL;
87908793

8791-
SDValue TryR =
8792-
MatchRotatePosNeg(RHSShiftArg, RHSShiftAmt, LHSShiftAmt, RExtOp0,
8793-
LExtOp0, HasROTR, ISD::ROTR, ISD::ROTL, DL);
8794-
if (TryR)
8794+
if (SDValue TryR = MatchRotatePosNeg(RHSShiftArg, RHSShiftAmt, LHSShiftAmt,
8795+
RExtOp0, LExtOp0, FromAdd, HasROTR,
8796+
ISD::ROTR, ISD::ROTL, DL))
87958797
return TryR;
87968798
}
87978799

8798-
SDValue TryL =
8799-
MatchFunnelPosNeg(LHSShiftArg, RHSShiftArg, LHSShiftAmt, RHSShiftAmt,
8800-
LExtOp0, RExtOp0, HasFSHL, ISD::FSHL, ISD::FSHR, DL);
8801-
if (TryL)
8800+
if (SDValue TryL = MatchFunnelPosNeg(LHSShiftArg, RHSShiftArg, LHSShiftAmt,
8801+
RHSShiftAmt, LExtOp0, RExtOp0, FromAdd,
8802+
HasFSHL, ISD::FSHL, ISD::FSHR, DL))
88028803
return TryL;
88038804

8804-
SDValue TryR =
8805-
MatchFunnelPosNeg(LHSShiftArg, RHSShiftArg, RHSShiftAmt, LHSShiftAmt,
8806-
RExtOp0, LExtOp0, HasFSHR, ISD::FSHR, ISD::FSHL, DL);
8807-
if (TryR)
8805+
if (SDValue TryR = MatchFunnelPosNeg(LHSShiftArg, RHSShiftArg, RHSShiftAmt,
8806+
LHSShiftAmt, RExtOp0, LExtOp0, FromAdd,
8807+
HasFSHR, ISD::FSHR, ISD::FSHL, DL))
88088808
return TryR;
88098809

88108810
return SDValue();

Diff for: llvm/test/CodeGen/AMDGPU/rotate-add.ll

+6-24
Original file line numberDiff line numberDiff line change
@@ -44,19 +44,15 @@ define i32 @test_rotl_var(i32 %x, i32 %y) {
4444
; SI-LABEL: test_rotl_var:
4545
; SI: ; %bb.0:
4646
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
47-
; SI-NEXT: v_lshlrev_b32_e32 v2, v1, v0
4847
; SI-NEXT: v_sub_i32_e32 v1, vcc, 32, v1
49-
; SI-NEXT: v_lshrrev_b32_e32 v0, v1, v0
50-
; SI-NEXT: v_add_i32_e32 v0, vcc, v2, v0
48+
; SI-NEXT: v_alignbit_b32 v0, v0, v0, v1
5149
; SI-NEXT: s_setpc_b64 s[30:31]
5250
;
5351
; VI-LABEL: test_rotl_var:
5452
; VI: ; %bb.0:
5553
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
56-
; VI-NEXT: v_lshlrev_b32_e32 v2, v1, v0
5754
; VI-NEXT: v_sub_u32_e32 v1, vcc, 32, v1
58-
; VI-NEXT: v_lshrrev_b32_e32 v0, v1, v0
59-
; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0
55+
; VI-NEXT: v_alignbit_b32 v0, v0, v0, v1
6056
; VI-NEXT: s_setpc_b64 s[30:31]
6157
%shl = shl i32 %x, %y
6258
%sub = sub i32 32, %y
@@ -69,19 +65,13 @@ define i32 @test_rotr_var(i32 %x, i32 %y) {
6965
; SI-LABEL: test_rotr_var:
7066
; SI: ; %bb.0:
7167
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
72-
; SI-NEXT: v_lshrrev_b32_e32 v2, v1, v0
73-
; SI-NEXT: v_sub_i32_e32 v1, vcc, 32, v1
74-
; SI-NEXT: v_lshlrev_b32_e32 v0, v1, v0
75-
; SI-NEXT: v_add_i32_e32 v0, vcc, v2, v0
68+
; SI-NEXT: v_alignbit_b32 v0, v0, v0, v1
7669
; SI-NEXT: s_setpc_b64 s[30:31]
7770
;
7871
; VI-LABEL: test_rotr_var:
7972
; VI: ; %bb.0:
8073
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
81-
; VI-NEXT: v_lshrrev_b32_e32 v2, v1, v0
82-
; VI-NEXT: v_sub_u32_e32 v1, vcc, 32, v1
83-
; VI-NEXT: v_lshlrev_b32_e32 v0, v1, v0
84-
; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0
74+
; VI-NEXT: v_alignbit_b32 v0, v0, v0, v1
8575
; VI-NEXT: s_setpc_b64 s[30:31]
8676
%shr = lshr i32 %x, %y
8777
%sub = sub i32 32, %y
@@ -174,21 +164,13 @@ define i32 @test_fshr_special_case(i32 %x0, i32 %x1, i32 %y) {
174164
; SI-LABEL: test_fshr_special_case:
175165
; SI: ; %bb.0:
176166
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
177-
; SI-NEXT: v_lshrrev_b32_e32 v1, v2, v1
178-
; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
179-
; SI-NEXT: v_xor_b32_e32 v2, 31, v2
180-
; SI-NEXT: v_lshlrev_b32_e32 v0, v2, v0
181-
; SI-NEXT: v_add_i32_e32 v0, vcc, v1, v0
167+
; SI-NEXT: v_alignbit_b32 v0, v0, v1, v2
182168
; SI-NEXT: s_setpc_b64 s[30:31]
183169
;
184170
; VI-LABEL: test_fshr_special_case:
185171
; VI: ; %bb.0:
186172
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
187-
; VI-NEXT: v_lshrrev_b32_e32 v1, v2, v1
188-
; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
189-
; VI-NEXT: v_xor_b32_e32 v2, 31, v2
190-
; VI-NEXT: v_lshlrev_b32_e32 v0, v2, v0
191-
; VI-NEXT: v_add_u32_e32 v0, vcc, v1, v0
173+
; VI-NEXT: v_alignbit_b32 v0, v0, v1, v2
192174
; VI-NEXT: s_setpc_b64 s[30:31]
193175
%shl = lshr i32 %x1, %y
194176
%srli = shl i32 %x0, 1

Diff for: llvm/test/CodeGen/ARM/rotate-add.ll

+2-5
Original file line numberDiff line numberDiff line change
@@ -29,9 +29,8 @@ define i32 @test_simple_rotr(i32 %x) {
2929
define i32 @test_rotl_var(i32 %x, i32 %y) {
3030
; CHECK-LABEL: test_rotl_var:
3131
; CHECK: @ %bb.0:
32-
; CHECK-NEXT: lsl r2, r0, r1
3332
; CHECK-NEXT: rsb r1, r1, #32
34-
; CHECK-NEXT: add r0, r2, r0, lsr r1
33+
; CHECK-NEXT: ror r0, r0, r1
3534
; CHECK-NEXT: bx lr
3635
%shl = shl i32 %x, %y
3736
%sub = sub i32 32, %y
@@ -43,9 +42,7 @@ define i32 @test_rotl_var(i32 %x, i32 %y) {
4342
define i32 @test_rotr_var(i32 %x, i32 %y) {
4443
; CHECK-LABEL: test_rotr_var:
4544
; CHECK: @ %bb.0:
46-
; CHECK-NEXT: lsr r2, r0, r1
47-
; CHECK-NEXT: rsb r1, r1, #32
48-
; CHECK-NEXT: add r0, r2, r0, lsl r1
45+
; CHECK-NEXT: ror r0, r0, r1
4946
; CHECK-NEXT: bx lr
5047
%shr = lshr i32 %x, %y
5148
%sub = sub i32 32, %y

0 commit comments

Comments
 (0)