Skip to content

Commit 23595d1

Browse files
[AArch64] Lower __builtin_bswap16 to rev16 if bswap followed by any_extend (llvm#105375)
GCC compiles the built-in function `__builtin_bswap16`, to the ARM instruction rev16, which reverses the byte order of 16-bit data. On the other Clang compiles the same built-in function to e.g. ```     rev     w8, w0         lsr     w0, w8, rust-lang#16 ``` i.e. it performs a byte reversal of a 32-bit register, (which moves the lower half, which contains the 16-bit data, to the upper half) and then right shifts the reversed 16-bit data back to the lower half of the register. We can improve Clang codegen by generating `rev16` instead of `rev` and `lsr`, like GCC.
1 parent 69f8923 commit 23595d1

File tree

3 files changed

+97
-5
lines changed

3 files changed

+97
-5
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

+19
Original file line numberDiff line numberDiff line change
@@ -22379,6 +22379,25 @@ static SDValue performExtendCombine(SDNode *N,
2237922379
N->getOperand(0)->getOpcode() == ISD::SETCC)
2238022380
return performSignExtendSetCCCombine(N, DCI, DAG);
2238122381

22382+
// If we see (any_extend (bswap ...)) with bswap returning an i16, we know
22383+
// that the top half of the result register must be unused, due to the
22384+
// any_extend. This means that we can replace this pattern with (rev16
22385+
// (any_extend ...)). This saves a machine instruction compared to (lsr (rev
22386+
// ...)), which is what this pattern would otherwise be lowered to.
22387+
// Only apply this optimisation if any_extend in original pattern to i32 or
22388+
// i64, because this type will become the input type to REV16 in the new
22389+
// pattern, so must be a legitimate REV16 input type.
22390+
SDValue Bswap = N->getOperand(0);
22391+
if (N->getOpcode() == ISD::ANY_EXTEND && Bswap.getOpcode() == ISD::BSWAP &&
22392+
Bswap.getValueType() == MVT::i16 &&
22393+
(N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64)) {
22394+
SDLoc DL(N);
22395+
SDValue NewAnyExtend = DAG.getNode(ISD::ANY_EXTEND, DL, N->getValueType(0),
22396+
Bswap->getOperand(0));
22397+
return DAG.getNode(AArch64ISD::REV16, SDLoc(N), N->getValueType(0),
22398+
NewAnyExtend);
22399+
}
22400+
2238222401
return SDValue();
2238322402
}
2238422403

llvm/lib/Target/AArch64/AArch64InstrInfo.td

+5
Original file line numberDiff line numberDiff line change
@@ -758,6 +758,8 @@ def AArch64mvni_msl : SDNode<"AArch64ISD::MVNImsl", SDT_AArch64MOVIshift>;
758758
def AArch64movi : SDNode<"AArch64ISD::MOVI", SDT_AArch64MOVIedit>;
759759
def AArch64fmov : SDNode<"AArch64ISD::FMOV", SDT_AArch64MOVIedit>;
760760

761+
def AArch64rev16_scalar : SDNode<"AArch64ISD::REV16", SDTIntUnaryOp>;
762+
761763
def AArch64rev16 : SDNode<"AArch64ISD::REV16", SDT_AArch64UnaryVec>;
762764
def AArch64rev32 : SDNode<"AArch64ISD::REV32", SDT_AArch64UnaryVec>;
763765
def AArch64rev64 : SDNode<"AArch64ISD::REV64", SDT_AArch64UnaryVec>;
@@ -2840,6 +2842,9 @@ def : Pat<(bswap (rotr GPR64:$Rn, (i64 32))), (REV32Xr GPR64:$Rn)>;
28402842
def : Pat<(srl (bswap top16Zero:$Rn), (i64 16)), (REV16Wr GPR32:$Rn)>;
28412843
def : Pat<(srl (bswap top32Zero:$Rn), (i64 32)), (REV32Xr GPR64:$Rn)>;
28422844

2845+
def : Pat<(AArch64rev16_scalar GPR32:$Rn), (REV16Wr GPR32:$Rn)>;
2846+
def : Pat<(AArch64rev16_scalar GPR64:$Rn), (REV16Xr GPR64:$Rn)>;
2847+
28432848
def : Pat<(or (and (srl GPR64:$Rn, (i64 8)), (i64 0x00ff00ff00ff00ff)),
28442849
(and (shl GPR64:$Rn, (i64 8)), (i64 0xff00ff00ff00ff00))),
28452850
(REV16Xr GPR64:$Rn)>;

llvm/test/CodeGen/AArch64/bswap.ll

+73-5
Original file line numberDiff line numberDiff line change
@@ -3,17 +3,85 @@
33
; RUN: llc -mtriple=aarch64 -global-isel %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
44

55
; ====== Scalar Tests =====
6-
define i16 @bswap_i16(i16 %a){
7-
; CHECK-LABEL: bswap_i16:
6+
7+
; ====== Scalar bswap.i16 Tests =====
8+
define i16 @bswap_i16_to_i16_anyext(i16 %a){
9+
; CHECK-SD-LABEL: bswap_i16_to_i16_anyext:
10+
; CHECK-SD: // %bb.0:
11+
; CHECK-SD-NEXT: rev16 w0, w0
12+
; CHECK-SD-NEXT: ret
13+
;
14+
; CHECK-GI-LABEL: bswap_i16_to_i16_anyext:
15+
; CHECK-GI: // %bb.0:
16+
; CHECK-GI-NEXT: rev w8, w0
17+
; CHECK-GI-NEXT: lsr w0, w8, #16
18+
; CHECK-GI-NEXT: ret
19+
%3 = call i16 @llvm.bswap.i16(i16 %a)
20+
ret i16 %3
21+
}
22+
declare i16 @llvm.bswap.i16(i16)
23+
24+
; The zext here is optimised to an any_extend during isel.
25+
define i64 @bswap_i16_to_i64_anyext(i16 %a) {
26+
; CHECK-SD-LABEL: bswap_i16_to_i64_anyext:
27+
; CHECK-SD: // %bb.0:
28+
; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0
29+
; CHECK-SD-NEXT: rev16 x8, x0
30+
; CHECK-SD-NEXT: lsl x0, x8, #48
31+
; CHECK-SD-NEXT: ret
32+
;
33+
; CHECK-GI-LABEL: bswap_i16_to_i64_anyext:
34+
; CHECK-GI: // %bb.0:
35+
; CHECK-GI-NEXT: rev w8, w0
36+
; CHECK-GI-NEXT: lsr w8, w8, #16
37+
; CHECK-GI-NEXT: and x8, x8, #0xffff
38+
; CHECK-GI-NEXT: lsl x0, x8, #48
39+
; CHECK-GI-NEXT: ret
40+
%3 = call i16 @llvm.bswap.i16(i16 %a)
41+
%4 = zext i16 %3 to i64
42+
%5 = shl i64 %4, 48
43+
ret i64 %5
44+
}
45+
46+
; The zext here is optimised to an any_extend during isel..
47+
define i128 @bswap_i16_to_i128_anyext(i16 %a) {
48+
; CHECK-SD-LABEL: bswap_i16_to_i128_anyext:
49+
; CHECK-SD: // %bb.0:
50+
; CHECK-SD-NEXT: mov w8, w0
51+
; CHECK-SD-NEXT: mov x0, xzr
52+
; CHECK-SD-NEXT: rev w8, w8
53+
; CHECK-SD-NEXT: lsr w8, w8, #16
54+
; CHECK-SD-NEXT: lsl x1, x8, #48
55+
; CHECK-SD-NEXT: ret
56+
;
57+
; CHECK-GI-LABEL: bswap_i16_to_i128_anyext:
58+
; CHECK-GI: // %bb.0:
59+
; CHECK-GI-NEXT: mov w8, w0
60+
; CHECK-GI-NEXT: mov x0, xzr
61+
; CHECK-GI-NEXT: rev w8, w8
62+
; CHECK-GI-NEXT: lsr w8, w8, #16
63+
; CHECK-GI-NEXT: bfi x8, x8, #32, #32
64+
; CHECK-GI-NEXT: and x8, x8, #0xffff
65+
; CHECK-GI-NEXT: lsl x1, x8, #48
66+
; CHECK-GI-NEXT: ret
67+
%3 = call i16 @llvm.bswap.i16(i16 %a)
68+
%4 = zext i16 %3 to i128
69+
%5 = shl i128 %4, 112
70+
ret i128 %5
71+
}
72+
73+
define i32 @bswap_i16_to_i32_zext(i16 %a){
74+
; CHECK-LABEL: bswap_i16_to_i32_zext:
875
; CHECK: // %bb.0:
976
; CHECK-NEXT: rev w8, w0
1077
; CHECK-NEXT: lsr w0, w8, #16
1178
; CHECK-NEXT: ret
12-
%3 = call i16 @llvm.bswap.i16(i16 %a)
13-
ret i16 %3
79+
%3 = call i16 @llvm.bswap.i16(i16 %a)
80+
%4 = zext i16 %3 to i32
81+
ret i32 %4
1482
}
15-
declare i16 @llvm.bswap.i16(i16)
1683

84+
; ====== Other scalar bswap tests =====
1785
define i32 @bswap_i32(i32 %a){
1886
; CHECK-LABEL: bswap_i32:
1987
; CHECK: // %bb.0:

0 commit comments

Comments
 (0)