Skip to content

Commit ab811e7

Browse files
authored
[AArch64] Fix BE popcount casts. (#129879)
A bitcast, being defined as a load and a store, can change the lane order. We need to use a NVCAST instead to keep the lanes out of the VADDV the same in big-endian. The extracting from a v2i64 vector is to keep the types of the nvcast legal, but also allow us to replace a lane mov with a mov 0. Fixes #129843
1 parent 02f024c commit ab811e7

File tree

4 files changed

+21
-18
lines changed

4 files changed

+21
-18
lines changed

Diff for: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

+8-2
Original file line numberDiff line numberDiff line change
@@ -10807,7 +10807,10 @@ SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op,
1080710807
if (VT == MVT::i32)
1080810808
AddV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, AddV,
1080910809
DAG.getConstant(0, DL, MVT::i64));
10810-
AddV = DAG.getNode(ISD::BITCAST, DL, VT, AddV);
10810+
else
10811+
AddV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
10812+
DAG.getNode(AArch64ISD::NVCAST, DL, MVT::v1i64, AddV),
10813+
DAG.getConstant(0, DL, MVT::i64));
1081110814
if (IsParity)
1081210815
AddV = DAG.getNode(ISD::AND, DL, VT, AddV, DAG.getConstant(1, DL, VT));
1081310816
return AddV;
@@ -10816,7 +10819,10 @@ SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op,
1081610819

1081710820
SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v16i8, Val);
1081810821
SDValue AddV = DAG.getNode(AArch64ISD::UADDV, DL, MVT::v16i8, CtPop);
10819-
AddV = DAG.getNode(ISD::BITCAST, DL, VT, AddV);
10822+
AddV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64,
10823+
DAG.getNode(AArch64ISD::NVCAST, DL, MVT::v2i64, AddV),
10824+
DAG.getConstant(0, DL, MVT::i64));
10825+
AddV = DAG.getZExtOrTrunc(AddV, DL, VT);
1082010826
if (IsParity)
1082110827
AddV = DAG.getNode(ISD::AND, DL, VT, AddV, DAG.getConstant(1, DL, VT));
1082210828
return AddV;

Diff for: llvm/test/CodeGen/AArch64/arm64-popcnt.ll

+3-5
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,6 @@ define i64 @cnt64_advsimd(i64 %x) nounwind readnone {
129129
; CHECK-BE-NEXT: rev64 v0.8b, v0.8b
130130
; CHECK-BE-NEXT: cnt v0.8b, v0.8b
131131
; CHECK-BE-NEXT: addv b0, v0.8b
132-
; CHECK-BE-NEXT: rev64 v0.8b, v0.8b
133132
; CHECK-BE-NEXT: fmov x0, d0
134133
; CHECK-BE-NEXT: ret
135134
%cnt = tail call i64 @llvm.ctpop.i64(i64 %x)
@@ -436,9 +435,9 @@ define i128 @cnt128(i128 %x) nounwind readnone {
436435
; CHECK: // %bb.0:
437436
; CHECK-NEXT: fmov d0, x0
438437
; CHECK-NEXT: mov.d v0[1], x1
438+
; CHECK-NEXT: mov x1, xzr
439439
; CHECK-NEXT: cnt.16b v0, v0
440440
; CHECK-NEXT: addv.16b b0, v0
441-
; CHECK-NEXT: mov.d x1, v0[1]
442441
; CHECK-NEXT: fmov x0, d0
443442
; CHECK-NEXT: ret
444443
;
@@ -481,13 +480,12 @@ define i128 @cnt128(i128 %x) nounwind readnone {
481480
; CHECK-BE-LABEL: cnt128:
482481
; CHECK-BE: // %bb.0:
483482
; CHECK-BE-NEXT: fmov d0, x0
483+
; CHECK-BE-NEXT: mov x0, xzr
484484
; CHECK-BE-NEXT: mov v0.d[1], x1
485485
; CHECK-BE-NEXT: rev64 v0.16b, v0.16b
486486
; CHECK-BE-NEXT: cnt v0.16b, v0.16b
487487
; CHECK-BE-NEXT: addv b0, v0.16b
488-
; CHECK-BE-NEXT: rev64 v0.16b, v0.16b
489-
; CHECK-BE-NEXT: mov x1, v0.d[1]
490-
; CHECK-BE-NEXT: fmov x0, d0
488+
; CHECK-BE-NEXT: fmov x1, d0
491489
; CHECK-BE-NEXT: ret
492490
%cnt = tail call i128 @llvm.ctpop.i128(i128 %x)
493491
ret i128 %cnt

Diff for: llvm/test/CodeGen/AArch64/parity.ll

+1-1
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,7 @@ define i32 @parity_64_trunc(i64 %x) {
159159
; CHECK-NEXT: fmov d0, x0
160160
; CHECK-NEXT: cnt v0.8b, v0.8b
161161
; CHECK-NEXT: addv b0, v0.8b
162-
; CHECK-NEXT: fmov x8, d0
162+
; CHECK-NEXT: fmov w8, s0
163163
; CHECK-NEXT: and w0, w8, #0x1
164164
; CHECK-NEXT: ret
165165
;

Diff for: llvm/test/CodeGen/AArch64/popcount.ll

+9-10
Original file line numberDiff line numberDiff line change
@@ -41,8 +41,8 @@ define i8 @popcount128(ptr nocapture nonnull readonly %0) {
4141
; BE-NEXT: rev64 v0.16b, v0.16b
4242
; BE-NEXT: cnt v0.16b, v0.16b
4343
; BE-NEXT: addv b0, v0.16b
44-
; BE-NEXT: rev32 v0.16b, v0.16b
45-
; BE-NEXT: mov w0, v0.s[3]
44+
; BE-NEXT: rev64 v0.4s, v0.4s
45+
; BE-NEXT: mov w0, v0.s[1]
4646
; BE-NEXT: ret
4747
;
4848
; GISEL-LABEL: popcount128:
@@ -138,10 +138,10 @@ define i16 @popcount256(ptr nocapture nonnull readonly %0) {
138138
; BE-NEXT: cnt v1.16b, v1.16b
139139
; BE-NEXT: addv b0, v0.16b
140140
; BE-NEXT: addv b1, v1.16b
141-
; BE-NEXT: rev32 v0.16b, v0.16b
142-
; BE-NEXT: rev32 v1.16b, v1.16b
143-
; BE-NEXT: mov w8, v0.s[3]
144-
; BE-NEXT: mov w9, v1.s[3]
141+
; BE-NEXT: rev64 v0.4s, v0.4s
142+
; BE-NEXT: rev64 v1.4s, v1.4s
143+
; BE-NEXT: mov w8, v0.s[1]
144+
; BE-NEXT: mov w9, v1.s[1]
145145
; BE-NEXT: add w0, w9, w8
146146
; BE-NEXT: ret
147147
;
@@ -227,22 +227,21 @@ define <1 x i128> @popcount1x128(<1 x i128> %0) {
227227
; CHECK: // %bb.0: // %Entry
228228
; CHECK-NEXT: fmov d0, x0
229229
; CHECK-NEXT: mov v0.d[1], x1
230+
; CHECK-NEXT: mov x1, xzr
230231
; CHECK-NEXT: cnt v0.16b, v0.16b
231232
; CHECK-NEXT: addv b0, v0.16b
232-
; CHECK-NEXT: mov x1, v0.d[1]
233233
; CHECK-NEXT: fmov x0, d0
234234
; CHECK-NEXT: ret
235235
;
236236
; BE-LABEL: popcount1x128:
237237
; BE: // %bb.0: // %Entry
238238
; BE-NEXT: fmov d0, x0
239+
; BE-NEXT: mov x0, xzr
239240
; BE-NEXT: mov v0.d[1], x1
240241
; BE-NEXT: rev64 v0.16b, v0.16b
241242
; BE-NEXT: cnt v0.16b, v0.16b
242243
; BE-NEXT: addv b0, v0.16b
243-
; BE-NEXT: rev64 v0.16b, v0.16b
244-
; BE-NEXT: mov x1, v0.d[1]
245-
; BE-NEXT: fmov x0, d0
244+
; BE-NEXT: fmov x1, d0
246245
; BE-NEXT: ret
247246
;
248247
; GISEL-LABEL: popcount1x128:

0 commit comments

Comments
 (0)