Skip to content

Commit 960c975

Browse files
authored
[AArch64] Expand scmp/ucmp vector operations with sub (#108830)
Unlike scalar, where AArch64 prefers expanding scmp/ucmp with select, under Neon we can use the arithmetic expansion to generate fewer instructions. Notably it also prevents the scalarization of vselect during vector-legalization.
1 parent f5ba3e1 commit 960c975

File tree

9 files changed

+80
-341
lines changed

9 files changed

+80
-341
lines changed

llvm/include/llvm/CodeGen/BasicTTIImpl.h

+2-1
Original file line numberDiff line numberDiff line change
@@ -2451,7 +2451,8 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
24512451
CmpIntrinsic::getLTPredicate(IID),
24522452
CostKind);
24532453

2454-
if (TLI->shouldExpandCmpUsingSelects()) {
2454+
EVT VT = TLI->getValueType(DL, CmpTy, true);
2455+
if (TLI->shouldExpandCmpUsingSelects(VT)) {
24552456
// x < y ? -1 : (x > y ? 1 : 0)
24562457
Cost += 2 * thisT()->getCmpSelInstrCost(
24572458
BinaryOperator::Select, RetTy, CondTy,

llvm/include/llvm/CodeGen/TargetLowering.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -3409,7 +3409,7 @@ class TargetLoweringBase {
34093409

34103410
/// Should we expand [US]CMP nodes using two selects and two compares, or by
34113411
/// doing arithmetic on boolean types
3412-
virtual bool shouldExpandCmpUsingSelects() const { return false; }
3412+
virtual bool shouldExpandCmpUsingSelects(EVT VT) const { return false; }
34133413

34143414
/// Does this target support complex deinterleaving
34153415
virtual bool isComplexDeinterleavingSupported() const { return false; }

llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -10681,7 +10681,7 @@ SDValue TargetLowering::expandCMP(SDNode *Node, SelectionDAG &DAG) const {
1068110681
// because one of the conditions can be merged with one of the selects.
1068210682
// And finally, if we don't know the contents of high bits of a boolean value
1068310683
// we can't perform any arithmetic either.
10684-
if (shouldExpandCmpUsingSelects() || BoolVT.getScalarSizeInBits() == 1 ||
10684+
if (shouldExpandCmpUsingSelects(VT) || BoolVT.getScalarSizeInBits() == 1 ||
1068510685
getBooleanContents(BoolVT) == UndefinedBooleanContent) {
1068610686
SDValue SelectZeroOrOne =
1068710687
DAG.getSelect(dl, ResVT, IsGT, DAG.getConstant(1, dl, ResVT),

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

+6
Original file line numberDiff line numberDiff line change
@@ -27781,6 +27781,12 @@ bool AArch64TargetLowering::shouldConvertFpToSat(unsigned Op, EVT FPVT,
2778127781
return TargetLowering::shouldConvertFpToSat(Op, FPVT, VT);
2778227782
}
2778327783

27784+
bool AArch64TargetLowering::shouldExpandCmpUsingSelects(EVT VT) const {
27785+
// Expand scalar and SVE operations using selects. Neon vectors prefer sub to
27786+
// avoid vselect becoming bsl / unrolling.
27787+
return !VT.isFixedLengthVector();
27788+
}
27789+
2778427790
MachineInstr *
2778527791
AArch64TargetLowering::EmitKCFICheck(MachineBasicBlock &MBB,
2778627792
MachineBasicBlock::instr_iterator &MBBI,

llvm/lib/Target/AArch64/AArch64ISelLowering.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -914,7 +914,7 @@ class AArch64TargetLowering : public TargetLowering {
914914

915915
bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override;
916916

917-
bool shouldExpandCmpUsingSelects() const override { return true; }
917+
bool shouldExpandCmpUsingSelects(EVT VT) const override;
918918

919919
bool isComplexDeinterleavingSupported() const override;
920920
bool isComplexDeinterleavingOperationSupported(

llvm/lib/Target/SystemZ/SystemZISelLowering.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -507,7 +507,7 @@ class SystemZTargetLowering : public TargetLowering {
507507

508508
bool shouldConsiderGEPOffsetSplit() const override { return true; }
509509

510-
bool shouldExpandCmpUsingSelects() const override { return true; }
510+
bool shouldExpandCmpUsingSelects(EVT VT) const override { return true; }
511511

512512
const char *getTargetNodeName(unsigned Opcode) const override;
513513
std::pair<unsigned, const TargetRegisterClass *>

llvm/test/Analysis/CostModel/AArch64/cmp.ll

+12-12
Original file line numberDiff line numberDiff line change
@@ -128,33 +128,33 @@ define void @uscmp() {
128128
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %u16 = call i16 @llvm.ucmp.i16.i16(i16 undef, i16 undef)
129129
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %u32 = call i32 @llvm.ucmp.i32.i32(i32 undef, i32 undef)
130130
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %u64 = call i64 @llvm.ucmp.i64.i64(i64 undef, i64 undef)
131-
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 98 for instruction: %uv16i8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> undef, <16 x i8> undef)
132-
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %uv8i16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> undef, <8 x i16> undef)
133-
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %uv4i32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef)
131+
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %uv16i8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> undef, <16 x i8> undef)
132+
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %uv8i16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> undef, <8 x i16> undef)
133+
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %uv4i32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef)
134134
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %s8 = call i8 @llvm.scmp.i8.i8(i8 undef, i8 undef)
135135
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %s16 = call i16 @llvm.scmp.i16.i16(i16 undef, i16 undef)
136136
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %s32 = call i32 @llvm.scmp.i32.i32(i32 undef, i32 undef)
137137
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %s64 = call i64 @llvm.scmp.i64.i64(i64 undef, i64 undef)
138-
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 98 for instruction: %sv16i8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> undef, <16 x i8> undef)
139-
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %sv8i16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> undef, <8 x i16> undef)
140-
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %sv4i32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef)
138+
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %sv16i8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> undef, <16 x i8> undef)
139+
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %sv8i16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> undef, <8 x i16> undef)
140+
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %sv4i32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef)
141141
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
142142
;
143143
; CHECK-SIZE-LABEL: 'uscmp'
144144
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %u8 = call i8 @llvm.ucmp.i8.i8(i8 undef, i8 undef)
145145
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %u16 = call i16 @llvm.ucmp.i16.i16(i16 undef, i16 undef)
146146
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %u32 = call i32 @llvm.ucmp.i32.i32(i32 undef, i32 undef)
147147
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %u64 = call i64 @llvm.ucmp.i64.i64(i64 undef, i64 undef)
148-
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %uv16i8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> undef, <16 x i8> undef)
149-
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %uv8i16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> undef, <8 x i16> undef)
150-
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %uv4i32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef)
148+
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %uv16i8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> undef, <16 x i8> undef)
149+
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %uv8i16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> undef, <8 x i16> undef)
150+
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %uv4i32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef)
151151
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %s8 = call i8 @llvm.scmp.i8.i8(i8 undef, i8 undef)
152152
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %s16 = call i16 @llvm.scmp.i16.i16(i16 undef, i16 undef)
153153
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %s32 = call i32 @llvm.scmp.i32.i32(i32 undef, i32 undef)
154154
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %s64 = call i64 @llvm.scmp.i64.i64(i64 undef, i64 undef)
155-
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %sv16i8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> undef, <16 x i8> undef)
156-
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %sv8i16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> undef, <8 x i16> undef)
157-
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %sv4i32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef)
155+
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %sv16i8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> undef, <16 x i8> undef)
156+
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %sv8i16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> undef, <8 x i16> undef)
157+
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %sv4i32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef)
158158
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
159159
;
160160
%u8 = call i8 @llvm.ucmp(i8 undef, i8 undef)

llvm/test/CodeGen/AArch64/scmp.ll

+28-162
Original file line numberDiff line numberDiff line change
@@ -136,11 +136,9 @@ define i64 @scmp.64.64(i64 %x, i64 %y) nounwind {
136136
define <8 x i8> @s_v8i8(<8 x i8> %a, <8 x i8> %b) {
137137
; CHECK-SD-LABEL: s_v8i8:
138138
; CHECK-SD: // %bb.0: // %entry
139-
; CHECK-SD-NEXT: movi v2.8b, #1
140-
; CHECK-SD-NEXT: cmgt v3.8b, v0.8b, v1.8b
139+
; CHECK-SD-NEXT: cmgt v2.8b, v0.8b, v1.8b
141140
; CHECK-SD-NEXT: cmgt v0.8b, v1.8b, v0.8b
142-
; CHECK-SD-NEXT: and v1.8b, v3.8b, v2.8b
143-
; CHECK-SD-NEXT: orr v0.8b, v1.8b, v0.8b
141+
; CHECK-SD-NEXT: sub v0.8b, v0.8b, v2.8b
144142
; CHECK-SD-NEXT: ret
145143
;
146144
; CHECK-GI-LABEL: s_v8i8:
@@ -160,11 +158,9 @@ entry:
160158
define <16 x i8> @s_v16i8(<16 x i8> %a, <16 x i8> %b) {
161159
; CHECK-SD-LABEL: s_v16i8:
162160
; CHECK-SD: // %bb.0: // %entry
163-
; CHECK-SD-NEXT: movi v2.16b, #1
164-
; CHECK-SD-NEXT: cmgt v3.16b, v0.16b, v1.16b
161+
; CHECK-SD-NEXT: cmgt v2.16b, v0.16b, v1.16b
165162
; CHECK-SD-NEXT: cmgt v0.16b, v1.16b, v0.16b
166-
; CHECK-SD-NEXT: and v1.16b, v3.16b, v2.16b
167-
; CHECK-SD-NEXT: orr v0.16b, v1.16b, v0.16b
163+
; CHECK-SD-NEXT: sub v0.16b, v0.16b, v2.16b
168164
; CHECK-SD-NEXT: ret
169165
;
170166
; CHECK-GI-LABEL: s_v16i8:
@@ -184,11 +180,9 @@ entry:
184180
define <4 x i16> @s_v4i16(<4 x i16> %a, <4 x i16> %b) {
185181
; CHECK-SD-LABEL: s_v4i16:
186182
; CHECK-SD: // %bb.0: // %entry
187-
; CHECK-SD-NEXT: movi v2.4h, #1
188-
; CHECK-SD-NEXT: cmgt v3.4h, v0.4h, v1.4h
183+
; CHECK-SD-NEXT: cmgt v2.4h, v0.4h, v1.4h
189184
; CHECK-SD-NEXT: cmgt v0.4h, v1.4h, v0.4h
190-
; CHECK-SD-NEXT: and v1.8b, v3.8b, v2.8b
191-
; CHECK-SD-NEXT: orr v0.8b, v1.8b, v0.8b
185+
; CHECK-SD-NEXT: sub v0.4h, v0.4h, v2.4h
192186
; CHECK-SD-NEXT: ret
193187
;
194188
; CHECK-GI-LABEL: s_v4i16:
@@ -208,11 +202,9 @@ entry:
208202
define <8 x i16> @s_v8i16(<8 x i16> %a, <8 x i16> %b) {
209203
; CHECK-SD-LABEL: s_v8i16:
210204
; CHECK-SD: // %bb.0: // %entry
211-
; CHECK-SD-NEXT: movi v2.8h, #1
212-
; CHECK-SD-NEXT: cmgt v3.8h, v0.8h, v1.8h
205+
; CHECK-SD-NEXT: cmgt v2.8h, v0.8h, v1.8h
213206
; CHECK-SD-NEXT: cmgt v0.8h, v1.8h, v0.8h
214-
; CHECK-SD-NEXT: and v1.16b, v3.16b, v2.16b
215-
; CHECK-SD-NEXT: orr v0.16b, v1.16b, v0.16b
207+
; CHECK-SD-NEXT: sub v0.8h, v0.8h, v2.8h
216208
; CHECK-SD-NEXT: ret
217209
;
218210
; CHECK-GI-LABEL: s_v8i16:
@@ -232,15 +224,12 @@ entry:
232224
define <16 x i16> @s_v16i16(<16 x i16> %a, <16 x i16> %b) {
233225
; CHECK-SD-LABEL: s_v16i16:
234226
; CHECK-SD: // %bb.0: // %entry
235-
; CHECK-SD-NEXT: movi v4.8h, #1
227+
; CHECK-SD-NEXT: cmgt v4.8h, v1.8h, v3.8h
236228
; CHECK-SD-NEXT: cmgt v5.8h, v0.8h, v2.8h
237-
; CHECK-SD-NEXT: cmgt v6.8h, v1.8h, v3.8h
238229
; CHECK-SD-NEXT: cmgt v0.8h, v2.8h, v0.8h
239230
; CHECK-SD-NEXT: cmgt v1.8h, v3.8h, v1.8h
240-
; CHECK-SD-NEXT: and v2.16b, v5.16b, v4.16b
241-
; CHECK-SD-NEXT: and v3.16b, v6.16b, v4.16b
242-
; CHECK-SD-NEXT: orr v0.16b, v2.16b, v0.16b
243-
; CHECK-SD-NEXT: orr v1.16b, v3.16b, v1.16b
231+
; CHECK-SD-NEXT: sub v0.8h, v0.8h, v5.8h
232+
; CHECK-SD-NEXT: sub v1.8h, v1.8h, v4.8h
244233
; CHECK-SD-NEXT: ret
245234
;
246235
; CHECK-GI-LABEL: s_v16i16:
@@ -264,11 +253,9 @@ entry:
264253
define <2 x i32> @s_v2i32(<2 x i32> %a, <2 x i32> %b) {
265254
; CHECK-SD-LABEL: s_v2i32:
266255
; CHECK-SD: // %bb.0: // %entry
267-
; CHECK-SD-NEXT: movi v2.2s, #1
268-
; CHECK-SD-NEXT: cmgt v3.2s, v0.2s, v1.2s
256+
; CHECK-SD-NEXT: cmgt v2.2s, v0.2s, v1.2s
269257
; CHECK-SD-NEXT: cmgt v0.2s, v1.2s, v0.2s
270-
; CHECK-SD-NEXT: and v1.8b, v3.8b, v2.8b
271-
; CHECK-SD-NEXT: orr v0.8b, v1.8b, v0.8b
258+
; CHECK-SD-NEXT: sub v0.2s, v0.2s, v2.2s
272259
; CHECK-SD-NEXT: ret
273260
;
274261
; CHECK-GI-LABEL: s_v2i32:
@@ -288,11 +275,9 @@ entry:
288275
define <4 x i32> @s_v4i32(<4 x i32> %a, <4 x i32> %b) {
289276
; CHECK-SD-LABEL: s_v4i32:
290277
; CHECK-SD: // %bb.0: // %entry
291-
; CHECK-SD-NEXT: movi v2.4s, #1
292-
; CHECK-SD-NEXT: cmgt v3.4s, v0.4s, v1.4s
278+
; CHECK-SD-NEXT: cmgt v2.4s, v0.4s, v1.4s
293279
; CHECK-SD-NEXT: cmgt v0.4s, v1.4s, v0.4s
294-
; CHECK-SD-NEXT: and v1.16b, v3.16b, v2.16b
295-
; CHECK-SD-NEXT: orr v0.16b, v1.16b, v0.16b
280+
; CHECK-SD-NEXT: sub v0.4s, v0.4s, v2.4s
296281
; CHECK-SD-NEXT: ret
297282
;
298283
; CHECK-GI-LABEL: s_v4i32:
@@ -312,15 +297,12 @@ entry:
312297
define <8 x i32> @s_v8i32(<8 x i32> %a, <8 x i32> %b) {
313298
; CHECK-SD-LABEL: s_v8i32:
314299
; CHECK-SD: // %bb.0: // %entry
315-
; CHECK-SD-NEXT: movi v4.4s, #1
300+
; CHECK-SD-NEXT: cmgt v4.4s, v1.4s, v3.4s
316301
; CHECK-SD-NEXT: cmgt v5.4s, v0.4s, v2.4s
317-
; CHECK-SD-NEXT: cmgt v6.4s, v1.4s, v3.4s
318302
; CHECK-SD-NEXT: cmgt v0.4s, v2.4s, v0.4s
319303
; CHECK-SD-NEXT: cmgt v1.4s, v3.4s, v1.4s
320-
; CHECK-SD-NEXT: and v2.16b, v5.16b, v4.16b
321-
; CHECK-SD-NEXT: and v3.16b, v6.16b, v4.16b
322-
; CHECK-SD-NEXT: orr v0.16b, v2.16b, v0.16b
323-
; CHECK-SD-NEXT: orr v1.16b, v3.16b, v1.16b
304+
; CHECK-SD-NEXT: sub v0.4s, v0.4s, v5.4s
305+
; CHECK-SD-NEXT: sub v1.4s, v1.4s, v4.4s
324306
; CHECK-SD-NEXT: ret
325307
;
326308
; CHECK-GI-LABEL: s_v8i32:
@@ -344,12 +326,9 @@ entry:
344326
define <2 x i64> @s_v2i64(<2 x i64> %a, <2 x i64> %b) {
345327
; CHECK-SD-LABEL: s_v2i64:
346328
; CHECK-SD: // %bb.0: // %entry
347-
; CHECK-SD-NEXT: mov w8, #1 // =0x1
348329
; CHECK-SD-NEXT: cmgt v2.2d, v0.2d, v1.2d
349330
; CHECK-SD-NEXT: cmgt v0.2d, v1.2d, v0.2d
350-
; CHECK-SD-NEXT: dup v3.2d, x8
351-
; CHECK-SD-NEXT: and v1.16b, v2.16b, v3.16b
352-
; CHECK-SD-NEXT: orr v0.16b, v1.16b, v0.16b
331+
; CHECK-SD-NEXT: sub v0.2d, v0.2d, v2.2d
353332
; CHECK-SD-NEXT: ret
354333
;
355334
; CHECK-GI-LABEL: s_v2i64:
@@ -370,16 +349,12 @@ entry:
370349
define <4 x i64> @s_v4i64(<4 x i64> %a, <4 x i64> %b) {
371350
; CHECK-SD-LABEL: s_v4i64:
372351
; CHECK-SD: // %bb.0: // %entry
373-
; CHECK-SD-NEXT: mov w8, #1 // =0x1
374-
; CHECK-SD-NEXT: cmgt v4.2d, v0.2d, v2.2d
375-
; CHECK-SD-NEXT: cmgt v6.2d, v1.2d, v3.2d
376-
; CHECK-SD-NEXT: dup v5.2d, x8
352+
; CHECK-SD-NEXT: cmgt v4.2d, v1.2d, v3.2d
353+
; CHECK-SD-NEXT: cmgt v5.2d, v0.2d, v2.2d
377354
; CHECK-SD-NEXT: cmgt v0.2d, v2.2d, v0.2d
378355
; CHECK-SD-NEXT: cmgt v1.2d, v3.2d, v1.2d
379-
; CHECK-SD-NEXT: and v2.16b, v4.16b, v5.16b
380-
; CHECK-SD-NEXT: and v3.16b, v6.16b, v5.16b
381-
; CHECK-SD-NEXT: orr v0.16b, v2.16b, v0.16b
382-
; CHECK-SD-NEXT: orr v1.16b, v3.16b, v1.16b
356+
; CHECK-SD-NEXT: sub v0.2d, v0.2d, v5.2d
357+
; CHECK-SD-NEXT: sub v1.2d, v1.2d, v4.2d
383358
; CHECK-SD-NEXT: ret
384359
;
385360
; CHECK-GI-LABEL: s_v4i64:
@@ -404,122 +379,13 @@ entry:
404379
define <16 x i8> @signOf_neon_scmp(<8 x i16> %s0_lo, <8 x i16> %s0_hi, <8 x i16> %s1_lo, <8 x i16> %s1_hi) {
405380
; CHECK-SD-LABEL: signOf_neon_scmp:
406381
; CHECK-SD: // %bb.0: // %entry
407-
; CHECK-SD-NEXT: cmgt v5.8h, v0.8h, v2.8h
408-
; CHECK-SD-NEXT: cmgt v2.8h, v2.8h, v0.8h
409382
; CHECK-SD-NEXT: cmgt v4.8h, v1.8h, v3.8h
410383
; CHECK-SD-NEXT: cmgt v1.8h, v3.8h, v1.8h
411-
; CHECK-SD-NEXT: umov w8, v5.h[1]
412-
; CHECK-SD-NEXT: umov w9, v2.h[1]
413-
; CHECK-SD-NEXT: umov w10, v5.h[0]
414-
; CHECK-SD-NEXT: umov w11, v2.h[0]
415-
; CHECK-SD-NEXT: tst w8, #0xffff
416-
; CHECK-SD-NEXT: cset w8, ne
417-
; CHECK-SD-NEXT: tst w9, #0xffff
418-
; CHECK-SD-NEXT: csinv w8, w8, wzr, eq
419-
; CHECK-SD-NEXT: tst w10, #0xffff
420-
; CHECK-SD-NEXT: umov w10, v5.h[2]
421-
; CHECK-SD-NEXT: cset w9, ne
422-
; CHECK-SD-NEXT: tst w11, #0xffff
423-
; CHECK-SD-NEXT: umov w11, v2.h[2]
424-
; CHECK-SD-NEXT: csinv w9, w9, wzr, eq
425-
; CHECK-SD-NEXT: fmov s0, w9
426-
; CHECK-SD-NEXT: tst w10, #0xffff
427-
; CHECK-SD-NEXT: umov w10, v2.h[3]
428-
; CHECK-SD-NEXT: cset w9, ne
429-
; CHECK-SD-NEXT: tst w11, #0xffff
430-
; CHECK-SD-NEXT: mov v0.b[1], w8
431-
; CHECK-SD-NEXT: umov w8, v5.h[3]
432-
; CHECK-SD-NEXT: csinv w9, w9, wzr, eq
433-
; CHECK-SD-NEXT: mov v0.b[2], w9
434-
; CHECK-SD-NEXT: tst w8, #0xffff
435-
; CHECK-SD-NEXT: umov w8, v5.h[4]
436-
; CHECK-SD-NEXT: cset w9, ne
437-
; CHECK-SD-NEXT: tst w10, #0xffff
438-
; CHECK-SD-NEXT: umov w10, v2.h[4]
439-
; CHECK-SD-NEXT: csinv w9, w9, wzr, eq
440-
; CHECK-SD-NEXT: mov v0.b[3], w9
441-
; CHECK-SD-NEXT: tst w8, #0xffff
442-
; CHECK-SD-NEXT: umov w8, v5.h[5]
443-
; CHECK-SD-NEXT: cset w9, ne
444-
; CHECK-SD-NEXT: tst w10, #0xffff
445-
; CHECK-SD-NEXT: umov w10, v2.h[5]
446-
; CHECK-SD-NEXT: csinv w9, w9, wzr, eq
447-
; CHECK-SD-NEXT: mov v0.b[4], w9
448-
; CHECK-SD-NEXT: tst w8, #0xffff
449-
; CHECK-SD-NEXT: umov w8, v5.h[6]
450-
; CHECK-SD-NEXT: cset w9, ne
451-
; CHECK-SD-NEXT: tst w10, #0xffff
452-
; CHECK-SD-NEXT: umov w10, v2.h[6]
453-
; CHECK-SD-NEXT: csinv w9, w9, wzr, eq
454-
; CHECK-SD-NEXT: mov v0.b[5], w9
455-
; CHECK-SD-NEXT: umov w9, v5.h[7]
456-
; CHECK-SD-NEXT: tst w8, #0xffff
457-
; CHECK-SD-NEXT: cset w8, ne
458-
; CHECK-SD-NEXT: tst w10, #0xffff
459-
; CHECK-SD-NEXT: umov w10, v2.h[7]
460-
; CHECK-SD-NEXT: csinv w8, w8, wzr, eq
461-
; CHECK-SD-NEXT: mov v0.b[6], w8
462-
; CHECK-SD-NEXT: tst w9, #0xffff
463-
; CHECK-SD-NEXT: umov w8, v4.h[0]
464-
; CHECK-SD-NEXT: cset w9, ne
465-
; CHECK-SD-NEXT: tst w10, #0xffff
466-
; CHECK-SD-NEXT: umov w10, v1.h[0]
467-
; CHECK-SD-NEXT: csinv w9, w9, wzr, eq
468-
; CHECK-SD-NEXT: mov v0.b[7], w9
469-
; CHECK-SD-NEXT: tst w8, #0xffff
470-
; CHECK-SD-NEXT: umov w8, v4.h[1]
471-
; CHECK-SD-NEXT: cset w9, ne
472-
; CHECK-SD-NEXT: tst w10, #0xffff
473-
; CHECK-SD-NEXT: umov w10, v1.h[1]
474-
; CHECK-SD-NEXT: csinv w9, w9, wzr, eq
475-
; CHECK-SD-NEXT: mov v0.b[8], w9
476-
; CHECK-SD-NEXT: tst w8, #0xffff
477-
; CHECK-SD-NEXT: umov w8, v4.h[2]
478-
; CHECK-SD-NEXT: cset w9, ne
479-
; CHECK-SD-NEXT: tst w10, #0xffff
480-
; CHECK-SD-NEXT: umov w10, v1.h[2]
481-
; CHECK-SD-NEXT: csinv w9, w9, wzr, eq
482-
; CHECK-SD-NEXT: mov v0.b[9], w9
483-
; CHECK-SD-NEXT: tst w8, #0xffff
484-
; CHECK-SD-NEXT: umov w8, v4.h[3]
485-
; CHECK-SD-NEXT: cset w9, ne
486-
; CHECK-SD-NEXT: tst w10, #0xffff
487-
; CHECK-SD-NEXT: umov w10, v1.h[3]
488-
; CHECK-SD-NEXT: csinv w9, w9, wzr, eq
489-
; CHECK-SD-NEXT: mov v0.b[10], w9
490-
; CHECK-SD-NEXT: tst w8, #0xffff
491-
; CHECK-SD-NEXT: umov w8, v4.h[4]
492-
; CHECK-SD-NEXT: cset w9, ne
493-
; CHECK-SD-NEXT: tst w10, #0xffff
494-
; CHECK-SD-NEXT: umov w10, v1.h[4]
495-
; CHECK-SD-NEXT: csinv w9, w9, wzr, eq
496-
; CHECK-SD-NEXT: mov v0.b[11], w9
497-
; CHECK-SD-NEXT: tst w8, #0xffff
498-
; CHECK-SD-NEXT: umov w8, v4.h[5]
499-
; CHECK-SD-NEXT: cset w9, ne
500-
; CHECK-SD-NEXT: tst w10, #0xffff
501-
; CHECK-SD-NEXT: umov w10, v1.h[5]
502-
; CHECK-SD-NEXT: csinv w9, w9, wzr, eq
503-
; CHECK-SD-NEXT: mov v0.b[12], w9
504-
; CHECK-SD-NEXT: tst w8, #0xffff
505-
; CHECK-SD-NEXT: umov w8, v4.h[6]
506-
; CHECK-SD-NEXT: cset w9, ne
507-
; CHECK-SD-NEXT: tst w10, #0xffff
508-
; CHECK-SD-NEXT: umov w10, v1.h[6]
509-
; CHECK-SD-NEXT: csinv w9, w9, wzr, eq
510-
; CHECK-SD-NEXT: mov v0.b[13], w9
511-
; CHECK-SD-NEXT: tst w8, #0xffff
512-
; CHECK-SD-NEXT: umov w8, v4.h[7]
513-
; CHECK-SD-NEXT: cset w9, ne
514-
; CHECK-SD-NEXT: tst w10, #0xffff
515-
; CHECK-SD-NEXT: umov w10, v1.h[7]
516-
; CHECK-SD-NEXT: csinv w9, w9, wzr, eq
517-
; CHECK-SD-NEXT: mov v0.b[14], w9
518-
; CHECK-SD-NEXT: tst w8, #0xffff
519-
; CHECK-SD-NEXT: cset w8, ne
520-
; CHECK-SD-NEXT: tst w10, #0xffff
521-
; CHECK-SD-NEXT: csinv w8, w8, wzr, eq
522-
; CHECK-SD-NEXT: mov v0.b[15], w8
384+
; CHECK-SD-NEXT: cmgt v3.8h, v0.8h, v2.8h
385+
; CHECK-SD-NEXT: cmgt v0.8h, v2.8h, v0.8h
386+
; CHECK-SD-NEXT: sub v1.8h, v1.8h, v4.8h
387+
; CHECK-SD-NEXT: sub v0.8h, v0.8h, v3.8h
388+
; CHECK-SD-NEXT: uzp1 v0.16b, v0.16b, v1.16b
523389
; CHECK-SD-NEXT: ret
524390
;
525391
; CHECK-GI-LABEL: signOf_neon_scmp:

0 commit comments

Comments
 (0)