Skip to content

[AArch64] Expand scmp/ucmp vector operations with sub #108830

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Sep 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion llvm/include/llvm/CodeGen/BasicTTIImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -2451,7 +2451,8 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
CmpIntrinsic::getLTPredicate(IID),
CostKind);

if (TLI->shouldExpandCmpUsingSelects()) {
EVT VT = TLI->getValueType(DL, CmpTy, true);
if (TLI->shouldExpandCmpUsingSelects(VT)) {
// x < y ? -1 : (x > y ? 1 : 0)
Cost += 2 * thisT()->getCmpSelInstrCost(
BinaryOperator::Select, RetTy, CondTy,
Expand Down
2 changes: 1 addition & 1 deletion llvm/include/llvm/CodeGen/TargetLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -3409,7 +3409,7 @@ class TargetLoweringBase {

/// Should we expand [US]CMP nodes using two selects and two compares, or by
/// doing arithmetic on boolean types
virtual bool shouldExpandCmpUsingSelects() const { return false; }
virtual bool shouldExpandCmpUsingSelects(EVT VT) const { return false; }

/// Does this target support complex deinterleaving
virtual bool isComplexDeinterleavingSupported() const { return false; }
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10681,7 +10681,7 @@ SDValue TargetLowering::expandCMP(SDNode *Node, SelectionDAG &DAG) const {
// because one of the conditions can be merged with one of the selects.
// And finally, if we don't know the contents of high bits of a boolean value
// we can't perform any arithmetic either.
if (shouldExpandCmpUsingSelects() || BoolVT.getScalarSizeInBits() == 1 ||
if (shouldExpandCmpUsingSelects(VT) || BoolVT.getScalarSizeInBits() == 1 ||
getBooleanContents(BoolVT) == UndefinedBooleanContent) {
SDValue SelectZeroOrOne =
DAG.getSelect(dl, ResVT, IsGT, DAG.getConstant(1, dl, ResVT),
Expand Down
6 changes: 6 additions & 0 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27781,6 +27781,12 @@ bool AArch64TargetLowering::shouldConvertFpToSat(unsigned Op, EVT FPVT,
return TargetLowering::shouldConvertFpToSat(Op, FPVT, VT);
}

bool AArch64TargetLowering::shouldExpandCmpUsingSelects(EVT VT) const {
// Expand scalar and SVE operations using selects. Neon vectors prefer sub to
// avoid vselect becoming bsl / unrolling.
return !VT.isFixedLengthVector();
}

MachineInstr *
AArch64TargetLowering::EmitKCFICheck(MachineBasicBlock &MBB,
MachineBasicBlock::instr_iterator &MBBI,
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/AArch64/AArch64ISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -914,7 +914,7 @@ class AArch64TargetLowering : public TargetLowering {

bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override;

bool shouldExpandCmpUsingSelects() const override { return true; }
bool shouldExpandCmpUsingSelects(EVT VT) const override;

bool isComplexDeinterleavingSupported() const override;
bool isComplexDeinterleavingOperationSupported(
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/SystemZ/SystemZISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -507,7 +507,7 @@ class SystemZTargetLowering : public TargetLowering {

bool shouldConsiderGEPOffsetSplit() const override { return true; }

bool shouldExpandCmpUsingSelects() const override { return true; }
bool shouldExpandCmpUsingSelects(EVT VT) const override { return true; }

const char *getTargetNodeName(unsigned Opcode) const override;
std::pair<unsigned, const TargetRegisterClass *>
Expand Down
24 changes: 12 additions & 12 deletions llvm/test/Analysis/CostModel/AArch64/cmp.ll
Original file line number Diff line number Diff line change
Expand Up @@ -128,33 +128,33 @@ define void @uscmp() {
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %u16 = call i16 @llvm.ucmp.i16.i16(i16 undef, i16 undef)
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %u32 = call i32 @llvm.ucmp.i32.i32(i32 undef, i32 undef)
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %u64 = call i64 @llvm.ucmp.i64.i64(i64 undef, i64 undef)
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 98 for instruction: %uv16i8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> undef, <16 x i8> undef)
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %uv8i16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> undef, <8 x i16> undef)
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %uv4i32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef)
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %uv16i8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> undef, <16 x i8> undef)
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %uv8i16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> undef, <8 x i16> undef)
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %uv4i32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef)
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %s8 = call i8 @llvm.scmp.i8.i8(i8 undef, i8 undef)
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %s16 = call i16 @llvm.scmp.i16.i16(i16 undef, i16 undef)
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %s32 = call i32 @llvm.scmp.i32.i32(i32 undef, i32 undef)
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %s64 = call i64 @llvm.scmp.i64.i64(i64 undef, i64 undef)
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 98 for instruction: %sv16i8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> undef, <16 x i8> undef)
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %sv8i16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> undef, <8 x i16> undef)
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %sv4i32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef)
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %sv16i8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> undef, <16 x i8> undef)
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %sv8i16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> undef, <8 x i16> undef)
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %sv4i32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef)
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; CHECK-SIZE-LABEL: 'uscmp'
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %u8 = call i8 @llvm.ucmp.i8.i8(i8 undef, i8 undef)
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %u16 = call i16 @llvm.ucmp.i16.i16(i16 undef, i16 undef)
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %u32 = call i32 @llvm.ucmp.i32.i32(i32 undef, i32 undef)
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %u64 = call i64 @llvm.ucmp.i64.i64(i64 undef, i64 undef)
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %uv16i8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> undef, <16 x i8> undef)
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %uv8i16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> undef, <8 x i16> undef)
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %uv4i32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef)
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %uv16i8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> undef, <16 x i8> undef)
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %uv8i16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> undef, <8 x i16> undef)
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %uv4i32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef)
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %s8 = call i8 @llvm.scmp.i8.i8(i8 undef, i8 undef)
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %s16 = call i16 @llvm.scmp.i16.i16(i16 undef, i16 undef)
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %s32 = call i32 @llvm.scmp.i32.i32(i32 undef, i32 undef)
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %s64 = call i64 @llvm.scmp.i64.i64(i64 undef, i64 undef)
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %sv16i8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> undef, <16 x i8> undef)
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %sv8i16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> undef, <8 x i16> undef)
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %sv4i32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef)
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %sv16i8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> undef, <16 x i8> undef)
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %sv8i16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> undef, <8 x i16> undef)
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %sv4i32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef)
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
;
%u8 = call i8 @llvm.ucmp(i8 undef, i8 undef)
Expand Down
190 changes: 28 additions & 162 deletions llvm/test/CodeGen/AArch64/scmp.ll
Original file line number Diff line number Diff line change
Expand Up @@ -136,11 +136,9 @@ define i64 @scmp.64.64(i64 %x, i64 %y) nounwind {
define <8 x i8> @s_v8i8(<8 x i8> %a, <8 x i8> %b) {
; CHECK-SD-LABEL: s_v8i8:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: movi v2.8b, #1
; CHECK-SD-NEXT: cmgt v3.8b, v0.8b, v1.8b
; CHECK-SD-NEXT: cmgt v2.8b, v0.8b, v1.8b
; CHECK-SD-NEXT: cmgt v0.8b, v1.8b, v0.8b
; CHECK-SD-NEXT: and v1.8b, v3.8b, v2.8b
; CHECK-SD-NEXT: orr v0.8b, v1.8b, v0.8b
; CHECK-SD-NEXT: sub v0.8b, v0.8b, v2.8b
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: s_v8i8:
Expand All @@ -160,11 +158,9 @@ entry:
define <16 x i8> @s_v16i8(<16 x i8> %a, <16 x i8> %b) {
; CHECK-SD-LABEL: s_v16i8:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: movi v2.16b, #1
; CHECK-SD-NEXT: cmgt v3.16b, v0.16b, v1.16b
; CHECK-SD-NEXT: cmgt v2.16b, v0.16b, v1.16b
; CHECK-SD-NEXT: cmgt v0.16b, v1.16b, v0.16b
; CHECK-SD-NEXT: and v1.16b, v3.16b, v2.16b
; CHECK-SD-NEXT: orr v0.16b, v1.16b, v0.16b
; CHECK-SD-NEXT: sub v0.16b, v0.16b, v2.16b
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: s_v16i8:
Expand All @@ -184,11 +180,9 @@ entry:
define <4 x i16> @s_v4i16(<4 x i16> %a, <4 x i16> %b) {
; CHECK-SD-LABEL: s_v4i16:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: movi v2.4h, #1
; CHECK-SD-NEXT: cmgt v3.4h, v0.4h, v1.4h
; CHECK-SD-NEXT: cmgt v2.4h, v0.4h, v1.4h
; CHECK-SD-NEXT: cmgt v0.4h, v1.4h, v0.4h
; CHECK-SD-NEXT: and v1.8b, v3.8b, v2.8b
; CHECK-SD-NEXT: orr v0.8b, v1.8b, v0.8b
; CHECK-SD-NEXT: sub v0.4h, v0.4h, v2.4h
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: s_v4i16:
Expand All @@ -208,11 +202,9 @@ entry:
define <8 x i16> @s_v8i16(<8 x i16> %a, <8 x i16> %b) {
; CHECK-SD-LABEL: s_v8i16:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: movi v2.8h, #1
; CHECK-SD-NEXT: cmgt v3.8h, v0.8h, v1.8h
; CHECK-SD-NEXT: cmgt v2.8h, v0.8h, v1.8h
; CHECK-SD-NEXT: cmgt v0.8h, v1.8h, v0.8h
; CHECK-SD-NEXT: and v1.16b, v3.16b, v2.16b
; CHECK-SD-NEXT: orr v0.16b, v1.16b, v0.16b
; CHECK-SD-NEXT: sub v0.8h, v0.8h, v2.8h
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: s_v8i16:
Expand All @@ -232,15 +224,12 @@ entry:
define <16 x i16> @s_v16i16(<16 x i16> %a, <16 x i16> %b) {
; CHECK-SD-LABEL: s_v16i16:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: movi v4.8h, #1
; CHECK-SD-NEXT: cmgt v4.8h, v1.8h, v3.8h
; CHECK-SD-NEXT: cmgt v5.8h, v0.8h, v2.8h
; CHECK-SD-NEXT: cmgt v6.8h, v1.8h, v3.8h
; CHECK-SD-NEXT: cmgt v0.8h, v2.8h, v0.8h
; CHECK-SD-NEXT: cmgt v1.8h, v3.8h, v1.8h
; CHECK-SD-NEXT: and v2.16b, v5.16b, v4.16b
; CHECK-SD-NEXT: and v3.16b, v6.16b, v4.16b
; CHECK-SD-NEXT: orr v0.16b, v2.16b, v0.16b
; CHECK-SD-NEXT: orr v1.16b, v3.16b, v1.16b
; CHECK-SD-NEXT: sub v0.8h, v0.8h, v5.8h
; CHECK-SD-NEXT: sub v1.8h, v1.8h, v4.8h
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: s_v16i16:
Expand All @@ -264,11 +253,9 @@ entry:
define <2 x i32> @s_v2i32(<2 x i32> %a, <2 x i32> %b) {
; CHECK-SD-LABEL: s_v2i32:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: movi v2.2s, #1
; CHECK-SD-NEXT: cmgt v3.2s, v0.2s, v1.2s
; CHECK-SD-NEXT: cmgt v2.2s, v0.2s, v1.2s
; CHECK-SD-NEXT: cmgt v0.2s, v1.2s, v0.2s
; CHECK-SD-NEXT: and v1.8b, v3.8b, v2.8b
; CHECK-SD-NEXT: orr v0.8b, v1.8b, v0.8b
; CHECK-SD-NEXT: sub v0.2s, v0.2s, v2.2s
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: s_v2i32:
Expand All @@ -288,11 +275,9 @@ entry:
define <4 x i32> @s_v4i32(<4 x i32> %a, <4 x i32> %b) {
; CHECK-SD-LABEL: s_v4i32:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: movi v2.4s, #1
; CHECK-SD-NEXT: cmgt v3.4s, v0.4s, v1.4s
; CHECK-SD-NEXT: cmgt v2.4s, v0.4s, v1.4s
; CHECK-SD-NEXT: cmgt v0.4s, v1.4s, v0.4s
; CHECK-SD-NEXT: and v1.16b, v3.16b, v2.16b
; CHECK-SD-NEXT: orr v0.16b, v1.16b, v0.16b
; CHECK-SD-NEXT: sub v0.4s, v0.4s, v2.4s
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: s_v4i32:
Expand All @@ -312,15 +297,12 @@ entry:
define <8 x i32> @s_v8i32(<8 x i32> %a, <8 x i32> %b) {
; CHECK-SD-LABEL: s_v8i32:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: movi v4.4s, #1
; CHECK-SD-NEXT: cmgt v4.4s, v1.4s, v3.4s
; CHECK-SD-NEXT: cmgt v5.4s, v0.4s, v2.4s
; CHECK-SD-NEXT: cmgt v6.4s, v1.4s, v3.4s
; CHECK-SD-NEXT: cmgt v0.4s, v2.4s, v0.4s
; CHECK-SD-NEXT: cmgt v1.4s, v3.4s, v1.4s
; CHECK-SD-NEXT: and v2.16b, v5.16b, v4.16b
; CHECK-SD-NEXT: and v3.16b, v6.16b, v4.16b
; CHECK-SD-NEXT: orr v0.16b, v2.16b, v0.16b
; CHECK-SD-NEXT: orr v1.16b, v3.16b, v1.16b
; CHECK-SD-NEXT: sub v0.4s, v0.4s, v5.4s
; CHECK-SD-NEXT: sub v1.4s, v1.4s, v4.4s
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: s_v8i32:
Expand All @@ -344,12 +326,9 @@ entry:
define <2 x i64> @s_v2i64(<2 x i64> %a, <2 x i64> %b) {
; CHECK-SD-LABEL: s_v2i64:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: mov w8, #1 // =0x1
; CHECK-SD-NEXT: cmgt v2.2d, v0.2d, v1.2d
; CHECK-SD-NEXT: cmgt v0.2d, v1.2d, v0.2d
; CHECK-SD-NEXT: dup v3.2d, x8
; CHECK-SD-NEXT: and v1.16b, v2.16b, v3.16b
; CHECK-SD-NEXT: orr v0.16b, v1.16b, v0.16b
; CHECK-SD-NEXT: sub v0.2d, v0.2d, v2.2d
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: s_v2i64:
Expand All @@ -370,16 +349,12 @@ entry:
define <4 x i64> @s_v4i64(<4 x i64> %a, <4 x i64> %b) {
; CHECK-SD-LABEL: s_v4i64:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: mov w8, #1 // =0x1
; CHECK-SD-NEXT: cmgt v4.2d, v0.2d, v2.2d
; CHECK-SD-NEXT: cmgt v6.2d, v1.2d, v3.2d
; CHECK-SD-NEXT: dup v5.2d, x8
; CHECK-SD-NEXT: cmgt v4.2d, v1.2d, v3.2d
; CHECK-SD-NEXT: cmgt v5.2d, v0.2d, v2.2d
; CHECK-SD-NEXT: cmgt v0.2d, v2.2d, v0.2d
; CHECK-SD-NEXT: cmgt v1.2d, v3.2d, v1.2d
; CHECK-SD-NEXT: and v2.16b, v4.16b, v5.16b
; CHECK-SD-NEXT: and v3.16b, v6.16b, v5.16b
; CHECK-SD-NEXT: orr v0.16b, v2.16b, v0.16b
; CHECK-SD-NEXT: orr v1.16b, v3.16b, v1.16b
; CHECK-SD-NEXT: sub v0.2d, v0.2d, v5.2d
; CHECK-SD-NEXT: sub v1.2d, v1.2d, v4.2d
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: s_v4i64:
Expand All @@ -404,122 +379,13 @@ entry:
define <16 x i8> @signOf_neon_scmp(<8 x i16> %s0_lo, <8 x i16> %s0_hi, <8 x i16> %s1_lo, <8 x i16> %s1_hi) {
; CHECK-SD-LABEL: signOf_neon_scmp:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: cmgt v5.8h, v0.8h, v2.8h
; CHECK-SD-NEXT: cmgt v2.8h, v2.8h, v0.8h
; CHECK-SD-NEXT: cmgt v4.8h, v1.8h, v3.8h
; CHECK-SD-NEXT: cmgt v1.8h, v3.8h, v1.8h
; CHECK-SD-NEXT: umov w8, v5.h[1]
; CHECK-SD-NEXT: umov w9, v2.h[1]
; CHECK-SD-NEXT: umov w10, v5.h[0]
; CHECK-SD-NEXT: umov w11, v2.h[0]
; CHECK-SD-NEXT: tst w8, #0xffff
; CHECK-SD-NEXT: cset w8, ne
; CHECK-SD-NEXT: tst w9, #0xffff
; CHECK-SD-NEXT: csinv w8, w8, wzr, eq
; CHECK-SD-NEXT: tst w10, #0xffff
; CHECK-SD-NEXT: umov w10, v5.h[2]
; CHECK-SD-NEXT: cset w9, ne
; CHECK-SD-NEXT: tst w11, #0xffff
; CHECK-SD-NEXT: umov w11, v2.h[2]
; CHECK-SD-NEXT: csinv w9, w9, wzr, eq
; CHECK-SD-NEXT: fmov s0, w9
; CHECK-SD-NEXT: tst w10, #0xffff
; CHECK-SD-NEXT: umov w10, v2.h[3]
; CHECK-SD-NEXT: cset w9, ne
; CHECK-SD-NEXT: tst w11, #0xffff
; CHECK-SD-NEXT: mov v0.b[1], w8
; CHECK-SD-NEXT: umov w8, v5.h[3]
; CHECK-SD-NEXT: csinv w9, w9, wzr, eq
; CHECK-SD-NEXT: mov v0.b[2], w9
; CHECK-SD-NEXT: tst w8, #0xffff
; CHECK-SD-NEXT: umov w8, v5.h[4]
; CHECK-SD-NEXT: cset w9, ne
; CHECK-SD-NEXT: tst w10, #0xffff
; CHECK-SD-NEXT: umov w10, v2.h[4]
; CHECK-SD-NEXT: csinv w9, w9, wzr, eq
; CHECK-SD-NEXT: mov v0.b[3], w9
; CHECK-SD-NEXT: tst w8, #0xffff
; CHECK-SD-NEXT: umov w8, v5.h[5]
; CHECK-SD-NEXT: cset w9, ne
; CHECK-SD-NEXT: tst w10, #0xffff
; CHECK-SD-NEXT: umov w10, v2.h[5]
; CHECK-SD-NEXT: csinv w9, w9, wzr, eq
; CHECK-SD-NEXT: mov v0.b[4], w9
; CHECK-SD-NEXT: tst w8, #0xffff
; CHECK-SD-NEXT: umov w8, v5.h[6]
; CHECK-SD-NEXT: cset w9, ne
; CHECK-SD-NEXT: tst w10, #0xffff
; CHECK-SD-NEXT: umov w10, v2.h[6]
; CHECK-SD-NEXT: csinv w9, w9, wzr, eq
; CHECK-SD-NEXT: mov v0.b[5], w9
; CHECK-SD-NEXT: umov w9, v5.h[7]
; CHECK-SD-NEXT: tst w8, #0xffff
; CHECK-SD-NEXT: cset w8, ne
; CHECK-SD-NEXT: tst w10, #0xffff
; CHECK-SD-NEXT: umov w10, v2.h[7]
; CHECK-SD-NEXT: csinv w8, w8, wzr, eq
; CHECK-SD-NEXT: mov v0.b[6], w8
; CHECK-SD-NEXT: tst w9, #0xffff
; CHECK-SD-NEXT: umov w8, v4.h[0]
; CHECK-SD-NEXT: cset w9, ne
; CHECK-SD-NEXT: tst w10, #0xffff
; CHECK-SD-NEXT: umov w10, v1.h[0]
; CHECK-SD-NEXT: csinv w9, w9, wzr, eq
; CHECK-SD-NEXT: mov v0.b[7], w9
; CHECK-SD-NEXT: tst w8, #0xffff
; CHECK-SD-NEXT: umov w8, v4.h[1]
; CHECK-SD-NEXT: cset w9, ne
; CHECK-SD-NEXT: tst w10, #0xffff
; CHECK-SD-NEXT: umov w10, v1.h[1]
; CHECK-SD-NEXT: csinv w9, w9, wzr, eq
; CHECK-SD-NEXT: mov v0.b[8], w9
; CHECK-SD-NEXT: tst w8, #0xffff
; CHECK-SD-NEXT: umov w8, v4.h[2]
; CHECK-SD-NEXT: cset w9, ne
; CHECK-SD-NEXT: tst w10, #0xffff
; CHECK-SD-NEXT: umov w10, v1.h[2]
; CHECK-SD-NEXT: csinv w9, w9, wzr, eq
; CHECK-SD-NEXT: mov v0.b[9], w9
; CHECK-SD-NEXT: tst w8, #0xffff
; CHECK-SD-NEXT: umov w8, v4.h[3]
; CHECK-SD-NEXT: cset w9, ne
; CHECK-SD-NEXT: tst w10, #0xffff
; CHECK-SD-NEXT: umov w10, v1.h[3]
; CHECK-SD-NEXT: csinv w9, w9, wzr, eq
; CHECK-SD-NEXT: mov v0.b[10], w9
; CHECK-SD-NEXT: tst w8, #0xffff
; CHECK-SD-NEXT: umov w8, v4.h[4]
; CHECK-SD-NEXT: cset w9, ne
; CHECK-SD-NEXT: tst w10, #0xffff
; CHECK-SD-NEXT: umov w10, v1.h[4]
; CHECK-SD-NEXT: csinv w9, w9, wzr, eq
; CHECK-SD-NEXT: mov v0.b[11], w9
; CHECK-SD-NEXT: tst w8, #0xffff
; CHECK-SD-NEXT: umov w8, v4.h[5]
; CHECK-SD-NEXT: cset w9, ne
; CHECK-SD-NEXT: tst w10, #0xffff
; CHECK-SD-NEXT: umov w10, v1.h[5]
; CHECK-SD-NEXT: csinv w9, w9, wzr, eq
; CHECK-SD-NEXT: mov v0.b[12], w9
; CHECK-SD-NEXT: tst w8, #0xffff
; CHECK-SD-NEXT: umov w8, v4.h[6]
; CHECK-SD-NEXT: cset w9, ne
; CHECK-SD-NEXT: tst w10, #0xffff
; CHECK-SD-NEXT: umov w10, v1.h[6]
; CHECK-SD-NEXT: csinv w9, w9, wzr, eq
; CHECK-SD-NEXT: mov v0.b[13], w9
; CHECK-SD-NEXT: tst w8, #0xffff
; CHECK-SD-NEXT: umov w8, v4.h[7]
; CHECK-SD-NEXT: cset w9, ne
; CHECK-SD-NEXT: tst w10, #0xffff
; CHECK-SD-NEXT: umov w10, v1.h[7]
; CHECK-SD-NEXT: csinv w9, w9, wzr, eq
; CHECK-SD-NEXT: mov v0.b[14], w9
; CHECK-SD-NEXT: tst w8, #0xffff
; CHECK-SD-NEXT: cset w8, ne
; CHECK-SD-NEXT: tst w10, #0xffff
; CHECK-SD-NEXT: csinv w8, w8, wzr, eq
; CHECK-SD-NEXT: mov v0.b[15], w8
; CHECK-SD-NEXT: cmgt v3.8h, v0.8h, v2.8h
; CHECK-SD-NEXT: cmgt v0.8h, v2.8h, v0.8h
; CHECK-SD-NEXT: sub v1.8h, v1.8h, v4.8h
; CHECK-SD-NEXT: sub v0.8h, v0.8h, v3.8h
; CHECK-SD-NEXT: uzp1 v0.16b, v0.16b, v1.16b
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: signOf_neon_scmp:
Expand Down
Loading
Loading