-
Notifications
You must be signed in to change notification settings - Fork 13.3k
[X86][AVX512] Use comx for compare #113567
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-x86 Author: Mahesh-Attarde (mahesh-attarde) ChangesContinuing on what PR #113098 added Earlier Legalization and combine expanded
Earlier transform is mentioned here #113098 (comment) Full diff: https://github.com/llvm/llvm-project/pull/113567.diff 4 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index bcb84add65d83e..42c8175ccc92cb 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -218,10 +218,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setTruncStoreAction(MVT::f64, MVT::f32, Expand);
// SETOEQ and SETUNE require checking two conditions.
- for (auto VT : {MVT::f32, MVT::f64, MVT::f80}) {
- setCondCodeAction(ISD::SETOEQ, VT, Expand);
- setCondCodeAction(ISD::SETUNE, VT, Expand);
+ for (auto VT : {MVT::f32, MVT::f64}) {
+ setCondCodeAction(ISD::SETOEQ, VT,
+ Subtarget.hasAVX10_2() ? Custom : Expand);
+ setCondCodeAction(ISD::SETUNE, VT,
+ Subtarget.hasAVX10_2() ? Custom : Expand);
}
+ setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
+ setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
// Integer absolute.
if (Subtarget.canUseCMOV()) {
@@ -2292,8 +2296,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FP_EXTEND, MVT::f32, Legal);
setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Legal);
- setCondCodeAction(ISD::SETOEQ, MVT::f16, Expand);
- setCondCodeAction(ISD::SETUNE, MVT::f16, Expand);
+ setCondCodeAction(ISD::SETOEQ, MVT::f16,
+ Subtarget.hasAVX10_2() ? Custom : Expand);
+ setCondCodeAction(ISD::SETUNE, MVT::f16,
+ Subtarget.hasAVX10_2() ? Custom : Expand);
if (Subtarget.useAVX512Regs()) {
setGroup(MVT::v32f16);
@@ -2442,7 +2448,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
}
}
}
-
+
if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
@@ -24073,6 +24079,13 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
}
+ if (Subtarget.hasAVX10_2_512()) {
+ if (CC == ISD::SETOEQ || CC == ISD::SETUNE) {
+ auto NewCC = (CC == ISD::SETOEQ) ? X86::COND_E : (X86::COND_NE);
+ return getSETCC(NewCC, DAG.getNode(X86ISD::UCOMX, dl, MVT::i32, Op0, Op1),
+ dl, DAG);
+ }
+ }
// Handle floating point.
X86::CondCode CondCode = TranslateX86CC(CC, dl, /*IsFP*/ true, Op0, Op1, DAG);
if (CondCode == X86::COND_INVALID)
diff --git a/llvm/lib/Target/X86/X86InstrAVX10.td b/llvm/lib/Target/X86/X86InstrAVX10.td
index 625f2e01d47218..c67ef49940e513 100644
--- a/llvm/lib/Target/X86/X86InstrAVX10.td
+++ b/llvm/lib/Target/X86/X86InstrAVX10.td
@@ -1541,6 +1541,24 @@ defm VFNMSUB132NEPBF16 : avx10_fma3p_132_bf16<0x9E, "vfnmsub132nepbf16", X86any_
//-------------------------------------------------
// AVX10 COMEF instructions
//-------------------------------------------------
+multiclass avx10_com_ef<bits<8> Opc, RegisterClass RC, ValueType VT,
+ SDPatternOperator OpNode, string OpcodeStr,
+ X86MemOperand x86memop, PatFrag ld_frag,
+ Domain d, X86FoldableSchedWrite sched = WriteFComX>{
+ let ExeDomain = d, mayRaiseFPException = 1, isCodeGenOnly = 1 in {
+ def rr : AVX512<Opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
+ [(set EFLAGS, (OpNode (VT RC:$src1), RC:$src2))]>,
+ EVEX, EVEX_V128, Sched<[sched]>, SIMD_EXC;
+ let mayLoad = 1 in {
+ def rm : AVX512<Opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
+ [(set EFLAGS, (OpNode (VT RC:$src1), (ld_frag addr:$src2)))]>,
+ EVEX, EVEX_V128, Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
+ }
+ }
+}
+
multiclass avx10_com_ef_int<bits<8> Opc, X86VectorVTInfo _, SDNode OpNode,
string OpcodeStr,
Domain d,
@@ -1564,6 +1582,16 @@ multiclass avx10_com_ef_int<bits<8> Opc, X86VectorVTInfo _, SDNode OpNode,
}
let Defs = [EFLAGS], Uses = [MXCSR], Predicates = [HasAVX10_2] in {
+
+ defm VUCOMXSDZ : avx10_com_ef<0x2e, FR64X, f64, X86ucomi512,
+ "vucomxsd", f64mem, loadf64, SSEPackedSingle>,
+ TB, XS, VEX_LIG, REX_W, EVEX_CD8<64, CD8VT1>;
+ defm VUCOMXSHZ : avx10_com_ef<0x2e, FR16X, f16, X86ucomi512,
+ "vucomxsh", f16mem, loadf16, SSEPackedSingle>,
+ T_MAP5, XD, EVEX_CD8<16, CD8VT1>;
+ defm VUCOMXSSZ : avx10_com_ef<0x2e, FR32X, f32, X86ucomi512,
+ "vucomxss", f32mem, loadf32, SSEPackedSingle>,
+ TB, XD, VEX_LIG, EVEX_CD8<32, CD8VT1>;
defm VCOMXSDZ : avx10_com_ef_int<0x2f, v2f64x_info, X86comi512,
"vcomxsd", SSEPackedDouble>,
TB, XS, VEX_LIG, REX_W, EVEX_CD8<64, CD8VT1>;
diff --git a/llvm/test/CodeGen/X86/avx10_2-cmp.ll b/llvm/test/CodeGen/X86/avx10_2-cmp.ll
new file mode 100644
index 00000000000000..62a187c3adc741
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avx10_2-cmp.ll
@@ -0,0 +1,237 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx10.2-512 | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+avx10.2-512 | FileCheck %s --check-prefix=X86
+
+define i1 @hoeq(half %x, half %y) {
+; X64-LABEL: hoeq:
+; X64: # %bb.0:
+; X64-NEXT: vucomxsh %xmm1, %xmm0
+; X64-NEXT: sete %al
+; X64-NEXT: retq
+;
+; X86-LABEL: hoeq:
+; X86: # %bb.0:
+; X86-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
+; X86-NEXT: vucomxsh {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT: sete %al
+; X86-NEXT: retl
+ %1 = fcmp oeq half %x, %y
+ ret i1 %1
+}
+
+define i1 @hune(half %x, half %y) {
+; X64-LABEL: hune:
+; X64: # %bb.0:
+; X64-NEXT: vucomxsh %xmm1, %xmm0
+; X64-NEXT: setne %al
+; X64-NEXT: retq
+;
+; X86-LABEL: hune:
+; X86: # %bb.0:
+; X86-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
+; X86-NEXT: vucomxsh {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT: setne %al
+; X86-NEXT: retl
+ %1 = fcmp une half %x, %y
+ ret i1 %1
+}
+
+define i1 @hoeq_mem(ptr %xp, ptr %yp) {
+; X64-LABEL: hoeq_mem:
+; X64: # %bb.0:
+; X64-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
+; X64-NEXT: vucomxsh (%rsi), %xmm0
+; X64-NEXT: sete %al
+; X64-NEXT: retq
+;
+; X86-LABEL: hoeq_mem:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
+; X86-NEXT: vucomxsh (%eax), %xmm0
+; X86-NEXT: sete %al
+; X86-NEXT: retl
+ %x = load half, ptr %xp
+ %y = load half, ptr %yp
+ %1 = fcmp oeq half %x, %y
+ ret i1 %1
+}
+
+define i1 @hune_mem(ptr %xp, ptr %yp) {
+; X64-LABEL: hune_mem:
+; X64: # %bb.0:
+; X64-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
+; X64-NEXT: vucomxsh (%rsi), %xmm0
+; X64-NEXT: setne %al
+; X64-NEXT: retq
+;
+; X86-LABEL: hune_mem:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
+; X86-NEXT: vucomxsh (%eax), %xmm0
+; X86-NEXT: setne %al
+; X86-NEXT: retl
+ %x = load half, ptr %xp
+ %y = load half, ptr %yp
+ %1 = fcmp une half %x, %y
+ ret i1 %1
+}
+
+define i1 @foeq(float %x, float %y) {
+; X64-LABEL: foeq:
+; X64: # %bb.0:
+; X64-NEXT: vucomxss %xmm1, %xmm0
+; X64-NEXT: sete %al
+; X64-NEXT: retq
+;
+; X86-LABEL: foeq:
+; X86: # %bb.0:
+; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: vucomxss {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT: sete %al
+; X86-NEXT: retl
+ %1 = fcmp oeq float %x, %y
+ ret i1 %1
+}
+
+define i1 @fune(float %x, float %y) {
+; X64-LABEL: fune:
+; X64: # %bb.0:
+; X64-NEXT: vucomxss %xmm1, %xmm0
+; X64-NEXT: setne %al
+; X64-NEXT: retq
+;
+; X86-LABEL: fune:
+; X86: # %bb.0:
+; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: vucomxss {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT: setne %al
+; X86-NEXT: retl
+ %1 = fcmp une float %x, %y
+ ret i1 %1
+}
+
+define i1 @foeq_mem(ptr %xp, ptr %yp) {
+; X64-LABEL: foeq_mem:
+; X64: # %bb.0:
+; X64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT: vucomxss (%rsi), %xmm0
+; X64-NEXT: sete %al
+; X64-NEXT: retq
+;
+; X86-LABEL: foeq_mem:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: vucomxss (%eax), %xmm0
+; X86-NEXT: sete %al
+; X86-NEXT: retl
+ %x = load float, ptr %xp
+ %y = load float, ptr %yp
+ %1 = fcmp oeq float %x, %y
+ ret i1 %1
+}
+
+define i1 @fune_mem(ptr %xp, ptr %yp) {
+; X64-LABEL: fune_mem:
+; X64: # %bb.0:
+; X64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT: vucomxss (%rsi), %xmm0
+; X64-NEXT: setne %al
+; X64-NEXT: retq
+;
+; X86-LABEL: fune_mem:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: vucomxss (%eax), %xmm0
+; X86-NEXT: setne %al
+; X86-NEXT: retl
+ %x = load float, ptr %xp
+ %y = load float, ptr %yp
+ %1 = fcmp une float %x, %y
+ ret i1 %1
+}
+
+define i1 @doeq(double %x, double %y) {
+; X64-LABEL: doeq:
+; X64: # %bb.0:
+; X64-NEXT: vucomxsd %xmm1, %xmm0
+; X64-NEXT: sete %al
+; X64-NEXT: retq
+;
+; X86-LABEL: doeq:
+; X86: # %bb.0:
+; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT: vucomxsd {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT: sete %al
+; X86-NEXT: retl
+ %1 = fcmp oeq double %x, %y
+ ret i1 %1
+}
+
+define i1 @dune(double %x, double %y) {
+; X64-LABEL: dune:
+; X64: # %bb.0:
+; X64-NEXT: vucomxsd %xmm1, %xmm0
+; X64-NEXT: setne %al
+; X64-NEXT: retq
+;
+; X86-LABEL: dune:
+; X86: # %bb.0:
+; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT: vucomxsd {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT: setne %al
+; X86-NEXT: retl
+ %1 = fcmp une double %x, %y
+ ret i1 %1
+}
+
+define i1 @doeq_mem(ptr %xp, ptr %yp) {
+; X64-LABEL: doeq_mem:
+; X64: # %bb.0:
+; X64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT: vucomxsd (%rsi), %xmm0
+; X64-NEXT: sete %al
+; X64-NEXT: retq
+;
+; X86-LABEL: doeq_mem:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT: vucomxsd (%eax), %xmm0
+; X86-NEXT: sete %al
+; X86-NEXT: retl
+ %x = load double, ptr %xp
+ %y = load double, ptr %yp
+ %1 = fcmp oeq double %x, %y
+ ret i1 %1
+}
+
+define i1 @dune_mem(ptr %xp, ptr %yp) {
+; X64-LABEL: dune_mem:
+; X64: # %bb.0:
+; X64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT: vucomxsd (%rsi), %xmm0
+; X64-NEXT: setne %al
+; X64-NEXT: retq
+;
+; X86-LABEL: dune_mem:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT: vucomxsd (%eax), %xmm0
+; X86-NEXT: setne %al
+; X86-NEXT: retl
+ %x = load double, ptr %xp
+ %y = load double, ptr %yp
+ %1 = fcmp une double %x, %y
+ ret i1 %1
+}
diff --git a/llvm/test/TableGen/x86-fold-tables.inc b/llvm/test/TableGen/x86-fold-tables.inc
index 85d9b02ac0cbf1..fd6ee37d27e147 100644
--- a/llvm/test/TableGen/x86-fold-tables.inc
+++ b/llvm/test/TableGen/x86-fold-tables.inc
@@ -1959,8 +1959,11 @@ static const X86FoldTableEntry Table1[] = {
{X86::VUCOMISSZrr_Int, X86::VUCOMISSZrm_Int, TB_NO_REVERSE},
{X86::VUCOMISSrr, X86::VUCOMISSrm, 0},
{X86::VUCOMISSrr_Int, X86::VUCOMISSrm_Int, TB_NO_REVERSE},
+ {X86::VUCOMXSDZrr, X86::VUCOMXSDZrm, 0},
{X86::VUCOMXSDZrr_Int, X86::VUCOMXSDZrm_Int, TB_NO_REVERSE},
+ {X86::VUCOMXSHZrr, X86::VUCOMXSHZrm, 0},
{X86::VUCOMXSHZrr_Int, X86::VUCOMXSHZrm_Int, TB_NO_REVERSE},
+ {X86::VUCOMXSSZrr, X86::VUCOMXSSZrm, 0},
{X86::VUCOMXSSZrr_Int, X86::VUCOMXSSZrm_Int, TB_NO_REVERSE},
{X86::XOR16ri8_ND, X86::XOR16mi8_ND, 0},
{X86::XOR16ri8_NF_ND, X86::XOR16mi8_NF_ND, 0},
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
@@ -24073,6 +24078,13 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { | |||
return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res; | |||
} | |||
|
|||
if (Subtarget.hasAVX10_2_512()) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
hasAVX10_2
llvm/test/CodeGen/X86/avx10_2-cmp.ll
Outdated
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx10.2-512 | FileCheck %s --check-prefix=X64 | ||
; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+avx10.2-512 | FileCheck %s --check-prefix=X86 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
+avx10.2-256
for (auto VT : {MVT::f32, MVT::f64}) { | ||
setCondCodeAction(ISD::SETOEQ, VT, | ||
Subtarget.hasAVX10_2() ? Custom : Expand); | ||
setCondCodeAction(ISD::SETUNE, VT, | ||
Subtarget.hasAVX10_2() ? Custom : Expand); | ||
} | ||
setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand); | ||
setCondCodeAction(ISD::SETUNE, MVT::f80, Expand); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You can leave it as is and override later with
if (Subtarget.hasAVX10_2()) {
for (auto VT : {MVT::f16, MVT::f32, MVT::f64}) {
...
}
llvm/lib/Target/X86/X86InstrAVX10.td
Outdated
@@ -1564,6 +1582,16 @@ multiclass avx10_com_ef_int<bits<8> Opc, X86VectorVTInfo _, SDNode OpNode, | |||
} | |||
|
|||
let Defs = [EFLAGS], Uses = [MXCSR], Predicates = [HasAVX10_2] in { | |||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Remove blank line.
llvm/lib/Target/X86/X86InstrAVX10.td
Outdated
@@ -1564,6 +1582,16 @@ multiclass avx10_com_ef_int<bits<8> Opc, X86VectorVTInfo _, SDNode OpNode, | |||
} | |||
|
|||
let Defs = [EFLAGS], Uses = [MXCSR], Predicates = [HasAVX10_2] in { | |||
|
|||
defm VUCOMXSDZ : avx10_com_ef<0x2e, FR64X, f64, X86ucomi512, | |||
"vucomxsd", f64mem, loadf64, SSEPackedSingle>, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
SSEPackedDouble
setCondCodeAction(ISD::SETOEQ, MVT::f16, | ||
Subtarget.hasAVX10_2() ? Custom : Expand); | ||
setCondCodeAction(ISD::SETUNE, MVT::f16, | ||
Subtarget.hasAVX10_2() ? Custom : Expand); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why don't combine them to the loop? You can move them to line 2445.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I can do that. Although setGroup(MVT::f16);
on 2286 line above has better proximity fore related code.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The code is in the scope of hasFP16
. We prefer to organizing them in feature bulk rather than mixing them together.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This code movement result in 3 regressions.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't see how it results in regressions. Did you remove these 2 Expand?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I meant this:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 676ae6b87610..1ffec84d9958 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2441,6 +2441,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SETCC, VT, Custom);
}
}
+
+ for (auto VT : {MVT::f16, MVT::f32, MVT::f64}) {
+ setCondCodeAction(ISD::SETOEQ, VT, Custom);
+ setCondCodeAction(ISD::SETUNE, VT, Custom);
+ }
}
if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
Notice it's under these code, so it should not affect any existing tests.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
See 2c9b981
I don't see any problem on my side.
Maybe you was misled. The diff above was to demonstrate the change in function X86TargetLowering
, so I omitted the change in LowerSETCC
. I'm surprised you didn't explore along this way.
@@ -2442,7 +2450,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, | |||
} | |||
} | |||
} | |||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Unrelated change.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM with 2 nits.
Can we merge this? |
316f48c
to
da9493c
Compare
if (Subtarget.hasAVX10_2()) { | ||
for (auto VT : {MVT::f16, MVT::f32, MVT::f64}) { | ||
setCondCodeAction(ISD::SETOEQ, VT, Custom); | ||
setCondCodeAction(ISD::SETUNE, VT, Custom); | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Move this under FP16 code. We need to override f16
with this code.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
you mean only FP16 or Complete SetCondCodeAction for f16, f32 and f64?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
All of them. See the line number of previous diff.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
#113905 I tried this. Moved SETOEQ and SETUNE group after avx10_2 & softfloat feature. it did not work.
This reverts commit da9493c.
7737070
to
9f2241b
Compare
This revision LG. @mahesh-attarde can you update the description before we merging it? |
done. |
We added AVX10.2 COMEF ISA in LLVM, This does not optimize correctly in scenario mentioned below. Summary Input ``` define i1 @oeq(float %x, float %y) { %1 = fcmp oeq float %x, %y ret i1 %1 }define i1 @UNE(float %x, float %y) { %1 = fcmp une float %x, %y ret i1 %1 }define i1 @ogt(float %x, float %y) { %1 = fcmp ogt float %x, %y ret i1 %1 } // Prior AVX10.2, default code generation oeq: # @oeq cmpeqss xmm0, xmm1 movd eax, xmm0 and eax, 1 ret une: # @UNE cmpneqss xmm0, xmm1 movd eax, xmm0 and eax, 1 ret ogt: # @ogt ucomiss xmm0, xmm1 seta al ret ``` This patch will remove `cmpeqss` and `cmpneqss`. For complete transform check unit test. Continuing on what PR llvm#113098 added Earlier Legalization and combine expanded `setcc oeq:ch` node into `and` and `setcc eq` , `setcc o`. From suggestions in community new internal transform ``` Optimized type-legalized selection DAG: %bb.0 'hoeq:' SelectionDAG has 11 nodes: t0: ch,glue = EntryToken t2: f16,ch = CopyFromReg t0, Register:f16 %0 t4: f16,ch = CopyFromReg t0, Register:f16 %1 t14: i8 = setcc t2, t4, setoeq:ch t10: ch,glue = CopyToReg t0, Register:i8 $al, t14 t11: ch = X86ISD::RET_GLUE t10, TargetConstant:i32<0>, Register:i8 $al, t10:1 Optimized legalized selection DAG: %bb.0 'hoeq:' SelectionDAG has 12 nodes: t0: ch,glue = EntryToken t2: f16,ch = CopyFromReg t0, Register:f16 %0 t4: f16,ch = CopyFromReg t0, Register:f16 %1 t15: i32 = X86ISD::UCOMX t2, t4 t17: i8 = X86ISD::SETCC TargetConstant:i8<4>, t15 t10: ch,glue = CopyToReg t0, Register:i8 $al, t17 t11: ch = X86ISD::RET_GLUE t10, TargetConstant:i32<0>, Register:i8 $al, t10:1 ``` Earlier transform is mentioned here llvm#113098 (comment) --------- Co-authored-by: mattarde <[email protected]>
…2414d4826 Local branch amd-gfx 0ba2414 Merged main:44d0e9522a80e1301e96c4751b7572ae0c9cb4dd into amd-gfx:a2e75de8b74c Remote branch main e61a7dc [X86][AVX512] Use comx for compare (llvm#113567)
We added AVX10.2 COMEF ISA in LLVM, This does not optimize correctly in scenario mentioned below.
Summary
Input
This patch will remove
cmpeqss
andcmpneqss
. For complete transform check unit test.Continuing on what PR #113098 added
Earlier Legalization and combine expanded
setcc oeq:ch
node intoand
andsetcc eq
,setcc o
. From suggestions in communitynew internal transform
Earlier transform is mentioned here #113098 (comment)