-
Notifications
You must be signed in to change notification settings - Fork 13.3k
[X86][AVX512] Use comx for compare #113098
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-x86 Author: Mahesh-Attarde (mahesh-attarde) ChangesPrior to AVX10_2, we used COMI for ogt only. with AVX10_2 We can utilize oeq and une to process vucomxss.
This patch will remove Full diff: https://github.com/llvm/llvm-project/pull/113098.diff 4 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index bcb84add65d83e..22fcd3bf6bc8eb 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -49520,6 +49520,15 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
// FIXME: need symbolic constants for these magic numbers.
// See X86ATTInstPrinter.cpp:printSSECC().
unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
+
+ // VCOMXSS simplifies conditional code sequence into single setcc node
+ // and a CC node, Earlier until COMI, it required 2 SETCC's
+ if (Subtarget.hasAVX10_2()) {
+ return getSETCC(
+ ((cc0 == X86::COND_E) ? X86::COND_E : X86::COND_NE),
+ DAG.getNode(X86ISD::UCOMX, DL, MVT::i32, CMP00, CMP01), DL,
+ DAG);
+ }
if (Subtarget.hasAVX512()) {
SDValue FSetCC =
DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
diff --git a/llvm/lib/Target/X86/X86InstrAVX10.td b/llvm/lib/Target/X86/X86InstrAVX10.td
index 625f2e01d47218..f9687897728382 100644
--- a/llvm/lib/Target/X86/X86InstrAVX10.td
+++ b/llvm/lib/Target/X86/X86InstrAVX10.td
@@ -1541,6 +1541,24 @@ defm VFNMSUB132NEPBF16 : avx10_fma3p_132_bf16<0x9E, "vfnmsub132nepbf16", X86any_
//-------------------------------------------------
// AVX10 COMEF instructions
//-------------------------------------------------
+multiclass avx10_com_ef<bits<8> Opc, RegisterClass RC, ValueType VT,
+ SDPatternOperator OpNode, string OpcodeStr,
+ X86MemOperand x86memop, PatFrag ld_frag,
+ Domain d, X86FoldableSchedWrite sched = WriteFComX>{
+ let ExeDomain = d, mayRaiseFPException = 1, isCodeGenOnly = 1 in {
+ def rr : AVX512<Opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
+ [(set EFLAGS, (OpNode (VT RC:$src1), RC:$src2))]>,
+ EVEX, EVEX_V128, Sched<[sched]>, SIMD_EXC;
+ let mayLoad = 1 in {
+ def rm : AVX512<Opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
+ [(set EFLAGS, (OpNode (VT RC:$src1), (ld_frag addr:$src2)))]>,
+ EVEX, EVEX_V128, Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
+ }
+ }
+}
+
multiclass avx10_com_ef_int<bits<8> Opc, X86VectorVTInfo _, SDNode OpNode,
string OpcodeStr,
Domain d,
@@ -1564,6 +1582,11 @@ multiclass avx10_com_ef_int<bits<8> Opc, X86VectorVTInfo _, SDNode OpNode,
}
let Defs = [EFLAGS], Uses = [MXCSR], Predicates = [HasAVX10_2] in {
+
+ defm VUCOMXSSZ : avx10_com_ef<0x2e, FR32X, f32, X86ucomi512,
+ "vucomxss", f32mem, loadf32, SSEPackedSingle>,
+ TB, XD, VEX_LIG, EVEX_CD8<32, CD8VT1>;
+
defm VCOMXSDZ : avx10_com_ef_int<0x2f, v2f64x_info, X86comi512,
"vcomxsd", SSEPackedDouble>,
TB, XS, VEX_LIG, REX_W, EVEX_CD8<64, CD8VT1>;
diff --git a/llvm/test/CodeGen/X86/avx10_2-cmp.ll b/llvm/test/CodeGen/X86/avx10_2-cmp.ll
new file mode 100644
index 00000000000000..8c134e21070b82
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avx10_2-cmp.ll
@@ -0,0 +1,121 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx10.2-512 | FileCheck %s --check-prefix=AVX10_2_X64
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+avx10.2-512 | FileCheck %s --check-prefix=AVX10_2_X86
+
+define i1 @oeq(float %x, float %y) {
+; AVX10_2_X64-LABEL: oeq:
+; AVX10_2_X64: # %bb.0:
+; AVX10_2_X64-NEXT: vucomxss %xmm1, %xmm0
+; AVX10_2_X64-NEXT: sete %al
+; AVX10_2_X64-NEXT: retq
+;
+; AVX10_2_X86-LABEL: oeq:
+; AVX10_2_X86: # %bb.0:
+; AVX10_2_X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX10_2_X86-NEXT: vucomxss {{[0-9]+}}(%esp), %xmm0
+; AVX10_2_X86-NEXT: sete %al
+; AVX10_2_X86-NEXT: retl
+ %1 = fcmp oeq float %x, %y
+ ret i1 %1
+}
+
+define i1 @une(float %x, float %y) {
+; AVX10_2_X64-LABEL: une:
+; AVX10_2_X64: # %bb.0:
+; AVX10_2_X64-NEXT: vucomxss %xmm1, %xmm0
+; AVX10_2_X64-NEXT: setne %al
+; AVX10_2_X64-NEXT: retq
+;
+; AVX10_2_X86-LABEL: une:
+; AVX10_2_X86: # %bb.0:
+; AVX10_2_X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX10_2_X86-NEXT: vucomxss {{[0-9]+}}(%esp), %xmm0
+; AVX10_2_X86-NEXT: setne %al
+; AVX10_2_X86-NEXT: retl
+ %1 = fcmp une float %x, %y
+ ret i1 %1
+}
+
+define i1 @ogt(float %x, float %y) {
+; AVX10_2_X64-LABEL: ogt:
+; AVX10_2_X64: # %bb.0:
+; AVX10_2_X64-NEXT: vucomiss %xmm1, %xmm0
+; AVX10_2_X64-NEXT: seta %al
+; AVX10_2_X64-NEXT: retq
+;
+; AVX10_2_X86-LABEL: ogt:
+; AVX10_2_X86: # %bb.0:
+; AVX10_2_X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX10_2_X86-NEXT: vucomiss {{[0-9]+}}(%esp), %xmm0
+; AVX10_2_X86-NEXT: seta %al
+; AVX10_2_X86-NEXT: retl
+ %1 = fcmp ogt float %x, %y
+ ret i1 %1
+}
+
+define i1 @oeq_mem(ptr %xp, ptr %yp) {
+; AVX10_2_X64-LABEL: oeq_mem:
+; AVX10_2_X64: # %bb.0:
+; AVX10_2_X64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX10_2_X64-NEXT: vucomxss (%rsi), %xmm0
+; AVX10_2_X64-NEXT: sete %al
+; AVX10_2_X64-NEXT: retq
+;
+; AVX10_2_X86-LABEL: oeq_mem:
+; AVX10_2_X86: # %bb.0:
+; AVX10_2_X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX10_2_X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; AVX10_2_X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX10_2_X86-NEXT: vucomxss (%eax), %xmm0
+; AVX10_2_X86-NEXT: sete %al
+; AVX10_2_X86-NEXT: retl
+ %x = load float, ptr %xp
+ %y = load float, ptr %yp
+ %1 = fcmp oeq float %x, %y
+ ret i1 %1
+}
+
+define i1 @une_mem(ptr %xp, ptr %yp) {
+; AVX10_2_X64-LABEL: une_mem:
+; AVX10_2_X64: # %bb.0:
+; AVX10_2_X64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX10_2_X64-NEXT: vucomxss (%rsi), %xmm0
+; AVX10_2_X64-NEXT: setne %al
+; AVX10_2_X64-NEXT: retq
+;
+; AVX10_2_X86-LABEL: une_mem:
+; AVX10_2_X86: # %bb.0:
+; AVX10_2_X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX10_2_X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; AVX10_2_X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX10_2_X86-NEXT: vucomxss (%eax), %xmm0
+; AVX10_2_X86-NEXT: setne %al
+; AVX10_2_X86-NEXT: retl
+ %x = load float, ptr %xp
+ %y = load float, ptr %yp
+ %1 = fcmp une float %x, %y
+ ret i1 %1
+}
+
+
+define i1 @ogt_mem(ptr %xp, ptr %yp) {
+; AVX10_2_X64-LABEL: ogt_mem:
+; AVX10_2_X64: # %bb.0:
+; AVX10_2_X64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX10_2_X64-NEXT: vucomiss (%rsi), %xmm0
+; AVX10_2_X64-NEXT: seta %al
+; AVX10_2_X64-NEXT: retq
+;
+; AVX10_2_X86-LABEL: ogt_mem:
+; AVX10_2_X86: # %bb.0:
+; AVX10_2_X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX10_2_X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; AVX10_2_X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX10_2_X86-NEXT: vucomiss (%eax), %xmm0
+; AVX10_2_X86-NEXT: seta %al
+; AVX10_2_X86-NEXT: retl
+ %x = load float, ptr %xp
+ %y = load float, ptr %yp
+ %1 = fcmp ogt float %x, %y
+ ret i1 %1
+}
diff --git a/llvm/test/TableGen/x86-fold-tables.inc b/llvm/test/TableGen/x86-fold-tables.inc
index 85d9b02ac0cbf1..e444c61354abe6 100644
--- a/llvm/test/TableGen/x86-fold-tables.inc
+++ b/llvm/test/TableGen/x86-fold-tables.inc
@@ -1961,6 +1961,7 @@ static const X86FoldTableEntry Table1[] = {
{X86::VUCOMISSrr_Int, X86::VUCOMISSrm_Int, TB_NO_REVERSE},
{X86::VUCOMXSDZrr_Int, X86::VUCOMXSDZrm_Int, TB_NO_REVERSE},
{X86::VUCOMXSHZrr_Int, X86::VUCOMXSHZrm_Int, TB_NO_REVERSE},
+ {X86::VUCOMXSSZrr, X86::VUCOMXSSZrm, 0},
{X86::VUCOMXSSZrr_Int, X86::VUCOMXSSZrm_Int, TB_NO_REVERSE},
{X86::XOR16ri8_ND, X86::XOR16mi8_ND, 0},
{X86::XOR16ri8_NF_ND, X86::XOR16mi8_NF_ND, 0},
|
llvm/test/CodeGen/X86/avx10_2-cmp.ll
Outdated
@@ -0,0 +1,121 @@ | |||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 | |||
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx10.2-512 | FileCheck %s --check-prefix=AVX10_2_X64 | |||
; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+avx10.2-512 | FileCheck %s --check-prefix=AVX10_2_X86 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just use X86/X64 for check prefixes
✅ With the latest revision this PR passed the C/C++ code formatter. |
// VCOMXSS simplifies conditional code sequence into single setcc node. | ||
// Earlier until COMI, it required upto 2 SETCC's to test CC. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
clang-format.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't understand the logic here. Why do we require 2 CC for comx? I don't see a test case using 2 CC. Can we just return SDValue()
to break combine?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
On First line, I mark intent to do this legalization. COMX tests more flags and we need Single SETCC Node to infer.
Earlier Attempts used chain of SETCC like in X86ISelLowering.cpp snippet
case COMI: { // Comparison intrinsics
....
....
SDValue SetCC;
switch (CC) {
case ISD::SETEQ: {
SetCC = getSETCC(X86::COND_E, Comi, dl, DAG); // FIRST SETCC
if (HasAVX10_2_COMX & HasAVX10_2_COMX_Ty) // ZF == 1
break;
// (ZF = 1 and PF = 0)
SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG); // SECOND SETCC
SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
break;
}
May be instead of 2 CC, I need to write 2 Flags. Is it?
If we return SDValue()
, We select vucomiss
so to get desired selection vucomxss
we need to change this.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I see. Can we generate UCOMX
there instead of generating 2 CC and then combine?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@mahesh-attarde Did you investigate this?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Optimized type-legalized selection DAG: %bb.0 'oeq:'
SelectionDAG has 11 nodes:
t0: ch,glue = EntryToken
t2: f32,ch = CopyFromReg t0, Register:f32 %0
t4: f32,ch = CopyFromReg t0, Register:f32 %1
t14: i8 = setcc t2, t4, setoeq:ch
t10: ch,glue = CopyToReg t0, Register:i8 $al, t14
t11: ch = X86ISD::RET_GLUE t10, TargetConstant:i32<0>, Register:i8 $al, t10:1
Legalized selection DAG: %bb.0 'oeq:'
SelectionDAG has 15 nodes:
t0: ch,glue = EntryToken
t24: i8 = X86ISD::SETCC TargetConstant:i8<4>, t20
t22: i8 = X86ISD::SETCC TargetConstant:i8<11>, t20
t19: i8 = and t24, t22
t10: ch,glue = CopyToReg t0, Register:i8 $al, t19
t2: f32,ch = CopyFromReg t0, Register:f32 %0
t4: f32,ch = CopyFromReg t0, Register:f32 %1
t20: i32 = X86ISD::FCMP t2, t4
t11: ch = X86ISD::RET_GLUE t10, TargetConstant:i32<0>, Register:i8 $al, t10:1
If i understand correctly you are asking to remove t19, t22 and t24, is that correct?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
#113567 added here. Can you review?
defm VUCOMXSSZ : avx10_com_ef<0x2e, FR32X, f32, X86ucomi512, | ||
"vucomxss", f32mem, loadf32, SSEPackedSingle>, | ||
TB, XD, VEX_LIG, EVEX_CD8<32, CD8VT1>; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
SD and SH?
llvm/test/CodeGen/X86/avx10_2-cmp.ll
Outdated
ret i1 %1 | ||
} | ||
|
||
define i1 @ogt(float %x, float %y) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ogt
generates expected code without this patch. No need to test it.
llvm/test/CodeGen/X86/avx10_2-cmp.ll
Outdated
} | ||
|
||
|
||
define i1 @ogt_mem(ptr %xp, ptr %yp) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ditto.
Failure looks unrelated
ping @RKSimon @phoebewang |
We added AVX10.2 COMEF ISA in LLVM, This does not optimize correctly in scenario mentioned below. Summary Input ``` define i1 @oeq(float %x, float %y) { %1 = fcmp oeq float %x, %y ret i1 %1 }define i1 @UNE(float %x, float %y) { %1 = fcmp une float %x, %y ret i1 %1 }define i1 @ogt(float %x, float %y) { %1 = fcmp ogt float %x, %y ret i1 %1 } // Prior AVX10.2, default code generation oeq: # @oeq cmpeqss xmm0, xmm1 movd eax, xmm0 and eax, 1 ret une: # @UNE cmpneqss xmm0, xmm1 movd eax, xmm0 and eax, 1 ret ogt: # @ogt ucomiss xmm0, xmm1 seta al ret ``` This patch will remove `cmpeqss` and `cmpneqss`. For complete transform check unit test. Continuing on what PR #113098 added Earlier Legalization and combine expanded `setcc oeq:ch` node into `and` and `setcc eq` , `setcc o`. From suggestions in community new internal transform ``` Optimized type-legalized selection DAG: %bb.0 'hoeq:' SelectionDAG has 11 nodes: t0: ch,glue = EntryToken t2: f16,ch = CopyFromReg t0, Register:f16 %0 t4: f16,ch = CopyFromReg t0, Register:f16 %1 t14: i8 = setcc t2, t4, setoeq:ch t10: ch,glue = CopyToReg t0, Register:i8 $al, t14 t11: ch = X86ISD::RET_GLUE t10, TargetConstant:i32<0>, Register:i8 $al, t10:1 Optimized legalized selection DAG: %bb.0 'hoeq:' SelectionDAG has 12 nodes: t0: ch,glue = EntryToken t2: f16,ch = CopyFromReg t0, Register:f16 %0 t4: f16,ch = CopyFromReg t0, Register:f16 %1 t15: i32 = X86ISD::UCOMX t2, t4 t17: i8 = X86ISD::SETCC TargetConstant:i8<4>, t15 t10: ch,glue = CopyToReg t0, Register:i8 $al, t17 t11: ch = X86ISD::RET_GLUE t10, TargetConstant:i32<0>, Register:i8 $al, t10:1 ``` Earlier transform is mentioned here #113098 (comment) --------- Co-authored-by: mattarde <[email protected]>
We added AVX10.2 COMEF ISA in LLVM, This does not optimize correctly in scenario mentioned below. Summary Input ``` define i1 @oeq(float %x, float %y) { %1 = fcmp oeq float %x, %y ret i1 %1 }define i1 @UNE(float %x, float %y) { %1 = fcmp une float %x, %y ret i1 %1 }define i1 @ogt(float %x, float %y) { %1 = fcmp ogt float %x, %y ret i1 %1 } // Prior AVX10.2, default code generation oeq: # @oeq cmpeqss xmm0, xmm1 movd eax, xmm0 and eax, 1 ret une: # @UNE cmpneqss xmm0, xmm1 movd eax, xmm0 and eax, 1 ret ogt: # @ogt ucomiss xmm0, xmm1 seta al ret ``` This patch will remove `cmpeqss` and `cmpneqss`. For complete transform check unit test. Continuing on what PR llvm#113098 added Earlier Legalization and combine expanded `setcc oeq:ch` node into `and` and `setcc eq` , `setcc o`. From suggestions in community new internal transform ``` Optimized type-legalized selection DAG: %bb.0 'hoeq:' SelectionDAG has 11 nodes: t0: ch,glue = EntryToken t2: f16,ch = CopyFromReg t0, Register:f16 %0 t4: f16,ch = CopyFromReg t0, Register:f16 %1 t14: i8 = setcc t2, t4, setoeq:ch t10: ch,glue = CopyToReg t0, Register:i8 $al, t14 t11: ch = X86ISD::RET_GLUE t10, TargetConstant:i32<0>, Register:i8 $al, t10:1 Optimized legalized selection DAG: %bb.0 'hoeq:' SelectionDAG has 12 nodes: t0: ch,glue = EntryToken t2: f16,ch = CopyFromReg t0, Register:f16 %0 t4: f16,ch = CopyFromReg t0, Register:f16 %1 t15: i32 = X86ISD::UCOMX t2, t4 t17: i8 = X86ISD::SETCC TargetConstant:i8<4>, t15 t10: ch,glue = CopyToReg t0, Register:i8 $al, t17 t11: ch = X86ISD::RET_GLUE t10, TargetConstant:i32<0>, Register:i8 $al, t10:1 ``` Earlier transform is mentioned here llvm#113098 (comment) --------- Co-authored-by: mattarde <[email protected]>
We added AVX10.2 COMEF ISA in LLVM, This does not optimize correctly in scenario mentioned below. Summary Input ``` define i1 @oeq(float %x, float %y) { %1 = fcmp oeq float %x, %y ret i1 %1 }define i1 @UNE(float %x, float %y) { %1 = fcmp une float %x, %y ret i1 %1 }define i1 @ogt(float %x, float %y) { %1 = fcmp ogt float %x, %y ret i1 %1 } // Prior AVX10.2, default code generation oeq: # @oeq cmpeqss xmm0, xmm1 movd eax, xmm0 and eax, 1 ret une: # @UNE cmpneqss xmm0, xmm1 movd eax, xmm0 and eax, 1 ret ogt: # @ogt ucomiss xmm0, xmm1 seta al ret ``` This patch will remove `cmpeqss` and `cmpneqss`. For complete transform check unit test. Continuing on what PR llvm/llvm-project#113098 added Earlier Legalization and combine expanded `setcc oeq:ch` node into `and` and `setcc eq` , `setcc o`. From suggestions in community new internal transform ``` Optimized type-legalized selection DAG: %bb.0 'hoeq:' SelectionDAG has 11 nodes: t0: ch,glue = EntryToken t2: f16,ch = CopyFromReg t0, Register:f16 %0 t4: f16,ch = CopyFromReg t0, Register:f16 %1 t14: i8 = setcc t2, t4, setoeq:ch t10: ch,glue = CopyToReg t0, Register:i8 $al, t14 t11: ch = X86ISD::RET_GLUE t10, TargetConstant:i32<0>, Register:i8 $al, t10:1 Optimized legalized selection DAG: %bb.0 'hoeq:' SelectionDAG has 12 nodes: t0: ch,glue = EntryToken t2: f16,ch = CopyFromReg t0, Register:f16 %0 t4: f16,ch = CopyFromReg t0, Register:f16 %1 t15: i32 = X86ISD::UCOMX t2, t4 t17: i8 = X86ISD::SETCC TargetConstant:i8<4>, t15 t10: ch,glue = CopyToReg t0, Register:i8 $al, t17 t11: ch = X86ISD::RET_GLUE t10, TargetConstant:i32<0>, Register:i8 $al, t10:1 ``` Earlier transform is mentioned here llvm/llvm-project#113098 (comment) --------- Co-authored-by: mattarde <[email protected]>
We added AVX10.2 COMEF ISA in LLVM, This does not optimize correctly in scenario mentioned below. Summary Input ``` define i1 @oeq(float %x, float %y) { %1 = fcmp oeq float %x, %y ret i1 %1 }define i1 @UNE(float %x, float %y) { %1 = fcmp une float %x, %y ret i1 %1 }define i1 @ogt(float %x, float %y) { %1 = fcmp ogt float %x, %y ret i1 %1 } // Prior AVX10.2, default code generation oeq: # @oeq cmpeqss xmm0, xmm1 movd eax, xmm0 and eax, 1 ret une: # @UNE cmpneqss xmm0, xmm1 movd eax, xmm0 and eax, 1 ret ogt: # @ogt ucomiss xmm0, xmm1 seta al ret ``` This patch will remove `cmpeqss` and `cmpneqss`. For complete transform check unit test. Continuing on what PR llvm/llvm-project#113098 added Earlier Legalization and combine expanded `setcc oeq:ch` node into `and` and `setcc eq` , `setcc o`. From suggestions in community new internal transform ``` Optimized type-legalized selection DAG: %bb.0 'hoeq:' SelectionDAG has 11 nodes: t0: ch,glue = EntryToken t2: f16,ch = CopyFromReg t0, Register:f16 %0 t4: f16,ch = CopyFromReg t0, Register:f16 %1 t14: i8 = setcc t2, t4, setoeq:ch t10: ch,glue = CopyToReg t0, Register:i8 $al, t14 t11: ch = X86ISD::RET_GLUE t10, TargetConstant:i32<0>, Register:i8 $al, t10:1 Optimized legalized selection DAG: %bb.0 'hoeq:' SelectionDAG has 12 nodes: t0: ch,glue = EntryToken t2: f16,ch = CopyFromReg t0, Register:f16 %0 t4: f16,ch = CopyFromReg t0, Register:f16 %1 t15: i32 = X86ISD::UCOMX t2, t4 t17: i8 = X86ISD::SETCC TargetConstant:i8<4>, t15 t10: ch,glue = CopyToReg t0, Register:i8 $al, t17 t11: ch = X86ISD::RET_GLUE t10, TargetConstant:i32<0>, Register:i8 $al, t10:1 ``` Earlier transform is mentioned here llvm/llvm-project#113098 (comment) --------- Co-authored-by: mattarde <[email protected]>
We added AVX10.2 COMEF ISA in LLVM, This does not optimize correctly in scenerio mentioned below.
Summary
Input
This patch will remove
cmpeqss
andcmpneqss
. For complete transform check unit test.I