Skip to content

Commit e61a7dc

Browse files
mahesh-attardemattarde
and
mattarde
authored
[X86][AVX512] Use comx for compare (llvm#113567)
We added AVX10.2 COMEF ISA in LLVM, This does not optimize correctly in scenario mentioned below. Summary Input ``` define i1 @oeq(float %x, float %y) { %1 = fcmp oeq float %x, %y ret i1 %1 }define i1 @UNE(float %x, float %y) { %1 = fcmp une float %x, %y ret i1 %1 }define i1 @ogt(float %x, float %y) { %1 = fcmp ogt float %x, %y ret i1 %1 } // Prior AVX10.2, default code generation oeq: # @oeq cmpeqss xmm0, xmm1 movd eax, xmm0 and eax, 1 ret une: # @UNE cmpneqss xmm0, xmm1 movd eax, xmm0 and eax, 1 ret ogt: # @ogt ucomiss xmm0, xmm1 seta al ret ``` This patch will remove `cmpeqss` and `cmpneqss`. For complete transform check unit test. Continuing on what PR llvm#113098 added Earlier Legalization and combine expanded `setcc oeq:ch` node into `and` and `setcc eq` , `setcc o`. From suggestions in community new internal transform ``` Optimized type-legalized selection DAG: %bb.0 'hoeq:' SelectionDAG has 11 nodes: t0: ch,glue = EntryToken t2: f16,ch = CopyFromReg t0, Register:f16 %0 t4: f16,ch = CopyFromReg t0, Register:f16 %1 t14: i8 = setcc t2, t4, setoeq:ch t10: ch,glue = CopyToReg t0, Register:i8 $al, t14 t11: ch = X86ISD::RET_GLUE t10, TargetConstant:i32<0>, Register:i8 $al, t10:1 Optimized legalized selection DAG: %bb.0 'hoeq:' SelectionDAG has 12 nodes: t0: ch,glue = EntryToken t2: f16,ch = CopyFromReg t0, Register:f16 %0 t4: f16,ch = CopyFromReg t0, Register:f16 %1 t15: i32 = X86ISD::UCOMX t2, t4 t17: i8 = X86ISD::SETCC TargetConstant:i8<4>, t15 t10: ch,glue = CopyToReg t0, Register:i8 $al, t17 t11: ch = X86ISD::RET_GLUE t10, TargetConstant:i32<0>, Register:i8 $al, t10:1 ``` Earlier transform is mentioned here llvm#113098 (comment) --------- Co-authored-by: mattarde <[email protected]>
1 parent f358422 commit e61a7dc

File tree

4 files changed

+278
-0
lines changed

4 files changed

+278
-0
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

+11
Original file line numberDiff line numberDiff line change
@@ -2440,6 +2440,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
24402440
setOperationAction(ISD::FMA, MVT::v32bf16, Legal);
24412441
setOperationAction(ISD::SETCC, MVT::v32bf16, Custom);
24422442
}
2443+
for (auto VT : {MVT::f16, MVT::f32, MVT::f64}) {
2444+
setCondCodeAction(ISD::SETOEQ, VT, Custom);
2445+
setCondCodeAction(ISD::SETUNE, VT, Custom);
2446+
}
24432447
}
24442448

24452449
if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
@@ -24072,6 +24076,13 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
2407224076
return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
2407324077
}
2407424078

24079+
if (Subtarget.hasAVX10_2()) {
24080+
if (CC == ISD::SETOEQ || CC == ISD::SETUNE) {
24081+
auto NewCC = (CC == ISD::SETOEQ) ? X86::COND_E : (X86::COND_NE);
24082+
return getSETCC(NewCC, DAG.getNode(X86ISD::UCOMX, dl, MVT::i32, Op0, Op1),
24083+
dl, DAG);
24084+
}
24085+
}
2407524086
// Handle floating point.
2407624087
X86::CondCode CondCode = TranslateX86CC(CC, dl, /*IsFP*/ true, Op0, Op1, DAG);
2407724088
if (CondCode == X86::COND_INVALID)

llvm/lib/Target/X86/X86InstrAVX10.td

+27
Original file line numberDiff line numberDiff line change
@@ -1541,6 +1541,24 @@ defm VFNMSUB132NEPBF16 : avx10_fma3p_132_bf16<0x9E, "vfnmsub132nepbf16", X86any_
15411541
//-------------------------------------------------
15421542
// AVX10 COMEF instructions
15431543
//-------------------------------------------------
1544+
multiclass avx10_com_ef<bits<8> Opc, RegisterClass RC, ValueType VT,
1545+
SDPatternOperator OpNode, string OpcodeStr,
1546+
X86MemOperand x86memop, PatFrag ld_frag,
1547+
Domain d, X86FoldableSchedWrite sched = WriteFComX>{
1548+
let ExeDomain = d, mayRaiseFPException = 1, isCodeGenOnly = 1 in {
1549+
def rr : AVX512<Opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
1550+
!strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
1551+
[(set EFLAGS, (OpNode (VT RC:$src1), RC:$src2))]>,
1552+
EVEX, EVEX_V128, Sched<[sched]>, SIMD_EXC;
1553+
let mayLoad = 1 in {
1554+
def rm : AVX512<Opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
1555+
!strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
1556+
[(set EFLAGS, (OpNode (VT RC:$src1), (ld_frag addr:$src2)))]>,
1557+
EVEX, EVEX_V128, Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
1558+
}
1559+
}
1560+
}
1561+
15441562
multiclass avx10_com_ef_int<bits<8> Opc, X86VectorVTInfo _, SDNode OpNode,
15451563
string OpcodeStr,
15461564
Domain d,
@@ -1564,6 +1582,15 @@ multiclass avx10_com_ef_int<bits<8> Opc, X86VectorVTInfo _, SDNode OpNode,
15641582
}
15651583

15661584
let Defs = [EFLAGS], Uses = [MXCSR], Predicates = [HasAVX10_2] in {
1585+
defm VUCOMXSDZ : avx10_com_ef<0x2e, FR64X, f64, X86ucomi512,
1586+
"vucomxsd", f64mem, loadf64, SSEPackedDouble>,
1587+
TB, XS, VEX_LIG, REX_W, EVEX_CD8<64, CD8VT1>;
1588+
defm VUCOMXSHZ : avx10_com_ef<0x2e, FR16X, f16, X86ucomi512,
1589+
"vucomxsh", f16mem, loadf16, SSEPackedSingle>,
1590+
T_MAP5, XD, EVEX_CD8<16, CD8VT1>;
1591+
defm VUCOMXSSZ : avx10_com_ef<0x2e, FR32X, f32, X86ucomi512,
1592+
"vucomxss", f32mem, loadf32, SSEPackedSingle>,
1593+
TB, XD, VEX_LIG, EVEX_CD8<32, CD8VT1>;
15671594
defm VCOMXSDZ : avx10_com_ef_int<0x2f, v2f64x_info, X86comi512,
15681595
"vcomxsd", SSEPackedDouble>,
15691596
TB, XS, VEX_LIG, REX_W, EVEX_CD8<64, CD8VT1>;

llvm/test/CodeGen/X86/avx10_2-cmp.ll

+237
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,237 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx10.2-256 | FileCheck %s --check-prefix=X64
3+
; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+avx10.2-256 | FileCheck %s --check-prefix=X86
4+
5+
define i1 @hoeq(half %x, half %y) {
6+
; X64-LABEL: hoeq:
7+
; X64: # %bb.0:
8+
; X64-NEXT: vucomxsh %xmm1, %xmm0
9+
; X64-NEXT: sete %al
10+
; X64-NEXT: retq
11+
;
12+
; X86-LABEL: hoeq:
13+
; X86: # %bb.0:
14+
; X86-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
15+
; X86-NEXT: vucomxsh {{[0-9]+}}(%esp), %xmm0
16+
; X86-NEXT: sete %al
17+
; X86-NEXT: retl
18+
%1 = fcmp oeq half %x, %y
19+
ret i1 %1
20+
}
21+
22+
define i1 @hune(half %x, half %y) {
23+
; X64-LABEL: hune:
24+
; X64: # %bb.0:
25+
; X64-NEXT: vucomxsh %xmm1, %xmm0
26+
; X64-NEXT: setne %al
27+
; X64-NEXT: retq
28+
;
29+
; X86-LABEL: hune:
30+
; X86: # %bb.0:
31+
; X86-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
32+
; X86-NEXT: vucomxsh {{[0-9]+}}(%esp), %xmm0
33+
; X86-NEXT: setne %al
34+
; X86-NEXT: retl
35+
%1 = fcmp une half %x, %y
36+
ret i1 %1
37+
}
38+
39+
define i1 @hoeq_mem(ptr %xp, ptr %yp) {
40+
; X64-LABEL: hoeq_mem:
41+
; X64: # %bb.0:
42+
; X64-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
43+
; X64-NEXT: vucomxsh (%rsi), %xmm0
44+
; X64-NEXT: sete %al
45+
; X64-NEXT: retq
46+
;
47+
; X86-LABEL: hoeq_mem:
48+
; X86: # %bb.0:
49+
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
50+
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
51+
; X86-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
52+
; X86-NEXT: vucomxsh (%eax), %xmm0
53+
; X86-NEXT: sete %al
54+
; X86-NEXT: retl
55+
%x = load half, ptr %xp
56+
%y = load half, ptr %yp
57+
%1 = fcmp oeq half %x, %y
58+
ret i1 %1
59+
}
60+
61+
define i1 @hune_mem(ptr %xp, ptr %yp) {
62+
; X64-LABEL: hune_mem:
63+
; X64: # %bb.0:
64+
; X64-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
65+
; X64-NEXT: vucomxsh (%rsi), %xmm0
66+
; X64-NEXT: setne %al
67+
; X64-NEXT: retq
68+
;
69+
; X86-LABEL: hune_mem:
70+
; X86: # %bb.0:
71+
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
72+
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
73+
; X86-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
74+
; X86-NEXT: vucomxsh (%eax), %xmm0
75+
; X86-NEXT: setne %al
76+
; X86-NEXT: retl
77+
%x = load half, ptr %xp
78+
%y = load half, ptr %yp
79+
%1 = fcmp une half %x, %y
80+
ret i1 %1
81+
}
82+
83+
define i1 @foeq(float %x, float %y) {
84+
; X64-LABEL: foeq:
85+
; X64: # %bb.0:
86+
; X64-NEXT: vucomxss %xmm1, %xmm0
87+
; X64-NEXT: sete %al
88+
; X64-NEXT: retq
89+
;
90+
; X86-LABEL: foeq:
91+
; X86: # %bb.0:
92+
; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
93+
; X86-NEXT: vucomxss {{[0-9]+}}(%esp), %xmm0
94+
; X86-NEXT: sete %al
95+
; X86-NEXT: retl
96+
%1 = fcmp oeq float %x, %y
97+
ret i1 %1
98+
}
99+
100+
define i1 @fune(float %x, float %y) {
101+
; X64-LABEL: fune:
102+
; X64: # %bb.0:
103+
; X64-NEXT: vucomxss %xmm1, %xmm0
104+
; X64-NEXT: setne %al
105+
; X64-NEXT: retq
106+
;
107+
; X86-LABEL: fune:
108+
; X86: # %bb.0:
109+
; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
110+
; X86-NEXT: vucomxss {{[0-9]+}}(%esp), %xmm0
111+
; X86-NEXT: setne %al
112+
; X86-NEXT: retl
113+
%1 = fcmp une float %x, %y
114+
ret i1 %1
115+
}
116+
117+
define i1 @foeq_mem(ptr %xp, ptr %yp) {
118+
; X64-LABEL: foeq_mem:
119+
; X64: # %bb.0:
120+
; X64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
121+
; X64-NEXT: vucomxss (%rsi), %xmm0
122+
; X64-NEXT: sete %al
123+
; X64-NEXT: retq
124+
;
125+
; X86-LABEL: foeq_mem:
126+
; X86: # %bb.0:
127+
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
128+
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
129+
; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
130+
; X86-NEXT: vucomxss (%eax), %xmm0
131+
; X86-NEXT: sete %al
132+
; X86-NEXT: retl
133+
%x = load float, ptr %xp
134+
%y = load float, ptr %yp
135+
%1 = fcmp oeq float %x, %y
136+
ret i1 %1
137+
}
138+
139+
define i1 @fune_mem(ptr %xp, ptr %yp) {
140+
; X64-LABEL: fune_mem:
141+
; X64: # %bb.0:
142+
; X64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
143+
; X64-NEXT: vucomxss (%rsi), %xmm0
144+
; X64-NEXT: setne %al
145+
; X64-NEXT: retq
146+
;
147+
; X86-LABEL: fune_mem:
148+
; X86: # %bb.0:
149+
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
150+
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
151+
; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
152+
; X86-NEXT: vucomxss (%eax), %xmm0
153+
; X86-NEXT: setne %al
154+
; X86-NEXT: retl
155+
%x = load float, ptr %xp
156+
%y = load float, ptr %yp
157+
%1 = fcmp une float %x, %y
158+
ret i1 %1
159+
}
160+
161+
define i1 @doeq(double %x, double %y) {
162+
; X64-LABEL: doeq:
163+
; X64: # %bb.0:
164+
; X64-NEXT: vucomxsd %xmm1, %xmm0
165+
; X64-NEXT: sete %al
166+
; X64-NEXT: retq
167+
;
168+
; X86-LABEL: doeq:
169+
; X86: # %bb.0:
170+
; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
171+
; X86-NEXT: vucomxsd {{[0-9]+}}(%esp), %xmm0
172+
; X86-NEXT: sete %al
173+
; X86-NEXT: retl
174+
%1 = fcmp oeq double %x, %y
175+
ret i1 %1
176+
}
177+
178+
define i1 @dune(double %x, double %y) {
179+
; X64-LABEL: dune:
180+
; X64: # %bb.0:
181+
; X64-NEXT: vucomxsd %xmm1, %xmm0
182+
; X64-NEXT: setne %al
183+
; X64-NEXT: retq
184+
;
185+
; X86-LABEL: dune:
186+
; X86: # %bb.0:
187+
; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
188+
; X86-NEXT: vucomxsd {{[0-9]+}}(%esp), %xmm0
189+
; X86-NEXT: setne %al
190+
; X86-NEXT: retl
191+
%1 = fcmp une double %x, %y
192+
ret i1 %1
193+
}
194+
195+
define i1 @doeq_mem(ptr %xp, ptr %yp) {
196+
; X64-LABEL: doeq_mem:
197+
; X64: # %bb.0:
198+
; X64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
199+
; X64-NEXT: vucomxsd (%rsi), %xmm0
200+
; X64-NEXT: sete %al
201+
; X64-NEXT: retq
202+
;
203+
; X86-LABEL: doeq_mem:
204+
; X86: # %bb.0:
205+
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
206+
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
207+
; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
208+
; X86-NEXT: vucomxsd (%eax), %xmm0
209+
; X86-NEXT: sete %al
210+
; X86-NEXT: retl
211+
%x = load double, ptr %xp
212+
%y = load double, ptr %yp
213+
%1 = fcmp oeq double %x, %y
214+
ret i1 %1
215+
}
216+
217+
define i1 @dune_mem(ptr %xp, ptr %yp) {
218+
; X64-LABEL: dune_mem:
219+
; X64: # %bb.0:
220+
; X64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
221+
; X64-NEXT: vucomxsd (%rsi), %xmm0
222+
; X64-NEXT: setne %al
223+
; X64-NEXT: retq
224+
;
225+
; X86-LABEL: dune_mem:
226+
; X86: # %bb.0:
227+
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
228+
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
229+
; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
230+
; X86-NEXT: vucomxsd (%eax), %xmm0
231+
; X86-NEXT: setne %al
232+
; X86-NEXT: retl
233+
%x = load double, ptr %xp
234+
%y = load double, ptr %yp
235+
%1 = fcmp une double %x, %y
236+
ret i1 %1
237+
}

llvm/test/TableGen/x86-fold-tables.inc

+3
Original file line numberDiff line numberDiff line change
@@ -1959,8 +1959,11 @@ static const X86FoldTableEntry Table1[] = {
19591959
{X86::VUCOMISSZrr_Int, X86::VUCOMISSZrm_Int, TB_NO_REVERSE},
19601960
{X86::VUCOMISSrr, X86::VUCOMISSrm, 0},
19611961
{X86::VUCOMISSrr_Int, X86::VUCOMISSrm_Int, TB_NO_REVERSE},
1962+
{X86::VUCOMXSDZrr, X86::VUCOMXSDZrm, 0},
19621963
{X86::VUCOMXSDZrr_Int, X86::VUCOMXSDZrm_Int, TB_NO_REVERSE},
1964+
{X86::VUCOMXSHZrr, X86::VUCOMXSHZrm, 0},
19631965
{X86::VUCOMXSHZrr_Int, X86::VUCOMXSHZrm_Int, TB_NO_REVERSE},
1966+
{X86::VUCOMXSSZrr, X86::VUCOMXSSZrm, 0},
19641967
{X86::VUCOMXSSZrr_Int, X86::VUCOMXSSZrm_Int, TB_NO_REVERSE},
19651968
{X86::XOR16ri8_ND, X86::XOR16mi8_ND, 0},
19661969
{X86::XOR16ri8_NF_ND, X86::XOR16mi8_NF_ND, 0},

0 commit comments

Comments
 (0)