-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[AArch64] Materialize constants via fneg. #80641
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-aarch64 Author: David Green (davemgreen) ChangesThis is something that is already done as a special case for copysign, this patch extends it to be more generally applied. If we are trying to matrialize a negative constant (notably -0.0, 0x80000000), then there may be no movi encoding that creates the immediate, but a fneg(movi) might. Some of the existing patterns for RADDHN needed to be adjusted to keep them in line with the new immediates. Patch is 22.63 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/80641.diff 9 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index b59f8d7306046..bbe000395162d 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -12935,42 +12935,67 @@ static SDValue NormalizeBuildVector(SDValue Op,
return DAG.getBuildVector(VT, dl, Ops);
}
-static SDValue ConstantBuildVector(SDValue Op, SelectionDAG &DAG) {
+static SDValue ConstantBuildVector(SDValue Op, SelectionDAG &DAG,
+ const AArch64Subtarget *ST) {
EVT VT = Op.getValueType();
+ assert((VT.getSizeInBits() == 64 || VT.getSizeInBits() == 128) &&
+ "Expected a legal NEON vector");
APInt DefBits(VT.getSizeInBits(), 0);
APInt UndefBits(VT.getSizeInBits(), 0);
BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
if (resolveBuildVector(BVN, DefBits, UndefBits)) {
- SDValue NewOp;
- if ((NewOp = tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) ||
- (NewOp = tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
- (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) ||
- (NewOp = tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
- (NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) ||
- (NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits)))
- return NewOp;
-
- DefBits = ~DefBits;
- if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG, DefBits)) ||
- (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MVNImsl, Op, DAG, DefBits)) ||
- (NewOp = tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, DefBits)))
- return NewOp;
-
- DefBits = UndefBits;
- if ((NewOp = tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) ||
- (NewOp = tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
- (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) ||
- (NewOp = tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
- (NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) ||
- (NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits)))
- return NewOp;
+ auto TryMOVIWithBits = [&](APInt DefBits) {
+ SDValue NewOp;
+ if ((NewOp = tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) ||
+ (NewOp = tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
+ (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) ||
+ (NewOp = tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
+ (NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) ||
+ (NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits)))
+ return NewOp;
+
+ APInt NotDefBits = ~DefBits;
+ if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG, NotDefBits)) ||
+ (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MVNImsl, Op, DAG, NotDefBits)) ||
+ (NewOp = tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, NotDefBits)))
+ return NewOp;
+ return SDValue();
+ };
+ if (SDValue R = TryMOVIWithBits(DefBits))
+ return R;
+ if (SDValue R = TryMOVIWithBits(UndefBits))
+ return R;
- DefBits = ~UndefBits;
- if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG, DefBits)) ||
- (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MVNImsl, Op, DAG, DefBits)) ||
- (NewOp = tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, DefBits)))
- return NewOp;
+ // See if a fneg of the constant can be materialized with a MOVI, etc
+ auto TryWithFNeg = [&](APInt DefBits, MVT FVT) {
+ // FNegate each sub-element of the constant
+ assert(VT.getSizeInBits() % FVT.getScalarSizeInBits() == 0);
+ APInt Neg = APInt::getHighBitsSet(FVT.getSizeInBits(), 1)
+ .zext(VT.getSizeInBits());
+ APInt NegBits(VT.getSizeInBits(), 0);
+ unsigned NumElts = VT.getSizeInBits() / FVT.getScalarSizeInBits();
+ for (unsigned i = 0; i < NumElts; i++)
+ NegBits |= Neg << (FVT.getScalarSizeInBits() * i);
+ NegBits = DefBits ^ NegBits;
+
+ // Try to create the new constants with MOVI, and if so generate a fneg
+ // for it.
+ if (SDValue NewOp = TryMOVIWithBits(NegBits)) {
+ SDLoc DL(Op);
+ MVT VFVT = NumElts == 1 ? FVT : MVT::getVectorVT(FVT, NumElts);
+ return DAG.getNode(
+ AArch64ISD::NVCAST, DL, VT,
+ DAG.getNode(ISD::FNEG, DL, VFVT,
+ DAG.getNode(AArch64ISD::NVCAST, DL, VFVT, NewOp)));
+ }
+ return SDValue();
+ };
+ SDValue R;
+ if ((R = TryWithFNeg(DefBits, MVT::f32)) ||
+ (R = TryWithFNeg(DefBits, MVT::f64)) ||
+ (ST->hasFullFP16() && (R = TryWithFNeg(DefBits, MVT::f16))))
+ return R;
}
return SDValue();
@@ -13019,7 +13044,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
return Op;
}
- if (SDValue V = ConstantBuildVector(Op, DAG))
+ if (SDValue V = ConstantBuildVector(Op, DAG, Subtarget))
return V;
// Scan through the operands to find some interesting properties we can
@@ -13244,7 +13269,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
ConstantValueAPInt = C->getAPIntValue().zextOrTrunc(BitSize);
if (!isNullConstant(ConstantValue) && !isNullFPConstant(ConstantValue) &&
!ConstantValueAPInt.isAllOnes()) {
- Val = ConstantBuildVector(Val, DAG);
+ Val = ConstantBuildVector(Val, DAG, Subtarget);
if (!Val)
// Otherwise, materialize the constant and splat it.
Val = DAG.getNode(AArch64ISD::DUP, dl, VT, ConstantValue);
@@ -23175,9 +23200,12 @@ static SDValue performDUPCombine(SDNode *N,
}
/// Get rid of unnecessary NVCASTs (that don't change the type).
-static SDValue performNVCASTCombine(SDNode *N) {
+static SDValue performNVCASTCombine(SDNode *N, SelectionDAG &DAG) {
if (N->getValueType(0) == N->getOperand(0).getValueType())
return N->getOperand(0);
+ if (N->getOperand(0).getOpcode() == AArch64ISD::NVCAST)
+ return DAG.getNode(AArch64ISD::NVCAST, SDLoc(N), N->getValueType(0),
+ N->getOperand(0).getOperand(0));
return SDValue();
}
@@ -24171,7 +24199,7 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
case AArch64ISD::DUPLANE128:
return performDupLane128Combine(N, DAG);
case AArch64ISD::NVCAST:
- return performNVCASTCombine(N);
+ return performNVCASTCombine(N, DAG);
case AArch64ISD::SPLICE:
return performSpliceCombine(N, DAG);
case AArch64ISD::UUNPKLO:
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index c476617e679f3..1797b8aea5091 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -7558,13 +7558,17 @@ defm USHR : SIMDVectorRShiftBHSD<1, 0b00000, "ushr", AArch64vlshr>;
defm USRA : SIMDVectorRShiftBHSDTied<1, 0b00010, "usra",
TriOpFrag<(add_and_or_is_add node:$LHS, (AArch64vlshr node:$MHS, node:$RHS))> >;
+def VImm0080: PatLeaf<(AArch64movi_shift (i32 128), (i32 0))>;
+def VImm00008000: PatLeaf<(AArch64movi_shift (i32 128), (i32 8))>;
+def VImm0000000080000000: PatLeaf<(AArch64NvCast (v2f64 (fneg (AArch64NvCast (v4i32 (AArch64movi_shift (i32 128), (i32 24)))))))>;
+
// RADDHN patterns for when RSHRN shifts by half the size of the vector element
-def : Pat<(v8i8 (trunc (AArch64vlshr (add (v8i16 V128:$Vn), (AArch64movi_shift (i32 128), (i32 0))), (i32 8)))),
+def : Pat<(v8i8 (trunc (AArch64vlshr (add (v8i16 V128:$Vn), VImm0080), (i32 8)))),
(RADDHNv8i16_v8i8 V128:$Vn, (v8i16 (MOVIv2d_ns (i32 0))))>;
-def : Pat<(v4i16 (trunc (AArch64vlshr (add (v4i32 V128:$Vn), (AArch64movi_shift (i32 128), (i32 8))), (i32 16)))),
+def : Pat<(v4i16 (trunc (AArch64vlshr (add (v4i32 V128:$Vn), VImm00008000), (i32 16)))),
(RADDHNv4i32_v4i16 V128:$Vn, (v4i32 (MOVIv2d_ns (i32 0))))>;
let AddedComplexity = 5 in
-def : Pat<(v2i32 (trunc (AArch64vlshr (add (v2i64 V128:$Vn), (AArch64dup (i64 2147483648))), (i32 32)))),
+def : Pat<(v2i32 (trunc (AArch64vlshr (add (v2i64 V128:$Vn), VImm0000000080000000), (i32 32)))),
(RADDHNv2i64_v2i32 V128:$Vn, (v2i64 (MOVIv2d_ns (i32 0))))>;
def : Pat<(v8i8 (int_aarch64_neon_rshrn (v8i16 V128:$Vn), (i32 8))),
(RADDHNv8i16_v8i8 V128:$Vn, (v8i16 (MOVIv2d_ns (i32 0))))>;
@@ -7576,20 +7580,20 @@ def : Pat<(v2i32 (int_aarch64_neon_rshrn (v2i64 V128:$Vn), (i32 32))),
// RADDHN2 patterns for when RSHRN shifts by half the size of the vector element
def : Pat<(v16i8 (concat_vectors
(v8i8 V64:$Vd),
- (v8i8 (trunc (AArch64vlshr (add (v8i16 V128:$Vn), (AArch64movi_shift (i32 128), (i32 0))), (i32 8)))))),
+ (v8i8 (trunc (AArch64vlshr (add (v8i16 V128:$Vn), VImm0080), (i32 8)))))),
(RADDHNv8i16_v16i8
(INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn,
(v8i16 (MOVIv2d_ns (i32 0))))>;
def : Pat<(v8i16 (concat_vectors
(v4i16 V64:$Vd),
- (v4i16 (trunc (AArch64vlshr (add (v4i32 V128:$Vn), (AArch64movi_shift (i32 128), (i32 8))), (i32 16)))))),
+ (v4i16 (trunc (AArch64vlshr (add (v4i32 V128:$Vn), VImm00008000), (i32 16)))))),
(RADDHNv4i32_v8i16
(INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn,
(v4i32 (MOVIv2d_ns (i32 0))))>;
let AddedComplexity = 5 in
def : Pat<(v4i32 (concat_vectors
(v2i32 V64:$Vd),
- (v2i32 (trunc (AArch64vlshr (add (v2i64 V128:$Vn), (AArch64dup (i64 2147483648))), (i32 32)))))),
+ (v2i32 (trunc (AArch64vlshr (add (v2i64 V128:$Vn), VImm0000000080000000), (i32 32)))))),
(RADDHNv2i64_v4i32
(INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn,
(v2i64 (MOVIv2d_ns (i32 0))))>;
diff --git a/llvm/test/CodeGen/AArch64/arm64-build-vector.ll b/llvm/test/CodeGen/AArch64/arm64-build-vector.ll
index 68c56d765cbb9..f7ba1ec5e28f9 100644
--- a/llvm/test/CodeGen/AArch64/arm64-build-vector.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-build-vector.ll
@@ -120,8 +120,8 @@ define <2 x double> @poszero_v2f64(<2 x double> %a) {
define <2 x double> @negzero_v2f64(<2 x double> %a) {
; CHECK-LABEL: negzero_v2f64:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov x8, #-9223372036854775808 // =0x8000000000000000
-; CHECK-NEXT: dup v1.2d, x8
+; CHECK-NEXT: movi v1.2d, #0000000000000000
+; CHECK-NEXT: fneg v1.2d, v1.2d
; CHECK-NEXT: fmul v0.2d, v0.2d, v1.2d
; CHECK-NEXT: ret
%b = fmul <2 x double> %a, <double -0.0, double -0.0>
diff --git a/llvm/test/CodeGen/AArch64/fabs-combine.ll b/llvm/test/CodeGen/AArch64/fabs-combine.ll
index 23bf7a699195f..d083f2006575b 100644
--- a/llvm/test/CodeGen/AArch64/fabs-combine.ll
+++ b/llvm/test/CodeGen/AArch64/fabs-combine.ll
@@ -71,8 +71,8 @@ define <4 x float> @nabsv4f32(<4 x float> %a) {
define <2 x double> @nabsv2d64(<2 x double> %a) {
; CHECK-LABEL: nabsv2d64:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov x8, #-9223372036854775808
-; CHECK-NEXT: dup v1.2d, x8
+; CHECK-NEXT: movi v1.2d, #0000000000000000
+; CHECK-NEXT: fneg v1.2d, v1.2d
; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
%conv = bitcast <2 x double> %a to <2 x i64>
diff --git a/llvm/test/CodeGen/AArch64/fcvt_combine.ll b/llvm/test/CodeGen/AArch64/fcvt_combine.ll
index b5b9055fbc02f..37133cf0aa1df 100644
--- a/llvm/test/CodeGen/AArch64/fcvt_combine.ll
+++ b/llvm/test/CodeGen/AArch64/fcvt_combine.ll
@@ -110,8 +110,8 @@ define <2 x i32> @test9(<2 x float> %f) {
define <2 x i32> @test10(<2 x float> %f) {
; CHECK-LABEL: test10:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #2143289344 // =0x7fc00000
-; CHECK-NEXT: dup v0.2s, w8
+; CHECK-NEXT: mvni v0.2s, #63, msl #16
+; CHECK-NEXT: fneg v0.2s, v0.2s
; CHECK-NEXT: fcvtzu v0.2s, v0.2s
; CHECK-NEXT: ret
%mul.i = fmul <2 x float> %f, <float undef, float undef>
@@ -426,8 +426,8 @@ define <2 x i32> @test9_sat(<2 x float> %f) {
define <2 x i32> @test10_sat(<2 x float> %f) {
; CHECK-LABEL: test10_sat:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #2143289344 // =0x7fc00000
-; CHECK-NEXT: dup v0.2s, w8
+; CHECK-NEXT: mvni v0.2s, #63, msl #16
+; CHECK-NEXT: fneg v0.2s, v0.2s
; CHECK-NEXT: fcvtzu v0.2s, v0.2s
; CHECK-NEXT: ret
%mul.i = fmul <2 x float> %f, <float undef, float undef>
diff --git a/llvm/test/CodeGen/AArch64/neon-mov.ll b/llvm/test/CodeGen/AArch64/neon-mov.ll
index 219c8b53243e6..7effdc97993c1 100644
--- a/llvm/test/CodeGen/AArch64/neon-mov.ll
+++ b/llvm/test/CodeGen/AArch64/neon-mov.ll
@@ -111,16 +111,14 @@ define <4 x i32> @movi4s_lsl16() {
define <4 x i32> @movi4s_fneg() {
; CHECK-NOFP16-SD-LABEL: movi4s_fneg:
; CHECK-NOFP16-SD: // %bb.0:
-; CHECK-NOFP16-SD-NEXT: mov w8, #61440 // =0xf000
-; CHECK-NOFP16-SD-NEXT: movk w8, #32768, lsl #16
-; CHECK-NOFP16-SD-NEXT: dup v0.4s, w8
+; CHECK-NOFP16-SD-NEXT: movi v0.4s, #240, lsl #8
+; CHECK-NOFP16-SD-NEXT: fneg v0.4s, v0.4s
; CHECK-NOFP16-SD-NEXT: ret
;
; CHECK-FP16-SD-LABEL: movi4s_fneg:
; CHECK-FP16-SD: // %bb.0:
-; CHECK-FP16-SD-NEXT: mov w8, #61440 // =0xf000
-; CHECK-FP16-SD-NEXT: movk w8, #32768, lsl #16
-; CHECK-FP16-SD-NEXT: dup v0.4s, w8
+; CHECK-FP16-SD-NEXT: movi v0.4s, #240, lsl #8
+; CHECK-FP16-SD-NEXT: fneg v0.4s, v0.4s
; CHECK-FP16-SD-NEXT: ret
;
; CHECK-NOFP16-GI-LABEL: movi4s_fneg:
@@ -178,11 +176,29 @@ define <8 x i16> @movi8h_lsl8() {
}
define <8 x i16> @movi8h_fneg() {
-; CHECK-LABEL: movi8h_fneg:
-; CHECK: // %bb.0:
-; CHECK-NEXT: adrp x8, .LCPI19_0
-; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI19_0]
-; CHECK-NEXT: ret
+; CHECK-NOFP16-SD-LABEL: movi8h_fneg:
+; CHECK-NOFP16-SD: // %bb.0:
+; CHECK-NOFP16-SD-NEXT: movi v0.8h, #127, lsl #8
+; CHECK-NOFP16-SD-NEXT: fneg v0.4s, v0.4s
+; CHECK-NOFP16-SD-NEXT: ret
+;
+; CHECK-FP16-SD-LABEL: movi8h_fneg:
+; CHECK-FP16-SD: // %bb.0:
+; CHECK-FP16-SD-NEXT: movi v0.8h, #127, lsl #8
+; CHECK-FP16-SD-NEXT: fneg v0.4s, v0.4s
+; CHECK-FP16-SD-NEXT: ret
+;
+; CHECK-NOFP16-GI-LABEL: movi8h_fneg:
+; CHECK-NOFP16-GI: // %bb.0:
+; CHECK-NOFP16-GI-NEXT: adrp x8, .LCPI19_0
+; CHECK-NOFP16-GI-NEXT: ldr q0, [x8, :lo12:.LCPI19_0]
+; CHECK-NOFP16-GI-NEXT: ret
+;
+; CHECK-FP16-GI-LABEL: movi8h_fneg:
+; CHECK-FP16-GI: // %bb.0:
+; CHECK-FP16-GI-NEXT: adrp x8, .LCPI19_0
+; CHECK-FP16-GI-NEXT: ldr q0, [x8, :lo12:.LCPI19_0]
+; CHECK-FP16-GI-NEXT: ret
ret <8 x i16> <i16 32512, i16 65280, i16 32512, i16 65280, i16 32512, i16 65280, i16 32512, i16 65280>
}
@@ -294,8 +310,8 @@ define <8 x i16> @mvni8h_neg() {
;
; CHECK-FP16-SD-LABEL: mvni8h_neg:
; CHECK-FP16-SD: // %bb.0:
-; CHECK-FP16-SD-NEXT: mov w8, #33008 // =0x80f0
-; CHECK-FP16-SD-NEXT: dup v0.8h, w8
+; CHECK-FP16-SD-NEXT: movi v0.8h, #240
+; CHECK-FP16-SD-NEXT: fneg v0.8h, v0.8h
; CHECK-FP16-SD-NEXT: ret
;
; CHECK-NOFP16-GI-LABEL: mvni8h_neg:
@@ -480,14 +496,14 @@ define <2 x double> @fmov2d() {
define <2 x double> @fmov2d_neg0() {
; CHECK-NOFP16-SD-LABEL: fmov2d_neg0:
; CHECK-NOFP16-SD: // %bb.0:
-; CHECK-NOFP16-SD-NEXT: mov x8, #-9223372036854775808 // =0x8000000000000000
-; CHECK-NOFP16-SD-NEXT: dup v0.2d, x8
+; CHECK-NOFP16-SD-NEXT: movi v0.2d, #0000000000000000
+; CHECK-NOFP16-SD-NEXT: fneg v0.2d, v0.2d
; CHECK-NOFP16-SD-NEXT: ret
;
; CHECK-FP16-SD-LABEL: fmov2d_neg0:
; CHECK-FP16-SD: // %bb.0:
-; CHECK-FP16-SD-NEXT: mov x8, #-9223372036854775808 // =0x8000000000000000
-; CHECK-FP16-SD-NEXT: dup v0.2d, x8
+; CHECK-FP16-SD-NEXT: movi v0.2d, #0000000000000000
+; CHECK-FP16-SD-NEXT: fneg v0.2d, v0.2d
; CHECK-FP16-SD-NEXT: ret
;
; CHECK-NOFP16-GI-LABEL: fmov2d_neg0:
diff --git a/llvm/test/CodeGen/AArch64/srem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/AArch64/srem-seteq-vec-nonsplat.ll
index f8c6f4193959d..1ebfe308e9af9 100644
--- a/llvm/test/CodeGen/AArch64/srem-seteq-vec-nonsplat.ll
+++ b/llvm/test/CodeGen/AArch64/srem-seteq-vec-nonsplat.ll
@@ -35,18 +35,17 @@ define <4 x i32> @test_srem_odd_even(<4 x i32> %X) nounwind {
define <4 x i32> @test_srem_odd_allones_eq(<4 x i32> %X) nounwind {
; CHECK-LABEL: test_srem_odd_allones_eq:
; CHECK: // %bb.0:
+; CHECK-NEXT: movi v1.16b, #153
; CHECK-NEXT: mov w8, #52429 // =0xcccd
-; CHECK-NEXT: mov w9, #39321 // =0x9999
; CHECK-NEXT: movk w8, #52428, lsl #16
-; CHECK-NEXT: movk w9, #6553, lsl #16
-; CHECK-NEXT: dup v1.4s, w8
-; CHECK-NEXT: dup v2.4s, w9
+; CHECK-NEXT: dup v2.4s, w8
; CHECK-NEXT: adrp x8, .LCPI1_0
-; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s
+; CHECK-NEXT: fneg v1.4s, v1.4s
+; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s
; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI1_0]
-; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: cmhs v0.4s, v0.4s, v2.4s
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: movi v2.4s, #1
+; CHECK-NEXT: cmhs v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: and v0.16b, v0.16b, v2.16b
; CHECK-NEXT: ret
%srem = srem <4 x i32> %X, <i32 5, i32 5, i32 4294967295, i32 5>
%cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
@@ -56,18 +55,17 @@ define <4 x i32> @test_srem_odd_allones_eq(<4 x i32> %X) nounwind {
define <4 x i32> @test_srem_odd_allones_ne(<4 x i32> %X) nounwind {
; CHECK-LABEL: test_srem_odd_allones_ne:
; CHECK: // %bb.0:
+; CHECK-NEXT: movi v1.16b, #153
; CHECK-NEXT: mov w8, #52429 // =0xcccd
-; CHECK-NEXT: mov w9, #39321 // =0x9999
; CHECK-NEXT: movk w8, #52428, lsl #16
-; CHECK-NEXT: movk w9, #6553, lsl #16
-; CHECK-NEXT: dup v1.4s, w8
-; CHECK-NEXT: dup v2.4s, w9
+; CHECK-NEXT: dup v2.4s, w8
; CHECK-NEXT: adrp x8, .LCPI2_0
-; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s
+; CHECK-NEXT: fneg v1.4s, v1.4s
+; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s
; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI2_0]
-; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: cmhi v0.4s, v2.4s, v0.4s
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: movi v2.4s, #1
+; CHECK-NEXT: cmhi v0.4s, v1.4s, v0.4s
+; CHECK-NEXT: and v0.16b, v0.16b, v2.16b
; CHECK-NEXT: ret
%srem = srem <4 x i32> %X, <i32 5, i32 5, i32 4294967295, i32 5>
%cmp = icmp ne <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
@@ -269,18 +267,17 @@ define <4 x i32> @test_srem_odd_even_poweroftwo(<4 x i32> %X) nounwind {
define <4 x i32> @test_srem_odd_one(<4 x i32> %X) nounwind {
; CHECK-LABEL: test_srem_odd_one:
; CHECK: // %bb.0:
+; CHECK-NEXT: movi v1.16b, #153
; CHECK-NEXT: mov w8, #52429 // =0xcccd
-; CHECK-NEXT: mov w9, #39321 // =0x9999
; CHECK-NEXT: movk w8, #52428, lsl #16
-; CHECK-NEXT: movk w9, #6553, lsl #16
-; CHECK-NEXT: dup v1.4s, w8
-; CHECK-NEXT: dup v2.4s, w9
+; CHECK-NEXT: dup v2.4s, w8
; CHECK-NEXT: adrp x8, .LCPI10_0
-; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s
+; CHECK-NEXT: fneg v1.4s, v1.4s
+; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s
; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI10_0]
-; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: cmhs v0.4s, v0.4s, v2.4s
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: movi v2.4s, #1
+; CHECK-NEXT: cmhs v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: and v0.16b, v0.16b, v2.16b
; CHECK-NEXT: ret
%srem = srem <4 x i32> %X, <i32 5, i32 5, i32 1, i32 5>
%cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
@@ -522,18 +519,17 @@ define <4 x i32> @test_srem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwi
define <4 x i32> @test_srem_odd_allones_and_one(<4 x i32> %X) nounwind {
; CHECK-LABEL: test_srem_odd_allones_and_one:
; CHECK: // %bb.0:
+; CHECK-NEXT: movi v1.16b, #153
; CHECK-NEXT: mov w8, #52429 // =0xcccd
-; CHECK-NEXT: mov w9, #39321 // =0x9999
; CHECK-NEXT: movk w8, #52428, lsl #16
-; CHECK-NEXT: movk w9, #6553, lsl #16
-; CHECK-NEXT: dup v1.4s, w8
-; CHECK-NEXT: dup v2.4s, w9
+; CHECK-NEXT: dup v2.4s, w8
; CHECK-NEXT: adrp x8, .LCPI19_0
-; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s
+; CHECK-NEXT: fneg v1.4s, v1.4s
+; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s
...
[truncated]
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
This is a Global ISel equivalent of llvm#80641, creating fneg(movi) instead of the alternative constant pool load or gpr dup.
This is something that is already done as a special case for copysign, this patch extends it to be more generally applied. If we are trying to matrialize a negative constant (notably -0.0, 0x80000000), then there may be no movi encoding that creates the immediate, but a fneg(movi) might. Some of the existing patterns for RADDHN needed to be adjusted to keep them in line with the new immediates.
1babf7f
to
35ba589
Compare
Ping. Thanks |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks very reasonable to me.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for clarifying a few details in person. Seems sensible.
This is a Global ISel equivalent of llvm#80641, creating fneg(movi) instead of the alternative constant pool load or gpr dup.
This is a Global ISel equivalent of #80641, creating fneg(movi) instead of the alternative constant pool load or gpr dup.
Hi, looks like the test |
Hi - are you sure that wasn't the updated tablegen patterns in #81960? They undo some of the changes made in this commit. |
Oh yes, looks like I did an incorrect git rebase/merge. |
This is something that is already done as a special case for copysign, this patch extends it to be more generally applied. If we are trying to matrialize a negative constant (notably -0.0, 0x80000000), then there may be no movi encoding that creates the immediate, but a fneg(movi) might.
Some of the existing patterns for RADDHN needed to be adjusted to keep them in line with the new immediates.