[KnownBits] Make nuw and nsw support in computeForAddSub optimal

goldsteinn · goldsteinn · commit 99422c763bee · 2024-03-02T12:22:20.000-06:00
Just some improvements that should hopefully strengthen analysis.
diff --git a/llvm/include/llvm/Support/KnownBits.h b/llvm/include/llvm/Support/KnownBits.h
@@ -62,6 +62,11 @@ struct KnownBits {
   /// Returns true if we don't know any bits.
   bool isUnknown() const { return Zero.isZero() && One.isZero(); }
 
+  /// Returns true if we don't know the sign bit.
+  bool isSignUnknown() const {
+    return !Zero.isSignBitSet() && !One.isSignBitSet();
+  }
+
   /// Resets the known state of all bits.
   void resetAll() {
     Zero.clearAllBits();
@@ -330,7 +335,7 @@ struct KnownBits {
 
   /// Compute known bits resulting from adding LHS and RHS.
   static KnownBits computeForAddSub(bool Add, bool NSW, bool NUW,
-                                    const KnownBits &LHS, KnownBits RHS);
+                                    const KnownBits &LHS, const KnownBits &RHS);
 
   /// Compute known bits results from subtracting RHS from LHS with 1-bit
   /// Borrow.
diff --git a/llvm/lib/Support/KnownBits.cpp b/llvm/lib/Support/KnownBits.cpp
@@ -54,32 +54,183 @@ KnownBits KnownBits::computeForAddCarry(
       LHS, RHS, Carry.Zero.getBoolValue(), Carry.One.getBoolValue());
 }
 
-KnownBits KnownBits::computeForAddSub(bool Add, bool NSW, bool /*NUW*/,
-                                      const KnownBits &LHS, KnownBits RHS) {
+KnownBits KnownBits::computeForAddSub(bool Add, bool NSW, bool NUW,
+                                      const KnownBits &LHS,
+                                      const KnownBits &RHS) {
+  // This can be a relatively expensive helper, so optimistically save some
+  // work.
+  if (LHS.isUnknown() && RHS.isUnknown())
+    return LHS;
   KnownBits KnownOut;
   if (Add) {
     // Sum = LHS + RHS + 0
-    KnownOut = ::computeForAddCarry(
-        LHS, RHS, /*CarryZero*/true, /*CarryOne*/false);
+    KnownOut =
+        ::computeForAddCarry(LHS, RHS, /*CarryZero*/ true, /*CarryOne*/ false);
   } else {
     // Sum = LHS + ~RHS + 1
-    std::swap(RHS.Zero, RHS.One);
-    KnownOut = ::computeForAddCarry(
-        LHS, RHS, /*CarryZero*/false, /*CarryOne*/true);
+    KnownBits NotRHS = RHS;
+    std::swap(NotRHS.Zero, NotRHS.One);
+    KnownOut = ::computeForAddCarry(LHS, NotRHS, /*CarryZero*/ false,
+                                    /*CarryOne*/ true);
   }
+  if (!NSW && !NUW)
+    return KnownOut;
 
-  // Are we still trying to solve for the sign bit?
-  if (!KnownOut.isNegative() && !KnownOut.isNonNegative()) {
-    if (NSW) {
-      // Adding two non-negative numbers, or subtracting a negative number from
-      // a non-negative one, can't wrap into negative.
-      if (LHS.isNonNegative() && RHS.isNonNegative())
+  auto GetMinMaxVal = [Add](bool ForNSW, bool ForMax, const KnownBits &L,
+                            const KnownBits &R, bool &OV) {
+    APInt LVal = ForMax ? L.getMaxValue() : L.getMinValue();
+    APInt RVal = Add == ForMax ? R.getMaxValue() : R.getMinValue();
+
+    if (ForNSW) {
+      LVal.clearSignBit();
+      RVal.clearSignBit();
+    }
+    APInt Res = Add ? LVal.uadd_ov(RVal, OV) : LVal.usub_ov(RVal, OV);
+    if (ForNSW) {
+      OV = Res.isSignBitSet();
+      Res.clearSignBit();
+      if (Res.getBitWidth() > 1 && Res[Res.getBitWidth() - 2])
+        Res.setSignBit();
+    }
+    return Res;
+  };
+
+  auto GetMaxVal = [&GetMinMaxVal](bool ForNSW, const KnownBits &L,
+                                   const KnownBits &R, bool &OV) {
+    return GetMinMaxVal(ForNSW, /*ForMax=*/true, L, R, OV);
+  };
+
+  auto GetMinVal = [&GetMinMaxVal](bool ForNSW, const KnownBits &L,
+                                   const KnownBits &R, bool &OV) {
+    return GetMinMaxVal(ForNSW, /*ForMax=*/false, L, R, OV);
+  };
+
+  auto ForceNegative = [](KnownBits &Known) {
+    Known.Zero.clearSignBit();
+    Known.One.setSignBit();
+  };
+
+  auto ForcePositive = [](KnownBits &Known) {
+    Known.One.clearSignBit();
+    Known.Zero.setSignBit();
+  };
+
+  // Handle add/sub given nsw and/or nuw.
+  //
+  // Possible TODO: Add/Sub implementations mirror one another in many ways.
+  // They could probably be compressed into a single implementation of roughly
+  // half the total LOC. Leaving seperate for now to increase clarity.
+  // NB: We handle NSW by essentially treating as nuw of bitwidth - 1 then
+  // deducing bits based on the known sign result.
+  if (Add) {
+    if (NUW || (LHS.isNonNegative() && RHS.isNonNegative())) {
+      bool OverflowMin;
+      APInt MinVal;
+      if (NSW) {
+        MinVal = GetMinVal(/*ForNSW=*/true, LHS, RHS, OverflowMin);
+        // (add nsw nuw) or (add nsw PosX, PosY)
+
+        // None of the adds can end up overflowing, so min consecutive
+        // highbits in minimum possible of X + Y must all remain set.
+        KnownOut.One.setHighBits(MinVal.countLeadingOnes());
+
+        // NSW and Positive arguments leads to positive result.
+        if (LHS.isNonNegative() && RHS.isNonNegative())
+          ForcePositive(KnownOut);
+      }
+      if (NUW) {
+        KnownOut.One.clearSignBit();
+        // (add nuw X, Y)
+        MinVal = GetMinVal(/*ForNSW=*/false, LHS, RHS, OverflowMin);
+        // Same as (add nsw PosX, PosY), basically since we can't overflow,
+        // the high bits of minimum possible X + Y must remain set.
+        KnownOut.One.setHighBits(MinVal.countLeadingOnes());
+      }
+    } else if (LHS.isNegative() && RHS.isNegative()) {
+      bool OverflowMax;
+      APInt MaxVal = GetMaxVal(/*ForNSW=*/true, LHS, RHS, OverflowMax);
+      // (add nsw NegX, NegY)
+
+      // We need to re-overflow the signbit, so we are looking for sequence
+      // of 0s from consecutive overflows.
+      KnownOut.Zero.setHighBits(MaxVal.countLeadingZeros());
+      ForceNegative(KnownOut);
+    } else if (!KnownOut.isSignUnknown()) {
+      // Pass, avoid extra work if we already know the sign bit.
+    } else if (LHS.isNonNegative() || RHS.isNonNegative()) {
+      bool OverflowMin;
+      (void)GetMinVal(/*ForNSW=*/true, LHS, RHS, OverflowMin);
+      // (add nsw PosX, ?Y)
+
+      // If the minimal possible of X + Y overflows the signbit, then Y must
+      // have been signed (which will cause unsigned overflow otherwise nsw
+      // will be violated) leading to unsigned result.
+      if (OverflowMin)
         KnownOut.makeNonNegative();
-      // Adding two negative numbers, or subtracting a non-negative number from
-      // a negative one, can't wrap into non-negative.
-      else if (LHS.isNegative() && RHS.isNegative())
+    } else if (LHS.isNegative() || RHS.isNegative()) {
+      bool OverflowMax;
+      (void)GetMaxVal(/*ForNSW=*/true, LHS, RHS, OverflowMax);
+      // (add nsw NegX, ?Y)
+
+      // If the maximum possible of X + Y doesn't overflows the signbit,
+      // then Y must have been unsigned (otherwise nsw violated) so NegX +
+      // PosY w.o overflowing the signbit results in Negative.
+      if (!OverflowMax)
         KnownOut.makeNegative();
     }
+  } else {
+    if (NUW || (LHS.isNegative() && RHS.isNonNegative())) {
+      bool OverflowMax;
+      APInt MaxVal;
+      if (NSW) {
+        MaxVal = GetMaxVal(/*ForNSW=*/true, LHS, RHS, OverflowMax);
+        // (sub nsw nuw) or (sub nsw NegX, PosY)
+
+        // None of the subs can overflow at any point, so any common high bits
+        // will subtract away and result in zeros.
+        KnownOut.Zero.setHighBits(MaxVal.countLeadingZeros());
+        if (LHS.isNegative() && RHS.isNonNegative())
+          ForceNegative(KnownOut);
+      }
+      if (NUW) {
+        KnownOut.Zero.clearSignBit();
+        // (sub nuw X, Y)
+        MaxVal = GetMaxVal(/*ForNSW=*/false, LHS, RHS, OverflowMax);
+
+        // Basically all common high bits between X/Y will cancel out as
+        // leading zeros.
+        KnownOut.Zero.setHighBits(MaxVal.countLeadingZeros());
+      }
+    } else if (LHS.isNonNegative() && RHS.isNegative()) {
+      bool OverflowMin;
+      APInt MinVal = GetMinVal(/*ForNSW=*/true, LHS, RHS, OverflowMin);
+      // (sub nsw PosX, NegY)
+
+      // Opposite case of above, we must "re-overflow" the signbit, so
+      // minimal set of high bits will be fixed.
+      KnownOut.One.setHighBits(MinVal.countLeadingOnes());
+      ForcePositive(KnownOut);
+    } else if (!KnownOut.isSignUnknown()) {
+      // Pass, avoid extra work if we already know the sign bit.
+    } else if (LHS.isNegative() || RHS.isNonNegative()) {
+      bool OverflowMax;
+      (void)GetMaxVal(/*ForNSW=*/true, LHS, RHS, OverflowMax);
+      // (sub nsw NegX/?X, ?Y/PosY)
+      if (OverflowMax)
+        KnownOut.makeNegative();
+    } else if (LHS.isNonNegative() || RHS.isNegative()) {
+      bool OverflowMin;
+      (void)GetMinVal(/*ForNSW=*/true, LHS, RHS, OverflowMin);
+      // (sub nsw PosX/?X, ?Y/NegY)
+      if (!OverflowMin)
+        KnownOut.makeNonNegative();
+    }
+  }
+
+  // Just return 0 if the nsw/nuw is violated and we have poison.
+  if (KnownOut.hasConflict()) {
+    KnownOut.setAllZero();
+    return KnownOut;
   }
 
   return KnownOut;
diff --git a/llvm/test/CodeGen/AArch64/sve-cmp-folds.ll b/llvm/test/CodeGen/AArch64/sve-cmp-folds.ll
@@ -114,9 +114,12 @@ define i1 @foo_last(<vscale x 4 x float> %a, <vscale x 4 x float> %b) {
 ; CHECK-LABEL: foo_last:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    fcmeq p1.s, p0/z, z0.s, z1.s
-; CHECK-NEXT:    ptest p0, p1.b
-; CHECK-NEXT:    cset w0, lo
+; CHECK-NEXT:    mov x8, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    whilels p1.s, xzr, x8
+; CHECK-NEXT:    fcmeq p0.s, p0/z, z0.s, z1.s
+; CHECK-NEXT:    mov z0.s, p0/z, #1 // =0x1
+; CHECK-NEXT:    lastb w8, p1, z0.s
+; CHECK-NEXT:    and w0, w8, #0x1
 ; CHECK-NEXT:    ret
   %vcond = fcmp oeq <vscale x 4 x float> %a, %b
   %vscale = call i64 @llvm.vscale.i64()
diff --git a/llvm/test/CodeGen/AArch64/sve-extract-element.ll b/llvm/test/CodeGen/AArch64/sve-extract-element.ll
@@ -614,9 +614,11 @@ define i1 @test_lane9_8xi1(<vscale x 8 x i1> %a) #0 {
 define i1 @test_last_8xi1(<vscale x 8 x i1> %a) #0 {
 ; CHECK-LABEL: test_last_8xi1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p1.h
-; CHECK-NEXT:    ptest p1, p0.b
-; CHECK-NEXT:    cset w0, lo
+; CHECK-NEXT:    mov x8, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    mov z0.h, p0/z, #1 // =0x1
+; CHECK-NEXT:    whilels p1.h, xzr, x8
+; CHECK-NEXT:    lastb w8, p1, z0.h
+; CHECK-NEXT:    and w0, w8, #0x1
 ; CHECK-NEXT:    ret
   %vscale = call i64 @llvm.vscale.i64()
   %shl = shl nuw nsw i64 %vscale, 3
diff --git a/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll b/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll
@@ -137,49 +137,46 @@ define amdgpu_kernel void @write_ds_sub_max_offset_global_clamp_bit(float %dummy
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; CI-NEXT:    s_mov_b64 vcc, 0
-; CI-NEXT:    v_not_b32_e32 v0, v0
-; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; CI-NEXT:    v_mov_b32_e32 v2, 0x7b
+; CI-NEXT:    v_mov_b32_e32 v1, 0x7b
+; CI-NEXT:    v_mov_b32_e32 v2, 0
+; CI-NEXT:    s_mov_b32 m0, -1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_mov_b32_e32 v1, s0
-; CI-NEXT:    v_div_fmas_f32 v1, v1, v1, v1
+; CI-NEXT:    v_mov_b32_e32 v0, s0
+; CI-NEXT:    v_div_fmas_f32 v0, v0, v0, v0
 ; CI-NEXT:    s_mov_b32 s0, 0
-; CI-NEXT:    s_mov_b32 m0, -1
 ; CI-NEXT:    s_mov_b32 s3, 0xf000
 ; CI-NEXT:    s_mov_b32 s2, -1
 ; CI-NEXT:    s_mov_b32 s1, s0
-; CI-NEXT:    ds_write_b32 v0, v2 offset:65532
-; CI-NEXT:    buffer_store_dword v1, off, s[0:3], 0
+; CI-NEXT:    ds_write_b32 v2, v1
+; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: write_ds_sub_max_offset_global_clamp_bit:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX9-NEXT:    s_mov_b64 vcc, 0
-; GFX9-NEXT:    v_not_b32_e32 v0, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 2, v0
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7b
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7b
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    ds_write_b32 v4, v3
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-NEXT:    v_div_fmas_f32 v2, v1, v1, v1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_div_fmas_f32 v2, v0, v0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    ds_write_b32 v3, v4 offset:65532
 ; GFX9-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: write_ds_sub_max_offset_global_clamp_bit:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x0
-; GFX10-NEXT:    v_not_b32_e32 v0, v0
 ; GFX10-NEXT:    s_mov_b32 vcc_lo, 0
-; GFX10-NEXT:    v_mov_b32_e32 v3, 0x7b
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0x7b
+; GFX10-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    ds_write_b32 v2, v3 offset:65532
+; GFX10-NEXT:    ds_write_b32 v3, v2
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_div_fmas_f32 v4, s0, s0, s0
 ; GFX10-NEXT:    global_store_dword v[0:1], v4, off
@@ -189,13 +186,11 @@ define amdgpu_kernel void @write_ds_sub_max_offset_global_clamp_bit(float %dummy
 ; GFX11-LABEL: write_ds_sub_max_offset_global_clamp_bit:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x0
-; GFX11-NEXT:    v_not_b32_e32 v0, v0
 ; GFX11-NEXT:    s_mov_b32 vcc_lo, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v3, 0x7b :: v_dual_lshlrev_b32 v2, 2, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    v_dual_mov_b32 v2, 0x7b :: v_dual_mov_b32 v3, 0
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-NEXT:    ds_store_b32 v2, v3 offset:65532
+; GFX11-NEXT:    ds_store_b32 v3, v2
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_div_fmas_f32 v4, s0, s0, s0
 ; GFX11-NEXT:    global_store_b32 v[0:1], v4, off dlc
diff --git a/llvm/test/Transforms/InstCombine/fold-log2-ceil-idiom.ll b/llvm/test/Transforms/InstCombine/fold-log2-ceil-idiom.ll
@@ -43,7 +43,7 @@ define i64 @log2_ceil_idiom_zext(i32 %x) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[X]], -1
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[TMP1]], i1 false), !range [[RNG0]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = sub nuw nsw i32 32, [[TMP2]]
-; CHECK-NEXT:    [[RET:%.*]] = zext i32 [[TMP3]] to i64
+; CHECK-NEXT:    [[RET:%.*]] = zext nneg i32 [[TMP3]] to i64
 ; CHECK-NEXT:    ret i64 [[RET]]
 ;
   %ctlz = tail call i32 @llvm.ctlz.i32(i32 %x, i1 true)
diff --git a/llvm/test/Transforms/InstCombine/icmp-sub.ll b/llvm/test/Transforms/InstCombine/icmp-sub.ll
@@ -36,7 +36,7 @@ define i1 @test_nuw_nsw_and_unsigned_pred(i64 %x) {
 
 define i1 @test_nuw_nsw_and_signed_pred(i64 %x) {
 ; CHECK-LABEL: @test_nuw_nsw_and_signed_pred(
-; CHECK-NEXT:    [[Z:%.*]] = icmp sgt i64 [[X:%.*]], 7
+; CHECK-NEXT:    [[Z:%.*]] = icmp ugt i64 [[X:%.*]], 7
 ; CHECK-NEXT:    ret i1 [[Z]]
 ;
   %y = sub nuw nsw i64 10, %x
@@ -46,8 +46,7 @@ define i1 @test_nuw_nsw_and_signed_pred(i64 %x) {
 
 define i1 @test_negative_nuw_and_signed_pred(i64 %x) {
 ; CHECK-LABEL: @test_negative_nuw_and_signed_pred(
-; CHECK-NEXT:    [[NOTSUB:%.*]] = add nuw i64 [[X:%.*]], -11
-; CHECK-NEXT:    [[Z:%.*]] = icmp sgt i64 [[NOTSUB]], -4
+; CHECK-NEXT:    [[Z:%.*]] = icmp ugt i64 [[X:%.*]], 7
 ; CHECK-NEXT:    ret i1 [[Z]]
 ;
   %y = sub nuw i64 10, %x
diff --git a/llvm/test/Transforms/InstCombine/sub.ll b/llvm/test/Transforms/InstCombine/sub.ll
@@ -2367,7 +2367,7 @@ define <2 x i8> @sub_to_and_vector3(<2 x i8> %x) {
 ; CHECK-LABEL: @sub_to_and_vector3(
 ; CHECK-NEXT:    [[SUB:%.*]] = sub nuw <2 x i8> <i8 71, i8 71>, [[X:%.*]]
 ; CHECK-NEXT:    [[AND:%.*]] = and <2 x i8> [[SUB]], <i8 120, i8 undef>
-; CHECK-NEXT:    [[R:%.*]] = sub <2 x i8> <i8 44, i8 44>, [[AND]]
+; CHECK-NEXT:    [[R:%.*]] = sub nsw <2 x i8> <i8 44, i8 44>, [[AND]]
 ; CHECK-NEXT:    ret <2 x i8> [[R]]
 ;
   %sub = sub nuw <2 x i8> <i8 71, i8 71>, %x
diff --git a/llvm/unittests/Support/KnownBitsTest.cpp b/llvm/unittests/Support/KnownBitsTest.cpp

Original file line number	Diff line number	Diff line change
`@@ -43,7 +43,7 @@ define i64 @log2_ceil_idiom_zext(i32 %x) {`
`43`	`43`	`; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[X]], -1`
`44`	`44`	`; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[TMP1]], i1 false), !range [[RNG0]]`
`45`	`45`	`; CHECK-NEXT: [[TMP3:%.*]] = sub nuw nsw i32 32, [[TMP2]]`
`46`		`-; CHECK-NEXT: [[RET:%.*]] = zext i32 [[TMP3]] to i64`
	`46`	`+; CHECK-NEXT: [[RET:%.*]] = zext nneg i32 [[TMP3]] to i64`
`47`	`47`	`; CHECK-NEXT: ret i64 [[RET]]`
`48`	`48`	`;`
`49`	`49`	`%ctlz = tail call i32 @llvm.ctlz.i32(i32 %x, i1 true)`
Original file line number	Diff line number	Diff line change
`@@ -2367,7 +2367,7 @@ define <2 x i8> @sub_to_and_vector3(<2 x i8> %x) {`
`2367`	`2367`	`; CHECK-LABEL: @sub_to_and_vector3(`
`2368`	`2368`	`; CHECK-NEXT: [[SUB:%.]] = sub nuw <2 x i8> <i8 71, i8 71>, [[X:%.]]`
`2369`	`2369`	`; CHECK-NEXT: [[AND:%.*]] = and <2 x i8> [[SUB]], <i8 120, i8 undef>`
`2370`		`-; CHECK-NEXT: [[R:%.*]] = sub <2 x i8> <i8 44, i8 44>, [[AND]]`
	`2370`	`+; CHECK-NEXT: [[R:%.*]] = sub nsw <2 x i8> <i8 44, i8 44>, [[AND]]`
`2371`	`2371`	`; CHECK-NEXT: ret <2 x i8> [[R]]`
`2372`	`2372`	`;`
`2373`	`2373`	`%sub = sub nuw <2 x i8> <i8 71, i8 71>, %x`