From 5b7b0090c80f0ef1b25f7814f3682ad7099ab556 Mon Sep 17 00:00:00 2001
From: Thomas Lively <tlively@google.com>
Date: Fri, 9 Apr 2021 11:21:49 -0700
Subject: [PATCH 1/7] [WebAssembly] Add shuffles as an option for lowering
 BUILD_VECTOR

When lowering a BUILD_VECTOR SDNode, we choose among various possible vector
creation instructions in an attempt to minimize the total number of instructions
used. We previously considered using swizzles, consts, and splats, and this
patch adds shuffles as well. A common pattern that now lowers to shuffles is
when two 64-bit vectors are concatenated. Previously, concatenations generally
lowered to sequences of extract_lane and replace_lane instructions when they
could have been a single shuffle.

Differential Revision: https://reviews.llvm.org/D100018
---
 .../WebAssembly/WebAssemblyISelLowering.cpp   | 95 +++++++++++++++++--
 .../CodeGen/WebAssembly/simd-build-vector.ll  | 16 ++++
 llvm/test/CodeGen/WebAssembly/simd-concat.ll  | 79 +++++++++++++++
 3 files changed, 184 insertions(+), 6 deletions(-)
 create mode 100644 llvm/test/CodeGen/WebAssembly/simd-concat.ll
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index c519c7d76c54c..322020638d9fe 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -1600,8 +1600,8 @@ SDValue WebAssemblyTargetLowering::LowerBUILD_VECTOR(SDValue Op,
   // TODO: Tune this. For example, lanewise swizzling is very expensive, so
   // swizzled lanes should be given greater weight.
 
-  // TODO: Investigate building vectors by shuffling together vectors built by
-  // separately specialized means.
+  // TODO: Investigate looping rather than always extracting/replacing specific
+  // lanes to fill gaps.
 
   auto IsConstant = [](const SDValue &V) {
     return V.getOpcode() == ISD::Constant || V.getOpcode() == ISD::ConstantFP;
@@ -1632,12 +1632,30 @@ SDValue WebAssemblyTargetLowering::LowerBUILD_VECTOR(SDValue Op,
     return std::make_pair(SwizzleSrc, SwizzleIndices);
   };
 
+  // If the lane is extracted from another vector at a constant index, return
+  // that vector. The source vector must not have more lanes than the dest
+  // because the shufflevector indices are in terms of the destination lanes and
+  // would not be able to address the smaller individual source lanes.
+  auto GetShuffleSrc = [&](const SDValue &Lane) {
+    if (Lane->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+      return SDValue();
+    if (!isa<ConstantSDNode>(Lane->getOperand(1).getNode()))
+      return SDValue();
+    if (Lane->getOperand(0).getValueType().getVectorNumElements() >
+        VecT.getVectorNumElements())
+      return SDValue();
+    return Lane->getOperand(0);
+  };
+
   using ValueEntry = std::pair<SDValue, size_t>;
   SmallVector<ValueEntry, 16> SplatValueCounts;
 
   using SwizzleEntry = std::pair<std::pair<SDValue, SDValue>, size_t>;
   SmallVector<SwizzleEntry, 16> SwizzleCounts;
 
+  using ShuffleEntry = std::pair<SDValue, size_t>;
+  SmallVector<ShuffleEntry, 16> ShuffleCounts;
+
   auto AddCount = [](auto &Counts, const auto &Val) {
     auto CountIt =
         llvm::find_if(Counts, [&Val](auto E) { return E.first == Val; });
@@ -1666,9 +1684,11 @@ SDValue WebAssemblyTargetLowering::LowerBUILD_VECTOR(SDValue Op,
 
     AddCount(SplatValueCounts, Lane);
 
-    if (IsConstant(Lane)) {
+    if (IsConstant(Lane))
       NumConstantLanes++;
-    } else if (CanSwizzle) {
+    if (auto ShuffleSrc = GetShuffleSrc(Lane))
+      AddCount(ShuffleCounts, ShuffleSrc);
+    if (CanSwizzle) {
       auto SwizzleSrcs = GetSwizzleSrcs(I, Lane);
       if (SwizzleSrcs.first)
         AddCount(SwizzleCounts, SwizzleSrcs);
@@ -1686,18 +1706,81 @@ SDValue WebAssemblyTargetLowering::LowerBUILD_VECTOR(SDValue Op,
     std::forward_as_tuple(std::tie(SwizzleSrc, SwizzleIndices),
                           NumSwizzleLanes) = GetMostCommon(SwizzleCounts);
 
+  // Shuffles can draw from up to two vectors, so find the two most common
+  // sources.
+  SDValue ShuffleSrc1, ShuffleSrc2;
+  size_t NumShuffleLanes = 0;
+  if (ShuffleCounts.size()) {
+    std::tie(ShuffleSrc1, NumShuffleLanes) = GetMostCommon(ShuffleCounts);
+    ShuffleCounts.erase(std::remove_if(ShuffleCounts.begin(),
+                                       ShuffleCounts.end(),
+                                       [&](const auto &Pair) {
+                                         return Pair.first == ShuffleSrc1;
+                                       }),
+                        ShuffleCounts.end());
+  }
+  if (ShuffleCounts.size()) {
+    size_t AdditionalShuffleLanes;
+    std::tie(ShuffleSrc2, AdditionalShuffleLanes) =
+        GetMostCommon(ShuffleCounts);
+    NumShuffleLanes += AdditionalShuffleLanes;
+  }
+
   // Predicate returning true if the lane is properly initialized by the
   // original instruction
   std::function<bool(size_t, const SDValue &)> IsLaneConstructed;
   SDValue Result;
-  // Prefer swizzles over vector consts over splats
-  if (NumSwizzleLanes >= NumSplatLanes && NumSwizzleLanes >= NumConstantLanes) {
+  // Prefer swizzles over shuffles over vector consts over splats
+  if (NumSwizzleLanes >= NumShuffleLanes &&
+      NumSwizzleLanes >= NumConstantLanes && NumSwizzleLanes >= NumSplatLanes) {
     Result = DAG.getNode(WebAssemblyISD::SWIZZLE, DL, VecT, SwizzleSrc,
                          SwizzleIndices);
     auto Swizzled = std::make_pair(SwizzleSrc, SwizzleIndices);
     IsLaneConstructed = [&, Swizzled](size_t I, const SDValue &Lane) {
       return Swizzled == GetSwizzleSrcs(I, Lane);
     };
+  } else if (NumShuffleLanes >= NumConstantLanes &&
+             NumShuffleLanes >= NumSplatLanes) {
+    size_t DestLaneSize = VecT.getVectorElementType().getFixedSizeInBits() / 8;
+    size_t DestLaneCount = VecT.getVectorNumElements();
+    size_t Scale1 = 1;
+    size_t Scale2 = 1;
+    SDValue Src1 = ShuffleSrc1;
+    SDValue Src2 = ShuffleSrc2 ? ShuffleSrc2 : DAG.getUNDEF(VecT);
+    if (Src1.getValueType() != VecT) {
+      size_t LaneSize =
+          Src1.getValueType().getVectorElementType().getFixedSizeInBits() / 8;
+      assert(LaneSize > DestLaneSize);
+      Scale1 = LaneSize / DestLaneSize;
+      Src1 = DAG.getBitcast(VecT, Src1);
+    }
+    if (Src2.getValueType() != VecT) {
+      size_t LaneSize =
+          Src2.getValueType().getVectorElementType().getFixedSizeInBits() / 8;
+      assert(LaneSize > DestLaneSize);
+      Scale2 = LaneSize / DestLaneSize;
+      Src2 = DAG.getBitcast(VecT, Src2);
+    }
+
+    int Mask[16];
+    assert(DestLaneCount <= 16);
+    for (size_t I = 0; I < DestLaneCount; ++I) {
+      const SDValue &Lane = Op->getOperand(I);
+      SDValue Src = GetShuffleSrc(Lane);
+      if (Src == ShuffleSrc1) {
+        Mask[I] = Lane->getConstantOperandVal(1) * Scale1;
+      } else if (Src && Src == ShuffleSrc2) {
+        Mask[I] = DestLaneCount + Lane->getConstantOperandVal(1) * Scale2;
+      } else {
+        Mask[I] = -1;
+      }
+    }
+    ArrayRef<int> MaskRef(Mask, DestLaneCount);
+    Result = DAG.getVectorShuffle(VecT, DL, Src1, Src2, MaskRef);
+    IsLaneConstructed = [&](size_t, const SDValue &Lane) {
+      auto Src = GetShuffleSrc(Lane);
+      return Src == ShuffleSrc1 || (Src && Src == ShuffleSrc2);
+    };
   } else if (NumConstantLanes >= NumSplatLanes) {
     SmallVector<SDValue, 16> ConstLanes;
     for (const SDValue &Lane : Op->op_values()) {
diff --git a/llvm/test/CodeGen/WebAssembly/simd-build-vector.ll b/llvm/test/CodeGen/WebAssembly/simd-build-vector.ll
index c1060ea1101fe..7003714098f25 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-build-vector.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-build-vector.ll
@@ -165,6 +165,22 @@ define <8 x i16> @swizzle_one_i16x8(<8 x i16> %src, <8 x i16> %mask) {
   ret <8 x i16> %v0
 }
 
+; CHECK-LABEL: half_shuffle_i32x4:
+; CHECK-NEXT: .functype        half_shuffle_i32x4 (v128) -> (v128)
+; CHECK:      i8x16.shuffle $push[[L0:[0-9]+]]=, $0, $0, 0, 0, 0, 0, 8, 9, 10, 11, 0, 1, 2, 3, 0, 0, 0, 0
+; CHECK:      i32x4.replace_lane
+; CHECK:      i32x4.replace_lane
+; CHECK:      return
+define <4 x i32> @half_shuffle_i32x4(<4 x i32> %src) {
+  %s0 = extractelement <4 x i32> %src, i32 0
+  %s2 = extractelement <4 x i32> %src, i32 2
+  %v0 = insertelement <4 x i32> undef, i32 0, i32 0
+  %v1 = insertelement <4 x i32> %v0, i32 %s2, i32 1
+  %v2 = insertelement <4 x i32> %v1, i32 %s0, i32 2
+  %v3 = insertelement <4 x i32> %v2, i32 3, i32 3
+  ret <4 x i32> %v3
+}
+
 ; CHECK-LABEL: mashup_swizzle_i8x16:
 ; CHECK-NEXT:  .functype       mashup_swizzle_i8x16 (v128, v128, i32) -> (v128)
 ; CHECK-NEXT:  i8x16.swizzle   $push[[L0:[0-9]+]]=, $0, $1
diff --git a/llvm/test/CodeGen/WebAssembly/simd-concat.ll b/llvm/test/CodeGen/WebAssembly/simd-concat.ll
new file mode 100644
index 0000000000000..21fe627f125c1
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/simd-concat.ll
@@ -0,0 +1,79 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -verify-machineinstrs -mattr=+simd128 | FileCheck %s
+
+; Check that all varieties of vector concatenations get lowered to shuffles.
+
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown--wasm"
+
+define <16 x i8> @concat_v8i8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: concat_v8i8:
+; CHECK:         .functype concat_v8i8 (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i8x16.shuffle 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+; CHECK-NEXT:    # fallthrough-return
+  %v = shufflevector <8 x i8> %a, <8 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %v
+}
+
+define <8 x i8> @concat_v4i8(<4 x i8> %a, <4 x i8> %b) {
+; CHECK-LABEL: concat_v4i8:
+; CHECK:         .functype concat_v4i8 (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i8x16.shuffle 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29
+; CHECK-NEXT:    # fallthrough-return
+  %v = shufflevector <4 x i8> %a, <4 x i8> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i8> %v
+}
+
+define <8 x i16> @concat_v4i16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: concat_v4i16:
+; CHECK:         .functype concat_v4i16 (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i8x16.shuffle 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29
+; CHECK-NEXT:    # fallthrough-return
+  %v = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %v
+}
+
+define <4 x i8> @concat_v2i8(<2 x i8> %a, <2 x i8> %b) {
+; CHECK-LABEL: concat_v2i8:
+; CHECK:         .functype concat_v2i8 (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i8x16.shuffle 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK-NEXT:    # fallthrough-return
+  %v = shufflevector <2 x i8> %a, <2 x i8> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i8> %v
+}
+
+define <4 x i16> @concat_v2i16(<2 x i16> %a, <2 x i16> %b) {
+; CHECK-LABEL: concat_v2i16:
+; CHECK:         .functype concat_v2i16 (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i8x16.shuffle 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK-NEXT:    # fallthrough-return
+  %v = shufflevector <2 x i16> %a, <2 x i16> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i16> %v
+}
+
+define <4 x i32> @concat_v2i32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK-LABEL: concat_v2i32:
+; CHECK:         .functype concat_v2i32 (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i8x16.shuffle 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK-NEXT:    # fallthrough-return
+  %v = shufflevector <2 x i32> %a, <2 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i32> %v
+}

From e07c0f553c2333562173c0ffc57fa87d7d70e4d3 Mon Sep 17 00:00:00 2001
From: Thomas Lively <tlively@google.com>
Date: Sun, 11 Apr 2021 11:13:16 -0700
Subject: [PATCH 2/7] [WebAssembly] Update v128.any_true

In the final SIMD spec, there is only a single v128.any_true instruction, rather
than one for each lane interpretation because the semantics do not depend on the
lane interpretation.

Differential Revision: https://reviews.llvm.org/D100241
---
 .../WebAssembly/WebAssemblyInstrSIMD.td       | 36 +++++++++++++++----
 .../CodeGen/WebAssembly/simd-intrinsics.ll    |  8 ++---
 .../CodeGen/WebAssembly/simd-reductions.ll    | 24 ++++++-------
 .../test/MC/Disassembler/WebAssembly/wasm.txt |  3 +-
 llvm/test/MC/WebAssembly/simd-encodings.s     |  3 +-
 5 files changed, 48 insertions(+), 26 deletions(-)

diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index f8df4d35de92e..9bdeab5f4db96 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -822,22 +822,44 @@ defm ABS : SIMDUnaryInt<abs, "abs", 96>;
 // Integer negation: neg
 defm NEG : SIMDUnaryInt<ivneg, "neg", 97>;
 
+// Population count: popcnt
+defm POPCNT : SIMDUnary<I8x16, int_wasm_popcnt, "popcnt", 0x62>;
+
 // Any lane true: any_true
-defm ANYTRUE : SIMDReduce<int_wasm_anytrue, "any_true", 98>;
+defm ANYTRUE : SIMD_I<(outs I32:$dst), (ins V128:$vec), (outs), (ins), [],
+                      "v128.any_true\t$dst, $vec", "v128.any_true", 0x53>;
+
+foreach vec = IntVecs in
+def : Pat<(int_wasm_anytrue (vec.vt V128:$vec)), (ANYTRUE V128:$vec)>;
 
 // All lanes true: all_true
-defm ALLTRUE : SIMDReduce<int_wasm_alltrue, "all_true", 99>;
+multiclass SIMDAllTrue<Vec vec, bits<32> simdop> {
+  defm ALLTRUE_#vec : SIMD_I<(outs I32:$dst), (ins V128:$vec), (outs), (ins),
+                             [(set I32:$dst,
+                               (i32 (int_wasm_alltrue (vec.vt V128:$vec))))],
+                             vec.prefix#".all_true\t$dst, $vec",
+                             vec.prefix#".all_true", simdop>;
+}
 
-// Population count: popcnt
-defm POPCNT : SIMDUnary<I8x16, int_wasm_popcnt, "popcnt", 0x62>;
+defm "" : SIMDAllTrue<I8x16, 0x63>;
+defm "" : SIMDAllTrue<I16x8, 0x83>;
+defm "" : SIMDAllTrue<I32x4, 0xa3>;
+defm "" : SIMDAllTrue<I64x2, 0xc3>;
 
 // Reductions already return 0 or 1, so and 1, setne 0, and seteq 1
 // can be folded out
 foreach reduction =
-  [["int_wasm_anytrue", "ANYTRUE"], ["int_wasm_alltrue", "ALLTRUE"]] in
-foreach vec = IntVecs in {
+  [["int_wasm_anytrue", "ANYTRUE", "I8x16"],
+   ["int_wasm_anytrue", "ANYTRUE", "I16x8"],
+   ["int_wasm_anytrue", "ANYTRUE", "I32x4"],
+   ["int_wasm_anytrue", "ANYTRUE", "I64x2"],
+   ["int_wasm_alltrue", "ALLTRUE_I8x16", "I8x16"],
+   ["int_wasm_alltrue", "ALLTRUE_I16x8", "I16x8"],
+   ["int_wasm_alltrue", "ALLTRUE_I32x4", "I32x4"],
+   ["int_wasm_alltrue", "ALLTRUE_I64x2", "I64x2"]] in {
 defvar intrinsic = !cast<Intrinsic>(reduction[0]);
-defvar inst = !cast<NI>(reduction[1]#"_"#vec);
+defvar inst = !cast<NI>(reduction[1]);
+defvar vec = !cast<Vec>(reduction[2]);
 def : Pat<(i32 (and (i32 (intrinsic (vec.vt V128:$x))), (i32 1))), (inst $x)>;
 def : Pat<(i32 (setne (i32 (intrinsic (vec.vt V128:$x))), (i32 0))), (inst $x)>;
 def : Pat<(i32 (seteq (i32 (intrinsic (vec.vt V128:$x))), (i32 1))), (inst $x)>;
diff --git a/llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll b/llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll
index e78b167ed0ed6..5d98f2b563783 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll
@@ -87,7 +87,7 @@ define <16 x i8> @popcnt_v16i8(<16 x i8> %x) {
 
 ; CHECK-LABEL: any_v16i8:
 ; CHECK-NEXT: .functype any_v16i8 (v128) -> (i32){{$}}
-; CHECK-NEXT: i8x16.any_true $push[[R:[0-9]+]]=, $0{{$}}
+; CHECK-NEXT: v128.any_true $push[[R:[0-9]+]]=, $0{{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
 declare i32 @llvm.wasm.anytrue.v16i8(<16 x i8>)
 define i32 @any_v16i8(<16 x i8> %x) {
@@ -319,7 +319,7 @@ define <8 x i16> @extadd_pairwise_u_v8i16(<16 x i8> %x) {
 
 ; CHECK-LABEL: any_v8i16:
 ; CHECK-NEXT: .functype any_v8i16 (v128) -> (i32){{$}}
-; CHECK-NEXT: i16x8.any_true $push[[R:[0-9]+]]=, $0{{$}}
+; CHECK-NEXT: v128.any_true $push[[R:[0-9]+]]=, $0{{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
 declare i32 @llvm.wasm.anytrue.v8i16(<8 x i16>)
 define i32 @any_v8i16(<8 x i16> %x) {
@@ -468,7 +468,7 @@ define <4 x i32> @extadd_pairwise_u_v4i32(<8 x i16> %x) {
 
 ; CHECK-LABEL: any_v4i32:
 ; CHECK-NEXT: .functype any_v4i32 (v128) -> (i32){{$}}
-; CHECK-NEXT: i32x4.any_true $push[[R:[0-9]+]]=, $0{{$}}
+; CHECK-NEXT: v128.any_true $push[[R:[0-9]+]]=, $0{{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
 declare i32 @llvm.wasm.anytrue.v4i32(<4 x i32>)
 define i32 @any_v4i32(<4 x i32> %x) {
@@ -643,7 +643,7 @@ define <2 x i64> @extmul_high_u_v2i64(<4 x i32> %x, <4 x i32> %y) {
 
 ; CHECK-LABEL: any_v2i64:
 ; CHECK-NEXT: .functype any_v2i64 (v128) -> (i32){{$}}
-; CHECK-NEXT: i64x2.any_true $push[[R:[0-9]+]]=, $0{{$}}
+; CHECK-NEXT: v128.any_true $push[[R:[0-9]+]]=, $0{{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
 declare i32 @llvm.wasm.anytrue.v2i64(<2 x i64>)
 define i32 @any_v2i64(<2 x i64> %x) {
diff --git a/llvm/test/CodeGen/WebAssembly/simd-reductions.ll b/llvm/test/CodeGen/WebAssembly/simd-reductions.ll
index 259ef3b3a81fe..500a4495028f0 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-reductions.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-reductions.ll
@@ -14,7 +14,7 @@ declare i32 @llvm.wasm.alltrue.v16i8(<16 x i8>)
 
 ; CHECK-LABEL: any_v16i8_trunc:
 ; CHECK-NEXT: .functype any_v16i8_trunc (v128) -> (i32){{$}}
-; CHECK-NEXT: i8x16.any_true $push[[R:[0-9]+]]=, $0{{$}}
+; CHECK-NEXT: v128.any_true $push[[R:[0-9]+]]=, $0{{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
 define i32 @any_v16i8_trunc(<16 x i8> %x) {
   %a = call i32 @llvm.wasm.anytrue.v16i8(<16 x i8> %x)
@@ -25,7 +25,7 @@ define i32 @any_v16i8_trunc(<16 x i8> %x) {
 
 ; CHECK-LABEL: any_v16i8_ne:
 ; CHECK-NEXT: .functype any_v16i8_ne (v128) -> (i32){{$}}
-; CHECK-NEXT: i8x16.any_true $push[[R:[0-9]+]]=, $0{{$}}
+; CHECK-NEXT: v128.any_true $push[[R:[0-9]+]]=, $0{{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
 define i32 @any_v16i8_ne(<16 x i8> %x) {
   %a = call i32 @llvm.wasm.anytrue.v16i8(<16 x i8> %x)
@@ -36,7 +36,7 @@ define i32 @any_v16i8_ne(<16 x i8> %x) {
 
 ; CHECK-LABEL: any_v16i8_eq:
 ; CHECK-NEXT: .functype any_v16i8_eq (v128) -> (i32){{$}}
-; CHECK-NEXT: i8x16.any_true $push[[R:[0-9]+]]=, $0{{$}}
+; CHECK-NEXT: v128.any_true $push[[R:[0-9]+]]=, $0{{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
 define i32 @any_v16i8_eq(<16 x i8> %x) {
   %a = call i32 @llvm.wasm.anytrue.v16i8(<16 x i8> %x)
@@ -86,7 +86,7 @@ declare i32 @llvm.wasm.alltrue.v8i16(<8 x i16>)
 
 ; CHECK-LABEL: any_v8i16_trunc:
 ; CHECK-NEXT: .functype any_v8i16_trunc (v128) -> (i32){{$}}
-; CHECK-NEXT: i16x8.any_true $push[[R:[0-9]+]]=, $0{{$}}
+; CHECK-NEXT: v128.any_true $push[[R:[0-9]+]]=, $0{{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
 define i32 @any_v8i16_trunc(<8 x i16> %x) {
   %a = call i32 @llvm.wasm.anytrue.v8i16(<8 x i16> %x)
@@ -97,7 +97,7 @@ define i32 @any_v8i16_trunc(<8 x i16> %x) {
 
 ; CHECK-LABEL: any_v8i16_ne:
 ; CHECK-NEXT: .functype any_v8i16_ne (v128) -> (i32){{$}}
-; CHECK-NEXT: i16x8.any_true $push[[R:[0-9]+]]=, $0{{$}}
+; CHECK-NEXT: v128.any_true $push[[R:[0-9]+]]=, $0{{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
 define i32 @any_v8i16_ne(<8 x i16> %x) {
   %a = call i32 @llvm.wasm.anytrue.v8i16(<8 x i16> %x)
@@ -108,7 +108,7 @@ define i32 @any_v8i16_ne(<8 x i16> %x) {
 
 ; CHECK-LABEL: any_v8i16_eq:
 ; CHECK-NEXT: .functype any_v8i16_eq (v128) -> (i32){{$}}
-; CHECK-NEXT: i16x8.any_true $push[[R:[0-9]+]]=, $0{{$}}
+; CHECK-NEXT: v128.any_true $push[[R:[0-9]+]]=, $0{{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
 define i32 @any_v8i16_eq(<8 x i16> %x) {
   %a = call i32 @llvm.wasm.anytrue.v8i16(<8 x i16> %x)
@@ -158,7 +158,7 @@ declare i32 @llvm.wasm.alltrue.v4i32(<4 x i32>)
 
 ; CHECK-LABEL: any_v4i32_trunc:
 ; CHECK-NEXT: .functype any_v4i32_trunc (v128) -> (i32){{$}}
-; CHECK-NEXT: i32x4.any_true $push[[R:[0-9]+]]=, $0{{$}}
+; CHECK-NEXT: v128.any_true $push[[R:[0-9]+]]=, $0{{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
 define i32 @any_v4i32_trunc(<4 x i32> %x) {
   %a = call i32 @llvm.wasm.anytrue.v4i32(<4 x i32> %x)
@@ -169,7 +169,7 @@ define i32 @any_v4i32_trunc(<4 x i32> %x) {
 
 ; CHECK-LABEL: any_v4i32_ne:
 ; CHECK-NEXT: .functype any_v4i32_ne (v128) -> (i32){{$}}
-; CHECK-NEXT: i32x4.any_true $push[[R:[0-9]+]]=, $0{{$}}
+; CHECK-NEXT: v128.any_true $push[[R:[0-9]+]]=, $0{{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
 define i32 @any_v4i32_ne(<4 x i32> %x) {
   %a = call i32 @llvm.wasm.anytrue.v4i32(<4 x i32> %x)
@@ -180,7 +180,7 @@ define i32 @any_v4i32_ne(<4 x i32> %x) {
 
 ; CHECK-LABEL: any_v4i32_eq:
 ; CHECK-NEXT: .functype any_v4i32_eq (v128) -> (i32){{$}}
-; CHECK-NEXT: i32x4.any_true $push[[R:[0-9]+]]=, $0{{$}}
+; CHECK-NEXT: v128.any_true $push[[R:[0-9]+]]=, $0{{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
 define i32 @any_v4i32_eq(<4 x i32> %x) {
   %a = call i32 @llvm.wasm.anytrue.v4i32(<4 x i32> %x)
@@ -230,7 +230,7 @@ declare i32 @llvm.wasm.alltrue.v2i64(<2 x i64>)
 
 ; CHECK-LABEL: any_v2i64_trunc:
 ; CHECK-NEXT: .functype any_v2i64_trunc (v128) -> (i32){{$}}
-; CHECK-NEXT: i64x2.any_true $push[[R:[0-9]+]]=, $0{{$}}
+; CHECK-NEXT: v128.any_true $push[[R:[0-9]+]]=, $0{{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
 define i32 @any_v2i64_trunc(<2 x i64> %x) {
   %a = call i32 @llvm.wasm.anytrue.v2i64(<2 x i64> %x)
@@ -241,7 +241,7 @@ define i32 @any_v2i64_trunc(<2 x i64> %x) {
 
 ; CHECK-LABEL: any_v2i64_ne:
 ; CHECK-NEXT: .functype any_v2i64_ne (v128) -> (i32){{$}}
-; CHECK-NEXT: i64x2.any_true $push[[R:[0-9]+]]=, $0{{$}}
+; CHECK-NEXT: v128.any_true $push[[R:[0-9]+]]=, $0{{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
 define i32 @any_v2i64_ne(<2 x i64> %x) {
   %a = call i32 @llvm.wasm.anytrue.v2i64(<2 x i64> %x)
@@ -252,7 +252,7 @@ define i32 @any_v2i64_ne(<2 x i64> %x) {
 
 ; CHECK-LABEL: any_v2i64_eq:
 ; CHECK-NEXT: .functype any_v2i64_eq (v128) -> (i32){{$}}
-; CHECK-NEXT: i64x2.any_true $push[[R:[0-9]+]]=, $0{{$}}
+; CHECK-NEXT: v128.any_true $push[[R:[0-9]+]]=, $0{{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
 define i32 @any_v2i64_eq(<2 x i64> %x) {
   %a = call i32 @llvm.wasm.anytrue.v2i64(<2 x i64> %x)
diff --git a/llvm/test/MC/Disassembler/WebAssembly/wasm.txt b/llvm/test/MC/Disassembler/WebAssembly/wasm.txt
index 783d59416f9de..0cbf584d9688e 100644
--- a/llvm/test/MC/Disassembler/WebAssembly/wasm.txt
+++ b/llvm/test/MC/Disassembler/WebAssembly/wasm.txt
@@ -43,8 +43,7 @@
 0xFD 0x83 0x01
 
 # Including non-canonical LEB128 encodings
-# CHECK: i16x8.any_true
-# CHECK-NOT: i16x8.neg
+# CHECK: i16x8.q15mulr_sat_s
 0xFD 0x82 0x81 0x80 0x80 0x80 0x80 0x00
 
 # Check br_table, which has its own operand type.
diff --git a/llvm/test/MC/WebAssembly/simd-encodings.s b/llvm/test/MC/WebAssembly/simd-encodings.s
index 2ce4eb622906d..c1047add02b48 100644
--- a/llvm/test/MC/WebAssembly/simd-encodings.s
+++ b/llvm/test/MC/WebAssembly/simd-encodings.s
@@ -280,7 +280,8 @@ main:
     # CHECK: v128.bitselect # encoding: [0xfd,0x52]
     v128.bitselect
 
-    # TODO: v128.any_true # encoding: [0xfd,0x53]
+    # CHECK: v128.any_true # encoding: [0xfd,0x53]
+    v128.any_true
 
     # CHECK: v128.load8_lane 32, 1 # encoding: [0xfd,0x54,0x00,0x20,0x01]
     v128.load8_lane 32, 1

From 9890f5e00119278aa8a91a2bc46b98ee4cc1f4cc Mon Sep 17 00:00:00 2001
From: Thomas Lively <tlively@google.com>
Date: Tue, 13 Apr 2021 00:06:25 -0700
Subject: [PATCH 3/7] [WebAssembly] Test i64x2.abs encoding

This test was disabled despite the instruction having been implemented for a
long time. This commit just enables the test.

Differential Revision: https://reviews.llvm.org/D100345
---
 llvm/test/MC/WebAssembly/simd-encodings.s | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/test/MC/WebAssembly/simd-encodings.s b/llvm/test/MC/WebAssembly/simd-encodings.s
index c1047add02b48..3093fb2963444 100644
--- a/llvm/test/MC/WebAssembly/simd-encodings.s
+++ b/llvm/test/MC/WebAssembly/simd-encodings.s
@@ -597,7 +597,8 @@ main:
     # CHECK: i32x4.extmul_high_i16x8_u # encoding: [0xfd,0xbf,0x01]
     i32x4.extmul_high_i16x8_u
 
-    # TODO: i64x2.abs # encoding: [0xfd,0xc0,0x01]
+    # CHECK: i64x2.abs # encoding: [0xfd,0xc0,0x01]
+    i64x2.abs
 
     # CHECK: i64x2.neg # encoding: [0xfd,0xc1,0x01]
     i64x2.neg

From 78fe0296bb1b07e54549043c4851c759990e90a8 Mon Sep 17 00:00:00 2001
From: Thomas Lively <tlively@google.com>
Date: Wed, 14 Apr 2021 09:19:26 -0700
Subject: [PATCH 4/7] [WebAssembly] Use standard intrinsics for f32x4 and f64x2
 ops

Now that these instructions are no longer prototypes, we do not need to be
careful about keeping them opt-in and can use the standard LLVM infrastructure
for them. This commit removes the bespoke intrinsics we were using to represent
these operations in favor of the corresponding target-independent intrinsics.
The clang builtins are preserved because there is no standard way to easily
represent these operations in C/C++.

For consistency with the scalar codegen in the Wasm backend, the intrinsic used
to represent {f32x4,f64x2}.nearest is @llvm.nearbyint even though
@llvm.roundeven better captures the semantics of the underlying Wasm
instruction. Replacing our use of @llvm.nearbyint with use of @llvm.roundeven is
left to a potential future patch.

Differential Revision: https://reviews.llvm.org/D100411
---
 clang/lib/CodeGen/CGBuiltin.cpp               |  8 +--
 clang/test/CodeGen/builtins-wasm.c            | 16 ++---
 llvm/include/llvm/IR/IntrinsicsWebAssembly.td | 20 ------
 .../WebAssembly/WebAssemblyISelLowering.cpp   |  3 +-
 .../WebAssembly/WebAssemblyInstrSIMD.td       | 16 ++---
 .../CodeGen/WebAssembly/simd-intrinsics.ll    | 32 +++++-----
 .../CodeGen/WebAssembly/simd-unsupported.ll   | 64 -------------------
 7 files changed, 37 insertions(+), 122 deletions(-)

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 1fd79306596e6..ea6d039fd2038 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -16819,19 +16819,19 @@ Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID,
     switch (BuiltinID) {
     case WebAssembly::BI__builtin_wasm_ceil_f32x4:
     case WebAssembly::BI__builtin_wasm_ceil_f64x2:
-      IntNo = Intrinsic::wasm_ceil;
+      IntNo = Intrinsic::ceil;
       break;
     case WebAssembly::BI__builtin_wasm_floor_f32x4:
     case WebAssembly::BI__builtin_wasm_floor_f64x2:
-      IntNo = Intrinsic::wasm_floor;
+      IntNo = Intrinsic::floor;
       break;
     case WebAssembly::BI__builtin_wasm_trunc_f32x4:
     case WebAssembly::BI__builtin_wasm_trunc_f64x2:
-      IntNo = Intrinsic::wasm_trunc;
+      IntNo = Intrinsic::trunc;
       break;
     case WebAssembly::BI__builtin_wasm_nearest_f32x4:
     case WebAssembly::BI__builtin_wasm_nearest_f64x2:
-      IntNo = Intrinsic::wasm_nearest;
+      IntNo = Intrinsic::nearbyint;
       break;
     default:
       llvm_unreachable("unexpected builtin ID");
diff --git a/clang/test/CodeGen/builtins-wasm.c b/clang/test/CodeGen/builtins-wasm.c
index c27be6d909c08..7b7965c026e1a 100644
--- a/clang/test/CodeGen/builtins-wasm.c
+++ b/clang/test/CodeGen/builtins-wasm.c
@@ -792,49 +792,49 @@ f64x2 pmax_f64x2(f64x2 x, f64x2 y) {
 
 f32x4 ceil_f32x4(f32x4 x) {
   return __builtin_wasm_ceil_f32x4(x);
-  // WEBASSEMBLY: call <4 x float> @llvm.wasm.ceil.v4f32(<4 x float> %x)
+  // WEBASSEMBLY: call <4 x float> @llvm.ceil.v4f32(<4 x float> %x)
   // WEBASSEMBLY: ret
 }
 
 f32x4 floor_f32x4(f32x4 x) {
   return __builtin_wasm_floor_f32x4(x);
-  // WEBASSEMBLY: call <4 x float> @llvm.wasm.floor.v4f32(<4 x float> %x)
+  // WEBASSEMBLY: call <4 x float> @llvm.floor.v4f32(<4 x float> %x)
   // WEBASSEMBLY: ret
 }
 
 f32x4 trunc_f32x4(f32x4 x) {
   return __builtin_wasm_trunc_f32x4(x);
-  // WEBASSEMBLY: call <4 x float> @llvm.wasm.trunc.v4f32(<4 x float> %x)
+  // WEBASSEMBLY: call <4 x float> @llvm.trunc.v4f32(<4 x float> %x)
   // WEBASSEMBLY: ret
 }
 
 f32x4 nearest_f32x4(f32x4 x) {
   return __builtin_wasm_nearest_f32x4(x);
-  // WEBASSEMBLY: call <4 x float> @llvm.wasm.nearest.v4f32(<4 x float> %x)
+  // WEBASSEMBLY: call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %x)
   // WEBASSEMBLY: ret
 }
 
 f64x2 ceil_f64x2(f64x2 x) {
   return __builtin_wasm_ceil_f64x2(x);
-  // WEBASSEMBLY: call <2 x double> @llvm.wasm.ceil.v2f64(<2 x double> %x)
+  // WEBASSEMBLY: call <2 x double> @llvm.ceil.v2f64(<2 x double> %x)
   // WEBASSEMBLY: ret
 }
 
 f64x2 floor_f64x2(f64x2 x) {
   return __builtin_wasm_floor_f64x2(x);
-  // WEBASSEMBLY: call <2 x double> @llvm.wasm.floor.v2f64(<2 x double> %x)
+  // WEBASSEMBLY: call <2 x double> @llvm.floor.v2f64(<2 x double> %x)
   // WEBASSEMBLY: ret
 }
 
 f64x2 trunc_f64x2(f64x2 x) {
   return __builtin_wasm_trunc_f64x2(x);
-  // WEBASSEMBLY: call <2 x double> @llvm.wasm.trunc.v2f64(<2 x double> %x)
+  // WEBASSEMBLY: call <2 x double> @llvm.trunc.v2f64(<2 x double> %x)
   // WEBASSEMBLY: ret
 }
 
 f64x2 nearest_f64x2(f64x2 x) {
   return __builtin_wasm_nearest_f64x2(x);
-  // WEBASSEMBLY: call <2 x double> @llvm.wasm.nearest.v2f64(<2 x double> %x)
+  // WEBASSEMBLY: call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %x)
   // WEBASSEMBLY: ret
 }
 
diff --git a/llvm/include/llvm/IR/IntrinsicsWebAssembly.td b/llvm/include/llvm/IR/IntrinsicsWebAssembly.td
index a57080d1d95b4..f4bdd07b81082 100644
--- a/llvm/include/llvm/IR/IntrinsicsWebAssembly.td
+++ b/llvm/include/llvm/IR/IntrinsicsWebAssembly.td
@@ -183,26 +183,6 @@ def int_wasm_pmax :
             [LLVMMatchType<0>, LLVMMatchType<0>],
             [IntrNoMem, IntrSpeculatable]>;
 
-// TODO: Replace these instrinsics with normal ISel patterns once the
-// rounding instructions are merged to the proposal
-// (https://github.com/WebAssembly/simd/pull/232).
-def int_wasm_ceil :
-  Intrinsic<[llvm_anyvector_ty],
-            [LLVMMatchType<0>],
-            [IntrNoMem, IntrSpeculatable]>;
-def int_wasm_floor :
-  Intrinsic<[llvm_anyvector_ty],
-            [LLVMMatchType<0>],
-            [IntrNoMem, IntrSpeculatable]>;
-def int_wasm_trunc :
-  Intrinsic<[llvm_anyvector_ty],
-            [LLVMMatchType<0>],
-            [IntrNoMem, IntrSpeculatable]>;
-def int_wasm_nearest :
-  Intrinsic<[llvm_anyvector_ty],
-            [LLVMMatchType<0>],
-            [IntrNoMem, IntrSpeculatable]>;
-
 // TODO: Replace these intrinsic with normal ISel patterns once the
 // load_zero instructions are merged to the proposal.
 def int_wasm_load32_zero :
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 322020638d9fe..bce8f8ef2105d 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -180,8 +180,7 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
         setOperationAction(Op, T, Legal);
 
     // Expand float operations supported for scalars but not SIMD
-    for (auto Op : {ISD::FCEIL, ISD::FFLOOR, ISD::FTRUNC, ISD::FNEARBYINT,
-                    ISD::FCOPYSIGN, ISD::FLOG, ISD::FLOG2, ISD::FLOG10,
+    for (auto Op : {ISD::FCOPYSIGN, ISD::FLOG, ISD::FLOG2, ISD::FLOG10,
                     ISD::FEXP, ISD::FEXP2, ISD::FRINT})
       for (auto T : {MVT::v4f32, MVT::v2f64})
         setOperationAction(Op, T, Expand);
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index 9bdeab5f4db96..1588f6aac8610 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -1044,14 +1044,14 @@ defm NEG : SIMDUnaryFP<fneg, "neg", 225>;
 defm SQRT : SIMDUnaryFP<fsqrt, "sqrt", 227>;
 
 // Rounding: ceil, floor, trunc, nearest
-defm CEIL : SIMDUnary<F32x4, int_wasm_ceil, "ceil", 0x67>;
-defm FLOOR : SIMDUnary<F32x4, int_wasm_floor, "floor", 0x68>;
-defm TRUNC: SIMDUnary<F32x4, int_wasm_trunc, "trunc", 0x69>;
-defm NEAREST: SIMDUnary<F32x4, int_wasm_nearest, "nearest", 0x6a>;
-defm CEIL : SIMDUnary<F64x2, int_wasm_ceil, "ceil", 0x74>;
-defm FLOOR : SIMDUnary<F64x2, int_wasm_floor, "floor", 0x75>;
-defm TRUNC: SIMDUnary<F64x2, int_wasm_trunc, "trunc", 0x7a>;
-defm NEAREST: SIMDUnary<F64x2, int_wasm_nearest, "nearest", 0x94>;
+defm CEIL : SIMDUnary<F32x4, fceil, "ceil", 0x67>;
+defm FLOOR : SIMDUnary<F32x4, ffloor, "floor", 0x68>;
+defm TRUNC: SIMDUnary<F32x4, ftrunc, "trunc", 0x69>;
+defm NEAREST: SIMDUnary<F32x4, fnearbyint, "nearest", 0x6a>;
+defm CEIL : SIMDUnary<F64x2, fceil, "ceil", 0x74>;
+defm FLOOR : SIMDUnary<F64x2, ffloor, "floor", 0x75>;
+defm TRUNC: SIMDUnary<F64x2, ftrunc, "trunc", 0x7a>;
+defm NEAREST: SIMDUnary<F64x2, fnearbyint, "nearest", 0x94>;
 
 //===----------------------------------------------------------------------===//
 // Floating-point binary arithmetic
diff --git a/llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll b/llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll
index 5d98f2b563783..f28eb0b242a52 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll
@@ -722,9 +722,9 @@ define <4 x float> @pmax_v4f32(<4 x float> %a, <4 x float> %b) {
 ; CHECK-NEXT: .functype ceil_v4f32 (v128) -> (v128){{$}}
 ; CHECK-NEXT: f32x4.ceil $push[[R:[0-9]+]]=, $0{{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
-declare <4 x float> @llvm.wasm.ceil.v4f32(<4 x float>)
+declare <4 x float> @llvm.ceil.v4f32(<4 x float>)
 define <4 x float> @ceil_v4f32(<4 x float> %a) {
-  %v = call <4 x float> @llvm.wasm.ceil.v4f32(<4 x float> %a)
+  %v = call <4 x float> @llvm.ceil.v4f32(<4 x float> %a)
   ret <4 x float> %v
 }
 
@@ -732,9 +732,9 @@ define <4 x float> @ceil_v4f32(<4 x float> %a) {
 ; CHECK-NEXT: .functype floor_v4f32 (v128) -> (v128){{$}}
 ; CHECK-NEXT: f32x4.floor $push[[R:[0-9]+]]=, $0{{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
-declare <4 x float> @llvm.wasm.floor.v4f32(<4 x float>)
+declare <4 x float> @llvm.floor.v4f32(<4 x float>)
 define <4 x float> @floor_v4f32(<4 x float> %a) {
-  %v = call <4 x float> @llvm.wasm.floor.v4f32(<4 x float> %a)
+  %v = call <4 x float> @llvm.floor.v4f32(<4 x float> %a)
   ret <4 x float> %v
 }
 
@@ -742,9 +742,9 @@ define <4 x float> @floor_v4f32(<4 x float> %a) {
 ; CHECK-NEXT: .functype trunc_v4f32 (v128) -> (v128){{$}}
 ; CHECK-NEXT: f32x4.trunc $push[[R:[0-9]+]]=, $0{{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
-declare <4 x float> @llvm.wasm.trunc.v4f32(<4 x float>)
+declare <4 x float> @llvm.trunc.v4f32(<4 x float>)
 define <4 x float> @trunc_v4f32(<4 x float> %a) {
-  %v = call <4 x float> @llvm.wasm.trunc.v4f32(<4 x float> %a)
+  %v = call <4 x float> @llvm.trunc.v4f32(<4 x float> %a)
   ret <4 x float> %v
 }
 
@@ -752,9 +752,9 @@ define <4 x float> @trunc_v4f32(<4 x float> %a) {
 ; CHECK-NEXT: .functype nearest_v4f32 (v128) -> (v128){{$}}
 ; CHECK-NEXT: f32x4.nearest $push[[R:[0-9]+]]=, $0{{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
-declare <4 x float> @llvm.wasm.nearest.v4f32(<4 x float>)
+declare <4 x float> @llvm.nearbyint.v4f32(<4 x float>)
 define <4 x float> @nearest_v4f32(<4 x float> %a) {
-  %v = call <4 x float> @llvm.wasm.nearest.v4f32(<4 x float> %a)
+  %v = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %a)
   ret <4 x float> %v
 }
 
@@ -807,9 +807,9 @@ define <2 x double> @pmax_v2f64(<2 x double> %a, <2 x double> %b) {
 ; CHECK-NEXT: .functype ceil_v2f64 (v128) -> (v128){{$}}
 ; CHECK-NEXT: f64x2.ceil $push[[R:[0-9]+]]=, $0{{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
-declare <2 x double> @llvm.wasm.ceil.v2f64(<2 x double>)
+declare <2 x double> @llvm.ceil.v2f64(<2 x double>)
 define <2 x double> @ceil_v2f64(<2 x double> %a) {
-  %v = call <2 x double> @llvm.wasm.ceil.v2f64(<2 x double> %a)
+  %v = call <2 x double> @llvm.ceil.v2f64(<2 x double> %a)
   ret <2 x double> %v
 }
 
@@ -817,9 +817,9 @@ define <2 x double> @ceil_v2f64(<2 x double> %a) {
 ; CHECK-NEXT: .functype floor_v2f64 (v128) -> (v128){{$}}
 ; CHECK-NEXT: f64x2.floor $push[[R:[0-9]+]]=, $0{{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
-declare <2 x double> @llvm.wasm.floor.v2f64(<2 x double>)
+declare <2 x double> @llvm.floor.v2f64(<2 x double>)
 define <2 x double> @floor_v2f64(<2 x double> %a) {
-  %v = call <2 x double> @llvm.wasm.floor.v2f64(<2 x double> %a)
+  %v = call <2 x double> @llvm.floor.v2f64(<2 x double> %a)
   ret <2 x double> %v
 }
 
@@ -827,9 +827,9 @@ define <2 x double> @floor_v2f64(<2 x double> %a) {
 ; CHECK-NEXT: .functype trunc_v2f64 (v128) -> (v128){{$}}
 ; CHECK-NEXT: f64x2.trunc $push[[R:[0-9]+]]=, $0{{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
-declare <2 x double> @llvm.wasm.trunc.v2f64(<2 x double>)
+declare <2 x double> @llvm.trunc.v2f64(<2 x double>)
 define <2 x double> @trunc_v2f64(<2 x double> %a) {
-  %v = call <2 x double> @llvm.wasm.trunc.v2f64(<2 x double> %a)
+  %v = call <2 x double> @llvm.trunc.v2f64(<2 x double> %a)
   ret <2 x double> %v
 }
 
@@ -837,9 +837,9 @@ define <2 x double> @trunc_v2f64(<2 x double> %a) {
 ; CHECK-NEXT: .functype nearest_v2f64 (v128) -> (v128){{$}}
 ; CHECK-NEXT: f64x2.nearest $push[[R:[0-9]+]]=, $0{{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
-declare <2 x double> @llvm.wasm.nearest.v2f64(<2 x double>)
+declare <2 x double> @llvm.nearbyint.v2f64(<2 x double>)
 define <2 x double> @nearest_v2f64(<2 x double> %a) {
-  %v = call <2 x double> @llvm.wasm.nearest.v2f64(<2 x double> %a)
+  %v = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %a)
   ret <2 x double> %v
 }
 
diff --git a/llvm/test/CodeGen/WebAssembly/simd-unsupported.ll b/llvm/test/CodeGen/WebAssembly/simd-unsupported.ll
index 1fc0a92b9032d..9332f51c7a911 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-unsupported.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-unsupported.ll
@@ -366,38 +366,6 @@ define <2 x i64> @rotr_v2i64(<2 x i64> %x, <2 x i64> %y) {
 ; 4 x f32
 ; ==============================================================================
 
-; CHECK-LABEL: ceil_v4f32:
-; CHECK: f32.ceil
-declare <4 x float> @llvm.ceil.v4f32(<4 x float>)
-define <4 x float> @ceil_v4f32(<4 x float> %x) {
-  %v = call <4 x float> @llvm.ceil.v4f32(<4 x float> %x)
-  ret <4 x float> %v
-}
-
-; CHECK-LABEL: floor_v4f32:
-; CHECK: f32.floor
-declare <4 x float> @llvm.floor.v4f32(<4 x float>)
-define <4 x float> @floor_v4f32(<4 x float> %x) {
-  %v = call <4 x float> @llvm.floor.v4f32(<4 x float> %x)
-  ret <4 x float> %v
-}
-
-; CHECK-LABEL: trunc_v4f32:
-; CHECK: f32.trunc
-declare <4 x float> @llvm.trunc.v4f32(<4 x float>)
-define <4 x float> @trunc_v4f32(<4 x float> %x) {
-  %v = call <4 x float> @llvm.trunc.v4f32(<4 x float> %x)
-  ret <4 x float> %v
-}
-
-; CHECK-LABEL: nearbyint_v4f32:
-; CHECK: f32.nearest
-declare <4 x float> @llvm.nearbyint.v4f32(<4 x float>)
-define <4 x float> @nearbyint_v4f32(<4 x float> %x) {
-  %v = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %x)
-  ret <4 x float> %v
-}
-
 ; CHECK-LABEL: copysign_v4f32:
 ; CHECK: f32.copysign
 declare <4 x float> @llvm.copysign.v4f32(<4 x float>, <4 x float>)
@@ -498,38 +466,6 @@ define <4 x float> @round_v4f32(<4 x float> %x) {
 ; 2 x f64
 ; ==============================================================================
 
-; CHECK-LABEL: ceil_v2f64:
-; CHECK: f64.ceil
-declare <2 x double> @llvm.ceil.v2f64(<2 x double>)
-define <2 x double> @ceil_v2f64(<2 x double> %x) {
-  %v = call <2 x double> @llvm.ceil.v2f64(<2 x double> %x)
-  ret <2 x double> %v
-}
-
-; CHECK-LABEL: floor_v2f64:
-; CHECK: f64.floor
-declare <2 x double> @llvm.floor.v2f64(<2 x double>)
-define <2 x double> @floor_v2f64(<2 x double> %x) {
-  %v = call <2 x double> @llvm.floor.v2f64(<2 x double> %x)
-  ret <2 x double> %v
-}
-
-; CHECK-LABEL: trunc_v2f64:
-; CHECK: f64.trunc
-declare <2 x double> @llvm.trunc.v2f64(<2 x double>)
-define <2 x double> @trunc_v2f64(<2 x double> %x) {
-  %v = call <2 x double> @llvm.trunc.v2f64(<2 x double> %x)
-  ret <2 x double> %v
-}
-
-; CHECK-LABEL: nearbyint_v2f64:
-; CHECK: f64.nearest
-declare <2 x double> @llvm.nearbyint.v2f64(<2 x double>)
-define <2 x double> @nearbyint_v2f64(<2 x double> %x) {
-  %v = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %x)
-  ret <2 x double> %v
-}
-
 ; CHECK-LABEL: copysign_v2f64:
 ; CHECK: f64.copysign
 declare <2 x double> @llvm.copysign.v2f64(<2 x double>, <2 x double>)

From fd7d65ac3c8ff426d4b88ecdcdbf821a929ae006 Mon Sep 17 00:00:00 2001
From: Thomas Lively <tlively@google.com>
Date: Wed, 14 Apr 2021 10:42:44 -0700
Subject: [PATCH 5/7] [WebAssembly] Codegen for f64x2.convert_low_i32x4_{s,u}

Add a custom DAG combine and ISD opcode for detecting patterns like

  (uint_to_fp (extract_subvector ...))

before the extract_subvector is expanded to ensure that they will ultimately
lower to f64x2.convert_low_i32x4_{s,u} instructions. Since these instructions
are no longer prototypes and can now be produced via standard IR, this commit
also removes the target intrinsics and builtins that had been used to prototype
the instructions.

Differential Revision: https://reviews.llvm.org/D100425
---
 .../clang/Basic/BuiltinsWebAssembly.def       |  2 -
 clang/lib/CodeGen/CGBuiltin.cpp               | 15 -------
 clang/test/CodeGen/builtins-wasm.c            | 12 ------
 llvm/include/llvm/IR/IntrinsicsWebAssembly.td |  6 ---
 .../lib/Target/WebAssembly/WebAssemblyISD.def |  2 +
 .../WebAssembly/WebAssemblyISelLowering.cpp   | 41 +++++++++++++++++++
 .../WebAssembly/WebAssemblyInstrSIMD.td       | 17 ++++----
 .../CodeGen/WebAssembly/simd-conversions.ll   | 22 ++++++++++
 .../CodeGen/WebAssembly/simd-intrinsics.ll    | 20 ---------
 9 files changed, 74 insertions(+), 63 deletions(-)

diff --git a/clang/include/clang/Basic/BuiltinsWebAssembly.def b/clang/include/clang/Basic/BuiltinsWebAssembly.def
index 3f8b050aabfd1..db8ec8ebeb302 100644
--- a/clang/include/clang/Basic/BuiltinsWebAssembly.def
+++ b/clang/include/clang/Basic/BuiltinsWebAssembly.def
@@ -196,8 +196,6 @@ TARGET_BUILTIN(__builtin_wasm_extend_high_s_i32x4_i64x2, "V2LLiV4i", "nc", "simd
 TARGET_BUILTIN(__builtin_wasm_extend_low_u_i32x4_i64x2, "V2LLUiV4Ui", "nc", "simd128")
 TARGET_BUILTIN(__builtin_wasm_extend_high_u_i32x4_i64x2, "V2LLUiV4Ui", "nc", "simd128")
 
-TARGET_BUILTIN(__builtin_wasm_convert_low_s_i32x4_f64x2, "V2dV4i", "nc", "simd128")
-TARGET_BUILTIN(__builtin_wasm_convert_low_u_i32x4_f64x2, "V2dV4Ui", "nc", "simd128")
 TARGET_BUILTIN(__builtin_wasm_trunc_sat_zero_s_f64x2_i32x4, "V4iV2d", "nc", "simd128")
 TARGET_BUILTIN(__builtin_wasm_trunc_sat_zero_u_f64x2_i32x4, "V4UiV2d", "nc", "simd128")
 TARGET_BUILTIN(__builtin_wasm_demote_zero_f64x2_f32x4, "V4fV2d", "nc", "simd128")
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index ea6d039fd2038..0dd0dc07d7693 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -17181,21 +17181,6 @@ Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID,
     Function *Callee = CGM.getIntrinsic(IntNo);
     return Builder.CreateCall(Callee, Vec);
   }
-  case WebAssembly::BI__builtin_wasm_convert_low_s_i32x4_f64x2:
-  case WebAssembly::BI__builtin_wasm_convert_low_u_i32x4_f64x2: {
-    Value *Vec = EmitScalarExpr(E->getArg(0));
-    unsigned IntNo;
-    switch (BuiltinID) {
-    case WebAssembly::BI__builtin_wasm_convert_low_s_i32x4_f64x2:
-      IntNo = Intrinsic::wasm_convert_low_signed;
-      break;
-    case WebAssembly::BI__builtin_wasm_convert_low_u_i32x4_f64x2:
-      IntNo = Intrinsic::wasm_convert_low_unsigned;
-      break;
-    }
-    Function *Callee = CGM.getIntrinsic(IntNo);
-    return Builder.CreateCall(Callee, Vec);
-  }
   case WebAssembly::BI__builtin_wasm_trunc_sat_zero_s_f64x2_i32x4:
   case WebAssembly::BI__builtin_wasm_trunc_sat_zero_u_f64x2_i32x4: {
     Value *Vec = EmitScalarExpr(E->getArg(0));
diff --git a/clang/test/CodeGen/builtins-wasm.c b/clang/test/CodeGen/builtins-wasm.c
index 7b7965c026e1a..a5c6f4423c3b4 100644
--- a/clang/test/CodeGen/builtins-wasm.c
+++ b/clang/test/CodeGen/builtins-wasm.c
@@ -914,18 +914,6 @@ u64x2 extend_high_u_i32x4_i64x2(u32x4 x) {
   // WEBASSEMBLY: ret
 }
 
-f64x2 convert_low_s_i32x4_f64x2(i32x4 x) {
-  return __builtin_wasm_convert_low_s_i32x4_f64x2(x);
-  // WEBASSEMBLY: call <2 x double> @llvm.wasm.convert.low.signed(<4 x i32> %x)
-  // WEBASSEMBLY: ret
-}
-
-f64x2 convert_low_u_i32x4_f64x2(u32x4 x) {
-  return __builtin_wasm_convert_low_u_i32x4_f64x2(x);
-  // WEBASSEMBLY: call <2 x double> @llvm.wasm.convert.low.unsigned(<4 x i32> %x)
-  // WEBASSEMBLY: ret
-}
-
 i32x4 trunc_sat_zero_s_f64x2_i32x4(f64x2 x) {
   return __builtin_wasm_trunc_sat_zero_s_f64x2_i32x4(x);
   // WEBASSEMBLY: call <4 x i32> @llvm.wasm.trunc.sat.zero.signed(<2 x double> %x)
diff --git a/llvm/include/llvm/IR/IntrinsicsWebAssembly.td b/llvm/include/llvm/IR/IntrinsicsWebAssembly.td
index f4bdd07b81082..977647db92adf 100644
--- a/llvm/include/llvm/IR/IntrinsicsWebAssembly.td
+++ b/llvm/include/llvm/IR/IntrinsicsWebAssembly.td
@@ -275,12 +275,6 @@ def int_wasm_extadd_pairwise_unsigned :
             [IntrNoMem, IntrSpeculatable]>;
 
 // TODO: Remove these if possible if they are merged to the spec.
-def int_wasm_convert_low_signed :
-  Intrinsic<[llvm_v2f64_ty], [llvm_v4i32_ty],
-            [IntrNoMem, IntrSpeculatable]>;
-def int_wasm_convert_low_unsigned :
-  Intrinsic<[llvm_v2f64_ty], [llvm_v4i32_ty],
-            [IntrNoMem, IntrSpeculatable]>;
 def int_wasm_trunc_sat_zero_signed :
   Intrinsic<[llvm_v4i32_ty], [llvm_v2f64_ty],
             [IntrNoMem, IntrSpeculatable]>;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISD.def b/llvm/lib/Target/WebAssembly/WebAssemblyISD.def
index 3a82dd45a5f65..c73ce43057f85 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISD.def
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISD.def
@@ -33,6 +33,8 @@ HANDLE_NODETYPE(EXTEND_LOW_S)
 HANDLE_NODETYPE(EXTEND_LOW_U)
 HANDLE_NODETYPE(EXTEND_HIGH_S)
 HANDLE_NODETYPE(EXTEND_HIGH_U)
+HANDLE_NODETYPE(CONVERT_LOW_S)
+HANDLE_NODETYPE(CONVERT_LOW_U)
 HANDLE_NODETYPE(THROW)
 HANDLE_NODETYPE(CATCH)
 HANDLE_NODETYPE(MEMORY_COPY)
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index bce8f8ef2105d..5fa5cf22a090f 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -130,6 +130,10 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
     setTargetDAGCombine(ISD::SIGN_EXTEND);
     setTargetDAGCombine(ISD::ZERO_EXTEND);
 
+    // Combine {s,u}int_to_fp of extract_vectors into conversion ops
+    setTargetDAGCombine(ISD::SINT_TO_FP);
+    setTargetDAGCombine(ISD::UINT_TO_FP);
+
     // Support saturating add for i8x16 and i16x8
     for (auto Op : {ISD::SADDSAT, ISD::UADDSAT})
       for (auto T : {MVT::v16i8, MVT::v8i16})
@@ -2016,6 +2020,40 @@ performVectorExtendCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
   return DAG.getNode(Op, SDLoc(N), ResVT, Source);
 }
 
+static SDValue
+performVectorConvertLowCombine(SDNode *N,
+                               TargetLowering::DAGCombinerInfo &DCI) {
+  auto &DAG = DCI.DAG;
+  assert(N->getOpcode() == ISD::SINT_TO_FP ||
+         N->getOpcode() == ISD::UINT_TO_FP);
+
+  // Combine ({s,u}int_to_fp (extract_subvector ... 0)) to an
+  // f64x2.convert_low_i32x4_{s,u} SDNode.
+  auto Extract = N->getOperand(0);
+  if (Extract.getOpcode() != ISD::EXTRACT_SUBVECTOR)
+    return SDValue();
+  auto Source = Extract.getOperand(0);
+  auto *IndexNode = dyn_cast<ConstantSDNode>(Extract.getOperand(1));
+  if (IndexNode == nullptr)
+    return SDValue();
+  auto Index = IndexNode->getZExtValue();
+
+  // The types must be correct.
+  EVT ResVT = N->getValueType(0);
+  if (ResVT != MVT::v2f64 || Extract.getValueType() != MVT::v2i32)
+    return SDValue();
+
+  // The extracted vector must be the low half.
+  if (Index != 0)
+    return SDValue();
+
+  unsigned Op = N->getOpcode() == ISD::SINT_TO_FP
+                    ? WebAssemblyISD::CONVERT_LOW_S
+                    : WebAssemblyISD::CONVERT_LOW_U;
+
+  return DAG.getNode(Op, SDLoc(N), ResVT, Source);
+}
+
 SDValue
 WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N,
                                              DAGCombinerInfo &DCI) const {
@@ -2027,5 +2065,8 @@ WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::SIGN_EXTEND:
   case ISD::ZERO_EXTEND:
     return performVectorExtendCombine(N, DCI);
+  case ISD::SINT_TO_FP:
+  case ISD::UINT_TO_FP:
+    return performVectorConvertLowCombine(N, DCI);
   }
 }
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index 1588f6aac8610..cfbb3ffec5393 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -1104,16 +1104,21 @@ multiclass SIMDConvert<Vec vec, Vec arg, SDNode op, string name,
 defm "" : SIMDConvert<I32x4, F32x4, fp_to_sint, "trunc_sat_f32x4_s", 248>;
 defm "" : SIMDConvert<I32x4, F32x4, fp_to_uint, "trunc_sat_f32x4_u", 249>;
 
-// Integer to floating point: convert
-defm "" : SIMDConvert<F32x4, I32x4, sint_to_fp, "convert_i32x4_s", 250>;
-defm "" : SIMDConvert<F32x4, I32x4, uint_to_fp, "convert_i32x4_u", 251>;
-
 // Lower llvm.wasm.trunc.sat.* to saturating instructions
 def : Pat<(v4i32 (int_wasm_trunc_saturate_signed (v4f32 V128:$src))),
           (fp_to_sint_I32x4 $src)>;
 def : Pat<(v4i32 (int_wasm_trunc_saturate_unsigned (v4f32 V128:$src))),
           (fp_to_uint_I32x4 $src)>;
 
+// Integer to floating point: convert
+def convert_low_t : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>]>;
+def convert_low_s : SDNode<"WebAssemblyISD::CONVERT_LOW_S", convert_low_t>;
+def convert_low_u : SDNode<"WebAssemblyISD::CONVERT_LOW_U", convert_low_t>;
+defm "" : SIMDConvert<F32x4, I32x4, sint_to_fp, "convert_i32x4_s", 250>;
+defm "" : SIMDConvert<F32x4, I32x4, uint_to_fp, "convert_i32x4_u", 251>;
+defm "" : SIMDConvert<F64x2, I32x4, convert_low_s, "convert_low_i32x4_s", 0xfe>;
+defm "" : SIMDConvert<F64x2, I32x4, convert_low_u, "convert_low_i32x4_u", 0xff>;
+
 // Extending operations
 def extend_t : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>]>;
 def extend_low_s : SDNode<"WebAssemblyISD::EXTEND_LOW_S", extend_t>;
@@ -1268,10 +1273,6 @@ defm "" : SIMDConvert<I32x4, F64x2, int_wasm_trunc_sat_zero_signed,
                       "trunc_sat_zero_f64x2_s", 0xfc>;
 defm "" : SIMDConvert<I32x4, F64x2, int_wasm_trunc_sat_zero_unsigned,
                       "trunc_sat_zero_f64x2_u", 0xfd>;
-defm "" : SIMDConvert<F64x2, I32x4, int_wasm_convert_low_signed,
-                      "convert_low_i32x4_s", 0xfe>;
-defm "" : SIMDConvert<F64x2, I32x4, int_wasm_convert_low_unsigned,
-                      "convert_low_i32x4_u", 0xff>;
 
 //===----------------------------------------------------------------------===//
 // Saturating Rounding Q-Format Multiplication
diff --git a/llvm/test/CodeGen/WebAssembly/simd-conversions.ll b/llvm/test/CodeGen/WebAssembly/simd-conversions.ll
index 36856336e65e5..431d559220409 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-conversions.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-conversions.ll
@@ -81,3 +81,25 @@ define <2 x i64> @trunc_sat_u_v2i64(<2 x double> %x) {
   %a = fptoui <2 x double> %x to <2 x i64>
   ret <2 x i64> %a
 }
+
+; CHECK-LABEL: convert_low_s_v2f64:
+; NO-SIMD128-NOT: f64x2
+; SIMD128-NEXT: .functype convert_low_s_v2f64 (v128) -> (v128){{$}}
+; SIMD128-NEXT: f64x2.convert_low_i32x4_s $push[[R:[0-9]+]]=, $0
+; SIMD128-NEXT: return $pop[[R]]
+define <2 x double> @convert_low_s_v2f64(<4 x i32> %x) {
+  %v = shufflevector <4 x i32> %x, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+  %a = sitofp <2 x i32> %v to <2 x double>
+  ret <2 x double> %a
+}
+
+; CHECK-LABEL: convert_low_u_v2f64:
+; NO-SIMD128-NOT: f64x2
+; SIMD128-NEXT: .functype convert_low_u_v2f64 (v128) -> (v128){{$}}
+; SIMD128-NEXT: f64x2.convert_low_i32x4_u $push[[R:[0-9]+]]=, $0
+; SIMD128-NEXT: return $pop[[R]]
+define <2 x double> @convert_low_u_v2f64(<4 x i32> %x) {
+  %v = shufflevector <4 x i32> %x, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+  %a = uitofp <2 x i32> %v to <2 x double>
+  ret <2 x double> %a
+}
diff --git a/llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll b/llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll
index f28eb0b242a52..5df5ae9a21bde 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll
@@ -843,26 +843,6 @@ define <2 x double> @nearest_v2f64(<2 x double> %a) {
   ret <2 x double> %v
 }
 
-; CHECK-LABEL: convert_low_signed_v2f64:
-; CHECK-NEXT: .functype convert_low_signed_v2f64 (v128) -> (v128){{$}}
-; CHECK-NEXT: f64x2.convert_low_i32x4_s $push[[R:[0-9]+]]=, $0{{$}}
-; CHECK-NEXT: return $pop[[R]]{{$}}
-declare <2 x double> @llvm.wasm.convert.low.signed(<4 x i32>)
-define <2 x double> @convert_low_signed_v2f64(<4 x i32> %a) {
-  %v = call <2 x double> @llvm.wasm.convert.low.signed(<4 x i32> %a)
-  ret <2 x double> %v
-}
-
-; CHECK-LABEL: convert_low_unsigned_v2f64:
-; CHECK-NEXT: .functype convert_low_unsigned_v2f64 (v128) -> (v128){{$}}
-; CHECK-NEXT: f64x2.convert_low_i32x4_u $push[[R:[0-9]+]]=, $0{{$}}
-; CHECK-NEXT: return $pop[[R]]{{$}}
-declare <2 x double> @llvm.wasm.convert.low.unsigned(<4 x i32>)
-define <2 x double> @convert_low_unsigned_v2f64(<4 x i32> %a) {
-  %v = call <2 x double> @llvm.wasm.convert.low.unsigned(<4 x i32> %a)
-  ret <2 x double> %v
-}
-
 ; CHECK-LABEL: promote_low_v2f64:
 ; CHECK-NEXT: .functype promote_low_v2f64 (v128) -> (v128){{$}}
 ; CHECK-NEXT: f64x2.promote_low_f32x4 $push[[R:[0-9]+]]=, $0{{$}}

From 6ffbea3802fe554dfeb3d253b932c1a52f8b6fcd Mon Sep 17 00:00:00 2001
From: Thomas Lively <tlively@google.com>
Date: Wed, 14 Apr 2021 13:43:09 -0700
Subject: [PATCH 6/7] [WebAssembly] Codegen for
 i64x2.extend_{low,high}_i32x4_{s,u}

Removes the builtins and intrinsics used to opt in to using these instructions
and replaces them with normal ISel patterns now that they are no longer
prototypes.

Differential Revision: https://reviews.llvm.org/D100402
---
 .../clang/Basic/BuiltinsWebAssembly.def       |  5 --
 clang/lib/CodeGen/CGBuiltin.cpp               | 23 --------
 clang/test/CodeGen/builtins-wasm.c            | 24 ---------
 llvm/include/llvm/IR/IntrinsicsWebAssembly.td | 11 ----
 .../WebAssembly/WebAssemblyISelLowering.cpp   |  8 ++-
 .../WebAssembly/WebAssemblyInstrSIMD.td       | 14 ++---
 .../CodeGen/WebAssembly/simd-extending.ll     | 52 +++++++++++++++++++
 .../CodeGen/WebAssembly/simd-intrinsics.ll    | 40 --------------
 8 files changed, 61 insertions(+), 116 deletions(-)

diff --git a/clang/include/clang/Basic/BuiltinsWebAssembly.def b/clang/include/clang/Basic/BuiltinsWebAssembly.def
index db8ec8ebeb302..bc0c37a11207f 100644
--- a/clang/include/clang/Basic/BuiltinsWebAssembly.def
+++ b/clang/include/clang/Basic/BuiltinsWebAssembly.def
@@ -191,11 +191,6 @@ TARGET_BUILTIN(__builtin_wasm_narrow_u_i8x16_i16x8, "V16UcV8UsV8Us", "nc", "simd
 TARGET_BUILTIN(__builtin_wasm_narrow_s_i16x8_i32x4, "V8sV4iV4i", "nc", "simd128")
 TARGET_BUILTIN(__builtin_wasm_narrow_u_i16x8_i32x4, "V8UsV4UiV4Ui", "nc", "simd128")
 
-TARGET_BUILTIN(__builtin_wasm_extend_low_s_i32x4_i64x2, "V2LLiV4i", "nc", "simd128")
-TARGET_BUILTIN(__builtin_wasm_extend_high_s_i32x4_i64x2, "V2LLiV4i", "nc", "simd128")
-TARGET_BUILTIN(__builtin_wasm_extend_low_u_i32x4_i64x2, "V2LLUiV4Ui", "nc", "simd128")
-TARGET_BUILTIN(__builtin_wasm_extend_high_u_i32x4_i64x2, "V2LLUiV4Ui", "nc", "simd128")
-
 TARGET_BUILTIN(__builtin_wasm_trunc_sat_zero_s_f64x2_i32x4, "V4iV2d", "nc", "simd128")
 TARGET_BUILTIN(__builtin_wasm_trunc_sat_zero_u_f64x2_i32x4, "V4UiV2d", "nc", "simd128")
 TARGET_BUILTIN(__builtin_wasm_demote_zero_f64x2_f32x4, "V4fV2d", "nc", "simd128")
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 0dd0dc07d7693..9322f04250fcd 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -17158,29 +17158,6 @@ Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID,
         CGM.getIntrinsic(IntNo, {ConvertType(E->getType()), Low->getType()});
     return Builder.CreateCall(Callee, {Low, High});
   }
-  case WebAssembly::BI__builtin_wasm_extend_low_s_i32x4_i64x2:
-  case WebAssembly::BI__builtin_wasm_extend_high_s_i32x4_i64x2:
-  case WebAssembly::BI__builtin_wasm_extend_low_u_i32x4_i64x2:
-  case WebAssembly::BI__builtin_wasm_extend_high_u_i32x4_i64x2: {
-    Value *Vec = EmitScalarExpr(E->getArg(0));
-    unsigned IntNo;
-    switch (BuiltinID) {
-    case WebAssembly::BI__builtin_wasm_extend_low_s_i32x4_i64x2:
-      IntNo = Intrinsic::wasm_extend_low_signed;
-      break;
-    case WebAssembly::BI__builtin_wasm_extend_high_s_i32x4_i64x2:
-      IntNo = Intrinsic::wasm_extend_high_signed;
-      break;
-    case WebAssembly::BI__builtin_wasm_extend_low_u_i32x4_i64x2:
-      IntNo = Intrinsic::wasm_extend_low_unsigned;
-      break;
-    case WebAssembly::BI__builtin_wasm_extend_high_u_i32x4_i64x2:
-      IntNo = Intrinsic::wasm_extend_high_unsigned;
-      break;
-    }
-    Function *Callee = CGM.getIntrinsic(IntNo);
-    return Builder.CreateCall(Callee, Vec);
-  }
   case WebAssembly::BI__builtin_wasm_trunc_sat_zero_s_f64x2_i32x4:
   case WebAssembly::BI__builtin_wasm_trunc_sat_zero_u_f64x2_i32x4: {
     Value *Vec = EmitScalarExpr(E->getArg(0));
diff --git a/clang/test/CodeGen/builtins-wasm.c b/clang/test/CodeGen/builtins-wasm.c
index a5c6f4423c3b4..1a986f03dc498 100644
--- a/clang/test/CodeGen/builtins-wasm.c
+++ b/clang/test/CodeGen/builtins-wasm.c
@@ -890,30 +890,6 @@ u16x8 narrow_u_i16x8_i32x4(u32x4 low, u32x4 high) {
   // WEBASSEMBLY: ret
 }
 
-i64x2 extend_low_s_i32x4_i64x2(i32x4 x) {
-  return __builtin_wasm_extend_low_s_i32x4_i64x2(x);
-  // WEBASSEMBLY: call <2 x i64> @llvm.wasm.extend.low.signed(<4 x i32> %x)
-  // WEBASSEMBLY: ret
-}
-
-i64x2 extend_high_s_i32x4_i64x2(i32x4 x) {
-  return __builtin_wasm_extend_high_s_i32x4_i64x2(x);
-  // WEBASSEMBLY: call <2 x i64> @llvm.wasm.extend.high.signed(<4 x i32> %x)
-  // WEBASSEMBLY: ret
-}
-
-u64x2 extend_low_u_i32x4_i64x2(u32x4 x) {
-  return __builtin_wasm_extend_low_u_i32x4_i64x2(x);
-  // WEBASSEMBLY: call <2 x i64> @llvm.wasm.extend.low.unsigned(<4 x i32> %x)
-  // WEBASSEMBLY: ret
-}
-
-u64x2 extend_high_u_i32x4_i64x2(u32x4 x) {
-  return __builtin_wasm_extend_high_u_i32x4_i64x2(x);
-  // WEBASSEMBLY: call <2 x i64> @llvm.wasm.extend.high.unsigned(<4 x i32> %x)
-  // WEBASSEMBLY: ret
-}
-
 i32x4 trunc_sat_zero_s_f64x2_i32x4(f64x2 x) {
   return __builtin_wasm_trunc_sat_zero_s_f64x2_i32x4(x);
   // WEBASSEMBLY: call <4 x i32> @llvm.wasm.trunc.sat.zero.signed(<2 x double> %x)
diff --git a/llvm/include/llvm/IR/IntrinsicsWebAssembly.td b/llvm/include/llvm/IR/IntrinsicsWebAssembly.td
index 977647db92adf..4e2d557f1f083 100644
--- a/llvm/include/llvm/IR/IntrinsicsWebAssembly.td
+++ b/llvm/include/llvm/IR/IntrinsicsWebAssembly.td
@@ -157,17 +157,6 @@ def int_wasm_narrow_unsigned :
             [llvm_anyvector_ty, LLVMMatchType<1>],
             [IntrNoMem, IntrSpeculatable]>;
 
-// TODO: Replace these intrinsics with normal ISel patterns once i32x4 to i64x2
-// extending is merged to the proposal.
-def int_wasm_extend_low_signed :
-  Intrinsic<[llvm_v2i64_ty], [llvm_v4i32_ty], [IntrNoMem, IntrSpeculatable]>;
-def int_wasm_extend_high_signed :
-  Intrinsic<[llvm_v2i64_ty], [llvm_v4i32_ty], [IntrNoMem, IntrSpeculatable]>;
-def int_wasm_extend_low_unsigned :
-  Intrinsic<[llvm_v2i64_ty], [llvm_v4i32_ty], [IntrNoMem, IntrSpeculatable]>;
-def int_wasm_extend_high_unsigned :
-  Intrinsic<[llvm_v2i64_ty], [llvm_v4i32_ty], [IntrNoMem, IntrSpeculatable]>;
-
 def int_wasm_q15mulr_sat_signed :
   Intrinsic<[llvm_v8i16_ty],
             [llvm_v8i16_ty, llvm_v8i16_ty],
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 5fa5cf22a090f..a9cbe75271226 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -1994,8 +1994,8 @@ performVectorExtendCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
     return SDValue();
   auto Index = IndexNode->getZExtValue();
 
-  // Only v8i8 and v4i16 extracts can be widened, and only if the extracted
-  // subvector is the low or high half of its source.
+  // Only v8i8, v4i16, and v2i32 extracts can be widened, and only if the
+  // extracted subvector is the low or high half of its source.
   EVT ResVT = N->getValueType(0);
   if (ResVT == MVT::v8i16) {
     if (Extract.getValueType() != MVT::v8i8 ||
@@ -2005,6 +2005,10 @@ performVectorExtendCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
     if (Extract.getValueType() != MVT::v4i16 ||
         Source.getValueType() != MVT::v8i16 || (Index != 0 && Index != 4))
       return SDValue();
+  } else if (ResVT == MVT::v2i64) {
+    if (Extract.getValueType() != MVT::v2i32 ||
+        Source.getValueType() != MVT::v4i32 || (Index != 0 && Index != 2))
+      return SDValue();
   } else {
     return SDValue();
   }
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index cfbb3ffec5393..7cf3cb1854fb1 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -1138,17 +1138,9 @@ multiclass SIMDExtend<Vec vec, bits<32> baseInst> {
                         "extend_high_"#vec.split.prefix#"_u", !add(baseInst, 3)>;
 }
 
-defm "" : SIMDExtend<I16x8, 135>;
-defm "" : SIMDExtend<I32x4, 167>;
-
-defm "" : SIMDConvert<I64x2, I32x4, int_wasm_extend_low_signed,
-                      "extend_low_i32x4_s", 199>;
-defm "" : SIMDConvert<I64x2, I32x4, int_wasm_extend_high_signed,
-                      "extend_high_i32x4_s", 200>;
-defm "" : SIMDConvert<I64x2, I32x4, int_wasm_extend_low_unsigned,
-                      "extend_low_i32x4_u", 201>;
-defm "" : SIMDConvert<I64x2, I32x4, int_wasm_extend_high_unsigned,
-                      "extend_high_i32x4_u", 202>;
+defm "" : SIMDExtend<I16x8, 0x87>;
+defm "" : SIMDExtend<I32x4, 0xa7>;
+defm "" : SIMDExtend<I64x2, 0xc7>;
 
 // Narrowing operations
 multiclass SIMDNarrow<Vec vec, bits<32> baseInst> {
diff --git a/llvm/test/CodeGen/WebAssembly/simd-extending.ll b/llvm/test/CodeGen/WebAssembly/simd-extending.ll
index 3f512cd2678e0..9ecee61424e40 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-extending.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-extending.ll
@@ -110,6 +110,58 @@ define <4 x i32> @extend_high_i16x8_u(<8 x i16> %v) {
   ret <4 x i32> %extended
 }
 
+define <2 x i64> @extend_low_i32x4_s(<4 x i32> %v) {
+; CHECK-LABEL: extend_low_i32x4_s:
+; CHECK:         .functype extend_low_i32x4_s (v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i64x2.extend_low_i32x4_s
+; CHECK-NEXT:    # fallthrough-return
+  %low = shufflevector <4 x i32> %v, <4 x i32> undef,
+           <2 x i32> <i32 0, i32 1>
+  %extended = sext <2 x i32> %low to <2 x i64>
+  ret <2 x i64> %extended
+}
+
+define <2 x i64> @extend_low_i32x4_u(<4 x i32> %v) {
+; CHECK-LABEL: extend_low_i32x4_u:
+; CHECK:         .functype extend_low_i32x4_u (v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i64x2.extend_low_i32x4_u
+; CHECK-NEXT:    # fallthrough-return
+  %low = shufflevector <4 x i32> %v, <4 x i32> undef,
+           <2 x i32> <i32 0, i32 1>
+  %extended = zext <2 x i32> %low to <2 x i64>
+  ret <2 x i64> %extended
+}
+
+define <2 x i64> @extend_high_i32x4_s(<4 x i32> %v) {
+; CHECK-LABEL: extend_high_i32x4_s:
+; CHECK:         .functype extend_high_i32x4_s (v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i64x2.extend_high_i32x4_s
+; CHECK-NEXT:    # fallthrough-return
+  %low = shufflevector <4 x i32> %v, <4 x i32> undef,
+           <2 x i32> <i32 2, i32 3>
+  %extended = sext <2 x i32> %low to <2 x i64>
+  ret <2 x i64> %extended
+}
+
+define <2 x i64> @extend_high_i32x4_u(<4 x i32> %v) {
+; CHECK-LABEL: extend_high_i32x4_u:
+; CHECK:         .functype extend_high_i32x4_u (v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i64x2.extend_high_i32x4_u
+; CHECK-NEXT:    # fallthrough-return
+  %low = shufflevector <4 x i32> %v, <4 x i32> undef,
+           <2 x i32> <i32 2, i32 3>
+  %extended = zext <2 x i32> %low to <2 x i64>
+  ret <2 x i64> %extended
+}
+
 ;; Also test that similar patterns with offsets not corresponding to
 ;; the low or high half are correctly expanded.
 
diff --git a/llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll b/llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll
index 5df5ae9a21bde..6e8e5a2fed71b 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll
@@ -553,46 +553,6 @@ define <4 x i32> @trunc_sat_zero_unsigned_v4i32(<2 x double> %a) {
 ; ==============================================================================
 ; 2 x i64
 ; ==============================================================================
-; CHECK-LABEL: extend_low_s_v2i64:
-; CHECK-NEXT: .functype extend_low_s_v2i64 (v128) -> (v128){{$}}
-; CHECK-NEXT: i64x2.extend_low_i32x4_s $push[[R:[0-9]+]]=, $0{{$}}
-; CHECK-NEXT: return $pop[[R]]{{$}}
-declare <2 x i64> @llvm.wasm.extend.low.signed(<4 x i32>)
-define <2 x i64> @extend_low_s_v2i64(<4 x i32> %x) {
-  %a = call <2 x i64> @llvm.wasm.extend.low.signed(<4 x i32> %x)
-  ret <2 x i64> %a
-}
-
-; CHECK-LABEL: extend_high_s_v2i64:
-; CHECK-NEXT: .functype extend_high_s_v2i64 (v128) -> (v128){{$}}
-; CHECK-NEXT: i64x2.extend_high_i32x4_s $push[[R:[0-9]+]]=, $0{{$}}
-; CHECK-NEXT: return $pop[[R]]{{$}}
-declare <2 x i64> @llvm.wasm.extend.high.signed(<4 x i32>)
-define <2 x i64> @extend_high_s_v2i64(<4 x i32> %x) {
-  %a = call <2 x i64> @llvm.wasm.extend.high.signed(<4 x i32> %x)
-  ret <2 x i64> %a
-}
-
-; CHECK-LABEL: extend_low_u_v2i64:
-; CHECK-NEXT: .functype extend_low_u_v2i64 (v128) -> (v128){{$}}
-; CHECK-NEXT: i64x2.extend_low_i32x4_u $push[[R:[0-9]+]]=, $0{{$}}
-; CHECK-NEXT: return $pop[[R]]{{$}}
-declare <2 x i64> @llvm.wasm.extend.low.unsigned(<4 x i32>)
-define <2 x i64> @extend_low_u_v2i64(<4 x i32> %x) {
-  %a = call <2 x i64> @llvm.wasm.extend.low.unsigned(<4 x i32> %x)
-  ret <2 x i64> %a
-}
-
-; CHECK-LABEL: extend_high_u_v2i64:
-; CHECK-NEXT: .functype extend_high_u_v2i64 (v128) -> (v128){{$}}
-; CHECK-NEXT: i64x2.extend_high_i32x4_u $push[[R:[0-9]+]]=, $0{{$}}
-; CHECK-NEXT: return $pop[[R]]{{$}}
-declare <2 x i64> @llvm.wasm.extend.high.unsigned(<4 x i32>)
-define <2 x i64> @extend_high_u_v2i64(<4 x i32> %x) {
-  %a = call <2 x i64> @llvm.wasm.extend.high.unsigned(<4 x i32> %x)
-  ret <2 x i64> %a
-}
-
 ; CHECK-LABEL: extmul_low_s_v2i64:
 ; CHECK-NEXT: .functype extmul_low_s_v2i64 (v128, v128) -> (v128){{$}}
 ; CHECK-NEXT: i64x2.extmul_low_i32x4_s $push[[R:[0-9]+]]=, $0, $1{{$}}

From 05751c82ecb45c069fbeee6887903d020cff4b87 Mon Sep 17 00:00:00 2001
From: Thomas Lively <tlively@google.com>
Date: Fri, 16 Apr 2021 12:11:20 -0700
Subject: [PATCH 7/7] [WebAssembly] Remove saturating fp-to-int target
 intrinsics

Use the target-independent @llvm.fptosi and @llvm.fptoui intrinsics instead.
This includes removing the instrinsics for i32x4.trunc_sat_zero_f64x2_{s,u},
which are now represented in IR as a saturating truncation to a v2i32 followed by
a concatenation with a zero vector.

Differential Revision: https://reviews.llvm.org/D100596
---
 clang/lib/CodeGen/CGBuiltin.cpp               |  24 +-
 clang/test/CodeGen/builtins-wasm.c            |  30 +-
 llvm/include/llvm/IR/IntrinsicsWebAssembly.td |   6 -
 llvm/lib/Analysis/ConstantFolding.cpp         |  24 +-
 .../lib/Target/WebAssembly/WebAssemblyISD.def |   2 +
 .../WebAssembly/WebAssemblyISelLowering.cpp   |  77 +++
 .../WebAssembly/WebAssemblyISelLowering.h     |   1 +
 .../WebAssembly/WebAssemblyInstrConv.td       |  26 +-
 .../WebAssembly/WebAssemblyInstrSIMD.td       |  24 +-
 llvm/test/CodeGen/WebAssembly/conv.ll         |  77 ++-
 .../CodeGen/WebAssembly/simd-intrinsics.ll    |  36 +-
 .../ConstProp/WebAssembly/trunc_saturate.ll   | 610 ------------------
 12 files changed, 220 insertions(+), 717 deletions(-)
 delete mode 100644 llvm/test/Transforms/InstSimplify/ConstProp/WebAssembly/trunc_saturate.ll

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 9322f04250fcd..6a197ed1cbecd 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -16756,8 +16756,8 @@ Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID,
   case WebAssembly::BI__builtin_wasm_trunc_saturate_s_i32x4_f32x4: {
     Value *Src = EmitScalarExpr(E->getArg(0));
     llvm::Type *ResT = ConvertType(E->getType());
-    Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_trunc_saturate_signed,
-                                        {ResT, Src->getType()});
+    Function *Callee =
+        CGM.getIntrinsic(Intrinsic::fptosi_sat, {ResT, Src->getType()});
     return Builder.CreateCall(Callee, {Src});
   }
   case WebAssembly::BI__builtin_wasm_trunc_saturate_u_i32_f32:
@@ -16767,8 +16767,8 @@ Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID,
   case WebAssembly::BI__builtin_wasm_trunc_saturate_u_i32x4_f32x4: {
     Value *Src = EmitScalarExpr(E->getArg(0));
     llvm::Type *ResT = ConvertType(E->getType());
-    Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_trunc_saturate_unsigned,
-                                        {ResT, Src->getType()});
+    Function *Callee =
+        CGM.getIntrinsic(Intrinsic::fptoui_sat, {ResT, Src->getType()});
     return Builder.CreateCall(Callee, {Src});
   }
   case WebAssembly::BI__builtin_wasm_min_f32:
@@ -17164,14 +17164,22 @@ Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID,
     unsigned IntNo;
     switch (BuiltinID) {
     case WebAssembly::BI__builtin_wasm_trunc_sat_zero_s_f64x2_i32x4:
-      IntNo = Intrinsic::wasm_trunc_sat_zero_signed;
+      IntNo = Intrinsic::fptosi_sat;
       break;
     case WebAssembly::BI__builtin_wasm_trunc_sat_zero_u_f64x2_i32x4:
-      IntNo = Intrinsic::wasm_trunc_sat_zero_unsigned;
+      IntNo = Intrinsic::fptoui_sat;
       break;
     }
-    Function *Callee = CGM.getIntrinsic(IntNo);
-    return Builder.CreateCall(Callee, Vec);
+    llvm::Type *SrcT = Vec->getType();
+    llvm::Type *TruncT =
+        SrcT->getWithNewType(llvm::IntegerType::get(getLLVMContext(), 32));
+    Function *Callee = CGM.getIntrinsic(IntNo, {TruncT, SrcT});
+    Value *Trunc = Builder.CreateCall(Callee, Vec);
+    Value *Splat = Builder.CreateVectorSplat(2, Builder.getInt32(0));
+    Value *ConcatMask =
+        llvm::ConstantVector::get({Builder.getInt32(0), Builder.getInt32(1),
+                                   Builder.getInt32(2), Builder.getInt32(3)});
+    return Builder.CreateShuffleVector(Trunc, Splat, ConcatMask);
   }
   case WebAssembly::BI__builtin_wasm_demote_zero_f64x2_f32x4: {
     Value *Vec = EmitScalarExpr(E->getArg(0));
diff --git a/clang/test/CodeGen/builtins-wasm.c b/clang/test/CodeGen/builtins-wasm.c
index 1a986f03dc498..d20b6a739f94a 100644
--- a/clang/test/CodeGen/builtins-wasm.c
+++ b/clang/test/CodeGen/builtins-wasm.c
@@ -123,49 +123,49 @@ long long trunc_u_i64_f64(double f) {
 
 int trunc_saturate_s_i32_f32(float f) {
   return __builtin_wasm_trunc_saturate_s_i32_f32(f);
-  // WEBASSEMBLY: call i32 @llvm.wasm.trunc.saturate.signed.i32.f32(float %f)
+  // WEBASSEMBLY: call i32 @llvm.fptosi.sat.i32.f32(float %f)
   // WEBASSEMBLY-NEXT: ret
 }
 
 int trunc_saturate_u_i32_f32(float f) {
   return __builtin_wasm_trunc_saturate_u_i32_f32(f);
-  // WEBASSEMBLY: call i32 @llvm.wasm.trunc.saturate.unsigned.i32.f32(float %f)
+  // WEBASSEMBLY: call i32 @llvm.fptoui.sat.i32.f32(float %f)
   // WEBASSEMBLY-NEXT: ret
 }
 
 int trunc_saturate_s_i32_f64(double f) {
   return __builtin_wasm_trunc_saturate_s_i32_f64(f);
-  // WEBASSEMBLY: call i32 @llvm.wasm.trunc.saturate.signed.i32.f64(double %f)
+  // WEBASSEMBLY: call i32 @llvm.fptosi.sat.i32.f64(double %f)
   // WEBASSEMBLY-NEXT: ret
 }
 
 int trunc_saturate_u_i32_f64(double f) {
   return __builtin_wasm_trunc_saturate_u_i32_f64(f);
-  // WEBASSEMBLY: call i32 @llvm.wasm.trunc.saturate.unsigned.i32.f64(double %f)
+  // WEBASSEMBLY: call i32 @llvm.fptoui.sat.i32.f64(double %f)
   // WEBASSEMBLY-NEXT: ret
 }
 
 long long trunc_saturate_s_i64_f32(float f) {
   return __builtin_wasm_trunc_saturate_s_i64_f32(f);
-  // WEBASSEMBLY: call i64 @llvm.wasm.trunc.saturate.signed.i64.f32(float %f)
+  // WEBASSEMBLY: call i64 @llvm.fptosi.sat.i64.f32(float %f)
   // WEBASSEMBLY-NEXT: ret
 }
 
 long long trunc_saturate_u_i64_f32(float f) {
   return __builtin_wasm_trunc_saturate_u_i64_f32(f);
-  // WEBASSEMBLY: call i64 @llvm.wasm.trunc.saturate.unsigned.i64.f32(float %f)
+  // WEBASSEMBLY: call i64 @llvm.fptoui.sat.i64.f32(float %f)
   // WEBASSEMBLY-NEXT: ret
 }
 
 long long trunc_saturate_s_i64_f64(double f) {
   return __builtin_wasm_trunc_saturate_s_i64_f64(f);
-  // WEBASSEMBLY: call i64 @llvm.wasm.trunc.saturate.signed.i64.f64(double %f)
+  // WEBASSEMBLY: call i64 @llvm.fptosi.sat.i64.f64(double %f)
   // WEBASSEMBLY-NEXT: ret
 }
 
 long long trunc_saturate_u_i64_f64(double f) {
   return __builtin_wasm_trunc_saturate_u_i64_f64(f);
-  // WEBASSEMBLY: call i64 @llvm.wasm.trunc.saturate.unsigned.i64.f64(double %f)
+  // WEBASSEMBLY: call i64 @llvm.fptoui.sat.i64.f64(double %f)
   // WEBASSEMBLY-NEXT: ret
 }
 
@@ -852,13 +852,13 @@ f64x2 sqrt_f64x2(f64x2 x) {
 
 i32x4 trunc_saturate_s_i32x4_f32x4(f32x4 f) {
   return __builtin_wasm_trunc_saturate_s_i32x4_f32x4(f);
-  // WEBASSEMBLY: call <4 x i32> @llvm.wasm.trunc.saturate.signed.v4i32.v4f32(<4 x float> %f)
+  // WEBASSEMBLY: call <4 x i32> @llvm.fptosi.sat.v4i32.v4f32(<4 x float> %f)
   // WEBASSEMBLY-NEXT: ret
 }
 
 i32x4 trunc_saturate_u_i32x4_f32x4(f32x4 f) {
   return __builtin_wasm_trunc_saturate_u_i32x4_f32x4(f);
-  // WEBASSEMBLY: call <4 x i32> @llvm.wasm.trunc.saturate.unsigned.v4i32.v4f32(<4 x float> %f)
+  // WEBASSEMBLY: call <4 x i32> @llvm.fptoui.sat.v4i32.v4f32(<4 x float> %f)
   // WEBASSEMBLY-NEXT: ret
 }
 
@@ -892,14 +892,16 @@ u16x8 narrow_u_i16x8_i32x4(u32x4 low, u32x4 high) {
 
 i32x4 trunc_sat_zero_s_f64x2_i32x4(f64x2 x) {
   return __builtin_wasm_trunc_sat_zero_s_f64x2_i32x4(x);
-  // WEBASSEMBLY: call <4 x i32> @llvm.wasm.trunc.sat.zero.signed(<2 x double> %x)
-  // WEBASSEMBLY: ret
+  // WEBASSEMBLY: %0 = tail call <2 x i32> @llvm.fptosi.sat.v2i32.v2f64(<2 x double> %x)
+  // WEBASSEMBLY: %1 = shufflevector <2 x i32> %0, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // WEBASSEMBLY: ret <4 x i32> %1
 }
 
 u32x4 trunc_sat_zero_u_f64x2_i32x4(f64x2 x) {
   return __builtin_wasm_trunc_sat_zero_u_f64x2_i32x4(x);
-  // WEBASSEMBLY: call <4 x i32> @llvm.wasm.trunc.sat.zero.unsigned(<2 x double> %x)
-  // WEBASSEMBLY: ret
+  // WEBASSEMBLY: %0 = tail call <2 x i32> @llvm.fptoui.sat.v2i32.v2f64(<2 x double> %x)
+  // WEBASSEMBLY: %1 = shufflevector <2 x i32> %0, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // WEBASSEMBLY: ret <4 x i32> %1
 }
 
 f32x4 wasm_demote_zero_f64x2_f32x4(f64x2 x) {
diff --git a/llvm/include/llvm/IR/IntrinsicsWebAssembly.td b/llvm/include/llvm/IR/IntrinsicsWebAssembly.td
index 4e2d557f1f083..abeb4c0a19f34 100644
--- a/llvm/include/llvm/IR/IntrinsicsWebAssembly.td
+++ b/llvm/include/llvm/IR/IntrinsicsWebAssembly.td
@@ -264,12 +264,6 @@ def int_wasm_extadd_pairwise_unsigned :
             [IntrNoMem, IntrSpeculatable]>;
 
 // TODO: Remove these if possible if they are merged to the spec.
-def int_wasm_trunc_sat_zero_signed :
-  Intrinsic<[llvm_v4i32_ty], [llvm_v2f64_ty],
-            [IntrNoMem, IntrSpeculatable]>;
-def int_wasm_trunc_sat_zero_unsigned :
-  Intrinsic<[llvm_v4i32_ty], [llvm_v2f64_ty],
-            [IntrNoMem, IntrSpeculatable]>;
 def int_wasm_demote_zero :
   Intrinsic<[llvm_v4f32_ty], [llvm_v2f64_ty],
             [IntrNoMem, IntrSpeculatable]>;
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index f73890d548f09..5b592c17e8de1 100644
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -1493,8 +1493,6 @@ bool llvm::canConstantFoldCallTo(const CallBase *Call, const Function *F) {
   // WebAssembly float semantics are always known
   case Intrinsic::wasm_trunc_signed:
   case Intrinsic::wasm_trunc_unsigned:
-  case Intrinsic::wasm_trunc_saturate_signed:
-  case Intrinsic::wasm_trunc_saturate_unsigned:
     return true;
 
   // Floating point operations cannot be folded in strictfp functions in
@@ -1896,17 +1894,11 @@ static Constant *ConstantFoldScalarCall1(StringRef Name,
     APFloat U = Op->getValueAPF();
 
     if (IntrinsicID == Intrinsic::wasm_trunc_signed ||
-        IntrinsicID == Intrinsic::wasm_trunc_unsigned ||
-        IntrinsicID == Intrinsic::wasm_trunc_saturate_signed ||
-        IntrinsicID == Intrinsic::wasm_trunc_saturate_unsigned) {
-
-      bool Saturating = IntrinsicID == Intrinsic::wasm_trunc_saturate_signed ||
-                        IntrinsicID == Intrinsic::wasm_trunc_saturate_unsigned;
-      bool Signed = IntrinsicID == Intrinsic::wasm_trunc_signed ||
-                    IntrinsicID == Intrinsic::wasm_trunc_saturate_signed;
+        IntrinsicID == Intrinsic::wasm_trunc_unsigned) {
+      bool Signed = IntrinsicID == Intrinsic::wasm_trunc_signed;
 
       if (U.isNaN())
-        return Saturating ? ConstantInt::get(Ty, 0) : nullptr;
+        return nullptr;
 
       unsigned Width = Ty->getIntegerBitWidth();
       APSInt Int(Width, !Signed);
@@ -1917,15 +1909,7 @@ static Constant *ConstantFoldScalarCall1(StringRef Name,
       if (Status == APFloat::opOK || Status == APFloat::opInexact)
         return ConstantInt::get(Ty, Int);
 
-      if (!Saturating)
-        return nullptr;
-
-      if (U.isNegative())
-        return Signed ? ConstantInt::get(Ty, APInt::getSignedMinValue(Width))
-                      : ConstantInt::get(Ty, APInt::getMinValue(Width));
-      else
-        return Signed ? ConstantInt::get(Ty, APInt::getSignedMaxValue(Width))
-                      : ConstantInt::get(Ty, APInt::getMaxValue(Width));
+      return nullptr;
     }
 
     if (IntrinsicID == Intrinsic::fptoui_sat ||
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISD.def b/llvm/lib/Target/WebAssembly/WebAssemblyISD.def
index c73ce43057f85..33fd2ae11154f 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISD.def
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISD.def
@@ -35,6 +35,8 @@ HANDLE_NODETYPE(EXTEND_HIGH_S)
 HANDLE_NODETYPE(EXTEND_HIGH_U)
 HANDLE_NODETYPE(CONVERT_LOW_S)
 HANDLE_NODETYPE(CONVERT_LOW_U)
+HANDLE_NODETYPE(TRUNC_SAT_ZERO_S)
+HANDLE_NODETYPE(TRUNC_SAT_ZERO_U)
 HANDLE_NODETYPE(THROW)
 HANDLE_NODETYPE(CATCH)
 HANDLE_NODETYPE(MEMORY_COPY)
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index a9cbe75271226..29742177626fd 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -121,6 +121,11 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
         setOperationAction(Op, T, Expand);
   }
 
+  if (Subtarget->hasNontrappingFPToInt())
+    for (auto Op : {ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT})
+      for (auto T : {MVT::i32, MVT::i64})
+        setOperationAction(Op, T, Custom);
+
   // SIMD-specific configuration
   if (Subtarget->hasSIMD128()) {
     // Hoist bitcasts out of shuffles
@@ -134,6 +139,9 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
     setTargetDAGCombine(ISD::SINT_TO_FP);
     setTargetDAGCombine(ISD::UINT_TO_FP);
 
+    // Combine concat of {s,u}int_to_fp_sat to i32x4.trunc_sat_f64x2_zero_{s,u}
+    setTargetDAGCombine(ISD::CONCAT_VECTORS);
+
     // Support saturating add for i8x16 and i16x8
     for (auto Op : {ISD::SADDSAT, ISD::UADDSAT})
       for (auto T : {MVT::v16i8, MVT::v8i16})
@@ -198,6 +206,10 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
          {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT})
       for (auto T : {MVT::v2i64, MVT::v2f64})
         setOperationAction(Op, T, Expand);
+
+    // But saturating fp_to_int converstions are
+    for (auto Op : {ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT})
+      setOperationAction(Op, MVT::v4i32, Custom);
   }
 
   // As a special case, these operators use the type to mean the type to
@@ -1233,6 +1245,9 @@ SDValue WebAssemblyTargetLowering::LowerOperation(SDValue Op,
   case ISD::SRA:
   case ISD::SRL:
     return LowerShift(Op, DAG);
+  case ISD::FP_TO_SINT_SAT:
+  case ISD::FP_TO_UINT_SAT:
+    return LowerFP_TO_INT_SAT(Op, DAG);
   }
 }
 
@@ -1949,6 +1964,21 @@ SDValue WebAssemblyTargetLowering::LowerShift(SDValue Op,
   return DAG.getNode(Opcode, DL, Op.getValueType(), Op.getOperand(0), ShiftVal);
 }
 
+SDValue WebAssemblyTargetLowering::LowerFP_TO_INT_SAT(SDValue Op,
+                                                      SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  EVT ResT = Op.getValueType();
+  uint64_t Width = Op.getConstantOperandVal(1);
+
+  if ((ResT == MVT::i32 || ResT == MVT::i64) && (Width == 32 || Width == 64))
+    return Op;
+
+  if (ResT == MVT::v4i32 && Width == 32)
+    return Op;
+
+  return SDValue();
+}
+
 //===----------------------------------------------------------------------===//
 //   Custom DAG combine hooks
 //===----------------------------------------------------------------------===//
@@ -2037,6 +2067,8 @@ performVectorConvertLowCombine(SDNode *N,
   if (Extract.getOpcode() != ISD::EXTRACT_SUBVECTOR)
     return SDValue();
   auto Source = Extract.getOperand(0);
+  if (Source.getValueType() != MVT::v4i32)
+    return SDValue();
   auto *IndexNode = dyn_cast<ConstantSDNode>(Extract.getOperand(1));
   if (IndexNode == nullptr)
     return SDValue();
@@ -2058,6 +2090,49 @@ performVectorConvertLowCombine(SDNode *N,
   return DAG.getNode(Op, SDLoc(N), ResVT, Source);
 }
 
+static SDValue
+performVectorTruncSatLowCombine(SDNode *N,
+                                TargetLowering::DAGCombinerInfo &DCI) {
+  auto &DAG = DCI.DAG;
+  assert(N->getOpcode() == ISD::CONCAT_VECTORS);
+
+  // Combine this:
+  //
+  //   (concat_vectors (v2i32 (fp_to_{s,u}int_sat $x, 32)), (v2i32 (splat 0)))
+  //
+  // into (i32x4.trunc_sat_f64x2_zero_{s,u} $x).
+  EVT ResVT = N->getValueType(0);
+  if (ResVT != MVT::v4i32)
+    return SDValue();
+
+  auto FPToInt = N->getOperand(0);
+  auto FPToIntOp = FPToInt.getOpcode();
+  if (FPToIntOp != ISD::FP_TO_SINT_SAT && FPToIntOp != ISD::FP_TO_UINT_SAT)
+    return SDValue();
+  if (FPToInt.getConstantOperandVal(1) != 32)
+    return SDValue();
+
+  auto Source = FPToInt.getOperand(0);
+  if (Source.getValueType() != MVT::v2f64)
+    return SDValue();
+
+  auto *Splat = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
+  APInt SplatValue, SplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+  if (!Splat || !Splat->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
+                                        HasAnyUndefs))
+    return SDValue();
+  if (SplatValue != 0)
+    return SDValue();
+
+  unsigned Op = FPToIntOp == ISD::FP_TO_SINT_SAT
+                    ? WebAssemblyISD::TRUNC_SAT_ZERO_S
+                    : WebAssemblyISD::TRUNC_SAT_ZERO_U;
+
+  return DAG.getNode(Op, SDLoc(N), ResVT, Source);
+}
+
 SDValue
 WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N,
                                              DAGCombinerInfo &DCI) const {
@@ -2072,5 +2147,7 @@ WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::SINT_TO_FP:
   case ISD::UINT_TO_FP:
     return performVectorConvertLowCombine(N, DCI);
+  case ISD::CONCAT_VECTORS:
+    return performVectorTruncSatLowCombine(N, DCI);
   }
 }
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
index c8a052d011997..10aca7708c239 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
@@ -119,6 +119,7 @@ class WebAssemblyTargetLowering final : public TargetLowering {
   SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerAccessVectorElement(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerShift(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const;
 
   // Custom DAG combine hooks
   SDValue
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrConv.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrConv.td
index f3d9c5d5032cb..68ef43f6af36d 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrConv.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrConv.td
@@ -96,23 +96,15 @@ defm I64_TRUNC_U_SAT_F64 : I<(outs I64:$dst), (ins F64:$src), (outs), (ins),
                              "i64.trunc_sat_f64_u", 0xfc07>,
                              Requires<[HasNontrappingFPToInt]>;
 
-// Lower llvm.wasm.trunc.saturate.* to saturating instructions
-def : Pat<(int_wasm_trunc_saturate_signed F32:$src),
-          (I32_TRUNC_S_SAT_F32 F32:$src)>;
-def : Pat<(int_wasm_trunc_saturate_unsigned F32:$src),
-          (I32_TRUNC_U_SAT_F32 F32:$src)>;
-def : Pat<(int_wasm_trunc_saturate_signed F64:$src),
-          (I32_TRUNC_S_SAT_F64 F64:$src)>;
-def : Pat<(int_wasm_trunc_saturate_unsigned F64:$src),
-          (I32_TRUNC_U_SAT_F64 F64:$src)>;
-def : Pat<(int_wasm_trunc_saturate_signed F32:$src),
-          (I64_TRUNC_S_SAT_F32 F32:$src)>;
-def : Pat<(int_wasm_trunc_saturate_unsigned F32:$src),
-          (I64_TRUNC_U_SAT_F32 F32:$src)>;
-def : Pat<(int_wasm_trunc_saturate_signed F64:$src),
-          (I64_TRUNC_S_SAT_F64 F64:$src)>;
-def : Pat<(int_wasm_trunc_saturate_unsigned F64:$src),
-          (I64_TRUNC_U_SAT_F64 F64:$src)>;
+// Support the explicitly saturating operations as well.
+def : Pat<(fp_to_sint_sat F32:$src, (i32 32)), (I32_TRUNC_S_SAT_F32 F32:$src)>;
+def : Pat<(fp_to_uint_sat F32:$src, (i32 32)), (I32_TRUNC_U_SAT_F32 F32:$src)>;
+def : Pat<(fp_to_sint_sat F64:$src, (i32 32)), (I32_TRUNC_S_SAT_F64 F64:$src)>;
+def : Pat<(fp_to_uint_sat F64:$src, (i32 32)), (I32_TRUNC_U_SAT_F64 F64:$src)>;
+def : Pat<(fp_to_sint_sat F32:$src, (i32 64)), (I64_TRUNC_S_SAT_F32 F32:$src)>;
+def : Pat<(fp_to_uint_sat F32:$src, (i32 64)), (I64_TRUNC_U_SAT_F32 F32:$src)>;
+def : Pat<(fp_to_sint_sat F64:$src, (i32 64)), (I64_TRUNC_S_SAT_F64 F64:$src)>;
+def : Pat<(fp_to_uint_sat F64:$src, (i32 64)), (I64_TRUNC_U_SAT_F64 F64:$src)>;
 
 // Conversion from floating point to integer pseudo-instructions which don't
 // trap on overflow or invalid.
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index 7cf3cb1854fb1..efcdf0368488b 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -1104,11 +1104,21 @@ multiclass SIMDConvert<Vec vec, Vec arg, SDNode op, string name,
 defm "" : SIMDConvert<I32x4, F32x4, fp_to_sint, "trunc_sat_f32x4_s", 248>;
 defm "" : SIMDConvert<I32x4, F32x4, fp_to_uint, "trunc_sat_f32x4_u", 249>;
 
-// Lower llvm.wasm.trunc.sat.* to saturating instructions
-def : Pat<(v4i32 (int_wasm_trunc_saturate_signed (v4f32 V128:$src))),
-          (fp_to_sint_I32x4 $src)>;
-def : Pat<(v4i32 (int_wasm_trunc_saturate_unsigned (v4f32 V128:$src))),
-          (fp_to_uint_I32x4 $src)>;
+// Support the saturating variety as well.
+def trunc_s_sat32 : PatFrag<(ops node:$x), (fp_to_sint_sat $x, (i32 32))>;
+def trunc_u_sat32 : PatFrag<(ops node:$x), (fp_to_uint_sat $x, (i32 32))>;
+def : Pat<(v4i32 (trunc_s_sat32 (v4f32 V128:$src))), (fp_to_sint_I32x4 $src)>;
+def : Pat<(v4i32 (trunc_u_sat32 (v4f32 V128:$src))), (fp_to_uint_I32x4 $src)>;
+
+def trunc_sat_zero_t : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>]>;
+def trunc_sat_zero_s :
+  SDNode<"WebAssemblyISD::TRUNC_SAT_ZERO_S", trunc_sat_zero_t>;
+def trunc_sat_zero_u :
+  SDNode<"WebAssemblyISD::TRUNC_SAT_ZERO_U", trunc_sat_zero_t>;
+defm "" : SIMDConvert<I32x4, F64x2, trunc_sat_zero_s, "trunc_sat_zero_f64x2_s",
+                      0xfc>;
+defm "" : SIMDConvert<I32x4, F64x2, trunc_sat_zero_u, "trunc_sat_zero_f64x2_u",
+                      0xfd>;
 
 // Integer to floating point: convert
 def convert_low_t : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>]>;
@@ -1261,10 +1271,6 @@ defm "" : SIMDConvert<F32x4, F64x2, int_wasm_demote_zero,
                       "demote_zero_f64x2", 0x5e>;
 defm "" : SIMDConvert<F64x2, F32x4, int_wasm_promote_low,
                       "promote_low_f32x4", 0x5f>;
-defm "" : SIMDConvert<I32x4, F64x2, int_wasm_trunc_sat_zero_signed,
-                      "trunc_sat_zero_f64x2_s", 0xfc>;
-defm "" : SIMDConvert<I32x4, F64x2, int_wasm_trunc_sat_zero_unsigned,
-                      "trunc_sat_zero_f64x2_u", 0xfd>;
 
 //===----------------------------------------------------------------------===//
 // Saturating Rounding Q-Format Multiplication
diff --git a/llvm/test/CodeGen/WebAssembly/conv.ll b/llvm/test/CodeGen/WebAssembly/conv.ll
index 68f941546ce10..5699c7b9adc5b 100644
--- a/llvm/test/CodeGen/WebAssembly/conv.ll
+++ b/llvm/test/CodeGen/WebAssembly/conv.ll
@@ -45,9 +45,9 @@ define i32 @i32_trunc_s_f32(float %x) {
 ; CHECK-NEXT: .functype i32_trunc_sat_s_f32 (f32) -> (i32){{$}}
 ; CHECK-NEXT: i32.trunc_sat_f32_s $push[[NUM:[0-9]+]]=, $0{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
-declare i32 @llvm.wasm.trunc.saturate.signed.i32.f32(float)
+declare i32 @llvm.fptosi.sat.i32.f32(float)
 define i32 @i32_trunc_sat_s_f32(float %x) {
-  %a = call i32 @llvm.wasm.trunc.saturate.signed.i32.f32(float %x)
+  %a = call i32 @llvm.fptosi.sat.i32.f32(float %x)
   ret i32 %a
 }
 
@@ -64,9 +64,9 @@ define i32 @i32_trunc_u_f32(float %x) {
 ; CHECK-NEXT: .functype i32_trunc_sat_u_f32 (f32) -> (i32){{$}}
 ; CHECK-NEXT: i32.trunc_sat_f32_u $push[[NUM:[0-9]+]]=, $0{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
-declare i32 @llvm.wasm.trunc.saturate.unsigned.i32.f32(float)
+declare i32 @llvm.fptoui.sat.i32.f32(float)
 define i32 @i32_trunc_sat_u_f32(float %x) {
-  %a = call i32 @llvm.wasm.trunc.saturate.unsigned.i32.f32(float %x)
+  %a = call i32 @llvm.fptoui.sat.i32.f32(float %x)
   ret i32 %a
 }
 
@@ -83,9 +83,9 @@ define i32 @i32_trunc_s_f64(double %x) {
 ; CHECK-NEXT: .functype i32_trunc_sat_s_f64 (f64) -> (i32){{$}}
 ; CHECK-NEXT: i32.trunc_sat_f64_s $push[[NUM:[0-9]+]]=, $0{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
-declare i32 @llvm.wasm.trunc.saturate.signed.i32.f64(double)
+declare i32 @llvm.fptosi.sat.i32.f64(double)
 define i32 @i32_trunc_sat_s_f64(double %x) {
-  %a = call i32 @llvm.wasm.trunc.saturate.signed.i32.f64(double %x)
+  %a = call i32 @llvm.fptosi.sat.i32.f64(double %x)
   ret i32 %a
 }
 
@@ -102,9 +102,9 @@ define i32 @i32_trunc_u_f64(double %x) {
 ; CHECK-NEXT: .functype i32_trunc_sat_u_f64 (f64) -> (i32){{$}}
 ; CHECK-NEXT: i32.trunc_sat_f64_u $push[[NUM:[0-9]+]]=, $0{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
-declare i32 @llvm.wasm.trunc.saturate.unsigned.i32.f64(double)
+declare i32 @llvm.fptoui.sat.i32.f64(double)
 define i32 @i32_trunc_sat_u_f64(double %x) {
-  %a = call i32 @llvm.wasm.trunc.saturate.unsigned.i32.f64(double %x)
+  %a = call i32 @llvm.fptoui.sat.i32.f64(double %x)
   ret i32 %a
 }
 
@@ -121,9 +121,9 @@ define i64 @i64_trunc_s_f32(float %x) {
 ; CHECK-NEXT: .functype i64_trunc_sat_s_f32 (f32) -> (i64){{$}}
 ; CHECK-NEXT: i64.trunc_sat_f32_s $push[[NUM:[0-9]+]]=, $0{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
-declare i64 @llvm.wasm.trunc.saturate.signed.i64.f32(float)
+declare i64 @llvm.fptosi.sat.i64.f32(float)
 define i64 @i64_trunc_sat_s_f32(float %x) {
-  %a = call i64 @llvm.wasm.trunc.saturate.signed.i64.f32(float %x)
+  %a = call i64 @llvm.fptosi.sat.i64.f32(float %x)
   ret i64 %a
 }
 
@@ -140,9 +140,9 @@ define i64 @i64_trunc_u_f32(float %x) {
 ; CHECK-NEXT: .functype i64_trunc_sat_u_f32 (f32) -> (i64){{$}}
 ; CHECK-NEXT: i64.trunc_sat_f32_u $push[[NUM:[0-9]+]]=, $0{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
-declare i64 @llvm.wasm.trunc.saturate.unsigned.i64.f32(float)
+declare i64 @llvm.fptoui.sat.i64.f32(float)
 define i64 @i64_trunc_sat_u_f32(float %x) {
-  %a = call i64 @llvm.wasm.trunc.saturate.unsigned.i64.f32(float %x)
+  %a = call i64 @llvm.fptoui.sat.i64.f32(float %x)
   ret i64 %a
 }
 
@@ -159,9 +159,9 @@ define i64 @i64_trunc_s_f64(double %x) {
 ; CHECK-NEXT: .functype i64_trunc_sat_s_f64 (f64) -> (i64){{$}}
 ; CHECK-NEXT: i64.trunc_sat_f64_s $push[[NUM:[0-9]+]]=, $0{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
-declare i64 @llvm.wasm.trunc.saturate.signed.i64.f64(double)
+declare i64 @llvm.fptosi.sat.i64.f64(double)
 define i64 @i64_trunc_sat_s_f64(double %x) {
-  %a = call i64 @llvm.wasm.trunc.saturate.signed.i64.f64(double %x)
+  %a = call i64 @llvm.fptosi.sat.i64.f64(double %x)
   ret i64 %a
 }
 
@@ -178,9 +178,9 @@ define i64 @i64_trunc_u_f64(double %x) {
 ; CHECK-NEXT: .functype i64_trunc_sat_u_f64 (f64) -> (i64){{$}}
 ; CHECK-NEXT: i64.trunc_sat_f64_u $push[[NUM:[0-9]+]]=, $0{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
-declare i64 @llvm.wasm.trunc.saturate.unsigned.i64.f64(double)
+declare i64 @llvm.fptoui.sat.i64.f64(double)
 define i64 @i64_trunc_sat_u_f64(double %x) {
-  %a = call i64 @llvm.wasm.trunc.saturate.unsigned.i64.f64(double %x)
+  %a = call i64 @llvm.fptoui.sat.i64.f64(double %x)
   ret i64 %a
 }
 
@@ -274,7 +274,7 @@ define float @f32_demote_f64(double %x) {
   ret float %a
 }
 
-; If the high its are unused, LLVM will optimize sext/zext into anyext, which
+; If the high bits are unused, LLVM will optimize sext/zext into anyext, which
 ; we need to patterm-match back to a specific instruction.
 
 ; CHECK-LABEL: anyext:
@@ -312,3 +312,46 @@ define i64 @bitcast_double_to_i64(double %a) {
   %t = bitcast double %a to i64
   ret i64 %t
 }
+
+; Check that saturating fptoint with unsupported target bit widths is lowered
+; correctly.
+
+; CHECK-LABEL: i16_trunc_sat_s_f32:
+; CHECK-NEXT: .functype i16_trunc_sat_s_f32 (f32) -> (i32){{$}}
+; CHECK: i32.select
+; CHECK: return
+declare i16 @llvm.fptosi.sat.i16.f32(float)
+define i16 @i16_trunc_sat_s_f32(float %x) {
+  %a = call i16 @llvm.fptosi.sat.i16.f32(float %x)
+  ret i16 %a
+}
+
+; CHECK-LABEL: i16_trunc_sat_u_f32:
+; CHECK-NEXT: .functype i16_trunc_sat_u_f32 (f32) -> (i32){{$}}
+; CHECK: i32.select
+; CHECK: return
+declare i16 @llvm.fptoui.sat.i16.f32(float)
+define i16 @i16_trunc_sat_u_f32(float %x) {
+  %a = call i16 @llvm.fptoui.sat.i16.f32(float %x)
+  ret i16 %a
+}
+
+; CHECK-LABEL: i16_trunc_sat_s_f64:
+; CHECK-NEXT: .functype i16_trunc_sat_s_f64 (f64) -> (i32){{$}}
+; CHECK: i32.select
+; CHECK: return
+declare i16 @llvm.fptosi.sat.i16.f64(double)
+define i16 @i16_trunc_sat_s_f64(double %x) {
+  %a = call i16 @llvm.fptosi.sat.i16.f64(double %x)
+  ret i16 %a
+}
+
+; CHECK-LABEL: i16_trunc_sat_u_f64:
+; CHECK-NEXT: .functype i16_trunc_sat_u_f64 (f64) -> (i32){{$}}
+; CHECK: i32.select
+; CHECK: return
+declare i16 @llvm.fptoui.sat.i16.f64(double)
+define i16 @i16_trunc_sat_u_f64(double %x) {
+  %a = call i16 @llvm.fptoui.sat.i16.f64(double %x)
+  ret i16 %a
+}
diff --git a/llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll b/llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll
index 6e8e5a2fed71b..0fc008d3ef9a3 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll
@@ -513,9 +513,9 @@ define <4 x i32> @bitselect_v4i32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %c) {
 ; CHECK-NEXT: .functype trunc_sat_s_v4i32 (v128) -> (v128){{$}}
 ; CHECK-NEXT: i32x4.trunc_sat_f32x4_s $push[[R:[0-9]+]]=, $0
 ; CHECK-NEXT: return $pop[[R]]
-declare <4 x i32> @llvm.wasm.trunc.saturate.signed.v4i32.v4f32(<4 x float>)
+declare <4 x i32> @llvm.fptosi.sat.v4i32.v4f32(<4 x float>)
 define <4 x i32> @trunc_sat_s_v4i32(<4 x float> %x) {
-  %a = call <4 x i32> @llvm.wasm.trunc.saturate.signed.v4i32.v4f32(<4 x float> %x)
+  %a = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f32(<4 x float> %x)
   ret <4 x i32> %a
 }
 
@@ -524,30 +524,34 @@ define <4 x i32> @trunc_sat_s_v4i32(<4 x float> %x) {
 ; CHECK-NEXT: .functype trunc_sat_u_v4i32 (v128) -> (v128){{$}}
 ; CHECK-NEXT: i32x4.trunc_sat_f32x4_u $push[[R:[0-9]+]]=, $0
 ; CHECK-NEXT: return $pop[[R]]
-declare <4 x i32> @llvm.wasm.trunc.saturate.unsigned.v4i32.v4f32(<4 x float>)
+declare <4 x i32> @llvm.fptoui.sat.v4i32.v4f32(<4 x float>)
 define <4 x i32> @trunc_sat_u_v4i32(<4 x float> %x) {
-  %a = call <4 x i32> @llvm.wasm.trunc.saturate.unsigned.v4i32.v4f32(<4 x float> %x)
+  %a = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f32(<4 x float> %x)
   ret <4 x i32> %a
 }
 
-; CHECK-LABEL: trunc_sat_zero_signed_v4i32:
-; CHECK-NEXT: .functype trunc_sat_zero_signed_v4i32 (v128) -> (v128){{$}}
+; CHECK-LABEL: trunc_sat_zero_s_v4i32:
+; CHECK-NEXT: .functype trunc_sat_zero_s_v4i32 (v128) -> (v128){{$}}
 ; CHECK-NEXT: i32x4.trunc_sat_zero_f64x2_s $push[[R:[0-9]+]]=, $0{{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
-declare <4 x i32> @llvm.wasm.trunc.sat.zero.signed(<2 x double>)
-define <4 x i32> @trunc_sat_zero_signed_v4i32(<2 x double> %a) {
-  %v = call <4 x i32> @llvm.wasm.trunc.sat.zero.signed(<2 x double> %a)
-  ret <4 x i32> %v
+declare <2 x i32> @llvm.fptosi.sat.v2i32.v2f64(<2 x double>)
+define <4 x i32> @trunc_sat_zero_s_v4i32(<2 x double> %x) {
+  %v = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f64(<2 x double> %x)
+  %a = shufflevector <2 x i32> %v, <2 x i32> <i32 0, i32 0>,
+           <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i32> %a
 }
 
-; CHECK-LABEL: trunc_sat_zero_unsigned_v4i32:
-; CHECK-NEXT: .functype trunc_sat_zero_unsigned_v4i32 (v128) -> (v128){{$}}
+; CHECK-LABEL: trunc_sat_zero_u_v4i32:
+; CHECK-NEXT: .functype trunc_sat_zero_u_v4i32 (v128) -> (v128){{$}}
 ; CHECK-NEXT: i32x4.trunc_sat_zero_f64x2_u $push[[R:[0-9]+]]=, $0{{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
-declare <4 x i32> @llvm.wasm.trunc.sat.zero.unsigned(<2 x double>)
-define <4 x i32> @trunc_sat_zero_unsigned_v4i32(<2 x double> %a) {
-  %v = call <4 x i32> @llvm.wasm.trunc.sat.zero.unsigned(<2 x double> %a)
-  ret <4 x i32> %v
+declare <2 x i32> @llvm.fptoui.sat.v2i32.v2f64(<2 x double>)
+define <4 x i32> @trunc_sat_zero_u_v4i32(<2 x double> %x) {
+  %v = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f64(<2 x double> %x)
+  %a = shufflevector <2 x i32> %v, <2 x i32> <i32 0, i32 0>,
+           <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i32> %a
 }
 
 ; ==============================================================================
diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/WebAssembly/trunc_saturate.ll b/llvm/test/Transforms/InstSimplify/ConstProp/WebAssembly/trunc_saturate.ll
deleted file mode 100644
index a7cd6066785aa..0000000000000
--- a/llvm/test/Transforms/InstSimplify/ConstProp/WebAssembly/trunc_saturate.ll
+++ /dev/null
@@ -1,610 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -instsimplify -S | FileCheck %s
-
-target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
-target triple = "wasm32-unknown-unknown"
-
-declare i32 @llvm.wasm.trunc.saturate.signed.i32.f32(float)
-declare i32 @llvm.wasm.trunc.saturate.unsigned.i32.f32(float)
-declare i32 @llvm.wasm.trunc.saturate.signed.i32.f64(double)
-declare i32 @llvm.wasm.trunc.saturate.unsigned.i32.f64(double)
-declare i64 @llvm.wasm.trunc.saturate.signed.i64.f32(float)
-declare i64 @llvm.wasm.trunc.saturate.unsigned.i64.f32(float)
-declare i64 @llvm.wasm.trunc.saturate.signed.i64.f64(double)
-declare i64 @llvm.wasm.trunc.saturate.unsigned.i64.f64(double)
-
-define void @test_i32_trunc_sat_f32_s(i32* %p) {
-; CHECK-LABEL: @test_i32_trunc_sat_f32_s(
-; CHECK-NEXT:    store volatile i32 0, i32* [[P:%.*]], align 4
-; CHECK-NEXT:    store volatile i32 0, i32* [[P]], align 4
-; CHECK-NEXT:    store volatile i32 0, i32* [[P]], align 4
-; CHECK-NEXT:    store volatile i32 0, i32* [[P]], align 4
-; CHECK-NEXT:    store volatile i32 1, i32* [[P]], align 4
-; CHECK-NEXT:    store volatile i32 1, i32* [[P]], align 4
-; CHECK-NEXT:    store volatile i32 1, i32* [[P]], align 4
-; CHECK-NEXT:    store volatile i32 -1, i32* [[P]], align 4
-; CHECK-NEXT:    store volatile i32 -1, i32* [[P]], align 4
-; CHECK-NEXT:    store volatile i32 -1, i32* [[P]], align 4
-; CHECK-NEXT:    store volatile i32 -1, i32* [[P]], align 4
-; CHECK-NEXT:    store volatile i32 -2, i32* [[P]], align 4
-; CHECK-NEXT:    store volatile i32 2147483520, i32* [[P]], align 4
-; CHECK-NEXT:    store volatile i32 -2147483648, i32* [[P]], align 4
-; CHECK-NEXT:    store volatile i32 2147483647, i32* [[P]], align 4
-; CHECK-NEXT:    store volatile i32 -2147483648, i32* [[P]], align 4
-; CHECK-NEXT:    store volatile i32 2147483647, i32* [[P]], align 4
-; CHECK-NEXT:    store volatile i32 -2147483648, i32* [[P]], align 4
-; CHECK-NEXT:    store volatile i32 0, i32* [[P]], align 4
-; CHECK-NEXT:    store volatile i32 0, i32* [[P]], align 4
-; CHECK-NEXT:    store volatile i32 0, i32* [[P]], align 4
-; CHECK-NEXT:    store volatile i32 0, i32* [[P]], align 4
-; CHECK-NEXT:    ret void
-;
-  %t0 = call i32 @llvm.wasm.trunc.saturate.signed.i32.f32(float +0.0)
-  store volatile i32 %t0, i32* %p
-  %t1 = call i32 @llvm.wasm.trunc.saturate.signed.i32.f32(float -0.0)
-  store volatile i32 %t1, i32* %p
-  %t2 = call i32 @llvm.wasm.trunc.saturate.signed.i32.f32(float 0x36a0000000000000); 0x1p-149
-  store volatile i32 %t2, i32* %p
-  %t3 = call i32 @llvm.wasm.trunc.saturate.signed.i32.f32(float 0xb6a0000000000000); -0x1p-149
-  store volatile i32 %t3, i32* %p
-  %t4 = call i32 @llvm.wasm.trunc.saturate.signed.i32.f32(float 1.0)
-  store volatile i32 %t4, i32* %p
-  %t5 = call i32 @llvm.wasm.trunc.saturate.signed.i32.f32(float 0x3ff19999a0000000); 0x1.19999ap+0
-  store volatile i32 %t5, i32* %p
-  %t6 = call i32 @llvm.wasm.trunc.saturate.signed.i32.f32(float 1.5)
-  store volatile i32 %t6, i32* %p
-  %t7 = call i32 @llvm.wasm.trunc.saturate.signed.i32.f32(float -1.0)
-  store volatile i32 %t7, i32* %p
-  %t8 = call i32 @llvm.wasm.trunc.saturate.signed.i32.f32(float 0xbff19999a0000000); -0x1.19999ap+0
-  store volatile i32 %t8, i32* %p
-  %t9 = call i32 @llvm.wasm.trunc.saturate.signed.i32.f32(float -1.5)
-  store volatile i32 %t9, i32* %p
-  %t10 = call i32 @llvm.wasm.trunc.saturate.signed.i32.f32(float 0xbffe666660000000); -1.9
-  store volatile i32 %t10, i32* %p
-  %t11 = call i32 @llvm.wasm.trunc.saturate.signed.i32.f32(float -2.0)
-  store volatile i32 %t11, i32* %p
-  %t12 = call i32 @llvm.wasm.trunc.saturate.signed.i32.f32(float 2147483520.0)
-  store volatile i32 %t12, i32* %p
-  %t13 = call i32 @llvm.wasm.trunc.saturate.signed.i32.f32(float -2147483648.0)
-  store volatile i32 %t13, i32* %p
-  %t14 = call i32 @llvm.wasm.trunc.saturate.signed.i32.f32(float 2147483648.0)
-  store volatile i32 %t14, i32* %p
-  %t15 = call i32 @llvm.wasm.trunc.saturate.signed.i32.f32(float -2147483904.0)
-  store volatile i32 %t15, i32* %p
-  %t16 = call i32 @llvm.wasm.trunc.saturate.signed.i32.f32(float 0x7ff0000000000000); inf
-  store volatile i32 %t16, i32* %p
-  %t17 = call i32 @llvm.wasm.trunc.saturate.signed.i32.f32(float 0xfff0000000000000); -inf
-  store volatile i32 %t17, i32* %p
-  %t18 = call i32 @llvm.wasm.trunc.saturate.signed.i32.f32(float 0x7ff8000000000000); nan
-  store volatile i32 %t18, i32* %p
-  %t19 = call i32 @llvm.wasm.trunc.saturate.signed.i32.f32(float 0x7ffa000000000000); nan:0x200000
-  store volatile i32 %t19, i32* %p
-  %t20 = call i32 @llvm.wasm.trunc.saturate.signed.i32.f32(float 0xfff8000000000000); -nan
-  store volatile i32 %t20, i32* %p
-  %t21 = call i32 @llvm.wasm.trunc.saturate.signed.i32.f32(float 0xfffa000000000000); -nan:0x200000
-  store volatile i32 %t21, i32* %p
-  ret void
-}
-
-define void @test_i32_trunc_sat_f32_u(i32* %p) {
-; CHECK-LABEL: @test_i32_trunc_sat_f32_u(
-; CHECK-NEXT:    store volatile i32 0, i32* [[P:%.*]], align 4
-; CHECK-NEXT:    store volatile i32 0, i32* [[P]], align 4
-; CHECK-NEXT:    store volatile i32 0, i32* [[P]], align 4
-; CHECK-NEXT:    store volatile i32 0, i32* [[P]], align 4
-; CHECK-NEXT:    store volatile i32 1, i32* [[P]], align 4
-; CHECK-NEXT:    store volatile i32 1, i32* [[P]], align 4
-; CHECK-NEXT:    store volatile i32 1, i32* [[P]], align 4
-; CHECK-NEXT:    store volatile i32 1, i32* [[P]], align 4
-; CHECK-NEXT:    store volatile i32 2, i32* [[P]], align 4
-; CHECK-NEXT:    store volatile i32 -2147483648, i32* [[P]], align 4
-; CHECK-NEXT:    store volatile i32 -256, i32* [[P]], align 4
-; CHECK-NEXT:    store volatile i32 0, i32* [[P]], align 4
-; CHECK-NEXT:    store volatile i32 0, i32* [[P]], align 4
-; CHECK-NEXT:    store volatile i32 -1, i32* [[P]], align 4
-; CHECK-NEXT:    store volatile i32 0, i32* [[P]], align 4
-; CHECK-NEXT:    store volatile i32 -1, i32* [[P]], align 4
-; CHECK-NEXT:    store volatile i32 0, i32* [[P]], align 4
-; CHECK-NEXT:    store volatile i32 0, i32* [[P]], align 4
-; CHECK-NEXT:    store volatile i32 0, i32* [[P]], align 4
-; CHECK-NEXT:    store volatile i32 0, i32* [[P]], align 4
-; CHECK-NEXT:    store volatile i32 0, i32* [[P]], align 4
-; CHECK-NEXT:    ret void
-;
-  %t0 = call i32 @llvm.wasm.trunc.saturate.unsigned.i32.f32(float +0.0)
-  store volatile i32 %t0, i32* %p
-  %t1 = call i32 @llvm.wasm.trunc.saturate.unsigned.i32.f32(float -0.0)
-  store volatile i32 %t1, i32* %p
-  %t2 = call i32 @llvm.wasm.trunc.saturate.unsigned.i32.f32(float 0x36a0000000000000); 0x1p-149
-  store volatile i32 %t2, i32* %p
-  %t3 = call i32 @llvm.wasm.trunc.saturate.unsigned.i32.f32(float 0xb6a0000000000000); -0x1p-149
-  store volatile i32 %t3, i32* %p
-  %t4 = call i32 @llvm.wasm.trunc.saturate.unsigned.i32.f32(float 1.0)
-  store volatile i32 %t4, i32* %p
-  %t5 = call i32 @llvm.wasm.trunc.saturate.unsigned.i32.f32(float 0x3ff19999a0000000); 0x1.19999ap+0
-  store volatile i32 %t5, i32* %p
-  %t6 = call i32 @llvm.wasm.trunc.saturate.unsigned.i32.f32(float 1.5)
-  store volatile i32 %t6, i32* %p
-  %t7 = call i32 @llvm.wasm.trunc.saturate.unsigned.i32.f32(float 0x3ffe666660000000); 1.9
-  store volatile i32 %t7, i32* %p
-  %t8 = call i32 @llvm.wasm.trunc.saturate.unsigned.i32.f32(float 2.0)
-  store volatile i32 %t8, i32* %p
-  %t9 = call i32 @llvm.wasm.trunc.saturate.unsigned.i32.f32(float 2147483648.0)
-  store volatile i32 %t9, i32* %p
-  %t10 = call i32 @llvm.wasm.trunc.saturate.unsigned.i32.f32(float 4294967040.0)
-  store volatile i32 %t10, i32* %p
-  %t11 = call i32 @llvm.wasm.trunc.saturate.unsigned.i32.f32(float 0xbfecccccc0000000); -0x1.ccccccp-1
-  store volatile i32 %t11, i32* %p
-  %t12 = call i32 @llvm.wasm.trunc.saturate.unsigned.i32.f32(float 0xbfefffffe0000000); -0x1.fffffep-1
-  store volatile i32 %t12, i32* %p
-  %t13 = call i32 @llvm.wasm.trunc.saturate.unsigned.i32.f32(float 4294967296.0)
-  store volatile i32 %t13, i32* %p
-  %t14 = call i32 @llvm.wasm.trunc.saturate.unsigned.i32.f32(float -1.0)
-  store volatile i32 %t14, i32* %p
-  %t15 = call i32 @llvm.wasm.trunc.saturate.unsigned.i32.f32(float 0x7ff0000000000000); inf
-  store volatile i32 %t15, i32* %p
-  %t16 = call i32 @llvm.wasm.trunc.saturate.unsigned.i32.f32(float 0xfff0000000000000); -inf
-  store volatile i32 %t16, i32* %p
-  %t17 = call i32 @llvm.wasm.trunc.saturate.unsigned.i32.f32(float 0x7ff8000000000000); nan
-  store volatile i32 %t17, i32* %p
-  %t18 = call i32 @llvm.wasm.trunc.saturate.unsigned.i32.f32(float 0x7ffa000000000000); nan:0x200000
-  store volatile i32 %t18, i32* %p
-  %t19 = call i32 @llvm.wasm.trunc.saturate.unsigned.i32.f32(float 0xfff8000000000000); -nan
-  store volatile i32 %t19, i32* %p
-  %t20 = call i32 @llvm.wasm.trunc.saturate.unsigned.i32.f32(float 0xfffa000000000000); -nan:0x200000
-  store volatile i32 %t20, i32* %p
-  ret void
-}
-
-define void @test_i32_trunc_sat_f64_s(i32* %p) {
-; CHECK-LABEL: @test_i32_trunc_sat_f64_s(
-; CHECK-NEXT:    store volatile i32 0, i32* [[P:%.*]], align 4
-; CHECK-NEXT:    store volatile i32 0, i32* [[P]], align 4
-; CHECK-NEXT:    store volatile i32 0, i32* [[P]], align 4
-; CHECK-NEXT:    store volatile i32 0, i32* [[P]], align 4
-; CHECK-NEXT:    store volatile i32 1, i32* [[P]], align 4
-; CHECK-NEXT:    store volatile i32 1, i32* [[P]], align 4
-; CHECK-NEXT:    store volatile i32 1, i32* [[P]], align 4
-; CHECK-NEXT:    store volatile i32 -1, i32* [[P]], align 4
-; CHECK-NEXT:    store volatile i32 -1, i32* [[P]], align 4
-; CHECK-NEXT:    store volatile i32 -1, i32* [[P]], align 4
-; CHECK-NEXT:    store volatile i32 -1, i32* [[P]], align 4
-; CHECK-NEXT:    store volatile i32 -2, i32* [[P]], align 4
-; CHECK-NEXT:    store volatile i32 2147483647, i32* [[P]], align 4
-; CHECK-NEXT:    store volatile i32 -2147483648, i32* [[P]], align 4
-; CHECK-NEXT:    store volatile i32 2147483647, i32* [[P]], align 4
-; CHECK-NEXT:    store volatile i32 -2147483648, i32* [[P]], align 4
-; CHECK-NEXT:    store volatile i32 2147483647, i32* [[P]], align 4
-; CHECK-NEXT:    store volatile i32 -2147483648, i32* [[P]], align 4
-; CHECK-NEXT:    store volatile i32 0, i32* [[P]], align 4
-; CHECK-NEXT:    store volatile i32 0, i32* [[P]], align 4
-; CHECK-NEXT:    store volatile i32 0, i32* [[P]], align 4
-; CHECK-NEXT:    store volatile i32 0, i32* [[P]], align 4
-; CHECK-NEXT:    ret void
-;
-  %t0 = call i32 @llvm.wasm.trunc.saturate.signed.i32.f64(double +0.0)
-  store volatile i32 %t0, i32* %p
-  %t1 = call i32 @llvm.wasm.trunc.saturate.signed.i32.f64(double -0.0)
-  store volatile i32 %t1, i32* %p
-  %t2 = call i32 @llvm.wasm.trunc.saturate.signed.i32.f64(double 0x0010000000000001); 0x0.0000000000001p-1022
-  store volatile i32 %t2, i32* %p
-  %t3 = call i32 @llvm.wasm.trunc.saturate.signed.i32.f64(double 0x8010000000000001); -0x1.0000000000001p-1022
-  store volatile i32 %t3, i32* %p
-  %t4 = call i32 @llvm.wasm.trunc.saturate.signed.i32.f64(double 1.0)
-  store volatile i32 %t4, i32* %p
-  %t5 = call i32 @llvm.wasm.trunc.saturate.signed.i32.f64(double 0x3ff199999999999a); 0x1.199999999999ap+0
-  store volatile i32 %t5, i32* %p
-  %t6 = call i32 @llvm.wasm.trunc.saturate.signed.i32.f64(double 1.5)
-  store volatile i32 %t6, i32* %p
-  %t7 = call i32 @llvm.wasm.trunc.saturate.signed.i32.f64(double -1.0)
-  store volatile i32 %t7, i32* %p
-  %t8 = call i32 @llvm.wasm.trunc.saturate.signed.i32.f64(double 0xbff199999999999a); -0x1.199999999999ap+0
-  store volatile i32 %t8, i32* %p
-  %t9 = call i32 @llvm.wasm.trunc.saturate.signed.i32.f64(double -1.5)
-  store volatile i32 %t9, i32* %p
-  %t10 = call i32 @llvm.wasm.trunc.saturate.signed.i32.f64(double 0xbffe666666666666); -1.9
-  store volatile i32 %t10, i32* %p
-  %t11 = call i32 @llvm.wasm.trunc.saturate.signed.i32.f64(double -2.0)
-  store volatile i32 %t11, i32* %p
-  %t12 = call i32 @llvm.wasm.trunc.saturate.signed.i32.f64(double 2147483647.0)
-  store volatile i32 %t12, i32* %p
-  %t13 = call i32 @llvm.wasm.trunc.saturate.signed.i32.f64(double -2147483648.0)
-  store volatile i32 %t13, i32* %p
-  %t14 = call i32 @llvm.wasm.trunc.saturate.signed.i32.f64(double 2147483648.0)
-  store volatile i32 %t14, i32* %p
-  %t15 = call i32 @llvm.wasm.trunc.saturate.signed.i32.f64(double -2147483649.0)
-  store volatile i32 %t15, i32* %p
-  %t16 = call i32 @llvm.wasm.trunc.saturate.signed.i32.f64(double 0x7ff0000000000000); inf
-  store volatile i32 %t16, i32* %p
-  %t17 = call i32 @llvm.wasm.trunc.saturate.signed.i32.f64(double 0xfff0000000000000); -inf
-  store volatile i32 %t17, i32* %p
-  %t18 = call i32 @llvm.wasm.trunc.saturate.signed.i32.f64(double 0x7ff8000000000000); nan
-  store volatile i32 %t18, i32* %p
-  %t19 = call i32 @llvm.wasm.trunc.saturate.signed.i32.f64(double 0x7ff4000000000000); nan:0x4000000000000
-  store volatile i32 %t19, i32* %p
-  %t20 = call i32 @llvm.wasm.trunc.saturate.signed.i32.f64(double 0xfff8000000000000); -nan
-  store volatile i32 %t20, i32* %p
-  %t21 = call i32 @llvm.wasm.trunc.saturate.signed.i32.f64(double 0x7ff4000000000000); -nan:0x4000000000000
-  store volatile i32 %t21, i32* %p
-  ret void
-}
-
-define void @test_i32_trunc_sat_f64_u(i32* %p) {
-; CHECK-LABEL: @test_i32_trunc_sat_f64_u(
-; CHECK-NEXT:    store volatile i32 0, i32* [[P:%.*]], align 4
-; CHECK-NEXT:    store volatile i32 0, i32* [[P]], align 4
-; CHECK-NEXT:    store volatile i32 0, i32* [[P]], align 4
-; CHECK-NEXT:    store volatile i32 0, i32* [[P]], align 4
-; CHECK-NEXT:    store volatile i32 1, i32* [[P]], align 4
-; CHECK-NEXT:    store volatile i32 1, i32* [[P]], align 4
-; CHECK-NEXT:    store volatile i32 1, i32* [[P]], align 4
-; CHECK-NEXT:    store volatile i32 1, i32* [[P]], align 4
-; CHECK-NEXT:    store volatile i32 2, i32* [[P]], align 4
-; CHECK-NEXT:    store volatile i32 -2147483648, i32* [[P]], align 4
-; CHECK-NEXT:    store volatile i32 -1, i32* [[P]], align 4
-; CHECK-NEXT:    store volatile i32 0, i32* [[P]], align 4
-; CHECK-NEXT:    store volatile i32 0, i32* [[P]], align 4
-; CHECK-NEXT:    store volatile i32 100000000, i32* [[P]], align 4
-; CHECK-NEXT:    store volatile i32 -1, i32* [[P]], align 4
-; CHECK-NEXT:    store volatile i32 0, i32* [[P]], align 4
-; CHECK-NEXT:    store volatile i32 -1, i32* [[P]], align 4
-; CHECK-NEXT:    store volatile i32 -1, i32* [[P]], align 4
-; CHECK-NEXT:    store volatile i32 -1, i32* [[P]], align 4
-; CHECK-NEXT:    store volatile i32 -1, i32* [[P]], align 4
-; CHECK-NEXT:    store volatile i32 0, i32* [[P]], align 4
-; CHECK-NEXT:    store volatile i32 0, i32* [[P]], align 4
-; CHECK-NEXT:    store volatile i32 0, i32* [[P]], align 4
-; CHECK-NEXT:    store volatile i32 0, i32* [[P]], align 4
-; CHECK-NEXT:    store volatile i32 0, i32* [[P]], align 4
-; CHECK-NEXT:    ret void
-;
-  %t0 = call i32 @llvm.wasm.trunc.saturate.unsigned.i32.f64(double +0.0)
-  store volatile i32 %t0, i32* %p
-  %t1 = call i32 @llvm.wasm.trunc.saturate.unsigned.i32.f64(double -0.0)
-  store volatile i32 %t1, i32* %p
-  %t2 = call i32 @llvm.wasm.trunc.saturate.unsigned.i32.f64(double 0x0010000000000001); 0x0.0000000000001p-1022
-  store volatile i32 %t2, i32* %p
-  %t3 = call i32 @llvm.wasm.trunc.saturate.unsigned.i32.f64(double 0x8010000000000001); -0x0.0000000000001p-1022
-  store volatile i32 %t3, i32* %p
-  %t4 = call i32 @llvm.wasm.trunc.saturate.unsigned.i32.f64(double 1.0)
-  store volatile i32 %t4, i32* %p
-  %t5 = call i32 @llvm.wasm.trunc.saturate.unsigned.i32.f64(double 0x3ff199999999999a); 0x1.199999999999ap+0
-  store volatile i32 %t5, i32* %p
-  %t6 = call i32 @llvm.wasm.trunc.saturate.unsigned.i32.f64(double 1.5)
-  store volatile i32 %t6, i32* %p
-  %t7 = call i32 @llvm.wasm.trunc.saturate.unsigned.i32.f64(double 0x3ffe666666666666); 1.9
-  store volatile i32 %t7, i32* %p
-  %t8 = call i32 @llvm.wasm.trunc.saturate.unsigned.i32.f64(double 2.0)
-  store volatile i32 %t8, i32* %p
-  %t9 = call i32 @llvm.wasm.trunc.saturate.unsigned.i32.f64(double 2147483648.0)
-  store volatile i32 %t9, i32* %p
-  %t10 = call i32 @llvm.wasm.trunc.saturate.unsigned.i32.f64(double 4294967295.0)
-  store volatile i32 %t10, i32* %p
-  %t11 = call i32 @llvm.wasm.trunc.saturate.unsigned.i32.f64(double 0xbfeccccccccccccd); -0x1.ccccccccccccdp-1
-  store volatile i32 %t11, i32* %p
-  %t12 = call i32 @llvm.wasm.trunc.saturate.unsigned.i32.f64(double 0xbfefffffffffffff); -0x1.fffffffffffffp-1
-  store volatile i32 %t12, i32* %p
-  %t13 = call i32 @llvm.wasm.trunc.saturate.unsigned.i32.f64(double 100000000.0); 1e8
-  store volatile i32 %t13, i32* %p
-  %t14 = call i32 @llvm.wasm.trunc.saturate.unsigned.i32.f64(double 4294967296.0)
-  store volatile i32 %t14, i32* %p
-  %t15 = call i32 @llvm.wasm.trunc.saturate.unsigned.i32.f64(double -1.0)
-  store volatile i32 %t15, i32* %p
-  %t16 = call i32 @llvm.wasm.trunc.saturate.unsigned.i32.f64(double 10000000000000000.0); 1e16
-  store volatile i32 %t16, i32* %p
-  %t17 = call i32 @llvm.wasm.trunc.saturate.unsigned.i32.f64(double 1000000000000000000000000000000.0); 1e30
-  store volatile i32 %t17, i32* %p
-  %t18 = call i32 @llvm.wasm.trunc.saturate.unsigned.i32.f64(double 9223372036854775808.0)
-  store volatile i32 %t18, i32* %p
-  %t19 = call i32 @llvm.wasm.trunc.saturate.unsigned.i32.f64(double 0x7ff0000000000000); inf
-  store volatile i32 %t19, i32* %p
-  %t20 = call i32 @llvm.wasm.trunc.saturate.unsigned.i32.f64(double 0xfff0000000000000); -inf
-  store volatile i32 %t20, i32* %p
-  %t21 = call i32 @llvm.wasm.trunc.saturate.unsigned.i32.f64(double 0x7ff8000000000000); nan
-  store volatile i32 %t21, i32* %p
-  %t22 = call i32 @llvm.wasm.trunc.saturate.unsigned.i32.f64(double 0x7ff4000000000000); nan:0x4000000000000
-  store volatile i32 %t22, i32* %p
-  %t23 = call i32 @llvm.wasm.trunc.saturate.unsigned.i32.f64(double 0xfff8000000000000); -nan
-  store volatile i32 %t23, i32* %p
-  %t24 = call i32 @llvm.wasm.trunc.saturate.unsigned.i32.f64(double 0xfff4000000000000); -nan:0x4000000000000
-  store volatile i32 %t24, i32* %p
-  ret void
-}
-
-define void @test_i64_trunc_sat_f32_s(i64* %p) {
-; CHECK-LABEL: @test_i64_trunc_sat_f32_s(
-; CHECK-NEXT:    store volatile i64 0, i64* [[P:%.*]], align 8
-; CHECK-NEXT:    store volatile i64 0, i64* [[P]], align 8
-; CHECK-NEXT:    store volatile i64 0, i64* [[P]], align 8
-; CHECK-NEXT:    store volatile i64 0, i64* [[P]], align 8
-; CHECK-NEXT:    store volatile i64 1, i64* [[P]], align 8
-; CHECK-NEXT:    store volatile i64 1, i64* [[P]], align 8
-; CHECK-NEXT:    store volatile i64 1, i64* [[P]], align 8
-; CHECK-NEXT:    store volatile i64 -1, i64* [[P]], align 8
-; CHECK-NEXT:    store volatile i64 -1, i64* [[P]], align 8
-; CHECK-NEXT:    store volatile i64 -1, i64* [[P]], align 8
-; CHECK-NEXT:    store volatile i64 -1, i64* [[P]], align 8
-; CHECK-NEXT:    store volatile i64 -2, i64* [[P]], align 8
-; CHECK-NEXT:    store volatile i64 4294967296, i64* [[P]], align 8
-; CHECK-NEXT:    store volatile i64 -4294967296, i64* [[P]], align 8
-; CHECK-NEXT:    store volatile i64 9223371487098961920, i64* [[P]], align 8
-; CHECK-NEXT:    store volatile i64 -9223372036854775808, i64* [[P]], align 8
-; CHECK-NEXT:    store volatile i64 9223372036854775807, i64* [[P]], align 8
-; CHECK-NEXT:    store volatile i64 -9223372036854775808, i64* [[P]], align 8
-; CHECK-NEXT:    store volatile i64 9223372036854775807, i64* [[P]], align 8
-; CHECK-NEXT:    store volatile i64 -9223372036854775808, i64* [[P]], align 8
-; CHECK-NEXT:    store volatile i64 0, i64* [[P]], align 8
-; CHECK-NEXT:    store volatile i64 0, i64* [[P]], align 8
-; CHECK-NEXT:    store volatile i64 0, i64* [[P]], align 8
-; CHECK-NEXT:    store volatile i64 0, i64* [[P]], align 8
-; CHECK-NEXT:    ret void
-;
-  %t0 = call i64 @llvm.wasm.trunc.saturate.signed.i64.f32(float +0.0)
-  store volatile i64 %t0, i64* %p
-  %t1 = call i64 @llvm.wasm.trunc.saturate.signed.i64.f32(float -0.0)
-  store volatile i64 %t1, i64* %p
-  %t2 = call i64 @llvm.wasm.trunc.saturate.signed.i64.f32(float 0x36a0000000000000); 0x1p-149
-  store volatile i64 %t2, i64* %p
-  %t3 = call i64 @llvm.wasm.trunc.saturate.signed.i64.f32(float 0xb6a0000000000000); -0x1p-149
-  store volatile i64 %t3, i64* %p
-  %t4 = call i64 @llvm.wasm.trunc.saturate.signed.i64.f32(float 1.0)
-  store volatile i64 %t4, i64* %p
-  %t5 = call i64 @llvm.wasm.trunc.saturate.signed.i64.f32(float 0x3ff19999a0000000); 0x1.19999ap+0
-  store volatile i64 %t5, i64* %p
-  %t6 = call i64 @llvm.wasm.trunc.saturate.signed.i64.f32(float 1.5)
-  store volatile i64 %t6, i64* %p
-  %t7 = call i64 @llvm.wasm.trunc.saturate.signed.i64.f32(float -1.0)
-  store volatile i64 %t7, i64* %p
-  %t8 = call i64 @llvm.wasm.trunc.saturate.signed.i64.f32(float 0xbff19999a0000000); -0x1.19999ap+0
-  store volatile i64 %t8, i64* %p
-  %t9 = call i64 @llvm.wasm.trunc.saturate.signed.i64.f32(float -1.5)
-  store volatile i64 %t9, i64* %p
-  %t10 = call i64 @llvm.wasm.trunc.saturate.signed.i64.f32(float 0xbffe666660000000); -1.9
-  store volatile i64 %t10, i64* %p
-  %t11 = call i64 @llvm.wasm.trunc.saturate.signed.i64.f32(float -2.0)
-  store volatile i64 %t11, i64* %p
-  %t12 = call i64 @llvm.wasm.trunc.saturate.signed.i64.f32(float 4294967296.0)
-  store volatile i64 %t12, i64* %p
-  %t13 = call i64 @llvm.wasm.trunc.saturate.signed.i64.f32(float -4294967296.0)
-  store volatile i64 %t13, i64* %p
-  %t14 = call i64 @llvm.wasm.trunc.saturate.signed.i64.f32(float 9223371487098961920.0)
-  store volatile i64 %t14, i64* %p
-  %t15 = call i64 @llvm.wasm.trunc.saturate.signed.i64.f32(float -9223372036854775808.0)
-  store volatile i64 %t15, i64* %p
-  %t16 = call i64 @llvm.wasm.trunc.saturate.signed.i64.f32(float 9223372036854775808.0)
-  store volatile i64 %t16, i64* %p
-  %t17 = call i64 @llvm.wasm.trunc.saturate.signed.i64.f32(float -9223373136366403584.0)
-  store volatile i64 %t17, i64* %p
-  %t18 = call i64 @llvm.wasm.trunc.saturate.signed.i64.f32(float 0x7ff0000000000000); inf
-  store volatile i64 %t18, i64* %p
-  %t19 = call i64 @llvm.wasm.trunc.saturate.signed.i64.f32(float 0xfff0000000000000); -inf
-  store volatile i64 %t19, i64* %p
-  %t20 = call i64 @llvm.wasm.trunc.saturate.signed.i64.f32(float 0x7ff8000000000000); nan
-  store volatile i64 %t20, i64* %p
-  %t21 = call i64 @llvm.wasm.trunc.saturate.signed.i64.f32(float 0x7ffa000000000000); nan:0x200000
-  store volatile i64 %t21, i64* %p
-  %t22 = call i64 @llvm.wasm.trunc.saturate.signed.i64.f32(float 0xfff8000000000000); -nan
-  store volatile i64 %t22, i64* %p
-  %t23 = call i64 @llvm.wasm.trunc.saturate.signed.i64.f32(float 0xfffa000000000000); -nan:0x200000
-  store volatile i64 %t23, i64* %p
-  ret void
-}
-
-define void @test_i64_trunc_sat_f32_u(i64* %p) {
-; CHECK-LABEL: @test_i64_trunc_sat_f32_u(
-; CHECK-NEXT:    store volatile i64 0, i64* [[P:%.*]], align 8
-; CHECK-NEXT:    store volatile i64 0, i64* [[P]], align 8
-; CHECK-NEXT:    store volatile i64 0, i64* [[P]], align 8
-; CHECK-NEXT:    store volatile i64 0, i64* [[P]], align 8
-; CHECK-NEXT:    store volatile i64 1, i64* [[P]], align 8
-; CHECK-NEXT:    store volatile i64 1, i64* [[P]], align 8
-; CHECK-NEXT:    store volatile i64 1, i64* [[P]], align 8
-; CHECK-NEXT:    store volatile i64 4294967296, i64* [[P]], align 8
-; CHECK-NEXT:    store volatile i64 -1099511627776, i64* [[P]], align 8
-; CHECK-NEXT:    store volatile i64 0, i64* [[P]], align 8
-; CHECK-NEXT:    store volatile i64 0, i64* [[P]], align 8
-; CHECK-NEXT:    store volatile i64 -1, i64* [[P]], align 8
-; CHECK-NEXT:    store volatile i64 0, i64* [[P]], align 8
-; CHECK-NEXT:    store volatile i64 -1, i64* [[P]], align 8
-; CHECK-NEXT:    store volatile i64 0, i64* [[P]], align 8
-; CHECK-NEXT:    store volatile i64 0, i64* [[P]], align 8
-; CHECK-NEXT:    store volatile i64 0, i64* [[P]], align 8
-; CHECK-NEXT:    store volatile i64 0, i64* [[P]], align 8
-; CHECK-NEXT:    store volatile i64 0, i64* [[P]], align 8
-; CHECK-NEXT:    ret void
-;
-  %t0 = call i64 @llvm.wasm.trunc.saturate.unsigned.i64.f32(float +0.0)
-  store volatile i64 %t0, i64* %p
-  %t1 = call i64 @llvm.wasm.trunc.saturate.unsigned.i64.f32(float -0.0)
-  store volatile i64 %t1, i64* %p
-  %t2 = call i64 @llvm.wasm.trunc.saturate.unsigned.i64.f32(float 0x36a0000000000000); 0x1p-149
-  store volatile i64 %t2, i64* %p
-  %t3 = call i64 @llvm.wasm.trunc.saturate.unsigned.i64.f32(float 0xb6a0000000000000); -0x1p-149
-  store volatile i64 %t3, i64* %p
-  %t4 = call i64 @llvm.wasm.trunc.saturate.unsigned.i64.f32(float 1.0)
-  store volatile i64 %t4, i64* %p
-  %t5 = call i64 @llvm.wasm.trunc.saturate.unsigned.i64.f32(float 0x3ff19999a0000000); 0x1.19999ap+0
-  store volatile i64 %t5, i64* %p
-  %t6 = call i64 @llvm.wasm.trunc.saturate.unsigned.i64.f32(float 1.5)
-  store volatile i64 %t6, i64* %p
-  %t7 = call i64 @llvm.wasm.trunc.saturate.unsigned.i64.f32(float 4294967296.0)
-  store volatile i64 %t7, i64* %p
-  %t8 = call i64 @llvm.wasm.trunc.saturate.unsigned.i64.f32(float 18446742974197923840.0)
-  store volatile i64 %t8, i64* %p
-  %t9 = call i64 @llvm.wasm.trunc.saturate.unsigned.i64.f32(float 0xbfecccccc0000000); -0x1.ccccccp-1
-  store volatile i64 %t9, i64* %p
-  %t10 = call i64 @llvm.wasm.trunc.saturate.unsigned.i64.f32(float 0xbfefffffe0000000); -0x1.fffffep-1
-  store volatile i64 %t10, i64* %p
-  %t11 = call i64 @llvm.wasm.trunc.saturate.unsigned.i64.f32(float 18446744073709551616.0)
-  store volatile i64 %t11, i64* %p
-  %t12 = call i64 @llvm.wasm.trunc.saturate.unsigned.i64.f32(float -1.0)
-  store volatile i64 %t12, i64* %p
-  %t13 = call i64 @llvm.wasm.trunc.saturate.unsigned.i64.f32(float 0x7ff0000000000000); inf
-  store volatile i64 %t13, i64* %p
-  %t14 = call i64 @llvm.wasm.trunc.saturate.unsigned.i64.f32(float 0xfff0000000000000); -inf
-  store volatile i64 %t14, i64* %p
-  %t15 = call i64 @llvm.wasm.trunc.saturate.unsigned.i64.f32(float 0x7ff8000000000000); nan
-  store volatile i64 %t15, i64* %p
-  %t16 = call i64 @llvm.wasm.trunc.saturate.unsigned.i64.f32(float 0x7ffa000000000000); nan:0x200000
-  store volatile i64 %t16, i64* %p
-  %t17 = call i64 @llvm.wasm.trunc.saturate.unsigned.i64.f32(float 0xfff8000000000000); -nan
-  store volatile i64 %t17, i64* %p
-  %t18 = call i64 @llvm.wasm.trunc.saturate.unsigned.i64.f32(float 0xfffa000000000000); -nan:0x200000
-  store volatile i64 %t18, i64* %p
-  ret void
-}
-
-define void @test_i64_trunc_sat_f64_s(i64* %p) {
-; CHECK-LABEL: @test_i64_trunc_sat_f64_s(
-; CHECK-NEXT:    store volatile i64 0, i64* [[P:%.*]], align 8
-; CHECK-NEXT:    store volatile i64 0, i64* [[P]], align 8
-; CHECK-NEXT:    store volatile i64 0, i64* [[P]], align 8
-; CHECK-NEXT:    store volatile i64 0, i64* [[P]], align 8
-; CHECK-NEXT:    store volatile i64 1, i64* [[P]], align 8
-; CHECK-NEXT:    store volatile i64 1, i64* [[P]], align 8
-; CHECK-NEXT:    store volatile i64 1, i64* [[P]], align 8
-; CHECK-NEXT:    store volatile i64 -1, i64* [[P]], align 8
-; CHECK-NEXT:    store volatile i64 -1, i64* [[P]], align 8
-; CHECK-NEXT:    store volatile i64 -1, i64* [[P]], align 8
-; CHECK-NEXT:    store volatile i64 -1, i64* [[P]], align 8
-; CHECK-NEXT:    store volatile i64 -2, i64* [[P]], align 8
-; CHECK-NEXT:    store volatile i64 4294967296, i64* [[P]], align 8
-; CHECK-NEXT:    store volatile i64 -4294967296, i64* [[P]], align 8
-; CHECK-NEXT:    store volatile i64 9223372036854774784, i64* [[P]], align 8
-; CHECK-NEXT:    store volatile i64 -9223372036854775808, i64* [[P]], align 8
-; CHECK-NEXT:    store volatile i64 9223372036854775807, i64* [[P]], align 8
-; CHECK-NEXT:    store volatile i64 -9223372036854775808, i64* [[P]], align 8
-; CHECK-NEXT:    store volatile i64 9223372036854775807, i64* [[P]], align 8
-; CHECK-NEXT:    store volatile i64 -9223372036854775808, i64* [[P]], align 8
-; CHECK-NEXT:    store volatile i64 0, i64* [[P]], align 8
-; CHECK-NEXT:    store volatile i64 0, i64* [[P]], align 8
-; CHECK-NEXT:    store volatile i64 0, i64* [[P]], align 8
-; CHECK-NEXT:    store volatile i64 0, i64* [[P]], align 8
-; CHECK-NEXT:    ret void
-;
-  %t0 = call i64 @llvm.wasm.trunc.saturate.signed.i64.f64(double +0.0)
-  store volatile i64 %t0, i64* %p
-  %t1 = call i64 @llvm.wasm.trunc.saturate.signed.i64.f64(double -0.0)
-  store volatile i64 %t1, i64* %p
-  %t2 = call i64 @llvm.wasm.trunc.saturate.signed.i64.f64(double 0x0010000000000001); 0x0.0000000000001p-1022
-  store volatile i64 %t2, i64* %p
-  %t3 = call i64 @llvm.wasm.trunc.saturate.signed.i64.f64(double 0x8010000000000001); -0x1.0000000000001p-1022
-  store volatile i64 %t3, i64* %p
-  %t4 = call i64 @llvm.wasm.trunc.saturate.signed.i64.f64(double 1.0)
-  store volatile i64 %t4, i64* %p
-  %t5 = call i64 @llvm.wasm.trunc.saturate.signed.i64.f64(double 0x3ff199999999999a); 0x1.199999999999ap+0
-  store volatile i64 %t5, i64* %p
-  %t6 = call i64 @llvm.wasm.trunc.saturate.signed.i64.f64(double 1.5)
-  store volatile i64 %t6, i64* %p
-  %t7 = call i64 @llvm.wasm.trunc.saturate.signed.i64.f64(double -1.0)
-  store volatile i64 %t7, i64* %p
-  %t8 = call i64 @llvm.wasm.trunc.saturate.signed.i64.f64(double 0xbff199999999999a); -0x1.199999999999ap+0
-  store volatile i64 %t8, i64* %p
-  %t9 = call i64 @llvm.wasm.trunc.saturate.signed.i64.f64(double -1.5)
-  store volatile i64 %t9, i64* %p
-  %t10 = call i64 @llvm.wasm.trunc.saturate.signed.i64.f64(double 0xbffe666666666666); -1.9
-  store volatile i64 %t10, i64* %p
-  %t11 = call i64 @llvm.wasm.trunc.saturate.signed.i64.f64(double -2.0)
-  store volatile i64 %t11, i64* %p
-  %t12 = call i64 @llvm.wasm.trunc.saturate.signed.i64.f64(double 4294967296.0)
-  store volatile i64 %t12, i64* %p
-  %t13 = call i64 @llvm.wasm.trunc.saturate.signed.i64.f64(double -4294967296.0)
-  store volatile i64 %t13, i64* %p
-  %t14 = call i64 @llvm.wasm.trunc.saturate.signed.i64.f64(double 9223372036854774784.0)
-  store volatile i64 %t14, i64* %p
-  %t15 = call i64 @llvm.wasm.trunc.saturate.signed.i64.f64(double -9223372036854775808.0)
-  store volatile i64 %t15, i64* %p
-  %t16 = call i64 @llvm.wasm.trunc.saturate.signed.i64.f64(double 9223372036854775808.0)
-  store volatile i64 %t16, i64* %p
-  %t17 = call i64 @llvm.wasm.trunc.saturate.signed.i64.f64(double -9223372036854777856.0)
-  store volatile i64 %t17, i64* %p
-  %t18 = call i64 @llvm.wasm.trunc.saturate.signed.i64.f64(double 0x7ff0000000000000); inf
-  store volatile i64 %t18, i64* %p
-  %t19 = call i64 @llvm.wasm.trunc.saturate.signed.i64.f64(double 0xfff0000000000000); -inf
-  store volatile i64 %t19, i64* %p
-  %t20 = call i64 @llvm.wasm.trunc.saturate.signed.i64.f64(double 0x7ff8000000000000); nan
-  store volatile i64 %t20, i64* %p
-  %t21 = call i64 @llvm.wasm.trunc.saturate.signed.i64.f64(double 0x7ff4000000000000); nan:0x4000000000000
-  store volatile i64 %t21, i64* %p
-  %t22 = call i64 @llvm.wasm.trunc.saturate.signed.i64.f64(double 0xfff8000000000000); -nan
-  store volatile i64 %t22, i64* %p
-  %t23 = call i64 @llvm.wasm.trunc.saturate.signed.i64.f64(double 0x7ff4000000000000); -nan:0x4000000000000
-  store volatile i64 %t23, i64* %p
-  ret void
-}
-
-define void @test_i64_trunc_sat_f64_u(i64* %p) {
-; CHECK-LABEL: @test_i64_trunc_sat_f64_u(
-; CHECK-NEXT:    store volatile i64 0, i64* [[P:%.*]], align 8
-; CHECK-NEXT:    store volatile i64 0, i64* [[P]], align 8
-; CHECK-NEXT:    store volatile i64 0, i64* [[P]], align 8
-; CHECK-NEXT:    store volatile i64 0, i64* [[P]], align 8
-; CHECK-NEXT:    store volatile i64 1, i64* [[P]], align 8
-; CHECK-NEXT:    store volatile i64 1, i64* [[P]], align 8
-; CHECK-NEXT:    store volatile i64 1, i64* [[P]], align 8
-; CHECK-NEXT:    store volatile i64 4294967295, i64* [[P]], align 8
-; CHECK-NEXT:    store volatile i64 4294967296, i64* [[P]], align 8
-; CHECK-NEXT:    store volatile i64 -2048, i64* [[P]], align 8
-; CHECK-NEXT:    store volatile i64 0, i64* [[P]], align 8
-; CHECK-NEXT:    store volatile i64 0, i64* [[P]], align 8
-; CHECK-NEXT:    store volatile i64 100000000, i64* [[P]], align 8
-; CHECK-NEXT:    store volatile i64 10000000000000000, i64* [[P]], align 8
-; CHECK-NEXT:    store volatile i64 -9223372036854775808, i64* [[P]], align 8
-; CHECK-NEXT:    store volatile i64 -1, i64* [[P]], align 8
-; CHECK-NEXT:    store volatile i64 0, i64* [[P]], align 8
-; CHECK-NEXT:    store volatile i64 -1, i64* [[P]], align 8
-; CHECK-NEXT:    store volatile i64 0, i64* [[P]], align 8
-; CHECK-NEXT:    store volatile i64 0, i64* [[P]], align 8
-; CHECK-NEXT:    store volatile i64 0, i64* [[P]], align 8
-; CHECK-NEXT:    store volatile i64 0, i64* [[P]], align 8
-; CHECK-NEXT:    store volatile i64 0, i64* [[P]], align 8
-; CHECK-NEXT:    ret void
-;
-  %t0 = call i64 @llvm.wasm.trunc.saturate.unsigned.i64.f64(double +0.0)
-  store volatile i64 %t0, i64* %p
-  %t1 = call i64 @llvm.wasm.trunc.saturate.unsigned.i64.f64(double -0.0)
-  store volatile i64 %t1, i64* %p
-  %t2 = call i64 @llvm.wasm.trunc.saturate.unsigned.i64.f64(double 0x0010000000000001); 0x0.0000000000001p-1022
-  store volatile i64 %t2, i64* %p
-  %t3 = call i64 @llvm.wasm.trunc.saturate.unsigned.i64.f64(double 0x8010000000000001); -0x0.0000000000001p-1022
-  store volatile i64 %t3, i64* %p
-  %t4 = call i64 @llvm.wasm.trunc.saturate.unsigned.i64.f64(double 1.0)
-  store volatile i64 %t4, i64* %p
-  %t5 = call i64 @llvm.wasm.trunc.saturate.unsigned.i64.f64(double 0x3ff199999999999a); 0x1.199999999999ap+0
-  store volatile i64 %t5, i64* %p
-  %t6 = call i64 @llvm.wasm.trunc.saturate.unsigned.i64.f64(double 1.5)
-  store volatile i64 %t6, i64* %p
-  %t7 = call i64 @llvm.wasm.trunc.saturate.unsigned.i64.f64(double 4294967295.0)
-  store volatile i64 %t7, i64* %p
-  %t8 = call i64 @llvm.wasm.trunc.saturate.unsigned.i64.f64(double 4294967296.0)
-  store volatile i64 %t8, i64* %p
-  %t9 = call i64 @llvm.wasm.trunc.saturate.unsigned.i64.f64(double 18446744073709549568.0)
-  store volatile i64 %t9, i64* %p
-  %t10 = call i64 @llvm.wasm.trunc.saturate.unsigned.i64.f64(double 0xbfeccccccccccccd); -0x1.ccccccccccccdp-1
-  store volatile i64 %t10, i64* %p
-  %t11 = call i64 @llvm.wasm.trunc.saturate.unsigned.i64.f64(double 0xbfefffffffffffff); -0x1.fffffffffffffp-1
-  store volatile i64 %t11, i64* %p
-  %t12 = call i64 @llvm.wasm.trunc.saturate.unsigned.i64.f64(double 100000000.0); 1e8
-  store volatile i64 %t12, i64* %p
-  %t13 = call i64 @llvm.wasm.trunc.saturate.unsigned.i64.f64(double 10000000000000000.0); 1e16
-  store volatile i64 %t13, i64* %p
-  %t14 = call i64 @llvm.wasm.trunc.saturate.unsigned.i64.f64(double 9223372036854775808.0);
-  store volatile i64 %t14, i64* %p
-  %t15 = call i64 @llvm.wasm.trunc.saturate.unsigned.i64.f64(double 18446744073709551616.0)
-  store volatile i64 %t15, i64* %p
-  %t16 = call i64 @llvm.wasm.trunc.saturate.unsigned.i64.f64(double -1.0)
-  store volatile i64 %t16, i64* %p
-  %t17 = call i64 @llvm.wasm.trunc.saturate.unsigned.i64.f64(double 0x7ff0000000000000); inf
-  store volatile i64 %t17, i64* %p
-  %t18 = call i64 @llvm.wasm.trunc.saturate.unsigned.i64.f64(double 0xfff0000000000000); -inf
-  store volatile i64 %t18, i64* %p
-  %t19 = call i64 @llvm.wasm.trunc.saturate.unsigned.i64.f64(double 0x7ff8000000000000); nan
-  store volatile i64 %t19, i64* %p
-  %t20 = call i64 @llvm.wasm.trunc.saturate.unsigned.i64.f64(double 0x7ff4000000000000); nan:0x4000000000000
-  store volatile i64 %t20, i64* %p
-  %t21 = call i64 @llvm.wasm.trunc.saturate.unsigned.i64.f64(double 0xfff8000000000000); -nan
-  store volatile i64 %t21, i64* %p
-  %t22 = call i64 @llvm.wasm.trunc.saturate.unsigned.i64.f64(double 0xfff4000000000000); -nan:0x4000000000000
-  store volatile i64 %t22, i64* %p
-  ret void
-}