AMDGPU: Add clamp bit to dot intrinsics

kzhuravl · kzhuravl · commit bb30ef7af4ba · 2018-08-01T01:31:30.000Z
Differential Revision: https://reviews.llvm.org/D49874 llvm-svn: 338470
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1191,7 +1191,7 @@ def int_amdgcn_ds_bpermute :
 // Deep learning intrinsics.
 //===----------------------------------------------------------------------===//
 
-// f32 %r = llvm.amdgcn.fdot2(v2f16 %a, v2f16 %b, f32 %c)
+// f32 %r = llvm.amdgcn.fdot2(v2f16 %a, v2f16 %b, f32 %c, i1 %clamp)
 //   %r = %a[0] * %b[0] + %a[1] * %b[1] + %c
 def int_amdgcn_fdot2 :
   GCCBuiltin<"__builtin_amdgcn_fdot2">,
@@ -1200,12 +1200,13 @@ def int_amdgcn_fdot2 :
     [
       llvm_v2f16_ty, // %a
       llvm_v2f16_ty, // %b
-      llvm_float_ty  // %c
+      llvm_float_ty, // %c
+      llvm_i1_ty     // %clamp
     ],
     [IntrNoMem, IntrSpeculatable]
   >;
 
-// i32 %r = llvm.amdgcn.sdot2(v2i16 %a, v2i16 %b, i32 %c)
+// i32 %r = llvm.amdgcn.sdot2(v2i16 %a, v2i16 %b, i32 %c, i1 %clamp)
 //   %r = %a[0] * %b[0] + %a[1] * %b[1] + %c
 def int_amdgcn_sdot2 :
   GCCBuiltin<"__builtin_amdgcn_sdot2">,
@@ -1214,12 +1215,13 @@ def int_amdgcn_sdot2 :
     [
       llvm_v2i16_ty, // %a
       llvm_v2i16_ty, // %b
-      llvm_i32_ty    // %c
+      llvm_i32_ty,   // %c
+      llvm_i1_ty     // %clamp
     ],
     [IntrNoMem, IntrSpeculatable]
   >;
 
-// u32 %r = llvm.amdgcn.udot2(v2u16 %a, v2u16 %b, u32 %c)
+// u32 %r = llvm.amdgcn.udot2(v2u16 %a, v2u16 %b, u32 %c, i1 %clamp)
 //   %r = %a[0] * %b[0] + %a[1] * %b[1] + %c
 def int_amdgcn_udot2 :
   GCCBuiltin<"__builtin_amdgcn_udot2">,
@@ -1228,12 +1230,13 @@ def int_amdgcn_udot2 :
     [
       llvm_v2i16_ty, // %a
       llvm_v2i16_ty, // %b
-      llvm_i32_ty    // %c
+      llvm_i32_ty,   // %c
+      llvm_i1_ty     // %clamp
     ],
     [IntrNoMem, IntrSpeculatable]
   >;
 
-// i32 %r = llvm.amdgcn.sdot4(v4i8 (as i32) %a, v4i8 (as i32) %b, i32 %c)
+// i32 %r = llvm.amdgcn.sdot4(v4i8 (as i32) %a, v4i8 (as i32) %b, i32 %c, i1 %clamp)
 //   %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c
 def int_amdgcn_sdot4 :
   GCCBuiltin<"__builtin_amdgcn_sdot4">,
@@ -1242,12 +1245,13 @@ def int_amdgcn_sdot4 :
     [
       llvm_i32_ty, // %a
       llvm_i32_ty, // %b
-      llvm_i32_ty  // %c
+      llvm_i32_ty, // %c
+      llvm_i1_ty   // %clamp
     ],
     [IntrNoMem, IntrSpeculatable]
   >;
 
-// u32 %r = llvm.amdgcn.udot4(v4u8 (as u32) %a, v4u8 (as u32) %b, u32 %c)
+// u32 %r = llvm.amdgcn.udot4(v4u8 (as u32) %a, v4u8 (as u32) %b, u32 %c, i1 %clamp)
 //   %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c
 def int_amdgcn_udot4 :
   GCCBuiltin<"__builtin_amdgcn_udot4">,
@@ -1256,12 +1260,13 @@ def int_amdgcn_udot4 :
     [
       llvm_i32_ty, // %a
       llvm_i32_ty, // %b
-      llvm_i32_ty  // %c
+      llvm_i32_ty, // %c
+      llvm_i1_ty   // %clamp
     ],
     [IntrNoMem, IntrSpeculatable]
   >;
 
-// i32 %r = llvm.amdgcn.sdot8(v8i4 (as i32) %a, v8i4 (as i32) %b, i32 %c)
+// i32 %r = llvm.amdgcn.sdot8(v8i4 (as i32) %a, v8i4 (as i32) %b, i32 %c, i1 %clamp)
 //   %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] +
 //        %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c
 def int_amdgcn_sdot8 :
@@ -1271,12 +1276,13 @@ def int_amdgcn_sdot8 :
     [
       llvm_i32_ty, // %a
       llvm_i32_ty, // %b
-      llvm_i32_ty  // %c
+      llvm_i32_ty, // %c
+      llvm_i1_ty   // %clamp
     ],
     [IntrNoMem, IntrSpeculatable]
   >;
 
-// u32 %r = llvm.amdgcn.udot8(v8u4 (as u32) %a, v8u4 (as u32) %b, u32 %c)
+// u32 %r = llvm.amdgcn.udot8(v8u4 (as u32) %a, v8u4 (as u32) %b, u32 %c, i1 %clamp)
 //   %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] +
 //        %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c
 def int_amdgcn_udot8 :
@@ -1286,7 +1292,8 @@ def int_amdgcn_udot8 :
     [
       llvm_i32_ty, // %a
       llvm_i32_ty, // %b
-      llvm_i32_ty  // %c
+      llvm_i32_ty, // %c
+      llvm_i1_ty   // %clamp
     ],
     [IntrNoMem, IntrSpeculatable]
   >;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -342,8 +342,9 @@ def AMDGPUumed3 : SDNode<"AMDGPUISD::UMED3", AMDGPUDTIntTernaryOp,
 def AMDGPUfmed3 : SDNode<"AMDGPUISD::FMED3", SDTFPTernaryOp, []>;
 
 def AMDGPUfdot2 : SDNode<"AMDGPUISD::FDOT2",
-                  SDTypeProfile<1, 3, [SDTCisSameAs<0, 3>, SDTCisSameAs<1, 2>,
-                                       SDTCisFP<0>, SDTCisVec<1>]>,
+                  SDTypeProfile<1, 4, [SDTCisSameAs<0, 3>, SDTCisSameAs<1, 2>,
+                                       SDTCisFP<0>, SDTCisVec<1>,
+                                       SDTCisInt<4>]>,
                   []>;
 
 def AMDGPUperm : SDNode<"AMDGPUISD::PERM", AMDGPUDTIntTernaryOp, []>;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -5010,7 +5010,8 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
   case Intrinsic::amdgcn_fdot2:
     return DAG.getNode(AMDGPUISD::FDOT2, DL, VT,
-                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
+                       Op.getOperand(4));
   case Intrinsic::amdgcn_fmul_legacy:
     return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT,
                        Op.getOperand(1), Op.getOperand(2));
@@ -7613,8 +7614,10 @@ SDValue SITargetLowering::performFMACombine(SDNode *N,
       return SDValue();
 
     if ((Vec1 == Vec3 && Vec2 == Vec4) ||
-        (Vec1 == Vec4 && Vec2 == Vec3))
-      return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc);
+        (Vec1 == Vec4 && Vec2 == Vec3)) {
+      return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
+                         DAG.getTargetConstant(0, SL, MVT::i1));
+    }
   }
   return SDValue();
 }
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -167,13 +167,30 @@ defm : MadFmaMixPats<fma, V_FMA_MIX_F32, V_FMA_MIXLO_F16, V_FMA_MIXHI_F16>;
 
 let SubtargetPredicate = HasDLInsts in {
 
-def V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16", VOP3_Profile<VOP_F32_V2F16_V2F16_F32>, AMDGPUfdot2>;
-def V_DOT2_I32_I16 : VOP3PInst<"v_dot2_i32_i16", VOP3_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_sdot2>;
-def V_DOT2_U32_U16 : VOP3PInst<"v_dot2_u32_u16", VOP3_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_udot2>;
-def V_DOT4_I32_I8  : VOP3PInst<"v_dot4_i32_i8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot4>;
-def V_DOT4_U32_U8  : VOP3PInst<"v_dot4_u32_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot4>;
-def V_DOT8_I32_I4  : VOP3PInst<"v_dot8_i32_i4", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot8>;
-def V_DOT8_U32_U4  : VOP3PInst<"v_dot8_u32_u4", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot8>;
+def V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16", VOP3_Profile<VOP_F32_V2F16_V2F16_F32>>;
+def V_DOT2_I32_I16 : VOP3PInst<"v_dot2_i32_i16", VOP3_Profile<VOP_I32_V2I16_V2I16_I32>>;
+def V_DOT2_U32_U16 : VOP3PInst<"v_dot2_u32_u16", VOP3_Profile<VOP_I32_V2I16_V2I16_I32>>;
+def V_DOT4_I32_I8  : VOP3PInst<"v_dot4_i32_i8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>>;
+def V_DOT4_U32_U8  : VOP3PInst<"v_dot4_u32_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>>;
+def V_DOT8_I32_I4  : VOP3PInst<"v_dot8_i32_i4", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>>;
+def V_DOT8_U32_U4  : VOP3PInst<"v_dot8_u32_u4", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>>;
+
+multiclass DotPats<SDPatternOperator dot_op,
+                   VOP3PInst dot_inst> {
+  def : GCNPat <
+    (dot_op (dot_inst.Pfl.Src0VT (VOP3PMods0 dot_inst.Pfl.Src0VT:$src0, i32:$src0_modifiers)),
+            (dot_inst.Pfl.Src1VT (VOP3PMods dot_inst.Pfl.Src1VT:$src1, i32:$src1_modifiers)),
+            (dot_inst.Pfl.Src2VT (VOP3PMods dot_inst.Pfl.Src2VT:$src2, i32:$src2_modifiers)), i1:$clamp),
+    (dot_inst $src0_modifiers, $src0, $src1_modifiers, $src1, $src2_modifiers, $src2, (as_i1imm $clamp))>;
+}
+
+defm : DotPats<AMDGPUfdot2, V_DOT2_F32_F16>;
+defm : DotPats<int_amdgcn_sdot2, V_DOT2_I32_I16>;
+defm : DotPats<int_amdgcn_udot2, V_DOT2_U32_U16>;
+defm : DotPats<int_amdgcn_sdot4, V_DOT4_I32_I8>;
+defm : DotPats<int_amdgcn_udot4, V_DOT4_U32_U8>;
+defm : DotPats<int_amdgcn_sdot8, V_DOT8_I32_I4>;
+defm : DotPats<int_amdgcn_udot8, V_DOT8_U32_U4>;
 
 } // End SubtargetPredicate = HasDLInsts
 
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll
@@ -1,10 +1,10 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX906
 
-declare float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %c)
+declare float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %c, i1 %clamp)
 
-; GFX906-LABEL: {{^}}test_llvm_amdgcn_fdot2
-; GFX906: v_dot2_f32_f16
-define amdgpu_kernel void @test_llvm_amdgcn_fdot2(
+; GFX906-LABEL: {{^}}test_llvm_amdgcn_fdot2_clamp
+; GFX906: v_dot2_f32_f16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} clamp{{$}}
+define amdgpu_kernel void @test_llvm_amdgcn_fdot2_clamp(
     float addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b,
@@ -13,7 +13,23 @@ entry:
   %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
   %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
   %c.val = load float, float addrspace(1)* %c
-  %r.val = call float @llvm.amdgcn.fdot2(<2 x half> %a.val, <2 x half> %b.val, float %c.val)
+  %r.val = call float @llvm.amdgcn.fdot2(<2 x half> %a.val, <2 x half> %b.val, float %c.val, i1 1)
+  store float %r.val, float addrspace(1)* %r
+  ret void
+}
+
+; GFX906-LABEL: {{^}}test_llvm_amdgcn_fdot2_no_clamp
+; GFX906: v_dot2_f32_f16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
+define amdgpu_kernel void @test_llvm_amdgcn_fdot2_no_clamp(
+    float addrspace(1)* %r,
+    <2 x half> addrspace(1)* %a,
+    <2 x half> addrspace(1)* %b,
+    float addrspace(1)* %c) {
+entry:
+  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
+  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
+  %c.val = load float, float addrspace(1)* %c
+  %r.val = call float @llvm.amdgcn.fdot2(<2 x half> %a.val, <2 x half> %b.val, float %c.val, i1 0)
   store float %r.val, float addrspace(1)* %r
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot2.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot2.ll
@@ -1,10 +1,10 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GCN --check-prefix=GFX906
 
-declare i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> %b, i32 %c)
+declare i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> %b, i32 %c, i1 %clamp)
 
-; GCN-LABEL: {{^}}test_llvm_amdgcn_sdot2
-; GFX906: v_dot2_i32_i16
-define amdgpu_kernel void @test_llvm_amdgcn_sdot2(
+; GCN-LABEL: {{^}}test_llvm_amdgcn_sdot2_clamp
+; GFX906: v_dot2_i32_i16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} clamp{{$}}
+define amdgpu_kernel void @test_llvm_amdgcn_sdot2_clamp(
     i32 addrspace(1)* %r,
     <2 x i16> addrspace(1)* %a,
     <2 x i16> addrspace(1)* %b,
@@ -13,7 +13,23 @@ entry:
   %a.val = load <2 x i16>, <2 x i16> addrspace(1)* %a
   %b.val = load <2 x i16>, <2 x i16> addrspace(1)* %b
   %c.val = load i32, i32 addrspace(1)* %c
-  %r.val = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a.val, <2 x i16> %b.val, i32 %c.val)
+  %r.val = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a.val, <2 x i16> %b.val, i32 %c.val, i1 1)
+  store i32 %r.val, i32 addrspace(1)* %r
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_llvm_amdgcn_sdot2_no_clamp
+; GFX906: v_dot2_i32_i16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
+define amdgpu_kernel void @test_llvm_amdgcn_sdot2_no_clamp(
+    i32 addrspace(1)* %r,
+    <2 x i16> addrspace(1)* %a,
+    <2 x i16> addrspace(1)* %b,
+    i32 addrspace(1)* %c) {
+entry:
+  %a.val = load <2 x i16>, <2 x i16> addrspace(1)* %a
+  %b.val = load <2 x i16>, <2 x i16> addrspace(1)* %b
+  %c.val = load i32, i32 addrspace(1)* %c
+  %r.val = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a.val, <2 x i16> %b.val, i32 %c.val, i1 0)
   store i32 %r.val, i32 addrspace(1)* %r
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot4.ll
@@ -1,10 +1,10 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GCN --check-prefix=GFX906
 
-declare i32 @llvm.amdgcn.sdot4(i32 %a, i32 %b, i32 %c)
+declare i32 @llvm.amdgcn.sdot4(i32 %a, i32 %b, i32 %c, i1 %clamp)
 
-; GCN-LABEL: {{^}}test_llvm_amdgcn_sdot4
-; GFX906: v_dot4_i32_i8
-define amdgpu_kernel void @test_llvm_amdgcn_sdot4(
+; GCN-LABEL: {{^}}test_llvm_amdgcn_sdot4_clamp
+; GFX906: v_dot4_i32_i8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} clamp{{$}}
+define amdgpu_kernel void @test_llvm_amdgcn_sdot4_clamp(
     i32 addrspace(1)* %r,
     <4 x i8> addrspace(1)* %a,
     <4 x i8> addrspace(1)* %b,
@@ -15,7 +15,25 @@ entry:
   %a.val.cast = bitcast <4 x i8> %a.val to i32
   %b.val.cast = bitcast <4 x i8> %b.val to i32
   %c.val = load i32, i32 addrspace(1)* %c
-  %r.val = call i32 @llvm.amdgcn.sdot4(i32 %a.val.cast, i32 %b.val.cast, i32 %c.val)
+  %r.val = call i32 @llvm.amdgcn.sdot4(i32 %a.val.cast, i32 %b.val.cast, i32 %c.val, i1 1)
+  store i32 %r.val, i32 addrspace(1)* %r
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_llvm_amdgcn_sdot4_no_clamp
+; GFX906: v_dot4_i32_i8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
+define amdgpu_kernel void @test_llvm_amdgcn_sdot4_no_clamp(
+    i32 addrspace(1)* %r,
+    <4 x i8> addrspace(1)* %a,
+    <4 x i8> addrspace(1)* %b,
+    i32 addrspace(1)* %c) {
+entry:
+  %a.val = load <4 x i8>, <4 x i8> addrspace(1)* %a
+  %b.val = load <4 x i8>, <4 x i8> addrspace(1)* %b
+  %a.val.cast = bitcast <4 x i8> %a.val to i32
+  %b.val.cast = bitcast <4 x i8> %b.val to i32
+  %c.val = load i32, i32 addrspace(1)* %c
+  %r.val = call i32 @llvm.amdgcn.sdot4(i32 %a.val.cast, i32 %b.val.cast, i32 %c.val, i1 0)
   store i32 %r.val, i32 addrspace(1)* %r
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot8.ll
@@ -1,10 +1,10 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GCN --check-prefix=GFX906
 
-declare i32 @llvm.amdgcn.sdot8(i32 %a, i32 %b, i32 %c)
+declare i32 @llvm.amdgcn.sdot8(i32 %a, i32 %b, i32 %c, i1 %clamp)
 
-; GCN-LABEL: {{^}}test_llvm_amdgcn_sdot8
-; GFX906: v_dot8_i32_i4
-define amdgpu_kernel void @test_llvm_amdgcn_sdot8(
+; GCN-LABEL: {{^}}test_llvm_amdgcn_sdot8_clamp
+; GFX906: v_dot8_i32_i4 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} clamp{{$}}
+define amdgpu_kernel void @test_llvm_amdgcn_sdot8_clamp(
     i32 addrspace(1)* %r,
     <8 x i4> addrspace(1)* %a,
     <8 x i4> addrspace(1)* %b,
@@ -15,7 +15,25 @@ entry:
   %a.val.cast = bitcast <8 x i4> %a.val to i32
   %b.val.cast = bitcast <8 x i4> %b.val to i32
   %c.val = load i32, i32 addrspace(1)* %c
-  %r.val = call i32 @llvm.amdgcn.sdot8(i32 %a.val.cast, i32 %b.val.cast, i32 %c.val)
+  %r.val = call i32 @llvm.amdgcn.sdot8(i32 %a.val.cast, i32 %b.val.cast, i32 %c.val, i1 1)
+  store i32 %r.val, i32 addrspace(1)* %r
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_llvm_amdgcn_sdot8_no_clamp
+; GFX906: v_dot8_i32_i4 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
+define amdgpu_kernel void @test_llvm_amdgcn_sdot8_no_clamp(
+    i32 addrspace(1)* %r,
+    <8 x i4> addrspace(1)* %a,
+    <8 x i4> addrspace(1)* %b,
+    i32 addrspace(1)* %c) {
+entry:
+  %a.val = load <8 x i4>, <8 x i4> addrspace(1)* %a
+  %b.val = load <8 x i4>, <8 x i4> addrspace(1)* %b
+  %a.val.cast = bitcast <8 x i4> %a.val to i32
+  %b.val.cast = bitcast <8 x i4> %b.val to i32
+  %c.val = load i32, i32 addrspace(1)* %c
+  %r.val = call i32 @llvm.amdgcn.sdot8(i32 %a.val.cast, i32 %b.val.cast, i32 %c.val, i1 0)
   store i32 %r.val, i32 addrspace(1)* %r
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot2.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot2.ll
@@ -1,10 +1,10 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GCN --check-prefix=GFX906
 
-declare i32 @llvm.amdgcn.udot2(<2 x i16> %a, <2 x i16> %b, i32 %c)
+declare i32 @llvm.amdgcn.udot2(<2 x i16> %a, <2 x i16> %b, i32 %c, i1 %clamp)
 
-; GCN-LABEL: {{^}}test_llvm_amdgcn_udot2
-; GFX906: v_dot2_u32_u16
-define amdgpu_kernel void @test_llvm_amdgcn_udot2(
+; GCN-LABEL: {{^}}test_llvm_amdgcn_udot2_clamp
+; GFX906: v_dot2_u32_u16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} clamp{{$}}
+define amdgpu_kernel void @test_llvm_amdgcn_udot2_clamp(
     i32 addrspace(1)* %r,
     <2 x i16> addrspace(1)* %a,
     <2 x i16> addrspace(1)* %b,
@@ -13,7 +13,23 @@ entry:
   %a.val = load <2 x i16>, <2 x i16> addrspace(1)* %a
   %b.val = load <2 x i16>, <2 x i16> addrspace(1)* %b
   %c.val = load i32, i32 addrspace(1)* %c
-  %r.val = call i32 @llvm.amdgcn.udot2(<2 x i16> %a.val, <2 x i16> %b.val, i32 %c.val)
+  %r.val = call i32 @llvm.amdgcn.udot2(<2 x i16> %a.val, <2 x i16> %b.val, i32 %c.val, i1 1)
+  store i32 %r.val, i32 addrspace(1)* %r
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_llvm_amdgcn_udot2_no_clamp
+; GFX906: v_dot2_u32_u16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
+define amdgpu_kernel void @test_llvm_amdgcn_udot2_no_clamp(
+    i32 addrspace(1)* %r,
+    <2 x i16> addrspace(1)* %a,
+    <2 x i16> addrspace(1)* %b,
+    i32 addrspace(1)* %c) {
+entry:
+  %a.val = load <2 x i16>, <2 x i16> addrspace(1)* %a
+  %b.val = load <2 x i16>, <2 x i16> addrspace(1)* %b
+  %c.val = load i32, i32 addrspace(1)* %c
+  %r.val = call i32 @llvm.amdgcn.udot2(<2 x i16> %a.val, <2 x i16> %b.val, i32 %c.val, i1 0)
   store i32 %r.val, i32 addrspace(1)* %r
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot4.ll
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot8.ll