Use UMad for dot products on uint vectors (cp to 1.8.2502) (microsoft#7092)

pow2clk · web-flow · commit e72bc64f2c4f · 2025-01-27T18:13:05.000-07:00
Cherry-pick of microsoft#7059 to release-1.8.2502 Dot products on uint vectors were not using the unsigned mad operations when they were formed using the dot intrinsic though they did when it was the effect of the mul intrinsic. While such operations aren't very common and less common still are the cases where the unsigned variant makes a difference, this was an internal inconsistency as well as with the clang implementation. This corrects the problem using a dummy op for udot similar to how other intrinsics address similar problems to pass the unsigned information to operation lowering. Incidentally adds unsigned notations to all the mul intrinsic entries. This is a non-functional change as any one overload that specifies an unsigned variant will apply for all of them, but it is less confusing than having half of them do it and the other half not. The test verifies the output of a few dot operations with sint, float, and uint types for both dot and mul intrinsics Fixes microsoft#7058 (cherry picked from commit 1fdebc4)
diff --git a/include/dxc/HlslIntrinsicOp.h b/include/dxc/HlslIntrinsicOp.h
@@ -367,6 +367,7 @@ enum class IntrinsicOp {
   IOP_WavePrefixUSum,
   IOP_uabs,
   IOP_uclamp,
+  IOP_udot,
   IOP_ufirstbithigh,
   IOP_umad,
   IOP_umax,
@@ -391,6 +392,7 @@ inline bool HasUnsignedIntrinsicOpcode(IntrinsicOp opcode) {
   case IntrinsicOp::IOP_WavePrefixSum:
   case IntrinsicOp::IOP_abs:
   case IntrinsicOp::IOP_clamp:
+  case IntrinsicOp::IOP_dot:
   case IntrinsicOp::IOP_firstbithigh:
   case IntrinsicOp::IOP_mad:
   case IntrinsicOp::IOP_max:
@@ -432,6 +434,8 @@ inline unsigned GetUnsignedIntrinsicOpcode(IntrinsicOp opcode) {
     return static_cast<unsigned>(IntrinsicOp::IOP_uabs);
   case IntrinsicOp::IOP_clamp:
     return static_cast<unsigned>(IntrinsicOp::IOP_uclamp);
+  case IntrinsicOp::IOP_dot:
+    return static_cast<unsigned>(IntrinsicOp::IOP_udot);
   case IntrinsicOp::IOP_firstbithigh:
     return static_cast<unsigned>(IntrinsicOp::IOP_ufirstbithigh);
   case IntrinsicOp::IOP_mad:
diff --git a/lib/HLSL/HLOperationLower.cpp b/lib/HLSL/HLOperationLower.cpp
@@ -2480,7 +2480,8 @@ Value *TranslateDot(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
   if (Ty->getScalarType()->isFloatingPointTy()) {
     return TranslateFDot(arg0, arg1, vecSize, hlslOP, Builder);
   } else {
-    return TranslateIDot(arg0, arg1, vecSize, hlslOP, Builder);
+    return TranslateIDot(arg0, arg1, vecSize, hlslOP, Builder,
+                         IOP == IntrinsicOp::IOP_udot);
   }
 }
 
@@ -6789,6 +6790,7 @@ IntrinsicLower gLowerTable[] = {
      DXIL::OpCode::WavePrefixOp},
     {IntrinsicOp::IOP_uabs, TranslateUAbs, DXIL::OpCode::NumOpCodes},
     {IntrinsicOp::IOP_uclamp, TranslateClamp, DXIL::OpCode::NumOpCodes},
+    {IntrinsicOp::IOP_udot, TranslateDot, DXIL::OpCode::NumOpCodes},
     {IntrinsicOp::IOP_ufirstbithigh, TranslateFirstbitHi,
      DXIL::OpCode::FirstbitHi},
     {IntrinsicOp::IOP_umad, TranslateFUITrinary, DXIL::OpCode::UMad},
diff --git a/tools/clang/lib/SPIRV/SpirvEmitter.cpp b/tools/clang/lib/SPIRV/SpirvEmitter.cpp
@@ -8957,6 +8957,7 @@ SpirvEmitter::processIntrinsicCallExpr(const CallExpr *callExpr) {
     return nullptr;
   }
   case hlsl::IntrinsicOp::IOP_dot:
+  case hlsl::IntrinsicOp::IOP_udot:
     retVal = processIntrinsicDot(callExpr);
     break;
   case hlsl::IntrinsicOp::IOP_GroupMemoryBarrier:
diff --git a/tools/clang/test/CodeGenHLSL/dot.hlsl b/tools/clang/test/CodeGenHLSL/dot.hlsl
@@ -0,0 +1,161 @@
+// RUN: %dxc -T vs_6_0 -DFUNC=dot %s | FileCheck %s
+// RUN: %dxc -T vs_6_0 -DFUNC=mul %s | FileCheck %s
+// RUN: %dxc -T vs_6_0 -DFUNC=dot -fcgl %s | FileCheck %s --check-prefix=CGLDOT
+// RUN: %dxc -T vs_6_0 -DFUNC=mul -fcgl %s | FileCheck %s --check-prefix=CGLMUL
+
+// Verifies correct implementation of dot and mul with vectors for various sizes and types.
+
+// Partially pilfered from SPIRV's intrinsic.dot.hlsl
+
+float4 main(int1 i1[2] : IO, int2 i2[2] : IT, int3 i3[2] : IH, int4 i4[2] : IF,
+            float1 f1[2] : FO, float2 f2[2] : FT, float3 f3[2] : FH, float4 f4[2] : FF,
+            uint1 u1[2] : UO, uint2 u2[2] : UT, uint3 u3[2] : UH, uint4 u4[2] : UF) : SV_Position {
+  int i = 0;
+  // CHECK-DAG: [[I0:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 0, i32 0, i8 0, i32 undef)
+  // CHECK-DAG: [[I1:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 0, i32 1, i8 0, i32 undef)
+  // CHECK: mul i32 [[I0]], [[I1]]
+  // CGLDOT: call i32 @"dx.hl.op.rn.i32 (i32, <1 x i32>, <1 x i32>)"(i32 134, <1 x i32> %{{.*}}, <1 x i32> %{{.*}})
+  // CGLMUL: call i32 @"dx.hl.op.rn.i32 (i32, <1 x i32>, <1 x i32>)"(i32 167, <1 x i32> %{{.*}}, <1 x i32> %{{.*}})
+  i += FUNC(i1[0], i1[1]);
+
+  // CHECK-DAG: [[I00:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 1, i32 0, i8 0, i32 undef)
+  // CHECK-DAG: [[I01:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 1, i32 0, i8 1, i32 undef)
+  // CHECK-DAG: [[I10:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 1, i32 1, i8 0, i32 undef)
+  // CHECK-DAG: [[I11:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 1, i32 1, i8 1, i32 undef)
+
+  // CHECK: [[MUL:%.*]] = mul i32 [[I00]], [[I10]]
+  // CHECK: call i32 @dx.op.tertiary.i32(i32 48, i32 [[I01]], i32 [[I11]], i32 [[MUL]])  ; IMad(a,b,c)
+  // CGLDOT: call i32 @"dx.hl.op.rn.i32 (i32, <2 x i32>, <2 x i32>)"(i32 134, <2 x i32> %{{.*}}, <2 x i32> %{{.*}})
+  // CGLMUL: call i32 @"dx.hl.op.rn.i32 (i32, <2 x i32>, <2 x i32>)"(i32 167, <2 x i32> %{{.*}}, <2 x i32> %{{.*}})
+  i += FUNC(i2[0], i2[1]);
+
+  // CHECK-DAG: [[I00:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 2, i32 0, i8 0, i32 undef)
+  // CHECK-DAG: [[I01:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 2, i32 0, i8 1, i32 undef)
+  // CHECK-DAG: [[I02:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 2, i32 0, i8 2, i32 undef)
+  // CHECK-DAG: [[I10:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 2, i32 1, i8 0, i32 undef)
+  // CHECK-DAG: [[I11:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 2, i32 1, i8 1, i32 undef)
+  // CHECK-DAG: [[I12:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 2, i32 1, i8 2, i32 undef)
+
+  // PING and PONG are just conveniences to track the result as it accumulates.
+  // Since we can't capture and match the source and result in the same line with the same variable.
+  // CHECK: [[PING:%.*]] = mul i32 [[I00]], [[I10]]
+  // CHECK: [[PONG:%.*]] = call i32 @dx.op.tertiary.i32(i32 48, i32 [[I01]], i32 [[I11]], i32 [[PING]])  ; IMad(a,b,c)
+  // CHECK: [[PING:%.*]] = call i32 @dx.op.tertiary.i32(i32 48, i32 [[I02]], i32 [[I12]], i32 [[PONG]])  ; IMad(a,b,c)
+  // CGLDOT: call i32 @"dx.hl.op.rn.i32 (i32, <3 x i32>, <3 x i32>)"(i32 134, <3 x i32> %{{.*}}, <3 x i32> %{{.*}})
+  // CGLMUL: call i32 @"dx.hl.op.rn.i32 (i32, <3 x i32>, <3 x i32>)"(i32 167, <3 x i32> %{{.*}}, <3 x i32> %{{.*}})
+  i += FUNC(i3[0], i3[1]);
+
+  // CHECK-DAG: [[I00:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 3, i32 0, i8 0, i32 undef)
+  // CHECK-DAG: [[I01:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 3, i32 0, i8 1, i32 undef)
+  // CHECK-DAG: [[I02:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 3, i32 0, i8 2, i32 undef)
+  // CHECK-DAG: [[I03:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 3, i32 0, i8 3, i32 undef)
+  // CHECK-DAG: [[I10:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 3, i32 1, i8 0, i32 undef)
+  // CHECK-DAG: [[I11:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 3, i32 1, i8 1, i32 undef)
+  // CHECK-DAG: [[I12:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 3, i32 1, i8 2, i32 undef)
+  // CHECK-DAG: [[I13:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 3, i32 1, i8 3, i32 undef)
+
+  // CHECK: [[PING:%.*]] = mul i32 [[I00]], [[I10]]
+  // CHECK: [[PONG:%.*]] = call i32 @dx.op.tertiary.i32(i32 48, i32 [[I01]], i32 [[I11]], i32 [[PING]])  ; IMad(a,b,c)
+  // CHECK: [[PING:%.*]] = call i32 @dx.op.tertiary.i32(i32 48, i32 [[I02]], i32 [[I12]], i32 [[PONG]])  ; IMad(a,b,c)
+  // CHECK: [[PONG:%.*]] = call i32 @dx.op.tertiary.i32(i32 48, i32 [[I03]], i32 [[I13]], i32 [[PING]])  ; IMad(a,b,c)
+  // CGLDOT: call i32 @"dx.hl.op.rn.i32 (i32, <4 x i32>, <4 x i32>)"(i32 134, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  // CGLMUL: call i32 @"dx.hl.op.rn.i32 (i32, <4 x i32>, <4 x i32>)"(i32 167, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  i += FUNC(i4[0], i4[1]);
+
+  float f = 0.0;
+
+  // CHECK-DAG: [[F0:%.*]] = call float @dx.op.loadInput.f32(i32 4, i32 4, i32 0, i8 0, i32 undef)
+  // CHECK-DAG: [[F1:%.*]] = call float @dx.op.loadInput.f32(i32 4, i32 4, i32 1, i8 0, i32 undef)
+  // CHECK: mul fast float [[F0]], [[F1]]
+  // CGLDOT: call float @"dx.hl.op.rn.float (i32, <1 x float>, <1 x float>)"(i32 134, <1 x float> %{{.*}}, <1 x float> %{{.*}})
+  // CGLMUL: call float @"dx.hl.op.rn.float (i32, <1 x float>, <1 x float>)"(i32 167, <1 x float> %{{.*}}, <1 x float> %{{.*}})
+  f += FUNC(f1[0], f1[1]);
+
+  // CHECK-DAG: [[F00:%.*]] = call float @dx.op.loadInput.f32(i32 4, i32 5, i32 0, i8 0, i32 undef)
+  // CHECK-DAG: [[F01:%.*]] = call float @dx.op.loadInput.f32(i32 4, i32 5, i32 0, i8 1, i32 undef)
+  // CHECK-DAG: [[F10:%.*]] = call float @dx.op.loadInput.f32(i32 4, i32 5, i32 1, i8 0, i32 undef)
+  // CHECK-DAG: [[F11:%.*]] = call float @dx.op.loadInput.f32(i32 4, i32 5, i32 1, i8 1, i32 undef)
+
+  // CHECK: call float @dx.op.dot2.f32(i32 54, float [[F00]], float [[F01]], float [[F10]], float [[F11]])
+  // CGLDOT: call float @"dx.hl.op.rn.float (i32, <2 x float>, <2 x float>)"(i32 134, <2 x float> %{{.*}}, <2 x float> %{{.*}})
+  // CGLMUL: call float @"dx.hl.op.rn.float (i32, <2 x float>, <2 x float>)"(i32 167, <2 x float> %{{.*}}, <2 x float> %{{.*}})
+  f += FUNC(f2[0], f2[1]);
+
+  // CHECK-DAG: [[F00:%.*]] = call float @dx.op.loadInput.f32(i32 4, i32 6, i32 0, i8 0, i32 undef)
+  // CHECK-DAG: [[F01:%.*]] = call float @dx.op.loadInput.f32(i32 4, i32 6, i32 0, i8 1, i32 undef)
+  // CHECK-DAG: [[F02:%.*]] = call float @dx.op.loadInput.f32(i32 4, i32 6, i32 0, i8 2, i32 undef)
+  // CHECK-DAG: [[F10:%.*]] = call float @dx.op.loadInput.f32(i32 4, i32 6, i32 1, i8 0, i32 undef)
+  // CHECK-DAG: [[F11:%.*]] = call float @dx.op.loadInput.f32(i32 4, i32 6, i32 1, i8 1, i32 undef)
+  // CHECK-DAG: [[F12:%.*]] = call float @dx.op.loadInput.f32(i32 4, i32 6, i32 1, i8 2, i32 undef)
+
+  // CHECK: call float @dx.op.dot3.f32(i32 55, float [[F00]], float [[F01]], float [[F02]], float [[F10]], float [[F11]], float [[F12]])
+  // CGLDOT: call float @"dx.hl.op.rn.float (i32, <3 x float>, <3 x float>)"(i32 134, <3 x float> %{{.*}}, <3 x float> %{{.*}})
+  // CGLMUL: call float @"dx.hl.op.rn.float (i32, <3 x float>, <3 x float>)"(i32 167, <3 x float> %{{.*}}, <3 x float> %{{.*}})
+  f += FUNC(f3[0], f3[1]);
+
+  // CHECK-DAG: [[F00:%.*]] = call float @dx.op.loadInput.f32(i32 4, i32 7, i32 0, i8 0, i32 undef)
+  // CHECK-DAG: [[F01:%.*]] = call float @dx.op.loadInput.f32(i32 4, i32 7, i32 0, i8 1, i32 undef)
+  // CHECK-DAG: [[F02:%.*]] = call float @dx.op.loadInput.f32(i32 4, i32 7, i32 0, i8 2, i32 undef)
+  // CHECK-DAG: [[F03:%.*]] = call float @dx.op.loadInput.f32(i32 4, i32 7, i32 0, i8 3, i32 undef)
+  // CHECK-DAG: [[F10:%.*]] = call float @dx.op.loadInput.f32(i32 4, i32 7, i32 1, i8 0, i32 undef)
+  // CHECK-DAG: [[F11:%.*]] = call float @dx.op.loadInput.f32(i32 4, i32 7, i32 1, i8 1, i32 undef)
+  // CHECK-DAG: [[F12:%.*]] = call float @dx.op.loadInput.f32(i32 4, i32 7, i32 1, i8 2, i32 undef)
+  // CHECK-DAG: [[F13:%.*]] = call float @dx.op.loadInput.f32(i32 4, i32 7, i32 1, i8 3, i32 undef)
+
+  // CHECK: call float @dx.op.dot4.f32(i32 56, float [[F00]], float [[F01]], float [[F02]], float [[F03]], float [[F10]], float [[F11]], float [[F12]], float [[F13]])
+  // CGLDOT: call float @"dx.hl.op.rn.float (i32, <4 x float>, <4 x float>)"(i32 134, <4 x float> %{{.*}}, <4 x float> %{{.*}})
+  // CGLMUL: call float @"dx.hl.op.rn.float (i32, <4 x float>, <4 x float>)"(i32 167, <4 x float> %{{.*}}, <4 x float> %{{.*}})
+  f += FUNC(f4[0], f4[1]);
+
+  int u = 0;
+  // CHECK-DAG: [[I0:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 8, i32 0, i8 0, i32 undef)
+  // CHECK-DAG: [[I1:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 8, i32 1, i8 0, i32 undef)
+  // CHECK: mul i32 [[I0]], [[I1]]
+  // CGLDOT: call i32 @"dx.hl.op.rn.i32 (i32, <1 x i32>, <1 x i32>)"(i32 349, <1 x i32> %{{.*}}, <1 x i32> %{{.*}})
+  // CGLMUL: call i32 @"dx.hl.op.rn.i32 (i32, <1 x i32>, <1 x i32>)"(i32 354, <1 x i32> %{{.*}}, <1 x i32> %{{.*}})
+  u += FUNC(u1[0], u1[1]);
+
+  // CHECK-DAG: [[I00:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 9, i32 0, i8 0, i32 undef)
+  // CHECK-DAG: [[I01:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 9, i32 0, i8 1, i32 undef)
+  // CHECK-DAG: [[I10:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 9, i32 1, i8 0, i32 undef)
+  // CHECK-DAG: [[I11:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 9, i32 1, i8 1, i32 undef)
+
+  // CHECK: [[MUL:%.*]] = mul i32 [[I00]], [[I10]]
+  // CHECK: call i32 @dx.op.tertiary.i32(i32 49, i32 [[I01]], i32 [[I11]], i32 [[MUL]])  ; UMad(a,b,c)
+  // CGLDOT: call i32 @"dx.hl.op.rn.i32 (i32, <2 x i32>, <2 x i32>)"(i32 349, <2 x i32> %{{.*}}, <2 x i32> %{{.*}})
+  // CGLMUL: call i32 @"dx.hl.op.rn.i32 (i32, <2 x i32>, <2 x i32>)"(i32 354, <2 x i32> %{{.*}}, <2 x i32> %{{.*}})
+  u += FUNC(u2[0], u2[1]);
+
+  // CHECK-DAG: [[I00:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 10, i32 0, i8 0, i32 undef)
+  // CHECK-DAG: [[I01:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 10, i32 0, i8 1, i32 undef)
+  // CHECK-DAG: [[I02:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 10, i32 0, i8 2, i32 undef)
+  // CHECK-DAG: [[I10:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 10, i32 1, i8 0, i32 undef)
+  // CHECK-DAG: [[I11:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 10, i32 1, i8 1, i32 undef)
+  // CHECK-DAG: [[I12:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 10, i32 1, i8 2, i32 undef)
+
+  // CHECK: [[PING:%.*]] = mul i32 [[I00]], [[I10]]
+  // CHECK: [[PONG:%.*]] = call i32 @dx.op.tertiary.i32(i32 49, i32 [[I01]], i32 [[I11]], i32 [[PING]])  ; UMad(a,b,c)
+  // CHECK: [[PING:%.*]] = call i32 @dx.op.tertiary.i32(i32 49, i32 [[I02]], i32 [[I12]], i32 [[PONG]])  ; UMad(a,b,c)
+  // CGLDOT: call i32 @"dx.hl.op.rn.i32 (i32, <3 x i32>, <3 x i32>)"(i32 349, <3 x i32> %{{.*}}, <3 x i32> %{{.*}})
+  // CGLMUL: call i32 @"dx.hl.op.rn.i32 (i32, <3 x i32>, <3 x i32>)"(i32 354, <3 x i32> %{{.*}}, <3 x i32> %{{.*}})
+  u += FUNC(u3[0], u3[1]);
+
+  // CHECK-DAG: [[I00:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 11, i32 0, i8 0, i32 undef)
+  // CHECK-DAG: [[I01:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 11, i32 0, i8 1, i32 undef)
+  // CHECK-DAG: [[I02:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 11, i32 0, i8 2, i32 undef)
+  // CHECK-DAG: [[I03:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 11, i32 0, i8 3, i32 undef)
+  // CHECK-DAG: [[I10:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 11, i32 1, i8 0, i32 undef)
+  // CHECK-DAG: [[I11:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 11, i32 1, i8 1, i32 undef)
+  // CHECK-DAG: [[I12:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 11, i32 1, i8 2, i32 undef)
+  // CHECK-DAG: [[I13:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 11, i32 1, i8 3, i32 undef)
+
+  // CHECK: [[PING:%.*]] = mul i32 [[I00]], [[I10]]
+  // CHECK: [[PONG:%.*]] = call i32 @dx.op.tertiary.i32(i32 49, i32 [[I01]], i32 [[I11]], i32 [[PING]])  ; UMad(a,b,c)
+  // CHECK: [[PING:%.*]] = call i32 @dx.op.tertiary.i32(i32 49, i32 [[I02]], i32 [[I12]], i32 [[PONG]])  ; UMad(a,b,c)
+  // CHECK: [[PONG:%.*]] = call i32 @dx.op.tertiary.i32(i32 49, i32 [[I03]], i32 [[I13]], i32 [[PING]])  ; UMad(a,b,c)
+  // CGLDOT: call i32 @"dx.hl.op.rn.i32 (i32, <4 x i32>, <4 x i32>)"(i32 349, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  // CGLMUL: call i32 @"dx.hl.op.rn.i32 (i32, <4 x i32>, <4 x i32>)"(i32 354, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  u += FUNC(u4[0], u4[1]);
+
+  return float4(i, f, u, 0);
+}
diff --git a/tools/clang/test/HLSLFileCheck/samples/d3d11/FluidCS11_BuildGridCS.hlsl b/tools/clang/test/HLSLFileCheck/samples/d3d11/FluidCS11_BuildGridCS.hlsl
@@ -4,7 +4,7 @@
 // CHECK: bufferLoad
 // CHECK: FMax
 // CHECK: FMin
-// CHECK: IMad
+// CHECK: UMad
 // CHECK: bufferStore
 
 //--------------------------------------------------------------------------------------
diff --git a/utils/hct/gen_intrin_main.txt b/utils/hct/gen_intrin_main.txt
@@ -124,7 +124,7 @@ $match<0, 1> float_like [[rn]] determinant(in float_like<r, r> x);
 void [[]] DeviceMemoryBarrier() : syncdevicememory_ug;
 void [[]] DeviceMemoryBarrierWithGroupSync() : syncgroupanddevicememory_ug;
 $match<0, 1> float_like [[rn]] distance(in float_like<c> a, in $type1 b);
-$match<0, 1> numeric [[rn]] dot(in numeric<c> a, in $type1 b);
+$match<0, 1> numeric [[rn,unsigned_op=udot]] dot(in numeric<c> a, in $type1 b);
 $type1 [[rn]] dst(in numeric<4> a, in $type1 b);
 // void errorf(in string Format, ...);
 $type1 [[rn]] EvaluateAttributeAtSample(in numeric<> value, in uint index);
@@ -198,13 +198,13 @@ $type1 [[rn,unsigned_op=umax]] max(in numeric<> a, in $type1 b);
 $type1 [[rn,unsigned_op=umin]] min(in numeric<> a, in $type1 b);
 $type1 [[]] modf(in float_like<> x, out $type1 ip);
 uint<4> [[rn]] msad4(in uint reference, in uint<2> source, in uint<4> accum);
-numeric [[rn]] mul(in $match<1, 0> numeric a, in $match<2, 0> numeric b) : mul_ss;
-numeric<c2> [[rn]] mul(in $match<1, 0> numeric a, in $match<2, 0> numeric<c2> b) : mul_sv;
-numeric<r2, c2> [[rn]] mul(in $match<1, 0> numeric a, in $match<2, 0> numeric<r2, c2> b) : mul_sm;
-numeric<c> [[rn]] mul(in $match<1, 0> numeric<c> a, in $match<2, 0> numeric b) : mul_vs;
-numeric [[rn]] mul(in $match<1, 0> numeric<c> a, in $match<2, 0> numeric<c> b) : mul_vv;
+numeric [[rn,unsigned_op=umul]] mul(in $match<1, 0> numeric a, in $match<2, 0> numeric b) : mul_ss;
+numeric<c2> [[rn,unsigned_op=umul]] mul(in $match<1, 0> numeric a, in $match<2, 0> numeric<c2> b) : mul_sv;
+numeric<r2, c2> [[rn,unsigned_op=umul]] mul(in $match<1, 0> numeric a, in $match<2, 0> numeric<r2, c2> b) : mul_sm;
+numeric<c> [[rn,unsigned_op=umul]] mul(in $match<1, 0> numeric<c> a, in $match<2, 0> numeric b) : mul_vs;
+numeric [[rn,unsigned_op=umul]] mul(in $match<1, 0> numeric<c> a, in $match<2, 0> numeric<c> b) : mul_vv;
 numeric<c2> [[rn,unsigned_op=umul]] mul(in $match<1, 0> numeric<c> a, in col_major $match<2, 0> numeric<c, c2> b) : mul_vm;
-numeric<r, c> [[rn]] mul(in $match<1, 0> numeric<r, c> a, in $match<2, 0> numeric b) : mul_ms;
+numeric<r, c> [[rn,unsigned_op=umul]] mul(in $match<1, 0> numeric<r, c> a, in $match<2, 0> numeric b) : mul_ms;
 numeric<r> [[rn,unsigned_op=umul]] mul(in row_major $match<1, 0> numeric<r, c> a, in $match<2, 0> numeric<c> b) : mul_mv;
 numeric<r, c2> [[rn,unsigned_op=umul]] mul(in row_major $match<1, 0> numeric<r, c> a, in col_major $match<2, 0> numeric<c, c2> b) : mul_mm;
 $type1 [[rn]] normalize(in float_like<c> x);

Original file line number	Diff line number	Diff line change
`@@ -8957,6 +8957,7 @@ SpirvEmitter::processIntrinsicCallExpr(const CallExpr *callExpr) {`
`8957`	`8957`	`return nullptr;`
`8958`	`8958`	`}`
`8959`	`8959`	`case hlsl::IntrinsicOp::IOP_dot:`
	`8960`	`+ case hlsl::IntrinsicOp::IOP_udot:`
`8960`	`8961`	`retVal = processIntrinsicDot(callExpr);`
`8961`	`8962`	`break;`
`8962`	`8963`	`case hlsl::IntrinsicOp::IOP_GroupMemoryBarrier:`