Skip to content

Commit bb30ef7

Browse files
committed
AMDGPU: Add clamp bit to dot intrinsics
Differential Revision: https://reviews.llvm.org/D49874 llvm-svn: 338470
1 parent 7a70be6 commit bb30ef7

11 files changed

+209
-61
lines changed

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

+21-14
Original file line numberDiff line numberDiff line change
@@ -1191,7 +1191,7 @@ def int_amdgcn_ds_bpermute :
11911191
// Deep learning intrinsics.
11921192
//===----------------------------------------------------------------------===//
11931193

1194-
// f32 %r = llvm.amdgcn.fdot2(v2f16 %a, v2f16 %b, f32 %c)
1194+
// f32 %r = llvm.amdgcn.fdot2(v2f16 %a, v2f16 %b, f32 %c, i1 %clamp)
11951195
// %r = %a[0] * %b[0] + %a[1] * %b[1] + %c
11961196
def int_amdgcn_fdot2 :
11971197
GCCBuiltin<"__builtin_amdgcn_fdot2">,
@@ -1200,12 +1200,13 @@ def int_amdgcn_fdot2 :
12001200
[
12011201
llvm_v2f16_ty, // %a
12021202
llvm_v2f16_ty, // %b
1203-
llvm_float_ty // %c
1203+
llvm_float_ty, // %c
1204+
llvm_i1_ty // %clamp
12041205
],
12051206
[IntrNoMem, IntrSpeculatable]
12061207
>;
12071208

1208-
// i32 %r = llvm.amdgcn.sdot2(v2i16 %a, v2i16 %b, i32 %c)
1209+
// i32 %r = llvm.amdgcn.sdot2(v2i16 %a, v2i16 %b, i32 %c, i1 %clamp)
12091210
// %r = %a[0] * %b[0] + %a[1] * %b[1] + %c
12101211
def int_amdgcn_sdot2 :
12111212
GCCBuiltin<"__builtin_amdgcn_sdot2">,
@@ -1214,12 +1215,13 @@ def int_amdgcn_sdot2 :
12141215
[
12151216
llvm_v2i16_ty, // %a
12161217
llvm_v2i16_ty, // %b
1217-
llvm_i32_ty // %c
1218+
llvm_i32_ty, // %c
1219+
llvm_i1_ty // %clamp
12181220
],
12191221
[IntrNoMem, IntrSpeculatable]
12201222
>;
12211223

1222-
// u32 %r = llvm.amdgcn.udot2(v2u16 %a, v2u16 %b, u32 %c)
1224+
// u32 %r = llvm.amdgcn.udot2(v2u16 %a, v2u16 %b, u32 %c, i1 %clamp)
12231225
// %r = %a[0] * %b[0] + %a[1] * %b[1] + %c
12241226
def int_amdgcn_udot2 :
12251227
GCCBuiltin<"__builtin_amdgcn_udot2">,
@@ -1228,12 +1230,13 @@ def int_amdgcn_udot2 :
12281230
[
12291231
llvm_v2i16_ty, // %a
12301232
llvm_v2i16_ty, // %b
1231-
llvm_i32_ty // %c
1233+
llvm_i32_ty, // %c
1234+
llvm_i1_ty // %clamp
12321235
],
12331236
[IntrNoMem, IntrSpeculatable]
12341237
>;
12351238

1236-
// i32 %r = llvm.amdgcn.sdot4(v4i8 (as i32) %a, v4i8 (as i32) %b, i32 %c)
1239+
// i32 %r = llvm.amdgcn.sdot4(v4i8 (as i32) %a, v4i8 (as i32) %b, i32 %c, i1 %clamp)
12371240
// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c
12381241
def int_amdgcn_sdot4 :
12391242
GCCBuiltin<"__builtin_amdgcn_sdot4">,
@@ -1242,12 +1245,13 @@ def int_amdgcn_sdot4 :
12421245
[
12431246
llvm_i32_ty, // %a
12441247
llvm_i32_ty, // %b
1245-
llvm_i32_ty // %c
1248+
llvm_i32_ty, // %c
1249+
llvm_i1_ty // %clamp
12461250
],
12471251
[IntrNoMem, IntrSpeculatable]
12481252
>;
12491253

1250-
// u32 %r = llvm.amdgcn.udot4(v4u8 (as u32) %a, v4u8 (as u32) %b, u32 %c)
1254+
// u32 %r = llvm.amdgcn.udot4(v4u8 (as u32) %a, v4u8 (as u32) %b, u32 %c, i1 %clamp)
12511255
// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c
12521256
def int_amdgcn_udot4 :
12531257
GCCBuiltin<"__builtin_amdgcn_udot4">,
@@ -1256,12 +1260,13 @@ def int_amdgcn_udot4 :
12561260
[
12571261
llvm_i32_ty, // %a
12581262
llvm_i32_ty, // %b
1259-
llvm_i32_ty // %c
1263+
llvm_i32_ty, // %c
1264+
llvm_i1_ty // %clamp
12601265
],
12611266
[IntrNoMem, IntrSpeculatable]
12621267
>;
12631268

1264-
// i32 %r = llvm.amdgcn.sdot8(v8i4 (as i32) %a, v8i4 (as i32) %b, i32 %c)
1269+
// i32 %r = llvm.amdgcn.sdot8(v8i4 (as i32) %a, v8i4 (as i32) %b, i32 %c, i1 %clamp)
12651270
// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] +
12661271
// %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c
12671272
def int_amdgcn_sdot8 :
@@ -1271,12 +1276,13 @@ def int_amdgcn_sdot8 :
12711276
[
12721277
llvm_i32_ty, // %a
12731278
llvm_i32_ty, // %b
1274-
llvm_i32_ty // %c
1279+
llvm_i32_ty, // %c
1280+
llvm_i1_ty // %clamp
12751281
],
12761282
[IntrNoMem, IntrSpeculatable]
12771283
>;
12781284

1279-
// u32 %r = llvm.amdgcn.udot8(v8u4 (as u32) %a, v8u4 (as u32) %b, u32 %c)
1285+
// u32 %r = llvm.amdgcn.udot8(v8u4 (as u32) %a, v8u4 (as u32) %b, u32 %c, i1 %clamp)
12801286
// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] +
12811287
// %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c
12821288
def int_amdgcn_udot8 :
@@ -1286,7 +1292,8 @@ def int_amdgcn_udot8 :
12861292
[
12871293
llvm_i32_ty, // %a
12881294
llvm_i32_ty, // %b
1289-
llvm_i32_ty // %c
1295+
llvm_i32_ty, // %c
1296+
llvm_i1_ty // %clamp
12901297
],
12911298
[IntrNoMem, IntrSpeculatable]
12921299
>;

llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td

+3-2
Original file line numberDiff line numberDiff line change
@@ -342,8 +342,9 @@ def AMDGPUumed3 : SDNode<"AMDGPUISD::UMED3", AMDGPUDTIntTernaryOp,
342342
def AMDGPUfmed3 : SDNode<"AMDGPUISD::FMED3", SDTFPTernaryOp, []>;
343343

344344
def AMDGPUfdot2 : SDNode<"AMDGPUISD::FDOT2",
345-
SDTypeProfile<1, 3, [SDTCisSameAs<0, 3>, SDTCisSameAs<1, 2>,
346-
SDTCisFP<0>, SDTCisVec<1>]>,
345+
SDTypeProfile<1, 4, [SDTCisSameAs<0, 3>, SDTCisSameAs<1, 2>,
346+
SDTCisFP<0>, SDTCisVec<1>,
347+
SDTCisInt<4>]>,
347348
[]>;
348349

349350
def AMDGPUperm : SDNode<"AMDGPUISD::PERM", AMDGPUDTIntTernaryOp, []>;

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

+6-3
Original file line numberDiff line numberDiff line change
@@ -5010,7 +5010,8 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
50105010
Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
50115011
case Intrinsic::amdgcn_fdot2:
50125012
return DAG.getNode(AMDGPUISD::FDOT2, DL, VT,
5013-
Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
5013+
Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
5014+
Op.getOperand(4));
50145015
case Intrinsic::amdgcn_fmul_legacy:
50155016
return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT,
50165017
Op.getOperand(1), Op.getOperand(2));
@@ -7613,8 +7614,10 @@ SDValue SITargetLowering::performFMACombine(SDNode *N,
76137614
return SDValue();
76147615

76157616
if ((Vec1 == Vec3 && Vec2 == Vec4) ||
7616-
(Vec1 == Vec4 && Vec2 == Vec3))
7617-
return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc);
7617+
(Vec1 == Vec4 && Vec2 == Vec3)) {
7618+
return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
7619+
DAG.getTargetConstant(0, SL, MVT::i1));
7620+
}
76187621
}
76197622
return SDValue();
76207623
}

llvm/lib/Target/AMDGPU/VOP3PInstructions.td

+24-7
Original file line numberDiff line numberDiff line change
@@ -167,13 +167,30 @@ defm : MadFmaMixPats<fma, V_FMA_MIX_F32, V_FMA_MIXLO_F16, V_FMA_MIXHI_F16>;
167167

168168
let SubtargetPredicate = HasDLInsts in {
169169

170-
def V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16", VOP3_Profile<VOP_F32_V2F16_V2F16_F32>, AMDGPUfdot2>;
171-
def V_DOT2_I32_I16 : VOP3PInst<"v_dot2_i32_i16", VOP3_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_sdot2>;
172-
def V_DOT2_U32_U16 : VOP3PInst<"v_dot2_u32_u16", VOP3_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_udot2>;
173-
def V_DOT4_I32_I8 : VOP3PInst<"v_dot4_i32_i8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot4>;
174-
def V_DOT4_U32_U8 : VOP3PInst<"v_dot4_u32_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot4>;
175-
def V_DOT8_I32_I4 : VOP3PInst<"v_dot8_i32_i4", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot8>;
176-
def V_DOT8_U32_U4 : VOP3PInst<"v_dot8_u32_u4", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot8>;
170+
def V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16", VOP3_Profile<VOP_F32_V2F16_V2F16_F32>>;
171+
def V_DOT2_I32_I16 : VOP3PInst<"v_dot2_i32_i16", VOP3_Profile<VOP_I32_V2I16_V2I16_I32>>;
172+
def V_DOT2_U32_U16 : VOP3PInst<"v_dot2_u32_u16", VOP3_Profile<VOP_I32_V2I16_V2I16_I32>>;
173+
def V_DOT4_I32_I8 : VOP3PInst<"v_dot4_i32_i8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>>;
174+
def V_DOT4_U32_U8 : VOP3PInst<"v_dot4_u32_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>>;
175+
def V_DOT8_I32_I4 : VOP3PInst<"v_dot8_i32_i4", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>>;
176+
def V_DOT8_U32_U4 : VOP3PInst<"v_dot8_u32_u4", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>>;
177+
178+
multiclass DotPats<SDPatternOperator dot_op,
179+
VOP3PInst dot_inst> {
180+
def : GCNPat <
181+
(dot_op (dot_inst.Pfl.Src0VT (VOP3PMods0 dot_inst.Pfl.Src0VT:$src0, i32:$src0_modifiers)),
182+
(dot_inst.Pfl.Src1VT (VOP3PMods dot_inst.Pfl.Src1VT:$src1, i32:$src1_modifiers)),
183+
(dot_inst.Pfl.Src2VT (VOP3PMods dot_inst.Pfl.Src2VT:$src2, i32:$src2_modifiers)), i1:$clamp),
184+
(dot_inst $src0_modifiers, $src0, $src1_modifiers, $src1, $src2_modifiers, $src2, (as_i1imm $clamp))>;
185+
}
186+
187+
defm : DotPats<AMDGPUfdot2, V_DOT2_F32_F16>;
188+
defm : DotPats<int_amdgcn_sdot2, V_DOT2_I32_I16>;
189+
defm : DotPats<int_amdgcn_udot2, V_DOT2_U32_U16>;
190+
defm : DotPats<int_amdgcn_sdot4, V_DOT4_I32_I8>;
191+
defm : DotPats<int_amdgcn_udot4, V_DOT4_U32_U8>;
192+
defm : DotPats<int_amdgcn_sdot8, V_DOT8_I32_I4>;
193+
defm : DotPats<int_amdgcn_udot8, V_DOT8_U32_U4>;
177194

178195
} // End SubtargetPredicate = HasDLInsts
179196

+21-5
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX906
22

3-
declare float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %c)
3+
declare float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %c, i1 %clamp)
44

5-
; GFX906-LABEL: {{^}}test_llvm_amdgcn_fdot2
6-
; GFX906: v_dot2_f32_f16
7-
define amdgpu_kernel void @test_llvm_amdgcn_fdot2(
5+
; GFX906-LABEL: {{^}}test_llvm_amdgcn_fdot2_clamp
6+
; GFX906: v_dot2_f32_f16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} clamp{{$}}
7+
define amdgpu_kernel void @test_llvm_amdgcn_fdot2_clamp(
88
float addrspace(1)* %r,
99
<2 x half> addrspace(1)* %a,
1010
<2 x half> addrspace(1)* %b,
@@ -13,7 +13,23 @@ entry:
1313
%a.val = load <2 x half>, <2 x half> addrspace(1)* %a
1414
%b.val = load <2 x half>, <2 x half> addrspace(1)* %b
1515
%c.val = load float, float addrspace(1)* %c
16-
%r.val = call float @llvm.amdgcn.fdot2(<2 x half> %a.val, <2 x half> %b.val, float %c.val)
16+
%r.val = call float @llvm.amdgcn.fdot2(<2 x half> %a.val, <2 x half> %b.val, float %c.val, i1 1)
17+
store float %r.val, float addrspace(1)* %r
18+
ret void
19+
}
20+
21+
; GFX906-LABEL: {{^}}test_llvm_amdgcn_fdot2_no_clamp
22+
; GFX906: v_dot2_f32_f16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
23+
define amdgpu_kernel void @test_llvm_amdgcn_fdot2_no_clamp(
24+
float addrspace(1)* %r,
25+
<2 x half> addrspace(1)* %a,
26+
<2 x half> addrspace(1)* %b,
27+
float addrspace(1)* %c) {
28+
entry:
29+
%a.val = load <2 x half>, <2 x half> addrspace(1)* %a
30+
%b.val = load <2 x half>, <2 x half> addrspace(1)* %b
31+
%c.val = load float, float addrspace(1)* %c
32+
%r.val = call float @llvm.amdgcn.fdot2(<2 x half> %a.val, <2 x half> %b.val, float %c.val, i1 0)
1733
store float %r.val, float addrspace(1)* %r
1834
ret void
1935
}
+21-5
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GCN --check-prefix=GFX906
22

3-
declare i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> %b, i32 %c)
3+
declare i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> %b, i32 %c, i1 %clamp)
44

5-
; GCN-LABEL: {{^}}test_llvm_amdgcn_sdot2
6-
; GFX906: v_dot2_i32_i16
7-
define amdgpu_kernel void @test_llvm_amdgcn_sdot2(
5+
; GCN-LABEL: {{^}}test_llvm_amdgcn_sdot2_clamp
6+
; GFX906: v_dot2_i32_i16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} clamp{{$}}
7+
define amdgpu_kernel void @test_llvm_amdgcn_sdot2_clamp(
88
i32 addrspace(1)* %r,
99
<2 x i16> addrspace(1)* %a,
1010
<2 x i16> addrspace(1)* %b,
@@ -13,7 +13,23 @@ entry:
1313
%a.val = load <2 x i16>, <2 x i16> addrspace(1)* %a
1414
%b.val = load <2 x i16>, <2 x i16> addrspace(1)* %b
1515
%c.val = load i32, i32 addrspace(1)* %c
16-
%r.val = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a.val, <2 x i16> %b.val, i32 %c.val)
16+
%r.val = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a.val, <2 x i16> %b.val, i32 %c.val, i1 1)
17+
store i32 %r.val, i32 addrspace(1)* %r
18+
ret void
19+
}
20+
21+
; GCN-LABEL: {{^}}test_llvm_amdgcn_sdot2_no_clamp
22+
; GFX906: v_dot2_i32_i16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
23+
define amdgpu_kernel void @test_llvm_amdgcn_sdot2_no_clamp(
24+
i32 addrspace(1)* %r,
25+
<2 x i16> addrspace(1)* %a,
26+
<2 x i16> addrspace(1)* %b,
27+
i32 addrspace(1)* %c) {
28+
entry:
29+
%a.val = load <2 x i16>, <2 x i16> addrspace(1)* %a
30+
%b.val = load <2 x i16>, <2 x i16> addrspace(1)* %b
31+
%c.val = load i32, i32 addrspace(1)* %c
32+
%r.val = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a.val, <2 x i16> %b.val, i32 %c.val, i1 0)
1733
store i32 %r.val, i32 addrspace(1)* %r
1834
ret void
1935
}
+23-5
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GCN --check-prefix=GFX906
22

3-
declare i32 @llvm.amdgcn.sdot4(i32 %a, i32 %b, i32 %c)
3+
declare i32 @llvm.amdgcn.sdot4(i32 %a, i32 %b, i32 %c, i1 %clamp)
44

5-
; GCN-LABEL: {{^}}test_llvm_amdgcn_sdot4
6-
; GFX906: v_dot4_i32_i8
7-
define amdgpu_kernel void @test_llvm_amdgcn_sdot4(
5+
; GCN-LABEL: {{^}}test_llvm_amdgcn_sdot4_clamp
6+
; GFX906: v_dot4_i32_i8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} clamp{{$}}
7+
define amdgpu_kernel void @test_llvm_amdgcn_sdot4_clamp(
88
i32 addrspace(1)* %r,
99
<4 x i8> addrspace(1)* %a,
1010
<4 x i8> addrspace(1)* %b,
@@ -15,7 +15,25 @@ entry:
1515
%a.val.cast = bitcast <4 x i8> %a.val to i32
1616
%b.val.cast = bitcast <4 x i8> %b.val to i32
1717
%c.val = load i32, i32 addrspace(1)* %c
18-
%r.val = call i32 @llvm.amdgcn.sdot4(i32 %a.val.cast, i32 %b.val.cast, i32 %c.val)
18+
%r.val = call i32 @llvm.amdgcn.sdot4(i32 %a.val.cast, i32 %b.val.cast, i32 %c.val, i1 1)
19+
store i32 %r.val, i32 addrspace(1)* %r
20+
ret void
21+
}
22+
23+
; GCN-LABEL: {{^}}test_llvm_amdgcn_sdot4_no_clamp
24+
; GFX906: v_dot4_i32_i8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
25+
define amdgpu_kernel void @test_llvm_amdgcn_sdot4_no_clamp(
26+
i32 addrspace(1)* %r,
27+
<4 x i8> addrspace(1)* %a,
28+
<4 x i8> addrspace(1)* %b,
29+
i32 addrspace(1)* %c) {
30+
entry:
31+
%a.val = load <4 x i8>, <4 x i8> addrspace(1)* %a
32+
%b.val = load <4 x i8>, <4 x i8> addrspace(1)* %b
33+
%a.val.cast = bitcast <4 x i8> %a.val to i32
34+
%b.val.cast = bitcast <4 x i8> %b.val to i32
35+
%c.val = load i32, i32 addrspace(1)* %c
36+
%r.val = call i32 @llvm.amdgcn.sdot4(i32 %a.val.cast, i32 %b.val.cast, i32 %c.val, i1 0)
1937
store i32 %r.val, i32 addrspace(1)* %r
2038
ret void
2139
}
+23-5
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GCN --check-prefix=GFX906
22

3-
declare i32 @llvm.amdgcn.sdot8(i32 %a, i32 %b, i32 %c)
3+
declare i32 @llvm.amdgcn.sdot8(i32 %a, i32 %b, i32 %c, i1 %clamp)
44

5-
; GCN-LABEL: {{^}}test_llvm_amdgcn_sdot8
6-
; GFX906: v_dot8_i32_i4
7-
define amdgpu_kernel void @test_llvm_amdgcn_sdot8(
5+
; GCN-LABEL: {{^}}test_llvm_amdgcn_sdot8_clamp
6+
; GFX906: v_dot8_i32_i4 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} clamp{{$}}
7+
define amdgpu_kernel void @test_llvm_amdgcn_sdot8_clamp(
88
i32 addrspace(1)* %r,
99
<8 x i4> addrspace(1)* %a,
1010
<8 x i4> addrspace(1)* %b,
@@ -15,7 +15,25 @@ entry:
1515
%a.val.cast = bitcast <8 x i4> %a.val to i32
1616
%b.val.cast = bitcast <8 x i4> %b.val to i32
1717
%c.val = load i32, i32 addrspace(1)* %c
18-
%r.val = call i32 @llvm.amdgcn.sdot8(i32 %a.val.cast, i32 %b.val.cast, i32 %c.val)
18+
%r.val = call i32 @llvm.amdgcn.sdot8(i32 %a.val.cast, i32 %b.val.cast, i32 %c.val, i1 1)
19+
store i32 %r.val, i32 addrspace(1)* %r
20+
ret void
21+
}
22+
23+
; GCN-LABEL: {{^}}test_llvm_amdgcn_sdot8_no_clamp
24+
; GFX906: v_dot8_i32_i4 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
25+
define amdgpu_kernel void @test_llvm_amdgcn_sdot8_no_clamp(
26+
i32 addrspace(1)* %r,
27+
<8 x i4> addrspace(1)* %a,
28+
<8 x i4> addrspace(1)* %b,
29+
i32 addrspace(1)* %c) {
30+
entry:
31+
%a.val = load <8 x i4>, <8 x i4> addrspace(1)* %a
32+
%b.val = load <8 x i4>, <8 x i4> addrspace(1)* %b
33+
%a.val.cast = bitcast <8 x i4> %a.val to i32
34+
%b.val.cast = bitcast <8 x i4> %b.val to i32
35+
%c.val = load i32, i32 addrspace(1)* %c
36+
%r.val = call i32 @llvm.amdgcn.sdot8(i32 %a.val.cast, i32 %b.val.cast, i32 %c.val, i1 0)
1937
store i32 %r.val, i32 addrspace(1)* %r
2038
ret void
2139
}
+21-5
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GCN --check-prefix=GFX906
22

3-
declare i32 @llvm.amdgcn.udot2(<2 x i16> %a, <2 x i16> %b, i32 %c)
3+
declare i32 @llvm.amdgcn.udot2(<2 x i16> %a, <2 x i16> %b, i32 %c, i1 %clamp)
44

5-
; GCN-LABEL: {{^}}test_llvm_amdgcn_udot2
6-
; GFX906: v_dot2_u32_u16
7-
define amdgpu_kernel void @test_llvm_amdgcn_udot2(
5+
; GCN-LABEL: {{^}}test_llvm_amdgcn_udot2_clamp
6+
; GFX906: v_dot2_u32_u16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} clamp{{$}}
7+
define amdgpu_kernel void @test_llvm_amdgcn_udot2_clamp(
88
i32 addrspace(1)* %r,
99
<2 x i16> addrspace(1)* %a,
1010
<2 x i16> addrspace(1)* %b,
@@ -13,7 +13,23 @@ entry:
1313
%a.val = load <2 x i16>, <2 x i16> addrspace(1)* %a
1414
%b.val = load <2 x i16>, <2 x i16> addrspace(1)* %b
1515
%c.val = load i32, i32 addrspace(1)* %c
16-
%r.val = call i32 @llvm.amdgcn.udot2(<2 x i16> %a.val, <2 x i16> %b.val, i32 %c.val)
16+
%r.val = call i32 @llvm.amdgcn.udot2(<2 x i16> %a.val, <2 x i16> %b.val, i32 %c.val, i1 1)
17+
store i32 %r.val, i32 addrspace(1)* %r
18+
ret void
19+
}
20+
21+
; GCN-LABEL: {{^}}test_llvm_amdgcn_udot2_no_clamp
22+
; GFX906: v_dot2_u32_u16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
23+
define amdgpu_kernel void @test_llvm_amdgcn_udot2_no_clamp(
24+
i32 addrspace(1)* %r,
25+
<2 x i16> addrspace(1)* %a,
26+
<2 x i16> addrspace(1)* %b,
27+
i32 addrspace(1)* %c) {
28+
entry:
29+
%a.val = load <2 x i16>, <2 x i16> addrspace(1)* %a
30+
%b.val = load <2 x i16>, <2 x i16> addrspace(1)* %b
31+
%c.val = load i32, i32 addrspace(1)* %c
32+
%r.val = call i32 @llvm.amdgcn.udot2(<2 x i16> %a.val, <2 x i16> %b.val, i32 %c.val, i1 0)
1733
store i32 %r.val, i32 addrspace(1)* %r
1834
ret void
1935
}

0 commit comments

Comments
 (0)