Skip to content

[AMDGPU] Extend llvm.amdgcn.update.dpp intrinsic to support f64 #91190

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
May 13, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion llvm/lib/Target/AMDGPU/SIInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -3338,7 +3338,7 @@ def : GCNPat <
(as_i1timm $bound_ctrl))
>;

foreach vt = [i64, f64] in {
foreach vt = Reg64Types.types in {
def : GCNPat <
(vt (int_amdgcn_update_dpp vt:$old, vt:$src, timm:$dpp_ctrl, timm:$row_mask,
timm:$bank_mask, timm:$bound_ctrl)),
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/SIRegisterInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -586,6 +586,7 @@ class RegisterTypes<list<ValueType> reg_types> {

def Reg16Types : RegisterTypes<[i16, f16, bf16]>;
def Reg32Types : RegisterTypes<[i32, f32, v2i16, v2f16, v2bf16, p2, p3, p5, p6]>;
def Reg64Types : RegisterTypes<[i64, f64, v2i32, v2f32, p0]>;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We'll need to do something about other address spaces for globalisel, eventually


let HasVGPR = 1 in {
// VOP3 and VINTERP can access 256 lo and 256 hi registers.
Expand Down
166 changes: 165 additions & 1 deletion llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll
Original file line number Diff line number Diff line change
Expand Up @@ -153,10 +153,174 @@ define amdgpu_kernel void @update_dppf64_test(ptr addrspace(1) %arg, double %in1
ret void
}

define amdgpu_kernel void @update_dppv2i32_test(ptr addrspace(1) %arg, <2 x i32> %in1, <2 x i32> %in2) {
; GFX8-LABEL: update_dppv2i32_test:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: v_mov_b32_e32 v5, s3
; GFX8-NEXT: v_mov_b32_e32 v4, s2
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-NEXT: v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: update_dppv2i32_test:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
; GFX10-NEXT: v_mov_b32_e32 v2, s2
; GFX10-NEXT: v_mov_b32_e32 v3, s3
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX10-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: update_dppv2i32_test:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX11-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX11-NEXT: global_store_b64 v4, v[2:3], s[0:1]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x i32>, ptr addrspace(1) %arg, i32 %id
%load = load <2 x i32>, ptr addrspace(1) %gep
%tmp0 = call <2 x i32> @llvm.amdgcn.update.dpp.v2i32(<2 x i32> %in1, <2 x i32> %load, i32 1, i32 1, i32 1, i1 false) #1
store <2 x i32> %tmp0, ptr addrspace(1) %gep
ret void
}

define amdgpu_kernel void @update_dppv2f32_test(ptr addrspace(1) %arg, <2 x float> %in1, <2 x float> %in2) {
; GFX8-LABEL: update_dppv2f32_test:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: v_mov_b32_e32 v5, s3
; GFX8-NEXT: v_mov_b32_e32 v4, s2
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-NEXT: v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: update_dppv2f32_test:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
; GFX10-NEXT: v_mov_b32_e32 v2, s2
; GFX10-NEXT: v_mov_b32_e32 v3, s3
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX10-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: update_dppv2f32_test:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX11-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX11-NEXT: global_store_b64 v4, v[2:3], s[0:1]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %arg, i32 %id
%load = load <2 x float>, ptr addrspace(1) %gep
%tmp0 = call <2 x float> @llvm.amdgcn.update.dpp.v2f32(<2 x float> %in1, <2 x float> %load, i32 1, i32 1, i32 1, i1 false) #1
store <2 x float> %tmp0, ptr addrspace(1) %gep
ret void
}

define amdgpu_kernel void @update_dpp_p0_test(ptr addrspace(1) %arg, ptr %in1, ptr %in2) {
; GFX8-LABEL: update_dpp_p0_test:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: v_mov_b32_e32 v5, s3
; GFX8-NEXT: v_mov_b32_e32 v4, s2
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-NEXT: v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: update_dpp_p0_test:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
; GFX10-NEXT: v_mov_b32_e32 v2, s2
; GFX10-NEXT: v_mov_b32_e32 v3, s3
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX10-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: update_dpp_p0_test:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX11-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX11-NEXT: global_store_b64 v4, v[2:3], s[0:1]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds ptr, ptr addrspace(1) %arg, i32 %id
%load = load ptr, ptr addrspace(1) %gep
%tmp0 = call ptr @llvm.amdgcn.update.dpp.v2f32(ptr %in1, ptr %load, i32 1, i32 1, i32 1, i1 false) #1
store ptr %tmp0, ptr addrspace(1) %gep
ret void
}

declare i32 @llvm.amdgcn.workitem.id.x() #0
declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32 immarg, i32 immarg, i32 immarg, i1 immarg) #1
declare i64 @llvm.amdgcn.update.dpp.i64(i64, i64, i32 immarg, i32 immarg, i32 immarg, i1 immarg) #1
declare double @llvm.amdgcn.update.dpp.f64(double, double, i32 immarg, i32 immarg, i32 immarg, i1 immarg) #1

attributes #0 = { nounwind readnone speculatable }
attributes #1 = { convergent nounwind readnone }
40 changes: 39 additions & 1 deletion llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,45 @@ define amdgpu_kernel void @update_dppf64_test(ptr addrspace(1) %arg, double %in1
ret void
}

; GCN-LABEL: {{^}}update_dppv2i32_test:
; GCN: load_{{dwordx2|b64}} v[[[SRC_LO:[0-9]+]]:[[SRC_HI:[0-9]+]]]
; GCN-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
; GCN-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
define amdgpu_kernel void @update_dppv2i32_test(ptr addrspace(1) %arg, <2 x i32> %in1, <2 x i32> %in2) {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x i32>, ptr addrspace(1) %arg, i32 %id
%load = load <2 x i32>, ptr addrspace(1) %gep
%tmp0 = call <2 x i32> @llvm.amdgcn.update.dpp.v2i32(<2 x i32> %in1, <2 x i32> %load, i32 1, i32 1, i32 1, i1 false) #0
store <2 x i32> %tmp0, ptr addrspace(1) %gep
ret void
}

; GCN-LABEL: {{^}}update_dppv2f32_test:
; GCN: load_{{dwordx2|b64}} v[[[SRC_LO:[0-9]+]]:[[SRC_HI:[0-9]+]]]
; GCN-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
; GCN-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
define amdgpu_kernel void @update_dppv2f32_test(ptr addrspace(1) %arg, <2 x float> %in1, <2 x float> %in2) {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %arg, i32 %id
%load = load <2 x float>, ptr addrspace(1) %gep
%tmp0 = call <2 x float> @llvm.amdgcn.update.dpp.v2f32(<2 x float> %in1, <2 x float> %load, i32 1, i32 1, i32 1, i1 false) #0
store <2 x float> %tmp0, ptr addrspace(1) %gep
ret void
}

; GCN-LABEL: {{^}}update_dpp_p0_test:
; GCN: load_{{dwordx2|b64}} v[[[SRC_LO:[0-9]+]]:[[SRC_HI:[0-9]+]]]
; GCN-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
; GCN-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
define amdgpu_kernel void @update_dpp_p0_test(ptr addrspace(1) %arg, ptr %in1, ptr %in2) {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds ptr, ptr addrspace(1) %arg, i32 %id
%load = load ptr, ptr addrspace(1) %gep
%tmp0 = call ptr @llvm.amdgcn.update.dpp.p0(ptr %in1, ptr %load, i32 1, i32 1, i32 1, i1 false) #0
store ptr %tmp0, ptr addrspace(1) %gep
ret void
}

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should have some 32-bit pointer tests too, don't think we have those

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added new tests

; GCN-LABEL: {{^}}update_dppi64_imm_old_test:
; GCN-OPT-DAG: v_mov_b32_e32 v[[OLD_LO:[0-9]+]], 0x3afaedd9
; GFX8-OPT-DAG,GFX10-DAG: v_mov_b32_e32 v[[OLD_HI:[0-9]+]], 0x7047
Expand Down Expand Up @@ -510,6 +549,5 @@ declare <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16>, <2 x i16>, i32, i32,
declare <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half>, <2 x half>, i32, i32, i32, i1) #0
declare float @llvm.amdgcn.update.dpp.f32(float, float, i32, i32, i32, i1) #0
declare i64 @llvm.amdgcn.update.dpp.i64(i64, i64, i32, i32, i32, i1) #0
declare double @llvm.amdgcn.update.dpp.f64(double, double, i32, i32, i32, i1) #0

attributes #0 = { nounwind readnone convergent }
Loading