Skip to content

Commit 4e958ab

Browse files
authored
[AMDGPU][PromoteAlloca] Support memsets to ptr allocas (#80678)
Fixes #80366
1 parent ff9af4c commit 4e958ab

File tree

2 files changed

+66
-4
lines changed

2 files changed

+66
-4
lines changed

llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -521,10 +521,18 @@ static Value *promoteAllocaUserToVector(
521521
// For memset, we don't need to know the previous value because we
522522
// currently only allow memsets that cover the whole alloca.
523523
Value *Elt = MSI->getOperand(1);
524-
if (DL.getTypeStoreSize(VecEltTy) > 1) {
525-
Value *EltBytes =
526-
Builder.CreateVectorSplat(DL.getTypeStoreSize(VecEltTy), Elt);
527-
Elt = Builder.CreateBitCast(EltBytes, VecEltTy);
524+
const unsigned BytesPerElt = DL.getTypeStoreSize(VecEltTy);
525+
if (BytesPerElt > 1) {
526+
Value *EltBytes = Builder.CreateVectorSplat(BytesPerElt, Elt);
527+
528+
// If the element type of the vector is a pointer, we need to first cast
529+
// to an integer, then use a PtrCast.
530+
if (VecEltTy->isPointerTy()) {
531+
Type *PtrInt = Builder.getIntNTy(BytesPerElt * 8);
532+
Elt = Builder.CreateBitCast(EltBytes, PtrInt);
533+
Elt = Builder.CreateIntToPtr(Elt, VecEltTy);
534+
} else
535+
Elt = Builder.CreateBitCast(EltBytes, VecEltTy);
528536
}
529537

530538
return Builder.CreateVectorSplat(VectorTy->getElementCount(), Elt);

llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,4 +84,58 @@ entry:
8484
ret void
8585
}
8686

87+
define amdgpu_kernel void @memset_array_ptr_alloca(ptr %out) {
88+
; CHECK-LABEL: @memset_array_ptr_alloca(
89+
; CHECK-NEXT: store i64 0, ptr [[OUT:%.*]], align 8
90+
; CHECK-NEXT: ret void
91+
;
92+
%alloca = alloca [6 x ptr], align 16, addrspace(5)
93+
call void @llvm.memset.p5.i64(ptr addrspace(5) %alloca, i8 0, i64 48, i1 false)
94+
%load = load i64, ptr addrspace(5) %alloca
95+
store i64 %load, ptr %out
96+
ret void
97+
}
98+
99+
define amdgpu_kernel void @memset_vector_ptr_alloca(ptr %out) {
100+
; CHECK-LABEL: @memset_vector_ptr_alloca(
101+
; CHECK-NEXT: store i64 0, ptr [[OUT:%.*]], align 8
102+
; CHECK-NEXT: ret void
103+
;
104+
%alloca = alloca <6 x ptr>, align 16, addrspace(5)
105+
call void @llvm.memset.p5.i64(ptr addrspace(5) %alloca, i8 0, i64 48, i1 false)
106+
%load = load i64, ptr addrspace(5) %alloca
107+
store i64 %load, ptr %out
108+
ret void
109+
}
110+
111+
define amdgpu_kernel void @memset_array_of_array_ptr_alloca(ptr %out) {
112+
; CHECK-LABEL: @memset_array_of_array_ptr_alloca(
113+
; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [2 x [3 x ptr]], align 16, addrspace(5)
114+
; CHECK-NEXT: call void @llvm.memset.p5.i64(ptr addrspace(5) [[ALLOCA]], i8 0, i64 48, i1 false)
115+
; CHECK-NEXT: [[LOAD:%.*]] = load i64, ptr addrspace(5) [[ALLOCA]], align 8
116+
; CHECK-NEXT: store i64 [[LOAD]], ptr [[OUT:%.*]], align 8
117+
; CHECK-NEXT: ret void
118+
;
119+
%alloca = alloca [2 x [3 x ptr]], align 16, addrspace(5)
120+
call void @llvm.memset.p5.i64(ptr addrspace(5) %alloca, i8 0, i64 48, i1 false)
121+
%load = load i64, ptr addrspace(5) %alloca
122+
store i64 %load, ptr %out
123+
ret void
124+
}
125+
126+
define amdgpu_kernel void @memset_array_of_vec_ptr_alloca(ptr %out) {
127+
; CHECK-LABEL: @memset_array_of_vec_ptr_alloca(
128+
; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [2 x <3 x ptr>], align 16, addrspace(5)
129+
; CHECK-NEXT: call void @llvm.memset.p5.i64(ptr addrspace(5) [[ALLOCA]], i8 0, i64 48, i1 false)
130+
; CHECK-NEXT: [[LOAD:%.*]] = load i64, ptr addrspace(5) [[ALLOCA]], align 8
131+
; CHECK-NEXT: store i64 [[LOAD]], ptr [[OUT:%.*]], align 8
132+
; CHECK-NEXT: ret void
133+
;
134+
%alloca = alloca [2 x <3 x ptr>], align 16, addrspace(5)
135+
call void @llvm.memset.p5.i64(ptr addrspace(5) %alloca, i8 0, i64 48, i1 false)
136+
%load = load i64, ptr addrspace(5) %alloca
137+
store i64 %load, ptr %out
138+
ret void
139+
}
140+
87141
declare void @llvm.memset.p5.i64(ptr addrspace(5) nocapture writeonly, i8, i64, i1 immarg)

0 commit comments

Comments
 (0)