@@ -53,15 +53,15 @@ function ``@my_kernel`` is callable from host code, but ``@my_fmad`` is not.
53
53
ret float %add
54
54
}
55
55
56
- define void @my_kernel(float* %ptr) {
57
- %val = load float, float* %ptr
56
+ define void @my_kernel(ptr %ptr) {
57
+ %val = load float, ptr %ptr
58
58
%ret = call float @my_fmad(float %val, float %val, float %val)
59
- store float %ret, float* %ptr
59
+ store float %ret, ptr %ptr
60
60
ret void
61
61
}
62
62
63
63
!nvvm.annotations = !{!1}
64
- !1 = !{void (float*)* @my_kernel, !"kernel", i32 1}
64
+ !1 = !{ptr @my_kernel, !"kernel", i32 1}
65
65
66
66
When compiled, the PTX kernel functions are callable by host-side code.
67
67
@@ -140,10 +140,10 @@ These are overloaded intrinsics. You can use these on any pointer types.
140
140
141
141
.. code-block :: llvm
142
142
143
- declare i8* @llvm.nvvm.ptr.global.to.gen.p0i8.p1i8(i8 addrspace(1)* )
144
- declare i8* @llvm.nvvm.ptr.shared.to.gen.p0i8.p3i8(i8 addrspace(3)* )
145
- declare i8* @llvm.nvvm.ptr.constant.to.gen.p0i8.p4i8(i8 addrspace(4)* )
146
- declare i8* @llvm.nvvm.ptr.local.to.gen.p0i8.p5i8(i8 addrspace(5)* )
143
+ declare ptr @llvm.nvvm.ptr.global.to.gen.p0.p1(ptr addrspace(1))
144
+ declare ptr @llvm.nvvm.ptr.shared.to.gen.p0.p3(ptr addrspace(3))
145
+ declare ptr @llvm.nvvm.ptr.constant.to.gen.p0.p4(ptr addrspace(4))
146
+ declare ptr @llvm.nvvm.ptr.local.to.gen.p0.p5(ptr addrspace(5))
147
147
148
148
Overview:
149
149
"""""""""
@@ -168,10 +168,10 @@ These are overloaded intrinsics. You can use these on any pointer types.
168
168
169
169
.. code-block :: llvm
170
170
171
- declare i8 addrspace(1)* @llvm.nvvm.ptr.gen.to.global.p1i8.p0i8(i8* )
172
- declare i8 addrspace(3)* @llvm.nvvm.ptr.gen.to.shared.p3i8.p0i8(i8* )
173
- declare i8 addrspace(4)* @llvm.nvvm.ptr.gen.to.constant.p4i8.p0i8(i8* )
174
- declare i8 addrspace(5)* @llvm.nvvm.ptr.gen.to.local.p5i8.p0i8(i8* )
171
+ declare ptr addrspace(1) @llvm.nvvm.ptr.gen.to.global.p1.p0(ptr )
172
+ declare ptr addrspace(3) @llvm.nvvm.ptr.gen.to.shared.p3.p0(ptr )
173
+ declare ptr addrspace(4) @llvm.nvvm.ptr.gen.to.constant.p4.p0(ptr )
174
+ declare ptr addrspace(5) @llvm.nvvm.ptr.gen.to.local.p5.p0(ptr )
175
175
176
176
Overview:
177
177
"""""""""
@@ -436,35 +436,33 @@ The Kernel
436
436
; Intrinsic to read X component of thread ID
437
437
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() readnone nounwind
438
438
439
- define void @kernel(float addrspace(1)* %A,
440
- float addrspace(1)* %B,
441
- float addrspace(1)* %C) {
439
+ define void @kernel(ptr addrspace(1) %A,
440
+ ptr addrspace(1) %B,
441
+ ptr addrspace(1) %C) {
442
442
entry:
443
443
; What is my ID?
444
444
%id = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() readnone nounwind
445
445
446
446
; Compute pointers into A, B, and C
447
- %ptrA = getelementptr float, float addrspace(1)* %A, i32 %id
448
- %ptrB = getelementptr float, float addrspace(1)* %B, i32 %id
449
- %ptrC = getelementptr float, float addrspace(1)* %C, i32 %id
447
+ %ptrA = getelementptr float, ptr addrspace(1) %A, i32 %id
448
+ %ptrB = getelementptr float, ptr addrspace(1) %B, i32 %id
449
+ %ptrC = getelementptr float, ptr addrspace(1) %C, i32 %id
450
450
451
451
; Read A, B
452
- %valA = load float, float addrspace(1)* %ptrA, align 4
453
- %valB = load float, float addrspace(1)* %ptrB, align 4
452
+ %valA = load float, ptr addrspace(1) %ptrA, align 4
453
+ %valB = load float, ptr addrspace(1) %ptrB, align 4
454
454
455
455
; Compute C = A + B
456
456
%valC = fadd float %valA, %valB
457
457
458
458
; Store back to C
459
- store float %valC, float addrspace(1)* %ptrC, align 4
459
+ store float %valC, ptr addrspace(1) %ptrC, align 4
460
460
461
461
ret void
462
462
}
463
463
464
464
!nvvm.annotations = !{!0}
465
- !0 = !{void (float addrspace(1)*,
466
- float addrspace(1)*,
467
- float addrspace(1)*)* @kernel, !"kernel", i32 1}
465
+ !0 = !{ptr @kernel, !"kernel", i32 1}
468
466
469
467
470
468
We can use the LLVM ``llc `` tool to directly run the NVPTX code generator:
@@ -613,9 +611,7 @@ For the previous example, we have:
613
611
.. code-block :: llvm
614
612
615
613
!nvvm.annotations = !{!0}
616
- !0 = !{void (float addrspace(1)*,
617
- float addrspace(1)*,
618
- float addrspace(1)*)* @kernel, !"kernel", i32 1}
614
+ !0 = !{ptr @kernel, !"kernel", i32 1}
619
615
620
616
Here, we have a single metadata declaration in ``nvvm.annotations ``. This
621
617
metadata annotates our ``@kernel `` function with the ``kernel `` attribute.
@@ -820,35 +816,33 @@ Libdevice provides an ``__nv_powf`` function that we will use.
820
816
; libdevice function
821
817
declare float @__nv_powf(float, float)
822
818
823
- define void @kernel(float addrspace(1)* %A,
824
- float addrspace(1)* %B,
825
- float addrspace(1)* %C) {
819
+ define void @kernel(ptr addrspace(1) %A,
820
+ ptr addrspace(1) %B,
821
+ ptr addrspace(1) %C) {
826
822
entry:
827
823
; What is my ID?
828
824
%id = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() readnone nounwind
829
825
830
826
; Compute pointers into A, B, and C
831
- %ptrA = getelementptr float, float addrspace(1)* %A, i32 %id
832
- %ptrB = getelementptr float, float addrspace(1)* %B, i32 %id
833
- %ptrC = getelementptr float, float addrspace(1)* %C, i32 %id
827
+ %ptrA = getelementptr float, ptr addrspace(1) %A, i32 %id
828
+ %ptrB = getelementptr float, ptr addrspace(1) %B, i32 %id
829
+ %ptrC = getelementptr float, ptr addrspace(1) %C, i32 %id
834
830
835
831
; Read A, B
836
- %valA = load float, float addrspace(1)* %ptrA, align 4
837
- %valB = load float, float addrspace(1)* %ptrB, align 4
832
+ %valA = load float, ptr addrspace(1) %ptrA, align 4
833
+ %valB = load float, ptr addrspace(1) %ptrB, align 4
838
834
839
835
; Compute C = pow(A, B)
840
836
%valC = call float @__nv_powf(float %valA, float %valB)
841
837
842
838
; Store back to C
843
- store float %valC, float addrspace(1)* %ptrC, align 4
839
+ store float %valC, ptr addrspace(1) %ptrC, align 4
844
840
845
841
ret void
846
842
}
847
843
848
844
!nvvm.annotations = !{!0}
849
- !0 = !{void (float addrspace(1)*,
850
- float addrspace(1)*,
851
- float addrspace(1)*)* @kernel, !"kernel", i32 1}
845
+ !0 = !{ptr @kernel, !"kernel", i32 1}
852
846
853
847
854
848
To compile this kernel, we perform the following steps:
0 commit comments