Skip to content

Commit 35eb411

Browse files
committed
micro clinfo, vector increment with global larger than 1
1 parent 3b4a930 commit 35eb411

12 files changed

+168
-59
lines changed

opencl/README.md

+3-1
Original file line numberDiff line numberDiff line change
@@ -4,20 +4,22 @@
44
1. Examples
55
1. [Increment](inc.c)
66
1. [Increment vector](inc_vector.c)
7+
1. [Increment vector globals](inc_vector_globals.c)
78
1. [Pass by value](pass_by_value.c)
89
1. [Work item built-ins](work_item_builtin.c)
910
1. [Vector type](vector_type.c)
11+
1. [clinfo](clinfo.c)
1012
1. Tools
1113
1. [clinfo](clinfo.md)
1214
1. [Benchmarks](benchmarks.md)
1315
1. Theory
1416
1. [Introduction](introduction.md)
17+
1. [Applications](applications.md)
1518
1. [Implementations](implementations.md)
1619
1. [Alternatives](alternatives.md)
1720
1. [CUDA](cuda.md)
1821
1. [Architecture](architecture.md)
1922
1. [C](c.md)
2023
1. [Host API](host-api.md)
21-
1. [Use cases](use-cases.md)
2224
1. [Bibliography](bibliography.md)
2325
1. [TODO](TODO.md)

opencl/applications.md

+13-2
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,9 @@ For an application to experience speedup compared to the CPU, it must:
55
- be highly parallelizable
66
- do a lot of work per input byte, because IO is very expensive
77

8-
## Actual applications
8+
Minimal example request: <http://stackoverflow.com/questions/7663343/simplest-possible-example-to-show-gpu-outperform-cpu-using-cuda>
9+
10+
## Examples
911

1012
- Monte Carlo
1113

@@ -14,10 +16,11 @@ For an application to experience speedup compared to the CPU, it must:
1416
- <https://en.wikipedia.org/wiki/Black%E2%80%93Scholes_model>
1517
- Reverse Time Migration: RTM <http://www.slb.com/services/seismic/geophysical_processing_characterization/dp/technologies/depth/prestackdepth/rtm.aspx>
1618

17-
Matrix multiplication:
19+
### Matrix multiplication
1820

1921
- <http://hpclab.blogspot.fr/2011/09/is-gpu-good-for-large-vector-addition.html>
2022
- <https://developer.nvidia.com/cublas>
23+
- <http://stackoverflow.com/questions/16748604/opencl-matrix-multiplication-should-be-faster>
2124

2225
Not surprising, since rendering is just a bunch of matrix multiplications, with fixed matrices and varying vectors.
2326

@@ -28,3 +31,11 @@ Bolt: C++ STL GPU powered implementation by AMD: <http://developer.amd.com/tools
2831
## Non-applications
2932

3033
Vector addition. Too little work per input byte (1 CPU cycle). <https://forums.khronos.org/showthread.php/7741-CPU-faster-in-vector-addition-than-GPU>, <http://stackoverflow.com/questions/15194798/vector-step-addition-slower-on-cuda> <http://hpclab.blogspot.fr/2011/09/is-gpu-good-for-large-vector-addition.html>
34+
35+
## Projects using OpenCL
36+
37+
Notable users:
38+
39+
- OpenCV
40+
- Bullet physics
41+
- VP9 decoding 2013 by Ittiam: <http://malideveloper.arm.com/partner-showroom/ittiam-vp9-decoder-using-opencl/>

opencl/architecture.md

+2
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,8 @@ But memory localization on GPUs is important enough that OpenCL exposes this ext
5050

5151
Synchronization only works inside a single work groups: http://stackoverflow.com/questions/5895001/opencl-synchronization-between-work-groups
5252

53+
TODO: can a single work group be run in parallel on the GPU?
54+
5355
### Local size
5456

5557
Size of the work group.

opencl/clinfo.c

+38
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
/*
2+
Just a small subset of clinfo, for parameters that we actually need
3+
to query at runtim time, like work group size.
4+
5+
Full list at:
6+
https://www.khronos.org/registry/OpenCL/sdk/1.0/docs/man/xhtml/clGetDeviceInfo.html
7+
*/
8+
9+
#include "common.h"
10+
11+
#define PRINT_SIZE_T(id) \
12+
clGetDeviceInfo(device, CL_ ## id, sizeof(size_t), &(buf_size_t), NULL); \
13+
printf(" " #id " = %zu\n", buf_size_t);
14+
15+
#define PRINT_CL_UINT(id) \
16+
clGetDeviceInfo(device, CL_ ## id, sizeof(cl_uint), &(buf_cl_uint), NULL); \
17+
printf(" " #id " = %ju\n", (uintmax_t)buf_cl_uint);
18+
19+
int main(void) {
20+
cl_platform_id platform;
21+
cl_device_id device;
22+
size_t buf_size_t;
23+
cl_uint buf_cl_uint;
24+
25+
/* Setup. */
26+
clGetPlatformIDs(1, &platform, NULL);
27+
clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, NULL);
28+
29+
/* Print. */
30+
puts("clinfo");
31+
PRINT_CL_UINT(DEVICE_MAX_WORK_ITEM_DIMENSIONS)
32+
PRINT_SIZE_T(DEVICE_MAX_WORK_GROUP_SIZE)
33+
PRINT_SIZE_T(DEVICE_MAX_WORK_ITEM_SIZES)
34+
35+
/* Cleanup. */
36+
clReleaseDevice(device);
37+
return EXIT_SUCCESS;
38+
}

opencl/common.h

+8-1
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,19 @@
22
#define COMMON_H
33

44
#include <assert.h>
5+
#include <stdint.h>
56
#include <stdio.h>
67
#include <stdlib.h>
8+
#include <string.h>
79

8-
// http://stackoverflow.com/questions/28500496/opencl-function-found-deprecated-by-visual-studio
10+
/* http://stackoverflow.com/questions/28500496/opencl-function-found-deprecated-by-visual-studio */
911
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
1012
#include <CL/cl.h>
1113

14+
/* Encapsulates objects that we use the same on most programs.
15+
* This excludes, notably, buffers. */
1216
typedef struct {
17+
cl_command_queue command_queue;
1318
cl_context context;
1419
cl_device_id device;
1520
cl_kernel kernel;
@@ -39,6 +44,7 @@ static void common_init(
3944
exit(EXIT_FAILURE);
4045
}
4146
common->kernel = clCreateKernel(common->program, "main", NULL);
47+
common->command_queue = clCreateCommandQueue(common->context, common->device, 0, NULL);
4248
}
4349

4450
static char * common_read_file(const char *path) {
@@ -70,6 +76,7 @@ static void common_init_file(
7076
static void common_deinit(
7177
Common *common
7278
) {
79+
clReleaseCommandQueue(common->command_queue);
7380
clReleaseProgram(common->program);
7481
clReleaseKernel(common->kernel);
7582
clReleaseContext(common->context);

opencl/inc_vector.c

+7-10
Original file line numberDiff line numberDiff line change
@@ -14,31 +14,28 @@ it's just a clEnqueueNDRangeKernel + get_global_id hello world.
1414
int main(void) {
1515
const char *source =
1616
"__kernel void main(__global int *out) {\n"
17-
" out[get_global_id(0)]++;\n"
17+
" out[get_global_id(0)]++;\n"
1818
"}\n";
19-
cl_command_queue command_queue;
2019
cl_int input[] = {1, 2};
2120
cl_mem buffer;
22-
const size_t global_work_size = sizeof(input) / sizeof(cl_int);
2321
Common common;
22+
const size_t global_work_size = sizeof(input) / sizeof(cl_int);
2423

2524
/* Run kernel. */
2625
common_init(&common, source);
27-
buffer = clCreateBuffer(common.context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(input), &input, NULL);
26+
buffer = clCreateBuffer(common.context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(input), input, NULL);
2827
clSetKernelArg(common.kernel, 0, sizeof(cl_mem), &buffer);
29-
command_queue = clCreateCommandQueue(common.context, common.device, 0, NULL);
30-
clEnqueueNDRangeKernel(command_queue, common.kernel, 1, NULL, &global_work_size, NULL, 0, NULL, NULL);
31-
clFlush(command_queue);
32-
clFinish(command_queue);
33-
clEnqueueReadBuffer(command_queue, buffer, CL_TRUE, 0, sizeof(input), &input, 0, NULL, NULL);
28+
clEnqueueNDRangeKernel(common.command_queue, common.kernel, 1, NULL, &global_work_size, NULL, 0, NULL, NULL);
29+
clFlush(common.command_queue);
30+
clFinish(common.command_queue);
31+
clEnqueueReadBuffer(common.command_queue, buffer, CL_TRUE, 0, sizeof(input), input, 0, NULL, NULL);
3432

3533
/* Assertions. */
3634
assert(input[0] == 2);
3735
assert(input[1] == 3);
3836

3937
/* Cleanup. */
4038
clReleaseMemObject(buffer);
41-
clReleaseCommandQueue(command_queue);
4239
common_deinit(&common);
4340
return EXIT_SUCCESS;
4441
}

opencl/inc_vector_globals.c

+65
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
/*
2+
API exercise, increment a vector with less global work groups than integers,
3+
which forces us to put a for loop in the kernel.
4+
5+
I don't think we can get the size of each global work group from the kernel,
6+
so we just calculate it on CPU ans pass a sa parameter.
7+
8+
This is how the work will be split:
9+
10+
| work group 0 | work group 1 | work group 2 |
11+
| in[0] in[1] | in[2] in[3] | in[4] |
12+
*/
13+
14+
#include "common.h"
15+
16+
#define FAKE_MAX_GROUP_NELEMS 2
17+
18+
int main(void) {
19+
const char *source =
20+
"__kernel void main(uint group_nlems, __global int *out) {\n"
21+
" uint i_min = get_global_id(0) * group_nlems;\n"
22+
" uint i_max = i_min + group_nlems;\n"
23+
" for (uint i = i_min; i < i_max; ++i) {\n"
24+
" out[i]++;\n"
25+
" };\n"
26+
"}\n";
27+
/* Not a multiple of work size on purpose, so we have to think about the edge case.
28+
* We can neither:
29+
* - add an if to the kernel. But I don't want to do that as it slows every kernel down.
30+
* - pad with trash to a multiple
31+
* */
32+
cl_int io[] = {1, 2, 3, 4, 5}, *io_align;
33+
cl_mem buffer;
34+
Common common;
35+
const cl_uint nelems = sizeof(io) / sizeof(io[0]);
36+
const cl_uint group_nelems = FAKE_MAX_GROUP_NELEMS;
37+
const size_t global_work_size = 1 + (nelems - 1) / FAKE_MAX_GROUP_NELEMS;
38+
const size_t nelems_align = global_work_size * group_nelems;
39+
const size_t io_align_sizeof = nelems_align * sizeof(*io_align);
40+
41+
/* Run kernel. */
42+
io_align = malloc(io_align_sizeof);
43+
memcpy(io_align, io, sizeof(io));
44+
common_init(&common, source);
45+
clSetKernelArg(common.kernel, 0, sizeof(group_nelems), &group_nelems);
46+
buffer = clCreateBuffer(common.context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, io_align_sizeof, io_align, NULL);
47+
clSetKernelArg(common.kernel, 1, sizeof(buffer), &buffer);
48+
clEnqueueNDRangeKernel(common.command_queue, common.kernel, 1, NULL, &global_work_size, NULL, 0, NULL, NULL);
49+
clFlush(common.command_queue);
50+
clFinish(common.command_queue);
51+
clEnqueueReadBuffer(common.command_queue, buffer, CL_TRUE, 0, io_align_sizeof, io_align, 0, NULL, NULL);
52+
53+
/* Assertions. */
54+
assert(io_align[0] == 2);
55+
assert(io_align[1] == 3);
56+
assert(io_align[2] == 4);
57+
assert(io_align[3] == 5);
58+
assert(io_align[4] == 6);
59+
60+
/* Cleanup. */
61+
free(io_align);
62+
clReleaseMemObject(buffer);
63+
common_deinit(&common);
64+
return EXIT_SUCCESS;
65+
}

opencl/pass_by_value.c

+11-12
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
/*
2-
Kernel takes an integer value instead of a pointer.
2+
Kernel takes an integer value `int` instead of a pointer.
33
44
cl_int is passed directly to clSetKernelArg instead of using
55
a buffer obtained from clCreateBuffer.
66
7-
Increment a vector. It is useless to do this on a GPU, not enough work / IO.
7+
Does not need to be __global because it is not a pointer.
8+
9+
In practice, this is often used to pass problem size parameters to the kernel.
810
*/
911

1012
#include "common.h"
@@ -14,28 +16,25 @@ int main(void) {
1416
"__kernel void main(int in, __global int *out) {\n"
1517
" out[0] = in + 1;\n"
1618
"}\n";
17-
cl_command_queue command_queue;
1819
cl_int input = 1;
1920
cl_mem buffer;
2021
Common common;
2122

2223
/* Run kernel. */
2324
common_init(&common, source);
24-
buffer = clCreateBuffer(common.context, CL_MEM_READ_WRITE, sizeof(cl_int), NULL, NULL);
25-
clSetKernelArg(common.kernel, 0, sizeof(cl_int), &input);
26-
clSetKernelArg(common.kernel, 1, sizeof(cl_mem), &buffer);
27-
command_queue = clCreateCommandQueue(common.context, common.device, 0, NULL);
28-
clEnqueueTask(command_queue, common.kernel, 0, NULL, NULL);
29-
clFlush(command_queue);
30-
clFinish(command_queue);
31-
clEnqueueReadBuffer(command_queue, buffer, CL_TRUE, 0, sizeof(cl_int), &input, 0, NULL, NULL);
25+
clSetKernelArg(common.kernel, 0, sizeof(input), &input);
26+
buffer = clCreateBuffer(common.context, CL_MEM_READ_WRITE, sizeof(input), NULL, NULL);
27+
clSetKernelArg(common.kernel, 1, sizeof(buffer), &buffer);
28+
clEnqueueTask(common.command_queue, common.kernel, 0, NULL, NULL);
29+
clFlush(common.command_queue);
30+
clFinish(common.command_queue);
31+
clEnqueueReadBuffer(common.command_queue, buffer, CL_TRUE, 0, sizeof(input), &input, 0, NULL, NULL);
3232

3333
/* Assertions. */
3434
assert(input == 2);
3535

3636
/* Cleanup. */
3737
clReleaseMemObject(buffer);
38-
clReleaseCommandQueue(command_queue);
3938
common_deinit(&common);
4039
return EXIT_SUCCESS;
4140
}

opencl/use-cases.md

-15
This file was deleted.

opencl/vector_type.c

+5-8
Original file line numberDiff line numberDiff line change
@@ -15,21 +15,19 @@ int main(void) {
1515
"__kernel void main(__global int2 *out) {\n"
1616
" out[get_global_id(0)]++;\n"
1717
"}\n";
18-
cl_command_queue command_queue;
1918
cl_int input[] = {0, 1, 2, 3};
2019
cl_mem buffer;
2120
Common common;
2221
const size_t global_work_size = sizeof(input) / sizeof(cl_int2);
2322

2423
/* Run kernel. */
2524
common_init(&common, source);
26-
buffer = clCreateBuffer(common.context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(input), &input, NULL);
25+
buffer = clCreateBuffer(common.context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(input), input, NULL);
2726
clSetKernelArg(common.kernel, 0, sizeof(cl_mem), &buffer);
28-
command_queue = clCreateCommandQueue(common.context, common.device, 0, NULL);
29-
clEnqueueNDRangeKernel(command_queue, common.kernel, 1, NULL, &global_work_size, NULL, 0, NULL, NULL);
30-
clFlush(command_queue);
31-
clFinish(command_queue);
32-
clEnqueueReadBuffer(command_queue, buffer, CL_TRUE, 0, sizeof(input), &input, 0, NULL, NULL);
27+
clEnqueueNDRangeKernel(common.command_queue, common.kernel, 1, NULL, &global_work_size, NULL, 0, NULL, NULL);
28+
clFlush(common.command_queue);
29+
clFinish(common.command_queue);
30+
clEnqueueReadBuffer(common.command_queue, buffer, CL_TRUE, 0, sizeof(input), input, 0, NULL, NULL);
3331

3432
/* Assertions. */
3533
assert(input[0] == 1);
@@ -39,7 +37,6 @@ int main(void) {
3937

4038
/* Cleanup. */
4139
clReleaseMemObject(buffer);
42-
clReleaseCommandQueue(command_queue);
4340
common_deinit(&common);
4441
return EXIT_SUCCESS;
4542
}

opencl/work_item_builtin.c

+6-9
Original file line numberDiff line numberDiff line change
@@ -8,26 +8,24 @@ PLay with some misc work item built-in functions.
88

99
#define NUM_FUNCTIONS (8)
1010
/* Play around with those parameters. */
11-
static size_t offset = 4;
11+
static size_t offset = 0;
1212
static size_t global = 1;
1313
static size_t local = 1;
1414

1515
int main(void) {
16-
cl_command_queue command_queue;
1716
cl_int ret;
1817
cl_mem buffer;
1918
cl_uint output[NUM_FUNCTIONS];
2019
Common common;
2120

2221
/* Run kernel. */
2322
common_init_file(&common, "work_item_builtin.cl");
24-
buffer = clCreateBuffer(common.context, CL_MEM_WRITE_ONLY, NUM_FUNCTIONS * sizeof(cl_uint), NULL, NULL);
23+
buffer = clCreateBuffer(common.context, CL_MEM_WRITE_ONLY, sizeof(output), NULL, NULL);
2524
clSetKernelArg(common.kernel, 0, sizeof(cl_mem), &buffer);
26-
command_queue = clCreateCommandQueue(common.context, common.device, 0, NULL);
27-
clEnqueueNDRangeKernel(command_queue, common.kernel, 1, &offset, &global, &local, 0, NULL, NULL);
28-
clFlush(command_queue);
29-
clFinish(command_queue);
30-
clEnqueueReadBuffer(command_queue, buffer, CL_TRUE, 0, NUM_FUNCTIONS * sizeof(cl_uint), &output, 0, NULL, NULL);
25+
clEnqueueNDRangeKernel(common.command_queue, common.kernel, 1, &offset, &global, &local, 0, NULL, NULL);
26+
clFlush(common.command_queue);
27+
clFinish(common.command_queue);
28+
clEnqueueReadBuffer(common.command_queue, buffer, CL_TRUE, 0, sizeof(output), output, 0, NULL, NULL);
3129

3230
/* Check the values. */
3331
printf("work_dim = %d\n", output[0]);
@@ -41,7 +39,6 @@ int main(void) {
4139

4240
/* Cleanup. */
4341
clReleaseMemObject(buffer);
44-
clReleaseCommandQueue(command_queue);
4542
common_deinit(&common);
4643
return EXIT_SUCCESS;
4744
}

0 commit comments

Comments
 (0)