|
| 1 | +/* |
| 2 | +API exercise, increment a vector with less global work groups than integers, |
| 3 | +which forces us to put a for loop in the kernel. |
| 4 | +
|
| 5 | +I don't think we can get the size of each global work group from the kernel, |
| 6 | +so we just calculate it on CPU ans pass a sa parameter. |
| 7 | +
|
| 8 | +This is how the work will be split: |
| 9 | +
|
| 10 | + | work group 0 | work group 1 | work group 2 | |
| 11 | + | in[0] in[1] | in[2] in[3] | in[4] | |
| 12 | +*/ |
| 13 | + |
| 14 | +#include "common.h" |
| 15 | + |
| 16 | +#define FAKE_MAX_GROUP_NELEMS 2 |
| 17 | + |
| 18 | +int main(void) { |
| 19 | + const char *source = |
| 20 | + "__kernel void main(uint group_nlems, __global int *out) {\n" |
| 21 | + " uint i_min = get_global_id(0) * group_nlems;\n" |
| 22 | + " uint i_max = i_min + group_nlems;\n" |
| 23 | + " for (uint i = i_min; i < i_max; ++i) {\n" |
| 24 | + " out[i]++;\n" |
| 25 | + " };\n" |
| 26 | + "}\n"; |
| 27 | + /* Not a multiple of work size on purpose, so we have to think about the edge case. |
| 28 | + * We can neither: |
| 29 | + * - add an if to the kernel. But I don't want to do that as it slows every kernel down. |
| 30 | + * - pad with trash to a multiple |
| 31 | + * */ |
| 32 | + cl_int io[] = {1, 2, 3, 4, 5}, *io_align; |
| 33 | + cl_mem buffer; |
| 34 | + Common common; |
| 35 | + const cl_uint nelems = sizeof(io) / sizeof(io[0]); |
| 36 | + const cl_uint group_nelems = FAKE_MAX_GROUP_NELEMS; |
| 37 | + const size_t global_work_size = 1 + (nelems - 1) / FAKE_MAX_GROUP_NELEMS; |
| 38 | + const size_t nelems_align = global_work_size * group_nelems; |
| 39 | + const size_t io_align_sizeof = nelems_align * sizeof(*io_align); |
| 40 | + |
| 41 | + /* Run kernel. */ |
| 42 | + io_align = malloc(io_align_sizeof); |
| 43 | + memcpy(io_align, io, sizeof(io)); |
| 44 | + common_init(&common, source); |
| 45 | + clSetKernelArg(common.kernel, 0, sizeof(group_nelems), &group_nelems); |
| 46 | + buffer = clCreateBuffer(common.context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, io_align_sizeof, io_align, NULL); |
| 47 | + clSetKernelArg(common.kernel, 1, sizeof(buffer), &buffer); |
| 48 | + clEnqueueNDRangeKernel(common.command_queue, common.kernel, 1, NULL, &global_work_size, NULL, 0, NULL, NULL); |
| 49 | + clFlush(common.command_queue); |
| 50 | + clFinish(common.command_queue); |
| 51 | + clEnqueueReadBuffer(common.command_queue, buffer, CL_TRUE, 0, io_align_sizeof, io_align, 0, NULL, NULL); |
| 52 | + |
| 53 | + /* Assertions. */ |
| 54 | + assert(io_align[0] == 2); |
| 55 | + assert(io_align[1] == 3); |
| 56 | + assert(io_align[2] == 4); |
| 57 | + assert(io_align[3] == 5); |
| 58 | + assert(io_align[4] == 6); |
| 59 | + |
| 60 | + /* Cleanup. */ |
| 61 | + free(io_align); |
| 62 | + clReleaseMemObject(buffer); |
| 63 | + common_deinit(&common); |
| 64 | + return EXIT_SUCCESS; |
| 65 | +} |
0 commit comments