micro clinfo, vector increment with global larger than 1

cirosantilli · cirosantilli · commit 35eb411459dc · 2017-03-12T11:48:22.000Z
diff --git a/opencl/README.md b/opencl/README.md
@@ -4,20 +4,22 @@
 1.  Examples
     1.  [Increment](inc.c)
     1.  [Increment vector](inc_vector.c)
+    1.  [Increment vector globals](inc_vector_globals.c)
     1.  [Pass by value](pass_by_value.c)
     1.  [Work item built-ins](work_item_builtin.c)
     1.  [Vector type](vector_type.c)
+    1.  [clinfo](clinfo.c)
 1.  Tools
     1.  [clinfo](clinfo.md)
     1.  [Benchmarks](benchmarks.md)
 1.  Theory
     1.  [Introduction](introduction.md)
+    1.  [Applications](applications.md)
     1.  [Implementations](implementations.md)
     1.  [Alternatives](alternatives.md)
         1.  [CUDA](cuda.md)
     1.  [Architecture](architecture.md)
     1.  [C](c.md)
     1.  [Host API](host-api.md)
-    1.  [Use cases](use-cases.md)
     1.  [Bibliography](bibliography.md)
     1.  [TODO](TODO.md)
diff --git a/opencl/applications.md b/opencl/applications.md
@@ -5,7 +5,9 @@ For an application to experience speedup compared to the CPU, it must:
 - be highly parallelizable
 - do a lot of work per input byte, because IO is very expensive
 
-## Actual applications
+Minimal example request: <http://stackoverflow.com/questions/7663343/simplest-possible-example-to-show-gpu-outperform-cpu-using-cuda>
+
+## Examples
 
 -   Monte Carlo
 
@@ -14,10 +16,11 @@ For an application to experience speedup compared to the CPU, it must:
     - <https://en.wikipedia.org/wiki/Black%E2%80%93Scholes_model>
     - Reverse Time Migration: RTM <http://www.slb.com/services/seismic/geophysical_processing_characterization/dp/technologies/depth/prestackdepth/rtm.aspx>
 
-Matrix multiplication:
+### Matrix multiplication
 
 - <http://hpclab.blogspot.fr/2011/09/is-gpu-good-for-large-vector-addition.html>
 - <https://developer.nvidia.com/cublas>
+- <http://stackoverflow.com/questions/16748604/opencl-matrix-multiplication-should-be-faster>
 
 Not surprising, since rendering is just a bunch of matrix multiplications, with fixed matrices and varying vectors.
 
@@ -28,3 +31,11 @@ Bolt: C++ STL GPU powered implementation by AMD: <http://developer.amd.com/tools
 ## Non-applications
 
 Vector addition. Too little work per input byte (1 CPU cycle). <https://forums.khronos.org/showthread.php/7741-CPU-faster-in-vector-addition-than-GPU>, <http://stackoverflow.com/questions/15194798/vector-step-addition-slower-on-cuda> <http://hpclab.blogspot.fr/2011/09/is-gpu-good-for-large-vector-addition.html>
+
+## Projects using OpenCL
+
+Notable users:
+
+- OpenCV
+- Bullet physics
+- VP9 decoding 2013 by Ittiam: <http://malideveloper.arm.com/partner-showroom/ittiam-vp9-decoder-using-opencl/>
diff --git a/opencl/architecture.md b/opencl/architecture.md
@@ -50,6 +50,8 @@ But memory localization on GPUs is important enough that OpenCL exposes this ext
 
 Synchronization only works inside a single work groups: http://stackoverflow.com/questions/5895001/opencl-synchronization-between-work-groups
 
+TODO: can a single work group be run in parallel on the GPU?
+
 ### Local size
 
 Size of the work group.
diff --git a/opencl/clinfo.c b/opencl/clinfo.c
@@ -0,0 +1,38 @@
+/*
+Just a small subset of clinfo, for parameters that we actually need
+to query at runtim time, like work group size.
+
+Full list at:
+https://www.khronos.org/registry/OpenCL/sdk/1.0/docs/man/xhtml/clGetDeviceInfo.html
+*/
+
+#include "common.h"
+
+#define PRINT_SIZE_T(id) \
+    clGetDeviceInfo(device, CL_ ## id, sizeof(size_t), &(buf_size_t), NULL); \
+    printf("  " #id " = %zu\n", buf_size_t);
+
+#define PRINT_CL_UINT(id) \
+    clGetDeviceInfo(device, CL_ ## id, sizeof(cl_uint), &(buf_cl_uint), NULL); \
+    printf("  " #id " = %ju\n", (uintmax_t)buf_cl_uint);
+
+int main(void) {
+    cl_platform_id platform;
+    cl_device_id device;
+    size_t buf_size_t;
+    cl_uint buf_cl_uint;
+
+    /* Setup. */
+    clGetPlatformIDs(1, &platform, NULL);
+    clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, NULL);
+
+    /* Print. */
+    puts("clinfo");
+    PRINT_CL_UINT(DEVICE_MAX_WORK_ITEM_DIMENSIONS)
+    PRINT_SIZE_T(DEVICE_MAX_WORK_GROUP_SIZE)
+    PRINT_SIZE_T(DEVICE_MAX_WORK_ITEM_SIZES)
+
+    /* Cleanup. */
+    clReleaseDevice(device);
+    return EXIT_SUCCESS;
+}
diff --git a/opencl/common.h b/opencl/common.h
@@ -2,14 +2,19 @@
 #define COMMON_H
 
 #include <assert.h>
+#include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <string.h>
 
-// http://stackoverflow.com/questions/28500496/opencl-function-found-deprecated-by-visual-studio
+/* http://stackoverflow.com/questions/28500496/opencl-function-found-deprecated-by-visual-studio */
 #define CL_USE_DEPRECATED_OPENCL_1_2_APIS
 #include <CL/cl.h>
 
+/* Encapsulates objects that we use the same on most programs.
+ * This excludes, notably, buffers. */
 typedef struct {
+    cl_command_queue command_queue;
     cl_context context;
     cl_device_id device;
     cl_kernel kernel;
@@ -39,6 +44,7 @@ static void common_init(
 		exit(EXIT_FAILURE);
     }
     common->kernel = clCreateKernel(common->program, "main", NULL);
+    common->command_queue = clCreateCommandQueue(common->context, common->device, 0, NULL);
 }
 
 static char * common_read_file(const char *path) {
@@ -70,6 +76,7 @@ static void common_init_file(
 static void common_deinit(
     Common *common
 ) {
+    clReleaseCommandQueue(common->command_queue);
     clReleaseProgram(common->program);
     clReleaseKernel(common->kernel);
     clReleaseContext(common->context);
diff --git a/opencl/inc_vector.c b/opencl/inc_vector.c
@@ -14,31 +14,28 @@ it's just a clEnqueueNDRangeKernel + get_global_id hello world.
 int main(void) {
     const char *source =
         "__kernel void main(__global int *out) {\n"
-        "      out[get_global_id(0)]++;\n"
+        "    out[get_global_id(0)]++;\n"
         "}\n";
-    cl_command_queue command_queue;
     cl_int input[] = {1, 2};
     cl_mem buffer;
-    const size_t global_work_size = sizeof(input) / sizeof(cl_int);
     Common common;
+    const size_t global_work_size = sizeof(input) / sizeof(cl_int);
 
 	/* Run kernel. */
     common_init(&common, source);
-    buffer = clCreateBuffer(common.context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(input), &input, NULL);
+    buffer = clCreateBuffer(common.context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(input), input, NULL);
     clSetKernelArg(common.kernel, 0, sizeof(cl_mem), &buffer);
-    command_queue = clCreateCommandQueue(common.context, common.device, 0, NULL);
-    clEnqueueNDRangeKernel(command_queue, common.kernel, 1, NULL, &global_work_size, NULL, 0, NULL, NULL);
-    clFlush(command_queue);
-    clFinish(command_queue);
-    clEnqueueReadBuffer(command_queue, buffer, CL_TRUE, 0, sizeof(input), &input, 0, NULL, NULL);
+    clEnqueueNDRangeKernel(common.command_queue, common.kernel, 1, NULL, &global_work_size, NULL, 0, NULL, NULL);
+    clFlush(common.command_queue);
+    clFinish(common.command_queue);
+    clEnqueueReadBuffer(common.command_queue, buffer, CL_TRUE, 0, sizeof(input), input, 0, NULL, NULL);
 
 	/* Assertions. */
     assert(input[0] == 2);
     assert(input[1] == 3);
 
 	/* Cleanup. */
     clReleaseMemObject(buffer);
-    clReleaseCommandQueue(command_queue);
     common_deinit(&common);
     return EXIT_SUCCESS;
 }
diff --git a/opencl/inc_vector_globals.c b/opencl/inc_vector_globals.c
@@ -0,0 +1,65 @@
+/*
+API exercise, increment a vector with less global work groups than integers,
+which forces us to put a for loop in the kernel.
+
+I don't think we can get the size of each global work group from the kernel,
+so we just calculate it on CPU ans pass a sa parameter.
+
+This is how the work will be split:
+
+    | work group 0 | work group 1 | work group 2 |
+    | in[0] in[1]  | in[2] in[3]  | in[4]        |
+*/
+
+#include "common.h"
+
+#define FAKE_MAX_GROUP_NELEMS 2
+
+int main(void) {
+    const char *source =
+        "__kernel void main(uint group_nlems, __global int *out) {\n"
+        "    uint i_min = get_global_id(0) * group_nlems;\n"
+        "    uint i_max = i_min + group_nlems;\n"
+        "    for (uint i = i_min; i < i_max; ++i) {\n"
+        "        out[i]++;\n"
+        "    };\n"
+        "}\n";
+    /* Not a multiple of work size on purpose, so we have to think about the edge case.
+     * We can neither:
+     * - add an if to the kernel. But I don't want to do that as it slows every kernel down.
+     * - pad with trash to a multiple
+     * */
+    cl_int io[] = {1, 2, 3, 4, 5}, *io_align;
+    cl_mem buffer;
+    Common common;
+    const cl_uint nelems = sizeof(io) / sizeof(io[0]);
+    const cl_uint group_nelems = FAKE_MAX_GROUP_NELEMS;
+    const size_t global_work_size = 1 + (nelems - 1) / FAKE_MAX_GROUP_NELEMS;
+    const size_t nelems_align = global_work_size * group_nelems;
+    const size_t io_align_sizeof = nelems_align * sizeof(*io_align);
+
+	/* Run kernel. */
+	io_align = malloc(io_align_sizeof);
+	memcpy(io_align, io, sizeof(io));
+    common_init(&common, source);
+    clSetKernelArg(common.kernel, 0, sizeof(group_nelems), &group_nelems);
+    buffer = clCreateBuffer(common.context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, io_align_sizeof, io_align, NULL);
+    clSetKernelArg(common.kernel, 1, sizeof(buffer), &buffer);
+    clEnqueueNDRangeKernel(common.command_queue, common.kernel, 1, NULL, &global_work_size, NULL, 0, NULL, NULL);
+    clFlush(common.command_queue);
+    clFinish(common.command_queue);
+    clEnqueueReadBuffer(common.command_queue, buffer, CL_TRUE, 0, io_align_sizeof, io_align, 0, NULL, NULL);
+
+	/* Assertions. */
+    assert(io_align[0] == 2);
+    assert(io_align[1] == 3);
+    assert(io_align[2] == 4);
+    assert(io_align[3] == 5);
+    assert(io_align[4] == 6);
+
+	/* Cleanup. */
+    free(io_align);
+    clReleaseMemObject(buffer);
+    common_deinit(&common);
+    return EXIT_SUCCESS;
+}
diff --git a/opencl/pass_by_value.c b/opencl/pass_by_value.c
@@ -1,10 +1,12 @@
 /*
-Kernel takes an integer value instead of a pointer.
+Kernel takes an integer value `int` instead of a pointer.
 
 cl_int is passed directly to clSetKernelArg instead of using
 a buffer obtained from clCreateBuffer.
 
-Increment a vector. It is useless to do this on a GPU, not enough work / IO.
+Does not need to be __global because it is not a pointer.
+
+In practice, this is often used to pass problem size parameters to the kernel.
 */
 
 #include "common.h"
@@ -14,28 +16,25 @@ int main(void) {
         "__kernel void main(int in, __global int *out) {\n"
         "    out[0] = in + 1;\n"
         "}\n";
-    cl_command_queue command_queue;
     cl_int input = 1;
     cl_mem buffer;
     Common common;
 
 	/* Run kernel. */
     common_init(&common, source);
-    buffer = clCreateBuffer(common.context, CL_MEM_READ_WRITE, sizeof(cl_int), NULL, NULL);
-    clSetKernelArg(common.kernel, 0, sizeof(cl_int), &input);
-    clSetKernelArg(common.kernel, 1, sizeof(cl_mem), &buffer);
-    command_queue = clCreateCommandQueue(common.context, common.device, 0, NULL);
-    clEnqueueTask(command_queue, common.kernel, 0, NULL, NULL);
-    clFlush(command_queue);
-    clFinish(command_queue);
-    clEnqueueReadBuffer(command_queue, buffer, CL_TRUE, 0, sizeof(cl_int), &input, 0, NULL, NULL);
+    clSetKernelArg(common.kernel, 0, sizeof(input), &input);
+    buffer = clCreateBuffer(common.context, CL_MEM_READ_WRITE, sizeof(input), NULL, NULL);
+    clSetKernelArg(common.kernel, 1, sizeof(buffer), &buffer);
+    clEnqueueTask(common.command_queue, common.kernel, 0, NULL, NULL);
+    clFlush(common.command_queue);
+    clFinish(common.command_queue);
+    clEnqueueReadBuffer(common.command_queue, buffer, CL_TRUE, 0, sizeof(input), &input, 0, NULL, NULL);
 
 	/* Assertions. */
     assert(input == 2);
 
 	/* Cleanup. */
     clReleaseMemObject(buffer);
-    clReleaseCommandQueue(command_queue);
     common_deinit(&common);
     return EXIT_SUCCESS;
 }
diff --git a/opencl/use-cases.md b/opencl/use-cases.md
diff --git a/opencl/vector_type.c b/opencl/vector_type.c
@@ -15,21 +15,19 @@ int main(void) {
         "__kernel void main(__global int2 *out) {\n"
         "      out[get_global_id(0)]++;\n"
         "}\n";
-    cl_command_queue command_queue;
     cl_int input[] = {0, 1, 2, 3};
     cl_mem buffer;
     Common common;
     const size_t global_work_size = sizeof(input) / sizeof(cl_int2);
 
 	/* Run kernel. */
     common_init(&common, source);
-    buffer = clCreateBuffer(common.context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(input), &input, NULL);
+    buffer = clCreateBuffer(common.context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(input), input, NULL);
     clSetKernelArg(common.kernel, 0, sizeof(cl_mem), &buffer);
-    command_queue = clCreateCommandQueue(common.context, common.device, 0, NULL);
-    clEnqueueNDRangeKernel(command_queue, common.kernel, 1, NULL, &global_work_size, NULL, 0, NULL, NULL);
-    clFlush(command_queue);
-    clFinish(command_queue);
-    clEnqueueReadBuffer(command_queue, buffer, CL_TRUE, 0, sizeof(input), &input, 0, NULL, NULL);
+    clEnqueueNDRangeKernel(common.command_queue, common.kernel, 1, NULL, &global_work_size, NULL, 0, NULL, NULL);
+    clFlush(common.command_queue);
+    clFinish(common.command_queue);
+    clEnqueueReadBuffer(common.command_queue, buffer, CL_TRUE, 0, sizeof(input), input, 0, NULL, NULL);
 
 	/* Assertions. */
     assert(input[0] == 1);
@@ -39,7 +37,6 @@ int main(void) {
 
 	/* Cleanup. */
     clReleaseMemObject(buffer);
-    clReleaseCommandQueue(command_queue);
     common_deinit(&common);
     return EXIT_SUCCESS;
 }
diff --git a/opencl/work_item_builtin.c b/opencl/work_item_builtin.c
@@ -8,26 +8,24 @@ PLay with some misc work item built-in functions.
 
 #define NUM_FUNCTIONS (8)
 /* Play around with those parameters. */
-static size_t offset = 4;
+static size_t offset = 0;
 static size_t global = 1;
 static size_t local = 1;
 
 int main(void) {
-    cl_command_queue command_queue;
     cl_int ret;
     cl_mem buffer;
     cl_uint output[NUM_FUNCTIONS];
     Common common;
 
 	/* Run kernel. */
     common_init_file(&common, "work_item_builtin.cl");
-    buffer = clCreateBuffer(common.context, CL_MEM_WRITE_ONLY, NUM_FUNCTIONS * sizeof(cl_uint), NULL, NULL);
+    buffer = clCreateBuffer(common.context, CL_MEM_WRITE_ONLY, sizeof(output), NULL, NULL);
     clSetKernelArg(common.kernel, 0, sizeof(cl_mem), &buffer);
-    command_queue = clCreateCommandQueue(common.context, common.device, 0, NULL);
-    clEnqueueNDRangeKernel(command_queue, common.kernel, 1, &offset, &global, &local, 0, NULL, NULL);
-    clFlush(command_queue);
-    clFinish(command_queue);
-    clEnqueueReadBuffer(command_queue, buffer, CL_TRUE, 0, NUM_FUNCTIONS * sizeof(cl_uint), &output, 0, NULL, NULL);
+    clEnqueueNDRangeKernel(common.command_queue, common.kernel, 1, &offset, &global, &local, 0, NULL, NULL);
+    clFlush(common.command_queue);
+    clFinish(common.command_queue);
+    clEnqueueReadBuffer(common.command_queue, buffer, CL_TRUE, 0, sizeof(output), output, 0, NULL, NULL);
 
 	/* Check the values. */
     printf("work_dim      = %d\n", output[0]);
@@ -41,7 +39,6 @@ int main(void) {
 
 	/* Cleanup. */
     clReleaseMemObject(buffer);
-    clReleaseCommandQueue(command_queue);
     common_deinit(&common);
     return EXIT_SUCCESS;
 }
diff --git a/opencl/work_item_builtin.cl b/opencl/work_item_builtin.cl