CL benchmark matrix multiplication boilerplate

cirosantilli · cirosantilli · commit 62b8ece87562 · 2017-03-13T14:40:48.000Z
diff --git a/c/rand.c b/c/rand.c
@@ -46,7 +46,7 @@ void random_array(char * const arr, const size_t len) {
         arr[i] = rand() % SCHAR_MAX;
 }
 
-int main() {
+int main(void) {
     srand(time(NULL));
 
     /*
diff --git a/opencl/applications.md b/opencl/applications.md
@@ -16,11 +16,19 @@ Minimal example request: <http://stackoverflow.com/questions/7663343/simplest-po
     - <https://en.wikipedia.org/wiki/Black%E2%80%93Scholes_model>
     - Reverse Time Migration: RTM <http://www.slb.com/services/seismic/geophysical_processing_characterization/dp/technologies/depth/prestackdepth/rtm.aspx>
 
+-   clMathLibraries organization, by AMD employees
+
+    -   <https://github.com/clMathLibraries/clFFT> FFT
+    -   <https://github.com/clMathLibraries/clRNG> random number generation
+
 ### Matrix multiplication
 
 - <http://hpclab.blogspot.fr/2011/09/is-gpu-good-for-large-vector-addition.html>
 - <https://developer.nvidia.com/cublas>
+- <https://github.com/clMathLibraries/clBLAS>
+- <https://github.com/clMathLibraries/clSPARSE>
 - <http://stackoverflow.com/questions/16748604/opencl-matrix-multiplication-should-be-faster>
+- <http://stackoverflow.com/questions/33086029/multiply-matrices-in-c-or-in-glsl>
 
 Not surprising, since rendering is just a bunch of matrix multiplications, with fixed matrices and varying vectors.
 
diff --git a/opencl/architecture.md b/opencl/architecture.md
@@ -94,3 +94,10 @@ It also shows how we must make an explicit copy to use private memory.
 - <http://stackoverflow.com/questions/8888718/how-to-declare-local-memory-in-opencl>
 - <http://stackoverflow.com/questions/2541929/how-do-i-use-local-memory-in-opencl>
 - <http://stackoverflow.com/questions/17574570/create-local-array-dynamic-inside-opencl-kernel>
+
+## Pinned memory
+
+TODO.
+
+- <http://stackoverflow.com/questions/25496656/cl-mem-use-host-ptr-vs-cl-mem-copy-host-ptr-vs-cl-mem-alloc-host-ptr>
+- <http://stackoverflow.com/questions/24158909/how-to-use-pinned-memory-mapped-memory-in-opencl>
diff --git a/opencl/clinfo.c b/opencl/clinfo.c
@@ -10,11 +10,11 @@ Full list at:
 
 #define PRINT_SIZE_T(id) \
     clGetDeviceInfo(device, CL_ ## id, sizeof(size_t), &(buf_size_t), NULL); \
-    printf("  " #id " = %zu\n", buf_size_t);
+    printf(#id " = %zu\n", buf_size_t);
 
 #define PRINT_CL_UINT(id) \
     clGetDeviceInfo(device, CL_ ## id, sizeof(cl_uint), &(buf_cl_uint), NULL); \
-    printf("  " #id " = %ju\n", (uintmax_t)buf_cl_uint);
+    printf(#id " = %ju\n", (uintmax_t)buf_cl_uint);
 
 int main(void) {
     cl_platform_id platform;
@@ -27,12 +27,14 @@ int main(void) {
     clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, NULL);
 
     /* Print. */
-    puts("clinfo");
+    puts("#clinfo");
     PRINT_CL_UINT(DEVICE_MAX_WORK_ITEM_DIMENSIONS)
     PRINT_SIZE_T(DEVICE_MAX_WORK_GROUP_SIZE)
     PRINT_SIZE_T(DEVICE_MAX_WORK_ITEM_SIZES)
 
     /* Cleanup. */
-    clReleaseDevice(device);
+#ifdef CL_1_2
+	clReleaseDevice(device);
+#endif
     return EXIT_SUCCESS;
 }
diff --git a/opencl/common.h b/opencl/common.h
@@ -2,10 +2,12 @@
 #define COMMON_H
 
 #include <assert.h>
+#include <math.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <time.h>
 
 /* http://stackoverflow.com/questions/28500496/opencl-function-found-deprecated-by-visual-studio */
 #define CL_USE_DEPRECATED_OPENCL_1_2_APIS
@@ -47,18 +49,18 @@ static void common_init(
     common->command_queue = clCreateCommandQueue(common->context, common->device, 0, NULL);
 }
 
-static char * common_read_file(const char *path) {
+static char* common_read_file(const char *path) {
     char *buffer;
     FILE *f;
     long length;
 
     f = fopen(path, "r");
-    fseek (f, 0, SEEK_END);
+    fseek(f, 0, SEEK_END);
     length = ftell(f);
     fseek(f, 0, SEEK_SET);
     buffer = calloc(1, length + 1);
-    fread (buffer, 1, length, f);
-    fclose (f);
+    fread(buffer, 1, length, f);
+    fclose(f);
     buffer[length] = '\0';
     return buffer;
 }
@@ -80,7 +82,15 @@ static void common_deinit(
     clReleaseProgram(common->program);
     clReleaseKernel(common->kernel);
     clReleaseContext(common->context);
-    clReleaseDevice(common->device);
+#ifdef CL_1_2
+	clReleaseDevice(common->device);
+#endif
+}
+
+static double common_get_nanos(void) {
+    struct timespec ts;
+    timespec_get(&ts, TIME_UTC);
+    return ts.tv_sec + ts.tv_nsec / 1000000000.0;
 }
 
 #endif
diff --git a/opencl/implementations.md b/opencl/implementations.md
@@ -33,18 +33,16 @@ Threads:
 - <http://stackoverflow.com/questions/3271243/clcreatesubbuffer-not-found-oo>
 - <https://devtalk.nvidia.com/default/topic/486564/nvidia-39-s-opencl-1-1-and-clcreatesubbuffer/>
 
-No OpenCL 2 planned as of 2016:
+## OpenCL
+
+OpenCL 2 announced in 2017:
 
 - <http://stackoverflow.com/questions/29219307/opencl-2-0-on-nvidia-graphics-cards>
 - <https://devtalk.nvidia.com/default/topic/954622/opencl-2-x-support-plans-/>
 
 but hardware support will very likely be / is already there because of Vulkan / OpenCL 2 convergence.
 
-Linux dependencies for 340.65:
-
-- `libdl.so.2`
-- `libpthread.so.0`
-- `libc.so.6`
+OpenCL 1.2 apparently added in driver 350.12, on Kepler hardware and up.
 
 ## Intel
 
diff --git a/opencl/inc.c b/opencl/inc.c
@@ -10,6 +10,7 @@ This is our OpenCL hello world, so we are not doing:
 #include <assert.h>
 #include <stdio.h>
 
+/* To prevent deprecation warnings when headers are 2.0. */
 #define CL_USE_DEPRECATED_OPENCL_1_2_APIS
 #include <CL/cl.h>
 
@@ -53,6 +54,8 @@ int main(void) {
     clReleaseProgram(program);
     clReleaseKernel(kernel);
     clReleaseContext(context);
-    clReleaseDevice(device);
+#ifdef CL_1_2
+	clReleaseDevice(device);
+#endif
     return EXIT_SUCCESS;
 }
diff --git a/opencl/inc_vector.c b/opencl/inc_vector.c
@@ -19,12 +19,12 @@ int main(void) {
     cl_int input[] = {1, 2};
     cl_mem buffer;
     Common common;
-    const size_t global_work_size = sizeof(input) / sizeof(cl_int);
+    const size_t global_work_size = sizeof(input) / sizeof(input[0]);
 
 	/* Run kernel. */
     common_init(&common, source);
     buffer = clCreateBuffer(common.context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(input), input, NULL);
-    clSetKernelArg(common.kernel, 0, sizeof(cl_mem), &buffer);
+    clSetKernelArg(common.kernel, 0, sizeof(buffer), &buffer);
     clEnqueueNDRangeKernel(common.command_queue, common.kernel, 1, NULL, &global_work_size, NULL, 0, NULL, NULL);
     clFlush(common.command_queue);
     clFinish(common.command_queue);
diff --git a/opencl/matmul.c b/opencl/matmul.c
@@ -0,0 +1,187 @@
+/*
+Matrix multiplication.
+
+The most basic / useful application where OpenCL might be faster than CPU.
+
+TODO: make a SERIOUS matrix implementation. Also compare with existing SERIOUS CPU and GPU implementations:
+
+- http://stackoverflow.com/questions/1907557/optimized-matrix-multiplication-in-c
+- http://stackoverflow.com/questions/12289235/simple-and-fast-matrix-vector-multiplication-in-c-c
+- https://www.quora.com/What-is-the-best-way-to-multiply-two-matrices-in-C++
+*/
+
+#include "common.h"
+
+typedef cl_float F;
+
+/* C = A*B, width n, naive. */
+void mat_mul_cpu(const F *A, const F *B, F *C, size_t n) {
+	F tmp;
+	size_t i, j, k;
+
+	for (i = 0; i < n; ++i) {
+		for (j = 0; j < n; ++j) {
+			tmp = 0;
+			for (k = 0; k < n; ++k) {
+				tmp += A[i*n+k] * B[k*n+j];
+			}
+			C[i*n+j] = tmp;
+		}
+	}
+}
+
+/* Simplest possible implementation. */
+void mat_mul_cl(const F *A, const F *B, F *C, size_t n) {
+    cl_mem buf_a, buf_b, buf_c;
+    Common common;
+    cl_uint ncl;
+    size_t global_work_size[2], mat_sizeof, n2;
+
+	/* Setup variables. */
+	global_work_size[0] = n;
+	global_work_size[1] = n;
+	n2 = n * n;
+	mat_sizeof = n2 * sizeof(F);
+ 	ncl = n;
+
+	/* Run kernel. */
+    common_init_file(&common, "matmul.cl");
+    buf_a = clCreateBuffer(common.context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mat_sizeof, (F*)A, NULL);
+    buf_b = clCreateBuffer(common.context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mat_sizeof, (F*)B, NULL);
+    buf_c = clCreateBuffer(common.context, CL_MEM_WRITE_ONLY, mat_sizeof, C, NULL);
+    clSetKernelArg(common.kernel, 0, sizeof(buf_a), &buf_a);
+    clSetKernelArg(common.kernel, 1, sizeof(buf_b), &buf_b);
+    clSetKernelArg(common.kernel, 2, sizeof(buf_c), &buf_c);
+    clSetKernelArg(common.kernel, 3, sizeof(ncl), &ncl);
+    clEnqueueNDRangeKernel(common.command_queue, common.kernel, 2, NULL, global_work_size, NULL, 0, NULL, NULL);
+    clFlush(common.command_queue);
+    clFinish(common.command_queue);
+    clEnqueueReadBuffer(common.command_queue, buf_c, CL_TRUE, 0, mat_sizeof, C, 0, NULL, NULL);
+
+	/* Cleanup. */
+    clReleaseMemObject(buf_a);
+    clReleaseMemObject(buf_b);
+    clReleaseMemObject(buf_c);
+    common_deinit(&common);
+}
+
+/* Check if two matrices are equal with given mean squared err_maxor. */
+int mat_eq(const F *A, const F *B, size_t n) {
+	const F err_max = 10e-3;
+	F err, diff, a, b;
+	size_t i, i_max;
+
+	err = 0.0;
+	i_max = n*n;
+	for (i = 0; i < i_max; ++i) {
+		a = A[i];
+		b = B[i];
+		diff = a - b;
+		err += diff * diff;
+	}
+	return (sqrt(err) / i_max) < err_max;
+}
+
+/* No, this was not created for debugging, my code is flawless from the first try. */
+void mat_print(const F *A, size_t n) {
+	size_t i, j;
+	for (i = 0; i < n; ++i) {
+		for (j = 0; j < n; ++j) {
+			printf("%f ", A[i*n+j]);
+		}
+		puts("");
+	}
+}
+
+/* Zero a matrix. */
+F * mat_zero(F *A, size_t n) {
+	size_t i, n2;
+	n2 = n*n;
+	for (i = 0; i < n2; ++i) {
+		A[i] = 0.0;
+	}
+}
+
+/* Initialize a random matrix. */
+F * mat_rand(F *A, size_t n) {
+	size_t i, n2;
+	n2 = n*n;
+	for (i = 0; i < n2; ++i) {
+		A[i] = ((float)rand()) / ((float)RAND_MAX);
+	}
+}
+
+int main(void) {
+	srand(time(NULL));
+
+	/* Unit test our implementations. */
+	{
+		const F A[] = {
+			1.0, 2.0,
+			3.0, 4.0
+		};
+		const F B[] = {
+			5.0, 6.0,
+			7.0, 8.0
+		};
+		size_t n = sqrt(sizeof(A)/sizeof(A[0]));
+		F C[n*n];
+		const F C_expect[] = {
+			19.0, 22.0,
+			43.0, 50.0
+		};
+
+		mat_zero(C, n);
+		mat_mul_cpu(A, B, C, n);
+		assert(mat_eq(C, C_expect, n));
+
+		mat_zero(C, n);
+		mat_mul_cl(A, B, C, n);
+		assert(mat_eq(C, C_expect, n));
+	}
+
+	/* Benchmarks. */
+	{
+		F *A = NULL, *B = NULL, *C = NULL, *C_ref = NULL;
+		double dt, time;
+		size_t i, n = 1, n2, a_sizeof;
+
+		puts("#matmul");
+		puts("n mat_mul_cpu mat_mul_cl");
+		while(1) {
+			printf("%zu ", n);
+			n2 = n * n;
+			a_sizeof = n2 * sizeof(F);
+			A = realloc(A, a_sizeof);
+			B = realloc(B, a_sizeof);
+			C_ref = realloc(C_ref, a_sizeof);
+			C = realloc(C, a_sizeof);
+			if (A == NULL || B == NULL || C == NULL) {
+				printf("Could not allocate memory for n = %zu", n);
+				break;
+			}
+			mat_rand(A, n);
+			mat_rand(B, n);
+
+			time = common_get_nanos();
+			mat_mul_cpu(A, B, C_ref, n);
+			dt = common_get_nanos() - time;
+			printf("%f ", dt);
+
+			time = common_get_nanos();
+			mat_mul_cl(A, B, C, n);
+			printf("%f", common_get_nanos() - time);
+
+			assert(mat_eq(C, C_ref, n));
+			puts("");
+			if (dt > 4.0)
+				break;
+			n *= 2;
+		}
+		free(A);
+		free(B);
+		free(C);
+	}
+
+    return EXIT_SUCCESS;
+}
diff --git a/opencl/matmul.cl b/opencl/matmul.cl
@@ -0,0 +1,16 @@
+__kernel void main(
+    __global float *A,
+    __global float *B,
+    __global float *C,
+    const uint N
+) {
+    uint k;
+    uint i = get_global_id(0);
+    uint j = get_global_id(1);
+    float tmp;
+
+    tmp = 0.0;
+    for (k = 0; k < N; ++k)
+        tmp += A[i*N+k] * B[k*N+j];
+    C[i*N+j] = tmp;
+}
diff --git a/opencl/pass_by_value.c b/opencl/pass_by_value.c
diff --git a/opencl/work_item_builtin.c b/opencl/work_item_builtin.c

Original file line number	Diff line number	Diff line change
`@@ -46,7 +46,7 @@ void random_array(char * const arr, const size_t len) {`
`46`	`46`	`arr[i] = rand() % SCHAR_MAX;`
`47`	`47`	`}`
`48`	`48`
`49`		`-int main() {`
	`49`	`+int main(void) {`
`50`	`50`	`srand(time(NULL));`
`51`	`51`
`52`	`52`	`/*`