Skip to content

Commit 62b8ece

Browse files
committed
CL benchmark matrix multiplication boilerplate
1 parent 35eb411 commit 62b8ece

12 files changed

+256
-24
lines changed

c/rand.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ void random_array(char * const arr, const size_t len) {
4646
arr[i] = rand() % SCHAR_MAX;
4747
}
4848

49-
int main() {
49+
int main(void) {
5050
srand(time(NULL));
5151

5252
/*

opencl/applications.md

+8
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,19 @@ Minimal example request: <http://stackoverflow.com/questions/7663343/simplest-po
1616
- <https://en.wikipedia.org/wiki/Black%E2%80%93Scholes_model>
1717
- Reverse Time Migration: RTM <http://www.slb.com/services/seismic/geophysical_processing_characterization/dp/technologies/depth/prestackdepth/rtm.aspx>
1818

19+
- clMathLibraries organization, by AMD employees
20+
21+
- <https://github.com/clMathLibraries/clFFT> FFT
22+
- <https://github.com/clMathLibraries/clRNG> random number generation
23+
1924
### Matrix multiplication
2025

2126
- <http://hpclab.blogspot.fr/2011/09/is-gpu-good-for-large-vector-addition.html>
2227
- <https://developer.nvidia.com/cublas>
28+
- <https://github.com/clMathLibraries/clBLAS>
29+
- <https://github.com/clMathLibraries/clSPARSE>
2330
- <http://stackoverflow.com/questions/16748604/opencl-matrix-multiplication-should-be-faster>
31+
- <http://stackoverflow.com/questions/33086029/multiply-matrices-in-c-or-in-glsl>
2432

2533
Not surprising, since rendering is just a bunch of matrix multiplications, with fixed matrices and varying vectors.
2634

opencl/architecture.md

+7
Original file line numberDiff line numberDiff line change
@@ -94,3 +94,10 @@ It also shows how we must make an explicit copy to use private memory.
9494
- <http://stackoverflow.com/questions/8888718/how-to-declare-local-memory-in-opencl>
9595
- <http://stackoverflow.com/questions/2541929/how-do-i-use-local-memory-in-opencl>
9696
- <http://stackoverflow.com/questions/17574570/create-local-array-dynamic-inside-opencl-kernel>
97+
98+
## Pinned memory
99+
100+
TODO.
101+
102+
- <http://stackoverflow.com/questions/25496656/cl-mem-use-host-ptr-vs-cl-mem-copy-host-ptr-vs-cl-mem-alloc-host-ptr>
103+
- <http://stackoverflow.com/questions/24158909/how-to-use-pinned-memory-mapped-memory-in-opencl>

opencl/clinfo.c

+6-4
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,11 @@ Full list at:
1010

1111
#define PRINT_SIZE_T(id) \
1212
clGetDeviceInfo(device, CL_ ## id, sizeof(size_t), &(buf_size_t), NULL); \
13-
printf(" " #id " = %zu\n", buf_size_t);
13+
printf(#id " = %zu\n", buf_size_t);
1414

1515
#define PRINT_CL_UINT(id) \
1616
clGetDeviceInfo(device, CL_ ## id, sizeof(cl_uint), &(buf_cl_uint), NULL); \
17-
printf(" " #id " = %ju\n", (uintmax_t)buf_cl_uint);
17+
printf(#id " = %ju\n", (uintmax_t)buf_cl_uint);
1818

1919
int main(void) {
2020
cl_platform_id platform;
@@ -27,12 +27,14 @@ int main(void) {
2727
clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, NULL);
2828

2929
/* Print. */
30-
puts("clinfo");
30+
puts("#clinfo");
3131
PRINT_CL_UINT(DEVICE_MAX_WORK_ITEM_DIMENSIONS)
3232
PRINT_SIZE_T(DEVICE_MAX_WORK_GROUP_SIZE)
3333
PRINT_SIZE_T(DEVICE_MAX_WORK_ITEM_SIZES)
3434

3535
/* Cleanup. */
36-
clReleaseDevice(device);
36+
#ifdef CL_1_2
37+
clReleaseDevice(device);
38+
#endif
3739
return EXIT_SUCCESS;
3840
}

opencl/common.h

+15-5
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,12 @@
22
#define COMMON_H
33

44
#include <assert.h>
5+
#include <math.h>
56
#include <stdint.h>
67
#include <stdio.h>
78
#include <stdlib.h>
89
#include <string.h>
10+
#include <time.h>
911

1012
/* http://stackoverflow.com/questions/28500496/opencl-function-found-deprecated-by-visual-studio */
1113
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
@@ -47,18 +49,18 @@ static void common_init(
4749
common->command_queue = clCreateCommandQueue(common->context, common->device, 0, NULL);
4850
}
4951

50-
static char * common_read_file(const char *path) {
52+
static char* common_read_file(const char *path) {
5153
char *buffer;
5254
FILE *f;
5355
long length;
5456

5557
f = fopen(path, "r");
56-
fseek (f, 0, SEEK_END);
58+
fseek(f, 0, SEEK_END);
5759
length = ftell(f);
5860
fseek(f, 0, SEEK_SET);
5961
buffer = calloc(1, length + 1);
60-
fread (buffer, 1, length, f);
61-
fclose (f);
62+
fread(buffer, 1, length, f);
63+
fclose(f);
6264
buffer[length] = '\0';
6365
return buffer;
6466
}
@@ -80,7 +82,15 @@ static void common_deinit(
8082
clReleaseProgram(common->program);
8183
clReleaseKernel(common->kernel);
8284
clReleaseContext(common->context);
83-
clReleaseDevice(common->device);
85+
#ifdef CL_1_2
86+
clReleaseDevice(common->device);
87+
#endif
88+
}
89+
90+
static double common_get_nanos(void) {
91+
struct timespec ts;
92+
timespec_get(&ts, TIME_UTC);
93+
return ts.tv_sec + ts.tv_nsec / 1000000000.0;
8494
}
8595

8696
#endif

opencl/implementations.md

+4-6
Original file line numberDiff line numberDiff line change
@@ -33,18 +33,16 @@ Threads:
3333
- <http://stackoverflow.com/questions/3271243/clcreatesubbuffer-not-found-oo>
3434
- <https://devtalk.nvidia.com/default/topic/486564/nvidia-39-s-opencl-1-1-and-clcreatesubbuffer/>
3535

36-
No OpenCL 2 planned as of 2016:
36+
## OpenCL
37+
38+
OpenCL 2 announced in 2017:
3739

3840
- <http://stackoverflow.com/questions/29219307/opencl-2-0-on-nvidia-graphics-cards>
3941
- <https://devtalk.nvidia.com/default/topic/954622/opencl-2-x-support-plans-/>
4042

4143
but hardware support will very likely be / is already there because of Vulkan / OpenCL 2 convergence.
4244

43-
Linux dependencies for 340.65:
44-
45-
- `libdl.so.2`
46-
- `libpthread.so.0`
47-
- `libc.so.6`
45+
OpenCL 1.2 apparently added in driver 350.12, on Kepler hardware and up.
4846

4947
## Intel
5048

opencl/inc.c

+4-1
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ This is our OpenCL hello world, so we are not doing:
1010
#include <assert.h>
1111
#include <stdio.h>
1212

13+
/* To prevent deprecation warnings when headers are 2.0. */
1314
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
1415
#include <CL/cl.h>
1516

@@ -53,6 +54,8 @@ int main(void) {
5354
clReleaseProgram(program);
5455
clReleaseKernel(kernel);
5556
clReleaseContext(context);
56-
clReleaseDevice(device);
57+
#ifdef CL_1_2
58+
clReleaseDevice(device);
59+
#endif
5760
return EXIT_SUCCESS;
5861
}

opencl/inc_vector.c

+2-2
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,12 @@ int main(void) {
1919
cl_int input[] = {1, 2};
2020
cl_mem buffer;
2121
Common common;
22-
const size_t global_work_size = sizeof(input) / sizeof(cl_int);
22+
const size_t global_work_size = sizeof(input) / sizeof(input[0]);
2323

2424
/* Run kernel. */
2525
common_init(&common, source);
2626
buffer = clCreateBuffer(common.context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(input), input, NULL);
27-
clSetKernelArg(common.kernel, 0, sizeof(cl_mem), &buffer);
27+
clSetKernelArg(common.kernel, 0, sizeof(buffer), &buffer);
2828
clEnqueueNDRangeKernel(common.command_queue, common.kernel, 1, NULL, &global_work_size, NULL, 0, NULL, NULL);
2929
clFlush(common.command_queue);
3030
clFinish(common.command_queue);

opencl/matmul.c

+187
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,187 @@
1+
/*
2+
Matrix multiplication.
3+
4+
The most basic / useful application where OpenCL might be faster than CPU.
5+
6+
TODO: make a SERIOUS matrix implementation. Also compare with existing SERIOUS CPU and GPU implementations:
7+
8+
- http://stackoverflow.com/questions/1907557/optimized-matrix-multiplication-in-c
9+
- http://stackoverflow.com/questions/12289235/simple-and-fast-matrix-vector-multiplication-in-c-c
10+
- https://www.quora.com/What-is-the-best-way-to-multiply-two-matrices-in-C++
11+
*/
12+
13+
#include "common.h"
14+
15+
typedef cl_float F;
16+
17+
/* C = A*B, width n, naive. */
18+
void mat_mul_cpu(const F *A, const F *B, F *C, size_t n) {
19+
F tmp;
20+
size_t i, j, k;
21+
22+
for (i = 0; i < n; ++i) {
23+
for (j = 0; j < n; ++j) {
24+
tmp = 0;
25+
for (k = 0; k < n; ++k) {
26+
tmp += A[i*n+k] * B[k*n+j];
27+
}
28+
C[i*n+j] = tmp;
29+
}
30+
}
31+
}
32+
33+
/* Simplest possible implementation. */
34+
void mat_mul_cl(const F *A, const F *B, F *C, size_t n) {
35+
cl_mem buf_a, buf_b, buf_c;
36+
Common common;
37+
cl_uint ncl;
38+
size_t global_work_size[2], mat_sizeof, n2;
39+
40+
/* Setup variables. */
41+
global_work_size[0] = n;
42+
global_work_size[1] = n;
43+
n2 = n * n;
44+
mat_sizeof = n2 * sizeof(F);
45+
ncl = n;
46+
47+
/* Run kernel. */
48+
common_init_file(&common, "matmul.cl");
49+
buf_a = clCreateBuffer(common.context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mat_sizeof, (F*)A, NULL);
50+
buf_b = clCreateBuffer(common.context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mat_sizeof, (F*)B, NULL);
51+
buf_c = clCreateBuffer(common.context, CL_MEM_WRITE_ONLY, mat_sizeof, C, NULL);
52+
clSetKernelArg(common.kernel, 0, sizeof(buf_a), &buf_a);
53+
clSetKernelArg(common.kernel, 1, sizeof(buf_b), &buf_b);
54+
clSetKernelArg(common.kernel, 2, sizeof(buf_c), &buf_c);
55+
clSetKernelArg(common.kernel, 3, sizeof(ncl), &ncl);
56+
clEnqueueNDRangeKernel(common.command_queue, common.kernel, 2, NULL, global_work_size, NULL, 0, NULL, NULL);
57+
clFlush(common.command_queue);
58+
clFinish(common.command_queue);
59+
clEnqueueReadBuffer(common.command_queue, buf_c, CL_TRUE, 0, mat_sizeof, C, 0, NULL, NULL);
60+
61+
/* Cleanup. */
62+
clReleaseMemObject(buf_a);
63+
clReleaseMemObject(buf_b);
64+
clReleaseMemObject(buf_c);
65+
common_deinit(&common);
66+
}
67+
68+
/* Check if two matrices are equal with given mean squared err_maxor. */
69+
int mat_eq(const F *A, const F *B, size_t n) {
70+
const F err_max = 10e-3;
71+
F err, diff, a, b;
72+
size_t i, i_max;
73+
74+
err = 0.0;
75+
i_max = n*n;
76+
for (i = 0; i < i_max; ++i) {
77+
a = A[i];
78+
b = B[i];
79+
diff = a - b;
80+
err += diff * diff;
81+
}
82+
return (sqrt(err) / i_max) < err_max;
83+
}
84+
85+
/* No, this was not created for debugging, my code is flawless from the first try. */
86+
void mat_print(const F *A, size_t n) {
87+
size_t i, j;
88+
for (i = 0; i < n; ++i) {
89+
for (j = 0; j < n; ++j) {
90+
printf("%f ", A[i*n+j]);
91+
}
92+
puts("");
93+
}
94+
}
95+
96+
/* Zero a matrix. */
97+
F * mat_zero(F *A, size_t n) {
98+
size_t i, n2;
99+
n2 = n*n;
100+
for (i = 0; i < n2; ++i) {
101+
A[i] = 0.0;
102+
}
103+
}
104+
105+
/* Initialize a random matrix. */
106+
F * mat_rand(F *A, size_t n) {
107+
size_t i, n2;
108+
n2 = n*n;
109+
for (i = 0; i < n2; ++i) {
110+
A[i] = ((float)rand()) / ((float)RAND_MAX);
111+
}
112+
}
113+
114+
int main(void) {
115+
srand(time(NULL));
116+
117+
/* Unit test our implementations. */
118+
{
119+
const F A[] = {
120+
1.0, 2.0,
121+
3.0, 4.0
122+
};
123+
const F B[] = {
124+
5.0, 6.0,
125+
7.0, 8.0
126+
};
127+
size_t n = sqrt(sizeof(A)/sizeof(A[0]));
128+
F C[n*n];
129+
const F C_expect[] = {
130+
19.0, 22.0,
131+
43.0, 50.0
132+
};
133+
134+
mat_zero(C, n);
135+
mat_mul_cpu(A, B, C, n);
136+
assert(mat_eq(C, C_expect, n));
137+
138+
mat_zero(C, n);
139+
mat_mul_cl(A, B, C, n);
140+
assert(mat_eq(C, C_expect, n));
141+
}
142+
143+
/* Benchmarks. */
144+
{
145+
F *A = NULL, *B = NULL, *C = NULL, *C_ref = NULL;
146+
double dt, time;
147+
size_t i, n = 1, n2, a_sizeof;
148+
149+
puts("#matmul");
150+
puts("n mat_mul_cpu mat_mul_cl");
151+
while(1) {
152+
printf("%zu ", n);
153+
n2 = n * n;
154+
a_sizeof = n2 * sizeof(F);
155+
A = realloc(A, a_sizeof);
156+
B = realloc(B, a_sizeof);
157+
C_ref = realloc(C_ref, a_sizeof);
158+
C = realloc(C, a_sizeof);
159+
if (A == NULL || B == NULL || C == NULL) {
160+
printf("Could not allocate memory for n = %zu", n);
161+
break;
162+
}
163+
mat_rand(A, n);
164+
mat_rand(B, n);
165+
166+
time = common_get_nanos();
167+
mat_mul_cpu(A, B, C_ref, n);
168+
dt = common_get_nanos() - time;
169+
printf("%f ", dt);
170+
171+
time = common_get_nanos();
172+
mat_mul_cl(A, B, C, n);
173+
printf("%f", common_get_nanos() - time);
174+
175+
assert(mat_eq(C, C_ref, n));
176+
puts("");
177+
if (dt > 4.0)
178+
break;
179+
n *= 2;
180+
}
181+
free(A);
182+
free(B);
183+
free(C);
184+
}
185+
186+
return EXIT_SUCCESS;
187+
}

opencl/matmul.cl

+16
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
__kernel void main(
2+
__global float *A,
3+
__global float *B,
4+
__global float *C,
5+
const uint N
6+
) {
7+
uint k;
8+
uint i = get_global_id(0);
9+
uint j = get_global_id(1);
10+
float tmp;
11+
12+
tmp = 0.0;
13+
for (k = 0; k < N; ++k)
14+
tmp += A[i*N+k] * B[k*N+j];
15+
C[i*N+j] = tmp;
16+
}

0 commit comments

Comments
 (0)