|
| 1 | +/* |
| 2 | +Matrix multiplication. |
| 3 | +
|
| 4 | +The most basic / useful application where OpenCL might be faster than CPU. |
| 5 | +
|
| 6 | +TODO: make a SERIOUS matrix implementation. Also compare with existing SERIOUS CPU and GPU implementations: |
| 7 | +
|
| 8 | +- http://stackoverflow.com/questions/1907557/optimized-matrix-multiplication-in-c |
| 9 | +- http://stackoverflow.com/questions/12289235/simple-and-fast-matrix-vector-multiplication-in-c-c |
| 10 | +- https://www.quora.com/What-is-the-best-way-to-multiply-two-matrices-in-C++ |
| 11 | +*/ |
| 12 | + |
| 13 | +#include "common.h" |
| 14 | + |
| 15 | +typedef cl_float F; |
| 16 | + |
| 17 | +/* C = A*B, width n, naive. */ |
| 18 | +void mat_mul_cpu(const F *A, const F *B, F *C, size_t n) { |
| 19 | + F tmp; |
| 20 | + size_t i, j, k; |
| 21 | + |
| 22 | + for (i = 0; i < n; ++i) { |
| 23 | + for (j = 0; j < n; ++j) { |
| 24 | + tmp = 0; |
| 25 | + for (k = 0; k < n; ++k) { |
| 26 | + tmp += A[i*n+k] * B[k*n+j]; |
| 27 | + } |
| 28 | + C[i*n+j] = tmp; |
| 29 | + } |
| 30 | + } |
| 31 | +} |
| 32 | + |
| 33 | +/* Simplest possible implementation. */ |
| 34 | +void mat_mul_cl(const F *A, const F *B, F *C, size_t n) { |
| 35 | + cl_mem buf_a, buf_b, buf_c; |
| 36 | + Common common; |
| 37 | + cl_uint ncl; |
| 38 | + size_t global_work_size[2], mat_sizeof, n2; |
| 39 | + |
| 40 | + /* Setup variables. */ |
| 41 | + global_work_size[0] = n; |
| 42 | + global_work_size[1] = n; |
| 43 | + n2 = n * n; |
| 44 | + mat_sizeof = n2 * sizeof(F); |
| 45 | + ncl = n; |
| 46 | + |
| 47 | + /* Run kernel. */ |
| 48 | + common_init_file(&common, "matmul.cl"); |
| 49 | + buf_a = clCreateBuffer(common.context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mat_sizeof, (F*)A, NULL); |
| 50 | + buf_b = clCreateBuffer(common.context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mat_sizeof, (F*)B, NULL); |
| 51 | + buf_c = clCreateBuffer(common.context, CL_MEM_WRITE_ONLY, mat_sizeof, C, NULL); |
| 52 | + clSetKernelArg(common.kernel, 0, sizeof(buf_a), &buf_a); |
| 53 | + clSetKernelArg(common.kernel, 1, sizeof(buf_b), &buf_b); |
| 54 | + clSetKernelArg(common.kernel, 2, sizeof(buf_c), &buf_c); |
| 55 | + clSetKernelArg(common.kernel, 3, sizeof(ncl), &ncl); |
| 56 | + clEnqueueNDRangeKernel(common.command_queue, common.kernel, 2, NULL, global_work_size, NULL, 0, NULL, NULL); |
| 57 | + clFlush(common.command_queue); |
| 58 | + clFinish(common.command_queue); |
| 59 | + clEnqueueReadBuffer(common.command_queue, buf_c, CL_TRUE, 0, mat_sizeof, C, 0, NULL, NULL); |
| 60 | + |
| 61 | + /* Cleanup. */ |
| 62 | + clReleaseMemObject(buf_a); |
| 63 | + clReleaseMemObject(buf_b); |
| 64 | + clReleaseMemObject(buf_c); |
| 65 | + common_deinit(&common); |
| 66 | +} |
| 67 | + |
| 68 | +/* Check if two matrices are equal with given mean squared err_maxor. */ |
| 69 | +int mat_eq(const F *A, const F *B, size_t n) { |
| 70 | + const F err_max = 10e-3; |
| 71 | + F err, diff, a, b; |
| 72 | + size_t i, i_max; |
| 73 | + |
| 74 | + err = 0.0; |
| 75 | + i_max = n*n; |
| 76 | + for (i = 0; i < i_max; ++i) { |
| 77 | + a = A[i]; |
| 78 | + b = B[i]; |
| 79 | + diff = a - b; |
| 80 | + err += diff * diff; |
| 81 | + } |
| 82 | + return (sqrt(err) / i_max) < err_max; |
| 83 | +} |
| 84 | + |
| 85 | +/* No, this was not created for debugging, my code is flawless from the first try. */ |
| 86 | +void mat_print(const F *A, size_t n) { |
| 87 | + size_t i, j; |
| 88 | + for (i = 0; i < n; ++i) { |
| 89 | + for (j = 0; j < n; ++j) { |
| 90 | + printf("%f ", A[i*n+j]); |
| 91 | + } |
| 92 | + puts(""); |
| 93 | + } |
| 94 | +} |
| 95 | + |
| 96 | +/* Zero a matrix. */ |
| 97 | +F * mat_zero(F *A, size_t n) { |
| 98 | + size_t i, n2; |
| 99 | + n2 = n*n; |
| 100 | + for (i = 0; i < n2; ++i) { |
| 101 | + A[i] = 0.0; |
| 102 | + } |
| 103 | +} |
| 104 | + |
| 105 | +/* Initialize a random matrix. */ |
| 106 | +F * mat_rand(F *A, size_t n) { |
| 107 | + size_t i, n2; |
| 108 | + n2 = n*n; |
| 109 | + for (i = 0; i < n2; ++i) { |
| 110 | + A[i] = ((float)rand()) / ((float)RAND_MAX); |
| 111 | + } |
| 112 | +} |
| 113 | + |
| 114 | +int main(void) { |
| 115 | + srand(time(NULL)); |
| 116 | + |
| 117 | + /* Unit test our implementations. */ |
| 118 | + { |
| 119 | + const F A[] = { |
| 120 | + 1.0, 2.0, |
| 121 | + 3.0, 4.0 |
| 122 | + }; |
| 123 | + const F B[] = { |
| 124 | + 5.0, 6.0, |
| 125 | + 7.0, 8.0 |
| 126 | + }; |
| 127 | + size_t n = sqrt(sizeof(A)/sizeof(A[0])); |
| 128 | + F C[n*n]; |
| 129 | + const F C_expect[] = { |
| 130 | + 19.0, 22.0, |
| 131 | + 43.0, 50.0 |
| 132 | + }; |
| 133 | + |
| 134 | + mat_zero(C, n); |
| 135 | + mat_mul_cpu(A, B, C, n); |
| 136 | + assert(mat_eq(C, C_expect, n)); |
| 137 | + |
| 138 | + mat_zero(C, n); |
| 139 | + mat_mul_cl(A, B, C, n); |
| 140 | + assert(mat_eq(C, C_expect, n)); |
| 141 | + } |
| 142 | + |
| 143 | + /* Benchmarks. */ |
| 144 | + { |
| 145 | + F *A = NULL, *B = NULL, *C = NULL, *C_ref = NULL; |
| 146 | + double dt, time; |
| 147 | + size_t i, n = 1, n2, a_sizeof; |
| 148 | + |
| 149 | + puts("#matmul"); |
| 150 | + puts("n mat_mul_cpu mat_mul_cl"); |
| 151 | + while(1) { |
| 152 | + printf("%zu ", n); |
| 153 | + n2 = n * n; |
| 154 | + a_sizeof = n2 * sizeof(F); |
| 155 | + A = realloc(A, a_sizeof); |
| 156 | + B = realloc(B, a_sizeof); |
| 157 | + C_ref = realloc(C_ref, a_sizeof); |
| 158 | + C = realloc(C, a_sizeof); |
| 159 | + if (A == NULL || B == NULL || C == NULL) { |
| 160 | + printf("Could not allocate memory for n = %zu", n); |
| 161 | + break; |
| 162 | + } |
| 163 | + mat_rand(A, n); |
| 164 | + mat_rand(B, n); |
| 165 | + |
| 166 | + time = common_get_nanos(); |
| 167 | + mat_mul_cpu(A, B, C_ref, n); |
| 168 | + dt = common_get_nanos() - time; |
| 169 | + printf("%f ", dt); |
| 170 | + |
| 171 | + time = common_get_nanos(); |
| 172 | + mat_mul_cl(A, B, C, n); |
| 173 | + printf("%f", common_get_nanos() - time); |
| 174 | + |
| 175 | + assert(mat_eq(C, C_ref, n)); |
| 176 | + puts(""); |
| 177 | + if (dt > 4.0) |
| 178 | + break; |
| 179 | + n *= 2; |
| 180 | + } |
| 181 | + free(A); |
| 182 | + free(B); |
| 183 | + free(C); |
| 184 | + } |
| 185 | + |
| 186 | + return EXIT_SUCCESS; |
| 187 | +} |
0 commit comments