priv_cols

cirosantilli · cirosantilli · commit 83e6b150e0e2 · 2017-03-26T09:29:33.000+01:00
diff --git a/lapack/README.md b/lapack/README.md
@@ -1,6 +1,4 @@
-# BLAS and LAPACK
-
-TODO BROKEN: how to install LAPACKE on Ubuntu 12.04?
+# LAPACK
 
 BLAS and LAPACK are:
 
@@ -16,9 +14,9 @@ BLAS and LAPACK are:
 
     It might be a good idea to understand how to interface Fortran with C before the C interfaces.
 
-## Related projects
+Many implementations have been made, so they may be considered interfaces derived from an initial implementation nowadays.
 
-### BLAS
+## BLAS
 
 <http://www.netlib.org/blas/>
 
@@ -30,24 +28,27 @@ BLAS contains low level functions such as:
 - vector matrix multiplication
 - matrix matrix multiplication
 
-### LAPACK
+LAPACK uses BLAS
+
+### BLAS vs LAPACK
 
 LAPACK contains higher level functions such as:
 
 - solving linear systems
+- least squares
 - eigenvalue/eigenvector calculations
 
-It now includes an official C interface called LAPACKE.
+It now includes an official C interface called LAPACKE, which other implementations also implement.
 
-This does not ship with the Ubuntu `liblapack-dev` package at the time of writing, but there is a `liblapacke-dev` package available which provides it.
+## Implementations
 
 ### ScaLAPACK
 
 <http://www.netlib.org/scalapack/>
 
 Continuation of LAPACK.
 
-Considers parallelism.
+Considers parallelism distributed across machines.
 
 ### ATLAS
 
@@ -59,6 +60,20 @@ Implements full BLAS, but only part of LAPACK.
 
 Has C interface.
 
+### OpenBLAS
+
+<https://github.com/xianyi/OpenBLAS>
+
+### PBLAS
+
+<https://en.wikipedia.org/wiki/PBLAS>
+
+Created and used by ScaLAPACK.
+
+### MKL
+
+Intel's closed source implementation.
+
 ## C interface
 
 The BLAS project provides `cblas.h`, which contains a C interface for BLAS (TODO but also an implementation?)
diff --git a/opencl/Makefile_params b/opencl/Makefile_params
@@ -1,3 +1,5 @@
 LIBS := -lm -pthread -lOpenCL -lblas -lclBLAS
-# blas from libatlas-base-dev
-# cblas from libblas-dev
+# Alternative blas implementations with identical interfaces:
+# - cblas from libblas-dev
+# - blas from libatlas-base-dev
+# - openblas from libopenblas-dev
diff --git a/opencl/clinfo.c b/opencl/clinfo.c
@@ -8,6 +8,10 @@ Full list at:
 
 #include "common.h"
 
+#define PRINT_CHAR(id) \
+    clGetDeviceInfo(device, CL_ ## id, sizeof(buf_char), buf_char, NULL); \
+    printf(#id " = %s\n", buf_char);
+
 #define PRINT_SIZE_T(id) \
     clGetDeviceInfo(device, CL_ ## id, sizeof(buf_size_t), &(buf_size_t), NULL); \
     printf(#id " = %zu\n", buf_size_t);
@@ -21,6 +25,7 @@ Full list at:
     printf(#id " = 0x%lx\n", (uintmax_t)buf_cl_ulong);
 
 int main(void) {
+    char buf_char[256];
     cl_device_id device;
     cl_platform_id platform;
     cl_uint buf_cl_uint;
@@ -33,6 +38,12 @@ int main(void) {
 
     /* Print. */
     puts("#clinfo");
+    PRINT_CHAR(DEVICE_EXTENSIONS);
+    PRINT_CHAR(DEVICE_NAME);
+    PRINT_CHAR(DEVICE_PROFILE);
+    PRINT_CHAR(DEVICE_VENDOR);
+    PRINT_CHAR(DEVICE_VERSION);
+    PRINT_CHAR(DRIVER_VERSION);
     PRINT_SIZE_T(DEVICE_MAX_WORK_GROUP_SIZE);
     PRINT_CL_UINT(DEVICE_MAX_WORK_ITEM_DIMENSIONS);
     /* TODO this is wrong, it is actually an array of
diff --git a/opencl/implementations.md b/opencl/implementations.md
@@ -2,8 +2,16 @@
 
 <https://en.wikipedia.org/wiki/OpenCL#Implementations>
 
+## ICD
+
+## Installable client driver
+
 There is a certain "installable client driver loader (ICD loader)" which forwards calls to the proprietary implementation.
 
+<https://www.khronos.org/news/permalink/opencl-installable-client-driver-icd-loader>
+
+TODO how to use it.
+
 ## Gallium Compute
 
 <http://www.x.org/wiki/XorgEVoC/GalliumCompute/>
diff --git a/opencl/matmul.c b/opencl/matmul.c
@@ -432,7 +432,7 @@ void mat_mul_cl_row_local(const F *A, const F *B, F *C, size_t n) {
  * This leads to a thread blockage / memory access tradeoff.
  *
  * We make work groups as large as possible to reload memory less times. */
-void mat_mul_cl_row_priv_priv_col_local(const F *A, const F *B, F *C, size_t n) {
+void mat_mul_cl_row_priv_col_local(const F *A, const F *B, F *C, size_t n) {
     char options[256];
     cl_mem buf_a, buf_b, buf_c;
     Common common;
@@ -441,13 +441,13 @@ void mat_mul_cl_row_priv_priv_col_local(const F *A, const F *B, F *C, size_t n)
 
     /* Setup variables. */
     global_work_size = n;
-    local_work_size = 0;
     mat_sizeof = n * n * sizeof(F);
     ncl = n;
 
     /* Run kernel. */
     snprintf(options, sizeof(options), "-DPRIV_ROW_SIZE=%ju", n);
     common_init_file_options(&common, "matmul_row_priv_col_local.cl", options);
+    local_work_size = 0;
     clGetDeviceInfo(common.device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(local_work_size), &local_work_size, NULL);
     local_work_size = zmin(local_work_size, n);
     buf_a = clCreateBuffer(common.context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mat_sizeof, (F*)A, NULL);
@@ -458,7 +458,61 @@ void mat_mul_cl_row_priv_priv_col_local(const F *A, const F *B, F *C, size_t n)
     clSetKernelArg(common.kernel, 2, sizeof(buf_c), &buf_c);
     clSetKernelArg(common.kernel, 3, n * sizeof(F), NULL);
     clSetKernelArg(common.kernel, 4, sizeof(ncl), &ncl);
-    clEnqueueNDRangeKernel(common.command_queue, common.kernel, 1, NULL, &global_work_size, NULL, 0, NULL, NULL);
+    clEnqueueNDRangeKernel(common.command_queue, common.kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL);
+    clFlush(common.command_queue);
+    clFinish(common.command_queue);
+    clEnqueueReadBuffer(common.command_queue, buf_c, CL_TRUE, 0, mat_sizeof, C, 0, NULL, NULL);
+
+    /* Cleanup. */
+    clReleaseMemObject(buf_a);
+    clReleaseMemObject(buf_b);
+    clReleaseMemObject(buf_c);
+    common_deinit(&common);
+}
+
+/* Copy as many cols from B as possibl to the local memory, only then start multiplying.
+ * This leads to less memory barrier hits.
+ * How many rows we copy is limited by the local memory size, ideally the entire matrix will fit. */
+void mat_mul_cl_row_priv_cols_local(const F *A, const F *B, F *C, size_t n) {
+    char options[256];
+    cl_mem buf_a, buf_b, buf_c;
+    Common common;
+    cl_uint ncl, n_local_cols;
+    cl_ulong local_mem_size;
+    size_t col_size, global_work_size, local_work_size, mat_sizeof;
+
+    /* Setup variables. */
+    col_size = n * sizeof(F);
+    global_work_size = n;
+    mat_sizeof = n * n * sizeof(F);
+    ncl = n;
+
+    /* Run kernel. */
+    snprintf(options, sizeof(options), "-DPRIV_ROW_SIZE=%ju", n);
+    common_init_file_options(&common, "matmul_row_priv_cols_local.cl", options);
+    local_work_size = 0;
+    clGetDeviceInfo(common.device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(local_work_size), &local_work_size, NULL);
+    local_work_size = zmin(local_work_size, n);
+    local_mem_size = 0;
+    clGetDeviceInfo(common.device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(local_mem_size), &local_mem_size, NULL);
+    /* TODO: can blow up without that - 1. Why?
+     * It only reaches the max without it, not crosses, right?
+     * So bug in the kernel? */
+    n_local_cols = zmin(local_mem_size / col_size, n) - 1;
+    /*puts("");*/
+    /*printf("max memory %llu\n", (unsigned long long)local_mem_size);*/
+    /*printf("n_local_cols %llu\n", (unsigned long long)n_local_cols);*/
+    /*printf("memory %llu\n", (unsigned long long)n_local_cols * n * sizeof(F));*/
+    buf_a = clCreateBuffer(common.context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mat_sizeof, (F*)A, NULL);
+    buf_b = clCreateBuffer(common.context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mat_sizeof, (F*)B, NULL);
+    buf_c = clCreateBuffer(common.context, CL_MEM_WRITE_ONLY, mat_sizeof, C, NULL);
+    clSetKernelArg(common.kernel, 0, sizeof(buf_a), &buf_a);
+    clSetKernelArg(common.kernel, 1, sizeof(buf_b), &buf_b);
+    clSetKernelArg(common.kernel, 2, sizeof(buf_c), &buf_c);
+    clSetKernelArg(common.kernel, 3, n_local_cols * col_size, NULL);
+    clSetKernelArg(common.kernel, 4, sizeof(ncl), &ncl);
+    clSetKernelArg(common.kernel, 5, sizeof(n_local_cols), &n_local_cols);
+    clEnqueueNDRangeKernel(common.command_queue, common.kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL);
     clFlush(common.command_queue);
     clFinish(common.command_queue);
     clEnqueueReadBuffer(common.command_queue, buf_c, CL_TRUE, 0, mat_sizeof, C, 0, NULL, NULL);
@@ -477,8 +531,6 @@ void mat_mul_cl_block(const F *A, const F *B, F *C, size_t n) {
     size_t global_work_size[2], local_work_size[2], mat_sizeof, nblk;
 
     /* Setup variables. */
-    /* Cannot be larger than 1 on this example, otherwise memory conflicts
-     * will happen between work items. */
     global_work_size[0] = n;
     global_work_size[1] = n;
     mat_sizeof = n * n * sizeof(F);
@@ -488,6 +540,7 @@ void mat_mul_cl_block(const F *A, const F *B, F *C, size_t n) {
     common_init_file(&common, "matmul_block.cl");
     clGetDeviceInfo(common.device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(nblk), &nblk, NULL);
     nblk = sqrt(zmin(nblk, n));
+    nblk = zmin(nblk, 3);
     nblkcl = nblk;
     local_work_size[0] = nblk;
     local_work_size[1] = nblk;
@@ -498,6 +551,8 @@ void mat_mul_cl_block(const F *A, const F *B, F *C, size_t n) {
     clSetKernelArg(common.kernel, 1, sizeof(buf_b), &buf_b);
     clSetKernelArg(common.kernel, 2, sizeof(buf_c), &buf_c);
     clSetKernelArg(common.kernel, 3, nblk * nblk * sizeof(F), NULL);
+    printf("nblk = %llu\n", (unsigned long long)nblk);
+    printf("local memory = %llu\n", (unsigned long long)2 * nblk * nblk * sizeof(F));
     clSetKernelArg(common.kernel, 4, nblk * nblk * sizeof(F), NULL);
     clSetKernelArg(common.kernel, 5, sizeof(ncl), &ncl);
     clSetKernelArg(common.kernel, 6, sizeof(nblkcl), &nblkcl);
@@ -534,17 +589,18 @@ int main(int argc, char **argv) {
     double max_runtime;
     /* Overly slow ones commented out by default. */
     MatMul mat_mul_funcs[] = {
-        mat_mul_cpu_trans,
-        mat_mul_cpu_trans_vec,
-        mat_mul_cpu_block,
+        /*mat_mul_cpu_trans,*/
+        /*mat_mul_cpu_trans_vec,*/
+        /*mat_mul_cpu_block,*/
         mat_mul_cpu_cblas,
-        mat_mul_cl,
-        mat_mul_cl_row_priv,
-        mat_mul_cl_row_local,
-        mat_mul_cl_row_priv_priv_col_local,
-        /* TODO broken. */
-        /*mat_mul_cl_block,*/
-        mat_mul_cl_clblas,
+        /*mat_mul_cl,*/
+        /*mat_mul_cl_row_priv,*/
+        /*mat_mul_cl_row_local,*/
+        /*mat_mul_cl_row_priv_col_local,*/
+        /*mat_mul_cl_row_priv_cols_local,*/
+        /* TODO broken for 32 or up, some cells contain trash. */
+        mat_mul_cl_block,
+        /*mat_mul_cl_clblas,*/
     };
     int first, func_done[NELEMS(mat_mul_funcs)] = {0};
     size_t f, i;
@@ -572,7 +628,6 @@ int main(int argc, char **argv) {
             19.0, 22.0,
             43.0, 50.0
         };
-
         for (f = 0; f < sizeof(mat_mul_funcs)/sizeof(mat_mul_funcs[0]); ++f) {
             mat_zero(C, n);
             mat_mul_funcs[f](A, B, C, n);
@@ -583,26 +638,25 @@ int main(int argc, char **argv) {
     /* Unit test 4x4. */
     {
         const F A[] = {
-            1.0,  2.0,  3.0,  4.0,
-            5.0,  6.0,  7.0,  8.0,
-            9.0, 10.0, 11.0, 12.0,
-           13.0, 14.0, 15.0, 16.0,
+             1.0,  2.0,  3.0,  4.0,
+             5.0,  6.0,  7.0,  8.0,
+             9.0, 10.0, 11.0, 12.0,
+            13.0, 14.0, 15.0, 16.0,
         };
         const F B[] = {
-           17.0, 18.0, 19.0, 20.0,
-           21.0, 22.0, 23.0, 24.0,
-           25.0, 26.0, 27.0, 28.0,
-           29.0, 30.0, 31.0, 32.0,
+            17.0, 18.0, 19.0, 20.0,
+            21.0, 22.0, 23.0, 24.0,
+            25.0, 26.0, 27.0, 28.0,
+            29.0, 30.0, 31.0, 32.0,
         };
         const F C_ref[] = {
-            250.000000, 260.000000, 270.000000, 280.000000,
-            618.000000, 644.000000, 670.000000, 696.000000,
-            986.000000, 1028.000000, 1070.000000, 1112.000000,
-            1354.000000, 1412.000000, 1470.000000, 1528.000000,
+             250.0,  260.0,  270.0,  280.0,
+             618.0,  644.0,  670.0,  696.0,
+             986.0, 1028.0, 1070.0, 1112.0,
+            1354.0, 1412.0, 1470.0, 1528.0,
         };
         enum N { n = 4 };
         F C[n*n];
-
         for (f = 0; f < NELEMS(mat_mul_funcs); ++f) {
             mat_zero(C, n);
             mat_mul_funcs[f](A, B, C, n);
@@ -615,7 +669,7 @@ int main(int argc, char **argv) {
         double dt;
         F *A = NULL, *B = NULL, *C = NULL, *C_ref = NULL, *dst = NULL, *ref = NULL;
         int done;
-        size_t n = 4, a_sizeof;
+        size_t n = 2, a_sizeof;
 
         done = 0;
         puts("#matmul");
diff --git a/opencl/matmul_block.cl b/opencl/matmul_block.cl
@@ -2,34 +2,37 @@ __kernel void main(
     __global const float* restrict A,
     __global const float* restrict B,
     __global float* restrict C,
-    __local float* restrict Awrk,
-    __local float* restrict Bwrk,
+    __local float* restrict Aloc,
+    __local float* restrict Bloc,
     const uint n,
     const uint blksz
 ) {
-    int kloc, Kblk;
-    float Ctmp=0.0f;
-    const uint i = get_global_id(0);
-    const uint j = get_global_id(1);
-    const uint Iblk = get_group_id(0);
-    const uint Jblk = get_group_id(1);
-    const uint iloc = get_local_id(0);
-    const uint jloc = get_local_id(1);
-    const uint nblks = n/blksz;
-          uint Abase = Iblk*n*blksz;
-    const uint Ainc  = blksz;
-          uint Bbase = Jblk*blksz;
-    const uint Binc  = blksz*n;
-    for (Kblk = 0;  Kblk<nblks;  Kblk++) {
-       Awrk[jloc*blksz+iloc] = A[Abase+jloc*n+iloc];
-       Bwrk[jloc*blksz+iloc] = B[Bbase+jloc*n+iloc];
-       barrier(CLK_LOCAL_MEM_FENCE);
-       for (kloc=0; kloc<blksz; kloc++)
-          Ctmp += Awrk[jloc*blksz+kloc] * Bwrk[kloc*blksz+iloc];
-       barrier(CLK_LOCAL_MEM_FENCE);
-       Abase += Ainc;
-       Bbase += Binc;
+    const uint
+        iloc = get_local_id(0),
+        jloc = get_local_id(1),
+        nblks = n / blksz,
+        b_inc = blksz * n
+    ;
+    float c_tmp = 0.0;
+    uint
+        a_base = get_group_id(1) * blksz * n,
+        b_base = get_group_id(0) * blksz,
+        iload,
+        iload_loc,
+        kloc,
+        kblk
+    ;
+    for (kblk = 0; kblk < nblks; kblk++) {
+        iload_loc = jloc * blksz + iloc;
+        iload = jloc * n + iloc;
+        Aloc[iload_loc] = A[a_base + iload];
+        Bloc[iload_loc] = B[b_base + iload];
+        barrier(CLK_LOCAL_MEM_FENCE);
+        for (kloc = 0; kloc < blksz; kloc++)
+           c_tmp += Aloc[jloc * blksz + kloc] * Bloc[kloc * blksz + iloc];
+        barrier(CLK_LOCAL_MEM_FENCE);
+        a_base += blksz;
+        b_base += b_inc;
     }
-    C[j*n+i] = Ctmp;
-
+    C[get_global_id(1) * n + get_global_id(0)] = c_tmp;
 }
diff --git a/opencl/matmul_row_priv_col_local.cl b/opencl/matmul_row_priv_col_local.cl
diff --git a/opencl/matmul_row_priv_cols_local.cl b/opencl/matmul_row_priv_cols_local.cl