Skip to content

Commit 83e6b15

Browse files
committed
priv_cols
1 parent 48a7819 commit 83e6b15

8 files changed

+226
-90
lines changed

lapack/README.md

+24-9
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,4 @@
1-
# BLAS and LAPACK
2-
3-
TODO BROKEN: how to install LAPACKE on Ubuntu 12.04?
1+
# LAPACK
42

53
BLAS and LAPACK are:
64

@@ -16,9 +14,9 @@ BLAS and LAPACK are:
1614

1715
It might be a good idea to understand how to interface Fortran with C before the C interfaces.
1816

19-
## Related projects
17+
Many implementations have been made, so they may be considered interfaces derived from an initial implementation nowadays.
2018

21-
### BLAS
19+
## BLAS
2220

2321
<http://www.netlib.org/blas/>
2422

@@ -30,24 +28,27 @@ BLAS contains low level functions such as:
3028
- vector matrix multiplication
3129
- matrix matrix multiplication
3230

33-
### LAPACK
31+
LAPACK uses BLAS
32+
33+
### BLAS vs LAPACK
3434

3535
LAPACK contains higher level functions such as:
3636

3737
- solving linear systems
38+
- least squares
3839
- eigenvalue/eigenvector calculations
3940

40-
It now includes an official C interface called LAPACKE.
41+
It now includes an official C interface called LAPACKE, which other implementations also implement.
4142

42-
This does not ship with the Ubuntu `liblapack-dev` package at the time of writing, but there is a `liblapacke-dev` package available which provides it.
43+
## Implementations
4344

4445
### ScaLAPACK
4546

4647
<http://www.netlib.org/scalapack/>
4748

4849
Continuation of LAPACK.
4950

50-
Considers parallelism.
51+
Considers parallelism distributed across machines.
5152

5253
### ATLAS
5354

@@ -59,6 +60,20 @@ Implements full BLAS, but only part of LAPACK.
5960

6061
Has C interface.
6162

63+
### OpenBLAS
64+
65+
<https://github.com/xianyi/OpenBLAS>
66+
67+
### PBLAS
68+
69+
<https://en.wikipedia.org/wiki/PBLAS>
70+
71+
Created and used by ScaLAPACK.
72+
73+
### MKL
74+
75+
Intel's closed source implementation.
76+
6277
## C interface
6378

6479
The BLAS project provides `cblas.h`, which contains a C interface for BLAS (TODO but also an implementation?)

opencl/Makefile_params

+4-2
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
11
LIBS := -lm -pthread -lOpenCL -lblas -lclBLAS
2-
# blas from libatlas-base-dev
3-
# cblas from libblas-dev
2+
# Alternative blas implementations with identical interfaces:
3+
# - cblas from libblas-dev
4+
# - blas from libatlas-base-dev
5+
# - openblas from libopenblas-dev

opencl/clinfo.c

+11
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,10 @@ Full list at:
88

99
#include "common.h"
1010

11+
#define PRINT_CHAR(id) \
12+
clGetDeviceInfo(device, CL_ ## id, sizeof(buf_char), buf_char, NULL); \
13+
printf(#id " = %s\n", buf_char);
14+
1115
#define PRINT_SIZE_T(id) \
1216
clGetDeviceInfo(device, CL_ ## id, sizeof(buf_size_t), &(buf_size_t), NULL); \
1317
printf(#id " = %zu\n", buf_size_t);
@@ -21,6 +25,7 @@ Full list at:
2125
printf(#id " = 0x%lx\n", (uintmax_t)buf_cl_ulong);
2226

2327
int main(void) {
28+
char buf_char[256];
2429
cl_device_id device;
2530
cl_platform_id platform;
2631
cl_uint buf_cl_uint;
@@ -33,6 +38,12 @@ int main(void) {
3338

3439
/* Print. */
3540
puts("#clinfo");
41+
PRINT_CHAR(DEVICE_EXTENSIONS);
42+
PRINT_CHAR(DEVICE_NAME);
43+
PRINT_CHAR(DEVICE_PROFILE);
44+
PRINT_CHAR(DEVICE_VENDOR);
45+
PRINT_CHAR(DEVICE_VERSION);
46+
PRINT_CHAR(DRIVER_VERSION);
3647
PRINT_SIZE_T(DEVICE_MAX_WORK_GROUP_SIZE);
3748
PRINT_CL_UINT(DEVICE_MAX_WORK_ITEM_DIMENSIONS);
3849
/* TODO this is wrong, it is actually an array of

opencl/implementations.md

+8
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,16 @@
22

33
<https://en.wikipedia.org/wiki/OpenCL#Implementations>
44

5+
## ICD
6+
7+
## Installable client driver
8+
59
There is a certain "installable client driver loader (ICD loader)" which forwards calls to the proprietary implementation.
610

11+
<https://www.khronos.org/news/permalink/opencl-installable-client-driver-icd-loader>
12+
13+
TODO how to use it.
14+
715
## Gallium Compute
816

917
<http://www.x.org/wiki/XorgEVoC/GalliumCompute/>

opencl/matmul.c

+84-30
Original file line numberDiff line numberDiff line change
@@ -432,7 +432,7 @@ void mat_mul_cl_row_local(const F *A, const F *B, F *C, size_t n) {
432432
* This leads to a thread blockage / memory access tradeoff.
433433
*
434434
* We make work groups as large as possible to reload memory less times. */
435-
void mat_mul_cl_row_priv_priv_col_local(const F *A, const F *B, F *C, size_t n) {
435+
void mat_mul_cl_row_priv_col_local(const F *A, const F *B, F *C, size_t n) {
436436
char options[256];
437437
cl_mem buf_a, buf_b, buf_c;
438438
Common common;
@@ -441,13 +441,13 @@ void mat_mul_cl_row_priv_priv_col_local(const F *A, const F *B, F *C, size_t n)
441441

442442
/* Setup variables. */
443443
global_work_size = n;
444-
local_work_size = 0;
445444
mat_sizeof = n * n * sizeof(F);
446445
ncl = n;
447446

448447
/* Run kernel. */
449448
snprintf(options, sizeof(options), "-DPRIV_ROW_SIZE=%ju", n);
450449
common_init_file_options(&common, "matmul_row_priv_col_local.cl", options);
450+
local_work_size = 0;
451451
clGetDeviceInfo(common.device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(local_work_size), &local_work_size, NULL);
452452
local_work_size = zmin(local_work_size, n);
453453
buf_a = clCreateBuffer(common.context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mat_sizeof, (F*)A, NULL);
@@ -458,7 +458,61 @@ void mat_mul_cl_row_priv_priv_col_local(const F *A, const F *B, F *C, size_t n)
458458
clSetKernelArg(common.kernel, 2, sizeof(buf_c), &buf_c);
459459
clSetKernelArg(common.kernel, 3, n * sizeof(F), NULL);
460460
clSetKernelArg(common.kernel, 4, sizeof(ncl), &ncl);
461-
clEnqueueNDRangeKernel(common.command_queue, common.kernel, 1, NULL, &global_work_size, NULL, 0, NULL, NULL);
461+
clEnqueueNDRangeKernel(common.command_queue, common.kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL);
462+
clFlush(common.command_queue);
463+
clFinish(common.command_queue);
464+
clEnqueueReadBuffer(common.command_queue, buf_c, CL_TRUE, 0, mat_sizeof, C, 0, NULL, NULL);
465+
466+
/* Cleanup. */
467+
clReleaseMemObject(buf_a);
468+
clReleaseMemObject(buf_b);
469+
clReleaseMemObject(buf_c);
470+
common_deinit(&common);
471+
}
472+
473+
/* Copy as many cols from B as possibl to the local memory, only then start multiplying.
474+
* This leads to less memory barrier hits.
475+
* How many rows we copy is limited by the local memory size, ideally the entire matrix will fit. */
476+
void mat_mul_cl_row_priv_cols_local(const F *A, const F *B, F *C, size_t n) {
477+
char options[256];
478+
cl_mem buf_a, buf_b, buf_c;
479+
Common common;
480+
cl_uint ncl, n_local_cols;
481+
cl_ulong local_mem_size;
482+
size_t col_size, global_work_size, local_work_size, mat_sizeof;
483+
484+
/* Setup variables. */
485+
col_size = n * sizeof(F);
486+
global_work_size = n;
487+
mat_sizeof = n * n * sizeof(F);
488+
ncl = n;
489+
490+
/* Run kernel. */
491+
snprintf(options, sizeof(options), "-DPRIV_ROW_SIZE=%ju", n);
492+
common_init_file_options(&common, "matmul_row_priv_cols_local.cl", options);
493+
local_work_size = 0;
494+
clGetDeviceInfo(common.device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(local_work_size), &local_work_size, NULL);
495+
local_work_size = zmin(local_work_size, n);
496+
local_mem_size = 0;
497+
clGetDeviceInfo(common.device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(local_mem_size), &local_mem_size, NULL);
498+
/* TODO: can blow up without that - 1. Why?
499+
* It only reaches the max without it, not crosses, right?
500+
* So bug in the kernel? */
501+
n_local_cols = zmin(local_mem_size / col_size, n) - 1;
502+
/*puts("");*/
503+
/*printf("max memory %llu\n", (unsigned long long)local_mem_size);*/
504+
/*printf("n_local_cols %llu\n", (unsigned long long)n_local_cols);*/
505+
/*printf("memory %llu\n", (unsigned long long)n_local_cols * n * sizeof(F));*/
506+
buf_a = clCreateBuffer(common.context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mat_sizeof, (F*)A, NULL);
507+
buf_b = clCreateBuffer(common.context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mat_sizeof, (F*)B, NULL);
508+
buf_c = clCreateBuffer(common.context, CL_MEM_WRITE_ONLY, mat_sizeof, C, NULL);
509+
clSetKernelArg(common.kernel, 0, sizeof(buf_a), &buf_a);
510+
clSetKernelArg(common.kernel, 1, sizeof(buf_b), &buf_b);
511+
clSetKernelArg(common.kernel, 2, sizeof(buf_c), &buf_c);
512+
clSetKernelArg(common.kernel, 3, n_local_cols * col_size, NULL);
513+
clSetKernelArg(common.kernel, 4, sizeof(ncl), &ncl);
514+
clSetKernelArg(common.kernel, 5, sizeof(n_local_cols), &n_local_cols);
515+
clEnqueueNDRangeKernel(common.command_queue, common.kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL);
462516
clFlush(common.command_queue);
463517
clFinish(common.command_queue);
464518
clEnqueueReadBuffer(common.command_queue, buf_c, CL_TRUE, 0, mat_sizeof, C, 0, NULL, NULL);
@@ -477,8 +531,6 @@ void mat_mul_cl_block(const F *A, const F *B, F *C, size_t n) {
477531
size_t global_work_size[2], local_work_size[2], mat_sizeof, nblk;
478532

479533
/* Setup variables. */
480-
/* Cannot be larger than 1 on this example, otherwise memory conflicts
481-
* will happen between work items. */
482534
global_work_size[0] = n;
483535
global_work_size[1] = n;
484536
mat_sizeof = n * n * sizeof(F);
@@ -488,6 +540,7 @@ void mat_mul_cl_block(const F *A, const F *B, F *C, size_t n) {
488540
common_init_file(&common, "matmul_block.cl");
489541
clGetDeviceInfo(common.device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(nblk), &nblk, NULL);
490542
nblk = sqrt(zmin(nblk, n));
543+
nblk = zmin(nblk, 3);
491544
nblkcl = nblk;
492545
local_work_size[0] = nblk;
493546
local_work_size[1] = nblk;
@@ -498,6 +551,8 @@ void mat_mul_cl_block(const F *A, const F *B, F *C, size_t n) {
498551
clSetKernelArg(common.kernel, 1, sizeof(buf_b), &buf_b);
499552
clSetKernelArg(common.kernel, 2, sizeof(buf_c), &buf_c);
500553
clSetKernelArg(common.kernel, 3, nblk * nblk * sizeof(F), NULL);
554+
printf("nblk = %llu\n", (unsigned long long)nblk);
555+
printf("local memory = %llu\n", (unsigned long long)2 * nblk * nblk * sizeof(F));
501556
clSetKernelArg(common.kernel, 4, nblk * nblk * sizeof(F), NULL);
502557
clSetKernelArg(common.kernel, 5, sizeof(ncl), &ncl);
503558
clSetKernelArg(common.kernel, 6, sizeof(nblkcl), &nblkcl);
@@ -534,17 +589,18 @@ int main(int argc, char **argv) {
534589
double max_runtime;
535590
/* Overly slow ones commented out by default. */
536591
MatMul mat_mul_funcs[] = {
537-
mat_mul_cpu_trans,
538-
mat_mul_cpu_trans_vec,
539-
mat_mul_cpu_block,
592+
/*mat_mul_cpu_trans,*/
593+
/*mat_mul_cpu_trans_vec,*/
594+
/*mat_mul_cpu_block,*/
540595
mat_mul_cpu_cblas,
541-
mat_mul_cl,
542-
mat_mul_cl_row_priv,
543-
mat_mul_cl_row_local,
544-
mat_mul_cl_row_priv_priv_col_local,
545-
/* TODO broken. */
546-
/*mat_mul_cl_block,*/
547-
mat_mul_cl_clblas,
596+
/*mat_mul_cl,*/
597+
/*mat_mul_cl_row_priv,*/
598+
/*mat_mul_cl_row_local,*/
599+
/*mat_mul_cl_row_priv_col_local,*/
600+
/*mat_mul_cl_row_priv_cols_local,*/
601+
/* TODO broken for 32 or up, some cells contain trash. */
602+
mat_mul_cl_block,
603+
/*mat_mul_cl_clblas,*/
548604
};
549605
int first, func_done[NELEMS(mat_mul_funcs)] = {0};
550606
size_t f, i;
@@ -572,7 +628,6 @@ int main(int argc, char **argv) {
572628
19.0, 22.0,
573629
43.0, 50.0
574630
};
575-
576631
for (f = 0; f < sizeof(mat_mul_funcs)/sizeof(mat_mul_funcs[0]); ++f) {
577632
mat_zero(C, n);
578633
mat_mul_funcs[f](A, B, C, n);
@@ -583,26 +638,25 @@ int main(int argc, char **argv) {
583638
/* Unit test 4x4. */
584639
{
585640
const F A[] = {
586-
1.0, 2.0, 3.0, 4.0,
587-
5.0, 6.0, 7.0, 8.0,
588-
9.0, 10.0, 11.0, 12.0,
589-
13.0, 14.0, 15.0, 16.0,
641+
1.0, 2.0, 3.0, 4.0,
642+
5.0, 6.0, 7.0, 8.0,
643+
9.0, 10.0, 11.0, 12.0,
644+
13.0, 14.0, 15.0, 16.0,
590645
};
591646
const F B[] = {
592-
17.0, 18.0, 19.0, 20.0,
593-
21.0, 22.0, 23.0, 24.0,
594-
25.0, 26.0, 27.0, 28.0,
595-
29.0, 30.0, 31.0, 32.0,
647+
17.0, 18.0, 19.0, 20.0,
648+
21.0, 22.0, 23.0, 24.0,
649+
25.0, 26.0, 27.0, 28.0,
650+
29.0, 30.0, 31.0, 32.0,
596651
};
597652
const F C_ref[] = {
598-
250.000000, 260.000000, 270.000000, 280.000000,
599-
618.000000, 644.000000, 670.000000, 696.000000,
600-
986.000000, 1028.000000, 1070.000000, 1112.000000,
601-
1354.000000, 1412.000000, 1470.000000, 1528.000000,
653+
250.0, 260.0, 270.0, 280.0,
654+
618.0, 644.0, 670.0, 696.0,
655+
986.0, 1028.0, 1070.0, 1112.0,
656+
1354.0, 1412.0, 1470.0, 1528.0,
602657
};
603658
enum N { n = 4 };
604659
F C[n*n];
605-
606660
for (f = 0; f < NELEMS(mat_mul_funcs); ++f) {
607661
mat_zero(C, n);
608662
mat_mul_funcs[f](A, B, C, n);
@@ -615,7 +669,7 @@ int main(int argc, char **argv) {
615669
double dt;
616670
F *A = NULL, *B = NULL, *C = NULL, *C_ref = NULL, *dst = NULL, *ref = NULL;
617671
int done;
618-
size_t n = 4, a_sizeof;
672+
size_t n = 2, a_sizeof;
619673

620674
done = 0;
621675
puts("#matmul");

opencl/matmul_block.cl

+29-26
Original file line numberDiff line numberDiff line change
@@ -2,34 +2,37 @@ __kernel void main(
22
__global const float* restrict A,
33
__global const float* restrict B,
44
__global float* restrict C,
5-
__local float* restrict Awrk,
6-
__local float* restrict Bwrk,
5+
__local float* restrict Aloc,
6+
__local float* restrict Bloc,
77
const uint n,
88
const uint blksz
99
) {
10-
int kloc, Kblk;
11-
float Ctmp=0.0f;
12-
const uint i = get_global_id(0);
13-
const uint j = get_global_id(1);
14-
const uint Iblk = get_group_id(0);
15-
const uint Jblk = get_group_id(1);
16-
const uint iloc = get_local_id(0);
17-
const uint jloc = get_local_id(1);
18-
const uint nblks = n/blksz;
19-
uint Abase = Iblk*n*blksz;
20-
const uint Ainc = blksz;
21-
uint Bbase = Jblk*blksz;
22-
const uint Binc = blksz*n;
23-
for (Kblk = 0; Kblk<nblks; Kblk++) {
24-
Awrk[jloc*blksz+iloc] = A[Abase+jloc*n+iloc];
25-
Bwrk[jloc*blksz+iloc] = B[Bbase+jloc*n+iloc];
26-
barrier(CLK_LOCAL_MEM_FENCE);
27-
for (kloc=0; kloc<blksz; kloc++)
28-
Ctmp += Awrk[jloc*blksz+kloc] * Bwrk[kloc*blksz+iloc];
29-
barrier(CLK_LOCAL_MEM_FENCE);
30-
Abase += Ainc;
31-
Bbase += Binc;
10+
const uint
11+
iloc = get_local_id(0),
12+
jloc = get_local_id(1),
13+
nblks = n / blksz,
14+
b_inc = blksz * n
15+
;
16+
float c_tmp = 0.0;
17+
uint
18+
a_base = get_group_id(1) * blksz * n,
19+
b_base = get_group_id(0) * blksz,
20+
iload,
21+
iload_loc,
22+
kloc,
23+
kblk
24+
;
25+
for (kblk = 0; kblk < nblks; kblk++) {
26+
iload_loc = jloc * blksz + iloc;
27+
iload = jloc * n + iloc;
28+
Aloc[iload_loc] = A[a_base + iload];
29+
Bloc[iload_loc] = B[b_base + iload];
30+
barrier(CLK_LOCAL_MEM_FENCE);
31+
for (kloc = 0; kloc < blksz; kloc++)
32+
c_tmp += Aloc[jloc * blksz + kloc] * Bloc[kloc * blksz + iloc];
33+
barrier(CLK_LOCAL_MEM_FENCE);
34+
a_base += blksz;
35+
b_base += b_inc;
3236
}
33-
C[j*n+i] = Ctmp;
34-
37+
C[get_global_id(1) * n + get_global_id(0)] = c_tmp;
3538
}

0 commit comments

Comments
 (0)