wall

cirosantilli · cirosantilli · commit 15b6cb6e38e9 · 2017-03-13T16:42:09.000Z
diff --git a/Makefile_many b/Makefile_many
@@ -13,8 +13,8 @@ I               ?= #-I/usr/include
 O               ?= 0
 STD             ?= c11
 PEDANTIC        ?= -pedantic-errors
-CFLAGS          ?= -g$(G) -O$(O) -pthread -std=$(STD) -Wextra $(PEDANTIC) $(CFLAGS_EXTRA)   #-pg 
-MYCXXFLAGS      ?= -g$(G) -O$(O) -pthread -std=c++14  -Wextra $(PEDANTIC) $(CXXFLAGS_EXTRA) #-pg 
+CFLAGS          ?= -g$(G) -O$(O) -pthread -std=$(STD) -Wall -Wextra $(PEDANTIC) $(CFLAGS_EXTRA)   #-pg 
+MYCXXFLAGS      ?= -g$(G) -O$(O) -pthread -std=c++14  -Wall -Wextra $(PEDANTIC) $(CXXFLAGS_EXTRA) #-pg 
 
 LIBS            ?= -lm -lrt #-lGL -lGLU -lglut
 
diff --git a/opencl/README.md b/opencl/README.md
@@ -9,6 +9,7 @@
     1.  [Work item built-ins](work_item_builtin.c)
     1.  [Vector type](vector_type.c)
     1.  [clinfo](clinfo.c)
+    1.  [Matrix multiplication](matmul.c)
 1.  Tools
     1.  [clinfo](clinfo.md)
     1.  [Benchmarks](benchmarks.md)
diff --git a/opencl/architecture.md b/opencl/architecture.md
@@ -40,17 +40,22 @@ TODO
 
 Contains many work items.
 
+Is basically a completely independent chunk of work.
+
 Work items inside the same work group can share local memory, and can synchronize.
 
-Work groups have a maximum size (otherwise the concept wouldn't even exist).
+So ideally, we would like to have a single work group, with infinitely many work items inside it.
+
+However, the laws of physics are cruel, and the following limits exist:
 
-Ideally we would like to have a single work group for all items, as that would allow us to worry less about the location of memory on the Global / Constant / Local / Private hierarchy.
+- `CL_DEVICE_MAX_WORK_GROUP_SIZE`: maximum number of work items inside the work group
+- `CL_DEVICE_MAX_WORK_ITEM_SIZES[CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS]`: maximum  number of work items on each dimension
 
-But memory localization on GPUs is important enough that OpenCL exposes this extra level.
+This is due to memory localization on GPUs is important enough that OpenCL exposes this extra level.
 
-Synchronization only works inside a single work groups: http://stackoverflow.com/questions/5895001/opencl-synchronization-between-work-groups
+We can however has as many work groups as we want.
 
-TODO: can a single work group be run in parallel on the GPU?
+Synchronization only works inside a single work groups: <http://stackoverflow.com/questions/5895001/opencl-synchronization-between-work-groups>
 
 ### Local size
 
@@ -76,6 +81,8 @@ An work item can be seen as a thread.
 
 Contains private memory, which no other work item can see.
 
+TODO: can a single work item be run in parallel on the GPU?
+
 ## Local and Private memory
 
 TODO: why use those at all instead of global memory?
diff --git a/opencl/clinfo.c b/opencl/clinfo.c
@@ -28,8 +28,10 @@ int main(void) {
 
     /* Print. */
     puts("#clinfo");
-    PRINT_CL_UINT(DEVICE_MAX_WORK_ITEM_DIMENSIONS)
     PRINT_SIZE_T(DEVICE_MAX_WORK_GROUP_SIZE)
+    PRINT_CL_UINT(DEVICE_MAX_WORK_ITEM_DIMENSIONS)
+    /* TODO this is wrong, it is actually an array.
+     * But yeah, likely the same for all dimensions. */
     PRINT_SIZE_T(DEVICE_MAX_WORK_ITEM_SIZES)
 
     /* Cleanup. */
diff --git a/opencl/common.h b/opencl/common.h
@@ -23,7 +23,7 @@ typedef struct {
     cl_program program;
 } Common;
 
-static void common_init(
+void common_init(
     Common *common,
     const char *source
 ) {
@@ -49,7 +49,7 @@ static void common_init(
     common->command_queue = clCreateCommandQueue(common->context, common->device, 0, NULL);
 }
 
-static char* common_read_file(const char *path) {
+char* common_read_file(const char *path) {
     char *buffer;
     FILE *f;
     long length;
@@ -58,14 +58,16 @@ static char* common_read_file(const char *path) {
     fseek(f, 0, SEEK_END);
     length = ftell(f);
     fseek(f, 0, SEEK_SET);
-    buffer = calloc(1, length + 1);
-    fread(buffer, 1, length, f);
+    buffer = malloc(length + 1);
+    if (fread(buffer, 1, length, f) < (size_t)length) {
+    	return NULL;
+    }
     fclose(f);
     buffer[length] = '\0';
     return buffer;
 }
 
-static void common_init_file(
+void common_init_file(
     Common *common,
     const char *source_path
 ) {
@@ -75,7 +77,7 @@ static void common_init_file(
     free(source);
 }
 
-static void common_deinit(
+void common_deinit(
     Common *common
 ) {
     clReleaseCommandQueue(common->command_queue);
@@ -87,7 +89,7 @@ static void common_deinit(
 #endif
 }
 
-static double common_get_nanos(void) {
+double common_get_nanos(void) {
     struct timespec ts;
     timespec_get(&ts, TIME_UTC);
     return ts.tv_sec + ts.tv_nsec / 1000000000.0;
diff --git a/opencl/getting-started.md b/opencl/getting-started.md
@@ -4,6 +4,10 @@ Tested in Ubuntu 15.10 NVIDIA 352, OpenCL 1.2.
 
 The day we do OpenCL 2.0, it will be put inside a subdirectory and clearly labeled.
 
+For benchmarks, make sure to use `0=3`:
+
+    make O=3
+
 ## NVIDIA
 
 On Ubuntu 15.10 with an NVIDIA NVS 5400M, Lenovo T430: <http://askubuntu.com/questions/541114/how-to-make-opencl-work-on-14-10-nvidia-331-89-drivers/693043#693043>
diff --git a/opencl/matmul.c b/opencl/matmul.c
@@ -8,6 +8,7 @@ TODO: make a SERIOUS matrix implementation. Also compare with existing SERIOUS C
 - http://stackoverflow.com/questions/1907557/optimized-matrix-multiplication-in-c
 - http://stackoverflow.com/questions/12289235/simple-and-fast-matrix-vector-multiplication-in-c-c
 - https://www.quora.com/What-is-the-best-way-to-multiply-two-matrices-in-C++
+- http://www.netlib.org/utk/papers/autoblock/node2.html
 */
 
 #include "common.h"
@@ -94,7 +95,7 @@ void mat_print(const F *A, size_t n) {
 }
 
 /* Zero a matrix. */
-F * mat_zero(F *A, size_t n) {
+void mat_zero(F *A, size_t n) {
 	size_t i, n2;
 	n2 = n*n;
 	for (i = 0; i < n2; ++i) {
@@ -103,7 +104,7 @@ F * mat_zero(F *A, size_t n) {
 }
 
 /* Initialize a random matrix. */
-F * mat_rand(F *A, size_t n) {
+void mat_rand(F *A, size_t n) {
 	size_t i, n2;
 	n2 = n*n;
 	for (i = 0; i < n2; ++i) {
@@ -144,7 +145,7 @@ int main(void) {
 	{
 		F *A = NULL, *B = NULL, *C = NULL, *C_ref = NULL;
 		double dt, time;
-		size_t i, n = 1, n2, a_sizeof;
+		size_t n = 1, n2, a_sizeof;
 
 		puts("#matmul");
 		puts("n mat_mul_cpu mat_mul_cl");
@@ -174,7 +175,7 @@ int main(void) {
 
 			assert(mat_eq(C, C_ref, n));
 			puts("");
-			if (dt > 4.0)
+			if (dt > 1.0)
 				break;
 			n *= 2;
 		}
diff --git a/opencl/work_item_builtin.c b/opencl/work_item_builtin.c
@@ -13,7 +13,6 @@ static size_t global = 1;
 static size_t local = 1;
 
 int main(void) {
-    cl_int ret;
     cl_mem buffer;
     cl_uint output[NUM_FUNCTIONS];
     Common common;