Skip to content

Commit 15b6cb6

Browse files
committed
wall
1 parent 62b8ece commit 15b6cb6

8 files changed

+36
-20
lines changed

Makefile_many

+2-2
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,8 @@ I ?= #-I/usr/include
1313
O ?= 0
1414
STD ?= c11
1515
PEDANTIC ?= -pedantic-errors
16-
CFLAGS ?= -g$(G) -O$(O) -pthread -std=$(STD) -Wextra $(PEDANTIC) $(CFLAGS_EXTRA) #-pg
17-
MYCXXFLAGS ?= -g$(G) -O$(O) -pthread -std=c++14 -Wextra $(PEDANTIC) $(CXXFLAGS_EXTRA) #-pg
16+
CFLAGS ?= -g$(G) -O$(O) -pthread -std=$(STD) -Wall -Wextra $(PEDANTIC) $(CFLAGS_EXTRA) #-pg
17+
MYCXXFLAGS ?= -g$(G) -O$(O) -pthread -std=c++14 -Wall -Wextra $(PEDANTIC) $(CXXFLAGS_EXTRA) #-pg
1818

1919
LIBS ?= -lm -lrt #-lGL -lGLU -lglut
2020

opencl/README.md

+1
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
1. [Work item built-ins](work_item_builtin.c)
1010
1. [Vector type](vector_type.c)
1111
1. [clinfo](clinfo.c)
12+
1. [Matrix multiplication](matmul.c)
1213
1. Tools
1314
1. [clinfo](clinfo.md)
1415
1. [Benchmarks](benchmarks.md)

opencl/architecture.md

+12-5
Original file line numberDiff line numberDiff line change
@@ -40,17 +40,22 @@ TODO
4040

4141
Contains many work items.
4242

43+
Is basically a completely independent chunk of work.
44+
4345
Work items inside the same work group can share local memory, and can synchronize.
4446

45-
Work groups have a maximum size (otherwise the concept wouldn't even exist).
47+
So ideally, we would like to have a single work group, with infinitely many work items inside it.
48+
49+
However, the laws of physics are cruel, and the following limits exist:
4650

47-
Ideally we would like to have a single work group for all items, as that would allow us to worry less about the location of memory on the Global / Constant / Local / Private hierarchy.
51+
- `CL_DEVICE_MAX_WORK_GROUP_SIZE`: maximum number of work items inside the work group
52+
- `CL_DEVICE_MAX_WORK_ITEM_SIZES[CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS]`: maximum number of work items on each dimension
4853

49-
But memory localization on GPUs is important enough that OpenCL exposes this extra level.
54+
This is due to memory localization on GPUs is important enough that OpenCL exposes this extra level.
5055

51-
Synchronization only works inside a single work groups: http://stackoverflow.com/questions/5895001/opencl-synchronization-between-work-groups
56+
We can however has as many work groups as we want.
5257

53-
TODO: can a single work group be run in parallel on the GPU?
58+
Synchronization only works inside a single work groups: <http://stackoverflow.com/questions/5895001/opencl-synchronization-between-work-groups>
5459

5560
### Local size
5661

@@ -76,6 +81,8 @@ An work item can be seen as a thread.
7681

7782
Contains private memory, which no other work item can see.
7883

84+
TODO: can a single work item be run in parallel on the GPU?
85+
7986
## Local and Private memory
8087

8188
TODO: why use those at all instead of global memory?

opencl/clinfo.c

+3-1
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,10 @@ int main(void) {
2828

2929
/* Print. */
3030
puts("#clinfo");
31-
PRINT_CL_UINT(DEVICE_MAX_WORK_ITEM_DIMENSIONS)
3231
PRINT_SIZE_T(DEVICE_MAX_WORK_GROUP_SIZE)
32+
PRINT_CL_UINT(DEVICE_MAX_WORK_ITEM_DIMENSIONS)
33+
/* TODO this is wrong, it is actually an array.
34+
* But yeah, likely the same for all dimensions. */
3335
PRINT_SIZE_T(DEVICE_MAX_WORK_ITEM_SIZES)
3436

3537
/* Cleanup. */

opencl/common.h

+9-7
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ typedef struct {
2323
cl_program program;
2424
} Common;
2525

26-
static void common_init(
26+
void common_init(
2727
Common *common,
2828
const char *source
2929
) {
@@ -49,7 +49,7 @@ static void common_init(
4949
common->command_queue = clCreateCommandQueue(common->context, common->device, 0, NULL);
5050
}
5151

52-
static char* common_read_file(const char *path) {
52+
char* common_read_file(const char *path) {
5353
char *buffer;
5454
FILE *f;
5555
long length;
@@ -58,14 +58,16 @@ static char* common_read_file(const char *path) {
5858
fseek(f, 0, SEEK_END);
5959
length = ftell(f);
6060
fseek(f, 0, SEEK_SET);
61-
buffer = calloc(1, length + 1);
62-
fread(buffer, 1, length, f);
61+
buffer = malloc(length + 1);
62+
if (fread(buffer, 1, length, f) < (size_t)length) {
63+
return NULL;
64+
}
6365
fclose(f);
6466
buffer[length] = '\0';
6567
return buffer;
6668
}
6769

68-
static void common_init_file(
70+
void common_init_file(
6971
Common *common,
7072
const char *source_path
7173
) {
@@ -75,7 +77,7 @@ static void common_init_file(
7577
free(source);
7678
}
7779

78-
static void common_deinit(
80+
void common_deinit(
7981
Common *common
8082
) {
8183
clReleaseCommandQueue(common->command_queue);
@@ -87,7 +89,7 @@ static void common_deinit(
8789
#endif
8890
}
8991

90-
static double common_get_nanos(void) {
92+
double common_get_nanos(void) {
9193
struct timespec ts;
9294
timespec_get(&ts, TIME_UTC);
9395
return ts.tv_sec + ts.tv_nsec / 1000000000.0;

opencl/getting-started.md

+4
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,10 @@ Tested in Ubuntu 15.10 NVIDIA 352, OpenCL 1.2.
44

55
The day we do OpenCL 2.0, it will be put inside a subdirectory and clearly labeled.
66

7+
For benchmarks, make sure to use `0=3`:
8+
9+
make O=3
10+
711
## NVIDIA
812

913
On Ubuntu 15.10 with an NVIDIA NVS 5400M, Lenovo T430: <http://askubuntu.com/questions/541114/how-to-make-opencl-work-on-14-10-nvidia-331-89-drivers/693043#693043>

opencl/matmul.c

+5-4
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ TODO: make a SERIOUS matrix implementation. Also compare with existing SERIOUS C
88
- http://stackoverflow.com/questions/1907557/optimized-matrix-multiplication-in-c
99
- http://stackoverflow.com/questions/12289235/simple-and-fast-matrix-vector-multiplication-in-c-c
1010
- https://www.quora.com/What-is-the-best-way-to-multiply-two-matrices-in-C++
11+
- http://www.netlib.org/utk/papers/autoblock/node2.html
1112
*/
1213

1314
#include "common.h"
@@ -94,7 +95,7 @@ void mat_print(const F *A, size_t n) {
9495
}
9596

9697
/* Zero a matrix. */
97-
F * mat_zero(F *A, size_t n) {
98+
void mat_zero(F *A, size_t n) {
9899
size_t i, n2;
99100
n2 = n*n;
100101
for (i = 0; i < n2; ++i) {
@@ -103,7 +104,7 @@ F * mat_zero(F *A, size_t n) {
103104
}
104105

105106
/* Initialize a random matrix. */
106-
F * mat_rand(F *A, size_t n) {
107+
void mat_rand(F *A, size_t n) {
107108
size_t i, n2;
108109
n2 = n*n;
109110
for (i = 0; i < n2; ++i) {
@@ -144,7 +145,7 @@ int main(void) {
144145
{
145146
F *A = NULL, *B = NULL, *C = NULL, *C_ref = NULL;
146147
double dt, time;
147-
size_t i, n = 1, n2, a_sizeof;
148+
size_t n = 1, n2, a_sizeof;
148149

149150
puts("#matmul");
150151
puts("n mat_mul_cpu mat_mul_cl");
@@ -174,7 +175,7 @@ int main(void) {
174175

175176
assert(mat_eq(C, C_ref, n));
176177
puts("");
177-
if (dt > 4.0)
178+
if (dt > 1.0)
178179
break;
179180
n *= 2;
180181
}

opencl/work_item_builtin.c

-1
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@ static size_t global = 1;
1313
static size_t local = 1;
1414

1515
int main(void) {
16-
cl_int ret;
1716
cl_mem buffer;
1817
cl_uint output[NUM_FUNCTIONS];
1918
Common common;

0 commit comments

Comments
 (0)