Skip to content

Commit 426e348

Browse files
committed
Attempt matmul vec but failing
1 parent 3c68f19 commit 426e348

File tree

9 files changed

+297
-58
lines changed

9 files changed

+297
-58
lines changed

gcc/README.md

+1
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ Cheat on the GNU Compile Collection (GCC) language extensions and command line u
2626
1. [Attribute](attribute.c)
2727
1. Variable
2828
1. [weak](weak/)
29+
1. [Vector extensions](vector.c)
2930
1. Function
3031
1. [sentinel](sentinel.c)
3132
1. [asm](asm.c)

gcc/vector.c

+64
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
/*
2+
Increase the chance that the compiler will use SIMD instructions.
3+
4+
Basically a more abstract and ISA portable (TODO?) version of intrinsics.
5+
6+
Note however that the GCC and other compiles have an auto-vectorization
7+
optimization, which might use SIMD even if you don't use the vector extensions.
8+
*/
9+
10+
#include "common.h"
11+
12+
typedef int int4 __attribute__ ((vector_size(4 * sizeof(int))));
13+
14+
/*
15+
TODO: SIMD to scalar operations don't appear possible without intrinsics:
16+
http://stackoverflow.com/questions/31597302/gcc-c-vector-extension-how-to-check-if-result-of-any-element-wise-comparison-is
17+
*/
18+
int int4_eq(int4 i, int4 j) {
19+
return
20+
i[0] == j[0] &&
21+
i[1] == j[1] &&
22+
i[2] == j[2] &&
23+
i[3] == j[3]
24+
;
25+
}
26+
27+
int main(void) {
28+
/* Hello world. */
29+
{
30+
int4 i = {1, 2, 3, 4};
31+
int4 j = {5, 6, 7, 8};
32+
int4 k = i + j;
33+
assert(int4_eq(k, (int4){6, 8, 10, 12}));
34+
}
35+
36+
/* From / to array. TODO any better way than going over all indices?
37+
* Intrinsic way: _mm_load_pd family.
38+
* */
39+
{
40+
int is[] = {1, 2, 3, 4};
41+
int js[] = {5, 6, 7, 8};
42+
int ks[4];
43+
int ks2[] = {6, 8, 10, 12};
44+
int4 i;
45+
int4 j;
46+
int4 k;
47+
i[0] = is[0];
48+
i[1] = is[1];
49+
i[2] = is[2];
50+
i[3] = is[3];
51+
j[0] = js[0];
52+
j[1] = js[1];
53+
j[2] = js[2];
54+
j[3] = js[3];
55+
k = i + j;
56+
ks[0] = k[0];
57+
ks[1] = k[1];
58+
ks[2] = k[2];
59+
ks[3] = k[3];
60+
assert(memcmp(ks, ks2, sizeof(ks)) == 0);
61+
}
62+
63+
return EXIT_SUCCESS;
64+
}

opencl/README.md

+3-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,9 @@
99
1. [Work item built-ins](work_item_builtin.c)
1010
1. [Vector type](vector_type.c)
1111
1. [clinfo](clinfo.c)
12-
1. [Matrix multiplication](matmul.c)
12+
1. [Preprocessor](matmul.c)
13+
1. [Benchmark examples](benchmark-examples.md)
14+
1. [Matrix multiplication](matmul.c)
1315
1. Tools
1416
1. [clinfo](clinfo.md)
1517
1. [Benchmarks](benchmarks.md)

opencl/benchmark-examples.md

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# Benchmark examples
2+
3+
Examples under this section do benchmarks, and thus take longer to finish.
4+
5+
This is the case of all useful examples, i.e. that actually run a real-ish application faster per dollar than the best CPU implementation we can craft, including intrinsics.

opencl/common.h

+11-3
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,10 @@ typedef struct {
2323
cl_program program;
2424
} Common;
2525

26-
void common_init(
26+
void common_init_options(
2727
Common *common,
28-
const char *source
28+
const char *source,
29+
const char *options
2930
) {
3031
char *err;
3132
size_t err_len;
@@ -36,7 +37,7 @@ void common_init(
3637
clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &(common->device), NULL);
3738
common->context = clCreateContext(NULL, 1, &(common->device), NULL, NULL, NULL);
3839
common->program = clCreateProgramWithSource(common->context, 1, &source, NULL, NULL);
39-
ret = clBuildProgram(common->program, 1, &(common->device), "", NULL, NULL);
40+
ret = clBuildProgram(common->program, 1, &(common->device), options, NULL, NULL);
4041
if (CL_SUCCESS != ret) {
4142
clGetProgramBuildInfo(common->program, common->device, CL_PROGRAM_BUILD_LOG, 0, NULL, &err_len);
4243
err = malloc(err_len);
@@ -49,6 +50,13 @@ void common_init(
4950
common->command_queue = clCreateCommandQueue(common->context, common->device, 0, NULL);
5051
}
5152

53+
void common_init(
54+
Common *common,
55+
const char *source
56+
) {
57+
common_init_options(common, source, "");
58+
}
59+
5260
char* common_read_file(const char *path) {
5361
char *buffer;
5462
FILE *f;

opencl/inc.c

+1
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ This is our OpenCL hello world, so we are not doing:
99

1010
#include <assert.h>
1111
#include <stdio.h>
12+
#include <stdlib.h>
1213

1314
/* To prevent deprecation warnings when headers are 2.0. */
1415
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS

0 commit comments

Comments
 (0)