@@ -432,7 +432,7 @@ void mat_mul_cl_row_local(const F *A, const F *B, F *C, size_t n) {
432
432
* This leads to a thread blockage / memory access tradeoff.
433
433
*
434
434
* We make work groups as large as possible to reload memory less times. */
435
- void mat_mul_cl_row_priv_priv_col_local (const F * A , const F * B , F * C , size_t n ) {
435
+ void mat_mul_cl_row_priv_col_local (const F * A , const F * B , F * C , size_t n ) {
436
436
char options [256 ];
437
437
cl_mem buf_a , buf_b , buf_c ;
438
438
Common common ;
@@ -441,13 +441,13 @@ void mat_mul_cl_row_priv_priv_col_local(const F *A, const F *B, F *C, size_t n)
441
441
442
442
/* Setup variables. */
443
443
global_work_size = n ;
444
- local_work_size = 0 ;
445
444
mat_sizeof = n * n * sizeof (F );
446
445
ncl = n ;
447
446
448
447
/* Run kernel. */
449
448
snprintf (options , sizeof (options ), "-DPRIV_ROW_SIZE=%ju" , n );
450
449
common_init_file_options (& common , "matmul_row_priv_col_local.cl" , options );
450
+ local_work_size = 0 ;
451
451
clGetDeviceInfo (common .device , CL_DEVICE_MAX_WORK_GROUP_SIZE , sizeof (local_work_size ), & local_work_size , NULL );
452
452
local_work_size = zmin (local_work_size , n );
453
453
buf_a = clCreateBuffer (common .context , CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR , mat_sizeof , (F * )A , NULL );
@@ -458,7 +458,61 @@ void mat_mul_cl_row_priv_priv_col_local(const F *A, const F *B, F *C, size_t n)
458
458
clSetKernelArg (common .kernel , 2 , sizeof (buf_c ), & buf_c );
459
459
clSetKernelArg (common .kernel , 3 , n * sizeof (F ), NULL );
460
460
clSetKernelArg (common .kernel , 4 , sizeof (ncl ), & ncl );
461
- clEnqueueNDRangeKernel (common .command_queue , common .kernel , 1 , NULL , & global_work_size , NULL , 0 , NULL , NULL );
461
+ clEnqueueNDRangeKernel (common .command_queue , common .kernel , 1 , NULL , & global_work_size , & local_work_size , 0 , NULL , NULL );
462
+ clFlush (common .command_queue );
463
+ clFinish (common .command_queue );
464
+ clEnqueueReadBuffer (common .command_queue , buf_c , CL_TRUE , 0 , mat_sizeof , C , 0 , NULL , NULL );
465
+
466
+ /* Cleanup. */
467
+ clReleaseMemObject (buf_a );
468
+ clReleaseMemObject (buf_b );
469
+ clReleaseMemObject (buf_c );
470
+ common_deinit (& common );
471
+ }
472
+
473
+ /* Copy as many cols from B as possibl to the local memory, only then start multiplying.
474
+ * This leads to less memory barrier hits.
475
+ * How many rows we copy is limited by the local memory size, ideally the entire matrix will fit. */
476
+ void mat_mul_cl_row_priv_cols_local (const F * A , const F * B , F * C , size_t n ) {
477
+ char options [256 ];
478
+ cl_mem buf_a , buf_b , buf_c ;
479
+ Common common ;
480
+ cl_uint ncl , n_local_cols ;
481
+ cl_ulong local_mem_size ;
482
+ size_t col_size , global_work_size , local_work_size , mat_sizeof ;
483
+
484
+ /* Setup variables. */
485
+ col_size = n * sizeof (F );
486
+ global_work_size = n ;
487
+ mat_sizeof = n * n * sizeof (F );
488
+ ncl = n ;
489
+
490
+ /* Run kernel. */
491
+ snprintf (options , sizeof (options ), "-DPRIV_ROW_SIZE=%ju" , n );
492
+ common_init_file_options (& common , "matmul_row_priv_cols_local.cl" , options );
493
+ local_work_size = 0 ;
494
+ clGetDeviceInfo (common .device , CL_DEVICE_MAX_WORK_GROUP_SIZE , sizeof (local_work_size ), & local_work_size , NULL );
495
+ local_work_size = zmin (local_work_size , n );
496
+ local_mem_size = 0 ;
497
+ clGetDeviceInfo (common .device , CL_DEVICE_LOCAL_MEM_SIZE , sizeof (local_mem_size ), & local_mem_size , NULL );
498
+ /* TODO: can blow up without that - 1. Why?
499
+ * It only reaches the max without it, not crosses, right?
500
+ * So bug in the kernel? */
501
+ n_local_cols = zmin (local_mem_size / col_size , n ) - 1 ;
502
+ /*puts("");*/
503
+ /*printf("max memory %llu\n", (unsigned long long)local_mem_size);*/
504
+ /*printf("n_local_cols %llu\n", (unsigned long long)n_local_cols);*/
505
+ /*printf("memory %llu\n", (unsigned long long)n_local_cols * n * sizeof(F));*/
506
+ buf_a = clCreateBuffer (common .context , CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR , mat_sizeof , (F * )A , NULL );
507
+ buf_b = clCreateBuffer (common .context , CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR , mat_sizeof , (F * )B , NULL );
508
+ buf_c = clCreateBuffer (common .context , CL_MEM_WRITE_ONLY , mat_sizeof , C , NULL );
509
+ clSetKernelArg (common .kernel , 0 , sizeof (buf_a ), & buf_a );
510
+ clSetKernelArg (common .kernel , 1 , sizeof (buf_b ), & buf_b );
511
+ clSetKernelArg (common .kernel , 2 , sizeof (buf_c ), & buf_c );
512
+ clSetKernelArg (common .kernel , 3 , n_local_cols * col_size , NULL );
513
+ clSetKernelArg (common .kernel , 4 , sizeof (ncl ), & ncl );
514
+ clSetKernelArg (common .kernel , 5 , sizeof (n_local_cols ), & n_local_cols );
515
+ clEnqueueNDRangeKernel (common .command_queue , common .kernel , 1 , NULL , & global_work_size , & local_work_size , 0 , NULL , NULL );
462
516
clFlush (common .command_queue );
463
517
clFinish (common .command_queue );
464
518
clEnqueueReadBuffer (common .command_queue , buf_c , CL_TRUE , 0 , mat_sizeof , C , 0 , NULL , NULL );
@@ -477,8 +531,6 @@ void mat_mul_cl_block(const F *A, const F *B, F *C, size_t n) {
477
531
size_t global_work_size [2 ], local_work_size [2 ], mat_sizeof , nblk ;
478
532
479
533
/* Setup variables. */
480
- /* Cannot be larger than 1 on this example, otherwise memory conflicts
481
- * will happen between work items. */
482
534
global_work_size [0 ] = n ;
483
535
global_work_size [1 ] = n ;
484
536
mat_sizeof = n * n * sizeof (F );
@@ -488,6 +540,7 @@ void mat_mul_cl_block(const F *A, const F *B, F *C, size_t n) {
488
540
common_init_file (& common , "matmul_block.cl" );
489
541
clGetDeviceInfo (common .device , CL_DEVICE_MAX_WORK_GROUP_SIZE , sizeof (nblk ), & nblk , NULL );
490
542
nblk = sqrt (zmin (nblk , n ));
543
+ nblk = zmin (nblk , 3 );
491
544
nblkcl = nblk ;
492
545
local_work_size [0 ] = nblk ;
493
546
local_work_size [1 ] = nblk ;
@@ -498,6 +551,8 @@ void mat_mul_cl_block(const F *A, const F *B, F *C, size_t n) {
498
551
clSetKernelArg (common .kernel , 1 , sizeof (buf_b ), & buf_b );
499
552
clSetKernelArg (common .kernel , 2 , sizeof (buf_c ), & buf_c );
500
553
clSetKernelArg (common .kernel , 3 , nblk * nblk * sizeof (F ), NULL );
554
+ printf ("nblk = %llu\n" , (unsigned long long )nblk );
555
+ printf ("local memory = %llu\n" , (unsigned long long )2 * nblk * nblk * sizeof (F ));
501
556
clSetKernelArg (common .kernel , 4 , nblk * nblk * sizeof (F ), NULL );
502
557
clSetKernelArg (common .kernel , 5 , sizeof (ncl ), & ncl );
503
558
clSetKernelArg (common .kernel , 6 , sizeof (nblkcl ), & nblkcl );
@@ -534,17 +589,18 @@ int main(int argc, char **argv) {
534
589
double max_runtime ;
535
590
/* Overly slow ones commented out by default. */
536
591
MatMul mat_mul_funcs [] = {
537
- mat_mul_cpu_trans ,
538
- mat_mul_cpu_trans_vec ,
539
- mat_mul_cpu_block ,
592
+ /* mat_mul_cpu_trans,*/
593
+ /* mat_mul_cpu_trans_vec,*/
594
+ /* mat_mul_cpu_block,*/
540
595
mat_mul_cpu_cblas ,
541
- mat_mul_cl ,
542
- mat_mul_cl_row_priv ,
543
- mat_mul_cl_row_local ,
544
- mat_mul_cl_row_priv_priv_col_local ,
545
- /* TODO broken. */
546
- /*mat_mul_cl_block,*/
547
- mat_mul_cl_clblas ,
596
+ /*mat_mul_cl,*/
597
+ /*mat_mul_cl_row_priv,*/
598
+ /*mat_mul_cl_row_local,*/
599
+ /*mat_mul_cl_row_priv_col_local,*/
600
+ /*mat_mul_cl_row_priv_cols_local,*/
601
+ /* TODO broken for 32 or up, some cells contain trash. */
602
+ mat_mul_cl_block ,
603
+ /*mat_mul_cl_clblas,*/
548
604
};
549
605
int first , func_done [NELEMS (mat_mul_funcs )] = {0 };
550
606
size_t f , i ;
@@ -572,7 +628,6 @@ int main(int argc, char **argv) {
572
628
19.0 , 22.0 ,
573
629
43.0 , 50.0
574
630
};
575
-
576
631
for (f = 0 ; f < sizeof (mat_mul_funcs )/sizeof (mat_mul_funcs [0 ]); ++ f ) {
577
632
mat_zero (C , n );
578
633
mat_mul_funcs [f ](A , B , C , n );
@@ -583,26 +638,25 @@ int main(int argc, char **argv) {
583
638
/* Unit test 4x4. */
584
639
{
585
640
const F A [] = {
586
- 1.0 , 2.0 , 3.0 , 4.0 ,
587
- 5.0 , 6.0 , 7.0 , 8.0 ,
588
- 9.0 , 10.0 , 11.0 , 12.0 ,
589
- 13.0 , 14.0 , 15.0 , 16.0 ,
641
+ 1.0 , 2.0 , 3.0 , 4.0 ,
642
+ 5.0 , 6.0 , 7.0 , 8.0 ,
643
+ 9.0 , 10.0 , 11.0 , 12.0 ,
644
+ 13.0 , 14.0 , 15.0 , 16.0 ,
590
645
};
591
646
const F B [] = {
592
- 17.0 , 18.0 , 19.0 , 20.0 ,
593
- 21.0 , 22.0 , 23.0 , 24.0 ,
594
- 25.0 , 26.0 , 27.0 , 28.0 ,
595
- 29.0 , 30.0 , 31.0 , 32.0 ,
647
+ 17.0 , 18.0 , 19.0 , 20.0 ,
648
+ 21.0 , 22.0 , 23.0 , 24.0 ,
649
+ 25.0 , 26.0 , 27.0 , 28.0 ,
650
+ 29.0 , 30.0 , 31.0 , 32.0 ,
596
651
};
597
652
const F C_ref [] = {
598
- 250.000000 , 260.000000 , 270.000000 , 280.000000 ,
599
- 618.000000 , 644.000000 , 670.000000 , 696.000000 ,
600
- 986.000000 , 1028.000000 , 1070.000000 , 1112.000000 ,
601
- 1354.000000 , 1412.000000 , 1470.000000 , 1528.000000 ,
653
+ 250.0 , 260.0 , 270.0 , 280.0 ,
654
+ 618.0 , 644.0 , 670.0 , 696.0 ,
655
+ 986.0 , 1028.0 , 1070.0 , 1112.0 ,
656
+ 1354.0 , 1412.0 , 1470.0 , 1528.0 ,
602
657
};
603
658
enum N { n = 4 };
604
659
F C [n * n ];
605
-
606
660
for (f = 0 ; f < NELEMS (mat_mul_funcs ); ++ f ) {
607
661
mat_zero (C , n );
608
662
mat_mul_funcs [f ](A , B , C , n );
@@ -615,7 +669,7 @@ int main(int argc, char **argv) {
615
669
double dt ;
616
670
F * A = NULL , * B = NULL , * C = NULL , * C_ref = NULL , * dst = NULL , * ref = NULL ;
617
671
int done ;
618
- size_t n = 4 , a_sizeof ;
672
+ size_t n = 2 , a_sizeof ;
619
673
620
674
done = 0 ;
621
675
puts ("#matmul" );
0 commit comments