19
19
#pragma warning(disable: 4244 4267) // possible loss of data
20
20
#endif
21
21
22
- #define CL_DMMV_BLOCK_SIZE 32
22
+ #define CL_DMMV_LOCAL_SIZE 32
23
23
24
24
#ifndef K_QUANTS_PER_ITERATION
25
25
#define K_QUANTS_PER_ITERATION 1
@@ -338,7 +338,7 @@ __kernel void dequantize_mul_mat_vec_q2_K(__global const struct block_q2_K * xx,
338
338
const int row = get_group_id (0 );
339
339
340
340
const int num_blocks_per_row = ncols / QK_K;
341
- const int ib0 = row*num_blocks_per_row;
341
+ const int ib0 = row*num_blocks_per_row + get_global_offset ( 0 ) ;
342
342
343
343
__global const struct block_q2_K * x = xx + ib0;
344
344
@@ -413,7 +413,7 @@ __kernel void dequantize_mul_mat_vec_q3_K(__global const struct block_q3_K * xx,
413
413
const int row = get_group_id (0 );
414
414
415
415
const int num_blocks_per_row = ncols / QK_K;
416
- const int ib0 = row*num_blocks_per_row;
416
+ const int ib0 = row*num_blocks_per_row + get_global_offset ( 0 ) ;
417
417
418
418
__global const struct block_q3_K * x = xx + ib0;
419
419
@@ -489,7 +489,7 @@ __kernel void dequantize_mul_mat_vec_q4_K(__global const struct block_q4_K * xx,
489
489
490
490
const int row = get_group_id (0 );
491
491
const int num_blocks_per_row = ncols / QK_K;
492
- const int ib0 = row*num_blocks_per_row;
492
+ const int ib0 = row*num_blocks_per_row + get_global_offset ( 0 ) ;
493
493
494
494
const int tid = get_local_id (0 )/K_QUANTS_PER_ITERATION; // 0...15
495
495
const int ix = get_local_id (0 )%K_QUANTS_PER_ITERATION;
@@ -562,7 +562,7 @@ __kernel void dequantize_mul_mat_vec_q5_K(__global const struct block_q5_K * xx,
562
562
563
563
const int row = get_group_id (0 );
564
564
const int num_blocks_per_row = ncols / QK_K;
565
- const int ib0 = row*num_blocks_per_row;
565
+ const int ib0 = row*num_blocks_per_row + get_global_offset ( 0 ) ;
566
566
567
567
const int tid = get_local_id (0 )/2 ; // 0...15
568
568
const int ix = get_local_id (0 )%2 ;
@@ -641,7 +641,7 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
641
641
const int row = get_group_id (0 );
642
642
643
643
const int num_blocks_per_row = ncols / QK_K;
644
- const int ib0 = row*num_blocks_per_row;
644
+ const int ib0 = row*num_blocks_per_row + get_global_offset ( 0 ) ;
645
645
646
646
__global const struct block_q6_K * x = xx + ib0;
647
647
@@ -745,19 +745,21 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __global float* y) {
745
745
746
746
std::string dequant_mul_mat_vec_template = MULTILINE_QUOTE(
747
747
__kernel void KERNEL_NAME (__global X_TYPE* x, __local float * tmp, __global float * y, __global float * dst, const int ncols) {
748
- const int block_size = get_local_size (0 );
748
+ const int local_size = get_local_size (0 );
749
749
const int row = get_group_id (0 );
750
750
const int tid = get_local_id (0 );
751
751
752
752
const uint qk = QUANT_K;
753
753
const uint qr = QUANT_R;
754
754
755
+ const int col_step = local_size * 2 ;
755
756
const int y_offset = qr == 1 ? 1 : qk/2 ;
756
757
758
+ x += get_global_offset (0 );
759
+
757
760
tmp[tid] = 0 ;
758
761
759
- for (int i = 0 ; i < ncols/block_size; i += 2 ) {
760
- const int col = i*block_size + 2 *tid;
762
+ for (int col = tid*2 ; col < ncols; col += col_step) {
761
763
const int ib = (row*ncols + col)/qk; // block index
762
764
const int iqs = (col%qk)/qr; // quant index
763
765
const int iybs = col - col%qk; // y block start index
@@ -773,7 +775,7 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float
773
775
774
776
// sum up partial sums and write back result
775
777
barrier (CLK_LOCAL_MEM_FENCE);
776
- for (int s=block_size /2 ; s>0 ; s>>=1 ) {
778
+ for (int s=local_size /2 ; s>0 ; s>>=1 ) {
777
779
if (tid < s) {
778
780
tmp[tid] += tmp[tid + s];
779
781
}
@@ -1704,7 +1706,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
1704
1706
const int nb2 = dst->nb [2 ];
1705
1707
const int nb3 = dst->nb [3 ];
1706
1708
const ggml_type type = src0->type ;
1707
- const bool mul_mat_vec = ne11 == 1 ;
1709
+ const bool mul_mat_vec = ne11 == 1 && ne00% 2 == 0 ;
1708
1710
1709
1711
const int64_t r2 = ne12 / ne02;
1710
1712
const int64_t r3 = ne13 / ne03;
@@ -1737,7 +1739,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
1737
1739
GGML_ASSERT (to_fp32_cl != nullptr );
1738
1740
1739
1741
const size_t global_denom = ggml_cl_global_denom (type);
1740
- const size_t local = ggml_cl_local_size (type);
1742
+ const size_t local = mul_mat_vec ? CL_DMMV_LOCAL_SIZE : ggml_cl_local_size (type);
1741
1743
1742
1744
size_t ev_idx = 0 ;
1743
1745
std::vector<cl_event> events;
@@ -1770,16 +1772,16 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
1770
1772
CL_CHECK (ggml_cl_h2d_tensor_2d (queue, d_Y, 0 , src1, i13, i12, events.data () + ev_idx++));
1771
1773
1772
1774
// compute
1773
- const size_t global = ne01 * CL_DMMV_BLOCK_SIZE ;
1774
- const size_t local = CL_DMMV_BLOCK_SIZE ;
1775
+ const size_t global = ne01 * local ;
1776
+ const size_t offset = src0-> backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0 ;
1775
1777
const cl_int ncols = ne00;
1776
1778
events.emplace_back ();
1777
1779
CL_CHECK (clSetKernelArg (*dmmv, 0 , sizeof (cl_mem), &d_Q));
1778
1780
CL_CHECK (clSetKernelArg (*dmmv, 1 , sizeof (float ) * local, NULL ));
1779
1781
CL_CHECK (clSetKernelArg (*dmmv, 2 , sizeof (cl_mem), &d_Y));
1780
1782
CL_CHECK (clSetKernelArg (*dmmv, 3 , sizeof (cl_mem), &d_D));
1781
1783
CL_CHECK (clSetKernelArg (*dmmv, 4 , sizeof (cl_int), &ncols));
1782
- CL_CHECK (clEnqueueNDRangeKernel (queue, *dmmv, 1 , NULL , &global, &local, events.size () - 1 , events.data (), events.data () + ev_idx++));
1784
+ CL_CHECK (clEnqueueNDRangeKernel (queue, *dmmv, 1 , &offset , &global, &local, events.size () - 1 , events.data (), events.data () + ev_idx++));
1783
1785
} else { // general dequantization kernel + CLBlast matrix matrix multiplication
1784
1786
// convert src0 to fp32 on device
1785
1787
const size_t global = x_ne / global_denom;
0 commit comments