@@ -131,6 +131,20 @@ static __device__ void cpy_blck_f32_q4_0(const char * cxi, char * cdsti) {
131
131
}
132
132
}
133
133
134
+ static __device__ void cpy_blck_q4_0_f32 (const char * cxi, char * cdsti) {
135
+ const block_q4_0 * xi = (const block_q4_0 *) cxi;
136
+ float * dsti = (float *) cdsti;
137
+
138
+ const float d = (float )xi->d ;
139
+
140
+ for (int j = 0 ; j < QK4_0/2 ; ++j) {
141
+ const float x0 = (xi->qs [j] & 0x0F ) - 8 ;
142
+ const float x1 = (xi->qs [j] >> 4 ) - 8 ;
143
+ dsti[j + 0 ] = x0 * d;
144
+ dsti[j + QK4_0/2 ] = x1 * d;
145
+ }
146
+ }
147
+
134
148
static __device__ void cpy_blck_f32_q4_1 (const char * cxi, char * cdsti) {
135
149
const float * xi = (const float *) cxi;
136
150
block_q4_1 * dsti = (block_q4_1 *) cdsti;
@@ -446,6 +460,16 @@ static void ggml_cpy_f32_q4_0_cuda(
446
460
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
447
461
}
448
462
463
+ static void ggml_cpy_q4_0_f32_cuda (
464
+ const char * cx, char * cdst, const int ne,
465
+ const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
466
+ const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
467
+
468
+ const int num_blocks = ne;
469
+ cpy_q_f32<cpy_blck_q4_0_f32, QK8_0><<<num_blocks, 1 , 0 , stream>>>
470
+ (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
471
+ }
472
+
449
473
static void ggml_cpy_f32_q4_1_cuda (
450
474
const char * cx, char * cdst, const int ne,
451
475
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
@@ -556,6 +580,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
556
580
ggml_cpy_f32_q8_0_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
557
581
} else if (src0->type == GGML_TYPE_Q8_0 && src1->type == GGML_TYPE_F32) {
558
582
ggml_cpy_q8_0_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
583
+ } else if (src0->type == GGML_TYPE_Q4_0 && src1->type == GGML_TYPE_F32) {
584
+ ggml_cpy_q4_0_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
559
585
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
560
586
ggml_cpy_f32_q4_0_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
561
587
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
@@ -598,6 +624,8 @@ void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
598
624
return (void *) cpy_f32_q<cpy_blck_f32_q8_0, QK8_0>;
599
625
} else if (src0->type == GGML_TYPE_Q8_0 && src1->type == GGML_TYPE_F32) {
600
626
return (void *) cpy_q_f32<cpy_blck_q8_0_f32, QK8_0>;
627
+ } else if (src0->type == GGML_TYPE_Q4_0 && src1->type == GGML_TYPE_F32) {
628
+ return (void *) cpy_q_f32<cpy_blck_q4_0_f32, QK4_0>;
601
629
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
602
630
return (void *) cpy_f32_q<cpy_blck_f32_q4_0, QK4_0>;
603
631
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
0 commit comments