Skip to content

Commit 9ac84e1

Browse files
committed
Add support for quantized models
1 parent 5f9fad1 commit 9ac84e1

File tree

2 files changed

+201
-29
lines changed

2 files changed

+201
-29
lines changed

ggml.c

+190-27
Original file line numberDiff line numberDiff line change
@@ -2336,6 +2336,30 @@ static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void * rest
23362336
*s = sumf;
23372337
}
23382338

2339+
// TODO: move this to a more sensible place
2340+
typedef void (*dequantize_row_q_t)(const void * restrict x, float * restrict y, int k);
2341+
typedef void (*quantize_row_q_t)(const float * restrict x, void * restrict y, int k);
2342+
typedef void (*vec_dot_q_t)(const int n, float * restrict s, const void * restrict x, const void * restrict y);
2343+
2344+
typedef struct {
2345+
dequantize_row_q_t dequantize_row_q;
2346+
quantize_row_q_t quantize_row_q;
2347+
vec_dot_q_t vec_dot_q;
2348+
} quantize_fns_t;
2349+
2350+
static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
2351+
[GGML_TYPE_Q4_0] = {
2352+
.dequantize_row_q = dequantize_row_q4_0,
2353+
.quantize_row_q = quantize_row_q4_0,
2354+
.vec_dot_q = ggml_vec_dot_q4_0,
2355+
},
2356+
[GGML_TYPE_Q4_1] = {
2357+
.dequantize_row_q = dequantize_row_q4_1,
2358+
.quantize_row_q = quantize_row_q4_1,
2359+
.vec_dot_q = ggml_vec_dot_q4_1,
2360+
},
2361+
};
2362+
23392363
// compute GGML_VEC_DOT_UNROLL dot products at once
23402364
// xs - x row stride in bytes
23412365
inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * restrict s, void * restrict xv, ggml_fp16_t * restrict y) {
@@ -5184,13 +5208,13 @@ static void ggml_compute_forward_add_f16_f32(
51845208
const int n = ggml_nrows(src0);
51855209
const int nc = src0->ne[0];
51865210

5187-
const size_t nb00 = src0->nb[0];
5211+
//const size_t nb00 = src0->nb[0];
51885212
const size_t nb01 = src0->nb[1];
51895213

51905214
const size_t nb10 = src1->nb[0];
51915215
const size_t nb11 = src1->nb[1];
51925216

5193-
const size_t nb0 = dst->nb[0];
5217+
//const size_t nb0 = dst->nb[0];
51945218
const size_t nb1 = dst->nb[1];
51955219

51965220
GGML_ASSERT(src0->type == GGML_TYPE_F16);
@@ -5202,12 +5226,163 @@ static void ggml_compute_forward_add_f16_f32(
52025226
ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + j*nb01);
52035227
for (int i = 0; i < nc; i++) {
52045228
float * src1_ptr = (float *) ((char *) src1->data + j*nb11 + i*nb10);
5205-
52065229
dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + *src1_ptr);
52075230
}
52085231
}
52095232
}
52105233

5234+
static void ggml_compute_forward_add_f16_f16(
5235+
const struct ggml_compute_params * params,
5236+
const struct ggml_tensor * src0,
5237+
const struct ggml_tensor * src1,
5238+
struct ggml_tensor * dst) {
5239+
GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
5240+
5241+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
5242+
return;
5243+
}
5244+
5245+
const int ith = params->ith;
5246+
const int nth = params->nth;
5247+
5248+
const int n = ggml_nrows(src0);
5249+
const int nc = src0->ne[0];
5250+
5251+
//const size_t nb00 = src0->nb[0];
5252+
const size_t nb01 = src0->nb[1];
5253+
5254+
const size_t nb10 = src1->nb[0];
5255+
const size_t nb11 = src1->nb[1];
5256+
5257+
//const size_t nb0 = dst->nb[0];
5258+
const size_t nb1 = dst->nb[1];
5259+
5260+
GGML_ASSERT(src0->type == GGML_TYPE_F16);
5261+
GGML_ASSERT(src1->type == GGML_TYPE_F16);
5262+
GGML_ASSERT(dst->type == GGML_TYPE_F16);
5263+
5264+
for (int j = ith; j < n; j += nth) {
5265+
ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + j*nb1);
5266+
ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + j*nb01);
5267+
for (int i = 0; i < nc; i++) {
5268+
ggml_fp16_t * src1_ptr = (ggml_fp16_t *) ((char *) src1->data + j*nb11 + i*nb10);
5269+
dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + GGML_FP16_TO_FP32(*src1_ptr));
5270+
}
5271+
}
5272+
}
5273+
5274+
static void ggml_compute_forward_add_q_f32(
5275+
const struct ggml_compute_params * params,
5276+
const struct ggml_tensor * src0,
5277+
const struct ggml_tensor * src1,
5278+
struct ggml_tensor * dst) {
5279+
GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
5280+
5281+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
5282+
return;
5283+
}
5284+
5285+
const int64_t ne00 = src0->ne[0];
5286+
const int64_t ne01 = src0->ne[1];
5287+
const int64_t ne02 = src0->ne[2];
5288+
const int64_t ne03 = src0->ne[3];
5289+
5290+
//const int64_t ne10 = src1->ne[0];
5291+
const int64_t ne11 = src1->ne[1];
5292+
const int64_t ne12 = src1->ne[2];
5293+
const int64_t ne13 = src1->ne[3];
5294+
5295+
const int64_t ne0 = dst->ne[0];
5296+
const int64_t ne1 = dst->ne[1];
5297+
const int64_t ne2 = dst->ne[2];
5298+
const int64_t ne3 = dst->ne[3];
5299+
5300+
const int nb00 = src0->nb[0];
5301+
const int nb01 = src0->nb[1];
5302+
const int nb02 = src0->nb[2];
5303+
const int nb03 = src0->nb[3];
5304+
5305+
const int nb10 = src1->nb[0];
5306+
const int nb11 = src1->nb[1];
5307+
const int nb12 = src1->nb[2];
5308+
const int nb13 = src1->nb[3];
5309+
5310+
const int nb0 = dst->nb[0];
5311+
const int nb1 = dst->nb[1];
5312+
const int nb2 = dst->nb[2];
5313+
const int nb3 = dst->nb[3];
5314+
5315+
const int ith = params->ith;
5316+
const int nth = params->nth;
5317+
5318+
GGML_ASSERT(ne02 == ne12);
5319+
GGML_ASSERT(ne03 == ne13);
5320+
GGML_ASSERT(ne2 == ne12);
5321+
GGML_ASSERT(ne3 == ne13);
5322+
5323+
const enum ggml_type type = src0->type;
5324+
dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q;
5325+
quantize_row_q_t const quantize_row_q = quantize_fns[type].quantize_row_q;
5326+
5327+
// we don't support permuted src0 or src1
5328+
GGML_ASSERT(nb00 == (int) GGML_TYPE_SIZE[type]);
5329+
GGML_ASSERT(nb10 == sizeof(float));
5330+
5331+
// dst cannot be transposed or permuted
5332+
GGML_ASSERT(nb0 <= nb1);
5333+
GGML_ASSERT(nb1 <= nb2);
5334+
GGML_ASSERT(nb2 <= nb3);
5335+
5336+
GGML_ASSERT(ne0 == ne01);
5337+
GGML_ASSERT(ne1 == ne11);
5338+
GGML_ASSERT(ne2 == ne02);
5339+
GGML_ASSERT(ne3 == ne03);
5340+
5341+
GGML_ASSERT(src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1);
5342+
GGML_ASSERT(dst->type == src0->type);
5343+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
5344+
5345+
// total rows in src0
5346+
const int nr = ne01*ne02*ne03;
5347+
5348+
// rows per thread
5349+
const int dr = (nr + nth - 1)/nth;
5350+
5351+
// row range for this thread
5352+
const int ir0 = dr*ith;
5353+
const int ir1 = MIN(ir0 + dr, nr);
5354+
5355+
for (int ir = ir0; ir < ir1; ++ir) {
5356+
// src0 indices
5357+
const int i03 = ir/(ne02*ne01);
5358+
const int i02 = (ir - i03*ne02*ne01)/ne01;
5359+
const int i01 = (ir - i03*ne02*ne01 - i02*ne01);
5360+
5361+
// src1 and dst are same shape as src0 => same indices
5362+
const int i13 = i03;
5363+
const int i12 = i02;
5364+
const int i11 = i01;
5365+
5366+
const int i3 = i03;
5367+
const int i2 = i02;
5368+
const int i1 = i01;
5369+
5370+
void * src0_row = (void *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03));
5371+
float * src1_row = (float *)((char *) src1->data + (i11*nb11 + i12*nb12 + i13*nb13));
5372+
void * dst_row = (void *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb0));
5373+
5374+
assert(ne00 % 32 == 0);
5375+
5376+
// unquantize row from src0 to temp buffer
5377+
float tmp[ne00];
5378+
dequantize_row_q(src0_row, tmp, ne00);
5379+
// add src1
5380+
ggml_vec_acc_f32(ne00, tmp, src1_row);
5381+
// quantize row to dst
5382+
quantize_row_q(tmp, dst_row, ne00);
5383+
}
5384+
}
5385+
52115386
static void ggml_compute_forward_add(
52125387
const struct ggml_compute_params * params,
52135388
const struct ggml_tensor * src0,
@@ -5220,10 +5395,21 @@ static void ggml_compute_forward_add(
52205395
} break;
52215396
case GGML_TYPE_F16:
52225397
{
5223-
ggml_compute_forward_add_f16_f32(params, src0, src1, dst);
5398+
if (src1->type == GGML_TYPE_F16) {
5399+
ggml_compute_forward_add_f16_f16(params, src0, src1, dst);
5400+
}
5401+
else if (src1->type == GGML_TYPE_F32) {
5402+
ggml_compute_forward_add_f16_f32(params, src0, src1, dst);
5403+
}
5404+
else {
5405+
GGML_ASSERT(false);
5406+
}
52245407
} break;
52255408
case GGML_TYPE_Q4_0:
52265409
case GGML_TYPE_Q4_1:
5410+
{
5411+
ggml_compute_forward_add_q_f32(params, src0, src1, dst);
5412+
} break;
52275413
case GGML_TYPE_I8:
52285414
case GGML_TYPE_I16:
52295415
case GGML_TYPE_I32:
@@ -6608,29 +6794,6 @@ static void ggml_compute_forward_mul_mat_f16_f32(
66086794
//}
66096795
}
66106796

6611-
typedef void (*dequantize_row_q_t)(const void * restrict x, float * restrict y, int k);
6612-
typedef void (*quantize_row_q_t)(const float * restrict x, void * restrict y, int k);
6613-
typedef void (*vec_dot_q_t)(const int n, float * restrict s, const void * restrict x, const void * restrict y);
6614-
6615-
typedef struct {
6616-
dequantize_row_q_t dequantize_row_q;
6617-
quantize_row_q_t quantize_row_q;
6618-
vec_dot_q_t vec_dot_q;
6619-
} quantize_fns_t;
6620-
6621-
static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
6622-
[GGML_TYPE_Q4_0] = {
6623-
.dequantize_row_q = dequantize_row_q4_0,
6624-
.quantize_row_q = quantize_row_q4_0,
6625-
.vec_dot_q = ggml_vec_dot_q4_0,
6626-
},
6627-
[GGML_TYPE_Q4_1] = {
6628-
.dequantize_row_q = dequantize_row_q4_1,
6629-
.quantize_row_q = quantize_row_q4_1,
6630-
.vec_dot_q = ggml_vec_dot_q4_1,
6631-
},
6632-
};
6633-
66346797
static void ggml_compute_forward_mul_mat_q_f32(
66356798
const struct ggml_compute_params * params,
66366799
const struct ggml_tensor * src0,

llama.cpp

+11-2
Original file line numberDiff line numberDiff line change
@@ -1823,14 +1823,23 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
18231823
return 1;
18241824
}
18251825

1826-
// w = w + BA
1826+
// w = w + BA*s
18271827
ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraB, loraA);
1828-
ggml_tensor * r = ggml_add_inplace(lora_ctx, tensor, BA);
1828+
1829+
//if (true) {
1830+
// ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, 1.0f);
1831+
// BA = ggml_scale(lora_ctx, BA, scale_tensor);
1832+
//}
1833+
ggml_tensor * r = ggml_add(lora_ctx, tensor, BA);
1834+
//r = ggml_cpy(lora_ctx, r, tensor);
18291835

18301836
struct ggml_cgraph gf = ggml_build_forward(r);
18311837
gf.n_threads = n_threads;
18321838
ggml_graph_compute(lora_ctx, &gf);
18331839

1840+
// hack until ggml_cpy supports quantized tensors
1841+
memcpy(tensor->data, r->data, ggml_nbytes(tensor));
1842+
18341843
// we won't need these tensors again, reset the context to save memory
18351844
ggml_free(lora_ctx);
18361845
lora_ctx = ggml_init(params);

0 commit comments

Comments
 (0)