@@ -54,6 +54,8 @@ enum e_model {
54
54
MODEL_65B,
55
55
};
56
56
57
+ static const char * ftype_str[] = { " f32" , " f16" , " q4_0" , " q4_1" , " gptq" };
58
+
57
59
static const size_t MB = 1024 *1024 ;
58
60
59
61
// computed for n_ctx == 2048
@@ -100,7 +102,7 @@ struct llama_hparams {
100
102
int32_t n_head = 32 ;
101
103
int32_t n_layer = 32 ;
102
104
int32_t n_rot = 64 ;
103
- int32_t f16 = 1 ;
105
+ int32_t f16 = LLAMA_FTYPE_MOSTLY_F16 ;
104
106
};
105
107
106
108
struct llama_layer {
@@ -435,7 +437,7 @@ static bool llama_model_load(
435
437
}
436
438
437
439
// temp warning to tell the user to use "--n_parts"
438
- if (hparams.f16 == 4 && n_parts != 1 ) {
440
+ if (hparams.f16 == LLAMA_FTYPE_PER_LAYER_IS_Q4_1 && n_parts != 1 ) {
439
441
fprintf (stderr, " %s: GPTQ model detected - are you sure n_parts should be %d? we normally expect it to be 1\n " , __func__, n_parts);
440
442
fprintf (stderr, " %s: use '--n_parts 1' if necessary\n " , __func__);
441
443
}
@@ -508,11 +510,14 @@ static bool llama_model_load(
508
510
// wtype is for per-layer weights, while vtype is for other weights
509
511
ggml_type wtype, vtype;
510
512
switch (model.hparams .f16 ) {
511
- case 0 : wtype = vtype = GGML_TYPE_F32; break ;
512
- case 1 : wtype = vtype = GGML_TYPE_F16; break ;
513
- case 2 : wtype = vtype = GGML_TYPE_Q4_0; break ;
514
- case 3 : wtype = vtype = GGML_TYPE_Q4_1; break ;
515
- case 4 : wtype = GGML_TYPE_Q4_1; vtype = GGML_TYPE_F16; break ;
513
+ case LLAMA_FTYPE_ALL_F32: wtype = vtype = GGML_TYPE_F32; break ;
514
+ case LLAMA_FTYPE_MOSTLY_F16: wtype = vtype = GGML_TYPE_F16; break ;
515
+ case LLAMA_FTYPE_MOSTLY_Q4_0: wtype = vtype = GGML_TYPE_Q4_0; break ;
516
+ case LLAMA_FTYPE_MOSTLY_Q4_1: wtype = vtype = GGML_TYPE_Q4_1; break ;
517
+ case LLAMA_FTYPE_PER_LAYER_IS_Q4_1:
518
+ wtype = GGML_TYPE_Q4_1;
519
+ vtype = GGML_TYPE_F16;
520
+ break ;
516
521
default :
517
522
{
518
523
fprintf (stderr, " %s: invalid model file '%s' (bad f16 value %d)\n " ,
@@ -684,16 +689,15 @@ static bool llama_model_load(
684
689
return false ;
685
690
}
686
691
if (0 ) {
687
- static const char * ftype_str[] = { " f32" , " f16" , " q4_0" , " q4_1" , };
688
692
fprintf (stderr, " %24s - [%5d, %5d], type = %6s\n " , name.data (), ne[0 ], ne[1 ], ftype_str[ftype]);
689
693
}
690
694
691
695
switch (ftype) {
692
- case 0 : // f32
693
- case 1 : // f16
696
+ case LLAMA_FTYPE_ALL_F32:
697
+ case LLAMA_FTYPE_MOSTLY_F16:
694
698
break ;
695
- case 2 : // q4_0
696
- case 3 : // q4_1
699
+ case LLAMA_FTYPE_MOSTLY_Q4_0:
700
+ case LLAMA_FTYPE_MOSTLY_Q4_1:
697
701
assert (ne[0 ] % 64 == 0 );
698
702
break ;
699
703
default :
@@ -1273,20 +1277,15 @@ static llama_vocab::id llama_sample_top_p_top_k(
1273
1277
//
1274
1278
1275
1279
// TODO: reuse code from the llama_model_load() somehow
1276
- static bool llama_model_quantize_internal (const std::string & fname_inp, const std::string & fname_out, int itype) {
1277
- ggml_type type = GGML_TYPE_Q4_1 ;
1280
+ static bool llama_model_quantize_internal (const std::string & fname_inp, const std::string & fname_out, enum llama_ftype itype) {
1281
+ ggml_type type;
1278
1282
1279
1283
switch (itype) {
1280
- case 2 : type = GGML_TYPE_Q4_0; break ;
1281
- case 3 : type = GGML_TYPE_Q4_1; break ;
1282
- default : fprintf (stderr, " %s: invalid quantization type %d\n " , __func__, itype); return 1 ;
1284
+ case LLAMA_FTYPE_MOSTLY_Q4_0 : type = GGML_TYPE_Q4_0; break ;
1285
+ case LLAMA_FTYPE_MOSTLY_Q4_1 : type = GGML_TYPE_Q4_1; break ;
1286
+ default : fprintf (stderr, " %s: invalid quantization type %d\n " , __func__, itype); return false ;
1283
1287
};
1284
1288
1285
- if (type != GGML_TYPE_Q4_0 && type != GGML_TYPE_Q4_1) {
1286
- fprintf (stderr, " %s: invalid quantization type %d\n " , __func__, type);
1287
- return false ;
1288
- }
1289
-
1290
1289
llama_vocab vocab;
1291
1290
1292
1291
printf (" %s: loading model from '%s'\n " , __func__, fname_inp.c_str ());
@@ -1438,7 +1437,6 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
1438
1437
}
1439
1438
1440
1439
{
1441
- static const char * ftype_str[] = { " f32" , " f16" , " q4_0" , " q4_1" , };
1442
1440
printf (" %48s - [%5d, %5d], type = %6s " , name.data (), ne[0 ], ne[1 ], ftype_str[ftype]);
1443
1441
}
1444
1442
@@ -1459,12 +1457,12 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
1459
1457
quantize &= (n_dims == 2 );
1460
1458
1461
1459
if (quantize) {
1462
- if (ftype != 0 && ftype != 1 ) {
1460
+ if (ftype != LLAMA_FTYPE_ALL_F32 && ftype != LLAMA_FTYPE_MOSTLY_F16 ) {
1463
1461
fprintf (stderr, " %s: unsupported ftype %d for integer quantization\n " , __func__, ftype);
1464
1462
return false ;
1465
1463
}
1466
1464
1467
- if (ftype == 1 ) {
1465
+ if (ftype == LLAMA_FTYPE_MOSTLY_F16 ) {
1468
1466
data_f16.resize (nelements);
1469
1467
finp.read (reinterpret_cast <char *>(data_f16.data ()), nelements * sizeof (ggml_fp16_t ));
1470
1468
data_f32.resize (nelements);
@@ -1478,7 +1476,7 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
1478
1476
1479
1477
ftype = itype;
1480
1478
} else {
1481
- const int bpe = (ftype == 0 ) ? sizeof (float ) : sizeof (uint16_t );
1479
+ const int bpe = (ftype == LLAMA_FTYPE_ALL_F32 ) ? sizeof (float ) : sizeof (uint16_t );
1482
1480
1483
1481
data_u8.resize (nelements*bpe);
1484
1482
finp.read (reinterpret_cast <char *>(data_u8.data ()), nelements * bpe);
@@ -1659,7 +1657,7 @@ void llama_free(struct llama_context * ctx) {
1659
1657
int llama_model_quantize (
1660
1658
const char * fname_inp,
1661
1659
const char * fname_out,
1662
- int itype) {
1660
+ enum llama_ftype itype) {
1663
1661
if (!llama_model_quantize_internal (fname_inp, fname_out, itype)) {
1664
1662
fprintf (stderr, " %s: failed to quantize\n " , __func__);
1665
1663
return 1 ;
0 commit comments