10
10
#include " llama.h"
11
11
12
12
#include " ggml.h"
13
- #ifdef GGML_USE_CUBLAS
13
+ #if defined GGML_USE_CUBLAS || defined GGML_USE_HIPBLAS
14
14
#include " ggml-cuda.h"
15
15
#elif defined(GGML_USE_CLBLAST)
16
16
#include " ggml-opencl.h"
@@ -80,7 +80,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
80
80
{ MODEL_3B, 256ull * MB },
81
81
{ MODEL_7B, 512ull * MB },
82
82
{ MODEL_13B, 512ull * MB },
83
- { MODEL_30B, 512ull * MB },
83
+ { MODEL_30B, 640ull * MB },
84
84
{ MODEL_65B, 1024ull * MB },
85
85
};
86
86
return k_sizes;
@@ -92,7 +92,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
92
92
{ MODEL_3B, 256ull * MB },
93
93
{ MODEL_7B, 512ull * MB },
94
94
{ MODEL_13B, 512ull * MB },
95
- { MODEL_30B, 512ull * MB },
95
+ { MODEL_30B, 640ull * MB },
96
96
{ MODEL_65B, 1024ull * MB },
97
97
};
98
98
return k_sizes;
@@ -105,7 +105,7 @@ static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
105
105
{ MODEL_3B, 682ull * MB },
106
106
{ MODEL_7B, 1026ull * MB },
107
107
{ MODEL_13B, 1608ull * MB },
108
- { MODEL_30B, 3124ull * MB },
108
+ { MODEL_30B, 3224ull * MB },
109
109
{ MODEL_65B, 5120ull * MB },
110
110
};
111
111
return k_sizes;
@@ -117,9 +117,9 @@ static const std::map<e_model, size_t> & MEM_REQ_EVAL()
117
117
{
118
118
static std::map<e_model, size_t > k_sizes = {
119
119
{ MODEL_3B, 512ull * MB },
120
- { MODEL_7B, 768ull * MB },
120
+ { MODEL_7B, 800ull * MB },
121
121
{ MODEL_13B, 1024ull * MB },
122
- { MODEL_30B, 1280ull * MB },
122
+ { MODEL_30B, 1380ull * MB },
123
123
{ MODEL_65B, 1536ull * MB },
124
124
};
125
125
return k_sizes;
@@ -175,7 +175,7 @@ struct llama_kv_cache {
175
175
ggml_free (ctx);
176
176
}
177
177
178
- #ifdef GGML_USE_CUBLAS
178
+ #if defined GGML_USE_CUBLAS || defined GGML_USE_HIPBLAS
179
179
ggml_cuda_free_data (k);
180
180
ggml_cuda_free_data (v);
181
181
#endif // GGML_USE_CUBLAS
@@ -234,7 +234,7 @@ struct llama_model {
234
234
ggml_free (ctx);
235
235
}
236
236
237
- #ifdef GGML_USE_CUBLAS
237
+ #if defined GGML_USE_CUBLAS || defined GGML_USE_HIPBLAS
238
238
for (size_t i = 0 ; i < tensors_by_name.size (); ++i) {
239
239
ggml_cuda_free_data (tensors_by_name[i].second );
240
240
}
@@ -800,7 +800,7 @@ struct llama_model_loader {
800
800
lmlock->grow_to (lock_size);
801
801
}
802
802
break ;
803
- #if defined(GGML_USE_CUBLAS)
803
+ #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_HIPBLAS)
804
804
case GGML_BACKEND_GPU:
805
805
case GGML_BACKEND_GPU_SPLIT:
806
806
ggml_cuda_transform_tensor (lt.data , lt.ggml_tensor );
@@ -920,7 +920,7 @@ static bool kv_cache_init(
920
920
ggml_set_name (cache.v , " cache_v" );
921
921
922
922
(void ) n_gpu_layers;
923
- #ifdef GGML_USE_CUBLAS
923
+ #if defined GGML_USE_CUBLAS || defined GGML_USE_HIPBLAS
924
924
if (n_gpu_layers > n_layer + 1 ) {
925
925
ggml_cuda_assign_buffers_no_scratch (cache.v );
926
926
}
@@ -1106,15 +1106,15 @@ static void llama_model_load_internal(
1106
1106
if (hparams.ftype != LLAMA_FTYPE_ALL_F32 &&
1107
1107
hparams.ftype != LLAMA_FTYPE_MOSTLY_F16 &&
1108
1108
hparams.ftype != LLAMA_FTYPE_MOSTLY_Q8_0) {
1109
- throw std::runtime_error ( format ( " this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1405)" ) );
1109
+ printf ( " \n this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1405)" );
1110
1110
}
1111
1111
}
1112
1112
1113
1113
if (file_version < LLAMA_FILE_VERSION_GGJT_V3) {
1114
1114
if (hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
1115
1115
hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_1 ||
1116
1116
hparams.ftype == LLAMA_FTYPE_MOSTLY_Q8_0) {
1117
- throw std::runtime_error ( format ( " this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1508)" ) );
1117
+ printf ( " \n this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1508)" );
1118
1118
}
1119
1119
}
1120
1120
@@ -1150,7 +1150,7 @@ static void llama_model_load_internal(
1150
1150
}
1151
1151
1152
1152
(void ) main_gpu;
1153
- #if defined(GGML_USE_CUBLAS)
1153
+ #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_HIPBLAS)
1154
1154
fprintf (stderr, " %s: using CUDA for GPU acceleration\n " , __func__);
1155
1155
ggml_cuda_set_main_device (main_gpu);
1156
1156
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
@@ -1261,7 +1261,7 @@ static void llama_model_load_internal(
1261
1261
1262
1262
(void ) vram_scratch;
1263
1263
(void ) n_batch;
1264
- #ifdef GGML_USE_CUBLAS
1264
+ #if defined GGML_USE_CUBLAS || defined GGML_USE_HIPBLAS
1265
1265
if (low_vram) {
1266
1266
fprintf (stderr, " %s: not allocating a VRAM scratch buffer due to low VRAM option\n " , __func__);
1267
1267
ggml_cuda_set_scratch_size (0 ); // disable scratch
@@ -1274,7 +1274,7 @@ static void llama_model_load_internal(
1274
1274
}
1275
1275
}
1276
1276
#endif // GGML_USE_CUBLAS
1277
- #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
1277
+ #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_HIPBLAS) || defined( GGML_USE_CLBLAST)
1278
1278
const int n_gpu = std::min (n_gpu_layers, int (hparams.n_layer ));
1279
1279
1280
1280
fprintf (stderr, " %s: offloading %d repeating layers to GPU\n " , __func__, n_gpu);
@@ -1314,7 +1314,7 @@ static void llama_model_load_internal(
1314
1314
}
1315
1315
1316
1316
(void ) tensor_split;
1317
- #if defined(GGML_USE_CUBLAS)
1317
+ #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_HIPBLAS)
1318
1318
{
1319
1319
ggml_cuda_set_tensor_split (tensor_split);
1320
1320
}
@@ -1375,11 +1375,11 @@ static bool llama_eval_internal(
1375
1375
const int n_threads,
1376
1376
const char * cgraph_fname) {
1377
1377
1378
- // enforce that the first token is BOS
1379
- if (n_past == 0 && tokens[0 ] != llama_token_bos ()) {
1380
- fprintf (stderr, " %s: first token must be BOS\n " , __func__);
1381
- return false ;
1382
- }
1378
+ // // enforce that the first token is BOS
1379
+ // if (n_past == 0 && tokens[0] != llama_token_bos()) {
1380
+ // fprintf(stderr, "%s: first token must be BOS\n", __func__);
1381
+ // return false;
1382
+ // }
1383
1383
1384
1384
const int64_t t_start_us = ggml_time_us ();
1385
1385
@@ -1435,7 +1435,7 @@ static bool llama_eval_internal(
1435
1435
offload_func_t offload_func_kq = llama_nop;
1436
1436
offload_func_t offload_func_v = llama_nop;
1437
1437
1438
- #ifdef GGML_USE_CUBLAS
1438
+ #if defined GGML_USE_CUBLAS || defined GGML_USE_HIPBLAS
1439
1439
if (n_gpu_layers > n_layer) {
1440
1440
offload_func_nr = ggml_cuda_assign_buffers;
1441
1441
}
@@ -1450,7 +1450,7 @@ static bool llama_eval_internal(
1450
1450
for (int il = 0 ; il < n_layer; ++il) {
1451
1451
offload_func_t offload_func = llama_nop;
1452
1452
1453
- #ifdef GGML_USE_CUBLAS
1453
+ #if defined GGML_USE_CUBLAS || defined GGML_USE_HIPBLAS
1454
1454
if (il >= i_gpu_start) {
1455
1455
offload_func = ggml_cuda_assign_buffers;
1456
1456
}
0 commit comments