@@ -1274,18 +1274,32 @@ static void llama_model_load_internal(
1274
1274
1275
1275
#ifdef GGML_USE_CUBLAS
1276
1276
const int max_backend_supported_layers = hparams.n_layer + 3 ;
1277
+ #if defined(GGML_USE_HIPBLAS)
1278
+ const int max_offloadable_layers = low_vram ? hparams.n_layer + 3 : hparams.n_layer + 3 ;
1279
+ #else
1277
1280
const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3 ;
1281
+ #endif
1278
1282
if (n_gpu_layers > (int ) hparams.n_layer + 1 ) {
1279
1283
if (low_vram) {
1284
+ #if defined(GGML_USE_HIPBLAS)
1285
+ fprintf (stderr, " %s: offloading v cache to GPU\n " , __func__);
1286
+ vram_kv_cache += hparams.kv_size () / 2 ;
1287
+ #else
1280
1288
fprintf (stderr, " %s: cannot offload v cache to GPU due to low VRAM option\n " , __func__);
1289
+ #endif
1281
1290
} else {
1282
1291
fprintf (stderr, " %s: offloading v cache to GPU\n " , __func__);
1283
1292
vram_kv_cache += hparams.kv_size () / 2 ;
1284
1293
}
1285
1294
}
1286
1295
if (n_gpu_layers > (int ) hparams.n_layer + 2 ) {
1287
1296
if (low_vram) {
1297
+ #if defined(GGML_USE_HIPBLAS)
1298
+ fprintf (stderr, " %s: offloading k cache to GPU\n " , __func__);
1299
+ vram_kv_cache += hparams.kv_size () / 2 ;
1300
+ #else
1288
1301
fprintf (stderr, " %s: cannot offload k cache to GPU due to low VRAM option\n " , __func__);
1302
+ #endif
1289
1303
} else {
1290
1304
fprintf (stderr, " %s: offloading k cache to GPU\n " , __func__);
1291
1305
vram_kv_cache += hparams.kv_size () / 2 ;
0 commit comments