@@ -1224,18 +1224,32 @@ static void llama_model_load_internal(
1224
1224
1225
1225
#ifdef GGML_USE_CUBLAS
1226
1226
const int max_backend_supported_layers = hparams.n_layer + 3 ;
1227
+ #if defined(GGML_USE_HIPBLAS)
1228
+ const int max_offloadable_layers = low_vram ? hparams.n_layer + 3 : hparams.n_layer + 3 ;
1229
+ #else
1227
1230
const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3 ;
1231
+ #endif
1228
1232
if (n_gpu_layers > (int ) hparams.n_layer + 1 ) {
1229
1233
if (low_vram) {
1234
+ #if defined(GGML_USE_HIPBLAS)
1235
+ fprintf (stderr, " %s: offloading v cache to GPU\n " , __func__);
1236
+ vram_kv_cache += MEM_REQ_KV_SELF ().at (model.type ) / 2 ;
1237
+ #else
1230
1238
fprintf (stderr, " %s: cannot offload v cache to GPU due to low VRAM option\n " , __func__);
1239
+ #endif
1231
1240
} else {
1232
1241
fprintf (stderr, " %s: offloading v cache to GPU\n " , __func__);
1233
1242
vram_kv_cache += MEM_REQ_KV_SELF ().at (model.type ) / 2 ;
1234
1243
}
1235
1244
}
1236
1245
if (n_gpu_layers > (int ) hparams.n_layer + 2 ) {
1237
1246
if (low_vram) {
1247
+ #if defined(GGML_USE_HIPBLAS)
1248
+ fprintf (stderr, " %s: offloading k cache to GPU\n " , __func__);
1249
+ vram_kv_cache += MEM_REQ_KV_SELF ().at (model.type ) / 2 ;
1250
+ #else
1238
1251
fprintf (stderr, " %s: cannot offload k cache to GPU due to low VRAM option\n " , __func__);
1252
+ #endif
1239
1253
} else {
1240
1254
fprintf (stderr, " %s: offloading k cache to GPU\n " , __func__);
1241
1255
vram_kv_cache += MEM_REQ_KV_SELF ().at (model.type ) / 2 ;
0 commit comments