@@ -1224,32 +1224,18 @@ static void llama_model_load_internal(
1224
1224
1225
1225
#ifdef GGML_USE_CUBLAS
1226
1226
const int max_backend_supported_layers = hparams.n_layer + 3 ;
1227
- #if defined(GGML_USE_HIPBLAS)
1228
- const int max_offloadable_layers = low_vram ? hparams.n_layer + 3 : hparams.n_layer + 3 ;
1229
- #else
1230
1227
const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3 ;
1231
- #endif
1232
1228
if (n_gpu_layers > (int ) hparams.n_layer + 1 ) {
1233
1229
if (low_vram) {
1234
- #if defined(GGML_USE_HIPBLAS)
1235
- fprintf (stderr, " %s: offloading v cache to GPU\n " , __func__);
1236
- vram_kv_cache += MEM_REQ_KV_SELF ().at (model.type ) / 2 ;
1237
- #else
1238
1230
fprintf (stderr, " %s: cannot offload v cache to GPU due to low VRAM option\n " , __func__);
1239
- #endif
1240
1231
} else {
1241
1232
fprintf (stderr, " %s: offloading v cache to GPU\n " , __func__);
1242
1233
vram_kv_cache += MEM_REQ_KV_SELF ().at (model.type ) / 2 ;
1243
1234
}
1244
1235
}
1245
1236
if (n_gpu_layers > (int ) hparams.n_layer + 2 ) {
1246
1237
if (low_vram) {
1247
- #if defined(GGML_USE_HIPBLAS)
1248
- fprintf (stderr, " %s: offloading k cache to GPU\n " , __func__);
1249
- vram_kv_cache += MEM_REQ_KV_SELF ().at (model.type ) / 2 ;
1250
- #else
1251
1238
fprintf (stderr, " %s: cannot offload k cache to GPU due to low VRAM option\n " , __func__);
1252
- #endif
1253
1239
} else {
1254
1240
fprintf (stderr, " %s: offloading k cache to GPU\n " , __func__);
1255
1241
vram_kv_cache += MEM_REQ_KV_SELF ().at (model.type ) / 2 ;
0 commit comments