Skip to content

Commit 9adfc8e

Browse files
committed
amd multigpu full layer offload w/o vram scratch
1 parent 05c792e commit 9adfc8e

File tree

1 file changed

+14
-0
lines changed

1 file changed

+14
-0
lines changed

llama.cpp

+14
Original file line numberDiff line numberDiff line change
@@ -1224,18 +1224,32 @@ static void llama_model_load_internal(
12241224

12251225
#ifdef GGML_USE_CUBLAS
12261226
const int max_backend_supported_layers = hparams.n_layer + 3;
1227+
#if defined(GGML_USE_HIPBLAS)
1228+
const int max_offloadable_layers = low_vram ? hparams.n_layer + 3 : hparams.n_layer + 3;
1229+
#else
12271230
const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3;
1231+
#endif
12281232
if (n_gpu_layers > (int) hparams.n_layer + 1) {
12291233
if (low_vram) {
1234+
#if defined(GGML_USE_HIPBLAS)
1235+
fprintf(stderr, "%s: offloading v cache to GPU\n", __func__);
1236+
vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;
1237+
#else
12301238
fprintf(stderr, "%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
1239+
#endif
12311240
} else {
12321241
fprintf(stderr, "%s: offloading v cache to GPU\n", __func__);
12331242
vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;
12341243
}
12351244
}
12361245
if (n_gpu_layers > (int) hparams.n_layer + 2) {
12371246
if (low_vram) {
1247+
#if defined(GGML_USE_HIPBLAS)
1248+
fprintf(stderr, "%s: offloading k cache to GPU\n", __func__);
1249+
vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;
1250+
#else
12381251
fprintf(stderr, "%s: cannot offload k cache to GPU due to low VRAM option\n", __func__);
1252+
#endif
12391253
} else {
12401254
fprintf(stderr, "%s: offloading k cache to GPU\n", __func__);
12411255
vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;

0 commit comments

Comments
 (0)