Skip to content

Commit 5b838d4

Browse files
committed
amd multigpu full layer offload w/o vram scratch
1 parent 9bfb2fd commit 5b838d4

File tree

1 file changed

+14
-0
lines changed

1 file changed

+14
-0
lines changed

llama.cpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1274,18 +1274,32 @@ static void llama_model_load_internal(
12741274

12751275
#ifdef GGML_USE_CUBLAS
12761276
const int max_backend_supported_layers = hparams.n_layer + 3;
1277+
#if defined(GGML_USE_HIPBLAS)
1278+
const int max_offloadable_layers = low_vram ? hparams.n_layer + 3 : hparams.n_layer + 3;
1279+
#else
12771280
const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3;
1281+
#endif
12781282
if (n_gpu_layers > (int) hparams.n_layer + 1) {
12791283
if (low_vram) {
1284+
#if defined(GGML_USE_HIPBLAS)
1285+
fprintf(stderr, "%s: offloading v cache to GPU\n", __func__);
1286+
vram_kv_cache += hparams.kv_size() / 2;
1287+
#else
12801288
fprintf(stderr, "%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
1289+
#endif
12811290
} else {
12821291
fprintf(stderr, "%s: offloading v cache to GPU\n", __func__);
12831292
vram_kv_cache += hparams.kv_size() / 2;
12841293
}
12851294
}
12861295
if (n_gpu_layers > (int) hparams.n_layer + 2) {
12871296
if (low_vram) {
1297+
#if defined(GGML_USE_HIPBLAS)
1298+
fprintf(stderr, "%s: offloading k cache to GPU\n", __func__);
1299+
vram_kv_cache += hparams.kv_size() / 2;
1300+
#else
12881301
fprintf(stderr, "%s: cannot offload k cache to GPU due to low VRAM option\n", __func__);
1302+
#endif
12891303
} else {
12901304
fprintf(stderr, "%s: offloading k cache to GPU\n", __func__);
12911305
vram_kv_cache += hparams.kv_size() / 2;

0 commit comments

Comments
 (0)