From bc04508666f2b59b4217b5e59fe6e67676d19269 Mon Sep 17 00:00:00 2001 From: Kilty McGowan Date: Sun, 4 Jun 2023 21:30:54 -0700 Subject: [PATCH 1/5] Use MTLDevice.newBufferWithBytesNoCopy to share buffers between CPU and GPU --- ggml-metal.m | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/ggml-metal.m b/ggml-metal.m index 3cb423a01f550..82c65963b989d 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -195,14 +195,25 @@ bool ggml_metal_add_buffer( } } + size_t page_size = getpagesize(); + size_t aligned_size = size; + if ((aligned_size % page_size) != 0) { + aligned_size += (page_size - (aligned_size % page_size)); + } + ctx->buffers[ctx->n_buffers].name = name; ctx->buffers[ctx->n_buffers].data = data; ctx->buffers[ctx->n_buffers].size = size; - ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytes:data length:size options:MTLResourceStorageModeShared]; + ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:aligned_size options:MTLResourceStorageModeShared deallocator:nil]; - ++ctx->n_buffers; + if (ctx->buffers[ctx->n_buffers].metal == nil) { + fprintf(stderr, "%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, aligned_size / 1024.0 / 1024.0); + return false; + } else { + fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB\n", __func__, name, aligned_size / 1024.0 / 1024.0); + } - fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB\n", __func__, name, size / 1024.0 / 1024.0); + ++ctx->n_buffers; } return true; From a9d0bea047272d9dda3d69cebf3c7f86b6776493 Mon Sep 17 00:00:00 2001 From: Kilty McGowan Date: Sun, 4 Jun 2023 23:16:45 -0700 Subject: [PATCH 2/5] Page-align buffers used by Metal --- ggml.c | 5 +++++ llama-util.h | 16 ++++++++++++++++ 2 files changed, 21 insertions(+) diff --git a/ggml.c b/ggml.c index 00bbee503f52a..785f4b91fddd7 100644 --- a/ggml.c +++ b/ggml.c @@ -20,6 +20,7 @@ #include #include #include +#include // if C99 - static_assert is noop // ref: https://stackoverflow.com/a/53923785/4039976 @@ -121,7 +122,11 @@ typedef void* thread_ret_t; #else inline static void* ggml_aligned_malloc(size_t size) { void* aligned_memory = NULL; +#ifdef GGML_USE_METAL + int result = posix_memalign(&aligned_memory, getpagesize(), size); +#else int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size); +#endif if (result != 0) { // Handle allocation failure return NULL; diff --git a/llama-util.h b/llama-util.h index 3cac9f681800b..195cb46ea6b09 100644 --- a/llama-util.h +++ b/llama-util.h @@ -405,13 +405,29 @@ struct llama_buffer { llama_buffer() = default; void resize(size_t len) { +#ifdef GGML_USE_METAL + free(addr); + int result = posix_memalign((void **) &addr, getpagesize(), len); + if (result == 0) { + memset(addr, 0, len); + } + else { + addr = NULL; + } +#else delete[] addr; addr = new uint8_t[len]; +#endif size = len; } ~llama_buffer() { +#ifdef GGML_USE_METAL + free(addr); +#else delete[] addr; +#endif + addr = NULL; } // disable copy and move From c1b44240d731e89c38e172b9ea30560e8dd338aa Mon Sep 17 00:00:00 2001 From: Kilty McGowan Date: Mon, 5 Jun 2023 05:02:38 -0700 Subject: [PATCH 3/5] Remove trailing whitespace --- llama-util.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama-util.h b/llama-util.h index 195cb46ea6b09..4f8a4296adc4e 100644 --- a/llama-util.h +++ b/llama-util.h @@ -410,7 +410,7 @@ struct llama_buffer { int result = posix_memalign((void **) &addr, getpagesize(), len); if (result == 0) { memset(addr, 0, len); - } + } else { addr = NULL; } From c38b0bbf82291d447727d7741964b127db6ff00f Mon Sep 17 00:00:00 2001 From: Kilty McGowan Date: Mon, 5 Jun 2023 09:02:14 -0700 Subject: [PATCH 4/5] Only import unistd.h for Metal builds --- ggml.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ggml.c b/ggml.c index 785f4b91fddd7..c066b5f477ef2 100644 --- a/ggml.c +++ b/ggml.c @@ -20,7 +20,10 @@ #include #include #include + +#ifdef GGML_USE_METAL #include +#endif // if C99 - static_assert is noop // ref: https://stackoverflow.com/a/53923785/4039976 From e129f0bd767385bf4de015056a71f0649cd592b9 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 5 Jun 2023 23:23:00 +0300 Subject: [PATCH 5/5] metal : remove unnecessary copies --- llama.cpp | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/llama.cpp b/llama.cpp index bc58ad960c139..b26931f8b31d3 100644 --- a/llama.cpp +++ b/llama.cpp @@ -53,7 +53,6 @@ enum e_model { MODEL_65B, }; - static const size_t MB = 1024*1024; // computed for n_ctx == 2048 @@ -1261,12 +1260,6 @@ static bool llama_eval_internal( ggml_set_name(embd, "embd"); memcpy(embd->data, tokens, N*ggml_element_size(embd)); -#ifdef GGML_USE_METAL - if (lctx.ctx_metal && N == 1) { - ggml_metal_set_tensor(lctx.ctx_metal, embd); - } -#endif - struct ggml_tensor * cur; struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd); @@ -1456,12 +1449,6 @@ static bool llama_eval_internal( // But for now, we have focused only on Matrix x Vector Metal multiplication. // ggml_graph_compute(ctx0, &gf); - - if (lctx.ctx_metal) { - // We need to sync the CPU KV cache with the GPU KV cache - ggml_metal_set_tensor(lctx.ctx_metal, kv_self.k); - ggml_metal_set_tensor(lctx.ctx_metal, kv_self.v); - } } #else ggml_graph_compute(ctx0, &gf);