From 32dd2ef133e8a680653fd33c4fac7741c9a352fc Mon Sep 17 00:00:00 2001 From: Markus Tavenrath Date: Wed, 12 Jun 2024 10:53:25 +0200 Subject: [PATCH 1/6] Implement non-mapped async IO for CUDA on Windows. On a fast Gen5 NVMe drive this change improves model load time by >3x while it should be the same (or slightly faster) on any other drive. --- llama.cpp | 161 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 161 insertions(+) diff --git a/llama.cpp b/llama.cpp index 8b675ea993a38..71f5b53bacd0d 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1274,6 +1274,122 @@ struct no_init { }; struct llama_file { + +#if defined(_WIN32) + // use FILE * so we don't have to re-open the file to mmap + FILE * fp; + HANDLE fp_win32; + size_t size; + +private: + std::string GetErrorMessageWin32(DWORD error_code) const { + std::string ret; + LPSTR lpMsgBuf = NULL; + DWORD bufLen = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, + NULL, error_code, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&lpMsgBuf, 0, NULL); + if (!bufLen) { + std::stringstream ss; + ret = format("Win32 error code: %s", error_code); + } else { + ret = lpMsgBuf; + LocalFree(lpMsgBuf); + } + + return ret; + } + +public: + + llama_file(const char * fname, const char * mode) { + fp = ggml_fopen(fname, mode); + if (fp == NULL) { + throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno))); + } + fp_win32 = (HANDLE) _get_osfhandle(_fileno(fp)); + seek(0, SEEK_END); + size = tell(); + seek(0, SEEK_SET); + } + + size_t tell() const { + // SetFilePointerEx returns the current position when seeking relative 0 bytes + LARGE_INTEGER li; + li.QuadPart = 0; + BOOL ret = SetFilePointerEx(fp_win32, li, &li, FILE_CURRENT); + + GGML_ASSERT(ret); + + return li.QuadPart; + } + + void seek(size_t offset, int whence) const { + // no need to convert SEEK_* to FILE_*. The enums are the same. + // Still, keep static asserts to avoid failures in the future. + static_assert(SEEK_SET == FILE_BEGIN); + static_assert(SEEK_CUR == FILE_CURRENT); + static_assert(SEEK_END == FILE_END); + + LARGE_INTEGER li; + li.QuadPart = offset; + BOOL ret = SetFilePointerEx(fp_win32, li, NULL, whence); + + GGML_ASSERT(ret); + } + + void read_raw(void * ptr, size_t len) const { + // There are conditions under which ReadFile cannot read chunks >64MB. + // Thus split the operation into smaller chunks if len exceeds this limit. + size_t bytes_read = 0; + while (bytes_read < len) { + size_t chunk_size = std::min(len - bytes_read, 64*1024*1024); + DWORD chunk_read = 0; + BOOL result = ReadFile(fp_win32, reinterpret_cast(ptr) + bytes_read, chunk_size, &chunk_read, NULL); + if (!result) { + throw std::runtime_error(format("read error: %s", GetErrorMessageWin32(GetLastError()))); + } + if (chunk_read < chunk_size || chunk_read == 0) { + throw std::runtime_error("unexpectedly reached end of file"); + } + + bytes_read += chunk_read; + } ; + } + + uint32_t read_u32() const { + uint32_t val; + read_raw(&val, sizeof(val)); + return val; + } + + void write_raw(const void * ptr, size_t len) const { + // There are conditions under which WriteFile cannot write chunks >64MB. + // Thus split the operation into smaller chunks if len exceeds this limit. + size_t bytes_written = 0; + while (bytes_written < len) { + size_t chunk_size = std::min(len - bytes_written, 64*1024*1024); + DWORD chunk_written = 0; + BOOL result = WriteFile(fp_win32, reinterpret_cast(ptr) + bytes_written, chunk_size, &chunk_written, NULL); + if (!result) { + throw std::runtime_error(format("write error: %s", GetErrorMessageWin32(GetLastError()).c_str())); + } + if (chunk_written < chunk_size || chunk_written == 0) { + throw std::runtime_error("unexpectedly failed to write bytes"); + } + + bytes_written += chunk_written; + } + } + + void write_u32(std::uint32_t val) const { + write_raw(&val, sizeof(val)); + } + + ~llama_file() { + if (fp) { + std::fclose(fp); + } + } +#else // use FILE * so we don't have to re-open the file to mmap FILE * fp; size_t size; @@ -1347,6 +1463,7 @@ struct llama_file { std::fclose(fp); } } +#endif }; using llama_files = std::vector>; @@ -3713,6 +3830,23 @@ struct llama_model_loader { std::vector> read_buf; std::vector>> validation_result; +#if defined(GGML_USE_CUDA) + std::vector host_buffers; + std::vector host_ptrs; + std::vector events; + size_t buffer_idx = 0; // buffer to use for async loads + + ggml_backend_t backend = ggml_backend_cuda_init(0); // TODO how to get the CUDA device / backend here? + + constexpr size_t num_buffers = 4; + constexpr size_t buffer_size = 1 * 1024 * 1024; // 1MB + for (size_t idx = 0; idx < num_buffers; ++idx) { + host_buffers.emplace_back(ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), buffer_size)); + host_ptrs.emplace_back(ggml_backend_buffer_get_base(host_buffers[idx])); + events.emplace_back(ggml_backend_event_new(backend)); + } +#endif + for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) { const auto * weight = get_weight(ggml_get_name(cur)); if (weight == nullptr) { @@ -3768,6 +3902,25 @@ struct llama_model_loader { })); } } else { +#if defined(GGML_USE_CUDA) + file->seek(weight->offs, SEEK_SET); + + size_t bytes_read = 0; + + while (bytes_read < n_size) + { + size_t read_iteration = std::min(buffer_size, n_size - bytes_read); + + ggml_backend_event_synchronize(events[buffer_idx]); + file->read_raw(host_ptrs[buffer_idx], read_iteration); + ggml_backend_tensor_set_async(backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration); + ggml_backend_event_record(events[buffer_idx]); + + bytes_read += read_iteration; + ++buffer_idx; + buffer_idx %= num_buffers; + } +#else read_buf.resize(n_size); file->seek(weight->offs, SEEK_SET); file->read_raw(read_buf.data(), n_size); @@ -3775,12 +3928,20 @@ struct llama_model_loader { if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) { throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur))); } +#endif } } size_done += n_size; } +#if defined(GGML_USE_CUDA) + for (auto const& event : events) { + ggml_backend_event_synchronize(event); + } +#endif + + // check validation results bool validation_failed = false; for (auto & future : validation_result) { From 1ebe20789bd6ec52672f4b1a2c1b1260ec9c342a Mon Sep 17 00:00:00 2001 From: Markus Tavenrath Date: Wed, 12 Jun 2024 13:45:41 +0200 Subject: [PATCH 2/6] Free resources except for backend. --- llama.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/llama.cpp b/llama.cpp index 71f5b53bacd0d..ac45828646cdb 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3936,8 +3936,12 @@ struct llama_model_loader { } #if defined(GGML_USE_CUDA) - for (auto const& event : events) { - ggml_backend_event_synchronize(event); + for (size_t idx = 0; idx < num_buffers;++idx) { + ggml_backend_event_synchronize(events[idx]); + ggml_backend_event_free(events[idx]); + ggml_backend_buffer_free(host_buffers[idx]); + + //ggml_backend_free(backend); } #endif From 86869fbdab7f81c0569ae02788a587e17f167bde Mon Sep 17 00:00:00 2001 From: Markus Tavenrath Date: Thu, 13 Jun 2024 14:32:03 +0200 Subject: [PATCH 3/6] Change assertions to exceptions in llama_file, find correct cuda backend to create CUDA resources and respect the use_mmap flag again for CUDA. --- llama.cpp | 117 +++++++++++++++++++++++++++++++++++------------------- 1 file changed, 77 insertions(+), 40 deletions(-) diff --git a/llama.cpp b/llama.cpp index ac45828646cdb..a6eb79c99b682 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1288,7 +1288,6 @@ struct llama_file { DWORD bufLen = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, NULL, error_code, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&lpMsgBuf, 0, NULL); if (!bufLen) { - std::stringstream ss; ret = format("Win32 error code: %s", error_code); } else { ret = lpMsgBuf; @@ -1316,8 +1315,9 @@ struct llama_file { LARGE_INTEGER li; li.QuadPart = 0; BOOL ret = SetFilePointerEx(fp_win32, li, &li, FILE_CURRENT); - - GGML_ASSERT(ret); + if (!ret) { + throw std::runtime_error(format("read error: %s", GetErrorMessageWin32(GetLastError()))); + } return li.QuadPart; } @@ -1332,11 +1332,15 @@ struct llama_file { LARGE_INTEGER li; li.QuadPart = offset; BOOL ret = SetFilePointerEx(fp_win32, li, NULL, whence); - - GGML_ASSERT(ret); + if (!ret) { + throw std::runtime_error(format("read error: %s", GetErrorMessageWin32(GetLastError()))); + } } void read_raw(void * ptr, size_t len) const { + // On Win32 ReadFile is significant faster than fread which is again significant faster than std::fstream. Thus + // use the Win32 API to do file io instead of the C/C++ library functions. + // There are conditions under which ReadFile cannot read chunks >64MB. // Thus split the operation into smaller chunks if len exceeds this limit. size_t bytes_read = 0; @@ -1410,7 +1414,10 @@ struct llama_file { #else long ret = std::ftell(fp); #endif - GGML_ASSERT(ret != -1); // this really shouldn't fail + if (ret == -1) { + throw std::runtime_error(format("ftell error: %s", strerror(errno))); + } + return (size_t) ret; } @@ -1420,7 +1427,9 @@ struct llama_file { #else int ret = std::fseek(fp, (long) offset, whence); #endif - GGML_ASSERT(ret == 0); // same + if (ret != 0) { + throw std::runtime_error(format("seek error: %s", strerror(errno))); + } } void read_raw(void * ptr, size_t len) const { @@ -3831,19 +3840,40 @@ struct llama_model_loader { std::vector>> validation_result; #if defined(GGML_USE_CUDA) + // 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives. + // NVMe raid configurations might require more / larger buffers. + constexpr size_t num_buffers = 4; + constexpr size_t buffer_size = 1 * 1024 * 1024; // 1MB + std::vector host_buffers; std::vector host_ptrs; std::vector events; size_t buffer_idx = 0; // buffer to use for async loads - ggml_backend_t backend = ggml_backend_cuda_init(0); // TODO how to get the CUDA device / backend here? + ggml_backend_t cuda_backend = nullptr; + if (!use_mmap) { + // When not using mmaped io use async uploads from pinned memory to GPU memory. + // First determine if the CUDA backend is active, and if so, determine the device ID. + ggml_backend_buffer_t buf = bufs_mmap.count(0) ? bufs_mmap.at(0) : nullptr; + if (buf) { + ggml_backend_buffer_type_t buffer_type = ggml_backend_buffer_get_type(buf); + for (int i = 0; i < ggml_backend_cuda_get_device_count(); ++i) { + auto cuda_buffer_type = ggml_backend_cuda_buffer_type(i); + if (buffer_type == ggml_backend_cuda_buffer_type(i)) { + cuda_backend = ggml_backend_cuda_init(i); + break; + } + } + } - constexpr size_t num_buffers = 4; - constexpr size_t buffer_size = 1 * 1024 * 1024; // 1MB - for (size_t idx = 0; idx < num_buffers; ++idx) { - host_buffers.emplace_back(ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), buffer_size)); - host_ptrs.emplace_back(ggml_backend_buffer_get_base(host_buffers[idx])); - events.emplace_back(ggml_backend_event_new(backend)); + // If the cuda backend is active create pinned memory buffers and events for synchronisation. + if (cuda_backend) { + for (size_t idx = 0; idx < num_buffers; ++idx) { + host_buffers.emplace_back(ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), buffer_size)); + host_ptrs.emplace_back(ggml_backend_buffer_get_base(host_buffers[idx])); + events.emplace_back(ggml_backend_event_new(cuda_backend)); + } + } } #endif @@ -3903,32 +3933,37 @@ struct llama_model_loader { } } else { #if defined(GGML_USE_CUDA) - file->seek(weight->offs, SEEK_SET); + // If cuda_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU. + if (cuda_backend) { + file->seek(weight->offs, SEEK_SET); - size_t bytes_read = 0; + size_t bytes_read = 0; - while (bytes_read < n_size) - { - size_t read_iteration = std::min(buffer_size, n_size - bytes_read); + while (bytes_read < n_size) + { + size_t read_iteration = std::min(buffer_size, n_size - bytes_read); - ggml_backend_event_synchronize(events[buffer_idx]); - file->read_raw(host_ptrs[buffer_idx], read_iteration); - ggml_backend_tensor_set_async(backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration); - ggml_backend_event_record(events[buffer_idx]); + ggml_backend_event_synchronize(events[buffer_idx]); + file->read_raw(host_ptrs[buffer_idx], read_iteration); + ggml_backend_tensor_set_async(cuda_backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration); + ggml_backend_event_record(events[buffer_idx]); - bytes_read += read_iteration; - ++buffer_idx; - buffer_idx %= num_buffers; - } -#else - read_buf.resize(n_size); - file->seek(weight->offs, SEEK_SET); - file->read_raw(read_buf.data(), n_size); - ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size); - if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) { - throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur))); + bytes_read += read_iteration; + ++buffer_idx; + buffer_idx %= num_buffers; + } } + else #endif + { + read_buf.resize(n_size); + file->seek(weight->offs, SEEK_SET); + file->read_raw(read_buf.data(), n_size); + ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size); + if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) { + throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur))); + } + } } } @@ -3936,12 +3971,14 @@ struct llama_model_loader { } #if defined(GGML_USE_CUDA) - for (size_t idx = 0; idx < num_buffers;++idx) { - ggml_backend_event_synchronize(events[idx]); - ggml_backend_event_free(events[idx]); - ggml_backend_buffer_free(host_buffers[idx]); - - //ggml_backend_free(backend); + // free temporary resources used for async cuda uploads + if (cuda_backend) { + for (size_t idx = 0; idx < num_buffers;++idx) { + ggml_backend_event_synchronize(events[idx]); + ggml_backend_event_free(events[idx]); + ggml_backend_buffer_free(host_buffers[idx]); + } + ggml_backend_free(cuda_backend); } #endif From c39d5ecd2bd88e5b2e660eb56994b38c1edc666c Mon Sep 17 00:00:00 2001 From: Markus Tavenrath Date: Thu, 13 Jun 2024 15:55:23 +0200 Subject: [PATCH 4/6] Apply suggestions from code review Co-authored-by: slaren --- llama.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/llama.cpp b/llama.cpp index a6eb79c99b682..df4944595d711 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3851,7 +3851,7 @@ struct llama_model_loader { size_t buffer_idx = 0; // buffer to use for async loads ggml_backend_t cuda_backend = nullptr; - if (!use_mmap) { + if (!use_mmap && !check_tensors) { // When not using mmaped io use async uploads from pinned memory to GPU memory. // First determine if the CUDA backend is active, and if so, determine the device ID. ggml_backend_buffer_t buf = bufs_mmap.count(0) ? bufs_mmap.at(0) : nullptr; @@ -3939,8 +3939,7 @@ struct llama_model_loader { size_t bytes_read = 0; - while (bytes_read < n_size) - { + while (bytes_read < n_size) { size_t read_iteration = std::min(buffer_size, n_size - bytes_read); ggml_backend_event_synchronize(events[buffer_idx]); @@ -3982,7 +3981,6 @@ struct llama_model_loader { } #endif - // check validation results bool validation_failed = false; for (auto & future : validation_result) { From d3131ce56529ad9feb8e60ddb9c561a8dee8a9a1 Mon Sep 17 00:00:00 2001 From: slaren Date: Thu, 13 Jun 2024 18:06:41 +0200 Subject: [PATCH 5/6] Fix editorconfig and unused variable --- llama.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/llama.cpp b/llama.cpp index df4944595d711..b6f2a264c0857 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3840,8 +3840,8 @@ struct llama_model_loader { std::vector>> validation_result; #if defined(GGML_USE_CUDA) - // 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives. - // NVMe raid configurations might require more / larger buffers. + // 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives. + // NVMe raid configurations might require more / larger buffers. constexpr size_t num_buffers = 4; constexpr size_t buffer_size = 1 * 1024 * 1024; // 1MB @@ -3858,8 +3858,8 @@ struct llama_model_loader { if (buf) { ggml_backend_buffer_type_t buffer_type = ggml_backend_buffer_get_type(buf); for (int i = 0; i < ggml_backend_cuda_get_device_count(); ++i) { - auto cuda_buffer_type = ggml_backend_cuda_buffer_type(i); - if (buffer_type == ggml_backend_cuda_buffer_type(i)) { + auto * cuda_buffer_type = ggml_backend_cuda_buffer_type(i); + if (buffer_type == cuda_buffer_type) { cuda_backend = ggml_backend_cuda_init(i); break; } From f4d33f87f88c18e7755e6682df9dbdbae52e36d5 Mon Sep 17 00:00:00 2001 From: slaren Date: Thu, 13 Jun 2024 18:52:41 +0200 Subject: [PATCH 6/6] Fix issues with Windows build --- llama.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/llama.cpp b/llama.cpp index e34a4c4814a62..c87d3261db3ad 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1320,7 +1320,7 @@ struct llama_file { li.QuadPart = 0; BOOL ret = SetFilePointerEx(fp_win32, li, &li, FILE_CURRENT); if (!ret) { - throw std::runtime_error(format("read error: %s", GetErrorMessageWin32(GetLastError()))); + throw std::runtime_error(format("read error: %s", GetErrorMessageWin32(GetLastError()).c_str())); } return li.QuadPart; @@ -1329,15 +1329,15 @@ struct llama_file { void seek(size_t offset, int whence) const { // no need to convert SEEK_* to FILE_*. The enums are the same. // Still, keep static asserts to avoid failures in the future. - static_assert(SEEK_SET == FILE_BEGIN); - static_assert(SEEK_CUR == FILE_CURRENT); - static_assert(SEEK_END == FILE_END); + static_assert(SEEK_SET == FILE_BEGIN, "SEEK_SET != FILE_BEGIN"); + static_assert(SEEK_CUR == FILE_CURRENT, "SEEK_CUR != FILE_CURRENT"); + static_assert(SEEK_END == FILE_END, "SEEK_END != FILE_END"); LARGE_INTEGER li; li.QuadPart = offset; BOOL ret = SetFilePointerEx(fp_win32, li, NULL, whence); if (!ret) { - throw std::runtime_error(format("read error: %s", GetErrorMessageWin32(GetLastError()))); + throw std::runtime_error(format("read error: %s", GetErrorMessageWin32(GetLastError()).c_str())); } } @@ -1353,7 +1353,7 @@ struct llama_file { DWORD chunk_read = 0; BOOL result = ReadFile(fp_win32, reinterpret_cast(ptr) + bytes_read, chunk_size, &chunk_read, NULL); if (!result) { - throw std::runtime_error(format("read error: %s", GetErrorMessageWin32(GetLastError()))); + throw std::runtime_error(format("read error: %s", GetErrorMessageWin32(GetLastError()).c_str())); } if (chunk_read < chunk_size || chunk_read == 0) { throw std::runtime_error("unexpectedly reached end of file");