From 32dd2ef133e8a680653fd33c4fac7741c9a352fc Mon Sep 17 00:00:00 2001
From: Markus Tavenrath <matavenrath@nvidia.com>
Date: Wed, 12 Jun 2024 10:53:25 +0200
Subject: [PATCH 1/6] Implement non-mapped async IO for CUDA on Windows. On a
 fast Gen5 NVMe drive this change improves model load time by >3x while it
 should be the same (or slightly faster) on any other drive.

---
 llama.cpp | 161 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 161 insertions(+)

diff --git a/llama.cpp b/llama.cpp
index 8b675ea993a38..71f5b53bacd0d 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1274,6 +1274,122 @@ struct no_init {
 };
 
 struct llama_file {
+
+#if defined(_WIN32)
+    // use FILE * so we don't have to re-open the file to mmap
+    FILE * fp;
+    HANDLE fp_win32;
+    size_t size;
+
+private:
+    std::string GetErrorMessageWin32(DWORD error_code) const {
+        std::string ret;
+        LPSTR lpMsgBuf = NULL;
+        DWORD bufLen = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
+                                    NULL, error_code, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&lpMsgBuf, 0, NULL);
+        if (!bufLen) {
+            std::stringstream ss;
+            ret = format("Win32 error code: %s", error_code);
+        } else {
+            ret = lpMsgBuf;
+            LocalFree(lpMsgBuf);
+        }
+
+        return ret;
+    }
+
+public:
+
+    llama_file(const char * fname, const char * mode) {
+        fp = ggml_fopen(fname, mode);
+        if (fp == NULL) {
+            throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
+        }
+        fp_win32 = (HANDLE) _get_osfhandle(_fileno(fp));
+        seek(0, SEEK_END);
+        size = tell();
+        seek(0, SEEK_SET);
+    }
+
+    size_t tell() const {
+        // SetFilePointerEx returns the current position when seeking relative 0 bytes
+        LARGE_INTEGER li;
+        li.QuadPart = 0;
+        BOOL ret = SetFilePointerEx(fp_win32, li, &li, FILE_CURRENT);
+        
+        GGML_ASSERT(ret);
+
+        return li.QuadPart;
+    }
+
+    void seek(size_t offset, int whence) const {
+        // no need to convert SEEK_* to FILE_*. The enums are the same.
+        // Still, keep static asserts to avoid failures in the future.
+        static_assert(SEEK_SET == FILE_BEGIN);
+        static_assert(SEEK_CUR == FILE_CURRENT);
+        static_assert(SEEK_END == FILE_END);
+
+        LARGE_INTEGER li;
+        li.QuadPart = offset;
+        BOOL ret = SetFilePointerEx(fp_win32, li, NULL, whence);
+
+        GGML_ASSERT(ret);
+    }
+
+    void read_raw(void * ptr, size_t len) const {
+        // There are conditions under which ReadFile cannot read chunks >64MB.
+        // Thus split the operation into smaller chunks if len exceeds this limit.
+        size_t bytes_read = 0;
+        while (bytes_read < len) {
+            size_t chunk_size = std::min<size_t>(len - bytes_read, 64*1024*1024);
+            DWORD chunk_read = 0;
+            BOOL result = ReadFile(fp_win32, reinterpret_cast<char*>(ptr) + bytes_read, chunk_size, &chunk_read, NULL);
+            if (!result) {
+                throw std::runtime_error(format("read error: %s", GetErrorMessageWin32(GetLastError())));
+            }
+            if (chunk_read < chunk_size || chunk_read == 0) {
+                throw std::runtime_error("unexpectedly reached end of file");
+            }
+
+            bytes_read += chunk_read;
+        } ;
+    }
+
+    uint32_t read_u32() const {
+        uint32_t val;
+        read_raw(&val, sizeof(val));
+        return val;
+    }
+
+    void write_raw(const void * ptr, size_t len) const {
+        // There are conditions under which WriteFile cannot write chunks >64MB.
+        // Thus split the operation into smaller chunks if len exceeds this limit.
+        size_t bytes_written = 0;
+        while (bytes_written < len) {
+            size_t chunk_size = std::min<size_t>(len - bytes_written, 64*1024*1024);
+            DWORD chunk_written = 0;
+            BOOL result = WriteFile(fp_win32, reinterpret_cast<char const*>(ptr) + bytes_written, chunk_size, &chunk_written, NULL);
+            if (!result) {
+                throw std::runtime_error(format("write error: %s", GetErrorMessageWin32(GetLastError()).c_str()));
+            }
+            if (chunk_written < chunk_size || chunk_written == 0) {
+                throw std::runtime_error("unexpectedly failed to write bytes");
+            }
+
+            bytes_written += chunk_written;
+        }
+    }
+
+    void write_u32(std::uint32_t val) const {
+        write_raw(&val, sizeof(val));
+    }
+
+    ~llama_file() {
+        if (fp) {
+            std::fclose(fp);
+        }
+    }
+#else
     // use FILE * so we don't have to re-open the file to mmap
     FILE * fp;
     size_t size;
@@ -1347,6 +1463,7 @@ struct llama_file {
             std::fclose(fp);
         }
     }
+#endif
 };
 using llama_files = std::vector<std::unique_ptr<llama_file>>;
 
@@ -3713,6 +3830,23 @@ struct llama_model_loader {
         std::vector<no_init<uint8_t>> read_buf;
         std::vector<std::future<std::pair<ggml_tensor *, bool>>> validation_result;
 
+#if defined(GGML_USE_CUDA)
+        std::vector<ggml_backend_buffer_t> host_buffers;
+        std::vector<void*> host_ptrs;
+        std::vector<ggml_backend_event_t> events;
+        size_t buffer_idx = 0; // buffer to use for async loads
+
+        ggml_backend_t backend = ggml_backend_cuda_init(0); // TODO how to get the CUDA device / backend here?
+
+        constexpr size_t num_buffers = 4;
+        constexpr size_t buffer_size = 1 * 1024 * 1024; // 1MB
+        for (size_t idx = 0; idx < num_buffers; ++idx) {
+            host_buffers.emplace_back(ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), buffer_size));
+            host_ptrs.emplace_back(ggml_backend_buffer_get_base(host_buffers[idx]));
+            events.emplace_back(ggml_backend_event_new(backend));
+        }
+#endif
+
         for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
             const auto * weight = get_weight(ggml_get_name(cur));
             if (weight == nullptr) {
@@ -3768,6 +3902,25 @@ struct llama_model_loader {
                         }));
                     }
                 } else {
+#if defined(GGML_USE_CUDA)
+                    file->seek(weight->offs, SEEK_SET);
+
+                    size_t bytes_read = 0;
+
+                    while (bytes_read < n_size)
+                    {
+                        size_t read_iteration = std::min<size_t>(buffer_size, n_size - bytes_read);
+
+                        ggml_backend_event_synchronize(events[buffer_idx]);
+                        file->read_raw(host_ptrs[buffer_idx], read_iteration);
+                        ggml_backend_tensor_set_async(backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration);
+                        ggml_backend_event_record(events[buffer_idx]);
+
+                        bytes_read += read_iteration;
+                        ++buffer_idx;
+                        buffer_idx %= num_buffers;
+                    }
+#else
                     read_buf.resize(n_size);
                     file->seek(weight->offs, SEEK_SET);
                     file->read_raw(read_buf.data(), n_size);
@@ -3775,12 +3928,20 @@ struct llama_model_loader {
                     if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
                         throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
                     }
+#endif
                 }
             }
 
             size_done += n_size;
         }
 
+#if defined(GGML_USE_CUDA)
+        for (auto const& event : events) {
+            ggml_backend_event_synchronize(event);
+        }
+#endif
+
+
         // check validation results
         bool validation_failed = false;
         for (auto & future : validation_result) {

From 1ebe20789bd6ec52672f4b1a2c1b1260ec9c342a Mon Sep 17 00:00:00 2001
From: Markus Tavenrath <matavenrath@nvidia.com>
Date: Wed, 12 Jun 2024 13:45:41 +0200
Subject: [PATCH 2/6] Free resources except for backend.

---
 llama.cpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 71f5b53bacd0d..ac45828646cdb 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -3936,8 +3936,12 @@ struct llama_model_loader {
         }
 
 #if defined(GGML_USE_CUDA)
-        for (auto const& event : events) {
-            ggml_backend_event_synchronize(event);
+        for (size_t idx = 0; idx < num_buffers;++idx) {
+            ggml_backend_event_synchronize(events[idx]);
+            ggml_backend_event_free(events[idx]);
+            ggml_backend_buffer_free(host_buffers[idx]);
+
+            //ggml_backend_free(backend);
         }
 #endif
 

From 86869fbdab7f81c0569ae02788a587e17f167bde Mon Sep 17 00:00:00 2001
From: Markus Tavenrath <matavenrath@nvidia.com>
Date: Thu, 13 Jun 2024 14:32:03 +0200
Subject: [PATCH 3/6] Change assertions to exceptions in llama_file, find
 correct cuda backend to create CUDA resources and respect the use_mmap flag
 again for CUDA.

---
 llama.cpp | 117 +++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 77 insertions(+), 40 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index ac45828646cdb..a6eb79c99b682 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1288,7 +1288,6 @@ struct llama_file {
         DWORD bufLen = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
                                     NULL, error_code, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&lpMsgBuf, 0, NULL);
         if (!bufLen) {
-            std::stringstream ss;
             ret = format("Win32 error code: %s", error_code);
         } else {
             ret = lpMsgBuf;
@@ -1316,8 +1315,9 @@ struct llama_file {
         LARGE_INTEGER li;
         li.QuadPart = 0;
         BOOL ret = SetFilePointerEx(fp_win32, li, &li, FILE_CURRENT);
-        
-        GGML_ASSERT(ret);
+        if (!ret) {
+            throw std::runtime_error(format("read error: %s", GetErrorMessageWin32(GetLastError())));
+        }
 
         return li.QuadPart;
     }
@@ -1332,11 +1332,15 @@ struct llama_file {
         LARGE_INTEGER li;
         li.QuadPart = offset;
         BOOL ret = SetFilePointerEx(fp_win32, li, NULL, whence);
-
-        GGML_ASSERT(ret);
+        if (!ret) {
+            throw std::runtime_error(format("read error: %s", GetErrorMessageWin32(GetLastError())));
+        }
     }
 
     void read_raw(void * ptr, size_t len) const {
+        // On Win32 ReadFile is significant faster than fread which is again significant faster than std::fstream. Thus
+        // use the Win32 API to do file io instead of the C/C++ library functions.
+
         // There are conditions under which ReadFile cannot read chunks >64MB.
         // Thus split the operation into smaller chunks if len exceeds this limit.
         size_t bytes_read = 0;
@@ -1410,7 +1414,10 @@ struct llama_file {
 #else
         long ret = std::ftell(fp);
 #endif
-        GGML_ASSERT(ret != -1); // this really shouldn't fail
+        if (ret == -1) {
+            throw std::runtime_error(format("ftell error: %s", strerror(errno)));
+        }
+
         return (size_t) ret;
     }
 
@@ -1420,7 +1427,9 @@ struct llama_file {
 #else
         int ret = std::fseek(fp, (long) offset, whence);
 #endif
-        GGML_ASSERT(ret == 0); // same
+        if (ret != 0) {
+            throw std::runtime_error(format("seek error: %s", strerror(errno)));
+        }
     }
 
     void read_raw(void * ptr, size_t len) const {
@@ -3831,19 +3840,40 @@ struct llama_model_loader {
         std::vector<std::future<std::pair<ggml_tensor *, bool>>> validation_result;
 
 #if defined(GGML_USE_CUDA)
+		// 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives.
+		// NVMe raid configurations might require more / larger buffers.
+        constexpr size_t num_buffers = 4;
+        constexpr size_t buffer_size = 1 * 1024 * 1024; // 1MB
+
         std::vector<ggml_backend_buffer_t> host_buffers;
         std::vector<void*> host_ptrs;
         std::vector<ggml_backend_event_t> events;
         size_t buffer_idx = 0; // buffer to use for async loads
 
-        ggml_backend_t backend = ggml_backend_cuda_init(0); // TODO how to get the CUDA device / backend here?
+        ggml_backend_t cuda_backend = nullptr;
+        if (!use_mmap) {
+            // When not using mmaped io use async uploads from pinned memory to GPU memory.
+            // First determine if the CUDA backend is active, and if so, determine the device ID.
+            ggml_backend_buffer_t buf = bufs_mmap.count(0) ? bufs_mmap.at(0) : nullptr;
+            if (buf) {
+                ggml_backend_buffer_type_t buffer_type = ggml_backend_buffer_get_type(buf);
+                for (int i = 0; i < ggml_backend_cuda_get_device_count(); ++i) {
+                    auto cuda_buffer_type = ggml_backend_cuda_buffer_type(i);
+                    if (buffer_type == ggml_backend_cuda_buffer_type(i)) {
+                        cuda_backend = ggml_backend_cuda_init(i);
+                        break;
+                    }
+                }
+            }
 
-        constexpr size_t num_buffers = 4;
-        constexpr size_t buffer_size = 1 * 1024 * 1024; // 1MB
-        for (size_t idx = 0; idx < num_buffers; ++idx) {
-            host_buffers.emplace_back(ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), buffer_size));
-            host_ptrs.emplace_back(ggml_backend_buffer_get_base(host_buffers[idx]));
-            events.emplace_back(ggml_backend_event_new(backend));
+            // If the cuda backend is active create pinned memory buffers and events for synchronisation.
+            if (cuda_backend) {
+                for (size_t idx = 0; idx < num_buffers; ++idx) {
+                    host_buffers.emplace_back(ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), buffer_size));
+                    host_ptrs.emplace_back(ggml_backend_buffer_get_base(host_buffers[idx]));
+                    events.emplace_back(ggml_backend_event_new(cuda_backend));
+                }
+            }
         }
 #endif
 
@@ -3903,32 +3933,37 @@ struct llama_model_loader {
                     }
                 } else {
 #if defined(GGML_USE_CUDA)
-                    file->seek(weight->offs, SEEK_SET);
+                    // If cuda_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU.
+                    if (cuda_backend) {
+                        file->seek(weight->offs, SEEK_SET);
 
-                    size_t bytes_read = 0;
+                        size_t bytes_read = 0;
 
-                    while (bytes_read < n_size)
-                    {
-                        size_t read_iteration = std::min<size_t>(buffer_size, n_size - bytes_read);
+                        while (bytes_read < n_size)
+                        {
+                            size_t read_iteration = std::min<size_t>(buffer_size, n_size - bytes_read);
 
-                        ggml_backend_event_synchronize(events[buffer_idx]);
-                        file->read_raw(host_ptrs[buffer_idx], read_iteration);
-                        ggml_backend_tensor_set_async(backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration);
-                        ggml_backend_event_record(events[buffer_idx]);
+                            ggml_backend_event_synchronize(events[buffer_idx]);
+                            file->read_raw(host_ptrs[buffer_idx], read_iteration);
+                            ggml_backend_tensor_set_async(cuda_backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration);
+                            ggml_backend_event_record(events[buffer_idx]);
 
-                        bytes_read += read_iteration;
-                        ++buffer_idx;
-                        buffer_idx %= num_buffers;
-                    }
-#else
-                    read_buf.resize(n_size);
-                    file->seek(weight->offs, SEEK_SET);
-                    file->read_raw(read_buf.data(), n_size);
-                    ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
-                    if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
-                        throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
+                            bytes_read += read_iteration;
+                            ++buffer_idx;
+                            buffer_idx %= num_buffers;
+                        }
                     }
+                    else
 #endif
+                    {
+                        read_buf.resize(n_size);
+                        file->seek(weight->offs, SEEK_SET);
+                        file->read_raw(read_buf.data(), n_size);
+                        ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
+                        if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
+                            throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
+                        }
+                    }
                 }
             }
 
@@ -3936,12 +3971,14 @@ struct llama_model_loader {
         }
 
 #if defined(GGML_USE_CUDA)
-        for (size_t idx = 0; idx < num_buffers;++idx) {
-            ggml_backend_event_synchronize(events[idx]);
-            ggml_backend_event_free(events[idx]);
-            ggml_backend_buffer_free(host_buffers[idx]);
-
-            //ggml_backend_free(backend);
+        // free temporary resources used for async cuda uploads
+        if (cuda_backend) {
+            for (size_t idx = 0; idx < num_buffers;++idx) {
+                ggml_backend_event_synchronize(events[idx]);
+                ggml_backend_event_free(events[idx]);
+                ggml_backend_buffer_free(host_buffers[idx]);
+            }
+            ggml_backend_free(cuda_backend);
         }
 #endif
 

From c39d5ecd2bd88e5b2e660eb56994b38c1edc666c Mon Sep 17 00:00:00 2001
From: Markus Tavenrath <mtavenrath@users.noreply.github.com>
Date: Thu, 13 Jun 2024 15:55:23 +0200
Subject: [PATCH 4/6] Apply suggestions from code review

Co-authored-by: slaren <slarengh@gmail.com>
---
 llama.cpp | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index a6eb79c99b682..df4944595d711 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -3851,7 +3851,7 @@ struct llama_model_loader {
         size_t buffer_idx = 0; // buffer to use for async loads
 
         ggml_backend_t cuda_backend = nullptr;
-        if (!use_mmap) {
+        if (!use_mmap && !check_tensors) {
             // When not using mmaped io use async uploads from pinned memory to GPU memory.
             // First determine if the CUDA backend is active, and if so, determine the device ID.
             ggml_backend_buffer_t buf = bufs_mmap.count(0) ? bufs_mmap.at(0) : nullptr;
@@ -3939,8 +3939,7 @@ struct llama_model_loader {
 
                         size_t bytes_read = 0;
 
-                        while (bytes_read < n_size)
-                        {
+                        while (bytes_read < n_size) {
                             size_t read_iteration = std::min<size_t>(buffer_size, n_size - bytes_read);
 
                             ggml_backend_event_synchronize(events[buffer_idx]);
@@ -3982,7 +3981,6 @@ struct llama_model_loader {
         }
 #endif
 
-
         // check validation results
         bool validation_failed = false;
         for (auto & future : validation_result) {

From d3131ce56529ad9feb8e60ddb9c561a8dee8a9a1 Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Thu, 13 Jun 2024 18:06:41 +0200
Subject: [PATCH 5/6] Fix editorconfig and unused variable

---
 llama.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index df4944595d711..b6f2a264c0857 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -3840,8 +3840,8 @@ struct llama_model_loader {
         std::vector<std::future<std::pair<ggml_tensor *, bool>>> validation_result;
 
 #if defined(GGML_USE_CUDA)
-		// 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives.
-		// NVMe raid configurations might require more / larger buffers.
+        // 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives.
+        // NVMe raid configurations might require more / larger buffers.
         constexpr size_t num_buffers = 4;
         constexpr size_t buffer_size = 1 * 1024 * 1024; // 1MB
 
@@ -3858,8 +3858,8 @@ struct llama_model_loader {
             if (buf) {
                 ggml_backend_buffer_type_t buffer_type = ggml_backend_buffer_get_type(buf);
                 for (int i = 0; i < ggml_backend_cuda_get_device_count(); ++i) {
-                    auto cuda_buffer_type = ggml_backend_cuda_buffer_type(i);
-                    if (buffer_type == ggml_backend_cuda_buffer_type(i)) {
+                    auto * cuda_buffer_type = ggml_backend_cuda_buffer_type(i);
+                    if (buffer_type == cuda_buffer_type) {
                         cuda_backend = ggml_backend_cuda_init(i);
                         break;
                     }

From f4d33f87f88c18e7755e6682df9dbdbae52e36d5 Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Thu, 13 Jun 2024 18:52:41 +0200
Subject: [PATCH 6/6] Fix issues with Windows build

---
 llama.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index e34a4c4814a62..c87d3261db3ad 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1320,7 +1320,7 @@ struct llama_file {
         li.QuadPart = 0;
         BOOL ret = SetFilePointerEx(fp_win32, li, &li, FILE_CURRENT);
         if (!ret) {
-            throw std::runtime_error(format("read error: %s", GetErrorMessageWin32(GetLastError())));
+            throw std::runtime_error(format("read error: %s", GetErrorMessageWin32(GetLastError()).c_str()));
         }
 
         return li.QuadPart;
@@ -1329,15 +1329,15 @@ struct llama_file {
     void seek(size_t offset, int whence) const {
         // no need to convert SEEK_* to FILE_*. The enums are the same.
         // Still, keep static asserts to avoid failures in the future.
-        static_assert(SEEK_SET == FILE_BEGIN);
-        static_assert(SEEK_CUR == FILE_CURRENT);
-        static_assert(SEEK_END == FILE_END);
+        static_assert(SEEK_SET == FILE_BEGIN, "SEEK_SET != FILE_BEGIN");
+        static_assert(SEEK_CUR == FILE_CURRENT, "SEEK_CUR != FILE_CURRENT");
+        static_assert(SEEK_END == FILE_END, "SEEK_END != FILE_END");
 
         LARGE_INTEGER li;
         li.QuadPart = offset;
         BOOL ret = SetFilePointerEx(fp_win32, li, NULL, whence);
         if (!ret) {
-            throw std::runtime_error(format("read error: %s", GetErrorMessageWin32(GetLastError())));
+            throw std::runtime_error(format("read error: %s", GetErrorMessageWin32(GetLastError()).c_str()));
         }
     }
 
@@ -1353,7 +1353,7 @@ struct llama_file {
             DWORD chunk_read = 0;
             BOOL result = ReadFile(fp_win32, reinterpret_cast<char*>(ptr) + bytes_read, chunk_size, &chunk_read, NULL);
             if (!result) {
-                throw std::runtime_error(format("read error: %s", GetErrorMessageWin32(GetLastError())));
+                throw std::runtime_error(format("read error: %s", GetErrorMessageWin32(GetLastError()).c_str()));
             }
             if (chunk_read < chunk_size || chunk_read == 0) {
                 throw std::runtime_error("unexpectedly reached end of file");