Skip to content

fallback to CPU buffer if host buffer alloc fails #4610

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Dec 23, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 6 additions & 5 deletions ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -6729,8 +6729,7 @@ void * ggml_cuda_host_malloc(size_t size) {
void * ptr = nullptr;
cudaError_t err = cudaMallocHost((void **) &ptr, size);
if (err != cudaSuccess) {
// The allocation error can be bypassed. A null ptr will assigned out of this function.
// This can fixed the OOM error in WSL.
// clear the error
cudaGetLastError();
fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
size/1024.0/1024.0, cudaGetErrorString(err));
Expand Down Expand Up @@ -9674,12 +9673,14 @@ ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
// host buffer type

static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
CUDA_CHECK(cudaFreeHost(buffer->context));
ggml_cuda_host_free(buffer->context);
}

static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
void * ptr;
CUDA_CHECK(cudaMallocHost(&ptr, size));
void * ptr = ggml_cuda_host_malloc(size);
if (ptr == nullptr) {
return nullptr;
}

// FIXME: this is a hack to avoid having to implement a new buffer type
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
Expand Down
16 changes: 11 additions & 5 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1177,21 +1177,27 @@ static std::string llama_token_to_piece(const struct llama_context * ctx, llama_
}

static ggml_backend_buffer_type_t llama_default_buffer_type(int n_gpu_layers) {
ggml_backend_buffer_type_t buft = nullptr;

#ifdef GGML_USE_METAL
if (n_gpu_layers > 0) {
return ggml_backend_metal_buffer_type();
buft = ggml_backend_metal_buffer_type();
}
#elif defined(GGML_USE_CUBLAS) && defined(LLAMA_GGML_BACKEND_CUDA_TEST)
if (n_gpu_layers > 0) {
return ggml_backend_cuda_buffer_type(0);
buft = ggml_backend_cuda_buffer_type(0);
}
#elif defined(GGML_USE_CUBLAS)
return ggml_backend_cuda_host_buffer_type();
buft = ggml_backend_cuda_host_buffer_type();
#elif defined(GGML_USE_CPU_HBM)
return ggml_backend_cpu_hbm_buffer_type();
buft = ggml_backend_cpu_hbm_buffer_type();
#endif

return ggml_backend_cpu_buffer_type();
if (buft == nullptr) {
buft = ggml_backend_cpu_buffer_type();
}

return buft;

GGML_UNUSED(n_gpu_layers);
}
Expand Down