From 56eaa9ef937bf4aa8d1eaaf0b0d3035f4860082f Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sun, 9 Feb 2025 16:47:32 +0800
Subject: [PATCH 1/6] add error handling

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 csrc/cumem_allocator.cpp | 35 ++++++++++++++++++++++++++---------
 1 file changed, 26 insertions(+), 9 deletions(-)

diff --git a/csrc/cumem_allocator.cpp b/csrc/cumem_allocator.cpp
index e8555d853b7..c74472fb3f6 100644
--- a/csrc/cumem_allocator.cpp
+++ b/csrc/cumem_allocator.cpp
@@ -12,15 +12,20 @@ extern "C" {
 #include <cuda_runtime_api.h>
 #include <cuda.h>
 
-#define CUDA_CHECK(condition)                                                  \
-  do {                                                                         \
-    CUresult error = condition;                                                \
-    if (error != 0) {                                                          \
-      char* error_string;                                                      \
-      cuGetErrorString(error, (const char**)&error_string);                    \
-      std::cerr << "CUDA Error: " << error_string << " at " << __FILE__ << ":" \
-                << __LINE__ << std::endl;                                      \
-    }                                                                          \
+char error_msg[10240];    // 10KB buffer to store error messages
+CUresult error_code = 0;  // store error code
+
+#define CUDA_CHECK(condition)                                           \
+  do {                                                                  \
+    CUresult error = condition;                                         \
+    if (error != 0) {                                                   \
+      error_code = error;                                               \
+      char* error_string;                                               \
+      cuGetErrorString(error, (const char**)&error_string);             \
+      snprintf(error_msg, sizeof(error_msg), "CUDA Error: %s at %s:%d", \
+               error_string, __FILE__, __LINE__);                       \
+      std::cerr << error_msg << std::endl;                              \
+    }                                                                   \
   } while (0)
 
 // Global references to Python callables
@@ -258,6 +263,12 @@ static PyObject* python_unmap_and_release(PyObject* self, PyObject* args) {
 
   unmap_and_release(recv_device, recv_size, d_mem_ptr, p_memHandle);
 
+  if (error_code != 0) {
+    error_code = 0;
+    PyErr_SetString(PyExc_RuntimeError, error_msg);
+    return nullptr;
+  }
+
   Py_RETURN_NONE;
 }
 
@@ -282,6 +293,12 @@ static PyObject* python_create_and_map(PyObject* self, PyObject* args) {
 
   create_and_map(recv_device, recv_size, d_mem_ptr, p_memHandle);
 
+  if (error_code != 0) {
+    error_code = 0;
+    PyErr_SetString(PyExc_RuntimeError, error_msg);
+    return nullptr;
+  }
+
   Py_RETURN_NONE;
 }
 

From b3f2341bf32e63e52d5d741713b6c2cff328ca58 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sun, 9 Feb 2025 16:57:20 +0800
Subject: [PATCH 2/6] add tests

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 tests/basic_correctness/test_cumem.py | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/tests/basic_correctness/test_cumem.py b/tests/basic_correctness/test_cumem.py
index da9239b0940..4e9f1bf1cf8 100644
--- a/tests/basic_correctness/test_cumem.py
+++ b/tests/basic_correctness/test_cumem.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
+import pytest
 import torch
 
 from vllm import LLM, SamplingParams
@@ -9,6 +10,32 @@
 from ..utils import fork_new_process_for_each_test
 
 
+@fork_new_process_for_each_test
+def test_python_error():
+    """
+    Test if Python error occurs when there's low-level
+    error happening from the C++ side.
+    """
+    allocator = CuMemAllocator.get_instance()
+    total_bytes = torch.cuda.mem_get_info()[1]
+    alloc_bytes = int(total_bytes * 0.7)
+    tensors = []
+    with allocator.use_memory_pool():
+        # allocate 70% of the total memory
+        x = torch.empty(alloc_bytes, dtype=torch.uint8, device='cuda')
+        tensors.append(x)
+    # release the memory
+    allocator.sleep()
+
+    # allocate more memory than the total memory
+    y = torch.empty(alloc_bytes, dtype=torch.uint8, device='cuda')
+    tensors.append(y)
+    with pytest.raises(RuntimeError):
+        # when the allocator is woken up, it should raise an error
+        # because we don't have enough memory
+        allocator.wake_up()
+
+
 @fork_new_process_for_each_test
 def test_basic_cumem():
     # some tensors from default memory pool

From de3b6ba269a6cd815f0dcb7b3aa7188783e2caf7 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sun, 9 Feb 2025 17:06:19 +0800
Subject: [PATCH 3/6] fix assignment

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 csrc/cumem_allocator.cpp | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/csrc/cumem_allocator.cpp b/csrc/cumem_allocator.cpp
index c74472fb3f6..a9aaeb1ef82 100644
--- a/csrc/cumem_allocator.cpp
+++ b/csrc/cumem_allocator.cpp
@@ -12,8 +12,9 @@ extern "C" {
 #include <cuda_runtime_api.h>
 #include <cuda.h>
 
-char error_msg[10240];    // 10KB buffer to store error messages
-CUresult error_code = 0;  // store error code
+char error_msg[10240];  // 10KB buffer to store error messages
+CUresult no_error = CUresult(0);
+CUresult error_code = no_error;  // store error code
 
 #define CUDA_CHECK(condition)                                           \
   do {                                                                  \
@@ -264,7 +265,7 @@ static PyObject* python_unmap_and_release(PyObject* self, PyObject* args) {
   unmap_and_release(recv_device, recv_size, d_mem_ptr, p_memHandle);
 
   if (error_code != 0) {
-    error_code = 0;
+    error_code = no_error;
     PyErr_SetString(PyExc_RuntimeError, error_msg);
     return nullptr;
   }
@@ -294,7 +295,7 @@ static PyObject* python_create_and_map(PyObject* self, PyObject* args) {
   create_and_map(recv_device, recv_size, d_mem_ptr, p_memHandle);
 
   if (error_code != 0) {
-    error_code = 0;
+    error_code = no_error;
     PyErr_SetString(PyExc_RuntimeError, error_msg);
     return nullptr;
   }

From 585c98021bb0e9fdf45893f68676d4ea9e8ada97 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sun, 9 Feb 2025 17:25:08 +0800
Subject: [PATCH 4/6] early stop

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 csrc/cumem_allocator.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/csrc/cumem_allocator.cpp b/csrc/cumem_allocator.cpp
index a9aaeb1ef82..3ac9fceb28a 100644
--- a/csrc/cumem_allocator.cpp
+++ b/csrc/cumem_allocator.cpp
@@ -26,6 +26,7 @@ CUresult error_code = no_error;  // store error code
       snprintf(error_msg, sizeof(error_msg), "CUDA Error: %s at %s:%d", \
                error_string, __FILE__, __LINE__);                       \
       std::cerr << error_msg << std::endl;                              \
+      return;                                                           \
     }                                                                   \
   } while (0)
 

From 23fb99c016279178b60d66f8692b8f05543c87bf Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sun, 9 Feb 2025 17:35:14 +0800
Subject: [PATCH 5/6] fix return type

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 csrc/cumem_allocator.cpp | 28 ++++++++++++++++++++++++----
 1 file changed, 24 insertions(+), 4 deletions(-)

diff --git a/csrc/cumem_allocator.cpp b/csrc/cumem_allocator.cpp
index 3ac9fceb28a..f739ed92138 100644
--- a/csrc/cumem_allocator.cpp
+++ b/csrc/cumem_allocator.cpp
@@ -26,7 +26,6 @@ CUresult error_code = no_error;  // store error code
       snprintf(error_msg, sizeof(error_msg), "CUDA Error: %s at %s:%d", \
                error_string, __FILE__, __LINE__);                       \
       std::cerr << error_msg << std::endl;                              \
-      return;                                                           \
     }                                                                   \
   } while (0)
 
@@ -61,14 +60,22 @@ void create_and_map(unsigned long long device, ssize_t size, CUdeviceptr d_mem,
 
   // Allocate memory using cuMemCreate
   CUDA_CHECK(cuMemCreate(p_memHandle, size, &prop, 0));
+  if (error_code != 0) {
+    return;
+  }
   CUDA_CHECK(cuMemMap(d_mem, size, 0, *p_memHandle, 0));
-
+  if (error_code != 0) {
+    return;
+  }
   CUmemAccessDesc accessDesc = {};
   accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
   accessDesc.location.id = device;
   accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
 
   CUDA_CHECK(cuMemSetAccess(d_mem, size, &accessDesc, 1));
+  if (error_code != 0) {
+    return;
+  }
   // std::cout << "create_and_map: device=" << device << ", size=" << size << ",
   // d_mem=" << d_mem << ", p_memHandle=" << p_memHandle << std::endl;
 }
@@ -80,7 +87,13 @@ void unmap_and_release(unsigned long long device, ssize_t size,
   // ", d_mem=" << d_mem << ", p_memHandle=" << p_memHandle << std::endl;
   ensure_context(device);
   CUDA_CHECK(cuMemUnmap(d_mem, size));
+  if (error_code != 0) {
+    return;
+  }
   CUDA_CHECK(cuMemRelease(*p_memHandle));
+  if (error_code != 0) {
+    return;
+  }
 }
 
 PyObject* create_tuple_from_c_integers(unsigned long long a,
@@ -128,12 +141,16 @@ void* my_malloc(ssize_t size, int device, CUstream stream) {
   size_t granularity;
   CUDA_CHECK(cuMemGetAllocationGranularity(&granularity, &prop,
                                            CU_MEM_ALLOC_GRANULARITY_MINIMUM));
-
+  if (error_code != 0) {
+    return nullptr;
+  }
   size_t alignedSize = ((size + granularity - 1) / granularity) * granularity;
 
   CUdeviceptr d_mem;
   CUDA_CHECK(cuMemAddressReserve(&d_mem, alignedSize, 0, 0, 0));
-
+  if (error_code != 0) {
+    return nullptr;
+  }
   // allocate the CUmemGenericAllocationHandle
   CUmemGenericAllocationHandle* p_memHandle =
       (CUmemGenericAllocationHandle*)malloc(
@@ -215,6 +232,9 @@ void my_free(void* ptr, ssize_t size, int device, CUstream stream) {
 
   // free address and the handle
   CUDA_CHECK(cuMemAddressFree(d_mem, size));
+  if (error_code != 0) {
+    return nullptr;
+  }
   free(p_memHandle);
 }
 

From 3038c19631add1a976c87bb7f6e3d39176254aa9 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sun, 9 Feb 2025 17:40:04 +0800
Subject: [PATCH 6/6] fix return type

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 csrc/cumem_allocator.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/csrc/cumem_allocator.cpp b/csrc/cumem_allocator.cpp
index f739ed92138..fab6ca36d42 100644
--- a/csrc/cumem_allocator.cpp
+++ b/csrc/cumem_allocator.cpp
@@ -233,7 +233,7 @@ void my_free(void* ptr, ssize_t size, int device, CUstream stream) {
   // free address and the handle
   CUDA_CHECK(cuMemAddressFree(d_mem, size));
   if (error_code != 0) {
-    return nullptr;
+    return;
   }
   free(p_memHandle);
 }