boundary condition

Your Name · Your Name · commit 5a6a5a85ec20 · 2025-05-21T12:17:35.000Z
diff --git a/csrc/quick_all_reduce.cu b/csrc/quick_all_reduce.cu
@@ -159,8 +159,6 @@ void DeviceComms::allreduce(int profile, hipStream_t stream, T const* A, T* B,
       break;
   }
 
-#endif
-
   // -------------------------------------------------
   // Rotate the flag color.
   flag_color++;
diff --git a/vllm/distributed/device_communicators/quick_all_reduce.py b/vllm/distributed/device_communicators/quick_all_reduce.py
@@ -52,13 +52,15 @@ def __init__(self,
                  group: ProcessGroup,
                  device: Union[int, str, torch.device],
                  max_size=512 * 1024 * 1024,
-                 min_size=32 * 1024) -> None:
+                 min_size=128 * 1024) -> None:
         """
         Args:
             group: the process group to work on. If None, it will use the
                 default process group.
             device: the device to bind the QuickAllreduce to. If None,
                 it will be bind to f"cuda:{local_rank}".
+            max_size: max supported size.
+            min_size: Less than this size, custom_allreduce is better.
         It is the caller's responsibility to make sure each communicator
         is bind to a unique device, and all communicators in this group
         are in the same node.
@@ -168,40 +170,20 @@ def should_quick_ar(self, inp: torch.Tensor):
             return inp_size < self.max_size  # and inp_size > self.min_size
         return False
 
-    def all_reduce(self,
-                   inp: torch.Tensor,
-                   *,
-                   out: torch.Tensor = None,
-                   registered: bool = False):
-        """Performs an out-of-place all reduce.
-        
-        If registered is True, this assumes inp's pointer is already
-        IPC-registered. Otherwise, inp is first copied into a pre-registered
-        buffer.
-        """
+    def all_reduce(self, inp: torch.Tensor, *, out: torch.Tensor = None):
+        """Performs an out-of-place all reduce."""
         if out is None:
             out = torch.empty_like(inp)
-        if registered:
-            ops.all_reduce(self._ptr, inp, out, 0, 0)
-        else:
-            # print("qr")
-            ops.qr_all_reduce(self._ptr, envs.VLLM_QUICK_ALLREDUCE, inp, out)
+        ops.qr_all_reduce(self._ptr, envs.VLLM_QUICK_ALLREDUCE, inp, out)
         return out
 
     def quick_all_reduce(self, input: torch.Tensor) -> Optional[torch.Tensor]:
         """The main allreduce API that provides support for cuda graph."""
         # When quick allreduce is disabled, this will be None.
         if self.disabled or not self.should_quick_ar(input):
             return None
-        if self._IS_CAPTURING:
-            if torch.cuda.is_current_stream_capturing():
-                return self.all_reduce(input, registered=True)
-            else:
-                # If warm up, mimic the allocation pattern since quick
-                # allreduce is out-of-place.
-                return torch.empty_like(input)
-        else:
-            return self.all_reduce(input, registered=False)
+
+        return self.all_reduce(input)
 
     def close(self):
         '''del self._ptr and del buffer'''

Original file line number	Diff line number	Diff line change
`@@ -159,8 +159,6 @@ void DeviceComms::allreduce(int profile, hipStream_t stream, T const* A, T* B,`
`159`	`159`	`break;`
`160`	`160`	`}`
`161`	`161`
`162`		`-#endif`
`163`		`-`
`164`	`162`	`// -------------------------------------------------`
`165`	`163`	`// Rotate the flag color.`
`166`	`164`	`flag_color++;`