Revert "fix some typos" (#6244)
This commit is contained in:
@@ -172,7 +172,7 @@ class CustomAllreduce:
|
||||
|
||||
if not custom_ar:
|
||||
# disable because of missing custom allreduce library
|
||||
# e.g. in a non-CUDA environment
|
||||
# e.g. in a non-cuda environment
|
||||
return
|
||||
|
||||
self.group = group
|
||||
@@ -389,11 +389,11 @@ class CustomAllreduce:
|
||||
if _is_hip:
|
||||
handle, offset = ops.get_graph_buffer_ipc_meta(self._ptr)
|
||||
handles, offsets = self._gather_ipc_meta((bytes(handle), offset))
|
||||
logger.info("Registering %d CUDA graph addresses", len(offset))
|
||||
logger.info("Registering %d cuda graph addresses", len(offset))
|
||||
ops.register_graph_buffers(self._ptr, handles, offsets)
|
||||
else:
|
||||
handle, offset = ops.get_graph_buffer_ipc_meta(self._ptr)
|
||||
logger.info("Registering %d CUDA graph addresses", len(offset))
|
||||
logger.info("Registering %d cuda graph addresses", len(offset))
|
||||
# We cannot directly use `dist.all_gather_object` here
|
||||
# because it is incompatible with `gloo` backend under inference mode.
|
||||
# see https://github.com/pytorch/pytorch/issues/126032 for details.
|
||||
@@ -435,7 +435,7 @@ class CustomAllreduce:
|
||||
return False
|
||||
|
||||
# all reduce, assuming inp tensor is IPC registered with register_buffer,
|
||||
# or, in the context of CUDA graphs, register_graph_buffers
|
||||
# or, in the context of cuda graphs, register_graph_buffers
|
||||
def all_reduce_reg(self, inp: torch.Tensor, out: torch.Tensor = None):
|
||||
if out is None:
|
||||
out = torch.empty_like(inp)
|
||||
@@ -473,7 +473,7 @@ class CustomAllreduce:
|
||||
return out
|
||||
|
||||
def custom_all_reduce(self, input: torch.Tensor) -> Optional[torch.Tensor]:
|
||||
"""The main allreduce API that provides support for CUDA graph."""
|
||||
"""The main allreduce API that provides support for cuda graph."""
|
||||
# When custom allreduce is disabled, this will be None.
|
||||
if self.disabled or not self.should_custom_ar(input):
|
||||
return None
|
||||
@@ -489,7 +489,7 @@ class CustomAllreduce:
|
||||
return torch.empty_like(input)
|
||||
else:
|
||||
if _is_hip:
|
||||
# note: outside of CUDA graph context,
|
||||
# note: outside of cuda graph context,
|
||||
# custom allreduce incurs a cost of cudaMemcpy, which should
|
||||
# be small(<=1% of overall latency) compared to the performance
|
||||
# gains of using custom kernels
|
||||
|
||||
@@ -121,14 +121,14 @@ def can_actually_p2p(
|
||||
Therefore, we have to perform a real P2P access to check if it is actually
|
||||
possible.
|
||||
|
||||
Note on p2p and CUDA IPC:
|
||||
Note on p2p and cuda IPC:
|
||||
Usually, one process uses one GPU:
|
||||
GPU src --> CUDA context src --> tensor src --> process src
|
||||
GPU src --> cuda context src --> tensor src --> process src
|
||||
|
||||
We need to combine p2p and CUDA IPC, so that:
|
||||
GPU src --> CUDA context src --> tensor src --> process src
|
||||
We need to combine p2p and cuda IPC, so that:
|
||||
GPU src --> cuda context src --> tensor src --> process src
|
||||
|shared|
|
||||
GPU tgt --> CUDA context tgt --> tensor tgt --> process tgt
|
||||
GPU tgt --> cuda context tgt --> tensor tgt --> process tgt
|
||||
That is to say, process src creates a tensor in GPU src, passes IPC handle to
|
||||
process tgt, and process tgt accesses the tensor in GPU tgt. Any operation on the
|
||||
tensor in process tgt will be reflected in the tensor in process src, because
|
||||
@@ -201,9 +201,9 @@ def can_actually_p2p(
|
||||
# then all the processes can read the cache file to check the p2p access status.
|
||||
# Note that the cache file is suffixed by the CUDA_VISIBLE_DEVICES, so that we
|
||||
# can have different cache files for different CUDA_VISIBLE_DEVICES settings,
|
||||
# e.g. used by different vLLM engines. The device id in the cache file is a
|
||||
# e.g. used by different vllm engines. The device id in the cache file is a
|
||||
# **local** device id, i.e. from 0 to num_dev-1, where num_dev is the number
|
||||
# of visible devices in the vLLM engine.
|
||||
# of visible devices in the vllm engine.
|
||||
_gpu_p2p_access_cache: Optional[Dict[str, bool]] = None
|
||||
|
||||
|
||||
|
||||
@@ -104,7 +104,7 @@ class PyNcclCommunicator:
|
||||
self.device = device
|
||||
# nccl communicator and stream will use this device
|
||||
# `torch.cuda.device` is a context manager that changes the
|
||||
# current CUDA device to the specified one
|
||||
# current cuda device to the specified one
|
||||
with torch.cuda.device(device):
|
||||
self.comm: ncclComm_t = self.nccl.ncclCommInitRank(
|
||||
self.world_size, self.unique_id, self.rank
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
# 1. We tried to use `cupy`, it calls NCCL correctly, but `cupy` itself
|
||||
# often gets stuck when initializing the NCCL communicator.
|
||||
# 2. We tried to use `torch.distributed`, but `torch.distributed.all_reduce`
|
||||
# contains many other potential CUDA APIs, that are not allowed during
|
||||
# contains many other potential cuda APIs, that are not allowed during
|
||||
# capturing the CUDA graph. For further details, please check
|
||||
# https://discuss.pytorch.org/t/pytorch-cudagraph-with-nccl-operation-failed/ .
|
||||
#
|
||||
|
||||
@@ -170,7 +170,7 @@ class GroupCoordinator:
|
||||
GroupCoordinator takes charge of all the communication operations among
|
||||
the processes in the group. It can route the communication to
|
||||
a specific implementation (e.g. switch allreduce implementation
|
||||
based on the tensor size and CUDA graph mode).
|
||||
based on the tensor size and cuda graph mode).
|
||||
"""
|
||||
|
||||
# available attributes:
|
||||
|
||||
Reference in New Issue
Block a user