fix some typos (#6209)

Co-authored-by: Brayden Zhong <b8zhong@uwaterloo.ca>
This commit is contained in:
applesaucethebun
2025-05-12 13:42:38 -04:00
committed by GitHub
parent 3ee40ff919
commit d738ab52f8
95 changed files with 276 additions and 276 deletions

View File

@@ -172,7 +172,7 @@ class CustomAllreduce:
if not custom_ar:
# disable because of missing custom allreduce library
# e.g. in a non-cuda environment
# e.g. in a non-CUDA environment
return
self.group = group
@@ -389,11 +389,11 @@ class CustomAllreduce:
if _is_hip:
handle, offset = ops.get_graph_buffer_ipc_meta(self._ptr)
handles, offsets = self._gather_ipc_meta((bytes(handle), offset))
logger.info("Registering %d cuda graph addresses", len(offset))
logger.info("Registering %d CUDA graph addresses", len(offset))
ops.register_graph_buffers(self._ptr, handles, offsets)
else:
handle, offset = ops.get_graph_buffer_ipc_meta(self._ptr)
logger.info("Registering %d cuda graph addresses", len(offset))
logger.info("Registering %d CUDA graph addresses", len(offset))
# We cannot directly use `dist.all_gather_object` here
# because it is incompatible with `gloo` backend under inference mode.
# see https://github.com/pytorch/pytorch/issues/126032 for details.
@@ -435,7 +435,7 @@ class CustomAllreduce:
return False
# all reduce, assuming inp tensor is IPC registered with register_buffer,
# or, in the context of cuda graphs, register_graph_buffers
# or, in the context of CUDA graphs, register_graph_buffers
def all_reduce_reg(self, inp: torch.Tensor, out: torch.Tensor = None):
if out is None:
out = torch.empty_like(inp)
@@ -473,7 +473,7 @@ class CustomAllreduce:
return out
def custom_all_reduce(self, input: torch.Tensor) -> Optional[torch.Tensor]:
"""The main allreduce API that provides support for cuda graph."""
"""The main allreduce API that provides support for CUDA graph."""
# When custom allreduce is disabled, this will be None.
if self.disabled or not self.should_custom_ar(input):
return None
@@ -489,7 +489,7 @@ class CustomAllreduce:
return torch.empty_like(input)
else:
if _is_hip:
# note: outside of cuda graph context,
# note: outside of CUDA graph context,
# custom allreduce incurs a cost of cudaMemcpy, which should
# be small(<=1% of overall latency) compared to the performance
# gains of using custom kernels

View File

@@ -121,14 +121,14 @@ def can_actually_p2p(
Therefore, we have to perform a real P2P access to check if it is actually
possible.
Note on p2p and cuda IPC:
Note on p2p and CUDA IPC:
Usually, one process uses one GPU:
GPU src --> cuda context src --> tensor src --> process src
GPU src --> CUDA context src --> tensor src --> process src
We need to combine p2p and cuda IPC, so that:
GPU src --> cuda context src --> tensor src --> process src
We need to combine p2p and CUDA IPC, so that:
GPU src --> CUDA context src --> tensor src --> process src
|shared|
GPU tgt --> cuda context tgt --> tensor tgt --> process tgt
GPU tgt --> CUDA context tgt --> tensor tgt --> process tgt
That is to say, process src creates a tensor in GPU src, passes IPC handle to
process tgt, and process tgt accesses the tensor in GPU tgt. Any operation on the
tensor in process tgt will be reflected in the tensor in process src, because
@@ -201,9 +201,9 @@ def can_actually_p2p(
# then all the processes can read the cache file to check the p2p access status.
# Note that the cache file is suffixed by the CUDA_VISIBLE_DEVICES, so that we
# can have different cache files for different CUDA_VISIBLE_DEVICES settings,
# e.g. used by different vllm engines. The device id in the cache file is a
# e.g. used by different vLLM engines. The device id in the cache file is a
# **local** device id, i.e. from 0 to num_dev-1, where num_dev is the number
# of visible devices in the vllm engine.
# of visible devices in the vLLM engine.
_gpu_p2p_access_cache: Optional[Dict[str, bool]] = None

View File

@@ -104,7 +104,7 @@ class PyNcclCommunicator:
self.device = device
# nccl communicator and stream will use this device
# `torch.cuda.device` is a context manager that changes the
# current cuda device to the specified one
# current CUDA device to the specified one
with torch.cuda.device(device):
self.comm: ncclComm_t = self.nccl.ncclCommInitRank(
self.world_size, self.unique_id, self.rank

View File

@@ -6,7 +6,7 @@
# 1. We tried to use `cupy`, it calls NCCL correctly, but `cupy` itself
# often gets stuck when initializing the NCCL communicator.
# 2. We tried to use `torch.distributed`, but `torch.distributed.all_reduce`
# contains many other potential cuda APIs, that are not allowed during
# contains many other potential CUDA APIs, that are not allowed during
# capturing the CUDA graph. For further details, please check
# https://discuss.pytorch.org/t/pytorch-cudagraph-with-nccl-operation-failed/ .
#

View File

@@ -170,7 +170,7 @@ class GroupCoordinator:
GroupCoordinator takes charge of all the communication operations among
the processes in the group. It can route the communication to
a specific implementation (e.g. switch allreduce implementation
based on the tensor size and cuda graph mode).
based on the tensor size and CUDA graph mode).
"""
# available attributes: