fix some typos (#6209)

Co-authored-by: Brayden Zhong <b8zhong@uwaterloo.ca>
2025-05-12 13:42:38 -04:00
parent 3ee40ff919
commit d738ab52f8
95 changed files with 276 additions and 276 deletions
--- a/python/sglang/srt/distributed/device_communicators/custom_all_reduce.py
+++ b/python/sglang/srt/distributed/device_communicators/custom_all_reduce.py
@@ -172,7 +172,7 @@ class CustomAllreduce:

        if not custom_ar:
            # disable because of missing custom allreduce library
-            # e.g. in a non-cuda environment
+            # e.g. in a non-CUDA environment
            return

        self.group = group
@@ -389,11 +389,11 @@ class CustomAllreduce:
        if _is_hip:
            handle, offset = ops.get_graph_buffer_ipc_meta(self._ptr)
            handles, offsets = self._gather_ipc_meta((bytes(handle), offset))
-            logger.info("Registering %d cuda graph addresses", len(offset))
+            logger.info("Registering %d CUDA graph addresses", len(offset))
            ops.register_graph_buffers(self._ptr, handles, offsets)
        else:
            handle, offset = ops.get_graph_buffer_ipc_meta(self._ptr)
-            logger.info("Registering %d cuda graph addresses", len(offset))
+            logger.info("Registering %d CUDA graph addresses", len(offset))
            # We cannot directly use `dist.all_gather_object` here
            # because it is incompatible with `gloo` backend under inference mode.
            # see https://github.com/pytorch/pytorch/issues/126032 for details.
@@ -435,7 +435,7 @@ class CustomAllreduce:
        return False

    # all reduce, assuming inp tensor is IPC registered with register_buffer,
-    # or, in the context of cuda graphs, register_graph_buffers
+    # or, in the context of CUDA graphs, register_graph_buffers
    def all_reduce_reg(self, inp: torch.Tensor, out: torch.Tensor = None):
        if out is None:
            out = torch.empty_like(inp)
@@ -473,7 +473,7 @@ class CustomAllreduce:
        return out

    def custom_all_reduce(self, input: torch.Tensor) -> Optional[torch.Tensor]:
-        """The main allreduce API that provides support for cuda graph."""
+        """The main allreduce API that provides support for CUDA graph."""
        # When custom allreduce is disabled, this will be None.
        if self.disabled or not self.should_custom_ar(input):
            return None
@@ -489,7 +489,7 @@ class CustomAllreduce:
                return torch.empty_like(input)
        else:
            if _is_hip:
-                # note: outside of cuda graph context,
+                # note: outside of CUDA graph context,
                # custom allreduce incurs a cost of cudaMemcpy, which should
                # be small(<=1% of overall latency) compared to the performance
                # gains of using custom kernels
--- a/python/sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py
+++ b/python/sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py
@@ -121,14 +121,14 @@ def can_actually_p2p(
    Therefore, we have to perform a real P2P access to check if it is actually
    possible.

-    Note on p2p and cuda IPC:
+    Note on p2p and CUDA IPC:
    Usually, one process uses one GPU:
-    GPU src --> cuda context src --> tensor src --> process src
+    GPU src --> CUDA context src --> tensor src --> process src

-    We need to combine p2p and cuda IPC, so that:
-    GPU src --> cuda context src --> tensor src --> process src
+    We need to combine p2p and CUDA IPC, so that:
+    GPU src --> CUDA context src --> tensor src --> process src
                                      |shared|
-    GPU tgt --> cuda context tgt --> tensor tgt --> process tgt
+    GPU tgt --> CUDA context tgt --> tensor tgt --> process tgt
    That is to say, process src creates a tensor in GPU src, passes IPC handle to
    process tgt, and process tgt accesses the tensor in GPU tgt. Any operation on the
    tensor in process tgt will be reflected in the tensor in process src, because
@@ -201,9 +201,9 @@ def can_actually_p2p(
 # then all the processes can read the cache file to check the p2p access status.
 # Note that the cache file is suffixed by the CUDA_VISIBLE_DEVICES, so that we
 #  can have different cache files for different CUDA_VISIBLE_DEVICES settings,
-#  e.g. used by different vllm engines. The device id in the cache file is a
+#  e.g. used by different vLLM engines. The device id in the cache file is a
 #  **local** device id, i.e. from 0 to num_dev-1, where num_dev is the number
-#  of visible devices in the vllm engine.
+#  of visible devices in the vLLM engine.
 _gpu_p2p_access_cache: Optional[Dict[str, bool]] = None


--- a/python/sglang/srt/distributed/device_communicators/pynccl.py
+++ b/python/sglang/srt/distributed/device_communicators/pynccl.py
@@ -104,7 +104,7 @@ class PyNcclCommunicator:
        self.device = device
        # nccl communicator and stream will use this device
        # `torch.cuda.device` is a context manager that changes the
-        # current cuda device to the specified one
+        # current CUDA device to the specified one
        with torch.cuda.device(device):
            self.comm: ncclComm_t = self.nccl.ncclCommInitRank(
                self.world_size, self.unique_id, self.rank
--- a/python/sglang/srt/distributed/device_communicators/pynccl_wrapper.py
+++ b/python/sglang/srt/distributed/device_communicators/pynccl_wrapper.py
@@ -6,7 +6,7 @@
 # 1. We tried to use `cupy`, it calls NCCL correctly, but `cupy` itself
 #  often gets stuck when initializing the NCCL communicator.
 # 2. We tried to use `torch.distributed`, but `torch.distributed.all_reduce`
-#  contains many other potential cuda APIs, that are not allowed during
+#  contains many other potential CUDA APIs, that are not allowed during
 #  capturing the CUDA graph. For further details, please check
 # https://discuss.pytorch.org/t/pytorch-cudagraph-with-nccl-operation-failed/ .
 #
--- a/python/sglang/srt/distributed/parallel_state.py
+++ b/python/sglang/srt/distributed/parallel_state.py
@@ -170,7 +170,7 @@ class GroupCoordinator:
    GroupCoordinator takes charge of all the communication operations among
        the processes in the group. It can route the communication to
        a specific implementation (e.g. switch allreduce implementation
-        based on the tensor size and cuda graph mode).
+        based on the tensor size and CUDA graph mode).
    """

    # available attributes: