Sync from v0.13

2026-01-19 10:38:50 +08:00
parent b2ef04d792
commit 5aef6c175a
3714 changed files with 854317 additions and 89342 deletions
--- a/vllm/distributed/device_communicators/all2all.py
+++ b/vllm/distributed/device_communicators/all2all.py
@@ -0,0 +1,490 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Any
+
+import torch
+import torch.distributed as dist
+
+import vllm.envs as envs
+from vllm.distributed import get_dp_group, get_ep_group
+from vllm.forward_context import get_forward_context
+from vllm.logger import init_logger
+from vllm.utils.flashinfer import has_flashinfer_all2all
+from vllm.utils.import_utils import has_deep_ep, has_pplx
+
+from .base_device_communicator import All2AllManagerBase, Cache
+
+if has_flashinfer_all2all():
+    from flashinfer.comm import Mapping  # type: ignore[import-not-found]
+    from flashinfer.comm.mnnvl import MnnvlConfig  # type: ignore[import-not-found]
+    from flashinfer.comm.trtllm_alltoall import (
+        MnnvlMoe,  # type: ignore[import-not-found]
+    )
+
+logger = init_logger(__name__)
+
+
+class NaiveAll2AllManager(All2AllManagerBase):
+    """
+    A naive implementation of all2all communication.
+    It uses all-reduce under the hood, which is not
+    efficient at all. The main purpose is for testing and
+    debugging.
+    """
+
+    def __init__(self, cpu_group):
+        super().__init__(cpu_group)
+
+    def naive_multicast(
+        self,
+        x: torch.Tensor,
+        cu_tokens_across_sp_cpu: torch.Tensor,
+        is_sequence_parallel: bool,
+    ) -> torch.Tensor:
+        assert len(x.shape) == 2
+        buffer = torch.empty(
+            (cu_tokens_across_sp_cpu[-1], x.size(1)), device=x.device, dtype=x.dtype
+        )
+
+        rank = self.rank if is_sequence_parallel else self.dp_rank
+        world_size = self.world_size if is_sequence_parallel else self.dp_world_size
+
+        start = 0 if rank == 0 else cu_tokens_across_sp_cpu[rank - 1]
+        end = cu_tokens_across_sp_cpu[rank]
+        buffer[start:end, :].copy_(x)
+        for idx in range(world_size):
+            start = 0 if idx == 0 else cu_tokens_across_sp_cpu[idx - 1]
+            end = cu_tokens_across_sp_cpu[idx]
+            get_ep_group().broadcast(buffer[start:end, :], idx)
+
+        return buffer
+
+    def dispatch(
+        self,
+        hidden_states: torch.Tensor,
+        router_logits: torch.Tensor,
+        is_sequence_parallel: bool = False,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        sp_size = self.tp_group.world_size if is_sequence_parallel else 1
+        dp_metadata = get_forward_context().dp_metadata
+        assert dp_metadata is not None
+        cu_tokens_across_sp_cpu = dp_metadata.cu_tokens_across_sp(sp_size)
+
+        hidden_states = self.naive_multicast(
+            hidden_states, cu_tokens_across_sp_cpu, is_sequence_parallel
+        )
+        router_logits = self.naive_multicast(
+            router_logits, cu_tokens_across_sp_cpu, is_sequence_parallel
+        )
+        return hidden_states, router_logits
+
+    def combine(
+        self, hidden_states: torch.Tensor, is_sequence_parallel: bool = False
+    ) -> torch.Tensor:
+        ep_rank = self.rank if is_sequence_parallel else self.dp_rank
+
+        dp_metadata = get_forward_context().dp_metadata
+        assert dp_metadata is not None
+        sp_size = self.tp_group.world_size if is_sequence_parallel else 1
+        cu_tokens_across_sp_cpu = dp_metadata.cu_tokens_across_sp(sp_size)
+
+        start = 0 if ep_rank == 0 else cu_tokens_across_sp_cpu[ep_rank - 1]
+        end = cu_tokens_across_sp_cpu[ep_rank]
+
+        all_hidden_states = get_ep_group().all_reduce(hidden_states)
+        hidden_states = all_hidden_states[start:end, :]
+        return hidden_states
+
+    def destroy(self):
+        pass
+
+
+class AgRsAll2AllManager(All2AllManagerBase):
+    """
+    An implementation of all2all communication based on
+    all-gather (dispatch) and reduce-scatter (combine).
+    """
+
+    def __init__(self, cpu_group):
+        super().__init__(cpu_group)
+
+    def dispatch(
+        self,
+        hidden_states: torch.Tensor,
+        router_logits: torch.Tensor,
+        is_sequence_parallel: bool = False,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Gather hidden_states and router_logits from all dp ranks.
+        """
+        dp_metadata = get_forward_context().dp_metadata
+        assert dp_metadata is not None
+        sizes = dp_metadata.get_chunk_sizes_across_dp_rank()
+        assert sizes is not None
+
+        dist_group = get_ep_group() if is_sequence_parallel else get_dp_group()
+        assert sizes[dist_group.rank_in_group] == hidden_states.shape[0]
+        hidden_states, router_logits = dist_group.all_gatherv(
+            [hidden_states, router_logits],
+            dim=0,
+            sizes=sizes,
+        )
+        return hidden_states, router_logits
+
+    def combine(
+        self, hidden_states: torch.Tensor, is_sequence_parallel: bool = False
+    ) -> torch.Tensor:
+        """
+        Reduce-scatter hidden_states across all dp ranks.
+        """
+        dp_metadata = get_forward_context().dp_metadata
+        assert dp_metadata is not None
+        sizes = dp_metadata.get_chunk_sizes_across_dp_rank()
+        assert sizes is not None
+
+        dist_group = get_ep_group() if is_sequence_parallel else get_dp_group()
+        hidden_states = dist_group.reduce_scatterv(hidden_states, dim=0, sizes=sizes)
+        return hidden_states
+
+    def destroy(self):
+        pass
+
+
+class PPLXAll2AllManager(All2AllManagerBase):
+    """
+    All2All communication based on PPLX kernels.
+    """
+
+    def __init__(self, cpu_group):
+        assert has_pplx(), (
+            "pplx_kernels not found. Please follow https://github.com/vllm-project/vllm/blob/main/tools/ep_kernels/README.md"
+            " to install pplx_kernels."
+        )
+        super().__init__(cpu_group)
+
+        if self.internode:
+            # inter-node communication needs nvshmem,
+            # intra-node communication uses p2p mapping directly
+            from pplx_kernels.nvshmem import (  # type: ignore[import-not-found]
+                nvshmem_alloc_empty_unique_id,
+                nvshmem_get_unique_id,
+                nvshmem_init,
+            )
+
+            logger.debug(
+                "Initialize NVSHMEM for pplx_kernels: rank=%d, world size=%d",
+                self.rank,
+                self.world_size,
+            )
+            uid = (
+                nvshmem_get_unique_id()
+                if self.rank == 0
+                else nvshmem_alloc_empty_unique_id()
+            )
+            dist.broadcast(
+                uid,
+                src=dist.get_process_group_ranks(self.cpu_group)[0],
+                group=self.cpu_group,
+            )
+            logger.debug("PPLX NVSHMEM UID = %s", uid)
+            nvshmem_init(uid, self.rank, self.world_size)
+
+        self.handle_cache = Cache()
+
+    def get_handle(self, kwargs):
+        import pplx_kernels as pplx  # type: ignore[import-not-found]
+
+        return self.handle_cache.get_or_create(
+            kwargs,
+            pplx.AllToAll.internode if self.internode else pplx.AllToAll.intranode,
+        )
+
+    def dispatch(
+        self,
+        hidden_states: torch.Tensor,
+        router_logits: torch.Tensor,
+        is_sequence_parallel: bool = False,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        raise NotImplementedError
+
+    def combine(
+        self, hidden_states: torch.Tensor, is_sequence_parallel: bool = False
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    def destroy(self):
+        with self.handle_cache._lock:
+            for _, handle in self.handle_cache._cache.items():
+                handle.destroy()
+
+        if self.internode:
+            from pplx_kernels.nvshmem import (
+                nvshmem_finalize,  # type: ignore[import-not-found]
+            )
+
+            logger.debug("PPLX NVSHMEM finalize")
+            nvshmem_finalize()
+
+
+class DeepEPAll2AllManagerBase(All2AllManagerBase):
+    """
+    All2All communication based on DeepEP High-Throughput kernels.
+    """
+
+    def __init__(self, cpu_group):
+        assert has_deep_ep(), (
+            "DeepEP kernels not found. Please follow https://github.com/vllm-project/vllm/blob/main/tools/ep_kernels/README.md"
+            " to install DeepEP kernels."
+        )  # noqa
+        super().__init__(cpu_group)
+        self.handle_cache = Cache()
+
+        # This is the DeepEP default. Stick to it till we can establish
+        # reasonable defaults based on profiling.
+        self.num_sms = 20
+
+    def get_handle(self, kwargs):
+        raise NotImplementedError
+
+    def dispatch(
+        self,
+        hidden_states: torch.Tensor,
+        router_logits: torch.Tensor,
+        is_sequence_parallel: bool = False,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        raise NotImplementedError
+
+    def combine(
+        self, hidden_states: torch.Tensor, is_sequence_parallel: bool = False
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    def destroy(self):
+        pass
+
+
+class DeepEPHTAll2AllManager(DeepEPAll2AllManagerBase):
+    """
+    All2All communication based on DeepEP High-Throughput kernels.
+    """
+
+    def __init__(self, cpu_group):
+        super().__init__(cpu_group)
+
+    def _make_all2all_kwargs(self) -> dict[Any, Any]:
+        # Defaults for internode and intranode are taken from DeepEP tests.
+        num_nvl_bytes = envs.VLLM_DEEPEP_BUFFER_SIZE_MB * 1024 * 1024
+        num_rdma_bytes = None
+        num_qps_per_rank = None
+
+        if self.internode and not envs.VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE:
+            num_rdma_bytes = envs.VLLM_DEEPEP_BUFFER_SIZE_MB * 1024 * 1024
+            num_qps_per_rank = self.num_sms // 2
+        else:
+            num_rdma_bytes = 0
+            num_qps_per_rank = 1
+
+        assert num_rdma_bytes is not None
+        assert num_qps_per_rank is not None
+        return dict(
+            group=self.cpu_group,
+            num_nvl_bytes=num_nvl_bytes,
+            num_rdma_bytes=num_rdma_bytes,
+            low_latency_mode=False,
+            num_qps_per_rank=num_qps_per_rank,
+        )
+
+    def get_handle(self, kwargs):
+        assert len(kwargs) == 0, (
+            "DeepEPHTAll2AllManager expects no arguments. All the required "
+            "args are computed in the Manager itself."
+        )
+
+        import deep_ep  # type: ignore[import-not-found]
+
+        buffer_kwargs = self._make_all2all_kwargs()
+        logger.debug("DeepEP all2all args %s", buffer_kwargs)
+        handle: deep_ep.Buffer = self.handle_cache.get_or_create(
+            buffer_kwargs, deep_ep.Buffer
+        )
+        return handle
+
+    def set_num_sms(self, num_sms: int):
+        import deep_ep  # type: ignore[import-not-found]
+
+        # Right now the buffers are sized for only what the kernels were
+        # created with. So we can only reduce the number of SMS used
+        # but not increase it.
+        if num_sms > self.num_sms:
+            num_sms = self.num_sms
+        deep_ep.Buffer.set_num_sms(num_sms)
+
+
+class DeepEPLLAll2AllManager(DeepEPAll2AllManagerBase):
+    """
+    All2All communication based on DeepEP Low-Latency kernels.
+    """
+
+    def __init__(self, cpu_group):
+        super().__init__(cpu_group)
+
+    def _make_all2all_kwargs(
+        self,
+        max_num_tokens_per_dp_rank: int,
+        token_hidden_size: int,
+        num_ep_ranks: int,
+        num_global_experts: int,
+        num_local_experts: int,
+    ) -> dict[Any, Any]:
+        """
+        max_num_tokens_per_dp_rank : the maximum number of tokens a DP rank
+          can dispatch all the ranks must hold the same value.
+        token_hidden_size: the hidden dimension of each token.
+        num_ep_ranks: the number of EP group ranks.
+        num_global_experts: Number of experts in the model.
+        num_local_experts: Number of experts in an EP rank.
+        """
+        import deep_ep  # type: ignore[import-not-found]
+
+        # Defaults for internode and intranode are taken from DeepEP tests.
+        num_nvl_bytes = envs.VLLM_DEEPEP_BUFFER_SIZE_MB * 1024 * 1024
+        num_qps_per_rank = num_local_experts
+        num_rdma_bytes = deep_ep.Buffer.get_low_latency_rdma_size_hint(
+            num_max_dispatch_tokens_per_rank=max_num_tokens_per_dp_rank,
+            hidden=token_hidden_size,
+            num_ranks=num_ep_ranks,
+            num_experts=num_global_experts,
+        )
+
+        assert num_rdma_bytes is not None
+        return dict(
+            group=self.cpu_group,
+            num_nvl_bytes=num_nvl_bytes,
+            num_rdma_bytes=num_rdma_bytes,
+            low_latency_mode=True,
+            num_qps_per_rank=num_qps_per_rank,
+            allow_nvlink_for_low_latency_mode=True,
+            allow_mnnvl=envs.VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL,
+        )
+
+    def get_handle(self, kwargs):
+        """
+        The kwargs for DeepEPLLAll2AllManager is dictated by
+        _make_all2all_kwargs.
+        """
+        import deep_ep  # type: ignore[import-not-found]
+
+        buffer_kwargs = self._make_all2all_kwargs(**kwargs)
+        logger.debug("DeepEP all2all args %s", buffer_kwargs)
+        handle: deep_ep.Buffer = self.handle_cache.get_or_create(
+            buffer_kwargs, deep_ep.Buffer
+        )
+        return handle
+
+    # DeepEP LL uses RDMA so no SMs are used for communication
+    def max_sms_used(self) -> int | None:
+        return 0
+
+
+class FlashInferAllToAllManager(All2AllManagerBase):
+    """
+    All2All communication based on flashinfer kernels.
+    """
+
+    # This type lint could be removed after all of the work in
+    # https://github.com/vllm-project/vllm/issues/26533 done.
+    rank: int
+    world_size: int
+
+    def __init__(self, cpu_group):
+        assert has_flashinfer_all2all(), (
+            "flashinfer all2all module not found. Please install/check flashinfer"
+        )  # noqa
+        super().__init__(cpu_group)
+        logger.debug(
+            "Initialize for flashinfer All2All rank=%d, world size=%d",
+            self.rank,
+            self.world_size,
+        )
+        self.initialized = False
+        self.alltoall_info = None
+
+    def initialize(
+        self,
+        world_size: int,
+        rank: int,
+        gpus_per_node: int,
+    ):
+        """Initialize workspace"""
+        if self.initialized:
+            return
+
+        self.cleanup()
+        logger.debug("making map: rank=%d, world size=%d", rank, world_size)
+        self.mapping = Mapping(
+            world_size,
+            rank,
+            gpus_per_node,
+            tp_size=world_size,
+        )
+
+        from vllm.distributed.device_communicators.mnnvl_compat import (
+            CustomCommunicator,
+        )
+
+        dp_config = MnnvlConfig(
+            comm_backend=CustomCommunicator(get_dp_group().cpu_group),
+            fabric_page_size=1 << 29,  # 512MB
+            allocation_granularity=0,  # Auto-detect
+        )
+
+        self.workspace_tensor = MnnvlMoe.get_moe_workspaces(self.mapping, dp_config)
+        self.prepare_workspace_tensor = MnnvlMoe.get_moe_prepare_workspace(
+            self.mapping, dp_config
+        )
+
+        self.world_size = world_size
+        self.rank = rank
+        self.gpus_per_node = gpus_per_node
+        self.initialized = True
+
+        logger.info(
+            "FlashInfer All2All initialized for rank %s, size %s", rank, world_size
+        )
+
+    def ensure_alltoall_workspace_initialized(self):
+        """Ensure workspace is initialized"""
+        if not has_flashinfer_all2all():
+            return False
+
+        if self.world_size <= 1:
+            return False
+
+        if not self.initialized:
+            self.initialize(
+                world_size=self.world_size,
+                rank=self.rank,
+                gpus_per_node=torch.cuda.device_count,
+            )
+        return self.initialized
+
+    def get_handle(self, kwargs):
+        return self
+
+    def cleanup(self):
+        """Clean up workspace"""
+        if (
+            self.initialized
+            and self.workspace_tensor is not None
+            and self.prepare_workspace_tensor is not None
+        ):
+            try:
+                del self.workspace_tensor
+                del self.prepare_workspace_tensor
+            except Exception as e:
+                logger.warning("Failed to cleanup FlashInfer workspace: %s", e)
+            finally:
+                self.workspace_tensor = None
+                self.prepare_workspace_tensor = None
+                self.mapping = None
+                self.initialized = False
--- a/vllm/distributed/device_communicators/all_reduce_utils.py
+++ b/vllm/distributed/device_communicators/all_reduce_utils.py
@@ -0,0 +1,344 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import ctypes
+import json
+import os
+import pickle
+import subprocess
+import sys
+import tempfile
+from collections.abc import Sequence
+from itertools import product
+from typing import Any
+
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+
+import vllm.envs as envs
+from vllm.distributed.device_communicators.cuda_wrapper import CudaRTLibrary
+from vllm.logger import init_logger
+from vllm.model_executor.layers.batch_invariant import (
+    vllm_is_batch_invariant,
+)
+from vllm.utils.system_utils import update_environment_variables
+from vllm.utils.torch_utils import cuda_device_count_stateless
+
+logger = init_logger(__name__)
+
+MiB = 1024 * 1024
+# Max size for each world size in case symmetric memory is available
+# For different SM architectures
+CUSTOM_ALL_REDUCE_MAX_SIZES = {
+    "9.0": {
+        2: 64 * MiB,  # 64 MB
+        4: 32 * MiB,  # 32 MB
+        6: MiB // 2,  # 512 KB
+        8: MiB // 4,  # 256 KB
+    },
+    "10.0": {
+        2: 2 * MiB,  # 2 MB
+        4: 2 * MiB,  # 2 MB
+        6: 1 * MiB,  # 1 MB
+        8: 1 * MiB,  # 1 MB
+    },
+}
+
+SYMM_MEM_ALL_REDUCE_MAX_SIZES = {
+    "9.0": {
+        2: 64 * MiB,  # 64 MB
+        4: 32 * MiB,  # 32 MB
+        6: 64 * MiB,  # 64 MB
+        8: 64 * MiB,  # 64 MB
+    },
+    "10.0": {
+        2: 8 * MiB,  # 8 MB
+        4: 32 * MiB,  # 32 MB
+        6: 128 * MiB,  # 128 MB
+        8: 128 * MiB,  # 128 MB
+    },
+}
+
+NCCL_SYMM_MEM_ALL_REDUCE_CONFIG: dict[str, Any] = {
+    "min_world_size": 4,
+    "thresholds": {
+        4: 2 * MiB,  # 2 MB
+        8: 1 * MiB,  # 1 MB
+    },
+    "always_use_above_world_size": 8,  # Always use symm mem for world_size > 8
+}
+
+
+def should_nccl_symm_mem_allreduce(world_size: int, input_tensor: torch.Tensor) -> bool:
+    from vllm.distributed.device_communicators.pynccl_allocator import (
+        is_symmetric_memory_enabled,
+    )
+
+    if vllm_is_batch_invariant():
+        return False
+
+    if not is_symmetric_memory_enabled():
+        return False
+    if world_size < NCCL_SYMM_MEM_ALL_REDUCE_CONFIG["min_world_size"]:
+        return False
+    threshold = NCCL_SYMM_MEM_ALL_REDUCE_CONFIG["thresholds"].get(world_size)
+    if threshold is not None and input_tensor.nbytes >= threshold:
+        return True
+    return world_size > NCCL_SYMM_MEM_ALL_REDUCE_CONFIG["always_use_above_world_size"]
+
+
+def producer(
+    batch_src: Sequence[int],
+    producer_queue,
+    consumer_queue,
+    result_queue,
+    cuda_visible_devices: str | None = None,
+):
+    if cuda_visible_devices is not None:
+        update_environment_variables({"CUDA_VISIBLE_DEVICES": cuda_visible_devices})
+
+    lib = CudaRTLibrary()
+    for i in batch_src:
+        lib.cudaSetDevice(i)
+        pointer = lib.cudaMalloc(1024)
+        lib.cudaMemset(pointer, 1, 1024)
+        lib.cudaDeviceSynchronize()
+        handle = lib.cudaIpcGetMemHandle(pointer)
+        producer_queue.put(handle)
+        open_success = consumer_queue.get()
+        if open_success:
+            # use two queues to simulate barrier
+            producer_queue.put(0)
+            consumer_queue.get()
+            # check if the memory is modified
+            host_data = (ctypes.c_char * 1024)()
+            lib.cudaMemcpy(host_data, pointer, 1024)  # type: ignore
+            for i in range(1024):
+                if ord(host_data[i]) != 2:
+                    open_success = False
+                    break
+        result_queue.put(open_success)
+        lib.cudaDeviceReset()
+
+
+def consumer(
+    batch_tgt: Sequence[int],
+    producer_queue,
+    consumer_queue,
+    result_queue,
+    cuda_visible_devices: str | None = None,
+):
+    if cuda_visible_devices is not None:
+        update_environment_variables({"CUDA_VISIBLE_DEVICES": cuda_visible_devices})
+
+    lib = CudaRTLibrary()
+    for j in batch_tgt:
+        lib.cudaSetDevice(j)
+        handle = producer_queue.get()
+        open_success = False
+        try:
+            pointer = lib.cudaIpcOpenMemHandle(handle)  # type: ignore
+            open_success = True
+        except RuntimeError:
+            # cannot error out here, because the producer process
+            # is still waiting for the response.
+            pass
+        consumer_queue.put(open_success)
+        if open_success:
+            # modify the memory
+            lib.cudaMemset(pointer, 2, 1024)
+            lib.cudaDeviceSynchronize()
+            # use two queues to simulate barrier
+            producer_queue.get()
+            consumer_queue.put(0)
+            # check if the memory is modified
+            host_data = (ctypes.c_char * 1024)()
+            lib.cudaMemcpy(host_data, pointer, 1024)  # type: ignore
+            for i in range(1024):
+                if ord(host_data[i]) != 2:
+                    open_success = False
+                    break
+        result_queue.put(open_success)
+        lib.cudaDeviceReset()
+
+
+def can_actually_p2p(
+    batch_src: Sequence[int],
+    batch_tgt: Sequence[int],
+) -> Sequence[bool]:
+    """
+    Usually, checking if P2P access is enabled can be done by
+    `torch.cuda.can_device_access_peer(src, tgt)`. However, sometimes
+    the driver might be broken, and `torch.cuda.can_device_access_peer(src, tgt)`
+    returns `True` even if P2P access is not actually possible.
+    See https://github.com/vllm-project/vllm/issues/2728 and
+    https://forums.developer.nvidia.com/t/direct-gpu-gpu-communication-does-not-seem-to-work-properly/283264/10
+    Therefore, we have to perform a real P2P access to check if it is actually
+    possible.
+
+    Note on p2p and cuda IPC:
+    Usually, one process uses one GPU:
+    GPU src --> cuda context src --> tensor src --> process src
+
+    We need to combine p2p and cuda IPC, so that:
+    GPU src --> cuda context src --> tensor src --> process src
+                                      |shared|
+    GPU tgt --> cuda context tgt --> tensor tgt --> process tgt
+    That is to say, process src creates a tensor in GPU src, passes IPC handle to
+    process tgt, and process tgt accesses the tensor in GPU tgt. Any operation on the
+    tensor in process tgt will be reflected in the tensor in process src, because
+    they are the same memory segment.
+    It is important to note that process tgt accesses the tensor in GPU tgt, not
+    GPU src. That's why we need p2p access.
+
+    The most time-consuming part is the process creation. To avoid creating
+    processes for every pair of GPUs, we use batched testing. We create two
+    processes for testing all pairs of GPUs in batch. The trick is to reset
+    the device after each test (which is not available in PyTorch).
+    """  # noqa
+    cuda_visible_devices = envs.CUDA_VISIBLE_DEVICES
+    # pass the CUDA_VISIBLE_DEVICES to the child process
+    # to make sure they see the same set of GPUs
+
+    # make sure the processes are spawned
+    smp = mp.get_context("spawn")
+    producer_queue = smp.Queue()
+    consumer_queue = smp.Queue()
+    result_queue = smp.Queue()
+    p_src = smp.Process(
+        target=producer,
+        args=(
+            batch_src,
+            producer_queue,
+            consumer_queue,
+            result_queue,
+            cuda_visible_devices,
+        ),
+    )
+    p_tgt = smp.Process(
+        target=consumer,
+        args=(
+            batch_tgt,
+            producer_queue,
+            consumer_queue,
+            result_queue,
+            cuda_visible_devices,
+        ),
+    )
+    p_src.start()
+    p_tgt.start()
+    p_src.join()
+    p_tgt.join()
+    assert p_src.exitcode == 0 and p_tgt.exitcode == 0
+    result: list[bool] = []
+    for src, tgt in zip(batch_src, batch_tgt):
+        a = result_queue.get()
+        b = result_queue.get()
+        if a != b:
+            logger.warning(
+                "Two processes do not agree on the P2P access"
+                " status on %d -> %d, treat as disabled.",
+                src,
+                tgt,
+            )
+            result.append(False)
+        else:
+            result.append(a)
+    return result
+
+
+# why do we need this cache?
+# we are testing peer-to-peer (p2p) access between GPUs,across processes.
+# if we test it every time, it will be very slow, because we need to create
+#  N * N * 2 processes, where N is the world size. This is very slow.
+# to reduce the time, we use a cache file to store the p2p access status.
+# the cache file is generated by the master process if it does not exist.
+# then all the processes can read the cache file to check the p2p access status.
+# Note that the cache file is suffixed by the CUDA_VISIBLE_DEVICES, so that we
+#  can have different cache files for different CUDA_VISIBLE_DEVICES settings,
+#  e.g. used by different vllm engines. The device id in the cache file is a
+#  **local** device id, i.e. from 0 to num_dev-1, where num_dev is the number
+#  of visible devices in the vllm engine.
+_gpu_p2p_access_cache: dict[str, bool] | None = None
+
+
+def gpu_p2p_access_check(src: int, tgt: int) -> bool:
+    """Check if GPU src can access GPU tgt."""
+
+    # if the cache variable is already calculated,
+    # read from the cache instead of checking it again
+    global _gpu_p2p_access_cache
+    if _gpu_p2p_access_cache is not None:
+        return _gpu_p2p_access_cache[f"{src}->{tgt}"]
+
+    is_distributed = dist.is_initialized()
+
+    num_dev = cuda_device_count_stateless()
+    cuda_visible_devices = envs.CUDA_VISIBLE_DEVICES
+    if cuda_visible_devices is None:
+        cuda_visible_devices = ",".join(str(i) for i in range(num_dev))
+
+    path = os.path.join(
+        envs.VLLM_CACHE_ROOT, f"gpu_p2p_access_cache_for_{cuda_visible_devices}.json"
+    )
+    os.makedirs(os.path.dirname(path), exist_ok=True)
+    from vllm.distributed.parallel_state import get_world_group
+
+    if (not is_distributed or get_world_group().local_rank == 0) and (
+        not os.path.exists(path)
+    ):
+        # only the local master process (with local_rank == 0) can
+        #  enter this block to calculate the cache
+        logger.info("generating GPU P2P access cache in %s", path)
+        cache: dict[str, bool] = {}
+        ids = list(range(num_dev))
+        # batch of all pairs of GPUs
+        batch_src, batch_tgt = zip(*list(product(ids, ids)))
+        # NOTE: we use `subprocess` rather than `multiprocessing` here
+        # because the caller might not have `if __name__ == "__main__":`,
+        # in that case we cannot use spawn method in multiprocessing.
+        # However, `can_actually_p2p` requires spawn method.
+        # The fix is, we use `subprocess` to call the function,
+        # where we have `if __name__ == "__main__":` in this file.
+
+        # use a temporary file to store the result
+        # we don't use the output of the subprocess directly,
+        # because the subprocess might produce logging output
+        with tempfile.NamedTemporaryFile() as output_file:
+            input_bytes = pickle.dumps((batch_src, batch_tgt, output_file.name))
+            returned = subprocess.run(
+                [sys.executable, __file__], input=input_bytes, capture_output=True
+            )
+            # check if the subprocess is successful
+            try:
+                returned.check_returncode()
+            except Exception as e:
+                # wrap raised exception to provide more information
+                raise RuntimeError(
+                    f"Error happened when batch testing "
+                    f"peer-to-peer access from {batch_src} to {batch_tgt}:\n"
+                    f"{returned.stderr.decode()}"
+                ) from e
+            with open(output_file.name, "rb") as f:
+                result = pickle.load(f)
+        for _i, _j, r in zip(batch_src, batch_tgt, result):
+            cache[f"{_i}->{_j}"] = r
+        with open(path, "w") as f:
+            json.dump(cache, f, indent=4)
+    if is_distributed:
+        get_world_group().barrier()
+    logger.info("reading GPU P2P access cache from %s", path)
+    with open(path) as f:
+        cache = json.load(f)
+    _gpu_p2p_access_cache = cache
+    return _gpu_p2p_access_cache[f"{src}->{tgt}"]
+
+
+__all__ = ["gpu_p2p_access_check"]
+
+if __name__ == "__main__":
+    batch_src, batch_tgt, output_file = pickle.loads(sys.stdin.buffer.read())
+    result = can_actually_p2p(batch_src, batch_tgt)
+    with open(output_file, "wb") as f:
+        f.write(pickle.dumps(result))
--- a/vllm/distributed/device_communicators/base_device_communicator.py
+++ b/vllm/distributed/device_communicators/base_device_communicator.py
@@ -0,0 +1,297 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import threading
+from weakref import WeakValueDictionary
+
+import torch
+import torch.distributed as dist
+from torch.distributed import ProcessGroup
+
+
+class Cache:
+    def __init__(self):
+        self._cache: WeakValueDictionary = WeakValueDictionary()
+        self._lock = threading.RLock()  # Reentrant lock for thread safety
+
+    def get_or_create(self, kwargs, func):
+        # Create a hashable key from the kwargs
+        key = tuple(sorted((k, v) for k, v in kwargs.items()))
+
+        with self._lock:
+            instance = self._cache.get(key)
+            if instance is None:
+                instance = func(**kwargs)
+                self._cache[key] = instance
+            return instance
+
+
+class All2AllManagerBase:
+    rank: int
+    world_size: int
+
+    def __init__(self, cpu_group):
+        self.cpu_group = cpu_group
+
+        # compute some common properties
+        from vllm.distributed.parallel_state import (
+            get_dp_group,
+            get_tp_group,
+            in_the_same_node_as,
+        )
+
+        # all2all lives in ep group, which is merged from dp and tp group
+        self.dp_group = get_dp_group()
+        self.tp_group = get_tp_group()
+
+        # no self.ep_group since self.ep_group is still in construction
+        # when we create this object
+        self.dp_rank = self.dp_group.rank_in_group
+        self.dp_world_size = self.dp_group.world_size
+        self.rank = dist.get_rank(cpu_group)
+        self.world_size = dist.get_world_size(cpu_group)
+
+        # all2all communication often has separate implementations for
+        # intra-node and inter-node communication
+        self.internode = not all(in_the_same_node_as(cpu_group, source_rank=0))
+
+    def get_handle(self, kwargs):
+        # get a handle for the all2all communication,
+        # based on the kwargs.
+        # different layers can have different configs,
+        # e.g. one layer has hidden size 1024, another has 2048.
+        # usually the underlying implementation caches the handle
+        # and reuse it for the same config.
+        raise NotImplementedError
+
+    def dispatch(
+        self,
+        hidden_states: torch.Tensor,
+        router_logits: torch.Tensor,
+        is_sequence_parallel: bool = False,
+    ):
+        raise NotImplementedError
+
+    def set_num_sms(self, num_sms: int):
+        pass
+
+    def max_sms_used(self) -> int | None:
+        return None  # None means it could use the whole GPU
+
+    def combine(self, hidden_states: torch.Tensor, is_sequence_parallel: bool = False):
+        raise NotImplementedError
+
+    def destroy(self):
+        pass
+
+
+class DeviceCommunicatorBase:
+    """
+    Base class for device-specific communicator.
+    It can use the `cpu_group` to initialize the communicator.
+    If the device has PyTorch integration (PyTorch can recognize its
+    communication backend), the `device_group` will also be given.
+    """
+
+    def __init__(
+        self,
+        cpu_group: ProcessGroup,
+        device: torch.device | None = None,
+        device_group: ProcessGroup | None = None,
+        unique_name: str = "",
+    ):
+        self.device = device or torch.device("cpu")
+        self.cpu_group = cpu_group
+        self.device_group = device_group
+        self.unique_name = unique_name
+        self.rank = dist.get_rank(cpu_group)
+        self.world_size = dist.get_world_size(cpu_group)
+        self.ranks = dist.get_process_group_ranks(cpu_group)
+        self.global_rank = dist.get_rank()
+        self.global_world_size = dist.get_world_size()
+        self.rank_in_group = dist.get_group_rank(self.cpu_group, self.global_rank)
+
+        use_ep = False
+        all2all_backend = None
+        from vllm.config import get_current_vllm_config
+
+        config = get_current_vllm_config()
+        if config is not None:
+            # as long as we use data parallel (coupled data parallel
+            # where all data parallel ranks execute forward together),
+            # we initialize the all2all manager used in expert parallel.
+            use_ep = config.parallel_config.data_parallel_size > 1
+            all2all_backend = config.parallel_config.all2all_backend
+
+        self.is_ep_communicator = "ep" in unique_name
+        self.use_all2all = self.is_ep_communicator and use_ep
+        self.all2all_backend = all2all_backend
+        self.all2all_manager: All2AllManagerBase | None = None
+
+    def all_reduce(self, input_: torch.Tensor) -> torch.Tensor:
+        dist.all_reduce(input_, group=self.device_group)
+        return input_
+
+    def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor:
+        if dim < 0:
+            # Convert negative dim to positive.
+            dim += input_.dim()
+        input_size = input_.size()
+        # NOTE: we have to use concat-style all-gather here,
+        # stack-style all-gather has compatibility issues with
+        # torch.compile . see https://github.com/pytorch/pytorch/issues/138795
+        output_size = (input_size[0] * self.world_size,) + input_size[1:]
+        # Allocate output tensor.
+        output_tensor = torch.empty(
+            output_size, dtype=input_.dtype, device=input_.device
+        )
+        # All-gather.
+        dist.all_gather_into_tensor(output_tensor, input_, group=self.device_group)
+        # Reshape
+        output_tensor = output_tensor.reshape((self.world_size,) + input_size)
+        output_tensor = output_tensor.movedim(0, dim)
+        output_tensor = output_tensor.reshape(
+            input_size[:dim]
+            + (self.world_size * input_size[dim],)
+            + input_size[dim + 1 :]
+        )
+        return output_tensor
+
+    def all_gatherv(
+        self,
+        input_: torch.Tensor | list[torch.Tensor],
+        dim: int = 0,
+        sizes: list[int] | None = None,
+    ) -> torch.Tensor | list[torch.Tensor]:
+        raise NotImplementedError
+
+    def reduce_scatter(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor:
+        world_size = self.world_size
+        # Bypass the function if we are using only 1 GPU.
+        if world_size == 1:
+            return input_
+        assert -input_.dim() <= dim < input_.dim(), (
+            f"Invalid dim ({dim}) for input tensor with shape {input_.size()}"
+        )
+
+        if dim < 0:
+            # Convert negative dim to positive.
+            dim += input_.dim()
+
+        # Note: This will produce an incorrect answer if we don't make
+        # the input_tensor contiguous. Possible bug in reduce_scatter_tensor?
+        input_tensor = input_.movedim(0, dim).contiguous()
+
+        assert input_tensor.shape[0] % world_size == 0
+        chunk_size = input_tensor.shape[0] // world_size
+        output_shape = (chunk_size,) + input_tensor.shape[1:]
+
+        output_tensor = torch.empty(
+            output_shape, dtype=input_tensor.dtype, device=input_tensor.device
+        )
+
+        # Perform reduce-scatter operation
+        torch.distributed.reduce_scatter_tensor(
+            output_tensor, input_tensor, group=self.device_group
+        )
+
+        # Reshape before returning
+        return output_tensor.movedim(0, dim).contiguous()
+
+    def reduce_scatterv(
+        self, input_: torch.Tensor, dim: int = -1, sizes: list[int] | None = None
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    def gather(
+        self, input_: torch.Tensor, dst: int = 0, dim: int = -1
+    ) -> torch.Tensor | None:
+        """
+        NOTE: We assume that the input tensor is on the same device across
+        all the ranks.
+        NOTE: `dst` is the local rank of the destination rank.
+        """
+        world_size = self.world_size
+        assert -input_.dim() <= dim < input_.dim(), (
+            f"Invalid dim ({dim}) for input tensor with shape {input_.size()}"
+        )
+        if dim < 0:
+            # Convert negative dim to positive.
+            dim += input_.dim()
+
+        # Allocate output tensor.
+        if self.rank_in_group == dst:
+            gather_list = [torch.empty_like(input_) for _ in range(world_size)]
+        else:
+            gather_list = None
+        # Gather.
+        torch.distributed.gather(
+            input_, gather_list, dst=self.ranks[dst], group=self.device_group
+        )
+        if self.rank_in_group == dst:
+            output_tensor = torch.cat(gather_list, dim=dim)
+        else:
+            output_tensor = None
+        return output_tensor
+
+    def send(self, tensor: torch.Tensor, dst: int | None = None) -> None:
+        """Sends a tensor to the destination rank in a blocking way"""
+        """NOTE: `dst` is the local rank of the destination rank."""
+        if dst is None:
+            dst = (self.rank_in_group + 1) % self.world_size
+        torch.distributed.send(tensor, self.ranks[dst], self.device_group)
+
+    def recv(
+        self, size: torch.Size, dtype: torch.dtype, src: int | None = None
+    ) -> torch.Tensor:
+        """Receives a tensor from the source rank."""
+        """NOTE: `src` is the local rank of the source rank."""
+        if src is None:
+            src = (self.rank_in_group - 1) % self.world_size
+
+        tensor = torch.empty(size, dtype=dtype, device=self.device)
+        torch.distributed.recv(tensor, self.ranks[src], self.device_group)
+        return tensor
+
+    def destroy(self):
+        pass
+
+    def prepare_communication_buffer_for_model(self, model: torch.nn.Module) -> None:
+        """
+        Prepare the communication buffer for the model.
+        """
+        if not self.is_ep_communicator:
+            return
+
+        moe_modules = [
+            module
+            for module in model.modules()
+            # TODO(bnell): Should use isinstance but can't.  Maybe search for
+            # presence of quant_method.maybe_init_modular_kernel?
+            if (
+                module.__class__.__name__ == "FusedMoE"
+                or module.__class__.__name__ == "SharedFusedMoE"
+            )
+        ]
+        for module in moe_modules:
+            module.maybe_init_modular_kernel()
+
+    def dispatch(
+        self,
+        hidden_states: torch.Tensor,
+        router_logits: torch.Tensor,
+        is_sequence_parallel: bool = False,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Dispatch the hidden states and router logits to the appropriate device.
+        This is a no-op in the base class.
+        """
+        return hidden_states, router_logits
+
+    def combine(
+        self, hidden_states: torch.Tensor, is_sequence_parallel: bool = False
+    ) -> torch.Tensor:
+        """
+        Combine the hidden states and router logits from the appropriate device.
+        This is a no-op in the base class.
+        """
+        return hidden_states
--- a/vllm/distributed/device_communicators/cpu_communicator.py
+++ b/vllm/distributed/device_communicators/cpu_communicator.py
@@ -0,0 +1,209 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+from typing import Any
+
+import torch
+from torch.distributed import ProcessGroup
+
+from vllm.distributed.utils import pickle
+from vllm.platforms import current_platform
+from vllm.platforms.interface import CpuArchEnum
+
+from .base_device_communicator import DeviceCommunicatorBase
+
+
+class CpuCommunicator(DeviceCommunicatorBase):
+    def __init__(
+        self,
+        cpu_group: ProcessGroup,
+        device: torch.device | None = None,
+        device_group: ProcessGroup | None = None,
+        unique_name: str = "",
+    ):
+        super().__init__(cpu_group, device, device_group, unique_name)
+        self.dist_module = torch.distributed
+
+        if (
+            (current_platform.get_cpu_architecture() == CpuArchEnum.X86)
+            and hasattr(torch.ops._C, "init_shm_manager")
+            and (unique_name.startswith("tp") or unique_name.startswith("pp"))
+        ):
+            self.dist_module = _CPUSHMDistributed(self)
+
+    def all_reduce(self, input_):
+        self.dist_module.all_reduce(input_, group=self.device_group)
+        return input_
+
+    def gather(
+        self, input_: torch.Tensor, dst: int = 0, dim: int = -1
+    ) -> torch.Tensor | None:
+        """
+        NOTE: We assume that the input tensor is on the same device across
+        all the ranks.
+        NOTE: `dst` is the local rank of the destination rank.
+        """
+        world_size = self.world_size
+        assert -input_.dim() <= dim < input_.dim(), (
+            f"Invalid dim ({dim}) for input tensor with shape {input_.size()}"
+        )
+        if dim < 0:
+            # Convert negative dim to positive.
+            dim += input_.dim()
+
+        # Allocate output tensor.
+        if self.rank_in_group == dst:
+            gather_list = [torch.empty_like(input_) for _ in range(world_size)]
+        else:
+            gather_list = None
+
+        # Gather.
+        self.dist_module.gather(
+            input_, gather_list, dst=self.ranks[dst], group=self.device_group
+        )
+
+        if self.rank_in_group == dst:
+            output_tensor = torch.cat(gather_list, dim=dim)
+        else:
+            output_tensor = None
+        return output_tensor
+
+    def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor:
+        if dim < 0:
+            # Convert negative dim to positive.
+            dim += input_.dim()
+        input_size = input_.size()
+        # NOTE: we have to use concat-style all-gather here,
+        # stack-style all-gather has compatibility issues with
+        # torch.compile . see https://github.com/pytorch/pytorch/issues/138795
+        output_size = (input_size[0] * self.world_size,) + input_size[1:]
+        # Allocate output tensor.
+        output_tensor = torch.empty(
+            output_size, dtype=input_.dtype, device=input_.device
+        )
+        # All-gather.
+        self.dist_module.all_gather_into_tensor(
+            output_tensor, input_, group=self.device_group
+        )
+
+        # Reshape
+        output_tensor = output_tensor.reshape((self.world_size,) + input_size)
+        output_tensor = output_tensor.movedim(0, dim)
+        output_tensor = output_tensor.reshape(
+            input_size[:dim]
+            + (self.world_size * input_size[dim],)
+            + input_size[dim + 1 :]
+        )
+        return output_tensor
+
+    def send_tensor_dict(
+        self,
+        tensor_dict: dict[str, torch.Tensor | Any],
+        dst: int,
+    ) -> None:
+        return self.dist_module.send_tensor_dict(tensor_dict, dst)
+
+    def recv_tensor_dict(
+        self,
+        src: int,
+    ) -> dict[str, torch.Tensor | Any]:
+        return self.dist_module.recv_tensor_dict(src)
+
+
+class _CPUSHMDistributed:
+    def __init__(self, communicator: CpuCommunicator):
+        instance_identifier = os.environ["VLLM_DIST_IDENT"]
+        unique_name = communicator.unique_name
+        instance_identifier = f"{instance_identifier}-{unique_name}"
+        self.communicator = communicator
+
+        group_ranks = [str(rank) for rank in self.communicator.ranks]
+        shm_group_identifier = f"[{'-'.join(group_ranks)}]"
+        self.group_name = f"{instance_identifier}-{shm_group_identifier}-cpushm"
+
+        self.handle = self._init_cpu_shm()
+
+    def _init_cpu_shm(self) -> int:
+        handle = torch.ops._C.init_shm_manager(
+            self.group_name,
+            self.communicator.world_size,
+            self.communicator.rank,
+        )
+        torch.distributed.barrier(self.communicator.device_group)
+        torch.ops._C.join_shm_manager(
+            handle,
+            self.group_name,
+        )
+        torch.distributed.barrier(self.communicator.device_group)
+
+        return handle
+
+    def all_reduce(
+        self, input: torch.Tensor, group: ProcessGroup | None = None
+    ) -> None:
+        torch.ops._C.shm_allreduce(self.handle, input)
+
+    def gather(
+        self,
+        input: torch.Tensor,
+        gather_list: list[torch.Tensor] | None,
+        dst: int = -1,
+        group: ProcessGroup | None = None,
+    ) -> None:
+        # Note: different from the torch gather, here we use local dst rank.
+        torch.ops._C.shm_gather(
+            self.handle,
+            input,
+            gather_list,
+            torch.distributed.get_group_rank(group, dst),
+        )
+
+    def all_gather_into_tensor(
+        self,
+        output: torch.Tensor,
+        input: torch.Tensor,
+        group: ProcessGroup | None = None,
+    ) -> None:
+        torch.ops._C.shm_all_gather(self.handle, input, output)
+
+    def send_tensor_dict(
+        self,
+        tensor_dict: dict[str, torch.Tensor | Any],
+        dst: int,
+    ) -> None:
+        key_list = list(tensor_dict.keys())
+        value_list = list(tensor_dict.values())
+        size_list = []
+        for v in value_list:
+            if not isinstance(v, torch.Tensor):
+                raise RuntimeError("CpuCommunicator only supports sending tensors.")
+            size_list.append(v.size())
+        key_size_tensor = torch.frombuffer(
+            pickle.dumps([key_list, size_list]), dtype=torch.uint8
+        )
+        value_list.append(key_size_tensor)
+
+        torch.ops._C.shm_send_tensor_list(self.handle, value_list, dst)
+
+        return None
+
+    def recv_tensor_dict(
+        self,
+        src: int,
+    ) -> dict[str, torch.Tensor | Any]:
+        tensor_list = torch.ops._C.shm_recv_tensor_list(self.handle, src)
+
+        value_list: list[torch.Tensor] = tensor_list[:-1]
+        key_size_tensor = tensor_list[-1]
+
+        key_size = pickle.loads(key_size_tensor.numpy().tobytes())
+        key_list = key_size[0]
+        size_list = key_size[1]
+        assert len(key_list) == len(size_list)
+        assert len(key_list) == len(value_list)
+
+        tensor_dict: dict[str, torch.Tensor] = {}
+        for key, size, t in zip(key_list, size_list, value_list):
+            tensor_dict[key] = t.view(size)
+        return tensor_dict
--- a/vllm/distributed/device_communicators/cuda_communicator.py
+++ b/vllm/distributed/device_communicators/cuda_communicator.py
@@ -0,0 +1,340 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+import torch
+from torch.distributed import ProcessGroup
+
+import vllm.envs as envs
+from vllm.distributed.device_communicators.all_reduce_utils import (
+    should_nccl_symm_mem_allreduce,
+)
+from vllm.distributed.device_communicators.pynccl import register_nccl_symmetric_ops
+from vllm.distributed.device_communicators.pynccl_allocator import (
+    is_symmetric_memory_enabled,
+)
+from vllm.logger import init_logger
+from vllm.platforms import current_platform
+
+from .base_device_communicator import DeviceCommunicatorBase
+
+logger = init_logger(__name__)
+
+
+class CudaCommunicator(DeviceCommunicatorBase):
+    def __init__(
+        self,
+        cpu_group: ProcessGroup,
+        device: torch.device | None = None,
+        device_group: ProcessGroup | None = None,
+        unique_name: str = "",
+    ):
+        super().__init__(cpu_group, device, device_group, unique_name)
+        if "tp" not in unique_name:
+            # custom allreduce or torch symm mem can be used only by tp
+            use_custom_allreduce = False
+            use_torch_symm_mem = False
+        else:
+            from vllm.distributed.parallel_state import _ENABLE_CUSTOM_ALL_REDUCE
+
+            use_custom_allreduce = _ENABLE_CUSTOM_ALL_REDUCE
+            use_torch_symm_mem = envs.VLLM_ALLREDUCE_USE_SYMM_MEM
+
+        self.use_custom_allreduce = use_custom_allreduce
+        self.use_torch_symm_mem = use_torch_symm_mem
+
+        # lazy import to avoid documentation build error
+        from vllm.distributed.device_communicators.custom_all_reduce import (
+            CustomAllreduce,
+        )
+        from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
+        from vllm.distributed.device_communicators.quick_all_reduce import (
+            QuickAllReduce,
+        )
+        from vllm.distributed.device_communicators.symm_mem import SymmMemCommunicator
+
+        self.pynccl_comm: PyNcclCommunicator | None = None
+        if self.world_size > 1:
+            self.pynccl_comm = PyNcclCommunicator(
+                group=self.cpu_group,
+                device=self.device,
+            )
+            if is_symmetric_memory_enabled():
+                register_nccl_symmetric_ops(self.pynccl_comm)
+
+        self.ca_comm: CustomAllreduce | None = None
+        self.qr_comm: QuickAllReduce | None = None
+        self.symm_mem_comm: SymmMemCommunicator | None = None
+        if use_torch_symm_mem and current_platform.is_cuda():
+            self.symm_mem_comm = SymmMemCommunicator(
+                group=self.cpu_group,
+                device=self.device,
+            )
+
+        if use_custom_allreduce and self.world_size > 1:
+            # Initialize a custom fast all-reduce implementation.
+            self.ca_comm = CustomAllreduce(
+                group=self.cpu_group,
+                device=self.device,
+                symm_mem_enabled=(
+                    self.symm_mem_comm is not None and not self.symm_mem_comm.disabled
+                ),
+            )
+
+            if current_platform.is_rocm():
+                # Initialize a custom quick all-reduce implementation for AMD.
+                # Quick reduce is designed as a complement to custom allreduce.
+                # Based on quickreduce (https://github.com/mk1-project/quickreduce).
+                # If it's a rocm, 'use_custom_allreduce==True' means it must
+                # currently be an MI300 series.
+                self.qr_comm = QuickAllReduce(group=self.cpu_group, device=self.device)
+
+        if self.use_all2all:
+            if self.all2all_backend == "naive":
+                from .all2all import NaiveAll2AllManager
+
+                self.all2all_manager = NaiveAll2AllManager(self.cpu_group)
+            elif self.all2all_backend == "allgather_reducescatter":
+                from .all2all import AgRsAll2AllManager
+
+                self.all2all_manager = AgRsAll2AllManager(self.cpu_group)
+            elif self.all2all_backend == "pplx":
+                from .all2all import PPLXAll2AllManager
+
+                self.all2all_manager = PPLXAll2AllManager(self.cpu_group)
+            elif self.all2all_backend == "deepep_high_throughput":
+                from .all2all import DeepEPHTAll2AllManager
+
+                self.all2all_manager = DeepEPHTAll2AllManager(self.cpu_group)
+            elif self.all2all_backend == "deepep_low_latency":
+                from .all2all import DeepEPLLAll2AllManager
+
+                self.all2all_manager = DeepEPLLAll2AllManager(self.cpu_group)
+            elif self.all2all_backend == "flashinfer_all2allv":
+                from .all2all import FlashInferAllToAllManager
+
+                self.all2all_manager = FlashInferAllToAllManager(self.cpu_group)
+            else:
+                raise ValueError(f"Unknown all2all backend: {self.all2all_backend}")
+
+            logger.info_once(
+                "Using %s all2all manager.",
+                self.all2all_manager.__class__.__name__,
+                scope="global",
+            )
+
+    def all_reduce(self, input_):
+        # since currently we perform copy input -> symm_input -> out-of-place AR
+        # return symm_output, we don't need to check if input is symmetric
+        if self.pynccl_comm is not None and should_nccl_symm_mem_allreduce(
+            self.pynccl_comm.world_size, input_
+        ):
+            out = torch.ops.vllm.all_reduce_symmetric_with_copy(input_)
+            if out is not None:
+                return out
+        # always try quick reduce first, then custom allreduce,
+        # and then pynccl. (quick reduce just for ROCM MI3*)
+        qr_comm = self.qr_comm
+        if (
+            qr_comm is not None
+            and not qr_comm.disabled
+            and qr_comm.should_quick_allreduce(input_)
+        ):
+            out = qr_comm.quick_all_reduce(input_)
+            assert out is not None
+            return out
+        ca_comm = self.ca_comm
+        if (
+            ca_comm is not None
+            and not ca_comm.disabled
+            and ca_comm.should_custom_ar(input_)
+        ):
+            out = ca_comm.custom_all_reduce(input_)
+            assert out is not None
+            return out
+        symm_mem_comm = self.symm_mem_comm
+        if symm_mem_comm is not None and symm_mem_comm.should_use_symm_mem(input_):
+            out = symm_mem_comm.all_reduce(input_)
+            assert out is not None
+            return out
+        pynccl_comm = self.pynccl_comm
+        if pynccl_comm is None or pynccl_comm.disabled:
+            out = input_.clone()
+            torch.distributed.all_reduce(out, group=self.device_group)
+            return out
+        assert pynccl_comm is not None
+        out = pynccl_comm.all_reduce(input_)
+        if out is None:
+            # fall back to the default all-reduce using PyTorch.
+            # this usually happens during testing.
+            # when we run the model, allreduce only happens for the TP
+            # group, where we always have either custom allreduce or pynccl.
+            out = input_.clone()
+            torch.distributed.all_reduce(out, group=self.device_group)
+        return out
+
+    def reduce_scatter(self, input_: torch.Tensor, dim: int = -1):
+        world_size = self.world_size
+        pynccl_comm = self.pynccl_comm
+        assert pynccl_comm is not None
+        if dim < 0:
+            # Convert negative dim to positive.
+            dim += input_.dim()
+
+        # Note: This will produce an incorrect answer if we don't make
+        # the input_tensor contiguous. Possible bug in reduce_scatter_tensor?
+        input_tensor = input_.movedim(0, dim).contiguous()
+
+        assert input_tensor.shape[0] % world_size == 0
+        chunk_size = input_tensor.shape[0] // world_size
+        output_shape = (chunk_size,) + input_tensor.shape[1:]
+
+        output = torch.empty(
+            output_shape, dtype=input_tensor.dtype, device=input_tensor.device
+        )
+
+        pynccl_comm.reduce_scatter(output, input_tensor)
+
+        # Reshape before returning
+        return output.movedim(0, dim).contiguous()
+
+    def reduce_scatterv(
+        self, input_: torch.Tensor, dim: int = -1, sizes: list[int] | None = None
+    ):
+        world_size = self.world_size
+        pynccl_comm = self.pynccl_comm
+        assert pynccl_comm is not None
+        if dim < 0:
+            # Convert negative dim to positive.
+            dim += input_.dim()
+
+        # Note: This will produce an incorrect answer if we don't make
+        # the input_tensor contiguous. Possible bug in reduce_scatter_tensor?
+        input_tensor = input_.movedim(0, dim).contiguous()
+
+        if sizes is not None:
+            assert len(sizes) == world_size
+            assert input_tensor.shape[0] == sum(sizes)
+            chunk_size = sizes[self.rank_in_group]
+        else:
+            assert input_tensor.shape[0] % world_size == 0
+            chunk_size = input_tensor.shape[0] // world_size
+        output_shape = (chunk_size,) + input_tensor.shape[1:]
+
+        output = torch.empty(
+            output_shape, dtype=input_tensor.dtype, device=input_tensor.device
+        )
+
+        if sizes is not None and sizes.count(sizes[0]) != len(sizes):
+            pynccl_comm.reduce_scatterv(output, input_tensor, sizes=sizes)
+        else:
+            pynccl_comm.reduce_scatter(output, input_tensor)
+
+        # Reshape before returning
+        return output.movedim(0, dim).contiguous()
+
+    def send(self, tensor: torch.Tensor, dst: int | None = None) -> None:
+        """Sends a tensor to the destination rank in a blocking way"""
+        """NOTE: `dst` is the local rank of the destination rank."""
+        if dst is None:
+            dst = (self.rank_in_group + 1) % self.world_size
+
+        pynccl_comm = self.pynccl_comm
+        if pynccl_comm is not None and not pynccl_comm.disabled:
+            pynccl_comm.send(tensor, dst)
+        else:
+            torch.distributed.send(tensor, self.ranks[dst], self.device_group)
+
+    def recv(
+        self, size: torch.Size, dtype: torch.dtype, src: int | None = None
+    ) -> torch.Tensor:
+        """Receives a tensor from the source rank."""
+        """NOTE: `src` is the local rank of the source rank."""
+        if src is None:
+            src = (self.rank_in_group - 1) % self.world_size
+
+        tensor = torch.empty(size, dtype=dtype, device=self.device)
+        pynccl_comm = self.pynccl_comm
+        if pynccl_comm is not None and not pynccl_comm.disabled:
+            pynccl_comm.recv(tensor, src)
+        else:
+            torch.distributed.recv(tensor, self.ranks[src], self.device_group)
+        return tensor
+
+    def destroy(self):
+        if self.pynccl_comm is not None:
+            self.pynccl_comm = None
+        if self.ca_comm is not None:
+            self.ca_comm = None
+        if self.all2all_manager is not None:
+            self.all2all_manager.destroy()
+            self.all2all_manager = None
+
+    def all_gatherv(
+        self,
+        input_: torch.Tensor | list[torch.Tensor],
+        dim: int = 0,
+        sizes: list[int] | None = None,
+    ):
+        if dim != 0:
+            raise NotImplementedError("only dim 0 all-gatherv is supported")
+        world_size = self.world_size
+        pynccl_comm = self.pynccl_comm
+        assert pynccl_comm is not None and not pynccl_comm.disabled
+
+        # 'sizes' is not needed if all inputs in the same group have the same
+        # shape
+        if sizes is not None and all(s == sizes[0] for s in sizes):
+            sizes = None
+
+        def _all_gather_single(input_: torch.Tensor, sizes: list[int] | None = None):
+            input_size = input_.size()
+            if sizes is not None:
+                assert len(sizes) == world_size
+                assert input_.shape[dim] == sizes[self.rank_in_group], (
+                    f"{input_.shape[dim]} != {sizes[self.rank_in_group]}"
+                )
+                output_size = (sum(sizes),) + input_size[1:]
+            else:
+                output_size = (input_size[0] * world_size,) + input_size[1:]
+            # Allocate output tensor.
+            output_tensor = torch.empty(
+                output_size, dtype=input_.dtype, device=input_.device
+            )
+            if sizes is not None:
+                pynccl_comm.all_gatherv(output_tensor, input_, sizes=sizes)
+            else:
+                pynccl_comm.all_gather(output_tensor, input_)
+            return output_tensor
+
+        if isinstance(input_, torch.Tensor):
+            return _all_gather_single(input_, sizes)
+
+        output_list = []
+        pynccl_comm.group_start()
+        for inp in input_:
+            output_list.append(_all_gather_single(inp, sizes=sizes))
+        pynccl_comm.group_end()
+
+        return output_list
+
+    def dispatch(
+        self,
+        hidden_states: torch.Tensor,
+        router_logits: torch.Tensor,
+        is_sequence_parallel: bool = False,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        assert self.all2all_manager is not None
+        hidden_states, router_logits = self.all2all_manager.dispatch(
+            hidden_states, router_logits, is_sequence_parallel
+        )
+        return hidden_states, router_logits
+
+    def combine(
+        self, hidden_states: torch.Tensor, is_sequence_parallel: bool = False
+    ) -> torch.Tensor:
+        assert self.all2all_manager is not None
+        hidden_states = self.all2all_manager.combine(
+            hidden_states, is_sequence_parallel
+        )
+        return hidden_states
--- a/vllm/distributed/device_communicators/cuda_wrapper.py
+++ b/vllm/distributed/device_communicators/cuda_wrapper.py
@@ -0,0 +1,216 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""This file is a pure Python wrapper for the cudart library.
+It avoids the need to compile a separate shared library, and is
+convenient for use when we just need to call a few functions.
+"""
+
+import ctypes
+from dataclasses import dataclass
+from typing import Any
+
+# this line makes it possible to directly load `libcudart.so` using `ctypes`
+import torch  # noqa
+
+import vllm.envs as envs
+from vllm.logger import init_logger
+from vllm.platforms import current_platform
+
+logger = init_logger(__name__)
+
+# === export types and functions from cudart to Python ===
+# for the original cudart definition, please check
+# https://docs.nvidia.com/cuda/cuda-runtime-api/index.html
+
+cudaError_t = ctypes.c_int
+cudaMemcpyKind = ctypes.c_int
+
+
+class cudaIpcMemHandle_t(ctypes.Structure):
+    _fields_ = [("internal", ctypes.c_byte * 128)]
+
+
+@dataclass
+class Function:
+    name: str
+    restype: Any
+    argtypes: list[Any]
+
+
+def find_loaded_library(lib_name) -> str | None:
+    """
+    According to according to https://man7.org/linux/man-pages/man5/proc_pid_maps.5.html,
+    the file `/proc/self/maps` contains the memory maps of the process, which includes the
+    shared libraries loaded by the process. We can use this file to find the path of the
+    a loaded library.
+    """  # noqa
+    found = False
+    with open("/proc/self/maps") as f:
+        for line in f:
+            if lib_name in line:
+                found = True
+                break
+    if not found:
+        # the library is not loaded in the current process
+        return None
+    # if lib_name is libcudart, we need to match a line with:
+    # address /path/to/libcudart-hash.so.11.0
+    start = line.index("/")
+    path = line[start:].strip()
+    filename = path.split("/")[-1]
+    assert filename.rpartition(".so")[0].startswith(lib_name), (
+        f"Unexpected filename: {filename} for library {lib_name}"
+    )
+    return path
+
+
+class CudaRTLibrary:
+    exported_functions = [
+        # cudaError_t cudaSetDevice ( int  device )
+        Function("cudaSetDevice", cudaError_t, [ctypes.c_int]),
+        # cudaError_t 	cudaDeviceSynchronize ( void )
+        Function("cudaDeviceSynchronize", cudaError_t, []),
+        # cudaError_t cudaDeviceReset ( void )
+        Function("cudaDeviceReset", cudaError_t, []),
+        # const char* 	cudaGetErrorString ( cudaError_t error )
+        Function("cudaGetErrorString", ctypes.c_char_p, [cudaError_t]),
+        # cudaError_t 	cudaMalloc ( void** devPtr, size_t size )
+        Function(
+            "cudaMalloc",
+            cudaError_t,
+            [ctypes.POINTER(ctypes.c_void_p), ctypes.c_size_t],
+        ),
+        # cudaError_t 	cudaFree ( void* devPtr )
+        Function("cudaFree", cudaError_t, [ctypes.c_void_p]),
+        # cudaError_t cudaMemset ( void* devPtr, int  value, size_t count )
+        Function(
+            "cudaMemset", cudaError_t, [ctypes.c_void_p, ctypes.c_int, ctypes.c_size_t]
+        ),
+        # cudaError_t cudaMemcpy ( void* dst, const void* src, size_t count, cudaMemcpyKind kind ) # noqa
+        Function(
+            "cudaMemcpy",
+            cudaError_t,
+            [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t, cudaMemcpyKind],
+        ),
+        # cudaError_t cudaIpcGetMemHandle ( cudaIpcMemHandle_t* handle, void* devPtr ) # noqa
+        Function(
+            "cudaIpcGetMemHandle",
+            cudaError_t,
+            [ctypes.POINTER(cudaIpcMemHandle_t), ctypes.c_void_p],
+        ),
+        # cudaError_t cudaIpcOpenMemHandle ( void** devPtr, cudaIpcMemHandle_t handle, unsigned int  flags ) # noqa
+        Function(
+            "cudaIpcOpenMemHandle",
+            cudaError_t,
+            [ctypes.POINTER(ctypes.c_void_p), cudaIpcMemHandle_t, ctypes.c_uint],
+        ),
+    ]
+
+    # https://rocm.docs.amd.com/projects/HIPIFY/en/latest/tables/CUDA_Runtime_API_functions_supported_by_HIP.html # noqa
+    cuda_to_hip_mapping = {
+        "cudaSetDevice": "hipSetDevice",
+        "cudaDeviceSynchronize": "hipDeviceSynchronize",
+        "cudaDeviceReset": "hipDeviceReset",
+        "cudaGetErrorString": "hipGetErrorString",
+        "cudaMalloc": "hipMalloc",
+        "cudaFree": "hipFree",
+        "cudaMemset": "hipMemset",
+        "cudaMemcpy": "hipMemcpy",
+        "cudaIpcGetMemHandle": "hipIpcGetMemHandle",
+        "cudaIpcOpenMemHandle": "hipIpcOpenMemHandle",
+    }
+
+    # class attribute to store the mapping from the path to the library
+    # to avoid loading the same library multiple times
+    path_to_library_cache: dict[str, Any] = {}
+
+    # class attribute to store the mapping from library path
+    #  to the corresponding dictionary
+    path_to_dict_mapping: dict[str, dict[str, Any]] = {}
+
+    def __init__(self, so_file: str | None = None):
+        if so_file is None:
+            so_file = find_loaded_library("libcudart")
+            if so_file is None:
+                # libcudart is not loaded in the current process, try hip
+                so_file = find_loaded_library("libamdhip64")
+                # should be safe to assume now that we are using ROCm
+                # as the following assertion should error out if the
+                # libhiprtc library is also not loaded
+                if so_file is None:
+                    so_file = envs.VLLM_CUDART_SO_PATH  # fallback to env var
+            assert so_file is not None, (
+                "libcudart is not loaded in the current process, "
+                "try setting VLLM_CUDART_SO_PATH"
+            )
+        if so_file not in CudaRTLibrary.path_to_library_cache:
+            lib = ctypes.CDLL(so_file)
+            CudaRTLibrary.path_to_library_cache[so_file] = lib
+        self.lib = CudaRTLibrary.path_to_library_cache[so_file]
+
+        if so_file not in CudaRTLibrary.path_to_dict_mapping:
+            _funcs = {}
+            for func in CudaRTLibrary.exported_functions:
+                f = getattr(
+                    self.lib,
+                    CudaRTLibrary.cuda_to_hip_mapping[func.name]
+                    if current_platform.is_rocm()
+                    else func.name,
+                )
+                f.restype = func.restype
+                f.argtypes = func.argtypes
+                _funcs[func.name] = f
+            CudaRTLibrary.path_to_dict_mapping[so_file] = _funcs
+        self.funcs = CudaRTLibrary.path_to_dict_mapping[so_file]
+
+    def CUDART_CHECK(self, result: cudaError_t) -> None:
+        if result != 0:
+            error_str = self.cudaGetErrorString(result)
+            raise RuntimeError(f"CUDART error: {error_str}")
+
+    def cudaGetErrorString(self, error: cudaError_t) -> str:
+        return self.funcs["cudaGetErrorString"](error).decode("utf-8")
+
+    def cudaSetDevice(self, device: int) -> None:
+        self.CUDART_CHECK(self.funcs["cudaSetDevice"](device))
+
+    def cudaDeviceSynchronize(self) -> None:
+        self.CUDART_CHECK(self.funcs["cudaDeviceSynchronize"]())
+
+    def cudaDeviceReset(self) -> None:
+        self.CUDART_CHECK(self.funcs["cudaDeviceReset"]())
+
+    def cudaMalloc(self, size: int) -> ctypes.c_void_p:
+        devPtr = ctypes.c_void_p()
+        self.CUDART_CHECK(self.funcs["cudaMalloc"](ctypes.byref(devPtr), size))
+        return devPtr
+
+    def cudaFree(self, devPtr: ctypes.c_void_p) -> None:
+        self.CUDART_CHECK(self.funcs["cudaFree"](devPtr))
+
+    def cudaMemset(self, devPtr: ctypes.c_void_p, value: int, count: int) -> None:
+        self.CUDART_CHECK(self.funcs["cudaMemset"](devPtr, value, count))
+
+    def cudaMemcpy(
+        self, dst: ctypes.c_void_p, src: ctypes.c_void_p, count: int
+    ) -> None:
+        cudaMemcpyDefault = 4
+        kind = cudaMemcpyDefault
+        self.CUDART_CHECK(self.funcs["cudaMemcpy"](dst, src, count, kind))
+
+    def cudaIpcGetMemHandle(self, devPtr: ctypes.c_void_p) -> cudaIpcMemHandle_t:
+        handle = cudaIpcMemHandle_t()
+        self.CUDART_CHECK(
+            self.funcs["cudaIpcGetMemHandle"](ctypes.byref(handle), devPtr)
+        )
+        return handle
+
+    def cudaIpcOpenMemHandle(self, handle: cudaIpcMemHandle_t) -> ctypes.c_void_p:
+        cudaIpcMemLazyEnablePeerAccess = 1
+        devPtr = ctypes.c_void_p()
+        self.CUDART_CHECK(
+            self.funcs["cudaIpcOpenMemHandle"](
+                ctypes.byref(devPtr), handle, cudaIpcMemLazyEnablePeerAccess
+            )
+        )
+        return devPtr
--- a/vllm/distributed/device_communicators/custom_all_reduce.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce.py
@@ -1,274 +1,326 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
 from contextlib import contextmanager
-from typing import Any, List, Optional
+from typing import cast

 import torch
 import torch.distributed as dist
+from torch.distributed import ProcessGroup

 import vllm.envs as envs
+from vllm import _custom_ops as ops
+from vllm.distributed.device_communicators.all_reduce_utils import (
+    CUSTOM_ALL_REDUCE_MAX_SIZES,
+    gpu_p2p_access_check,
+)
+from vllm.distributed.parallel_state import in_the_same_node_as
 from vllm.logger import init_logger
+from vllm.platforms import current_platform
+from vllm.utils.torch_utils import cuda_device_count_stateless

 try:
-    import pynvml
-
-    from vllm_C import custom_ar
-except ImportError:
-    # For AMD GPUs
-    custom_ar = None
-    pynvml = None
+    ops.meta_size()
+    custom_ar = True
+except Exception:
+    # For CPUs
+    custom_ar = False

 logger = init_logger(__name__)

-_CA_HANDLE: Optional["CustomAllreduce"] = None
-_IS_CAPTURING = False
-_SUPPORTED_WORLD_SIZES = [2, 4, 6, 8]
-
-
-def init_custom_ar() -> None:
-    from vllm.distributed import (get_tensor_model_parallel_rank,
-                                  get_tensor_model_parallel_world_size)
-
-    global _CA_HANDLE
-    if _CA_HANDLE is not None:
-        return
-    rank = get_tensor_model_parallel_rank()
-    world_size = get_tensor_model_parallel_world_size()
-    if world_size == 1:
-        # No need to initialize custom allreduce for single GPU case.
-        return
-
-    if world_size not in _SUPPORTED_WORLD_SIZES:
-        logger.warning(
-            "Custom allreduce is disabled due to an unsupported world size: "
-            "%d. Supported world sizes: %s. To silence this warning, specify"
-            " disable_custom_all_reduce=True explicitly.", world_size,
-            str(_SUPPORTED_WORLD_SIZES))
-        return
-    num_dev = torch.musa.device_count()
-    # note: num dev can be larger than world_size if we're only using
-    # first few GPUs
-    if num_dev < world_size:
-        logger.warning(
-            "Cannot test GPU P2P because not all GPUs are visible to the "
-            "current process. This might be the case if 'CUDA_VISIBLE_DEVICES'"
-            " is set.")
-        return
-    # test nvlink first, this will filter out most of the cases
-    # where custom allreduce is not supported
-    cuda_visible_devices = envs.CUDA_VISIBLE_DEVICES
-    if cuda_visible_devices:
-        device_ids = list(map(int, cuda_visible_devices.split(",")))
-    else:
-        device_ids = list(range(num_dev))
-    # this checks hardware and driver support for NVLink
-    full_nvlink = _is_full_nvlink(device_ids)
-    if world_size > 2 and not full_nvlink:
-        logger.warning(
-            "Custom allreduce is disabled because it's not supported on more"
-            " than two PCIe-only GPUs. To silence this warning, specify"
-            " disable_custom_all_reduce=True explicitly.")
-        return
-    # test P2P capability, this checks software/cudaruntime support
-    # this is expensive to compute at the first time
-    # then we cache the result
-    if not _can_p2p(rank, world_size):
-        logger.warning(
-            "Custom allreduce is disabled because your platform lacks GPU P2P"
-            " capability or P2P test failed. To silence this warning, specify"
-            " disable_custom_all_reduce=True explicitly.")
-        return
-    _CA_HANDLE = CustomAllreduce(rank, world_size, full_nvlink)
-
-
-def begin_capture() -> None:
-    global _IS_CAPTURING
-    _IS_CAPTURING = True
-
-
-def end_capture() -> None:
-    global _IS_CAPTURING
-    _IS_CAPTURING = False
-
-
-def is_capturing() -> bool:
-    return _IS_CAPTURING and _CA_HANDLE is not None
-
-
-def get_handle() -> Optional["CustomAllreduce"]:
-    return _CA_HANDLE
-
-
-def is_initialized() -> bool:
-    return _CA_HANDLE is not None
-
-
-@contextmanager
-def capture():
-    try:
-        begin_capture()
-        yield
-    finally:
-        end_capture()
-        handle = get_handle()
-        if handle is not None:
-            handle.register_graph_buffers()
-
-
-def custom_all_reduce(input: torch.Tensor) -> Optional[torch.Tensor]:
-    ca_handle = get_handle()
-    # when custom allreduce is disabled, this will be None
-    if ca_handle is None:
-        return None
-    if is_capturing():
-        if torch.cuda.is_current_stream_capturing():
-            if ca_handle.should_custom_ar(input):
-                return ca_handle.all_reduce_reg(input)
-        else:
-            if ca_handle.should_custom_ar(input):
-                # if warm up, mimic the allocation pattern
-                # since custom allreduce is out-of-place
-                return torch.empty_like(input)
-    else:
-        # note: outside of cuda graph context,
-        # custom allreduce incurs a cost of cudaMemcpy, which should
-        # be small(<=1% of overall latency) compared to the performance
-        # gains of using custom kernels
-        if ca_handle.should_custom_ar(input):
-            return ca_handle.all_reduce_unreg(input)
-
-    return None
-
-
-@contextmanager
-def _nvml():
-    try:
-        pynvml.nvmlInit()
-        yield
-    finally:
-        pynvml.nvmlShutdown()
-
-
-@_nvml()
-def _is_full_nvlink(device_ids: List[int]) -> bool:
-    """
-    query if the set of gpus are fully connected by nvlink (1 hop)
-    Note that `pynvml` is not affected by `CUDA_VISIBLE_DEVICES`,
-    so it works on real physical device ids.
-    """
-    handles = [pynvml.nvmlDeviceGetHandleByIndex(i) for i in device_ids]
-    for i, handle in enumerate(handles):
-        for j, peer_handle in enumerate(handles):
-            if i < j:
-                try:
-                    p2p_status = pynvml.nvmlDeviceGetP2PStatus(
-                        handle, peer_handle, pynvml.NVML_P2P_CAPS_INDEX_NVLINK)
-                    if p2p_status != pynvml.NVML_P2P_STATUS_OK:
-                        return False
-                except pynvml.NVMLError as error:
-                    logger.error(
-                        "NVLink detection failed. This is normal if your"
-                        " machine has no NVLink equipped.",
-                        exc_info=error)
-                    return False
-    return True
-

 def _can_p2p(rank: int, world_size: int) -> bool:
-    from vllm.distributed.utils import gpu_p2p_access_check
    for i in range(world_size):
        if i == rank:
            continue
+        if envs.VLLM_SKIP_P2P_CHECK:
+            logger.debug("Skipping P2P check and trusting the driver's P2P report.")
+            return torch.cuda.can_device_access_peer(rank, i)
        if not gpu_p2p_access_check(rank, i):
            return False
    return True


+def is_weak_contiguous(inp: torch.Tensor):
+    return inp.is_contiguous() or (
+        inp.storage().nbytes() - inp.storage_offset() * inp.element_size()
+        == inp.numel() * inp.element_size()
+    )
+
+
 class CustomAllreduce:
+    _SUPPORTED_WORLD_SIZES = [2, 4, 6, 8]

    # max_size: max supported allreduce size
-    def __init__(self,
-                 rank,
-                 world_size,
-                 full_nvlink,
-                 max_size=8192 * 1024) -> None:
-        # buffers memory are owned by this Python class and passed to C++
-        # meta data composes of two parts: meta data for synchronization
-        # (256 bytes) and a temporary buffer for storing intermediate
-        # allreduce results.
-        self.meta = torch.zeros(custom_ar.meta_size() + max_size,
-                                dtype=torch.uint8,
-                                device="musa")
+    def __init__(
+        self,
+        group: ProcessGroup,
+        device: int | str | torch.device,
+        max_size=8192 * 1024,
+        symm_mem_enabled=False,
+    ) -> None:
+        """
+        Args:
+            group: the process group to work on. If None, it will use the
+                default process group.
+            device: the device to bind the CustomAllreduce to. If None,
+                it will be bound to f"cuda:{local_rank}".
+        It is the caller's responsibility to make sure each communicator
+        is bind to a unique device, and all communicators in this group
+        are in the same node.
+        """
+        self._IS_CAPTURING = False
+        self.disabled = True
+
+        if not custom_ar:
+            # disable because of missing custom allreduce library
+            # e.g. in a non-GPU environment
+            logger.info(
+                "Custom allreduce is disabled because "
+                "of missing custom allreduce library"
+            )
+            return
+
+        self.group = group
+
+        assert dist.get_backend(group) != dist.Backend.NCCL, (
+            "CustomAllreduce should be attached to a non-NCCL group."
+        )
+
+        if not all(in_the_same_node_as(group, source_rank=0)):
+            # No need to initialize custom allreduce for multi-node case.
+            logger.warning(
+                "Custom allreduce is disabled because this process group"
+                " spans across nodes."
+            )
+            return
+
+        rank = dist.get_rank(group=self.group)
+        self.rank = rank
+        world_size = dist.get_world_size(group=self.group)
+        if world_size == 1:
+            # No need to initialize custom allreduce for single GPU case.
+            return
+
+        if world_size not in CustomAllreduce._SUPPORTED_WORLD_SIZES:
+            logger.warning(
+                "Custom allreduce is disabled due to an unsupported world"
+                " size: %d. Supported world sizes: %s. To silence this "
+                "warning, specify disable_custom_all_reduce=True explicitly.",
+                world_size,
+                str(CustomAllreduce._SUPPORTED_WORLD_SIZES),
+            )
+            return
+
+        if isinstance(device, int):
+            device = torch.device(f"cuda:{device}")
+        elif isinstance(device, str):
+            device = torch.device(device)
+        # now `device` is a `torch.device` object
+        assert isinstance(device, torch.device)
+        self.device = device
+        device_capability = current_platform.get_device_capability()
+        if (
+            current_platform.is_cuda()
+            and symm_mem_enabled
+            and device_capability is not None
+        ):
+            device_capability_str = device_capability.as_version_str()
+            if device_capability_str in CUSTOM_ALL_REDUCE_MAX_SIZES:
+                max_size = min(
+                    CUSTOM_ALL_REDUCE_MAX_SIZES[device_capability_str][world_size],
+                    max_size,
+                )
+        cuda_visible_devices = envs.CUDA_VISIBLE_DEVICES
+        if cuda_visible_devices:
+            device_ids = list(map(int, cuda_visible_devices.split(",")))
+        else:
+            device_ids = list(range(cuda_device_count_stateless()))
+
+        physical_device_id = device_ids[device.index]
+        tensor = torch.tensor([physical_device_id], dtype=torch.int, device="cpu")
+        gather_list = [
+            torch.tensor([0], dtype=torch.int, device="cpu") for _ in range(world_size)
+        ]
+        dist.all_gather(gather_list, tensor, group=self.group)
+        physical_device_ids = [t.item() for t in gather_list]
+
+        # test nvlink first, this will filter out most of the cases
+        # where custom allreduce is not supported
+        # this checks hardware and driver support for NVLink
+        assert current_platform.is_cuda_alike()
+        fully_connected = current_platform.is_fully_connected(physical_device_ids)
+        if world_size > 2 and not fully_connected:
+            logger.warning(
+                "Custom allreduce is disabled because it's not supported on"
+                " more than two PCIe-only GPUs. To silence this warning, "
+                "specify disable_custom_all_reduce=True explicitly."
+            )
+            return
+        # test P2P capability, this checks software/cudaruntime support
+        # this is expensive to compute at the first time
+        # then we cache the result
+        # On AMD GPU, p2p is always enabled between XGMI connected GPUs
+        if not current_platform.is_rocm() and not _can_p2p(rank, world_size):
+            logger.warning(
+                "Custom allreduce is disabled because your platform lacks "
+                "GPU P2P capability or P2P test failed. To silence this "
+                "warning, specify disable_custom_all_reduce=True explicitly."
+            )
+            return
+
+        self.disabled = False
+        # Buffers memory are owned by this Python class and passed to C++.
+        # Metadata composes of two parts: metadata for synchronization and a
+        # temporary buffer for storing intermediate allreduce results.
+        self.meta_ptrs = self.create_shared_buffer(
+            ops.meta_size() + max_size, group=group, uncached=True
+        )
        # This is a pre-registered IPC buffer. In eager mode, input tensors
        # are first copied into this buffer before allreduce is performed
-        self.buffer = torch.empty(max_size, dtype=torch.uint8, device="musa")
+        self.buffer_ptrs = self.create_shared_buffer(max_size, group=group)
        # This is a buffer for storing the tuples of pointers pointing to
        # IPC buffers from all ranks. Each registered tuple has size of
        # 8*world_size bytes where world_size is at most 8. Allocating 8MB
        # is enough for 131072 such tuples. The largest model I've seen only
        # needs less than 10000 of registered tuples.
-        self.rank_data = torch.empty(8 * 1024 * 1024,
-                                     dtype=torch.uint8,
-                                     device="musa")
-        self.max_size = max_size
-        self.world_size = world_size
-        handles, offsets = self._get_ipc_meta(self.meta)
-        self.full_nvlink = full_nvlink
-        self._ptr = custom_ar.init_custom_ar(self.meta, self.rank_data,
-                                             handles, offsets, rank,
-                                             self.full_nvlink)
-        self.register_buffer(self.buffer)
-
-    def _get_ipc_meta(self, inp: torch.Tensor):
-        data = inp.untyped_storage()._share_cuda_()
-        shard_data = (
-            data[1],  # ipc handle to base ptr
-            data[3],  # offset of base ptr
+        self.rank_data = torch.empty(
+            8 * 1024 * 1024, dtype=torch.uint8, device=self.device
        )
-        return self._gather_ipc_meta(shard_data)
+        self.max_size = max_size
+        self.rank = rank
+        self.world_size = world_size
+        self.fully_connected = fully_connected
+        self._ptr = ops.init_custom_ar(
+            self.meta_ptrs, self.rank_data, rank, self.fully_connected
+        )
+        ops.register_buffer(self._ptr, self.buffer_ptrs)

-    def _gather_ipc_meta(self, shard_data):
-        all_data: List[Optional[Any]] = [None] * self.world_size
-        dist.all_gather_object(all_data, shard_data)
-
-        handles = []
-        offsets = []
-        for i in range(len(all_data)):
-            handles.append(all_data[i][0])  # type: ignore
-            offsets.append(all_data[i][1])  # type: ignore
-        return handles, offsets
-
-    def register_buffer(self, inp: torch.Tensor):
-        handles, offsets = self._get_ipc_meta(inp)
-        custom_ar.register_buffer(self._ptr, inp, handles, offsets)
+    @contextmanager
+    def capture(self):
+        """
+        The main responsibility of this context manager is the
+        `register_graph_buffers` call at the end of the context.
+        It records all the buffer addresses used in the CUDA graph.
+        """
+        try:
+            self._IS_CAPTURING = True
+            yield
+        finally:
+            self._IS_CAPTURING = False
+            if not self.disabled:
+                self.register_graph_buffers()

    def register_graph_buffers(self):
-        handle, offset = custom_ar.get_graph_buffer_ipc_meta(self._ptr)
-        handles, offsets = self._gather_ipc_meta((bytes(handle), offset))
+        handle, offset = ops.get_graph_buffer_ipc_meta(self._ptr)
        logger.info("Registering %d cuda graph addresses", len(offset))
-        custom_ar.register_graph_buffers(self._ptr, handles, offsets)
+        # We cannot directly use `dist.all_gather_object` here
+        # because it is incompatible with `gloo` backend under inference mode.
+        # see https://github.com/pytorch/pytorch/issues/126032 for details.
+        all_data: list[list[list[int] | None]]
+        all_data = [[None, None] for _ in range(dist.get_world_size(group=self.group))]
+        all_data[self.rank] = [handle, offset]
+        ranks = sorted(dist.get_process_group_ranks(group=self.group))
+        for i, rank in enumerate(ranks):
+            dist.broadcast_object_list(
+                all_data[i], src=rank, group=self.group, device="cpu"
+            )
+        # Unpack list of tuples to tuple of lists.
+        handles = cast(list[list[int]], [d[0] for d in all_data])
+        offsets = cast(list[list[int]], [d[1] for d in all_data])
+        ops.register_graph_buffers(self._ptr, handles, offsets)

    def should_custom_ar(self, inp: torch.Tensor):
-        return custom_ar.should_custom_ar(inp, self.max_size, self.world_size,
-                                          self.full_nvlink)
+        if self.disabled:
+            return False
+        inp_size = inp.numel() * inp.element_size()
+        # custom allreduce requires input byte size to be multiples of 16
+        if inp_size % 16 != 0:
+            return False
+        if not is_weak_contiguous(inp):
+            return False
+        # for 4 or more non NVLink-capable GPUs, custom allreduce provides
+        # little performance improvement over NCCL.
+        if self.world_size == 2 or self.fully_connected:
+            return inp_size < self.max_size
+        return False

-    # all reduce, assuming inp tensor is IPC registered with register_buffer,
-    # or, in the context of cuda graphs, register_graph_buffers
-    def all_reduce_reg(self, inp: torch.Tensor, out: torch.Tensor = None):
+    def all_reduce(
+        self, inp: torch.Tensor, *, out: torch.Tensor = None, registered: bool = False
+    ):
+        """Performs an out-of-place all reduce.
+
+        If registered is True, this assumes inp's pointer is already
+        IPC-registered. Otherwise, inp is first copied into a pre-registered
+        buffer.
+        """
        if out is None:
            out = torch.empty_like(inp)
-        custom_ar.all_reduce_reg(self._ptr, inp, out)
+        if registered:
+            ops.all_reduce(self._ptr, inp, out, 0, 0)
+        else:
+            ops.all_reduce(
+                self._ptr, inp, out, self.buffer_ptrs[self.rank], self.max_size
+            )
        return out

-    # all reduce, assuming inp tensor is NOT IPC registered
-    def all_reduce_unreg(self, inp: torch.Tensor, out: torch.Tensor = None):
-        if out is None:
-            out = torch.empty_like(inp)
-        custom_ar.all_reduce_unreg(self._ptr, inp, self.buffer, out)
-        return out
+    def custom_all_reduce(self, input: torch.Tensor) -> torch.Tensor | None:
+        """The main allreduce API that provides support for cuda graph."""
+        # When custom allreduce is disabled, this will be None.
+        if self.disabled or not self.should_custom_ar(input):
+            return None
+        if self._IS_CAPTURING:
+            if torch.cuda.is_current_stream_capturing():
+                return self.all_reduce(input, registered=True)
+            else:
+                # If warm up, mimic the allocation pattern since custom
+                # allreduce is out-of-place.
+                return torch.empty_like(input)
+        else:
+            # Note: outside of cuda graph context, custom allreduce incurs a
+            # cost of cudaMemcpy, which should be small (<=1% of overall
+            # latency) compared to the performance gain of using custom kernels
+            return self.all_reduce(input, registered=False)

    def close(self):
-        if self._ptr:
-            custom_ar.dispose(self._ptr)
+        if not self.disabled and self._ptr:
+            if ops is not None:
+                ops.dispose(self._ptr)
            self._ptr = 0
+            self.free_shared_buffer(self.meta_ptrs, rank=self.rank)
+            self.free_shared_buffer(self.buffer_ptrs, rank=self.rank)

    def __del__(self):
        self.close()
+
+    @staticmethod
+    def create_shared_buffer(
+        size_in_bytes: int,
+        group: ProcessGroup | None = None,
+        uncached: bool | None = False,
+    ) -> list[int]:
+        pointer, handle = ops.allocate_shared_buffer_and_handle(size_in_bytes)
+
+        world_size = dist.get_world_size(group=group)
+        rank = dist.get_rank(group=group)
+        handles = [None] * world_size
+        dist.all_gather_object(handles, handle, group=group)
+
+        pointers: list[int] = []
+        for i, h in enumerate(handles):
+            if i == rank:
+                pointers.append(pointer)  # type: ignore
+            else:
+                pointers.append(ops.open_mem_handle(h))
+        return pointers
+
+    @staticmethod
+    def free_shared_buffer(
+        pointers: list[int],
+        group: ProcessGroup | None = None,
+        rank: int | None = None,
+    ) -> None:
+        if rank is None:
+            rank = dist.get_rank(group=group)
+        if ops is not None:
+            ops.free_shared_buffer(pointers[rank])
--- a/vllm/distributed/device_communicators/mnnvl_compat.py
+++ b/vllm/distributed/device_communicators/mnnvl_compat.py
@@ -0,0 +1,27 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch.distributed as dist
+from flashinfer.comm.mnnvl import CommBackend as CommBackend
+
+from vllm.utils.flashinfer import has_flashinfer_all2all
+
+assert has_flashinfer_all2all(), "Flashinfer alltoallv module cannot be found"
+
+
+class CustomCommunicator(CommBackend):
+    def __init__(self, group):
+        self._group = group
+
+    def Get_rank(self) -> int:
+        return self._group.rank()
+
+    def Get_size(self) -> int:
+        return self._group.size()
+
+    def allgather(self, data: int):
+        gathered = [None] * self.Get_size()
+        dist.all_gather_object(gathered, data, group=self._group)
+        return gathered
+
+    def Split(self, color: int, key: int) -> "CustomCommunicator":
+        return self
--- a/vllm/distributed/device_communicators/pymccl.py
+++ b/vllm/distributed/device_communicators/pymccl.py
@@ -1,284 +0,0 @@
-# This file is a pure Python wrapper for the MCCL library.
-# The main purpose is to use MCCL combined with MUSA graph.
-# Before writing this script, we tried the following approach:
-# 1. We tried to use `cupy`, it calls MCCL correctly, but `cupy` itself
-#  often gets stuck when initializing the MCCL communicator.
-# 2. We tried to use `torch.distributed`, but `torch.distributed.all_reduce`
-#  contains many other potential musa APIs, that are not allowed during
-#  capturing the MUSA graph. For further details, please check
-# https://discuss.pytorch.org/t/pytorch-musagraph-with-mccl-operation-failed/ .
-#
-# Another rejected idea is to write a C/C++ binding for MCCL. It is usually
-# doable, but we often encounter issues related with mccl versions, and need
-# to switch between different versions of MCCL. See
-# https://github.com/NVIDIA/mccl/issues/1234 for more details.
-# A C/C++ binding is not flexible enough to handle this. It requires
-# recompilation of the code every time we want to switch between different
-# versions. This current implementation, with a **pure** Python wrapper, is
-# more flexible. We can easily switch between different versions of MCCL by
-# changing the environment variable `VLLM_MCCL_SO_PATH`, or the `so_file`
-# variable in the code.
-
-import ctypes
-import platform
-from typing import Optional, Union
-
-# ===================== import region =====================
-import torch
-import torch_musa
-import torch.distributed as dist
-from torch.distributed import ProcessGroup, ReduceOp
-
-from vllm.distributed.parallel_state import get_cpu_world_group, get_local_rank
-from vllm.logger import init_logger
-from vllm.utils import find_mccl_library, mccl_integrity_check
-
-logger = init_logger(__name__)
-
-so_file = find_mccl_library()
-
-try:
-    # load the library in another process.
-    # if it core dumps, it will not crash the current process
-    mccl_integrity_check(so_file)
-    mccl = ctypes.CDLL(so_file)
-except Exception as e:
-    logger.error(
-        "Failed to load MCCL library from %s ."
-        "It is expected if you are not running on NVIDIA/AMD GPUs."
-        "Otherwise, the mccl library might not exist, be corrupted "
-        "or it does not support the current platform %s."
-        "One solution is to download libmccl2 version 2.18 from "
-        "https://developer.download.nvidia.com/compute/musa/repos/ "
-        "and extract the libmccl.so.2 file. If you already have the "
-        "library, please set the environment variable VLLM_MCCL_SO_PATH"
-        " to point to the correct mccl library path.", so_file,
-        platform.platform())
-    raise e
-
-# === export types and functions from mccl to Python ===
-# for the original mccl definition, please check
-# https://github.com/NVIDIA/mccl/blob/master/src/mccl.h.in
-
-mcclResult_t = ctypes.c_int
-
-_c_mcclGetErrorString = mccl.mcclGetErrorString
-_c_mcclGetErrorString.restype = ctypes.c_char_p
-_c_mcclGetErrorString.argtypes = [mcclResult_t]
-
-
-def MCCL_CHECK(result: mcclResult_t) -> None:
-    if result != 0:
-        error_str = _c_mcclGetErrorString(result)
-        error_str = error_str.decode("utf-8")
-        raise RuntimeError(f"MCCL error: {error_str}")
-
-
-# equivalent to c declaration:
-# mcclResult_t  mcclGetVersion(int *version);
-_c_mcclGetVersion = mccl.mcclGetVersion
-_c_mcclGetVersion.restype = ctypes.c_int
-_c_mcclGetVersion.argtypes = [ctypes.POINTER(ctypes.c_int)]
-
-
-def mcclGetVersion() -> str:
-    version = ctypes.c_int()
-    MCCL_CHECK(_c_mcclGetVersion(ctypes.byref(version)))
-    version_str = str(version.value)
-    return version_str
-
-
-class McclUniqueId(ctypes.Structure):
-    _fields_ = [("internal", ctypes.c_byte * 128)]
-
-
-# equivalent to c declaration:
-# mcclResult_t mcclGetUniqueId(mcclUniqueId* uniqueId);
-_c_mcclGetUniqueId = mccl.mcclGetUniqueId
-_c_mcclGetUniqueId.restype = ctypes.c_int
-_c_mcclGetUniqueId.argtypes = [ctypes.POINTER(McclUniqueId)]
-
-
-def mcclGetUniqueId() -> McclUniqueId:
-    unique_id = McclUniqueId()
-    MCCL_CHECK(_c_mcclGetUniqueId(ctypes.byref(unique_id)))
-    return unique_id
-
-
-# equivalent to c declaration:
-# mcclResult_t  mcclCommInitRank(
-#   mcclComm_t* comm, int nranks, mcclUniqueId commId, int rank);
-# note that mcclComm_t is a pointer type, so the first argument
-# is a pointer to a pointer
-_c_mcclCommInitRank = mccl.mcclCommInitRank
-_c_mcclCommInitRank.restype = ctypes.c_int
-_c_mcclCommInitRank.argtypes = [
-    ctypes.POINTER(ctypes.c_void_p), ctypes.c_int, McclUniqueId, ctypes.c_int
-]
-
-mcclDataType_t = ctypes.c_int
-
-
-class mcclDataTypeEnum:
-    mcclInt8 = 0
-    mcclChar = 0
-    mcclUint8 = 1
-    mcclInt32 = 2
-    mcclInt = 2
-    mcclUint32 = 3
-    mcclInt64 = 4
-    mcclUint64 = 5
-    mcclFloat16 = 6
-    mcclHalf = 6
-    mcclFloat32 = 7
-    mcclFloat = 7
-    mcclFloat64 = 8
-    mcclDouble = 8
-    mcclBfloat16 = 9
-    mcclNumTypes = 10
-
-    @classmethod
-    def from_torch(cls, dtype: torch.dtype) -> int:
-        if dtype == torch.int8:
-            return cls.mcclInt8
-        if dtype == torch.uint8:
-            return cls.mcclUint8
-        if dtype == torch.int32:
-            return cls.mcclInt32
-        if dtype == torch.int64:
-            return cls.mcclInt64
-        if dtype == torch.float16:
-            return cls.mcclFloat16
-        if dtype == torch.float32:
-            return cls.mcclFloat32
-        if dtype == torch.float64:
-            return cls.mcclFloat64
-        if dtype == torch.bfloat16:
-            return cls.mcclBfloat16
-        raise ValueError(f"Unsupported dtype: {dtype}")
-
-
-mcclRedOp_t = ctypes.c_int
-
-
-class mcclRedOpTypeEnum:
-    mcclSum = 0
-    mcclProd = 1
-    mcclMax = 2
-    mcclMin = 3
-    mcclAvg = 4
-    mcclNumOps = 5
-
-    @classmethod
-    def from_torch(cls, op: ReduceOp) -> int:
-        if op == ReduceOp.SUM:
-            return cls.mcclSum
-        if op == ReduceOp.PRODUCT:
-            return cls.mcclProd
-        if op == ReduceOp.MAX:
-            return cls.mcclMax
-        if op == ReduceOp.MIN:
-            return cls.mcclMin
-        if op == ReduceOp.AVG:
-            return cls.mcclAvg
-        raise ValueError(f"Unsupported op: {op}")
-
-
-# equivalent to c declaration:
-# mcclResult_t  mcclAllReduce(
-#   const void* sendbuff, void* recvbuff, size_t count,
-#   mcclDataType_t datatype, mcclRedOp_t op, mcclComm_t comm,
-#   udaStream_t stream);
-# note that musaStream_t is a pointer type, so the last argument is a pointer
-_c_mcclAllReduce = mccl.mcclAllReduce
-_c_mcclAllReduce.restype = ctypes.c_int
-_c_mcclAllReduce.argtypes = [
-    ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t, mcclRedOp_t,
-    mcclDataType_t, ctypes.c_void_p, ctypes.c_void_p
-]
-
-# be cautious! this is a collective call, it will block until all
-# processes in the communicator have called this function.
-# because Python object destruction can happen in random order,
-# it is better not to call it at all.
-# equivalent to c declaration:
-# mcclResult_t  mcclCommDestroy(mcclComm_t comm);
-_c_mcclCommDestroy = mccl.mcclCommDestroy
-_c_mcclCommDestroy.restype = ctypes.c_int
-_c_mcclCommDestroy.argtypes = [ctypes.c_void_p]
-
-
-class MCCLCommunicator:
-
-    def __init__(
-        self,
-        group: Optional[ProcessGroup] = None,
-        device: Optional[Union[int, str, torch.device]] = None,
-    ):
-        """
-        Args:
-            group: the process group to work on. If None, it will use the
-                default process group.
-            device: the device to bind the MCCLCommunicator to. If None,
-                it will be bind to f"musa:{local_rank}".
-        It is the caller's responsibility to make sure each communicator
-        is bind to a unique device.
-        """
-        assert dist.is_initialized()
-        group = get_cpu_world_group() if group is None else group
-        assert dist.get_backend(group) != dist.Backend.MCCL, (
-            "MCCLCommunicator should be attached to a non-MCCL group.")
-        self.group = group
-        # note: this rank is the rank in the group
-        self.rank = dist.get_rank(group)
-        self.world_size = dist.get_world_size(group)
-        if self.rank == 0:
-            self.unique_id = mcclGetUniqueId()
-        else:
-            self.unique_id = McclUniqueId()
-        tensor = torch.ByteTensor(list(self.unique_id.internal))
-        ranks = dist.get_process_group_ranks(group)
-        # arg `src` in `broadcast` is the global rank
-        dist.broadcast(tensor, src=ranks[0], group=group)
-        byte_list = tensor.tolist()
-        for i, byte in enumerate(byte_list):
-            self.unique_id.internal[i] = byte
-        self.comm = ctypes.c_void_p()
-        if device is None:
-            local_rank = get_local_rank()
-            device = torch.device(f"musa:{local_rank}")
-        elif isinstance(device, int):
-            device = torch.device(f"musa:{device}")
-        elif isinstance(device, str):
-            device = torch.device(device)
-        # now `device` is a `torch.device` object
-        assert isinstance(device, torch.device)
-        self.device = device
-        # mccl communicator and stream will use this device
-        # `torch.musa.device` is a context manager that changes the
-        # current musa device to the specified one
-        with torch.musa.device(device):
-            MCCL_CHECK(
-                _c_mcclCommInitRank(ctypes.byref(self.comm), self.world_size,
-                                    self.unique_id, self.rank))
-            self.stream = torch.musa.Stream()
-
-    def all_reduce(self,
-                   tensor: torch.Tensor,
-                   op: ReduceOp = ReduceOp.SUM,
-                   stream=None):
-        # mccl communicator created on a specific device
-        # will only work on tensors on the same device
-        # otherwise it will cause "illegal memory access"
-        assert tensor.device == self.device, (
-            f"this mccl communicator is created to work on {self.device}, "
-            f"but the input tensor is on {tensor.device}")
-        if stream is None:
-            stream = self.stream
-        MCCL_CHECK(
-            _c_mcclAllReduce(ctypes.c_void_p(tensor.data_ptr()),
-                             ctypes.c_void_p(tensor.data_ptr()),
-                             tensor.numel(),
-                             mcclDataTypeEnum.from_torch(tensor.dtype),
-                             mcclRedOpTypeEnum.from_torch(op), self.comm,
-                             ctypes.c_void_p(stream.musa_stream)))
--- a/vllm/distributed/device_communicators/pymccl_utils.py
+++ b/vllm/distributed/device_communicators/pymccl_utils.py
@@ -1,66 +0,0 @@
-import contextlib
-from typing import Optional
-
-import torch
-from torch.distributed import ProcessGroup, ReduceOp
-
-from vllm.logger import init_logger
-
-logger = init_logger(__name__)
-
-try:
-    from vllm.distributed.device_communicators.pymccl import (MCCLCommunicator,
-                                                              mcclGetVersion)
-except Exception as e:
-    # in non-MTHREADS environments, we can't import the mccl module
-    # e.g. when running on machines with AMD GPUs
-    logger.info("Failed to import MCCL library: %s", e)
-    logger.info("It is expected if you are not running on Mthreads GPUs.")
-    pass
-
-comm: Optional["MCCLCommunicator"] = None
-
-
-def is_initialized() -> bool:
-    """Returns whether the NCCL backend is initialized."""
-    return comm is not None
-
-
-@contextlib.contextmanager
-def set_pymccl_stream(stream: torch.cuda.Stream):
-    """Set the cuda stream for communication"""
-    try:
-        assert comm is not None
-        comm.stream = stream
-        yield
-    finally:
-        pass
-
-
-def init_process_group(group: Optional[ProcessGroup] = None) -> None:
-    assert not is_initialized()
-    global comm
-    logger.info("vLLM is using nccl==%s", mcclGetVersion())
-    comm = MCCLCommunicator(group=group)
-
-
-def all_reduce(input_: torch.Tensor, op=ReduceOp.SUM) -> None:
-    """All-reduces the input tensor across the process group."""
-    assert input_.is_musa, f"{input_} should be a musa tensor"
-    assert comm is not None
-    comm.all_reduce(input_, op)
-
-
-def destroy_process_group() -> None:
-    global comm
-    comm = None
-
-
-def get_world_size() -> int:
-    """Returns the world size."""
-    assert comm is not None
-    return comm.world_size
-
-
-def get_nccl_backend() -> Optional["MCCLCommunicator"]:
-    return comm
--- a/vllm/distributed/device_communicators/pynccl.py
+++ b/vllm/distributed/device_communicators/pynccl.py
@@ -1,256 +1,131 @@
-# This file is a pure Python wrapper for the NCCL library.
-# The main purpose is to use NCCL combined with CUDA graph.
-# Before writing this script, we tried the following approach:
-# 1. We tried to use `cupy`, it calls NCCL correctly, but `cupy` itself
-#  often gets stuck when initializing the NCCL communicator.
-# 2. We tried to use `torch.distributed`, but `torch.distributed.all_reduce`
-#  contains many other potential cuda APIs, that are not allowed during
-#  capturing the CUDA graph. For further details, please check
-# https://discuss.pytorch.org/t/pytorch-cudagraph-with-nccl-operation-failed/ .
-#
-# Another rejected idea is to write a C/C++ binding for NCCL. It is usually
-# doable, but we often encounter issues related with nccl versions, and need
-# to switch between different versions of NCCL. See
-# https://github.com/NVIDIA/nccl/issues/1234 for more details.
-# A C/C++ binding is not flexible enough to handle this. It requires
-# recompilation of the code every time we want to switch between different
-# versions. This current implementation, with a **pure** Python wrapper, is
-# more flexible. We can easily switch between different versions of NCCL by
-# changing the environment variable `VLLM_NCCL_SO_PATH`, or the `so_file`
-# variable in the code.
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

-import ctypes
-import platform
-from typing import Optional, Union

 # ===================== import region =====================
 import torch
 import torch.distributed as dist
 from torch.distributed import ProcessGroup, ReduceOp

-from vllm.distributed.parallel_state import get_cpu_world_group, get_local_rank
+import vllm.envs as envs
+from vllm.distributed.device_communicators.pynccl_wrapper import (
+    NCCLLibrary,
+    buffer_type,
+    cudaStream_t,
+    ncclComm_t,
+    ncclDataTypeEnum,
+    ncclRedOpTypeEnum,
+    ncclUniqueId,
+)
+from vllm.distributed.utils import StatelessProcessGroup
 from vllm.logger import init_logger
-from vllm.utils import find_nccl_library, nccl_integrity_check
+from vllm.utils.torch_utils import current_stream

 logger = init_logger(__name__)

-so_file = find_nccl_library()
-
-try:
-    # load the library in another process.
-    # if it core dumps, it will not crash the current process
-    nccl_integrity_check(so_file)
-    nccl = ctypes.CDLL(so_file)
-except Exception as e:
-    logger.error(
-        "Failed to load NCCL library from %s ."
-        "It is expected if you are not running on NVIDIA/AMD GPUs."
-        "Otherwise, the nccl library might not exist, be corrupted "
-        "or it does not support the current platform %s."
-        "One solution is to download libnccl2 version 2.18 from "
-        "https://developer.download.nvidia.com/compute/cuda/repos/ "
-        "and extract the libnccl.so.2 file. If you already have the "
-        "library, please set the environment variable VLLM_NCCL_SO_PATH"
-        " to point to the correct nccl library path.", so_file,
-        platform.platform())
-    raise e
-
-# === export types and functions from nccl to Python ===
-# for the original nccl definition, please check
-# https://github.com/NVIDIA/nccl/blob/master/src/nccl.h.in
-
-ncclResult_t = ctypes.c_int
-
-_c_ncclGetErrorString = nccl.ncclGetErrorString
-_c_ncclGetErrorString.restype = ctypes.c_char_p
-_c_ncclGetErrorString.argtypes = [ncclResult_t]
+_NCCL_SYMM_OPS_REGISTERED = False


-def NCCL_CHECK(result: ncclResult_t) -> None:
-    if result != 0:
-        error_str = _c_ncclGetErrorString(result)
-        error_str = error_str.decode("utf-8")
-        raise RuntimeError(f"NCCL error: {error_str}")
+def register_nccl_symmetric_ops(pynccl_comm):
+    from vllm.distributed.device_communicators.pynccl_allocator import (
+        nccl_symm_mem_context,
+    )
+    from vllm.utils.torch_utils import direct_register_custom_op
+
+    global _NCCL_SYMM_OPS_REGISTERED
+    if _NCCL_SYMM_OPS_REGISTERED:
+        return
+    _NCCL_SYMM_OPS_REGISTERED = True
+
+    def all_reduce_symmetric_with_copy_impl(input_tensor: torch.Tensor) -> torch.Tensor:
+        with nccl_symm_mem_context(pynccl_comm):
+            symm_input = torch.empty_like(input_tensor)
+            symm_output = torch.empty_like(input_tensor)
+        symm_input.copy_(input_tensor)
+        symm_output = pynccl_comm.all_reduce(symm_input, symm_output)
+        return symm_output
+
+    def all_reduce_symmetric_with_copy_fake(input_tensor: torch.Tensor) -> torch.Tensor:
+        return torch.empty_like(input_tensor)
+
+    direct_register_custom_op(
+        op_name="all_reduce_symmetric_with_copy",
+        op_func=all_reduce_symmetric_with_copy_impl,
+        fake_impl=all_reduce_symmetric_with_copy_fake,
+    )


-# equivalent to c declaration:
-# ncclResult_t  ncclGetVersion(int *version);
-_c_ncclGetVersion = nccl.ncclGetVersion
-_c_ncclGetVersion.restype = ctypes.c_int
-_c_ncclGetVersion.argtypes = [ctypes.POINTER(ctypes.c_int)]
-
-
-def ncclGetVersion() -> str:
-    version = ctypes.c_int()
-    NCCL_CHECK(_c_ncclGetVersion(ctypes.byref(version)))
-    # something like 21903 --> "2.19.3"
-    version_str = str(version.value)
-    major = version_str[0].lstrip("0")
-    minor = version_str[1:3].lstrip("0")
-    patch = version_str[3:].lstrip("0")
-    return f"{major}.{minor}.{patch}"
-
-
-class NcclUniqueId(ctypes.Structure):
-    _fields_ = [("internal", ctypes.c_byte * 128)]
-
-
-# equivalent to c declaration:
-# ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId);
-_c_ncclGetUniqueId = nccl.ncclGetUniqueId
-_c_ncclGetUniqueId.restype = ctypes.c_int
-_c_ncclGetUniqueId.argtypes = [ctypes.POINTER(NcclUniqueId)]
-
-
-def ncclGetUniqueId() -> NcclUniqueId:
-    unique_id = NcclUniqueId()
-    NCCL_CHECK(_c_ncclGetUniqueId(ctypes.byref(unique_id)))
-    return unique_id
-
-
-# equivalent to c declaration:
-# ncclResult_t  ncclCommInitRank(
-#   ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
-# note that ncclComm_t is a pointer type, so the first argument
-# is a pointer to a pointer
-_c_ncclCommInitRank = nccl.ncclCommInitRank
-_c_ncclCommInitRank.restype = ctypes.c_int
-_c_ncclCommInitRank.argtypes = [
-    ctypes.POINTER(ctypes.c_void_p), ctypes.c_int, NcclUniqueId, ctypes.c_int
-]
-
-ncclDataType_t = ctypes.c_int
-
-
-class ncclDataTypeEnum:
-    ncclInt8 = 0
-    ncclChar = 0
-    ncclUint8 = 1
-    ncclInt32 = 2
-    ncclInt = 2
-    ncclUint32 = 3
-    ncclInt64 = 4
-    ncclUint64 = 5
-    ncclFloat16 = 6
-    ncclHalf = 6
-    ncclFloat32 = 7
-    ncclFloat = 7
-    ncclFloat64 = 8
-    ncclDouble = 8
-    ncclBfloat16 = 9
-    ncclNumTypes = 10
-
-    @classmethod
-    def from_torch(cls, dtype: torch.dtype) -> int:
-        if dtype == torch.int8:
-            return cls.ncclInt8
-        if dtype == torch.uint8:
-            return cls.ncclUint8
-        if dtype == torch.int32:
-            return cls.ncclInt32
-        if dtype == torch.int64:
-            return cls.ncclInt64
-        if dtype == torch.float16:
-            return cls.ncclFloat16
-        if dtype == torch.float32:
-            return cls.ncclFloat32
-        if dtype == torch.float64:
-            return cls.ncclFloat64
-        if dtype == torch.bfloat16:
-            return cls.ncclBfloat16
-        raise ValueError(f"Unsupported dtype: {dtype}")
-
-
-ncclRedOp_t = ctypes.c_int
-
-
-class ncclRedOpTypeEnum:
-    ncclSum = 0
-    ncclProd = 1
-    ncclMax = 2
-    ncclMin = 3
-    ncclAvg = 4
-    ncclNumOps = 5
-
-    @classmethod
-    def from_torch(cls, op: ReduceOp) -> int:
-        if op == ReduceOp.SUM:
-            return cls.ncclSum
-        if op == ReduceOp.PRODUCT:
-            return cls.ncclProd
-        if op == ReduceOp.MAX:
-            return cls.ncclMax
-        if op == ReduceOp.MIN:
-            return cls.ncclMin
-        if op == ReduceOp.AVG:
-            return cls.ncclAvg
-        raise ValueError(f"Unsupported op: {op}")
-
-
-# equivalent to c declaration:
-# ncclResult_t  ncclAllReduce(
-#   const void* sendbuff, void* recvbuff, size_t count,
-#   ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
-#   udaStream_t stream);
-# note that cudaStream_t is a pointer type, so the last argument is a pointer
-_c_ncclAllReduce = nccl.ncclAllReduce
-_c_ncclAllReduce.restype = ctypes.c_int
-_c_ncclAllReduce.argtypes = [
-    ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t, ncclRedOp_t,
-    ncclDataType_t, ctypes.c_void_p, ctypes.c_void_p
-]
-
-# be cautious! this is a collective call, it will block until all
-# processes in the communicator have called this function.
-# because Python object destruction can happen in random order,
-# it is better not to call it at all.
-# equivalent to c declaration:
-# ncclResult_t  ncclCommDestroy(ncclComm_t comm);
-_c_ncclCommDestroy = nccl.ncclCommDestroy
-_c_ncclCommDestroy.restype = ctypes.c_int
-_c_ncclCommDestroy.argtypes = [ctypes.c_void_p]
-
-
-class NCCLCommunicator:
-
+class PyNcclCommunicator:
    def __init__(
        self,
-        group: Optional[ProcessGroup] = None,
-        device: Optional[Union[int, str, torch.device]] = None,
+        group: ProcessGroup | StatelessProcessGroup,
+        device: int | str | torch.device,
+        library_path: str | None = None,
    ):
        """
        Args:
            group: the process group to work on. If None, it will use the
                default process group.
-            device: the device to bind the NCCLCommunicator to. If None,
-                it will be bind to f"cuda:{local_rank}".
+            device: the device to bind the PyNcclCommunicator to. If None,
+                it will be bound to f"cuda:{local_rank}".
+            library_path: the path to the NCCL library. If None, it will
+                use the default library path.
        It is the caller's responsibility to make sure each communicator
        is bind to a unique device.
        """
-        assert dist.is_initialized()
-        group = get_cpu_world_group() if group is None else group
-        assert dist.get_backend(group) != dist.Backend.NCCL, (
-            "NCCLCommunicator should be attached to a non-NCCL group.")
-        self.group = group
-        # note: this rank is the rank in the group
-        self.rank = dist.get_rank(group)
-        self.world_size = dist.get_world_size(group)
-        if self.rank == 0:
-            self.unique_id = ncclGetUniqueId()
+        if not isinstance(group, StatelessProcessGroup):
+            assert dist.is_initialized()
+            assert dist.get_backend(group) != dist.Backend.NCCL, (
+                "PyNcclCommunicator should be attached to a non-NCCL group."
+            )
+            # note: this rank is the rank in the group
+            self.rank = dist.get_rank(group)
+            self.world_size = dist.get_world_size(group)
        else:
-            self.unique_id = NcclUniqueId()
-        tensor = torch.ByteTensor(list(self.unique_id.internal))
-        ranks = dist.get_process_group_ranks(group)
-        # arg `src` in `broadcast` is the global rank
-        dist.broadcast(tensor, src=ranks[0], group=group)
-        byte_list = tensor.tolist()
-        for i, byte in enumerate(byte_list):
-            self.unique_id.internal[i] = byte
-        self.comm = ctypes.c_void_p()
-        if device is None:
-            local_rank = get_local_rank()
-            device = torch.device(f"cuda:{local_rank}")
-        elif isinstance(device, int):
+            self.rank = group.rank
+            self.world_size = group.world_size
+
+        self.group = group
+
+        # if world_size == 1, no need to create communicator
+        if self.world_size == 1 or envs.VLLM_DISABLE_PYNCCL:
+            self.available = False
+            self.disabled = True
+            return
+        try:
+            self.nccl = NCCLLibrary(library_path)
+        except Exception:
+            # disable because of missing NCCL library
+            # e.g. in a non-GPU environment
+            self.available = False
+            self.disabled = True
+            return
+
+        self.available = True
+        self.disabled = False
+
+        self.nccl_version = self.nccl.ncclGetRawVersion()
+        if self.rank == 0:
+            # get the unique id from NCCL
+            self.unique_id = self.nccl.ncclGetUniqueId()
+            logger.info_once(
+                "vLLM is using nccl==%s", self.nccl.ncclGetVersion(), scope="local"
+            )
+        else:
+            # construct an empty unique id
+            self.unique_id = ncclUniqueId()
+
+        if not isinstance(group, StatelessProcessGroup):
+            tensor = torch.ByteTensor(list(self.unique_id.internal))
+            ranks = dist.get_process_group_ranks(group)
+            # arg `src` in `broadcast` is the global rank
+            dist.broadcast(tensor, src=ranks[0], group=group)
+            byte_list = tensor.tolist()
+            for i, byte in enumerate(byte_list):
+                self.unique_id.internal[i] = byte
+        else:
+            self.unique_id = group.broadcast_obj(self.unique_id, src=0)
+        if isinstance(device, int):
            device = torch.device(f"cuda:{device}")
        elif isinstance(device, str):
            device = torch.device(device)
@@ -261,27 +136,251 @@ class NCCLCommunicator:
        # `torch.cuda.device` is a context manager that changes the
        # current cuda device to the specified one
        with torch.cuda.device(device):
-            NCCL_CHECK(
-                _c_ncclCommInitRank(ctypes.byref(self.comm), self.world_size,
-                                    self.unique_id, self.rank))
-            self.stream = torch.cuda.Stream()
+            self.comm: ncclComm_t = self.nccl.ncclCommInitRank(
+                self.world_size, self.unique_id, self.rank
+            )

-    def all_reduce(self,
-                   tensor: torch.Tensor,
-                   op: ReduceOp = ReduceOp.SUM,
-                   stream=None):
+            stream = current_stream()
+            # A small all_reduce for warmup.
+            data = torch.zeros(1, device=device)
+            self.all_reduce(data)
+            stream.synchronize()
+            del data
+
+    def all_reduce(
+        self,
+        in_tensor: torch.Tensor,
+        out_tensor: torch.Tensor = None,
+        op: ReduceOp = ReduceOp.SUM,
+        stream=None,
+    ) -> torch.Tensor:
+        if self.disabled:
+            return None
        # nccl communicator created on a specific device
        # will only work on tensors on the same device
        # otherwise it will cause "illegal memory access"
+        assert in_tensor.device == self.device, (
+            f"this nccl communicator is created to work on {self.device}, "
+            f"but the input tensor is on {in_tensor.device}"
+        )
+
+        if out_tensor is None:
+            out_tensor = torch.empty_like(in_tensor)
+
+        if stream is None:
+            stream = current_stream()
+        self.nccl.ncclAllReduce(
+            buffer_type(in_tensor.data_ptr()),
+            buffer_type(out_tensor.data_ptr()),
+            in_tensor.numel(),
+            ncclDataTypeEnum.from_torch(in_tensor.dtype),
+            ncclRedOpTypeEnum.from_torch(op),
+            self.comm,
+            cudaStream_t(stream.cuda_stream),
+        )
+        return out_tensor
+
+    def all_gather(
+        self, output_tensor: torch.Tensor, input_tensor: torch.Tensor, stream=None
+    ):
+        if self.disabled:
+            return
+        # nccl communicator created on a specific device
+        # will only work on tensors on the same device
+        # otherwise it will cause "illegal memory access"
+        assert input_tensor.device == self.device, (
+            f"this nccl communicator is created to work on {self.device}, "
+            f"but the input tensor is on {input_tensor.device}"
+        )
+        if stream is None:
+            stream = current_stream()
+        self.nccl.ncclAllGather(
+            buffer_type(input_tensor.data_ptr()),
+            buffer_type(output_tensor.data_ptr()),
+            input_tensor.numel(),
+            ncclDataTypeEnum.from_torch(input_tensor.dtype),
+            self.comm,
+            cudaStream_t(stream.cuda_stream),
+        )
+
+    def all_gatherv(
+        self,
+        output_tensor: torch.Tensor,
+        input_tensor: torch.Tensor,
+        sizes: list[int],
+        stream=None,
+    ):
+        if self.disabled:
+            return
+        # nccl communicator created on a specific device
+        # will only work on tensors on the same device
+        # otherwise it will cause "illegal memory access"
+        assert input_tensor.device == self.device, (
+            f"this nccl communicator is created to work on {self.device}, "
+            f"but the input tensor is on {input_tensor.device}"
+        )
+        if stream is None:
+            stream = current_stream()
+        assert output_tensor.shape[0] == sum(sizes)
+        split_offset = 0
+        self.nccl.ncclGroupStart()
+        for root, split_size in enumerate(sizes):
+            dst_slice = output_tensor[split_offset : split_offset + split_size]
+            self.nccl.ncclBroadcast(
+                buffer_type(input_tensor.data_ptr()),
+                buffer_type(dst_slice.data_ptr()),
+                dst_slice.numel(),
+                ncclDataTypeEnum.from_torch(input_tensor.dtype),
+                root,
+                self.comm,
+                cudaStream_t(stream.cuda_stream),
+            )
+            split_offset += split_size
+        self.nccl.ncclGroupEnd()
+
+    def reduce_scatter(
+        self,
+        output_tensor: torch.Tensor,
+        input_tensor: torch.Tensor,
+        op: ReduceOp = ReduceOp.SUM,
+        stream=None,
+    ):
+        if self.disabled:
+            return
+        # nccl communicator created on a specific device
+        # will only work on tensors on the same device
+        # otherwise it will cause "illegal memory access"
+        assert input_tensor.device == self.device, (
+            f"this nccl communicator is created to work on {self.device}, "
+            f"but the input tensor is on {input_tensor.device}"
+        )
+        if stream is None:
+            stream = current_stream()
+        self.nccl.ncclReduceScatter(
+            buffer_type(input_tensor.data_ptr()),
+            buffer_type(output_tensor.data_ptr()),
+            output_tensor.numel(),
+            ncclDataTypeEnum.from_torch(input_tensor.dtype),
+            ncclRedOpTypeEnum.from_torch(op),
+            self.comm,
+            cudaStream_t(stream.cuda_stream),
+        )
+
+    def reduce_scatterv(
+        self,
+        output_tensor: torch.Tensor,
+        input_tensor: torch.Tensor,
+        sizes: list[int],
+        op: ReduceOp = ReduceOp.SUM,
+        stream=None,
+    ):
+        if self.disabled:
+            return
+        # nccl communicator created on a specific device
+        # will only work on tensors on the same device
+        # otherwise it will cause "illegal memory access"
+        assert input_tensor.device == self.device, (
+            f"this nccl communicator is created to work on {self.device}, "
+            f"but the input tensor is on {input_tensor.device}"
+        )
+        if stream is None:
+            stream = current_stream()
+
+        split_offset = 0
+        self.nccl.ncclGroupStart()
+        for root, split_size in enumerate(sizes):
+            chunk = input_tensor[split_offset : split_offset + split_size, ...]
+            self.nccl.ncclReduce(
+                buffer_type(chunk.data_ptr()),
+                buffer_type(output_tensor.data_ptr()),
+                chunk.numel(),
+                ncclDataTypeEnum.from_torch(input_tensor.dtype),
+                ncclRedOpTypeEnum.from_torch(op),
+                root,
+                self.comm,
+                cudaStream_t(stream.cuda_stream),
+            )
+            split_offset += split_size
+        self.nccl.ncclGroupEnd()
+
+    def send(self, tensor: torch.Tensor, dst: int, stream=None):
+        if self.disabled:
+            return
        assert tensor.device == self.device, (
            f"this nccl communicator is created to work on {self.device}, "
-            f"but the input tensor is on {tensor.device}")
+            f"but the input tensor is on {tensor.device}"
+        )
        if stream is None:
-            stream = self.stream
-        NCCL_CHECK(
-            _c_ncclAllReduce(ctypes.c_void_p(tensor.data_ptr()),
-                             ctypes.c_void_p(tensor.data_ptr()),
-                             tensor.numel(),
-                             ncclDataTypeEnum.from_torch(tensor.dtype),
-                             ncclRedOpTypeEnum.from_torch(op), self.comm,
-                             ctypes.c_void_p(stream.cuda_stream)))
+            stream = current_stream()
+        self.nccl.ncclSend(
+            buffer_type(tensor.data_ptr()),
+            tensor.numel(),
+            ncclDataTypeEnum.from_torch(tensor.dtype),
+            dst,
+            self.comm,
+            cudaStream_t(stream.cuda_stream),
+        )
+
+    def recv(self, tensor: torch.Tensor, src: int, stream=None):
+        if self.disabled:
+            return
+        assert tensor.device == self.device, (
+            f"this nccl communicator is created to work on {self.device}, "
+            f"but the input tensor is on {tensor.device}"
+        )
+        if stream is None:
+            stream = current_stream()
+        self.nccl.ncclRecv(
+            buffer_type(tensor.data_ptr()),
+            tensor.numel(),
+            ncclDataTypeEnum.from_torch(tensor.dtype),
+            src,
+            self.comm,
+            cudaStream_t(stream.cuda_stream),
+        )
+
+    def broadcast(self, tensor: torch.Tensor, src: int, stream=None):
+        if self.disabled:
+            return
+        assert tensor.device == self.device, (
+            f"this nccl communicator is created to work on {self.device}, "
+            f"but the input tensor is on {tensor.device}"
+        )
+        if stream is None:
+            stream = current_stream()
+        if src == self.rank:
+            sendbuff = buffer_type(tensor.data_ptr())
+            # NCCL requires the sender also to have a receive buffer
+            recvbuff = buffer_type(tensor.data_ptr())
+        else:
+            sendbuff = buffer_type()
+            recvbuff = buffer_type(tensor.data_ptr())
+        self.nccl.ncclBroadcast(
+            sendbuff,
+            recvbuff,
+            tensor.numel(),
+            ncclDataTypeEnum.from_torch(tensor.dtype),
+            src,
+            self.comm,
+            cudaStream_t(stream.cuda_stream),
+        )
+
+    def group_start(self):
+        self.nccl.ncclGroupStart()
+
+    def group_end(self):
+        self.nccl.ncclGroupEnd()
+
+    def register_comm_window(self, tensor: torch.Tensor):
+        return self.nccl.ncclCommWindowRegister(
+            self.comm,
+            buffer_type(tensor.data_ptr()),
+            tensor.numel() * tensor.element_size(),
+            1,
+        )
+
+    def register_comm_window_raw(self, ptr: int, size: int):
+        return self.nccl.ncclCommWindowRegister(self.comm, buffer_type(ptr), size, 1)
+
+    def deregister_comm_window(self, window):
+        return self.nccl.ncclCommWindowDeregister(self.comm, window)
--- a/vllm/distributed/device_communicators/pynccl_allocator.py
+++ b/vllm/distributed/device_communicators/pynccl_allocator.py
@@ -0,0 +1,191 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import atexit
+import contextlib
+import tempfile
+from typing import Any
+
+import torch
+from packaging import version
+from torch.cuda.memory import CUDAPluggableAllocator
+from torch.utils.cpp_extension import load_inline
+
+from vllm import envs
+from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
+from vllm.logger import init_logger
+from vllm.platforms import current_platform
+from vllm.utils.nccl import find_nccl_include_paths
+
+logger = init_logger(__name__)
+
+nccl_allocator_source = """
+#include <nccl.h>
+extern "C" {
+
+void* nccl_alloc_plug(size_t size, int device, void* stream) {
+  void* ptr;
+  ncclResult_t err = ncclMemAlloc(&ptr, size);
+  return ptr;
+
+}
+
+void nccl_free_plug(void* ptr, size_t size, int device, void* stream) {
+  ncclResult_t err = ncclMemFree(ptr);
+}
+
+}
+"""
+
+_allocator = None
+_allocator_wrapper = None
+_mem_pool = None
+_registered_base_addrs = set()
+_graph_pool_id = None
+_nccl_allocator_failed_to_compile = False
+_cached_pool_snapshot = None
+
+
+def is_symmetric_memory_enabled():
+    global _nccl_allocator_failed_to_compile
+    return envs.VLLM_USE_NCCL_SYMM_MEM and not _nccl_allocator_failed_to_compile
+
+
+def is_symmetric_memory_tensor(tensor: torch.Tensor):
+    if not is_symmetric_memory_enabled() or _cached_pool_snapshot is None:
+        return False
+    for segment in _cached_pool_snapshot:
+        for block in segment["blocks"]:
+            if block["address"] == tensor.untyped_storage().data_ptr():
+                return True
+    return False
+
+
+def set_graph_pool_id(graph_pool_id):
+    global _graph_pool_id
+    _graph_pool_id = graph_pool_id
+
+
+def compile_nccl_allocator():
+    global _allocator, _allocator_wrapper, _nccl_allocator_failed_to_compile
+    if not current_platform.is_cuda():
+        _nccl_allocator_failed_to_compile = True
+        return
+    try:
+        out_dir = tempfile.gettempdir()
+        nccl_allocator_libname = "nccl_allocator"
+        nccl_include_paths = find_nccl_include_paths()
+        load_inline(
+            name=nccl_allocator_libname,
+            cpp_sources=nccl_allocator_source,
+            with_cuda=True,
+            extra_ldflags=["-lnccl"],
+            verbose=envs.VLLM_LOGGING_LEVEL == "DEBUG",
+            is_python_module=False,
+            build_directory=out_dir,
+            extra_include_paths=nccl_include_paths,
+        )
+        _allocator_wrapper = CUDAPluggableAllocator(
+            f"{out_dir}/{nccl_allocator_libname}.so",
+            "nccl_alloc_plug",
+            "nccl_free_plug",
+        )
+        _allocator = _allocator_wrapper.allocator()
+    except Exception as e:
+        _nccl_allocator_failed_to_compile = True
+        logger.warning(
+            "Failed to compile NCCL memory allocator. "
+            "Symmetric memory will be disabled. "
+            "This is expected if NCCL headers are not available. "
+            "optionally set VLLM_NCCL_INCLUDE_PATH to point to a directory "
+            "containing the NCCL header. "
+            "Error: %s",
+            str(e),
+        )
+
+
+def get_nccl_mem_pool():
+    global _mem_pool, _nccl_allocator_failed_to_compile
+    if _mem_pool is None and not _nccl_allocator_failed_to_compile:
+        compile_nccl_allocator()
+        if _allocator is not None:
+            _mem_pool = torch.cuda.MemPool(_allocator)
+    return _mem_pool
+
+
+def _cleanup_nccl_mem_pool():
+    global _mem_pool
+    _mem_pool = None
+
+
+def _cleanup_nccl_allocator_wrapper():
+    global _allocator_wrapper
+    _allocator_wrapper = None
+
+
+atexit.register(_cleanup_nccl_mem_pool)
+atexit.register(_cleanup_nccl_allocator_wrapper)
+
+
+class nccl_symm_mem_context:
+    def __init__(
+        self,
+        pynccl_comm: PyNcclCommunicator,
+        disabled: bool = False,
+    ):
+        self.disabled = (
+            disabled
+            or not is_symmetric_memory_enabled()
+            or pynccl_comm.world_size == 1
+            or not current_platform.is_cuda()
+            or get_nccl_mem_pool() is None
+            or version.parse(torch.__version__) < version.parse("2.8.0.a0")
+        )
+        if self.disabled:
+            self.pynccl_comm: PyNcclCommunicator | None = None
+            self._mem_pool_ctx: contextlib.AbstractContextManager[Any] = (
+                contextlib.nullcontext()
+            )
+            self.is_graph_capture = None
+            self.device = None
+        else:
+            self.pynccl_comm = pynccl_comm
+            self._mem_pool_ctx = torch.cuda.use_mem_pool(get_nccl_mem_pool())
+            self.is_graph_capture = torch.cuda.is_current_stream_capturing()
+            self.device = torch.cuda.current_device()
+
+    def __enter__(self):
+        if self.disabled:
+            return self
+        assert self.pynccl_comm is not None, (
+            "Symmetric memory requires pynccl to be initialized"
+        )
+        assert self.pynccl_comm.nccl_version >= 22703, (
+            "NCCL version 2.27.3 or higher is required for NCCL symmetric memory"
+        )
+        if self.is_graph_capture:
+            assert _graph_pool_id is not None, (
+                "graph_pool_id is not set under graph capture"
+            )
+            # Pause graph memory pool to use symmetric memory with cuda graph
+            torch._C._cuda_endAllocateToPool(self.device, _graph_pool_id)
+        self._mem_pool_ctx.__enter__()
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if self.disabled:
+            return
+        global _cached_pool_snapshot
+        global _registered_base_addrs
+        self._mem_pool_ctx.__exit__(exc_type, exc_val, exc_tb)
+        _pool = get_nccl_mem_pool()
+        assert _pool is not None
+        _cached_pool_snapshot = _pool.snapshot()
+        assert self.pynccl_comm is not None
+        for segment in _cached_pool_snapshot:
+            if segment["address"] not in _registered_base_addrs:
+                self.pynccl_comm.register_comm_window_raw(
+                    segment["address"], segment["total_size"]
+                )
+                _registered_base_addrs.add(segment["address"])
+        if self.is_graph_capture:
+            torch._C._cuda_beginAllocateCurrentThreadToPool(self.device, _graph_pool_id)
--- a/vllm/distributed/device_communicators/pynccl_wrapper.py
+++ b/vllm/distributed/device_communicators/pynccl_wrapper.py
@@ -0,0 +1,564 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# This file is a pure Python wrapper for the NCCL library.
+# The main purpose is to use NCCL combined with CUDA graph.
+# Before writing this script, we tried the following approach:
+# 1. We tried to use `cupy`, it calls NCCL correctly, but `cupy` itself
+#  often gets stuck when initializing the NCCL communicator.
+# 2. We tried to use `torch.distributed`, but `torch.distributed.all_reduce`
+#  contains many other potential cuda APIs, that are not allowed during
+#  capturing the CUDA graph. For further details, please check
+# https://discuss.pytorch.org/t/pytorch-cudagraph-with-nccl-operation-failed/ .
+#
+# Another rejected idea is to write a C/C++ binding for NCCL. It is usually
+# doable, but we often encounter issues related with nccl versions, and need
+# to switch between different versions of NCCL. See
+# https://github.com/NVIDIA/nccl/issues/1234 for more details.
+# A C/C++ binding is not flexible enough to handle this. It requires
+# recompilation of the code every time we want to switch between different
+# versions. This current implementation, with a **pure** Python wrapper, is
+# more flexible. We can easily switch between different versions of NCCL by
+# changing the environment variable `VLLM_NCCL_SO_PATH`, or the `so_file`
+# variable in the code.
+
+import ctypes
+import platform
+from dataclasses import dataclass
+from typing import Any
+
+import torch
+from torch.distributed import ReduceOp
+
+from vllm import envs
+from vllm.logger import init_logger
+from vllm.platforms import current_platform
+from vllm.utils.nccl import find_nccl_library
+
+logger = init_logger(__name__)
+
+# === export types and functions from nccl to Python ===
+# for the original nccl definition, please check
+# https://github.com/NVIDIA/nccl/blob/master/src/nccl.h.in
+
+ncclResult_t = ctypes.c_int
+ncclComm_t = ctypes.c_void_p
+ncclWindow_t = ctypes.c_void_p
+
+
+class ncclUniqueId(ctypes.Structure):
+    _fields_ = [("internal", ctypes.c_byte * 128)]
+
+
+cudaStream_t = ctypes.c_void_p
+buffer_type = ctypes.c_void_p
+
+ncclDataType_t = ctypes.c_int
+
+
+class ncclDataTypeEnum:
+    ncclInt8 = 0
+    ncclChar = 0
+    ncclUint8 = 1
+    ncclInt32 = 2
+    ncclInt = 2
+    ncclUint32 = 3
+    ncclInt64 = 4
+    ncclUint64 = 5
+    ncclFloat16 = 6
+    ncclHalf = 6
+    ncclFloat32 = 7
+    ncclFloat = 7
+    ncclFloat64 = 8
+    ncclDouble = 8
+    ncclBfloat16 = 9
+    ncclNumTypes = 10
+
+    @classmethod
+    def from_torch(cls, dtype: torch.dtype) -> int:
+        if dtype == torch.int8:
+            return cls.ncclInt8
+        if dtype == torch.uint8:
+            return cls.ncclUint8
+        if dtype == torch.int32:
+            return cls.ncclInt32
+        if dtype == torch.int64:
+            return cls.ncclInt64
+        if dtype == torch.float16:
+            return cls.ncclFloat16
+        if dtype == torch.float32:
+            return cls.ncclFloat32
+        if dtype == torch.float64:
+            return cls.ncclFloat64
+        if dtype == torch.bfloat16:
+            return cls.ncclBfloat16
+        raise ValueError(f"Unsupported dtype: {dtype}")
+
+
+ncclRedOp_t = ctypes.c_int
+
+
+class ncclRedOpTypeEnum:
+    ncclSum = 0
+    ncclProd = 1
+    ncclMax = 2
+    ncclMin = 3
+    ncclAvg = 4
+    ncclNumOps = 5
+
+    @classmethod
+    def from_torch(cls, op: ReduceOp) -> int:
+        if op == ReduceOp.SUM:
+            return cls.ncclSum
+        if op == ReduceOp.PRODUCT:
+            return cls.ncclProd
+        if op == ReduceOp.MAX:
+            return cls.ncclMax
+        if op == ReduceOp.MIN:
+            return cls.ncclMin
+        if op == ReduceOp.AVG:
+            return cls.ncclAvg
+        raise ValueError(f"Unsupported op: {op}")
+
+
+@dataclass
+class Function:
+    name: str
+    restype: Any
+    argtypes: list[Any]
+
+
+class NCCLLibrary:
+    exported_functions = [
+        # const char* ncclGetErrorString(ncclResult_t result)
+        Function("ncclGetErrorString", ctypes.c_char_p, [ncclResult_t]),
+        # ncclResult_t  ncclGetVersion(int *version);
+        Function("ncclGetVersion", ncclResult_t, [ctypes.POINTER(ctypes.c_int)]),
+        # ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId);
+        Function("ncclGetUniqueId", ncclResult_t, [ctypes.POINTER(ncclUniqueId)]),
+        # ncclResult_t  ncclCommInitRank(
+        #   ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
+        # note that ncclComm_t is a pointer type, so the first argument
+        # is a pointer to a pointer
+        Function(
+            "ncclCommInitRank",
+            ncclResult_t,
+            [ctypes.POINTER(ncclComm_t), ctypes.c_int, ncclUniqueId, ctypes.c_int],
+        ),
+        # ncclResult_t  ncclAllReduce(
+        #   const void* sendbuff, void* recvbuff, size_t count,
+        #   ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
+        #   cudaStream_t stream);
+        # note that cudaStream_t is a pointer type, so the last argument
+        # is a pointer
+        Function(
+            "ncclAllReduce",
+            ncclResult_t,
+            [
+                buffer_type,
+                buffer_type,
+                ctypes.c_size_t,
+                ncclDataType_t,
+                ncclRedOp_t,
+                ncclComm_t,
+                cudaStream_t,
+            ],
+        ),
+        # ncclResult_t  ncclReduce(
+        #   const void* sendbuff, void* recvbuff, size_t count,
+        #   ncclDataType_t datatype, ncclRedOp_t op, int root,
+        #   ncclComm_t comm,  cudaStream_t stream);
+        # note that cudaStream_t is a pointer type, so the last argument
+        # is a pointer
+        Function(
+            "ncclReduce",
+            ncclResult_t,
+            [
+                buffer_type,
+                buffer_type,
+                ctypes.c_size_t,
+                ncclDataType_t,
+                ncclRedOp_t,
+                ctypes.c_int,
+                ncclComm_t,
+                cudaStream_t,
+            ],
+        ),
+        # ncclResult_t  ncclAllGather(
+        #   const void* sendbuff, void* recvbuff, size_t count,
+        #   ncclDataType_t datatype, ncclComm_t comm,
+        #   cudaStream_t stream);
+        # note that cudaStream_t is a pointer type, so the last argument
+        # is a pointer
+        Function(
+            "ncclAllGather",
+            ncclResult_t,
+            [
+                buffer_type,
+                buffer_type,
+                ctypes.c_size_t,
+                ncclDataType_t,
+                ncclComm_t,
+                cudaStream_t,
+            ],
+        ),
+        # ncclResult_t  ncclReduceScatter(
+        #   const void* sendbuff, void* recvbuff, size_t count,
+        #   ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
+        #   cudaStream_t stream);
+        # note that cudaStream_t is a pointer type, so the last argument
+        # is a pointer
+        Function(
+            "ncclReduceScatter",
+            ncclResult_t,
+            [
+                buffer_type,
+                buffer_type,
+                ctypes.c_size_t,
+                ncclDataType_t,
+                ncclRedOp_t,
+                ncclComm_t,
+                cudaStream_t,
+            ],
+        ),
+        # ncclResult_t  ncclSend(
+        #   const void* sendbuff, size_t count, ncclDataType_t datatype,
+        #   int dest, ncclComm_t comm, cudaStream_t stream);
+        Function(
+            "ncclSend",
+            ncclResult_t,
+            [
+                buffer_type,
+                ctypes.c_size_t,
+                ncclDataType_t,
+                ctypes.c_int,
+                ncclComm_t,
+                cudaStream_t,
+            ],
+        ),
+        # ncclResult_t  ncclRecv(
+        #   void* recvbuff, size_t count, ncclDataType_t datatype,
+        #   int src, ncclComm_t comm, cudaStream_t stream);
+        Function(
+            "ncclRecv",
+            ncclResult_t,
+            [
+                buffer_type,
+                ctypes.c_size_t,
+                ncclDataType_t,
+                ctypes.c_int,
+                ncclComm_t,
+                cudaStream_t,
+            ],
+        ),
+        # ncclResult_t ncclBroadcast(
+        #   const void* sendbuff, void* recvbuff, size_t count,
+        #   ncclDataType_t datatype, int root, ncclComm_t comm,
+        #   cudaStream_t stream);
+        Function(
+            "ncclBroadcast",
+            ncclResult_t,
+            [
+                buffer_type,
+                buffer_type,
+                ctypes.c_size_t,
+                ncclDataType_t,
+                ctypes.c_int,
+                ncclComm_t,
+                cudaStream_t,
+            ],
+        ),
+        # be cautious! this is a collective call, it will block until all
+        # processes in the communicator have called this function.
+        # because Python object destruction can happen in random order,
+        # it is better not to call it at all.
+        # ncclResult_t  ncclCommDestroy(ncclComm_t comm);
+        Function("ncclCommDestroy", ncclResult_t, [ncclComm_t]),
+        # ncclResult_t ncclGroupStart();
+        Function("ncclGroupStart", ncclResult_t, []),
+        # ncclResult_t ncclGroupEnd();
+        Function("ncclGroupEnd", ncclResult_t, []),
+        # ncclResult_t ncclCommWindowRegister(
+        #   ncclComm_t comm, void* buff, size_t size,
+        #   ncclWindow_t* win, int winFlags);
+        Function(
+            "ncclCommWindowRegister",
+            ncclResult_t,
+            [
+                ncclComm_t,
+                buffer_type,
+                ctypes.c_size_t,
+                ctypes.POINTER(ncclWindow_t),
+                ctypes.c_int,
+            ],
+        ),
+        # ncclResult_t ncclCommWindowDeregister(
+        #   ncclComm_t comm, ncclWindow_t win);
+        Function("ncclCommWindowDeregister", ncclResult_t, [ncclComm_t, ncclWindow_t]),
+    ]
+
+    # class attribute to store the mapping from the path to the library
+    # to avoid loading the same library multiple times
+    path_to_library_cache: dict[str, Any] = {}
+
+    # class attribute to store the mapping from library path
+    #  to the corresponding dictionary
+    path_to_dict_mapping: dict[str, dict[str, Any]] = {}
+
+    def __init__(self, so_file: str | None = None):
+        so_file = so_file or find_nccl_library()
+
+        try:
+            if so_file not in NCCLLibrary.path_to_dict_mapping:
+                lib = ctypes.CDLL(so_file)
+                NCCLLibrary.path_to_library_cache[so_file] = lib
+            self.lib = NCCLLibrary.path_to_library_cache[so_file]
+        except Exception as e:
+            logger.error(
+                "Failed to load NCCL library from %s. "
+                "It is expected if you are not running on NVIDIA/AMD GPUs."
+                "Otherwise, the nccl library might not exist, be corrupted "
+                "or it does not support the current platform %s. "
+                "If you already have the library, please set the "
+                "environment variable VLLM_NCCL_SO_PATH"
+                " to point to the correct nccl library path.",
+                so_file,
+                platform.platform(),
+            )
+            raise e
+
+        if so_file not in NCCLLibrary.path_to_dict_mapping:
+            _funcs: dict[str, Any] = {}
+            for func in NCCLLibrary.exported_functions:
+                try:
+                    f = getattr(self.lib, func.name)
+                    f.restype = func.restype
+                    f.argtypes = func.argtypes
+                    _funcs[func.name] = f
+                except AttributeError:
+                    if func.name in [
+                        "ncclCommWindowRegister",
+                        "ncclCommWindowDeregister",
+                    ]:
+                        if envs.VLLM_USE_NCCL_SYMM_MEM:
+                            logger.warning_once(
+                                "The symbol %s is not found in the NCCL "
+                                "library %s. To enable VLLM_USE_NCCL_SYMM_MEM "
+                                " please update your NCCL version to >= "
+                                "2.27.03.",
+                                func.name,
+                                so_file,
+                            )
+                        if current_platform.is_rocm():
+                            # Having an exception here on ROCm platform is
+                            # not allowed during graph capturing
+                            continue
+                    raise
+            NCCLLibrary.path_to_dict_mapping[so_file] = _funcs
+        self._funcs = NCCLLibrary.path_to_dict_mapping[so_file]
+
+    def ncclGetErrorString(self, result: ncclResult_t) -> str:
+        return self._funcs["ncclGetErrorString"](result).decode("utf-8")
+
+    def NCCL_CHECK(self, result: ncclResult_t) -> None:
+        if result != 0:
+            error_str = self.ncclGetErrorString(result)
+            raise RuntimeError(f"NCCL error: {error_str}")
+
+    def ncclGetRawVersion(self) -> int:
+        version = ctypes.c_int()
+        self.NCCL_CHECK(self._funcs["ncclGetVersion"](ctypes.byref(version)))
+        # something like 21903
+        return version.value
+
+    def ncclGetVersion(self) -> str:
+        version_str = str(self.ncclGetRawVersion())
+        # something like 21903 --> "2.19.3"
+        major = version_str[0].lstrip("0")
+        minor = version_str[1:3].lstrip("0")
+        patch = version_str[3:].lstrip("0")
+        return f"{major}.{minor}.{patch}"
+
+    def ncclGetUniqueId(self) -> ncclUniqueId:
+        unique_id = ncclUniqueId()
+        self.NCCL_CHECK(self._funcs["ncclGetUniqueId"](ctypes.byref(unique_id)))
+        return unique_id
+
+    def unique_id_from_bytes(self, data: bytes) -> ncclUniqueId:
+        if len(data) != 128:
+            raise ValueError(
+                f"Expected 128 bytes for ncclUniqueId, got {len(data)} bytes"
+            )
+        unique_id = ncclUniqueId()
+        ctypes.memmove(ctypes.addressof(unique_id.internal), data, 128)
+        return unique_id
+
+    def ncclCommInitRank(
+        self, world_size: int, unique_id: ncclUniqueId, rank: int
+    ) -> ncclComm_t:
+        comm = ncclComm_t()
+        self.NCCL_CHECK(
+            self._funcs["ncclCommInitRank"](
+                ctypes.byref(comm), world_size, unique_id, rank
+            )
+        )
+        return comm
+
+    def ncclAllReduce(
+        self,
+        sendbuff: buffer_type,
+        recvbuff: buffer_type,
+        count: int,
+        datatype: int,
+        op: int,
+        comm: ncclComm_t,
+        stream: cudaStream_t,
+    ) -> None:
+        # `datatype` actually should be `ncclDataType_t`
+        # and `op` should be `ncclRedOp_t`
+        # both are aliases of `ctypes.c_int`
+        # when we pass int to a function, it will be converted to `ctypes.c_int`
+        # by ctypes automatically
+        self.NCCL_CHECK(
+            self._funcs["ncclAllReduce"](
+                sendbuff, recvbuff, count, datatype, op, comm, stream
+            )
+        )
+
+    def ncclReduce(
+        self,
+        sendbuff: buffer_type,
+        recvbuff: buffer_type,
+        count: int,
+        datatype: int,
+        op: int,
+        root: int,
+        comm: ncclComm_t,
+        stream: cudaStream_t,
+    ) -> None:
+        # `datatype` actually should be `ncclDataType_t`
+        # and `op` should be `ncclRedOp_t`
+        # both are aliases of `ctypes.c_int`
+        # when we pass int to a function, it will be converted to `ctypes.c_int`
+        # by ctypes automatically
+        self.NCCL_CHECK(
+            self._funcs["ncclReduce"](
+                sendbuff, recvbuff, count, datatype, op, root, comm, stream
+            )
+        )
+
+    def ncclReduceScatter(
+        self,
+        sendbuff: buffer_type,
+        recvbuff: buffer_type,
+        count: int,
+        datatype: int,
+        op: int,
+        comm: ncclComm_t,
+        stream: cudaStream_t,
+    ) -> None:
+        # `datatype` actually should be `ncclDataType_t`
+        # and `op` should be `ncclRedOp_t`
+        # both are aliases of `ctypes.c_int`
+        # when we pass int to a function, it will be converted to `ctypes.c_int`
+        # by ctypes automatically
+        self.NCCL_CHECK(
+            self._funcs["ncclReduceScatter"](
+                sendbuff, recvbuff, count, datatype, op, comm, stream
+            )
+        )
+
+    def ncclAllGather(
+        self,
+        sendbuff: buffer_type,
+        recvbuff: buffer_type,
+        count: int,
+        datatype: int,
+        comm: ncclComm_t,
+        stream: cudaStream_t,
+    ) -> None:
+        # `datatype` actually should be `ncclDataType_t`
+        # which is an aliases of `ctypes.c_int`
+        # when we pass int to a function, it will be converted to `ctypes.c_int`
+        # by ctypes automatically
+        self.NCCL_CHECK(
+            self._funcs["ncclAllGather"](
+                sendbuff, recvbuff, count, datatype, comm, stream
+            )
+        )
+
+    def ncclSend(
+        self,
+        sendbuff: buffer_type,
+        count: int,
+        datatype: int,
+        dest: int,
+        comm: ncclComm_t,
+        stream: cudaStream_t,
+    ) -> None:
+        self.NCCL_CHECK(
+            self._funcs["ncclSend"](sendbuff, count, datatype, dest, comm, stream)
+        )
+
+    def ncclRecv(
+        self,
+        recvbuff: buffer_type,
+        count: int,
+        datatype: int,
+        src: int,
+        comm: ncclComm_t,
+        stream: cudaStream_t,
+    ) -> None:
+        self.NCCL_CHECK(
+            self._funcs["ncclRecv"](recvbuff, count, datatype, src, comm, stream)
+        )
+
+    def ncclBroadcast(
+        self,
+        sendbuff: buffer_type,
+        recvbuff: buffer_type,
+        count: int,
+        datatype: int,
+        root: int,
+        comm: ncclComm_t,
+        stream: cudaStream_t,
+    ) -> None:
+        self.NCCL_CHECK(
+            self._funcs["ncclBroadcast"](
+                sendbuff, recvbuff, count, datatype, root, comm, stream
+            )
+        )
+
+    def ncclCommDestroy(self, comm: ncclComm_t) -> None:
+        self.NCCL_CHECK(self._funcs["ncclCommDestroy"](comm))
+
+    def ncclGroupStart(self) -> None:
+        self.NCCL_CHECK(self._funcs["ncclGroupStart"]())
+
+    def ncclGroupEnd(self) -> None:
+        self.NCCL_CHECK(self._funcs["ncclGroupEnd"]())
+
+    def ncclCommWindowRegister(
+        self, comm: ncclComm_t, buff: buffer_type, size: int, win_flags: int
+    ) -> ncclWindow_t:
+        window = ncclWindow_t()
+        self.NCCL_CHECK(
+            self._funcs["ncclCommWindowRegister"](
+                comm, buff, size, ctypes.byref(window), win_flags
+            )
+        )
+        return window
+
+    def ncclCommWindowDeregister(self, comm: ncclComm_t, window: ncclWindow_t) -> None:
+        self.NCCL_CHECK(self._funcs["ncclCommWindowDeregister"](comm, window))
+
+
+__all__ = [
+    "NCCLLibrary",
+    "ncclDataTypeEnum",
+    "ncclRedOpTypeEnum",
+    "ncclUniqueId",
+    "ncclComm_t",
+    "cudaStream_t",
+    "buffer_type",
+]
--- a/vllm/distributed/device_communicators/quick_all_reduce.py
+++ b/vllm/distributed/device_communicators/quick_all_reduce.py
@@ -0,0 +1,290 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from enum import Enum
+
+import torch
+import torch.distributed as dist
+from torch.distributed import ProcessGroup
+
+import vllm.envs as envs
+from vllm import _custom_ops as ops
+from vllm.config import get_current_vllm_config
+from vllm.distributed.parallel_state import in_the_same_node_as
+from vllm.logger import init_logger
+from vllm.platforms import current_platform
+from vllm.utils.torch_utils import cuda_device_count_stateless
+
+logger = init_logger(__name__)
+
+try:
+    ops.qr_max_size()
+    quick_ar = True
+except Exception:
+    # For CPUs and CUDA
+    quick_ar = False
+
+
+def is_weak_contiguous(inp: torch.Tensor):
+    return inp.is_contiguous() or (
+        inp.storage().nbytes() - inp.storage_offset() * inp.element_size()
+        == inp.numel() * inp.element_size()
+    )
+
+
+class QuickReduceRegime(Enum):
+    FP = 0
+    INT8 = 1
+    INT6 = 2
+    INT4 = 3
+    NONE = 4
+
+
+MB = 1024 * 1024
+
+
+class QuickAllReduce:
+    _SUPPORTED_WORLD_SIZES = [2, 4, 8]
+    _SUPPORTED_DTYPES = [torch.float16, torch.bfloat16]
+    # The following data is based on kernel tests.
+    # In this order [FP, INT8, INT6, INT4].
+    _QR_MIN_SIZE = {
+        (torch.float16, 2): [1 * MB, 2 * MB, 2 * MB, 1 * MB],
+        (torch.float16, 4): [1 * MB, 16 * MB, 4 * MB, 2 * MB],
+        (torch.float16, 8): [16 * MB, 4 * MB, 4 * MB, 2 * MB],
+        (torch.bfloat16, 2): [2 * MB, 8 * MB, 8 * MB, 8 * MB],
+        (torch.bfloat16, 4): [8 * MB, 64 * MB, 64 * MB, 16 * MB],
+        (torch.bfloat16, 8): [16 * MB, 2048 * MB, 2048 * MB, 2048 * MB],
+    }
+
+    def __init__(self, group: ProcessGroup, device: int | str | torch.device) -> None:
+        """
+        Custom allreduce provides non-destructive acceleration and is
+        available for CUDA and ROCm MI300 series.
+
+        Custom quick allreduce leverages quantization for further
+        acceleration on ROCm. It currently supports Q8, Q6, and Q4
+        quantization formats and FP(float16, bfloat16).
+
+        Quick allreduce is designed as a complement to custom allreduce.
+        Its initialization requires even stricter conditions.
+
+        Only the ROCm MI300 series is supported for quick allreduce at
+        this time.
+
+        Args:
+            group: the process group to work on. If None, it will use the
+                default process group.
+            device: the device to bind the CustomAllreduce to. If None,
+                it will be bound to f"cuda:{local_rank}".
+        It is the caller's responsibility to make sure each communicator
+        is bind to a unique device, and all communicators in this group
+        are in the same node.
+        """
+        self.disabled = True
+        if not self._rocm_arch_available():
+            logger.debug(
+                "Custom quick allreduce is only supported on ROCm MI300 series."
+            )
+            return
+
+        if not quick_ar:
+            # disable because of missing quick reduce library
+            # e.g. in a cuda environment
+            logger.info(
+                "Custom quick allreduce is disabled because "
+                "of missing custom quick allreduce library"
+            )
+            return
+
+        self.group = group
+        assert dist.get_backend(group) != dist.Backend.NCCL, (
+            "Custom quick allreduce should be attached to a non-NCCL group."
+        )
+        if not all(in_the_same_node_as(group, source_rank=0)):
+            # No need to initialize custom quick allreduce for
+            # multi-node case.
+            logger.warning(
+                "Custom quick allreduce is disabled because this "
+                "process group spans across nodes."
+            )
+            return
+        rank = dist.get_rank(group=self.group)
+        world_size = dist.get_world_size(group=self.group)
+        self.rank = rank
+        self.world_size = world_size
+        if world_size == 1:
+            # No need to initialize QuickReduce for single GPU case.
+            return
+
+        if world_size not in QuickAllReduce._SUPPORTED_WORLD_SIZES:
+            logger.warning(
+                "Custom quick allreduce is disabled due to an "
+                "unsupported world size: %d. Supported world sizes: %s.",
+                world_size,
+                str(QuickAllReduce._SUPPORTED_WORLD_SIZES),
+            )
+            return
+
+        if isinstance(device, int):
+            device = torch.device(f"cuda:{device}")
+        elif isinstance(device, str):
+            device = torch.device(device)
+        assert isinstance(device, torch.device)
+        self.device = device
+
+        cuda_visible_devices = envs.CUDA_VISIBLE_DEVICES
+        if cuda_visible_devices:
+            device_ids = list(map(int, cuda_visible_devices.split(",")))
+        else:
+            device_ids = list(range(cuda_device_count_stateless()))
+        physical_device_id = device_ids[device.index]
+        tensor = torch.tensor([physical_device_id], dtype=torch.int, device="cpu")
+        gather_list = [
+            torch.tensor([0], dtype=torch.int, device="cpu")
+            for _ in range(self.world_size)
+        ]
+        dist.all_gather(gather_list, tensor, group=self.group)
+        physical_device_ids = [t.item() for t in gather_list]
+
+        # test nvlink first, this will filter out most of the cases
+        # where custom quick allreduce is not supported
+        # this checks hardware and driver support for NVLink
+        assert current_platform.is_cuda_alike()
+        self.fully_connected = current_platform.is_fully_connected(physical_device_ids)
+        if self.world_size > 2 and not self.fully_connected:
+            logger.debug(
+                "Custom quick allreduce is disabled because it's not supported "
+                "on more than two PCIe-only GPUs. "
+            )
+            return
+
+        self.init_quick_all_reduce()
+
+    def init_quick_all_reduce(self):
+        # On RocM, bfloat16 kernels are slower than fp16
+        # due to slower match operations
+        # If environment variable is set to 1, we convert input to fp16
+        self.use_fp16_kernels = envs.VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16
+        regime_str = envs.VLLM_ROCM_QUICK_REDUCE_QUANTIZATION
+        if regime_str not in QuickReduceRegime.__members__:
+            logger.warning(
+                "Custom quick allreduce:",
+                f"Invalid quantization level: {regime_str}. "
+                "Supported levels: "
+                f"{list(QuickReduceRegime.__members__.keys())}",
+            )
+            return
+
+        if regime_str == "NONE":
+            logger.debug(
+                "Custom quick allreduce is disabled based "
+                "on env variable "
+                "VLLM_ROCM_QUICK_REDUCE_QUANTIZATION='NONE'"
+            )
+            return
+        self.qr_quant_level = QuickReduceRegime[regime_str]
+        vllm_config = get_current_vllm_config()
+        if (
+            vllm_config is not None
+            and hasattr(vllm_config, "model_config")
+            and hasattr(vllm_config.model_config, "dtype")
+        ):
+            dtype = vllm_config.model_config.dtype
+            if dtype not in [torch.float16, torch.bfloat16]:
+                logger.debug(
+                    "Custom quick allreduce disabled: only supports "
+                    "float16 and float16, but get %s.",
+                    dtype,
+                )
+                return
+
+            if dtype == torch.bfloat16 and self.use_fp16_kernels:
+                logger.info(
+                    "Custom quick allreduce: BF16 inputs will be converted "
+                    "to FP16 to improve performance. set "
+                    "envs.VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16=0 "
+                    "to turn off."
+                )
+
+        # VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB is specified in MB
+        qr_max_size = envs.VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB
+        if qr_max_size is not None:
+            if qr_max_size < 1:
+                logger.info(
+                    "You should not set a max_size smaller than 1MB, which can "
+                    "lead to error or degradation to custom allreduce or rccl."
+                )
+            qr_max_size = qr_max_size * MB
+        self._ptr = ops.init_custom_qr(self.rank, self.world_size, qr_max_size)
+        self.qr_max_size = qr_max_size if qr_max_size is not None else ops.qr_max_size()
+        self.create_shared_buffer()
+        self.disabled = False
+
+    def _rocm_arch_available(self):
+        if not current_platform.is_rocm():
+            return False
+        try:
+            props = torch.cuda.get_device_properties(0)
+            gcn_arch = getattr(props, "gcnArchName", "")
+            supported_archs = ["gfx94", "gfx95"]
+            return any(gfx in gcn_arch for gfx in supported_archs)
+        except Exception as e:
+            logger.warning("Failed to determine ROCm for quick allreduce: %s", e)
+            return False
+
+    def create_shared_buffer(self):
+        """
+        Creates a shared buffer for quickreduce.
+        Has to be called after init_custom_qr
+        """
+        handle = ops.qr_get_handle(self._ptr)
+        world_size = dist.get_world_size(group=self.group)
+        handles = [None] * world_size
+        dist.all_gather_object(handles, handle, group=self.group)
+        ops.qr_open_handles(self._ptr, handles)
+
+    def should_quick_allreduce(self, inp: torch.Tensor):
+        """
+        Check if quickreduce is available
+        """
+        if self.disabled:
+            return False
+        if inp.dtype not in self._SUPPORTED_DTYPES:
+            return False
+        inp_size = inp.numel() * inp.element_size()
+        # custom quick allreduce requires input byte size to be
+        # multiples of 16
+        if inp_size % 16 != 0:
+            return False
+        if not is_weak_contiguous(inp):
+            return False
+        dtype = inp.dtype
+        if self.use_fp16_kernels:
+            dtype = torch.float16
+        return (
+            inp_size <= self.qr_max_size
+            and inp_size
+            >= self._QR_MIN_SIZE[(dtype, self.world_size)][self.qr_quant_level.value]
+        )
+
+    def quick_all_reduce(self, inp: torch.Tensor, *, out: torch.Tensor = None):
+        """Performs an out-of-place custom quick all reduce."""
+        # quick allreduce doesn't require a separate graph mode,
+        # as QR uses static IPC buffer.
+        if out is None:
+            out = torch.empty_like(inp)
+        ops.qr_all_reduce(
+            self._ptr, inp, out, self.qr_quant_level.value, self.use_fp16_kernels
+        )
+        return out
+
+    def close(self):
+        if not self.disabled and getattr(self, "_ptr", None):
+            if ops is not None:
+                ops.qr_destroy(self._ptr)
+            self._ptr = 0
+            self.disabled = True
+
+    def __del__(self):
+        self.close()
--- a/vllm/distributed/device_communicators/ray_communicator.py
+++ b/vllm/distributed/device_communicators/ray_communicator.py
@@ -0,0 +1,259 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import uuid
+from typing import Any
+
+import ray
+import torch
+from ray.exceptions import RayChannelError
+from ray.experimental.channel.communicator import Communicator, TorchTensorAllocator
+from torch.distributed import ReduceOp
+
+from vllm.distributed.device_communicators.base_device_communicator import (
+    DeviceCommunicatorBase,
+)
+from vllm.distributed.parallel_state import get_pp_group
+from vllm.logger import init_logger
+from vllm.utils.torch_utils import current_stream
+
+logger = init_logger(__name__)
+
+
+class RayPPCommunicator(Communicator):
+    """
+    Communicator to be used for pipeline parallelism in Ray Compiled Graph.
+    This is wraps around the vLLM _PP GroupCoordinator.
+
+    This class is not thread-safe.
+    """
+
+    _comm: DeviceCommunicatorBase | None
+
+    def __init__(
+        self,
+        world_size: int,
+        comm_id: Any,
+        rank: int | None,
+        actor_handles: list["ray.actor.ActorHandle"],
+        cuda_stream: torch.cuda.Stream | None,
+        use_communication_streams: bool = False,
+    ):
+        """
+        Initialize a RayPPCommunicator that can be used to communicate with
+        other Ray Compiled Graph actors for pipeline parallelism.
+
+        Args:
+            world_size: The number of participating actors.
+            comm_id: A unique communicator ID. This is just to conform with
+                the Ray Communicator API and is not used.
+            rank: The rank of this actor. If None, then the caller is not a
+                participant of the RayPPCommunicator group (e.g., the Ray
+                driver).
+            actor_handles: A list of actor handles.
+            cuda_stream: A CUDA stream to dispatch communication ops to. This
+                is not supported.
+            use_communication_streams: Whether to use communication streams.
+                This is not supported.
+        """
+        self._world_size = world_size
+        self._rank: int | None = None
+        self._actor_handles = actor_handles
+        if use_communication_streams:
+            raise NotImplementedError("use_communication_streams is not supported")
+        if cuda_stream is not None and cuda_stream != current_stream():
+            raise ValueError(
+                "cuda_stream other than the current stream is not supported"
+            )
+
+        if rank is not None:
+            # Rank is not None, this is Ray worker
+            assert ray.get_gpu_ids(), "RayPPCommunicator has no GPUs assigned"
+
+            self._comm = get_pp_group().device_communicator
+            assert self._comm is not None
+
+            # Since we wrap around the vLLM _PP communicator, we use
+            # the rank from the vLLM communicator, and ignore the rank
+            # passed in from Ray.
+            # TODO(rui): refactor the Ray Communicator API so that
+            # it also supports no rank passed in.
+            self._rank = self._comm.rank_in_group
+
+            self._build_actor_rank_mapping()
+        else:
+            # Rank is None, this is Ray driver
+            self._comm = None
+
+        self._closed = False
+
+    def _build_actor_rank_mapping(self):
+        """
+        Use collective communication to build a mapping from actor IDs to ranks.
+        This should be called once during initialization.
+        """
+        if self._comm is None:
+            return {}
+
+        current_actor = ray.get_runtime_context().current_actor
+        actor_id_str = current_actor._actor_id.hex()
+
+        # Ray actor IDs are 32-character hex strings (128 bits)
+        ACTOR_ID_LEN = 32
+        actor_id_bytes = bytearray(actor_id_str.encode("utf-8"))
+        assert len(actor_id_bytes) == ACTOR_ID_LEN, (
+            f"Unexpected actor ID length: {len(actor_id_bytes)}"
+        )
+
+        actor_id_tensor = torch.frombuffer(actor_id_bytes, dtype=torch.uint8).to(
+            self._comm.device
+        )
+
+        # All-gather full actor IDs from all actors
+        gathered_ids = self._comm.all_gather(actor_id_tensor, dim=0)
+
+        # Build mapping: actor_id -> device_comm_rank
+        self._actor_id_to_rank = {}
+        for rank in range(self._world_size):
+            start_idx = rank * ACTOR_ID_LEN
+            end_idx = (rank + 1) * ACTOR_ID_LEN
+            actor_bytes = gathered_ids[start_idx:end_idx].cpu().numpy().tobytes()
+            actor_id = actor_bytes.decode("utf-8")
+            self._actor_id_to_rank[actor_id] = rank
+
+    def initialize(self, rank: int) -> None:
+        # No additional initialization is needed.
+        pass
+
+    def get_actor_handles(self) -> list["ray.actor.ActorHandle"]:
+        return self._actor_handles
+
+    def get_rank(self, actor: ray.actor.ActorHandle) -> int:
+        """
+        Return the given actor's rank using device communicator collective ops.
+        """
+        assert hasattr(self, "_actor_id_to_rank"), (
+            "Actor rank mapping not built. "
+            "This should have been done during initialization."
+        )
+
+        actor_id_str = actor._actor_id.hex()
+
+        if actor_id_str in self._actor_id_to_rank:
+            return self._actor_id_to_rank[actor_id_str]  # type: ignore
+        else:
+            raise ValueError(f"Actor {actor} not found in communicator group")
+
+    def get_self_rank(self) -> int | None:
+        """
+        Return this actor's rank.
+        """
+        return self._rank
+
+    def get_world_size(self) -> int:
+        """
+        Return the number of ranks in the RayPPCommunicator group.
+        """
+        return self._world_size
+
+    def send(self, buf: "torch.Tensor", peer_rank: int) -> None:
+        """
+        Send a torch.Tensor to a peer.
+
+        This returns when the send kernel has been queued, but the kernel may
+        not have completed. Therefore, the caller should ensure that there are
+        no concurrent writes to the sent `buf` until the send has finished.
+        That is, either all writes should be submitted on the current stream
+        (self._cuda_stream) or, if on a different stream, that stream should
+        synchronize with the current stream.
+
+        Args:
+            buf: The torch.Tensor to send. It should already be on this
+                actor's default device.
+            peer_rank: The rank of the actor to send to.
+        """
+        if self._closed:
+            raise RayChannelError("RayPPCommunicator has been destroyed.")
+
+        assert self._comm is not None
+        self._comm.send(buf, peer_rank)
+
+    def recv(
+        self,
+        shape: tuple[int, ...],
+        dtype: "torch.dtype",
+        peer_rank: int,
+        allocator: TorchTensorAllocator,
+    ) -> "torch.Tensor":
+        """
+        Receive a torch.Tensor from a peer and synchronize the current stream.
+
+        After this call returns, the receive buffer is safe to read from
+        any stream. An RayChannelError will be raised if an error occurred
+        (e.g., remote actor died), and the buffer is not safe to read.
+
+        Args:
+            shape: The shape of the tensor to receive.
+            dtype: The dtype of the tensor to receive.
+            peer_rank: The rank of the actor to receive from.
+            allocator: The allocator to use to create the received tensor.
+                This is ignored for this implementation.
+        """
+        if self._closed:
+            raise RayChannelError("RayPPCommunicator has been destroyed.")
+
+        assert self._comm is not None
+        size = torch.Size(shape)
+        buf = self._comm.recv(size, dtype, src=peer_rank)
+
+        # Buffer values are undefined if NCCL ops are aborted. Therefore, we
+        # need to synchronize here and check that the channel is still
+        # open to ensure that the receive buffer is valid.
+        # TODO(swang): Avoid CUDA synchronization.
+        current_stream().synchronize()
+
+        if self._closed:
+            raise RayChannelError("RayPPCommunicator has been destroyed.")
+        return buf
+
+    def allgather(
+        self,
+        send_buf: "torch.Tensor",
+        recv_buf: "torch.Tensor",
+    ):
+        raise NotImplementedError("allgather is not supported")
+
+    def allreduce(
+        self,
+        send_buf: "torch.Tensor",
+        recv_buf: "torch.Tensor",
+        op: ReduceOp = ReduceOp.SUM,
+    ):
+        raise NotImplementedError("allreduce is not supported")
+
+    def reducescatter(
+        self,
+        send_buf: "torch.Tensor",
+        recv_buf: "torch.Tensor",
+        op: ReduceOp = ReduceOp.SUM,
+    ):
+        raise NotImplementedError("reducescatter is not supported")
+
+    @property
+    def recv_stream(self):
+        return torch.cuda.StreamContext(current_stream())
+
+    @property
+    def send_stream(self):
+        return torch.cuda.StreamContext(current_stream())
+
+    def destroy(self) -> None:
+        # Just sets a flag, vLLM manages the lifecycle of the underlying
+        # _PP GroupCoordinator.
+        self._closed = True
+
+    def get_transport_name(self) -> str:
+        return "nccl"
+
+    @classmethod
+    def generate_communicator_id(cls) -> Any:
+        return uuid.uuid4()
--- a/vllm/distributed/device_communicators/shm_broadcast.py
+++ b/vllm/distributed/device_communicators/shm_broadcast.py
@@ -0,0 +1,778 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import functools
+import pickle
+import threading
+import time
+from contextlib import contextmanager
+from dataclasses import dataclass, field
+from multiprocessing import shared_memory
+from pickle import PickleBuffer
+from threading import Event
+from typing import TYPE_CHECKING, Any, cast
+from unittest.mock import patch
+
+import torch
+import torch.distributed as dist
+import zmq
+from torch.distributed import ProcessGroup
+from zmq import (  # type: ignore
+    IPV6,  # type: ignore
+    SUB,
+    SUBSCRIBE,
+    XPUB,
+    XPUB_VERBOSE,
+    Context,
+)
+
+import vllm.envs as envs
+from vllm.distributed.utils import StatelessProcessGroup, sched_yield
+from vllm.logger import init_logger
+from vllm.platforms import current_platform
+from vllm.utils.network_utils import (
+    get_ip,
+    get_open_port,
+    get_open_zmq_ipc_path,
+    is_valid_ipv6_address,
+)
+
+if TYPE_CHECKING:
+    from _typeshed import SizedBuffer
+
+VLLM_RINGBUFFER_WARNING_INTERVAL = envs.VLLM_RINGBUFFER_WARNING_INTERVAL
+
+from_bytes_big = functools.partial(int.from_bytes, byteorder="big")
+
+
+# Memory fence for cross-process shared memory visibility.
+# Required for correct producer-consumer synchronization when using
+# shared memory without locks.
+_memory_fence_lock = threading.Lock()
+
+
+def memory_fence():
+    """
+    Full memory barrier for shared memory synchronization.
+
+    Ensures all prior memory writes are visible to other processes before
+    any subsequent reads. This is critical for lock-free producer-consumer
+    patterns using shared memory.
+
+    Implementation acquires and immediately releases a lock. Python's
+    threading.Lock provides sequentially consistent memory barrier semantics
+    across all major platforms (POSIX, Windows). This is a lightweight
+    operation (~20ns) that guarantees:
+    - All stores before the barrier are visible to other threads/processes
+    - All loads after the barrier see the latest values
+    """
+    # Lock acquire/release provides full memory barrier semantics.
+    # Using context manager ensures lock release even on exceptions.
+    with _memory_fence_lock:
+        pass
+
+
+def to_bytes_big(value: int, size: int) -> bytes:
+    return value.to_bytes(size, byteorder="big")
+
+
+logger = init_logger(__name__)
+
+
+def long_wait_time_msg(threshold: int) -> str:
+    return (
+        "No available shared memory broadcast block found "
+        f"in {threshold} seconds. This typically happens "
+        "when some processes are hanging or doing some "
+        "time-consuming work (e.g. compilation, "
+        "weight/kv cache quantization)."
+    )
+
+
+class SpinTimer:
+    def record_activity(self):
+        pass
+
+    def spin(self):
+        sched_yield()
+
+
+class SpinSleepTimer(SpinTimer):
+    """
+    In setups which have long inactivity periods it is desirable to reduce
+    system power consumption when vllm does nothing. This would lead to more
+    CPU thermal headroom when a request eventually comes, especially when
+    multiple GPUs are connected as each GPU would otherwise pin one thread at
+    100% CPU usage.
+
+    The simplest solution is to reduce polling frequency when there is no
+    activity for a certain period of time.
+    """
+
+    def __init__(self, busy_loop_s: float = 3.0, wait_sleep_s: float = 0.1):
+        self.last_activity = time.monotonic()
+        self.busy_loop_s = busy_loop_s
+        self.wait_sleep_s = wait_sleep_s
+
+    def record_activity(self):
+        self.last_activity = time.monotonic()
+
+    def spin(self):
+        curr_time = time.monotonic()
+        if curr_time >= self.last_activity + self.busy_loop_s:
+            time.sleep(self.wait_sleep_s)
+        else:
+            sched_yield()
+
+
+class ShmRingBuffer:
+    def __init__(
+        self,
+        n_reader: int,
+        max_chunk_bytes: int,
+        max_chunks: int,
+        name: str | None = None,
+    ):
+        """
+        A shared memory ring buffer implementation for broadcast communication.
+        Essentially, it is a queue where only one will `enqueue` and multiple
+        will `dequeue`. The max size of each item, together with the max number
+        of items that can be stored in the buffer are known in advance.
+        In this case, we don't need to synchronize the access to
+         the buffer.
+
+        Buffer memory layout:
+                  data                                 metadata
+                    |                                      |
+                    | (current_idx)                        | (current_idx)
+                    v                                      v
+        +-------------------------------+----------------------------------------+
+        | chunk0 | chunk1 | ... | chunk | metadata0 | metadata1 | ... | metadata |
+        +-------------------------------+----------------------------------------+
+        | max_chunks x max_chunk_bytes  | max_chunks x (1 + n_reader) bytes      |
+
+        metadata memory layout: each byte is a flag, the first byte is the written
+        flag, and the rest are reader flags. The flags are set to 0 by default.
+        +--------------+--------------+--------------+-----+--------------+
+        | written_flag | reader0_flag | reader1_flag | ... | readerN_flag |
+        +--------------+--------------+--------------+-----+--------------+
+
+        The state of metadata is as follows:
+
+        (case 1) 0???...???: the block is not written yet, cannot read, can write
+        (case 2) 1000...000: the block is just written, can read, cannot write
+        (case 3) 1???...???: the block is written and read by some readers, can read if not read, cannot write
+        (case 4) 1111...111: the block is written and read by all readers, cannot read, can write
+
+        State transition for readers:
+
+        When a reader finds a block that it can read (case 2 or 3), it can yield the block for caller to read.
+        Only after the caller finishes reading the block, the reader can mark the block as read.
+        Readers only mark the block as read (from 0 to 1), the writer marks the block as ready to read (from 1 to 0).
+
+        State transition for writer:
+
+        When the writer writes to a block (case 1 or 4), it first resets the written flag to 0, converting either case
+        to case 1. Then it can yield the block for caller to write. After the caller finishes writing the block, the writer
+        can reset the reader flags to 0, and mark the block as written (from 0 to 1).
+        NOTE: the order is important here, first reset the reader flags (so that we are still in case 1), then mark the block as written. The state transition is atomic. If we do it in the reverse order, it will go through case 3 and then back to case 2, and readers might read the intermediate case 3, which is not correct.
+
+        During creation, `name` is None and the buffer is created. We can pass the
+        created object to other processes by pickling it. The other processes will
+        get the name of the shared memory and open it, so that they can access the
+        same shared memory buffer.
+        """  # noqa
+        self.n_reader = n_reader
+        self.metadata_size = 1 + n_reader
+        self.max_chunk_bytes = max_chunk_bytes
+        self.max_chunks = max_chunks
+        self.total_bytes_of_buffer = (
+            self.max_chunk_bytes + self.metadata_size
+        ) * self.max_chunks
+        self.data_offset = 0
+        self.metadata_offset = self.max_chunk_bytes * self.max_chunks
+
+        if name is None:
+            # we are creating a buffer
+            self.is_creator = True
+            self.shared_memory = shared_memory.SharedMemory(
+                create=True, size=self.total_bytes_of_buffer
+            )
+            # initialize the metadata section to 0
+            with self.shared_memory.buf[self.metadata_offset :] as metadata_buffer:
+                torch.frombuffer(metadata_buffer, dtype=torch.uint8).fill_(0)
+        else:
+            # we are opening an existing buffer
+            self.is_creator = False
+            # fix to https://stackoverflow.com/q/62748654/9191338
+            # Python incorrectly tracks shared memory even if it is not
+            # created by the process. The following patch is a workaround.
+            with patch(
+                "multiprocessing.resource_tracker.register",
+                lambda *args, **kwargs: None,
+            ):
+                try:
+                    self.shared_memory = shared_memory.SharedMemory(name=name)
+                    # See https://docs.python.org/3/library/multiprocessing.shared_memory.html # noqa
+                    # Some platforms allocate memory based on page size,
+                    # so the shared memory block size may be larger or equal
+                    # to the requested size. The size parameter is ignored
+                    # when attaching to an existing block.
+                    assert self.shared_memory.size >= self.total_bytes_of_buffer
+                except FileNotFoundError:
+                    # we might deserialize the object in a different node
+                    # in this case, this object is not used,
+                    # and we should suppress the error
+                    pass
+
+    def handle(self):
+        return (
+            self.n_reader,
+            self.max_chunk_bytes,
+            self.max_chunks,
+            self.shared_memory.name,
+        )
+
+    def __reduce__(self):
+        return (
+            self.__class__,
+            self.handle(),
+        )
+
+    def __del__(self):
+        if hasattr(self, "shared_memory"):
+            self.shared_memory.close()
+            if self.is_creator:
+                self.shared_memory.unlink()
+
+    @contextmanager
+    def get_data(self, current_idx: int):
+        start = self.data_offset + current_idx * self.max_chunk_bytes
+        end = start + self.max_chunk_bytes
+        with self.shared_memory.buf[start:end] as buf:
+            yield buf
+
+    @contextmanager
+    def get_metadata(self, current_idx: int):
+        start = self.metadata_offset + current_idx * self.metadata_size
+        end = start + self.metadata_size
+        with self.shared_memory.buf[start:end] as buf:
+            yield buf
+
+
+@dataclass
+class Handle:
+    local_reader_ranks: list[int] = field(default_factory=list)
+
+    buffer_handle: tuple[int, int, int, str] | None = None
+    local_subscribe_addr: str | None = None
+    remote_subscribe_addr: str | None = None
+    remote_addr_ipv6: bool = False
+
+
+class MessageQueue:
+    def __init__(
+        self,
+        n_reader,  # number of all readers
+        n_local_reader,  # number of local readers through shared memory
+        local_reader_ranks: list[int] | None = None,
+        # Default of 24MiB chosen to be large enough to accommodate grammar
+        # bitmask tensors for large batches (1024 requests).
+        max_chunk_bytes: int = 1024 * 1024 * 24,
+        max_chunks: int = 10,
+        connect_ip: str | None = None,
+    ):
+        if local_reader_ranks is None:
+            local_reader_ranks = list(range(n_local_reader))
+        else:
+            assert len(local_reader_ranks) == n_local_reader
+        self.n_local_reader = n_local_reader
+        n_remote_reader = n_reader - n_local_reader
+        self.n_remote_reader = n_remote_reader
+
+        context = Context()
+
+        if n_local_reader > 0:
+            # for local readers, we will:
+            # 1. create a shared memory ring buffer to communicate small data
+            # 2. create a publish-subscribe socket to communicate large data
+            self.buffer = ShmRingBuffer(n_local_reader, max_chunk_bytes, max_chunks)
+
+            # XPUB is very similar to PUB,
+            # except that it can receive subscription messages
+            # to confirm the number of subscribers
+            self.local_socket = context.socket(XPUB)
+            # set the verbose option so that we can receive every subscription
+            # message. otherwise, we will only receive the first subscription
+            # see http://api.zeromq.org/3-3:zmq-setsockopt for more details
+            self.local_socket.setsockopt(XPUB_VERBOSE, True)
+            local_subscribe_addr = get_open_zmq_ipc_path()
+            logger.debug("Binding to %s", local_subscribe_addr)
+            self.local_socket.bind(local_subscribe_addr)
+
+            self.current_idx = 0
+        else:
+            self.buffer = None  # type: ignore
+            local_subscribe_addr = None
+            self.local_socket = None
+            self.current_idx = -1
+
+        remote_addr_ipv6 = False
+        if n_remote_reader > 0:
+            # for remote readers, we will:
+            # create a publish-subscribe socket to communicate large data
+            if not connect_ip:
+                connect_ip = get_ip()
+            self.remote_socket = context.socket(XPUB)
+            self.remote_socket.setsockopt(XPUB_VERBOSE, True)
+            remote_subscribe_port = get_open_port()
+            if is_valid_ipv6_address(connect_ip):
+                self.remote_socket.setsockopt(IPV6, 1)
+                remote_addr_ipv6 = True
+                connect_ip = f"[{connect_ip}]"
+            socket_addr = f"tcp://{connect_ip}:{remote_subscribe_port}"
+            self.remote_socket.bind(socket_addr)
+            remote_subscribe_addr = f"tcp://{connect_ip}:{remote_subscribe_port}"
+        else:
+            remote_subscribe_addr = None
+            self.remote_socket = None
+
+        self._is_writer = True
+        self._is_local_reader = False
+        self.local_reader_rank = -1
+        # rank does not matter for remote readers
+        self._is_remote_reader = False
+        self._read_spin_timer = SpinTimer()
+
+        self.handle = Handle(
+            local_reader_ranks=local_reader_ranks,
+            buffer_handle=self.buffer.handle() if self.buffer is not None else None,
+            local_subscribe_addr=local_subscribe_addr,
+            remote_subscribe_addr=remote_subscribe_addr,
+            remote_addr_ipv6=remote_addr_ipv6,
+        )
+
+        logger.debug("vLLM message queue communication handle: %s", self.handle)
+
+    def export_handle(self) -> Handle:
+        return self.handle
+
+    @staticmethod
+    def create_from_handle(handle: Handle, rank) -> "MessageQueue":
+        self = MessageQueue.__new__(MessageQueue)
+        self.handle = handle
+        self._is_writer = False
+
+        context = Context()
+
+        if rank in handle.local_reader_ranks:
+            assert handle.buffer_handle is not None
+            self.buffer = ShmRingBuffer(*handle.buffer_handle)
+            self.current_idx = 0
+            self.local_reader_rank = handle.local_reader_ranks.index(rank)
+            self._is_local_reader = True
+            self._is_remote_reader = False
+
+            self.local_socket = context.socket(SUB)
+            self.local_socket.setsockopt_string(SUBSCRIBE, "")
+            socket_addr = handle.local_subscribe_addr
+            logger.debug("Connecting to %s", socket_addr)
+            self.local_socket.connect(socket_addr)
+
+            self.remote_socket = None
+
+            self._read_spin_timer = (
+                SpinSleepTimer() if envs.VLLM_SLEEP_WHEN_IDLE else SpinTimer()
+            )
+        else:
+            self.buffer = None  # type: ignore
+            self.current_idx = -1
+            self.local_reader_rank = -1
+            self._is_local_reader = False
+            self._is_remote_reader = True
+
+            self.local_socket = None
+
+            self.remote_socket = context.socket(SUB)
+            self.remote_socket.setsockopt_string(SUBSCRIBE, "")
+            if handle.remote_addr_ipv6:
+                self.remote_socket.setsockopt(IPV6, 1)
+            socket_addr = handle.remote_subscribe_addr
+            logger.debug("Connecting to %s", socket_addr)
+            self.remote_socket.connect(socket_addr)
+
+        return self
+
+    def wait_until_ready(self):
+        """This is a collective operation. All processes (including the
+        readers and the writer) should call this function.
+        """
+        if self._is_writer:
+            # wait for all readers to connect
+
+            # local readers
+            for i in range(self.n_local_reader):
+                # wait for subscription messages from all local readers
+                self.local_socket.recv()
+            if self.n_local_reader > 0:
+                # send a message to all local readers
+                # to make sure the publish channel is working
+                self.local_socket.send(b"READY")
+
+            # remote readers
+            for i in range(self.n_remote_reader):
+                # wait for subscription messages from all remote readers
+                self.remote_socket.recv()
+            if self.n_remote_reader > 0:
+                # send a message to all remote readers
+                # to make sure the publish channel is working
+                self.remote_socket.send(b"READY")
+        elif self._is_local_reader:
+            # wait for the writer to send a message
+            recv = self.local_socket.recv()
+            assert recv == b"READY"
+        elif self._is_remote_reader:
+            # wait for the writer to send a message
+            recv = self.remote_socket.recv()
+            assert recv == b"READY"
+
+    @contextmanager
+    def acquire_write(self, timeout: float | None = None):
+        assert self._is_writer, "Only writers can acquire write"
+        start_time = time.monotonic()
+        n_warning = 1
+        while True:
+            with self.buffer.get_metadata(self.current_idx) as metadata_buffer:
+                # Memory fence ensures we see the latest read flags from readers.
+                # Without this, we may read stale flags from our CPU cache and
+                # spin indefinitely even though readers have completed.
+                memory_fence()
+                read_count = sum(metadata_buffer[1:])
+                written_flag = metadata_buffer[0]
+                if written_flag and read_count != self.buffer.n_reader:
+                    # this block is written and not read by all readers
+                    # for writers, `self.current_idx` is the next block to write
+                    # if this block is not ready to write,
+                    # we need to wait until it is read by all readers
+
+                    # Release the processor to other threads
+                    sched_yield()
+
+                    # if we time out, raise an exception
+                    elapsed = time.monotonic() - start_time
+                    if timeout is not None and elapsed > timeout:
+                        raise TimeoutError
+
+                    # if we wait for a long time, log a message
+                    if elapsed > VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning:
+                        logger.info(
+                            long_wait_time_msg(VLLM_RINGBUFFER_WARNING_INTERVAL)
+                        )
+                        n_warning += 1
+
+                    continue
+                # found a block that is either
+                # (1) not written
+                # (2) read by all readers
+
+                # mark the block as not written
+                metadata_buffer[0] = 0
+                # let caller write to the buffer
+                with self.buffer.get_data(self.current_idx) as buf:
+                    yield buf
+
+                # caller has written to the buffer
+                # NOTE: order is important here
+                # first set the read flags to 0
+                # then set the written flag to 1
+                # otherwise, the readers may think they already read the block
+                for i in range(1, self.buffer.n_reader + 1):
+                    # set read flag to 0, meaning it is not read yet
+                    metadata_buffer[i] = 0
+                # mark the block as written
+                metadata_buffer[0] = 1
+                # Memory fence ensures the write is visible to readers on other cores
+                # before we proceed. Without this, readers may spin indefinitely
+                # waiting for a write that's stuck in our CPU's store buffer.
+                memory_fence()
+                self.current_idx = (self.current_idx + 1) % self.buffer.max_chunks
+                break
+
+    @contextmanager
+    def acquire_read(
+        self,
+        timeout: float | None = None,
+        cancel: Event | None = None,
+        indefinite: bool = False,
+    ):
+        assert self._is_local_reader, "Only readers can acquire read"
+        start_time = time.monotonic()
+        n_warning = 1
+        while True:
+            with self.buffer.get_metadata(self.current_idx) as metadata_buffer:
+                # Memory fence ensures we see the latest writes from the writer.
+                # Without this, we may read stale flags from our CPU cache
+                # and spin indefinitely even though writer has updated them.
+                memory_fence()
+                read_flag = metadata_buffer[self.local_reader_rank + 1]
+                written_flag = metadata_buffer[0]
+                if not written_flag or read_flag:
+                    # this block is either
+                    # (1) not written
+                    # (2) already read by this reader
+
+                    # for readers, `self.current_idx` is the next block to read
+                    # if this block is not ready,
+                    # we need to wait until it is written
+
+                    # Release the processor to other threads
+                    self._read_spin_timer.spin()
+
+                    if cancel is not None and cancel.is_set():
+                        raise RuntimeError("cancelled")
+
+                    # if we time out, raise an exception
+                    elapsed = time.monotonic() - start_time
+                    if timeout is not None and elapsed > timeout:
+                        raise TimeoutError
+
+                    # if we wait for a long time, log a message
+                    if not indefinite and (
+                        elapsed > VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning
+                    ):
+                        logger.info(
+                            long_wait_time_msg(VLLM_RINGBUFFER_WARNING_INTERVAL)
+                        )
+                        n_warning += 1
+
+                    continue
+                # found a block that is not read by this reader
+                # let caller read from the buffer
+                with self.buffer.get_data(self.current_idx) as buf:
+                    yield buf
+
+                # caller has read from the buffer
+                # set the read flag
+                metadata_buffer[self.local_reader_rank + 1] = 1
+                # Memory fence ensures the read flag is visible to the writer.
+                # Without this, writer may not see our read completion and
+                # could wait indefinitely for all readers to finish.
+                memory_fence()
+                self.current_idx = (self.current_idx + 1) % self.buffer.max_chunks
+
+                self._read_spin_timer.record_activity()
+                break
+
+    def enqueue(self, obj, timeout: float | None = None):
+        """Write to message queue with optional timeout (in seconds)"""
+        assert self._is_writer, "Only writers can enqueue"
+        all_buffers: list[SizedBuffer] = [b""]
+        total_bytes = 6  # 2 bytes for oob buffer count, 4 for main buffer size
+
+        def oob_callback(buf: PickleBuffer) -> bool:
+            raw_buf = buf.raw()
+            if len(raw_buf) < 1024 * 1024:
+                # In-line buffers smaller than 1MiB.
+                return True
+            all_buffers.append(raw_buf)
+            nonlocal total_bytes
+            total_bytes += len(raw_buf) + 4
+            return False
+
+        all_buffers[0] = pickle.dumps(
+            obj, protocol=pickle.HIGHEST_PROTOCOL, buffer_callback=oob_callback
+        )
+        if self.n_local_reader > 0:
+            if total_bytes + len(all_buffers[0]) >= self.buffer.max_chunk_bytes:
+                with self.acquire_write(timeout) as buf:
+                    buf[0] = 1  # overflow
+                self.local_socket.send_multipart(all_buffers, copy=False)
+            else:
+                # Byte 0: 0
+                # Bytes 1-2: Count of buffers
+                # Then each buffer follows, preceded by 4 bytes containing its length:
+                # [4 byte int L][L bytes of buffer content] ...
+                with self.acquire_write(timeout) as buf:
+                    buf[0] = 0  # not overflow
+                    offset = 3
+                    buf[1:offset] = to_bytes_big(len(all_buffers), 2)  # oob buf count
+                    for buffer in all_buffers:
+                        buf_len = len(buffer)
+                        # prepend each buffer with 4 bytes containing its size.
+                        buf_offset = offset + 4
+                        buf[offset:buf_offset] = to_bytes_big(buf_len, 4)
+                        buf[buf_offset : (offset := buf_offset + buf_len)] = buffer
+
+        if self.n_remote_reader > 0:
+            self.remote_socket.send_multipart(all_buffers, copy=False)
+
+    def dequeue(
+        self,
+        timeout: float | None = None,
+        cancel: Event | None = None,
+        indefinite: bool = False,
+    ):
+        """Read from message queue with optional timeout (in seconds)"""
+        if self._is_local_reader:
+            with self.acquire_read(timeout, cancel, indefinite) as buf:
+                overflow = buf[0] == 1
+                if not overflow:
+                    offset = 3
+                    buf_count = from_bytes_big(buf[1:offset])
+                    all_buffers = []
+                    for i in range(buf_count):
+                        buf_offset = offset + 4
+                        buf_len = from_bytes_big(buf[offset:buf_offset])
+                        offset = buf_offset + buf_len
+                        all_buffers.append(buf[buf_offset:offset])
+                    obj = pickle.loads(all_buffers[0], buffers=all_buffers[1:])
+            if overflow:
+                obj = MessageQueue.recv(self.local_socket, timeout)
+        elif self._is_remote_reader:
+            obj = MessageQueue.recv(self.remote_socket, timeout)
+        else:
+            raise RuntimeError("Only readers can dequeue")
+        return obj
+
+    @staticmethod
+    def recv(socket: zmq.Socket, timeout: float | None) -> Any:
+        timeout_ms = None if timeout is None else int(timeout * 1000)
+        if not socket.poll(timeout=timeout_ms):
+            raise TimeoutError
+        recv, *recv_oob = socket.recv_multipart(copy=False)
+        return pickle.loads(recv, buffers=recv_oob)
+
+    def broadcast_object(self, obj=None):
+        if self._is_writer:
+            self.enqueue(obj)
+            return obj
+        return self.dequeue()
+
+    @staticmethod
+    def create_from_process_group_single_reader(
+        pg: ProcessGroup,
+        max_chunk_bytes,
+        max_chunks,
+        reader_rank: int = 0,
+        blocking: bool = False,
+    ) -> tuple["MessageQueue", list[Handle]]:
+        """
+        Creates a MessageQueue for a process group with a single reader.
+
+        This method is designed for scenarios where only one process (the reader)
+        will consume messages, and all other processes are writers. It sets up
+        the shared memory buffer and communication handles accordingly, and
+        gathers the handles from all processes to the reader.
+
+        Args:
+            pg (ProcessGroup): The torch distributed process group.
+            max_chunk_bytes (int): Maximum size in bytes for each chunk in the buffer.
+            max_chunks (int): Maximum number of chunks in the buffer.
+            reader_rank (int, optional): The global rank that will act as the reader.
+                Defaults to 0.
+            blocking (bool, optional): If True, blocks until all processes are ready.
+                Defaults to False.
+
+        Returns:
+            tuple[MessageQueue, list[Handle]]:
+            The MessageQueue instance for the calling process,
+            and a list of handles (only non-empty for the reader process).
+        """
+        local_size = current_platform.device_count()
+        rank = dist.get_rank()
+        same_node = rank // local_size == reader_rank // local_size
+        buffer_io = MessageQueue(
+            n_reader=1,
+            n_local_reader=1 if same_node else 0,
+            max_chunk_bytes=max_chunk_bytes,
+            max_chunks=max_chunks,
+        )
+        handle = buffer_io.export_handle()
+        handles = [None] * dist.get_world_size(pg) if rank == reader_rank else None
+        dist.gather_object(handle, handles, dst=reader_rank, group=pg)
+        if blocking:
+            buffer_io.wait_until_ready()
+        return buffer_io, cast(list[Handle], handles or [])
+
+    @staticmethod
+    def create_from_process_group(
+        pg: ProcessGroup | StatelessProcessGroup,
+        max_chunk_bytes,
+        max_chunks,
+        writer_rank: int = 0,
+        external_writer_handle=None,
+        blocking: bool = True,
+    ) -> "MessageQueue":
+        """
+        Creates a MessageQueue for a distributed process group with one writer and
+        multiple readers.
+
+        This method is designed for scenarios where one process (the writer) sends
+        messages, and all other processes (the readers) receive messages. It sets up
+        the shared memory buffer and socket communication handles accordingly, and
+        broadcasts the handle from the writer to all readers.
+
+        Args:
+            pg (ProcessGroup | StatelessProcessGroup): The torch distributed process
+                group.
+            max_chunk_bytes (int): Maximum size in bytes for each chunk in the buffer.
+            max_chunks (int): Maximum number of chunks in the buffer.
+            writer_rank (int, optional): The global rank that will act as the writer.
+                Defaults to 0.
+            external_writer_handle (Handle, optional): Used when there is a handle
+                from an external Message Queue. If provided, use this handle to init
+                PG writer message queue instead of creating a new one. Defaults to None.
+            blocking (bool, optional): If True, blocks until all processes are ready.
+                Defaults to True.
+
+        Returns:
+            MessageQueue: The MessageQueue instance for the calling process.
+
+        """
+        if isinstance(pg, ProcessGroup):
+            group_rank = dist.get_rank(pg)
+            group_world_size = dist.get_world_size(pg)
+            global_ranks = dist.get_process_group_ranks(pg)
+        else:
+            group_rank = pg.rank
+            group_world_size = pg.world_size
+            global_ranks = list(range(pg.world_size))
+        from vllm.distributed.parallel_state import in_the_same_node_as
+
+        status = in_the_same_node_as(pg, source_rank=writer_rank)
+        if group_rank == writer_rank:
+            if external_writer_handle is not None:
+                buffer_io = MessageQueue.create_from_handle(
+                    external_writer_handle, group_rank
+                )
+            else:
+                same_node_ranks = [i for i, s in enumerate(status) if s]
+                n_reader = group_world_size - 1
+                n_local_reader = len(same_node_ranks) - 1
+                local_reader_ranks = [i for i in same_node_ranks if i != writer_rank]
+                buffer_io = MessageQueue(
+                    n_reader=n_reader,
+                    n_local_reader=n_local_reader,
+                    local_reader_ranks=local_reader_ranks,
+                    max_chunk_bytes=max_chunk_bytes,
+                    max_chunks=max_chunks,
+                )
+            handle = buffer_io.export_handle()
+            if isinstance(pg, ProcessGroup):
+                dist.broadcast_object_list(
+                    [handle], src=global_ranks[writer_rank], group=pg
+                )
+            else:
+                pg.broadcast_obj(handle, writer_rank)
+        else:
+            if isinstance(pg, ProcessGroup):
+                recv = [None]
+                dist.broadcast_object_list(
+                    recv, src=global_ranks[writer_rank], group=pg
+                )
+                handle = recv[0]  # type: ignore
+            else:
+                handle = pg.broadcast_obj(None, writer_rank)
+            buffer_io = MessageQueue.create_from_handle(handle, group_rank)
+        if blocking:
+            buffer_io.wait_until_ready()
+        return buffer_io
--- a/vllm/distributed/device_communicators/shm_object_storage.py
+++ b/vllm/distributed/device_communicators/shm_object_storage.py
@@ -0,0 +1,697 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pickle
+from abc import ABC, abstractmethod
+from collections.abc import Callable, Iterable
+from contextlib import contextmanager
+from dataclasses import dataclass
+from itertools import chain
+from multiprocessing import shared_memory
+from multiprocessing.synchronize import Lock as LockType
+from typing import Any
+from unittest.mock import patch
+
+import torch
+
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+class SingleWriterShmRingBuffer:
+    """
+    A single-writer, multiple-reader ring buffer implementation using shared
+    memory. This class provides a thread-safe ring buffer where one process
+    can write data while multiple processes/threads can read from it.
+
+    Architecture:
+    - Uses shared memory for cross-process communication
+    - Maintains metadata for each allocated buffer chunk in the writer process
+    - Supports custom "is_free_fn" functions to determine when buffers can be
+      reused
+    - Each buffer chunk contains: `[4-byte id][4-byte size][actual_data]`
+
+    Key Concepts:
+    - monotonic_id_start/end: Track the range of active buffer IDs
+    - data_buffer_start/end: Track the physical memory range in use
+    - Automatic wraparound when reaching buffer end
+    - Lazy garbage collection based on is_free_fn checks
+
+    Example Usage Scenarios:
+
+    Scenario 1: Simple Linear Allocation
+    ```
+    Buffer size: 100 bytes
+    Initial state: [................................................. ]
+                   ^start=end(0)
+
+    After allocating 20 bytes (id=0):
+    [id:0|size:20|data........][...................................]
+    ^start(0)                  ^end(28)
+
+    After allocating 30 bytes (id=1):
+    [id:0|size:20|data........][id:1|size:30|data..............][..]
+    ^start(0)                                                   ^end(66)
+    ```
+
+    Scenario 2: Memory Reclamation
+    ```
+    Before freeing (both buffers still in use):
+    [id:0|size:20|data........][id:1|size:30|data..............][..]
+    ^start(0)                                                   ^end(66)
+
+    After id:0 is marked free by readers:
+    [FREED.................... ][id:1|size:30|data..............][..]
+                                ^start(28)                       ^end(66)
+
+    After both are freed:
+    [FREED..............................................][..]
+                                                         ^start=end(66)
+    ```
+
+    Scenario 3: Wraparound Allocation (continuing from Scenario 2)
+    ```
+    Starting from after memory reclamation in Scenario 2:
+    [FREED..............................................][..]
+                                                         ^start=end(66)
+
+    Allocate 40 bytes (id=2) - only 34 bytes available at end, so wraparound:
+    [id:2|size:40|data........................][FREED.............][..]
+                                              ^end(148)            ^start(66)
+    ```
+
+    Scenario 4: Error Handling - Out of Space
+    ```
+    Starting from after wraparound allocation in Scenario 3:
+    [id:2|size:40|data........................][FREED.............][..]
+                                              ^end(148)            ^start(66)
+
+    Trying to allocate 20 more bytes:
+    occupied_size_new = end + size - start = 148 + 28 - 66 > buffer_size(100)
+    -> Raises MemoryError: "Not enough space in the data buffer"
+    ```
+
+    Thread Safety:
+    - Single writer: Only one process/thread should write (allocate_buf)
+    - Multiple readers: Multiple processes/threads can read (access_buf)
+    - Reader synchronization handled by is_free_fn callback
+    - Writer handles garbage collection (free_buf) based on reader feedback
+
+    Memory Layout per Buffer Chunk:
+    `[4-byte monotonic_id][4-byte chunk_size][actual_data...]`
+    ^metadata_start                         ^data_start
+
+    The monotonic_id ensures data integrity - readers can verify they're
+    accessing the correct data even after buffer wraparound or reuse.
+    """
+
+    def __init__(
+        self,
+        data_buffer_size: int,
+        name: str | None = None,
+        create: bool = False,
+    ):
+        self.data_buffer_size = data_buffer_size
+        self.is_writer = create
+
+        self.ID_NBYTES = 4
+        self.ID_MAX = 2**31  # exclusive, so 2**31 - 1 is the max value
+        self.SIZE_NBYTES = 4
+        # 4 bytes for id, 4 bytes for buffer size
+        self.MD_SIZE = self.ID_NBYTES + self.SIZE_NBYTES
+        self.monotonic_id_end = 0
+        self.monotonic_id_start = 0
+        self.data_buffer_start = 0
+        self.data_buffer_end = 0
+
+        if create:
+            # we are creating a buffer
+            self.metadata: dict[int, int] = {}  # monotonic_id -> start address
+            self.shared_memory = shared_memory.SharedMemory(
+                create=True, size=self.data_buffer_size, name=name
+            )
+        else:
+            # we are opening an existing buffer
+            # fix to https://stackoverflow.com/q/62748654/9191338
+            # Python incorrectly tracks shared memory even if it is not
+            # created by the process. The following patch is a workaround.
+            with patch(
+                "multiprocessing.resource_tracker.register",
+                lambda *args, **kwargs: None,
+            ):
+                self.shared_memory = shared_memory.SharedMemory(name=name)
+                # See https://docs.python.org/3/library/multiprocessing.shared_memory.html # noqa
+                # Some platforms allocate memory based on page size,
+                # so the shared memory block size may be larger or equal
+                # to the requested size. The size parameter is ignored
+                # when attaching to an existing block.
+                assert self.shared_memory.size >= self.data_buffer_size
+
+        logger.debug(
+            "Shared memory created/opened with name: %s, size: %d",
+            self.shared_memory.name,
+            self.data_buffer_size,
+        )
+
+    def handle(self):
+        return (
+            self.data_buffer_size,
+            self.shared_memory.name,
+        )
+
+    def clear(self) -> None:
+        """Clear the ring buffer."""
+        assert self.is_writer, "Only the writer can clear the buffer."
+        self.metadata.clear()
+        self.monotonic_id_end = 0
+        self.monotonic_id_start = 0
+        self.data_buffer_start = 0
+        self.data_buffer_end = 0
+
+    def __del__(self):
+        if hasattr(self, "shared_memory"):
+            self.shared_memory.close()
+            if self.is_writer:
+                self.shared_memory.unlink()
+
+    def int2byte(self, integer: int) -> bytes:
+        """Convert an integer to bytes."""
+        return integer.to_bytes(self.ID_NBYTES, "little", signed=True)
+
+    def byte2int(self, byte_data: bytes) -> int:
+        """Convert bytes back to an integer."""
+        return int.from_bytes(byte_data, "little", signed=True)
+
+    def allocate_buf(self, size: int) -> tuple[int, int]:
+        """
+        Allocate a buffer `MD_SIZE` + `size` bytes in the shared memory.
+        Memory layout:
+        `[4-byte monotonic_id][4-byte size][buffer data...]`
+        """
+        assert self.is_writer, "Only the writer can allocate buffers."
+        assert size > 0, "Size must be greater than 0"
+        size += self.MD_SIZE  # add metadata size to the buffer size
+        # reset to beginning if the buffer does have enough contiguous space
+        buffer_end_reset = self.data_buffer_end % self.data_buffer_size
+        if buffer_end_reset + size > self.data_buffer_size:
+            buffer_end_reset = (
+                self.data_buffer_end // self.data_buffer_size + 1
+            ) * self.data_buffer_size
+        else:  # no reset needed
+            buffer_end_reset = self.data_buffer_end
+
+        # check if we have enough space in the data buffer
+        # i.e. if the new end (self.data_buffer_end + size)
+        # exceeds the start of the data buffer
+        occupied_size_new = buffer_end_reset + size - self.data_buffer_start
+        if occupied_size_new > self.data_buffer_size:
+            raise MemoryError(
+                "Not enough space in the data buffer, "
+                "try calling free_buf() to free up space"
+            )
+        self.data_buffer_end = buffer_end_reset
+
+        # first 4 bytes as the monotonic id
+        buf_idx = self.data_buffer_end % self.data_buffer_size
+        self.shared_memory.buf[buf_idx : buf_idx + self.ID_NBYTES] = self.int2byte(
+            self.monotonic_id_end
+        )
+        # next 4 bytes as the size of the data buffer
+        self.shared_memory.buf[buf_idx + self.ID_NBYTES : buf_idx + self.MD_SIZE] = (
+            self.int2byte(size)
+        )
+
+        # record metadata
+        self.metadata[self.monotonic_id_end % self.ID_MAX] = self.data_buffer_end
+        # update buffer and monotonic id indices
+        current_buffer_end = self.data_buffer_end
+        current_id_end = self.monotonic_id_end
+        self.data_buffer_end += size
+        self.monotonic_id_end = (self.monotonic_id_end + 1) % self.ID_MAX
+        return current_buffer_end, current_id_end
+
+    @contextmanager
+    def access_buf(self, address: int):
+        buf_idx = address % self.data_buffer_size
+
+        # read metadata
+        metadata_buff = self.shared_memory.buf[buf_idx : buf_idx + self.MD_SIZE]
+        id = self.byte2int(metadata_buff[: self.ID_NBYTES])
+        size = self.byte2int(metadata_buff[self.ID_NBYTES : self.MD_SIZE])
+
+        # yield the data buffer and metadata
+        data_buff = self.shared_memory.buf[buf_idx + self.MD_SIZE : buf_idx + size]
+        with (
+            memoryview(data_buff) as data_view,
+        ):
+            yield data_view, (id, size)
+
+    def free_buf(
+        self,
+        is_free_fn: Callable[[int, memoryview], bool],
+        nbytes: int | None = None,
+    ) -> Iterable[int]:
+        """
+        Free a buffer of the given size. This is a no-op in shared memory,
+        but we need to keep track of the metadata.
+
+        If freed memory spreads across the end and start of the ring buffer,
+        the actual freed memory will be in two segments. In this case there
+        still might not be a contiguous space of `nbytes` available.
+
+        Args:
+            nbytes (int, optional): The size of the buffer to free. If None,
+                frees the maximum size of the ring buffer.
+        """
+
+        assert self.is_writer, "Only the writer can free buffers."
+        logger.debug(
+            "Freeing up space in the ring buffer, "
+            "monotonic_id_start: %d, monotonic_id_end: %d",
+            self.monotonic_id_start,
+            self.monotonic_id_end,
+        )
+        monotonic_id_before = self.monotonic_id_start
+        # if nbytes is None, free up the maximum size of the ring buffer
+        if nbytes is None:
+            nbytes = self.data_buffer_size
+        freed_bytes = 0
+        while self.monotonic_id_start in self.metadata and freed_bytes < nbytes:
+            address = self.metadata[self.monotonic_id_start]
+            with self.access_buf(address) as (data_buff, metadata):
+                if is_free_fn(self.monotonic_id_start, data_buff):
+                    # check passed, we can free the buffer
+                    del self.metadata[self.monotonic_id_start]
+                    self.monotonic_id_start = (
+                        self.monotonic_id_start + 1
+                    ) % self.ID_MAX
+                    if self.monotonic_id_start in self.metadata:
+                        # pointing to the start addr of next allocation
+                        self.data_buffer_start += (
+                            self.metadata[self.monotonic_id_start]
+                            - self.data_buffer_start
+                        ) % self.data_buffer_size
+                    else:
+                        # no remaining allocation, reset to zero
+                        self.data_buffer_start = self.data_buffer_end = 0
+                    freed_bytes += metadata[1]
+                else:
+                    # there are still readers, we cannot free the buffer
+                    break
+
+        logger.debug(
+            "Freed %d bytes from the ring buffer, "
+            "monotonic_id_start: %d, monotonic_id_end: %d",
+            freed_bytes,
+            self.monotonic_id_start,
+            self.monotonic_id_end,
+        )
+
+        # buffer wrap around
+        if self.data_buffer_start >= self.data_buffer_size:
+            self.data_buffer_start -= self.data_buffer_size
+            self.data_buffer_end -= self.data_buffer_size
+
+        monotonic_id_after = self.monotonic_id_start
+        # id wrap around
+        if monotonic_id_after >= monotonic_id_before:
+            return range(monotonic_id_before, monotonic_id_after)
+        else:
+            return chain(
+                range(monotonic_id_before, self.ID_MAX), range(0, monotonic_id_after)
+            )
+
+
+class ObjectSerde(ABC):
+    @abstractmethod
+    def serialize(self, value: Any) -> tuple[Any, int, bytes, int]:
+        """Serialize an object to bytes."""
+        raise NotImplementedError
+
+    @abstractmethod
+    def deserialize(self, data: memoryview) -> Any:
+        """Deserialize bytes back to an object."""
+        raise NotImplementedError
+
+
+class MsgpackSerde(ObjectSerde):
+    def __init__(self):
+        # Delayed import to avoid circular dependency
+        from vllm.multimodal.inputs import MultiModalKwargsItem
+        from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder
+
+        self.encoder = MsgpackEncoder()
+        self.tensor_decoder = MsgpackDecoder(torch.Tensor, share_mem=False)
+        self.mm_decoder = MsgpackDecoder(MultiModalKwargsItem, share_mem=False)
+        self._mm_kwargs_item_cls = MultiModalKwargsItem
+
+    def serialize(self, value: Any) -> tuple[bytes | list[bytes], int, bytes, int]:
+        len_arr = None
+        if isinstance(value, (torch.Tensor, self._mm_kwargs_item_cls)):
+            type_name = type(value).__name__
+            value = self.encoder.encode(value)
+            len_arr = [len(s) for s in value]
+            nbytes = sum(len_arr)
+        else:
+            value = pickle.dumps(value, protocol=pickle.HIGHEST_PROTOCOL)
+            type_name = type(value).__name__
+            nbytes = len(value)
+
+        object_metadata = (type_name, nbytes, len_arr)
+        serialized_metadata = pickle.dumps(
+            object_metadata, protocol=pickle.HIGHEST_PROTOCOL
+        )
+        return value, nbytes, serialized_metadata, len(serialized_metadata)
+
+    def deserialize(self, data_view: memoryview) -> Any:
+        # pickle.loads do not read past the end of a pickled object
+        # within a large buffer, so we can skip storing the metadata size
+        type_name, nbytes, len_arr = pickle.loads(data_view)
+        serialized_data = data_view[-nbytes:]
+
+        if type_name == torch.Tensor.__name__:
+            obj = []
+            start_idx = 0
+            for length in len_arr:
+                item_bytes = serialized_data[start_idx : start_idx + length]
+                obj.append(item_bytes)
+                start_idx += length
+            obj = self.tensor_decoder.decode(obj)
+        elif type_name == self._mm_kwargs_item_cls.__name__:
+            obj = []
+            start_idx = 0
+            for length in len_arr:
+                item_bytes = serialized_data[start_idx : start_idx + length]
+                obj.append(item_bytes)
+                start_idx += length
+            obj = self.mm_decoder.decode(obj)
+        elif type_name == bytes.__name__:
+            obj = pickle.loads(serialized_data)
+        else:
+            raise ValueError(f"Unsupported object type '{type_name}' in metadata")
+
+        return obj
+
+
+@dataclass
+class ShmObjectStorageHandle:
+    max_object_size: int
+    n_readers: int
+    ring_buffer_handle: tuple[int, str]
+    serde_class: type[ObjectSerde]
+    reader_lock: LockType | None
+
+
+class SingleWriterShmObjectStorage:
+    """
+    A single-writer, multiple-reader object storage system built on top of a
+    shared memory ring buffer. Provides key-value storage with automatic memory
+    management and cross-process serialization support.
+
+    This storage system follows a FIFO (First-In-First-Out) eviction policy
+    where the oldest objects are automatically freed when memory runs low.
+    Memory is reclaimed based on reader reference counting - objects are only
+    freed when all readers have finished accessing them.
+
+    Architecture:
+    - Single writer process can put(key, value) objects
+    - Multiple reader processes can get(address, monotonic_id) objects
+    - Built on SingleWriterShmRingBuffer for efficient shared memory management
+    - Thread-safe operations with reader synchronization via locks
+
+    Key Features:
+    - FIFO Eviction: Oldest objects are evicted first when memory is full
+    - Reference Counting: Objects are only freed when no readers are
+      accessing them
+    - Duplicate Key Handling: Existing keys are not overwritten, just
+      re-referenced
+    - Customized Serialization: By default uses Msgpack for efficient
+      serialization of Python objects, but can be extended for custom types
+    - Cross-Process Safety: Uses shared memory with proper synchronization
+    - Automatic Cleanup: Garbage collection happens transparently during
+      allocation
+
+    Memory Layout per Object:
+    `[4-byte reference_count][metadata_size][serialized_object_data]`
+
+    Thread Safety:
+    - Writer operations (put, clear) are single-threaded by design
+    - Reader operations (get) are thread-safe with lock-based reference
+      counting
+    - Memory reclamation is handled exclusively by the writer process
+    """
+
+    def __init__(
+        self,
+        max_object_size: int,
+        n_readers: int,
+        ring_buffer: SingleWriterShmRingBuffer,
+        serde_class: type[ObjectSerde] = MsgpackSerde,
+        reader_lock: LockType | None = None,
+    ):
+        """
+        Initialize the object storage.
+
+        Args:
+            max_object_size: Maximum size for a single object in bytes.
+            n_readers: Number of reader processes that can access the storage.
+            ring_buffer: The shared memory ring buffer for storing objects.
+            serde_class: Serializer/deserializer for objects.
+            reader_lock: Optional lock for synchronizing reader access.
+        Raises:
+            ValueError: If reader_lock is None for readers.
+        """
+
+        self.max_object_size = max_object_size
+        self.n_readers = n_readers
+        self.serde_class = serde_class
+        self.ser_de = serde_class()
+        self.ring_buffer = ring_buffer
+        self.is_writer = self.ring_buffer.is_writer
+
+        self.flag_bytes = 4  # for in-use flag
+
+        if self.is_writer:
+            # Key-value mapping: key -> (address, monotonic_id)
+            self.key_index: dict[str, tuple[int, int]] = {}
+            # Reverse mapping: monotonic_id -> key
+            self.id_index: dict[int, str] = {}
+            # Writer flag to track in-use status: monotonic_id -> count
+            self.writer_flag: dict[int, int] = {}
+        else:
+            if reader_lock is None:
+                raise ValueError("Lock must be provided for readers.")
+
+        self._reader_lock = reader_lock
+
+    def clear(self) -> None:
+        """Clear the object storage."""
+        if self.is_writer:
+            self.ring_buffer.clear()
+            self.key_index.clear()
+            self.id_index.clear()
+            self.writer_flag.clear()
+            logger.debug("Object storage cleared and reinitialized.")
+
+    def copy_to_buffer(
+        self,
+        data: bytes | list[bytes],
+        data_bytes: int,
+        metadata: bytes,
+        md_bytes: int,
+        data_view: memoryview,
+    ) -> None:
+        data_view[self.flag_bytes : self.flag_bytes + md_bytes] = metadata
+        if isinstance(data, bytes):
+            data_view[-data_bytes:] = data
+        elif isinstance(data, list):
+            start_idx = self.flag_bytes + md_bytes
+            for item_bytes in data:
+                item_size = len(item_bytes)
+                data_view[start_idx : start_idx + item_size] = item_bytes
+                start_idx += item_size
+        else:
+            raise ValueError(f"Unsupported data type for serialization: {type(data)}")
+
+    def increment_writer_flag(self, id: int) -> None:
+        """Set the in-use flag for the writer."""
+        self.writer_flag[id] = self.writer_flag.get(id, 0) + 1
+
+    def increment_reader_flag(self, data_view: memoryview) -> None:
+        """Set the in-use flag for the reader."""
+        # >0 for in-use flag
+        reader_count = self.ring_buffer.byte2int(data_view)
+        data_view[:] = self.ring_buffer.int2byte(reader_count + 1)
+
+    def free_unused(self) -> None:
+        """Free unused buffers in the ring buffer."""
+        # try to free up 2*max_object_size bytes of space in the ring buffer,
+        # since the buffer might be fragmented
+        freed_ids = self.ring_buffer.free_buf(
+            self.default_is_free_check, 2 * self.max_object_size
+        )
+        # update the metadata after freeing up space
+        for freed_id in freed_ids:
+            key_to_free = self.id_index[freed_id]
+            del self.key_index[key_to_free]
+            del self.id_index[freed_id]
+            del self.writer_flag[freed_id]
+
+    def is_cached(self, key: str) -> bool:
+        """
+        Check if the object with the given key is cached.
+        """
+        return key in self.key_index
+
+    def get_cached(self, key: str) -> tuple[int, int]:
+        """
+        Get the cached object by key if it exists.
+        """
+        address, monotonic_id = self.key_index[key]
+        self.increment_writer_flag(monotonic_id)
+        return address, monotonic_id
+
+    def put(self, key: str, value: Any) -> tuple[int, int]:
+        """
+        Store a key-value pair in the object storage.
+        Attempts to free max_object_size bytes using FIFO order
+        when the ring buffer runs out of space during a put() operation.
+
+        Args:
+            key: String key to identify the object
+            value: Any serializable Python object
+
+        Raises:
+            MemoryError: If there's not enough space in the buffer
+            ValueError: If the serialized object is too large
+            ValueError: If the key already exists in the storage
+        """
+        if key in self.key_index:
+            raise ValueError(f"Key '{key}' already exists in the storage.")
+
+        object_data, data_bytes, object_metadata, md_bytes = self.ser_de.serialize(
+            value
+        )
+        buffer_size = self.flag_bytes + data_bytes + md_bytes
+        # Sanity checks
+        if buffer_size > self.max_object_size:
+            raise ValueError(
+                f"Serialized object size ({buffer_size} bytes) exceeds "
+                f"max object size ({self.max_object_size} bytes)"
+            )
+
+        # Allocate new buffer
+        try:
+            address, monotonic_id = self.ring_buffer.allocate_buf(buffer_size)
+        except MemoryError:
+            self.free_unused()
+            # try again after freeing up space
+            address, monotonic_id = self.ring_buffer.allocate_buf(buffer_size)
+
+        # Write data to buffer
+        with self.ring_buffer.access_buf(address) as (data_view, metadata):
+            data_view[: self.flag_bytes] = self.ring_buffer.int2byte(0)
+            self.copy_to_buffer(
+                object_data, data_bytes, object_metadata, md_bytes, data_view
+            )
+        self.increment_writer_flag(monotonic_id)
+
+        # Update key index
+        self.key_index[key] = (address, monotonic_id)
+        self.id_index[monotonic_id] = key
+        return address, monotonic_id
+
+    def get(self, address: int, monotonic_id: int) -> Any:
+        # Read data from buffer
+        with self.ring_buffer.access_buf(address) as (data_view, buf_metadata):
+            # check id from metadata
+            if buf_metadata[0] != monotonic_id:
+                raise ValueError(
+                    f"Data for address:id '{address}:{monotonic_id}'"
+                    " has been modified or is invalid."
+                )
+
+            obj = self.ser_de.deserialize(data_view[self.flag_bytes :])
+
+            # decrease the in-use flag for reader reads
+            if self._reader_lock is not None:
+                with self._reader_lock:
+                    self.increment_reader_flag(data_view[: self.flag_bytes])
+            else:
+                # if self._reader_lock is None, it means we are the writer
+                # in this case, we do not need to decrease the reader count
+                assert self.is_writer
+
+        return obj
+
+    def touch(
+        self,
+        key: str,
+        address: int = 0,
+        monotonic_id: int = 0,
+    ) -> None:
+        """
+        Touch an existing cached item to update its eviction status.
+
+        For writers (ShmObjectStoreSenderCache): Increment writer_flag
+        For readers (ShmObjectStoreReceiverCache): Increment reader_count
+
+        Args:
+            key: String key of the object to touch
+            address: Address of the object (only for readers)
+            monotonic_id: Monotonic ID of the object (only for readers)
+
+        """
+        if self._reader_lock is None:
+            if key not in self.key_index:
+                return None
+            address, monotonic_id = self.key_index[key]
+            # Writer side: increment writer_flag to raise eviction threshold
+            self.increment_writer_flag(monotonic_id)
+        else:
+            with (
+                self._reader_lock,
+                self.ring_buffer.access_buf(address) as (data_view, _),
+            ):
+                reader_count = self.ring_buffer.byte2int(data_view[: self.flag_bytes])
+
+                # NOTE(Long):
+                # Avoid increasing flag on newly added item (sync with sender)
+                # Since when a new item is added
+                # pre-touch has no effect on writer side
+                if reader_count >= self.n_readers:
+                    self.increment_reader_flag(data_view[: self.flag_bytes])
+
+    def handle(self):
+        """Get handle for sharing across processes."""
+        return ShmObjectStorageHandle(
+            max_object_size=self.max_object_size,
+            n_readers=self.n_readers,
+            ring_buffer_handle=self.ring_buffer.handle(),
+            serde_class=self.serde_class,
+            reader_lock=self._reader_lock,
+        )
+
+    @staticmethod
+    def create_from_handle(
+        handle: ShmObjectStorageHandle,
+    ) -> "SingleWriterShmObjectStorage":
+        logger.debug("Creating storage from handle: %s", handle)
+        ring_buffer = SingleWriterShmRingBuffer(*handle.ring_buffer_handle)
+        return SingleWriterShmObjectStorage(
+            max_object_size=handle.max_object_size,
+            n_readers=handle.n_readers,
+            ring_buffer=ring_buffer,
+            serde_class=handle.serde_class,
+            reader_lock=handle.reader_lock,
+        )
+
+    def default_is_free_check(self, id: int, buf: memoryview) -> bool:
+        """
+        Default is_free function that checks if the first 4 bytes are zero.
+        This indicates that the buffer is free.
+        """
+        reader_count = int.from_bytes(buf[0:4], "little", signed=True)
+        writer_count = self.writer_flag[id]
+        return reader_count >= writer_count * self.n_readers
--- a/vllm/distributed/device_communicators/symm_mem.py
+++ b/vllm/distributed/device_communicators/symm_mem.py
@@ -0,0 +1,156 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import torch
+import torch.distributed as dist
+from torch.distributed import ProcessGroup
+
+from vllm.distributed.device_communicators.all_reduce_utils import (
+    SYMM_MEM_ALL_REDUCE_MAX_SIZES,
+)
+from vllm.logger import init_logger
+from vllm.model_executor.layers.batch_invariant import (
+    vllm_is_batch_invariant,
+)
+from vllm.platforms import current_platform
+
+try:
+    import torch.distributed._symmetric_memory as torch_symm_mem
+
+    symm_mem_available = True
+except ImportError:
+    symm_mem_available = False
+
+logger = init_logger(__name__)
+
+
+class SymmMemCommunicator:
+    _WORLD_SIZES_MULTIMEM = {
+        "9.0": [4, 6, 8],
+        "10.0": [6, 8],
+    }
+
+    def __init__(
+        self,
+        group: ProcessGroup,
+        device: int | str | torch.device,
+        # add options for testing
+        force_multimem: bool | None = None,
+        max_size_override: int | None = None,
+    ):
+        self.disabled = True
+
+        if not symm_mem_available:
+            return
+
+        if not current_platform.is_cuda():
+            logger.warning("SymmMemCommunicator: symmetric memory is not available.")
+            return
+        if isinstance(device, int):
+            device = torch.device(f"cuda:{device}")
+        elif isinstance(device, str):
+            device = torch.device(device)
+        torch.cuda.set_device(device)
+        self.dtype = torch.bfloat16
+        self.device = device
+        self.group = group
+        self.world_size = dist.get_world_size(self.group)
+        capability = current_platform.get_device_capability()
+        if capability is None:
+            logger.warning(
+                "SymmMemCommunicator: device capability is unknown, "
+                "communicator is not available."
+            )
+            return
+        self.device_capability = capability.as_version_str()
+        if self.device_capability not in SYMM_MEM_ALL_REDUCE_MAX_SIZES:
+            logger.warning(
+                "SymmMemCommunicator: Device capability %s not supported, "
+                "communicator is not available.",
+                self.device_capability,
+            )
+            return
+        if self.world_size not in SYMM_MEM_ALL_REDUCE_MAX_SIZES[self.device_capability]:
+            logger.warning(
+                "SymmMemCommunicator: World size %d not supported, "
+                "communicator is not available.",
+                self.world_size,
+            )
+            return
+        # Use override max_size if provided, otherwise use default
+        if max_size_override is not None:
+            self.max_size = max_size_override
+            logger.info(
+                "SymmMemCommunicator: Using override max_size: %s bytes",
+                self.max_size,
+            )
+        else:
+            self.max_size = SYMM_MEM_ALL_REDUCE_MAX_SIZES[self.device_capability][
+                self.world_size
+            ]
+        try:
+            self.buffer = torch_symm_mem.empty(
+                self.max_size // self.dtype.itemsize,
+                device=self.device,
+                dtype=self.dtype,
+            )
+            handle = torch_symm_mem.rendezvous(self.buffer, self.group.group_name)
+        except RuntimeError as e:
+            logger.warning_once(
+                "SymmMemCommunicator: symmetric memory initialization failed: %s "
+                "Communicator is not available. To suppress this warning set "
+                "VLLM_ALLREDUCE_USE_SYMM_MEM=0",
+                str(e),
+            )
+            return
+        if handle.multicast_ptr == 0:
+            logger.warning(
+                "SymmMemCommunicator: symmetric memory "
+                "multicast operations are not supported."
+            )
+            return
+        self.force_multimem = force_multimem
+        self.disabled = False
+        if vllm_is_batch_invariant():
+            self.disabled = True
+
+    def should_use_symm_mem(self, inp: torch.Tensor):
+        if self.disabled:
+            return False
+        if inp.dtype != self.dtype:
+            return False
+        inp_size = inp.numel() * inp.element_size()
+        if inp_size % 4 != 0:
+            return False
+        return inp_size < self.max_size
+
+    def all_reduce(
+        self, inp: torch.Tensor, *, out: torch.Tensor | None = None
+    ) -> torch.Tensor | None:
+        if not self.should_use_symm_mem(inp):
+            return None
+        if out is None:
+            out = torch.empty_like(inp)
+        self.buffer[: inp.numel()].copy_(inp.view(-1))
+
+        # Determine which algorithm to use
+        use_multimem = False
+        if self.force_multimem is not None:
+            # Test override: use forced setting
+            use_multimem = self.force_multimem
+        else:
+            # Normal logic: use multimem for supported world sizes
+            use_multimem = (
+                self.world_size in self._WORLD_SIZES_MULTIMEM[self.device_capability]
+            )
+
+        if use_multimem:
+            torch.ops.symm_mem.multimem_all_reduce_(
+                self.buffer[: inp.numel()], "sum", self.group.group_name
+            )
+        else:
+            torch.ops.symm_mem.two_shot_all_reduce_(
+                self.buffer[: inp.numel()], "sum", self.group.group_name
+            )
+        out.copy_(self.buffer[: inp.numel()].view(out.shape))
+        return out
--- a/vllm/distributed/device_communicators/tpu_communicator.py
+++ b/vllm/distributed/device_communicators/tpu_communicator.py
@@ -0,0 +1,99 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+
+import torch
+from torch.distributed import ProcessGroup
+
+from vllm.config import get_current_vllm_config
+from vllm.logger import init_logger
+from vllm.platforms import current_platform
+from vllm.platforms.tpu import USE_TPU_INFERENCE
+
+from .base_device_communicator import DeviceCommunicatorBase
+
+USE_RAY = parallel_config = (
+    get_current_vllm_config().parallel_config.distributed_executor_backend == "ray"
+)
+
+logger = init_logger(__name__)
+
+if not USE_TPU_INFERENCE:
+    logger.info("tpu_inference not found, using vLLM's TpuCommunicator")
+    if current_platform.is_tpu():
+        import torch_xla
+        import torch_xla.core.xla_model as xm
+        import torch_xla.runtime as xr
+        from torch_xla._internal import pjrt
+        from torch_xla.distributed.xla_multiprocessing import (
+            create_optimized_replica_groups,
+        )
+
+        if USE_RAY:
+            from vllm.v1.executor import ray_utils
+
+
+class TpuCommunicator(DeviceCommunicatorBase):
+    def __init__(
+        self,
+        cpu_group: ProcessGroup,
+        device: torch.device | None = None,
+        device_group: ProcessGroup | None = None,
+        unique_name: str = "",
+    ):
+        super().__init__(cpu_group, device, device_group, unique_name)
+
+        # NOTE(woosuk): When using TP > 1 on TPUs, every TPU on the same node
+        # must be used together. Therefore, the local rank and world size can
+        # be simply calculated as follows.
+        global_rank = self.global_rank
+        global_world_size = self.global_world_size
+
+        if USE_RAY:
+            logger.info("TpuCommunicator initialized with RAY")
+            # Calculate how many TPU nodes are in the current deployment. This
+            # is the Ray placement group if it is deployed with Ray. Default
+            # to the number of TPU nodes in the Ray cluster. The number of TPU
+            # nodes is computed by the total number of TPUs divided by the
+            # number of TPU accelerators per node, to account for clusters
+            # with both CPUs and TPUs.
+            num_nodes = ray_utils.get_num_tpu_nodes()
+            num_nodes_in_pg = ray_utils.get_num_nodes_in_placement_group()
+            if num_nodes_in_pg > 0:
+                num_nodes = num_nodes_in_pg
+
+            local_world_size = global_world_size // num_nodes
+            local_rank = global_rank % local_world_size
+        else:
+            logger.info("TpuCommunicator initialized with MP")
+            # Sanity: Verify we run on a single host
+            num_hosts = torch_xla.tpu.num_tpu_workers()
+            assert num_hosts == 1
+
+            # Get the current number of TPUs (we have locally)
+            local_world_size = torch_xla.tpu.num_available_chips()
+
+            # Get current rank
+            local_rank = global_rank % local_world_size
+
+        # Ensure environment variables are set for multihost deployments.
+        # On GKE, this is needed for libtpu and TPU driver to know which TPU
+        # chip is actually visible. Otherwise the TPU driver will fail to
+        # initialize because the number of devices would be different from
+        # the number of visible worker addresses.
+        os.environ["CLOUD_TPU_TASK_ID"] = str(global_rank)
+        os.environ["TPU_VISIBLE_CHIPS"] = str(local_rank)
+
+        pjrt.initialize_multiprocess(local_rank, local_world_size)
+        xr._init_world_size_ordinal()
+        self.groups = create_optimized_replica_groups()
+
+    def all_reduce(self, input_: torch.Tensor) -> torch.Tensor:
+        # TODO: Remove the groups specification after XLA compiler can support
+        # auto-reordering the ring order for all-reduce.
+        return xm.all_reduce(xm.REDUCE_SUM, input_, groups=self.groups)
+
+    def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor:
+        assert dim == -1, "TPUs only support dim=-1 for all-gather."
+        return xm.all_gather(input_, dim=dim)
--- a/vllm/distributed/device_communicators/xpu_communicator.py
+++ b/vllm/distributed/device_communicators/xpu_communicator.py
@@ -0,0 +1,95 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+import torch
+import torch.distributed as dist
+from torch.distributed import ProcessGroup
+
+from vllm.logger import init_logger
+
+from .base_device_communicator import DeviceCommunicatorBase
+
+logger = init_logger(__name__)
+
+
+class XpuCommunicator(DeviceCommunicatorBase):
+    def __init__(
+        self,
+        cpu_group: ProcessGroup,
+        device: torch.device | None = None,
+        device_group: ProcessGroup | None = None,
+        unique_name: str = "",
+    ):
+        super().__init__(cpu_group, device, device_group, unique_name)
+        if self.use_all2all:
+            if self.all2all_backend != "naive":
+                logger.warning(
+                    "`%s` all2all manager is not supported on XPU. "
+                    "Falling back to `naive` all2all manager for XPU.",
+                    self.all2all_backend,
+                )
+                self.all2all_backend = "naive"
+            if self.all2all_backend == "naive":
+                from .all2all import NaiveAll2AllManager
+
+                self.all2all_manager = NaiveAll2AllManager(self.cpu_group)
+                logger.info("Using naive all2all manager.")
+
+    def all_reduce(self, input_) -> torch.Tensor:
+        dist.all_reduce(input_, group=self.device_group)
+        return input_
+
+    def gather(
+        self, input_: torch.Tensor, dst: int = 0, dim: int = -1
+    ) -> torch.Tensor | None:
+        assert -input_.dim() <= dim < input_.dim(), (
+            f"Invalid dim ({dim}) for input tensor with shape {input_.size()}"
+        )
+        if dim < 0:
+            # Convert negative dim to positive.
+            dim += input_.dim()
+        # For xpu path, gather doesn't work properly together with ray
+        # cluster so we use all_gather instead for now.
+        input_size = input_.size()
+        # Allocate output tensor.
+        output_tensor = torch.empty(
+            (self.world_size,) + input_size, dtype=input_.dtype, device=input_.device
+        )
+        # All-gather.
+        dist.all_gather_into_tensor(output_tensor, input_, group=self.device_group)
+        if self.rank_in_group == dst:
+            # Reshape
+            output_tensor = output_tensor.movedim(0, dim)
+            output_tensor = output_tensor.reshape(
+                input_size[:dim]
+                + (self.world_size * input_size[dim],)
+                + input_size[dim + 1 :]
+            )
+        else:
+            output_tensor = None
+        return output_tensor
+
+    def broadcast(self, input_: torch.Tensor, src: int = 0) -> None:
+        dist.broadcast(input_, src=src, group=self.device_group)
+
+    def dispatch(
+        self,
+        hidden_states: torch.Tensor,
+        router_logits: torch.Tensor,
+        is_sequence_parallel: bool = False,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        assert self.all2all_manager is not None
+        hidden_states, router_logits = self.all2all_manager.dispatch(
+            hidden_states, router_logits, is_sequence_parallel
+        )
+        return hidden_states, router_logits
+
+    def combine(
+        self, hidden_states: torch.Tensor, is_sequence_parallel: bool = False
+    ) -> torch.Tensor:
+        assert self.all2all_manager is not None
+        hidden_states = self.all2all_manager.combine(
+            hidden_states, is_sequence_parallel
+        )
+        return hidden_states