Sync from v0.13

2026-01-19 10:38:50 +08:00
parent b2ef04d792
commit 5aef6c175a
3714 changed files with 854317 additions and 89342 deletions
--- a/vllm/distributed/ec_transfer/init.py
+++ b/vllm/distributed/ec_transfer/init.py
@@ -0,0 +1,14 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from vllm.distributed.ec_transfer.ec_transfer_state import (
+    ensure_ec_transfer_initialized,
+    get_ec_transfer,
+    has_ec_transfer,
+)
+
+__all__ = [
+    "get_ec_transfer",
+    "ensure_ec_transfer_initialized",
+    "has_ec_transfer",
+]
--- a/vllm/distributed/ec_transfer/ec_connector/init.py
+++ b/vllm/distributed/ec_transfer/ec_connector/init.py
--- a/vllm/distributed/ec_transfer/ec_connector/base.py
+++ b/vllm/distributed/ec_transfer/ec_connector/base.py
@@ -0,0 +1,247 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+ECConnectorBase Class for Distributed Encoder Cache &
+P2P Encoder cache communication in V1
+
+The class provides the following primitives:
+    Scheduler-side: runs in the scheduler, binds metadata, which
+    is used by the worker-side to load/save Encoder cache.
+        check_caches_exist() - Check whether Encoder cache of requests exist
+        update_state_after_alloc() - update ECConnector state after
+        allocate. This will decide to load the cache or not
+        request_finished() - called when a request is finished,
+        free the cache with the requests
+
+    Worker-side: runs in each worker, loads/saves Encoder Cache to/from
+    the Connector based on the metadata.
+        start_load_ec() - starts loading all ECs (maybe async)
+        wait_for_save() - blocks until all saves are done
+
+        get_finished() - called with ids of finished requests, returns
+            ids of requests that have completed async sending/recving.
+"""
+
+import enum
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING, Any
+
+import torch
+
+from vllm.logger import init_logger
+from vllm.v1.core.sched.output import SchedulerOutput
+from vllm.v1.outputs import ECConnectorOutput
+
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+    from vllm.v1.request import Request
+
+logger = init_logger(__name__)
+
+
+class ECConnectorRole(enum.Enum):
+    # Connector running in the scheduler process
+    SCHEDULER = 0
+
+    # Connector running in the worker process
+    WORKER = 1
+
+
+class ECConnectorMetadata(ABC):  # noqa: B024
+    """
+    Abstract Metadata used to communicate between the
+    Scheduler ECConnector and Worker ECConnector.
+    """
+
+    pass
+
+
+class ECConnectorBase(ABC):
+    def __init__(self, vllm_config: "VllmConfig", role: ECConnectorRole):
+        self._connector_metadata: ECConnectorMetadata | None = None
+        self._vllm_config = vllm_config
+        self._role = role
+        if vllm_config.ec_transfer_config is not None:
+            self._is_producer = vllm_config.ec_transfer_config.is_ec_producer
+        else:
+            raise ValueError("ec_transfer_config must be set for ECConnectorBase")
+
+    @property
+    def role(self) -> ECConnectorRole:
+        return self._role
+
+    @property
+    def is_producer(self) -> bool:
+        return self._is_producer
+
+    # ==============================
+    # Worker-side methods
+    # ==============================
+
+    def bind_connector_metadata(self, connector_metadata: ECConnectorMetadata) -> None:
+        """Set the connector metadata from the scheduler.
+
+        This function should be called by the model runner every time
+        before the model execution. The metadata will be used for runtime
+        EC cache loading.
+
+        Args:
+            connector_metadata (dict): the connector metadata.
+        """
+        self._connector_metadata = connector_metadata
+
+    def clear_connector_metadata(self) -> None:
+        """Clear the connector metadata.
+
+        This function should be called by the model runner every time
+        after the model execution.
+        """
+        self._connector_metadata = None
+
+    def _get_connector_metadata(self) -> ECConnectorMetadata:
+        """Get the connector metadata.
+
+        This function should only be called inside the connector.
+
+        Returns:
+            ConnectorMetadata: the connector metadata.
+        """
+
+        # Should only be called while set to valid metadata.
+        assert self._connector_metadata is not None
+        return self._connector_metadata
+
+    def register_caches(
+        self,
+        ec_caches: dict[str, torch.Tensor],
+    ):
+        """
+        Initialize with the EC caches.
+        Args:
+            ec_caches: dictionary of encoder cache
+        """
+        # TODO: Implement this later for P2P feature
+        return
+
+    @abstractmethod
+    def start_load_caches(
+        self, encoder_cache: dict[str, torch.Tensor], **kwargs
+    ) -> None:
+        """
+        Start loading the cache from the connector into vLLM's encoder cache.
+
+        This method loads the encoder cache based on metadata provided by the scheduler.
+        It is called before `_gather_mm_embeddings` for the EC Connector. For EC,
+        the `encoder_cache` and `mm_hash` are stored in `kwargs`.
+
+        Args:
+            encoder_cache (dict[str, torch.Tensor]): A dictionary mapping multimodal
+                data hashes (`mm_hash`) to encoder cache tensors.
+            kwargs (dict): Additional keyword arguments for the connector.
+        """
+        pass
+
+    @abstractmethod
+    def save_caches(
+        self, encoder_cache: dict[str, torch.Tensor], mm_hash: str, **kwargs
+    ) -> None:
+        """
+        Save the encoder cache to the connector.
+
+        This method saves the encoder cache from the worker's local storage
+        to shared storage or another external connector.
+
+        Args:
+            encoder_cache (dict[str, torch.Tensor]): A dictionary mapping multimodal
+                data hashes (`mm_hash`) to encoder cache tensors.
+            mm_hash (str): The hash of the multimodal data whose cache is being saved.
+            kwargs (dict): Additional keyword arguments for the connector.
+        """
+        pass
+
+    def get_finished(
+        self, finished_req_ids: set[str]
+    ) -> tuple[set[str] | None, set[str] | None]:
+        """
+        Notifies worker-side connector ids of requests that have
+        finished generating tokens on the worker.
+        The scheduler process (via the Executors) will use this output
+        to track which workers are done.
+
+        Returns:
+            ids of requests that have finished asynchronous transfer
+            (requests that previously returned True from request_finished()),
+            tuple of (sending/saving ids, recving/loading ids).
+            The finished saves/sends req ids must belong to a set provided in a
+            call to this method (this call or a prior one).
+        """
+        return None, None
+
+    # ==============================
+    # Scheduler-side methods
+    # ==============================
+
+    @abstractmethod
+    def has_caches(
+        self,
+        request: "Request",
+    ) -> list[bool]:
+        """
+        Check if encoder cache exists for each mm data of requests
+
+        Args:
+            request (Request): the request object.
+
+        Returns:
+            A list bool where ith value is True if cache exist for
+            ith mm_data of requests
+        """
+        pass
+
+    @abstractmethod
+    def update_state_after_alloc(self, request: "Request", index: int):
+        """
+        Update ECConnector state to decide allocate cache for requests
+
+        Args:
+            request (Request): the request object.
+        """
+        pass
+
+    @abstractmethod
+    def build_connector_meta(
+        self, scheduler_output: SchedulerOutput
+    ) -> ECConnectorMetadata:
+        """
+        Build the connector metadata for this step.
+
+        This function should NOT modify fields in the scheduler_output.
+        Also, calling this function will reset the state of the connector.
+
+        Args:
+            scheduler_output (SchedulerOutput): the scheduler output object.
+        """
+        pass
+
+    def update_connector_output(self, connector_output: ECConnectorOutput):
+        """
+        Update ECConnector state from worker-side connectors output.
+
+        Args:
+            connector_output (ECConnectorOutput): the worker-side
+                connectors output.
+        """
+        return
+
+    def request_finished(
+        self, request: "Request"
+    ) -> tuple[bool, dict[str, Any] | None]:
+        """
+        Called when a request has finished, before its encoder cache is freed.
+
+        Returns:
+            True if the request is being saved/sent asynchronously and cached
+            should not be freed until the request_id is returned from
+            get_finished().
+        """
+        return False, None
--- a/vllm/distributed/ec_transfer/ec_connector/example_connector.py
+++ b/vllm/distributed/ec_transfer/ec_connector/example_connector.py
@@ -0,0 +1,201 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
+from dataclasses import dataclass
+from typing import TYPE_CHECKING
+
+import safetensors
+
+from vllm.config import VllmConfig
+from vllm.distributed.ec_transfer.ec_connector.base import (
+    ECConnectorBase,
+    ECConnectorMetadata,
+    ECConnectorRole,
+)
+from vllm.logger import init_logger
+from vllm.v1.core.sched.output import SchedulerOutput
+
+if TYPE_CHECKING:
+    from vllm.v1.request import Request
+
+logger = init_logger(__name__)
+
+
+@dataclass
+class MMMeta:
+    mm_hash: str
+    num_token: int
+
+    @staticmethod
+    def make_meta(mm_hash, num_token) -> "MMMeta":
+        return MMMeta(mm_hash=mm_hash, num_token=num_token)
+
+
+@dataclass
+class ECExampleConnectorMetadata(ECConnectorMetadata):
+    mm_datas: list[MMMeta]
+
+    def __init__(self):
+        self.mm_datas = []
+
+    def add_mm_data(self, mm_data: MMMeta):
+        self.mm_datas.append(mm_data)
+
+
+class ECExampleConnector(ECConnectorBase):
+    # NOTE: This is Simple debug implementation of the EC connector.
+    # It save / load the EC cache to / from the disk.
+
+    def __init__(self, vllm_config: "VllmConfig", role: ECConnectorRole):
+        super().__init__(vllm_config=vllm_config, role=role)
+        # req_id -> index
+        self._mm_datas_need_loads: dict[str, int] = {}
+        transfer_config = vllm_config.ec_transfer_config
+        if transfer_config is not None:
+            self._storage_path = transfer_config.get_from_extra_config(
+                "shared_storage_path", "/tmp"
+            )
+            logger.debug(transfer_config)
+            logger.debug("Shared storage path is %s", self._storage_path)
+        else:
+            raise ValueError("ec_transfer_config must be set for ECConnectorBase")
+
+    def start_load_caches(self, encoder_cache, **kwargs) -> None:
+        """
+        Start loading the cache from the connector into vLLM's encoder cache.
+
+        This method loads the encoder cache based on metadata provided by the scheduler.
+        It is called before `_gather_mm_embeddings` for the EC Connector. For EC,
+        the `encoder_cache` and `mm_hash` are stored in `kwargs`.
+
+        Args:
+            encoder_cache (dict[str, torch.Tensor]): A dictionary mapping multimodal
+                data hashes (`mm_hash`) to encoder cache tensors.
+            kwargs (dict): Additional keyword arguments for the connector.
+        """
+
+        # Get the metadata
+        metadata: ECConnectorMetadata = self._get_connector_metadata()
+        assert isinstance(metadata, ECExampleConnectorMetadata)
+        assert encoder_cache is not None
+        if metadata is None:
+            logger.warning(
+                (
+                    "In connector.start_load_caches, ",
+                    "but the connector metadata is None",
+                )
+            )
+            return
+        # Load the EC for each mm data
+        for mm_data in metadata.mm_datas:
+            if mm_data.mm_hash in encoder_cache:
+                continue
+            filename = self._generate_filename_debug(mm_data.mm_hash)
+            ec_cache = safetensors.torch.load_file(filename)["ec_cache"].cuda()
+            encoder_cache[mm_data.mm_hash] = ec_cache
+            logger.debug("Success load encoder cache for hash %s", mm_data.mm_hash)
+
+    def save_caches(self, encoder_cache, mm_hash, **kwargs) -> None:
+        """
+        Save the encoder cache to the connector.
+
+        This method saves the encoder cache from the worker's local storage
+        to shared storage or another external connector.
+
+        Args:
+            encoder_cache (dict[str, torch.Tensor]): A dictionary mapping multimodal
+                data hashes (`mm_hash`) to encoder cache tensors.
+            mm_hash (str): The hash of the multimodal data whose cache is being saved.
+            kwargs (dict): Additional keyword arguments for the connector.
+        """
+        # Return if it is PD Instance
+        if not self.is_producer:
+            return
+        filename = self._generate_filename_debug(mm_hash)
+        ec_cache = encoder_cache[mm_hash]
+        tensors = {"ec_cache": ec_cache.detach().cpu()}
+        safetensors.torch.save_file(tensors, filename)
+        logger.debug("Save cache successful for mm_hash %s", mm_hash)
+
+    def has_caches(
+        self,
+        request: "Request",
+    ) -> list[bool]:
+        """
+        Check if cache exist externally for each mm_data of request
+
+        Args:
+            request (Request): the request object.
+
+        Returns:
+            List of bool indicate that ith mm_data exist in cache or not
+        """
+        result = []
+        for feature in request.mm_features:
+            result.append(self._found_match_for_mm_data(feature.identifier))
+        return result
+
+    def update_state_after_alloc(
+        self,
+        request: "Request",
+        index: int,
+    ) -> None:
+        """
+        Update ECConnector state after encoder cache allocation.
+        """
+        mm_hash = request.mm_features[index].identifier
+        num_encoder_token = request.get_num_encoder_embeds(index)
+        # Insert mm_hash only if this block has not been recorded yet.
+        self._mm_datas_need_loads[mm_hash] = num_encoder_token
+
+    def build_connector_meta(
+        self,
+        scheduler_output: SchedulerOutput,
+    ) -> ECConnectorMetadata:
+        """Build the connector metadata for this step.
+
+        This function should NOT modify any fields in the scheduler_output.
+        Also, calling this function will reset the state of the connector.
+        This only build for load mm_data only
+        Args:
+            scheduler_output (SchedulerOutput): the scheduler output object.
+        """
+        meta = ECExampleConnectorMetadata()
+        for mm_hash, num_encoder_token in self._mm_datas_need_loads.items():
+            meta.add_mm_data(MMMeta.make_meta(mm_hash, num_encoder_token))
+        self._mm_datas_need_loads.clear()
+        return meta
+
+    # ==============================
+    # Helper functions
+    # ==============================
+
+    def _found_match_for_mm_data(self, mm_hash) -> bool:
+        """Check if the cache is hit for the request."""
+        filename = self._generate_filename_debug(mm_hash)
+        return os.path.exists(filename)
+
+    def _generate_foldername_debug(
+        self,
+        mm_hash: str,
+        create_folder: bool = True,  # <- now defaults to True
+    ) -> str:
+        """
+        Return the folder in which the cache for this mm_hash lives.
+        If `create_folder` is True (default) the directory is created
+        recursively the first time it is needed.
+        """
+        foldername = os.path.join(self._storage_path, mm_hash)
+        if create_folder:
+            os.makedirs(foldername, exist_ok=True)
+        return foldername
+
+    def _generate_filename_debug(self, mm_hash: str) -> str:
+        """
+        Return the full path of the safetensors file for this mm_hash.
+        Ensures the parent directory exists because
+        `_generate_foldername_debug` is called with its default
+        (`create_folder=True`).
+        """
+        foldername = self._generate_foldername_debug(mm_hash)  # <- folder auto-created
+        return os.path.join(foldername, "encoder_cache.safetensors")
--- a/vllm/distributed/ec_transfer/ec_connector/factory.py
+++ b/vllm/distributed/ec_transfer/ec_connector/factory.py
@@ -0,0 +1,85 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import importlib
+from collections.abc import Callable
+from typing import TYPE_CHECKING
+
+from vllm.distributed.ec_transfer.ec_connector.base import (
+    ECConnectorBase,
+    ECConnectorRole,
+)
+from vllm.logger import init_logger
+
+if TYPE_CHECKING:
+    from vllm.config import ECTransferConfig, VllmConfig
+
+logger = init_logger(__name__)
+
+
+class ECConnectorFactory:
+    _registry: dict[str, Callable[[], type[ECConnectorBase]]] = {}
+
+    @classmethod
+    def register_connector(cls, name: str, module_path: str, class_name: str) -> None:
+        """Register a connector with a lazy-loading module and class name."""
+        if name in cls._registry:
+            raise ValueError(f"Connector '{name}' is already registered.")
+
+        def loader() -> type[ECConnectorBase]:
+            module = importlib.import_module(module_path)
+            return getattr(module, class_name)
+
+        cls._registry[name] = loader
+
+    @classmethod
+    def create_connector(
+        cls,
+        config: "VllmConfig",
+        role: ECConnectorRole,
+    ) -> ECConnectorBase:
+        ec_transfer_config = config.ec_transfer_config
+        if ec_transfer_config is None:
+            raise ValueError("ec_transfer_config must be set to create a connector")
+        connector_cls = cls.get_connector_class(ec_transfer_config)
+        logger.info(
+            "Creating connector with name: %s and engine_id: %s",
+            connector_cls.__name__,
+            ec_transfer_config.engine_id,
+        )
+        # Connector is explicitly separated into two roles.
+        # Scheduler connector:
+        # - Co-locate with scheduler process
+        # - Should only be used inside the Scheduler class
+        # Worker connector:
+        # - Co-locate with worker process
+        return connector_cls(config, role)
+
+    @classmethod
+    def get_connector_class(
+        cls, ec_transfer_config: "ECTransferConfig"
+    ) -> type[ECConnectorBase]:
+        """Get the connector class by name."""
+        connector_name = ec_transfer_config.ec_connector
+        if connector_name is None:
+            raise ValueError("EC connect must not be None")
+        elif connector_name in cls._registry:
+            connector_cls = cls._registry[connector_name]()
+        else:
+            connector_module_path = ec_transfer_config.ec_connector_module_path
+            if connector_module_path is None:
+                raise ValueError(f"Unsupported connector type: {connector_name}")
+            connector_module = importlib.import_module(connector_module_path)
+            connector_cls = getattr(connector_module, connector_name)
+        return connector_cls
+
+
+# Register various connectors here.
+# The registration should not be done in each individual file, as we want to
+# only load the files corresponding to the current connector.
+
+ECConnectorFactory.register_connector(
+    "ECExampleConnector",
+    "vllm.distributed.ec_transfer.ec_connector.example_connector",
+    "ECExampleConnector",
+)
--- a/vllm/distributed/ec_transfer/ec_transfer_state.py
+++ b/vllm/distributed/ec_transfer/ec_transfer_state.py
@@ -0,0 +1,42 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import TYPE_CHECKING
+
+from vllm.distributed.ec_transfer.ec_connector.base import (
+    ECConnectorBase,
+    ECConnectorRole,
+)
+from vllm.distributed.ec_transfer.ec_connector.factory import ECConnectorFactory
+
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+
+_EC_CONNECTOR_AGENT: ECConnectorBase | None = None
+
+
+def get_ec_transfer() -> ECConnectorBase:
+    assert _EC_CONNECTOR_AGENT is not None, "disaggregated EC cache is not initialized"
+    return _EC_CONNECTOR_AGENT
+
+
+def has_ec_transfer() -> bool:
+    return _EC_CONNECTOR_AGENT is not None
+
+
+def ensure_ec_transfer_initialized(vllm_config: "VllmConfig") -> None:
+    """
+    Initialize EC cache connector.
+    """
+
+    global _EC_CONNECTOR_AGENT
+
+    if vllm_config.ec_transfer_config is None:
+        return
+
+    if (
+        vllm_config.ec_transfer_config.is_ec_transfer_instance
+        and _EC_CONNECTOR_AGENT is None
+    ):
+        _EC_CONNECTOR_AGENT = ECConnectorFactory.create_connector(
+            config=vllm_config, role=ECConnectorRole.WORKER
+        )