init

2026-04-02 04:53:13 +00:00
parent 80932c96e5
commit 24df76db9d
1987 changed files with 447445 additions and 0 deletions
--- a/vllm_vacc/vllm/v1/engine/async_llm.py
+++ b/vllm_vacc/vllm/v1/engine/async_llm.py
@@ -0,0 +1,108 @@
+
+from typing import Any, Optional, Union
+
+import vllm.envs as envs
+from vllm.config import VllmConfig
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.usage.usage_lib import UsageContext
+from vllm.v1.executor.abstract import Executor
+from vllm.v1.metrics.loggers import StatLoggerFactory
+
+from vllm.engine.protocol import EngineClient
+from vllm_vacc.vllm.v1.engine import EngineCoreRequest
+from vllm.v1.engine.output_processor import (OutputProcessor,
+                                             RequestOutputCollector)
+from vllm.v1.engine.parallel_sampling import ParentRequest
+from vllm.v1.engine.async_llm import logger
+
+
+class AsyncLLM(EngineClient):
+
+    @classmethod
+    def from_vllm_config(
+        cls,
+        vllm_config: VllmConfig,
+        start_engine_loop: bool = True,
+        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
+        stat_loggers: Optional[list[StatLoggerFactory]] = None,
+        enable_log_requests: bool = False,
+        disable_log_stats: bool = False,
+        client_addresses: Optional[dict[str, str]] = None,
+        client_count: int = 1,
+        client_index: int = 0,
+        disable_log_requests: bool = True,  # Deprecated, will be removed
+    ) -> "AsyncLLM":
+        # vacc support spec_num = 1
+        from .vllm_config_checker import check_spec_model
+        check_spec_model(vllm_config)
+
+        if not envs.VLLM_USE_V1:
+            raise ValueError(
+                "Using V1 AsyncLLMEngine, but envs.VLLM_USE_V1=False. "
+                "This should not happen. As a workaround, try using "
+                "AsyncLLMEngine.from_vllm_config(...) or explicitly set "
+                "VLLM_USE_V1=0 or 1 and report this issue on Github.")
+
+        # Create the LLMEngine.
+        from vllm.v1.engine.async_llm import AsyncLLM as DefaultAsyncLLM
+        async_cls = DefaultAsyncLLM
+        return async_cls(
+            vllm_config=vllm_config,
+            executor_class=Executor.get_class(vllm_config),
+            start_engine_loop=start_engine_loop,
+            stat_loggers=stat_loggers,
+            log_requests=enable_log_requests,
+            log_stats=not disable_log_stats,
+            usage_context=usage_context,
+            client_addresses=client_addresses,
+            client_count=client_count,
+            client_index=client_index,
+        )
+
+    @classmethod
+    def from_engine_args(
+        cls,
+        engine_args: AsyncEngineArgs,
+        start_engine_loop: bool = True,
+        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
+        stat_loggers: Optional[list[StatLoggerFactory]] = None,
+    ) -> "AsyncLLM":
+        """Create an AsyncLLM from the EngineArgs."""
+        # Create the engine configs.
+        vllm_config = engine_args.create_engine_config(usage_context)
+        executor_class = Executor.get_class(vllm_config)
+
+        # vacc support spec_num = 1
+        from .vllm_config_checker import check_spec_model
+        check_spec_model(vllm_config)
+        
+        # Create the AsyncLLM.
+        from vllm.v1.engine.async_llm import AsyncLLM as DefaultAsyncLLM
+        async_cls = DefaultAsyncLLM
+        return async_cls(
+            vllm_config=vllm_config,
+            executor_class=executor_class,
+            log_requests=not engine_args.disable_log_requests,
+            log_stats=not engine_args.disable_log_stats,
+            start_engine_loop=start_engine_loop,
+            usage_context=usage_context,
+            stat_loggers=stat_loggers,
+        )
+
+    async def _add_request(self, request: EngineCoreRequest,
+                           prompt: Optional[str],
+                           parent_req: Optional[ParentRequest], index: int,
+                           queue: RequestOutputCollector):
+
+        # Add the request to OutputProcessor (this process).
+        self.output_processor.add_request(request, prompt, parent_req, index,
+                                          queue)
+
+        # Add the EngineCoreRequest to EngineCore (separate process).
+        await self.engine_core.add_request_async(request)
+
+        if self.log_requests:
+            if request.prompt_token_ids is not None:
+                logger.info("Added request: %s, prompt length: %s", request.request_id, len(request.prompt_token_ids))
+            else:
+                logger.info("Added request %s.", request.request_id)