from typing import Any, Optional, Union import vllm.envs as envs from vllm.config import VllmConfig from vllm.engine.arg_utils import AsyncEngineArgs from vllm.usage.usage_lib import UsageContext from vllm.v1.executor.abstract import Executor from vllm.v1.metrics.loggers import StatLoggerFactory from vllm.engine.protocol import EngineClient from vllm_vacc.vllm.v1.engine import EngineCoreRequest from vllm.v1.engine.output_processor import (OutputProcessor, RequestOutputCollector) from vllm.v1.engine.parallel_sampling import ParentRequest from vllm.v1.engine.async_llm import logger class AsyncLLM(EngineClient): @classmethod def from_vllm_config( cls, vllm_config: VllmConfig, start_engine_loop: bool = True, usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, stat_loggers: Optional[list[StatLoggerFactory]] = None, enable_log_requests: bool = False, disable_log_stats: bool = False, client_addresses: Optional[dict[str, str]] = None, client_count: int = 1, client_index: int = 0, disable_log_requests: bool = True, # Deprecated, will be removed ) -> "AsyncLLM": # vacc support spec_num = 1 from .vllm_config_checker import check_spec_model check_spec_model(vllm_config) if not envs.VLLM_USE_V1: raise ValueError( "Using V1 AsyncLLMEngine, but envs.VLLM_USE_V1=False. " "This should not happen. As a workaround, try using " "AsyncLLMEngine.from_vllm_config(...) or explicitly set " "VLLM_USE_V1=0 or 1 and report this issue on Github.") # Create the LLMEngine. from vllm.v1.engine.async_llm import AsyncLLM as DefaultAsyncLLM async_cls = DefaultAsyncLLM return async_cls( vllm_config=vllm_config, executor_class=Executor.get_class(vllm_config), start_engine_loop=start_engine_loop, stat_loggers=stat_loggers, log_requests=enable_log_requests, log_stats=not disable_log_stats, usage_context=usage_context, client_addresses=client_addresses, client_count=client_count, client_index=client_index, ) @classmethod def from_engine_args( cls, engine_args: AsyncEngineArgs, start_engine_loop: bool = True, usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, stat_loggers: Optional[list[StatLoggerFactory]] = None, ) -> "AsyncLLM": """Create an AsyncLLM from the EngineArgs.""" # Create the engine configs. vllm_config = engine_args.create_engine_config(usage_context) executor_class = Executor.get_class(vllm_config) # vacc support spec_num = 1 from .vllm_config_checker import check_spec_model check_spec_model(vllm_config) # Create the AsyncLLM. from vllm.v1.engine.async_llm import AsyncLLM as DefaultAsyncLLM async_cls = DefaultAsyncLLM return async_cls( vllm_config=vllm_config, executor_class=executor_class, log_requests=not engine_args.disable_log_requests, log_stats=not engine_args.disable_log_stats, start_engine_loop=start_engine_loop, usage_context=usage_context, stat_loggers=stat_loggers, ) async def _add_request(self, request: EngineCoreRequest, prompt: Optional[str], parent_req: Optional[ParentRequest], index: int, queue: RequestOutputCollector): # Add the request to OutputProcessor (this process). self.output_processor.add_request(request, prompt, parent_req, index, queue) # Add the EngineCoreRequest to EngineCore (separate process). await self.engine_core.add_request_async(request) if self.log_requests: if request.prompt_token_ids is not None: logger.info("Added request: %s, prompt length: %s", request.request_id, len(request.prompt_token_ids)) else: logger.info("Added request %s.", request.request_id)