108 lines
4.2 KiB
Python
108 lines
4.2 KiB
Python
|
|
|
||
|
|
from typing import Any, Optional, Union
|
||
|
|
|
||
|
|
import vllm.envs as envs
|
||
|
|
from vllm.config import VllmConfig
|
||
|
|
from vllm.engine.arg_utils import AsyncEngineArgs
|
||
|
|
from vllm.usage.usage_lib import UsageContext
|
||
|
|
from vllm.v1.executor.abstract import Executor
|
||
|
|
from vllm.v1.metrics.loggers import StatLoggerFactory
|
||
|
|
|
||
|
|
from vllm.engine.protocol import EngineClient
|
||
|
|
from vllm_vacc.vllm.v1.engine import EngineCoreRequest
|
||
|
|
from vllm.v1.engine.output_processor import (OutputProcessor,
|
||
|
|
RequestOutputCollector)
|
||
|
|
from vllm.v1.engine.parallel_sampling import ParentRequest
|
||
|
|
from vllm.v1.engine.async_llm import logger
|
||
|
|
|
||
|
|
|
||
|
|
class AsyncLLM(EngineClient):
|
||
|
|
|
||
|
|
@classmethod
|
||
|
|
def from_vllm_config(
|
||
|
|
cls,
|
||
|
|
vllm_config: VllmConfig,
|
||
|
|
start_engine_loop: bool = True,
|
||
|
|
usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
|
||
|
|
stat_loggers: Optional[list[StatLoggerFactory]] = None,
|
||
|
|
enable_log_requests: bool = False,
|
||
|
|
disable_log_stats: bool = False,
|
||
|
|
client_addresses: Optional[dict[str, str]] = None,
|
||
|
|
client_count: int = 1,
|
||
|
|
client_index: int = 0,
|
||
|
|
disable_log_requests: bool = True, # Deprecated, will be removed
|
||
|
|
) -> "AsyncLLM":
|
||
|
|
# vacc support spec_num = 1
|
||
|
|
from .vllm_config_checker import check_spec_model
|
||
|
|
check_spec_model(vllm_config)
|
||
|
|
|
||
|
|
if not envs.VLLM_USE_V1:
|
||
|
|
raise ValueError(
|
||
|
|
"Using V1 AsyncLLMEngine, but envs.VLLM_USE_V1=False. "
|
||
|
|
"This should not happen. As a workaround, try using "
|
||
|
|
"AsyncLLMEngine.from_vllm_config(...) or explicitly set "
|
||
|
|
"VLLM_USE_V1=0 or 1 and report this issue on Github.")
|
||
|
|
|
||
|
|
# Create the LLMEngine.
|
||
|
|
from vllm.v1.engine.async_llm import AsyncLLM as DefaultAsyncLLM
|
||
|
|
async_cls = DefaultAsyncLLM
|
||
|
|
return async_cls(
|
||
|
|
vllm_config=vllm_config,
|
||
|
|
executor_class=Executor.get_class(vllm_config),
|
||
|
|
start_engine_loop=start_engine_loop,
|
||
|
|
stat_loggers=stat_loggers,
|
||
|
|
log_requests=enable_log_requests,
|
||
|
|
log_stats=not disable_log_stats,
|
||
|
|
usage_context=usage_context,
|
||
|
|
client_addresses=client_addresses,
|
||
|
|
client_count=client_count,
|
||
|
|
client_index=client_index,
|
||
|
|
)
|
||
|
|
|
||
|
|
@classmethod
|
||
|
|
def from_engine_args(
|
||
|
|
cls,
|
||
|
|
engine_args: AsyncEngineArgs,
|
||
|
|
start_engine_loop: bool = True,
|
||
|
|
usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
|
||
|
|
stat_loggers: Optional[list[StatLoggerFactory]] = None,
|
||
|
|
) -> "AsyncLLM":
|
||
|
|
"""Create an AsyncLLM from the EngineArgs."""
|
||
|
|
# Create the engine configs.
|
||
|
|
vllm_config = engine_args.create_engine_config(usage_context)
|
||
|
|
executor_class = Executor.get_class(vllm_config)
|
||
|
|
|
||
|
|
# vacc support spec_num = 1
|
||
|
|
from .vllm_config_checker import check_spec_model
|
||
|
|
check_spec_model(vllm_config)
|
||
|
|
|
||
|
|
# Create the AsyncLLM.
|
||
|
|
from vllm.v1.engine.async_llm import AsyncLLM as DefaultAsyncLLM
|
||
|
|
async_cls = DefaultAsyncLLM
|
||
|
|
return async_cls(
|
||
|
|
vllm_config=vllm_config,
|
||
|
|
executor_class=executor_class,
|
||
|
|
log_requests=not engine_args.disable_log_requests,
|
||
|
|
log_stats=not engine_args.disable_log_stats,
|
||
|
|
start_engine_loop=start_engine_loop,
|
||
|
|
usage_context=usage_context,
|
||
|
|
stat_loggers=stat_loggers,
|
||
|
|
)
|
||
|
|
|
||
|
|
async def _add_request(self, request: EngineCoreRequest,
|
||
|
|
prompt: Optional[str],
|
||
|
|
parent_req: Optional[ParentRequest], index: int,
|
||
|
|
queue: RequestOutputCollector):
|
||
|
|
|
||
|
|
# Add the request to OutputProcessor (this process).
|
||
|
|
self.output_processor.add_request(request, prompt, parent_req, index,
|
||
|
|
queue)
|
||
|
|
|
||
|
|
# Add the EngineCoreRequest to EngineCore (separate process).
|
||
|
|
await self.engine_core.add_request_async(request)
|
||
|
|
|
||
|
|
if self.log_requests:
|
||
|
|
if request.prompt_token_ids is not None:
|
||
|
|
logger.info("Added request: %s, prompt length: %s", request.request_id, len(request.prompt_token_ids))
|
||
|
|
else:
|
||
|
|
logger.info("Added request %s.", request.request_id)
|