This commit is contained in:
2026-04-02 04:53:13 +00:00
parent 80932c96e5
commit 24df76db9d
1987 changed files with 447445 additions and 0 deletions

View File

@@ -0,0 +1,94 @@
import enum
import time
from collections.abc import Mapping
from typing import Any, Optional, Union
import msgspec
import torch
from vllm.lora.request import LoRARequest
from vllm.multimodal.inputs import MultiModalFeatureSpec
from vllm.pooling_params import PoolingParams
from vllm.sampling_params import SamplingParams
from vllm.v1.outputs import LogprobsLists, LogprobsTensors
from vllm.v1.engine import EngineCoreOutput, UtilityOutput
from vllm_vacc.vllm.v1.metrics.stats import SchedulerStats
# These are possible values of RequestOutput.finish_reason,
# so form part of the external API.
FINISH_REASON_STRINGS = ("stop", "length", "abort")
class EngineCoreRequest(
msgspec.Struct,
array_like=True, # type: ignore[call-arg]
omit_defaults=True, # type: ignore[call-arg]
gc=False): # type: ignore[call-arg]
request_id: str
prompt_token_ids: Optional[list[int]]
mm_features: Optional[list[MultiModalFeatureSpec]]
sampling_params: Optional[SamplingParams]
pooling_params: Optional[PoolingParams]
eos_token_id: Optional[int]
arrival_time: float
lora_request: Optional[LoRARequest]
cache_salt: Optional[str]
data_parallel_rank: Optional[int]
prompt_embeds: Optional[torch.Tensor] = None
deepstack_input_embeds: Optional[torch.Tensor] = None
# Index of the client, used to ensure outputs are sent back to the same
# client for this request when scaling out the front-end.
client_index: int = 0
# Used in DP case to indicate which wave of requests this is expected to
# belong to, to cover a race condition where the request is sent before
# a wave finished notification is received.
current_wave: int = 0
priority: int = 0
trace_headers: Optional[Mapping[str, str]] = None
class EngineCoreOutputs(
msgspec.Struct,
array_like=True, # type: ignore[call-arg]
omit_defaults=True, # type: ignore[call-arg]
gc=False): # type: ignore[call-arg]
# NOTE(Nick): We could consider ways to make this more compact,
# e.g. columnwise layout
engine_index: int = 0
# [num_reqs]
outputs: list[EngineCoreOutput] = []
scheduler_stats: Optional[SchedulerStats] = None
timestamp: float = 0.0
utility_output: Optional[UtilityOutput] = None
finished_requests: Optional[set[str]] = None
# In DP case, used to signal that the current wave of requests
# has finished and the engines are paused.
wave_complete: Optional[int] = None
# In DP case, used to signal that a request was received for an
# "old" wave, so the next wave needs to be started in other engines.
start_wave: Optional[int] = None
def __post_init__(self):
if self.timestamp == 0.0:
self.timestamp = time.monotonic()
class EngineCoreRequestType(enum.Enum):
"""
Request types defined as hex byte strings, so it can be sent over sockets
without separate encoding step.
"""
ADD = b'\x00'
ABORT = b'\x01'
START_DP_WAVE = b'\x02'
UTILITY = b'\x03'
# Sentinel used within EngineCoreProc.
EXECUTOR_FAILED = b'\x04'
ADD_BULK = b'\x05'

View File

@@ -0,0 +1,108 @@
from typing import Any, Optional, Union
import vllm.envs as envs
from vllm.config import VllmConfig
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.usage.usage_lib import UsageContext
from vllm.v1.executor.abstract import Executor
from vllm.v1.metrics.loggers import StatLoggerFactory
from vllm.engine.protocol import EngineClient
from vllm_vacc.vllm.v1.engine import EngineCoreRequest
from vllm.v1.engine.output_processor import (OutputProcessor,
RequestOutputCollector)
from vllm.v1.engine.parallel_sampling import ParentRequest
from vllm.v1.engine.async_llm import logger
class AsyncLLM(EngineClient):
@classmethod
def from_vllm_config(
cls,
vllm_config: VllmConfig,
start_engine_loop: bool = True,
usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
stat_loggers: Optional[list[StatLoggerFactory]] = None,
enable_log_requests: bool = False,
disable_log_stats: bool = False,
client_addresses: Optional[dict[str, str]] = None,
client_count: int = 1,
client_index: int = 0,
disable_log_requests: bool = True, # Deprecated, will be removed
) -> "AsyncLLM":
# vacc support spec_num = 1
from .vllm_config_checker import check_spec_model
check_spec_model(vllm_config)
if not envs.VLLM_USE_V1:
raise ValueError(
"Using V1 AsyncLLMEngine, but envs.VLLM_USE_V1=False. "
"This should not happen. As a workaround, try using "
"AsyncLLMEngine.from_vllm_config(...) or explicitly set "
"VLLM_USE_V1=0 or 1 and report this issue on Github.")
# Create the LLMEngine.
from vllm.v1.engine.async_llm import AsyncLLM as DefaultAsyncLLM
async_cls = DefaultAsyncLLM
return async_cls(
vllm_config=vllm_config,
executor_class=Executor.get_class(vllm_config),
start_engine_loop=start_engine_loop,
stat_loggers=stat_loggers,
log_requests=enable_log_requests,
log_stats=not disable_log_stats,
usage_context=usage_context,
client_addresses=client_addresses,
client_count=client_count,
client_index=client_index,
)
@classmethod
def from_engine_args(
cls,
engine_args: AsyncEngineArgs,
start_engine_loop: bool = True,
usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
stat_loggers: Optional[list[StatLoggerFactory]] = None,
) -> "AsyncLLM":
"""Create an AsyncLLM from the EngineArgs."""
# Create the engine configs.
vllm_config = engine_args.create_engine_config(usage_context)
executor_class = Executor.get_class(vllm_config)
# vacc support spec_num = 1
from .vllm_config_checker import check_spec_model
check_spec_model(vllm_config)
# Create the AsyncLLM.
from vllm.v1.engine.async_llm import AsyncLLM as DefaultAsyncLLM
async_cls = DefaultAsyncLLM
return async_cls(
vllm_config=vllm_config,
executor_class=executor_class,
log_requests=not engine_args.disable_log_requests,
log_stats=not engine_args.disable_log_stats,
start_engine_loop=start_engine_loop,
usage_context=usage_context,
stat_loggers=stat_loggers,
)
async def _add_request(self, request: EngineCoreRequest,
prompt: Optional[str],
parent_req: Optional[ParentRequest], index: int,
queue: RequestOutputCollector):
# Add the request to OutputProcessor (this process).
self.output_processor.add_request(request, prompt, parent_req, index,
queue)
# Add the EngineCoreRequest to EngineCore (separate process).
await self.engine_core.add_request_async(request)
if self.log_requests:
if request.prompt_token_ids is not None:
logger.info("Added request: %s, prompt length: %s", request.request_id, len(request.prompt_token_ids))
else:
logger.info("Added request %s.", request.request_id)

View File

@@ -0,0 +1,209 @@
import os
import queue
import signal
import sys
import threading
import time
from collections import deque
from collections.abc import Generator
from concurrent.futures import Future
from contextlib import ExitStack, contextmanager
from inspect import isclass, signature
from logging import DEBUG
from typing import Any, Callable, Optional, TypeVar, Union
import msgspec
import zmq
from vllm.config import ParallelConfig, VllmConfig
from vllm.logger import init_logger
from vllm.utils import make_zmq_socket
from vllm.v1.kv_cache_interface import KVCacheConfig
from vllm.v1.engine import (EngineCoreRequest,EngineCoreRequestType)
from vllm.v1.core.kv_cache_utils import (BlockHash,
generate_scheduler_kv_cache_config,
get_kv_cache_configs,
get_request_block_hasher,
init_none_hash)
from vllm.v1.serial_utils import MsgpackDecoder
from vllm.v1.engine.core import EngineCore
from vllm.v1.request import Request, RequestStatus
logger = init_logger(__name__)
POLLING_TIMEOUT_S = 2.5
HANDSHAKE_TIMEOUT_MINS = 5
_R = TypeVar('_R') # Return type for collective_rpc
class EngineCoreProc(EngineCore):
"""ZMQ-wrapper for running EngineCore in background process."""
ENGINE_CORE_DEAD = b'ENGINE_CORE_DEAD'
def process_input_sockets(self, input_addresses: list[str],
coord_input_address: Optional[str],
identity: bytes, ready_event: threading.Event):
"""Input socket IO thread."""
# Msgpack serialization decoding.
add_request_decoder = MsgpackDecoder(EngineCoreRequest)
generic_decoder = MsgpackDecoder()
bulk_add_decoder = MsgpackDecoder(list[EngineCoreRequest])
with ExitStack() as stack, zmq.Context() as ctx:
input_sockets = [
stack.enter_context(
make_zmq_socket(ctx,
input_address,
zmq.DEALER,
identity=identity,
bind=False))
for input_address in input_addresses
]
if coord_input_address is None:
coord_socket = None
else:
coord_socket = stack.enter_context(
make_zmq_socket(ctx,
coord_input_address,
zmq.XSUB,
identity=identity,
bind=False))
# Send subscription message to coordinator.
coord_socket.send(b'\x01')
# Register sockets with poller.
poller = zmq.Poller()
for input_socket in input_sockets:
# Send initial message to each input socket - this is required
# before the front-end ROUTER socket can send input messages
# back to us.
input_socket.send(b'')
poller.register(input_socket, zmq.POLLIN)
if coord_socket is not None:
poller.register(coord_socket, zmq.POLLIN)
ready_event.set()
del ready_event
while True:
for input_socket, _ in poller.poll():
# (RequestType, RequestData)
type_frame, *data_frames = input_socket.recv_multipart(
copy=False)
request_type = EngineCoreRequestType(
bytes(type_frame.buffer))
if request_type == EngineCoreRequestType.ADD_BULK:
# 关键:按 list[EngineCoreRequest] 解码,然后在接收线程就地 fan-out
requests = bulk_add_decoder.decode(data_frames)
for r in requests:
r = self.preprocess_add_request(r)
self.input_queue.put_nowait((EngineCoreRequestType.ADD, r))
continue
# Deserialize the request data.
if request_type == EngineCoreRequestType.ADD:
request = add_request_decoder.decode(data_frames)
request = self.preprocess_add_request(request)
else:
request = generic_decoder.decode(data_frames)
# Push to input queue for core busy loop.
self.input_queue.put_nowait((request_type, request))
class EngineCore:
"""Inner loop of vLLM's Engine."""
def _initialize_kv_caches(
self, vllm_config: VllmConfig) -> tuple[int, int, KVCacheConfig]:
start = time.time()
# Get all kv cache needed by the model
kv_cache_specs = self.model_executor.get_kv_cache_specs()
# get_kv_cache_specs in model_runner
# for layer_name, layer in vllm_config.compilation_config.static_forward_context.items():
# print(f'layer_name = {layer_name}; layer = {layer}')
# # 只有 moe layer 拿不到attention layer?
# # TODO for no kv cahe model
# has_kv_cache = any(kv_cache_spec for kv_cache_spec in kv_cache_specs)
# if has_kv_cache:
# if os.environ.get("VLLM_ELASTIC_EP_SCALE_UP_LAUNCH") == "1":
# dp_group = getattr(self, "dp_group", None)
# assert dp_group is not None
# self.available_gpu_memory_for_kv_cache = \
# ParallelConfig.sync_kv_cache_memory_size(dp_group, -1)
# available_gpu_memory = [
# self.available_gpu_memory_for_kv_cache
# ] * len(kv_cache_specs)
# else:
# # Profiles the peak memory usage of the model to determine how
# # much memory can be allocated for kv cache.
# available_gpu_memory = (
# self.model_executor.determine_available_memory())
# self.available_gpu_memory_for_kv_cache = \
# available_gpu_memory[0]
# else:
# # Attention free models don't need memory for kv cache
# available_gpu_memory = [0] * len(kv_cache_specs)
memory_blocks = self.model_executor.determine_available_memory_block() # [(memory, blocks) * rank_number]
available_gpu_memory = [memory_block[0] for memory_block in memory_blocks]
num_gpu_blocks = memory_blocks[0][1]
assert len(kv_cache_specs) == len(available_gpu_memory)
kv_cache_configs = get_kv_cache_configs(vllm_config, kv_cache_specs,
available_gpu_memory)
### patch here to support long seq_length for mtp
for kv_cache_config in kv_cache_configs:
for ii in range(len(kv_cache_config.kv_cache_tensors)):
kv_cache_config.kv_cache_tensors[ii].size = kv_cache_config.kv_cache_tensors[ii].size * num_gpu_blocks // kv_cache_config.num_blocks
kv_cache_config.num_blocks = num_gpu_blocks
### patch here to support long seq_length for mtp end
scheduler_kv_cache_config = generate_scheduler_kv_cache_config(
kv_cache_configs)
num_gpu_blocks = scheduler_kv_cache_config.num_blocks
num_cpu_blocks = 0
# Initialize kv cache and warmup the execution
self.model_executor.initialize_from_config(kv_cache_configs)
elapsed = time.time() - start
logger.info(("init engine (profile, create kv cache, "
"warmup model) took %.2f seconds"), elapsed)
return num_gpu_blocks, num_cpu_blocks, scheduler_kv_cache_config
def preprocess_add_request(
self, request: EngineCoreRequest) -> tuple[Request, int]:
"""Preprocess the request.
This function could be directly used in input processing thread to allow
request initialization running in parallel with Model forward
"""
# Note on thread safety: no race condition.
# `mm_receiver_cache` is reset at the end of LLMEngine init,
# and will only be accessed in the input processing thread afterwards.
if self.mm_receiver_cache is not None and request.mm_features:
request.mm_features = (
self.mm_receiver_cache.get_and_update_features(
request.mm_features))
req = Request.from_engine_core_request(request,
self.request_block_hasher)
if req.use_structured_output:
# Note on thread safety: no race condition.
# `grammar_init` is only invoked in input processing thread. For
# `structured_output_manager`, each request is independent and
# grammar compilation is async. Scheduler always checks grammar
# compilation status before scheduling request.
self.structured_output_manager.grammar_init(req)
return req, request.current_wave

View File

@@ -0,0 +1,76 @@
import asyncio
import contextlib
import queue
import sys
import uuid
import weakref
from abc import ABC, abstractmethod
from collections import defaultdict, deque
from collections.abc import Awaitable, Sequence
from concurrent.futures import Future
from dataclasses import dataclass
from threading import Thread
from typing import Any, Callable, Optional, TypeVar, Union, List
import msgspec.msgpack
import zmq
import zmq.asyncio
from vllm.config import VllmConfig
from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
from vllm.utils import get_open_zmq_inproc_path, make_zmq_socket
from vllm.v1.engine import (EngineCoreOutputs,
EngineCoreRequestType, UtilityOutput)
from vllm_vacc.vllm.v1.engine import EngineCoreRequest
from vllm.v1.engine.coordinator import DPCoordinator
from vllm.v1.engine.core import EngineCore, EngineCoreProc
from vllm.v1.engine.exceptions import EngineDeadError
from vllm.v1.engine.utils import (CoreEngineActorManager,
CoreEngineProcManager, launch_core_engines)
from vllm.v1.executor.abstract import Executor
from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder, bytestr
from vllm.v1.engine.core_client import MPClient
logger = init_logger(__name__)
AnyFuture = Union[asyncio.Future[Any], Future[Any]]
_R = TypeVar('_R') # Return type for collective_rpc
EngineIdentity = bytes
class EngineCoreClient(ABC):
"""
EngineCoreClient: subclasses handle different methods for pushing
and pulling from the EngineCore for asyncio / multiprocessing.
Subclasses:
* InprocClient: In process EngineCore (for V0-style LLMEngine use)
* SyncMPClient: ZMQ + background proc EngineCore (for LLM)
* AsyncMPClient: ZMQ + background proc EngineCore w/ asyncio (for AsyncLLM)
"""
@abstractmethod
def _send_input(self, req_type: EngineCoreRequestType, payload: Any) -> None:
"""Send a request to EngineCore."""
raise NotImplementedError
def add_requests(self, requests: List["EngineCoreRequest"]) -> None:
"""一次性发多条 ADDADD_BULK"""
if not requests:
return
self._send_input(EngineCoreRequestType.ADD_BULK, requests)
class SyncMPClient(MPClient):
"""Synchronous client for multi-proc EngineCore."""
def add_requests(self, requests: List[EngineCoreRequest]) -> None:
if not requests:
return
if self.is_dp: # 与 add_request 保持一致
self.engines_running = True
self._send_input(EngineCoreRequestType.ADD_BULK, requests)

View File

@@ -0,0 +1,176 @@
from collections.abc import Mapping
from copy import copy
from typing import Any, Callable, Optional, Union
from typing_extensions import TypeVar
import vllm.envs as envs
from vllm.config import ParallelConfig, VllmConfig
from vllm.distributed import stateless_destroy_torch_distributed_process_group
from vllm.engine.arg_utils import EngineArgs
from vllm.inputs import PromptType
from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
from vllm.outputs import PoolingRequestOutput, RequestOutput
from vllm.pooling_params import PoolingParams
# from vllm.prompt_adapter.request import PromptAdapterRequest
from vllm.sampling_params import SamplingParams
from vllm.usage.usage_lib import UsageContext
from vllm.utils import Device
from vllm.v1.engine.core_client import EngineCoreClient
from vllm.v1.engine.output_processor import OutputProcessor
from vllm.v1.engine.parallel_sampling import ParentRequest
from vllm.v1.engine.processor import Processor
from vllm.v1.executor.abstract import Executor
from vllm.v1.metrics.loggers import (PrometheusStatLogger, StatLoggerBase,
StatLoggerFactory)
from vllm.v1.metrics.reader import Metric, get_metrics_snapshot
from vllm.v1.metrics.stats import IterationStats
from vllm.v1.engine import EngineCoreRequest
logger = init_logger(__name__)
class LLMEngine:
@classmethod
def from_vllm_config(
cls,
vllm_config: VllmConfig,
usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
stat_loggers: Optional[list[StatLoggerFactory]] = None,
disable_log_stats: bool = False,
) -> "LLMEngine":
# vacc support spec_num = 1
from .vllm_config_checker import check_spec_model
check_spec_model(vllm_config)
from vllm.v1.engine.llm_engine import LLMEngine as DefaultLLM
default_cls = DefaultLLM
return default_cls(vllm_config=vllm_config,
executor_class=Executor.get_class(vllm_config),
log_stats=(not disable_log_stats),
usage_context=usage_context,
stat_loggers=stat_loggers,
multiprocess_mode=envs.VLLM_ENABLE_V1_MULTIPROCESSING)
@classmethod
def from_engine_args(
cls,
engine_args: EngineArgs,
usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
stat_loggers: Optional[list[StatLoggerFactory]] = None,
enable_multiprocessing: bool = False,
) -> "LLMEngine":
"""Creates an LLM engine from the engine arguments."""
# Create the engine configs.
vllm_config = engine_args.create_engine_config(usage_context)
executor_class = Executor.get_class(vllm_config)
# vacc support spec_num = 1
from .vllm_config_checker import check_spec_model
check_spec_model(vllm_config)
if envs.VLLM_ENABLE_V1_MULTIPROCESSING:
logger.debug("Enabling multiprocessing for LLMEngine.")
enable_multiprocessing = True
# Create the LLMEngine.
from vllm.v1.engine.llm_engine import LLMEngine as DefaultLLM
default_cls = DefaultLLM
return default_cls(vllm_config=vllm_config,
executor_class=executor_class,
log_stats=not engine_args.disable_log_stats,
usage_context=usage_context,
stat_loggers=stat_loggers,
multiprocess_mode=enable_multiprocessing)
"""Legacy LLMEngine for backwards compatibility."""
def add_requests(
self,
items: list[tuple[
str, # request_id
PromptType, # prompt
Union[SamplingParams, PoolingParams], # params
Optional[float], # arrival_time
Optional[LoRARequest], # lora_request
Optional[dict], # tokenization_kwargs
Optional[dict], # trace_headers
# Optional[PromptAdapterRequest], # prompt_adapter_request
int, # priority
]],
) -> None:
"""批量把请求送入 EngineCore一次性触发 ADD_BULK。"""
core_reqs: list[EngineCoreRequest] = []
for (request_id, prompt, params, arrival_time, lora_request,
tokenization_kwargs, trace_headers,
priority) in items:
# 复用现有逐条流程的解析入口
prompt_str, request = self.processor.process_inputs(
request_id=request_id,
prompt=prompt,
params=params,
arrival_time=arrival_time,
lora_request=lora_request,
tokenization_kwargs=tokenization_kwargs,
trace_headers=trace_headers,
# prompt_adapter_request=prompt_adapter_request,
priority=priority,
)
n = params.n if isinstance(params, SamplingParams) else 1
if n == 1:
# Make a new RequestState and queue.
self.output_processor.add_request(request, prompt_str, None, 0)
# Add the request to EngineCore.
core_reqs.append(request)
continue
# self.engine_core.add_request(request)
# return
# Fan out child requests (for n>1).
parent_req = ParentRequest(request_id, params)
for idx in range(n):
request_id, params = parent_req.get_child_info(idx)
child_request = request if idx == n - 1 else copy(request)
child_request.request_id = request_id
child_request.sampling_params = params
# Make a new RequestState and queue.
self.output_processor.add_request(child_request, prompt_str,
parent_req, idx)
# Add the request to EngineCore.
# self.engine_core.add_request(child_request)
# print("add_requests: child_request id=", child_request.request_id)
core_reqs.append(child_request)
# output_processor 需要为每个“实际进入引擎的 req_id”建索引。
# 如果是 SamplingParams 且 n>1/best_of>1要做 parent-children 拆分;
# 否则直接登记单条。
# if isinstance(params, SamplingParams) and (
# (params.n is not None and params.n > 1) or
# (getattr(params, "best_of", 1) and getattr(params, "best_of", 1) > 1)
# ):
# parent = self.parallel_sampler.create_parent(request_id, params)
# # 注意:最后一个 child 可以直接复用 request其余用 copy
# children = self.parallel_sampler.materialize_children(parent, request)
# for child_idx, child in enumerate(children):
# self.output_processor.add_request(
# request=child,
# prompt_str=prompt_str,
# parent=parent,
# child_index=child_idx,
# )
# core_reqs.append(child)
# else:
# self.output_processor.add_request(request, prompt_str)
# core_reqs.append(request)
# 关键:一次性下发给 Core。EngineCoreClient 会发送 ADD_BULK。
print('self.engine_core', self.engine_core)
self.engine_core.add_requests(core_reqs)

View File

@@ -0,0 +1,184 @@
import time
from collections.abc import Mapping
from typing import Any, Literal, Optional, Union
import torch
from vllm.config import VllmConfig
from vllm.inputs import ProcessorInputs, PromptType, SingletonInputs
from vllm.inputs.parse import split_enc_dec_inputs
from vllm.inputs.preprocess import InputPreprocessor
from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
from vllm.multimodal.cache import processor_cache_from_config
from vllm.multimodal.inputs import MultiModalFeatureSpec, MultiModalUUIDDict
from vllm.multimodal.processing import EncDecMultiModalProcessor
from vllm.multimodal.utils import argsort_mm_positions
from vllm.pooling_params import PoolingParams
from vllm.sampling_params import SamplingParams
from vllm.transformers_utils.tokenizer import AnyTokenizer
from vllm.utils import length_from_prompt_token_ids_or_embeds
from vllm.v1.engine import EngineCoreRequest
from vllm.v1.structured_output.backend_guidance import (
validate_guidance_grammar)
from vllm.v1.structured_output.backend_lm_format_enforcer import (
validate_structured_output_request_lm_format_enforcer)
from vllm.v1.structured_output.backend_outlines import (
validate_structured_output_request_outlines)
from vllm.v1.structured_output.backend_xgrammar import (
validate_xgrammar_grammar)
logger = init_logger(__name__)
class Processor:
def process_inputs(
self,
request_id: str,
prompt: PromptType,
params: Union[SamplingParams, PoolingParams],
arrival_time: Optional[float] = None,
lora_request: Optional[LoRARequest] = None,
tokenization_kwargs: Optional[dict[str, Any]] = None,
trace_headers: Optional[Mapping[str, str]] = None,
priority: int = 0,
data_parallel_rank: Optional[int] = None,
) -> tuple[Optional[str], EngineCoreRequest]:
# TODO(woosuk): Support pooling models.
self._validate_lora(lora_request)
self._validate_params(params)
data_parallel_size = self.vllm_config.parallel_config.data_parallel_size
if data_parallel_rank is not None and not (0 <= data_parallel_rank <
data_parallel_size):
raise ValueError(f"data_parallel_rank {data_parallel_rank} "
f"is out of range [0, {data_parallel_size}).")
if arrival_time is None:
arrival_time = time.time()
# Optionally generate multimodal hash overrides to avoid hashing
# multimodal data items by their content as their identifiers.
# NOTE: when users explicitly turn off BOTH prefix caching and input
# processing caching, no multimodal features or embeddings will be
# reused across requests, therefore identifying multimodal data items
# by their content is no longer necessary, and we create uuids with
# request id-modality-index as multimodal hash overrides.
if (self.model_config.multimodal_config and
self.model_config.multimodal_config.mm_processor_cache_gb == 0
and not self.cache_config.enable_prefix_caching):
mm_uuids = self._maybe_build_mm_uuids(request_id, prompt)
else:
# Otherwise, use user-provided uuids as multimodal hash overrides
# if provided.
self._validate_multi_modal_uuids(prompt)
if isinstance(prompt, dict):
mm_uuids = prompt.get("multi_modal_uuids")
else:
mm_uuids = None
# Process inputs, which includes:
# 1. Tokenize text prompt, with LoRA request if one exists.
# 2. For multimodal models with a merged preprocessor, preprocess
# multimodal data and expand prompt token ids accordingly.
processed_inputs: ProcessorInputs = self.input_preprocessor.preprocess(
prompt,
tokenization_kwargs=tokenization_kwargs,
mm_uuids=mm_uuids,
)
from vllm.platforms import current_platform
current_platform.validate_request(
prompt=prompt,
params=params,
processed_inputs=processed_inputs,
)
eos_token_id = self.input_preprocessor.get_eos_token_id()
encoder_inputs, decoder_inputs = split_enc_dec_inputs(processed_inputs)
self._validate_model_inputs(encoder_inputs, decoder_inputs)
# Mypy does not always properly infer the types of some elements of
# discriminated unions of TypedDicts, because of how it handles
# inheritance of TypedDict. If we explicitly extract the items we want
# we can avoid type errors from using `dict.get` later in the method.
prompt_str: Optional[str] = None if decoder_inputs[
"type"] == "embeds" else decoder_inputs.get("prompt")
prompt_token_ids = decoder_inputs[
"prompt_token_ids"] if decoder_inputs["type"] != "embeds" else None
prompt_embeds = decoder_inputs["prompt_embeds"] if decoder_inputs[
"type"] == "embeds" else None
deepstack_input_embeds = decoder_inputs["deepstack_input_embeds"] if decoder_inputs[
"type"] == "embeds" else None
# for deepstack_input_embeds in llm.generate method
if isinstance(deepstack_input_embeds, dict):
all_tensors = []
for key in deepstack_input_embeds:
if isinstance(deepstack_input_embeds[key], torch.Tensor):
all_tensors.append(deepstack_input_embeds[key].unsqueeze(0))
if len(all_tensors) > 0:
deepstack_input_embeds = torch.concatenate(all_tensors, 0)
sampling_params = None
pooling_params = None
if isinstance(params, SamplingParams):
# TODO: can we avoid cloning here in multiproc case?
sampling_params = params.clone()
# If unset max tokens, then generate up to the max_model_len.
if sampling_params.max_tokens is None:
seq_len = length_from_prompt_token_ids_or_embeds(
prompt_token_ids, prompt_embeds)
sampling_params.max_tokens = \
self.model_config.max_model_len - seq_len
sampling_params.update_from_generation_config(
self.generation_config_fields, eos_token_id)
if self.tokenizer is not None:
sampling_params.update_from_tokenizer(self.tokenizer)
else:
pooling_params = params.clone()
# Multimodal related.
mm_features: Optional[list[MultiModalFeatureSpec]] = None
if decoder_inputs["type"] == "multimodal":
decoder_mm_inputs = decoder_inputs["mm_kwargs"]
decoder_mm_positions = decoder_inputs["mm_placeholders"]
decoder_mm_hashes = decoder_inputs["mm_hashes"]
# Merge and flatten multimodal placeholders, hashes and inputs
# from dictionaries to lists, and sort them by each item's position
# in the input sequence.
sorted_mm_idxs = argsort_mm_positions(decoder_mm_positions)
mm_features = []
for modality, idx in sorted_mm_idxs:
mm_features.append(
MultiModalFeatureSpec(
data=decoder_mm_inputs[modality][idx],
modality=modality,
identifier=decoder_mm_hashes[modality][idx],
mm_position=decoder_mm_positions[modality][idx]))
return prompt_str, EngineCoreRequest(
request_id=request_id,
prompt_token_ids=prompt_token_ids,
prompt_embeds=prompt_embeds,
deepstack_input_embeds=deepstack_input_embeds,
mm_features=mm_features,
sampling_params=sampling_params,
pooling_params=pooling_params,
eos_token_id=eos_token_id,
arrival_time=arrival_time,
lora_request=lora_request,
cache_salt=decoder_inputs.get("cache_salt"),
priority=priority,
data_parallel_rank=data_parallel_rank,
trace_headers=trace_headers,
)

View File

@@ -0,0 +1,17 @@
# check spec model config
# write spec model func to config file
def check_spec_model(vllm_config):
# add spec tag
speculative_mode = hasattr(vllm_config, 'speculative_config')
if speculative_mode and \
hasattr(vllm_config.speculative_config, 'num_speculative_tokens') and \
vllm_config.speculative_config.num_speculative_tokens != 1:
raise ValueError(f'run_mp_engine: only support num_speculative_tokens == 1, but get {vllm_config.speculative_config.num_speculative_tokens}')
default_model_infos = "default"
if speculative_mode:
if hasattr(vllm_config.speculative_config, 'method'):
default_model_infos = vllm_config.speculative_config.method
from vllm_vacc.vllm.config_manager import vllm_vacc_config_manager
vllm_vacc_config_manager().update_model_infos(default_model_infos)