init
This commit is contained in:
94
vllm_vacc/vllm/v1/engine/__init__.py
Normal file
94
vllm_vacc/vllm/v1/engine/__init__.py
Normal file
@@ -0,0 +1,94 @@
|
||||
|
||||
import enum
|
||||
import time
|
||||
from collections.abc import Mapping
|
||||
from typing import Any, Optional, Union
|
||||
|
||||
import msgspec
|
||||
import torch
|
||||
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.multimodal.inputs import MultiModalFeatureSpec
|
||||
from vllm.pooling_params import PoolingParams
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.v1.outputs import LogprobsLists, LogprobsTensors
|
||||
from vllm.v1.engine import EngineCoreOutput, UtilityOutput
|
||||
|
||||
from vllm_vacc.vllm.v1.metrics.stats import SchedulerStats
|
||||
|
||||
# These are possible values of RequestOutput.finish_reason,
|
||||
# so form part of the external API.
|
||||
FINISH_REASON_STRINGS = ("stop", "length", "abort")
|
||||
|
||||
class EngineCoreRequest(
|
||||
msgspec.Struct,
|
||||
array_like=True, # type: ignore[call-arg]
|
||||
omit_defaults=True, # type: ignore[call-arg]
|
||||
gc=False): # type: ignore[call-arg]
|
||||
|
||||
request_id: str
|
||||
prompt_token_ids: Optional[list[int]]
|
||||
mm_features: Optional[list[MultiModalFeatureSpec]]
|
||||
sampling_params: Optional[SamplingParams]
|
||||
pooling_params: Optional[PoolingParams]
|
||||
eos_token_id: Optional[int]
|
||||
arrival_time: float
|
||||
lora_request: Optional[LoRARequest]
|
||||
cache_salt: Optional[str]
|
||||
data_parallel_rank: Optional[int]
|
||||
prompt_embeds: Optional[torch.Tensor] = None
|
||||
deepstack_input_embeds: Optional[torch.Tensor] = None
|
||||
# Index of the client, used to ensure outputs are sent back to the same
|
||||
# client for this request when scaling out the front-end.
|
||||
client_index: int = 0
|
||||
|
||||
# Used in DP case to indicate which wave of requests this is expected to
|
||||
# belong to, to cover a race condition where the request is sent before
|
||||
# a wave finished notification is received.
|
||||
current_wave: int = 0
|
||||
priority: int = 0
|
||||
|
||||
trace_headers: Optional[Mapping[str, str]] = None
|
||||
|
||||
class EngineCoreOutputs(
|
||||
msgspec.Struct,
|
||||
array_like=True, # type: ignore[call-arg]
|
||||
omit_defaults=True, # type: ignore[call-arg]
|
||||
gc=False): # type: ignore[call-arg]
|
||||
|
||||
# NOTE(Nick): We could consider ways to make this more compact,
|
||||
# e.g. columnwise layout
|
||||
|
||||
engine_index: int = 0
|
||||
|
||||
# [num_reqs]
|
||||
outputs: list[EngineCoreOutput] = []
|
||||
scheduler_stats: Optional[SchedulerStats] = None
|
||||
timestamp: float = 0.0
|
||||
|
||||
utility_output: Optional[UtilityOutput] = None
|
||||
finished_requests: Optional[set[str]] = None
|
||||
|
||||
# In DP case, used to signal that the current wave of requests
|
||||
# has finished and the engines are paused.
|
||||
wave_complete: Optional[int] = None
|
||||
# In DP case, used to signal that a request was received for an
|
||||
# "old" wave, so the next wave needs to be started in other engines.
|
||||
start_wave: Optional[int] = None
|
||||
|
||||
def __post_init__(self):
|
||||
if self.timestamp == 0.0:
|
||||
self.timestamp = time.monotonic()
|
||||
|
||||
class EngineCoreRequestType(enum.Enum):
|
||||
"""
|
||||
Request types defined as hex byte strings, so it can be sent over sockets
|
||||
without separate encoding step.
|
||||
"""
|
||||
ADD = b'\x00'
|
||||
ABORT = b'\x01'
|
||||
START_DP_WAVE = b'\x02'
|
||||
UTILITY = b'\x03'
|
||||
# Sentinel used within EngineCoreProc.
|
||||
EXECUTOR_FAILED = b'\x04'
|
||||
ADD_BULK = b'\x05'
|
||||
BIN
vllm_vacc/vllm/v1/engine/__pycache__/__init__.cpython-312.pyc
Normal file
BIN
vllm_vacc/vllm/v1/engine/__pycache__/__init__.cpython-312.pyc
Normal file
Binary file not shown.
BIN
vllm_vacc/vllm/v1/engine/__pycache__/async_llm.cpython-312.pyc
Normal file
BIN
vllm_vacc/vllm/v1/engine/__pycache__/async_llm.cpython-312.pyc
Normal file
Binary file not shown.
BIN
vllm_vacc/vllm/v1/engine/__pycache__/core.cpython-312.pyc
Normal file
BIN
vllm_vacc/vllm/v1/engine/__pycache__/core.cpython-312.pyc
Normal file
Binary file not shown.
BIN
vllm_vacc/vllm/v1/engine/__pycache__/core_client.cpython-312.pyc
Normal file
BIN
vllm_vacc/vllm/v1/engine/__pycache__/core_client.cpython-312.pyc
Normal file
Binary file not shown.
BIN
vllm_vacc/vllm/v1/engine/__pycache__/llm_engine.cpython-312.pyc
Normal file
BIN
vllm_vacc/vllm/v1/engine/__pycache__/llm_engine.cpython-312.pyc
Normal file
Binary file not shown.
BIN
vllm_vacc/vllm/v1/engine/__pycache__/processor.cpython-312.pyc
Normal file
BIN
vllm_vacc/vllm/v1/engine/__pycache__/processor.cpython-312.pyc
Normal file
Binary file not shown.
Binary file not shown.
108
vllm_vacc/vllm/v1/engine/async_llm.py
Normal file
108
vllm_vacc/vllm/v1/engine/async_llm.py
Normal file
@@ -0,0 +1,108 @@
|
||||
|
||||
from typing import Any, Optional, Union
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.engine.arg_utils import AsyncEngineArgs
|
||||
from vllm.usage.usage_lib import UsageContext
|
||||
from vllm.v1.executor.abstract import Executor
|
||||
from vllm.v1.metrics.loggers import StatLoggerFactory
|
||||
|
||||
from vllm.engine.protocol import EngineClient
|
||||
from vllm_vacc.vllm.v1.engine import EngineCoreRequest
|
||||
from vllm.v1.engine.output_processor import (OutputProcessor,
|
||||
RequestOutputCollector)
|
||||
from vllm.v1.engine.parallel_sampling import ParentRequest
|
||||
from vllm.v1.engine.async_llm import logger
|
||||
|
||||
|
||||
class AsyncLLM(EngineClient):
|
||||
|
||||
@classmethod
|
||||
def from_vllm_config(
|
||||
cls,
|
||||
vllm_config: VllmConfig,
|
||||
start_engine_loop: bool = True,
|
||||
usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
|
||||
stat_loggers: Optional[list[StatLoggerFactory]] = None,
|
||||
enable_log_requests: bool = False,
|
||||
disable_log_stats: bool = False,
|
||||
client_addresses: Optional[dict[str, str]] = None,
|
||||
client_count: int = 1,
|
||||
client_index: int = 0,
|
||||
disable_log_requests: bool = True, # Deprecated, will be removed
|
||||
) -> "AsyncLLM":
|
||||
# vacc support spec_num = 1
|
||||
from .vllm_config_checker import check_spec_model
|
||||
check_spec_model(vllm_config)
|
||||
|
||||
if not envs.VLLM_USE_V1:
|
||||
raise ValueError(
|
||||
"Using V1 AsyncLLMEngine, but envs.VLLM_USE_V1=False. "
|
||||
"This should not happen. As a workaround, try using "
|
||||
"AsyncLLMEngine.from_vllm_config(...) or explicitly set "
|
||||
"VLLM_USE_V1=0 or 1 and report this issue on Github.")
|
||||
|
||||
# Create the LLMEngine.
|
||||
from vllm.v1.engine.async_llm import AsyncLLM as DefaultAsyncLLM
|
||||
async_cls = DefaultAsyncLLM
|
||||
return async_cls(
|
||||
vllm_config=vllm_config,
|
||||
executor_class=Executor.get_class(vllm_config),
|
||||
start_engine_loop=start_engine_loop,
|
||||
stat_loggers=stat_loggers,
|
||||
log_requests=enable_log_requests,
|
||||
log_stats=not disable_log_stats,
|
||||
usage_context=usage_context,
|
||||
client_addresses=client_addresses,
|
||||
client_count=client_count,
|
||||
client_index=client_index,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_engine_args(
|
||||
cls,
|
||||
engine_args: AsyncEngineArgs,
|
||||
start_engine_loop: bool = True,
|
||||
usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
|
||||
stat_loggers: Optional[list[StatLoggerFactory]] = None,
|
||||
) -> "AsyncLLM":
|
||||
"""Create an AsyncLLM from the EngineArgs."""
|
||||
# Create the engine configs.
|
||||
vllm_config = engine_args.create_engine_config(usage_context)
|
||||
executor_class = Executor.get_class(vllm_config)
|
||||
|
||||
# vacc support spec_num = 1
|
||||
from .vllm_config_checker import check_spec_model
|
||||
check_spec_model(vllm_config)
|
||||
|
||||
# Create the AsyncLLM.
|
||||
from vllm.v1.engine.async_llm import AsyncLLM as DefaultAsyncLLM
|
||||
async_cls = DefaultAsyncLLM
|
||||
return async_cls(
|
||||
vllm_config=vllm_config,
|
||||
executor_class=executor_class,
|
||||
log_requests=not engine_args.disable_log_requests,
|
||||
log_stats=not engine_args.disable_log_stats,
|
||||
start_engine_loop=start_engine_loop,
|
||||
usage_context=usage_context,
|
||||
stat_loggers=stat_loggers,
|
||||
)
|
||||
|
||||
async def _add_request(self, request: EngineCoreRequest,
|
||||
prompt: Optional[str],
|
||||
parent_req: Optional[ParentRequest], index: int,
|
||||
queue: RequestOutputCollector):
|
||||
|
||||
# Add the request to OutputProcessor (this process).
|
||||
self.output_processor.add_request(request, prompt, parent_req, index,
|
||||
queue)
|
||||
|
||||
# Add the EngineCoreRequest to EngineCore (separate process).
|
||||
await self.engine_core.add_request_async(request)
|
||||
|
||||
if self.log_requests:
|
||||
if request.prompt_token_ids is not None:
|
||||
logger.info("Added request: %s, prompt length: %s", request.request_id, len(request.prompt_token_ids))
|
||||
else:
|
||||
logger.info("Added request %s.", request.request_id)
|
||||
209
vllm_vacc/vllm/v1/engine/core.py
Normal file
209
vllm_vacc/vllm/v1/engine/core.py
Normal file
@@ -0,0 +1,209 @@
|
||||
import os
|
||||
import queue
|
||||
import signal
|
||||
import sys
|
||||
import threading
|
||||
import time
|
||||
from collections import deque
|
||||
from collections.abc import Generator
|
||||
from concurrent.futures import Future
|
||||
from contextlib import ExitStack, contextmanager
|
||||
from inspect import isclass, signature
|
||||
from logging import DEBUG
|
||||
from typing import Any, Callable, Optional, TypeVar, Union
|
||||
|
||||
import msgspec
|
||||
import zmq
|
||||
|
||||
from vllm.config import ParallelConfig, VllmConfig
|
||||
from vllm.logger import init_logger
|
||||
from vllm.utils import make_zmq_socket
|
||||
|
||||
from vllm.v1.kv_cache_interface import KVCacheConfig
|
||||
from vllm.v1.engine import (EngineCoreRequest,EngineCoreRequestType)
|
||||
from vllm.v1.core.kv_cache_utils import (BlockHash,
|
||||
generate_scheduler_kv_cache_config,
|
||||
get_kv_cache_configs,
|
||||
get_request_block_hasher,
|
||||
init_none_hash)
|
||||
from vllm.v1.serial_utils import MsgpackDecoder
|
||||
from vllm.v1.engine.core import EngineCore
|
||||
from vllm.v1.request import Request, RequestStatus
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
POLLING_TIMEOUT_S = 2.5
|
||||
HANDSHAKE_TIMEOUT_MINS = 5
|
||||
|
||||
_R = TypeVar('_R') # Return type for collective_rpc
|
||||
|
||||
class EngineCoreProc(EngineCore):
|
||||
"""ZMQ-wrapper for running EngineCore in background process."""
|
||||
|
||||
ENGINE_CORE_DEAD = b'ENGINE_CORE_DEAD'
|
||||
def process_input_sockets(self, input_addresses: list[str],
|
||||
coord_input_address: Optional[str],
|
||||
identity: bytes, ready_event: threading.Event):
|
||||
"""Input socket IO thread."""
|
||||
|
||||
# Msgpack serialization decoding.
|
||||
add_request_decoder = MsgpackDecoder(EngineCoreRequest)
|
||||
generic_decoder = MsgpackDecoder()
|
||||
bulk_add_decoder = MsgpackDecoder(list[EngineCoreRequest])
|
||||
|
||||
with ExitStack() as stack, zmq.Context() as ctx:
|
||||
input_sockets = [
|
||||
stack.enter_context(
|
||||
make_zmq_socket(ctx,
|
||||
input_address,
|
||||
zmq.DEALER,
|
||||
identity=identity,
|
||||
bind=False))
|
||||
for input_address in input_addresses
|
||||
]
|
||||
if coord_input_address is None:
|
||||
coord_socket = None
|
||||
else:
|
||||
coord_socket = stack.enter_context(
|
||||
make_zmq_socket(ctx,
|
||||
coord_input_address,
|
||||
zmq.XSUB,
|
||||
identity=identity,
|
||||
bind=False))
|
||||
# Send subscription message to coordinator.
|
||||
coord_socket.send(b'\x01')
|
||||
|
||||
# Register sockets with poller.
|
||||
poller = zmq.Poller()
|
||||
for input_socket in input_sockets:
|
||||
# Send initial message to each input socket - this is required
|
||||
# before the front-end ROUTER socket can send input messages
|
||||
# back to us.
|
||||
input_socket.send(b'')
|
||||
poller.register(input_socket, zmq.POLLIN)
|
||||
|
||||
if coord_socket is not None:
|
||||
poller.register(coord_socket, zmq.POLLIN)
|
||||
|
||||
ready_event.set()
|
||||
del ready_event
|
||||
while True:
|
||||
for input_socket, _ in poller.poll():
|
||||
# (RequestType, RequestData)
|
||||
type_frame, *data_frames = input_socket.recv_multipart(
|
||||
copy=False)
|
||||
request_type = EngineCoreRequestType(
|
||||
bytes(type_frame.buffer))
|
||||
|
||||
if request_type == EngineCoreRequestType.ADD_BULK:
|
||||
# 关键:按 list[EngineCoreRequest] 解码,然后在接收线程就地 fan-out
|
||||
requests = bulk_add_decoder.decode(data_frames)
|
||||
for r in requests:
|
||||
r = self.preprocess_add_request(r)
|
||||
self.input_queue.put_nowait((EngineCoreRequestType.ADD, r))
|
||||
continue
|
||||
|
||||
# Deserialize the request data.
|
||||
if request_type == EngineCoreRequestType.ADD:
|
||||
request = add_request_decoder.decode(data_frames)
|
||||
request = self.preprocess_add_request(request)
|
||||
else:
|
||||
request = generic_decoder.decode(data_frames)
|
||||
|
||||
# Push to input queue for core busy loop.
|
||||
self.input_queue.put_nowait((request_type, request))
|
||||
|
||||
|
||||
class EngineCore:
|
||||
"""Inner loop of vLLM's Engine."""
|
||||
|
||||
def _initialize_kv_caches(
|
||||
self, vllm_config: VllmConfig) -> tuple[int, int, KVCacheConfig]:
|
||||
start = time.time()
|
||||
|
||||
# Get all kv cache needed by the model
|
||||
kv_cache_specs = self.model_executor.get_kv_cache_specs()
|
||||
|
||||
# get_kv_cache_specs in model_runner
|
||||
# for layer_name, layer in vllm_config.compilation_config.static_forward_context.items():
|
||||
# print(f'layer_name = {layer_name}; layer = {layer}')
|
||||
# # 只有 moe layer, 拿不到attention layer?
|
||||
# # TODO for no kv cahe model
|
||||
|
||||
|
||||
# has_kv_cache = any(kv_cache_spec for kv_cache_spec in kv_cache_specs)
|
||||
|
||||
# if has_kv_cache:
|
||||
# if os.environ.get("VLLM_ELASTIC_EP_SCALE_UP_LAUNCH") == "1":
|
||||
# dp_group = getattr(self, "dp_group", None)
|
||||
# assert dp_group is not None
|
||||
# self.available_gpu_memory_for_kv_cache = \
|
||||
# ParallelConfig.sync_kv_cache_memory_size(dp_group, -1)
|
||||
# available_gpu_memory = [
|
||||
# self.available_gpu_memory_for_kv_cache
|
||||
# ] * len(kv_cache_specs)
|
||||
# else:
|
||||
# # Profiles the peak memory usage of the model to determine how
|
||||
# # much memory can be allocated for kv cache.
|
||||
# available_gpu_memory = (
|
||||
# self.model_executor.determine_available_memory())
|
||||
# self.available_gpu_memory_for_kv_cache = \
|
||||
# available_gpu_memory[0]
|
||||
# else:
|
||||
# # Attention free models don't need memory for kv cache
|
||||
# available_gpu_memory = [0] * len(kv_cache_specs)
|
||||
|
||||
memory_blocks = self.model_executor.determine_available_memory_block() # [(memory, blocks) * rank_number]
|
||||
available_gpu_memory = [memory_block[0] for memory_block in memory_blocks]
|
||||
num_gpu_blocks = memory_blocks[0][1]
|
||||
|
||||
assert len(kv_cache_specs) == len(available_gpu_memory)
|
||||
|
||||
kv_cache_configs = get_kv_cache_configs(vllm_config, kv_cache_specs,
|
||||
available_gpu_memory)
|
||||
|
||||
### patch here to support long seq_length for mtp
|
||||
for kv_cache_config in kv_cache_configs:
|
||||
for ii in range(len(kv_cache_config.kv_cache_tensors)):
|
||||
kv_cache_config.kv_cache_tensors[ii].size = kv_cache_config.kv_cache_tensors[ii].size * num_gpu_blocks // kv_cache_config.num_blocks
|
||||
|
||||
kv_cache_config.num_blocks = num_gpu_blocks
|
||||
### patch here to support long seq_length for mtp end
|
||||
scheduler_kv_cache_config = generate_scheduler_kv_cache_config(
|
||||
kv_cache_configs)
|
||||
num_gpu_blocks = scheduler_kv_cache_config.num_blocks
|
||||
num_cpu_blocks = 0
|
||||
|
||||
# Initialize kv cache and warmup the execution
|
||||
self.model_executor.initialize_from_config(kv_cache_configs)
|
||||
|
||||
elapsed = time.time() - start
|
||||
logger.info(("init engine (profile, create kv cache, "
|
||||
"warmup model) took %.2f seconds"), elapsed)
|
||||
return num_gpu_blocks, num_cpu_blocks, scheduler_kv_cache_config
|
||||
|
||||
def preprocess_add_request(
|
||||
self, request: EngineCoreRequest) -> tuple[Request, int]:
|
||||
"""Preprocess the request.
|
||||
|
||||
This function could be directly used in input processing thread to allow
|
||||
request initialization running in parallel with Model forward
|
||||
"""
|
||||
# Note on thread safety: no race condition.
|
||||
# `mm_receiver_cache` is reset at the end of LLMEngine init,
|
||||
# and will only be accessed in the input processing thread afterwards.
|
||||
if self.mm_receiver_cache is not None and request.mm_features:
|
||||
request.mm_features = (
|
||||
self.mm_receiver_cache.get_and_update_features(
|
||||
request.mm_features))
|
||||
|
||||
req = Request.from_engine_core_request(request,
|
||||
self.request_block_hasher)
|
||||
if req.use_structured_output:
|
||||
# Note on thread safety: no race condition.
|
||||
# `grammar_init` is only invoked in input processing thread. For
|
||||
# `structured_output_manager`, each request is independent and
|
||||
# grammar compilation is async. Scheduler always checks grammar
|
||||
# compilation status before scheduling request.
|
||||
self.structured_output_manager.grammar_init(req)
|
||||
return req, request.current_wave
|
||||
76
vllm_vacc/vllm/v1/engine/core_client.py
Normal file
76
vllm_vacc/vllm/v1/engine/core_client.py
Normal file
@@ -0,0 +1,76 @@
|
||||
import asyncio
|
||||
import contextlib
|
||||
import queue
|
||||
import sys
|
||||
import uuid
|
||||
import weakref
|
||||
from abc import ABC, abstractmethod
|
||||
from collections import defaultdict, deque
|
||||
from collections.abc import Awaitable, Sequence
|
||||
from concurrent.futures import Future
|
||||
from dataclasses import dataclass
|
||||
from threading import Thread
|
||||
from typing import Any, Callable, Optional, TypeVar, Union, List
|
||||
|
||||
import msgspec.msgpack
|
||||
import zmq
|
||||
import zmq.asyncio
|
||||
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.logger import init_logger
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.utils import get_open_zmq_inproc_path, make_zmq_socket
|
||||
from vllm.v1.engine import (EngineCoreOutputs,
|
||||
EngineCoreRequestType, UtilityOutput)
|
||||
from vllm_vacc.vllm.v1.engine import EngineCoreRequest
|
||||
|
||||
from vllm.v1.engine.coordinator import DPCoordinator
|
||||
from vllm.v1.engine.core import EngineCore, EngineCoreProc
|
||||
from vllm.v1.engine.exceptions import EngineDeadError
|
||||
from vllm.v1.engine.utils import (CoreEngineActorManager,
|
||||
CoreEngineProcManager, launch_core_engines)
|
||||
from vllm.v1.executor.abstract import Executor
|
||||
from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder, bytestr
|
||||
from vllm.v1.engine.core_client import MPClient
|
||||
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
AnyFuture = Union[asyncio.Future[Any], Future[Any]]
|
||||
|
||||
_R = TypeVar('_R') # Return type for collective_rpc
|
||||
|
||||
EngineIdentity = bytes
|
||||
|
||||
|
||||
|
||||
class EngineCoreClient(ABC):
|
||||
"""
|
||||
EngineCoreClient: subclasses handle different methods for pushing
|
||||
and pulling from the EngineCore for asyncio / multiprocessing.
|
||||
|
||||
Subclasses:
|
||||
* InprocClient: In process EngineCore (for V0-style LLMEngine use)
|
||||
* SyncMPClient: ZMQ + background proc EngineCore (for LLM)
|
||||
* AsyncMPClient: ZMQ + background proc EngineCore w/ asyncio (for AsyncLLM)
|
||||
"""
|
||||
@abstractmethod
|
||||
def _send_input(self, req_type: EngineCoreRequestType, payload: Any) -> None:
|
||||
"""Send a request to EngineCore."""
|
||||
raise NotImplementedError
|
||||
|
||||
def add_requests(self, requests: List["EngineCoreRequest"]) -> None:
|
||||
"""一次性发多条 ADD(ADD_BULK)"""
|
||||
if not requests:
|
||||
return
|
||||
self._send_input(EngineCoreRequestType.ADD_BULK, requests)
|
||||
|
||||
class SyncMPClient(MPClient):
|
||||
"""Synchronous client for multi-proc EngineCore."""
|
||||
def add_requests(self, requests: List[EngineCoreRequest]) -> None:
|
||||
if not requests:
|
||||
return
|
||||
if self.is_dp: # 与 add_request 保持一致
|
||||
self.engines_running = True
|
||||
self._send_input(EngineCoreRequestType.ADD_BULK, requests)
|
||||
|
||||
176
vllm_vacc/vllm/v1/engine/llm_engine.py
Normal file
176
vllm_vacc/vllm/v1/engine/llm_engine.py
Normal file
@@ -0,0 +1,176 @@
|
||||
|
||||
from collections.abc import Mapping
|
||||
from copy import copy
|
||||
from typing import Any, Callable, Optional, Union
|
||||
|
||||
from typing_extensions import TypeVar
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.config import ParallelConfig, VllmConfig
|
||||
from vllm.distributed import stateless_destroy_torch_distributed_process_group
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.inputs import PromptType
|
||||
from vllm.logger import init_logger
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
|
||||
from vllm.outputs import PoolingRequestOutput, RequestOutput
|
||||
from vllm.pooling_params import PoolingParams
|
||||
# from vllm.prompt_adapter.request import PromptAdapterRequest
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.usage.usage_lib import UsageContext
|
||||
from vllm.utils import Device
|
||||
from vllm.v1.engine.core_client import EngineCoreClient
|
||||
from vllm.v1.engine.output_processor import OutputProcessor
|
||||
from vllm.v1.engine.parallel_sampling import ParentRequest
|
||||
from vllm.v1.engine.processor import Processor
|
||||
from vllm.v1.executor.abstract import Executor
|
||||
from vllm.v1.metrics.loggers import (PrometheusStatLogger, StatLoggerBase,
|
||||
StatLoggerFactory)
|
||||
from vllm.v1.metrics.reader import Metric, get_metrics_snapshot
|
||||
from vllm.v1.metrics.stats import IterationStats
|
||||
from vllm.v1.engine import EngineCoreRequest
|
||||
|
||||
logger = init_logger(__name__)
|
||||
class LLMEngine:
|
||||
|
||||
@classmethod
|
||||
def from_vllm_config(
|
||||
cls,
|
||||
vllm_config: VllmConfig,
|
||||
usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
|
||||
stat_loggers: Optional[list[StatLoggerFactory]] = None,
|
||||
disable_log_stats: bool = False,
|
||||
) -> "LLMEngine":
|
||||
# vacc support spec_num = 1
|
||||
from .vllm_config_checker import check_spec_model
|
||||
check_spec_model(vllm_config)
|
||||
|
||||
from vllm.v1.engine.llm_engine import LLMEngine as DefaultLLM
|
||||
default_cls = DefaultLLM
|
||||
return default_cls(vllm_config=vllm_config,
|
||||
executor_class=Executor.get_class(vllm_config),
|
||||
log_stats=(not disable_log_stats),
|
||||
usage_context=usage_context,
|
||||
stat_loggers=stat_loggers,
|
||||
multiprocess_mode=envs.VLLM_ENABLE_V1_MULTIPROCESSING)
|
||||
|
||||
@classmethod
|
||||
def from_engine_args(
|
||||
cls,
|
||||
engine_args: EngineArgs,
|
||||
usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
|
||||
stat_loggers: Optional[list[StatLoggerFactory]] = None,
|
||||
enable_multiprocessing: bool = False,
|
||||
) -> "LLMEngine":
|
||||
"""Creates an LLM engine from the engine arguments."""
|
||||
# Create the engine configs.
|
||||
vllm_config = engine_args.create_engine_config(usage_context)
|
||||
executor_class = Executor.get_class(vllm_config)
|
||||
|
||||
# vacc support spec_num = 1
|
||||
from .vllm_config_checker import check_spec_model
|
||||
check_spec_model(vllm_config)
|
||||
|
||||
if envs.VLLM_ENABLE_V1_MULTIPROCESSING:
|
||||
logger.debug("Enabling multiprocessing for LLMEngine.")
|
||||
enable_multiprocessing = True
|
||||
|
||||
# Create the LLMEngine.
|
||||
from vllm.v1.engine.llm_engine import LLMEngine as DefaultLLM
|
||||
default_cls = DefaultLLM
|
||||
return default_cls(vllm_config=vllm_config,
|
||||
executor_class=executor_class,
|
||||
log_stats=not engine_args.disable_log_stats,
|
||||
usage_context=usage_context,
|
||||
stat_loggers=stat_loggers,
|
||||
multiprocess_mode=enable_multiprocessing)
|
||||
|
||||
"""Legacy LLMEngine for backwards compatibility."""
|
||||
def add_requests(
|
||||
self,
|
||||
items: list[tuple[
|
||||
str, # request_id
|
||||
PromptType, # prompt
|
||||
Union[SamplingParams, PoolingParams], # params
|
||||
Optional[float], # arrival_time
|
||||
Optional[LoRARequest], # lora_request
|
||||
Optional[dict], # tokenization_kwargs
|
||||
Optional[dict], # trace_headers
|
||||
# Optional[PromptAdapterRequest], # prompt_adapter_request
|
||||
int, # priority
|
||||
]],
|
||||
) -> None:
|
||||
"""批量把请求送入 EngineCore,一次性触发 ADD_BULK。"""
|
||||
core_reqs: list[EngineCoreRequest] = []
|
||||
|
||||
for (request_id, prompt, params, arrival_time, lora_request,
|
||||
tokenization_kwargs, trace_headers,
|
||||
priority) in items:
|
||||
|
||||
# 复用现有逐条流程的解析入口
|
||||
prompt_str, request = self.processor.process_inputs(
|
||||
request_id=request_id,
|
||||
prompt=prompt,
|
||||
params=params,
|
||||
arrival_time=arrival_time,
|
||||
lora_request=lora_request,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
trace_headers=trace_headers,
|
||||
# prompt_adapter_request=prompt_adapter_request,
|
||||
priority=priority,
|
||||
)
|
||||
|
||||
n = params.n if isinstance(params, SamplingParams) else 1
|
||||
|
||||
if n == 1:
|
||||
# Make a new RequestState and queue.
|
||||
self.output_processor.add_request(request, prompt_str, None, 0)
|
||||
# Add the request to EngineCore.
|
||||
core_reqs.append(request)
|
||||
continue
|
||||
# self.engine_core.add_request(request)
|
||||
# return
|
||||
|
||||
# Fan out child requests (for n>1).
|
||||
parent_req = ParentRequest(request_id, params)
|
||||
for idx in range(n):
|
||||
request_id, params = parent_req.get_child_info(idx)
|
||||
child_request = request if idx == n - 1 else copy(request)
|
||||
child_request.request_id = request_id
|
||||
child_request.sampling_params = params
|
||||
|
||||
# Make a new RequestState and queue.
|
||||
self.output_processor.add_request(child_request, prompt_str,
|
||||
parent_req, idx)
|
||||
# Add the request to EngineCore.
|
||||
# self.engine_core.add_request(child_request)
|
||||
# print("add_requests: child_request id=", child_request.request_id)
|
||||
core_reqs.append(child_request)
|
||||
|
||||
# output_processor 需要为每个“实际进入引擎的 req_id”建索引。
|
||||
# 如果是 SamplingParams 且 n>1/best_of>1,要做 parent-children 拆分;
|
||||
# 否则直接登记单条。
|
||||
# if isinstance(params, SamplingParams) and (
|
||||
# (params.n is not None and params.n > 1) or
|
||||
# (getattr(params, "best_of", 1) and getattr(params, "best_of", 1) > 1)
|
||||
# ):
|
||||
# parent = self.parallel_sampler.create_parent(request_id, params)
|
||||
# # 注意:最后一个 child 可以直接复用 request,其余用 copy
|
||||
# children = self.parallel_sampler.materialize_children(parent, request)
|
||||
# for child_idx, child in enumerate(children):
|
||||
# self.output_processor.add_request(
|
||||
# request=child,
|
||||
# prompt_str=prompt_str,
|
||||
# parent=parent,
|
||||
# child_index=child_idx,
|
||||
# )
|
||||
# core_reqs.append(child)
|
||||
# else:
|
||||
# self.output_processor.add_request(request, prompt_str)
|
||||
# core_reqs.append(request)
|
||||
|
||||
# 关键:一次性下发给 Core。EngineCoreClient 会发送 ADD_BULK。
|
||||
print('self.engine_core', self.engine_core)
|
||||
self.engine_core.add_requests(core_reqs)
|
||||
|
||||
|
||||
184
vllm_vacc/vllm/v1/engine/processor.py
Normal file
184
vllm_vacc/vllm/v1/engine/processor.py
Normal file
@@ -0,0 +1,184 @@
|
||||
|
||||
|
||||
import time
|
||||
from collections.abc import Mapping
|
||||
from typing import Any, Literal, Optional, Union
|
||||
|
||||
import torch
|
||||
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.inputs import ProcessorInputs, PromptType, SingletonInputs
|
||||
from vllm.inputs.parse import split_enc_dec_inputs
|
||||
from vllm.inputs.preprocess import InputPreprocessor
|
||||
from vllm.logger import init_logger
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
|
||||
from vllm.multimodal.cache import processor_cache_from_config
|
||||
from vllm.multimodal.inputs import MultiModalFeatureSpec, MultiModalUUIDDict
|
||||
from vllm.multimodal.processing import EncDecMultiModalProcessor
|
||||
from vllm.multimodal.utils import argsort_mm_positions
|
||||
from vllm.pooling_params import PoolingParams
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
||||
from vllm.utils import length_from_prompt_token_ids_or_embeds
|
||||
from vllm.v1.engine import EngineCoreRequest
|
||||
from vllm.v1.structured_output.backend_guidance import (
|
||||
validate_guidance_grammar)
|
||||
from vllm.v1.structured_output.backend_lm_format_enforcer import (
|
||||
validate_structured_output_request_lm_format_enforcer)
|
||||
from vllm.v1.structured_output.backend_outlines import (
|
||||
validate_structured_output_request_outlines)
|
||||
from vllm.v1.structured_output.backend_xgrammar import (
|
||||
validate_xgrammar_grammar)
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class Processor:
|
||||
|
||||
def process_inputs(
|
||||
self,
|
||||
request_id: str,
|
||||
prompt: PromptType,
|
||||
params: Union[SamplingParams, PoolingParams],
|
||||
arrival_time: Optional[float] = None,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
||||
trace_headers: Optional[Mapping[str, str]] = None,
|
||||
priority: int = 0,
|
||||
data_parallel_rank: Optional[int] = None,
|
||||
) -> tuple[Optional[str], EngineCoreRequest]:
|
||||
|
||||
# TODO(woosuk): Support pooling models.
|
||||
self._validate_lora(lora_request)
|
||||
self._validate_params(params)
|
||||
|
||||
data_parallel_size = self.vllm_config.parallel_config.data_parallel_size
|
||||
if data_parallel_rank is not None and not (0 <= data_parallel_rank <
|
||||
data_parallel_size):
|
||||
raise ValueError(f"data_parallel_rank {data_parallel_rank} "
|
||||
f"is out of range [0, {data_parallel_size}).")
|
||||
|
||||
if arrival_time is None:
|
||||
arrival_time = time.time()
|
||||
|
||||
# Optionally generate multimodal hash overrides to avoid hashing
|
||||
# multimodal data items by their content as their identifiers.
|
||||
|
||||
# NOTE: when users explicitly turn off BOTH prefix caching and input
|
||||
# processing caching, no multimodal features or embeddings will be
|
||||
# reused across requests, therefore identifying multimodal data items
|
||||
# by their content is no longer necessary, and we create uuids with
|
||||
# request id-modality-index as multimodal hash overrides.
|
||||
if (self.model_config.multimodal_config and
|
||||
self.model_config.multimodal_config.mm_processor_cache_gb == 0
|
||||
and not self.cache_config.enable_prefix_caching):
|
||||
mm_uuids = self._maybe_build_mm_uuids(request_id, prompt)
|
||||
else:
|
||||
# Otherwise, use user-provided uuids as multimodal hash overrides
|
||||
# if provided.
|
||||
self._validate_multi_modal_uuids(prompt)
|
||||
if isinstance(prompt, dict):
|
||||
mm_uuids = prompt.get("multi_modal_uuids")
|
||||
else:
|
||||
mm_uuids = None
|
||||
|
||||
# Process inputs, which includes:
|
||||
# 1. Tokenize text prompt, with LoRA request if one exists.
|
||||
# 2. For multimodal models with a merged preprocessor, preprocess
|
||||
# multimodal data and expand prompt token ids accordingly.
|
||||
processed_inputs: ProcessorInputs = self.input_preprocessor.preprocess(
|
||||
prompt,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
mm_uuids=mm_uuids,
|
||||
)
|
||||
from vllm.platforms import current_platform
|
||||
current_platform.validate_request(
|
||||
prompt=prompt,
|
||||
params=params,
|
||||
processed_inputs=processed_inputs,
|
||||
)
|
||||
|
||||
eos_token_id = self.input_preprocessor.get_eos_token_id()
|
||||
|
||||
encoder_inputs, decoder_inputs = split_enc_dec_inputs(processed_inputs)
|
||||
self._validate_model_inputs(encoder_inputs, decoder_inputs)
|
||||
|
||||
# Mypy does not always properly infer the types of some elements of
|
||||
# discriminated unions of TypedDicts, because of how it handles
|
||||
# inheritance of TypedDict. If we explicitly extract the items we want
|
||||
# we can avoid type errors from using `dict.get` later in the method.
|
||||
prompt_str: Optional[str] = None if decoder_inputs[
|
||||
"type"] == "embeds" else decoder_inputs.get("prompt")
|
||||
prompt_token_ids = decoder_inputs[
|
||||
"prompt_token_ids"] if decoder_inputs["type"] != "embeds" else None
|
||||
prompt_embeds = decoder_inputs["prompt_embeds"] if decoder_inputs[
|
||||
"type"] == "embeds" else None
|
||||
deepstack_input_embeds = decoder_inputs["deepstack_input_embeds"] if decoder_inputs[
|
||||
"type"] == "embeds" else None
|
||||
|
||||
# for deepstack_input_embeds in llm.generate method
|
||||
if isinstance(deepstack_input_embeds, dict):
|
||||
all_tensors = []
|
||||
for key in deepstack_input_embeds:
|
||||
if isinstance(deepstack_input_embeds[key], torch.Tensor):
|
||||
all_tensors.append(deepstack_input_embeds[key].unsqueeze(0))
|
||||
if len(all_tensors) > 0:
|
||||
deepstack_input_embeds = torch.concatenate(all_tensors, 0)
|
||||
|
||||
sampling_params = None
|
||||
pooling_params = None
|
||||
if isinstance(params, SamplingParams):
|
||||
# TODO: can we avoid cloning here in multiproc case?
|
||||
sampling_params = params.clone()
|
||||
# If unset max tokens, then generate up to the max_model_len.
|
||||
if sampling_params.max_tokens is None:
|
||||
seq_len = length_from_prompt_token_ids_or_embeds(
|
||||
prompt_token_ids, prompt_embeds)
|
||||
sampling_params.max_tokens = \
|
||||
self.model_config.max_model_len - seq_len
|
||||
sampling_params.update_from_generation_config(
|
||||
self.generation_config_fields, eos_token_id)
|
||||
if self.tokenizer is not None:
|
||||
sampling_params.update_from_tokenizer(self.tokenizer)
|
||||
else:
|
||||
pooling_params = params.clone()
|
||||
|
||||
# Multimodal related.
|
||||
mm_features: Optional[list[MultiModalFeatureSpec]] = None
|
||||
|
||||
if decoder_inputs["type"] == "multimodal":
|
||||
decoder_mm_inputs = decoder_inputs["mm_kwargs"]
|
||||
decoder_mm_positions = decoder_inputs["mm_placeholders"]
|
||||
decoder_mm_hashes = decoder_inputs["mm_hashes"]
|
||||
|
||||
# Merge and flatten multimodal placeholders, hashes and inputs
|
||||
# from dictionaries to lists, and sort them by each item's position
|
||||
# in the input sequence.
|
||||
sorted_mm_idxs = argsort_mm_positions(decoder_mm_positions)
|
||||
|
||||
mm_features = []
|
||||
for modality, idx in sorted_mm_idxs:
|
||||
mm_features.append(
|
||||
MultiModalFeatureSpec(
|
||||
data=decoder_mm_inputs[modality][idx],
|
||||
modality=modality,
|
||||
identifier=decoder_mm_hashes[modality][idx],
|
||||
mm_position=decoder_mm_positions[modality][idx]))
|
||||
|
||||
return prompt_str, EngineCoreRequest(
|
||||
request_id=request_id,
|
||||
prompt_token_ids=prompt_token_ids,
|
||||
prompt_embeds=prompt_embeds,
|
||||
deepstack_input_embeds=deepstack_input_embeds,
|
||||
mm_features=mm_features,
|
||||
sampling_params=sampling_params,
|
||||
pooling_params=pooling_params,
|
||||
eos_token_id=eos_token_id,
|
||||
arrival_time=arrival_time,
|
||||
lora_request=lora_request,
|
||||
cache_salt=decoder_inputs.get("cache_salt"),
|
||||
priority=priority,
|
||||
data_parallel_rank=data_parallel_rank,
|
||||
trace_headers=trace_headers,
|
||||
)
|
||||
17
vllm_vacc/vllm/v1/engine/vllm_config_checker.py
Normal file
17
vllm_vacc/vllm/v1/engine/vllm_config_checker.py
Normal file
@@ -0,0 +1,17 @@
|
||||
# check spec model config
|
||||
# write spec model func to config file
|
||||
def check_spec_model(vllm_config):
|
||||
# add spec tag
|
||||
speculative_mode = hasattr(vllm_config, 'speculative_config')
|
||||
if speculative_mode and \
|
||||
hasattr(vllm_config.speculative_config, 'num_speculative_tokens') and \
|
||||
vllm_config.speculative_config.num_speculative_tokens != 1:
|
||||
raise ValueError(f'run_mp_engine: only support num_speculative_tokens == 1, but get {vllm_config.speculative_config.num_speculative_tokens}')
|
||||
|
||||
default_model_infos = "default"
|
||||
if speculative_mode:
|
||||
if hasattr(vllm_config.speculative_config, 'method'):
|
||||
default_model_infos = vllm_config.speculative_config.method
|
||||
|
||||
from vllm_vacc.vllm.config_manager import vllm_vacc_config_manager
|
||||
vllm_vacc_config_manager().update_model_infos(default_model_infos)
|
||||
Reference in New Issue
Block a user