init
This commit is contained in:
0
vllm_vacc/vllm/engine/__init__.py
Normal file
0
vllm_vacc/vllm/engine/__init__.py
Normal file
BIN
vllm_vacc/vllm/engine/__pycache__/__init__.cpython-312.pyc
Normal file
BIN
vllm_vacc/vllm/engine/__pycache__/__init__.cpython-312.pyc
Normal file
Binary file not shown.
BIN
vllm_vacc/vllm/engine/__pycache__/arg_utils.cpython-312.pyc
Normal file
BIN
vllm_vacc/vllm/engine/__pycache__/arg_utils.cpython-312.pyc
Normal file
Binary file not shown.
BIN
vllm_vacc/vllm/engine/__pycache__/llm_engine.cpython-312.pyc
Normal file
BIN
vllm_vacc/vllm/engine/__pycache__/llm_engine.cpython-312.pyc
Normal file
Binary file not shown.
BIN
vllm_vacc/vllm/engine/__pycache__/metrics.cpython-312.pyc
Normal file
BIN
vllm_vacc/vllm/engine/__pycache__/metrics.cpython-312.pyc
Normal file
Binary file not shown.
158
vllm_vacc/vllm/engine/arg_utils.py
Normal file
158
vllm_vacc/vllm/engine/arg_utils.py
Normal file
@@ -0,0 +1,158 @@
|
||||
import argparse
|
||||
import copy
|
||||
import dataclasses
|
||||
import functools
|
||||
import json
|
||||
import sys
|
||||
import threading
|
||||
import warnings
|
||||
from dataclasses import MISSING, dataclass, fields, is_dataclass
|
||||
from itertools import permutations
|
||||
from typing import (Annotated, Any, Callable, Dict, List, Literal, Optional,
|
||||
Type, TypeVar, Union, cast, get_args, get_origin)
|
||||
|
||||
import regex as re
|
||||
import torch
|
||||
from pydantic import TypeAdapter, ValidationError
|
||||
from typing_extensions import TypeIs, deprecated
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.executor.executor_base import ExecutorBase
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.layers.quantization import QuantizationMethods
|
||||
from vllm.plugins import load_general_plugins
|
||||
from vllm.reasoning import ReasoningParserManager
|
||||
from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET, MODELS_ON_S3
|
||||
from vllm.transformers_utils.utils import check_gguf_file
|
||||
from vllm.usage.usage_lib import UsageContext
|
||||
from vllm.utils import (FlexibleArgumentParser, GiB_bytes, get_ip,
|
||||
is_in_ray_actor)
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.platforms import CpuArchEnum, current_platform
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
def _set_default_args(self, usage_context: UsageContext,
|
||||
model_config: ModelConfig) -> None:
|
||||
"""Set Default Arguments for V1 Engine."""
|
||||
|
||||
# V1 always uses chunked prefills and prefix caching
|
||||
# for non-pooling tasks.
|
||||
# For pooling tasks the default is False
|
||||
self.enable_chunked_prefill = False
|
||||
self.enable_prefix_caching = False
|
||||
if model_config.runner_type != "pooling":
|
||||
# TODO: When prefix caching supports prompt embeds inputs, this
|
||||
# check can be removed.
|
||||
if (self.enable_prompt_embeds
|
||||
and self.enable_prefix_caching is not False):
|
||||
logger.warning(
|
||||
"--enable-prompt-embeds and --enable-prefix-caching "
|
||||
"are not supported together in V1. Prefix caching has "
|
||||
"been disabled.")
|
||||
|
||||
# V1 should use the new scheduler by default.
|
||||
# Swap it only if this arg is set to the original V0 default
|
||||
if self.scheduler_cls == EngineArgs.scheduler_cls:
|
||||
self.scheduler_cls = "vllm.v1.core.sched.scheduler.Scheduler"
|
||||
|
||||
# When no user override, set the default values based on the usage
|
||||
# context.
|
||||
# Use different default values for different hardware.
|
||||
|
||||
# Try to query the device name on the current platform. If it fails,
|
||||
# it may be because the platform that imports vLLM is not the same
|
||||
# as the platform that vLLM is running on (e.g. the case of scaling
|
||||
# vLLM with Ray) and has no GPUs. In this case we use the default
|
||||
# values for non-H100/H200 GPUs.
|
||||
try:
|
||||
device_memory = current_platform.get_device_total_memory()
|
||||
device_name = current_platform.get_device_name().lower()
|
||||
except Exception:
|
||||
# This is only used to set default_max_num_batched_tokens
|
||||
device_memory = 0
|
||||
|
||||
# NOTE(Kuntai): Setting large `max_num_batched_tokens` for A100 reduces
|
||||
# throughput, see PR #17885 for more details.
|
||||
# So here we do an extra device name check to prevent such regression.
|
||||
from vllm.usage.usage_lib import UsageContext
|
||||
if device_memory >= 70 * GiB_bytes and "a100" not in device_name:
|
||||
# For GPUs like H100 and MI300x, use larger default values.
|
||||
default_max_num_batched_tokens = {
|
||||
UsageContext.LLM_CLASS: 16384,
|
||||
UsageContext.OPENAI_API_SERVER: 8192,
|
||||
}
|
||||
default_max_num_seqs = {
|
||||
UsageContext.LLM_CLASS: 1024,
|
||||
UsageContext.OPENAI_API_SERVER: 1024,
|
||||
}
|
||||
else:
|
||||
# TODO(woosuk): Tune the default values for other hardware.
|
||||
default_max_num_batched_tokens = {
|
||||
UsageContext.LLM_CLASS: 8192,
|
||||
UsageContext.OPENAI_API_SERVER: 2048,
|
||||
}
|
||||
default_max_num_seqs = {
|
||||
UsageContext.LLM_CLASS: 4,
|
||||
UsageContext.OPENAI_API_SERVER: 4,
|
||||
}
|
||||
|
||||
# tpu specific default values.
|
||||
if current_platform.is_tpu():
|
||||
default_max_num_batched_tokens_tpu = {
|
||||
UsageContext.LLM_CLASS: {
|
||||
'V6E': 2048,
|
||||
'V5E': 1024,
|
||||
'V5P': 512,
|
||||
},
|
||||
UsageContext.OPENAI_API_SERVER: {
|
||||
'V6E': 1024,
|
||||
'V5E': 512,
|
||||
'V5P': 256,
|
||||
}
|
||||
}
|
||||
|
||||
# cpu specific default values.
|
||||
if current_platform.is_cpu():
|
||||
world_size = self.pipeline_parallel_size * self.tensor_parallel_size
|
||||
default_max_num_batched_tokens = {
|
||||
UsageContext.LLM_CLASS: 4096 * world_size,
|
||||
UsageContext.OPENAI_API_SERVER: 2048 * world_size,
|
||||
}
|
||||
default_max_num_seqs = {
|
||||
UsageContext.LLM_CLASS: 256 * world_size,
|
||||
UsageContext.OPENAI_API_SERVER: 128 * world_size,
|
||||
}
|
||||
|
||||
use_context_value = usage_context.value if usage_context else None
|
||||
if (self.max_num_batched_tokens is None
|
||||
and usage_context in default_max_num_batched_tokens):
|
||||
if current_platform.is_tpu():
|
||||
chip_name = current_platform.get_device_name()
|
||||
if chip_name in default_max_num_batched_tokens_tpu[
|
||||
usage_context]:
|
||||
self.max_num_batched_tokens = \
|
||||
default_max_num_batched_tokens_tpu[
|
||||
usage_context][chip_name]
|
||||
else:
|
||||
self.max_num_batched_tokens = \
|
||||
default_max_num_batched_tokens[usage_context]
|
||||
else:
|
||||
if not self.enable_chunked_prefill:
|
||||
self.max_num_batched_tokens = model_config.max_model_len
|
||||
else:
|
||||
self.max_num_batched_tokens = \
|
||||
default_max_num_batched_tokens[usage_context]
|
||||
logger.debug(
|
||||
"Setting max_num_batched_tokens to %d for %s usage context.",
|
||||
self.max_num_batched_tokens, use_context_value)
|
||||
|
||||
if (self.max_num_seqs is None
|
||||
and usage_context in default_max_num_seqs):
|
||||
self.max_num_seqs = min(default_max_num_seqs[usage_context],
|
||||
self.max_num_batched_tokens or sys.maxsize)
|
||||
|
||||
logger.debug("Setting max_num_seqs to %d for %s usage context.",
|
||||
self.max_num_seqs, use_context_value)
|
||||
|
||||
49
vllm_vacc/vllm/engine/llm_engine.py
Normal file
49
vllm_vacc/vllm/engine/llm_engine.py
Normal file
@@ -0,0 +1,49 @@
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.usage.usage_lib import UsageContext
|
||||
from typing import Dict, Optional
|
||||
from vllm.engine.metrics_types import StatLoggerBase
|
||||
|
||||
class LLMEngine:
|
||||
|
||||
@classmethod
|
||||
def from_engine_args(
|
||||
cls,
|
||||
engine_args: EngineArgs,
|
||||
usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
|
||||
stat_loggers: Optional[Dict[str, StatLoggerBase]] = None,
|
||||
) -> "LLMEngine":
|
||||
"""Creates an LLM engine from the engine arguments."""
|
||||
# Create the engine configs.
|
||||
vllm_config = engine_args.create_engine_config(usage_context)
|
||||
#patch to prevent num_speculative_tokens > 1
|
||||
speculative_mode = hasattr(vllm_config, 'speculative_config')
|
||||
if speculative_mode and \
|
||||
hasattr(vllm_config.speculative_config, 'num_speculative_tokens') and \
|
||||
vllm_config.speculative_config.num_speculative_tokens != 1:
|
||||
raise ValueError(f'run_mp_engine: only support num_speculative_tokens == 1, but get {vllm_config.speculative_config.num_speculative_tokens}')
|
||||
|
||||
default_model_infos = "default"
|
||||
if speculative_mode:
|
||||
if hasattr(vllm_config.speculative_config, 'method'):
|
||||
default_model_infos = vllm_config.speculative_config.method
|
||||
|
||||
from vllm_vacc.vllm.config_manager import vllm_vacc_config_manager
|
||||
vllm_vacc_config_manager().update_model_infos(default_model_infos)
|
||||
|
||||
import vllm.envs as envs
|
||||
engine_cls = None
|
||||
if envs.VLLM_USE_V1:
|
||||
from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine
|
||||
engine_cls = V1LLMEngine
|
||||
else:
|
||||
from vllm.engine.llm_engine import LLMEngine as DefaultEngine
|
||||
engine_cls = DefaultEngine
|
||||
|
||||
assert engine_cls is not None, f"LLMEngine is empty: {engine_cls}"
|
||||
|
||||
return engine_cls.from_vllm_config(
|
||||
vllm_config=vllm_config,
|
||||
usage_context=usage_context,
|
||||
stat_loggers=stat_loggers,
|
||||
disable_log_stats=engine_args.disable_log_stats,
|
||||
)
|
||||
69
vllm_vacc/vllm/engine/metrics.py
Normal file
69
vllm_vacc/vllm/engine/metrics.py
Normal file
@@ -0,0 +1,69 @@
|
||||
from vllm.engine.metrics_types import (StatLoggerBase, Stats)
|
||||
import vllm_vacc.vllm.model_executor.models.vars as global_vars
|
||||
|
||||
class LoggingStatLogger(StatLoggerBase):
|
||||
"""LoggingStatLogger is used in LLMEngine to log to Stdout."""
|
||||
|
||||
def log(self, stats: Stats) -> None:
|
||||
from vllm.engine.metrics import local_interval_elapsed, get_throughput, logger
|
||||
"""Called by LLMEngine.
|
||||
Logs to Stdout every self.local_interval seconds."""
|
||||
|
||||
# Save tracked stats for token counters.
|
||||
self.num_prompt_tokens.append(stats.num_prompt_tokens_iter)
|
||||
self.num_generation_tokens.append(stats.num_generation_tokens_iter)
|
||||
|
||||
# Update spec decode metrics
|
||||
self.maybe_update_spec_decode_metrics(stats)
|
||||
|
||||
# Log locally every local_interval seconds.
|
||||
if local_interval_elapsed(stats.now, self.last_local_log,
|
||||
self.local_interval):
|
||||
# Compute summary metrics for tracked stats (and log them
|
||||
# to promethus if applicable).
|
||||
prompt_throughput = get_throughput(self.num_prompt_tokens,
|
||||
now=stats.now,
|
||||
last_log=self.last_local_log)
|
||||
generation_throughput = get_throughput(
|
||||
self.num_generation_tokens,
|
||||
now=stats.now,
|
||||
last_log=self.last_local_log)
|
||||
|
||||
log_fn = logger.info
|
||||
if not any((prompt_throughput, generation_throughput,
|
||||
self.last_prompt_throughput,
|
||||
self.last_generation_throughput)):
|
||||
# Avoid log noise on an idle production system
|
||||
log_fn = logger.debug
|
||||
|
||||
log_fn(
|
||||
"Avg prompt throughput: %.1f tokens/s, "
|
||||
"Avg generation throughput: %.1f tokens/s, "
|
||||
"Running: %d reqs, Swapped: %d reqs, "
|
||||
"Pending: %d reqs, GPU KV cache usage: %.1f%%, "
|
||||
"CPU KV cache usage: %.1f%%., "
|
||||
"Do sequences length: %s",
|
||||
prompt_throughput,
|
||||
generation_throughput,
|
||||
stats.num_running_sys,
|
||||
stats.num_swapped_sys,
|
||||
stats.num_waiting_sys,
|
||||
stats.gpu_cache_usage_sys * 100,
|
||||
stats.cpu_cache_usage_sys * 100,
|
||||
str(global_vars.DO_SEQ_LENS)
|
||||
)
|
||||
if (stats.cpu_prefix_cache_hit_rate >= 0
|
||||
or stats.gpu_prefix_cache_hit_rate >= 0):
|
||||
log_fn(
|
||||
"Prefix cache hit rate: GPU: %.2f%%, CPU: %.2f%%",
|
||||
stats.gpu_prefix_cache_hit_rate * 100,
|
||||
stats.cpu_prefix_cache_hit_rate * 100,
|
||||
)
|
||||
if self.spec_decode_metrics is not None:
|
||||
logger.debug(
|
||||
self._format_spec_decode_metrics_str(
|
||||
self.spec_decode_metrics))
|
||||
|
||||
self._reset(stats, prompt_throughput, generation_throughput)
|
||||
|
||||
|
||||
0
vllm_vacc/vllm/engine/multiprocessing/__init__.py
Normal file
0
vllm_vacc/vllm/engine/multiprocessing/__init__.py
Normal file
Binary file not shown.
Binary file not shown.
103
vllm_vacc/vllm/engine/multiprocessing/engine.py
Normal file
103
vllm_vacc/vllm/engine/multiprocessing/engine.py
Normal file
@@ -0,0 +1,103 @@
|
||||
from vllm.engine.multiprocessing import (ENGINE_DEAD_ERROR,
|
||||
RPCError,
|
||||
RPCProcessRequest,
|
||||
RPCAbortRequest)
|
||||
from vllm.config import VllmConfig
|
||||
import signal
|
||||
from vllm.logger import init_logger
|
||||
from vllm.transformers_utils.config import (
|
||||
maybe_register_config_serialize_by_value)
|
||||
from vllm.usage.usage_lib import UsageContext
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class MQLLMEngine:
|
||||
|
||||
def _handle_process_request(self, request: RPCProcessRequest):
|
||||
"""Handle RPCProcessRequest by adding it to the LLMEngine."""
|
||||
request_id = request.request_id
|
||||
|
||||
if self._errored_with is not None:
|
||||
rpc_err = RPCError(request_id=request_id,
|
||||
is_engine_errored=True,
|
||||
exception=ENGINE_DEAD_ERROR(self._errored_with))
|
||||
self._send_outputs(rpc_err)
|
||||
|
||||
try:
|
||||
self.engine.add_request(
|
||||
request_id=request_id,
|
||||
prompt=request.prompt,
|
||||
params=request.params,
|
||||
lora_request=request.lora_request,
|
||||
trace_headers=request.trace_headers,
|
||||
prompt_adapter_request=request.prompt_adapter_request,
|
||||
priority=request.priority)
|
||||
|
||||
if self.log_requests:
|
||||
from vllm.engine.multiprocessing.engine import logger
|
||||
|
||||
if request.prompt.get('prompt_token_ids') is not None:
|
||||
# logger.info("Added request: %s, %s, prompt length: %s", request.request_id, request.prompt['prompt_token_ids'], len(request.prompt['prompt_token_ids']))
|
||||
logger.info("Added request: %s, prompt length: %s", request.request_id, len(request.prompt['prompt_token_ids']))
|
||||
else:
|
||||
logger.info("Added request %s.", request.request_id)
|
||||
|
||||
except Exception as e:
|
||||
# We do not set self._errored = True here, since the error
|
||||
# is due to an issue adding this request to the engine,
|
||||
# rather than an issue with the engine itself.
|
||||
is_errored = self._errored_with is not None
|
||||
rpc_err = RPCError(request_id=request_id,
|
||||
is_engine_errored=is_errored,
|
||||
exception=e)
|
||||
self._send_outputs(rpc_err)
|
||||
|
||||
# Remove request from the engine.
|
||||
self.engine.abort_request(request_id)
|
||||
|
||||
def _handle_abort_request(self, request: RPCAbortRequest):
|
||||
self.engine.abort_request(request.request_id)
|
||||
if self.log_requests:
|
||||
from vllm.engine.multiprocessing.engine import logger
|
||||
import vllm_vacc.vllm.model_executor.models.vars as global_vars
|
||||
logger.info("Aborted request: %s, prompt length: %s", request.request_id, global_vars.DO_SEQ_LENS)
|
||||
|
||||
def run_mp_engine(vllm_config: VllmConfig, usage_context: UsageContext,
|
||||
ipc_path: str, disable_log_stats: bool,
|
||||
disable_log_requests: bool, engine_alive):
|
||||
|
||||
#patch to prevent num_speculative_tokens > 1
|
||||
speculative_mode = hasattr(vllm_config, 'speculative_config')
|
||||
if speculative_mode and \
|
||||
hasattr(vllm_config.speculative_config, 'num_speculative_tokens') and \
|
||||
vllm_config.speculative_config.num_speculative_tokens != 1:
|
||||
raise ValueError(f'run_mp_engine: only support num_speculative_tokens == 1, but get {vllm_config.speculative_config.num_speculative_tokens}')
|
||||
|
||||
default_model_infos = "default"
|
||||
if speculative_mode:
|
||||
if hasattr(vllm_config.speculative_config, 'method'):
|
||||
default_model_infos = vllm_config.speculative_config.method
|
||||
|
||||
from vllm_vacc.vllm.config_manager import vllm_vacc_config_manager
|
||||
vllm_vacc_config_manager().update_model_infos(default_model_infos)
|
||||
|
||||
try:
|
||||
# Ensure we can serialize transformer config before spawning
|
||||
maybe_register_config_serialize_by_value()
|
||||
from vllm.engine.multiprocessing.engine import MQLLMEngine,signal_handler
|
||||
engine = MQLLMEngine.from_vllm_config(
|
||||
vllm_config=vllm_config,
|
||||
usage_context=usage_context,
|
||||
disable_log_stats=disable_log_stats,
|
||||
disable_log_requests=disable_log_requests,
|
||||
ipc_path=ipc_path)
|
||||
|
||||
signal.signal(signal.SIGTERM, signal_handler)
|
||||
|
||||
engine.start()
|
||||
|
||||
except BaseException as e:
|
||||
logger.exception(e)
|
||||
engine_alive.value = False
|
||||
raise e
|
||||
Reference in New Issue
Block a user