init
This commit is contained in:
0
vllm/engine/__init__.py
Normal file
0
vllm/engine/__init__.py
Normal file
649
vllm/engine/arg_utils.py
Normal file
649
vllm/engine/arg_utils.py
Normal file
@@ -0,0 +1,649 @@
|
||||
import argparse
|
||||
import dataclasses
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Optional, Union
|
||||
|
||||
from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig,
|
||||
EngineConfig, LoadConfig, LoRAConfig, ModelConfig,
|
||||
ParallelConfig, SchedulerConfig, SpeculativeConfig,
|
||||
TokenizerPoolConfig, VisionLanguageConfig)
|
||||
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
|
||||
from vllm.utils import str_to_int_tuple
|
||||
|
||||
|
||||
def nullable_str(val: str):
|
||||
if not val or val == "None":
|
||||
return None
|
||||
return val
|
||||
|
||||
|
||||
@dataclass
|
||||
class EngineArgs:
|
||||
"""Arguments for vLLM engine."""
|
||||
model: str
|
||||
served_model_name: Optional[Union[List[str]]] = None
|
||||
tokenizer: Optional[str] = None
|
||||
skip_tokenizer_init: bool = False
|
||||
tokenizer_mode: str = 'auto'
|
||||
trust_remote_code: bool = False
|
||||
download_dir: Optional[str] = None
|
||||
load_format: str = 'auto'
|
||||
dtype: str = 'auto'
|
||||
kv_cache_dtype: str = 'auto'
|
||||
quantization_param_path: Optional[str] = None
|
||||
seed: int = 0
|
||||
max_model_len: Optional[int] = None
|
||||
worker_use_ray: bool = False
|
||||
pipeline_parallel_size: int = 1
|
||||
tensor_parallel_size: int = 1
|
||||
max_parallel_loading_workers: Optional[int] = None
|
||||
block_size: int = 16
|
||||
enable_prefix_caching: bool = False
|
||||
use_v2_block_manager: bool = False
|
||||
swap_space: int = 4 # GiB
|
||||
gpu_memory_utilization: float = 0.90
|
||||
max_num_batched_tokens: Optional[int] = None
|
||||
max_num_seqs: int = 256
|
||||
max_logprobs: int = 5 # OpenAI default value
|
||||
disable_log_stats: bool = False
|
||||
revision: Optional[str] = None
|
||||
code_revision: Optional[str] = None
|
||||
tokenizer_revision: Optional[str] = None
|
||||
quantization: Optional[str] = None
|
||||
enforce_eager: bool = False
|
||||
max_context_len_to_capture: Optional[int] = None
|
||||
max_seq_len_to_capture: int = 8192
|
||||
disable_custom_all_reduce: bool = False
|
||||
tokenizer_pool_size: int = 0
|
||||
tokenizer_pool_type: str = "ray"
|
||||
tokenizer_pool_extra_config: Optional[dict] = None
|
||||
enable_lora: bool = False
|
||||
max_loras: int = 1
|
||||
max_lora_rank: int = 16
|
||||
fully_sharded_loras: bool = False
|
||||
lora_extra_vocab_size: int = 256
|
||||
lora_dtype = 'auto'
|
||||
max_cpu_loras: Optional[int] = None
|
||||
device: str = 'auto'
|
||||
ray_workers_use_nsight: bool = False
|
||||
num_gpu_blocks_override: Optional[int] = None
|
||||
num_lookahead_slots: int = 0
|
||||
model_loader_extra_config: Optional[dict] = None
|
||||
|
||||
# Related to Vision-language models such as llava
|
||||
image_input_type: Optional[str] = None
|
||||
image_token_id: Optional[int] = None
|
||||
image_input_shape: Optional[str] = None
|
||||
image_feature_size: Optional[int] = None
|
||||
scheduler_delay_factor: float = 0.0
|
||||
enable_chunked_prefill: bool = False
|
||||
|
||||
guided_decoding_backend: str = 'outlines'
|
||||
# Speculative decoding configuration.
|
||||
speculative_model: Optional[str] = None
|
||||
num_speculative_tokens: Optional[int] = None
|
||||
speculative_max_model_len: Optional[int] = None
|
||||
ngram_prompt_lookup_max: Optional[int] = None
|
||||
ngram_prompt_lookup_min: Optional[int] = None
|
||||
|
||||
def __post_init__(self):
|
||||
if self.tokenizer is None:
|
||||
self.tokenizer = self.model
|
||||
|
||||
@staticmethod
|
||||
def add_cli_args(
|
||||
parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
|
||||
"""Shared CLI arguments for vLLM engine."""
|
||||
|
||||
# Model arguments
|
||||
parser.add_argument(
|
||||
'--model',
|
||||
type=str,
|
||||
default='facebook/opt-125m',
|
||||
help='Name or path of the huggingface model to use.')
|
||||
parser.add_argument(
|
||||
'--tokenizer',
|
||||
type=nullable_str,
|
||||
default=EngineArgs.tokenizer,
|
||||
help='Name or path of the huggingface tokenizer to use.')
|
||||
parser.add_argument(
|
||||
'--skip-tokenizer-init',
|
||||
action='store_true',
|
||||
help='Skip initialization of tokenizer and detokenizer')
|
||||
parser.add_argument(
|
||||
'--revision',
|
||||
type=nullable_str,
|
||||
default=None,
|
||||
help='The specific model version to use. It can be a branch '
|
||||
'name, a tag name, or a commit id. If unspecified, will use '
|
||||
'the default version.')
|
||||
parser.add_argument(
|
||||
'--code-revision',
|
||||
type=nullable_str,
|
||||
default=None,
|
||||
help='The specific revision to use for the model code on '
|
||||
'Hugging Face Hub. It can be a branch name, a tag name, or a '
|
||||
'commit id. If unspecified, will use the default version.')
|
||||
parser.add_argument(
|
||||
'--tokenizer-revision',
|
||||
type=nullable_str,
|
||||
default=None,
|
||||
help='The specific tokenizer version to use. It can be a branch '
|
||||
'name, a tag name, or a commit id. If unspecified, will use '
|
||||
'the default version.')
|
||||
parser.add_argument(
|
||||
'--tokenizer-mode',
|
||||
type=str,
|
||||
default=EngineArgs.tokenizer_mode,
|
||||
choices=['auto', 'slow'],
|
||||
help='The tokenizer mode.\n\n* "auto" will use the '
|
||||
'fast tokenizer if available.\n* "slow" will '
|
||||
'always use the slow tokenizer.')
|
||||
parser.add_argument('--trust-remote-code',
|
||||
action='store_true',
|
||||
help='Trust remote code from huggingface.')
|
||||
parser.add_argument('--download-dir',
|
||||
type=nullable_str,
|
||||
default=EngineArgs.download_dir,
|
||||
help='Directory to download and load the weights, '
|
||||
'default to the default cache dir of '
|
||||
'huggingface.')
|
||||
parser.add_argument(
|
||||
'--load-format',
|
||||
type=str,
|
||||
default=EngineArgs.load_format,
|
||||
choices=[
|
||||
'auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer'
|
||||
],
|
||||
help='The format of the model weights to load.\n\n'
|
||||
'* "auto" will try to load the weights in the safetensors format '
|
||||
'and fall back to the pytorch bin format if safetensors format '
|
||||
'is not available.\n'
|
||||
'* "pt" will load the weights in the pytorch bin format.\n'
|
||||
'* "safetensors" will load the weights in the safetensors format.\n'
|
||||
'* "npcache" will load the weights in pytorch format and store '
|
||||
'a numpy cache to speed up the loading.\n'
|
||||
'* "dummy" will initialize the weights with random values, '
|
||||
'which is mainly for profiling.\n'
|
||||
'* "tensorizer" will load the weights using tensorizer from '
|
||||
'CoreWeave which assumes tensorizer_uri is set to the location of '
|
||||
'the serialized weights.')
|
||||
parser.add_argument(
|
||||
'--dtype',
|
||||
type=str,
|
||||
default=EngineArgs.dtype,
|
||||
choices=[
|
||||
'auto', 'half', 'float16', 'bfloat16', 'float', 'float32'
|
||||
],
|
||||
help='Data type for model weights and activations.\n\n'
|
||||
'* "auto" will use FP16 precision for FP32 and FP16 models, and '
|
||||
'BF16 precision for BF16 models.\n'
|
||||
'* "half" for FP16. Recommended for AWQ quantization.\n'
|
||||
'* "float16" is the same as "half".\n'
|
||||
'* "bfloat16" for a balance between precision and range.\n'
|
||||
'* "float" is shorthand for FP32 precision.\n'
|
||||
'* "float32" for FP32 precision.')
|
||||
parser.add_argument(
|
||||
'--kv-cache-dtype',
|
||||
type=str,
|
||||
choices=['auto', 'fp8'],
|
||||
default=EngineArgs.kv_cache_dtype,
|
||||
help='Data type for kv cache storage. If "auto", will use model '
|
||||
'data type. FP8_E5M2 (without scaling) is only supported on cuda '
|
||||
'version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead '
|
||||
'supported for common inference criteria.')
|
||||
parser.add_argument(
|
||||
'--quantization-param-path',
|
||||
type=nullable_str,
|
||||
default=None,
|
||||
help='Path to the JSON file containing the KV cache '
|
||||
'scaling factors. This should generally be supplied, when '
|
||||
'KV cache dtype is FP8. Otherwise, KV cache scaling factors '
|
||||
'default to 1.0, which may cause accuracy issues. '
|
||||
'FP8_E5M2 (without scaling) is only supported on cuda version'
|
||||
'greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead '
|
||||
'supported for common inference criteria.')
|
||||
parser.add_argument('--max-model-len',
|
||||
type=int,
|
||||
default=EngineArgs.max_model_len,
|
||||
help='Model context length. If unspecified, will '
|
||||
'be automatically derived from the model config.')
|
||||
parser.add_argument(
|
||||
'--guided-decoding-backend',
|
||||
type=str,
|
||||
default='outlines',
|
||||
choices=['outlines', 'lm-format-enforcer'],
|
||||
help='Which engine will be used for guided decoding'
|
||||
' (JSON schema / regex etc) by default. Currently support '
|
||||
'https://github.com/outlines-dev/outlines and '
|
||||
'https://github.com/noamgat/lm-format-enforcer.'
|
||||
' Can be overridden per request via guided_decoding_backend'
|
||||
' parameter.')
|
||||
# Parallel arguments
|
||||
parser.add_argument('--worker-use-ray',
|
||||
action='store_true',
|
||||
help='Use Ray for distributed serving, will be '
|
||||
'automatically set when using more than 1 GPU.')
|
||||
parser.add_argument('--pipeline-parallel-size',
|
||||
'-pp',
|
||||
type=int,
|
||||
default=EngineArgs.pipeline_parallel_size,
|
||||
help='Number of pipeline stages.')
|
||||
parser.add_argument('--tensor-parallel-size',
|
||||
'-tp',
|
||||
type=int,
|
||||
default=EngineArgs.tensor_parallel_size,
|
||||
help='Number of tensor parallel replicas.')
|
||||
parser.add_argument(
|
||||
'--max-parallel-loading-workers',
|
||||
type=int,
|
||||
default=EngineArgs.max_parallel_loading_workers,
|
||||
help='Load model sequentially in multiple batches, '
|
||||
'to avoid RAM OOM when using tensor '
|
||||
'parallel and large models.')
|
||||
parser.add_argument(
|
||||
'--ray-workers-use-nsight',
|
||||
action='store_true',
|
||||
help='If specified, use nsight to profile Ray workers.')
|
||||
# KV cache arguments
|
||||
parser.add_argument('--block-size',
|
||||
type=int,
|
||||
default=EngineArgs.block_size,
|
||||
choices=[8, 16, 32],
|
||||
help='Token block size for contiguous chunks of '
|
||||
'tokens.')
|
||||
|
||||
parser.add_argument('--enable-prefix-caching',
|
||||
action='store_true',
|
||||
help='Enables automatic prefix caching.')
|
||||
parser.add_argument('--use-v2-block-manager',
|
||||
action='store_true',
|
||||
help='Use BlockSpaceMangerV2.')
|
||||
parser.add_argument(
|
||||
'--num-lookahead-slots',
|
||||
type=int,
|
||||
default=EngineArgs.num_lookahead_slots,
|
||||
help='Experimental scheduling config necessary for '
|
||||
'speculative decoding. This will be replaced by '
|
||||
'speculative config in the future; it is present '
|
||||
'to enable correctness tests until then.')
|
||||
|
||||
parser.add_argument('--seed',
|
||||
type=int,
|
||||
default=EngineArgs.seed,
|
||||
help='Random seed for operations.')
|
||||
parser.add_argument('--swap-space',
|
||||
type=int,
|
||||
default=EngineArgs.swap_space,
|
||||
help='CPU swap space size (GiB) per GPU.')
|
||||
parser.add_argument(
|
||||
'--gpu-memory-utilization',
|
||||
type=float,
|
||||
default=EngineArgs.gpu_memory_utilization,
|
||||
help='The fraction of GPU memory to be used for the model '
|
||||
'executor, which can range from 0 to 1. For example, a value of '
|
||||
'0.5 would imply 50%% GPU memory utilization. If unspecified, '
|
||||
'will use the default value of 0.9.')
|
||||
parser.add_argument(
|
||||
'--num-gpu-blocks-override',
|
||||
type=int,
|
||||
default=None,
|
||||
help='If specified, ignore GPU profiling result and use this number'
|
||||
'of GPU blocks. Used for testing preemption.')
|
||||
parser.add_argument('--max-num-batched-tokens',
|
||||
type=int,
|
||||
default=EngineArgs.max_num_batched_tokens,
|
||||
help='Maximum number of batched tokens per '
|
||||
'iteration.')
|
||||
parser.add_argument('--max-num-seqs',
|
||||
type=int,
|
||||
default=EngineArgs.max_num_seqs,
|
||||
help='Maximum number of sequences per iteration.')
|
||||
parser.add_argument(
|
||||
'--max-logprobs',
|
||||
type=int,
|
||||
default=EngineArgs.max_logprobs,
|
||||
help=('Max number of log probs to return logprobs is specified in'
|
||||
' SamplingParams.'))
|
||||
parser.add_argument('--disable-log-stats',
|
||||
action='store_true',
|
||||
help='Disable logging statistics.')
|
||||
# Quantization settings.
|
||||
parser.add_argument('--quantization',
|
||||
'-q',
|
||||
type=nullable_str,
|
||||
choices=[*QUANTIZATION_METHODS, None],
|
||||
default=EngineArgs.quantization,
|
||||
help='Method used to quantize the weights. If '
|
||||
'None, we first check the `quantization_config` '
|
||||
'attribute in the model config file. If that is '
|
||||
'None, we assume the model weights are not '
|
||||
'quantized and use `dtype` to determine the data '
|
||||
'type of the weights.')
|
||||
parser.add_argument('--enforce-eager',
|
||||
action='store_true',
|
||||
help='Always use eager-mode PyTorch. If False, '
|
||||
'will use eager mode and CUDA graph in hybrid '
|
||||
'for maximal performance and flexibility.')
|
||||
parser.add_argument('--max-context-len-to-capture',
|
||||
type=int,
|
||||
default=EngineArgs.max_context_len_to_capture,
|
||||
help='Maximum context length covered by CUDA '
|
||||
'graphs. When a sequence has context length '
|
||||
'larger than this, we fall back to eager mode. '
|
||||
'(DEPRECATED. Use --max-seq_len-to-capture instead'
|
||||
')')
|
||||
parser.add_argument('--max-seq_len-to-capture',
|
||||
type=int,
|
||||
default=EngineArgs.max_seq_len_to_capture,
|
||||
help='Maximum sequence length covered by CUDA '
|
||||
'graphs. When a sequence has context length '
|
||||
'larger than this, we fall back to eager mode.')
|
||||
parser.add_argument('--disable-custom-all-reduce',
|
||||
action='store_true',
|
||||
default=EngineArgs.disable_custom_all_reduce,
|
||||
help='See ParallelConfig.')
|
||||
parser.add_argument('--tokenizer-pool-size',
|
||||
type=int,
|
||||
default=EngineArgs.tokenizer_pool_size,
|
||||
help='Size of tokenizer pool to use for '
|
||||
'asynchronous tokenization. If 0, will '
|
||||
'use synchronous tokenization.')
|
||||
parser.add_argument('--tokenizer-pool-type',
|
||||
type=str,
|
||||
default=EngineArgs.tokenizer_pool_type,
|
||||
help='Type of tokenizer pool to use for '
|
||||
'asynchronous tokenization. Ignored '
|
||||
'if tokenizer_pool_size is 0.')
|
||||
parser.add_argument('--tokenizer-pool-extra-config',
|
||||
type=nullable_str,
|
||||
default=EngineArgs.tokenizer_pool_extra_config,
|
||||
help='Extra config for tokenizer pool. '
|
||||
'This should be a JSON string that will be '
|
||||
'parsed into a dictionary. Ignored if '
|
||||
'tokenizer_pool_size is 0.')
|
||||
# LoRA related configs
|
||||
parser.add_argument('--enable-lora',
|
||||
action='store_true',
|
||||
help='If True, enable handling of LoRA adapters.')
|
||||
parser.add_argument('--max-loras',
|
||||
type=int,
|
||||
default=EngineArgs.max_loras,
|
||||
help='Max number of LoRAs in a single batch.')
|
||||
parser.add_argument('--max-lora-rank',
|
||||
type=int,
|
||||
default=EngineArgs.max_lora_rank,
|
||||
help='Max LoRA rank.')
|
||||
parser.add_argument(
|
||||
'--lora-extra-vocab-size',
|
||||
type=int,
|
||||
default=EngineArgs.lora_extra_vocab_size,
|
||||
help=('Maximum size of extra vocabulary that can be '
|
||||
'present in a LoRA adapter (added to the base '
|
||||
'model vocabulary).'))
|
||||
parser.add_argument(
|
||||
'--lora-dtype',
|
||||
type=str,
|
||||
default=EngineArgs.lora_dtype,
|
||||
choices=['auto', 'float16', 'bfloat16', 'float32'],
|
||||
help=('Data type for LoRA. If auto, will default to '
|
||||
'base model dtype.'))
|
||||
parser.add_argument(
|
||||
'--max-cpu-loras',
|
||||
type=int,
|
||||
default=EngineArgs.max_cpu_loras,
|
||||
help=('Maximum number of LoRAs to store in CPU memory. '
|
||||
'Must be >= than max_num_seqs. '
|
||||
'Defaults to max_num_seqs.'))
|
||||
parser.add_argument(
|
||||
'--fully-sharded-loras',
|
||||
action='store_true',
|
||||
help=('By default, only half of the LoRA computation is '
|
||||
'sharded with tensor parallelism. '
|
||||
'Enabling this will use the fully sharded layers. '
|
||||
'At high sequence length, max rank or '
|
||||
'tensor parallel size, this is likely faster.'))
|
||||
parser.add_argument("--device",
|
||||
type=str,
|
||||
default=EngineArgs.device,
|
||||
choices=["auto", "cuda", "neuron", "cpu", "musa"],
|
||||
help='Device type for vLLM execution.')
|
||||
# Related to Vision-language models such as llava
|
||||
parser.add_argument(
|
||||
'--image-input-type',
|
||||
type=nullable_str,
|
||||
default=None,
|
||||
choices=[
|
||||
t.name.lower() for t in VisionLanguageConfig.ImageInputType
|
||||
],
|
||||
help=('The image input type passed into vLLM. '
|
||||
'Should be one of "pixel_values" or "image_features".'))
|
||||
parser.add_argument('--image-token-id',
|
||||
type=int,
|
||||
default=None,
|
||||
help=('Input id for image token.'))
|
||||
parser.add_argument(
|
||||
'--image-input-shape',
|
||||
type=nullable_str,
|
||||
default=None,
|
||||
help=('The biggest image input shape (worst for memory footprint) '
|
||||
'given an input type. Only used for vLLM\'s profile_run.'))
|
||||
parser.add_argument(
|
||||
'--image-feature-size',
|
||||
type=int,
|
||||
default=None,
|
||||
help=('The image feature size along the context dimension.'))
|
||||
parser.add_argument(
|
||||
'--scheduler-delay-factor',
|
||||
type=float,
|
||||
default=EngineArgs.scheduler_delay_factor,
|
||||
help='Apply a delay (of delay factor multiplied by previous'
|
||||
'prompt latency) before scheduling next prompt.')
|
||||
parser.add_argument(
|
||||
'--enable-chunked-prefill',
|
||||
action='store_true',
|
||||
help='If set, the prefill requests can be chunked based on the '
|
||||
'max_num_batched_tokens.')
|
||||
|
||||
parser.add_argument(
|
||||
'--speculative-model',
|
||||
type=nullable_str,
|
||||
default=EngineArgs.speculative_model,
|
||||
help=
|
||||
'The name of the draft model to be used in speculative decoding.')
|
||||
|
||||
parser.add_argument(
|
||||
'--num-speculative-tokens',
|
||||
type=int,
|
||||
default=EngineArgs.num_speculative_tokens,
|
||||
help='The number of speculative tokens to sample from '
|
||||
'the draft model in speculative decoding.')
|
||||
|
||||
parser.add_argument(
|
||||
'--speculative-max-model-len',
|
||||
type=int,
|
||||
default=EngineArgs.speculative_max_model_len,
|
||||
help='The maximum sequence length supported by the '
|
||||
'draft model. Sequences over this length will skip '
|
||||
'speculation.')
|
||||
|
||||
parser.add_argument(
|
||||
'--ngram-prompt-lookup-max',
|
||||
type=int,
|
||||
default=EngineArgs.ngram_prompt_lookup_max,
|
||||
help='Max size of window for ngram prompt lookup in speculative '
|
||||
'decoding.')
|
||||
|
||||
parser.add_argument(
|
||||
'--ngram-prompt-lookup-min',
|
||||
type=int,
|
||||
default=EngineArgs.ngram_prompt_lookup_min,
|
||||
help='Min size of window for ngram prompt lookup in speculative '
|
||||
'decoding.')
|
||||
|
||||
parser.add_argument('--model-loader-extra-config',
|
||||
type=nullable_str,
|
||||
default=EngineArgs.model_loader_extra_config,
|
||||
help='Extra config for model loader. '
|
||||
'This will be passed to the model loader '
|
||||
'corresponding to the chosen load_format. '
|
||||
'This should be a JSON string that will be '
|
||||
'parsed into a dictionary.')
|
||||
|
||||
parser.add_argument(
|
||||
"--served-model-name",
|
||||
nargs="+",
|
||||
type=str,
|
||||
default=None,
|
||||
help="The model name(s) used in the API. If multiple "
|
||||
"names are provided, the server will respond to any "
|
||||
"of the provided names. The model name in the model "
|
||||
"field of a response will be the first name in this "
|
||||
"list. If not specified, the model name will be the "
|
||||
"same as the `--model` argument. Noted that this name(s)"
|
||||
"will also be used in `model_name` tag content of "
|
||||
"prometheus metrics, if multiple names provided, metrics"
|
||||
"tag will take the first one.")
|
||||
|
||||
return parser
|
||||
|
||||
@classmethod
|
||||
def from_cli_args(cls, args: argparse.Namespace) -> 'EngineArgs':
|
||||
# Get the list of attributes of this dataclass.
|
||||
attrs = [attr.name for attr in dataclasses.fields(cls)]
|
||||
# Set the attributes from the parsed arguments.
|
||||
engine_args = cls(**{attr: getattr(args, attr) for attr in attrs})
|
||||
return engine_args
|
||||
|
||||
def create_engine_config(self, ) -> EngineConfig:
|
||||
device_config = DeviceConfig(self.device)
|
||||
model_config = ModelConfig(
|
||||
self.model, self.tokenizer, self.tokenizer_mode,
|
||||
self.trust_remote_code, self.dtype, self.seed, self.revision,
|
||||
self.code_revision, self.tokenizer_revision, self.max_model_len,
|
||||
self.quantization, self.quantization_param_path,
|
||||
self.enforce_eager, self.max_context_len_to_capture,
|
||||
self.max_seq_len_to_capture, self.max_logprobs,
|
||||
self.skip_tokenizer_init, self.served_model_name)
|
||||
cache_config = CacheConfig(self.block_size,
|
||||
self.gpu_memory_utilization,
|
||||
self.swap_space, self.kv_cache_dtype,
|
||||
self.num_gpu_blocks_override,
|
||||
model_config.get_sliding_window(),
|
||||
self.enable_prefix_caching)
|
||||
parallel_config = ParallelConfig(
|
||||
self.pipeline_parallel_size, self.tensor_parallel_size,
|
||||
self.worker_use_ray, self.max_parallel_loading_workers,
|
||||
self.disable_custom_all_reduce,
|
||||
TokenizerPoolConfig.create_config(
|
||||
self.tokenizer_pool_size,
|
||||
self.tokenizer_pool_type,
|
||||
self.tokenizer_pool_extra_config,
|
||||
), self.ray_workers_use_nsight)
|
||||
|
||||
speculative_config = SpeculativeConfig.maybe_create_spec_config(
|
||||
target_model_config=model_config,
|
||||
target_parallel_config=parallel_config,
|
||||
target_dtype=self.dtype,
|
||||
speculative_model=self.speculative_model,
|
||||
num_speculative_tokens=self.num_speculative_tokens,
|
||||
speculative_max_model_len=self.speculative_max_model_len,
|
||||
enable_chunked_prefill=self.enable_chunked_prefill,
|
||||
use_v2_block_manager=self.use_v2_block_manager,
|
||||
ngram_prompt_lookup_max=self.ngram_prompt_lookup_max,
|
||||
ngram_prompt_lookup_min=self.ngram_prompt_lookup_min,
|
||||
)
|
||||
|
||||
scheduler_config = SchedulerConfig(
|
||||
self.max_num_batched_tokens,
|
||||
self.max_num_seqs,
|
||||
model_config.max_model_len,
|
||||
self.use_v2_block_manager,
|
||||
num_lookahead_slots=(self.num_lookahead_slots
|
||||
if speculative_config is None else
|
||||
speculative_config.num_lookahead_slots),
|
||||
delay_factor=self.scheduler_delay_factor,
|
||||
enable_chunked_prefill=self.enable_chunked_prefill,
|
||||
)
|
||||
lora_config = LoRAConfig(
|
||||
max_lora_rank=self.max_lora_rank,
|
||||
max_loras=self.max_loras,
|
||||
fully_sharded_loras=self.fully_sharded_loras,
|
||||
lora_extra_vocab_size=self.lora_extra_vocab_size,
|
||||
lora_dtype=self.lora_dtype,
|
||||
max_cpu_loras=self.max_cpu_loras if self.max_cpu_loras
|
||||
and self.max_cpu_loras > 0 else None) if self.enable_lora else None
|
||||
|
||||
load_config = LoadConfig(
|
||||
load_format=self.load_format,
|
||||
download_dir=self.download_dir,
|
||||
model_loader_extra_config=self.model_loader_extra_config,
|
||||
)
|
||||
|
||||
if self.image_input_type:
|
||||
if (not self.image_token_id or not self.image_input_shape
|
||||
or not self.image_feature_size):
|
||||
raise ValueError(
|
||||
'Specify `image_token_id`, `image_input_shape` and '
|
||||
'`image_feature_size` together with `image_input_type`.')
|
||||
vision_language_config = VisionLanguageConfig(
|
||||
image_input_type=VisionLanguageConfig.
|
||||
get_image_input_enum_type(self.image_input_type),
|
||||
image_token_id=self.image_token_id,
|
||||
image_input_shape=str_to_int_tuple(self.image_input_shape),
|
||||
image_feature_size=self.image_feature_size,
|
||||
)
|
||||
else:
|
||||
vision_language_config = None
|
||||
|
||||
decoding_config = DecodingConfig(
|
||||
guided_decoding_backend=self.guided_decoding_backend)
|
||||
|
||||
return EngineConfig(model_config=model_config,
|
||||
cache_config=cache_config,
|
||||
parallel_config=parallel_config,
|
||||
scheduler_config=scheduler_config,
|
||||
device_config=device_config,
|
||||
lora_config=lora_config,
|
||||
vision_language_config=vision_language_config,
|
||||
speculative_config=speculative_config,
|
||||
load_config=load_config,
|
||||
decoding_config=decoding_config)
|
||||
|
||||
|
||||
@dataclass
|
||||
class AsyncEngineArgs(EngineArgs):
|
||||
"""Arguments for asynchronous vLLM engine."""
|
||||
engine_use_ray: bool = False
|
||||
disable_log_requests: bool = False
|
||||
max_log_len: Optional[int] = None
|
||||
|
||||
@staticmethod
|
||||
def add_cli_args(parser: argparse.ArgumentParser,
|
||||
async_args_only: bool = False) -> argparse.ArgumentParser:
|
||||
if not async_args_only:
|
||||
parser = EngineArgs.add_cli_args(parser)
|
||||
parser.add_argument('--engine-use-ray',
|
||||
action='store_true',
|
||||
help='Use Ray to start the LLM engine in a '
|
||||
'separate process as the server process.')
|
||||
parser.add_argument('--disable-log-requests',
|
||||
action='store_true',
|
||||
help='Disable logging requests.')
|
||||
parser.add_argument('--max-log-len',
|
||||
type=int,
|
||||
default=None,
|
||||
help='Max number of prompt characters or prompt '
|
||||
'ID numbers being printed in log.'
|
||||
'\n\nDefault: Unlimited')
|
||||
return parser
|
||||
|
||||
|
||||
# These functions are used by sphinx to build the documentation
|
||||
def _engine_args_parser():
|
||||
return EngineArgs.add_cli_args(argparse.ArgumentParser())
|
||||
|
||||
|
||||
def _async_engine_args_parser():
|
||||
return AsyncEngineArgs.add_cli_args(argparse.ArgumentParser(),
|
||||
async_args_only=True)
|
||||
737
vllm/engine/async_llm_engine.py
Normal file
737
vllm/engine/async_llm_engine.py
Normal file
@@ -0,0 +1,737 @@
|
||||
import asyncio
|
||||
import time
|
||||
from functools import partial
|
||||
from typing import (Any, AsyncIterator, Callable, Dict, Iterable, List,
|
||||
Optional, Set, Tuple, Type, Union)
|
||||
|
||||
from transformers import PreTrainedTokenizer
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.config import DecodingConfig, ModelConfig
|
||||
from vllm.core.scheduler import SchedulerOutputs
|
||||
from vllm.engine.arg_utils import AsyncEngineArgs
|
||||
from vllm.engine.llm_engine import LLMEngine
|
||||
from vllm.executor.ray_utils import initialize_ray_cluster, ray
|
||||
from vllm.logger import init_logger
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.outputs import RequestOutput
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.sequence import ExecuteModelRequest, MultiModalData, SamplerOutput
|
||||
from vllm.usage.usage_lib import UsageContext
|
||||
|
||||
logger = init_logger(__name__)
|
||||
ENGINE_ITERATION_TIMEOUT_S = envs.VLLM_ENGINE_ITERATION_TIMEOUT_S
|
||||
|
||||
|
||||
class AsyncEngineDeadError(RuntimeError):
|
||||
pass
|
||||
|
||||
|
||||
def _raise_exception_on_finish(
|
||||
task: asyncio.Task, error_callback: Callable[[Exception],
|
||||
None]) -> None:
|
||||
msg = ("Task finished unexpectedly. This should never happen! "
|
||||
"Please open an issue on Github.")
|
||||
|
||||
exception = None
|
||||
try:
|
||||
task.result()
|
||||
# NOTE: This will be thrown if task exits normally (which it should not)
|
||||
raise AsyncEngineDeadError(msg)
|
||||
except Exception as e:
|
||||
exception = e
|
||||
logger.error("Engine background task failed", exc_info=e)
|
||||
error_callback(exception)
|
||||
raise AsyncEngineDeadError(
|
||||
msg + " See stack trace above for the actual cause.") from e
|
||||
|
||||
|
||||
class AsyncStream:
|
||||
"""A stream of RequestOutputs for a request that can be
|
||||
iterated over asynchronously."""
|
||||
|
||||
def __init__(self, request_id: str) -> None:
|
||||
self.request_id = request_id
|
||||
self._queue: asyncio.Queue = asyncio.Queue()
|
||||
self._finished = False
|
||||
|
||||
def put(self, item: Union[RequestOutput, Exception]) -> None:
|
||||
if self._finished:
|
||||
return
|
||||
self._queue.put_nowait(item)
|
||||
|
||||
def finish(self) -> None:
|
||||
self._queue.put_nowait(StopAsyncIteration())
|
||||
self._finished = True
|
||||
|
||||
@property
|
||||
def finished(self) -> bool:
|
||||
return self._finished
|
||||
|
||||
def __aiter__(self):
|
||||
return self
|
||||
|
||||
async def __anext__(self) -> RequestOutput:
|
||||
result = await self._queue.get()
|
||||
if isinstance(result, Exception):
|
||||
raise result
|
||||
return result
|
||||
|
||||
|
||||
class RequestTracker:
|
||||
"""Synchronous abstraction for tracking requests."""
|
||||
|
||||
def __init__(self) -> None:
|
||||
self._request_streams: Dict[str, AsyncStream] = {}
|
||||
self._finished_requests: asyncio.Queue[str] = asyncio.Queue()
|
||||
self._new_requests: asyncio.Queue[Tuple[AsyncStream,
|
||||
dict]] = asyncio.Queue()
|
||||
self.new_requests_event = asyncio.Event()
|
||||
|
||||
def __contains__(self, item):
|
||||
return item in self._request_streams
|
||||
|
||||
def __len__(self) -> int:
|
||||
return len(self._request_streams)
|
||||
|
||||
def propagate_exception(self,
|
||||
exc: Exception,
|
||||
request_id: Optional[str] = None) -> None:
|
||||
"""Propagate an exception to request streams
|
||||
(all if request_id is None)."""
|
||||
if request_id is not None:
|
||||
self._request_streams[request_id].put(exc)
|
||||
self.abort_request(request_id)
|
||||
else:
|
||||
for rid, stream in self._request_streams.items():
|
||||
stream.put(exc)
|
||||
self.abort_request(rid)
|
||||
|
||||
def process_request_output(self,
|
||||
request_output: RequestOutput,
|
||||
*,
|
||||
verbose: bool = False) -> None:
|
||||
"""Process a request output from the engine."""
|
||||
request_id = request_output.request_id
|
||||
|
||||
self._request_streams[request_id].put(request_output)
|
||||
if request_output.finished:
|
||||
if verbose:
|
||||
logger.info("Finished request %s.", request_id)
|
||||
self.abort_request(request_id)
|
||||
|
||||
def process_exception(self,
|
||||
request_id: str,
|
||||
exception: Exception,
|
||||
*,
|
||||
verbose: bool = False) -> None:
|
||||
"""Propagate an exception from the engine."""
|
||||
self._request_streams[request_id].put(exception)
|
||||
if verbose:
|
||||
logger.info("Finished request %s.", request_id)
|
||||
self.abort_request(request_id)
|
||||
|
||||
def add_request(self, request_id: str,
|
||||
**engine_add_request_kwargs) -> AsyncStream:
|
||||
"""Add a request to be sent to the engine on the next background
|
||||
loop iteration."""
|
||||
if request_id in self._request_streams:
|
||||
raise KeyError(f"Request {request_id} already exists.")
|
||||
|
||||
stream = AsyncStream(request_id)
|
||||
self._new_requests.put_nowait((stream, {
|
||||
"request_id": request_id,
|
||||
**engine_add_request_kwargs
|
||||
}))
|
||||
|
||||
self.new_requests_event.set()
|
||||
|
||||
return stream
|
||||
|
||||
def abort_request(self, request_id: str, *, verbose: bool = False) -> None:
|
||||
"""Abort a request during next background loop iteration."""
|
||||
if verbose:
|
||||
logger.info("Aborted request %s.", request_id)
|
||||
|
||||
self._finished_requests.put_nowait(request_id)
|
||||
|
||||
if request_id not in self._request_streams or self._request_streams[
|
||||
request_id].finished:
|
||||
# The request has already finished or been aborted.
|
||||
return
|
||||
|
||||
self._request_streams[request_id].finish()
|
||||
|
||||
def get_new_and_finished_requests(self) -> Tuple[List[Dict], Set[str]]:
|
||||
"""Get the new requests and finished requests to be
|
||||
sent to the engine."""
|
||||
new_requests: List[Dict] = []
|
||||
finished_requests: Set[str] = set()
|
||||
|
||||
while not self._finished_requests.empty():
|
||||
request_id = self._finished_requests.get_nowait()
|
||||
finished_requests.add(request_id)
|
||||
self._request_streams.pop(request_id, None)
|
||||
|
||||
while not self._new_requests.empty():
|
||||
stream, new_request = self._new_requests.get_nowait()
|
||||
if stream.request_id in finished_requests:
|
||||
# The request has already been aborted.
|
||||
stream.finish()
|
||||
continue
|
||||
self._request_streams[stream.request_id] = stream
|
||||
new_requests.append(new_request)
|
||||
|
||||
return new_requests, finished_requests
|
||||
|
||||
async def wait_for_new_requests(self):
|
||||
if not self.has_new_requests():
|
||||
await self.new_requests_event.wait()
|
||||
self.new_requests_event.clear()
|
||||
|
||||
def has_new_requests(self):
|
||||
return not self._new_requests.empty()
|
||||
|
||||
|
||||
class _AsyncLLMEngine(LLMEngine):
|
||||
"""Extension of LLMEngine to add async methods."""
|
||||
|
||||
async def step_async(self) -> List[RequestOutput]:
|
||||
"""Performs one decoding iteration and returns newly generated results.
|
||||
The workers are ran asynchronously if possible.
|
||||
|
||||
This function performs one decoding iteration of the engine. It first
|
||||
schedules the sequences to be executed in the next iteration and the
|
||||
token blocks to be swapped in/out/copy. Then, it executes the model
|
||||
and updates the scheduler with the model outputs. Finally, it decodes
|
||||
the sequences and returns the newly generated results.
|
||||
"""
|
||||
seq_group_metadata_list, scheduler_outputs = self.scheduler.schedule()
|
||||
|
||||
if not scheduler_outputs.is_empty():
|
||||
# Execute the model.
|
||||
execute_model_req = ExecuteModelRequest(
|
||||
seq_group_metadata_list=seq_group_metadata_list,
|
||||
blocks_to_swap_in=scheduler_outputs.blocks_to_swap_in,
|
||||
blocks_to_swap_out=scheduler_outputs.blocks_to_swap_out,
|
||||
blocks_to_copy=scheduler_outputs.blocks_to_copy,
|
||||
num_lookahead_slots=scheduler_outputs.num_lookahead_slots,
|
||||
running_queue_size=scheduler_outputs.running_queue_size,
|
||||
)
|
||||
output = await self.model_executor.execute_model_async(
|
||||
execute_model_req)
|
||||
else:
|
||||
output = []
|
||||
|
||||
request_outputs = self._process_model_outputs(
|
||||
output, scheduler_outputs.scheduled_seq_groups,
|
||||
scheduler_outputs.ignored_seq_groups, seq_group_metadata_list)
|
||||
|
||||
# Log stats.
|
||||
self.do_log_stats(scheduler_outputs, output)
|
||||
|
||||
return request_outputs
|
||||
|
||||
async def encode_request_async(
|
||||
self,
|
||||
request_id: str, # pylint: disable=unused-argument
|
||||
prompt: Optional[str],
|
||||
prompt_token_ids: Optional[List[int]] = None,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
):
|
||||
if prompt_token_ids is None:
|
||||
assert prompt is not None
|
||||
prompt_token_ids = await self.tokenizer.encode_async(
|
||||
request_id=request_id,
|
||||
prompt=prompt,
|
||||
lora_request=lora_request)
|
||||
return prompt_token_ids
|
||||
|
||||
async def add_request_async(
|
||||
self,
|
||||
request_id: str,
|
||||
prompt: Optional[str],
|
||||
sampling_params: SamplingParams,
|
||||
prompt_token_ids: Optional[List[int]] = None,
|
||||
arrival_time: Optional[float] = None,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
multi_modal_data: Optional[MultiModalData] = None,
|
||||
) -> None:
|
||||
if lora_request is not None and not self.lora_config:
|
||||
raise ValueError(f"Got lora_request {lora_request} but LoRA is "
|
||||
"not enabled!")
|
||||
if arrival_time is None:
|
||||
arrival_time = time.time()
|
||||
prompt_token_ids = await self.encode_request_async(
|
||||
request_id=request_id,
|
||||
prompt=prompt,
|
||||
prompt_token_ids=prompt_token_ids,
|
||||
lora_request=lora_request)
|
||||
|
||||
return self.add_request(request_id,
|
||||
prompt=prompt,
|
||||
prompt_token_ids=prompt_token_ids,
|
||||
sampling_params=sampling_params,
|
||||
arrival_time=arrival_time,
|
||||
lora_request=lora_request,
|
||||
multi_modal_data=multi_modal_data)
|
||||
|
||||
async def check_health_async(self) -> None:
|
||||
self.model_executor.check_health()
|
||||
|
||||
|
||||
class AsyncLLMEngine:
|
||||
"""An asynchronous wrapper for LLMEngine.
|
||||
|
||||
This class is used to wrap the LLMEngine class to make it asynchronous. It
|
||||
uses asyncio to create a background loop that keeps processing incoming
|
||||
requests. The LLMEngine is kicked by the generate method when there
|
||||
are requests in the waiting queue. The generate method yields the outputs
|
||||
from the LLMEngine to the caller.
|
||||
|
||||
NOTE: For the comprehensive list of arguments, see `LLMEngine`.
|
||||
|
||||
Args:
|
||||
worker_use_ray: Whether to use Ray for model workers. Required for
|
||||
distributed execution. Should be the same as
|
||||
`parallel_config.worker_use_ray`.
|
||||
engine_use_ray: Whether to make LLMEngine a Ray actor. If so, the
|
||||
async frontend will be executed in a separate process as the
|
||||
model workers.
|
||||
log_requests: Whether to log the requests.
|
||||
max_log_len: Maximum number of prompt characters or prompt ID numbers
|
||||
being printed in log.
|
||||
start_engine_loop: If True, the background task to run the engine
|
||||
will be automatically started in the generate call.
|
||||
*args: Arguments for LLMEngine.
|
||||
*kwargs: Arguments for LLMEngine.
|
||||
"""
|
||||
|
||||
_engine_class: Type[_AsyncLLMEngine] = _AsyncLLMEngine
|
||||
|
||||
def __init__(self,
|
||||
worker_use_ray: bool,
|
||||
engine_use_ray: bool,
|
||||
*args,
|
||||
log_requests: bool = True,
|
||||
max_log_len: Optional[int] = None,
|
||||
start_engine_loop: bool = True,
|
||||
**kwargs) -> None:
|
||||
self.worker_use_ray = worker_use_ray
|
||||
self.engine_use_ray = engine_use_ray
|
||||
self.log_requests = log_requests
|
||||
self.max_log_len = max_log_len
|
||||
self.engine = self._init_engine(*args, **kwargs)
|
||||
|
||||
self.background_loop: Optional[asyncio.Future] = None
|
||||
# We need to keep a reference to unshielded
|
||||
# task as well to prevent it from being garbage
|
||||
# collected
|
||||
self._background_loop_unshielded: Optional[asyncio.Task[Any]] = None
|
||||
self.start_engine_loop = start_engine_loop
|
||||
self._errored_with: Optional[BaseException] = None
|
||||
|
||||
# Lazy initialized fields
|
||||
self._request_tracker: RequestTracker
|
||||
|
||||
@classmethod
|
||||
def from_engine_args(
|
||||
cls,
|
||||
engine_args: AsyncEngineArgs,
|
||||
start_engine_loop: bool = True,
|
||||
usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
|
||||
) -> "AsyncLLMEngine":
|
||||
"""Creates an async LLM engine from the engine arguments."""
|
||||
# Create the engine configs.
|
||||
engine_config = engine_args.create_engine_config()
|
||||
|
||||
if engine_config.device_config.device_type == "neuron":
|
||||
from vllm.executor.neuron_executor import NeuronExecutorAsync
|
||||
executor_class = NeuronExecutorAsync
|
||||
elif engine_config.device_config.device_type == "cpu":
|
||||
assert not engine_config.parallel_config.worker_use_ray, (
|
||||
"Ray is not supported with the CPU backend.")
|
||||
from vllm.executor.cpu_executor import CPUExecutorAsync
|
||||
executor_class = CPUExecutorAsync
|
||||
elif engine_config.parallel_config.worker_use_ray:
|
||||
initialize_ray_cluster(engine_config.parallel_config)
|
||||
from vllm.executor.ray_gpu_executor import RayGPUExecutorAsync
|
||||
executor_class = RayGPUExecutorAsync
|
||||
else:
|
||||
assert engine_config.parallel_config.world_size == 1, (
|
||||
"Ray is required if parallel_config.world_size > 1.")
|
||||
from vllm.executor.gpu_executor import GPUExecutorAsync
|
||||
executor_class = GPUExecutorAsync
|
||||
# Create the async LLM engine.
|
||||
engine = cls(
|
||||
engine_config.parallel_config.worker_use_ray,
|
||||
engine_args.engine_use_ray,
|
||||
**engine_config.to_dict(),
|
||||
executor_class=executor_class,
|
||||
log_requests=not engine_args.disable_log_requests,
|
||||
log_stats=not engine_args.disable_log_stats,
|
||||
max_log_len=engine_args.max_log_len,
|
||||
start_engine_loop=start_engine_loop,
|
||||
usage_context=usage_context,
|
||||
)
|
||||
return engine
|
||||
|
||||
@property
|
||||
def is_running(self) -> bool:
|
||||
return (self.background_loop is not None
|
||||
and self._background_loop_unshielded is not None
|
||||
and not self._background_loop_unshielded.done())
|
||||
|
||||
@property
|
||||
def is_stopped(self) -> bool:
|
||||
return self.errored or (self.background_loop is not None and
|
||||
self._background_loop_unshielded is not None
|
||||
and self._background_loop_unshielded.done())
|
||||
|
||||
@property
|
||||
def errored(self) -> bool:
|
||||
return self._errored_with is not None
|
||||
|
||||
def set_errored(self, exc: Exception) -> None:
|
||||
self._errored_with = exc
|
||||
|
||||
def _error_callback(self, exc: Exception) -> None:
|
||||
self.set_errored(exc)
|
||||
self._request_tracker.propagate_exception(exc)
|
||||
|
||||
async def get_tokenizer(self) -> "PreTrainedTokenizer":
|
||||
if self.engine_use_ray:
|
||||
return await self.engine.get_tokenizer.remote() # type: ignore
|
||||
else:
|
||||
return self.engine.get_tokenizer()
|
||||
|
||||
def start_background_loop(self) -> None:
|
||||
"""Start the background loop."""
|
||||
if self.errored:
|
||||
raise AsyncEngineDeadError(
|
||||
"Background loop has errored already.") from self._errored_with
|
||||
if self.is_running:
|
||||
raise RuntimeError("Background loop is already running.")
|
||||
# Initialize the RequestTracker here so it uses the right event loop.
|
||||
self._request_tracker = RequestTracker()
|
||||
|
||||
self._background_loop_unshielded = asyncio.get_event_loop(
|
||||
).create_task(self.run_engine_loop())
|
||||
self._background_loop_unshielded.add_done_callback(
|
||||
partial(_raise_exception_on_finish,
|
||||
error_callback=self._error_callback))
|
||||
self.background_loop = asyncio.shield(self._background_loop_unshielded)
|
||||
|
||||
def _init_engine(self, *args,
|
||||
**kwargs) -> Union[_AsyncLLMEngine, "ray.ObjectRef"]:
|
||||
if not self.engine_use_ray:
|
||||
engine_class = self._engine_class
|
||||
elif self.worker_use_ray:
|
||||
engine_class = ray.remote(num_cpus=0)(self._engine_class).remote
|
||||
else:
|
||||
# FIXME(woosuk): This is a bit hacky. Be careful when changing the
|
||||
# order of the arguments.
|
||||
cache_config = kwargs["cache_config"]
|
||||
parallel_config = kwargs["parallel_config"]
|
||||
if parallel_config.tensor_parallel_size == 1:
|
||||
num_gpus = cache_config.gpu_memory_utilization
|
||||
else:
|
||||
num_gpus = 1
|
||||
engine_class = ray.remote(num_gpus=num_gpus)(
|
||||
self._engine_class).remote
|
||||
return engine_class(*args, **kwargs)
|
||||
|
||||
async def engine_step(self) -> bool:
|
||||
"""Kick the engine to process the waiting requests.
|
||||
|
||||
Returns True if there are in-progress requests."""
|
||||
|
||||
new_requests, finished_requests = (
|
||||
self._request_tracker.get_new_and_finished_requests())
|
||||
|
||||
for new_request in new_requests:
|
||||
# Add the request into the vLLM engine's waiting queue.
|
||||
# TODO: Maybe add add_request_batch to reduce Ray overhead
|
||||
try:
|
||||
if self.engine_use_ray:
|
||||
await self.engine.add_request.remote( # type: ignore
|
||||
**new_request)
|
||||
else:
|
||||
await self.engine.add_request_async(**new_request)
|
||||
except ValueError as e:
|
||||
# TODO: use a vLLM specific error for failed validation
|
||||
self._request_tracker.process_exception(
|
||||
new_request["request_id"],
|
||||
e,
|
||||
verbose=self.log_requests,
|
||||
)
|
||||
|
||||
if finished_requests:
|
||||
await self._engine_abort(finished_requests)
|
||||
|
||||
if self.engine_use_ray:
|
||||
request_outputs = await self.engine.step.remote() # type: ignore
|
||||
else:
|
||||
request_outputs = await self.engine.step_async()
|
||||
|
||||
# Put the outputs into the corresponding streams.
|
||||
for request_output in request_outputs:
|
||||
self._request_tracker.process_request_output(
|
||||
request_output, verbose=self.log_requests)
|
||||
|
||||
return len(request_outputs) > 0
|
||||
|
||||
async def _engine_abort(self, request_ids: Iterable[str]):
|
||||
if self.engine_use_ray:
|
||||
await self.engine.abort_request.remote(request_ids) # type: ignore
|
||||
else:
|
||||
self.engine.abort_request(request_ids)
|
||||
|
||||
async def run_engine_loop(self):
|
||||
has_requests_in_progress = False
|
||||
while True:
|
||||
if not has_requests_in_progress:
|
||||
logger.debug("Waiting for new requests...")
|
||||
await self._request_tracker.wait_for_new_requests()
|
||||
logger.debug("Got new requests!")
|
||||
|
||||
# Abort if iteration takes too long due to unrecoverable errors
|
||||
# (eg. NCCL timeouts).
|
||||
try:
|
||||
has_requests_in_progress = await asyncio.wait_for(
|
||||
self.engine_step(), ENGINE_ITERATION_TIMEOUT_S)
|
||||
except asyncio.TimeoutError as exc:
|
||||
logger.error(
|
||||
"Engine iteration timed out. This should never happen!")
|
||||
self.set_errored(exc)
|
||||
raise
|
||||
await asyncio.sleep(0)
|
||||
|
||||
async def add_request(
|
||||
self,
|
||||
request_id: str,
|
||||
prompt: Optional[str],
|
||||
sampling_params: SamplingParams,
|
||||
prompt_token_ids: Optional[List[int]] = None,
|
||||
arrival_time: Optional[float] = None,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
multi_modal_data: Optional[MultiModalData] = None,
|
||||
) -> AsyncStream:
|
||||
if self.log_requests:
|
||||
shortened_prompt = prompt
|
||||
shortened_token_ids = prompt_token_ids
|
||||
if self.max_log_len is not None:
|
||||
if shortened_prompt is not None:
|
||||
shortened_prompt = shortened_prompt[:self.max_log_len]
|
||||
if shortened_token_ids is not None:
|
||||
shortened_token_ids = shortened_token_ids[:self.
|
||||
max_log_len]
|
||||
logger.info(
|
||||
"Received request %s: prompt: %r, "
|
||||
"sampling_params: %s, prompt_token_ids: %s, "
|
||||
"lora_request: %s.", request_id, shortened_prompt,
|
||||
sampling_params, shortened_token_ids, lora_request)
|
||||
|
||||
if not self.is_running:
|
||||
if self.start_engine_loop:
|
||||
self.start_background_loop()
|
||||
else:
|
||||
raise AsyncEngineDeadError(
|
||||
"Background loop is not running. If it was running, "
|
||||
"inspect the output to find the stacktrace of the "
|
||||
"error that caused the background loop to stop "
|
||||
"(AsyncEngineDeadError).")
|
||||
|
||||
if arrival_time is None:
|
||||
arrival_time = time.time()
|
||||
|
||||
if self.engine_use_ray:
|
||||
prompt_token_ids = await (
|
||||
self.engine.encode_request_async.remote( # type: ignore
|
||||
request_id=request_id,
|
||||
prompt=prompt,
|
||||
prompt_token_ids=prompt_token_ids,
|
||||
lora_request=lora_request))
|
||||
else:
|
||||
prompt_token_ids = await self.engine.encode_request_async(
|
||||
request_id=request_id,
|
||||
prompt=prompt,
|
||||
prompt_token_ids=prompt_token_ids,
|
||||
lora_request=lora_request)
|
||||
|
||||
stream = self._request_tracker.add_request(
|
||||
request_id,
|
||||
prompt=prompt,
|
||||
sampling_params=sampling_params,
|
||||
prompt_token_ids=prompt_token_ids,
|
||||
arrival_time=arrival_time,
|
||||
lora_request=lora_request,
|
||||
multi_modal_data=multi_modal_data,
|
||||
)
|
||||
|
||||
return stream
|
||||
|
||||
async def generate(
|
||||
self,
|
||||
prompt: Optional[str],
|
||||
sampling_params: SamplingParams,
|
||||
request_id: str,
|
||||
prompt_token_ids: Optional[List[int]] = None,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
multi_modal_data: Optional[MultiModalData] = None
|
||||
) -> AsyncIterator[RequestOutput]:
|
||||
"""Generate outputs for a request.
|
||||
|
||||
Generate outputs for a request. This method is a coroutine. It adds the
|
||||
request into the waiting queue of the LLMEngine and streams the outputs
|
||||
from the LLMEngine to the caller.
|
||||
|
||||
Args:
|
||||
prompt: The prompt string. Can be None if prompt_token_ids is
|
||||
provided.
|
||||
sampling_params: The sampling parameters of the request.
|
||||
request_id: The unique id of the request.
|
||||
prompt_token_ids: The token IDs of the prompt. If None, we
|
||||
use the tokenizer to convert the prompts to token IDs.
|
||||
lora_request: LoRA request to use for generation, if any.
|
||||
multi_modal_data: Multi modal data per request.
|
||||
|
||||
Yields:
|
||||
The output `RequestOutput` objects from the LLMEngine for the
|
||||
request.
|
||||
|
||||
Details:
|
||||
- If the engine is not running, start the background loop,
|
||||
which iteratively invokes
|
||||
:meth:`~vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step`
|
||||
to process the waiting requests.
|
||||
- Add the request to the engine's `RequestTracker`.
|
||||
On the next background loop, this request will be sent to
|
||||
the underlying engine.
|
||||
Also, a corresponding `AsyncStream` will be created.
|
||||
- Wait for the request outputs from `AsyncStream` and yield them.
|
||||
|
||||
Example:
|
||||
>>> # Please refer to entrypoints/api_server.py for
|
||||
>>> # the complete example.
|
||||
>>>
|
||||
>>> # initialize the engine and the example input
|
||||
>>> engine = AsyncLLMEngine.from_engine_args(engine_args)
|
||||
>>> example_input = {
|
||||
>>> "prompt": "What is LLM?",
|
||||
>>> "stream": False, # assume the non-streaming case
|
||||
>>> "temperature": 0.0,
|
||||
>>> "request_id": 0,
|
||||
>>> }
|
||||
>>>
|
||||
>>> # start the generation
|
||||
>>> results_generator = engine.generate(
|
||||
>>> example_input["prompt"],
|
||||
>>> SamplingParams(temperature=example_input["temperature"]),
|
||||
>>> example_input["request_id"])
|
||||
>>>
|
||||
>>> # get the results
|
||||
>>> final_output = None
|
||||
>>> async for request_output in results_generator:
|
||||
>>> if await request.is_disconnected():
|
||||
>>> # Abort the request if the client disconnects.
|
||||
>>> await engine.abort(request_id)
|
||||
>>> # Return or raise an error
|
||||
>>> ...
|
||||
>>> final_output = request_output
|
||||
>>>
|
||||
>>> # Process and return the final output
|
||||
>>> ...
|
||||
"""
|
||||
# Preprocess the request.
|
||||
arrival_time = time.time()
|
||||
|
||||
try:
|
||||
stream = await self.add_request(
|
||||
request_id,
|
||||
prompt,
|
||||
sampling_params,
|
||||
prompt_token_ids=prompt_token_ids,
|
||||
arrival_time=arrival_time,
|
||||
lora_request=lora_request,
|
||||
multi_modal_data=multi_modal_data,
|
||||
)
|
||||
|
||||
async for request_output in stream:
|
||||
yield request_output
|
||||
except (Exception, asyncio.CancelledError) as e:
|
||||
# If there is an exception or coroutine is cancelled, abort the
|
||||
# request.
|
||||
self._abort(request_id)
|
||||
raise e
|
||||
|
||||
async def abort(self, request_id: str) -> None:
|
||||
"""Abort a request.
|
||||
|
||||
Abort a submitted request. If the request is finished or not found,
|
||||
this method will be a no-op.
|
||||
|
||||
Args:
|
||||
request_id: The unique id of the request.
|
||||
"""
|
||||
if not self.is_running:
|
||||
raise AsyncEngineDeadError(
|
||||
"Background loop is not running. If it was running, "
|
||||
"inspect the output to find the stacktrace of the "
|
||||
"error that caused the background loop to stop "
|
||||
"(AsyncEngineDeadError).")
|
||||
|
||||
return self._abort(request_id)
|
||||
|
||||
def _abort(self, request_id: str) -> None:
|
||||
"""Abort a request.
|
||||
|
||||
Abort a submitted request. If the request is finished or not found,
|
||||
this method will be a no-op.
|
||||
|
||||
Args:
|
||||
request_id: The unique id of the request.
|
||||
"""
|
||||
self._request_tracker.abort_request(request_id,
|
||||
verbose=self.log_requests)
|
||||
|
||||
async def get_model_config(self) -> ModelConfig:
|
||||
"""Get the model configuration of the vLLM engine."""
|
||||
if self.engine_use_ray:
|
||||
return await self.engine.get_model_config.remote() # type: ignore
|
||||
else:
|
||||
return self.engine.get_model_config()
|
||||
|
||||
async def get_decoding_config(self) -> DecodingConfig:
|
||||
"""Get the decoding configuration of the vLLM engine."""
|
||||
if self.engine_use_ray:
|
||||
return await self.engine.get_decoding_config.remote( # type: ignore
|
||||
)
|
||||
else:
|
||||
return self.engine.get_decoding_config()
|
||||
|
||||
async def do_log_stats(
|
||||
self,
|
||||
scheduler_outputs: Optional[SchedulerOutputs] = None,
|
||||
model_output: Optional[List[SamplerOutput]] = None) -> None:
|
||||
if self.engine_use_ray:
|
||||
await self.engine.do_log_stats.remote( # type: ignore
|
||||
scheduler_outputs, model_output)
|
||||
else:
|
||||
self.engine.do_log_stats()
|
||||
|
||||
async def check_health(self) -> None:
|
||||
"""Raises an error if engine is unhealthy."""
|
||||
t = time.perf_counter()
|
||||
logger.debug("Starting health check...")
|
||||
if self.is_stopped:
|
||||
raise AsyncEngineDeadError("Background loop is stopped.")
|
||||
|
||||
if self.engine_use_ray:
|
||||
try:
|
||||
await self.engine.check_health.remote() # type: ignore
|
||||
except ray.exceptions.RayActorError as e:
|
||||
raise RuntimeError("Engine is dead.") from e
|
||||
else:
|
||||
await self.engine.check_health_async()
|
||||
logger.debug("Health check took %fs", time.perf_counter() - t)
|
||||
784
vllm/engine/llm_engine.py
Normal file
784
vllm/engine/llm_engine.py
Normal file
@@ -0,0 +1,784 @@
|
||||
import time
|
||||
from typing import Iterable, List, Optional, Type, Union
|
||||
|
||||
from transformers import GenerationConfig, PreTrainedTokenizer
|
||||
|
||||
import vllm
|
||||
from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig, LoadConfig,
|
||||
LoRAConfig, ModelConfig, ParallelConfig,
|
||||
SchedulerConfig, SpeculativeConfig,
|
||||
VisionLanguageConfig)
|
||||
from vllm.core.scheduler import (ScheduledSequenceGroup, Scheduler,
|
||||
SchedulerOutputs)
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.engine.metrics import StatLogger, Stats
|
||||
from vllm.engine.output_processor.interfaces import (
|
||||
SequenceGroupOutputProcessor)
|
||||
from vllm.engine.output_processor.stop_checker import StopChecker
|
||||
from vllm.engine.output_processor.util import create_output_by_sequence_group
|
||||
from vllm.executor.executor_base import ExecutorBase
|
||||
from vllm.executor.ray_utils import initialize_ray_cluster
|
||||
from vllm.logger import init_logger
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.outputs import RequestOutput
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.sequence import (ExecuteModelRequest, MultiModalData, SamplerOutput,
|
||||
Sequence, SequenceGroup, SequenceGroupMetadata,
|
||||
SequenceStatus)
|
||||
from vllm.transformers_utils.detokenizer import Detokenizer
|
||||
from vllm.transformers_utils.tokenizer_group import (BaseTokenizerGroup,
|
||||
get_tokenizer_group)
|
||||
from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled,
|
||||
usage_message)
|
||||
from vllm.utils import Counter
|
||||
|
||||
logger = init_logger(__name__)
|
||||
_LOCAL_LOGGING_INTERVAL_SEC = 5
|
||||
|
||||
|
||||
def _load_generation_config_dict(model_config: ModelConfig):
|
||||
try:
|
||||
return GenerationConfig.from_pretrained(
|
||||
model_config.model,
|
||||
revision=model_config.revision,
|
||||
).to_diff_dict()
|
||||
except OSError:
|
||||
# Not found.
|
||||
return {}
|
||||
|
||||
|
||||
class LLMEngine:
|
||||
"""An LLM engine that receives requests and generates texts.
|
||||
|
||||
This is the main class for the vLLM engine. It receives requests
|
||||
from clients and generates texts from the LLM. It includes a tokenizer, a
|
||||
language model (possibly distributed across multiple GPUs), and GPU memory
|
||||
space allocated for intermediate states (aka KV cache). This class utilizes
|
||||
iteration-level scheduling and efficient memory management to maximize the
|
||||
serving throughput.
|
||||
|
||||
The `LLM` class wraps this class for offline batched inference and the
|
||||
`AsyncLLMEngine` class wraps this class for online serving.
|
||||
|
||||
NOTE: The config arguments are derived from the `EngineArgs` class. For the
|
||||
comprehensive list of arguments, see `EngineArgs`.
|
||||
|
||||
Args:
|
||||
model_config: The configuration related to the LLM model.
|
||||
cache_config: The configuration related to the KV cache memory
|
||||
management.
|
||||
parallel_config: The configuration related to distributed execution.
|
||||
scheduler_config: The configuration related to the request scheduler.
|
||||
device_config: The configuration related to the device.
|
||||
lora_config (Optional): The configuration related to serving multi-LoRA.
|
||||
vision_language_config (Optional): The configuration related to vision
|
||||
language models.
|
||||
speculative_config (Optional): The configuration related to speculative
|
||||
decoding.
|
||||
executor_class: The model executor class for managing distributed
|
||||
execution.
|
||||
log_stats: Whether to log statistics.
|
||||
usage_context: Specified entry point, used for usage info collection
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model_config: ModelConfig,
|
||||
cache_config: CacheConfig,
|
||||
parallel_config: ParallelConfig,
|
||||
scheduler_config: SchedulerConfig,
|
||||
device_config: DeviceConfig,
|
||||
load_config: LoadConfig,
|
||||
lora_config: Optional[LoRAConfig],
|
||||
vision_language_config: Optional[VisionLanguageConfig],
|
||||
speculative_config: Optional[SpeculativeConfig],
|
||||
decoding_config: Optional[DecodingConfig],
|
||||
executor_class: Type[ExecutorBase],
|
||||
log_stats: bool,
|
||||
usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
|
||||
) -> None:
|
||||
logger.info(
|
||||
"Initializing an LLM engine (v%s) with config: "
|
||||
"model=%r, speculative_config=%r, tokenizer=%r, "
|
||||
"skip_tokenizer_init=%s, tokenizer_mode=%s, revision=%s, "
|
||||
"tokenizer_revision=%s, trust_remote_code=%s, dtype=%s, "
|
||||
"max_seq_len=%d, download_dir=%r, load_format=%s, "
|
||||
"tensor_parallel_size=%d, disable_custom_all_reduce=%s, "
|
||||
"quantization=%s, enforce_eager=%s, kv_cache_dtype=%s, "
|
||||
"quantization_param_path=%s, device_config=%s, "
|
||||
"decoding_config=%r, seed=%d, served_model_name=%s)",
|
||||
vllm.__version__,
|
||||
model_config.model,
|
||||
speculative_config,
|
||||
model_config.tokenizer,
|
||||
model_config.skip_tokenizer_init,
|
||||
model_config.tokenizer_mode,
|
||||
model_config.revision,
|
||||
model_config.tokenizer_revision,
|
||||
model_config.trust_remote_code,
|
||||
model_config.dtype,
|
||||
model_config.max_model_len,
|
||||
load_config.download_dir,
|
||||
load_config.load_format,
|
||||
parallel_config.tensor_parallel_size,
|
||||
parallel_config.disable_custom_all_reduce,
|
||||
model_config.quantization,
|
||||
model_config.enforce_eager,
|
||||
cache_config.cache_dtype,
|
||||
model_config.quantization_param_path,
|
||||
device_config.device,
|
||||
decoding_config,
|
||||
model_config.seed,
|
||||
model_config.served_model_name,
|
||||
)
|
||||
# TODO(woosuk): Print more configs in debug mode.
|
||||
|
||||
self.model_config = model_config
|
||||
self.cache_config = cache_config
|
||||
self.lora_config = lora_config
|
||||
self.vision_language_config = vision_language_config
|
||||
self.parallel_config = parallel_config
|
||||
self.scheduler_config = scheduler_config
|
||||
self.device_config = device_config
|
||||
self.speculative_config = speculative_config
|
||||
self.load_config = load_config
|
||||
self.decoding_config = decoding_config or DecodingConfig()
|
||||
self.log_stats = log_stats
|
||||
|
||||
if not self.model_config.skip_tokenizer_init:
|
||||
self.tokenizer: BaseTokenizerGroup
|
||||
self._init_tokenizer()
|
||||
self.detokenizer = Detokenizer(self.tokenizer)
|
||||
else:
|
||||
self.detokenizer = None
|
||||
self.tokenizer = None
|
||||
|
||||
self.seq_counter = Counter()
|
||||
self.generation_config_fields = _load_generation_config_dict(
|
||||
model_config)
|
||||
|
||||
self.model_executor = executor_class(
|
||||
model_config=model_config,
|
||||
cache_config=cache_config,
|
||||
parallel_config=parallel_config,
|
||||
scheduler_config=scheduler_config,
|
||||
device_config=device_config,
|
||||
lora_config=lora_config,
|
||||
vision_language_config=vision_language_config,
|
||||
speculative_config=speculative_config,
|
||||
load_config=load_config,
|
||||
)
|
||||
|
||||
self._initialize_kv_caches()
|
||||
|
||||
# If usage stat is enabled, collect relevant info.
|
||||
if is_usage_stats_enabled():
|
||||
from vllm.model_executor.model_loader import (
|
||||
get_architecture_class_name)
|
||||
usage_message.report_usage(
|
||||
get_architecture_class_name(model_config),
|
||||
usage_context,
|
||||
extra_kvs={
|
||||
# Common configuration
|
||||
"dtype":
|
||||
str(model_config.dtype),
|
||||
"tensor_parallel_size":
|
||||
parallel_config.tensor_parallel_size,
|
||||
"block_size":
|
||||
cache_config.block_size,
|
||||
"gpu_memory_utilization":
|
||||
cache_config.gpu_memory_utilization,
|
||||
|
||||
# Quantization
|
||||
"quantization":
|
||||
model_config.quantization,
|
||||
"kv_cache_dtype":
|
||||
cache_config.cache_dtype,
|
||||
|
||||
# Feature flags
|
||||
"enable_lora":
|
||||
bool(lora_config),
|
||||
"enable_prefix_caching":
|
||||
cache_config.enable_prefix_caching,
|
||||
"enforce_eager":
|
||||
model_config.enforce_eager,
|
||||
"disable_custom_all_reduce":
|
||||
parallel_config.disable_custom_all_reduce,
|
||||
})
|
||||
|
||||
if self.tokenizer:
|
||||
# Ping the tokenizer to ensure liveness if it runs in a
|
||||
# different process.
|
||||
self.tokenizer.ping()
|
||||
|
||||
# Create the scheduler.
|
||||
# NOTE: the cache_config here have been updated with the numbers of
|
||||
# GPU and CPU blocks, which are profiled in the distributed executor.
|
||||
self.scheduler = Scheduler(scheduler_config, cache_config, lora_config)
|
||||
|
||||
# Metric Logging.
|
||||
if self.log_stats:
|
||||
self.stat_logger = StatLogger(
|
||||
local_interval=_LOCAL_LOGGING_INTERVAL_SEC,
|
||||
labels=dict(model_name=model_config.served_model_name),
|
||||
max_model_len=self.model_config.max_model_len)
|
||||
self.stat_logger.info("cache_config", self.cache_config)
|
||||
|
||||
# Create sequence output processor, e.g. for beam search or
|
||||
# speculative decoding.
|
||||
self.output_processor = (
|
||||
SequenceGroupOutputProcessor.create_output_processor(
|
||||
self.scheduler_config,
|
||||
self.detokenizer,
|
||||
self.scheduler,
|
||||
self.seq_counter,
|
||||
self.get_tokenizer_for_seq,
|
||||
stop_checker=StopChecker(
|
||||
self.scheduler_config.max_model_len,
|
||||
self.get_tokenizer_for_seq,
|
||||
),
|
||||
))
|
||||
|
||||
def _initialize_kv_caches(self) -> None:
|
||||
"""Initialize the KV cache in the worker(s).
|
||||
|
||||
The workers will determine the number of blocks in both the GPU cache
|
||||
and the swap CPU cache.
|
||||
"""
|
||||
num_gpu_blocks, num_cpu_blocks = (
|
||||
self.model_executor.determine_num_available_blocks())
|
||||
|
||||
if self.cache_config.num_gpu_blocks_override is not None:
|
||||
num_gpu_blocks_override = self.cache_config.num_gpu_blocks_override
|
||||
logger.info(
|
||||
"Overriding num_gpu_blocks=%d with "
|
||||
"num_gpu_blocks_override=%d", num_gpu_blocks,
|
||||
num_gpu_blocks_override)
|
||||
num_gpu_blocks = num_gpu_blocks_override
|
||||
|
||||
self.cache_config.num_gpu_blocks = num_gpu_blocks
|
||||
self.cache_config.num_cpu_blocks = num_cpu_blocks
|
||||
|
||||
self.model_executor.initialize_cache(num_gpu_blocks, num_cpu_blocks)
|
||||
|
||||
@classmethod
|
||||
def from_engine_args(
|
||||
cls,
|
||||
engine_args: EngineArgs,
|
||||
usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
|
||||
) -> "LLMEngine":
|
||||
"""Creates an LLM engine from the engine arguments."""
|
||||
# Create the engine configs.
|
||||
engine_config = engine_args.create_engine_config()
|
||||
|
||||
# Initialize the cluster and specify the executor class.
|
||||
if engine_config.device_config.device_type == "neuron":
|
||||
from vllm.executor.neuron_executor import NeuronExecutor
|
||||
executor_class = NeuronExecutor
|
||||
elif engine_config.device_config.device_type == "cpu":
|
||||
from vllm.executor.cpu_executor import CPUExecutor
|
||||
executor_class = CPUExecutor
|
||||
elif engine_config.parallel_config.worker_use_ray:
|
||||
initialize_ray_cluster(engine_config.parallel_config)
|
||||
from vllm.executor.ray_gpu_executor import RayGPUExecutor
|
||||
executor_class = RayGPUExecutor
|
||||
else:
|
||||
assert engine_config.parallel_config.world_size == 1, (
|
||||
"Ray is required if parallel_config.world_size > 1.")
|
||||
from vllm.executor.gpu_executor import GPUExecutor
|
||||
executor_class = GPUExecutor
|
||||
|
||||
# Create the LLM engine.
|
||||
engine = cls(
|
||||
**engine_config.to_dict(),
|
||||
executor_class=executor_class,
|
||||
log_stats=not engine_args.disable_log_stats,
|
||||
usage_context=usage_context,
|
||||
)
|
||||
return engine
|
||||
|
||||
def __reduce__(self):
|
||||
# This is to ensure that the LLMEngine is not referenced in
|
||||
# the closure used to initialize Ray worker actors
|
||||
raise RuntimeError("LLMEngine should not be pickled!")
|
||||
|
||||
def __del__(self):
|
||||
# Shutdown model executor when engine is garbage collected
|
||||
# Use getattr since __init__ can fail before the field is set
|
||||
if model_executor := getattr(self, "model_executor", None):
|
||||
model_executor.shutdown()
|
||||
|
||||
def get_tokenizer(self) -> "PreTrainedTokenizer":
|
||||
return self.tokenizer.get_lora_tokenizer(None)
|
||||
|
||||
def get_tokenizer_for_seq(self,
|
||||
sequence: Sequence) -> "PreTrainedTokenizer":
|
||||
return self.tokenizer.get_lora_tokenizer(sequence.lora_request)
|
||||
|
||||
def _init_tokenizer(self, **tokenizer_init_kwargs):
|
||||
init_kwargs = dict(
|
||||
tokenizer_id=self.model_config.tokenizer,
|
||||
enable_lora=bool(self.lora_config),
|
||||
max_num_seqs=self.scheduler_config.max_num_seqs,
|
||||
max_input_length=None,
|
||||
tokenizer_mode=self.model_config.tokenizer_mode,
|
||||
trust_remote_code=self.model_config.trust_remote_code,
|
||||
revision=self.model_config.tokenizer_revision)
|
||||
init_kwargs.update(tokenizer_init_kwargs)
|
||||
self.tokenizer = get_tokenizer_group(
|
||||
self.parallel_config.tokenizer_pool_config, **init_kwargs)
|
||||
|
||||
def _verify_args(self) -> None:
|
||||
self.model_config.verify_with_parallel_config(self.parallel_config)
|
||||
self.cache_config.verify_with_parallel_config(self.parallel_config)
|
||||
if self.lora_config:
|
||||
self.lora_config.verify_with_model_config(self.model_config)
|
||||
self.lora_config.verify_with_scheduler_config(
|
||||
self.scheduler_config)
|
||||
|
||||
def encode_request(
|
||||
self,
|
||||
request_id: str, # pylint: disable=unused-argument
|
||||
prompt: Optional[str],
|
||||
prompt_token_ids: Optional[List[int]] = None,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
):
|
||||
if prompt_token_ids is None:
|
||||
assert prompt is not None
|
||||
prompt_token_ids = self.tokenizer.encode(request_id=request_id,
|
||||
prompt=prompt,
|
||||
lora_request=lora_request)
|
||||
return prompt_token_ids
|
||||
|
||||
def add_request(
|
||||
self,
|
||||
request_id: str,
|
||||
prompt: Optional[str],
|
||||
sampling_params: SamplingParams,
|
||||
prompt_token_ids: Optional[List[int]] = None,
|
||||
arrival_time: Optional[float] = None,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
multi_modal_data: Optional[MultiModalData] = None,
|
||||
) -> None:
|
||||
"""Add a request to the engine's request pool.
|
||||
|
||||
The request is added to the request pool and will be processed by the
|
||||
scheduler as `engine.step()` is called. The exact scheduling policy is
|
||||
determined by the scheduler.
|
||||
|
||||
Args:
|
||||
request_id: The unique ID of the request.
|
||||
prompt: The prompt string. Can be None if prompt_token_ids is
|
||||
provided.
|
||||
sampling_params: The sampling parameters for text generation.
|
||||
prompt_token_ids: The token IDs of the prompt. If None, we
|
||||
use the tokenizer to convert the prompts to token IDs.
|
||||
arrival_time: The arrival time of the request. If None, we use
|
||||
the current monotonic time.
|
||||
multi_modal_data: Multi modal data per request.
|
||||
|
||||
Details:
|
||||
- Set arrival_time to the current time if it is None.
|
||||
- Set prompt_token_ids to the encoded prompt if it is None.
|
||||
- Create `best_of` number of :class:`~vllm.Sequence` objects.
|
||||
- Create a :class:`~vllm.SequenceGroup` object
|
||||
from the list of :class:`~vllm.Sequence`.
|
||||
- Add the :class:`~vllm.SequenceGroup` object to the scheduler.
|
||||
|
||||
Example:
|
||||
>>> # initialize engine
|
||||
>>> engine = LLMEngine.from_engine_args(engine_args)
|
||||
>>> # set request arguments
|
||||
>>> example_prompt = "Who is the president of the United States?"
|
||||
>>> sampling_params = SamplingParams(temperature=0.0)
|
||||
>>> request_id = 0
|
||||
>>>
|
||||
>>> # add the request to the engine
|
||||
>>> engine.add_request(
|
||||
>>> str(request_id),
|
||||
>>> example_prompt,
|
||||
>>> SamplingParams(temperature=0.0))
|
||||
>>> # continue the request processing
|
||||
>>> ...
|
||||
"""
|
||||
if lora_request is not None and not self.lora_config:
|
||||
raise ValueError(f"Got lora_request {lora_request} but LoRA is "
|
||||
"not enabled!")
|
||||
max_logprobs = self.get_model_config().max_logprobs
|
||||
if (sampling_params.logprobs
|
||||
and sampling_params.logprobs > max_logprobs) or (
|
||||
sampling_params.prompt_logprobs
|
||||
and sampling_params.prompt_logprobs > max_logprobs):
|
||||
raise ValueError(f"Cannot request more than "
|
||||
f"{max_logprobs} logprobs.")
|
||||
if arrival_time is None:
|
||||
arrival_time = time.time()
|
||||
prompt_token_ids = self.encode_request(
|
||||
request_id=request_id,
|
||||
prompt=prompt,
|
||||
prompt_token_ids=prompt_token_ids,
|
||||
lora_request=lora_request)
|
||||
|
||||
# Create the sequences.
|
||||
block_size = self.cache_config.block_size
|
||||
seq_id = next(self.seq_counter)
|
||||
eos_token_id = None
|
||||
if self.tokenizer:
|
||||
eos_token_id = self.tokenizer.get_lora_tokenizer(
|
||||
lora_request).eos_token_id
|
||||
else:
|
||||
logger.warning("Use None for EOS token id because tokenizer is "
|
||||
"not initialized")
|
||||
seq = Sequence(seq_id, prompt, prompt_token_ids, block_size,
|
||||
eos_token_id, lora_request)
|
||||
|
||||
# Defensive copy of SamplingParams, which are used by the sampler,
|
||||
# this doesn't deep-copy LogitsProcessor objects
|
||||
sampling_params = sampling_params.clone()
|
||||
# Add the eos token id into the sampling_params to support min_tokens
|
||||
# processing
|
||||
if seq.eos_token_id is not None:
|
||||
sampling_params.all_stop_token_ids.add(seq.eos_token_id)
|
||||
sampling_params.update_from_generation_config(
|
||||
self.generation_config_fields)
|
||||
|
||||
# Create the sequence group.
|
||||
seq_group = SequenceGroup(request_id, [seq], sampling_params,
|
||||
arrival_time, lora_request, multi_modal_data)
|
||||
|
||||
# Add the sequence group to the scheduler.
|
||||
self.scheduler.add_seq_group(seq_group)
|
||||
|
||||
def abort_request(self, request_id: Union[str, Iterable[str]]) -> None:
|
||||
"""Aborts a request(s) with the given ID.
|
||||
|
||||
Args:
|
||||
request_id: The ID(s) of the request to abort.
|
||||
|
||||
Details:
|
||||
- Refer to the
|
||||
:meth:`~vllm.core.scheduler.Scheduler.abort_seq_group`
|
||||
from class :class:`~vllm.core.scheduler.Scheduler`.
|
||||
|
||||
Example:
|
||||
>>> # initialize engine and add a request with request_id
|
||||
>>> request_id = str(0)
|
||||
>>> # abort the request
|
||||
>>> engine.abort_request(request_id)
|
||||
"""
|
||||
self.scheduler.abort_seq_group(request_id)
|
||||
|
||||
def get_model_config(self) -> ModelConfig:
|
||||
"""Gets the model configuration."""
|
||||
return self.model_config
|
||||
|
||||
def get_decoding_config(self) -> DecodingConfig:
|
||||
"""Gets the decoding configuration."""
|
||||
return self.decoding_config
|
||||
|
||||
def get_num_unfinished_requests(self) -> int:
|
||||
"""Gets the number of unfinished requests."""
|
||||
return self.scheduler.get_num_unfinished_seq_groups()
|
||||
|
||||
def has_unfinished_requests(self) -> bool:
|
||||
"""Returns True if there are unfinished requests."""
|
||||
return self.scheduler.has_unfinished_seqs()
|
||||
|
||||
def _process_model_outputs(
|
||||
self,
|
||||
output: List[SamplerOutput],
|
||||
scheduled_seq_groups: List[ScheduledSequenceGroup],
|
||||
ignored_seq_groups: List[SequenceGroup],
|
||||
seq_group_metadata_list: List[SequenceGroupMetadata],
|
||||
) -> List[RequestOutput]:
|
||||
"""Apply the model output to the sequences in the scheduled seq groups.
|
||||
|
||||
Returns RequestOutputs that can be returned to the client.
|
||||
"""
|
||||
|
||||
now = time.time()
|
||||
|
||||
# Organize outputs by [sequence group][step] instead of
|
||||
# [step][sequence group].
|
||||
output_by_sequence_group = create_output_by_sequence_group(
|
||||
sampler_outputs=output, num_seq_groups=len(scheduled_seq_groups))
|
||||
|
||||
# Update the scheduled sequence groups with the model outputs.
|
||||
for scheduled_seq_group, outputs, seq_group_meta in zip(
|
||||
scheduled_seq_groups, output_by_sequence_group,
|
||||
seq_group_metadata_list):
|
||||
seq_group = scheduled_seq_group.seq_group
|
||||
seq_group.update_num_computed_tokens(
|
||||
scheduled_seq_group.token_chunk_size)
|
||||
|
||||
self.output_processor.process_prompt_logprob(seq_group, outputs)
|
||||
if seq_group_meta.do_sample:
|
||||
self.output_processor.process_outputs(seq_group, outputs)
|
||||
|
||||
# Free the finished sequence groups.
|
||||
self.scheduler.free_finished_seq_groups()
|
||||
|
||||
# Create the outputs.
|
||||
request_outputs: List[RequestOutput] = []
|
||||
for scheduled_seq_group in scheduled_seq_groups:
|
||||
seq_group = scheduled_seq_group.seq_group
|
||||
seq_group.maybe_set_first_token_time(now)
|
||||
request_output = RequestOutput.from_seq_group(seq_group)
|
||||
request_outputs.append(request_output)
|
||||
for seq_group in ignored_seq_groups:
|
||||
request_output = RequestOutput.from_seq_group(seq_group)
|
||||
request_outputs.append(request_output)
|
||||
return request_outputs
|
||||
|
||||
def step(self) -> List[RequestOutput]:
|
||||
"""Performs one decoding iteration and returns newly generated results.
|
||||
|
||||
.. figure:: https://i.imgur.com/sv2HssD.png
|
||||
:alt: Overview of the step function
|
||||
:align: center
|
||||
|
||||
Overview of the step function.
|
||||
|
||||
Details:
|
||||
- Step 1: Schedules the sequences to be executed in the next
|
||||
iteration and the token blocks to be swapped in/out/copy.
|
||||
|
||||
- Depending on the scheduling policy,
|
||||
sequences may be `preempted/reordered`.
|
||||
- A Sequence Group (SG) refer to a group of sequences
|
||||
that are generated from the same prompt.
|
||||
|
||||
- Step 2: Calls the distributed executor to execute the model.
|
||||
- Step 3: Processes the model output. This mainly includes:
|
||||
|
||||
- Decodes the relevant outputs.
|
||||
- Updates the scheduled sequence groups with model outputs
|
||||
based on its `sampling parameters` (`use_beam_search` or not).
|
||||
- Frees the finished sequence groups.
|
||||
|
||||
- Finally, it creates and returns the newly generated results.
|
||||
|
||||
Example:
|
||||
>>> # Please see the example/ folder for more detailed examples.
|
||||
>>>
|
||||
>>> # initialize engine and request arguments
|
||||
>>> engine = LLMEngine.from_engine_args(engine_args)
|
||||
>>> example_inputs = [(0, "What is LLM?",
|
||||
>>> SamplingParams(temperature=0.0))]
|
||||
>>>
|
||||
>>> # Start the engine with an event loop
|
||||
>>> while True:
|
||||
>>> if example_inputs:
|
||||
>>> req_id, prompt, sampling_params = example_inputs.pop(0)
|
||||
>>> engine.add_request(str(req_id), prompt, sampling_params)
|
||||
>>>
|
||||
>>> # continue the request processing
|
||||
>>> request_outputs = engine.step()
|
||||
>>> for request_output in request_outputs:
|
||||
>>> if request_output.finished:
|
||||
>>> # return or show the request output
|
||||
>>>
|
||||
>>> if not (engine.has_unfinished_requests() or example_inputs):
|
||||
>>> break
|
||||
"""
|
||||
seq_group_metadata_list, scheduler_outputs = self.scheduler.schedule()
|
||||
|
||||
if not scheduler_outputs.is_empty():
|
||||
execute_model_req = ExecuteModelRequest(
|
||||
seq_group_metadata_list=seq_group_metadata_list,
|
||||
blocks_to_swap_in=scheduler_outputs.blocks_to_swap_in,
|
||||
blocks_to_swap_out=scheduler_outputs.blocks_to_swap_out,
|
||||
blocks_to_copy=scheduler_outputs.blocks_to_copy,
|
||||
num_lookahead_slots=scheduler_outputs.num_lookahead_slots,
|
||||
running_queue_size=scheduler_outputs.running_queue_size,
|
||||
)
|
||||
output = self.model_executor.execute_model(
|
||||
execute_model_req=execute_model_req)
|
||||
else:
|
||||
output = []
|
||||
|
||||
request_outputs = self._process_model_outputs(
|
||||
output, scheduler_outputs.scheduled_seq_groups,
|
||||
scheduler_outputs.ignored_seq_groups, seq_group_metadata_list)
|
||||
|
||||
# Log stats.
|
||||
self.do_log_stats(scheduler_outputs, output)
|
||||
|
||||
return request_outputs
|
||||
|
||||
def do_log_stats(
|
||||
self,
|
||||
scheduler_outputs: Optional[SchedulerOutputs] = None,
|
||||
model_output: Optional[List[SamplerOutput]] = None) -> None:
|
||||
"""Forced log when no requests active."""
|
||||
if self.log_stats:
|
||||
self.stat_logger.log(
|
||||
self._get_stats(scheduler_outputs, model_output))
|
||||
|
||||
def _get_stats(
|
||||
self,
|
||||
scheduler_outputs: Optional[SchedulerOutputs],
|
||||
model_output: Optional[List[SamplerOutput]] = None) -> Stats:
|
||||
"""Get Stats to be Logged to Prometheus.
|
||||
|
||||
Args:
|
||||
scheduler_outputs: Optional, used to populate metrics related to
|
||||
the scheduled batch,
|
||||
model_output: Optional, used to emit speculative decoding metrics
|
||||
which are created by the workers.
|
||||
"""
|
||||
now = time.time()
|
||||
|
||||
# System State
|
||||
# Scheduler State
|
||||
num_running_sys = len(self.scheduler.running)
|
||||
num_swapped_sys = len(self.scheduler.swapped)
|
||||
num_waiting_sys = len(self.scheduler.waiting)
|
||||
|
||||
# KV Cache Usage in %
|
||||
num_total_gpu = self.cache_config.num_gpu_blocks
|
||||
num_free_gpu = self.scheduler.block_manager.get_num_free_gpu_blocks()
|
||||
gpu_cache_usage_sys = 1.0 - (num_free_gpu / num_total_gpu)
|
||||
|
||||
num_total_cpu = self.cache_config.num_cpu_blocks
|
||||
cpu_cache_usage_sys = 0.
|
||||
if num_total_cpu > 0:
|
||||
num_free_cpu = self.scheduler.block_manager.get_num_free_cpu_blocks(
|
||||
)
|
||||
cpu_cache_usage_sys = 1.0 - (num_free_cpu / num_total_cpu)
|
||||
|
||||
# Iteration stats
|
||||
num_prompt_tokens_iter = 0
|
||||
num_generation_tokens_iter = 0
|
||||
time_to_first_tokens_iter: List[float] = []
|
||||
time_per_output_tokens_iter: List[float] = []
|
||||
|
||||
# Request stats
|
||||
# Latency
|
||||
time_e2e_requests: List[float] = []
|
||||
# Metadata
|
||||
num_prompt_tokens_requests: List[int] = []
|
||||
num_generation_tokens_requests: List[int] = []
|
||||
best_of_requests: List[int] = []
|
||||
n_requests: List[int] = []
|
||||
finished_reason_requests: List[str] = []
|
||||
|
||||
# NOTE: This loop assumes prefill seq_groups are before
|
||||
# decode seq_groups in scheduled_seq_groups.
|
||||
if scheduler_outputs is not None:
|
||||
num_generation_tokens_from_prefill_groups = 0.
|
||||
# NOTE: if scheduler_outputs.num_prefill_groups > 0 and
|
||||
# the len of scheduler_outputs.scheduled_seq_groups is !=
|
||||
# scheduler_outputs.num_prefill_groups, this means that
|
||||
# chunked prefills have been detected.
|
||||
|
||||
for idx, scheduled_seq_group in enumerate(
|
||||
scheduler_outputs.scheduled_seq_groups):
|
||||
group_was_prefill = idx < scheduler_outputs.num_prefill_groups
|
||||
seq_group = scheduled_seq_group.seq_group
|
||||
|
||||
# NOTE: a seq_group that completed all of its prefill tokens
|
||||
# in the last iteration will have seq_group.is_prefill() = False
|
||||
# with group_was_prefill = True
|
||||
if group_was_prefill:
|
||||
# Number of prompt tokens.
|
||||
num_prompt_tokens_iter += (
|
||||
scheduled_seq_group.token_chunk_size)
|
||||
|
||||
# If the seq_group just finished the prefill state
|
||||
# get TTFT.
|
||||
if not seq_group.is_prefill():
|
||||
latency = seq_group.get_last_latency(now)
|
||||
time_to_first_tokens_iter.append(latency)
|
||||
|
||||
# One generation token per finished prefill.
|
||||
num_generation_tokens_from_prefill_groups += (
|
||||
seq_group.num_seqs())
|
||||
else:
|
||||
# TPOTs.
|
||||
latency = seq_group.get_last_latency(now)
|
||||
time_per_output_tokens_iter.append(latency)
|
||||
|
||||
# Because of chunked prefill, we can have a single sequence
|
||||
# group that does multiple prompt_runs. To prevent logging
|
||||
# the same metadata more than once per request, we standardize
|
||||
# on logging request level information for finished requests,
|
||||
# which can only happen once.
|
||||
if seq_group.is_finished():
|
||||
# Latency timings
|
||||
time_e2e_requests.append(now -
|
||||
seq_group.metrics.arrival_time)
|
||||
|
||||
# Metadata
|
||||
num_prompt_tokens_requests.append(
|
||||
len(seq_group.prompt_token_ids))
|
||||
num_generation_tokens_requests.extend([
|
||||
seq.get_output_len()
|
||||
for seq in seq_group.get_finished_seqs()
|
||||
])
|
||||
best_of_requests.append(seq_group.sampling_params.best_of)
|
||||
n_requests.append(seq_group.sampling_params.n)
|
||||
finished_reason_requests.extend([
|
||||
SequenceStatus.get_finished_reason(seq.status)
|
||||
for seq in seq_group.get_finished_seqs()
|
||||
])
|
||||
|
||||
# Number of generation tokens.
|
||||
# num_batched_tokens equals the number of prompt_tokens plus the
|
||||
# number of decode_tokens in a single iteration. So,
|
||||
# num_generation_tokens = num_batched_tokens - num_prompt_tokens
|
||||
# + num_generation_tokens_from_prefill_groups (since we generate
|
||||
# one token on prefills on iters where the prefill finishes).
|
||||
num_generation_tokens_iter = (
|
||||
scheduler_outputs.num_batched_tokens - num_prompt_tokens_iter +
|
||||
num_generation_tokens_from_prefill_groups)
|
||||
|
||||
# Spec decode, if enabled, emits specialized metrics from the worker in
|
||||
# sampler output.
|
||||
if model_output and (model_output[0].spec_decode_worker_metrics
|
||||
is not None):
|
||||
spec_decode_metrics = model_output[0].spec_decode_worker_metrics
|
||||
else:
|
||||
spec_decode_metrics = None
|
||||
|
||||
return Stats(
|
||||
now=now,
|
||||
|
||||
# System stats
|
||||
# Scheduler State
|
||||
num_running_sys=num_running_sys,
|
||||
num_swapped_sys=num_swapped_sys,
|
||||
num_waiting_sys=num_waiting_sys,
|
||||
# KV Cache Usage in %
|
||||
gpu_cache_usage_sys=gpu_cache_usage_sys,
|
||||
cpu_cache_usage_sys=cpu_cache_usage_sys,
|
||||
|
||||
# Iteration stats
|
||||
num_prompt_tokens_iter=num_prompt_tokens_iter,
|
||||
num_generation_tokens_iter=num_generation_tokens_iter,
|
||||
time_to_first_tokens_iter=time_to_first_tokens_iter,
|
||||
time_per_output_tokens_iter=time_per_output_tokens_iter,
|
||||
spec_decode_metrics=spec_decode_metrics,
|
||||
|
||||
# Request stats
|
||||
# Latency
|
||||
time_e2e_requests=time_e2e_requests,
|
||||
# Metadata
|
||||
num_prompt_tokens_requests=num_prompt_tokens_requests,
|
||||
num_generation_tokens_requests=num_generation_tokens_requests,
|
||||
best_of_requests=best_of_requests,
|
||||
n_requests=n_requests,
|
||||
finished_reason_requests=finished_reason_requests,
|
||||
)
|
||||
|
||||
def add_lora(self, lora_request: LoRARequest) -> bool:
|
||||
return self.model_executor.add_lora(lora_request)
|
||||
|
||||
def remove_lora(self, lora_id: int) -> bool:
|
||||
return self.model_executor.remove_lora(lora_id)
|
||||
|
||||
def list_loras(self) -> List[int]:
|
||||
return self.model_executor.list_loras()
|
||||
|
||||
def check_health(self) -> None:
|
||||
self.model_executor.check_health()
|
||||
368
vllm/engine/metrics.py
Normal file
368
vllm/engine/metrics.py
Normal file
@@ -0,0 +1,368 @@
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from typing import TYPE_CHECKING
|
||||
from typing import Counter as CollectionsCounter
|
||||
from typing import Dict, List, Optional, Protocol, Union
|
||||
|
||||
import numpy as np
|
||||
from prometheus_client import (REGISTRY, Counter, Gauge, Histogram, Info,
|
||||
disable_created_metrics)
|
||||
|
||||
from vllm.logger import init_logger
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.spec_decode.metrics import SpecDecodeWorkerMetrics
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
disable_created_metrics()
|
||||
|
||||
# The begin-* and end* here are used by the documentation generator
|
||||
# to extract the metrics definitions.
|
||||
|
||||
|
||||
# begin-metrics-definitions
|
||||
class Metrics:
|
||||
labelname_finish_reason = "finished_reason"
|
||||
|
||||
def __init__(self, labelnames: List[str], max_model_len: int):
|
||||
# Unregister any existing vLLM collectors
|
||||
for collector in list(REGISTRY._collector_to_names):
|
||||
if hasattr(collector, "_name") and "vllm" in collector._name:
|
||||
REGISTRY.unregister(collector)
|
||||
|
||||
# Config Information
|
||||
self.info_cache_config = Info(
|
||||
name='vllm:cache_config',
|
||||
documentation='information of cache_config')
|
||||
|
||||
# System stats
|
||||
# Scheduler State
|
||||
self.gauge_scheduler_running = Gauge(
|
||||
name="vllm:num_requests_running",
|
||||
documentation="Number of requests currently running on GPU.",
|
||||
labelnames=labelnames)
|
||||
self.gauge_scheduler_waiting = Gauge(
|
||||
name="vllm:num_requests_waiting",
|
||||
documentation="Number of requests waiting to be processed.",
|
||||
labelnames=labelnames)
|
||||
self.gauge_scheduler_swapped = Gauge(
|
||||
name="vllm:num_requests_swapped",
|
||||
documentation="Number of requests swapped to CPU.",
|
||||
labelnames=labelnames)
|
||||
# KV Cache Usage in %
|
||||
self.gauge_gpu_cache_usage = Gauge(
|
||||
name="vllm:gpu_cache_usage_perc",
|
||||
documentation="GPU KV-cache usage. 1 means 100 percent usage.",
|
||||
labelnames=labelnames)
|
||||
self.gauge_cpu_cache_usage = Gauge(
|
||||
name="vllm:cpu_cache_usage_perc",
|
||||
documentation="CPU KV-cache usage. 1 means 100 percent usage.",
|
||||
labelnames=labelnames)
|
||||
|
||||
# Iteration stats
|
||||
self.counter_prompt_tokens = Counter(
|
||||
name="vllm:prompt_tokens_total",
|
||||
documentation="Number of prefill tokens processed.",
|
||||
labelnames=labelnames)
|
||||
self.counter_generation_tokens = Counter(
|
||||
name="vllm:generation_tokens_total",
|
||||
documentation="Number of generation tokens processed.",
|
||||
labelnames=labelnames)
|
||||
self.histogram_time_to_first_token = Histogram(
|
||||
name="vllm:time_to_first_token_seconds",
|
||||
documentation="Histogram of time to first token in seconds.",
|
||||
labelnames=labelnames,
|
||||
buckets=[
|
||||
0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5,
|
||||
0.75, 1.0, 2.5, 5.0, 7.5, 10.0
|
||||
])
|
||||
self.histogram_time_per_output_token = Histogram(
|
||||
name="vllm:time_per_output_token_seconds",
|
||||
documentation="Histogram of time per output token in seconds.",
|
||||
labelnames=labelnames,
|
||||
buckets=[
|
||||
0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
|
||||
1.0, 2.5
|
||||
])
|
||||
|
||||
# Request stats
|
||||
# Latency
|
||||
self.histogram_e2e_time_request = Histogram(
|
||||
name="vllm:e2e_request_latency_seconds",
|
||||
documentation="Histogram of end to end request latency in seconds.",
|
||||
labelnames=labelnames,
|
||||
buckets=[1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0])
|
||||
# Metadata
|
||||
self.histogram_num_prompt_tokens_request = Histogram(
|
||||
name="vllm:request_prompt_tokens",
|
||||
documentation="Number of prefill tokens processed.",
|
||||
labelnames=labelnames,
|
||||
buckets=build_1_2_5_buckets(max_model_len),
|
||||
)
|
||||
self.histogram_num_generation_tokens_request = Histogram(
|
||||
name="vllm:request_generation_tokens",
|
||||
documentation="Number of generation tokens processed.",
|
||||
labelnames=labelnames,
|
||||
buckets=build_1_2_5_buckets(max_model_len),
|
||||
)
|
||||
self.histogram_best_of_request = Histogram(
|
||||
name="vllm:request_params_best_of",
|
||||
documentation="Histogram of the best_of request parameter.",
|
||||
labelnames=labelnames,
|
||||
buckets=[1, 2, 5, 10, 20],
|
||||
)
|
||||
self.histogram_n_request = Histogram(
|
||||
name="vllm:request_params_n",
|
||||
documentation="Histogram of the n request parameter.",
|
||||
labelnames=labelnames,
|
||||
buckets=[1, 2, 5, 10, 20],
|
||||
)
|
||||
self.counter_request_success = Counter(
|
||||
name="vllm:request_success_total",
|
||||
documentation="Count of successfully processed requests.",
|
||||
labelnames=labelnames + [Metrics.labelname_finish_reason])
|
||||
|
||||
# Deprecated in favor of vllm:prompt_tokens_total
|
||||
self.gauge_avg_prompt_throughput = Gauge(
|
||||
name="vllm:avg_prompt_throughput_toks_per_s",
|
||||
documentation="Average prefill throughput in tokens/s.",
|
||||
labelnames=labelnames,
|
||||
)
|
||||
# Deprecated in favor of vllm:generation_tokens_total
|
||||
self.gauge_avg_generation_throughput = Gauge(
|
||||
name="vllm:avg_generation_throughput_toks_per_s",
|
||||
documentation="Average generation throughput in tokens/s.",
|
||||
labelnames=labelnames,
|
||||
)
|
||||
|
||||
|
||||
# end-metrics-definitions
|
||||
|
||||
|
||||
def build_1_2_5_buckets(max_value: int):
|
||||
"""
|
||||
Builds a list of buckets with increasing powers of 10 multiplied by
|
||||
mantissa values (1, 2, 5) until the value exceeds the specified maximum.
|
||||
|
||||
Example:
|
||||
>>> build_1_2_5_buckets(100)
|
||||
[1, 2, 5, 10, 20, 50, 100]
|
||||
"""
|
||||
mantissa_lst = [1, 2, 5]
|
||||
exponent = 0
|
||||
buckets = []
|
||||
while True:
|
||||
for m in mantissa_lst:
|
||||
value = m * 10**exponent
|
||||
if value <= max_value:
|
||||
buckets.append(value)
|
||||
else:
|
||||
return buckets
|
||||
exponent += 1
|
||||
|
||||
|
||||
@dataclass
|
||||
class Stats:
|
||||
"""Created by LLMEngine for use by StatLogger."""
|
||||
now: float
|
||||
|
||||
# System stats (should have _sys suffix)
|
||||
# Scheduler State
|
||||
num_running_sys: int
|
||||
num_waiting_sys: int
|
||||
num_swapped_sys: int
|
||||
# KV Cache Usage in %
|
||||
gpu_cache_usage_sys: float
|
||||
cpu_cache_usage_sys: float
|
||||
|
||||
# Iteration stats (should have _iter suffix)
|
||||
num_prompt_tokens_iter: int
|
||||
num_generation_tokens_iter: int
|
||||
time_to_first_tokens_iter: List[float]
|
||||
time_per_output_tokens_iter: List[float]
|
||||
|
||||
# Request stats (should have _requests suffix)
|
||||
# Latency
|
||||
time_e2e_requests: List[float]
|
||||
# Metadata
|
||||
num_prompt_tokens_requests: List[int]
|
||||
num_generation_tokens_requests: List[int]
|
||||
best_of_requests: List[int]
|
||||
n_requests: List[int]
|
||||
finished_reason_requests: List[str]
|
||||
|
||||
spec_decode_metrics: Optional["SpecDecodeWorkerMetrics"] = None
|
||||
|
||||
|
||||
class SupportsMetricsInfo(Protocol):
|
||||
|
||||
def metrics_info(self) -> Dict[str, str]:
|
||||
...
|
||||
|
||||
|
||||
class StatLogger:
|
||||
"""StatLogger is used LLMEngine to log to Promethus and Stdout."""
|
||||
|
||||
def __init__(self, local_interval: float, labels: Dict[str, str],
|
||||
max_model_len: int) -> None:
|
||||
# Metadata for logging locally.
|
||||
self.last_local_log = time.time()
|
||||
self.local_interval = local_interval
|
||||
|
||||
# Tracked stats over current local logging interval.
|
||||
self.num_prompt_tokens: List[int] = []
|
||||
self.num_generation_tokens: List[int] = []
|
||||
|
||||
# Prometheus metrics
|
||||
self.labels = labels
|
||||
self.metrics = Metrics(labelnames=list(labels.keys()),
|
||||
max_model_len=max_model_len)
|
||||
|
||||
def info(self, type: str, obj: SupportsMetricsInfo) -> None:
|
||||
if type == "cache_config":
|
||||
self.metrics.info_cache_config.info(obj.metrics_info())
|
||||
|
||||
def _get_throughput(self, tracked_stats: List[int], now: float) -> float:
|
||||
return float(np.sum(tracked_stats) / (now - self.last_local_log))
|
||||
|
||||
def _local_interval_elapsed(self, now: float) -> bool:
|
||||
elapsed_time = now - self.last_local_log
|
||||
return elapsed_time > self.local_interval
|
||||
|
||||
def _log_prometheus(self, stats: Stats) -> None:
|
||||
# System state data
|
||||
self._log_gauge(self.metrics.gauge_scheduler_running,
|
||||
stats.num_running_sys)
|
||||
self._log_gauge(self.metrics.gauge_scheduler_swapped,
|
||||
stats.num_swapped_sys)
|
||||
self._log_gauge(self.metrics.gauge_scheduler_waiting,
|
||||
stats.num_waiting_sys)
|
||||
self._log_gauge(self.metrics.gauge_gpu_cache_usage,
|
||||
stats.gpu_cache_usage_sys)
|
||||
self._log_gauge(self.metrics.gauge_cpu_cache_usage,
|
||||
stats.cpu_cache_usage_sys)
|
||||
|
||||
# Iteration level data
|
||||
self._log_counter(self.metrics.counter_prompt_tokens,
|
||||
stats.num_prompt_tokens_iter)
|
||||
self._log_counter(self.metrics.counter_generation_tokens,
|
||||
stats.num_generation_tokens_iter)
|
||||
self._log_histogram(self.metrics.histogram_time_to_first_token,
|
||||
stats.time_to_first_tokens_iter)
|
||||
self._log_histogram(self.metrics.histogram_time_per_output_token,
|
||||
stats.time_per_output_tokens_iter)
|
||||
|
||||
# Request level data
|
||||
# Latency
|
||||
self._log_histogram(self.metrics.histogram_e2e_time_request,
|
||||
stats.time_e2e_requests)
|
||||
# Metadata
|
||||
finished_reason_counter = CollectionsCounter(
|
||||
stats.finished_reason_requests)
|
||||
self._log_counter_labels(self.metrics.counter_request_success,
|
||||
finished_reason_counter,
|
||||
Metrics.labelname_finish_reason)
|
||||
self._log_histogram(self.metrics.histogram_num_prompt_tokens_request,
|
||||
stats.num_prompt_tokens_requests)
|
||||
self._log_histogram(
|
||||
self.metrics.histogram_num_generation_tokens_request,
|
||||
stats.num_generation_tokens_requests)
|
||||
self._log_histogram(self.metrics.histogram_n_request, stats.n_requests)
|
||||
self._log_histogram(self.metrics.histogram_best_of_request,
|
||||
stats.best_of_requests)
|
||||
|
||||
def _log_gauge(self, gauge: Gauge, data: Union[int, float]) -> None:
|
||||
# Convenience function for logging to gauge.
|
||||
gauge.labels(**self.labels).set(data)
|
||||
|
||||
def _log_counter(self, counter: Counter, data: Union[int, float]) -> None:
|
||||
# Convenience function for logging to counter.
|
||||
counter.labels(**self.labels).inc(data)
|
||||
|
||||
def _log_counter_labels(self, counter: Counter, data: CollectionsCounter,
|
||||
label_key: str) -> None:
|
||||
# Convenience function for collection counter of labels.
|
||||
for label, count in data.items():
|
||||
counter.labels(**{**self.labels, label_key: label}).inc(count)
|
||||
|
||||
def _log_histogram(self, histogram: Histogram,
|
||||
data: Union[List[int], List[float]]) -> None:
|
||||
# Convenience function for logging list to histogram.
|
||||
for datum in data:
|
||||
histogram.labels(**self.labels).observe(datum)
|
||||
|
||||
def _log_prometheus_interval(self, prompt_throughput: float,
|
||||
generation_throughput: float) -> None:
|
||||
# Logs metrics to prometheus that are computed every logging_interval.
|
||||
# Support legacy gauge metrics that make throughput calculations on
|
||||
# the vLLM side. Moving forward, we should use counters like
|
||||
# counter_prompt_tokens, counter_generation_tokens
|
||||
# Which log raw data and calculate summaries using rate() on the
|
||||
# grafana/prometheus side. See
|
||||
# https://github.com/vllm-project/vllm/pull/2316#discussion_r1464204666
|
||||
self.metrics.gauge_avg_prompt_throughput.labels(
|
||||
**self.labels).set(prompt_throughput)
|
||||
self.metrics.gauge_avg_generation_throughput.labels(
|
||||
**self.labels).set(generation_throughput)
|
||||
|
||||
def log(self, stats: Stats) -> None:
|
||||
"""Called by LLMEngine.
|
||||
Logs to prometheus and tracked stats every iteration.
|
||||
Logs to Stdout every self.local_interval seconds."""
|
||||
|
||||
# Log to prometheus.
|
||||
self._log_prometheus(stats)
|
||||
|
||||
# Save tracked stats for token counters.
|
||||
self.num_prompt_tokens.append(stats.num_prompt_tokens_iter)
|
||||
self.num_generation_tokens.append(stats.num_generation_tokens_iter)
|
||||
|
||||
# Log locally every local_interval seconds.
|
||||
if self._local_interval_elapsed(stats.now):
|
||||
# Compute summary metrics for tracked stats (and log them
|
||||
# to promethus if applicable).
|
||||
prompt_throughput = self._get_throughput(self.num_prompt_tokens,
|
||||
now=stats.now)
|
||||
generation_throughput = self._get_throughput(
|
||||
self.num_generation_tokens, now=stats.now)
|
||||
self._log_prometheus_interval(
|
||||
prompt_throughput=prompt_throughput,
|
||||
generation_throughput=generation_throughput)
|
||||
|
||||
# Log to stdout.
|
||||
logger.info(
|
||||
"Avg prompt throughput: %.1f tokens/s, "
|
||||
"Avg generation throughput: %.1f tokens/s, "
|
||||
"Running: %d reqs, Swapped: %d reqs, "
|
||||
"Pending: %d reqs, GPU KV cache usage: %.1f%%, "
|
||||
"CPU KV cache usage: %.1f%%",
|
||||
prompt_throughput,
|
||||
generation_throughput,
|
||||
stats.num_running_sys,
|
||||
stats.num_swapped_sys,
|
||||
stats.num_waiting_sys,
|
||||
stats.gpu_cache_usage_sys * 100,
|
||||
stats.cpu_cache_usage_sys * 100,
|
||||
)
|
||||
|
||||
# Reset tracked stats for next interval.
|
||||
self.num_prompt_tokens = []
|
||||
self.num_generation_tokens = []
|
||||
self.last_local_log = stats.now
|
||||
|
||||
if stats.spec_decode_metrics is not None:
|
||||
logger.info(
|
||||
self._format_spec_decode_metrics_str(
|
||||
stats.spec_decode_metrics))
|
||||
|
||||
def _format_spec_decode_metrics_str(
|
||||
self, metrics: "SpecDecodeWorkerMetrics") -> str:
|
||||
|
||||
return ("Speculative metrics: "
|
||||
f"Draft acceptance rate: {metrics.draft_acceptance_rate:.3f}, "
|
||||
f"System efficiency: {metrics.system_efficiency:.3f}, "
|
||||
f"Number of speculative tokens: {metrics.num_spec_tokens}, "
|
||||
f"Number of accepted tokens: {metrics.accepted_tokens}, "
|
||||
f"Number of draft tokens tokens: {metrics.draft_tokens}, "
|
||||
f"Number of emitted tokens tokens: {metrics.emitted_tokens}.")
|
||||
0
vllm/engine/output_processor/__init__.py
Normal file
0
vllm/engine/output_processor/__init__.py
Normal file
76
vllm/engine/output_processor/interfaces.py
Normal file
76
vllm/engine/output_processor/interfaces.py
Normal file
@@ -0,0 +1,76 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Callable, List
|
||||
|
||||
from transformers import PreTrainedTokenizer
|
||||
|
||||
from vllm.config import SchedulerConfig
|
||||
from vllm.core.scheduler import Scheduler
|
||||
from vllm.engine.output_processor.stop_checker import StopChecker
|
||||
from vllm.sequence import Sequence, SequenceGroup, SequenceGroupOutput
|
||||
from vllm.transformers_utils.detokenizer import Detokenizer
|
||||
from vllm.utils import Counter
|
||||
|
||||
|
||||
class SequenceGroupOutputProcessor(ABC):
|
||||
"""Interface for logic that processes new token ids in sequence groups,
|
||||
managing detokenization, stop checking, and freeing/forking sequences with
|
||||
the scheduler.
|
||||
|
||||
This is highly coupled with the LLMEngine and should be seen as an extension
|
||||
of it. The logic is separated to simplify the LLMEngine class and allow
|
||||
separate implementations for single-step decoding (which supports beam
|
||||
search sequence forking) and multi-step decoding (which does not support
|
||||
beam search, but does support speculative decoding).
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def create_output_processor(
|
||||
scheduler_config: SchedulerConfig,
|
||||
detokenizer: Detokenizer,
|
||||
scheduler: Scheduler,
|
||||
seq_counter: Counter,
|
||||
get_tokenizer_for_seq: Callable[[Sequence], PreTrainedTokenizer],
|
||||
stop_checker: "StopChecker",
|
||||
):
|
||||
"""Create an output processor.
|
||||
|
||||
This returns a single-step output processor if num_lookahead_slots is
|
||||
zero, else returns a multi-step output processor.
|
||||
"""
|
||||
if scheduler_config.num_lookahead_slots == 0:
|
||||
# Importing here to avoid cycle.
|
||||
from vllm.engine.output_processor.single_step import (
|
||||
SingleStepOutputProcessor)
|
||||
return SingleStepOutputProcessor(
|
||||
scheduler_config,
|
||||
detokenizer,
|
||||
scheduler,
|
||||
seq_counter,
|
||||
stop_checker,
|
||||
)
|
||||
else:
|
||||
# Importing here to avoid cycle.
|
||||
from vllm.engine.output_processor.multi_step import (
|
||||
MultiStepOutputProcessor)
|
||||
return MultiStepOutputProcessor(
|
||||
detokenizer,
|
||||
scheduler,
|
||||
seq_counter,
|
||||
get_tokenizer_for_seq,
|
||||
stop_checker,
|
||||
)
|
||||
|
||||
@abstractmethod
|
||||
def process_outputs(self, sequence_group: SequenceGroup,
|
||||
outputs: List[SequenceGroupOutput]) -> None:
|
||||
"""Process new token ids for the sequence group. Handles logic such as
|
||||
detokenization, stop checking, and freeing/forking sequences in the
|
||||
scheduler.
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def process_prompt_logprob(self, seq_group: SequenceGroup,
|
||||
outputs: List[SequenceGroupOutput]) -> None:
|
||||
"""Update prompt logprobs received from outputs to seq_group."""
|
||||
pass
|
||||
142
vllm/engine/output_processor/multi_step.py
Normal file
142
vllm/engine/output_processor/multi_step.py
Normal file
@@ -0,0 +1,142 @@
|
||||
import functools
|
||||
from typing import Callable, List
|
||||
|
||||
from transformers import PreTrainedTokenizer
|
||||
|
||||
from vllm.core.scheduler import Scheduler
|
||||
from vllm.engine.output_processor.interfaces import (
|
||||
SequenceGroupOutputProcessor)
|
||||
from vllm.engine.output_processor.stop_checker import StopChecker
|
||||
from vllm.logger import init_logger
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.sequence import (Sequence, SequenceGroup, SequenceGroupOutput,
|
||||
SequenceOutput, SequenceStatus)
|
||||
from vllm.transformers_utils.detokenizer import Detokenizer
|
||||
from vllm.utils import Counter
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class MultiStepOutputProcessor(SequenceGroupOutputProcessor):
|
||||
"""SequenceGroupOutputProcessor which handles logic related to
|
||||
detokenization and stopping conditions. It specializes to "multi-step
|
||||
decoding", where vLLM's worker may generate multiple tokens per invocation.
|
||||
This is currently mutually exclusive with advanced sampling techniques like
|
||||
beam search, which motivates the separation of this logic from the single
|
||||
step output processor.
|
||||
|
||||
This class is responsible for things such as correctly appending all new
|
||||
token ids to their sequence, detokenizing new token ids, truncating new
|
||||
output tokens after an eos token, and correctly handling the case where the
|
||||
number of new output tokens per sequence differs in a single batch.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
detokenizer: Detokenizer,
|
||||
scheduler: Scheduler,
|
||||
seq_counter: Counter,
|
||||
get_tokenizer_for_seq: Callable[[Sequence], PreTrainedTokenizer],
|
||||
stop_checker: StopChecker,
|
||||
):
|
||||
self.detokenizer = detokenizer
|
||||
self.scheduler = scheduler
|
||||
self.seq_counter = seq_counter
|
||||
self.get_tokenizer_for_seq = get_tokenizer_for_seq
|
||||
self.stop_checker = stop_checker
|
||||
|
||||
def process_prompt_logprob(self, seq_group: SequenceGroup,
|
||||
outputs: List[SequenceGroupOutput]) -> None:
|
||||
# TODO(sang): Prompt logprob currently not implemented in multi step
|
||||
# workers.
|
||||
self._log_prompt_logprob_unsupported_warning_once()
|
||||
|
||||
@staticmethod
|
||||
@functools.lru_cache()
|
||||
def _log_prompt_logprob_unsupported_warning_once():
|
||||
logger.warning(
|
||||
"Prompt logprob is not supported by multi step workers. "
|
||||
"(e.g., speculative decode uses multi step workers).")
|
||||
|
||||
def process_outputs(self, sequence_group: SequenceGroup,
|
||||
outputs: List[SequenceGroupOutput]) -> None:
|
||||
"""Append new tokens in the outputs to sequences in the sequence group.
|
||||
|
||||
This only supports sequence groups of size 1. It supports greater than
|
||||
one new token per sequence.
|
||||
|
||||
This applies logic like stop condition checking and detokenization,
|
||||
including freeing finished sequences. It also handles cases where there
|
||||
are tokens emitted after the EOS token.
|
||||
"""
|
||||
seqs = sequence_group.get_seqs(status=SequenceStatus.RUNNING)
|
||||
|
||||
assert seqs, "expected running sequences"
|
||||
assert len(seqs) == 1, (
|
||||
"Beam search not supported in multi-step decoding.")
|
||||
seq = seqs[0]
|
||||
|
||||
# Since there's only one sequence per sequence group, we can take the
|
||||
# first sample.
|
||||
samples = [outputs[step].samples[0] for step in range(len(outputs))]
|
||||
|
||||
# -1 means the output token is not valid (eg. due to spec decode
|
||||
# rejecting tokens).
|
||||
valid_samples = [
|
||||
sample for sample in samples if sample.output_token != -1
|
||||
]
|
||||
assert valid_samples
|
||||
|
||||
self._process_seq_outputs(seq, valid_samples,
|
||||
sequence_group.sampling_params)
|
||||
|
||||
def _process_seq_outputs(self, seq: Sequence,
|
||||
valid_samples: List[SequenceOutput],
|
||||
sampling_params: SamplingParams) -> None:
|
||||
output_token_ids = [sample.output_token for sample in valid_samples]
|
||||
output_logprobs = [sample.logprobs for sample in valid_samples]
|
||||
|
||||
# Truncate to max_tokens if necessary.
|
||||
remaining_tokens = sampling_params.max_tokens - (seq.get_output_len() +
|
||||
len(output_token_ids))
|
||||
if remaining_tokens < 0:
|
||||
valid_samples = valid_samples[:remaining_tokens]
|
||||
output_token_ids = output_token_ids[:remaining_tokens]
|
||||
|
||||
# Truncate any tokens after EOS. This is required as spec decode
|
||||
# generates a fixed number of tokens without evaluating stopping
|
||||
# conditions within the block. This can cause an eos token to be
|
||||
# unintentionally ignored.
|
||||
if not sampling_params.ignore_eos:
|
||||
eos_token_id = self.get_tokenizer_for_seq(seq).eos_token_id
|
||||
# Avoiding .index calls as exception throwing in the happy path
|
||||
# is expensive.
|
||||
for i in range(len(output_token_ids)):
|
||||
if output_token_ids[i] == eos_token_id:
|
||||
output_token_ids = output_token_ids[:i + 1]
|
||||
valid_samples = valid_samples[:i + 1]
|
||||
break
|
||||
|
||||
# Incrementally append tokens to the sequence, as if we had only one new
|
||||
# token.
|
||||
for output_token_id, output_logprob in zip(output_token_ids,
|
||||
output_logprobs):
|
||||
seq.append_token_id(
|
||||
token_id=output_token_id,
|
||||
logprobs=output_logprob,
|
||||
)
|
||||
|
||||
new_char_count = 0
|
||||
if sampling_params.detokenize:
|
||||
new_char_count = self.detokenizer.decode_sequence_inplace(
|
||||
seq, sampling_params)
|
||||
|
||||
self.stop_checker.maybe_stop_sequence(
|
||||
seq,
|
||||
new_char_count=new_char_count,
|
||||
sampling_params=sampling_params)
|
||||
if seq.is_finished():
|
||||
break
|
||||
|
||||
if seq.is_finished():
|
||||
self.scheduler.free_seq(seq)
|
||||
284
vllm/engine/output_processor/single_step.py
Normal file
284
vllm/engine/output_processor/single_step.py
Normal file
@@ -0,0 +1,284 @@
|
||||
from typing import Dict, List, Tuple, Union
|
||||
|
||||
from vllm.config import SchedulerConfig
|
||||
from vllm.core.scheduler import Scheduler
|
||||
from vllm.engine.output_processor.interfaces import (
|
||||
SequenceGroupOutputProcessor)
|
||||
from vllm.engine.output_processor.stop_checker import StopChecker
|
||||
from vllm.logger import init_logger
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.sequence import (Sequence, SequenceGroup, SequenceGroupOutput,
|
||||
SequenceOutput, SequenceStatus)
|
||||
from vllm.transformers_utils.detokenizer import Detokenizer
|
||||
from vllm.utils import Counter
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class SingleStepOutputProcessor(SequenceGroupOutputProcessor):
|
||||
"""SequenceGroupOutputProcessor which handles "output processing" logic,
|
||||
which happens after the model returns generated token ids and before
|
||||
scheduling of the next batch. Output processing logic includes
|
||||
detokenization, and determining if a sequence is finished (e.g. via max len
|
||||
or eos token).
|
||||
|
||||
The SingleStepOutputProcessor is specialized to the case where the model
|
||||
emits at most a single token per invocation, which precludes configurations
|
||||
such as speculative decoding or multi-step decoding. This enables beam
|
||||
search sampling, which requires forking/finishing/freeing sequences in a way
|
||||
that is currently difficult to schedule multiple steps ahead of time.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
scheduler_config: SchedulerConfig,
|
||||
detokenizer: Detokenizer,
|
||||
scheduler: Scheduler,
|
||||
seq_counter: Counter,
|
||||
stop_checker: StopChecker,
|
||||
):
|
||||
self.scheduler_config = scheduler_config
|
||||
self.detokenizer = detokenizer
|
||||
self.scheduler = scheduler
|
||||
self.seq_counter = seq_counter
|
||||
self.stop_checker = stop_checker
|
||||
|
||||
def process_outputs(self, sequence_group: SequenceGroup,
|
||||
outputs: List[SequenceGroupOutput]) -> None:
|
||||
"""Append all new tokens to sequences in the sequence group. Fork any
|
||||
surviving beam candidates; free any unsurviving ones.
|
||||
|
||||
Invokes detokenizer to detokenize new tokens, and also marks sequences
|
||||
as finished if they meet stop conditions.
|
||||
"""
|
||||
assert (len(outputs) == 1
|
||||
), f"{type(self)} does not support multiple outputs per step"
|
||||
return self._process_sequence_group_outputs(sequence_group, outputs[0])
|
||||
|
||||
def process_prompt_logprob(self, seq_group: SequenceGroup,
|
||||
outputs: List[SequenceGroupOutput]) -> None:
|
||||
assert len(outputs) == 1, ("Single step should only has 1 output.")
|
||||
output = outputs[0]
|
||||
prompt_logprobs = output.prompt_logprobs
|
||||
if (prompt_logprobs is not None
|
||||
and seq_group.sampling_params.detokenize and self.detokenizer):
|
||||
self.detokenizer.decode_prompt_logprobs_inplace(
|
||||
seq_group, prompt_logprobs)
|
||||
if not seq_group.prompt_logprobs:
|
||||
# The first prompt token's logprob is None because it doesn't
|
||||
# have tokens that are precedent.
|
||||
seq_group.prompt_logprobs = [None]
|
||||
seq_group.prompt_logprobs.extend(prompt_logprobs)
|
||||
|
||||
def _process_sequence_group_outputs(self, seq_group: SequenceGroup,
|
||||
outputs: SequenceGroupOutput) -> None:
|
||||
# Process samples
|
||||
samples = outputs.samples
|
||||
parent_seqs = seq_group.get_seqs(status=SequenceStatus.RUNNING)
|
||||
existing_finished_seqs = seq_group.get_finished_seqs()
|
||||
parent_child_dict: Dict[int, List[SequenceOutput]] = {
|
||||
parent_seq.seq_id: []
|
||||
for parent_seq in parent_seqs
|
||||
}
|
||||
for sample in samples:
|
||||
parent_child_dict[sample.parent_seq_id].append(sample)
|
||||
# List of (child, parent)
|
||||
child_seqs: List[Tuple[Sequence, Sequence]] = []
|
||||
|
||||
# Process the child samples for each parent sequence
|
||||
for parent in parent_seqs:
|
||||
child_samples: List[SequenceOutput] = parent_child_dict[
|
||||
parent.seq_id]
|
||||
if len(child_samples) == 0:
|
||||
# This parent sequence has no children samples. Remove
|
||||
# the parent sequence from the sequence group since it will
|
||||
# not be used in the future iterations.
|
||||
parent.status = SequenceStatus.FINISHED_ABORTED
|
||||
seq_group.remove(parent.seq_id)
|
||||
self.scheduler.free_seq(parent)
|
||||
continue
|
||||
# Fork the parent sequence if there are multiple child samples.
|
||||
for child_sample in child_samples[:-1]:
|
||||
new_child_seq_id: int = next(self.seq_counter)
|
||||
child = parent.fork(new_child_seq_id)
|
||||
child.append_token_id(child_sample.output_token,
|
||||
child_sample.logprobs)
|
||||
child_seqs.append((child, parent))
|
||||
# Continue the parent sequence for the last child sample.
|
||||
# We reuse the parent sequence here to reduce redundant memory
|
||||
# copies, especially when using non-beam search sampling methods.
|
||||
last_child_sample = child_samples[-1]
|
||||
parent.append_token_id(last_child_sample.output_token,
|
||||
last_child_sample.logprobs)
|
||||
child_seqs.append((parent, parent))
|
||||
|
||||
for seq, _ in child_seqs:
|
||||
if seq_group.sampling_params.detokenize and self.detokenizer:
|
||||
new_char_count = self.detokenizer.decode_sequence_inplace(
|
||||
seq, seq_group.sampling_params)
|
||||
else:
|
||||
new_char_count = 0
|
||||
self.stop_checker.maybe_stop_sequence(seq, new_char_count,
|
||||
seq_group.sampling_params)
|
||||
|
||||
# Non-beam search case
|
||||
if not seq_group.sampling_params.use_beam_search:
|
||||
# For newly created child sequences, add them to the sequence group
|
||||
# and fork them in block manager if they are not finished.
|
||||
for seq, parent in child_seqs:
|
||||
if seq is not parent:
|
||||
seq_group.add(seq)
|
||||
if not seq.is_finished():
|
||||
self.scheduler.fork_seq(parent, seq)
|
||||
|
||||
# Free the finished and selected parent sequences' memory in block
|
||||
# manager. Keep them in the sequence group as candidate output.
|
||||
# NOTE: we need to fork the new sequences before freeing the
|
||||
# old sequences.
|
||||
for seq, parent in child_seqs:
|
||||
if seq is parent and seq.is_finished():
|
||||
self.scheduler.free_seq(seq)
|
||||
return
|
||||
|
||||
# Beam search case
|
||||
# Select the child sequences to keep in the sequence group.
|
||||
selected_child_seqs = []
|
||||
unselected_child_seqs = []
|
||||
beam_width = seq_group.sampling_params.best_of
|
||||
length_penalty = seq_group.sampling_params.length_penalty
|
||||
|
||||
# Select the newly finished sequences with the highest scores
|
||||
# to replace existing finished sequences.
|
||||
# Tuple of (seq, parent, is_new)
|
||||
existing_finished_seqs = [(seq, None, False)
|
||||
for seq in existing_finished_seqs]
|
||||
new_finished_seqs = [(seq, parent, True) for seq, parent in child_seqs
|
||||
if seq.is_finished()]
|
||||
all_finished_seqs = existing_finished_seqs + new_finished_seqs
|
||||
# Sort the finished sequences by their scores.
|
||||
all_finished_seqs.sort(key=lambda x: x[0].get_beam_search_score(
|
||||
length_penalty=length_penalty, eos_token_id=x[0].eos_token_id),
|
||||
reverse=True)
|
||||
for seq, parent, is_new in all_finished_seqs[:beam_width]:
|
||||
if is_new:
|
||||
# A newly generated child sequence finishes and has a high
|
||||
# score, so we will add it into the sequence group.
|
||||
selected_child_seqs.append((seq, parent))
|
||||
for seq, parent, is_new in all_finished_seqs[beam_width:]:
|
||||
if is_new:
|
||||
# A newly generated child sequence finishes but has a low
|
||||
# score, so we will not add it into the sequence group.
|
||||
# Additionally, if this sequence is a continuation of a
|
||||
# parent sequence, we will need remove the parent sequence
|
||||
# from the sequence group.
|
||||
unselected_child_seqs.append((seq, parent))
|
||||
else:
|
||||
# An existing finished sequence has a low score, so we will
|
||||
# remove it from the sequence group.
|
||||
seq_group.remove(seq.seq_id)
|
||||
|
||||
# select the top beam_width sequences from the running
|
||||
# sequences for the next iteration to continue the beam
|
||||
# search.
|
||||
running_child_seqs = [(seq, parent) for seq, parent in child_seqs
|
||||
if not seq.is_finished()]
|
||||
# Sort the running sequences by their scores.
|
||||
running_child_seqs.sort(key=lambda x: x[0].get_beam_search_score(
|
||||
length_penalty=length_penalty, eos_token_id=x[0].eos_token_id),
|
||||
reverse=True)
|
||||
|
||||
# Check if we can stop the beam search.
|
||||
if len(running_child_seqs) == 0:
|
||||
# No running sequences, stop the beam search.
|
||||
stop_beam_search = True
|
||||
elif len(all_finished_seqs) < beam_width:
|
||||
# Not enough finished sequences, continue the beam search.
|
||||
stop_beam_search = False
|
||||
else:
|
||||
# Check the early stopping criteria
|
||||
best_running_seq = running_child_seqs[0][0]
|
||||
current_worst_seq = all_finished_seqs[beam_width - 1][0]
|
||||
stop_beam_search = self._check_beam_search_early_stopping(
|
||||
seq_group.sampling_params.early_stopping,
|
||||
seq_group.sampling_params, best_running_seq, current_worst_seq)
|
||||
|
||||
if stop_beam_search:
|
||||
# Stop the beam search and remove all the running sequences from
|
||||
# the sequence group.
|
||||
unselected_child_seqs.extend(running_child_seqs)
|
||||
else:
|
||||
# Continue the beam search and select the top beam_width sequences
|
||||
# to continue the beam search.
|
||||
selected_child_seqs.extend(running_child_seqs[:beam_width])
|
||||
# The remaining running sequences will not be used in the next
|
||||
# iteration. Again, if these sequences are continuations of
|
||||
# parent sequences, we will need to remove the parent sequences
|
||||
# from the sequence group.
|
||||
unselected_child_seqs.extend(running_child_seqs[beam_width:])
|
||||
|
||||
# For newly created child sequences, add them to the sequence group
|
||||
# and fork them in block manager if they are not finished.
|
||||
for seq, parent in selected_child_seqs:
|
||||
if seq is not parent:
|
||||
seq_group.add(seq)
|
||||
if not seq.is_finished():
|
||||
self.scheduler.fork_seq(parent, seq)
|
||||
|
||||
# Free the finished and selected parent sequences' memory in block
|
||||
# manager. Keep them in the sequence group as candidate output.
|
||||
for seq, parent in selected_child_seqs:
|
||||
if seq is parent and seq.is_finished():
|
||||
self.scheduler.free_seq(seq)
|
||||
|
||||
# Remove the unselected parent sequences from the sequence group and
|
||||
# free their memory in block manager.
|
||||
for seq, parent in unselected_child_seqs:
|
||||
if seq is parent:
|
||||
# Remove the parent sequence if it is not selected for next
|
||||
# iteration
|
||||
seq_group.remove(seq.seq_id)
|
||||
self.scheduler.free_seq(seq)
|
||||
|
||||
def _check_beam_search_early_stopping(
|
||||
self,
|
||||
early_stopping: Union[bool, str],
|
||||
sampling_params: SamplingParams,
|
||||
best_running_seq: Sequence,
|
||||
current_worst_seq: Sequence,
|
||||
) -> bool:
|
||||
assert sampling_params.use_beam_search
|
||||
length_penalty = sampling_params.length_penalty
|
||||
if early_stopping is True:
|
||||
return True
|
||||
|
||||
current_worst_score = current_worst_seq.get_beam_search_score(
|
||||
length_penalty=length_penalty,
|
||||
eos_token_id=current_worst_seq.eos_token_id)
|
||||
if early_stopping is False:
|
||||
highest_attainable_score = best_running_seq.get_beam_search_score(
|
||||
length_penalty=length_penalty,
|
||||
eos_token_id=best_running_seq.eos_token_id)
|
||||
else:
|
||||
assert early_stopping == "never"
|
||||
if length_penalty > 0.0:
|
||||
# If length_penalty > 0.0, beam search will prefer longer
|
||||
# sequences. The highest attainable score calculation is
|
||||
# based on the longest possible sequence length in this case.
|
||||
max_possible_length = max(
|
||||
best_running_seq.get_prompt_len() +
|
||||
sampling_params.max_tokens,
|
||||
self.scheduler_config.max_model_len)
|
||||
highest_attainable_score = (
|
||||
best_running_seq.get_beam_search_score(
|
||||
length_penalty=length_penalty,
|
||||
eos_token_id=best_running_seq.eos_token_id,
|
||||
seq_len=max_possible_length))
|
||||
else:
|
||||
# Otherwise, beam search will prefer shorter sequences. The
|
||||
# highest attainable score calculation is based on the current
|
||||
# sequence length.
|
||||
highest_attainable_score = (
|
||||
best_running_seq.get_beam_search_score(
|
||||
length_penalty=length_penalty,
|
||||
eos_token_id=best_running_seq.eos_token_id))
|
||||
return current_worst_score >= highest_attainable_score
|
||||
101
vllm/engine/output_processor/stop_checker.py
Normal file
101
vllm/engine/output_processor/stop_checker.py
Normal file
@@ -0,0 +1,101 @@
|
||||
from typing import Callable, Optional
|
||||
|
||||
from transformers import PreTrainedTokenizer
|
||||
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.sequence import Sequence, SequenceStatus
|
||||
|
||||
|
||||
class StopChecker:
|
||||
"""LLMEngine helper class which separates out the logic involving stop
|
||||
checking. This checks things such as: whether the eos token was emitted,
|
||||
whether the max_tokens has been consumed, whether a stop string has been
|
||||
emitted, or if we have exceeded the max model len.
|
||||
"""
|
||||
|
||||
def __init__(self, max_model_len: int,
|
||||
get_tokenizer_for_seq: Callable[[Sequence],
|
||||
PreTrainedTokenizer]):
|
||||
self.max_model_len = max_model_len
|
||||
self.get_tokenizer_for_seq = get_tokenizer_for_seq
|
||||
|
||||
def maybe_stop_sequence(self, seq: Sequence, new_char_count: int,
|
||||
sampling_params: SamplingParams) -> None:
|
||||
"""Stop the finished sequences.
|
||||
|
||||
new_char_count is the number of chars added to the
|
||||
sequence's output text for the newly generated token
|
||||
"""
|
||||
|
||||
# Check if the minimum number of tokens has been generated yet;
|
||||
# skip the stop string/token checks if not
|
||||
if seq.get_output_len() < sampling_params.min_tokens:
|
||||
return
|
||||
|
||||
# Check if the sequence has generated the EOS token.
|
||||
if ((not sampling_params.ignore_eos)
|
||||
and seq.get_last_token_id() == seq.eos_token_id):
|
||||
seq.status = SequenceStatus.FINISHED_STOPPED
|
||||
return
|
||||
|
||||
# Check if a stop token was encountered.
|
||||
# This assumes a single token produced per step.
|
||||
last_token_id = seq.get_last_token_id()
|
||||
if last_token_id in sampling_params.stop_token_ids:
|
||||
if new_char_count and (
|
||||
not sampling_params.include_stop_str_in_output):
|
||||
# Remove last token
|
||||
seq.output_text = seq.output_text[:-new_char_count]
|
||||
seq.status = SequenceStatus.FINISHED_STOPPED
|
||||
seq.stop_reason = last_token_id
|
||||
return
|
||||
|
||||
# Check if any stop strings are matched.
|
||||
stop_str = self._check_stop_strings(seq, new_char_count,
|
||||
sampling_params)
|
||||
if stop_str is not None:
|
||||
seq.status = SequenceStatus.FINISHED_STOPPED
|
||||
seq.stop_reason = stop_str
|
||||
return
|
||||
|
||||
# Check if the sequence has reached max_model_len.
|
||||
if seq.get_len() > self.max_model_len:
|
||||
seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED
|
||||
return
|
||||
|
||||
# Check if the sequence has reached max_tokens.
|
||||
if seq.get_output_len() == sampling_params.max_tokens:
|
||||
seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED
|
||||
return
|
||||
|
||||
@staticmethod
|
||||
def _check_stop_strings(seq: Sequence, new_char_count: int,
|
||||
sampling_params: SamplingParams) -> Optional[str]:
|
||||
"""Check if any stop strings are matched and truncate sequence
|
||||
output text accordingly.
|
||||
|
||||
Returns the stop string if matched or else None.
|
||||
"""
|
||||
if not new_char_count:
|
||||
return None
|
||||
|
||||
for stop_str in sampling_params.stop:
|
||||
stop_string_len = len(stop_str)
|
||||
# Avoid searching already-searched text.
|
||||
stop_index = seq.output_text.find(
|
||||
stop_str, -new_char_count - stop_string_len)
|
||||
if stop_index == -1:
|
||||
continue
|
||||
|
||||
if sampling_params.include_stop_str_in_output:
|
||||
# Truncate to end of stop string.
|
||||
stop_index += stop_string_len
|
||||
if stop_index >= len(seq.output_text):
|
||||
# No truncation required.
|
||||
return stop_str
|
||||
|
||||
# Truncate the output text to either the beginning
|
||||
# or end of the stop string.
|
||||
seq.output_text = seq.output_text[:stop_index]
|
||||
return stop_str
|
||||
return None
|
||||
19
vllm/engine/output_processor/util.py
Normal file
19
vllm/engine/output_processor/util.py
Normal file
@@ -0,0 +1,19 @@
|
||||
from typing import List
|
||||
|
||||
from vllm.sequence import SamplerOutput, SequenceGroupOutput
|
||||
|
||||
|
||||
def create_output_by_sequence_group(
|
||||
sampler_outputs: List[SamplerOutput],
|
||||
num_seq_groups: int) -> List[List[SequenceGroupOutput]]:
|
||||
"""Helper method which transforms a 2d list organized by
|
||||
[step][sequence group] into [sequence group][step].
|
||||
"""
|
||||
output_by_sequence_group: List[List[SamplerOutput]] = [
|
||||
[] for _ in range(num_seq_groups)
|
||||
]
|
||||
for step in sampler_outputs:
|
||||
for i, sequence_group_output in enumerate(step):
|
||||
output_by_sequence_group[i].append(sequence_group_output)
|
||||
|
||||
return output_by_sequence_group
|
||||
Reference in New Issue
Block a user