257 lines
12 KiB
Python
257 lines
12 KiB
Python
################################################################################
|
|
# Copyright(c)2020-2025 Shanghai Biren Technology Co., Ltd. All rights reserved.
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#
|
|
################################################################################
|
|
|
|
import os
|
|
|
|
import torch
|
|
|
|
import vllm
|
|
import vllm.envs as envs
|
|
from vllm.config import VllmConfig, logger
|
|
from vllm.config.compilation import CompilationLevel
|
|
from vllm.multimodal import MULTIMODAL_REGISTRY
|
|
from vllm.utils import random_uuid
|
|
from .compilation import SUPAGraphMode
|
|
|
|
|
|
def supa_post_init(self):
|
|
"""Verify configs are valid & consistent with each other.
|
|
"""
|
|
|
|
self.try_verify_and_update_config()
|
|
|
|
if self.model_config is not None:
|
|
self.model_config.verify_with_parallel_config(self.parallel_config)
|
|
self.model_config.verify_dual_chunk_attention_config(self.load_config)
|
|
|
|
self.cache_config.verify_with_parallel_config(self.parallel_config)
|
|
|
|
if self.lora_config is not None:
|
|
self.lora_config.verify_with_cache_config(self.cache_config)
|
|
self.lora_config.verify_with_model_config(self.model_config)
|
|
|
|
if self.quant_config is None and self.model_config is not None:
|
|
self.quant_config = VllmConfig._get_quantization_config(
|
|
self.model_config, self.load_config)
|
|
|
|
from vllm.platforms import current_platform
|
|
if self.model_config is not None and \
|
|
self.scheduler_config.chunked_prefill_enabled and \
|
|
self.model_config.dtype == torch.float32 and \
|
|
current_platform.get_device_capability() == (7, 5):
|
|
logger.warning_once(
|
|
"Turing devices tensor cores do not support float32 matmul. "
|
|
"To workaround this limitation, vLLM will set 'ieee' input "
|
|
"precision for chunked prefill triton kernels.")
|
|
|
|
# If the user does not explicitly set a compilation level, then
|
|
# we use the default level. The default level depends on other
|
|
# settings (see the below code).
|
|
if self.compilation_config.level is None:
|
|
if envs.VLLM_USE_V1:
|
|
if (self.model_config is not None
|
|
and not self.model_config.enforce_eager):
|
|
self.compilation_config.level = CompilationLevel.PIECEWISE
|
|
else:
|
|
self.compilation_config.level = \
|
|
CompilationLevel.NO_COMPILATION
|
|
|
|
else:
|
|
# NB: Passing both --enforce-eager and a compilation level
|
|
# in V0 means the compilation level wins out.
|
|
self.compilation_config.level = CompilationLevel.NO_COMPILATION
|
|
|
|
# async tp is built on top of sequence parallelism
|
|
# and requires it to be enabled.
|
|
if self.compilation_config.pass_config.enable_async_tp:
|
|
self.compilation_config.pass_config.enable_sequence_parallelism = \
|
|
True
|
|
if self.compilation_config.pass_config.enable_sequence_parallelism:
|
|
self.compilation_config.custom_ops.append("+rms_norm")
|
|
|
|
if current_platform.support_static_graph_mode():
|
|
# if cudagraph_mode is not explicitly set by users, set default
|
|
# value
|
|
if self.compilation_config.cudagraph_mode is None:
|
|
if envs.VLLM_USE_V1 and self.compilation_config.level \
|
|
== CompilationLevel.PIECEWISE:
|
|
# default to full and piecewise for most models
|
|
self.compilation_config.cudagraph_mode = \
|
|
SUPAGraphMode.FULL_AND_PIECEWISE
|
|
|
|
# pooling models and encoder-decoder models
|
|
# do not support full cudagraphs
|
|
if self.model_config is not None and \
|
|
(self.model_config.pooler_config is not None
|
|
or self.model_config.is_encoder_decoder):
|
|
self.compilation_config.cudagraph_mode = \
|
|
SUPAGraphMode.PIECEWISE
|
|
else:
|
|
self.compilation_config.cudagraph_mode = SUPAGraphMode.NONE
|
|
|
|
# disable cudagraph when enforce eager execution
|
|
if self.model_config is not None and \
|
|
self.model_config.enforce_eager:
|
|
logger.info("Cudagraph is disabled under eager mode")
|
|
self.compilation_config.cudagraph_mode = SUPAGraphMode.NONE
|
|
elif envs.VLLM_USE_V1:
|
|
self.compilation_config.cudagraph_num_of_warmups = 1
|
|
|
|
self._set_cudagraph_sizes()
|
|
else:
|
|
self.compilation_config.cudagraph_mode = SUPAGraphMode.NONE
|
|
|
|
if self.cache_config.kv_sharing_fast_prefill:
|
|
|
|
if self.speculative_config is not None and \
|
|
self.speculative_config.use_eagle():
|
|
raise NotImplementedError(
|
|
"Fast prefill optimization for KV sharing is not "
|
|
"compatible with EAGLE as EAGLE requires correct logits "
|
|
"for all tokens while fast prefill gives incorrect logits "
|
|
"for prompt tokens.")
|
|
|
|
logger.warning_once(
|
|
"--kv-sharing-fast-prefill requires changes on model side for "
|
|
"correctness and to realize prefill savings. ")
|
|
|
|
disable_chunked_prefill_reasons: list[str] = []
|
|
|
|
if self.model_config:
|
|
if self.model_config.pooler_config:
|
|
pooling_type = self.model_config.pooler_config.pooling_type
|
|
if pooling_type is None or pooling_type.lower() != "last":
|
|
disable_chunked_prefill_reasons.append(
|
|
"Only \"last\" pooling supports chunked "
|
|
"prefill and prefix caching; disabling both.")
|
|
if not getattr(self.model_config.hf_config, "is_causal", True):
|
|
disable_chunked_prefill_reasons.append(
|
|
"Only models using causal attention supports chunked "
|
|
"prefill and prefix caching; disabling both.")
|
|
elif self.model_config.is_encoder_decoder:
|
|
self.scheduler_config.max_num_encoder_input_tokens = \
|
|
MULTIMODAL_REGISTRY.get_encdec_max_encoder_len(self.model_config)
|
|
logger.debug(
|
|
"Encoder-decoder model detected: setting "
|
|
"`max_num_encoder_input_tokens` to encoder length (%s)",
|
|
self.scheduler_config.max_num_encoder_input_tokens)
|
|
self.scheduler_config.disable_chunked_mm_input = True
|
|
disable_chunked_prefill_reasons.append(
|
|
"Encoder-decoder models do not support chunked prefill nor"
|
|
" prefix caching; disabling both.")
|
|
if (self.model_config.architecture
|
|
== "WhisperForConditionalGeneration" and
|
|
os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") != "spawn"):
|
|
logger.warning("Whisper is known to have issues with "
|
|
"forked workers. If startup is hanging, "
|
|
"try setting 'VLLM_WORKER_MULTIPROC_METHOD' "
|
|
"to 'spawn'.")
|
|
|
|
if disable_chunked_prefill_reasons:
|
|
for reason in disable_chunked_prefill_reasons:
|
|
logger.info(reason)
|
|
self.scheduler_config.chunked_prefill_enabled = False
|
|
self.scheduler_config.long_prefill_token_threshold = 0
|
|
|
|
if self.cache_config is not None:
|
|
self.cache_config.enable_prefix_caching = False
|
|
|
|
if (self.kv_events_config is not None
|
|
and self.kv_events_config.enable_kv_cache_events
|
|
and not self.cache_config.enable_prefix_caching):
|
|
logger.warning(
|
|
"KV cache events are on, but prefix caching is not enabled."
|
|
"Use --enable-prefix-caching to enable.")
|
|
if (self.kv_events_config is not None
|
|
and self.kv_events_config.publisher != "null"
|
|
and not self.kv_events_config.enable_kv_cache_events):
|
|
logger.warning("KV cache events are disabled,"
|
|
"but the scheduler is configured to publish them."
|
|
"Modify KVEventsConfig.enable_kv_cache_events"
|
|
"to True to enable.")
|
|
current_platform.check_and_update_config(self)
|
|
|
|
# final check of cudagraph mode after platform-specific update
|
|
if envs.VLLM_USE_V1 and current_platform.is_cuda_alike():
|
|
if self.compilation_config.cudagraph_mode == SUPAGraphMode.FULL \
|
|
and self.model_config is not None and \
|
|
not self.model_config.disable_cascade_attn:
|
|
logger.info("SUPAGraphMode.FULL is not supported with "
|
|
"cascade attention currently. Disabling cascade"
|
|
"attention.")
|
|
self.model_config.disable_cascade_attn = True
|
|
|
|
if self.compilation_config.cudagraph_mode\
|
|
.requires_piecewise_compilation():
|
|
assert self.compilation_config.level == \
|
|
CompilationLevel.PIECEWISE, \
|
|
"Compilation level should be CompilationLevel.PIECEWISE "\
|
|
"when cudagraph_mode piecewise cudagraphs is used, "\
|
|
f"cudagraph_mode={self.compilation_config.cudagraph_mode}"
|
|
|
|
if self.parallel_config.enable_dbo:
|
|
a2a_backend = envs.VLLM_ALL2ALL_BACKEND
|
|
assert a2a_backend in \
|
|
["deepep_low_latency", "deepep_high_throughput"], \
|
|
"Microbatching currently only supports the deepep_low_latency and "\
|
|
f"deepep_high_throughput all2all backend. {a2a_backend} is not "\
|
|
"supported. To fix set the VLLM_ALL2ALL_BACKEND environment "\
|
|
"variable to deepep_low_latency or deepep_high_throughput and "\
|
|
"install the DeepEP kernels."
|
|
|
|
if not self.instance_id:
|
|
self.instance_id = random_uuid()[:5]
|
|
|
|
# Do this after all the updates to compilation_config.level
|
|
if envs.VLLM_USE_V1 and \
|
|
self.compilation_config.level == CompilationLevel.PIECEWISE:
|
|
self.compilation_config.set_splitting_ops_for_v1()
|
|
|
|
if (envs.VLLM_USE_V1
|
|
and not self.scheduler_config.disable_hybrid_kv_cache_manager):
|
|
# logger should only print warning message for hybrid models. As we
|
|
# can't know whether the model is hybrid or not now, so we don't log
|
|
# warning message here and will log it later.
|
|
if not current_platform.support_hybrid_kv_cache():
|
|
# Hybrid KV cache manager is not supported on non-GPU platforms.
|
|
self.scheduler_config.disable_hybrid_kv_cache_manager = True
|
|
if self.kv_transfer_config is not None:
|
|
# Hybrid KV cache manager is not compatible with KV transfer.
|
|
self.scheduler_config.disable_hybrid_kv_cache_manager = True
|
|
if self.kv_events_config is not None:
|
|
# Hybrid KV cache manager is not compatible with KV events.
|
|
self.scheduler_config.disable_hybrid_kv_cache_manager = True
|
|
if self.model_config is not None and \
|
|
self.model_config.attention_chunk_size is not None:
|
|
if self.speculative_config is not None and \
|
|
self.speculative_config.use_eagle():
|
|
# Hybrid KV cache manager is not yet supported with chunked
|
|
# local attention + eagle.
|
|
self.scheduler_config.disable_hybrid_kv_cache_manager = True
|
|
elif \
|
|
not envs.VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE:
|
|
logger.warning(
|
|
"There is a latency regression when using chunked local"
|
|
" attention with the hybrid KV cache manager. Disabling"
|
|
" it, by default. To enable it, set the environment "
|
|
"VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE=1.")
|
|
# Hybrid KV cache manager is not yet supported with chunked
|
|
# local attention.
|
|
self.scheduler_config.disable_hybrid_kv_cache_manager = True
|
|
|
|
|
|
vllm.config.VllmConfig.__post_init__ = supa_post_init
|