Files
2026-03-10 13:31:25 +08:00

257 lines
12 KiB
Python

################################################################################
# Copyright(c)2020-2025 Shanghai Biren Technology Co., Ltd. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
################################################################################
import os
import torch
import vllm
import vllm.envs as envs
from vllm.config import VllmConfig, logger
from vllm.config.compilation import CompilationLevel
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.utils import random_uuid
from .compilation import SUPAGraphMode
def supa_post_init(self):
"""Verify configs are valid & consistent with each other.
"""
self.try_verify_and_update_config()
if self.model_config is not None:
self.model_config.verify_with_parallel_config(self.parallel_config)
self.model_config.verify_dual_chunk_attention_config(self.load_config)
self.cache_config.verify_with_parallel_config(self.parallel_config)
if self.lora_config is not None:
self.lora_config.verify_with_cache_config(self.cache_config)
self.lora_config.verify_with_model_config(self.model_config)
if self.quant_config is None and self.model_config is not None:
self.quant_config = VllmConfig._get_quantization_config(
self.model_config, self.load_config)
from vllm.platforms import current_platform
if self.model_config is not None and \
self.scheduler_config.chunked_prefill_enabled and \
self.model_config.dtype == torch.float32 and \
current_platform.get_device_capability() == (7, 5):
logger.warning_once(
"Turing devices tensor cores do not support float32 matmul. "
"To workaround this limitation, vLLM will set 'ieee' input "
"precision for chunked prefill triton kernels.")
# If the user does not explicitly set a compilation level, then
# we use the default level. The default level depends on other
# settings (see the below code).
if self.compilation_config.level is None:
if envs.VLLM_USE_V1:
if (self.model_config is not None
and not self.model_config.enforce_eager):
self.compilation_config.level = CompilationLevel.PIECEWISE
else:
self.compilation_config.level = \
CompilationLevel.NO_COMPILATION
else:
# NB: Passing both --enforce-eager and a compilation level
# in V0 means the compilation level wins out.
self.compilation_config.level = CompilationLevel.NO_COMPILATION
# async tp is built on top of sequence parallelism
# and requires it to be enabled.
if self.compilation_config.pass_config.enable_async_tp:
self.compilation_config.pass_config.enable_sequence_parallelism = \
True
if self.compilation_config.pass_config.enable_sequence_parallelism:
self.compilation_config.custom_ops.append("+rms_norm")
if current_platform.support_static_graph_mode():
# if cudagraph_mode is not explicitly set by users, set default
# value
if self.compilation_config.cudagraph_mode is None:
if envs.VLLM_USE_V1 and self.compilation_config.level \
== CompilationLevel.PIECEWISE:
# default to full and piecewise for most models
self.compilation_config.cudagraph_mode = \
SUPAGraphMode.FULL_AND_PIECEWISE
# pooling models and encoder-decoder models
# do not support full cudagraphs
if self.model_config is not None and \
(self.model_config.pooler_config is not None
or self.model_config.is_encoder_decoder):
self.compilation_config.cudagraph_mode = \
SUPAGraphMode.PIECEWISE
else:
self.compilation_config.cudagraph_mode = SUPAGraphMode.NONE
# disable cudagraph when enforce eager execution
if self.model_config is not None and \
self.model_config.enforce_eager:
logger.info("Cudagraph is disabled under eager mode")
self.compilation_config.cudagraph_mode = SUPAGraphMode.NONE
elif envs.VLLM_USE_V1:
self.compilation_config.cudagraph_num_of_warmups = 1
self._set_cudagraph_sizes()
else:
self.compilation_config.cudagraph_mode = SUPAGraphMode.NONE
if self.cache_config.kv_sharing_fast_prefill:
if self.speculative_config is not None and \
self.speculative_config.use_eagle():
raise NotImplementedError(
"Fast prefill optimization for KV sharing is not "
"compatible with EAGLE as EAGLE requires correct logits "
"for all tokens while fast prefill gives incorrect logits "
"for prompt tokens.")
logger.warning_once(
"--kv-sharing-fast-prefill requires changes on model side for "
"correctness and to realize prefill savings. ")
disable_chunked_prefill_reasons: list[str] = []
if self.model_config:
if self.model_config.pooler_config:
pooling_type = self.model_config.pooler_config.pooling_type
if pooling_type is None or pooling_type.lower() != "last":
disable_chunked_prefill_reasons.append(
"Only \"last\" pooling supports chunked "
"prefill and prefix caching; disabling both.")
if not getattr(self.model_config.hf_config, "is_causal", True):
disable_chunked_prefill_reasons.append(
"Only models using causal attention supports chunked "
"prefill and prefix caching; disabling both.")
elif self.model_config.is_encoder_decoder:
self.scheduler_config.max_num_encoder_input_tokens = \
MULTIMODAL_REGISTRY.get_encdec_max_encoder_len(self.model_config)
logger.debug(
"Encoder-decoder model detected: setting "
"`max_num_encoder_input_tokens` to encoder length (%s)",
self.scheduler_config.max_num_encoder_input_tokens)
self.scheduler_config.disable_chunked_mm_input = True
disable_chunked_prefill_reasons.append(
"Encoder-decoder models do not support chunked prefill nor"
" prefix caching; disabling both.")
if (self.model_config.architecture
== "WhisperForConditionalGeneration" and
os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") != "spawn"):
logger.warning("Whisper is known to have issues with "
"forked workers. If startup is hanging, "
"try setting 'VLLM_WORKER_MULTIPROC_METHOD' "
"to 'spawn'.")
if disable_chunked_prefill_reasons:
for reason in disable_chunked_prefill_reasons:
logger.info(reason)
self.scheduler_config.chunked_prefill_enabled = False
self.scheduler_config.long_prefill_token_threshold = 0
if self.cache_config is not None:
self.cache_config.enable_prefix_caching = False
if (self.kv_events_config is not None
and self.kv_events_config.enable_kv_cache_events
and not self.cache_config.enable_prefix_caching):
logger.warning(
"KV cache events are on, but prefix caching is not enabled."
"Use --enable-prefix-caching to enable.")
if (self.kv_events_config is not None
and self.kv_events_config.publisher != "null"
and not self.kv_events_config.enable_kv_cache_events):
logger.warning("KV cache events are disabled,"
"but the scheduler is configured to publish them."
"Modify KVEventsConfig.enable_kv_cache_events"
"to True to enable.")
current_platform.check_and_update_config(self)
# final check of cudagraph mode after platform-specific update
if envs.VLLM_USE_V1 and current_platform.is_cuda_alike():
if self.compilation_config.cudagraph_mode == SUPAGraphMode.FULL \
and self.model_config is not None and \
not self.model_config.disable_cascade_attn:
logger.info("SUPAGraphMode.FULL is not supported with "
"cascade attention currently. Disabling cascade"
"attention.")
self.model_config.disable_cascade_attn = True
if self.compilation_config.cudagraph_mode\
.requires_piecewise_compilation():
assert self.compilation_config.level == \
CompilationLevel.PIECEWISE, \
"Compilation level should be CompilationLevel.PIECEWISE "\
"when cudagraph_mode piecewise cudagraphs is used, "\
f"cudagraph_mode={self.compilation_config.cudagraph_mode}"
if self.parallel_config.enable_dbo:
a2a_backend = envs.VLLM_ALL2ALL_BACKEND
assert a2a_backend in \
["deepep_low_latency", "deepep_high_throughput"], \
"Microbatching currently only supports the deepep_low_latency and "\
f"deepep_high_throughput all2all backend. {a2a_backend} is not "\
"supported. To fix set the VLLM_ALL2ALL_BACKEND environment "\
"variable to deepep_low_latency or deepep_high_throughput and "\
"install the DeepEP kernels."
if not self.instance_id:
self.instance_id = random_uuid()[:5]
# Do this after all the updates to compilation_config.level
if envs.VLLM_USE_V1 and \
self.compilation_config.level == CompilationLevel.PIECEWISE:
self.compilation_config.set_splitting_ops_for_v1()
if (envs.VLLM_USE_V1
and not self.scheduler_config.disable_hybrid_kv_cache_manager):
# logger should only print warning message for hybrid models. As we
# can't know whether the model is hybrid or not now, so we don't log
# warning message here and will log it later.
if not current_platform.support_hybrid_kv_cache():
# Hybrid KV cache manager is not supported on non-GPU platforms.
self.scheduler_config.disable_hybrid_kv_cache_manager = True
if self.kv_transfer_config is not None:
# Hybrid KV cache manager is not compatible with KV transfer.
self.scheduler_config.disable_hybrid_kv_cache_manager = True
if self.kv_events_config is not None:
# Hybrid KV cache manager is not compatible with KV events.
self.scheduler_config.disable_hybrid_kv_cache_manager = True
if self.model_config is not None and \
self.model_config.attention_chunk_size is not None:
if self.speculative_config is not None and \
self.speculative_config.use_eagle():
# Hybrid KV cache manager is not yet supported with chunked
# local attention + eagle.
self.scheduler_config.disable_hybrid_kv_cache_manager = True
elif \
not envs.VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE:
logger.warning(
"There is a latency regression when using chunked local"
" attention with the hybrid KV cache manager. Disabling"
" it, by default. To enable it, set the environment "
"VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE=1.")
# Hybrid KV cache manager is not yet supported with chunked
# local attention.
self.scheduler_config.disable_hybrid_kv_cache_manager = True
vllm.config.VllmConfig.__post_init__ = supa_post_init