################################################################################ # Copyright(c)2020-2025 Shanghai Biren Technology Co., Ltd. All rights reserved. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ################################################################################ import os import torch import vllm import vllm.envs as envs from vllm.config import VllmConfig, logger from vllm.config.compilation import CompilationLevel from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.utils import random_uuid from .compilation import SUPAGraphMode def supa_post_init(self): """Verify configs are valid & consistent with each other. """ self.try_verify_and_update_config() if self.model_config is not None: self.model_config.verify_with_parallel_config(self.parallel_config) self.model_config.verify_dual_chunk_attention_config(self.load_config) self.cache_config.verify_with_parallel_config(self.parallel_config) if self.lora_config is not None: self.lora_config.verify_with_cache_config(self.cache_config) self.lora_config.verify_with_model_config(self.model_config) if self.quant_config is None and self.model_config is not None: self.quant_config = VllmConfig._get_quantization_config( self.model_config, self.load_config) from vllm.platforms import current_platform if self.model_config is not None and \ self.scheduler_config.chunked_prefill_enabled and \ self.model_config.dtype == torch.float32 and \ current_platform.get_device_capability() == (7, 5): logger.warning_once( "Turing devices tensor cores do not support float32 matmul. " "To workaround this limitation, vLLM will set 'ieee' input " "precision for chunked prefill triton kernels.") # If the user does not explicitly set a compilation level, then # we use the default level. The default level depends on other # settings (see the below code). if self.compilation_config.level is None: if envs.VLLM_USE_V1: if (self.model_config is not None and not self.model_config.enforce_eager): self.compilation_config.level = CompilationLevel.PIECEWISE else: self.compilation_config.level = \ CompilationLevel.NO_COMPILATION else: # NB: Passing both --enforce-eager and a compilation level # in V0 means the compilation level wins out. self.compilation_config.level = CompilationLevel.NO_COMPILATION # async tp is built on top of sequence parallelism # and requires it to be enabled. if self.compilation_config.pass_config.enable_async_tp: self.compilation_config.pass_config.enable_sequence_parallelism = \ True if self.compilation_config.pass_config.enable_sequence_parallelism: self.compilation_config.custom_ops.append("+rms_norm") if current_platform.support_static_graph_mode(): # if cudagraph_mode is not explicitly set by users, set default # value if self.compilation_config.cudagraph_mode is None: if envs.VLLM_USE_V1 and self.compilation_config.level \ == CompilationLevel.PIECEWISE: # default to full and piecewise for most models self.compilation_config.cudagraph_mode = \ SUPAGraphMode.FULL_AND_PIECEWISE # pooling models and encoder-decoder models # do not support full cudagraphs if self.model_config is not None and \ (self.model_config.pooler_config is not None or self.model_config.is_encoder_decoder): self.compilation_config.cudagraph_mode = \ SUPAGraphMode.PIECEWISE else: self.compilation_config.cudagraph_mode = SUPAGraphMode.NONE # disable cudagraph when enforce eager execution if self.model_config is not None and \ self.model_config.enforce_eager: logger.info("Cudagraph is disabled under eager mode") self.compilation_config.cudagraph_mode = SUPAGraphMode.NONE elif envs.VLLM_USE_V1: self.compilation_config.cudagraph_num_of_warmups = 1 self._set_cudagraph_sizes() else: self.compilation_config.cudagraph_mode = SUPAGraphMode.NONE if self.cache_config.kv_sharing_fast_prefill: if self.speculative_config is not None and \ self.speculative_config.use_eagle(): raise NotImplementedError( "Fast prefill optimization for KV sharing is not " "compatible with EAGLE as EAGLE requires correct logits " "for all tokens while fast prefill gives incorrect logits " "for prompt tokens.") logger.warning_once( "--kv-sharing-fast-prefill requires changes on model side for " "correctness and to realize prefill savings. ") disable_chunked_prefill_reasons: list[str] = [] if self.model_config: if self.model_config.pooler_config: pooling_type = self.model_config.pooler_config.pooling_type if pooling_type is None or pooling_type.lower() != "last": disable_chunked_prefill_reasons.append( "Only \"last\" pooling supports chunked " "prefill and prefix caching; disabling both.") if not getattr(self.model_config.hf_config, "is_causal", True): disable_chunked_prefill_reasons.append( "Only models using causal attention supports chunked " "prefill and prefix caching; disabling both.") elif self.model_config.is_encoder_decoder: self.scheduler_config.max_num_encoder_input_tokens = \ MULTIMODAL_REGISTRY.get_encdec_max_encoder_len(self.model_config) logger.debug( "Encoder-decoder model detected: setting " "`max_num_encoder_input_tokens` to encoder length (%s)", self.scheduler_config.max_num_encoder_input_tokens) self.scheduler_config.disable_chunked_mm_input = True disable_chunked_prefill_reasons.append( "Encoder-decoder models do not support chunked prefill nor" " prefix caching; disabling both.") if (self.model_config.architecture == "WhisperForConditionalGeneration" and os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") != "spawn"): logger.warning("Whisper is known to have issues with " "forked workers. If startup is hanging, " "try setting 'VLLM_WORKER_MULTIPROC_METHOD' " "to 'spawn'.") if disable_chunked_prefill_reasons: for reason in disable_chunked_prefill_reasons: logger.info(reason) self.scheduler_config.chunked_prefill_enabled = False self.scheduler_config.long_prefill_token_threshold = 0 if self.cache_config is not None: self.cache_config.enable_prefix_caching = False if (self.kv_events_config is not None and self.kv_events_config.enable_kv_cache_events and not self.cache_config.enable_prefix_caching): logger.warning( "KV cache events are on, but prefix caching is not enabled." "Use --enable-prefix-caching to enable.") if (self.kv_events_config is not None and self.kv_events_config.publisher != "null" and not self.kv_events_config.enable_kv_cache_events): logger.warning("KV cache events are disabled," "but the scheduler is configured to publish them." "Modify KVEventsConfig.enable_kv_cache_events" "to True to enable.") current_platform.check_and_update_config(self) # final check of cudagraph mode after platform-specific update if envs.VLLM_USE_V1 and current_platform.is_cuda_alike(): if self.compilation_config.cudagraph_mode == SUPAGraphMode.FULL \ and self.model_config is not None and \ not self.model_config.disable_cascade_attn: logger.info("SUPAGraphMode.FULL is not supported with " "cascade attention currently. Disabling cascade" "attention.") self.model_config.disable_cascade_attn = True if self.compilation_config.cudagraph_mode\ .requires_piecewise_compilation(): assert self.compilation_config.level == \ CompilationLevel.PIECEWISE, \ "Compilation level should be CompilationLevel.PIECEWISE "\ "when cudagraph_mode piecewise cudagraphs is used, "\ f"cudagraph_mode={self.compilation_config.cudagraph_mode}" if self.parallel_config.enable_dbo: a2a_backend = envs.VLLM_ALL2ALL_BACKEND assert a2a_backend in \ ["deepep_low_latency", "deepep_high_throughput"], \ "Microbatching currently only supports the deepep_low_latency and "\ f"deepep_high_throughput all2all backend. {a2a_backend} is not "\ "supported. To fix set the VLLM_ALL2ALL_BACKEND environment "\ "variable to deepep_low_latency or deepep_high_throughput and "\ "install the DeepEP kernels." if not self.instance_id: self.instance_id = random_uuid()[:5] # Do this after all the updates to compilation_config.level if envs.VLLM_USE_V1 and \ self.compilation_config.level == CompilationLevel.PIECEWISE: self.compilation_config.set_splitting_ops_for_v1() if (envs.VLLM_USE_V1 and not self.scheduler_config.disable_hybrid_kv_cache_manager): # logger should only print warning message for hybrid models. As we # can't know whether the model is hybrid or not now, so we don't log # warning message here and will log it later. if not current_platform.support_hybrid_kv_cache(): # Hybrid KV cache manager is not supported on non-GPU platforms. self.scheduler_config.disable_hybrid_kv_cache_manager = True if self.kv_transfer_config is not None: # Hybrid KV cache manager is not compatible with KV transfer. self.scheduler_config.disable_hybrid_kv_cache_manager = True if self.kv_events_config is not None: # Hybrid KV cache manager is not compatible with KV events. self.scheduler_config.disable_hybrid_kv_cache_manager = True if self.model_config is not None and \ self.model_config.attention_chunk_size is not None: if self.speculative_config is not None and \ self.speculative_config.use_eagle(): # Hybrid KV cache manager is not yet supported with chunked # local attention + eagle. self.scheduler_config.disable_hybrid_kv_cache_manager = True elif \ not envs.VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE: logger.warning( "There is a latency regression when using chunked local" " attention with the hybrid KV cache manager. Disabling" " it, by default. To enable it, set the environment " "VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE=1.") # Hybrid KV cache manager is not yet supported with chunked # local attention. self.scheduler_config.disable_hybrid_kv_cache_manager = True vllm.config.VllmConfig.__post_init__ = supa_post_init