enginex-biren-vllm/vllm_br/config/__init__.py

################################################################################
# Copyright(c)2020-2025 Shanghai Biren Technology Co., Ltd. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
################################################################################

import os

import torch

import vllm
import vllm.envs as envs
from vllm.config import VllmConfig, logger
from vllm.config.compilation import CompilationLevel
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.utils import random_uuid
from .compilation import SUPAGraphMode


def supa_post_init(self):
    """Verify configs are valid & consistent with each other.
    """

    self.try_verify_and_update_config()

    if self.model_config is not None:
        self.model_config.verify_with_parallel_config(self.parallel_config)
        self.model_config.verify_dual_chunk_attention_config(self.load_config)

    self.cache_config.verify_with_parallel_config(self.parallel_config)

    if self.lora_config is not None:
        self.lora_config.verify_with_cache_config(self.cache_config)
        self.lora_config.verify_with_model_config(self.model_config)

    if self.quant_config is None and self.model_config is not None:
        self.quant_config = VllmConfig._get_quantization_config(
            self.model_config, self.load_config)

    from vllm.platforms import current_platform
    if self.model_config is not None and \
        self.scheduler_config.chunked_prefill_enabled and \
        self.model_config.dtype == torch.float32 and \
        current_platform.get_device_capability() == (7, 5):
        logger.warning_once(
            "Turing devices tensor cores do not support float32 matmul. "
            "To workaround this limitation, vLLM will set 'ieee' input "
            "precision for chunked prefill triton kernels.")

    # If the user does not explicitly set a compilation level, then
    # we use the default level. The default level depends on other
    # settings (see the below code).
    if self.compilation_config.level is None:
        if envs.VLLM_USE_V1:
            if (self.model_config is not None
                    and not self.model_config.enforce_eager):
                self.compilation_config.level = CompilationLevel.PIECEWISE
            else:
                self.compilation_config.level = \
                        CompilationLevel.NO_COMPILATION

        else:
            # NB: Passing both --enforce-eager and a compilation level
            # in V0 means the compilation level wins out.
            self.compilation_config.level = CompilationLevel.NO_COMPILATION

    # async tp is built on top of sequence parallelism
    # and requires it to be enabled.
    if self.compilation_config.pass_config.enable_async_tp:
        self.compilation_config.pass_config.enable_sequence_parallelism = \
            True
    if self.compilation_config.pass_config.enable_sequence_parallelism:
        self.compilation_config.custom_ops.append("+rms_norm")

    if current_platform.support_static_graph_mode():
        # if cudagraph_mode is not explicitly set by users, set default
        # value
        if self.compilation_config.cudagraph_mode is None:
            if envs.VLLM_USE_V1 and self.compilation_config.level \
                == CompilationLevel.PIECEWISE:
                # default to full and piecewise for most models
                self.compilation_config.cudagraph_mode = \
                    SUPAGraphMode.FULL_AND_PIECEWISE

                # pooling models and encoder-decoder models
                # do not support full cudagraphs
                if self.model_config is not None and \
                    (self.model_config.pooler_config is not None
                        or self.model_config.is_encoder_decoder):
                    self.compilation_config.cudagraph_mode = \
                        SUPAGraphMode.PIECEWISE
            else:
                self.compilation_config.cudagraph_mode = SUPAGraphMode.NONE

        # disable cudagraph when enforce eager execution
        if self.model_config is not None and \
                self.model_config.enforce_eager:
            logger.info("Cudagraph is disabled under eager mode")
            self.compilation_config.cudagraph_mode = SUPAGraphMode.NONE
        elif envs.VLLM_USE_V1:
            self.compilation_config.cudagraph_num_of_warmups = 1

        self._set_cudagraph_sizes()
    else:
        self.compilation_config.cudagraph_mode = SUPAGraphMode.NONE

    if self.cache_config.kv_sharing_fast_prefill:

        if self.speculative_config is not None and \
            self.speculative_config.use_eagle():
            raise NotImplementedError(
                "Fast prefill optimization for KV sharing is not "
                "compatible with EAGLE as EAGLE requires correct logits "
                "for all tokens while fast prefill gives incorrect logits "
                "for prompt tokens.")

        logger.warning_once(
            "--kv-sharing-fast-prefill requires changes on model side for "
            "correctness and to realize prefill savings. ")

    disable_chunked_prefill_reasons: list[str] = []

    if self.model_config:
        if self.model_config.pooler_config:
            pooling_type = self.model_config.pooler_config.pooling_type
            if pooling_type is None or pooling_type.lower() != "last":
                disable_chunked_prefill_reasons.append(
                    "Only \"last\" pooling supports chunked "
                    "prefill and prefix caching; disabling both.")
            if not getattr(self.model_config.hf_config, "is_causal", True):
                disable_chunked_prefill_reasons.append(
                    "Only models using causal attention supports chunked "
                    "prefill and prefix caching; disabling both.")
        elif self.model_config.is_encoder_decoder:
            self.scheduler_config.max_num_encoder_input_tokens = \
                MULTIMODAL_REGISTRY.get_encdec_max_encoder_len(self.model_config)
            logger.debug(
                "Encoder-decoder model detected: setting "
                "`max_num_encoder_input_tokens` to encoder length (%s)",
                self.scheduler_config.max_num_encoder_input_tokens)
            self.scheduler_config.disable_chunked_mm_input = True
            disable_chunked_prefill_reasons.append(
                "Encoder-decoder models do not support chunked prefill nor"
                " prefix caching; disabling both.")
            if (self.model_config.architecture
                    == "WhisperForConditionalGeneration" and
                    os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") != "spawn"):
                logger.warning("Whisper is known to have issues with "
                               "forked workers. If startup is hanging, "
                               "try setting 'VLLM_WORKER_MULTIPROC_METHOD' "
                               "to 'spawn'.")

    if disable_chunked_prefill_reasons:
        for reason in disable_chunked_prefill_reasons:
            logger.info(reason)
        self.scheduler_config.chunked_prefill_enabled = False
        self.scheduler_config.long_prefill_token_threshold = 0

        if self.cache_config is not None:
            self.cache_config.enable_prefix_caching = False

    if (self.kv_events_config is not None
            and self.kv_events_config.enable_kv_cache_events
            and not self.cache_config.enable_prefix_caching):
        logger.warning(
            "KV cache events are on, but prefix caching is not enabled."
            "Use --enable-prefix-caching to enable.")
    if (self.kv_events_config is not None
            and self.kv_events_config.publisher != "null"
            and not self.kv_events_config.enable_kv_cache_events):
        logger.warning("KV cache events are disabled,"
                       "but the scheduler is configured to publish them."
                       "Modify KVEventsConfig.enable_kv_cache_events"
                       "to True to enable.")
    current_platform.check_and_update_config(self)

    # final check of cudagraph mode after platform-specific update
    if envs.VLLM_USE_V1 and current_platform.is_cuda_alike():
        if self.compilation_config.cudagraph_mode == SUPAGraphMode.FULL \
            and self.model_config is not None and \
            not self.model_config.disable_cascade_attn:
            logger.info("SUPAGraphMode.FULL is not supported with "
                        "cascade attention currently. Disabling cascade"
                        "attention.")
            self.model_config.disable_cascade_attn = True

        if self.compilation_config.cudagraph_mode\
            .requires_piecewise_compilation():
            assert self.compilation_config.level == \
                CompilationLevel.PIECEWISE, \
                "Compilation level should be CompilationLevel.PIECEWISE "\
                "when cudagraph_mode piecewise cudagraphs is used, "\
                f"cudagraph_mode={self.compilation_config.cudagraph_mode}"

    if self.parallel_config.enable_dbo:
        a2a_backend = envs.VLLM_ALL2ALL_BACKEND
        assert a2a_backend in \
            ["deepep_low_latency", "deepep_high_throughput"], \
        "Microbatching currently only supports the deepep_low_latency and "\
        f"deepep_high_throughput all2all backend. {a2a_backend} is not "\
        "supported. To fix set the VLLM_ALL2ALL_BACKEND environment "\
        "variable to deepep_low_latency or deepep_high_throughput and "\
        "install the DeepEP kernels."

    if not self.instance_id:
        self.instance_id = random_uuid()[:5]

    # Do this after all the updates to compilation_config.level
    if envs.VLLM_USE_V1 and \
        self.compilation_config.level == CompilationLevel.PIECEWISE:
        self.compilation_config.set_splitting_ops_for_v1()

    if (envs.VLLM_USE_V1
            and not self.scheduler_config.disable_hybrid_kv_cache_manager):
        # logger should only print warning message for hybrid models. As we
        # can't know whether the model is hybrid or not now, so we don't log
        # warning message here and will log it later.
        if not current_platform.support_hybrid_kv_cache():
            # Hybrid KV cache manager is not supported on non-GPU platforms.
            self.scheduler_config.disable_hybrid_kv_cache_manager = True
        if self.kv_transfer_config is not None:
            # Hybrid KV cache manager is not compatible with KV transfer.
            self.scheduler_config.disable_hybrid_kv_cache_manager = True
        if self.kv_events_config is not None:
            # Hybrid KV cache manager is not compatible with KV events.
            self.scheduler_config.disable_hybrid_kv_cache_manager = True
        if self.model_config is not None and \
            self.model_config.attention_chunk_size is not None:
            if self.speculative_config is not None and \
                self.speculative_config.use_eagle():
                # Hybrid KV cache manager is not yet supported with chunked
                # local attention + eagle.
                self.scheduler_config.disable_hybrid_kv_cache_manager = True
            elif \
                not envs.VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE:
                logger.warning(
                    "There is a latency regression when using chunked local"
                    " attention with the hybrid KV cache manager. Disabling"
                    " it, by default. To enable it, set the environment "
                    "VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE=1.")
                # Hybrid KV cache manager is not yet supported with chunked
                # local attention.
                self.scheduler_config.disable_hybrid_kv_cache_manager = True


vllm.config.VllmConfig.__post_init__ = supa_post_init