Upgrade to vllm 0.17.0 corex v4.1 overlay

2026-04-29 19:38:22 +08:00
parent 8fac6062e4
commit 938d0854a5
430 changed files with 35969 additions and 14511 deletions
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -50,8 +50,6 @@ from vllm.transformers_utils.utils import maybe_model_redirect
 from vllm.utils.import_utils import LazyLoader
 from vllm.v1.attention.backends.registry import AttentionBackendEnum

-import os
-
 if TYPE_CHECKING:
    from transformers import PretrainedConfig

@@ -128,6 +126,7 @@ class ModelConfig:
    - "slow" will always use the slow tokenizer.\n
    - "mistral" will always use the tokenizer from `mistral_common`.\n
    - "deepseek_v32" will always use the tokenizer from `deepseek_v32`.\n
+    - "qwen_vl" will always use the tokenizer from `qwen_vl`.\n
    - Other custom values can be supported via plugins."""
    trust_remote_code: bool = False
    """Trust remote code (e.g., from HuggingFace) when downloading the model
@@ -463,8 +462,6 @@ class ModelConfig:

        self.maybe_pull_model_tokenizer_for_runai(self.model, self.tokenizer)

-        from vllm.platforms import current_platform
-
        if self.override_attention_dtype is not None and not current_platform.is_rocm():
            warnings.warn(
                "override-attention-dtype is set but not using ROCm platform",
@@ -473,10 +470,9 @@ class ModelConfig:

        if self.enable_sleep_mode and not current_platform.is_sleep_mode_available():
            raise ValueError("Sleep mode is not supported on current platform.")
-        
-        temp_hf_config_path = os.environ.get("CUSTOM_QUANT_CONFIG", None)
+
        hf_config = get_config(
-            temp_hf_config_path or self.hf_config_path or self.model,
+            self.hf_config_path or self.model,
            self.trust_remote_code,
            self.revision,
            self.code_revision,
@@ -622,6 +618,16 @@ class ModelConfig:
        self._try_verify_and_update_model_config()
        self._verify_quantization()
        self._verify_cuda_graph()
+        import os
+        enforce_cuda_graph = os.environ.get("VLLM_ENFORCE_CUDA_GRAPH",None)
+        if enforce_cuda_graph is not None and enforce_cuda_graph in ["1", "y", "Y"]:
+            self.enforce_eager = False
+        else:
+            self.enforce_eager = True
+            logger.warning_once(
+                "Please export VLLM_ENFORCE_CUDA_GRAPH=1 to enable cuda graph. "
+                "For now, cuda graph is not used and --enforce-eager is disabled ,"
+                "we are trying to use cuda graph as the default mode")
        self._verify_bnb_config()

    def get_model_arch_config(
@@ -886,6 +892,7 @@ class ModelConfig:
                "modelopt",
                "modelopt_fp4",
                "modelopt_mxfp8",
+                "modelopt_mixed",
                "petit_nvfp4",
                # Ensure heavy backends are probed last to avoid unnecessary
                # imports during override detection (e.g., MXFP4 imports Triton)
@@ -942,8 +949,6 @@ class ModelConfig:
                    f"Unknown quantization method: {self.quantization}. Must "
                    f"be one of {supported_quantization}."
                )
-            from vllm.platforms import current_platform
-
            current_platform.verify_quantization(self.quantization)

        if self.quantization in me_quant.DEPRECATED_QUANTIZATION_METHODS:
@@ -1813,8 +1818,6 @@ def _resolve_auto_dtype(
    *,
    is_pooling_model: bool,
 ):
-    from vllm.platforms import current_platform
-
    supported_dtypes = [
        dtype
        for dtype in current_platform.supported_dtypes