Upgrade to vllm 0.17.0 corex v4.1 overlay

This commit is contained in:
2026-04-29 19:38:22 +08:00
parent 8fac6062e4
commit 938d0854a5
430 changed files with 35969 additions and 14511 deletions

View File

@@ -50,8 +50,6 @@ from vllm.transformers_utils.utils import maybe_model_redirect
from vllm.utils.import_utils import LazyLoader
from vllm.v1.attention.backends.registry import AttentionBackendEnum
import os
if TYPE_CHECKING:
from transformers import PretrainedConfig
@@ -128,6 +126,7 @@ class ModelConfig:
- "slow" will always use the slow tokenizer.\n
- "mistral" will always use the tokenizer from `mistral_common`.\n
- "deepseek_v32" will always use the tokenizer from `deepseek_v32`.\n
- "qwen_vl" will always use the tokenizer from `qwen_vl`.\n
- Other custom values can be supported via plugins."""
trust_remote_code: bool = False
"""Trust remote code (e.g., from HuggingFace) when downloading the model
@@ -463,8 +462,6 @@ class ModelConfig:
self.maybe_pull_model_tokenizer_for_runai(self.model, self.tokenizer)
from vllm.platforms import current_platform
if self.override_attention_dtype is not None and not current_platform.is_rocm():
warnings.warn(
"override-attention-dtype is set but not using ROCm platform",
@@ -473,10 +470,9 @@ class ModelConfig:
if self.enable_sleep_mode and not current_platform.is_sleep_mode_available():
raise ValueError("Sleep mode is not supported on current platform.")
temp_hf_config_path = os.environ.get("CUSTOM_QUANT_CONFIG", None)
hf_config = get_config(
temp_hf_config_path or self.hf_config_path or self.model,
self.hf_config_path or self.model,
self.trust_remote_code,
self.revision,
self.code_revision,
@@ -622,6 +618,16 @@ class ModelConfig:
self._try_verify_and_update_model_config()
self._verify_quantization()
self._verify_cuda_graph()
import os
enforce_cuda_graph = os.environ.get("VLLM_ENFORCE_CUDA_GRAPH",None)
if enforce_cuda_graph is not None and enforce_cuda_graph in ["1", "y", "Y"]:
self.enforce_eager = False
else:
self.enforce_eager = True
logger.warning_once(
"Please export VLLM_ENFORCE_CUDA_GRAPH=1 to enable cuda graph. "
"For now, cuda graph is not used and --enforce-eager is disabled ,"
"we are trying to use cuda graph as the default mode")
self._verify_bnb_config()
def get_model_arch_config(
@@ -886,6 +892,7 @@ class ModelConfig:
"modelopt",
"modelopt_fp4",
"modelopt_mxfp8",
"modelopt_mixed",
"petit_nvfp4",
# Ensure heavy backends are probed last to avoid unnecessary
# imports during override detection (e.g., MXFP4 imports Triton)
@@ -942,8 +949,6 @@ class ModelConfig:
f"Unknown quantization method: {self.quantization}. Must "
f"be one of {supported_quantization}."
)
from vllm.platforms import current_platform
current_platform.verify_quantization(self.quantization)
if self.quantization in me_quant.DEPRECATED_QUANTIZATION_METHODS:
@@ -1813,8 +1818,6 @@ def _resolve_auto_dtype(
*,
is_pooling_model: bool,
):
from vllm.platforms import current_platform
supported_dtypes = [
dtype
for dtype in current_platform.supported_dtypes