Upgrade to vllm 0.17.0 corex v4.1 overlay
This commit is contained in:
@@ -50,8 +50,6 @@ from vllm.transformers_utils.utils import maybe_model_redirect
|
||||
from vllm.utils.import_utils import LazyLoader
|
||||
from vllm.v1.attention.backends.registry import AttentionBackendEnum
|
||||
|
||||
import os
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from transformers import PretrainedConfig
|
||||
|
||||
@@ -128,6 +126,7 @@ class ModelConfig:
|
||||
- "slow" will always use the slow tokenizer.\n
|
||||
- "mistral" will always use the tokenizer from `mistral_common`.\n
|
||||
- "deepseek_v32" will always use the tokenizer from `deepseek_v32`.\n
|
||||
- "qwen_vl" will always use the tokenizer from `qwen_vl`.\n
|
||||
- Other custom values can be supported via plugins."""
|
||||
trust_remote_code: bool = False
|
||||
"""Trust remote code (e.g., from HuggingFace) when downloading the model
|
||||
@@ -463,8 +462,6 @@ class ModelConfig:
|
||||
|
||||
self.maybe_pull_model_tokenizer_for_runai(self.model, self.tokenizer)
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
if self.override_attention_dtype is not None and not current_platform.is_rocm():
|
||||
warnings.warn(
|
||||
"override-attention-dtype is set but not using ROCm platform",
|
||||
@@ -473,10 +470,9 @@ class ModelConfig:
|
||||
|
||||
if self.enable_sleep_mode and not current_platform.is_sleep_mode_available():
|
||||
raise ValueError("Sleep mode is not supported on current platform.")
|
||||
|
||||
temp_hf_config_path = os.environ.get("CUSTOM_QUANT_CONFIG", None)
|
||||
|
||||
hf_config = get_config(
|
||||
temp_hf_config_path or self.hf_config_path or self.model,
|
||||
self.hf_config_path or self.model,
|
||||
self.trust_remote_code,
|
||||
self.revision,
|
||||
self.code_revision,
|
||||
@@ -622,6 +618,16 @@ class ModelConfig:
|
||||
self._try_verify_and_update_model_config()
|
||||
self._verify_quantization()
|
||||
self._verify_cuda_graph()
|
||||
import os
|
||||
enforce_cuda_graph = os.environ.get("VLLM_ENFORCE_CUDA_GRAPH",None)
|
||||
if enforce_cuda_graph is not None and enforce_cuda_graph in ["1", "y", "Y"]:
|
||||
self.enforce_eager = False
|
||||
else:
|
||||
self.enforce_eager = True
|
||||
logger.warning_once(
|
||||
"Please export VLLM_ENFORCE_CUDA_GRAPH=1 to enable cuda graph. "
|
||||
"For now, cuda graph is not used and --enforce-eager is disabled ,"
|
||||
"we are trying to use cuda graph as the default mode")
|
||||
self._verify_bnb_config()
|
||||
|
||||
def get_model_arch_config(
|
||||
@@ -886,6 +892,7 @@ class ModelConfig:
|
||||
"modelopt",
|
||||
"modelopt_fp4",
|
||||
"modelopt_mxfp8",
|
||||
"modelopt_mixed",
|
||||
"petit_nvfp4",
|
||||
# Ensure heavy backends are probed last to avoid unnecessary
|
||||
# imports during override detection (e.g., MXFP4 imports Triton)
|
||||
@@ -942,8 +949,6 @@ class ModelConfig:
|
||||
f"Unknown quantization method: {self.quantization}. Must "
|
||||
f"be one of {supported_quantization}."
|
||||
)
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
current_platform.verify_quantization(self.quantization)
|
||||
|
||||
if self.quantization in me_quant.DEPRECATED_QUANTIZATION_METHODS:
|
||||
@@ -1813,8 +1818,6 @@ def _resolve_auto_dtype(
|
||||
*,
|
||||
is_pooling_model: bool,
|
||||
):
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
supported_dtypes = [
|
||||
dtype
|
||||
for dtype in current_platform.supported_dtypes
|
||||
|
||||
Reference in New Issue
Block a user