Upgrade to vllm 0.17.0 corex v4.1 overlay

This commit is contained in:
2026-04-29 19:38:22 +08:00
parent 8fac6062e4
commit 938d0854a5
430 changed files with 35969 additions and 14511 deletions

View File

@@ -126,6 +126,9 @@ def enable_allreduce_rms_fusion(cfg: "VllmConfig") -> bool:
# tp-dp combination broken:
# https://github.com/vllm-project/vllm/issues/34458
and cfg.parallel_config.data_parallel_size == 1
# tp-pp combination broken:
# https://github.com/vllm-project/vllm/issues/35426
and cfg.parallel_config.pipeline_parallel_size == 1
)
@@ -857,7 +860,7 @@ class VllmConfig:
self.compilation_config.pass_config.fuse_gemm_comms = False
else:
# Compute SP threshold early; disable if None (model too
# small) before +rms_norm gets forced into custom_ops.
# small for SP to be beneficial).
pass_config = self.compilation_config.pass_config
if pass_config.sp_min_token_num is None:
from vllm.compilation.passes.fusion.sequence_parallelism import (
@@ -880,15 +883,13 @@ class VllmConfig:
self.compilation_config.pass_config.enable_sp = False
self.compilation_config.pass_config.fuse_gemm_comms = False
if self.compilation_config.pass_config.enable_sp:
if "-rms_norm" in self.compilation_config.custom_ops:
logger.warning(
"RMS norm force disabled, sequence parallelism might break"
)
else:
self.compilation_config.custom_ops.append("+rms_norm")
from vllm.utils.torch_utils import HAS_OPAQUE_TYPE
if self.compilation_config.fast_moe_cold_start is None:
if HAS_OPAQUE_TYPE:
# On torch >= 2.11 the hoisted OpaqueObject approach supersedes
# fast_moe_cold_start, so force it off.
self.compilation_config.fast_moe_cold_start = False
elif self.compilation_config.fast_moe_cold_start is None:
# resolve default behavior: try to be as safe as possible
# this config is unsafe if any spec decoding draft model has a MOE.
# We'll conservatively turn it off if we see spec decoding.
@@ -907,9 +908,9 @@ class VllmConfig:
):
logger.warning_once(
"Pooling models do not support full cudagraphs. "
"Overriding cudagraph_mode to PIECEWISE."
"Overriding cudagraph_mode to NONE."
)
self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE
elif (
model_config.is_encoder_decoder
and self.compilation_config.cudagraph_mode
@@ -924,6 +925,33 @@ class VllmConfig:
CUDAGraphMode.FULL_DECODE_ONLY
)
# Check if KV connector requires PIECEWISE mode for CUDA graphs
if (
self.kv_transfer_config is not None
and self.kv_transfer_config.is_kv_transfer_instance
and self.compilation_config.cudagraph_mode.has_full_cudagraphs()
):
# Lazy import to avoid circular dependencies
from vllm.distributed.kv_transfer.kv_connector.factory import (
KVConnectorFactory,
)
connector_cls = KVConnectorFactory.get_connector_class(
self.kv_transfer_config
)
if connector_cls.requires_piecewise_for_cudagraph(
self.kv_transfer_config.kv_connector_extra_config
):
logger.warning_once(
"KV connector %s requires PIECEWISE CUDA graph mode "
"due to layerwise async operations that cannot be "
"captured in CUDA graphs. "
"Overriding cudagraph_mode from %s to PIECEWISE.",
connector_cls.__name__,
self.compilation_config.cudagraph_mode.name,
)
self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
# disable cudagraph when enforce eager execution
if self.model_config is not None and self.model_config.enforce_eager:
logger.info("Cudagraph is disabled under eager mode")
@@ -1113,6 +1141,20 @@ class VllmConfig:
if not self.instance_id:
self.instance_id = random_uuid()[:5]
def is_ixserver_connector(kv_transfer_config) -> bool:
if kv_transfer_config is not None and hasattr(
kv_transfer_config, "kv_connector"
):
connector = kv_transfer_config.kv_connector
if isinstance(connector, str):
connector_name = connector
else:
connector_name = getattr(
type(connector), "__name__", str(connector)
)
return "IxServer" in connector_name
return False
# Hybrid KV cache manager (HMA) runtime rules:
# - Explicit enable (--no-disable-kv-cache-manager): error if runtime
@@ -1154,21 +1196,29 @@ class VllmConfig:
if self.scheduler_config.disable_hybrid_kv_cache_manager is None:
# Default to disable HMA, but only if the user didn't express a preference.
if self.kv_transfer_config is not None:
if is_ixserver_connector(self.kv_transfer_config):
pass
# NOTE(Kuntai): turn HMA off for connector unless specifically enabled.
need_disable_hybrid_kv_cache_manager = True
logger.warning(
"Turning off hybrid kv cache manager because "
"`--kv-transfer-config` is set. This will reduce the "
"performance of vLLM on LLMs with sliding window attention "
"or Mamba attention. If you are a developer of kv connector"
", please consider supporting hybrid kv cache manager for "
"your connector by making sure your connector is a subclass"
" of `SupportsHMA` defined in kv_connector/v1/base.py and"
" use --no-disable-hybrid-kv-cache-manager to start vLLM."
else:
need_disable_hybrid_kv_cache_manager = True
logger.warning(
"Turning off hybrid kv cache manager because "
"`--kv-transfer-config` is set. This will reduce the "
"performance of vLLM on LLMs with sliding window attention "
"or Mamba attention. If you are a developer of kv connector"
", please consider supporting hybrid kv cache manager for "
"your connector by making sure your connector is a subclass"
" of `SupportsHMA` defined in kv_connector/v1/base.py and"
" use --no-disable-hybrid-kv-cache-manager to start vLLM."
)
self.scheduler_config.disable_hybrid_kv_cache_manager = (
need_disable_hybrid_kv_cache_manager
)
else:
self.scheduler_config.disable_hybrid_kv_cache_manager = (
need_disable_hybrid_kv_cache_manager
)
self.scheduler_config.disable_hybrid_kv_cache_manager = (
need_disable_hybrid_kv_cache_manager
)
elif (
self.scheduler_config.disable_hybrid_kv_cache_manager is False
and need_disable_hybrid_kv_cache_manager
@@ -1466,22 +1516,22 @@ class VllmConfig:
if compile_range_end is not None:
computed_compile_ranges_split_points.append(compile_range_end)
# # Add the compile ranges for flashinfer
# if compilation_config.pass_config.fuse_allreduce_rms:
# tp_size = self.parallel_config.tensor_parallel_size
# max_size = compilation_config.pass_config.flashinfer_max_size(tp_size)
# if max_size is not None:
# max_token_num = max_size // (
# self.model_config.get_hidden_size()
# * self.model_config.dtype.itemsize
# )
# if compile_range_end is not None and max_token_num < compile_range_end:
# computed_compile_ranges_split_points.append(max_token_num)
# else:
# logger.debug(
# "Max num batched tokens below allreduce-rms fusion threshold, "
# "allreduce-rms fusion will be enabled for all num_tokens."
# )
# Add the compile ranges for flashinfer
if compilation_config.pass_config.fuse_allreduce_rms:
tp_size = self.parallel_config.tensor_parallel_size
max_size = compilation_config.pass_config.flashinfer_max_size(tp_size)
if max_size is not None:
max_token_num = max_size // (
self.model_config.get_hidden_size()
* self.model_config.dtype.itemsize
)
if compile_range_end is not None and max_token_num < compile_range_end:
computed_compile_ranges_split_points.append(max_token_num)
else:
logger.debug(
"Max num batched tokens below allreduce-rms fusion threshold, "
"allreduce-rms fusion will be enabled for all num_tokens."
)
# Add the compile ranges for sequence parallelism
if compilation_config.pass_config.enable_sp:
@@ -1618,6 +1668,7 @@ class VllmConfig:
f"pipeline_parallel_size={self.parallel_config.pipeline_parallel_size}, " # noqa
f"data_parallel_size={self.parallel_config.data_parallel_size}, " # noqa
f"disable_custom_all_reduce={self.parallel_config.disable_custom_all_reduce}, " # noqa
f"quantization={self.model_config.quantization}, "
f"enforce_eager={self.model_config.enforce_eager}, "
f"enable_return_routed_experts={self.model_config.enable_return_routed_experts}, " # noqa
f"kv_cache_dtype={self.cache_config.cache_dtype}, "