Upgrade to vllm 0.17.0 corex v4.1 overlay
This commit is contained in:
@@ -126,6 +126,9 @@ def enable_allreduce_rms_fusion(cfg: "VllmConfig") -> bool:
|
||||
# tp-dp combination broken:
|
||||
# https://github.com/vllm-project/vllm/issues/34458
|
||||
and cfg.parallel_config.data_parallel_size == 1
|
||||
# tp-pp combination broken:
|
||||
# https://github.com/vllm-project/vllm/issues/35426
|
||||
and cfg.parallel_config.pipeline_parallel_size == 1
|
||||
)
|
||||
|
||||
|
||||
@@ -857,7 +860,7 @@ class VllmConfig:
|
||||
self.compilation_config.pass_config.fuse_gemm_comms = False
|
||||
else:
|
||||
# Compute SP threshold early; disable if None (model too
|
||||
# small) before +rms_norm gets forced into custom_ops.
|
||||
# small for SP to be beneficial).
|
||||
pass_config = self.compilation_config.pass_config
|
||||
if pass_config.sp_min_token_num is None:
|
||||
from vllm.compilation.passes.fusion.sequence_parallelism import (
|
||||
@@ -880,15 +883,13 @@ class VllmConfig:
|
||||
self.compilation_config.pass_config.enable_sp = False
|
||||
self.compilation_config.pass_config.fuse_gemm_comms = False
|
||||
|
||||
if self.compilation_config.pass_config.enable_sp:
|
||||
if "-rms_norm" in self.compilation_config.custom_ops:
|
||||
logger.warning(
|
||||
"RMS norm force disabled, sequence parallelism might break"
|
||||
)
|
||||
else:
|
||||
self.compilation_config.custom_ops.append("+rms_norm")
|
||||
from vllm.utils.torch_utils import HAS_OPAQUE_TYPE
|
||||
|
||||
if self.compilation_config.fast_moe_cold_start is None:
|
||||
if HAS_OPAQUE_TYPE:
|
||||
# On torch >= 2.11 the hoisted OpaqueObject approach supersedes
|
||||
# fast_moe_cold_start, so force it off.
|
||||
self.compilation_config.fast_moe_cold_start = False
|
||||
elif self.compilation_config.fast_moe_cold_start is None:
|
||||
# resolve default behavior: try to be as safe as possible
|
||||
# this config is unsafe if any spec decoding draft model has a MOE.
|
||||
# We'll conservatively turn it off if we see spec decoding.
|
||||
@@ -907,9 +908,9 @@ class VllmConfig:
|
||||
):
|
||||
logger.warning_once(
|
||||
"Pooling models do not support full cudagraphs. "
|
||||
"Overriding cudagraph_mode to PIECEWISE."
|
||||
"Overriding cudagraph_mode to NONE."
|
||||
)
|
||||
self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
|
||||
self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE
|
||||
elif (
|
||||
model_config.is_encoder_decoder
|
||||
and self.compilation_config.cudagraph_mode
|
||||
@@ -924,6 +925,33 @@ class VllmConfig:
|
||||
CUDAGraphMode.FULL_DECODE_ONLY
|
||||
)
|
||||
|
||||
# Check if KV connector requires PIECEWISE mode for CUDA graphs
|
||||
if (
|
||||
self.kv_transfer_config is not None
|
||||
and self.kv_transfer_config.is_kv_transfer_instance
|
||||
and self.compilation_config.cudagraph_mode.has_full_cudagraphs()
|
||||
):
|
||||
# Lazy import to avoid circular dependencies
|
||||
from vllm.distributed.kv_transfer.kv_connector.factory import (
|
||||
KVConnectorFactory,
|
||||
)
|
||||
|
||||
connector_cls = KVConnectorFactory.get_connector_class(
|
||||
self.kv_transfer_config
|
||||
)
|
||||
if connector_cls.requires_piecewise_for_cudagraph(
|
||||
self.kv_transfer_config.kv_connector_extra_config
|
||||
):
|
||||
logger.warning_once(
|
||||
"KV connector %s requires PIECEWISE CUDA graph mode "
|
||||
"due to layerwise async operations that cannot be "
|
||||
"captured in CUDA graphs. "
|
||||
"Overriding cudagraph_mode from %s to PIECEWISE.",
|
||||
connector_cls.__name__,
|
||||
self.compilation_config.cudagraph_mode.name,
|
||||
)
|
||||
self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
|
||||
|
||||
# disable cudagraph when enforce eager execution
|
||||
if self.model_config is not None and self.model_config.enforce_eager:
|
||||
logger.info("Cudagraph is disabled under eager mode")
|
||||
@@ -1113,6 +1141,20 @@ class VllmConfig:
|
||||
|
||||
if not self.instance_id:
|
||||
self.instance_id = random_uuid()[:5]
|
||||
|
||||
def is_ixserver_connector(kv_transfer_config) -> bool:
|
||||
if kv_transfer_config is not None and hasattr(
|
||||
kv_transfer_config, "kv_connector"
|
||||
):
|
||||
connector = kv_transfer_config.kv_connector
|
||||
if isinstance(connector, str):
|
||||
connector_name = connector
|
||||
else:
|
||||
connector_name = getattr(
|
||||
type(connector), "__name__", str(connector)
|
||||
)
|
||||
return "IxServer" in connector_name
|
||||
return False
|
||||
|
||||
# Hybrid KV cache manager (HMA) runtime rules:
|
||||
# - Explicit enable (--no-disable-kv-cache-manager): error if runtime
|
||||
@@ -1154,21 +1196,29 @@ class VllmConfig:
|
||||
if self.scheduler_config.disable_hybrid_kv_cache_manager is None:
|
||||
# Default to disable HMA, but only if the user didn't express a preference.
|
||||
if self.kv_transfer_config is not None:
|
||||
if is_ixserver_connector(self.kv_transfer_config):
|
||||
pass
|
||||
# NOTE(Kuntai): turn HMA off for connector unless specifically enabled.
|
||||
need_disable_hybrid_kv_cache_manager = True
|
||||
logger.warning(
|
||||
"Turning off hybrid kv cache manager because "
|
||||
"`--kv-transfer-config` is set. This will reduce the "
|
||||
"performance of vLLM on LLMs with sliding window attention "
|
||||
"or Mamba attention. If you are a developer of kv connector"
|
||||
", please consider supporting hybrid kv cache manager for "
|
||||
"your connector by making sure your connector is a subclass"
|
||||
" of `SupportsHMA` defined in kv_connector/v1/base.py and"
|
||||
" use --no-disable-hybrid-kv-cache-manager to start vLLM."
|
||||
else:
|
||||
need_disable_hybrid_kv_cache_manager = True
|
||||
logger.warning(
|
||||
"Turning off hybrid kv cache manager because "
|
||||
"`--kv-transfer-config` is set. This will reduce the "
|
||||
"performance of vLLM on LLMs with sliding window attention "
|
||||
"or Mamba attention. If you are a developer of kv connector"
|
||||
", please consider supporting hybrid kv cache manager for "
|
||||
"your connector by making sure your connector is a subclass"
|
||||
" of `SupportsHMA` defined in kv_connector/v1/base.py and"
|
||||
" use --no-disable-hybrid-kv-cache-manager to start vLLM."
|
||||
)
|
||||
self.scheduler_config.disable_hybrid_kv_cache_manager = (
|
||||
need_disable_hybrid_kv_cache_manager
|
||||
)
|
||||
|
||||
else:
|
||||
self.scheduler_config.disable_hybrid_kv_cache_manager = (
|
||||
need_disable_hybrid_kv_cache_manager
|
||||
)
|
||||
self.scheduler_config.disable_hybrid_kv_cache_manager = (
|
||||
need_disable_hybrid_kv_cache_manager
|
||||
)
|
||||
elif (
|
||||
self.scheduler_config.disable_hybrid_kv_cache_manager is False
|
||||
and need_disable_hybrid_kv_cache_manager
|
||||
@@ -1466,22 +1516,22 @@ class VllmConfig:
|
||||
if compile_range_end is not None:
|
||||
computed_compile_ranges_split_points.append(compile_range_end)
|
||||
|
||||
# # Add the compile ranges for flashinfer
|
||||
# if compilation_config.pass_config.fuse_allreduce_rms:
|
||||
# tp_size = self.parallel_config.tensor_parallel_size
|
||||
# max_size = compilation_config.pass_config.flashinfer_max_size(tp_size)
|
||||
# if max_size is not None:
|
||||
# max_token_num = max_size // (
|
||||
# self.model_config.get_hidden_size()
|
||||
# * self.model_config.dtype.itemsize
|
||||
# )
|
||||
# if compile_range_end is not None and max_token_num < compile_range_end:
|
||||
# computed_compile_ranges_split_points.append(max_token_num)
|
||||
# else:
|
||||
# logger.debug(
|
||||
# "Max num batched tokens below allreduce-rms fusion threshold, "
|
||||
# "allreduce-rms fusion will be enabled for all num_tokens."
|
||||
# )
|
||||
# Add the compile ranges for flashinfer
|
||||
if compilation_config.pass_config.fuse_allreduce_rms:
|
||||
tp_size = self.parallel_config.tensor_parallel_size
|
||||
max_size = compilation_config.pass_config.flashinfer_max_size(tp_size)
|
||||
if max_size is not None:
|
||||
max_token_num = max_size // (
|
||||
self.model_config.get_hidden_size()
|
||||
* self.model_config.dtype.itemsize
|
||||
)
|
||||
if compile_range_end is not None and max_token_num < compile_range_end:
|
||||
computed_compile_ranges_split_points.append(max_token_num)
|
||||
else:
|
||||
logger.debug(
|
||||
"Max num batched tokens below allreduce-rms fusion threshold, "
|
||||
"allreduce-rms fusion will be enabled for all num_tokens."
|
||||
)
|
||||
|
||||
# Add the compile ranges for sequence parallelism
|
||||
if compilation_config.pass_config.enable_sp:
|
||||
@@ -1618,6 +1668,7 @@ class VllmConfig:
|
||||
f"pipeline_parallel_size={self.parallel_config.pipeline_parallel_size}, " # noqa
|
||||
f"data_parallel_size={self.parallel_config.data_parallel_size}, " # noqa
|
||||
f"disable_custom_all_reduce={self.parallel_config.disable_custom_all_reduce}, " # noqa
|
||||
f"quantization={self.model_config.quantization}, "
|
||||
f"enforce_eager={self.model_config.enforce_eager}, "
|
||||
f"enable_return_routed_experts={self.model_config.enable_return_routed_experts}, " # noqa
|
||||
f"kv_cache_dtype={self.cache_config.cache_dtype}, "
|
||||
|
||||
Reference in New Issue
Block a user