Upgrade to vllm 0.17.0 corex v4.1 overlay

This commit is contained in:
2026-04-29 19:38:22 +08:00
parent 8fac6062e4
commit 938d0854a5
430 changed files with 35969 additions and 14511 deletions

View File

@@ -86,9 +86,16 @@ class CUDAGraphMode(enum.Enum):
def separate_routine(self) -> bool:
return isinstance(self.value, tuple)
def decode_use_graph(self) -> bool:
return self.decode_mode() == CUDAGraphMode.FULL
def valid_runtime_modes(self) -> bool:
return self in [CUDAGraphMode.NONE, CUDAGraphMode.PIECEWISE, CUDAGraphMode.FULL]
@classmethod
def valid_runtime_modes(cls) -> frozenset["CUDAGraphMode"]:
return frozenset({cls.NONE, cls.PIECEWISE, cls.FULL})
def is_valid_runtime_mode(self) -> bool:
return self in CUDAGraphMode.valid_runtime_modes()
def __str__(self) -> str:
return self.name
@@ -385,7 +392,7 @@ class CompilationConfig:
Please use mode. Currently all levels are mapped to mode.
"""
# Top-level Compilation control
mode: CompilationMode = Field(default=None)
mode: CompilationMode = Field(default=CompilationMode.NONE)
"""The compilation approach used for torch.compile-based compilation of the
model.
@@ -503,7 +510,7 @@ class CompilationConfig:
constructor, e.g. `CompilationConfig(inductor_passes={"a": func})`."""
# CudaGraph compilation
cudagraph_mode: CUDAGraphMode = Field(default=None)
cudagraph_mode: CUDAGraphMode = Field(default=CUDAGraphMode.FULL_DECODE_ONLY)
"""
The mode of the cudagraph:
@@ -1003,6 +1010,7 @@ class CompilationConfig:
# https://github.com/vllm-project/vllm/issues/33267
if not self.use_inductor_graph_partition:
self.splitting_ops.append("vllm::unified_kv_cache_update")
self.splitting_ops.append("vllm::unified_mla_kv_cache_update")
elif len(self.splitting_ops) == 0:
if (
@@ -1045,7 +1053,7 @@ class CompilationConfig:
"are optimized for prefill and are incompatible with CUDA Graphs. "
"In order to use CUDA Graphs for decode-optimized workloads, "
"use --all2all-backend with another option, such as "
"deepep_low_latency, pplx, or allgather_reducescatter."
"deepep_low_latency or allgather_reducescatter."
)
self.cudagraph_mode = CUDAGraphMode.NONE