Upgrade to vllm 0.17.0 corex v4.1 overlay

2026-04-29 19:38:22 +08:00
parent 8fac6062e4
commit 938d0854a5
430 changed files with 35969 additions and 14511 deletions
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -86,9 +86,16 @@ class CUDAGraphMode(enum.Enum):

    def separate_routine(self) -> bool:
        return isinstance(self.value, tuple)
+    
+    def decode_use_graph(self) -> bool:
+        return self.decode_mode() == CUDAGraphMode.FULL

-    def valid_runtime_modes(self) -> bool:
-        return self in [CUDAGraphMode.NONE, CUDAGraphMode.PIECEWISE, CUDAGraphMode.FULL]
+    @classmethod
+    def valid_runtime_modes(cls) -> frozenset["CUDAGraphMode"]:
+        return frozenset({cls.NONE, cls.PIECEWISE, cls.FULL})
+
+    def is_valid_runtime_mode(self) -> bool:
+        return self in CUDAGraphMode.valid_runtime_modes()

    def __str__(self) -> str:
        return self.name
@@ -385,7 +392,7 @@ class CompilationConfig:
    Please use mode. Currently all levels are mapped to mode.
    """
    # Top-level Compilation control
-    mode: CompilationMode = Field(default=None)
+    mode: CompilationMode = Field(default=CompilationMode.NONE)
    """The compilation approach used for torch.compile-based compilation of the
    model.

@@ -503,7 +510,7 @@ class CompilationConfig:
    constructor, e.g. `CompilationConfig(inductor_passes={"a": func})`."""

    # CudaGraph compilation
-    cudagraph_mode: CUDAGraphMode = Field(default=None)
+    cudagraph_mode: CUDAGraphMode = Field(default=CUDAGraphMode.FULL_DECODE_ONLY)
    """
    The mode of the cudagraph:

@@ -1003,6 +1010,7 @@ class CompilationConfig:
                # https://github.com/vllm-project/vllm/issues/33267
                if not self.use_inductor_graph_partition:
                    self.splitting_ops.append("vllm::unified_kv_cache_update")
+                    self.splitting_ops.append("vllm::unified_mla_kv_cache_update")

            elif len(self.splitting_ops) == 0:
                if (
@@ -1045,7 +1053,7 @@ class CompilationConfig:
                "are optimized for prefill and are incompatible with CUDA Graphs. "
                "In order to use CUDA Graphs for decode-optimized workloads, "
                "use --all2all-backend with another option, such as "
-                "deepep_low_latency, pplx, or allgather_reducescatter."
+                "deepep_low_latency or allgather_reducescatter."
            )
            self.cudagraph_mode = CUDAGraphMode.NONE