Upgrade to vllm 0.17.0 corex v4.1 overlay

2026-04-29 19:38:22 +08:00
parent 8fac6062e4
commit 938d0854a5
430 changed files with 35969 additions and 14511 deletions
--- a/vllm/v1/cudagraph_dispatcher.py
+++ b/vllm/v1/cudagraph_dispatcher.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Set as AbstractSet
 from dataclasses import replace
 from itertools import product

@@ -136,7 +137,7 @@ class CudagraphDispatcher:
        num_tokens_padded = self._bs_to_padded_graph_size[num_tokens]

        if uniform_decode and self.cudagraph_mode.has_mode(CUDAGraphMode.FULL):
-            num_reqs = num_tokens_padded // uniform_decode_query_len
+            num_reqs = min(num_tokens_padded // uniform_decode_query_len, max_num_seqs)
            assert num_tokens_padded % uniform_decode_query_len == 0
        else:
            uniform_decode = False
@@ -232,8 +233,9 @@ class CudagraphDispatcher:
        num_tokens: int,
        uniform_decode: bool = False,
        has_lora: bool = False,
-        disable_full: bool = False,
        num_active_loras: int = 0,
+        valid_modes: AbstractSet[CUDAGraphMode] | None = None,
+        invalid_modes: AbstractSet[CUDAGraphMode] | None = None,
    ) -> tuple[CUDAGraphMode, BatchDescriptor]:
        """
        Given conditions(e.g.,batch descriptor and if using piecewise only),
@@ -246,15 +248,29 @@ class CudagraphDispatcher:
            uniform_decode: Whether the batch is uniform decode (i.e. uniform and query
                length is uniform_decode_query_len).
            has_lora: Whether LoRA is active.
-            disable_full: If True, skip FULL cudagraph checks and
-                return PIECEWISE or NONE only. (can be used for features like
-                cascade attention that are not supported by full cudagraphs)
            num_active_loras: Number of distinct active LoRA adapters.
+            valid_modes: Set of cudagraph modes that are allowed. None means
+                all modes are allowed.
+            invalid_modes: Set of cudagraph modes to exclude. Subtracted from
+                valid_modes to compute allowed modes. (e.g., {FULL} for
+                features like cascade attention not supported by full
+                cudagraphs). None means no modes are excluded.
        """
+        allowed_modes = valid_modes or CUDAGraphMode.valid_runtime_modes()
+
+        if invalid_modes:
+            allowed_modes -= invalid_modes
+
+        assert len(allowed_modes) >= 1, (
+            f"No allowed cudagraph modes: valid_modes={valid_modes}, "
+            f"invalid_modes={invalid_modes}"
+        )
+
        if (
            not self.keys_initialized
            or self.cudagraph_mode == CUDAGraphMode.NONE
            or num_tokens > self.compilation_config.max_cudagraph_capture_size
+            or allowed_modes <= {CUDAGraphMode.NONE}
        ):
            return CUDAGraphMode.NONE, BatchDescriptor(num_tokens)

@@ -281,24 +297,26 @@ class CudagraphDispatcher:
            num_tokens, uniform_decode, has_lora, effective_num_active_loras
        )

-        # check if key exists for full cudagraph
-        # For pure FULL mode, keys are registered with uniform=False.
-        batch_desc_to_check = batch_desc
-        if self.cudagraph_mode == CUDAGraphMode.FULL:
-            batch_desc_to_check = replace(batch_desc, uniform=False)
-        if (
-            not disable_full
-            and batch_desc_to_check in self.cudagraph_keys[CUDAGraphMode.FULL]
-        ):
-            return CUDAGraphMode.FULL, batch_desc_to_check
+        if CUDAGraphMode.FULL in allowed_modes:
+            # check if key exists for full cudagraph
+            # For pure FULL mode, keys are registered with uniform=False.
+            batch_desc_to_check = batch_desc
+            if self.cudagraph_mode == CUDAGraphMode.FULL:
+                batch_desc_to_check = replace(batch_desc, uniform=False)
+            if batch_desc_to_check in self.cudagraph_keys[CUDAGraphMode.FULL]:
+                return CUDAGraphMode.FULL, batch_desc_to_check

-        # also check if the relaxed key exists for more "general"
-        # piecewise cudagraph
-        batch_desc_to_check = replace(batch_desc, num_reqs=None, uniform=False)
-        if batch_desc_to_check in self.cudagraph_keys[CUDAGraphMode.PIECEWISE]:
-            return CUDAGraphMode.PIECEWISE, batch_desc_to_check
+        if CUDAGraphMode.PIECEWISE in allowed_modes:
+            # also check if the relaxed key exists for more "general"
+            # piecewise cudagraph
+            batch_desc_to_check = replace(batch_desc, num_reqs=None, uniform=False)
+            if batch_desc_to_check in self.cudagraph_keys[CUDAGraphMode.PIECEWISE]:
+                return CUDAGraphMode.PIECEWISE, batch_desc_to_check

-        # finally, just return no cudagraphs and a trivial batch descriptor
+        assert CUDAGraphMode.NONE in allowed_modes, (
+            f"No matching cudagraph found and NONE is not in "
+            f"allowed_modes={allowed_modes}"
+        )
        return CUDAGraphMode.NONE, BatchDescriptor(num_tokens)

    def get_capture_descs(self) -> list[tuple[CUDAGraphMode, list[BatchDescriptor]]]: