Upgrade to vllm 0.17.0 corex v4.1 overlay

2026-04-29 19:38:22 +08:00
parent 8fac6062e4
commit 938d0854a5
430 changed files with 35969 additions and 14511 deletions
--- a/vllm/v1/core/sched/output.py
+++ b/vllm/v1/core/sched/output.py
@@ -5,8 +5,6 @@ from dataclasses import dataclass
 from functools import cached_property
 from typing import TYPE_CHECKING

-from vllm._bc_linter import bc_linter_include
-
 if TYPE_CHECKING:
    import numpy as np
    import numpy.typing as npt
@@ -29,7 +27,6 @@ else:
    Request = object


-@bc_linter_include
@dataclass
 class NewRequestData:
    req_id: str
@@ -109,7 +106,6 @@ class NewRequestData:
        )


-@bc_linter_include
@dataclass
 class CachedRequestData:
    req_ids: list[str]
@@ -179,7 +175,6 @@ class CachedRequestData:
        )


-@bc_linter_include
@dataclass
 class SchedulerOutput:
    # list of the requests that are scheduled for the first time.
@@ -217,6 +212,9 @@ class SchedulerOutput:
    # freed from the encoder cache.
    free_encoder_mm_hashes: list[str]

+    # Request IDs that are resumed from preemption in this step.
+    scheduled_resumed_reqs: list[str] | None = None
+
    # Request IDs that are preempted in this step.
    # Only used for v2 model runner.
    preempted_req_ids: set[str] | None = None
@@ -238,6 +236,11 @@ class SchedulerOutput:
    # EC Cache Connector metadata
    ec_connector_metadata: ECConnectorMetadata | None = None

+    # Block IDs freshly allocated from the pool during this scheduling step.
+    # The worker zeros the corresponding GPU memory before the blocks are used,
+    # preventing stale NaN/data from corrupting attention or SSM computation.
+    new_block_ids_to_zero: list[int] | None = None
+
    @classmethod
    def make_empty(cls) -> "SchedulerOutput":
        return cls(