Upgrade to vllm 0.17.0 corex v4.1 overlay

This commit is contained in:
2026-04-29 19:38:22 +08:00
parent 8fac6062e4
commit 938d0854a5
430 changed files with 35969 additions and 14511 deletions

View File

@@ -2,8 +2,9 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from abc import ABC, abstractmethod
from collections.abc import Callable
from dataclasses import dataclass, field
from typing import TYPE_CHECKING, NamedTuple, TypeAlias
from typing import TYPE_CHECKING, NamedTuple, TypeAlias, TypeVar
import numpy as np
import torch
@@ -120,6 +121,20 @@ class SamplerOutput:
logprobs_tensors: LogprobsTensors | None
T = TypeVar("T")
def _combine_non_none(f: Callable[[T, T], T], items: list[T | None]) -> T | None:
non_none = [item for item in items if item is not None]
if len(non_none) == 0:
return None
combined = non_none[0]
for item in non_none[1:]:
combined = f(combined, item)
return combined
@dataclass
class KVConnectorOutput:
# [req_ids]
@@ -146,6 +161,43 @@ class KVConnectorOutput:
and not self.invalid_block_ids
)
@classmethod
def merge(cls, *outputs: "KVConnectorOutput"):
assert len(outputs) > 0, "Cannot merge empty outputs"
finished_sending = _combine_non_none(
set.union, [output.finished_sending for output in outputs]
)
finished_recving = _combine_non_none(
set.union, [output.finished_recving for output in outputs]
)
kv_connector_stats = _combine_non_none(
lambda x, y: x.aggregate(y),
[output.kv_connector_stats for output in outputs],
)
kv_cache_events = _combine_non_none(
lambda x, y: x.merge(y),
[output.kv_cache_events for output in outputs],
)
invalid_block_ids = _combine_non_none(
set.union, [output.invalid_block_ids for output in outputs]
)
assert invalid_block_ids is not None
assert all(
output.expected_finished_count == outputs[0].expected_finished_count
for output in outputs
)
expected_finished_count = outputs[0].expected_finished_count
return cls(
finished_sending=finished_sending,
finished_recving=finished_recving,
kv_connector_stats=kv_connector_stats,
kv_cache_events=kv_cache_events,
invalid_block_ids=invalid_block_ids,
expected_finished_count=expected_finished_count,
)
@dataclass
class ECConnectorOutput:
@@ -153,7 +205,12 @@ class ECConnectorOutput:
finished_sending: set[str] | None = None
finished_recving: set[str] | None = None
@dataclass
class DraftTokenIds:
# [num_reqs]
req_ids: list[str]
# num_reqs x num_draft_tokens
draft_token_ids: list[list[int]]
# ModelRunnerOutput is serialized and sent to the scheduler process.
# This is expensive for torch.Tensor so prefer to use list instead.
@dataclass
@@ -191,6 +248,8 @@ class ModelRunnerOutput:
# req_id -> num_nans_in_logits
num_nans_in_logits: dict[str, int] | None = None
draft_token_ids: DraftTokenIds | None = None
# information related to cudagraph execution
cudagraph_stats: CUDAGraphStat | None = None
@@ -209,13 +268,6 @@ class AsyncModelRunnerOutput(ABC):
pass
@dataclass
class DraftTokenIds:
# [num_reqs]
req_ids: list[str]
# num_reqs x num_draft_tokens
draft_token_ids: list[list[int]]
def make_empty_encoder_model_runner_output(
scheduler_output: "SchedulerOutput",