Upgrade to vllm 0.17.0 corex v4.1 overlay
This commit is contained in:
@@ -2,8 +2,9 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from collections.abc import Callable
|
||||
from dataclasses import dataclass, field
|
||||
from typing import TYPE_CHECKING, NamedTuple, TypeAlias
|
||||
from typing import TYPE_CHECKING, NamedTuple, TypeAlias, TypeVar
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
@@ -120,6 +121,20 @@ class SamplerOutput:
|
||||
logprobs_tensors: LogprobsTensors | None
|
||||
|
||||
|
||||
T = TypeVar("T")
|
||||
|
||||
|
||||
def _combine_non_none(f: Callable[[T, T], T], items: list[T | None]) -> T | None:
|
||||
non_none = [item for item in items if item is not None]
|
||||
if len(non_none) == 0:
|
||||
return None
|
||||
|
||||
combined = non_none[0]
|
||||
for item in non_none[1:]:
|
||||
combined = f(combined, item)
|
||||
return combined
|
||||
|
||||
|
||||
@dataclass
|
||||
class KVConnectorOutput:
|
||||
# [req_ids]
|
||||
@@ -146,6 +161,43 @@ class KVConnectorOutput:
|
||||
and not self.invalid_block_ids
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def merge(cls, *outputs: "KVConnectorOutput"):
|
||||
assert len(outputs) > 0, "Cannot merge empty outputs"
|
||||
finished_sending = _combine_non_none(
|
||||
set.union, [output.finished_sending for output in outputs]
|
||||
)
|
||||
finished_recving = _combine_non_none(
|
||||
set.union, [output.finished_recving for output in outputs]
|
||||
)
|
||||
kv_connector_stats = _combine_non_none(
|
||||
lambda x, y: x.aggregate(y),
|
||||
[output.kv_connector_stats for output in outputs],
|
||||
)
|
||||
kv_cache_events = _combine_non_none(
|
||||
lambda x, y: x.merge(y),
|
||||
[output.kv_cache_events for output in outputs],
|
||||
)
|
||||
invalid_block_ids = _combine_non_none(
|
||||
set.union, [output.invalid_block_ids for output in outputs]
|
||||
)
|
||||
assert invalid_block_ids is not None
|
||||
|
||||
assert all(
|
||||
output.expected_finished_count == outputs[0].expected_finished_count
|
||||
for output in outputs
|
||||
)
|
||||
expected_finished_count = outputs[0].expected_finished_count
|
||||
|
||||
return cls(
|
||||
finished_sending=finished_sending,
|
||||
finished_recving=finished_recving,
|
||||
kv_connector_stats=kv_connector_stats,
|
||||
kv_cache_events=kv_cache_events,
|
||||
invalid_block_ids=invalid_block_ids,
|
||||
expected_finished_count=expected_finished_count,
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ECConnectorOutput:
|
||||
@@ -153,7 +205,12 @@ class ECConnectorOutput:
|
||||
finished_sending: set[str] | None = None
|
||||
finished_recving: set[str] | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class DraftTokenIds:
|
||||
# [num_reqs]
|
||||
req_ids: list[str]
|
||||
# num_reqs x num_draft_tokens
|
||||
draft_token_ids: list[list[int]]
|
||||
# ModelRunnerOutput is serialized and sent to the scheduler process.
|
||||
# This is expensive for torch.Tensor so prefer to use list instead.
|
||||
@dataclass
|
||||
@@ -191,6 +248,8 @@ class ModelRunnerOutput:
|
||||
|
||||
# req_id -> num_nans_in_logits
|
||||
num_nans_in_logits: dict[str, int] | None = None
|
||||
|
||||
draft_token_ids: DraftTokenIds | None = None
|
||||
|
||||
# information related to cudagraph execution
|
||||
cudagraph_stats: CUDAGraphStat | None = None
|
||||
@@ -209,13 +268,6 @@ class AsyncModelRunnerOutput(ABC):
|
||||
pass
|
||||
|
||||
|
||||
@dataclass
|
||||
class DraftTokenIds:
|
||||
# [num_reqs]
|
||||
req_ids: list[str]
|
||||
# num_reqs x num_draft_tokens
|
||||
draft_token_ids: list[list[int]]
|
||||
|
||||
|
||||
def make_empty_encoder_model_runner_output(
|
||||
scheduler_output: "SchedulerOutput",
|
||||
|
||||
Reference in New Issue
Block a user