Sync from v0.13

2026-01-19 10:38:50 +08:00
parent b2ef04d792
commit 5aef6c175a
3714 changed files with 854317 additions and 89342 deletions
--- a/vllm/v1/worker/gpu/async_utils.py
+++ b/vllm/v1/worker/gpu/async_utils.py
@@ -0,0 +1,98 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from contextlib import contextmanager
+
+import numpy as np
+import torch
+
+from vllm.v1.outputs import (
+    AsyncModelRunnerOutput,
+    LogprobsTensors,
+    ModelRunnerOutput,
+)
+from vllm.v1.worker.gpu.sample.output import SamplerOutput
+
+
+class AsyncOutput(AsyncModelRunnerOutput):
+    def __init__(
+        self,
+        model_runner_output: ModelRunnerOutput,
+        sampler_output: SamplerOutput,
+        num_sampled_tokens: torch.Tensor,
+        copy_stream: torch.cuda.Stream,
+        copy_event: torch.cuda.Event,
+    ):
+        # NOTE(woosuk): We must retain references to the GPU tensors,
+        # as the copy operations are performed on a different CUDA stream than
+        # the one where the tensors were created.
+        self.model_runner_output = model_runner_output
+        self.sampler_output = sampler_output
+        self.num_sampled_tokens = num_sampled_tokens
+        self.copy_stream = copy_stream
+        self.copy_event = copy_event
+
+        default_stream = torch.cuda.current_stream()
+        with torch.cuda.stream(self.copy_stream):
+            self.copy_stream.wait_stream(default_stream)
+
+            self.sampled_token_ids = async_copy_to_np(sampler_output.sampled_token_ids)
+            if sampler_output.logprobs_tensors is not None:
+                self.logprobs_tensors: LogprobsTensors | None = (
+                    sampler_output.logprobs_tensors.to_cpu_nonblocking()
+                )
+            else:
+                self.logprobs_tensors = None
+            if sampler_output.num_nans is not None:
+                self.num_nans = async_copy_to_np(sampler_output.num_nans)
+            else:
+                self.num_nans = None
+            self.num_sampled_tokens_np = async_copy_to_np(num_sampled_tokens)
+            self.prompt_logprobs_dict: dict[str, LogprobsTensors | None] = {}
+            if self.model_runner_output.prompt_logprobs_dict:
+                for k, v in self.model_runner_output.prompt_logprobs_dict.items():
+                    if v is not None:
+                        self.prompt_logprobs_dict[k] = v.to_cpu_nonblocking()
+                    else:
+                        self.prompt_logprobs_dict[k] = None
+            self.copy_event.record(self.copy_stream)
+
+    def get_output(self) -> ModelRunnerOutput:
+        self.copy_event.synchronize()
+
+        # NOTE(woosuk): The following code is to ensure compatibility with
+        # the existing model runner.
+        # Going forward, we should keep the data structures as NumPy arrays
+        # rather than Python lists.
+        sampled_token_ids: list[list[int]] = self.sampled_token_ids.tolist()
+        num_reqs = len(sampled_token_ids)
+        num_sampled_tokens = self.num_sampled_tokens_np.tolist()
+        for i in range(num_reqs):
+            del sampled_token_ids[i][num_sampled_tokens[i] :]
+        self.model_runner_output.sampled_token_ids = sampled_token_ids
+
+        if self.num_nans is not None:
+            num_nans = self.num_nans.tolist()
+            self.model_runner_output.num_nans_in_logits = {
+                req_id: num_nans[i]
+                for i, req_id in enumerate(self.model_runner_output.req_ids)
+            }
+
+        if self.logprobs_tensors is not None:
+            self.model_runner_output.logprobs = self.logprobs_tensors.tolists()
+        self.model_runner_output.prompt_logprobs_dict = self.prompt_logprobs_dict
+        return self.model_runner_output
+
+
+@contextmanager
+def async_barrier(event: torch.cuda.Event | None):
+    if event is not None:
+        event.synchronize()
+    try:
+        yield
+    finally:
+        if event is not None:
+            event.record()
+
+
+def async_copy_to_np(x: torch.Tensor) -> np.ndarray:
+    return x.to("cpu", non_blocking=True).numpy()