Upgrade to vllm 0.17.0 corex v4.1 overlay

This commit is contained in:
2026-04-29 19:38:22 +08:00
parent 8fac6062e4
commit 938d0854a5
430 changed files with 35969 additions and 14511 deletions

View File

@@ -70,6 +70,42 @@ class AsyncOutput(AsyncModelRunnerOutput):
return self.model_runner_output
class AsyncPoolingOutput(AsyncModelRunnerOutput):
def __init__(
self,
model_runner_output: ModelRunnerOutput,
pooler_output: torch.Tensor,
is_valid: torch.Tensor | None,
main_stream: torch.cuda.Stream,
copy_stream: torch.cuda.Stream,
copy_event: torch.cuda.Event,
):
self.model_runner_output = model_runner_output
self.pooler_output = pooler_output
self.is_valid = is_valid
self.copy_event = copy_event
with stream(copy_stream, main_stream):
copy_stream.wait_stream(main_stream)
self.pooler_output_cpu = self.pooler_output.to("cpu", non_blocking=True)
if self.is_valid is not None:
self.is_valid_cpu = self.is_valid.to("cpu", non_blocking=True)
else:
self.is_valid_cpu = None
self.copy_event.record(copy_stream)
def get_output(self) -> ModelRunnerOutput:
self.copy_event.synchronize()
pooler_output = self.pooler_output_cpu.unbind(dim=0)
if self.is_valid_cpu is not None:
is_valid_cpu = self.is_valid_cpu.tolist()
for i, is_valid in enumerate(is_valid_cpu):
if not is_valid:
pooler_output[i] = None
self.model_runner_output.pooler_output = pooler_output
return self.model_runner_output
def async_copy_to_np(x: torch.Tensor) -> np.ndarray:
return x.to("cpu", non_blocking=True).numpy()