from concurrent.futures import Future from typing import Callable, Union import torch import torch.distributed as dist from vllm.config import VllmConfig from vllm.executor.executor_base import ExecutorBase from vllm.executor.uniproc_executor import ( # noqa ExecutorWithExternalLauncher as ExecutorWithExternalLauncherV0) from vllm.executor.uniproc_executor import ( # noqa UniProcExecutor as UniProcExecutorV0) from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec from vllm.v1.outputs import ModelRunnerOutput FailureCallback = Callable[[], None] class Executor(ExecutorBase): def determine_available_memory_block(self) -> list[(int, int)]: # in bytes output = self.collective_rpc("determine_available_memory_block") return output