24 lines
783 B
Python
24 lines
783 B
Python
|
|
|
||
|
|
from concurrent.futures import Future
|
||
|
|
from typing import Callable, Union
|
||
|
|
|
||
|
|
import torch
|
||
|
|
import torch.distributed as dist
|
||
|
|
|
||
|
|
from vllm.config import VllmConfig
|
||
|
|
from vllm.executor.executor_base import ExecutorBase
|
||
|
|
from vllm.executor.uniproc_executor import ( # noqa
|
||
|
|
ExecutorWithExternalLauncher as ExecutorWithExternalLauncherV0)
|
||
|
|
from vllm.executor.uniproc_executor import ( # noqa
|
||
|
|
UniProcExecutor as UniProcExecutorV0)
|
||
|
|
from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
|
||
|
|
from vllm.v1.outputs import ModelRunnerOutput
|
||
|
|
|
||
|
|
FailureCallback = Callable[[], None]
|
||
|
|
|
||
|
|
|
||
|
|
class Executor(ExecutorBase):
|
||
|
|
|
||
|
|
def determine_available_memory_block(self) -> list[(int, int)]: # in bytes
|
||
|
|
output = self.collective_rpc("determine_available_memory_block")
|
||
|
|
return output
|