forked from EngineX-Cambricon/enginex-mlu370-vllm
add qwen3
This commit is contained in:
0
vllm-v0.6.2/vllm/executor/__init__.py
Normal file
0
vllm-v0.6.2/vllm/executor/__init__.py
Normal file
BIN
vllm-v0.6.2/vllm/executor/__pycache__/__init__.cpython-310.pyc
Normal file
BIN
vllm-v0.6.2/vllm/executor/__pycache__/__init__.cpython-310.pyc
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
vllm-v0.6.2/vllm/executor/__pycache__/ray_utils.cpython-310.pyc
Normal file
BIN
vllm-v0.6.2/vllm/executor/__pycache__/ray_utils.cpython-310.pyc
Normal file
Binary file not shown.
377
vllm-v0.6.2/vllm/executor/cpu_executor.py
Normal file
377
vllm-v0.6.2/vllm/executor/cpu_executor.py
Normal file
@@ -0,0 +1,377 @@
|
||||
import os
|
||||
from functools import partial
|
||||
from typing import Any, Awaitable, List, Optional, Set, Tuple, Union
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.config import (CacheConfig, ModelConfig, ParallelConfig,
|
||||
SchedulerConfig)
|
||||
from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
|
||||
from vllm.executor.multiproc_worker_utils import (ProcessWorkerWrapper,
|
||||
ResultHandler, WorkerMonitor)
|
||||
from vllm.logger import init_logger
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.model_executor.layers.sampler import SamplerOutput
|
||||
from vllm.prompt_adapter.request import PromptAdapterRequest
|
||||
from vllm.sequence import ExecuteModelRequest
|
||||
from vllm.utils import (GiB_bytes, get_distributed_init_method, get_open_port,
|
||||
get_vllm_instance_id, make_async)
|
||||
from vllm.worker.worker_base import WorkerWrapperBase
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class CPUExecutor(ExecutorBase):
|
||||
|
||||
uses_ray: bool = False
|
||||
|
||||
def _init_executor(self) -> None:
|
||||
assert self.device_config.device_type == "cpu"
|
||||
# Reminder: Please update docs/source/serving/compatibility_matrix.rst
|
||||
# If the feature combo become valid
|
||||
assert self.lora_config is None, "cpu backend doesn't support LoRA"
|
||||
|
||||
#
|
||||
# Environment variables for CPU executor
|
||||
#
|
||||
|
||||
# Ensure that VLLM_INSTANCE_ID is set, to be inherited by workers
|
||||
os.environ["VLLM_INSTANCE_ID"] = get_vllm_instance_id()
|
||||
|
||||
# Disable torch async compiling which won't work with daemonic processes
|
||||
os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1"
|
||||
|
||||
# Intel OpenMP setting
|
||||
ld_prealod_str = os.getenv("LD_PRELOAD", "")
|
||||
if "libiomp5.so" in ld_prealod_str:
|
||||
# The time(milliseconds) that a thread should wait after
|
||||
# completing the execution of a parallel region, before sleeping.
|
||||
os.environ['KMP_BLOCKTIME'] = "1"
|
||||
# Prevents the CPU to run into low performance state
|
||||
os.environ['KMP_TPAUSE'] = "0"
|
||||
# Provides fine granularity parallelism
|
||||
os.environ['KMP_FORKJOIN_BARRIER_PATTERN'] = "dist,dist"
|
||||
os.environ['KMP_PLAIN_BARRIER_PATTERN'] = "dist,dist"
|
||||
os.environ['KMP_REDUCTION_BARRIER_PATTERN'] = "dist,dist"
|
||||
|
||||
# To hint IPEX uses shared memory based AllReduce
|
||||
os.environ["LOCAL_WORLD_SIZE"] = str(
|
||||
self.parallel_config.tensor_parallel_size)
|
||||
|
||||
self.model_config = _verify_and_get_model_config(self.model_config)
|
||||
self.cache_config = _verify_and_get_cache_config(self.cache_config)
|
||||
self.scheduler_config = _verify_and_get_scheduler_config(
|
||||
self.scheduler_config)
|
||||
self.parallel_config = _verify_and_get_parallel_config(
|
||||
self.parallel_config)
|
||||
|
||||
# Multiprocessing-based executor does not support multi-node setting.
|
||||
# Since it only works for single node, we can use the loopback address
|
||||
# 127.0.0.1 for communication.
|
||||
ip = "127.0.0.1"
|
||||
port = get_open_port()
|
||||
self.distributed_init_method = get_distributed_init_method(ip, port)
|
||||
|
||||
is_async = isinstance(self, CPUExecutorAsync)
|
||||
|
||||
world_size = self.parallel_config.tensor_parallel_size
|
||||
result_handler = ResultHandler()
|
||||
self.parallel_worker_tasks: Optional[Union[Any, Awaitable[Any]]] = None
|
||||
self.workers = []
|
||||
|
||||
if is_async:
|
||||
self.workers = [
|
||||
ProcessWorkerWrapper(
|
||||
result_handler,
|
||||
partial(
|
||||
self._create_worker,
|
||||
rank=rank,
|
||||
local_rank=rank,
|
||||
)) for rank in range(0, world_size)
|
||||
]
|
||||
self.driver_worker = self.workers[0]
|
||||
self.workers = self.workers[1:]
|
||||
self.driver_method_invoker = _async_driver_method_invoker
|
||||
else:
|
||||
self.driver_worker = self._create_worker()
|
||||
self.driver_method_invoker = _driver_method_invoker
|
||||
|
||||
if world_size != 1:
|
||||
self.workers = [
|
||||
ProcessWorkerWrapper(
|
||||
result_handler,
|
||||
partial(
|
||||
self._create_worker,
|
||||
rank=rank,
|
||||
local_rank=rank,
|
||||
)) for rank in range(1, world_size)
|
||||
]
|
||||
|
||||
self.worker_monitor = None
|
||||
if world_size != 1 or is_async:
|
||||
if is_async:
|
||||
async_worker_list = self.workers + [self.driver_worker]
|
||||
else:
|
||||
async_worker_list = self.workers
|
||||
self.worker_monitor = WorkerMonitor(async_worker_list,
|
||||
result_handler)
|
||||
result_handler.start()
|
||||
self.worker_monitor.start()
|
||||
|
||||
self._run_workers("init_device")
|
||||
self._run_workers("load_model")
|
||||
|
||||
def _create_worker(
|
||||
self,
|
||||
local_rank: int = 0,
|
||||
rank: int = 0,
|
||||
):
|
||||
worker_module_name = "vllm.worker.cpu_worker"
|
||||
worker_class_name = "CPUWorker"
|
||||
|
||||
wrapper = WorkerWrapperBase(
|
||||
worker_module_name=worker_module_name,
|
||||
worker_class_name=worker_class_name,
|
||||
)
|
||||
|
||||
assert self.distributed_init_method is not None
|
||||
|
||||
kwargs = dict(
|
||||
vllm_config=self.vllm_config,
|
||||
local_rank=local_rank,
|
||||
rank=rank,
|
||||
distributed_init_method=self.distributed_init_method,
|
||||
kv_cache_dtype=self.cache_config.cache_dtype,
|
||||
is_driver_worker=rank == 0,
|
||||
)
|
||||
wrapper.init_worker(**kwargs)
|
||||
|
||||
return wrapper.worker
|
||||
|
||||
def _run_workers(
|
||||
self,
|
||||
method: str,
|
||||
*args,
|
||||
async_run_remote_workers_only: bool = False,
|
||||
max_concurrent_workers: Optional[int] = None,
|
||||
**kwargs,
|
||||
) -> Any:
|
||||
"""Runs the given method on all workers.
|
||||
|
||||
Args:
|
||||
async_run_remote_workers_only: If True the method will be run only
|
||||
in the remote workers, not the driver worker. It will also be
|
||||
run asynchronously and return a list of futures rather than
|
||||
blocking on the results.
|
||||
"""
|
||||
|
||||
if max_concurrent_workers:
|
||||
raise NotImplementedError(
|
||||
"max_concurrent_workers is not supported yet.")
|
||||
|
||||
# Start the workers first.
|
||||
worker_outputs = [
|
||||
worker.execute_method(method, *args, **kwargs)
|
||||
for worker in self.workers
|
||||
]
|
||||
|
||||
if async_run_remote_workers_only:
|
||||
# Just return futures
|
||||
return worker_outputs
|
||||
|
||||
driver_worker_output = self.driver_method_invoker(
|
||||
self.driver_worker, method, *args, **kwargs)
|
||||
|
||||
# Get the results of the workers.
|
||||
return [driver_worker_output
|
||||
] + [output.get() for output in worker_outputs]
|
||||
|
||||
def determine_num_available_blocks(self) -> Tuple[int, int]:
|
||||
"""Determine the number of available KV blocks by invoking the
|
||||
underlying worker.
|
||||
"""
|
||||
return self.driver_method_invoker(self.driver_worker,
|
||||
"determine_num_available_blocks")
|
||||
|
||||
def initialize_cache(self, num_gpu_blocks: int,
|
||||
num_cpu_blocks: int) -> None:
|
||||
"""Initialize the KV cache by invoking the underlying worker.
|
||||
"""
|
||||
# NOTE: We log here to avoid multiple logs when number of workers is
|
||||
# greater than one. We could log in the engine, but not all executors
|
||||
# have GPUs.
|
||||
# NOTE: `cpu block` for CPU backend is located on CPU memory but is
|
||||
# referred as `gpu block`. Because we want to reuse the existing block
|
||||
# management procedure.
|
||||
logger.info("# CPU blocks: %d", num_gpu_blocks)
|
||||
|
||||
self._run_workers("initialize_cache",
|
||||
num_gpu_blocks=num_gpu_blocks,
|
||||
num_cpu_blocks=num_cpu_blocks)
|
||||
|
||||
def execute_model(
|
||||
self,
|
||||
execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
|
||||
if (self.parallel_config.tensor_parallel_size > 1
|
||||
and self.parallel_worker_tasks is None):
|
||||
self.parallel_worker_tasks = self._run_workers(
|
||||
"start_worker_execution_loop",
|
||||
async_run_remote_workers_only=True,
|
||||
)
|
||||
output = self.driver_method_invoker(self.driver_worker,
|
||||
"execute_model", execute_model_req)
|
||||
return output
|
||||
|
||||
def stop_remote_worker_execution_loop(self) -> None:
|
||||
if self.parallel_worker_tasks is None:
|
||||
return
|
||||
"""
|
||||
Passing None will cause the driver to stop the model execution
|
||||
loop running in each of the remote workers.
|
||||
"""
|
||||
self.driver_method_invoker(self.driver_worker, "execute_model", None)
|
||||
parallel_worker_tasks = self.parallel_worker_tasks
|
||||
self.parallel_worker_tasks = None
|
||||
# Ensure that workers exit model loop cleanly
|
||||
# (this will raise otherwise)
|
||||
self._wait_for_tasks_completion(parallel_worker_tasks)
|
||||
|
||||
def add_lora(self, lora_request: LoRARequest) -> bool:
|
||||
return all(self._run_workers("add_lora", lora_request))
|
||||
|
||||
def remove_lora(self, lora_id: int) -> bool:
|
||||
return all(self._run_workers("remove_lora", lora_id))
|
||||
|
||||
def pin_lora(self, lora_id: int) -> bool:
|
||||
assert lora_id > 0, "lora_id must be greater than 0."
|
||||
return all(self._run_workers(
|
||||
"pin_lora",
|
||||
lora_id=lora_id,
|
||||
))
|
||||
|
||||
def list_loras(self) -> Set[int]:
|
||||
return self.driver_method_invoker(self.driver_worker, "list_loras")
|
||||
|
||||
def add_prompt_adapter(
|
||||
self, prompt_adapter_request: PromptAdapterRequest) -> bool:
|
||||
return all(
|
||||
self._run_workers(
|
||||
"add_prompt_adapter",
|
||||
prompt_adapter_request,
|
||||
))
|
||||
|
||||
def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
|
||||
return all(
|
||||
self._run_workers(
|
||||
"remove_prompt_adapter",
|
||||
prompt_adapter_id,
|
||||
))
|
||||
|
||||
def list_prompt_adapters(self) -> Set[int]:
|
||||
return self.driver_method_invoker(self.driver_worker,
|
||||
"list_prompt_adapters")
|
||||
|
||||
def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
|
||||
return all(self._run_workers(
|
||||
"pin_prompt_adapter",
|
||||
prompt_adapter_id,
|
||||
))
|
||||
|
||||
def check_health(self) -> None:
|
||||
"""Raises an error if engine is unhealthy."""
|
||||
if self.worker_monitor is not None and not self.worker_monitor.is_alive(
|
||||
):
|
||||
raise RuntimeError("Worker processes are not running")
|
||||
|
||||
def shutdown(self):
|
||||
if (worker_monitor := getattr(self, "worker_monitor",
|
||||
None)) is not None:
|
||||
worker_monitor.close()
|
||||
|
||||
def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None:
|
||||
"""Wait for futures returned from _run_workers() with
|
||||
async_run_remote_workers_only to complete."""
|
||||
for result in parallel_worker_tasks:
|
||||
result.get()
|
||||
|
||||
def start_profile(self) -> None:
|
||||
self.driver_method_invoker(self.driver_worker, "start_profile")
|
||||
|
||||
def stop_profile(self) -> None:
|
||||
self.driver_method_invoker(self.driver_worker, "stop_profile")
|
||||
|
||||
|
||||
class CPUExecutorAsync(CPUExecutor, ExecutorAsyncBase):
|
||||
|
||||
async def execute_model_async(
|
||||
self,
|
||||
execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
|
||||
output = await make_async(self.execute_model
|
||||
)(execute_model_req=execute_model_req, )
|
||||
return output
|
||||
|
||||
async def check_health_async(self) -> None:
|
||||
self.check_health()
|
||||
|
||||
|
||||
def _verify_and_get_model_config(config: ModelConfig) -> ModelConfig:
|
||||
# Reminder: Please update docs/source/serving/compatibility_matrix.rst
|
||||
# If the feature combo become valid
|
||||
if not config.enforce_eager:
|
||||
logger.warning(
|
||||
"CUDA graph is not supported on CPU, fallback to the eager "
|
||||
"mode.")
|
||||
config.enforce_eager = True
|
||||
return config
|
||||
|
||||
|
||||
def _verify_and_get_scheduler_config(
|
||||
config: SchedulerConfig) -> SchedulerConfig:
|
||||
# Reminder: Please update docs/source/serving/compatibility_matrix.rst
|
||||
# If the feature combo become valid
|
||||
if config.chunked_prefill_enabled:
|
||||
logger.warning("Chunked prefill is not supported on CPU, disable it.")
|
||||
config.chunked_prefill_enabled = False
|
||||
|
||||
return config
|
||||
|
||||
|
||||
def _verify_and_get_cache_config(config: CacheConfig) -> CacheConfig:
|
||||
# Reminder: Please update docs/source/serving/compatibility_matrix.rst
|
||||
# If the feature combo become valid
|
||||
if config.enable_prefix_caching:
|
||||
logger.warning("Prefix caching is not supported on CPU, disable it.")
|
||||
config.enable_prefix_caching = False
|
||||
|
||||
kv_cache_space = envs.VLLM_CPU_KVCACHE_SPACE
|
||||
|
||||
if kv_cache_space >= 0:
|
||||
if kv_cache_space == 0:
|
||||
config.cpu_kvcache_space_bytes = 4 * GiB_bytes # type: ignore
|
||||
logger.warning("Environment variable VLLM_CPU_KVCACHE_SPACE (GB) "
|
||||
"for CPU backend is not set, using 4 by default.")
|
||||
else:
|
||||
config.cpu_kvcache_space_bytes = kv_cache_space * GiB_bytes # type: ignore
|
||||
else:
|
||||
raise RuntimeError(
|
||||
"Invalid environment variable VLLM_CPU_KVCACHE_SPACE"
|
||||
f" {kv_cache_space}, expect a positive integer value.")
|
||||
|
||||
return config
|
||||
|
||||
|
||||
def _verify_and_get_parallel_config(config: ParallelConfig) -> ParallelConfig:
|
||||
if (config.distributed_executor_backend is not None
|
||||
and config.distributed_executor_backend != "mp"):
|
||||
logger.warning(
|
||||
"%s is not supported on CPU, fallback to mp distributed executor "
|
||||
"backend.", config.distributed_executor_backend)
|
||||
config.distributed_executor_backend = "mp"
|
||||
return config
|
||||
|
||||
|
||||
def _driver_method_invoker(driver, method: str, *args, **kwargs):
|
||||
return getattr(driver, method)(*args, **kwargs)
|
||||
|
||||
|
||||
def _async_driver_method_invoker(driver, method: str, *args, **kwargs):
|
||||
return driver.execute_method(method, *args, **kwargs).get()
|
||||
212
vllm-v0.6.2/vllm/executor/distributed_gpu_executor.py
Normal file
212
vllm-v0.6.2/vllm/executor/distributed_gpu_executor.py
Normal file
@@ -0,0 +1,212 @@
|
||||
import asyncio
|
||||
from abc import abstractmethod
|
||||
from typing import Any, Awaitable, Dict, List, Optional, Set, Tuple, Union
|
||||
|
||||
from vllm.executor.executor_base import ExecutorAsyncBase
|
||||
from vllm.executor.gpu_executor import GPUExecutor
|
||||
from vllm.logger import init_logger
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.model_executor.layers.sampler import SamplerOutput
|
||||
from vllm.sequence import ExecuteModelRequest
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class DistributedGPUExecutor(GPUExecutor):
|
||||
"""Abstract superclass of multi-GPU executor implementations."""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
# This is non-None when the execute model loop is running
|
||||
# in the parallel workers. It's a coroutine in the AsyncLLMEngine case.
|
||||
self.parallel_worker_tasks: Optional[Union[Any, Awaitable[Any]]] = None
|
||||
# Updated by implementations that require additional args to be passed
|
||||
# to the _run_workers execute_model call
|
||||
self.extra_execute_model_run_workers_kwargs: Dict[str, Any] = {}
|
||||
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
def determine_num_available_blocks(self) -> Tuple[int, int]:
|
||||
"""Determine the number of available KV blocks.
|
||||
|
||||
This invokes `determine_num_available_blocks` on each worker and takes
|
||||
the min of the results, guaranteeing that the selected cache sizes are
|
||||
compatible with all workers.
|
||||
|
||||
Returns:
|
||||
- tuple[num_gpu_blocks, num_cpu_blocks]
|
||||
"""
|
||||
# Get the maximum number of blocks that can be allocated on GPU and CPU.
|
||||
num_blocks = self._run_workers("determine_num_available_blocks", )
|
||||
|
||||
# Since we use a shared centralized controller, we take the minimum
|
||||
# number of blocks across all workers to make sure all the memory
|
||||
# operators can be applied to all workers.
|
||||
num_gpu_blocks = min(b[0] for b in num_blocks)
|
||||
num_cpu_blocks = min(b[1] for b in num_blocks)
|
||||
|
||||
return num_gpu_blocks, num_cpu_blocks
|
||||
|
||||
def initialize_cache(self, num_gpu_blocks: int,
|
||||
num_cpu_blocks: int) -> None:
|
||||
"""Initialize the KV cache in all workers.
|
||||
"""
|
||||
|
||||
# NOTE: We log here to avoid multiple logs when number of workers is
|
||||
# greater than one. We could log in the engine, but not all executors
|
||||
# have GPUs.
|
||||
logger.info("# GPU blocks: %d, # CPU blocks: %d", num_gpu_blocks,
|
||||
num_cpu_blocks)
|
||||
max_concurrency = (num_gpu_blocks * self.cache_config.block_size /
|
||||
self.model_config.max_model_len)
|
||||
logger.info("Maximum concurrency for %s tokens per request: %.2fx",
|
||||
self.model_config.max_model_len, max_concurrency)
|
||||
|
||||
self.cache_config.num_gpu_blocks = num_gpu_blocks
|
||||
self.cache_config.num_cpu_blocks = num_cpu_blocks
|
||||
|
||||
self._run_workers("initialize_cache",
|
||||
num_gpu_blocks=num_gpu_blocks,
|
||||
num_cpu_blocks=num_cpu_blocks)
|
||||
|
||||
def execute_model(
|
||||
self,
|
||||
execute_model_req: ExecuteModelRequest,
|
||||
) -> List[SamplerOutput]:
|
||||
if self.parallel_worker_tasks is None:
|
||||
self.parallel_worker_tasks = self._run_workers(
|
||||
"start_worker_execution_loop",
|
||||
async_run_tensor_parallel_workers_only=True,
|
||||
**self.extra_execute_model_run_workers_kwargs)
|
||||
|
||||
# Only the driver worker returns the sampling results.
|
||||
driver_outputs = self._driver_execute_model(execute_model_req)
|
||||
assert driver_outputs is not None
|
||||
return driver_outputs
|
||||
|
||||
def stop_remote_worker_execution_loop(self) -> None:
|
||||
if self.parallel_worker_tasks is None:
|
||||
return
|
||||
|
||||
self._driver_execute_model(execute_model_req=None)
|
||||
parallel_worker_tasks = self.parallel_worker_tasks
|
||||
self.parallel_worker_tasks = None
|
||||
# Ensure that workers exit model loop cleanly
|
||||
# (this will raise otherwise)
|
||||
self._wait_for_tasks_completion(parallel_worker_tasks)
|
||||
|
||||
def add_lora(self, lora_request: LoRARequest) -> bool:
|
||||
assert lora_request.lora_int_id > 0, "lora_id must be greater than 0."
|
||||
return self._run_workers(
|
||||
"add_lora",
|
||||
lora_request=lora_request,
|
||||
)
|
||||
|
||||
def remove_lora(self, lora_id: int) -> bool:
|
||||
assert lora_id > 0, "lora_id must be greater than 0."
|
||||
return self._run_workers(
|
||||
"remove_lora",
|
||||
lora_id=lora_id,
|
||||
)
|
||||
|
||||
def pin_lora(self, lora_id: int) -> bool:
|
||||
assert lora_id > 0, "lora_id must be greater than 0."
|
||||
return self._run_workers(
|
||||
"pin_lora",
|
||||
lora_id=lora_id,
|
||||
)
|
||||
|
||||
def list_loras(self) -> Set[int]:
|
||||
return self._run_workers("list_loras")
|
||||
|
||||
def save_sharded_state(
|
||||
self,
|
||||
path: str,
|
||||
pattern: Optional[str] = None,
|
||||
max_size: Optional[int] = None,
|
||||
) -> None:
|
||||
self._run_workers("save_sharded_state",
|
||||
path=path,
|
||||
pattern=pattern,
|
||||
max_size=max_size)
|
||||
|
||||
@abstractmethod
|
||||
def _driver_execute_model(
|
||||
self, execute_model_req: Optional[ExecuteModelRequest]
|
||||
) -> Optional[List[SamplerOutput]]:
|
||||
"""Run execute_model in the driver worker.
|
||||
|
||||
Passing None will cause the driver to stop the model execution loop
|
||||
running in each of the remote workers. In this case, this method
|
||||
returns None. Otherwise, this method returns the model output.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def _run_workers(
|
||||
self,
|
||||
method: str,
|
||||
*args,
|
||||
async_run_tensor_parallel_workers_only: bool = False,
|
||||
max_concurrent_workers: Optional[int] = None,
|
||||
**kwargs,
|
||||
) -> Any:
|
||||
"""Runs the given method on all workers.
|
||||
|
||||
Args:
|
||||
async_run_tensor_parallel_workers_only: If True the method will be
|
||||
run only in the remote TP workers, not the driver worker.
|
||||
It will also be run asynchronously and return a list of futures
|
||||
rather than blocking on the results.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None:
|
||||
"""Wait for futures returned from _run_workers() with
|
||||
async_run_remote_workers_only to complete."""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class DistributedGPUExecutorAsync(DistributedGPUExecutor, ExecutorAsyncBase):
|
||||
|
||||
async def execute_model_async(
|
||||
self,
|
||||
execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
|
||||
if self.parallel_worker_tasks is None:
|
||||
# Start model execution loop running in the parallel workers
|
||||
self.parallel_worker_tasks = asyncio.create_task(
|
||||
self._start_worker_execution_loop())
|
||||
|
||||
# Only the driver worker returns the sampling results.
|
||||
return await self._driver_execute_model_async(execute_model_req)
|
||||
|
||||
async def stop_remote_worker_execution_loop_async(self) -> None:
|
||||
if self.parallel_worker_tasks is None:
|
||||
return
|
||||
|
||||
await self._driver_execute_model_async()
|
||||
parallel_worker_tasks = self.parallel_worker_tasks
|
||||
self.parallel_worker_tasks = None
|
||||
# Ensure that workers exit model loop cleanly
|
||||
# (this will raise otherwise)
|
||||
await parallel_worker_tasks
|
||||
|
||||
@abstractmethod
|
||||
async def _driver_execute_model_async(
|
||||
self,
|
||||
execute_model_req: Optional[ExecuteModelRequest] = None,
|
||||
) -> List[SamplerOutput]:
|
||||
"""Execute the model asynchronously in the driver worker.
|
||||
|
||||
Passing None will cause the driver to stop the model execution
|
||||
loop running in each of the remote workers.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
async def _start_worker_execution_loop(self):
|
||||
"""Run execution loop on all workers. It guarantees all workers run
|
||||
the loop or None of them is running the loop. Loop can be stopped by
|
||||
`stop_remote_worker_execution_loop`.
|
||||
The API is idempotent (guarantee only 1 loop run at any moment)."""
|
||||
raise NotImplementedError
|
||||
212
vllm-v0.6.2/vllm/executor/distributed_mlu_executor.py
Normal file
212
vllm-v0.6.2/vllm/executor/distributed_mlu_executor.py
Normal file
@@ -0,0 +1,212 @@
|
||||
import asyncio
|
||||
from abc import abstractmethod
|
||||
from typing import Any, Awaitable, Dict, List, Optional, Set, Tuple, Union
|
||||
|
||||
from vllm.executor.executor_base import ExecutorAsyncBase
|
||||
from vllm.executor.mlu_executor import MLUExecutor
|
||||
from vllm.logger import init_logger
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.sequence import ExecuteModelRequest
|
||||
from vllm.model_executor.layers.sampler import SamplerOutput
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class DistributedMLUExecutor(MLUExecutor):
|
||||
"""Abstract superclass of multi-MLU executor implementations."""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
# This is non-None when the execute model loop is running
|
||||
# in the parallel workers. It's a coroutine in the AsyncLLMEngine case.
|
||||
self.parallel_worker_tasks: Optional[Union[Any, Awaitable[Any]]] = None
|
||||
# Updated by implementations that require additional args to be passed
|
||||
# to the _run_workers execute_model call
|
||||
self.extra_execute_model_run_workers_kwargs: Dict[str, Any] = {}
|
||||
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
def determine_num_available_blocks(self) -> Tuple[int, int]:
|
||||
"""Determine the number of available KV blocks.
|
||||
|
||||
This invokes `determine_num_available_blocks` on each worker and takes
|
||||
the min of the results, guaranteeing that the selected cache sizes are
|
||||
compatible with all workers.
|
||||
|
||||
Returns:
|
||||
- tuple[num_gpu_blocks, num_cpu_blocks]
|
||||
"""
|
||||
# Get the maximum number of blocks that can be allocated on GPU and CPU.
|
||||
num_blocks = self._run_workers("determine_num_available_blocks", )
|
||||
|
||||
# Since we use a shared centralized controller, we take the minimum
|
||||
# number of blocks across all workers to make sure all the memory
|
||||
# operators can be applied to all workers.
|
||||
num_gpu_blocks = min(b[0] for b in num_blocks)
|
||||
num_cpu_blocks = min(b[1] for b in num_blocks)
|
||||
|
||||
return num_gpu_blocks, num_cpu_blocks
|
||||
|
||||
def initialize_cache(self, num_gpu_blocks: int,
|
||||
num_cpu_blocks: int) -> None:
|
||||
"""Initialize the KV cache in all workers.
|
||||
"""
|
||||
|
||||
# NOTE: We log here to avoid multiple logs when number of workers is
|
||||
# greater than one. We could log in the engine, but not all executors
|
||||
# have GPUs.
|
||||
logger.info("# MLU blocks: %d, # CPU blocks: %d", num_gpu_blocks,
|
||||
num_cpu_blocks)
|
||||
max_concurrency = (num_gpu_blocks * self.cache_config.block_size /
|
||||
self.model_config.max_model_len)
|
||||
logger.info("Maximum concurrency for %s tokens per request: %.2fx",
|
||||
self.model_config.max_model_len, max_concurrency)
|
||||
|
||||
self.cache_config.num_gpu_blocks = num_gpu_blocks
|
||||
self.cache_config.num_cpu_blocks = num_cpu_blocks
|
||||
|
||||
self._run_workers("initialize_cache",
|
||||
num_gpu_blocks=num_gpu_blocks,
|
||||
num_cpu_blocks=num_cpu_blocks)
|
||||
|
||||
def execute_model(
|
||||
self,
|
||||
execute_model_req: ExecuteModelRequest,
|
||||
) -> List[SamplerOutput]:
|
||||
if self.parallel_worker_tasks is None:
|
||||
self.parallel_worker_tasks = self._run_workers(
|
||||
"start_worker_execution_loop",
|
||||
async_run_tensor_parallel_workers_only=True,
|
||||
**self.extra_execute_model_run_workers_kwargs)
|
||||
|
||||
# Only the driver worker returns the sampling results.
|
||||
driver_outputs = self._driver_execute_model(execute_model_req)
|
||||
assert driver_outputs is not None
|
||||
return driver_outputs
|
||||
|
||||
def stop_remote_worker_execution_loop(self) -> None:
|
||||
if self.parallel_worker_tasks is None:
|
||||
return
|
||||
|
||||
self._driver_execute_model(execute_model_req=None)
|
||||
parallel_worker_tasks = self.parallel_worker_tasks
|
||||
self.parallel_worker_tasks = None
|
||||
# Ensure that workers exit model loop cleanly
|
||||
# (this will raise otherwise)
|
||||
self._wait_for_tasks_completion(parallel_worker_tasks)
|
||||
|
||||
def add_lora(self, lora_request: LoRARequest) -> bool:
|
||||
assert lora_request.lora_int_id > 0, "lora_id must be greater than 0."
|
||||
return self._run_workers(
|
||||
"add_lora",
|
||||
lora_request=lora_request,
|
||||
)
|
||||
|
||||
def remove_lora(self, lora_id: int) -> bool:
|
||||
assert lora_id > 0, "lora_id must be greater than 0."
|
||||
return self._run_workers(
|
||||
"remove_lora",
|
||||
lora_id=lora_id,
|
||||
)
|
||||
|
||||
def pin_lora(self, lora_id: int) -> bool:
|
||||
assert lora_id > 0, "lora_id must be greater than 0."
|
||||
return self._run_workers(
|
||||
"pin_lora",
|
||||
lora_id=lora_id,
|
||||
)
|
||||
|
||||
def list_loras(self) -> Set[int]:
|
||||
return self._run_workers("list_loras")
|
||||
|
||||
def save_sharded_state(
|
||||
self,
|
||||
path: str,
|
||||
pattern: Optional[str] = None,
|
||||
max_size: Optional[int] = None,
|
||||
) -> None:
|
||||
self._run_workers("save_sharded_state",
|
||||
path=path,
|
||||
pattern=pattern,
|
||||
max_size=max_size)
|
||||
|
||||
@abstractmethod
|
||||
def _driver_execute_model(
|
||||
self, execute_model_req: Optional[ExecuteModelRequest]
|
||||
) -> Optional[List[SamplerOutput]]:
|
||||
"""Run execute_model in the driver worker.
|
||||
|
||||
Passing None will cause the driver to stop the model execution loop
|
||||
running in each of the remote workers. In this case, this method
|
||||
returns None. Otherwise, this method returns the model output.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def _run_workers(
|
||||
self,
|
||||
method: str,
|
||||
*args,
|
||||
async_run_tensor_parallel_workers_only: bool = False,
|
||||
max_concurrent_workers: Optional[int] = None,
|
||||
**kwargs,
|
||||
) -> Any:
|
||||
"""Runs the given method on all workers.
|
||||
|
||||
Args:
|
||||
async_run_tensor_parallel_workers_only: If True the method will be
|
||||
run only in the remote TP workers, not the driver worker.
|
||||
It will also be run asynchronously and return a list of futures
|
||||
rather than blocking on the results.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None:
|
||||
"""Wait for futures returned from _run_workers() with
|
||||
async_run_remote_workers_only to complete."""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class DistributedMLUExecutorAsync(DistributedMLUExecutor, ExecutorAsyncBase):
|
||||
|
||||
async def execute_model_async(
|
||||
self,
|
||||
execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
|
||||
if self.parallel_worker_tasks is None:
|
||||
# Start model execution loop running in the parallel workers
|
||||
self.parallel_worker_tasks = asyncio.create_task(
|
||||
self._start_worker_execution_loop())
|
||||
|
||||
# Only the driver worker returns the sampling results.
|
||||
return await self._driver_execute_model_async(execute_model_req)
|
||||
|
||||
async def stop_remote_worker_execution_loop_async(self) -> None:
|
||||
if self.parallel_worker_tasks is None:
|
||||
return
|
||||
|
||||
await self._driver_execute_model_async()
|
||||
parallel_worker_tasks = self.parallel_worker_tasks
|
||||
self.parallel_worker_tasks = None
|
||||
# Ensure that workers exit model loop cleanly
|
||||
# (this will raise otherwise)
|
||||
await parallel_worker_tasks
|
||||
|
||||
@abstractmethod
|
||||
async def _driver_execute_model_async(
|
||||
self,
|
||||
execute_model_req: Optional[ExecuteModelRequest] = None,
|
||||
) -> List[SamplerOutput]:
|
||||
"""Execute the model asynchronously in the driver worker.
|
||||
|
||||
Passing None will cause the driver to stop the model execution
|
||||
loop running in each of the remote workers.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
async def _start_worker_execution_loop(self):
|
||||
"""Run execution loop on all workers. It guarantees all workers run
|
||||
the loop or None of them is running the loop. Loop can be stopped by
|
||||
`stop_remote_worker_execution_loop`.
|
||||
The API is idempotent (guarantee only 1 loop run at any moment)."""
|
||||
raise NotImplementedError
|
||||
139
vllm-v0.6.2/vllm/executor/executor_base.py
Normal file
139
vllm-v0.6.2/vllm/executor/executor_base.py
Normal file
@@ -0,0 +1,139 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import List, Optional, Set, Tuple
|
||||
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.model_executor.layers.sampler import SamplerOutput
|
||||
from vllm.prompt_adapter.request import PromptAdapterRequest
|
||||
from vllm.sequence import ExecuteModelRequest
|
||||
|
||||
|
||||
class ExecutorBase(ABC):
|
||||
"""Base class for all executors.
|
||||
|
||||
An executor is responsible for executing the model on a specific device
|
||||
type (e.g., CPU, GPU, Neuron, etc.). Or it can be a distributed executor
|
||||
that can execute the model on multiple devices.
|
||||
"""
|
||||
|
||||
uses_ray: bool # whether the executor uses Ray for orchestration.
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vllm_config: VllmConfig,
|
||||
) -> None:
|
||||
self.vllm_config = vllm_config
|
||||
self.model_config = vllm_config.model_config
|
||||
self.cache_config = vllm_config.cache_config
|
||||
self.lora_config = vllm_config.lora_config
|
||||
self.load_config = vllm_config.load_config
|
||||
self.parallel_config = vllm_config.parallel_config
|
||||
self.scheduler_config = vllm_config.scheduler_config
|
||||
self.device_config = vllm_config.device_config
|
||||
self.speculative_config = vllm_config.speculative_config
|
||||
self.prompt_adapter_config = vllm_config.prompt_adapter_config
|
||||
self.observability_config = vllm_config.observability_config
|
||||
self._init_executor()
|
||||
|
||||
@abstractmethod
|
||||
def _init_executor(self) -> None:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def determine_num_available_blocks(self) -> Tuple[int, int]:
|
||||
"""Determine the number of available blocks for the GPU KV cache and
|
||||
swappable CPU KV cache.
|
||||
|
||||
Normally, this should simply delegate to the underlying Worker. Some
|
||||
ExecutorBase may require modification of the result, e.g. to ensure the
|
||||
selected cache sizes are compatible with all workers.
|
||||
|
||||
Returns a Tuple[num_gpu_blocks, num_cpu_blocks], where num_gpu_blocks
|
||||
are blocks that are "active" on the device and can be appended to.
|
||||
num_cpu_blocks refers to "swapped" blocks in CPU memory and cannot be
|
||||
appended to.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def initialize_cache(self, num_gpu_blocks: int,
|
||||
num_cpu_blocks: int) -> None:
|
||||
"""Initialize the KV cache with the given size in blocks.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def execute_model(
|
||||
self, execute_model_req: ExecuteModelRequest
|
||||
) -> Optional[List[SamplerOutput]]:
|
||||
"""Executes at least one model step on the given sequences."""
|
||||
raise NotImplementedError
|
||||
|
||||
def stop_remote_worker_execution_loop(self) -> None:
|
||||
"""Releases parallel workers from model loop."""
|
||||
return
|
||||
|
||||
@abstractmethod
|
||||
def add_lora(self, lora_request: LoRARequest) -> bool:
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def remove_lora(self, lora_id: int) -> bool:
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def pin_lora(self, lora_id: int) -> bool:
|
||||
raise NotImplementedError # type: ignore
|
||||
|
||||
@abstractmethod
|
||||
def list_loras(self) -> Set[int]:
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def add_prompt_adapter(
|
||||
self, prompt_adapter_request: PromptAdapterRequest) -> bool:
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
|
||||
raise NotImplementedError # type: ignore
|
||||
|
||||
@abstractmethod
|
||||
def list_prompt_adapters(self) -> Set[int]:
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def check_health(self) -> None:
|
||||
"""Checks if the executor is healthy. If not, it should raise an
|
||||
exception."""
|
||||
raise NotImplementedError
|
||||
|
||||
def shutdown(self) -> None:
|
||||
"""Shutdown the executor."""
|
||||
return
|
||||
|
||||
def __del__(self):
|
||||
self.shutdown()
|
||||
|
||||
|
||||
class ExecutorAsyncBase(ExecutorBase):
|
||||
|
||||
@abstractmethod
|
||||
async def execute_model_async(
|
||||
self,
|
||||
execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
|
||||
"""Executes one model step on the given sequences."""
|
||||
raise NotImplementedError
|
||||
|
||||
async def stop_remote_worker_execution_loop_async(self) -> None:
|
||||
"""Releases parallel workers from model loop."""
|
||||
return
|
||||
|
||||
async def check_health_async(self) -> None:
|
||||
"""Checks if the executor is healthy. If not, it should raise an
|
||||
exception."""
|
||||
self.check_health()
|
||||
182
vllm-v0.6.2/vllm/executor/gpu_executor.py
Normal file
182
vllm-v0.6.2/vllm/executor/gpu_executor.py
Normal file
@@ -0,0 +1,182 @@
|
||||
from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
|
||||
|
||||
from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
|
||||
from vllm.logger import init_logger
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.model_executor.layers.sampler import SamplerOutput
|
||||
from vllm.prompt_adapter.request import PromptAdapterRequest
|
||||
from vllm.sequence import ExecuteModelRequest, PoolerOutput
|
||||
from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
|
||||
make_async)
|
||||
from vllm.worker.worker_base import WorkerBase, WorkerWrapperBase
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
def create_worker(worker_module_name: str, worker_class_name: str,
|
||||
worker_class_fn: Optional[Callable[[], Type[WorkerBase]]],
|
||||
**kwargs):
|
||||
wrapper = WorkerWrapperBase(
|
||||
worker_module_name=worker_module_name,
|
||||
worker_class_name=worker_class_name,
|
||||
worker_class_fn=worker_class_fn,
|
||||
)
|
||||
wrapper.init_worker(**kwargs)
|
||||
return wrapper.worker
|
||||
|
||||
|
||||
class GPUExecutor(ExecutorBase):
|
||||
|
||||
uses_ray: bool = False
|
||||
|
||||
def _init_executor(self) -> None:
|
||||
"""Initialize the worker and load the model.
|
||||
"""
|
||||
assert self.parallel_config.world_size == 1, (
|
||||
"GPUExecutor only supports single GPU.")
|
||||
|
||||
self.driver_worker = self._create_worker()
|
||||
self.driver_worker.init_device()
|
||||
self.driver_worker.load_model()
|
||||
|
||||
def _get_worker_kwargs(
|
||||
self,
|
||||
local_rank: int = 0,
|
||||
rank: int = 0,
|
||||
distributed_init_method: Optional[str] = None) -> Dict[str, Any]:
|
||||
"""Return worker init args for a given rank."""
|
||||
if distributed_init_method is None:
|
||||
distributed_init_method = get_distributed_init_method(
|
||||
get_ip(), get_open_port())
|
||||
return dict(
|
||||
vllm_config=self.vllm_config,
|
||||
local_rank=local_rank,
|
||||
rank=rank,
|
||||
distributed_init_method=distributed_init_method,
|
||||
is_driver_worker=(not self.parallel_config)
|
||||
or (rank % self.parallel_config.tensor_parallel_size == 0),
|
||||
)
|
||||
|
||||
def _get_worker_module_and_class(
|
||||
self) -> Tuple[str, str, Optional[Callable[[], Type[WorkerBase]]]]:
|
||||
worker_class_fn = None
|
||||
if self.scheduler_config.is_multi_step:
|
||||
worker_module_name = "vllm.worker.multi_step_worker"
|
||||
worker_class_name = "MultiStepWorker"
|
||||
elif self.speculative_config:
|
||||
worker_module_name = "vllm.spec_decode.spec_decode_worker"
|
||||
worker_class_name = "create_spec_worker"
|
||||
else:
|
||||
worker_module_name = "vllm.worker.worker"
|
||||
worker_class_name = "Worker"
|
||||
return (worker_module_name, worker_class_name, worker_class_fn)
|
||||
|
||||
def _get_create_worker_kwargs(
|
||||
self,
|
||||
local_rank: int = 0,
|
||||
rank: int = 0,
|
||||
distributed_init_method: Optional[str] = None) -> Dict:
|
||||
worker_kwargs = self._get_worker_kwargs(local_rank, rank,
|
||||
distributed_init_method)
|
||||
|
||||
(worker_module_name, worker_class_name,
|
||||
worker_class_fn) = self._get_worker_module_and_class()
|
||||
worker_kwargs.update(
|
||||
worker_module_name=worker_module_name,
|
||||
worker_class_name=worker_class_name,
|
||||
worker_class_fn=worker_class_fn,
|
||||
)
|
||||
|
||||
return worker_kwargs
|
||||
|
||||
def _create_worker(self,
|
||||
local_rank: int = 0,
|
||||
rank: int = 0,
|
||||
distributed_init_method: Optional[str] = None):
|
||||
return create_worker(**self._get_create_worker_kwargs(
|
||||
local_rank=local_rank,
|
||||
rank=rank,
|
||||
distributed_init_method=distributed_init_method))
|
||||
|
||||
def determine_num_available_blocks(self) -> Tuple[int, int]:
|
||||
"""Determine the number of available KV blocks by invoking the
|
||||
underlying worker.
|
||||
"""
|
||||
return self.driver_worker.determine_num_available_blocks()
|
||||
|
||||
def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None:
|
||||
"""Initialize the KV cache by invoking the underlying worker.
|
||||
"""
|
||||
# NOTE: This is logged in the executor because there can be >1 worker
|
||||
# with other executors. We could log in the engine level, but work
|
||||
# remains to abstract away the device for non-GPU configurations.
|
||||
logger.info("# GPU blocks: %d, # CPU blocks: %d", num_gpu_blocks,
|
||||
num_cpu_blocks)
|
||||
max_concurrency = (num_gpu_blocks * self.cache_config.block_size /
|
||||
self.model_config.max_model_len)
|
||||
logger.info("Maximum concurrency for %s tokens per request: %.2fx",
|
||||
self.model_config.max_model_len, max_concurrency)
|
||||
|
||||
self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)
|
||||
|
||||
def execute_model(
|
||||
self, execute_model_req: ExecuteModelRequest
|
||||
) -> Optional[List[Union[SamplerOutput, PoolerOutput]]]:
|
||||
output = self.driver_worker.execute_model(execute_model_req)
|
||||
return output
|
||||
|
||||
def add_lora(self, lora_request: LoRARequest) -> bool:
|
||||
assert lora_request.lora_int_id > 0, "lora_id must be greater than 0."
|
||||
return self.driver_worker.add_lora(lora_request)
|
||||
|
||||
def remove_lora(self, lora_id: int) -> bool:
|
||||
assert lora_id > 0, "lora_id must be greater than 0."
|
||||
return self.driver_worker.remove_lora(lora_id)
|
||||
|
||||
def pin_lora(self, lora_id: int) -> bool:
|
||||
assert lora_id > 0, "lora_id must be greater than 0."
|
||||
return self.driver_worker.pin_lora(lora_id)
|
||||
|
||||
def list_loras(self) -> Set[int]:
|
||||
return self.driver_worker.list_loras()
|
||||
|
||||
def add_prompt_adapter(
|
||||
self, prompt_adapter_request: PromptAdapterRequest) -> bool:
|
||||
assert prompt_adapter_request.prompt_adapter_id > 0, \
|
||||
"prompt_adapter_id must be greater than 0."
|
||||
return self.driver_worker.add_prompt_adapter(prompt_adapter_request)
|
||||
|
||||
def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
|
||||
assert prompt_adapter_id > 0, \
|
||||
"prompt_adapter_id must be greater than 0."
|
||||
return self.driver_worker.remove_prompt_adapter(prompt_adapter_id)
|
||||
|
||||
def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
|
||||
assert prompt_adapter_id > 0, \
|
||||
"prompt_adapter_id must be greater than 0."
|
||||
return self.driver_worker.pin_prompt_adapter(prompt_adapter_id)
|
||||
|
||||
def list_prompt_adapters(self) -> Set[int]:
|
||||
return self.driver_worker.list_prompt_adapters()
|
||||
|
||||
def check_health(self) -> None:
|
||||
# GPUExecutor will always be healthy as long as
|
||||
# it's running.
|
||||
return
|
||||
|
||||
def start_profile(self) -> None:
|
||||
self.driver_worker.start_profile()
|
||||
|
||||
def stop_profile(self) -> None:
|
||||
self.driver_worker.stop_profile()
|
||||
|
||||
|
||||
class GPUExecutorAsync(GPUExecutor, ExecutorAsyncBase):
|
||||
|
||||
async def execute_model_async(
|
||||
self,
|
||||
execute_model_req: ExecuteModelRequest,
|
||||
) -> List[Union[SamplerOutput, PoolerOutput]]:
|
||||
output = await make_async(self.driver_worker.execute_model
|
||||
)(execute_model_req=execute_model_req)
|
||||
return output
|
||||
205
vllm-v0.6.2/vllm/executor/hpu_executor.py
Normal file
205
vllm-v0.6.2/vllm/executor/hpu_executor.py
Normal file
@@ -0,0 +1,205 @@
|
||||
###############################################################################
|
||||
# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
|
||||
###############################################################################
|
||||
|
||||
import contextlib
|
||||
import os
|
||||
from typing import Any, Dict, List, Optional, Set, Tuple
|
||||
|
||||
from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
|
||||
from vllm.logger import init_logger
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.model_executor.layers.sampler import SamplerOutput
|
||||
from vllm.prompt_adapter.request import PromptAdapterRequest
|
||||
from vllm.sequence import ExecuteModelRequest
|
||||
from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
|
||||
make_async)
|
||||
from vllm.worker.worker_base import WorkerWrapperBase
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class HPUExecutor(ExecutorBase):
|
||||
|
||||
uses_ray: bool = False
|
||||
|
||||
def _init_executor(self) -> None:
|
||||
"""Initialize the worker and load the model."""
|
||||
self._init_worker()
|
||||
|
||||
def _get_worker_kwargs(
|
||||
self,
|
||||
local_rank: int = 0,
|
||||
rank: int = 0,
|
||||
distributed_init_method: Optional[str] = None) -> Dict[str, Any]:
|
||||
"""Return worker init args for a given rank."""
|
||||
if distributed_init_method is None:
|
||||
distributed_init_method = get_distributed_init_method(
|
||||
get_ip(), get_open_port())
|
||||
return dict(
|
||||
vllm_config=self.vllm_config,
|
||||
local_rank=local_rank,
|
||||
rank=rank,
|
||||
distributed_init_method=distributed_init_method,
|
||||
is_driver_worker=rank == 0,
|
||||
)
|
||||
|
||||
def _create_worker(self,
|
||||
local_rank: int = 0,
|
||||
rank: int = 0,
|
||||
distributed_init_method: Optional[str] = None):
|
||||
wrapper = WorkerWrapperBase(
|
||||
worker_module_name="vllm.worker.hpu_worker",
|
||||
worker_class_name="HPUWorker",
|
||||
)
|
||||
wrapper.init_worker(**self._get_worker_kwargs(local_rank, rank,
|
||||
distributed_init_method))
|
||||
return wrapper.worker
|
||||
|
||||
def _init_worker(self):
|
||||
assert self.parallel_config.world_size == 1, (
|
||||
"GPUExecutor only supports single GPU.")
|
||||
|
||||
self.driver_worker = self._create_worker()
|
||||
self.driver_worker.init_device()
|
||||
self.driver_worker.load_model()
|
||||
|
||||
def determine_num_available_blocks(self) -> Tuple[int, int]:
|
||||
"""Determine the number of available KV blocks by invoking the
|
||||
underlying worker.
|
||||
"""
|
||||
return self.driver_worker.determine_num_available_blocks()
|
||||
|
||||
def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None:
|
||||
"""Initialize the KV cache by invoking the underlying worker.
|
||||
"""
|
||||
# NOTE: This is logged in the executor because there can be >1 worker
|
||||
# with other executors. We could log in the engine level, but work
|
||||
# remains to abstract away the device for non-GPU configurations.
|
||||
logger.info("# HPU blocks: %d, # CPU blocks: %d", num_gpu_blocks,
|
||||
num_cpu_blocks)
|
||||
from vllm_hpu_extension.profiler import HabanaMemoryProfiler
|
||||
with HabanaMemoryProfiler() as cache_init_m:
|
||||
self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)
|
||||
msg = f"init_cache_engine took {cache_init_m.get_summary_string()}"
|
||||
logger.info(msg)
|
||||
|
||||
def finish_measurements(self):
|
||||
self.driver_worker.finish_measurements()
|
||||
|
||||
def execute_model(
|
||||
self,
|
||||
execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
|
||||
# VLLM_HPU_LOG_STEP_GRAPH_COMPILATION - will log graph compilations per engine step, only when there was any - highly recommended to use alongside PT_HPU_METRICS_GC_DETAILS! # noqa:E501
|
||||
# VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL - will log graph compilations per engine step, always, even if there were none # noqa:E501
|
||||
# VLLM_HPU_LOG_STEP_CPU_FALLBACKS - will log cpu fallbacks per engine step, only when there was any # noqa:E501
|
||||
# VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL - will log cpu fallbacks per engine step, always, even if there were none # noqa:E501
|
||||
log_graph_compilation_all = os.environ.get(
|
||||
'VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL', '0') != '0'
|
||||
log_graph_compilation = os.environ.get(
|
||||
'VLLM_HPU_LOG_STEP_GRAPH_COMPILATION',
|
||||
'0') != '0' or log_graph_compilation_all
|
||||
log_cpu_fallbacks_all = os.environ.get(
|
||||
'VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL', '0') != '0'
|
||||
log_cpu_fallbacks = os.environ.get('VLLM_HPU_LOG_STEP_CPU_FALLBACKS',
|
||||
'0') != '0' or log_cpu_fallbacks_all
|
||||
if log_graph_compilation or log_cpu_fallbacks:
|
||||
from habana_frameworks.torch.hpu.metrics import metric_localcontext
|
||||
seq_group_metadata_list = execute_model_req.seq_group_metadata_list
|
||||
is_prompt = any([
|
||||
seq_group_metadata.is_prompt
|
||||
for seq_group_metadata in seq_group_metadata_list
|
||||
])
|
||||
max_context_len = max([
|
||||
max([
|
||||
len(v.prompt_token_ids) + len(v.output_token_ids)
|
||||
for v in seq_group_metadata.seq_data.values()
|
||||
]) for seq_group_metadata in seq_group_metadata_list
|
||||
]) # whoa, that's some spicy stuff right here
|
||||
max_num_blocks = (
|
||||
(max_context_len - 1) // self.cache_config.block_size) + 1
|
||||
input_stats = (f'is_prompt: {is_prompt}, '
|
||||
f'num_seqs: {len(seq_group_metadata_list)}, '
|
||||
f'max_context_len: {max_context_len}, '
|
||||
f'max_num_blocks {max_num_blocks}')
|
||||
gc_ctx = metric_localcontext(
|
||||
"graph_compilation"
|
||||
) if log_graph_compilation else contextlib.nullcontext()
|
||||
cpu_fallback_ctx = metric_localcontext(
|
||||
"cpu_fallback"
|
||||
) if log_cpu_fallbacks else contextlib.nullcontext()
|
||||
with gc_ctx as gc_local_metric, \
|
||||
cpu_fallback_ctx as cpu_fallback_local_metric:
|
||||
output = self.driver_worker.execute_model(execute_model_req)
|
||||
if (log_graph_compilation and gc_local_metric.stats()[0][1] > 0
|
||||
) or log_graph_compilation_all:
|
||||
msg = ("VLLM_HPU_STEP_GRAPH_COMPILATION: "
|
||||
f"{gc_local_metric.stats()}, {input_stats}")
|
||||
logger.warning(msg)
|
||||
if (log_cpu_fallbacks and cpu_fallback_local_metric.stats()[0][1] >
|
||||
0) or log_cpu_fallbacks_all:
|
||||
msg = ("VLLM_HPU_STEP_CPU_FALLBACK: "
|
||||
f"{cpu_fallback_local_metric.stats()}, {input_stats}")
|
||||
logger.warning(msg)
|
||||
|
||||
return output
|
||||
|
||||
output = self.driver_worker.execute_model(execute_model_req)
|
||||
return output
|
||||
|
||||
def add_lora(self, lora_request: LoRARequest) -> bool:
|
||||
assert lora_request.lora_int_id > 0, "lora_id must be greater than 0."
|
||||
return self.driver_worker.add_lora(lora_request)
|
||||
|
||||
def remove_lora(self, lora_id: int) -> bool:
|
||||
assert lora_id > 0, "lora_id must be greater than 0."
|
||||
return self.driver_worker.remove_lora(lora_id)
|
||||
|
||||
def pin_lora(self, lora_id: int) -> bool:
|
||||
assert lora_id > 0, "lora_id must be greater than 0."
|
||||
return self.driver_worker.pin_lora(lora_id)
|
||||
|
||||
def list_loras(self) -> Set[int]:
|
||||
return self.driver_worker.list_loras()
|
||||
|
||||
def add_prompt_adapter(
|
||||
self, prompt_adapter_request: PromptAdapterRequest) -> bool:
|
||||
raise NotImplementedError(
|
||||
"Prompt Adapter is not implemented for HPU backend.")
|
||||
|
||||
def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
|
||||
raise NotImplementedError(
|
||||
"Prompt Adapter is not implemented for HPU backend.")
|
||||
|
||||
def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
|
||||
raise NotImplementedError(
|
||||
"Prompt Adapter is not implemented for HPU backend.")
|
||||
|
||||
def list_prompt_adapters(self) -> Set[int]:
|
||||
raise NotImplementedError(
|
||||
"Prompt Adapter is not implemented for HPU backend.")
|
||||
|
||||
def check_health(self) -> None:
|
||||
# GPUExecutor will always be healthy as long as
|
||||
# it's running.
|
||||
return
|
||||
|
||||
def start_profile(self) -> None:
|
||||
self.driver_worker.start_profile()
|
||||
|
||||
def stop_profile(self) -> None:
|
||||
self.driver_worker.stop_profile()
|
||||
|
||||
def shutdown(self) -> None:
|
||||
self.driver_worker.shutdown_inc()
|
||||
|
||||
|
||||
class HPUExecutorAsync(HPUExecutor, ExecutorAsyncBase):
|
||||
|
||||
async def execute_model_async(
|
||||
self,
|
||||
execute_model_req: ExecuteModelRequest,
|
||||
) -> List[SamplerOutput]:
|
||||
output = await make_async(self.driver_worker.execute_model
|
||||
)(execute_model_req=execute_model_req, )
|
||||
return output
|
||||
64
vllm-v0.6.2/vllm/executor/mlu_executor.py
Normal file
64
vllm-v0.6.2/vllm/executor/mlu_executor.py
Normal file
@@ -0,0 +1,64 @@
|
||||
from typing import Callable, List, Optional, Tuple, Union, Type
|
||||
|
||||
from vllm.executor.executor_base import ExecutorAsyncBase
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.layers.sampler import SamplerOutput
|
||||
from vllm.sequence import ExecuteModelRequest, PoolerOutput
|
||||
from vllm.utils import make_async
|
||||
from vllm.executor.gpu_executor import GPUExecutor
|
||||
from vllm.worker.worker_base import WorkerBase
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class MLUExecutor(GPUExecutor):
|
||||
|
||||
def _init_executor(self) -> None:
|
||||
"""Initialize the worker and load the model.
|
||||
"""
|
||||
assert self.parallel_config.world_size == 1, (
|
||||
"MLUExecutor only supports single MLU.")
|
||||
|
||||
self.driver_worker = self._create_worker()
|
||||
self.driver_worker.init_device()
|
||||
self.driver_worker.load_model()
|
||||
|
||||
def _get_worker_module_and_class(
|
||||
self) -> Tuple[str, str, Optional[Callable[[], Type[WorkerBase]]]]:
|
||||
worker_class_fn = None
|
||||
if self.scheduler_config.is_multi_step:
|
||||
worker_module_name = "vllm.worker.mlu_multi_step_worker"
|
||||
worker_class_name = "MLUMultiStepWorker"
|
||||
elif self.speculative_config:
|
||||
worker_module_name = "vllm.spec_decode.mlu_spec_decode_worker"
|
||||
worker_class_name = "create_mlu_spec_worker"
|
||||
else:
|
||||
worker_module_name = "vllm.worker.mlu_worker"
|
||||
worker_class_name = "MLUWorker"
|
||||
return (worker_module_name, worker_class_name, worker_class_fn)
|
||||
|
||||
def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None:
|
||||
"""Initialize the KV cache by invoking the underlying worker.
|
||||
"""
|
||||
# NOTE: This is logged in the executor because there can be >1 worker
|
||||
# with other executors. We could log in the engine level, but work
|
||||
# remains to abstract away the device for non-GPU configurations.
|
||||
logger.info("# MLU blocks: %d, # CPU blocks: %d", num_gpu_blocks,
|
||||
num_cpu_blocks)
|
||||
max_concurrency = (num_gpu_blocks * self.cache_config.block_size /
|
||||
self.model_config.max_model_len)
|
||||
logger.info("Maximum concurrency for %s tokens per request: %.2fx",
|
||||
self.model_config.max_model_len, max_concurrency)
|
||||
|
||||
self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)
|
||||
|
||||
|
||||
class MLUExecutorAsync(MLUExecutor, ExecutorAsyncBase):
|
||||
|
||||
async def execute_model_async(
|
||||
self,
|
||||
execute_model_req: ExecuteModelRequest,
|
||||
) -> List[Union[SamplerOutput, PoolerOutput]]:
|
||||
output = await make_async(self.driver_worker.execute_model
|
||||
)(execute_model_req=execute_model_req)
|
||||
return output
|
||||
27
vllm-v0.6.2/vllm/executor/msgspec_utils.py
Normal file
27
vllm-v0.6.2/vllm/executor/msgspec_utils.py
Normal file
@@ -0,0 +1,27 @@
|
||||
from array import array
|
||||
from typing import Any, Type
|
||||
|
||||
from vllm.sequence import VLLM_TOKEN_ID_ARRAY_TYPE
|
||||
|
||||
|
||||
def encode_hook(obj: Any) -> Any:
|
||||
"""Custom msgspec enc hook that supports array types.
|
||||
|
||||
See https://jcristharif.com/msgspec/api.html#msgspec.msgpack.Encoder
|
||||
"""
|
||||
if isinstance(obj, array):
|
||||
assert obj.typecode == VLLM_TOKEN_ID_ARRAY_TYPE, (
|
||||
f"vLLM array type should use '{VLLM_TOKEN_ID_ARRAY_TYPE}' type. "
|
||||
f"Given array has a type code of {obj.typecode}.")
|
||||
return obj.tobytes()
|
||||
|
||||
|
||||
def decode_hook(type: Type, obj: Any) -> Any:
|
||||
"""Custom msgspec dec hook that supports array types.
|
||||
|
||||
See https://jcristharif.com/msgspec/api.html#msgspec.msgpack.Encoder
|
||||
"""
|
||||
if type is array:
|
||||
deserialized = array(VLLM_TOKEN_ID_ARRAY_TYPE)
|
||||
deserialized.frombytes(obj)
|
||||
return deserialized
|
||||
261
vllm-v0.6.2/vllm/executor/multiproc_gpu_executor.py
Normal file
261
vllm-v0.6.2/vllm/executor/multiproc_gpu_executor.py
Normal file
@@ -0,0 +1,261 @@
|
||||
import asyncio
|
||||
import os
|
||||
from functools import partial
|
||||
from typing import Any, List, Optional
|
||||
|
||||
import torch
|
||||
|
||||
from vllm.executor.distributed_gpu_executor import ( # yapf: disable
|
||||
DistributedGPUExecutor, DistributedGPUExecutorAsync)
|
||||
from vllm.executor.gpu_executor import create_worker
|
||||
from vllm.executor.multiproc_worker_utils import (ProcessWorkerWrapper,
|
||||
ResultHandler, WorkerMonitor)
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.layers.sampler import SamplerOutput
|
||||
from vllm.sequence import ExecuteModelRequest
|
||||
from vllm.triton_utils.importing import HAS_TRITON
|
||||
from vllm.utils import (_run_task_with_lock, cuda_device_count_stateless,
|
||||
cuda_is_initialized, get_distributed_init_method,
|
||||
get_open_port, get_vllm_instance_id, make_async,
|
||||
update_environment_variables)
|
||||
|
||||
if HAS_TRITON:
|
||||
from vllm.triton_utils import maybe_set_triton_cache_manager
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class MultiprocessingGPUExecutor(DistributedGPUExecutor):
|
||||
"""Python multiprocessing-based multi-GPU executor"""
|
||||
|
||||
uses_ray: bool = False
|
||||
|
||||
def _init_executor(self) -> None:
|
||||
self._check_executor_parameters()
|
||||
|
||||
# Create the parallel GPU workers.
|
||||
world_size = self.parallel_config.world_size
|
||||
tensor_parallel_size = self.parallel_config.tensor_parallel_size
|
||||
|
||||
# Ensure that VLLM_INSTANCE_ID is set, to be inherited by workers
|
||||
os.environ["VLLM_INSTANCE_ID"] = get_vllm_instance_id()
|
||||
|
||||
# Disable torch async compiling which won't work with daemonic processes
|
||||
os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1"
|
||||
|
||||
# Configure thread parallelism if OMP_NUM_THREADS isn't set
|
||||
#
|
||||
# Helps to avoid CPU contention. The default of spawning a thread per
|
||||
# core combined with multiprocessing for each GPU can have a negative
|
||||
# impact on performance. The contention is amplified when running in a
|
||||
# container where CPU limits can cause throttling.
|
||||
default_omp_num_threads = 1
|
||||
if "OMP_NUM_THREADS" not in os.environ and (
|
||||
current_parallelism :=
|
||||
torch.get_num_threads()) > default_omp_num_threads:
|
||||
logger.warning(
|
||||
"Reducing Torch parallelism from %d threads to %d to avoid "
|
||||
"unnecessary CPU contention. Set OMP_NUM_THREADS in the "
|
||||
"external environment to tune this value as needed.",
|
||||
current_parallelism, default_omp_num_threads)
|
||||
os.environ["OMP_NUM_THREADS"] = str(default_omp_num_threads)
|
||||
torch.set_num_threads(default_omp_num_threads)
|
||||
|
||||
# workaround for https://github.com/vllm-project/vllm/issues/6103
|
||||
if HAS_TRITON and world_size > 1:
|
||||
maybe_set_triton_cache_manager()
|
||||
|
||||
# Multiprocessing-based executor does not support multi-node setting.
|
||||
# Since it only works for single node, we can use the loopback address
|
||||
# 127.0.0.1 for communication.
|
||||
distributed_init_method = get_distributed_init_method(
|
||||
"127.0.0.1", get_open_port())
|
||||
|
||||
self.workers: List[ProcessWorkerWrapper] = []
|
||||
# This is the list of workers that are rank 0 of each TP group EXCEPT
|
||||
# global rank 0. These are the workers that will broadcast to the
|
||||
# rest of the workers.
|
||||
self.tp_driver_workers: List[ProcessWorkerWrapper] = []
|
||||
# This is the list of workers that are not drivers and not the first
|
||||
# worker in a TP group. These are the workers that will be
|
||||
# broadcasted to.
|
||||
self.non_driver_workers: List[ProcessWorkerWrapper] = []
|
||||
|
||||
if world_size == 1:
|
||||
self.worker_monitor = None
|
||||
else:
|
||||
result_handler = ResultHandler()
|
||||
for rank in range(1, world_size):
|
||||
worker = ProcessWorkerWrapper(
|
||||
result_handler,
|
||||
partial(
|
||||
create_worker,
|
||||
**self._get_create_worker_kwargs(
|
||||
rank=rank,
|
||||
local_rank=rank,
|
||||
distributed_init_method=distributed_init_method,
|
||||
)))
|
||||
self.workers.append(worker)
|
||||
if rank % tensor_parallel_size == 0:
|
||||
self.tp_driver_workers.append(worker)
|
||||
else:
|
||||
self.non_driver_workers.append(worker)
|
||||
|
||||
self.worker_monitor = WorkerMonitor(self.workers, result_handler)
|
||||
result_handler.start()
|
||||
self.worker_monitor.start()
|
||||
|
||||
# Set up signal handlers to shutdown the executor cleanly
|
||||
# sometimes gc does not work well
|
||||
|
||||
self.driver_worker = self._create_worker(
|
||||
distributed_init_method=distributed_init_method)
|
||||
self._run_workers("init_device")
|
||||
self._run_workers("load_model",
|
||||
max_concurrent_workers=self.parallel_config.
|
||||
max_parallel_loading_workers)
|
||||
|
||||
def _check_executor_parameters(self):
|
||||
world_size = self.parallel_config.world_size
|
||||
tensor_parallel_size = self.parallel_config.tensor_parallel_size
|
||||
|
||||
# Set CUDA_VISIBLE_DEVICES for the driver, inherited by workers
|
||||
if "CUDA_VISIBLE_DEVICES" not in os.environ:
|
||||
update_environment_variables({
|
||||
"CUDA_VISIBLE_DEVICES": (",".join(map(str, range(world_size))))
|
||||
})
|
||||
|
||||
if (cuda_is_initialized()
|
||||
and os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") != "spawn"):
|
||||
logger.warning("CUDA was previously initialized. We must use "
|
||||
"the `spawn` multiprocessing start method. Setting "
|
||||
"VLLM_WORKER_MULTIPROC_METHOD to 'spawn'.")
|
||||
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
||||
|
||||
cuda_device_count = cuda_device_count_stateless()
|
||||
# Use confusing message for more common TP-only case.
|
||||
assert tensor_parallel_size <= cuda_device_count, (
|
||||
f"please set tensor_parallel_size ({tensor_parallel_size}) "
|
||||
f"to less than max local gpu count ({cuda_device_count})")
|
||||
|
||||
assert world_size <= cuda_device_count, (
|
||||
f"please ensure that world_size ({world_size}) "
|
||||
f"is less than than max local gpu count ({cuda_device_count})")
|
||||
|
||||
def shutdown(self):
|
||||
if (worker_monitor := getattr(self, "worker_monitor",
|
||||
None)) is not None:
|
||||
worker_monitor.close()
|
||||
|
||||
def _driver_execute_model(
|
||||
self, execute_model_req: Optional[ExecuteModelRequest]
|
||||
) -> Optional[List[SamplerOutput]]:
|
||||
"""Run execute_model in the driver worker.
|
||||
|
||||
Passing None will cause the driver to stop the model execution
|
||||
loop running in each of the remote workers.
|
||||
"""
|
||||
return self.driver_worker.execute_model(execute_model_req)
|
||||
|
||||
def _run_workers(
|
||||
self,
|
||||
method: str,
|
||||
*args,
|
||||
async_run_tensor_parallel_workers_only: bool = False,
|
||||
max_concurrent_workers: Optional[int] = None,
|
||||
**kwargs,
|
||||
) -> Any:
|
||||
"""Runs the given method on all workers.
|
||||
|
||||
Args:
|
||||
async_run_tensor_parallel_workers_only: If True the method will be
|
||||
run only in the remote TP workers, not the driver worker.
|
||||
It will also be run asynchronously and return a list of futures
|
||||
rather than blocking on the results.
|
||||
"""
|
||||
|
||||
if max_concurrent_workers:
|
||||
raise NotImplementedError(
|
||||
"max_concurrent_workers is not supported yet.")
|
||||
|
||||
if async_run_tensor_parallel_workers_only:
|
||||
# Run only non-driver workers and just return futures.
|
||||
return [
|
||||
worker.execute_method(method, *args, **kwargs)
|
||||
for worker in self.non_driver_workers
|
||||
]
|
||||
|
||||
# Start all remote workers first.
|
||||
worker_outputs = [
|
||||
worker.execute_method(method, *args, **kwargs)
|
||||
for worker in self.workers
|
||||
]
|
||||
|
||||
driver_worker_method = getattr(self.driver_worker, method)
|
||||
driver_worker_output = driver_worker_method(*args, **kwargs)
|
||||
|
||||
# Get the results of the workers.
|
||||
return [driver_worker_output
|
||||
] + [output.get() for output in worker_outputs]
|
||||
|
||||
def check_health(self) -> None:
|
||||
"""Raises an error if engine is unhealthy."""
|
||||
if self.worker_monitor is not None and not self.worker_monitor.is_alive(
|
||||
):
|
||||
raise RuntimeError("Worker processes are not running")
|
||||
|
||||
def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None:
|
||||
"""Wait for futures returned from _run_workers() with
|
||||
async_run_remote_workers_only to complete."""
|
||||
for result in parallel_worker_tasks:
|
||||
result.get()
|
||||
|
||||
|
||||
class MultiprocessingGPUExecutorAsync(MultiprocessingGPUExecutor,
|
||||
DistributedGPUExecutorAsync):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.driver_exec_model = make_async(self.driver_worker.execute_model)
|
||||
self.pp_locks: Optional[List[asyncio.Lock]] = None
|
||||
|
||||
async def _driver_execute_model_async(
|
||||
self,
|
||||
execute_model_req: Optional[ExecuteModelRequest] = None
|
||||
) -> List[SamplerOutput]:
|
||||
if not self.tp_driver_workers:
|
||||
return await self.driver_exec_model(execute_model_req)
|
||||
|
||||
if self.pp_locks is None:
|
||||
# This locks each pipeline parallel stage so multiple virtual
|
||||
# engines can't execute on the same stage at the same time
|
||||
# We create the locks here to avoid creating them in the constructor
|
||||
# which uses a different asyncio loop.
|
||||
self.pp_locks = [
|
||||
asyncio.Lock()
|
||||
for _ in range(self.parallel_config.pipeline_parallel_size)
|
||||
]
|
||||
|
||||
tasks = [
|
||||
asyncio.create_task(
|
||||
_run_task_with_lock(self.driver_exec_model, self.pp_locks[0],
|
||||
execute_model_req))
|
||||
]
|
||||
for pp_rank, driver_worker in enumerate(self.tp_driver_workers,
|
||||
start=1):
|
||||
tasks.append(
|
||||
asyncio.create_task(
|
||||
_run_task_with_lock(driver_worker.execute_method_async,
|
||||
self.pp_locks[pp_rank],
|
||||
"execute_model", execute_model_req)))
|
||||
results = await asyncio.gather(*tasks)
|
||||
|
||||
# Only the last PP stage has the final results.
|
||||
return results[-1]
|
||||
|
||||
async def _start_worker_execution_loop(self):
|
||||
coros = [
|
||||
worker.execute_method_async("start_worker_execution_loop")
|
||||
for worker in self.non_driver_workers
|
||||
]
|
||||
return await asyncio.gather(*coros)
|
||||
261
vllm-v0.6.2/vllm/executor/multiproc_mlu_executor.py
Normal file
261
vllm-v0.6.2/vllm/executor/multiproc_mlu_executor.py
Normal file
@@ -0,0 +1,261 @@
|
||||
import asyncio
|
||||
import os
|
||||
import signal
|
||||
import threading
|
||||
import weakref
|
||||
from functools import partial
|
||||
from typing import Any, List, Optional
|
||||
|
||||
import torch
|
||||
|
||||
from vllm.executor.distributed_mlu_executor import ( # yapf: disable
|
||||
DistributedMLUExecutor, DistributedMLUExecutorAsync)
|
||||
from vllm.executor.gpu_executor import create_worker
|
||||
from vllm.executor.multiproc_worker_utils import (ProcessWorkerWrapper,
|
||||
ResultHandler, WorkerMonitor)
|
||||
from vllm.logger import init_logger
|
||||
from vllm.sequence import ExecuteModelRequest
|
||||
from vllm.model_executor.layers.sampler import SamplerOutput
|
||||
from vllm.triton_utils.importing import HAS_TRITON
|
||||
from vllm.utils import (_run_task_with_lock, mlu_device_count_stateless,
|
||||
get_distributed_init_method, get_open_port,
|
||||
get_vllm_instance_id, make_async,
|
||||
update_environment_variables)
|
||||
if HAS_TRITON:
|
||||
from vllm.triton_utils import maybe_set_triton_cache_manager
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class MultiprocessingMLUExecutor(DistributedMLUExecutor):
|
||||
"""Python multiprocessing-based multi-MLU executor"""
|
||||
|
||||
uses_ray: bool = False
|
||||
|
||||
def _init_executor(self) -> None:
|
||||
self._check_executor_parameters()
|
||||
|
||||
# Create the parallel GPU workers.
|
||||
world_size = self.parallel_config.world_size
|
||||
tensor_parallel_size = self.parallel_config.tensor_parallel_size
|
||||
|
||||
# Ensure that VLLM_INSTANCE_ID is set, to be inherited by workers
|
||||
os.environ["VLLM_INSTANCE_ID"] = get_vllm_instance_id()
|
||||
|
||||
# Disable torch async compiling which won't work with daemonic processes
|
||||
os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1"
|
||||
|
||||
# Configure thread parallelism if OMP_NUM_THREADS isn't set
|
||||
#
|
||||
# Helps to avoid CPU contention. The default of spawning a thread per
|
||||
# core combined with multiprocessing for each GPU can have a negative
|
||||
# impact on performance. The contention is amplified when running in a
|
||||
# container where CPU limits can cause throttling.
|
||||
default_omp_num_threads = 1
|
||||
if "OMP_NUM_THREADS" not in os.environ and (
|
||||
current_parallelism :=
|
||||
torch.get_num_threads()) > default_omp_num_threads:
|
||||
logger.warning(
|
||||
"Reducing Torch parallelism from %d threads to %d to avoid "
|
||||
"unnecessary CPU contention. Set OMP_NUM_THREADS in the "
|
||||
"external environment to tune this value as needed.",
|
||||
current_parallelism, default_omp_num_threads)
|
||||
os.environ["OMP_NUM_THREADS"] = str(default_omp_num_threads)
|
||||
torch.set_num_threads(default_omp_num_threads)
|
||||
|
||||
# workaround for https://github.com/vllm-project/vllm/issues/6103
|
||||
if HAS_TRITON and world_size > 1:
|
||||
maybe_set_triton_cache_manager()
|
||||
|
||||
# Multiprocessing-based executor does not support multi-node setting.
|
||||
# Since it only works for single node, we can use the loopback address
|
||||
# 127.0.0.1 for communication.
|
||||
distributed_init_method = get_distributed_init_method(
|
||||
"127.0.0.1", get_open_port())
|
||||
|
||||
self.workers: List[ProcessWorkerWrapper] = []
|
||||
# This is the list of workers that are rank 0 of each TP group EXCEPT
|
||||
# global rank 0. These are the workers that will broadcast to the
|
||||
# rest of the workers.
|
||||
self.tp_driver_workers: List[ProcessWorkerWrapper] = []
|
||||
# This is the list of workers that are not drivers and not the first
|
||||
# worker in a TP group. These are the workers that will be
|
||||
# broadcasted to.
|
||||
self.non_driver_workers: List[ProcessWorkerWrapper] = []
|
||||
|
||||
if world_size == 1:
|
||||
self.worker_monitor = None
|
||||
else:
|
||||
result_handler = ResultHandler()
|
||||
for rank in range(1, world_size):
|
||||
worker = ProcessWorkerWrapper(
|
||||
result_handler,
|
||||
partial(
|
||||
create_worker,
|
||||
**self._get_create_worker_kwargs(
|
||||
rank=rank,
|
||||
local_rank=rank,
|
||||
distributed_init_method=distributed_init_method,
|
||||
)))
|
||||
self.workers.append(worker)
|
||||
if rank % tensor_parallel_size == 0:
|
||||
self.tp_driver_workers.append(worker)
|
||||
else:
|
||||
self.non_driver_workers.append(worker)
|
||||
|
||||
self.worker_monitor = WorkerMonitor(self.workers, result_handler)
|
||||
result_handler.start()
|
||||
self.worker_monitor.start()
|
||||
|
||||
# Set up signal handlers to shutdown the executor cleanly
|
||||
# sometimes gc does not work well
|
||||
|
||||
self.driver_worker = self._create_worker(
|
||||
distributed_init_method=distributed_init_method)
|
||||
self._run_workers("init_device")
|
||||
self._run_workers("load_model",
|
||||
max_concurrent_workers=self.parallel_config.
|
||||
max_parallel_loading_workers)
|
||||
|
||||
def _check_executor_parameters(self):
|
||||
world_size = self.parallel_config.world_size
|
||||
tensor_parallel_size = self.parallel_config.tensor_parallel_size
|
||||
|
||||
# Set MLU_VISIBLE_DEVICES for the driver, inherited by workers
|
||||
if "MLU_VISIBLE_DEVICES" not in os.environ:
|
||||
update_environment_variables({
|
||||
"MLU_VISIBLE_DEVICES": (",".join(map(str, range(world_size))))
|
||||
})
|
||||
|
||||
if os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") != "spawn":
|
||||
logger.warning("We must use the `spawn` multiprocessing start method. Setting "
|
||||
"VLLM_WORKER_MULTIPROC_METHOD to 'spawn'.")
|
||||
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
||||
|
||||
mlu_device_count = mlu_device_count_stateless()
|
||||
# Use confusing message for more common TP-only case.
|
||||
assert tensor_parallel_size <= mlu_device_count, (
|
||||
f"please set tensor_parallel_size ({tensor_parallel_size}) "
|
||||
f"to less than max local mlu count ({mlu_device_count})")
|
||||
|
||||
assert world_size <= mlu_device_count, (
|
||||
f"please ensure that world_size ({world_size}) "
|
||||
f"is less than than max local mlu count ({mlu_device_count})")
|
||||
|
||||
def shutdown(self):
|
||||
if (worker_monitor := getattr(self, "worker_monitor",
|
||||
None)) is not None:
|
||||
worker_monitor.close()
|
||||
|
||||
def _driver_execute_model(
|
||||
self, execute_model_req: Optional[ExecuteModelRequest]
|
||||
) -> Optional[List[SamplerOutput]]:
|
||||
"""Run execute_model in the driver worker.
|
||||
|
||||
Passing None will cause the driver to stop the model execution
|
||||
loop running in each of the remote workers.
|
||||
"""
|
||||
return self.driver_worker.execute_model(execute_model_req)
|
||||
|
||||
def _run_workers(
|
||||
self,
|
||||
method: str,
|
||||
*args,
|
||||
async_run_tensor_parallel_workers_only: bool = False,
|
||||
max_concurrent_workers: Optional[int] = None,
|
||||
**kwargs,
|
||||
) -> Any:
|
||||
"""Runs the given method on all workers.
|
||||
|
||||
Args:
|
||||
async_run_tensor_parallel_workers_only: If True the method will be
|
||||
run only in the remote TP workers, not the driver worker.
|
||||
It will also be run asynchronously and return a list of futures
|
||||
rather than blocking on the results.
|
||||
"""
|
||||
|
||||
if max_concurrent_workers:
|
||||
raise NotImplementedError(
|
||||
"max_concurrent_workers is not supported yet.")
|
||||
|
||||
if async_run_tensor_parallel_workers_only:
|
||||
# Run only non-driver workers and just return futures.
|
||||
return [
|
||||
worker.execute_method(method, *args, **kwargs)
|
||||
for worker in self.non_driver_workers
|
||||
]
|
||||
|
||||
# Start all remote workers first.
|
||||
worker_outputs = [
|
||||
worker.execute_method(method, *args, **kwargs)
|
||||
for worker in self.workers
|
||||
]
|
||||
|
||||
driver_worker_method = getattr(self.driver_worker, method)
|
||||
driver_worker_output = driver_worker_method(*args, **kwargs)
|
||||
|
||||
# Get the results of the workers.
|
||||
return [driver_worker_output
|
||||
] + [output.get() for output in worker_outputs]
|
||||
|
||||
def check_health(self) -> None:
|
||||
"""Raises an error if engine is unhealthy."""
|
||||
if self.worker_monitor is not None and not self.worker_monitor.is_alive(
|
||||
):
|
||||
raise RuntimeError("Worker processes are not running")
|
||||
|
||||
def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None:
|
||||
"""Wait for futures returned from _run_workers() with
|
||||
async_run_remote_workers_only to complete."""
|
||||
for result in parallel_worker_tasks:
|
||||
result.get()
|
||||
|
||||
|
||||
class MultiprocessingMLUExecutorAsync(MultiprocessingMLUExecutor,
|
||||
DistributedMLUExecutorAsync):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.driver_exec_model = make_async(self.driver_worker.execute_model)
|
||||
self.pp_locks: Optional[List[asyncio.Lock]] = None
|
||||
|
||||
async def _driver_execute_model_async(
|
||||
self,
|
||||
execute_model_req: Optional[ExecuteModelRequest] = None
|
||||
) -> List[SamplerOutput]:
|
||||
if not self.tp_driver_workers:
|
||||
return await self.driver_exec_model(execute_model_req)
|
||||
|
||||
if self.pp_locks is None:
|
||||
# This locks each pipeline parallel stage so multiple virtual
|
||||
# engines can't execute on the same stage at the same time
|
||||
# We create the locks here to avoid creating them in the constructor
|
||||
# which uses a different asyncio loop.
|
||||
self.pp_locks = [
|
||||
asyncio.Lock()
|
||||
for _ in range(self.parallel_config.pipeline_parallel_size)
|
||||
]
|
||||
|
||||
tasks = [
|
||||
asyncio.create_task(
|
||||
_run_task_with_lock(self.driver_exec_model, self.pp_locks[0],
|
||||
execute_model_req))
|
||||
]
|
||||
for pp_rank, driver_worker in enumerate(self.tp_driver_workers,
|
||||
start=1):
|
||||
tasks.append(
|
||||
asyncio.create_task(
|
||||
_run_task_with_lock(driver_worker.execute_method_async,
|
||||
self.pp_locks[pp_rank],
|
||||
"execute_model", execute_model_req)))
|
||||
results = await asyncio.gather(*tasks)
|
||||
|
||||
# Only the last PP stage has the final results.
|
||||
return results[-1]
|
||||
|
||||
async def _start_worker_execution_loop(self):
|
||||
coros = [
|
||||
worker.execute_method_async("start_worker_execution_loop")
|
||||
for worker in self.non_driver_workers
|
||||
]
|
||||
return await asyncio.gather(*coros)
|
||||
272
vllm-v0.6.2/vllm/executor/multiproc_worker_utils.py
Normal file
272
vllm-v0.6.2/vllm/executor/multiproc_worker_utils.py
Normal file
@@ -0,0 +1,272 @@
|
||||
import asyncio
|
||||
import multiprocessing
|
||||
import os
|
||||
import sys
|
||||
import threading
|
||||
import uuid
|
||||
from dataclasses import dataclass
|
||||
from multiprocessing import Queue
|
||||
from multiprocessing.connection import wait
|
||||
from multiprocessing.process import BaseProcess
|
||||
from typing import (Any, Callable, Dict, Generic, List, Optional, TextIO,
|
||||
TypeVar, Union)
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.logger import init_logger
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
T = TypeVar('T')
|
||||
|
||||
_TERMINATE = "TERMINATE" # sentinel
|
||||
|
||||
# ANSI color codes
|
||||
CYAN = '\033[1;36m'
|
||||
RESET = '\033[0;0m'
|
||||
|
||||
JOIN_TIMEOUT_S = 2
|
||||
|
||||
|
||||
@dataclass
|
||||
class Result(Generic[T]):
|
||||
"""Result of task dispatched to worker"""
|
||||
|
||||
task_id: uuid.UUID
|
||||
value: Optional[T] = None
|
||||
exception: Optional[BaseException] = None
|
||||
|
||||
|
||||
class ResultFuture(threading.Event, Generic[T]):
|
||||
"""Synchronous future for non-async case"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.result: Optional[Result[T]] = None
|
||||
|
||||
def set_result(self, result: Result[T]):
|
||||
self.result = result
|
||||
self.set()
|
||||
|
||||
def get(self) -> T:
|
||||
self.wait()
|
||||
assert self.result is not None
|
||||
if self.result.exception is not None:
|
||||
raise self.result.exception
|
||||
return self.result.value # type: ignore[return-value]
|
||||
|
||||
|
||||
def _set_future_result(future: Union[ResultFuture, asyncio.Future],
|
||||
result: Result):
|
||||
if isinstance(future, ResultFuture):
|
||||
future.set_result(result)
|
||||
return
|
||||
loop = future.get_loop()
|
||||
if not loop.is_closed():
|
||||
if result.exception is not None:
|
||||
loop.call_soon_threadsafe(future.set_exception, result.exception)
|
||||
else:
|
||||
loop.call_soon_threadsafe(future.set_result, result.value)
|
||||
|
||||
|
||||
class ResultHandler(threading.Thread):
|
||||
"""Handle results from all workers (in background thread)"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
super().__init__(daemon=True)
|
||||
self.result_queue = get_mp_context().Queue()
|
||||
self.tasks: Dict[uuid.UUID, Union[ResultFuture, asyncio.Future]] = {}
|
||||
|
||||
def run(self):
|
||||
for result in iter(self.result_queue.get, _TERMINATE):
|
||||
future = self.tasks.pop(result.task_id)
|
||||
_set_future_result(future, result)
|
||||
# Ensure that all waiters will receive an exception
|
||||
for task_id, future in self.tasks.items():
|
||||
_set_future_result(
|
||||
future,
|
||||
Result(task_id=task_id,
|
||||
exception=ChildProcessError("worker died")))
|
||||
|
||||
def close(self):
|
||||
self.result_queue.put(_TERMINATE)
|
||||
|
||||
|
||||
class WorkerMonitor(threading.Thread):
|
||||
"""Monitor worker status (in background thread)"""
|
||||
|
||||
def __init__(self, workers: List['ProcessWorkerWrapper'],
|
||||
result_handler: ResultHandler):
|
||||
super().__init__(daemon=True)
|
||||
self.workers = workers
|
||||
self.result_handler = result_handler
|
||||
self._close = False
|
||||
|
||||
def run(self) -> None:
|
||||
# Blocks until any worker exits
|
||||
dead_sentinels = wait([w.process.sentinel for w in self.workers])
|
||||
if not self._close:
|
||||
self._close = True
|
||||
|
||||
# Kill / cleanup all workers
|
||||
for worker in self.workers:
|
||||
process = worker.process
|
||||
if process.sentinel in dead_sentinels:
|
||||
process.join(JOIN_TIMEOUT_S)
|
||||
if process.exitcode is not None and process.exitcode != 0:
|
||||
logger.error("Worker %s pid %s died, exit code: %s",
|
||||
process.name, process.pid, process.exitcode)
|
||||
# Cleanup any remaining workers
|
||||
if logger:
|
||||
logger.info("Killing local vLLM worker processes")
|
||||
for worker in self.workers:
|
||||
worker.kill_worker()
|
||||
# Must be done after worker task queues are all closed
|
||||
self.result_handler.close()
|
||||
|
||||
for worker in self.workers:
|
||||
worker.process.join(JOIN_TIMEOUT_S)
|
||||
|
||||
def close(self):
|
||||
if self._close:
|
||||
return
|
||||
self._close = True
|
||||
logger.info("Terminating local vLLM worker processes")
|
||||
for worker in self.workers:
|
||||
worker.terminate_worker()
|
||||
# Must be done after worker task queues are all closed
|
||||
self.result_handler.close()
|
||||
|
||||
|
||||
class ProcessWorkerWrapper:
|
||||
"""Local process wrapper for vllm.worker.Worker,
|
||||
for handling single-node multi-GPU tensor parallel."""
|
||||
|
||||
def __init__(self, result_handler: ResultHandler,
|
||||
worker_factory: Callable[[], Any]) -> None:
|
||||
self.mp = get_mp_context()
|
||||
self._task_queue = self.mp.Queue()
|
||||
self.result_queue = result_handler.result_queue
|
||||
self.tasks = result_handler.tasks
|
||||
self.process: BaseProcess = self.mp.Process( # type: ignore[attr-defined]
|
||||
target=_run_worker_process,
|
||||
name="VllmWorkerProcess",
|
||||
kwargs=dict(
|
||||
worker_factory=worker_factory,
|
||||
task_queue=self._task_queue,
|
||||
result_queue=self.result_queue,
|
||||
),
|
||||
daemon=True)
|
||||
|
||||
self.process.start()
|
||||
|
||||
def _enqueue_task(self, future: Union[ResultFuture, asyncio.Future],
|
||||
method: str, args, kwargs):
|
||||
task_id = uuid.uuid4()
|
||||
self.tasks[task_id] = future
|
||||
try:
|
||||
self._task_queue.put((task_id, method, args, kwargs))
|
||||
except SystemExit:
|
||||
raise
|
||||
except BaseException as e:
|
||||
del self.tasks[task_id]
|
||||
raise ChildProcessError("worker died") from e
|
||||
|
||||
def execute_method(self, method: str, *args, **kwargs):
|
||||
future: ResultFuture = ResultFuture()
|
||||
self._enqueue_task(future, method, args, kwargs)
|
||||
return future
|
||||
|
||||
async def execute_method_async(self, method: str, *args, **kwargs):
|
||||
future = asyncio.get_running_loop().create_future()
|
||||
self._enqueue_task(future, method, args, kwargs)
|
||||
return await future
|
||||
|
||||
def terminate_worker(self):
|
||||
try:
|
||||
self._task_queue.put(_TERMINATE)
|
||||
except ValueError:
|
||||
self.process.kill()
|
||||
self._task_queue.close()
|
||||
|
||||
def kill_worker(self):
|
||||
self._task_queue.close()
|
||||
self.process.kill()
|
||||
|
||||
|
||||
def _run_worker_process(
|
||||
worker_factory: Callable[[], Any],
|
||||
task_queue: Queue,
|
||||
result_queue: Queue,
|
||||
) -> None:
|
||||
"""Worker process event loop"""
|
||||
|
||||
# Add process-specific prefix to stdout and stderr
|
||||
process_name = get_mp_context().current_process().name
|
||||
pid = os.getpid()
|
||||
_add_prefix(sys.stdout, process_name, pid)
|
||||
_add_prefix(sys.stderr, process_name, pid)
|
||||
|
||||
# Initialize worker
|
||||
worker = worker_factory()
|
||||
del worker_factory
|
||||
|
||||
# Accept tasks from the engine in task_queue
|
||||
# and return task output in result_queue
|
||||
logger.info("Worker ready; awaiting tasks")
|
||||
try:
|
||||
for items in iter(task_queue.get, _TERMINATE):
|
||||
output = None
|
||||
exception = None
|
||||
task_id, method, args, kwargs = items
|
||||
try:
|
||||
executor = getattr(worker, method)
|
||||
output = executor(*args, **kwargs)
|
||||
except SystemExit:
|
||||
raise
|
||||
except KeyboardInterrupt:
|
||||
break
|
||||
except BaseException as e:
|
||||
logger.exception(
|
||||
"Exception in worker %s while processing method %s.",
|
||||
process_name, method)
|
||||
exception = e
|
||||
result_queue.put(
|
||||
Result(task_id=task_id, value=output, exception=exception))
|
||||
except KeyboardInterrupt:
|
||||
pass
|
||||
except Exception:
|
||||
logger.exception("Worker failed")
|
||||
|
||||
logger.info("Worker exiting")
|
||||
|
||||
|
||||
def _add_prefix(file: TextIO, worker_name: str, pid: int) -> None:
|
||||
"""Prepend each output line with process-specific prefix"""
|
||||
|
||||
prefix = f"{CYAN}({worker_name} pid={pid}){RESET} "
|
||||
file_write = file.write
|
||||
|
||||
def write_with_prefix(s: str):
|
||||
if not s:
|
||||
return
|
||||
if file.start_new_line: # type: ignore[attr-defined]
|
||||
file_write(prefix)
|
||||
idx = 0
|
||||
while (next_idx := s.find('\n', idx)) != -1:
|
||||
next_idx += 1
|
||||
file_write(s[idx:next_idx])
|
||||
if next_idx == len(s):
|
||||
file.start_new_line = True # type: ignore[attr-defined]
|
||||
return
|
||||
file_write(prefix)
|
||||
idx = next_idx
|
||||
file_write(s[idx:])
|
||||
file.start_new_line = False # type: ignore[attr-defined]
|
||||
|
||||
file.start_new_line = True # type: ignore[attr-defined]
|
||||
file.write = write_with_prefix # type: ignore[method-assign]
|
||||
|
||||
|
||||
def get_mp_context():
|
||||
mp_method = envs.VLLM_WORKER_MULTIPROC_METHOD
|
||||
return multiprocessing.get_context(mp_method)
|
||||
26
vllm-v0.6.2/vllm/executor/multiproc_xpu_executor.py
Normal file
26
vllm-v0.6.2/vllm/executor/multiproc_xpu_executor.py
Normal file
@@ -0,0 +1,26 @@
|
||||
import vllm.envs as envs
|
||||
from vllm.executor.multiproc_gpu_executor import (
|
||||
MultiprocessingGPUExecutor, MultiprocessingGPUExecutorAsync)
|
||||
from vllm.executor.xpu_executor import XPUExecutor
|
||||
from vllm.logger import init_logger
|
||||
from vllm.utils import make_async
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class MultiprocessingXPUExecutor(MultiprocessingGPUExecutor, XPUExecutor):
|
||||
"""Python multiprocessing-based multi-XPU executor"""
|
||||
|
||||
def _check_executor_parameters(self):
|
||||
mp_method = envs.VLLM_WORKER_MULTIPROC_METHOD
|
||||
if mp_method != "spawn":
|
||||
raise RuntimeError(
|
||||
"XPU multiprocess executor only support spawn as mp method")
|
||||
|
||||
|
||||
class MultiprocessingXPUExecutorAsync(MultiprocessingXPUExecutor,
|
||||
MultiprocessingGPUExecutorAsync):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.driver_exec_model = make_async(self.driver_worker.execute_model)
|
||||
111
vllm-v0.6.2/vllm/executor/neuron_executor.py
Normal file
111
vllm-v0.6.2/vllm/executor/neuron_executor.py
Normal file
@@ -0,0 +1,111 @@
|
||||
from typing import List, Set, Tuple
|
||||
|
||||
from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
|
||||
from vllm.logger import init_logger
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.model_executor.layers.sampler import SamplerOutput
|
||||
from vllm.sequence import ExecuteModelRequest
|
||||
from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
|
||||
make_async)
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class NeuronExecutor(ExecutorBase):
|
||||
|
||||
uses_ray: bool = False
|
||||
|
||||
def _init_executor(self) -> None:
|
||||
assert (self.lora_config is
|
||||
None), "LoRA is not supported for Neuron backend."
|
||||
assert (not self.speculative_config
|
||||
), "Speculative decoding not yet supported for Neuron backend."
|
||||
|
||||
# Instantiate the worker and load the model to the device.
|
||||
self._init_worker()
|
||||
|
||||
def _init_worker(self):
|
||||
from vllm.worker.neuron_worker import NeuronWorker
|
||||
distributed_init_method = get_distributed_init_method(
|
||||
get_ip(), get_open_port())
|
||||
self.driver_worker = NeuronWorker(
|
||||
vllm_config=self.vllm_config,
|
||||
local_rank=0,
|
||||
rank=0,
|
||||
distributed_init_method=distributed_init_method)
|
||||
self.driver_worker.init_device()
|
||||
self.driver_worker.load_model()
|
||||
|
||||
def determine_num_available_blocks(self) -> Tuple[int, int]:
|
||||
"""Determine the number of available KV blocks by invoking the
|
||||
underlying worker.
|
||||
"""
|
||||
return self.driver_worker.determine_num_available_blocks()
|
||||
|
||||
def initialize_cache(self, num_gpu_blocks: int,
|
||||
num_cpu_blocks: int) -> None:
|
||||
"""Initialize the KV cache by invoking the underlying worker.
|
||||
"""
|
||||
self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)
|
||||
|
||||
def execute_model(
|
||||
self,
|
||||
execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
|
||||
assert (not execute_model_req.blocks_to_swap_in
|
||||
and not execute_model_req.blocks_to_swap_out
|
||||
and not execute_model_req.blocks_to_copy), (
|
||||
"Cache operations are not supported for Neuron backend.")
|
||||
assert execute_model_req.num_lookahead_slots == 0, (
|
||||
"lookahead not supported for Neuron backend.")
|
||||
|
||||
output = self.driver_worker.execute_model(execute_model_req)
|
||||
return output
|
||||
|
||||
def add_lora(self, lora_request: LoRARequest) -> bool:
|
||||
return self.driver_worker.add_lora(lora_request)
|
||||
|
||||
def remove_lora(self, lora_id: int) -> bool:
|
||||
return self.driver_worker.remove_lora(lora_id)
|
||||
|
||||
def pin_lora(self, lora_id: int) -> bool:
|
||||
return self.driver_worker.pin_lora(lora_id)
|
||||
|
||||
def list_loras(self) -> Set[int]:
|
||||
return self.driver_worker.list_loras()
|
||||
|
||||
def add_prompt_adapter(self, prompt_adapter_request) -> bool:
|
||||
raise NotImplementedError(
|
||||
"Soft prompt is currently not supported by the Neuron backend.")
|
||||
|
||||
def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
|
||||
raise NotImplementedError(
|
||||
"Soft prompt is currently not supported by the Neuron backend.")
|
||||
|
||||
def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
|
||||
raise NotImplementedError(
|
||||
"Soft prompt is currently not supported by the Neuron backend.")
|
||||
|
||||
def list_prompt_adapters(self) -> Set[int]:
|
||||
raise NotImplementedError(
|
||||
"Soft prompt is currently not supported by the Neuron backend.")
|
||||
|
||||
def check_health(self) -> None:
|
||||
# NeuronExecutor will always be healthy as long as
|
||||
# it's running.
|
||||
return
|
||||
|
||||
|
||||
class NeuronExecutorAsync(NeuronExecutor, ExecutorAsyncBase):
|
||||
|
||||
async def execute_model_async(
|
||||
self,
|
||||
execute_model_req: ExecuteModelRequest,
|
||||
) -> List[SamplerOutput]:
|
||||
output = await make_async(self.driver_worker.execute_model
|
||||
)(execute_model_req=execute_model_req, )
|
||||
return output
|
||||
|
||||
async def check_health_async(self) -> None:
|
||||
# NeuronExecutor will always be healthy as long as
|
||||
# it's running.
|
||||
return
|
||||
201
vllm-v0.6.2/vllm/executor/openvino_executor.py
Normal file
201
vllm-v0.6.2/vllm/executor/openvino_executor.py
Normal file
@@ -0,0 +1,201 @@
|
||||
from typing import List, Set, Tuple
|
||||
|
||||
import openvino as ov
|
||||
import openvino.properties.hint as hints
|
||||
import torch
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.config import CacheConfig, ModelConfig
|
||||
from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
|
||||
from vllm.logger import init_logger
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.model_executor.layers.sampler import SamplerOutput
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.sequence import ExecuteModelRequest
|
||||
from vllm.utils import (GiB_bytes, get_distributed_init_method, get_ip,
|
||||
get_open_port, make_async)
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class OpenVINOExecutor(ExecutorBase):
|
||||
|
||||
uses_ray: bool = False
|
||||
|
||||
def _init_executor(self) -> None:
|
||||
assert self.device_config.device_type == "openvino"
|
||||
assert self.lora_config is None, "OpenVINO backend doesn't support LoRA"
|
||||
assert current_platform.is_openvino_cpu() or \
|
||||
current_platform.is_openvino_gpu(), \
|
||||
"OpenVINO backend supports only CPU and GPU devices"
|
||||
|
||||
self.ov_core = ov.Core()
|
||||
self.model_config = _verify_and_get_model_config(self.model_config)
|
||||
self.cache_config = _verify_and_get_cache_config(
|
||||
self.ov_core, self.cache_config)
|
||||
|
||||
# Instantiate the worker and load the model to CPU.
|
||||
self._init_worker()
|
||||
|
||||
def _init_worker(self):
|
||||
from vllm.worker.openvino_worker import OpenVINOWorker
|
||||
|
||||
assert (
|
||||
self.parallel_config.world_size == 1
|
||||
), "OpenVINOExecutor only supports single CPU socket currently."
|
||||
|
||||
distributed_init_method = get_distributed_init_method(
|
||||
get_ip(), get_open_port())
|
||||
self.driver_worker = OpenVINOWorker(
|
||||
ov_core=self.ov_core,
|
||||
vllm_config=self.vllm_config,
|
||||
local_rank=0,
|
||||
rank=0,
|
||||
distributed_init_method=distributed_init_method,
|
||||
kv_cache_dtype=self.cache_config.cache_dtype,
|
||||
is_driver_worker=True,
|
||||
)
|
||||
self.driver_worker.init_device()
|
||||
self.driver_worker.load_model()
|
||||
|
||||
def determine_num_available_blocks(self) -> Tuple[int, int]:
|
||||
"""Determine the number of available KV blocks by invoking the
|
||||
underlying worker.
|
||||
"""
|
||||
return self.driver_worker.determine_num_available_blocks()
|
||||
|
||||
def initialize_cache(self, num_gpu_blocks: int,
|
||||
num_cpu_blocks: int) -> None:
|
||||
"""Initialize the KV cache by invoking the underlying worker."""
|
||||
# NOTE: We log here to avoid multiple logs when number of workers is
|
||||
# greater than one. We could log in the engine, but not all executors
|
||||
# have GPUs.
|
||||
# NOTE: In case of a CPU device, `cpu block` for OpenVINO backend
|
||||
# is located on CPU memory but is referred as `gpu block`.
|
||||
# Because we want to reuse the existing block management procedure.
|
||||
device_blocks = num_gpu_blocks
|
||||
swap_blocks = num_cpu_blocks
|
||||
logger.info("OpenVINO %s: # device blocks: %d; # swap blocks: %d",
|
||||
envs.VLLM_OPENVINO_DEVICE, device_blocks, swap_blocks)
|
||||
self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)
|
||||
|
||||
def execute_model(
|
||||
self,
|
||||
execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
|
||||
output = self.driver_worker.execute_model(execute_model_req)
|
||||
return output
|
||||
|
||||
def add_lora(self, lora_request: LoRARequest) -> bool:
|
||||
return self.driver_worker.add_lora(lora_request)
|
||||
|
||||
def remove_lora(self, lora_id: int) -> bool:
|
||||
return self.driver_worker.remove_lora(lora_id)
|
||||
|
||||
def pin_lora(self, lora_id: int) -> bool:
|
||||
return self.driver_worker.pin_lora(lora_id)
|
||||
|
||||
def list_loras(self) -> Set[int]:
|
||||
return self.driver_worker.list_loras()
|
||||
|
||||
def add_prompt_adapter(self, prompt_adapter_request) -> bool:
|
||||
raise NotImplementedError(
|
||||
"Soft prompt is currently not supported by the OPENVINO backend.")
|
||||
|
||||
def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
|
||||
raise NotImplementedError(
|
||||
"Soft prompt is currently not supported by the OPENVINO backend.")
|
||||
|
||||
def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
|
||||
raise NotImplementedError(
|
||||
"Soft prompt is currently not supported by the OPENVINO backend.")
|
||||
|
||||
def list_prompt_adapters(self) -> Set[int]:
|
||||
raise NotImplementedError(
|
||||
"Soft prompt is currently not supported by the OPENVINO backend.")
|
||||
|
||||
def check_health(self) -> None:
|
||||
# OpenVINOExecutor will always be healthy as long as
|
||||
# it's running.
|
||||
return
|
||||
|
||||
|
||||
class OpenVINOExecutorAsync(OpenVINOExecutor, ExecutorAsyncBase):
|
||||
|
||||
async def execute_model_async(
|
||||
self,
|
||||
execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
|
||||
output = await make_async(self.driver_worker.execute_model
|
||||
)(execute_model_req=execute_model_req, )
|
||||
return output
|
||||
|
||||
async def check_health_async(self) -> None:
|
||||
# OpenVINOExecutor will always be healthy as long as
|
||||
# it's running.
|
||||
return
|
||||
|
||||
|
||||
def _verify_and_get_model_config(config: ModelConfig) -> ModelConfig:
|
||||
if config.dtype != torch.float32:
|
||||
logger.warning(
|
||||
f"Only float32 dtype is supported on OpenVINO, casting from {config.dtype}." # noqa: G004, E501
|
||||
)
|
||||
config.dtype = torch.float32
|
||||
if not config.enforce_eager:
|
||||
logger.warning(
|
||||
"CUDA graph is not supported on OpenVINO backend, fallback to the "
|
||||
"eager mode.")
|
||||
config.enforce_eager = True
|
||||
return config
|
||||
|
||||
|
||||
def _verify_and_get_cache_config(ov_core: ov.Core,
|
||||
config: CacheConfig) -> CacheConfig:
|
||||
if envs.VLLM_OPENVINO_CPU_KV_CACHE_PRECISION == "u8":
|
||||
if not current_platform.is_openvino_cpu():
|
||||
logger.info("VLLM_OPENVINO_CPU_KV_CACHE_PRECISION is"
|
||||
"ignored for GPU, f16 data type will be used.")
|
||||
config.cache_dtype = ov.Type.f16
|
||||
else:
|
||||
logger.info("KV cache type is overridden to u8 via "
|
||||
"VLLM_OPENVINO_CPU_KV_CACHE_PRECISION env var.")
|
||||
config.cache_dtype = ov.Type.u8
|
||||
else:
|
||||
if current_platform.is_openvino_cpu():
|
||||
ov_device = envs.VLLM_OPENVINO_DEVICE
|
||||
inference_precision = ov_core.get_property(
|
||||
ov_device, hints.inference_precision)
|
||||
if inference_precision == ov.Type.bf16:
|
||||
config.cache_dtype = ov.Type.bf16
|
||||
else:
|
||||
config.cache_dtype = ov.Type.f16
|
||||
else:
|
||||
config.cache_dtype = ov.Type.f16
|
||||
|
||||
if current_platform.is_openvino_cpu():
|
||||
if config.block_size != 32:
|
||||
logger.info(
|
||||
f"OpenVINO CPU optimal block size is 32, overriding currently set {config.block_size}" # noqa: G004, E501
|
||||
)
|
||||
config.block_size = 32
|
||||
else:
|
||||
if config.block_size != 16:
|
||||
logger.info(
|
||||
f"OpenVINO GPU optimal block size is 16, overriding currently set {config.block_size}" # noqa: G004, E501
|
||||
)
|
||||
config.block_size = 16
|
||||
|
||||
kv_cache_space = envs.VLLM_OPENVINO_KVCACHE_SPACE
|
||||
if kv_cache_space >= 0:
|
||||
if kv_cache_space == 0 and current_platform.is_openvino_cpu():
|
||||
config.openvino_kvcache_space_bytes = 4 * GiB_bytes # type: ignore
|
||||
logger.warning(
|
||||
"Environment variable VLLM_OPENVINO_KVCACHE_SPACE (GB) "
|
||||
"for OpenVINO backend is not set, using 4 by default.")
|
||||
else:
|
||||
config.openvino_kvcache_space_bytes = kv_cache_space * GiB_bytes # type: ignore
|
||||
else:
|
||||
raise RuntimeError(
|
||||
"Invalid environment variable VLLM_OPENVINO_KVCACHE_SPACE"
|
||||
f" {kv_cache_space}, expect a positive integer value.")
|
||||
|
||||
return config
|
||||
580
vllm-v0.6.2/vllm/executor/ray_gpu_executor.py
Normal file
580
vllm-v0.6.2/vllm/executor/ray_gpu_executor.py
Normal file
@@ -0,0 +1,580 @@
|
||||
import asyncio
|
||||
import os
|
||||
from collections import defaultdict
|
||||
from itertools import islice, repeat
|
||||
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
|
||||
|
||||
import msgspec
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.executor.distributed_gpu_executor import ( # yapf: disable
|
||||
DistributedGPUExecutor, DistributedGPUExecutorAsync)
|
||||
from vllm.executor.msgspec_utils import encode_hook
|
||||
from vllm.executor.ray_utils import RayWorkerWrapper, ray
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.layers.sampler import SamplerOutput
|
||||
from vllm.sequence import ExecuteModelRequest
|
||||
from vllm.utils import (_run_task_with_lock, get_distributed_init_method,
|
||||
get_ip, get_open_port, get_vllm_instance_id,
|
||||
make_async)
|
||||
|
||||
if ray is not None:
|
||||
from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from ray.util.placement_group import PlacementGroup
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class RayGPUExecutor(DistributedGPUExecutor):
|
||||
|
||||
uses_ray: bool = True
|
||||
|
||||
def _init_executor(self) -> None:
|
||||
self.forward_dag: Optional[ray.dag.CompiledDAG] = None
|
||||
# If the env var is set, it uses the Ray's compiled DAG API
|
||||
# which optimizes the control plane overhead.
|
||||
# Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it.
|
||||
# Currently, this requires USE_RAY_SPMD_WORKER=True.
|
||||
self.use_ray_compiled_dag = envs.VLLM_USE_RAY_COMPILED_DAG
|
||||
# If the env var is set, then we do not distinguish between the
|
||||
# "driver worker" vs other workers. Also, the rank 0 worker will
|
||||
# be executed in a remote Ray worker. Currently this requires
|
||||
# USE_RAY_COMPILED_DAG=True.
|
||||
self.use_ray_spmd_worker = envs.VLLM_USE_RAY_SPMD_WORKER
|
||||
if self.use_ray_compiled_dag:
|
||||
assert self.use_ray_spmd_worker, (
|
||||
"VLLM_USE_RAY_COMPILED_DAG=1 requires "
|
||||
"VLLM_USE_RAY_SPMD_WORKER=1")
|
||||
if self.use_ray_spmd_worker:
|
||||
# TODO: Support SPMD worker for non-DAG Ray executor.
|
||||
assert self.use_ray_compiled_dag, (
|
||||
"VLLM_USE_RAY_SPMD_WORKER=1 requires "
|
||||
"VLLM_USE_RAY_COMPILED_DAG=1")
|
||||
|
||||
assert self.uses_ray
|
||||
placement_group = self.parallel_config.placement_group
|
||||
|
||||
# Disable Ray usage stats collection.
|
||||
ray_usage = os.environ.get("RAY_USAGE_STATS_ENABLED", "0")
|
||||
if ray_usage != "1":
|
||||
os.environ["RAY_USAGE_STATS_ENABLED"] = "0"
|
||||
|
||||
# Create the parallel GPU workers.
|
||||
self._init_workers_ray(placement_group)
|
||||
|
||||
self.input_encoder = msgspec.msgpack.Encoder(enc_hook=encode_hook)
|
||||
self.output_decoder = msgspec.msgpack.Decoder(
|
||||
Optional[List[SamplerOutput]])
|
||||
|
||||
def shutdown(self) -> None:
|
||||
if hasattr(self, "forward_dag") and self.forward_dag is not None:
|
||||
self.forward_dag.teardown()
|
||||
import ray
|
||||
for worker in self.workers:
|
||||
ray.kill(worker)
|
||||
self.forward_dag = None
|
||||
|
||||
def _configure_ray_workers_use_nsight(self,
|
||||
ray_remote_kwargs) -> Dict[str, Any]:
|
||||
# If nsight profiling is enabled, we need to set the profiling
|
||||
# configuration for the ray workers as runtime env.
|
||||
runtime_env = ray_remote_kwargs.setdefault("runtime_env", {})
|
||||
runtime_env.update({
|
||||
"nsight": {
|
||||
"t": "cuda,cudnn,cublas",
|
||||
"o": "'worker_process_%p'",
|
||||
"cuda-graph-trace": "node",
|
||||
}
|
||||
})
|
||||
|
||||
return ray_remote_kwargs
|
||||
|
||||
def _get_worker_wrapper_args(self) -> Dict[str, Any]:
|
||||
(worker_module_name, worker_class_name,
|
||||
worker_class_fn) = self._get_worker_module_and_class()
|
||||
|
||||
return dict(
|
||||
worker_module_name=worker_module_name,
|
||||
worker_class_name=worker_class_name,
|
||||
worker_class_fn=worker_class_fn,
|
||||
trust_remote_code=self.model_config.trust_remote_code,
|
||||
)
|
||||
|
||||
# child class could overwrite this to return actual env vars.
|
||||
def _get_env_vars_to_be_updated(self):
|
||||
return self._env_vars_for_all_workers
|
||||
|
||||
def _init_workers_ray(self, placement_group: "PlacementGroup",
|
||||
**ray_remote_kwargs):
|
||||
if (self.parallel_config.tensor_parallel_size == 1
|
||||
and self.parallel_config.pipeline_parallel_size == 1):
|
||||
# For single GPU case, we use a ray worker with constrained memory.
|
||||
num_gpus = self.cache_config.gpu_memory_utilization
|
||||
else:
|
||||
# Otherwise, the ray workers are allocated with a full GPU.
|
||||
num_gpus = 1
|
||||
|
||||
# The driver dummy worker does not actually use any resources.
|
||||
# It holds the resource for the driver worker.
|
||||
self.driver_dummy_worker: Optional[RayWorkerWrapper] = None
|
||||
# The remaining workers are the actual ray actors.
|
||||
self.workers: List[RayWorkerWrapper] = []
|
||||
|
||||
# Used in ray compiled DAG: indexed first by PP rank,
|
||||
# and then TP rank. In other words, the inner list is
|
||||
# the TP group of workers for a PP rank.
|
||||
self.pp_tp_workers: List[List[RayWorkerWrapper]] = []
|
||||
|
||||
if self.parallel_config.ray_workers_use_nsight:
|
||||
ray_remote_kwargs = self._configure_ray_workers_use_nsight(
|
||||
ray_remote_kwargs)
|
||||
|
||||
logger.info("use_ray_spmd_worker: %s", self.use_ray_spmd_worker)
|
||||
|
||||
# Create the workers.
|
||||
driver_ip = get_ip()
|
||||
worker_wrapper_kwargs = self._get_worker_wrapper_args()
|
||||
for bundle_id, bundle in enumerate(placement_group.bundle_specs):
|
||||
if not bundle.get("GPU", 0):
|
||||
continue
|
||||
scheduling_strategy = PlacementGroupSchedulingStrategy(
|
||||
placement_group=placement_group,
|
||||
placement_group_capture_child_tasks=True,
|
||||
placement_group_bundle_index=bundle_id,
|
||||
)
|
||||
|
||||
worker = ray.remote(
|
||||
num_cpus=0,
|
||||
num_gpus=num_gpus,
|
||||
scheduling_strategy=scheduling_strategy,
|
||||
**ray_remote_kwargs,
|
||||
)(RayWorkerWrapper).remote(**worker_wrapper_kwargs)
|
||||
|
||||
if self.use_ray_spmd_worker:
|
||||
self.workers.append(worker)
|
||||
else:
|
||||
worker_ip = ray.get(worker.get_node_ip.remote())
|
||||
if worker_ip == driver_ip and self.driver_dummy_worker is None:
|
||||
# If the worker is on the same node as the driver, we use it
|
||||
# as the resource holder for the driver process.
|
||||
self.driver_dummy_worker = worker
|
||||
self.driver_worker = RayWorkerWrapper(
|
||||
**worker_wrapper_kwargs)
|
||||
else:
|
||||
# Else, added to the list of workers.
|
||||
self.workers.append(worker)
|
||||
|
||||
logger.debug("workers: %s", self.workers)
|
||||
logger.debug("driver_dummy_worker: %s", self.driver_dummy_worker)
|
||||
if not self.use_ray_spmd_worker and self.driver_dummy_worker is None:
|
||||
raise ValueError(
|
||||
"Ray does not allocate any GPUs on the driver node. Consider "
|
||||
"adjusting the Ray placement group or running the driver on a "
|
||||
"GPU node.")
|
||||
|
||||
worker_ips = [
|
||||
ray.get(worker.get_node_ip.remote()) # type: ignore[attr-defined]
|
||||
for worker in self.workers
|
||||
]
|
||||
ip_counts: Dict[str, int] = {}
|
||||
for ip in worker_ips:
|
||||
ip_counts[ip] = ip_counts.get(ip, 0) + 1
|
||||
|
||||
def sort_by_driver_then_worker_ip(worker):
|
||||
"""
|
||||
Sort the workers based on 3 properties:
|
||||
1. If the worker is on the same node as the driver (vllm engine),
|
||||
it should be placed first.
|
||||
2. Then, if the worker is on a node with fewer workers, it should
|
||||
be placed first.
|
||||
3. Finally, if the work is on a node with smaller IP address, it
|
||||
should be placed first.
|
||||
"""
|
||||
ip = ray.get(worker.get_node_ip.remote())
|
||||
return (ip != driver_ip, ip_counts[ip], ip)
|
||||
|
||||
# After sorting, the workers on the same node will be
|
||||
# close to each other, and the workers on the driver
|
||||
# node will be placed first.
|
||||
self.workers = sorted(self.workers, key=sort_by_driver_then_worker_ip)
|
||||
|
||||
# Get the set of GPU IDs used on each node.
|
||||
worker_node_and_gpu_ids = self._run_workers("get_node_and_gpu_ids",
|
||||
use_dummy_driver=True)
|
||||
|
||||
node_workers = defaultdict(list) # node id -> list of worker ranks
|
||||
node_gpus = defaultdict(list) # node id -> list of gpu ids
|
||||
|
||||
for i, (node_id, gpu_ids) in enumerate(worker_node_and_gpu_ids):
|
||||
node_workers[node_id].append(i)
|
||||
# `gpu_ids` can be a list of strings or integers.
|
||||
# convert them to integers for consistency.
|
||||
# NOTE: gpu_ids can be larger than 9 (e.g. 16 GPUs),
|
||||
# string sorting is not sufficient.
|
||||
# see https://github.com/vllm-project/vllm/issues/5590
|
||||
gpu_ids = [int(x) for x in gpu_ids]
|
||||
node_gpus[node_id].extend(gpu_ids)
|
||||
for node_id, gpu_ids in node_gpus.items():
|
||||
node_gpus[node_id] = sorted(gpu_ids)
|
||||
|
||||
all_ips = set(worker_ips + [driver_ip])
|
||||
n_ips = len(all_ips)
|
||||
n_nodes = len(node_workers)
|
||||
|
||||
if n_nodes != n_ips:
|
||||
raise RuntimeError(
|
||||
f"Every node should have a unique IP address. Got {n_nodes}"
|
||||
f" nodes with node ids {list(node_workers.keys())} and "
|
||||
f"{n_ips} unique IP addresses {all_ips}. Please check your"
|
||||
" network configuration. If you set `VLLM_HOST_IP` or "
|
||||
"`HOST_IP` environment variable, make sure it is unique for"
|
||||
" each node.")
|
||||
|
||||
VLLM_INSTANCE_ID = get_vllm_instance_id()
|
||||
|
||||
# Set environment variables for the driver and workers.
|
||||
all_args_to_update_environment_variables = [({
|
||||
"CUDA_VISIBLE_DEVICES":
|
||||
",".join(map(str, node_gpus[node_id])),
|
||||
"VLLM_INSTANCE_ID":
|
||||
VLLM_INSTANCE_ID,
|
||||
"VLLM_TRACE_FUNCTION":
|
||||
str(envs.VLLM_TRACE_FUNCTION),
|
||||
**({
|
||||
"VLLM_ATTENTION_BACKEND": envs.VLLM_ATTENTION_BACKEND
|
||||
} if envs.VLLM_ATTENTION_BACKEND is not None else {})
|
||||
}, ) for (node_id, _) in worker_node_and_gpu_ids]
|
||||
|
||||
self._env_vars_for_all_workers = (
|
||||
all_args_to_update_environment_variables)
|
||||
|
||||
self._run_workers("update_environment_variables",
|
||||
all_args=self._get_env_vars_to_be_updated())
|
||||
|
||||
if len(node_gpus) == 1:
|
||||
# in single node case, we don't need to get the IP address.
|
||||
# the loopback address is sufficient
|
||||
# NOTE: a node may have several IP addresses, one for each
|
||||
# network interface. `get_ip()` might return any of them,
|
||||
# while they might not work for communication inside the node
|
||||
# if the network setup is complicated. Using the loopback address
|
||||
# solves this issue, as it always works for communication inside
|
||||
# the node.
|
||||
driver_ip = "127.0.0.1"
|
||||
distributed_init_method = get_distributed_init_method(
|
||||
driver_ip, get_open_port())
|
||||
|
||||
# Initialize the actual workers inside worker wrapper.
|
||||
init_worker_all_kwargs = [
|
||||
self._get_worker_kwargs(
|
||||
local_rank=node_workers[node_id].index(rank),
|
||||
rank=rank,
|
||||
distributed_init_method=distributed_init_method,
|
||||
) for rank, (node_id, _) in enumerate(worker_node_and_gpu_ids)
|
||||
]
|
||||
self._run_workers("init_worker", all_kwargs=init_worker_all_kwargs)
|
||||
|
||||
self._run_workers("init_device")
|
||||
self._run_workers("load_model",
|
||||
max_concurrent_workers=self.parallel_config.
|
||||
max_parallel_loading_workers)
|
||||
|
||||
if self.use_ray_spmd_worker:
|
||||
for pp_rank in range(self.parallel_config.pipeline_parallel_size):
|
||||
self.pp_tp_workers.append([])
|
||||
for tp_rank in range(
|
||||
self.parallel_config.tensor_parallel_size):
|
||||
# PP=2, TP=4
|
||||
# pp_tp_workers = [[0, 1, 2, 3], [4, 5, 6, 7]]
|
||||
rank = (pp_rank * self.parallel_config.tensor_parallel_size
|
||||
) + tp_rank
|
||||
assert len(self.pp_tp_workers[pp_rank]) == tp_rank
|
||||
assert pp_rank < len(self.pp_tp_workers)
|
||||
self.pp_tp_workers[pp_rank].append(self.workers[rank])
|
||||
|
||||
# This is the list of workers that are rank 0 of each TP group EXCEPT
|
||||
# global rank 0. These are the workers that will broadcast to the
|
||||
# rest of the workers.
|
||||
self.tp_driver_workers: List[RayWorkerWrapper] = []
|
||||
# This is the list of workers that are not drivers and not the first
|
||||
# worker in a TP group. These are the workers that will be
|
||||
# broadcasted to.
|
||||
self.non_driver_workers: List[RayWorkerWrapper] = []
|
||||
|
||||
# Enforce rank order for correct rank to return final output.
|
||||
for index, worker in enumerate(self.workers):
|
||||
# The driver worker is rank 0 and not in self.workers.
|
||||
rank = index + 1
|
||||
if rank % self.parallel_config.tensor_parallel_size == 0:
|
||||
self.tp_driver_workers.append(worker)
|
||||
else:
|
||||
self.non_driver_workers.append(worker)
|
||||
|
||||
def _driver_execute_model(
|
||||
self, execute_model_req: Optional[ExecuteModelRequest]
|
||||
) -> Optional[List[SamplerOutput]]:
|
||||
"""Run execute_model in the driver worker.
|
||||
|
||||
Passing None will cause the driver to stop the model execution
|
||||
loop running in each of the remote workers.
|
||||
"""
|
||||
assert not self.use_ray_spmd_worker, (
|
||||
"driver_worker does not exist for VLLM_USE_RAY_SPMD_WORKER=1")
|
||||
return self.driver_worker.execute_method("execute_model",
|
||||
execute_model_req)
|
||||
|
||||
def execute_model(
|
||||
self,
|
||||
execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
|
||||
if not self.use_ray_spmd_worker:
|
||||
return super().execute_model(execute_model_req)
|
||||
|
||||
if self.forward_dag is None:
|
||||
self.forward_dag = self._compiled_ray_dag(enable_asyncio=False)
|
||||
|
||||
serialized_data = self.input_encoder.encode(execute_model_req)
|
||||
outputs = ray.get(self.forward_dag.execute(serialized_data))
|
||||
output = self.output_decoder.decode(outputs[0])
|
||||
return output
|
||||
|
||||
def _run_workers(
|
||||
self,
|
||||
method: str,
|
||||
*args,
|
||||
async_run_tensor_parallel_workers_only: bool = False,
|
||||
all_args: Optional[List[Tuple[Any, ...]]] = None,
|
||||
all_kwargs: Optional[List[Dict[str, Any]]] = None,
|
||||
use_dummy_driver: bool = False,
|
||||
max_concurrent_workers: Optional[int] = None,
|
||||
**kwargs,
|
||||
) -> Any:
|
||||
"""Runs the given method on all workers. Can be used in the following
|
||||
ways:
|
||||
|
||||
Args:
|
||||
- async_run_tensor_parallel_workers_only: If True the method will be
|
||||
run only in the remote TP workers, not the driver worker.
|
||||
It will also be run asynchronously and return a list of futures
|
||||
rather than blocking on the results.
|
||||
- args/kwargs: All workers share the same args/kwargs
|
||||
- all_args/all_kwargs: args/kwargs for each worker are specified
|
||||
individually
|
||||
"""
|
||||
if self.use_ray_spmd_worker:
|
||||
assert not async_run_tensor_parallel_workers_only, (
|
||||
"async_run_tensor_parallel_workers_only is not supported for "
|
||||
"spmd mode.")
|
||||
|
||||
if max_concurrent_workers:
|
||||
raise NotImplementedError(
|
||||
"max_concurrent_workers is not supported yet.")
|
||||
|
||||
count = len(self.workers) if not \
|
||||
async_run_tensor_parallel_workers_only \
|
||||
else len(self.non_driver_workers)
|
||||
# If using SPMD worker, all workers are the same, so we should execute
|
||||
# the args on all workers. Otherwise, we skip the first worker's args
|
||||
# because those args will go to the driver worker.
|
||||
first_worker_args_index: int = 0 if self.use_ray_spmd_worker else 1
|
||||
all_worker_args = repeat(args, count) if all_args is None \
|
||||
else islice(all_args, first_worker_args_index, None)
|
||||
all_worker_kwargs = repeat(kwargs, count) if all_kwargs is None \
|
||||
else islice(all_kwargs, first_worker_args_index, None)
|
||||
|
||||
# Start the ray workers first.
|
||||
ray_workers = self.workers
|
||||
if async_run_tensor_parallel_workers_only:
|
||||
ray_workers = self.non_driver_workers
|
||||
ray_worker_outputs = [
|
||||
worker.execute_method.remote(method, *worker_args, **worker_kwargs)
|
||||
for (worker, worker_args, worker_kwargs
|
||||
) in zip(ray_workers, all_worker_args, all_worker_kwargs)
|
||||
]
|
||||
|
||||
if async_run_tensor_parallel_workers_only:
|
||||
# Just return futures
|
||||
return ray_worker_outputs
|
||||
|
||||
driver_worker_output = []
|
||||
# In SPMD mode, the driver worker is the same as any other worker,
|
||||
# so we only explicitly execute on the driver worker if using a
|
||||
# non-SPMD worker class.
|
||||
if not self.use_ray_spmd_worker:
|
||||
driver_args = args if all_args is None else all_args[0]
|
||||
driver_kwargs = kwargs if all_kwargs is None else all_kwargs[0]
|
||||
|
||||
# Start the driver worker after all the ray workers.
|
||||
if not use_dummy_driver:
|
||||
driver_worker_output = [
|
||||
self.driver_worker.execute_method(method, *driver_args,
|
||||
**driver_kwargs)
|
||||
]
|
||||
else:
|
||||
assert self.driver_dummy_worker is not None
|
||||
driver_worker_output = [
|
||||
ray.get(
|
||||
self.driver_dummy_worker.execute_method.remote(
|
||||
method, *driver_args, **driver_kwargs))
|
||||
]
|
||||
|
||||
# Get the results of the ray workers.
|
||||
if self.workers:
|
||||
ray_worker_outputs = ray.get(ray_worker_outputs)
|
||||
|
||||
return driver_worker_output + ray_worker_outputs
|
||||
|
||||
def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None:
|
||||
"""Wait for futures returned from _run_workers() with
|
||||
async_run_remote_workers_only to complete."""
|
||||
ray.get(parallel_worker_tasks)
|
||||
|
||||
def _check_ray_adag_installation(self):
|
||||
import pkg_resources
|
||||
from packaging import version
|
||||
|
||||
required_version = version.parse("2.35")
|
||||
current_version = version.parse(
|
||||
pkg_resources.get_distribution("ray").version)
|
||||
# TODO: update the constraint once we adapt to the backward
|
||||
# incompatible API change from ray 2.36
|
||||
if current_version != required_version:
|
||||
raise ValueError(f"Ray version {required_version} is "
|
||||
f"required, but found {current_version}")
|
||||
|
||||
import importlib.util
|
||||
adag_spec = importlib.util.find_spec(
|
||||
"ray.experimental.compiled_dag_ref")
|
||||
if adag_spec is None:
|
||||
raise ValueError("Ray accelerated DAG is not installed. "
|
||||
"Run `pip install ray[adag]` to install it.")
|
||||
|
||||
cupy_spec = importlib.util.find_spec("cupy")
|
||||
if cupy_spec is None and envs.VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL:
|
||||
raise ValueError(
|
||||
"cupy is not installed but required since "
|
||||
"VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL is set."
|
||||
"Run `pip install ray[adag]` and check cupy installation.")
|
||||
|
||||
def _compiled_ray_dag(self, enable_asyncio: bool):
|
||||
assert self.parallel_config.use_ray
|
||||
self._check_ray_adag_installation()
|
||||
from ray.dag import InputNode, MultiOutputNode
|
||||
from ray.experimental.channel.torch_tensor_type import TorchTensorType
|
||||
|
||||
logger.info("VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL = %s",
|
||||
envs.VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL)
|
||||
with InputNode() as input_data:
|
||||
# Example DAG: PP=2, TP=4
|
||||
# (ExecuteModelReq, None) -> 0 -> (ExecuteModelReq, IntermediateOutput) -> 4 -> SamplerOutput # noqa: E501
|
||||
# -> 1 -> (ExecuteModelReq, IntermediateOutput) -> 5 -> SamplerOutput # noqa: E501
|
||||
# -> 2 -> (ExecuteModelReq, IntermediateOutput) -> 6 -> SamplerOutput # noqa: E501
|
||||
# -> 3 -> (ExecuteModelReq, IntermediateOutput) -> 7 -> SamplerOutput # noqa: E501
|
||||
|
||||
# All workers in the first TP group will take in the
|
||||
# ExecuteModelRequest as input.
|
||||
outputs = [input_data for _ in self.pp_tp_workers[0]]
|
||||
for pp_rank, tp_group in enumerate(self.pp_tp_workers):
|
||||
# Each PP worker takes in the output of the previous PP worker,
|
||||
# and the TP group executes in SPMD fashion.
|
||||
outputs = [
|
||||
worker.execute_model_spmd.
|
||||
bind( # type: ignore[attr-defined]
|
||||
outputs[i]) for i, worker in enumerate(tp_group)
|
||||
]
|
||||
|
||||
last_pp_rank = len(self.pp_tp_workers) - 1
|
||||
if pp_rank < last_pp_rank:
|
||||
# Specify how intermediate tensors should be passed
|
||||
# between pp stages, no need to specify for the last
|
||||
# pp stage.
|
||||
transport = "nccl" \
|
||||
if envs.VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL \
|
||||
else "auto"
|
||||
outputs = [
|
||||
output.with_type_hint(
|
||||
TorchTensorType(transport=transport))
|
||||
for output in outputs
|
||||
]
|
||||
|
||||
forward_dag = MultiOutputNode(outputs)
|
||||
|
||||
return forward_dag.experimental_compile(enable_asyncio=enable_asyncio)
|
||||
|
||||
def __del__(self):
|
||||
self.shutdown()
|
||||
|
||||
|
||||
class RayGPUExecutorAsync(RayGPUExecutor, DistributedGPUExecutorAsync):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.pp_locks: Optional[List[asyncio.Lock]] = None
|
||||
self.use_ray_spmd_worker = envs.VLLM_USE_RAY_SPMD_WORKER
|
||||
if not self.use_ray_compiled_dag:
|
||||
self.driver_exec_method = make_async(
|
||||
self.driver_worker.execute_method)
|
||||
|
||||
async def execute_model_async(
|
||||
self,
|
||||
execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
|
||||
if not self.use_ray_spmd_worker:
|
||||
return await super().execute_model_async(execute_model_req)
|
||||
|
||||
if self.forward_dag is None:
|
||||
self.forward_dag = self._compiled_ray_dag(enable_asyncio=True)
|
||||
|
||||
serialized_data = self.input_encoder.encode(execute_model_req)
|
||||
dag_future = await self.forward_dag.execute_async(serialized_data)
|
||||
outputs = await dag_future
|
||||
return self.output_decoder.decode(outputs[0])
|
||||
|
||||
async def _driver_execute_model_async(
|
||||
self,
|
||||
execute_model_req: Optional[ExecuteModelRequest] = None
|
||||
) -> List[SamplerOutput]:
|
||||
assert not self.use_ray_spmd_worker, (
|
||||
"driver_worker does not exist for VLLM_USE_RAY_SPMD_WORKER=1")
|
||||
if not self.tp_driver_workers:
|
||||
return await self.driver_exec_method("execute_model",
|
||||
execute_model_req)
|
||||
if self.pp_locks is None:
|
||||
# This locks each pipeline parallel stage so multiple virtual
|
||||
# engines can't execute on the same stage at the same time
|
||||
# We create the locks here to avoid creating them in the constructor
|
||||
# which uses a different asyncio loop.
|
||||
self.pp_locks = [
|
||||
asyncio.Lock()
|
||||
for _ in range(self.parallel_config.pipeline_parallel_size)
|
||||
]
|
||||
|
||||
tasks = [
|
||||
asyncio.create_task(
|
||||
_run_task_with_lock(self.driver_exec_method, self.pp_locks[0],
|
||||
"execute_model", execute_model_req))
|
||||
]
|
||||
for pp_rank, driver_worker in enumerate(self.tp_driver_workers,
|
||||
start=1):
|
||||
tasks.append(
|
||||
asyncio.create_task(
|
||||
_run_task_with_lock(driver_worker.execute_method.remote,
|
||||
self.pp_locks[pp_rank],
|
||||
"execute_model", execute_model_req)))
|
||||
|
||||
results = await asyncio.gather(*tasks)
|
||||
|
||||
# Only the last PP stage has the final results.
|
||||
return results[-1]
|
||||
|
||||
async def _start_worker_execution_loop(self):
|
||||
assert not self.use_ray_spmd_worker, (
|
||||
"worker loop is disabled for VLLM_USE_RAY_SPMD_WORKER=1")
|
||||
coros = [
|
||||
worker.execute_method.remote("start_worker_execution_loop")
|
||||
for worker in self.non_driver_workers
|
||||
]
|
||||
return await asyncio.gather(*coros)
|
||||
|
||||
def __del__(self):
|
||||
self.shutdown()
|
||||
554
vllm-v0.6.2/vllm/executor/ray_hpu_executor.py
Normal file
554
vllm-v0.6.2/vllm/executor/ray_hpu_executor.py
Normal file
@@ -0,0 +1,554 @@
|
||||
import asyncio
|
||||
import os
|
||||
from collections import defaultdict
|
||||
from itertools import islice, repeat
|
||||
from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple,
|
||||
Type)
|
||||
|
||||
import msgspec
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.executor.distributed_gpu_executor import ( # yapf: disable
|
||||
DistributedGPUExecutor, DistributedGPUExecutorAsync)
|
||||
from vllm.executor.msgspec_utils import encode_hook
|
||||
from vllm.executor.ray_utils import RayWorkerWrapper, ray
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.layers.sampler import SamplerOutput
|
||||
from vllm.sequence import ExecuteModelRequest
|
||||
from vllm.utils import (_run_task_with_lock, get_distributed_init_method,
|
||||
get_ip, get_open_port, get_vllm_instance_id,
|
||||
make_async)
|
||||
from vllm.worker.worker_base import WorkerBase
|
||||
|
||||
if ray is not None:
|
||||
from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from ray.util.placement_group import PlacementGroup
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class RayHPUExecutor(DistributedGPUExecutor):
|
||||
|
||||
uses_ray: bool = True
|
||||
|
||||
def _init_executor(self) -> None:
|
||||
self.forward_dag: Optional[ray.dag.CompiledDAG] = None
|
||||
# If the env var is set, it uses the Ray's compiled DAG API
|
||||
# which optimizes the control plane overhead.
|
||||
# Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it.
|
||||
# Currently, this requires USE_RAY_SPMD_WORKER=True.
|
||||
self.use_ray_compiled_dag = envs.VLLM_USE_RAY_COMPILED_DAG
|
||||
# If the env var is set, then we do not distinguish between the
|
||||
# "driver worker" vs other workers. Also, the rank 0 worker will
|
||||
# be executed in a remote Ray worker. Currently this requires
|
||||
# USE_RAY_COMPILED_DAG=True.
|
||||
self.use_ray_spmd_worker = envs.VLLM_USE_RAY_SPMD_WORKER
|
||||
if self.use_ray_compiled_dag:
|
||||
assert self.use_ray_spmd_worker, (
|
||||
"VLLM_USE_RAY_COMPILED_DAG=1 requires "
|
||||
"VLLM_USE_RAY_SPMD_WORKER=1")
|
||||
if self.use_ray_spmd_worker:
|
||||
# TODO: Support SPMD worker for non-DAG Ray executor.
|
||||
assert self.use_ray_compiled_dag, (
|
||||
"VLLM_USE_RAY_SPMD_WORKER=1 requires "
|
||||
"VLLM_USE_RAY_COMPILED_DAG=1")
|
||||
|
||||
assert self.uses_ray
|
||||
placement_group = self.parallel_config.placement_group
|
||||
|
||||
# Disable Ray usage stats collection.
|
||||
ray_usage = os.environ.get("RAY_USAGE_STATS_ENABLED", "0")
|
||||
if ray_usage != "1":
|
||||
os.environ["RAY_USAGE_STATS_ENABLED"] = "0"
|
||||
|
||||
# Create the parallel GPU workers.
|
||||
self._init_workers_ray(placement_group)
|
||||
|
||||
self.input_encoder = msgspec.msgpack.Encoder(enc_hook=encode_hook)
|
||||
self.output_decoder = msgspec.msgpack.Decoder(
|
||||
Optional[List[SamplerOutput]])
|
||||
|
||||
def shutdown(self) -> None:
|
||||
if hasattr(self, "forward_dag") and self.forward_dag is not None:
|
||||
self.forward_dag.teardown()
|
||||
import ray
|
||||
for worker in self.workers:
|
||||
ray.kill(worker)
|
||||
self.forward_dag = None
|
||||
|
||||
def finish_measurements(self):
|
||||
self._run_workers("finish_measurements")
|
||||
|
||||
def _get_worker_module_and_class(
|
||||
self
|
||||
) -> Tuple[str, str, Optional[Callable[[],
|
||||
Type[WorkerBase]]]]: # noqa: F821
|
||||
worker_class_fn = None
|
||||
if self.scheduler_config.is_multi_step:
|
||||
raise NotImplementedError(
|
||||
"Multi-step execution is not implemented for HPU")
|
||||
elif self.speculative_config:
|
||||
raise NotImplementedError(
|
||||
"Speculative decoding is not implemented for HPU")
|
||||
else:
|
||||
worker_module_name = "vllm.worker.hpu_worker"
|
||||
worker_class_name = "HPUWorker"
|
||||
return (worker_module_name, worker_class_name, worker_class_fn)
|
||||
|
||||
def _get_worker_wrapper_args(self) -> Dict[str, Any]:
|
||||
(worker_module_name, worker_class_name,
|
||||
worker_class_fn) = self._get_worker_module_and_class()
|
||||
|
||||
return dict(
|
||||
worker_module_name=worker_module_name,
|
||||
worker_class_name=worker_class_name,
|
||||
worker_class_fn=worker_class_fn,
|
||||
trust_remote_code=self.model_config.trust_remote_code,
|
||||
)
|
||||
|
||||
def _init_workers_ray(self, placement_group: "PlacementGroup",
|
||||
**ray_remote_kwargs):
|
||||
# Otherwise, the ray workers are allocated with a full GPU.
|
||||
num_gpus = 1
|
||||
|
||||
# The driver dummy worker does not actually use any resources.
|
||||
# It holds the resource for the driver worker.
|
||||
self.driver_dummy_worker: Optional[RayWorkerWrapper] = None
|
||||
# The remaining workers are the actual ray actors.
|
||||
self.workers: List[RayWorkerWrapper] = []
|
||||
|
||||
# Used in ray compiled DAG: indexed first by PP rank,
|
||||
# and then TP rank. In other words, the inner list is
|
||||
# the TP group of workers for a PP rank.
|
||||
self.pp_tp_workers: List[List[RayWorkerWrapper]] = []
|
||||
|
||||
logger.info("use_ray_spmd_worker: %s", self.use_ray_spmd_worker)
|
||||
|
||||
# Create the workers.
|
||||
driver_ip = get_ip()
|
||||
worker_wrapper_kwargs = self._get_worker_wrapper_args()
|
||||
for bundle_id, bundle in enumerate(placement_group.bundle_specs):
|
||||
if not bundle.get("HPU", 0):
|
||||
continue
|
||||
scheduling_strategy = PlacementGroupSchedulingStrategy(
|
||||
placement_group=placement_group,
|
||||
placement_group_capture_child_tasks=True,
|
||||
placement_group_bundle_index=bundle_id,
|
||||
)
|
||||
|
||||
worker = ray.remote(
|
||||
num_cpus=0,
|
||||
num_gpus=0,
|
||||
resources={'HPU': num_gpus},
|
||||
scheduling_strategy=scheduling_strategy,
|
||||
**ray_remote_kwargs,
|
||||
)(RayWorkerWrapper).remote(**worker_wrapper_kwargs)
|
||||
|
||||
if self.use_ray_spmd_worker:
|
||||
self.workers.append(worker)
|
||||
else:
|
||||
worker_ip = ray.get(worker.get_node_ip.remote())
|
||||
if worker_ip == driver_ip and self.driver_dummy_worker is None:
|
||||
# If the worker is on the same node as the driver, we use it
|
||||
# as the resource holder for the driver process.
|
||||
self.driver_dummy_worker = worker
|
||||
self.driver_worker = RayWorkerWrapper(
|
||||
**worker_wrapper_kwargs)
|
||||
else:
|
||||
# Else, added to the list of workers.
|
||||
self.workers.append(worker)
|
||||
|
||||
logger.debug("workers: %s", self.workers)
|
||||
logger.debug("driver_dummy_worker: %s", self.driver_dummy_worker)
|
||||
if not self.use_ray_spmd_worker and self.driver_dummy_worker is None:
|
||||
raise ValueError(
|
||||
"Ray does not allocate any GPUs on the driver node. Consider "
|
||||
"adjusting the Ray placement group or running the driver on a "
|
||||
"GPU node.")
|
||||
|
||||
worker_ips = [
|
||||
ray.get(worker.get_node_ip.remote()) # type: ignore[attr-defined]
|
||||
for worker in self.workers
|
||||
]
|
||||
ip_counts: Dict[str, int] = {}
|
||||
for ip in worker_ips:
|
||||
ip_counts[ip] = ip_counts.get(ip, 0) + 1
|
||||
|
||||
def sort_by_driver_then_worker_ip(worker):
|
||||
"""
|
||||
Sort the workers based on 3 properties:
|
||||
1. If the worker is on the same node as the driver (vllm engine),
|
||||
it should be placed first.
|
||||
2. Then, if the worker is on a node with fewer workers, it should
|
||||
be placed first.
|
||||
3. Finally, if the work is on a node with smaller IP address, it
|
||||
should be placed first.
|
||||
"""
|
||||
ip = ray.get(worker.get_node_ip.remote())
|
||||
return (ip != driver_ip, ip_counts[ip], ip)
|
||||
|
||||
# After sorting, the workers on the same node will be
|
||||
# close to each other, and the workers on the driver
|
||||
# node will be placed first.
|
||||
self.workers = sorted(self.workers, key=sort_by_driver_then_worker_ip)
|
||||
|
||||
# Get the set of GPU IDs used on each node.
|
||||
worker_node_and_gpu_ids = self._run_workers("get_node_and_gpu_ids",
|
||||
use_dummy_driver=True)
|
||||
|
||||
node_workers = defaultdict(list) # node id -> list of worker ranks
|
||||
node_gpus = defaultdict(list) # node id -> list of gpu ids
|
||||
|
||||
for i, (node_id, gpu_ids) in enumerate(worker_node_and_gpu_ids):
|
||||
node_workers[node_id].append(i)
|
||||
# `gpu_ids` can be a list of strings or integers.
|
||||
# convert them to integers for consistency.
|
||||
# NOTE: gpu_ids can be larger than 9 (e.g. 16 GPUs),
|
||||
# string sorting is not sufficient.
|
||||
# see https://github.com/vllm-project/vllm/issues/5590
|
||||
gpu_ids = [int(x) for x in gpu_ids]
|
||||
node_gpus[node_id].extend(gpu_ids)
|
||||
for node_id, gpu_ids in node_gpus.items():
|
||||
node_gpus[node_id] = sorted(gpu_ids)
|
||||
|
||||
all_ips = set(worker_ips + [driver_ip])
|
||||
n_ips = len(all_ips)
|
||||
n_nodes = len(node_workers)
|
||||
|
||||
if n_nodes != n_ips:
|
||||
raise RuntimeError(
|
||||
f"Every node should have a unique IP address. Got {n_nodes}"
|
||||
f" nodes with node ids {list(node_workers.keys())} and "
|
||||
f"{n_ips} unique IP addresses {all_ips}. Please check your"
|
||||
" network configuration. If you set `VLLM_HOST_IP` or "
|
||||
"`HOST_IP` environment variable, make sure it is unique for"
|
||||
" each node.")
|
||||
|
||||
VLLM_INSTANCE_ID = get_vllm_instance_id()
|
||||
|
||||
# Set environment variables for the driver and workers.
|
||||
all_args_to_update_environment_variables = [({
|
||||
"VLLM_INSTANCE_ID":
|
||||
VLLM_INSTANCE_ID,
|
||||
"VLLM_TRACE_FUNCTION":
|
||||
str(envs.VLLM_TRACE_FUNCTION),
|
||||
}, ) for (node_id, _) in worker_node_and_gpu_ids]
|
||||
self._run_workers("update_environment_variables",
|
||||
all_args=all_args_to_update_environment_variables)
|
||||
|
||||
if len(node_gpus) == 1:
|
||||
# in single node case, we don't need to get the IP address.
|
||||
# the loopback address is sufficient
|
||||
# NOTE: a node may have several IP addresses, one for each
|
||||
# network interface. `get_ip()` might return any of them,
|
||||
# while they might not work for communication inside the node
|
||||
# if the network setup is complicated. Using the loopback address
|
||||
# solves this issue, as it always works for communication inside
|
||||
# the node.
|
||||
driver_ip = "127.0.0.1"
|
||||
distributed_init_method = get_distributed_init_method(
|
||||
driver_ip, get_open_port())
|
||||
|
||||
# Initialize the actual workers inside worker wrapper.
|
||||
init_worker_all_kwargs = [
|
||||
self._get_worker_kwargs(
|
||||
local_rank=node_workers[node_id].index(rank),
|
||||
rank=rank,
|
||||
distributed_init_method=distributed_init_method,
|
||||
) for rank, (node_id, _) in enumerate(worker_node_and_gpu_ids)
|
||||
]
|
||||
self._run_workers("init_worker", all_kwargs=init_worker_all_kwargs)
|
||||
|
||||
self._run_workers("init_device")
|
||||
self._run_workers("load_model",
|
||||
max_concurrent_workers=self.parallel_config.
|
||||
max_parallel_loading_workers)
|
||||
|
||||
if self.use_ray_spmd_worker:
|
||||
for pp_rank in range(self.parallel_config.pipeline_parallel_size):
|
||||
self.pp_tp_workers.append([])
|
||||
for tp_rank in range(
|
||||
self.parallel_config.tensor_parallel_size):
|
||||
# PP=2, TP=4
|
||||
# pp_tp_workers = [[0, 1, 2, 3], [4, 5, 6, 7]]
|
||||
rank = (pp_rank * self.parallel_config.tensor_parallel_size
|
||||
) + tp_rank
|
||||
assert len(self.pp_tp_workers[pp_rank]) == tp_rank
|
||||
assert pp_rank < len(self.pp_tp_workers)
|
||||
self.pp_tp_workers[pp_rank].append(self.workers[rank])
|
||||
|
||||
# This is the list of workers that are rank 0 of each TP group EXCEPT
|
||||
# global rank 0. These are the workers that will broadcast to the
|
||||
# rest of the workers.
|
||||
self.tp_driver_workers: List[RayWorkerWrapper] = []
|
||||
# This is the list of workers that are not drivers and not the first
|
||||
# worker in a TP group. These are the workers that will be
|
||||
# broadcasted to.
|
||||
self.non_driver_workers: List[RayWorkerWrapper] = []
|
||||
|
||||
# Enforce rank order for correct rank to return final output.
|
||||
for index, worker in enumerate(self.workers):
|
||||
# The driver worker is rank 0 and not in self.workers.
|
||||
rank = index + 1
|
||||
if rank % self.parallel_config.tensor_parallel_size == 0:
|
||||
self.tp_driver_workers.append(worker)
|
||||
else:
|
||||
self.non_driver_workers.append(worker)
|
||||
|
||||
def _driver_execute_model(
|
||||
self, execute_model_req: Optional[ExecuteModelRequest]
|
||||
) -> Optional[List[SamplerOutput]]:
|
||||
"""Run execute_model in the driver worker.
|
||||
|
||||
Passing None will cause the driver to stop the model execution
|
||||
loop running in each of the remote workers.
|
||||
"""
|
||||
assert not self.use_ray_spmd_worker, (
|
||||
"driver_worker does not exist for VLLM_USE_RAY_SPMD_WORKER=1")
|
||||
return self.driver_worker.execute_method("execute_model",
|
||||
execute_model_req)
|
||||
|
||||
def execute_model(
|
||||
self,
|
||||
execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
|
||||
if not self.use_ray_spmd_worker:
|
||||
return super().execute_model(execute_model_req)
|
||||
|
||||
if self.forward_dag is None:
|
||||
self.forward_dag = self._compiled_ray_dag(enable_asyncio=False)
|
||||
|
||||
serialized_data = self.input_encoder.encode(execute_model_req)
|
||||
outputs = ray.get(self.forward_dag.execute(serialized_data))
|
||||
output = self.output_decoder.decode(outputs[0])
|
||||
return output
|
||||
|
||||
def _run_workers(
|
||||
self,
|
||||
method: str,
|
||||
*args,
|
||||
async_run_tensor_parallel_workers_only: bool = False,
|
||||
all_args: Optional[List[Tuple[Any, ...]]] = None,
|
||||
all_kwargs: Optional[List[Dict[str, Any]]] = None,
|
||||
use_dummy_driver: bool = False,
|
||||
max_concurrent_workers: Optional[int] = None,
|
||||
**kwargs,
|
||||
) -> Any:
|
||||
"""Runs the given method on all workers. Can be used in the following
|
||||
ways:
|
||||
|
||||
Args:
|
||||
- async_run_tensor_parallel_workers_only: If True the method will be
|
||||
run only in the remote TP workers, not the driver worker.
|
||||
It will also be run asynchronously and return a list of futures
|
||||
rather than blocking on the results.
|
||||
- args/kwargs: All workers share the same args/kwargs
|
||||
- all_args/all_kwargs: args/kwargs for each worker are specified
|
||||
individually
|
||||
"""
|
||||
if self.use_ray_spmd_worker:
|
||||
assert not async_run_tensor_parallel_workers_only, (
|
||||
"async_run_tensor_parallel_workers_only is not supported for "
|
||||
"spmd mode.")
|
||||
|
||||
if max_concurrent_workers:
|
||||
raise NotImplementedError(
|
||||
"max_concurrent_workers is not supported yet.")
|
||||
|
||||
count = len(self.workers) if not \
|
||||
async_run_tensor_parallel_workers_only \
|
||||
else len(self.non_driver_workers)
|
||||
# If using SPMD worker, all workers are the same, so we should execute
|
||||
# the args on all workers. Otherwise, we skip the first worker's args
|
||||
# because those args will go to the driver worker.
|
||||
first_worker_args_index: int = 0 if self.use_ray_spmd_worker else 1
|
||||
all_worker_args = repeat(args, count) if all_args is None \
|
||||
else islice(all_args, first_worker_args_index, None)
|
||||
all_worker_kwargs = repeat(kwargs, count) if all_kwargs is None \
|
||||
else islice(all_kwargs, first_worker_args_index, None)
|
||||
|
||||
# Start the ray workers first.
|
||||
ray_workers = self.workers
|
||||
if async_run_tensor_parallel_workers_only:
|
||||
ray_workers = self.non_driver_workers
|
||||
ray_worker_outputs = [
|
||||
worker.execute_method.remote(method, *worker_args, **worker_kwargs)
|
||||
for (worker, worker_args, worker_kwargs
|
||||
) in zip(ray_workers, all_worker_args, all_worker_kwargs)
|
||||
]
|
||||
|
||||
if async_run_tensor_parallel_workers_only:
|
||||
# Just return futures
|
||||
return ray_worker_outputs
|
||||
|
||||
driver_worker_output = []
|
||||
# In SPMD mode, the driver worker is the same as any other worker,
|
||||
# so we only explicitly execute on the driver worker if using a
|
||||
# non-SPMD worker class.
|
||||
if not self.use_ray_spmd_worker:
|
||||
driver_args = args if all_args is None else all_args[0]
|
||||
driver_kwargs = kwargs if all_kwargs is None else all_kwargs[0]
|
||||
|
||||
# Start the driver worker after all the ray workers.
|
||||
if not use_dummy_driver:
|
||||
driver_worker_output = [
|
||||
self.driver_worker.execute_method(method, *driver_args,
|
||||
**driver_kwargs)
|
||||
]
|
||||
else:
|
||||
assert self.driver_dummy_worker is not None
|
||||
driver_worker_output = [
|
||||
ray.get(
|
||||
self.driver_dummy_worker.execute_method.remote(
|
||||
method, *driver_args, **driver_kwargs))
|
||||
]
|
||||
|
||||
# Get the results of the ray workers.
|
||||
if self.workers:
|
||||
ray_worker_outputs = ray.get(ray_worker_outputs)
|
||||
|
||||
return driver_worker_output + ray_worker_outputs
|
||||
|
||||
def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None:
|
||||
"""Wait for futures returned from _run_workers() with
|
||||
async_run_remote_workers_only to complete."""
|
||||
ray.get(parallel_worker_tasks)
|
||||
|
||||
def _check_ray_adag_installation(self):
|
||||
import pkg_resources
|
||||
from packaging import version
|
||||
|
||||
required_version = version.parse("2.35")
|
||||
current_version = version.parse(
|
||||
pkg_resources.get_distribution("ray").version)
|
||||
# TODO: update the constraint once we adapt to the backward
|
||||
# incompatible API change from ray 2.36
|
||||
if current_version != required_version:
|
||||
raise ValueError(f"Ray version {required_version} is "
|
||||
f"required, but found {current_version}")
|
||||
|
||||
import importlib.util
|
||||
adag_spec = importlib.util.find_spec(
|
||||
"ray.experimental.compiled_dag_ref")
|
||||
if adag_spec is None:
|
||||
raise ValueError("Ray accelerated DAG is not installed. "
|
||||
"Run `pip install ray[adag]` to install it.")
|
||||
|
||||
def _compiled_ray_dag(self, enable_asyncio: bool):
|
||||
assert self.parallel_config.use_ray
|
||||
self._check_ray_adag_installation()
|
||||
from ray.dag import InputNode, MultiOutputNode
|
||||
from ray.experimental.channel.torch_tensor_type import TorchTensorType
|
||||
|
||||
with InputNode() as input_data:
|
||||
# Example DAG: PP=2, TP=4
|
||||
# (ExecuteModelReq, None) -> 0 -> (ExecuteModelReq, IntermediateOutput) -> 4 -> SamplerOutput # noqa: E501
|
||||
# -> 1 -> (ExecuteModelReq, IntermediateOutput) -> 5 -> SamplerOutput # noqa: E501
|
||||
# -> 2 -> (ExecuteModelReq, IntermediateOutput) -> 6 -> SamplerOutput # noqa: E501
|
||||
# -> 3 -> (ExecuteModelReq, IntermediateOutput) -> 7 -> SamplerOutput # noqa: E501
|
||||
|
||||
# All workers in the first TP group will take in the
|
||||
# ExecuteModelRequest as input.
|
||||
outputs = [input_data for _ in self.pp_tp_workers[0]]
|
||||
for pp_rank, tp_group in enumerate(self.pp_tp_workers):
|
||||
# Each PP worker takes in the output of the previous PP worker,
|
||||
# and the TP group executes in SPMD fashion.
|
||||
outputs = [
|
||||
worker.execute_model_spmd.
|
||||
bind( # type: ignore[attr-defined]
|
||||
outputs[i]) for i, worker in enumerate(tp_group)
|
||||
]
|
||||
|
||||
last_pp_rank = len(self.pp_tp_workers) - 1
|
||||
if pp_rank < last_pp_rank:
|
||||
# Specify how intermediate tensors should be passed
|
||||
# between pp stages, no need to specify for the last
|
||||
# pp stage.
|
||||
transport = "auto"
|
||||
outputs = [
|
||||
output.with_type_hint(
|
||||
TorchTensorType(transport=transport))
|
||||
for output in outputs
|
||||
]
|
||||
|
||||
forward_dag = MultiOutputNode(outputs)
|
||||
|
||||
return forward_dag.experimental_compile(enable_asyncio=enable_asyncio)
|
||||
|
||||
def __del__(self):
|
||||
self.shutdown()
|
||||
|
||||
|
||||
class RayHPUExecutorAsync(RayHPUExecutor, DistributedGPUExecutorAsync):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.pp_locks: Optional[List[asyncio.Lock]] = None
|
||||
self.use_ray_spmd_worker = envs.VLLM_USE_RAY_SPMD_WORKER
|
||||
if not self.use_ray_compiled_dag:
|
||||
self.driver_exec_method = make_async(
|
||||
self.driver_worker.execute_method)
|
||||
|
||||
async def execute_model_async(
|
||||
self,
|
||||
execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
|
||||
if not self.use_ray_spmd_worker:
|
||||
return await super().execute_model_async(execute_model_req)
|
||||
|
||||
if self.forward_dag is None:
|
||||
self.forward_dag = self._compiled_ray_dag(enable_asyncio=True)
|
||||
|
||||
serialized_data = self.input_encoder.encode(execute_model_req)
|
||||
dag_future = await self.forward_dag.execute_async(serialized_data)
|
||||
outputs = await dag_future
|
||||
return self.output_decoder.decode(outputs[0])
|
||||
|
||||
async def _driver_execute_model_async(
|
||||
self,
|
||||
execute_model_req: Optional[ExecuteModelRequest] = None
|
||||
) -> List[SamplerOutput]:
|
||||
assert not self.use_ray_spmd_worker, (
|
||||
"driver_worker does not exist for VLLM_USE_RAY_SPMD_WORKER=1")
|
||||
if not self.tp_driver_workers:
|
||||
return await self.driver_exec_method("execute_model",
|
||||
execute_model_req)
|
||||
if self.pp_locks is None:
|
||||
# This locks each pipeline parallel stage so multiple virtual
|
||||
# engines can't execute on the same stage at the same time
|
||||
# We create the locks here to avoid creating them in the constructor
|
||||
# which uses a different asyncio loop.
|
||||
self.pp_locks = [
|
||||
asyncio.Lock()
|
||||
for _ in range(self.parallel_config.pipeline_parallel_size)
|
||||
]
|
||||
|
||||
tasks = [
|
||||
asyncio.create_task(
|
||||
_run_task_with_lock(self.driver_exec_method, self.pp_locks[0],
|
||||
"execute_model", execute_model_req))
|
||||
]
|
||||
for pp_rank, driver_worker in enumerate(self.tp_driver_workers,
|
||||
start=1):
|
||||
tasks.append(
|
||||
asyncio.create_task(
|
||||
_run_task_with_lock(driver_worker.execute_method.remote,
|
||||
self.pp_locks[pp_rank],
|
||||
"execute_model", execute_model_req)))
|
||||
|
||||
results = await asyncio.gather(*tasks)
|
||||
|
||||
# Only the last PP stage has the final results.
|
||||
return results[-1]
|
||||
|
||||
async def _start_worker_execution_loop(self):
|
||||
assert not self.use_ray_spmd_worker, (
|
||||
"worker loop is disabled for VLLM_USE_RAY_SPMD_WORKER=1")
|
||||
coros = [
|
||||
worker.execute_method.remote("start_worker_execution_loop")
|
||||
for worker in self.non_driver_workers
|
||||
]
|
||||
return await asyncio.gather(*coros)
|
||||
|
||||
def __del__(self):
|
||||
self.shutdown()
|
||||
497
vllm-v0.6.2/vllm/executor/ray_mlu_executor.py
Normal file
497
vllm-v0.6.2/vllm/executor/ray_mlu_executor.py
Normal file
@@ -0,0 +1,497 @@
|
||||
import asyncio
|
||||
import os
|
||||
from collections import defaultdict
|
||||
from itertools import islice, repeat
|
||||
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
|
||||
|
||||
import msgspec
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.executor.distributed_mlu_executor import (
|
||||
DistributedMLUExecutor, DistributedMLUExecutorAsync
|
||||
)
|
||||
from vllm.executor.msgspec_utils import encode_hook
|
||||
from vllm.executor.ray_utils import RayWorkerWrapper, ray
|
||||
from vllm.logger import init_logger
|
||||
from vllm.sequence import ExecuteModelRequest
|
||||
from vllm.model_executor.layers.sampler import SamplerOutput
|
||||
from vllm.utils import (_run_task_with_lock,
|
||||
get_distributed_init_method, get_ip, get_open_port,
|
||||
get_vllm_instance_id, make_async)
|
||||
|
||||
if ray is not None:
|
||||
from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from ray.util.placement_group import PlacementGroup
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class RayMLUExecutor(DistributedMLUExecutor):
|
||||
|
||||
uses_ray: bool = True
|
||||
|
||||
def _init_executor(self) -> None:
|
||||
self.forward_dag: Optional[ray.dag.CompiledDAG] = None
|
||||
# If the env var is set, it uses the Ray's compiled DAG API
|
||||
# which optimizes the control plane overhead.
|
||||
# Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it.
|
||||
# Currently, this requires USE_RAY_SPMD_WORKER=True.
|
||||
self.use_ray_compiled_dag = envs.VLLM_USE_RAY_COMPILED_DAG
|
||||
# If the env var is set, then we do not distinguish between the
|
||||
# "driver worker" vs other workers. Also, the rank 0 worker will
|
||||
# be executed in a remote Ray worker. Currently this requires
|
||||
# USE_RAY_COMPILED_DAG=True.
|
||||
self.use_ray_spmd_worker = envs.VLLM_USE_RAY_SPMD_WORKER
|
||||
if self.use_ray_compiled_dag:
|
||||
assert self.use_ray_spmd_worker, (
|
||||
"VLLM_USE_RAY_COMPILED_DAG=1 requires "
|
||||
"VLLM_USE_RAY_SPMD_WORKER=1")
|
||||
if self.use_ray_spmd_worker:
|
||||
# TODO: Support SPMD worker for non-DAG Ray executor.
|
||||
assert self.use_ray_compiled_dag, (
|
||||
"VLLM_USE_RAY_SPMD_WORKER=1 requires "
|
||||
"VLLM_USE_RAY_COMPILED_DAG=1")
|
||||
|
||||
assert not self.use_ray_compiled_dag and not self.use_ray_spmd_worker, \
|
||||
f"RayMLUExecutor is not supported for compiled dag and spmd mode."
|
||||
|
||||
assert self.uses_ray
|
||||
placement_group = self.parallel_config.placement_group
|
||||
|
||||
# Disable Ray usage stats collection.
|
||||
ray_usage = os.environ.get("RAY_USAGE_STATS_ENABLED", "0")
|
||||
if ray_usage != "1":
|
||||
os.environ["RAY_USAGE_STATS_ENABLED"] = "0"
|
||||
|
||||
# Create the parallel GPU workers.
|
||||
self._init_workers_ray(placement_group)
|
||||
|
||||
self.input_encoder = msgspec.msgpack.Encoder(enc_hook=encode_hook)
|
||||
self.output_decoder = msgspec.msgpack.Decoder(
|
||||
Optional[List[SamplerOutput]])
|
||||
|
||||
def shutdown(self) -> None:
|
||||
if hasattr(self, "forward_dag") and self.forward_dag is not None:
|
||||
self.forward_dag.teardown()
|
||||
import ray
|
||||
for worker in self.workers:
|
||||
ray.kill(worker)
|
||||
self.forward_dag = None
|
||||
|
||||
def _configure_ray_workers_use_nsight(self,
|
||||
ray_remote_kwargs) -> Dict[str, Any]:
|
||||
# If nsight profiling is enabled, we need to set the profiling
|
||||
# configuration for the ray workers as runtime env.
|
||||
runtime_env = ray_remote_kwargs.setdefault("runtime_env", {})
|
||||
runtime_env.update({
|
||||
"nsight": {
|
||||
"o": os.getcwd(),
|
||||
"force_overwrite": "false"
|
||||
}
|
||||
})
|
||||
|
||||
return ray_remote_kwargs
|
||||
|
||||
def _get_worker_wrapper_args(self) -> Dict[str, Any]:
|
||||
(worker_module_name, worker_class_name,
|
||||
worker_class_fn) = self._get_worker_module_and_class()
|
||||
|
||||
return dict(
|
||||
worker_module_name=worker_module_name,
|
||||
worker_class_name=worker_class_name,
|
||||
worker_class_fn=worker_class_fn,
|
||||
trust_remote_code=self.model_config.trust_remote_code,
|
||||
)
|
||||
|
||||
# child class could overwrite this to return actual env vars.
|
||||
def _get_env_vars_to_be_updated(self):
|
||||
return self._env_vars_for_all_workers
|
||||
|
||||
def _init_workers_ray(self, placement_group: "PlacementGroup",
|
||||
**ray_remote_kwargs):
|
||||
if (self.parallel_config.tensor_parallel_size == 1
|
||||
and self.parallel_config.pipeline_parallel_size == 1):
|
||||
# For single GPU case, we use a ray worker with constrained memory.
|
||||
num_gpus = self.cache_config.gpu_memory_utilization
|
||||
else:
|
||||
# Otherwise, the ray workers are allocated with a full GPU.
|
||||
num_gpus = 1
|
||||
|
||||
# The driver dummy worker does not actually use any resources.
|
||||
# It holds the resource for the driver worker.
|
||||
self.driver_dummy_worker: Optional[RayWorkerWrapper] = None
|
||||
# The remaining workers are the actual ray actors.
|
||||
self.workers: List[RayWorkerWrapper] = []
|
||||
|
||||
# Used in ray compiled DAG: indexed first by PP rank,
|
||||
# and then TP rank. In other words, the inner list is
|
||||
# the TP group of workers for a PP rank.
|
||||
self.pp_tp_workers: List[List[RayWorkerWrapper]] = []
|
||||
|
||||
if self.parallel_config.ray_workers_use_nsight:
|
||||
ray_remote_kwargs = self._configure_ray_workers_use_nsight(
|
||||
ray_remote_kwargs)
|
||||
|
||||
logger.info("use_ray_spmd_worker: %s", self.use_ray_spmd_worker)
|
||||
|
||||
# Create the workers.
|
||||
driver_ip = get_ip()
|
||||
worker_wrapper_kwargs = self._get_worker_wrapper_args()
|
||||
for bundle_id, bundle in enumerate(placement_group.bundle_specs):
|
||||
if not bundle.get("GPU", 0):
|
||||
continue
|
||||
scheduling_strategy = PlacementGroupSchedulingStrategy(
|
||||
placement_group=placement_group,
|
||||
placement_group_capture_child_tasks=True,
|
||||
placement_group_bundle_index=bundle_id,
|
||||
)
|
||||
|
||||
worker = ray.remote(
|
||||
num_cpus=0,
|
||||
num_gpus=num_gpus,
|
||||
scheduling_strategy=scheduling_strategy,
|
||||
**ray_remote_kwargs,
|
||||
)(RayWorkerWrapper).remote(**worker_wrapper_kwargs)
|
||||
|
||||
if self.use_ray_spmd_worker:
|
||||
self.workers.append(worker)
|
||||
else:
|
||||
worker_ip = ray.get(worker.get_node_ip.remote())
|
||||
if worker_ip == driver_ip and self.driver_dummy_worker is None:
|
||||
# If the worker is on the same node as the driver, we use it
|
||||
# as the resource holder for the driver process.
|
||||
self.driver_dummy_worker = worker
|
||||
self.driver_worker = RayWorkerWrapper(
|
||||
**worker_wrapper_kwargs)
|
||||
else:
|
||||
# Else, added to the list of workers.
|
||||
self.workers.append(worker)
|
||||
|
||||
logger.debug("workers: %s", self.workers)
|
||||
logger.debug("driver_dummy_worker: %s", self.driver_dummy_worker)
|
||||
if not self.use_ray_spmd_worker and self.driver_dummy_worker is None:
|
||||
raise ValueError(
|
||||
"Ray does not allocate any GPUs on the driver node. Consider "
|
||||
"adjusting the Ray placement group or running the driver on a "
|
||||
"GPU node.")
|
||||
|
||||
worker_ips = [
|
||||
ray.get(worker.get_node_ip.remote()) # type: ignore[attr-defined]
|
||||
for worker in self.workers
|
||||
]
|
||||
ip_counts: Dict[str, int] = {}
|
||||
for ip in worker_ips:
|
||||
ip_counts[ip] = ip_counts.get(ip, 0) + 1
|
||||
|
||||
def sort_by_driver_then_worker_ip(worker):
|
||||
"""
|
||||
Sort the workers based on 3 properties:
|
||||
1. If the worker is on the same node as the driver (vllm engine),
|
||||
it should be placed first.
|
||||
2. Then, if the worker is on a node with fewer workers, it should
|
||||
be placed first.
|
||||
3. Finally, if the work is on a node with smaller IP address, it
|
||||
should be placed first.
|
||||
"""
|
||||
ip = ray.get(worker.get_node_ip.remote())
|
||||
return (ip != driver_ip, ip_counts[ip], ip)
|
||||
|
||||
# After sorting, the workers on the same node will be
|
||||
# close to each other, and the workers on the driver
|
||||
# node will be placed first.
|
||||
self.workers = sorted(self.workers, key=sort_by_driver_then_worker_ip)
|
||||
|
||||
# Get the set of GPU IDs used on each node.
|
||||
worker_node_and_gpu_ids = self._run_workers("get_node_and_gpu_ids",
|
||||
use_dummy_driver=True)
|
||||
|
||||
node_workers = defaultdict(list) # node id -> list of worker ranks
|
||||
node_gpus = defaultdict(list) # node id -> list of gpu ids
|
||||
|
||||
for i, (node_id, gpu_ids) in enumerate(worker_node_and_gpu_ids):
|
||||
node_workers[node_id].append(i)
|
||||
# `gpu_ids` can be a list of strings or integers.
|
||||
# convert them to integers for consistency.
|
||||
# NOTE: gpu_ids can be larger than 9 (e.g. 16 GPUs),
|
||||
# string sorting is not sufficient.
|
||||
# see https://github.com/vllm-project/vllm/issues/5590
|
||||
gpu_ids = [int(x) for x in gpu_ids]
|
||||
node_gpus[node_id].extend(gpu_ids)
|
||||
for node_id, gpu_ids in node_gpus.items():
|
||||
node_gpus[node_id] = sorted(gpu_ids)
|
||||
|
||||
all_ips = set(worker_ips + [driver_ip])
|
||||
n_ips = len(all_ips)
|
||||
n_nodes = len(node_workers)
|
||||
|
||||
if n_nodes != n_ips:
|
||||
raise RuntimeError(
|
||||
f"Every node should have a unique IP address. Got {n_nodes}"
|
||||
f" nodes with node ids {list(node_workers.keys())} and "
|
||||
f"{n_ips} unique IP addresses {all_ips}. Please check your"
|
||||
" network configuration. If you set `VLLM_HOST_IP` or "
|
||||
"`HOST_IP` environment variable, make sure it is unique for"
|
||||
" each node.")
|
||||
|
||||
VLLM_INSTANCE_ID = get_vllm_instance_id()
|
||||
|
||||
# Set environment variables for the driver and workers.
|
||||
all_args_to_update_environment_variables = [({
|
||||
"MLU_VISIBLE_DEVICES":
|
||||
",".join(map(str, node_gpus[node_id])),
|
||||
"VLLM_INSTANCE_ID":
|
||||
VLLM_INSTANCE_ID,
|
||||
"VLLM_TRACE_FUNCTION":
|
||||
str(envs.VLLM_TRACE_FUNCTION),
|
||||
**({
|
||||
"VLLM_ATTENTION_BACKEND": envs.VLLM_ATTENTION_BACKEND
|
||||
} if envs.VLLM_ATTENTION_BACKEND is not None else {})
|
||||
}, ) for (node_id, _) in worker_node_and_gpu_ids]
|
||||
|
||||
self._env_vars_for_all_workers = (
|
||||
all_args_to_update_environment_variables)
|
||||
|
||||
self._run_workers("update_environment_variables",
|
||||
all_args=self._get_env_vars_to_be_updated())
|
||||
|
||||
if len(node_gpus) == 1:
|
||||
# in single node case, we don't need to get the IP address.
|
||||
# the loopback address is sufficient
|
||||
# NOTE: a node may have several IP addresses, one for each
|
||||
# network interface. `get_ip()` might return any of them,
|
||||
# while they might not work for communication inside the node
|
||||
# if the network setup is complicated. Using the loopback address
|
||||
# solves this issue, as it always works for communication inside
|
||||
# the node.
|
||||
driver_ip = "127.0.0.1"
|
||||
distributed_init_method = get_distributed_init_method(
|
||||
driver_ip, get_open_port())
|
||||
|
||||
# Initialize the actual workers inside worker wrapper.
|
||||
init_worker_all_kwargs = [
|
||||
self._get_worker_kwargs(
|
||||
local_rank=node_workers[node_id].index(rank),
|
||||
rank=rank,
|
||||
distributed_init_method=distributed_init_method,
|
||||
) for rank, (node_id, _) in enumerate(worker_node_and_gpu_ids)
|
||||
]
|
||||
self._run_workers("init_worker", all_kwargs=init_worker_all_kwargs)
|
||||
|
||||
self._run_workers("init_device")
|
||||
self._run_workers("load_model",
|
||||
max_concurrent_workers=self.parallel_config.
|
||||
max_parallel_loading_workers)
|
||||
|
||||
if self.use_ray_spmd_worker:
|
||||
for pp_rank in range(self.parallel_config.pipeline_parallel_size):
|
||||
self.pp_tp_workers.append([])
|
||||
for tp_rank in range(
|
||||
self.parallel_config.tensor_parallel_size):
|
||||
# PP=2, TP=4
|
||||
# pp_tp_workers = [[0, 1, 2, 3], [4, 5, 6, 7]]
|
||||
rank = (pp_rank * self.parallel_config.tensor_parallel_size
|
||||
) + tp_rank
|
||||
assert len(self.pp_tp_workers[pp_rank]) == tp_rank
|
||||
assert pp_rank < len(self.pp_tp_workers)
|
||||
self.pp_tp_workers[pp_rank].append(self.workers[rank])
|
||||
|
||||
# This is the list of workers that are rank 0 of each TP group EXCEPT
|
||||
# global rank 0. These are the workers that will broadcast to the
|
||||
# rest of the workers.
|
||||
self.tp_driver_workers: List[RayWorkerWrapper] = []
|
||||
# This is the list of workers that are not drivers and not the first
|
||||
# worker in a TP group. These are the workers that will be
|
||||
# broadcasted to.
|
||||
self.non_driver_workers: List[RayWorkerWrapper] = []
|
||||
|
||||
# Enforce rank order for correct rank to return final output.
|
||||
for index, worker in enumerate(self.workers):
|
||||
# The driver worker is rank 0 and not in self.workers.
|
||||
rank = index + 1
|
||||
if rank % self.parallel_config.tensor_parallel_size == 0:
|
||||
self.tp_driver_workers.append(worker)
|
||||
else:
|
||||
self.non_driver_workers.append(worker)
|
||||
|
||||
def _driver_execute_model(
|
||||
self, execute_model_req: Optional[ExecuteModelRequest]
|
||||
) -> Optional[List[SamplerOutput]]:
|
||||
"""Run execute_model in the driver worker.
|
||||
|
||||
Passing None will cause the driver to stop the model execution
|
||||
loop running in each of the remote workers.
|
||||
"""
|
||||
assert not self.use_ray_spmd_worker, (
|
||||
"driver_worker does not exist for VLLM_USE_RAY_SPMD_WORKER=1")
|
||||
return self.driver_worker.execute_method("execute_model",
|
||||
execute_model_req)
|
||||
|
||||
def execute_model(
|
||||
self,
|
||||
execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
|
||||
assert not self.use_ray_spmd_worker, (
|
||||
"RayMLUExecutor is not supported for spmd mode.")
|
||||
return super().execute_model(execute_model_req)
|
||||
|
||||
def _run_workers(
|
||||
self,
|
||||
method: str,
|
||||
*args,
|
||||
async_run_tensor_parallel_workers_only: bool = False,
|
||||
all_args: Optional[List[Tuple[Any, ...]]] = None,
|
||||
all_kwargs: Optional[List[Dict[str, Any]]] = None,
|
||||
use_dummy_driver: bool = False,
|
||||
max_concurrent_workers: Optional[int] = None,
|
||||
**kwargs,
|
||||
) -> Any:
|
||||
"""Runs the given method on all workers. Can be used in the following
|
||||
ways:
|
||||
|
||||
Args:
|
||||
- async_run_tensor_parallel_workers_only: If True the method will be
|
||||
run only in the remote TP workers, not the driver worker.
|
||||
It will also be run asynchronously and return a list of futures
|
||||
rather than blocking on the results.
|
||||
- args/kwargs: All workers share the same args/kwargs
|
||||
- all_args/all_kwargs: args/kwargs for each worker are specified
|
||||
individually
|
||||
"""
|
||||
if self.use_ray_spmd_worker:
|
||||
assert not async_run_tensor_parallel_workers_only, (
|
||||
"async_run_tensor_parallel_workers_only is not supported for "
|
||||
"spmd mode.")
|
||||
|
||||
if max_concurrent_workers:
|
||||
raise NotImplementedError(
|
||||
"max_concurrent_workers is not supported yet.")
|
||||
|
||||
count = len(self.workers) if not \
|
||||
async_run_tensor_parallel_workers_only \
|
||||
else len(self.non_driver_workers)
|
||||
# If using SPMD worker, all workers are the same, so we should execute
|
||||
# the args on all workers. Otherwise, we skip the first worker's args
|
||||
# because those args will go to the driver worker.
|
||||
first_worker_args_index: int = 0 if self.use_ray_spmd_worker else 1
|
||||
all_worker_args = repeat(args, count) if all_args is None \
|
||||
else islice(all_args, first_worker_args_index, None)
|
||||
all_worker_kwargs = repeat(kwargs, count) if all_kwargs is None \
|
||||
else islice(all_kwargs, first_worker_args_index, None)
|
||||
|
||||
# Start the ray workers first.
|
||||
ray_workers = self.workers
|
||||
if async_run_tensor_parallel_workers_only:
|
||||
ray_workers = self.non_driver_workers
|
||||
ray_worker_outputs = [
|
||||
worker.execute_method.remote(method, *worker_args, **worker_kwargs)
|
||||
for (worker, worker_args, worker_kwargs
|
||||
) in zip(ray_workers, all_worker_args, all_worker_kwargs)
|
||||
]
|
||||
|
||||
if async_run_tensor_parallel_workers_only:
|
||||
# Just return futures
|
||||
return ray_worker_outputs
|
||||
|
||||
driver_worker_output = []
|
||||
# In SPMD mode, the driver worker is the same as any other worker,
|
||||
# so we only explicitly execute on the driver worker if using a
|
||||
# non-SPMD worker class.
|
||||
if not self.use_ray_spmd_worker:
|
||||
driver_args = args if all_args is None else all_args[0]
|
||||
driver_kwargs = kwargs if all_kwargs is None else all_kwargs[0]
|
||||
|
||||
# Start the driver worker after all the ray workers.
|
||||
if not use_dummy_driver:
|
||||
driver_worker_output = [
|
||||
self.driver_worker.execute_method(method, *driver_args,
|
||||
**driver_kwargs)
|
||||
]
|
||||
else:
|
||||
assert self.driver_dummy_worker is not None
|
||||
driver_worker_output = [
|
||||
ray.get(
|
||||
self.driver_dummy_worker.execute_method.remote(
|
||||
method, *driver_args, **driver_kwargs))
|
||||
]
|
||||
|
||||
# Get the results of the ray workers.
|
||||
if self.workers:
|
||||
ray_worker_outputs = ray.get(ray_worker_outputs)
|
||||
|
||||
return driver_worker_output + ray_worker_outputs
|
||||
|
||||
def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None:
|
||||
"""Wait for futures returned from _run_workers() with
|
||||
async_run_remote_workers_only to complete."""
|
||||
ray.get(parallel_worker_tasks)
|
||||
|
||||
def __del__(self):
|
||||
self.shutdown()
|
||||
|
||||
|
||||
class RayMLUExecutorAsync(RayMLUExecutor, DistributedMLUExecutorAsync):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.pp_locks: Optional[List[asyncio.Lock]] = None
|
||||
self.use_ray_spmd_worker = envs.VLLM_USE_RAY_SPMD_WORKER
|
||||
if not self.use_ray_compiled_dag:
|
||||
self.driver_exec_method = make_async(
|
||||
self.driver_worker.execute_method)
|
||||
|
||||
async def execute_model_async(
|
||||
self,
|
||||
execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
|
||||
assert not self.use_ray_spmd_worker, (
|
||||
"RayMLUExecutorAsync is not supported for spmd mode.")
|
||||
return await super().execute_model_async(execute_model_req)
|
||||
|
||||
async def _driver_execute_model_async(
|
||||
self,
|
||||
execute_model_req: Optional[ExecuteModelRequest] = None
|
||||
) -> List[SamplerOutput]:
|
||||
assert not self.use_ray_spmd_worker, (
|
||||
"driver_worker does not exist for VLLM_USE_RAY_SPMD_WORKER=1")
|
||||
if not self.tp_driver_workers:
|
||||
return await self.driver_exec_method("execute_model",
|
||||
execute_model_req)
|
||||
if self.pp_locks is None:
|
||||
# This locks each pipeline parallel stage so multiple virtual
|
||||
# engines can't execute on the same stage at the same time
|
||||
# We create the locks here to avoid creating them in the constructor
|
||||
# which uses a different asyncio loop.
|
||||
self.pp_locks = [
|
||||
asyncio.Lock()
|
||||
for _ in range(self.parallel_config.pipeline_parallel_size)
|
||||
]
|
||||
|
||||
tasks = [
|
||||
asyncio.create_task(
|
||||
_run_task_with_lock(self.driver_exec_method, self.pp_locks[0],
|
||||
"execute_model", execute_model_req))
|
||||
]
|
||||
for pp_rank, driver_worker in enumerate(self.tp_driver_workers,
|
||||
start=1):
|
||||
tasks.append(
|
||||
asyncio.create_task(
|
||||
_run_task_with_lock(driver_worker.execute_method.remote,
|
||||
self.pp_locks[pp_rank],
|
||||
"execute_model", execute_model_req)))
|
||||
|
||||
results = await asyncio.gather(*tasks)
|
||||
|
||||
# Only the last PP stage has the final results.
|
||||
return results[-1]
|
||||
|
||||
async def _start_worker_execution_loop(self):
|
||||
assert not self.use_ray_spmd_worker, (
|
||||
"worker loop is disabled for VLLM_USE_RAY_SPMD_WORKER=1")
|
||||
coros = [
|
||||
worker.execute_method.remote("start_worker_execution_loop")
|
||||
for worker in self.non_driver_workers
|
||||
]
|
||||
return await asyncio.gather(*coros)
|
||||
|
||||
def __del__(self):
|
||||
self.shutdown()
|
||||
363
vllm-v0.6.2/vllm/executor/ray_tpu_executor.py
Normal file
363
vllm-v0.6.2/vllm/executor/ray_tpu_executor.py
Normal file
@@ -0,0 +1,363 @@
|
||||
import asyncio
|
||||
import os
|
||||
from collections import defaultdict
|
||||
from itertools import islice, repeat
|
||||
from typing import (TYPE_CHECKING, Any, Awaitable, Dict, List, Optional, Tuple,
|
||||
Union)
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.executor.executor_base import ExecutorAsyncBase
|
||||
from vllm.executor.ray_utils import RayWorkerWrapper, ray
|
||||
from vllm.executor.tpu_executor import TPUExecutor
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.layers.sampler import SamplerOutput
|
||||
from vllm.sequence import ExecuteModelRequest
|
||||
from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
|
||||
get_vllm_instance_id, make_async)
|
||||
|
||||
if ray is not None:
|
||||
from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from ray.util.placement_group import PlacementGroup
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class RayTPUExecutor(TPUExecutor):
|
||||
|
||||
uses_ray: bool = True
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
# This is non-None when the execute model loop is running
|
||||
# in the parallel workers. It's a coroutine in the AsyncLLMEngine case.
|
||||
self.parallel_worker_tasks: Optional[Union[Any, Awaitable[Any]]] = None
|
||||
# Updated by implementations that require additional args to be passed
|
||||
# to the _run_workers execute_model call
|
||||
self.extra_execute_model_run_workers_kwargs: Dict[str, Any] = {}
|
||||
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
def _init_executor(self) -> None:
|
||||
assert self.parallel_config.distributed_executor_backend == "ray"
|
||||
placement_group = self.parallel_config.placement_group
|
||||
|
||||
# Disable Ray usage stats collection.
|
||||
ray_usage = os.environ.get("RAY_USAGE_STATS_ENABLED", "0")
|
||||
if ray_usage != "1":
|
||||
os.environ["RAY_USAGE_STATS_ENABLED"] = "0"
|
||||
|
||||
# Create the parallel TPU workers.
|
||||
self._init_workers_ray(placement_group)
|
||||
|
||||
def _init_workers_ray(self, placement_group: "PlacementGroup",
|
||||
**ray_remote_kwargs):
|
||||
# The driver dummy worker does not actually use any resources.
|
||||
# It holds the resource for the driver worker.
|
||||
self.driver_dummy_worker: Optional[RayWorkerWrapper] = None
|
||||
# The remaining workers are the actual ray actors.
|
||||
self.workers: List[RayWorkerWrapper] = []
|
||||
|
||||
# Create the workers.
|
||||
driver_ip = get_ip()
|
||||
for bundle_id, bundle in enumerate(placement_group.bundle_specs):
|
||||
if not bundle.get("TPU", 0):
|
||||
continue
|
||||
scheduling_strategy = PlacementGroupSchedulingStrategy(
|
||||
placement_group=placement_group,
|
||||
placement_group_capture_child_tasks=True,
|
||||
placement_group_bundle_index=bundle_id,
|
||||
)
|
||||
|
||||
assert self.speculative_config is None
|
||||
if self.scheduler_config.is_multi_step:
|
||||
worker_module_name = "vllm.worker.multi_step_tpu_worker"
|
||||
worker_class_name = "MultiStepTPUWorker"
|
||||
else:
|
||||
worker_module_name = "vllm.worker.tpu_worker"
|
||||
worker_class_name = "TPUWorker"
|
||||
|
||||
# GKE does not fetch environment information from metadata server
|
||||
# and instead sets these from within the Ray process. Therefore we
|
||||
# need to override the Ray environment variables manually.
|
||||
override_env = {}
|
||||
if "TPU_CHIPS_PER_HOST_BOUNDS" in os.environ:
|
||||
override_env.update({
|
||||
"TPU_CHIPS_PER_HOST_BOUNDS":
|
||||
os.environ["TPU_CHIPS_PER_HOST_BOUNDS"]
|
||||
})
|
||||
if "TPU_HOST_BOUNDS" in os.environ:
|
||||
override_env.update(
|
||||
{"TPU_HOST_BOUNDS": os.environ["TPU_HOST_BOUNDS"]})
|
||||
|
||||
worker = ray.remote(
|
||||
num_cpus=0,
|
||||
resources={"TPU": 1},
|
||||
scheduling_strategy=scheduling_strategy,
|
||||
**ray_remote_kwargs,
|
||||
)(RayWorkerWrapper).remote(
|
||||
worker_module_name=worker_module_name,
|
||||
worker_class_name=worker_class_name,
|
||||
trust_remote_code=self.model_config.trust_remote_code,
|
||||
)
|
||||
if override_env:
|
||||
worker.override_env_vars.remote(override_env)
|
||||
|
||||
worker_ip = ray.get(worker.get_node_ip.remote())
|
||||
if worker_ip == driver_ip and self.driver_dummy_worker is None:
|
||||
# If the worker is on the same node as the driver, we use it
|
||||
# as the resource holder for the driver process.
|
||||
self.driver_dummy_worker = worker
|
||||
self.driver_worker = RayWorkerWrapper(
|
||||
worker_module_name=worker_module_name,
|
||||
worker_class_name=worker_class_name,
|
||||
trust_remote_code=self.model_config.trust_remote_code,
|
||||
)
|
||||
else:
|
||||
# Else, added to the list of workers.
|
||||
self.workers.append(worker)
|
||||
|
||||
logger.debug("workers: %s", self.workers)
|
||||
logger.debug("driver_dummy_worker: %s", self.driver_dummy_worker)
|
||||
if self.driver_dummy_worker is None:
|
||||
raise ValueError(
|
||||
"Ray does not allocate any TPUs on the driver node. Consider "
|
||||
"adjusting the Ray placement group or running the driver on a "
|
||||
"TPU node.")
|
||||
|
||||
worker_ips = [
|
||||
ray.get(worker.get_node_ip.remote()) # type: ignore[attr-defined]
|
||||
for worker in self.workers
|
||||
]
|
||||
ip_counts: Dict[str, int] = {}
|
||||
for ip in worker_ips:
|
||||
ip_counts[ip] = ip_counts.get(ip, 0) + 1
|
||||
|
||||
def sort_by_driver_then_worker_ip(worker):
|
||||
"""
|
||||
Sort the workers based on 3 properties:
|
||||
1. If the worker is on the same node as the driver (vllm engine),
|
||||
it should be placed first.
|
||||
2. Then, if the worker is on a node with fewer workers, it should
|
||||
be placed first.
|
||||
3. Finally, if the work is on a node with smaller IP address, it
|
||||
should be placed first.
|
||||
"""
|
||||
ip = ray.get(worker.get_node_ip.remote())
|
||||
return (ip != driver_ip, ip_counts[ip], ip)
|
||||
|
||||
# After sorting, the workers on the same node will be
|
||||
# close to each other, and the workers on the driver
|
||||
# node will be placed first.
|
||||
self.workers = sorted(self.workers, key=sort_by_driver_then_worker_ip)
|
||||
|
||||
# Get the set of TPU IDs used on each node.
|
||||
worker_node_and_gpu_ids = self._run_workers("get_node_and_gpu_ids",
|
||||
use_dummy_driver=True)
|
||||
|
||||
node_workers = defaultdict(list)
|
||||
for i, (node_id, _) in enumerate(worker_node_and_gpu_ids):
|
||||
node_workers[node_id].append(i)
|
||||
|
||||
VLLM_INSTANCE_ID = get_vllm_instance_id()
|
||||
|
||||
# Set environment variables for the driver and workers.
|
||||
all_args_to_update_environment_variables = [({
|
||||
"VLLM_INSTANCE_ID":
|
||||
VLLM_INSTANCE_ID,
|
||||
"VLLM_TRACE_FUNCTION":
|
||||
str(envs.VLLM_TRACE_FUNCTION),
|
||||
}, ) for _ in worker_node_and_gpu_ids]
|
||||
self._run_workers("update_environment_variables",
|
||||
all_args=all_args_to_update_environment_variables)
|
||||
|
||||
if len(node_workers) == 1:
|
||||
# in single node case, we don't need to get the IP address.
|
||||
# the loopback address is sufficient
|
||||
# NOTE: a node may have several IP addresses, one for each
|
||||
# network interface. `get_ip()` might return any of them,
|
||||
# while they might not work for communication inside the node
|
||||
# if the network setup is complicated. Using the loopback address
|
||||
# solves this issue, as it always works for communication inside
|
||||
# the node.
|
||||
driver_ip = "127.0.0.1"
|
||||
distributed_init_method = get_distributed_init_method(
|
||||
driver_ip, get_open_port())
|
||||
|
||||
# Initialize the actual workers inside worker wrapper.
|
||||
init_worker_all_kwargs = [
|
||||
self._get_worker_kwargs(
|
||||
local_rank=node_workers[node_id].index(rank),
|
||||
rank=rank,
|
||||
distributed_init_method=distributed_init_method,
|
||||
) for rank, (node_id, _) in enumerate(worker_node_and_gpu_ids)
|
||||
]
|
||||
self._run_workers("init_worker", all_kwargs=init_worker_all_kwargs)
|
||||
|
||||
self._run_workers("init_device")
|
||||
self._run_workers("load_model",
|
||||
max_concurrent_workers=self.parallel_config.
|
||||
max_parallel_loading_workers)
|
||||
|
||||
def _driver_execute_model(
|
||||
self,
|
||||
execute_model_req: Optional[ExecuteModelRequest] = None
|
||||
) -> List[SamplerOutput]:
|
||||
"""Run execute_model in the driver worker.
|
||||
|
||||
Passing None will cause the driver to stop the model execution
|
||||
loop running in each of the remote workers.
|
||||
"""
|
||||
return self.driver_worker.execute_method("execute_model",
|
||||
execute_model_req)
|
||||
|
||||
def _run_workers(
|
||||
self,
|
||||
method: str,
|
||||
*args,
|
||||
async_run_remote_workers_only: bool = False,
|
||||
all_args: Optional[List[Tuple[Any, ...]]] = None,
|
||||
all_kwargs: Optional[List[Dict[str, Any]]] = None,
|
||||
use_dummy_driver: bool = False,
|
||||
max_concurrent_workers: Optional[int] = None,
|
||||
use_ray_compiled_dag: bool = False,
|
||||
**kwargs,
|
||||
) -> Any:
|
||||
"""Runs the given method on all workers. Can be used in the following
|
||||
ways:
|
||||
|
||||
- async_run_remote_workers_only: If True the method will be run only
|
||||
in the remote workers, not the driver worker. It will also be
|
||||
run asynchronously and return a list of futures rather than blocking
|
||||
on the results.
|
||||
- args/kwargs: All workers share the same args/kwargs
|
||||
- all_args/all_kwargs: args/kwargs for each worker are specified
|
||||
individually
|
||||
"""
|
||||
|
||||
if max_concurrent_workers:
|
||||
raise NotImplementedError(
|
||||
"max_concurrent_workers is not supported yet.")
|
||||
|
||||
count = len(self.workers)
|
||||
all_worker_args = repeat(args, count) if all_args is None \
|
||||
else islice(all_args, 1, None)
|
||||
all_worker_kwargs = repeat(kwargs, count) if all_kwargs is None \
|
||||
else islice(all_kwargs, 1, None)
|
||||
|
||||
# Start the ray workers first.
|
||||
ray_worker_outputs = [
|
||||
worker.execute_method.remote(method, *worker_args, **worker_kwargs)
|
||||
for (worker, worker_args, worker_kwargs
|
||||
) in zip(self.workers, all_worker_args, all_worker_kwargs)
|
||||
]
|
||||
|
||||
if async_run_remote_workers_only:
|
||||
# Just return futures
|
||||
return ray_worker_outputs
|
||||
|
||||
driver_args = args if all_args is None else all_args[0]
|
||||
driver_kwargs = kwargs if all_kwargs is None else all_kwargs[0]
|
||||
|
||||
# Start the driver worker after all the ray workers.
|
||||
if not use_dummy_driver:
|
||||
driver_worker_output = self.driver_worker.execute_method(
|
||||
method, *driver_args, **driver_kwargs)
|
||||
else:
|
||||
assert self.driver_dummy_worker is not None
|
||||
driver_worker_output = ray.get(
|
||||
self.driver_dummy_worker.execute_method.remote(
|
||||
method, *driver_args, **driver_kwargs))
|
||||
# Get the results of the ray workers.
|
||||
if self.workers:
|
||||
ray_worker_outputs = ray.get(ray_worker_outputs)
|
||||
|
||||
return [driver_worker_output] + ray_worker_outputs
|
||||
|
||||
def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None:
|
||||
"""Wait for futures returned from _run_workers() with
|
||||
async_run_remote_workers_only to complete."""
|
||||
ray.get(parallel_worker_tasks)
|
||||
|
||||
def determine_num_available_blocks(self) -> Tuple[int, int]:
|
||||
num_blocks = self._run_workers("determine_num_available_blocks", )
|
||||
num_tpu_blocks = min(b[0] for b in num_blocks)
|
||||
num_cpu_blocks = min(b[1] for b in num_blocks)
|
||||
return num_tpu_blocks, num_cpu_blocks
|
||||
|
||||
def initialize_cache(self, num_gpu_blocks: int,
|
||||
num_cpu_blocks: int) -> None:
|
||||
logger.info("# TPU blocks: %d, # CPU blocks: %d", num_gpu_blocks,
|
||||
num_cpu_blocks)
|
||||
self.cache_config.num_gpu_blocks = num_gpu_blocks
|
||||
self.cache_config.num_cpu_blocks = num_cpu_blocks
|
||||
self._run_workers("initialize_cache",
|
||||
num_gpu_blocks=num_gpu_blocks,
|
||||
num_cpu_blocks=num_cpu_blocks)
|
||||
|
||||
def execute_model(
|
||||
self,
|
||||
execute_model_req: ExecuteModelRequest,
|
||||
) -> List[SamplerOutput]:
|
||||
if self.parallel_worker_tasks is None:
|
||||
self.parallel_worker_tasks = self._run_workers(
|
||||
"start_worker_execution_loop",
|
||||
async_run_remote_workers_only=True,
|
||||
**self.extra_execute_model_run_workers_kwargs)
|
||||
|
||||
# Only the driver worker returns the sampling results.
|
||||
return self._driver_execute_model(execute_model_req)
|
||||
|
||||
def stop_remote_worker_execution_loop(self) -> None:
|
||||
if self.parallel_worker_tasks is None:
|
||||
return
|
||||
|
||||
self._driver_execute_model()
|
||||
parallel_worker_tasks = self.parallel_worker_tasks
|
||||
self.parallel_worker_tasks = None
|
||||
# Ensure that workers exit model loop cleanly
|
||||
# (this will raise otherwise)
|
||||
self._wait_for_tasks_completion(parallel_worker_tasks)
|
||||
|
||||
|
||||
class RayTPUExecutorAsync(RayTPUExecutor, ExecutorAsyncBase):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.driver_exec_method = make_async(self.driver_worker.execute_method)
|
||||
|
||||
async def execute_model_async(
|
||||
self,
|
||||
execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
|
||||
if self.parallel_worker_tasks is None:
|
||||
# Start model execution loop running in the parallel workers
|
||||
self.parallel_worker_tasks = asyncio.create_task(
|
||||
self._start_worker_execution_loop())
|
||||
|
||||
# Only the driver worker returns the sampling results.
|
||||
return await self._driver_execute_model_async(execute_model_req)
|
||||
|
||||
async def stop_remote_worker_execution_loop_async(self) -> None:
|
||||
if self.parallel_worker_tasks is None:
|
||||
return
|
||||
|
||||
await self._driver_execute_model_async()
|
||||
parallel_worker_tasks = self.parallel_worker_tasks
|
||||
self.parallel_worker_tasks = None
|
||||
# Ensure that workers exit model loop cleanly
|
||||
# (this will raise otherwise)
|
||||
await parallel_worker_tasks
|
||||
|
||||
async def _driver_execute_model_async(
|
||||
self,
|
||||
execute_model_req: Optional[ExecuteModelRequest] = None
|
||||
) -> List[SamplerOutput]:
|
||||
return await self.driver_exec_method("execute_model",
|
||||
execute_model_req)
|
||||
|
||||
async def _start_worker_execution_loop(self):
|
||||
coros = [
|
||||
worker.execute_method.remote("start_worker_execution_loop")
|
||||
for worker in self.workers
|
||||
]
|
||||
return await asyncio.gather(*coros)
|
||||
338
vllm-v0.6.2/vllm/executor/ray_utils.py
Normal file
338
vllm-v0.6.2/vllm/executor/ray_utils.py
Normal file
@@ -0,0 +1,338 @@
|
||||
import os
|
||||
import time
|
||||
from collections import defaultdict
|
||||
from typing import Dict, List, Optional, Tuple, Union
|
||||
|
||||
import msgspec
|
||||
|
||||
from vllm.config import ParallelConfig
|
||||
from vllm.executor.msgspec_utils import decode_hook, encode_hook
|
||||
from vllm.logger import init_logger
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.sequence import ExecuteModelRequest, IntermediateTensors
|
||||
from vllm.utils import get_ip
|
||||
from vllm.worker.worker_base import WorkerWrapperBase
|
||||
|
||||
logger = init_logger(__name__)
|
||||
PG_WAIT_TIMEOUT = 1800
|
||||
|
||||
try:
|
||||
import ray
|
||||
from ray.util import placement_group_table
|
||||
from ray.util.placement_group import PlacementGroup
|
||||
try:
|
||||
from ray._private.state import available_resources_per_node
|
||||
except ImportError:
|
||||
# Ray 2.9.x doesn't expose `available_resources_per_node`
|
||||
from ray._private.state import state as _state
|
||||
available_resources_per_node = _state._available_resources_per_node
|
||||
|
||||
class RayWorkerWrapper(WorkerWrapperBase):
|
||||
"""Ray wrapper for vllm.worker.Worker, allowing Worker to be
|
||||
lazliy initialized after Ray sets CUDA_VISIBLE_DEVICES."""
|
||||
|
||||
def __init__(self, *args, **kwargs) -> None:
|
||||
super().__init__(*args, **kwargs)
|
||||
# Since the compiled DAG runs a main execution
|
||||
# in a different thread that calls cuda.set_device.
|
||||
# The flag indicates is set_device is called on
|
||||
# that thread.
|
||||
self.compiled_dag_cuda_device_set = False
|
||||
|
||||
self.input_decoder = msgspec.msgpack.Decoder(ExecuteModelRequest,
|
||||
dec_hook=decode_hook)
|
||||
self.output_encoder = msgspec.msgpack.Encoder(enc_hook=encode_hook)
|
||||
|
||||
def get_node_ip(self) -> str:
|
||||
return get_ip()
|
||||
|
||||
def get_node_and_gpu_ids(self) -> Tuple[str, List[int]]:
|
||||
node_id = ray.get_runtime_context().get_node_id()
|
||||
gpu_ids = ray.get_gpu_ids()
|
||||
return node_id, gpu_ids
|
||||
|
||||
def execute_model_spmd(
|
||||
self, req_or_tuple: Union[bytes,
|
||||
Tuple[bytes,
|
||||
Optional[IntermediateTensors]]]
|
||||
) -> bytes:
|
||||
"""Execute model in SPMD fashion: used only when SPMD worker and
|
||||
compiled DAG are both enabled.
|
||||
|
||||
Args:
|
||||
req_or_tuple: A request or a tuple containing the
|
||||
request and intermediate tensors. Intermediate tensors are
|
||||
None unless if it is provided because it is > 0 pipeline
|
||||
stage. The request is serialized by msgspec.
|
||||
"""
|
||||
if isinstance(req_or_tuple, bytes):
|
||||
serialized_req, intermediate_tensors = req_or_tuple, None
|
||||
else:
|
||||
serialized_req, intermediate_tensors = req_or_tuple
|
||||
|
||||
execute_model_req = self.input_decoder.decode(serialized_req)
|
||||
|
||||
# TODO(swang): This is needed right now because Ray aDAG executes
|
||||
# on a background thread, so we need to reset torch's current
|
||||
# device.
|
||||
import torch
|
||||
if not self.compiled_dag_cuda_device_set:
|
||||
torch.cuda.set_device(self.worker.device)
|
||||
self.compiled_dag_cuda_device_set = True
|
||||
|
||||
output = self.worker._execute_model_spmd(execute_model_req,
|
||||
intermediate_tensors)
|
||||
# Pipeline model request and output to the next pipeline stage.
|
||||
if isinstance(output, IntermediateTensors):
|
||||
output = serialized_req, output
|
||||
else:
|
||||
output = self.output_encoder.encode(output)
|
||||
|
||||
return output
|
||||
|
||||
def override_env_vars(self, vars: Dict[str, str]):
|
||||
os.environ.update(vars)
|
||||
|
||||
ray_import_err = None
|
||||
|
||||
except ImportError as e:
|
||||
ray = None # type: ignore
|
||||
ray_import_err = e
|
||||
RayWorkerWrapper = None # type: ignore
|
||||
|
||||
|
||||
def ray_is_available() -> bool:
|
||||
"""Returns True if Ray is available."""
|
||||
return ray is not None
|
||||
|
||||
|
||||
def assert_ray_available():
|
||||
"""Raise an exception if Ray is not available."""
|
||||
if ray is None:
|
||||
raise ValueError("Failed to import Ray, please install Ray with "
|
||||
"`pip install ray`.") from ray_import_err
|
||||
|
||||
|
||||
def _verify_bundles(placement_group: "PlacementGroup",
|
||||
parallel_config: ParallelConfig, device_str: str):
|
||||
"""Verify a given placement group has bundles located in the right place.
|
||||
|
||||
There are 2 rules.
|
||||
- Warn if all tensor parallel workers cannot fit in a single node.
|
||||
- Fail if driver node is not included in a placement group.
|
||||
"""
|
||||
assert ray.is_initialized(), (
|
||||
"Ray is not initialized although distributed-executor-backend is ray.")
|
||||
pg_data = placement_group_table(placement_group)
|
||||
# bundle_idx -> node_id
|
||||
bundle_to_node_ids = pg_data["bundles_to_node_id"]
|
||||
# bundle_idx -> bundle (e.g., {"GPU": 1})
|
||||
bundles = pg_data["bundles"]
|
||||
# node_id -> List of bundle (e.g., {"GPU": 1})
|
||||
node_id_to_bundle: Dict[str, List[Dict[str, float]]] = defaultdict(list)
|
||||
|
||||
for bundle_idx, node_id in bundle_to_node_ids.items():
|
||||
node_id_to_bundle[node_id].append(bundles[bundle_idx])
|
||||
driver_node_id = ray.get_runtime_context().get_node_id()
|
||||
|
||||
if driver_node_id not in node_id_to_bundle:
|
||||
raise RuntimeError(
|
||||
f"driver node id {driver_node_id} is not included in a placement "
|
||||
f"group {placement_group.id}. Node id -> bundles "
|
||||
f"{node_id_to_bundle}. "
|
||||
"You don't have enough GPUs available in a current node. Check "
|
||||
"`ray status` to see if you have available GPUs in a node "
|
||||
f"{driver_node_id} before starting an vLLM engine.")
|
||||
|
||||
for node_id, bundles in node_id_to_bundle.items():
|
||||
if len(bundles) < parallel_config.tensor_parallel_size:
|
||||
logger.warning(
|
||||
"tensor_parallel_size=%d "
|
||||
"is bigger than a reserved number of %ss (%d "
|
||||
"%ss) in a node %s. Tensor parallel workers can be "
|
||||
"spread out to 2+ nodes which can degrade the performance "
|
||||
"unless you have fast interconnect across nodes, like "
|
||||
"Infiniband. To resolve this issue, make sure you have more "
|
||||
"than %d GPUs available at each node.",
|
||||
parallel_config.tensor_parallel_size, device_str, len(bundles),
|
||||
device_str, node_id, parallel_config.tensor_parallel_size)
|
||||
|
||||
|
||||
def _wait_until_pg_ready(current_placement_group: "PlacementGroup"):
|
||||
"""Wait until a placement group is ready.
|
||||
|
||||
It prints the informative log messages if the placement group is
|
||||
not created within time.
|
||||
|
||||
"""
|
||||
# Wait until PG is ready - this will block until all
|
||||
# requested resources are available, and will timeout
|
||||
# if they cannot be provisioned.
|
||||
placement_group_specs = current_placement_group.bundle_specs
|
||||
|
||||
s = time.time()
|
||||
pg_ready_ref = current_placement_group.ready()
|
||||
wait_interval = 10
|
||||
while time.time() - s < PG_WAIT_TIMEOUT:
|
||||
ready, _ = ray.wait([pg_ready_ref], timeout=wait_interval)
|
||||
if len(ready) > 0:
|
||||
break
|
||||
|
||||
# Exponential backoff for warning print.
|
||||
wait_interval *= 2
|
||||
logger.info(
|
||||
"Waiting for creating a placement group of specs for "
|
||||
"%d seconds. specs=%s. Check "
|
||||
"`ray status` to see if you have enough resources.",
|
||||
int(time.time() - s), placement_group_specs)
|
||||
|
||||
try:
|
||||
ray.get(pg_ready_ref, timeout=0)
|
||||
except ray.exceptions.GetTimeoutError:
|
||||
raise ValueError(
|
||||
"Cannot provide a placement group of "
|
||||
f"{placement_group_specs=} within {PG_WAIT_TIMEOUT} seconds. See "
|
||||
"`ray status` to make sure the cluster has enough resources."
|
||||
) from None
|
||||
|
||||
|
||||
def _wait_until_pg_removed(current_placement_group: "PlacementGroup"):
|
||||
ray.util.remove_placement_group(current_placement_group)
|
||||
s = time.time()
|
||||
wait_interval = 10
|
||||
while time.time() - s < PG_WAIT_TIMEOUT:
|
||||
pg = ray.util.get_current_placement_group()
|
||||
if pg is None:
|
||||
break
|
||||
|
||||
# Exponential backoff for warning print.
|
||||
wait_interval *= 2
|
||||
logger.info(
|
||||
"Waiting for removing a placement group of specs for "
|
||||
"%d seconds.", int(time.time() - s))
|
||||
time.sleep(wait_interval)
|
||||
|
||||
|
||||
def initialize_ray_cluster(
|
||||
parallel_config: ParallelConfig,
|
||||
ray_address: Optional[str] = None,
|
||||
):
|
||||
"""Initialize the distributed cluster with Ray.
|
||||
|
||||
it will connect to the Ray cluster and create a placement group
|
||||
for the workers, which includes the specification of the resources
|
||||
for each distributed worker.
|
||||
|
||||
Args:
|
||||
parallel_config: The configurations for parallel execution.
|
||||
ray_address: The address of the Ray cluster. If None, uses
|
||||
the default Ray cluster address.
|
||||
"""
|
||||
assert_ray_available()
|
||||
|
||||
# Connect to a ray cluster.
|
||||
if current_platform.is_rocm() or current_platform.is_xpu():
|
||||
# Try to connect existing ray instance and create a new one if not found
|
||||
try:
|
||||
ray.init("auto")
|
||||
except ConnectionError:
|
||||
logger.warning(
|
||||
"No existing RAY instance detected. "
|
||||
"A new instance will be launched with current node resources.")
|
||||
ray.init(address=ray_address,
|
||||
ignore_reinit_error=True,
|
||||
num_gpus=parallel_config.world_size)
|
||||
else:
|
||||
ray.init(address=ray_address, ignore_reinit_error=True)
|
||||
|
||||
if parallel_config.placement_group:
|
||||
# Placement group is already set.
|
||||
return
|
||||
|
||||
device_str = "GPU"
|
||||
if current_platform.is_tpu():
|
||||
device_str = "TPU"
|
||||
elif current_platform.is_hpu():
|
||||
device_str = 'HPU'
|
||||
# Create placement group for worker processes
|
||||
current_placement_group = ray.util.get_current_placement_group()
|
||||
if current_placement_group:
|
||||
# We are in a placement group
|
||||
bundles = current_placement_group.bundle_specs
|
||||
# Verify that we can use the placement group.
|
||||
device_bundles = 0
|
||||
for bundle in bundles:
|
||||
bundle_devices = bundle.get(device_str, 0)
|
||||
if bundle_devices > 1:
|
||||
raise ValueError(
|
||||
"Placement group bundle cannot have more than 1 "
|
||||
f"{device_str}.")
|
||||
if bundle_devices:
|
||||
device_bundles += 1
|
||||
if parallel_config.world_size > device_bundles:
|
||||
raise ValueError(
|
||||
f"The number of required {device_str}s exceeds the total "
|
||||
f"number of available {device_str}s in the placement group."
|
||||
f"Required number of devices: {parallel_config.world_size}. "
|
||||
f"Total number of devices: {device_bundles}.")
|
||||
else:
|
||||
num_devices_in_cluster = ray.cluster_resources().get(device_str, 0)
|
||||
if parallel_config.world_size > num_devices_in_cluster:
|
||||
raise ValueError(
|
||||
f"The number of required {device_str}s exceeds the total "
|
||||
f"number of available {device_str}s in the placement group.")
|
||||
# Create a new placement group
|
||||
placement_group_specs: List[Dict[str, float]] = ([{
|
||||
device_str: 1.0
|
||||
} for _ in range(parallel_config.world_size)])
|
||||
|
||||
# vLLM engine is also a worker to execute model with an accelerator,
|
||||
# so it requires to have the device in a current node. Check if
|
||||
# the current node has at least one device.
|
||||
current_ip = get_ip()
|
||||
current_node_id = ray.get_runtime_context().get_node_id()
|
||||
current_node_resource = available_resources_per_node()[current_node_id]
|
||||
if current_node_resource.get(device_str, 0) < 1:
|
||||
raise ValueError(
|
||||
f"Current node has no {device_str} available. "
|
||||
f"{current_node_resource=}. vLLM engine cannot start without "
|
||||
f"{device_str}. Make sure you have at least 1 {device_str} "
|
||||
f"available in a node {current_node_id=} {current_ip=}.")
|
||||
# This way, at least bundle is required to be created in a current
|
||||
# node.
|
||||
placement_group_specs[0][f"node:{current_ip}"] = 0.001
|
||||
|
||||
# By default, Ray packs resources as much as possible.
|
||||
current_placement_group = ray.util.placement_group(
|
||||
placement_group_specs, strategy="PACK")
|
||||
_wait_until_pg_ready(current_placement_group)
|
||||
|
||||
assert current_placement_group is not None
|
||||
_verify_bundles(current_placement_group, parallel_config, device_str)
|
||||
# Set the placement group in the parallel config
|
||||
parallel_config.placement_group = current_placement_group
|
||||
|
||||
|
||||
def get_num_tpu_nodes() -> int:
|
||||
from ray._private.accelerators import TPUAcceleratorManager
|
||||
cluster_resources = ray.cluster_resources()
|
||||
total_tpus = int(cluster_resources["TPU"])
|
||||
tpus_per_node = TPUAcceleratorManager.get_current_node_num_accelerators()
|
||||
assert total_tpus % tpus_per_node == 0
|
||||
return total_tpus // tpus_per_node
|
||||
|
||||
|
||||
def get_num_nodes_in_placement_group() -> int:
|
||||
pg_table = ray.util.placement_group_table()
|
||||
current_pg = ray.util.get_current_placement_group()
|
||||
num_nodes = 0
|
||||
|
||||
if current_pg:
|
||||
nodes_in_pg = set()
|
||||
for pg_key, pg in pg_table.items():
|
||||
if pg_key == current_pg.id.hex():
|
||||
for _, node in pg["bundles_to_node_id"].items():
|
||||
nodes_in_pg.add(node)
|
||||
num_nodes = len(nodes_in_pg)
|
||||
|
||||
return num_nodes
|
||||
37
vllm-v0.6.2/vllm/executor/ray_xpu_executor.py
Normal file
37
vllm-v0.6.2/vllm/executor/ray_xpu_executor.py
Normal file
@@ -0,0 +1,37 @@
|
||||
import asyncio
|
||||
from typing import List, Optional
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.executor.ray_gpu_executor import RayGPUExecutor, RayGPUExecutorAsync
|
||||
from vllm.executor.xpu_executor import XPUExecutor
|
||||
from vllm.logger import init_logger
|
||||
from vllm.utils import get_vllm_instance_id, make_async
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class RayXPUExecutor(RayGPUExecutor, XPUExecutor):
|
||||
|
||||
def _get_env_vars_to_be_updated(self):
|
||||
# Get the set of GPU IDs used on each node.
|
||||
worker_node_and_gpu_ids = self._run_workers("get_node_and_gpu_ids",
|
||||
use_dummy_driver=True)
|
||||
|
||||
VLLM_INSTANCE_ID = get_vllm_instance_id()
|
||||
|
||||
# Set environment variables for the driver and workers.
|
||||
all_args_to_update_environment_variables = [({
|
||||
"VLLM_INSTANCE_ID":
|
||||
VLLM_INSTANCE_ID,
|
||||
"VLLM_TRACE_FUNCTION":
|
||||
str(envs.VLLM_TRACE_FUNCTION),
|
||||
}, ) for (_, _) in worker_node_and_gpu_ids]
|
||||
return all_args_to_update_environment_variables
|
||||
|
||||
|
||||
class RayXPUExecutorAsync(RayXPUExecutor, RayGPUExecutorAsync):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.driver_exec_method = make_async(self.driver_worker.execute_method)
|
||||
self.pp_locks: Optional[List[asyncio.Lock]] = None
|
||||
142
vllm-v0.6.2/vllm/executor/tpu_executor.py
Normal file
142
vllm-v0.6.2/vllm/executor/tpu_executor.py
Normal file
@@ -0,0 +1,142 @@
|
||||
from typing import Any, Dict, List, Optional, Set, Tuple
|
||||
|
||||
import torch
|
||||
|
||||
from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
|
||||
from vllm.logger import init_logger
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.model_executor.layers.sampler import SamplerOutput
|
||||
from vllm.sequence import ExecuteModelRequest
|
||||
from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
|
||||
make_async)
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class TPUExecutor(ExecutorBase):
|
||||
|
||||
uses_ray: bool = False
|
||||
|
||||
def _init_executor(self) -> None:
|
||||
assert not self.scheduler_config.chunked_prefill_enabled, (
|
||||
"Chunked prefill is not yet supported for TPU backend")
|
||||
assert not self.speculative_config, (
|
||||
"Speculative decoding is not yet supported for TPU backend")
|
||||
if self.model_config.dtype in (torch.float16, torch.float32):
|
||||
logger.warning(
|
||||
"The TPU backend currently does not support %s. "
|
||||
"Using bfloat16 instead.", self.model_config.dtype)
|
||||
self.model_config.dtype = torch.bfloat16
|
||||
|
||||
# Instantiate the worker and load the model to the device.
|
||||
self.driver_worker = self._create_worker()
|
||||
self.driver_worker.init_device()
|
||||
self.driver_worker.load_model()
|
||||
|
||||
def _get_worker_kwargs(
|
||||
self,
|
||||
local_rank: int = 0,
|
||||
rank: int = 0,
|
||||
distributed_init_method: Optional[str] = None,
|
||||
) -> Dict[str, Any]:
|
||||
"""Return worker init args for a given rank."""
|
||||
if distributed_init_method is None:
|
||||
distributed_init_method = get_distributed_init_method(
|
||||
get_ip(), get_open_port())
|
||||
return dict(
|
||||
vllm_config=self.vllm_config,
|
||||
local_rank=local_rank,
|
||||
rank=rank,
|
||||
distributed_init_method=distributed_init_method,
|
||||
is_driver_worker=rank == 0,
|
||||
)
|
||||
|
||||
def _create_worker(
|
||||
self,
|
||||
local_rank: int = 0,
|
||||
rank: int = 0,
|
||||
distributed_init_method: Optional[str] = None,
|
||||
):
|
||||
if self.scheduler_config.is_multi_step:
|
||||
from vllm.worker.multi_step_tpu_worker import MultiStepTPUWorker
|
||||
worker = MultiStepTPUWorker(**self._get_worker_kwargs(
|
||||
local_rank, rank, distributed_init_method))
|
||||
return worker
|
||||
else:
|
||||
from vllm.worker.tpu_worker import TPUWorker
|
||||
|
||||
worker = TPUWorker(**self._get_worker_kwargs(
|
||||
local_rank, rank, distributed_init_method))
|
||||
return worker
|
||||
|
||||
def initialize_cache(
|
||||
self,
|
||||
num_gpu_blocks: int,
|
||||
num_cpu_blocks: int,
|
||||
) -> None:
|
||||
"""Initialize the KV cache by invoking the underlying worker."""
|
||||
# NOTE: This is logged in the executor because there can be >1 worker
|
||||
# with other executors. We could log in the engine level, but work
|
||||
# remains to abstract away the device for non-GPU configurations.
|
||||
logger.info("# TPU blocks: %d, # CPU blocks: %d", num_gpu_blocks,
|
||||
num_cpu_blocks)
|
||||
self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)
|
||||
|
||||
def determine_num_available_blocks(self) -> Tuple[int, int]:
|
||||
"""Determine the number of available KV blocks by invoking the
|
||||
underlying worker."""
|
||||
return self.driver_worker.determine_num_available_blocks()
|
||||
|
||||
def execute_model(
|
||||
self,
|
||||
execute_model_req: ExecuteModelRequest,
|
||||
) -> List[SamplerOutput]:
|
||||
output = self.driver_worker.execute_model(execute_model_req)
|
||||
return output
|
||||
|
||||
def add_lora(self, lora_request: LoRARequest) -> bool:
|
||||
raise NotImplementedError(
|
||||
"LoRA is currently not supported by the TPU backend.")
|
||||
|
||||
def remove_lora(self, lora_id: int) -> bool:
|
||||
raise NotImplementedError(
|
||||
"LoRA is currently not supported by the TPU backend.")
|
||||
|
||||
def pin_lora(self, lora_id: int) -> bool:
|
||||
raise NotImplementedError(
|
||||
"LoRA is currently not supported by the TPU backend.")
|
||||
|
||||
def list_loras(self) -> Set[int]:
|
||||
raise NotImplementedError(
|
||||
"LoRA is currently not supported by the TPU backend.")
|
||||
|
||||
def add_prompt_adapter(self, prompt_adapter_request) -> bool:
|
||||
raise NotImplementedError(
|
||||
"Soft prompt is currently not supported by the TPU backend.")
|
||||
|
||||
def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
|
||||
raise NotImplementedError(
|
||||
"Soft prompt is currently not supported by the TPU backend.")
|
||||
|
||||
def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
|
||||
raise NotImplementedError(
|
||||
"Soft prompt is currently not supported by the TPU backend.")
|
||||
|
||||
def list_prompt_adapters(self) -> Set[int]:
|
||||
raise NotImplementedError(
|
||||
"Soft prompt is currently not supported by the TPU backend.")
|
||||
|
||||
def check_health(self) -> None:
|
||||
# TPUExecutor will always be healthy as long as it's running.
|
||||
return
|
||||
|
||||
|
||||
class TPUExecutorAsync(TPUExecutor, ExecutorAsyncBase):
|
||||
|
||||
async def execute_model_async(
|
||||
self,
|
||||
sexecute_model_req: ExecuteModelRequest,
|
||||
) -> SamplerOutput:
|
||||
output = await make_async(self.driver_worker.execute_model
|
||||
)(sexecute_model_req)
|
||||
return output
|
||||
78
vllm-v0.6.2/vllm/executor/xpu_executor.py
Normal file
78
vllm-v0.6.2/vllm/executor/xpu_executor.py
Normal file
@@ -0,0 +1,78 @@
|
||||
from typing import Callable, List, Optional, Tuple, Type, Union
|
||||
|
||||
import torch
|
||||
|
||||
from vllm.config import ModelConfig, ParallelConfig
|
||||
from vllm.executor.executor_base import ExecutorAsyncBase
|
||||
from vllm.executor.gpu_executor import GPUExecutor
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.layers.sampler import SamplerOutput
|
||||
from vllm.sequence import ExecuteModelRequest, PoolerOutput
|
||||
from vllm.utils import make_async
|
||||
from vllm.worker.worker_base import WorkerBase
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class XPUExecutor(GPUExecutor):
|
||||
|
||||
uses_ray: bool = False
|
||||
|
||||
def _init_executor(self) -> None:
|
||||
assert self.device_config.device_type == "xpu"
|
||||
assert self.speculative_config is None, (
|
||||
"Speculative decoding not yet supported for XPU backend")
|
||||
|
||||
self.model_config = _verify_and_get_model_config(self.model_config)
|
||||
GPUExecutor._init_executor(self)
|
||||
|
||||
def _get_worker_module_and_class(
|
||||
self) -> Tuple[str, str, Optional[Callable[[], Type[WorkerBase]]]]:
|
||||
worker_class_fn = None
|
||||
if self.speculative_config is not None:
|
||||
raise NotImplementedError(
|
||||
"XPU does not support speculative decoding")
|
||||
else:
|
||||
worker_module_name = "vllm.worker.xpu_worker"
|
||||
worker_class_name = "XPUWorker"
|
||||
return (worker_module_name, worker_class_name, worker_class_fn)
|
||||
|
||||
def execute_model(
|
||||
self, execute_model_req: ExecuteModelRequest
|
||||
) -> Optional[List[Union[SamplerOutput, PoolerOutput]]]:
|
||||
output = self.driver_worker.execute_model(execute_model_req)
|
||||
return output
|
||||
|
||||
|
||||
class XPUExecutorAsync(XPUExecutor, ExecutorAsyncBase):
|
||||
|
||||
async def execute_model_async(
|
||||
self,
|
||||
execute_model_req: ExecuteModelRequest,
|
||||
) -> List[SamplerOutput]:
|
||||
output = await make_async(self.driver_worker.execute_model
|
||||
)(execute_model_req=execute_model_req)
|
||||
return output
|
||||
|
||||
|
||||
def _verify_and_get_model_config(config: ModelConfig) -> ModelConfig:
|
||||
if config.dtype == torch.bfloat16:
|
||||
logger.warning(
|
||||
"bfloat16 is not fully supported on XPU, casting to float16.")
|
||||
config.dtype = torch.float16
|
||||
if not config.enforce_eager:
|
||||
logger.warning(
|
||||
"CUDA graph is not supported on XPU, fallback to the eager "
|
||||
"mode.")
|
||||
config.enforce_eager = True
|
||||
return config
|
||||
|
||||
|
||||
def _verify_and_get_parallel_config(config: ParallelConfig) -> ParallelConfig:
|
||||
if (config.distributed_executor_backend is not None
|
||||
and config.distributed_executor_backend != "ray"):
|
||||
logger.warning(
|
||||
"%s is not supported on XPU, fallback to ray distributed executor "
|
||||
"backend.", config.distributed_executor_backend)
|
||||
config.distributed_executor_backend = "ray"
|
||||
return config
|
||||
Reference in New Issue
Block a user