@@ -62,7 +62,10 @@ try:
|
||||
python_create_and_map_offload as python_create_and_map,python_unmap_and_release_offload as python_unmap_and_release,
|
||||
python_get_mem_info_offload as python_get_mem_info,
|
||||
python_try_lock_gpu_offload as python_try_lock_gpu,
|
||||
python_unlock_gpu_offload as python_unlock_gpu
|
||||
python_unlock_gpu_offload as python_unlock_gpu,
|
||||
python_start_wait_offload as python_start_wait,
|
||||
python_cancel_wait_offload as python_cancel_wait,
|
||||
python_has_higher_priority_waiter_offload as python_has_higher_priority_waiter
|
||||
)
|
||||
else:
|
||||
from vllm_ascend.vllm_ascend_C import ( # type: ignore # noqa: F401
|
||||
@@ -73,6 +76,9 @@ try:
|
||||
python_get_mem_info = None
|
||||
python_try_lock_gpu = None
|
||||
python_unlock_gpu = None
|
||||
python_start_wait = None
|
||||
python_cancel_wait = None
|
||||
python_has_higher_priority_waiter = None
|
||||
|
||||
lib_name = find_loaded_library("vllm_ascend_C")
|
||||
camem_available = True
|
||||
@@ -84,6 +90,9 @@ except ImportError as e:
|
||||
python_get_mem_info = None
|
||||
python_try_lock_gpu = None
|
||||
python_unlock_gpu = None
|
||||
python_start_wait = None
|
||||
python_cancel_wait = None
|
||||
python_has_higher_priority_waiter = None
|
||||
lib_name = None
|
||||
libcudart = None
|
||||
|
||||
@@ -306,15 +315,37 @@ class CaMemAllocator:
|
||||
return False, False
|
||||
|
||||
def _vnpu_lock_gpu(self) -> bool:
|
||||
is_waiting = False
|
||||
while True:
|
||||
success, _ = self.vnpu_try_lock_gpu()
|
||||
if success:
|
||||
if is_waiting:
|
||||
self.vnpu_cancel_wait()
|
||||
return True
|
||||
else:
|
||||
if not is_waiting:
|
||||
self.vnpu_start_wait()
|
||||
is_waiting = True
|
||||
self.vnpu_unlock_gpu(keep_wait=True)
|
||||
|
||||
time.sleep(0.001)
|
||||
|
||||
def vnpu_unlock_gpu(self):
|
||||
def vnpu_unlock_gpu(self, keep_wait: bool = False):
|
||||
if python_unlock_gpu:
|
||||
python_unlock_gpu()
|
||||
python_unlock_gpu(keep_wait)
|
||||
|
||||
def vnpu_start_wait(self) -> None:
|
||||
if python_start_wait:
|
||||
python_start_wait()
|
||||
|
||||
def vnpu_cancel_wait(self) -> None:
|
||||
if python_cancel_wait:
|
||||
python_cancel_wait()
|
||||
|
||||
def vnpu_has_higher_priority_waiter(self) -> bool:
|
||||
if python_has_higher_priority_waiter:
|
||||
return python_has_higher_priority_waiter()
|
||||
return False
|
||||
|
||||
def get_pool_mem_info(self) -> tuple[int, int]:
|
||||
"""
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
from concurrent.futures import Future
|
||||
from logging import DEBUG
|
||||
import signal
|
||||
import queue
|
||||
import signal
|
||||
import time
|
||||
|
||||
from vllm.config import ParallelConfig, VllmConfig
|
||||
from vllm.logger import logger
|
||||
@@ -98,8 +100,13 @@ def run_engine_core(*args, dp_rank: int = 0, local_dp_rank: int = 0, **kwargs):
|
||||
if engine_core is not None:
|
||||
engine_core.shutdown()
|
||||
|
||||
|
||||
def run_busy_loop(self):
|
||||
"""Core busy loop of the EngineCore."""
|
||||
# vnpu yield
|
||||
yield_probe_counter = 0
|
||||
prepared_yield = False
|
||||
|
||||
while self._handle_shutdown():
|
||||
# 1) Poll the input queue until there is work to do.
|
||||
self._process_input_queue()
|
||||
@@ -111,14 +118,52 @@ def run_busy_loop(self):
|
||||
prev_is_self = self.model_executor.reload_vram()
|
||||
if not prev_is_self:
|
||||
self.reset_prefix_cache()
|
||||
|
||||
# 2) Step the engine core and return the outputs.
|
||||
self._process_engine_step()
|
||||
if (
|
||||
envs_ascend.VLLM_ASCEND_ENABLE_VNPU
|
||||
and not self.has_work()
|
||||
and not self.model_executor.is_offloaded()
|
||||
):
|
||||
self.model_executor.offload_vram()
|
||||
|
||||
if envs_ascend.VLLM_ASCEND_ENABLE_VNPU:
|
||||
if not self.has_work():
|
||||
if not self.model_executor.is_offloaded():
|
||||
self.model_executor.offload_vram()
|
||||
elif not prepared_yield:
|
||||
# check should yield every 10 steps
|
||||
yield_probe_counter = (yield_probe_counter + 1) % 10
|
||||
if yield_probe_counter == 0:
|
||||
should_yield = self.model_executor.vnpu_has_higher_priority_waiter()
|
||||
if should_yield:
|
||||
logger.info(
|
||||
"Found other higher priority worker. Current engine will yield after finishing in-flight requests."
|
||||
)
|
||||
prepared_yield = True
|
||||
pause_future = self.pause_scheduler(
|
||||
mode="wait", clear_cache=True
|
||||
)
|
||||
|
||||
def pause_complete(f: Future):
|
||||
nonlocal prepared_yield
|
||||
try:
|
||||
if f:
|
||||
f.result()
|
||||
if not self.model_executor.is_offloaded():
|
||||
self.model_executor.offload_vram(is_yield=True)
|
||||
prepared_yield = False
|
||||
logger.info("Current engine has yielded.")
|
||||
# Scheduler should wake up itself after yielding.
|
||||
# Sleep some time to give chance to other worker.
|
||||
time.sleep(2)
|
||||
self.resume_scheduler()
|
||||
except Exception as e:
|
||||
logger.exception("Failed to yield: {e}.")
|
||||
raise e
|
||||
|
||||
if pause_future is None:
|
||||
# pause finished, no in-flight requests
|
||||
pause_complete(None)
|
||||
else:
|
||||
# pause_future will be set after all in-flight
|
||||
# requests are finished in _process_input_queue
|
||||
pause_future.add_done_callback(pause_complete)
|
||||
|
||||
raise SystemExit
|
||||
|
||||
@@ -142,7 +187,8 @@ def _process_input_queue(self):
|
||||
and not self.model_executor.is_offloaded()
|
||||
):
|
||||
self.model_executor.offload_vram()
|
||||
block = self.process_input_queue_block
|
||||
# vNPU: if scheduler is resumed and has work, should not block
|
||||
block = self.process_input_queue_block and not self.has_work()
|
||||
try:
|
||||
req = self.input_queue.get(block=block)
|
||||
self._handle_client_request(*req)
|
||||
@@ -169,6 +215,9 @@ EngineCoreProc._process_input_queue = _process_input_queue
|
||||
|
||||
def DPEngineCoreProc_run_busy_loop(self):
|
||||
"""Core busy loop of the EngineCore for data parallel case."""
|
||||
# vnpu yield
|
||||
yield_probe_counter = 0
|
||||
prepared_yield = False
|
||||
|
||||
# Loop until process is sent a SIGINT or SIGTERM
|
||||
while self._handle_shutdown():
|
||||
@@ -226,13 +275,49 @@ def DPEngineCoreProc_run_busy_loop(self):
|
||||
# Increment wave count and reset step counter.
|
||||
self.current_wave += 1
|
||||
self.step_counter = 0
|
||||
|
||||
if (
|
||||
envs_ascend.VLLM_ASCEND_ENABLE_VNPU
|
||||
and not self.has_work()
|
||||
and not self.model_executor.is_offloaded()
|
||||
):
|
||||
self.model_executor.offload_vram()
|
||||
|
||||
if envs_ascend.VLLM_ASCEND_ENABLE_VNPU:
|
||||
if not self.has_work():
|
||||
if not self.model_executor.is_offloaded():
|
||||
self.model_executor.offload_vram()
|
||||
elif not prepared_yield:
|
||||
# check should yield every 10 steps
|
||||
yield_probe_counter = (yield_probe_counter + 1) % 10
|
||||
if yield_probe_counter == 0:
|
||||
should_yield = self.model_executor.vnpu_has_higher_priority_waiter()
|
||||
if should_yield:
|
||||
logger.info(
|
||||
"Found other higher priority worker. Current engine will yield after finishing in-flight requests."
|
||||
)
|
||||
prepared_yield = True
|
||||
pause_future = self.pause_scheduler(
|
||||
mode="wait", clear_cache=True
|
||||
)
|
||||
|
||||
def pause_complete(f: Future):
|
||||
nonlocal prepared_yield
|
||||
try:
|
||||
if f:
|
||||
f.result()
|
||||
if not self.model_executor.is_offloaded():
|
||||
self.model_executor.offload_vram(is_yield=True)
|
||||
prepared_yield = False
|
||||
logger.info("Current engine has yielded.")
|
||||
# Scheduler should wake up itself after yielding.
|
||||
# Sleep some time to give chance to other worker.
|
||||
time.sleep(2)
|
||||
self.resume_scheduler()
|
||||
except Exception as e:
|
||||
logger.exception("Failed to yield: {e}.")
|
||||
raise e
|
||||
|
||||
if pause_future is None:
|
||||
# pause finished, no in-flight requests
|
||||
pause_complete(None)
|
||||
else:
|
||||
# pause_future will be set after all in-flight
|
||||
# requests are finished in _process_input_queue
|
||||
pause_future.add_done_callback(pause_complete)
|
||||
|
||||
raise SystemExit
|
||||
|
||||
|
||||
@@ -8,7 +8,12 @@ def is_offloaded(self) -> bool:
|
||||
self._is_offloaded = False
|
||||
return self._is_offloaded
|
||||
|
||||
def offload_vram(self):
|
||||
def is_yielded(self) -> bool:
|
||||
if not hasattr(self, "_is_yielded"):
|
||||
self._is_yielded = False
|
||||
return self._is_yielded
|
||||
|
||||
def offload_vram(self, is_yield: bool = False):
|
||||
if self.is_offloaded():
|
||||
logger.warning("Executor is already offloaded.")
|
||||
return
|
||||
@@ -17,14 +22,18 @@ def offload_vram(self):
|
||||
time_after_offload = time.perf_counter()
|
||||
|
||||
self._is_offloaded = True
|
||||
logger.info(f"Offloading VRAM costs {time_after_offload - time_before_offload:.6f} seconds.")
|
||||
|
||||
if is_yield:
|
||||
self._is_yielded = True
|
||||
logger.info(
|
||||
f"Offloading VRAM costs {time_after_offload - time_before_offload:.3f} seconds."
|
||||
)
|
||||
|
||||
def reload_vram(self) -> bool:
|
||||
if not self.is_offloaded():
|
||||
logger.warning("Executor is not offloaded.")
|
||||
return True
|
||||
|
||||
is_waiting = False
|
||||
while True:
|
||||
time_before_reload = time.perf_counter()
|
||||
res = self.collective_rpc("try_reload_vram")
|
||||
@@ -33,15 +42,28 @@ def reload_vram(self) -> bool:
|
||||
succ = all(x[0] for x in res)
|
||||
if succ:
|
||||
self._is_offloaded = False
|
||||
logger.info(f"Reloading VRAM costs {time_after_reload - time_before_reload:.6f} seconds.")
|
||||
self._is_yielded = False
|
||||
prev_is_self = all(x[1] for x in res)
|
||||
if is_waiting:
|
||||
self.collective_rpc("vnpu_cancel_wait")
|
||||
logger.info(
|
||||
f"Reloading VRAM costs {time_after_reload - time_before_reload:.3f} seconds."
|
||||
)
|
||||
return prev_is_self
|
||||
else:
|
||||
# some workers not get lock
|
||||
self.collective_rpc("vnpu_unlock_gpu")
|
||||
if not is_waiting:
|
||||
self.collective_rpc("vnpu_start_wait")
|
||||
is_waiting = True
|
||||
self.collective_rpc("vnpu_unlock_gpu", kwargs={"keep_wait": True})
|
||||
time.sleep(0.001)
|
||||
|
||||
def vnpu_has_higher_priority_waiter(self) -> bool:
|
||||
res = self.collective_rpc("vnpu_has_higher_priority_waiter")
|
||||
return any(res)
|
||||
|
||||
|
||||
Executor.is_offloaded = is_offloaded
|
||||
Executor.offload_vram = offload_vram
|
||||
Executor.reload_vram = reload_vram
|
||||
Executor.vnpu_has_higher_priority_waiter = vnpu_has_higher_priority_waiter
|
||||
|
||||
@@ -470,7 +470,7 @@ class NPUWorker(WorkerBase):
|
||||
# save memory to host with lock
|
||||
self.offload_vram()
|
||||
succ, _ = self.try_reload_vram()
|
||||
assert succ, "Failed to reload model weights after offloading."
|
||||
# assert succ, "Failed to reload model weights after offloading."
|
||||
|
||||
def offload_vram(self) -> None:
|
||||
allocator = CaMemAllocator.get_instance()
|
||||
@@ -480,9 +480,21 @@ class NPUWorker(WorkerBase):
|
||||
allocator = CaMemAllocator.get_instance()
|
||||
return allocator.try_reload_vram(tags=None)
|
||||
|
||||
def vnpu_unlock_gpu(self) -> None:
|
||||
def vnpu_unlock_gpu(self, keep_wait: bool = False) -> None:
|
||||
allocator = CaMemAllocator.get_instance()
|
||||
allocator.vnpu_unlock_gpu()
|
||||
allocator.vnpu_unlock_gpu(keep_wait)
|
||||
|
||||
def vnpu_start_wait(self) -> None:
|
||||
allocator = CaMemAllocator.get_instance()
|
||||
allocator.vnpu_start_wait()
|
||||
|
||||
def vnpu_cancel_wait(self) -> None:
|
||||
allocator = CaMemAllocator.get_instance()
|
||||
allocator.vnpu_cancel_wait()
|
||||
|
||||
def vnpu_has_higher_priority_waiter(self) -> bool:
|
||||
allocator = CaMemAllocator.get_instance()
|
||||
return allocator.vnpu_has_higher_priority_waiter()
|
||||
|
||||
def compile_or_warm_up_model(self) -> float:
|
||||
# Note: need to adapt for graph mode.
|
||||
|
||||
Reference in New Issue
Block a user