add vxpu
This commit is contained in:
52
vllm_kunlun/patch/platform/patch_core.py
Normal file
52
vllm_kunlun/patch/platform/patch_core.py
Normal file
@@ -0,0 +1,52 @@
|
||||
from logging import DEBUG
|
||||
|
||||
from vllm.logger import logger
|
||||
from vllm.v1.engine.core import EngineCoreProc
|
||||
import vllm_kunlun.platforms.envs as xenvs
|
||||
|
||||
|
||||
def run_busy_loop(self):
|
||||
"""Core busy loop of the EngineCore."""
|
||||
|
||||
# Loop until process is sent a SIGINT or SIGTERM
|
||||
while True:
|
||||
# 1) Poll the input queue until there is work to do.
|
||||
self._process_input_queue()
|
||||
# 2) Step the engine core and return the outputs.
|
||||
if xenvs.VLLM_KUNLUN_ENABLE_VXPU and self.scheduler.has_requests() and self.model_executor.is_offloaded:
|
||||
prev_is_self = self.model_executor.reload_vram()
|
||||
if not prev_is_self:
|
||||
self.reset_prefix_cache()
|
||||
self._process_engine_step()
|
||||
if xenvs.VLLM_KUNLUN_ENABLE_VXPU and not self.scheduler.has_requests() and not self.model_executor.is_offloaded:
|
||||
self.model_executor.offload_vram()
|
||||
|
||||
|
||||
def _process_input_queue(self):
|
||||
"""Exits when an engine step needs to be performed."""
|
||||
|
||||
waited = False
|
||||
while not self.engines_running and not self.scheduler.has_requests() \
|
||||
and not self.batch_queue:
|
||||
if logger.isEnabledFor(DEBUG) and self.input_queue.empty():
|
||||
logger.debug("EngineCore waiting for work.")
|
||||
waited = True
|
||||
if xenvs.VLLM_KUNLUN_ENABLE_VXPU and not self.model_executor.is_offloaded:
|
||||
self.model_executor.offload_vram()
|
||||
req = self.input_queue.get()
|
||||
self._handle_client_request(*req)
|
||||
|
||||
if waited:
|
||||
logger.debug("EngineCore loop active.")
|
||||
|
||||
# Handle any more client requests.
|
||||
while not self.input_queue.empty():
|
||||
req = self.input_queue.get_nowait()
|
||||
self._handle_client_request(*req)
|
||||
|
||||
|
||||
EngineCoreProc.run_busy_loop = run_busy_loop
|
||||
EngineCoreProc._process_input_queue = _process_input_queue
|
||||
|
||||
|
||||
# TODO:
|
||||
48
vllm_kunlun/patch/platform/patch_executor.py
Normal file
48
vllm_kunlun/patch/platform/patch_executor.py
Normal file
@@ -0,0 +1,48 @@
|
||||
import time
|
||||
|
||||
from vllm.executor.executor_base import logger, ExecutorBase
|
||||
|
||||
|
||||
original_init = ExecutorBase.__init__
|
||||
def init(self, *args, **kwargs):
|
||||
original_init(self, *args, **kwargs)
|
||||
self.is_offloaded = False
|
||||
|
||||
|
||||
def offload_vram(self):
|
||||
if self.is_offloaded:
|
||||
logger.warning("Executor is already offloaded.")
|
||||
return
|
||||
time_before_offload = time.perf_counter()
|
||||
self.collective_rpc("offload_vram")
|
||||
time_after_offload = time.perf_counter()
|
||||
|
||||
self.is_offloaded = True
|
||||
logger.info(f"Offloading VRAM costs {time_after_offload - time_before_offload:.6f} seconds.")
|
||||
|
||||
|
||||
def reload_vram(self) -> bool:
|
||||
if not self.is_offloaded:
|
||||
logger.warning("Executor is not offloaded.")
|
||||
return True
|
||||
|
||||
while True:
|
||||
time_before_reload = time.perf_counter()
|
||||
res = self.collective_rpc("try_reload_vram")
|
||||
time_after_reload = time.perf_counter()
|
||||
|
||||
succ = all(x[0] for x in res)
|
||||
if succ:
|
||||
self.is_offloaded = False
|
||||
logger.info(f"Reloading VRAM costs {time_after_reload - time_before_reload:.6f} seconds.")
|
||||
prev_is_self = all(x[1] for x in res)
|
||||
return prev_is_self
|
||||
else:
|
||||
# some workers not get lock
|
||||
self.collective_rpc("vnpu_unlock_gpu")
|
||||
time.sleep(0.001)
|
||||
|
||||
|
||||
ExecutorBase.__init__ = init
|
||||
ExecutorBase.offload_vram = offload_vram
|
||||
ExecutorBase.reload_vram = reload_vram
|
||||
Reference in New Issue
Block a user