add env vars & misc
This commit is contained in:
@@ -84,12 +84,12 @@ def run_busy_loop(self):
|
||||
# 1) Poll the input queue until there is work to do.
|
||||
self._process_input_queue()
|
||||
# 2) Step the engine core and return the outputs.
|
||||
if envs_ascend.VLLM_ASCEND_ENABLE_IDLE_OFFLOAD and self.scheduler.has_requests() and self.model_executor.is_offloaded:
|
||||
if envs_ascend.VLLM_ASCEND_ENABLE_VNPU and self.scheduler.has_requests() and self.model_executor.is_offloaded:
|
||||
prev_is_self = self.model_executor.reload_vram()
|
||||
if not prev_is_self:
|
||||
self.reset_prefix_cache()
|
||||
self._process_engine_step()
|
||||
if envs_ascend.VLLM_ASCEND_ENABLE_IDLE_OFFLOAD and not self.scheduler.has_requests() and not self.model_executor.is_offloaded:
|
||||
if envs_ascend.VLLM_ASCEND_ENABLE_VNPU and not self.scheduler.has_requests() and not self.model_executor.is_offloaded:
|
||||
self.model_executor.offload_vram()
|
||||
|
||||
def _process_input_queue(self):
|
||||
@@ -101,7 +101,7 @@ def _process_input_queue(self):
|
||||
if logger.isEnabledFor(DEBUG) and self.input_queue.empty():
|
||||
logger.debug("EngineCore waiting for work.")
|
||||
waited = True
|
||||
if envs_ascend.VLLM_ASCEND_ENABLE_IDLE_OFFLOAD and not self.model_executor.is_offloaded:
|
||||
if envs_ascend.VLLM_ASCEND_ENABLE_VNPU and not self.model_executor.is_offloaded:
|
||||
self.model_executor.offload_vram()
|
||||
req = self.input_queue.get()
|
||||
self._handle_client_request(*req)
|
||||
@@ -128,10 +128,10 @@ def _initialize_kv_caches(
|
||||
|
||||
has_kv_cache = any(kv_cache_spec for kv_cache_spec in kv_cache_specs)
|
||||
if has_kv_cache:
|
||||
if envs_ascend.VLLM_ASCEND_ENABLE_IDLE_OFFLOAD:
|
||||
if envs_ascend.VLLM_ASCEND_ENABLE_VNPU:
|
||||
# get available memory in idle offload mode
|
||||
available_gpu_memory = (
|
||||
self.model_executor.determine_available_memory_idle_offload_mode())
|
||||
self.model_executor.determine_available_memory_vnpu_offload_mode())
|
||||
self.available_gpu_memory_for_kv_cache = \
|
||||
available_gpu_memory[0]
|
||||
elif os.environ.get("VLLM_ELASTIC_EP_SCALE_UP_LAUNCH") == "1":
|
||||
|
||||
@@ -43,11 +43,11 @@ def reload_vram(self) -> bool:
|
||||
time.sleep(0.001)
|
||||
|
||||
|
||||
def determine_available_memory_idle_offload_mode(self) -> int:
|
||||
return self.collective_rpc("determine_available_memory_idle_offload_mode")
|
||||
def determine_available_memory_vnpu_offload_mode(self) -> int:
|
||||
return self.collective_rpc("determine_available_memory_vnpu_offload_mode")
|
||||
|
||||
|
||||
ExecutorBase.__init__ = init
|
||||
ExecutorBase.offload_vram = offload_vram
|
||||
ExecutorBase.reload_vram = reload_vram
|
||||
ExecutorBase.determine_available_memory_idle_offload_mode = determine_available_memory_idle_offload_mode
|
||||
ExecutorBase.determine_available_memory_vnpu_offload_mode = determine_available_memory_vnpu_offload_mode
|
||||
|
||||
Reference in New Issue
Block a user