vllm-ascend vnpu v1
This commit is contained in:
@@ -258,6 +258,17 @@ class NPUWorker(WorkerBase):
|
||||
)
|
||||
return available_kv_cache_memory
|
||||
|
||||
def determine_available_memory_idle_offload_mode(self) -> int:
|
||||
allocator = CaMemAllocator.get_instance()
|
||||
free, total = allocator.get_pool_mem_info()
|
||||
available_kv_cache_memory = int(
|
||||
total * self.cache_config.gpu_memory_utilization - (total - free))
|
||||
available_kv_cache_memory = int(max(available_kv_cache_memory, 0))
|
||||
logger.info(
|
||||
f"Available memory (idle offload mode): {available_kv_cache_memory}, total memory: {total}"
|
||||
)
|
||||
return available_kv_cache_memory
|
||||
|
||||
def execute_model(
|
||||
self,
|
||||
scheduler_output: "SchedulerOutput",
|
||||
@@ -306,11 +317,46 @@ class NPUWorker(WorkerBase):
|
||||
"Sleep mode can only be "
|
||||
"used for one instance per process.")
|
||||
context = allocator.use_memory_pool(tag="weights")
|
||||
elif envs_ascend.VLLM_ASCEND_ENABLE_IDLE_OFFLOAD:
|
||||
if not sleep_mode_enabled():
|
||||
raise ValueError(
|
||||
"Sleep mode is not enabled. Please compile vllm-ascend with COMPILE_CUSTOM_KERNELS=1."
|
||||
)
|
||||
if is_enable_nz():
|
||||
raise ValueError(
|
||||
"FRACTAL_NZ mode is enabled. This may cause model parameter precision issues "
|
||||
"in the RL scenarios. Please set VLLM_ASCEND_ENABLE_NZ=0.")
|
||||
allocator = CaMemAllocator.get_instance()
|
||||
assert allocator.get_current_usage() == 0, (
|
||||
"Idle offload mode can only be "
|
||||
"used for one instance per process.")
|
||||
context = allocator.use_memory_pool(tag="weights")
|
||||
else:
|
||||
from contextlib import nullcontext
|
||||
context = nullcontext() # type: ignore
|
||||
with context:
|
||||
self.model_runner.load_model()
|
||||
if envs_ascend.VLLM_ASCEND_ENABLE_IDLE_OFFLOAD:
|
||||
# save memory to host with lock
|
||||
self.offload_vram()
|
||||
self.reload_vram()
|
||||
|
||||
def offload_vram(self) -> None:
|
||||
# free_bytes_before_offload = NPUPlatform.mem_get_info()[0]
|
||||
allocator = CaMemAllocator.get_instance()
|
||||
allocator.offload_vram(offload_tags=("weights", ))
|
||||
# free_bytes_after_offload, total = NPUPlatform.mem_get_info()
|
||||
# freed_bytes = free_bytes_after_offload - free_bytes_before_offload
|
||||
# used_bytes = total - free_bytes_after_offload
|
||||
# assert freed_bytes >= 0, "Memory usage increased after sleeping."
|
||||
# logger.info(
|
||||
# "Offloading freed %.2f GiB memory, "
|
||||
# "%.2f GiB memory is still in use.", freed_bytes / GiB_bytes,
|
||||
# used_bytes / GiB_bytes)
|
||||
|
||||
def reload_vram(self) -> bool:
|
||||
allocator = CaMemAllocator.get_instance()
|
||||
return allocator.reload_vram(tags=None)
|
||||
|
||||
def compile_or_warm_up_model(self) -> None:
|
||||
# Note: need to adapt for graph mode.
|
||||
@@ -351,6 +397,9 @@ class NPUWorker(WorkerBase):
|
||||
if self.vllm_config.model_config.enable_sleep_mode:
|
||||
allocator = CaMemAllocator.get_instance()
|
||||
context = allocator.use_memory_pool(tag="kv_cache")
|
||||
elif envs_ascend.VLLM_ASCEND_ENABLE_IDLE_OFFLOAD:
|
||||
allocator = CaMemAllocator.get_instance()
|
||||
context = allocator.use_memory_pool(tag="kv_cache")
|
||||
else:
|
||||
from contextlib import nullcontext
|
||||
context = nullcontext() # type: ignore
|
||||
|
||||
Reference in New Issue
Block a user