vllm-ascend vnpu v1

This commit is contained in:
starkwj
2025-12-26 07:37:35 +00:00
parent 2f1aed98cc
commit 135cc0a505
168 changed files with 28337 additions and 9 deletions

View File

@@ -258,6 +258,17 @@ class NPUWorker(WorkerBase):
)
return available_kv_cache_memory
def determine_available_memory_idle_offload_mode(self) -> int:
allocator = CaMemAllocator.get_instance()
free, total = allocator.get_pool_mem_info()
available_kv_cache_memory = int(
total * self.cache_config.gpu_memory_utilization - (total - free))
available_kv_cache_memory = int(max(available_kv_cache_memory, 0))
logger.info(
f"Available memory (idle offload mode): {available_kv_cache_memory}, total memory: {total}"
)
return available_kv_cache_memory
def execute_model(
self,
scheduler_output: "SchedulerOutput",
@@ -306,11 +317,46 @@ class NPUWorker(WorkerBase):
"Sleep mode can only be "
"used for one instance per process.")
context = allocator.use_memory_pool(tag="weights")
elif envs_ascend.VLLM_ASCEND_ENABLE_IDLE_OFFLOAD:
if not sleep_mode_enabled():
raise ValueError(
"Sleep mode is not enabled. Please compile vllm-ascend with COMPILE_CUSTOM_KERNELS=1."
)
if is_enable_nz():
raise ValueError(
"FRACTAL_NZ mode is enabled. This may cause model parameter precision issues "
"in the RL scenarios. Please set VLLM_ASCEND_ENABLE_NZ=0.")
allocator = CaMemAllocator.get_instance()
assert allocator.get_current_usage() == 0, (
"Idle offload mode can only be "
"used for one instance per process.")
context = allocator.use_memory_pool(tag="weights")
else:
from contextlib import nullcontext
context = nullcontext() # type: ignore
with context:
self.model_runner.load_model()
if envs_ascend.VLLM_ASCEND_ENABLE_IDLE_OFFLOAD:
# save memory to host with lock
self.offload_vram()
self.reload_vram()
def offload_vram(self) -> None:
# free_bytes_before_offload = NPUPlatform.mem_get_info()[0]
allocator = CaMemAllocator.get_instance()
allocator.offload_vram(offload_tags=("weights", ))
# free_bytes_after_offload, total = NPUPlatform.mem_get_info()
# freed_bytes = free_bytes_after_offload - free_bytes_before_offload
# used_bytes = total - free_bytes_after_offload
# assert freed_bytes >= 0, "Memory usage increased after sleeping."
# logger.info(
# "Offloading freed %.2f GiB memory, "
# "%.2f GiB memory is still in use.", freed_bytes / GiB_bytes,
# used_bytes / GiB_bytes)
def reload_vram(self) -> bool:
allocator = CaMemAllocator.get_instance()
return allocator.reload_vram(tags=None)
def compile_or_warm_up_model(self) -> None:
# Note: need to adapt for graph mode.
@@ -351,6 +397,9 @@ class NPUWorker(WorkerBase):
if self.vllm_config.model_config.enable_sleep_mode:
allocator = CaMemAllocator.get_instance()
context = allocator.use_memory_pool(tag="kv_cache")
elif envs_ascend.VLLM_ASCEND_ENABLE_IDLE_OFFLOAD:
allocator = CaMemAllocator.get_instance()
context = allocator.use_memory_pool(tag="kv_cache")
else:
from contextlib import nullcontext
context = nullcontext() # type: ignore