add env vars & misc
This commit is contained in:
@@ -58,7 +58,7 @@ def find_loaded_library(lib_name) -> Optional[str]:
|
||||
|
||||
camem_available = False
|
||||
try:
|
||||
if envs_ascend.VLLM_ASCEND_ENABLE_IDLE_OFFLOAD:
|
||||
if envs_ascend.VLLM_ASCEND_ENABLE_VNPU:
|
||||
from vllm_ascend.vllm_ascend_C import ( # type: ignore # noqa: F401
|
||||
init_module_offload as init_module,
|
||||
python_create_and_map_offload as python_create_and_map,python_unmap_and_release_offload as python_unmap_and_release,
|
||||
@@ -109,7 +109,7 @@ def get_pluggable_allocator(
|
||||
python_malloc_fn: Callable[[tuple[int, int, int, int]], None],
|
||||
python_free_func: Callable[[int], tuple[int, int, int, int]]
|
||||
) -> torch.npu.memory.NPUPluggableAllocator:
|
||||
if envs_ascend.VLLM_ASCEND_ENABLE_IDLE_OFFLOAD:
|
||||
if envs_ascend.VLLM_ASCEND_ENABLE_VNPU:
|
||||
current_device = torch.npu.current_device()
|
||||
init_module(python_malloc_fn, python_free_func, current_device)
|
||||
new_alloc = torch.npu.memory.NPUPluggableAllocator(
|
||||
@@ -281,7 +281,7 @@ class CaMemAllocator:
|
||||
# see https://github.com/pytorch/pytorch/issues/146431 .
|
||||
self.allocator_and_pools[tag] = data
|
||||
# lock gpu
|
||||
if envs_ascend.VLLM_ASCEND_ENABLE_IDLE_OFFLOAD:
|
||||
if envs_ascend.VLLM_ASCEND_ENABLE_VNPU:
|
||||
self._vnpu_lock_gpu()
|
||||
yield
|
||||
# PyTorch's bug, calling torch.cuda.empty_cache() will error
|
||||
@@ -294,7 +294,7 @@ class CaMemAllocator:
|
||||
# allocate memory.
|
||||
# TODO: we need to find a way to release the memory,
|
||||
# i.e. calling torch.cuda.empty_cache()
|
||||
if envs_ascend.VLLM_ASCEND_ENABLE_IDLE_OFFLOAD:
|
||||
if envs_ascend.VLLM_ASCEND_ENABLE_VNPU:
|
||||
self.vnpu_unlock_gpu()
|
||||
self.current_tag = old_tag
|
||||
|
||||
@@ -321,12 +321,11 @@ class CaMemAllocator:
|
||||
return True
|
||||
time.sleep(0.001)
|
||||
|
||||
|
||||
def vnpu_unlock_gpu(self):
|
||||
if python_unlock_gpu:
|
||||
python_unlock_gpu()
|
||||
|
||||
def get_pool_mem_info(self) -> int:
|
||||
def get_pool_mem_info(self) -> tuple[int, int]:
|
||||
"""
|
||||
get available memory in reserved pool."""
|
||||
return python_get_mem_info()
|
||||
|
||||
@@ -167,7 +167,7 @@ env_variables: Dict[str, Callable[[], Any]] = {
|
||||
# Whether to enable transpose weight and cast format to FRACTAL_NZ.
|
||||
"VLLM_ASCEND_ENABLE_NZ":
|
||||
lambda: int(os.getenv("VLLM_ASCEND_ENABLE_NZ", 0)),
|
||||
"VLLM_ASCEND_ENABLE_IDLE_OFFLOAD": lambda: int(os.getenv("VLLM_ASCEND_ENABLE_IDLE_OFFLOAD", 1)),
|
||||
"VLLM_ASCEND_ENABLE_VNPU": lambda: int(os.getenv("VLLM_ASCEND_ENABLE_VNPU", 1)),
|
||||
}
|
||||
|
||||
# end-env-vars-definition
|
||||
|
||||
@@ -84,12 +84,12 @@ def run_busy_loop(self):
|
||||
# 1) Poll the input queue until there is work to do.
|
||||
self._process_input_queue()
|
||||
# 2) Step the engine core and return the outputs.
|
||||
if envs_ascend.VLLM_ASCEND_ENABLE_IDLE_OFFLOAD and self.scheduler.has_requests() and self.model_executor.is_offloaded:
|
||||
if envs_ascend.VLLM_ASCEND_ENABLE_VNPU and self.scheduler.has_requests() and self.model_executor.is_offloaded:
|
||||
prev_is_self = self.model_executor.reload_vram()
|
||||
if not prev_is_self:
|
||||
self.reset_prefix_cache()
|
||||
self._process_engine_step()
|
||||
if envs_ascend.VLLM_ASCEND_ENABLE_IDLE_OFFLOAD and not self.scheduler.has_requests() and not self.model_executor.is_offloaded:
|
||||
if envs_ascend.VLLM_ASCEND_ENABLE_VNPU and not self.scheduler.has_requests() and not self.model_executor.is_offloaded:
|
||||
self.model_executor.offload_vram()
|
||||
|
||||
def _process_input_queue(self):
|
||||
@@ -101,7 +101,7 @@ def _process_input_queue(self):
|
||||
if logger.isEnabledFor(DEBUG) and self.input_queue.empty():
|
||||
logger.debug("EngineCore waiting for work.")
|
||||
waited = True
|
||||
if envs_ascend.VLLM_ASCEND_ENABLE_IDLE_OFFLOAD and not self.model_executor.is_offloaded:
|
||||
if envs_ascend.VLLM_ASCEND_ENABLE_VNPU and not self.model_executor.is_offloaded:
|
||||
self.model_executor.offload_vram()
|
||||
req = self.input_queue.get()
|
||||
self._handle_client_request(*req)
|
||||
@@ -128,10 +128,10 @@ def _initialize_kv_caches(
|
||||
|
||||
has_kv_cache = any(kv_cache_spec for kv_cache_spec in kv_cache_specs)
|
||||
if has_kv_cache:
|
||||
if envs_ascend.VLLM_ASCEND_ENABLE_IDLE_OFFLOAD:
|
||||
if envs_ascend.VLLM_ASCEND_ENABLE_VNPU:
|
||||
# get available memory in idle offload mode
|
||||
available_gpu_memory = (
|
||||
self.model_executor.determine_available_memory_idle_offload_mode())
|
||||
self.model_executor.determine_available_memory_vnpu_offload_mode())
|
||||
self.available_gpu_memory_for_kv_cache = \
|
||||
available_gpu_memory[0]
|
||||
elif os.environ.get("VLLM_ELASTIC_EP_SCALE_UP_LAUNCH") == "1":
|
||||
|
||||
@@ -43,11 +43,11 @@ def reload_vram(self) -> bool:
|
||||
time.sleep(0.001)
|
||||
|
||||
|
||||
def determine_available_memory_idle_offload_mode(self) -> int:
|
||||
return self.collective_rpc("determine_available_memory_idle_offload_mode")
|
||||
def determine_available_memory_vnpu_offload_mode(self) -> int:
|
||||
return self.collective_rpc("determine_available_memory_vnpu_offload_mode")
|
||||
|
||||
|
||||
ExecutorBase.__init__ = init
|
||||
ExecutorBase.offload_vram = offload_vram
|
||||
ExecutorBase.reload_vram = reload_vram
|
||||
ExecutorBase.determine_available_memory_idle_offload_mode = determine_available_memory_idle_offload_mode
|
||||
ExecutorBase.determine_available_memory_vnpu_offload_mode = determine_available_memory_vnpu_offload_mode
|
||||
|
||||
@@ -258,7 +258,7 @@ class NPUWorker(WorkerBase):
|
||||
)
|
||||
return available_kv_cache_memory
|
||||
|
||||
def determine_available_memory_idle_offload_mode(self) -> int:
|
||||
def determine_available_memory_vnpu_offload_mode(self) -> int:
|
||||
allocator = CaMemAllocator.get_instance()
|
||||
free, total = allocator.get_pool_mem_info()
|
||||
available_kv_cache_memory = int(
|
||||
@@ -317,7 +317,7 @@ class NPUWorker(WorkerBase):
|
||||
"Sleep mode can only be "
|
||||
"used for one instance per process.")
|
||||
context = allocator.use_memory_pool(tag="weights")
|
||||
elif envs_ascend.VLLM_ASCEND_ENABLE_IDLE_OFFLOAD:
|
||||
elif envs_ascend.VLLM_ASCEND_ENABLE_VNPU:
|
||||
if not sleep_mode_enabled():
|
||||
raise ValueError(
|
||||
"Sleep mode is not enabled. Please compile vllm-ascend with COMPILE_CUSTOM_KERNELS=1."
|
||||
@@ -336,7 +336,7 @@ class NPUWorker(WorkerBase):
|
||||
context = nullcontext() # type: ignore
|
||||
with context:
|
||||
self.model_runner.load_model()
|
||||
if envs_ascend.VLLM_ASCEND_ENABLE_IDLE_OFFLOAD:
|
||||
if envs_ascend.VLLM_ASCEND_ENABLE_VNPU:
|
||||
# save memory to host with lock
|
||||
self.offload_vram()
|
||||
succ, _ = self.try_reload_vram()
|
||||
@@ -402,7 +402,7 @@ class NPUWorker(WorkerBase):
|
||||
if self.vllm_config.model_config.enable_sleep_mode:
|
||||
allocator = CaMemAllocator.get_instance()
|
||||
context = allocator.use_memory_pool(tag="kv_cache")
|
||||
elif envs_ascend.VLLM_ASCEND_ENABLE_IDLE_OFFLOAD:
|
||||
elif envs_ascend.VLLM_ASCEND_ENABLE_VNPU:
|
||||
allocator = CaMemAllocator.get_instance()
|
||||
context = allocator.use_memory_pool(tag="kv_cache")
|
||||
else:
|
||||
|
||||
Reference in New Issue
Block a user