[Refactor] Cleanup platform (#5566)
### What this PR does / why we need it?
1. add `COMPILATION_PASS_KEY` constant
2. clean up useless platform interface `empty_cache`, `synchronize`,
`mem_get_info`, `clear_npu_memory`
3. rename `CUSTOM_OP_REGISTERED` to `_CUSTOM_OP_REGISTERED`
4. remove uesless env `VLLM_ENABLE_CUDAGRAPH_GC`
NPUPlatform is the interface called by vLLM. Do not call it inner
vllm-ascend.
### Does this PR introduce _any_ user-facing change?
This PR is just a cleanup. All CI should pass.
### How was this patch tested?
- vLLM version: v0.13.0
- vLLM main:
7157596103
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
@@ -28,6 +28,7 @@ from torch.fx import GraphModule
|
||||
from vllm.compilation.compiler_interface import CompilerInterface
|
||||
|
||||
from vllm_ascend.ascend_config import get_ascend_config
|
||||
from vllm_ascend.utils import COMPILATION_PASS_KEY
|
||||
|
||||
|
||||
def compile_fx(graph: GraphModule, example_inputs: list,
|
||||
@@ -51,7 +52,7 @@ def fusion_pass_compile(
|
||||
) -> tuple[Optional[Callable], Optional[Any]]:
|
||||
|
||||
def compile_inner(graph, example_inputs):
|
||||
current_pass_manager = compiler_config["graph_fusion_manager"]
|
||||
current_pass_manager = compiler_config[COMPILATION_PASS_KEY]
|
||||
graph = current_pass_manager(graph, runtime_shape)
|
||||
return graph
|
||||
|
||||
|
||||
@@ -25,8 +25,6 @@ import torch
|
||||
from acl.rt import memcpy # type: ignore # noqa: F401
|
||||
from vllm.logger import logger
|
||||
|
||||
from vllm_ascend.platform import NPUPlatform
|
||||
|
||||
|
||||
def find_loaded_library(lib_name) -> Optional[str]:
|
||||
"""
|
||||
@@ -196,11 +194,10 @@ class CaMemAllocator:
|
||||
handle = data.handle
|
||||
if data.tag in offload_tags:
|
||||
size_in_bytes = handle[1]
|
||||
cpu_backup_tensor = torch.empty(
|
||||
size_in_bytes,
|
||||
dtype=torch.uint8,
|
||||
device='cpu',
|
||||
pin_memory=NPUPlatform.is_pin_memory_available())
|
||||
cpu_backup_tensor = torch.empty(size_in_bytes,
|
||||
dtype=torch.uint8,
|
||||
device='cpu',
|
||||
pin_memory=True)
|
||||
cpu_ptr = cpu_backup_tensor.data_ptr()
|
||||
ACL_MEMCPY_DEVICE_TO_HOST = 2
|
||||
dest_max = cpu_ptr + size_in_bytes * 2
|
||||
|
||||
@@ -15,9 +15,8 @@
|
||||
# This file is a part of the vllm-ascend project.
|
||||
#
|
||||
|
||||
import gc
|
||||
import os
|
||||
from typing import TYPE_CHECKING, Optional, Tuple
|
||||
from typing import TYPE_CHECKING, Optional
|
||||
from uuid import uuid4
|
||||
|
||||
import torch
|
||||
@@ -26,18 +25,16 @@ from vllm.platforms import Platform, PlatformEnum
|
||||
|
||||
# todo: please remove it when solve cuda hard code in vllm
|
||||
os.environ["VLLM_DISABLE_SHARED_EXPERTS_STREAM"] = "1"
|
||||
# todo: please remove it when support controls garbage collection during CUDA graph capture.
|
||||
os.environ["VLLM_ENABLE_CUDAGRAPH_GC"] = "1"
|
||||
|
||||
from vllm_ascend.ascend_config import init_ascend_config
|
||||
from vllm_ascend.utils import refresh_block_size
|
||||
|
||||
# isort: off
|
||||
from vllm_ascend.utils import (
|
||||
ASCEND_QUANTIZATION_METHOD, COMPRESSED_TENSORS_METHOD, AscendDeviceType,
|
||||
enable_sp, get_ascend_device_type, is_vl_model, update_aclgraph_sizes,
|
||||
update_cudagraph_capture_sizes, update_default_aclgraph_sizes,
|
||||
check_kv_extra_config)
|
||||
ASCEND_QUANTIZATION_METHOD, COMPRESSED_TENSORS_METHOD,
|
||||
COMPILATION_PASS_KEY, AscendDeviceType, enable_sp, get_ascend_device_type,
|
||||
is_vl_model, update_aclgraph_sizes, update_cudagraph_capture_sizes,
|
||||
update_default_aclgraph_sizes, check_kv_extra_config)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.config import ModelConfig, VllmConfig
|
||||
@@ -47,7 +44,7 @@ else:
|
||||
VllmConfig = None
|
||||
FlexibleArgumentParser = None
|
||||
|
||||
CUSTOM_OP_REGISTERED = False
|
||||
_CUSTOM_OP_REGISTERED = False
|
||||
|
||||
|
||||
class NPUPlatform(Platform):
|
||||
@@ -74,7 +71,7 @@ class NPUPlatform(Platform):
|
||||
It is a parameter of inductor_config used to register custom passes.
|
||||
Currently, we only use Inductor's 'pattern matcher' functionality, so we define our own pass_key.
|
||||
"""
|
||||
return "graph_fusion_manager"
|
||||
return COMPILATION_PASS_KEY
|
||||
|
||||
@classmethod
|
||||
def get_pass_manager_cls(cls) -> str:
|
||||
@@ -131,24 +128,6 @@ class NPUPlatform(Platform):
|
||||
def set_device(cls, device: torch.device):
|
||||
torch.npu.set_device(device)
|
||||
|
||||
@classmethod
|
||||
def empty_cache(cls):
|
||||
torch.npu.empty_cache()
|
||||
|
||||
@classmethod
|
||||
def synchronize(cls):
|
||||
torch.npu.synchronize()
|
||||
|
||||
@classmethod
|
||||
def mem_get_info(cls) -> Tuple[int, int]:
|
||||
return torch.npu.mem_get_info()
|
||||
|
||||
@classmethod
|
||||
def clear_npu_memory(cls):
|
||||
gc.collect()
|
||||
torch.npu.empty_cache()
|
||||
torch.npu.reset_peak_memory_stats()
|
||||
|
||||
@classmethod
|
||||
def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
|
||||
# initialize ascend config from vllm additional_config
|
||||
@@ -351,8 +330,8 @@ class NPUPlatform(Platform):
|
||||
# from vllm_ascend.utils import enable_custom_op
|
||||
# enable_custom_op()
|
||||
# set custom ops path
|
||||
global CUSTOM_OP_REGISTERED
|
||||
if CUSTOM_OP_REGISTERED:
|
||||
global _CUSTOM_OP_REGISTERED
|
||||
if _CUSTOM_OP_REGISTERED:
|
||||
return
|
||||
CUR_DIR = os.path.dirname(os.path.realpath(__file__))
|
||||
CUSTOM_OPP_PATH = os.path.join(CUR_DIR, "_cann_ops_custom", "vendors",
|
||||
@@ -365,7 +344,7 @@ class NPUPlatform(Platform):
|
||||
"ASCEND_CUSTOM_OPP_PATH"] = f"{CUSTOM_OPP_PATH}:{current_cust_opp_path}"
|
||||
else:
|
||||
os.environ["ASCEND_CUSTOM_OPP_PATH"] = CUSTOM_OPP_PATH
|
||||
CUSTOM_OP_REGISTERED = True
|
||||
_CUSTOM_OP_REGISTERED = True
|
||||
|
||||
@classmethod
|
||||
def get_attn_backend_cls(cls, selected_backend, attn_selector_config):
|
||||
|
||||
@@ -41,6 +41,7 @@ if TYPE_CHECKING:
|
||||
else:
|
||||
VllmConfig = None
|
||||
|
||||
COMPILATION_PASS_KEY = "graph_fusion_manager"
|
||||
ASCEND_QUANTIZATION_METHOD = "ascend"
|
||||
COMPRESSED_TENSORS_METHOD = "compressed-tensors"
|
||||
SOC_VERSION_INFERENCE_SERIES = ["Ascend310P3"]
|
||||
|
||||
@@ -18,6 +18,7 @@
|
||||
#
|
||||
|
||||
import copy
|
||||
import gc
|
||||
from types import NoneType
|
||||
from typing import Optional
|
||||
|
||||
@@ -55,15 +56,19 @@ from vllm_ascend.cpu_binding import bind_cpus
|
||||
from vllm_ascend.device_allocator.camem import CaMemAllocator
|
||||
from vllm_ascend.distributed.parallel_state import init_ascend_model_parallel
|
||||
from vllm_ascend.ops.triton.triton_utils import init_device_properties_triton
|
||||
from vllm_ascend.platform import NPUPlatform
|
||||
from vllm_ascend.utils import (AscendDeviceType, check_ascend_device_type,
|
||||
enable_sp, get_ascend_device_type,
|
||||
register_ascend_customop)
|
||||
register_ascend_customop, vllm_version_is)
|
||||
from vllm_ascend.worker.model_runner_v1 import NPUModelRunner
|
||||
|
||||
torch._dynamo.trace_rules.clear_lru_cache() # noqa: E402
|
||||
from torch._dynamo.variables import TorchInGraphFunctionVariable # noqa: E402
|
||||
|
||||
if vllm_version_is("0.13.0"):
|
||||
from vllm.model_executor.utils import set_random_seed
|
||||
else:
|
||||
from vllm.utils.torch_utils import set_random_seed
|
||||
|
||||
torch_non_c_binding_in_graph_functions_npu = dict.fromkeys(
|
||||
["torch.npu.current_stream"],
|
||||
TorchInGraphFunctionVariable,
|
||||
@@ -147,7 +152,7 @@ class NPUWorker(WorkerBase):
|
||||
self.use_v2_model_runner = envs_vllm.VLLM_USE_V2_MODEL_RUNNER
|
||||
|
||||
def sleep(self, level: int = 1) -> None:
|
||||
free_bytes_before_sleep = NPUPlatform.mem_get_info()[0]
|
||||
free_bytes_before_sleep = torch.npu.mem_get_info()[0]
|
||||
# Save the buffers before level 2 sleep
|
||||
if level == 2:
|
||||
model = self.model_runner.model
|
||||
@@ -157,7 +162,7 @@ class NPUWorker(WorkerBase):
|
||||
}
|
||||
allocator = CaMemAllocator.get_instance()
|
||||
allocator.sleep(offload_tags=("weights", ) if level == 1 else tuple())
|
||||
free_bytes_after_sleep, total = NPUPlatform.mem_get_info()
|
||||
free_bytes_after_sleep, total = torch.npu.mem_get_info()
|
||||
freed_bytes = free_bytes_after_sleep - free_bytes_before_sleep
|
||||
used_bytes = total - free_bytes_after_sleep
|
||||
assert freed_bytes >= 0, "Memory usage increased after sleeping."
|
||||
@@ -210,8 +215,8 @@ class NPUWorker(WorkerBase):
|
||||
|
||||
def _init_device(self):
|
||||
device = torch.device(f"npu:{self.local_rank}")
|
||||
NPUPlatform.set_device(device)
|
||||
NPUPlatform.empty_cache()
|
||||
torch.npu.set_device(device)
|
||||
torch.npu.empty_cache()
|
||||
|
||||
if (self.parallel_config.data_parallel_size > 1
|
||||
and self.parallel_config.data_parallel_size_local > 0
|
||||
@@ -226,11 +231,11 @@ class NPUWorker(WorkerBase):
|
||||
f"be less than or equal to the number of visible devices "
|
||||
f"({visible_device_count}).")
|
||||
|
||||
self.init_npu_memory = NPUPlatform.mem_get_info()[0]
|
||||
self.init_npu_memory = torch.npu.mem_get_info()[0]
|
||||
# Initialize the distributed environment.
|
||||
self._init_worker_distributed_environment()
|
||||
# Set random seed.
|
||||
NPUPlatform.seed_everything(self.model_config.seed)
|
||||
set_random_seed(self.model_config.seed)
|
||||
# Initialize device properties used by triton kernels.
|
||||
init_device_properties_triton()
|
||||
return device
|
||||
@@ -258,16 +263,18 @@ class NPUWorker(WorkerBase):
|
||||
def determine_available_memory(self) -> int:
|
||||
# Profile the memory usage of the model and get the maximum number of
|
||||
# cache blocks that can be allocated with the remaining free memory.
|
||||
NPUPlatform.clear_npu_memory()
|
||||
gc.collect()
|
||||
torch.npu.empty_cache()
|
||||
torch.npu.reset_peak_memory_stats()
|
||||
|
||||
# Execute a forward pass with dummy inputs to profile the memory usage
|
||||
# of the model.
|
||||
_, total_npu_memory = NPUPlatform.mem_get_info()
|
||||
_, total_npu_memory = torch.npu.mem_get_info()
|
||||
self.model_runner.profile_run()
|
||||
|
||||
# Calculate the number of blocks that can be allocated with the
|
||||
# profiled peak memory.
|
||||
free_npu_memory, _ = NPUPlatform.mem_get_info()
|
||||
free_npu_memory, _ = torch.npu.mem_get_info()
|
||||
# NOTE(woosuk): Here we assume that the other processes using the same
|
||||
# GPU did not change their memory usage during the profiling.
|
||||
assert self.init_npu_memory > free_npu_memory, (
|
||||
@@ -280,7 +287,7 @@ class NPUWorker(WorkerBase):
|
||||
peak_memory = torch_npu.npu.memory_stats()["allocated_bytes.all.peak"]
|
||||
# TODO: don`t need impl this func after empty_cache in
|
||||
# Worker.determine_num_available_blocks() unified`
|
||||
NPUPlatform.empty_cache()
|
||||
torch.npu.empty_cache()
|
||||
torch_allocated_bytes = torch_npu.npu.memory_stats(
|
||||
)["allocated_bytes.all.current"]
|
||||
total_allocated_bytes = torch_npu.npu.mem_get_info(
|
||||
@@ -389,7 +396,7 @@ class NPUWorker(WorkerBase):
|
||||
self._warm_up_atb()
|
||||
# Reset the seed to ensure that the random state is not affected by
|
||||
# the model initialization and profiling.
|
||||
NPUPlatform.seed_everything(self.model_config.seed)
|
||||
set_random_seed(self.model_config.seed)
|
||||
|
||||
def _warm_up_atb(self):
|
||||
x = torch.rand((2, 4), dtype=torch.float16).npu()
|
||||
|
||||
Reference in New Issue
Block a user