Fix some ci issue and refactor modelrunner (#2445)

### What this PR does / why we need it?
Fix some ci issue and refactor modelrunner

### Does this PR introduce _any_ user-facing change?
N/A

### How was this patch tested?
CI passed with existing test.

- vLLM version: v0.10.0
- vLLM main:
4d9c61993a

---------

Signed-off-by: wangli <wangli858794774@gmail.com>
Signed-off-by: MengqingCao <cmq0113@163.com>
Signed-off-by: weiguihua2 <weiguihua2@huawei.com>
Co-authored-by: wangli <wangli858794774@gmail.com>
Co-authored-by: weiguihua2 <weiguihua2@huawei.com>
This commit is contained in:
Mengqing Cao
2025-08-20 09:01:04 +08:00
committed by GitHub
parent 955411611c
commit 1327f9be1c
28 changed files with 1612 additions and 1020 deletions

View File

@@ -139,33 +139,53 @@ class NPUPlatform(Platform):
enforce_eager = getattr(model_config, "enforce_eager", False)
check_ascend_config(vllm_config, enforce_eager)
from vllm.config.compilation import CUDAGraphMode
# TODO(cmq): update the post init in vllmconfig
# if cudagraph_mode is not explicitly set by users, set default value
if envs_vllm.VLLM_USE_V1 and compilation_config.level \
== CompilationLevel.PIECEWISE:
compilation_config.cudagraph_mode = \
CUDAGraphMode.PIECEWISE
else:
compilation_config.cudagraph_mode = CUDAGraphMode.NONE
vllm_config._set_cudagraph_sizes()
# TODO(cmq): update the compilation level config to be determined by CUDAGraphMode
if enforce_eager or compilation_config.level == CompilationLevel.NO_COMPILATION:
logger.info("Compilation disabled, using eager mode by default")
compilation_config.level = CompilationLevel.NO_COMPILATION
compilation_config.cudagraph_mode = CUDAGraphMode.NONE
elif compilation_config.level != CompilationLevel.PIECEWISE:
logger.warning(
"NPU does not support %s compilation level. Setting level to NO_COMPILATION",
compilation_config.level)
compilation_config.level = CompilationLevel.NO_COMPILATION
compilation_config.cudagraph_mode = CUDAGraphMode.NONE
elif ascend_config.torchair_graph_config.enabled:
logger.info(
"Torchair compilation enabled on NPU. Setting level to NO_COMPILATION"
)
compilation_config.level = CompilationLevel.NO_COMPILATION
compilation_config.cudagraph_mode = CUDAGraphMode.NONE
elif parallel_config.distributed_executor_backend == "ray":
logger.warning(
"Ray distributed executor backend is not compatible with ACL Graph mode "
"right now. Setting level to NO_COMPILATION")
compilation_config.level = CompilationLevel.NO_COMPILATION
compilation_config.cudagraph_mode = CUDAGraphMode.NONE
else:
logger.info(
"PIECEWISE compilation enabled on NPU. use_inductor not supported - "
"using only ACL Graph mode")
if envs_vllm.VLLM_USE_V1 and \
compilation_config.level == CompilationLevel.PIECEWISE:
compilation_config.set_splitting_ops_for_v1()
compilation_config.use_inductor = False
compilation_config.splitting_ops.extend(
["vllm.unified_ascend_attention_with_output"])
update_aclgraph_sizes(vllm_config)
compilation_config.cudagraph_num_of_warmups = 1
if parallel_config and parallel_config.worker_cls == "auto":
if ascend_config.torchair_graph_config.enabled:
@@ -249,11 +269,11 @@ class NPUPlatform(Platform):
return True
@classmethod
def get_piecewise_backend_cls(cls) -> str:
def get_static_graph_wrapper_cls(cls) -> str:
"""
Get piecewise backend class for piecewise graph.
"""
return "vllm_ascend.compilation.piecewise_backend.NPUPiecewiseBackend" # noqa
return "vllm_ascend.compilation.acl_graph.ACLGraphWrapper" # noqa
@classmethod
def stateless_init_device_torch_dist_pg(