Fix some ci issue and refactor modelrunner (#2445)
### What this PR does / why we need it?
Fix some ci issue and refactor modelrunner
### Does this PR introduce _any_ user-facing change?
N/A
### How was this patch tested?
CI passed with existing test.
- vLLM version: v0.10.0
- vLLM main:
4d9c61993a
---------
Signed-off-by: wangli <wangli858794774@gmail.com>
Signed-off-by: MengqingCao <cmq0113@163.com>
Signed-off-by: weiguihua2 <weiguihua2@huawei.com>
Co-authored-by: wangli <wangli858794774@gmail.com>
Co-authored-by: weiguihua2 <weiguihua2@huawei.com>
This commit is contained in:
@@ -139,33 +139,53 @@ class NPUPlatform(Platform):
|
||||
enforce_eager = getattr(model_config, "enforce_eager", False)
|
||||
|
||||
check_ascend_config(vllm_config, enforce_eager)
|
||||
from vllm.config.compilation import CUDAGraphMode
|
||||
|
||||
# TODO(cmq): update the post init in vllmconfig
|
||||
# if cudagraph_mode is not explicitly set by users, set default value
|
||||
if envs_vllm.VLLM_USE_V1 and compilation_config.level \
|
||||
== CompilationLevel.PIECEWISE:
|
||||
compilation_config.cudagraph_mode = \
|
||||
CUDAGraphMode.PIECEWISE
|
||||
else:
|
||||
compilation_config.cudagraph_mode = CUDAGraphMode.NONE
|
||||
vllm_config._set_cudagraph_sizes()
|
||||
|
||||
# TODO(cmq): update the compilation level config to be determined by CUDAGraphMode
|
||||
if enforce_eager or compilation_config.level == CompilationLevel.NO_COMPILATION:
|
||||
logger.info("Compilation disabled, using eager mode by default")
|
||||
compilation_config.level = CompilationLevel.NO_COMPILATION
|
||||
compilation_config.cudagraph_mode = CUDAGraphMode.NONE
|
||||
elif compilation_config.level != CompilationLevel.PIECEWISE:
|
||||
logger.warning(
|
||||
"NPU does not support %s compilation level. Setting level to NO_COMPILATION",
|
||||
compilation_config.level)
|
||||
compilation_config.level = CompilationLevel.NO_COMPILATION
|
||||
compilation_config.cudagraph_mode = CUDAGraphMode.NONE
|
||||
elif ascend_config.torchair_graph_config.enabled:
|
||||
logger.info(
|
||||
"Torchair compilation enabled on NPU. Setting level to NO_COMPILATION"
|
||||
)
|
||||
compilation_config.level = CompilationLevel.NO_COMPILATION
|
||||
compilation_config.cudagraph_mode = CUDAGraphMode.NONE
|
||||
elif parallel_config.distributed_executor_backend == "ray":
|
||||
logger.warning(
|
||||
"Ray distributed executor backend is not compatible with ACL Graph mode "
|
||||
"right now. Setting level to NO_COMPILATION")
|
||||
compilation_config.level = CompilationLevel.NO_COMPILATION
|
||||
compilation_config.cudagraph_mode = CUDAGraphMode.NONE
|
||||
else:
|
||||
logger.info(
|
||||
"PIECEWISE compilation enabled on NPU. use_inductor not supported - "
|
||||
"using only ACL Graph mode")
|
||||
if envs_vllm.VLLM_USE_V1 and \
|
||||
compilation_config.level == CompilationLevel.PIECEWISE:
|
||||
compilation_config.set_splitting_ops_for_v1()
|
||||
compilation_config.use_inductor = False
|
||||
compilation_config.splitting_ops.extend(
|
||||
["vllm.unified_ascend_attention_with_output"])
|
||||
update_aclgraph_sizes(vllm_config)
|
||||
compilation_config.cudagraph_num_of_warmups = 1
|
||||
|
||||
if parallel_config and parallel_config.worker_cls == "auto":
|
||||
if ascend_config.torchair_graph_config.enabled:
|
||||
@@ -249,11 +269,11 @@ class NPUPlatform(Platform):
|
||||
return True
|
||||
|
||||
@classmethod
|
||||
def get_piecewise_backend_cls(cls) -> str:
|
||||
def get_static_graph_wrapper_cls(cls) -> str:
|
||||
"""
|
||||
Get piecewise backend class for piecewise graph.
|
||||
"""
|
||||
return "vllm_ascend.compilation.piecewise_backend.NPUPiecewiseBackend" # noqa
|
||||
return "vllm_ascend.compilation.acl_graph.ACLGraphWrapper" # noqa
|
||||
|
||||
@classmethod
|
||||
def stateless_init_device_torch_dist_pg(
|
||||
|
||||
Reference in New Issue
Block a user