diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py index 5be5a05..7f21f26 100644 --- a/vllm_ascend/platform.py +++ b/vllm_ascend/platform.py @@ -29,7 +29,7 @@ from vllm.platforms import Platform, PlatformEnum from vllm_ascend.ascend_config import (check_ascend_config, get_ascend_config, init_ascend_config) from vllm_ascend.utils import (ASCEND_QUATIZATION_METHOD, is_310p, - register_ascend_customop, update_aclgraph_sizes) + update_aclgraph_sizes) if TYPE_CHECKING: from vllm.config import ModelConfig, VllmConfig @@ -201,9 +201,6 @@ class NPUPlatform(Platform): "For better performance in Qwen3 MoE, SP only works exclusively with MC2, AllToAll, and AllToAllV." ) - # register Ascend CustomOp - register_ascend_customop() - @classmethod def get_attn_backend_cls(cls, selected_backend, diff --git a/vllm_ascend/worker/worker_v1.py b/vllm_ascend/worker/worker_v1.py index 19ef2ef..4e75a7d 100644 --- a/vllm_ascend/worker/worker_v1.py +++ b/vllm_ascend/worker/worker_v1.py @@ -45,7 +45,8 @@ from vllm_ascend.ascend_config import init_ascend_config from vllm_ascend.device_allocator.camem import CaMemAllocator from vllm_ascend.distributed.parallel_state import init_ascend_model_parallel from vllm_ascend.platform import NPUPlatform -from vllm_ascend.utils import (init_ascend_soc_version, sleep_mode_enabled, +from vllm_ascend.utils import (init_ascend_soc_version, + register_ascend_customop, sleep_mode_enabled, try_register_lib) from vllm_ascend.worker.model_runner_v1 import NPUModelRunner @@ -69,7 +70,7 @@ class NPUWorker(WorkerBase): from vllm_ascend import ops ops.register_dummy_fusion_op() _register_atb_extensions() - + register_ascend_customop() # init ascend config and soc version init_ascend_config(vllm_config) init_ascend_soc_version()