diff --git a/vllm_ascend/envs.py b/vllm_ascend/envs.py index 74e9c19..02ecd66 100644 --- a/vllm_ascend/envs.py +++ b/vllm_ascend/envs.py @@ -111,15 +111,6 @@ env_variables: Dict[str, Callable[[], Any]] = { # 1: enable moe_all2all_buffer. "MOE_ALL2ALL_BUFFER": lambda: bool(int(os.getenv("MOE_ALL2ALL_BUFFER", '0'))), - # VLLM_ASCEND_ACL_OP_INIT_MODE: - # 0: default, normal init. - # 1: delay init until launch aclops. - # 2: forbid aclops init and launch. - # Find more details at https://gitee.com/ascend/pytorch/pulls/18094 - # We set this var default to `1` in vllm-ascend to avoid segment fault when - # enable `pin_memory` while creating a tensor using `torch.tensor`. - "VLLM_ASCEND_ACL_OP_INIT_MODE": - lambda: os.getenv("VLLM_ASCEND_ACL_OP_INIT_MODE", '0'), # Some models are optimized by vllm ascend. While in some case, e.g. rlhf # training, the optimized model may not be suitable. In this case, set this # value to False to disable the optimized model. diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py index 5a45e9e..b9233da 100644 --- a/vllm_ascend/platform.py +++ b/vllm_ascend/platform.py @@ -27,7 +27,6 @@ from torch.distributed.distributed_c10d import PrefixStore from vllm.logger import logger from vllm.platforms import Platform, PlatformEnum -import vllm_ascend.envs as ascend_envs from vllm_ascend.ascend_config import check_ascend_config, init_ascend_config from vllm_ascend.utils import ASCEND_QUATIZATION_METHOD, update_aclgraph_sizes @@ -39,8 +38,6 @@ else: VllmConfig = None FlexibleArgumentParser = None -os.environ["ACL_OP_INIT_MODE"] = ascend_envs.VLLM_ASCEND_ACL_OP_INIT_MODE - class NPUPlatform(Platform): @@ -188,6 +185,9 @@ class NPUPlatform(Platform): if envs.VLLM_USE_V1: parallel_config.worker_cls = "vllm_ascend.worker.worker_v1.NPUWorker" elif vllm_config.speculative_config: + # NOTE: We set this var to `1` in vllm-ascend to avoid segment + # fault when using spec decode with V0 engine. + os.environ["ACL_OP_INIT_MODE"] = "1" parallel_config.worker_cls = "vllm.spec_decode.spec_decode_worker.create_spec_worker" parallel_config.sd_worker_cls = "vllm_ascend.worker.worker.NPUWorker" elif vllm_config.scheduler_config.is_multi_step: