From 2cd8ecdc4f1d9ee5e24a0fa70a2b4089785f8837 Mon Sep 17 00:00:00 2001
From: Shanshan Shen <467638484@qq.com>
Date: Wed, 18 Jun 2025 17:50:20 +0800
Subject: [PATCH] [Bugfix][Spec Decode] Enable `ACL_OP_INIT_MODE=1` directly
 only when using V0 spec decode (#1258)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### What this PR does / why we need it?

Enable `ACL_OP_INIT_MODE=1` directly only when using V0 spec decode.

Find more details at **mengwei805**'s comment in
https://github.com/vllm-project/vllm-ascend/pull/1123.

### Does this PR introduce _any_ user-facing change?

The user will not be aware of `VLLM_ASCEND_ACL_OP_INIT_MODE`
(`ACL_OP_INIT_MODE`).

### How was this patch tested?

Test scripts:

```python
from vllm import LLM, SamplingParams

prompts = [
    "The future of AI is",
]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)

llm = LLM(
    model="Qwen/Qwen2.5-1.5B-Instruct",
    tensor_parallel_size=1,
    speculative_config={
        "method": "ngram",
        "num_speculative_tokens": 5,
        "prompt_lookup_max": 4,
    },
)
outputs = llm.generate(prompts, sampling_params)

for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```

Results:

```
Adding requests: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 76.70it/s]
Processed prompts: 100%|███████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.33it/s, est. speed input: 6.64 toks/s, output: 21.26 toks/s]
Prompt: 'The future of AI is', Generated text: ' bright\n\n04/15/2020\n\nBy: James'
```

---------

Signed-off-by: shen-shanshan <467638484@qq.com>
---
 vllm_ascend/envs.py     | 9 ---------
 vllm_ascend/platform.py | 6 +++---
 2 files changed, 3 insertions(+), 12 deletions(-)

diff --git a/vllm_ascend/envs.py b/vllm_ascend/envs.py
index 74e9c19..02ecd66 100644
--- a/vllm_ascend/envs.py
+++ b/vllm_ascend/envs.py
@@ -111,15 +111,6 @@ env_variables: Dict[str, Callable[[], Any]] = {
     #   1: enable moe_all2all_buffer.
     "MOE_ALL2ALL_BUFFER":
     lambda: bool(int(os.getenv("MOE_ALL2ALL_BUFFER", '0'))),
-    # VLLM_ASCEND_ACL_OP_INIT_MODE:
-    #   0: default, normal init.
-    #   1: delay init until launch aclops.
-    #   2: forbid aclops init and launch.
-    # Find more details at https://gitee.com/ascend/pytorch/pulls/18094
-    # We set this var default to `1` in vllm-ascend to avoid segment fault when
-    # enable `pin_memory` while creating a tensor using `torch.tensor`.
-    "VLLM_ASCEND_ACL_OP_INIT_MODE":
-    lambda: os.getenv("VLLM_ASCEND_ACL_OP_INIT_MODE", '0'),
     # Some models are optimized by vllm ascend. While in some case, e.g. rlhf
     # training, the optimized model may not be suitable. In this case, set this
     # value to False to disable the optimized model.
diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py
index 5a45e9e..b9233da 100644
--- a/vllm_ascend/platform.py
+++ b/vllm_ascend/platform.py
@@ -27,7 +27,6 @@ from torch.distributed.distributed_c10d import PrefixStore
 from vllm.logger import logger
 from vllm.platforms import Platform, PlatformEnum
 
-import vllm_ascend.envs as ascend_envs
 from vllm_ascend.ascend_config import check_ascend_config, init_ascend_config
 from vllm_ascend.utils import ASCEND_QUATIZATION_METHOD, update_aclgraph_sizes
 
@@ -39,8 +38,6 @@ else:
     VllmConfig = None
     FlexibleArgumentParser = None
 
-os.environ["ACL_OP_INIT_MODE"] = ascend_envs.VLLM_ASCEND_ACL_OP_INIT_MODE
-
 
 class NPUPlatform(Platform):
 
@@ -188,6 +185,9 @@ class NPUPlatform(Platform):
             if envs.VLLM_USE_V1:
                 parallel_config.worker_cls = "vllm_ascend.worker.worker_v1.NPUWorker"
             elif vllm_config.speculative_config:
+                # NOTE: We set this var to `1` in vllm-ascend to avoid segment
+                # fault when using spec decode with V0 engine.
+                os.environ["ACL_OP_INIT_MODE"] = "1"
                 parallel_config.worker_cls = "vllm.spec_decode.spec_decode_worker.create_spec_worker"
                 parallel_config.sd_worker_cls = "vllm_ascend.worker.worker.NPUWorker"
             elif vllm_config.scheduler_config.is_multi_step: