[Bugfix][Spec Decode] Enable ACL_OP_INIT_MODE=1 directly only when using V0 spec decode (#1258)
### What this PR does / why we need it? Enable `ACL_OP_INIT_MODE=1` directly only when using V0 spec decode. Find more details at **mengwei805**'s comment in https://github.com/vllm-project/vllm-ascend/pull/1123. ### Does this PR introduce _any_ user-facing change? The user will not be aware of `VLLM_ASCEND_ACL_OP_INIT_MODE` (`ACL_OP_INIT_MODE`). ### How was this patch tested? Test scripts: ```python from vllm import LLM, SamplingParams prompts = [ "The future of AI is", ] sampling_params = SamplingParams(temperature=0.8, top_p=0.95) llm = LLM( model="Qwen/Qwen2.5-1.5B-Instruct", tensor_parallel_size=1, speculative_config={ "method": "ngram", "num_speculative_tokens": 5, "prompt_lookup_max": 4, }, ) outputs = llm.generate(prompts, sampling_params) for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") ``` Results: ``` Adding requests: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 76.70it/s] Processed prompts: 100%|███████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 1.33it/s, est. speed input: 6.64 toks/s, output: 21.26 toks/s] Prompt: 'The future of AI is', Generated text: ' bright\n\n04/15/2020\n\nBy: James' ``` --------- Signed-off-by: shen-shanshan <467638484@qq.com>
This commit is contained in:
@@ -111,15 +111,6 @@ env_variables: Dict[str, Callable[[], Any]] = {
|
||||
# 1: enable moe_all2all_buffer.
|
||||
"MOE_ALL2ALL_BUFFER":
|
||||
lambda: bool(int(os.getenv("MOE_ALL2ALL_BUFFER", '0'))),
|
||||
# VLLM_ASCEND_ACL_OP_INIT_MODE:
|
||||
# 0: default, normal init.
|
||||
# 1: delay init until launch aclops.
|
||||
# 2: forbid aclops init and launch.
|
||||
# Find more details at https://gitee.com/ascend/pytorch/pulls/18094
|
||||
# We set this var default to `1` in vllm-ascend to avoid segment fault when
|
||||
# enable `pin_memory` while creating a tensor using `torch.tensor`.
|
||||
"VLLM_ASCEND_ACL_OP_INIT_MODE":
|
||||
lambda: os.getenv("VLLM_ASCEND_ACL_OP_INIT_MODE", '0'),
|
||||
# Some models are optimized by vllm ascend. While in some case, e.g. rlhf
|
||||
# training, the optimized model may not be suitable. In this case, set this
|
||||
# value to False to disable the optimized model.
|
||||
|
||||
@@ -27,7 +27,6 @@ from torch.distributed.distributed_c10d import PrefixStore
|
||||
from vllm.logger import logger
|
||||
from vllm.platforms import Platform, PlatformEnum
|
||||
|
||||
import vllm_ascend.envs as ascend_envs
|
||||
from vllm_ascend.ascend_config import check_ascend_config, init_ascend_config
|
||||
from vllm_ascend.utils import ASCEND_QUATIZATION_METHOD, update_aclgraph_sizes
|
||||
|
||||
@@ -39,8 +38,6 @@ else:
|
||||
VllmConfig = None
|
||||
FlexibleArgumentParser = None
|
||||
|
||||
os.environ["ACL_OP_INIT_MODE"] = ascend_envs.VLLM_ASCEND_ACL_OP_INIT_MODE
|
||||
|
||||
|
||||
class NPUPlatform(Platform):
|
||||
|
||||
@@ -188,6 +185,9 @@ class NPUPlatform(Platform):
|
||||
if envs.VLLM_USE_V1:
|
||||
parallel_config.worker_cls = "vllm_ascend.worker.worker_v1.NPUWorker"
|
||||
elif vllm_config.speculative_config:
|
||||
# NOTE: We set this var to `1` in vllm-ascend to avoid segment
|
||||
# fault when using spec decode with V0 engine.
|
||||
os.environ["ACL_OP_INIT_MODE"] = "1"
|
||||
parallel_config.worker_cls = "vllm.spec_decode.spec_decode_worker.create_spec_worker"
|
||||
parallel_config.sd_worker_cls = "vllm_ascend.worker.worker.NPUWorker"
|
||||
elif vllm_config.scheduler_config.is_multi_step:
|
||||
|
||||
Reference in New Issue
Block a user