[Bugfix] rename enable_flash_comm_v1 back to enable_sp (#6883)

### What this PR does / why we need it? PR #5632 introduced a bug by replacing some branches gated by enable_sp with enable_flash_comm_v1. As a result, when enable_shared_expert_dp is enabled alone (i.e., VLLM_ASCEND_ENABLE_FLASHCOMM1=0 and VLLM_ASCEND_ENABLE_FLASHCOMM=0), the behavior becomes inconsistent with the previous logic and leads to accuracy issues. This PR restores the original enable_sp-based branching to recover expected behavior and accuracy. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? #### 1. start server ``` bash vllm serve /home/weights/DeepSeek-V2-Lite-W8A8/ \ --port 8001 \ --served-model-name auto \ --max-model-len 1024 \ --enforce-eager \ --tensor-parallel-size 2 \ --data-parallel-size 2 \ --gpu-memory-utilization 0.9 \ --enable-expert-parallel \ --additional-config '{"enable_shared_expert_dp": true}' ``` #### 2. curl ```bash curl -s http://localhost:8001/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{ "model": "auto", "messages": [ {"role": "user", "content": "Hello. I have a question. Who are you?"} ], "max_tokens": 10, "temperature": 0.0, "ignore_eos_token": true }' ``` - vLLM version: v0.16.0 - vLLM main: 15d76f74e2 Signed-off-by: realliujiaxu <realliujiaxu@163.com>
2026-03-01 20:22:50 +08:00
parent 8835236181
commit 5e24b26a54
7 changed files with 24 additions and 29 deletions
--- a/vllm_ascend/utils.py
+++ b/vllm_ascend/utils.py
@@ -719,15 +719,6 @@ def matmul_allreduce_enable() -> bool:
    return envs_ascend.VLLM_ASCEND_ENABLE_MATMUL_ALLREDUCE


-def enable_flash_comm_v1():
-    return (
-        envs_ascend.VLLM_ASCEND_ENABLE_FLASHCOMM1
-        # Flash comm 1 should be enabled by env VLLM_ASCEND_ENABLE_FLASHCOMM1
-        # We retain the env VLLM_ASCEND_ENABLE_FLASHCOMM here for backward compatibility.
-        or bool(int(os.getenv("VLLM_ASCEND_ENABLE_FLASHCOMM", "0")))
-    )
-
-
 def enable_sp_by_pass(vllm_config: VllmConfig):
    return not vllm_config.model_config.enforce_eager and vllm_config.compilation_config.pass_config.enable_sp

@@ -739,7 +730,12 @@ def enable_sp(vllm_config=None, enable_shared_expert_dp: bool = False) -> bool:
            from vllm.config import get_current_vllm_config

            vllm_config = get_current_vllm_config()
-        _ENABLE_SP = enable_sp_by_pass(vllm_config) or enable_flash_comm_v1()
+        _ENABLE_SP = (
+            envs_ascend.VLLM_ASCEND_ENABLE_FLASHCOMM1
+            # Flash comm 1 should be enabled by env VLLM_ASCEND_ENABLE_FLASHCOMM1
+            # We retain the env VLLM_ASCEND_ENABLE_FLASHCOMM here for backward compatibility.
+            or bool(int(os.getenv("VLLM_ASCEND_ENABLE_FLASHCOMM", "0")))
+        )

        if not _ENABLE_SP and enable_shared_expert_dp:
            _ENABLE_SP = True
@@ -1104,7 +1100,7 @@ def enable_dsa_cp() -> bool:
    is_ds_v32 = hasattr(vllm_config.model_config, "hf_text_config") and hasattr(
        vllm_config.model_config.hf_text_config, "index_topk"
    )
-    return bool(is_ds_v32 and enable_flash_comm_v1())
+    return bool(is_ds_v32 and enable_sp())


@lru_cache(maxsize=1)