[Bugfix] rename enable_flash_comm_v1 back to enable_sp (#6883)

### What this PR does / why we need it? PR #5632 introduced a bug by replacing some branches gated by enable_sp with enable_flash_comm_v1. As a result, when enable_shared_expert_dp is enabled alone (i.e., VLLM_ASCEND_ENABLE_FLASHCOMM1=0 and VLLM_ASCEND_ENABLE_FLASHCOMM=0), the behavior becomes inconsistent with the previous logic and leads to accuracy issues. This PR restores the original enable_sp-based branching to recover expected behavior and accuracy. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? #### 1. start server ``` bash vllm serve /home/weights/DeepSeek-V2-Lite-W8A8/ \ --port 8001 \ --served-model-name auto \ --max-model-len 1024 \ --enforce-eager \ --tensor-parallel-size 2 \ --data-parallel-size 2 \ --gpu-memory-utilization 0.9 \ --enable-expert-parallel \ --additional-config '{"enable_shared_expert_dp": true}' ``` #### 2. curl ```bash curl -s http://localhost:8001/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{ "model": "auto", "messages": [ {"role": "user", "content": "Hello. I have a question. Who are you?"} ], "max_tokens": 10, "temperature": 0.0, "ignore_eos_token": true }' ``` - vLLM version: v0.16.0 - vLLM main: 15d76f74e2 Signed-off-by: realliujiaxu <realliujiaxu@163.com>
2026-03-01 20:22:50 +08:00
parent 8835236181
commit 5e24b26a54
7 changed files with 24 additions and 29 deletions
--- a/vllm_ascend/ascend_forward_context.py
+++ b/vllm_ascend/ascend_forward_context.py
@@ -12,7 +12,7 @@ import vllm_ascend.envs as envs_ascend
 from vllm_ascend.ascend_config import get_ascend_config
 from vllm_ascend.utils import (
    AscendDeviceType,
-    enable_flash_comm_v1,
+    enable_sp,
    flashcomm2_enable,
    get_ascend_device_type,
    has_layer_idx,
@@ -92,14 +92,14 @@ def set_ascend_forward_context(
        # main model and drafter model may have different architecture
        is_context_moe_model = is_drafter_moe_model(vllm_config) if is_draft_model else is_moe_model(vllm_config)
        if is_context_moe_model:
-            flash_comm_v1_enabled = enable_flash_comm_v1() and num_tokens is not None
+            flash_comm_v1_enabled = enable_sp(vllm_config) and num_tokens is not None
            mmrs_fusion = False
        elif is_draft_model:
            # TODO: for dense drafter, `sp` is redundant and is not compatible with `dp` and `graph`.
            # Disable it to avoid more problems.
            flash_comm_v1_enabled = False
        else:
-            flash_comm_v1_enabled = enable_flash_comm_v1() and num_tokens is not None and num_tokens > 1000
+            flash_comm_v1_enabled = enable_sp(vllm_config) and num_tokens is not None and num_tokens > 1000
        forward_context.mmrs_fusion = mmrs_fusion
        forward_context.num_tokens = num_tokens
        forward_context.flash_comm_v1_enabled = flash_comm_v1_enabled