[Bugfix] rename enable_flash_comm_v1 back to enable_sp (#6883)

### What this PR does / why we need it? PR #5632 introduced a bug by replacing some branches gated by enable_sp with enable_flash_comm_v1. As a result, when enable_shared_expert_dp is enabled alone (i.e., VLLM_ASCEND_ENABLE_FLASHCOMM1=0 and VLLM_ASCEND_ENABLE_FLASHCOMM=0), the behavior becomes inconsistent with the previous logic and leads to accuracy issues. This PR restores the original enable_sp-based branching to recover expected behavior and accuracy. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? #### 1. start server ``` bash vllm serve /home/weights/DeepSeek-V2-Lite-W8A8/ \ --port 8001 \ --served-model-name auto \ --max-model-len 1024 \ --enforce-eager \ --tensor-parallel-size 2 \ --data-parallel-size 2 \ --gpu-memory-utilization 0.9 \ --enable-expert-parallel \ --additional-config '{"enable_shared_expert_dp": true}' ``` #### 2. curl ```bash curl -s http://localhost:8001/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{ "model": "auto", "messages": [ {"role": "user", "content": "Hello. I have a question. Who are you?"} ], "max_tokens": 10, "temperature": 0.0, "ignore_eos_token": true }' ``` - vLLM version: v0.16.0 - vLLM main: 15d76f74e2 Signed-off-by: realliujiaxu <realliujiaxu@163.com>
2026-03-01 20:22:50 +08:00
parent 8835236181
commit 5e24b26a54
7 changed files with 24 additions and 29 deletions
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -113,8 +113,8 @@ from vllm_ascend.spec_decode.medusa_proposer import MedusaProposer
 from vllm_ascend.spec_decode.mtp_proposer import MtpProposer
 from vllm_ascend.utils import (
    check_gdn_layer,
-    enable_flash_comm_v1,
    enable_sp,
+    enable_sp_by_pass,
    is_drafter_moe_model,
    is_moe_model,
    lmhead_tp_enable,
@@ -1745,7 +1745,7 @@ class NPUModelRunner(GPUModelRunner):
        # Pad tokens to multiple of tensor_parallel_size when
        # enabled collective fusion for SP
        tp_size = self.vllm_config.parallel_config.tensor_parallel_size
-        if enable_sp(self.vllm_config):
+        if enable_sp(self.vllm_config) or enable_sp_by_pass(self.vllm_config):
            return round_up(num_scheduled_tokens, tp_size)
        return num_scheduled_tokens

@@ -2300,7 +2300,7 @@ class NPUModelRunner(GPUModelRunner):
                # tp_size; otherwise, on non-first PP ranks it would effectively perform an extra all-gather, leading
                # to incorrect memory estimation and potentially causing OOM.
                intermediate_tokens = num_tokens_padded
-                if enable_flash_comm_v1():
+                if enable_sp():
                    tp_size = get_tensor_model_parallel_world_size()
                    intermediate_tokens = (num_tokens_padded + tp_size - 1) // tp_size
                if self.intermediate_tensors is None:
--- a/vllm_ascend/worker/worker.py
+++ b/vllm_ascend/worker/worker.py
@@ -55,7 +55,7 @@ from vllm_ascend.ops.triton.triton_utils import init_device_properties_triton
 from vllm_ascend.utils import (
    AscendDeviceType,
    check_ascend_device_type,
-    enable_flash_comm_v1,
+    enable_sp,
    get_ascend_device_type,
    register_ascend_customop,
 )
@@ -376,7 +376,7 @@ class NPUWorker(WorkerBase):
        if forward_pass and not get_pp_group().is_first_rank:
            # If flashcomm1 is used, this all_gather_group parameter needs to be removed, otherwise
            # it will conflict with the all-gather operation in flashcomm1.
-            if enable_flash_comm_v1():
+            if enable_sp():
                all_gather_group = None
            else:
                all_gather_group = get_tp_group()
@@ -393,7 +393,7 @@ class NPUWorker(WorkerBase):
        assert parallel_config.distributed_executor_backend != ("external_launcher") and not get_pp_group().is_last_rank
        # If flashcomm1 is used, this all_gather_group parameter needs to be removed, otherwise
        # it will conflict with the all-gather operation in flashcomm1.
-        if enable_flash_comm_v1():
+        if enable_sp():
            all_gather_group = None
        else:
            all_gather_group = get_tp_group()