[Bugfix] rename enable_flash_comm_v1 back to enable_sp (#6883)
### What this PR does / why we need it?
PR #5632 introduced a bug by replacing some branches gated by enable_sp
with enable_flash_comm_v1. As a result, when enable_shared_expert_dp is
enabled alone (i.e., VLLM_ASCEND_ENABLE_FLASHCOMM1=0 and
VLLM_ASCEND_ENABLE_FLASHCOMM=0), the behavior becomes inconsistent with
the previous logic and leads to accuracy issues. This PR restores the
original enable_sp-based branching to recover expected behavior and
accuracy.
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
#### 1. start server
``` bash
vllm serve /home/weights/DeepSeek-V2-Lite-W8A8/ \
--port 8001 \
--served-model-name auto \
--max-model-len 1024 \
--enforce-eager \
--tensor-parallel-size 2 \
--data-parallel-size 2 \
--gpu-memory-utilization 0.9 \
--enable-expert-parallel \
--additional-config '{"enable_shared_expert_dp": true}'
```
#### 2. curl
```bash
curl -s http://localhost:8001/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "auto",
"messages": [
{"role": "user", "content": "Hello. I have a question. Who are you?"}
],
"max_tokens": 10,
"temperature": 0.0,
"ignore_eos_token": true
}'
```
- vLLM version: v0.16.0
- vLLM main:
15d76f74e2
Signed-off-by: realliujiaxu <realliujiaxu@163.com>
This commit is contained in:
@@ -113,8 +113,8 @@ from vllm_ascend.spec_decode.medusa_proposer import MedusaProposer
|
||||
from vllm_ascend.spec_decode.mtp_proposer import MtpProposer
|
||||
from vllm_ascend.utils import (
|
||||
check_gdn_layer,
|
||||
enable_flash_comm_v1,
|
||||
enable_sp,
|
||||
enable_sp_by_pass,
|
||||
is_drafter_moe_model,
|
||||
is_moe_model,
|
||||
lmhead_tp_enable,
|
||||
@@ -1745,7 +1745,7 @@ class NPUModelRunner(GPUModelRunner):
|
||||
# Pad tokens to multiple of tensor_parallel_size when
|
||||
# enabled collective fusion for SP
|
||||
tp_size = self.vllm_config.parallel_config.tensor_parallel_size
|
||||
if enable_sp(self.vllm_config):
|
||||
if enable_sp(self.vllm_config) or enable_sp_by_pass(self.vllm_config):
|
||||
return round_up(num_scheduled_tokens, tp_size)
|
||||
return num_scheduled_tokens
|
||||
|
||||
@@ -2300,7 +2300,7 @@ class NPUModelRunner(GPUModelRunner):
|
||||
# tp_size; otherwise, on non-first PP ranks it would effectively perform an extra all-gather, leading
|
||||
# to incorrect memory estimation and potentially causing OOM.
|
||||
intermediate_tokens = num_tokens_padded
|
||||
if enable_flash_comm_v1():
|
||||
if enable_sp():
|
||||
tp_size = get_tensor_model_parallel_world_size()
|
||||
intermediate_tokens = (num_tokens_padded + tp_size - 1) // tp_size
|
||||
if self.intermediate_tensors is None:
|
||||
|
||||
@@ -55,7 +55,7 @@ from vllm_ascend.ops.triton.triton_utils import init_device_properties_triton
|
||||
from vllm_ascend.utils import (
|
||||
AscendDeviceType,
|
||||
check_ascend_device_type,
|
||||
enable_flash_comm_v1,
|
||||
enable_sp,
|
||||
get_ascend_device_type,
|
||||
register_ascend_customop,
|
||||
)
|
||||
@@ -376,7 +376,7 @@ class NPUWorker(WorkerBase):
|
||||
if forward_pass and not get_pp_group().is_first_rank:
|
||||
# If flashcomm1 is used, this all_gather_group parameter needs to be removed, otherwise
|
||||
# it will conflict with the all-gather operation in flashcomm1.
|
||||
if enable_flash_comm_v1():
|
||||
if enable_sp():
|
||||
all_gather_group = None
|
||||
else:
|
||||
all_gather_group = get_tp_group()
|
||||
@@ -393,7 +393,7 @@ class NPUWorker(WorkerBase):
|
||||
assert parallel_config.distributed_executor_backend != ("external_launcher") and not get_pp_group().is_last_rank
|
||||
# If flashcomm1 is used, this all_gather_group parameter needs to be removed, otherwise
|
||||
# it will conflict with the all-gather operation in flashcomm1.
|
||||
if enable_flash_comm_v1():
|
||||
if enable_sp():
|
||||
all_gather_group = None
|
||||
else:
|
||||
all_gather_group = get_tp_group()
|
||||
|
||||
Reference in New Issue
Block a user