From f1543d5e0d71de4fc4a649c1d20d2d2ac2eb5d7e Mon Sep 17 00:00:00 2001 From: zzzzwwjj <34335947+zzzzwwjj@users.noreply.github.com> Date: Sat, 7 Jun 2025 21:11:36 +0800 Subject: [PATCH] [bugfix] fix deeepseek accuracy (#1118) ### What this PR does / why we need it? fix deeepseek accuracy in mix-parallel case. Signed-off-by: zzzzwwjj <1183291235@qq.com> --- vllm_ascend/models/deepseek_v2.py | 41 ++++++++++++------------------- vllm_ascend/ops/fused_moe.py | 3 ++- vllm_ascend/platform.py | 6 ++++- 3 files changed, 23 insertions(+), 27 deletions(-) diff --git a/vllm_ascend/models/deepseek_v2.py b/vllm_ascend/models/deepseek_v2.py index 8a1b8d2..96c7633 100644 --- a/vllm_ascend/models/deepseek_v2.py +++ b/vllm_ascend/models/deepseek_v2.py @@ -67,6 +67,7 @@ from vllm.sequence import IntermediateTensors import vllm_ascend.envs as envs_ascend from vllm_ascend.ascend_config import get_ascend_config +from vllm_ascend.distributed.parallel_state import get_ep_group from vllm_ascend.ops.fused_moe import AscendFusedMoE from vllm_ascend.quantization.w8a8_dynamic import AscendW8A8DynamicLinearMethod from vllm_ascend.utils import dispose_tensor @@ -211,13 +212,15 @@ class CustomDeepseekV2MoE(nn.Module): self.tp_group = get_tp_group().device_group self.tp_rank = get_tp_group().rank_in_group + self.ep_group = get_ep_group() self.params_dtype = torch.get_default_dtype() ascend_config = get_ascend_config() self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled + # NOTE: multistream only effective when `VLLM_ENABLE_MC2` is on self.enable_multistream_shared_expert = \ - ascend_config.torchair_graph_config.enable_multistream_shared_expert + ascend_config.torchair_graph_config.enable_multistream_shared_expert and VLLM_ENABLE_MC2 def forward( self, @@ -245,16 +248,12 @@ class CustomDeepseekV2MoE(nn.Module): old_hidden_states = hidden_states.clone() if self.tp_size > 1: - if envs_ascend.VLLM_ENABLE_MC2 and not is_prefill: - chunks = torch.chunk(hidden_states, self.tp_size, dim=0) - hidden_states = chunks[self.tp_rank] - elif not self.torchair_graph_enabled: - num_padding_tokens = (self.tp_size - - num_tokens % self.tp_size) % self.tp_size - # Pad hidden_states to make it divisible by tp_size to avoid cross-ring AllGatherV on 910B2C - if num_padding_tokens > 0: + if (VLLM_ENABLE_MC2 + and not is_prefill) or not (self.torchair_graph_enabled or + self.ep_group.world_size == 1): + if num_tokens < self.tp_size: hidden_states = nn.functional.pad( - hidden_states, (0, 0, 0, num_padding_tokens)) + hidden_states, (0, 0, 0, self.tp_size - num_tokens)) chunk_hidden_states = torch.tensor_split(hidden_states, self.tp_size, dim=0) @@ -284,24 +283,16 @@ class CustomDeepseekV2MoE(nn.Module): hidden_states = hidden_states * self.routed_scaling_factor if self.tp_size > 1: - if self.torchair_graph_enabled: - if envs_ascend.VLLM_ENABLE_MC2 and not is_prefill: - final_hidden_states = torch.zeros( - [num_tokens, hidden_size], - dtype=self.params_dtype, - device="npu") - dist.all_gather_into_tensor(final_hidden_states, - hidden_states, self.tp_group) - hidden_states = final_hidden_states - else: - hidden_states = tensor_model_parallel_all_reduce( - hidden_states) - else: + if (VLLM_ENABLE_MC2 + and not is_prefill) or not (self.torchair_graph_enabled or + self.ep_group.world_size == 1): dist.all_gather(list(chunk_hidden_states), hidden_states, self.tp_group) hidden_states = torch.cat(chunk_hidden_states, dim=0) - if num_padding_tokens > 0: - hidden_states = hidden_states[:-num_padding_tokens] + if num_tokens < self.tp_size: + hidden_states = hidden_states[:num_tokens] + else: + hidden_states = tensor_model_parallel_all_reduce(hidden_states) if self.n_shared_experts is not None: if not multistream: diff --git a/vllm_ascend/ops/fused_moe.py b/vllm_ascend/ops/fused_moe.py index 56df04e..c5f3178 100644 --- a/vllm_ascend/ops/fused_moe.py +++ b/vllm_ascend/ops/fused_moe.py @@ -1027,8 +1027,9 @@ class AscendFusedMoE(FusedMoE): ascend_config = get_ascend_config() self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled + # NOTE: multistream only effective when `VLLM_ENABLE_MC2` is on self.enable_multistream_shared_expert = \ - ascend_config.torchair_graph_config.enable_multistream_shared_expert + ascend_config.torchair_graph_config.enable_multistream_shared_expert and VLLM_ENABLE_MC2 if self.scoring_func != "softmax" and not self.use_grouped_topk: raise ValueError("Only softmax scoring function is supported for " diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py index b5798d0..ff9a945 100644 --- a/vllm_ascend/platform.py +++ b/vllm_ascend/platform.py @@ -142,7 +142,11 @@ class NPUPlatform(Platform): # NOTE: When enable_expert_parallel is True, we follow vLLM convention: # ep_size = world_size, which means expert_tensor_parallel_size must be 1 - if ascend_config.expert_tensor_parallel_size > 0 and not parallel_config.enable_expert_parallel: + if parallel_config.enable_expert_parallel: + parallel_config.expert_tensor_parallel_size = 1 + # NOTE: When enable_expert_parallel is False and param `asceend_config.expert_tensor_parallel_size` + # is configured, use ascend_config + elif ascend_config.expert_tensor_parallel_size > 0: parallel_config.expert_tensor_parallel_size = ascend_config.expert_tensor_parallel_size # Calculate expert parallel size based on world size