[Bugfix] Fix deepseek percision issue and add acc ci for it (#905)

### What this PR does / why we need it? Fix deepseek percision issue on V0 and add acc ci for it Fixes https://github.com/vllm-project/vllm-ascend/issues/1062 ### How was this patch tested? CI passed with new added test. Signed-off-by: MengqingCao <cmq0113@163.com>
2025-06-04 20:26:44 +08:00
parent da9acfca60
commit afc4c0cd03
9 changed files with 121 additions and 43 deletions
--- a/vllm_ascend/ops/fused_moe.py
+++ b/vllm_ascend/ops/fused_moe.py
@@ -629,6 +629,7 @@ class AscendUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod):
        scoring_func: str = "softmax",
        e_score_correction_bias: Optional[torch.Tensor] = None,
        is_prefill: bool = False,
+        enable_force_load_balance: bool = False,
        **kwargs,
    ):
        # NOTE: now npu_moe_gating_top_k can only support `group_count=256` pattern
@@ -660,6 +661,13 @@ class AscendUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod):
                e_score_correction_bias=e_score_correction_bias,
            )

+        topk_weights = topk_weights.to(x.dtype)
+        # this is a naive implementation for experts load balance so as
+        # to avoid accumulating too much tokens on a single rank.
+        # currently it is only activated when doing profile runs.
+        if enable_force_load_balance:
+            topk_ids = torch.randint_like(topk_ids, 0, global_num_experts)
+
        if VLLM_ENABLE_MC2 and not is_prefill:
            return fused_experts_with_mc2(
                hidden_states=x,