[perf]Support MOE Multi-stream in Deepseek (#947)

### What this PR does / why we need it? Support MOE inner Multi-stream for Deepseek. This feature requires graph mode with mc2 enabled. --------- Signed-off-by: David9857 <985700846@qq.com>
2025-06-05 23:39:38 +08:00
parent 908a851a77
commit 78431b3469
6 changed files with 133 additions and 45 deletions
--- a/vllm_ascend/quantization/quant_config.py
+++ b/vllm_ascend/quantization/quant_config.py
@@ -329,7 +329,7 @@ class AscendFusedMoEMethod(FusedMoEMethodBase):
            layer, x, router_logits, top_k, renormalize, use_grouped_topk,
            global_num_experts, expert_map, topk_group, num_expert_group,
            custom_routing_function, scoring_func, e_score_correction_bias,
-            is_prefill, enable_force_load_balance)
+            is_prefill, enable_force_load_balance, **kwargs)

    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
        if hasattr(self.quant_method, "process_weights_after_loading"):