support deepseek quant & mix-parallel with graphmode (#585)

### What this PR does / why we need it? 1. support deepseek with w8a8 quant; 2. support deepseek with mix-parallel(multi-DP, EP+TP); 3. support deepseek with graphmode. --------- Signed-off-by: wen-jie666 <wenjie39@huawei.com> Signed-off-by: Yizhou Liu <liuyizhou5@h-partners.com> Signed-off-by: libaokui <libaokui@huawei.com> Signed-off-by: linfeng-yuan <1102311262@qq.com> Co-authored-by: wen-jie666 <wenjie39@huawei.com>
2025-04-23 16:23:25 +08:00
parent e74331a1ed
commit 5c6d05a59e
13 changed files with 520 additions and 221 deletions
--- a/vllm_ascend/quantization/quant_config.py
+++ b/vllm_ascend/quantization/quant_config.py
@@ -310,21 +310,22 @@ class AscendFusedMoEMethod(FusedMoEMethodBase):
        top_k: int,
        renormalize: bool,
        use_grouped_topk: bool = False,
-        topk_group: Optional[int] = None,
-        num_expert_group: Optional[int] = None,
        global_num_experts: int = -1,
        expert_map: Optional[torch.Tensor] = None,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
        custom_routing_function: Optional[Callable] = None,
        scoring_func: str = "softmax",
        e_score_correction_bias: Optional[torch.Tensor] = None,
+        is_prefill: bool = True,
        **kwargs,
    ) -> torch.Tensor:
        return self.quant_method.apply(layer, x, router_logits, top_k,
                                       renormalize, use_grouped_topk,
-                                       topk_group, num_expert_group,
                                       global_num_experts, expert_map,
+                                       topk_group, num_expert_group,
                                       custom_routing_function, scoring_func,
-                                       e_score_correction_bias)
+                                       e_score_correction_bias, is_prefill)

    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
        if hasattr(self.quant_method, "process_weights_after_loading"):