[Feat] Shared expert dp for deepseek and deepseek_mtp (#3495)

### What this PR does / why we need it? shared expert dp for deepseek and deepseek_mtp, could be combined with sp to improve performance. ### How was this patch tested? - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 --------- Signed-off-by: zhaozx-cn <zhaozx2116@163.com> Co-authored-by: realliujiaxu <realliujiaxu@163.com>
2025-10-17 15:06:37 +08:00
parent d9ee491f70
commit bf87606932
9 changed files with 57 additions and 10 deletions
--- a/vllm_ascend/platform.py
+++ b/vllm_ascend/platform.py
@@ -273,7 +273,7 @@ class NPUPlatform(Platform):
        if parallel_config and parallel_config.worker_cls == "auto":
            # TODO: this is a tricky way to disable `use_sequence_parallel_moe` in vllm.
            os.environ["VLLM_ALL2ALL_BACKEND"] = "flashinfer_all2allv"
-            if ascend_config.torchair_graph_config.enabled or ascend_config.enable_shared_expert_dp:
+            if ascend_config.torchair_graph_config.enabled:
                parallel_config.worker_cls = "vllm_ascend.torchair.torchair_worker.NPUTorchairWorker"
            else:
                parallel_config.worker_cls = "vllm_ascend.worker.worker_v1.NPUWorker"
@@ -320,8 +320,6 @@ class NPUPlatform(Platform):
        ascend_config = get_ascend_config()

        if use_mla and ascend_config.enable_shared_expert_dp:
-            if use_mla and not use_sparse:
-                return "vllm_ascend.torchair.torchair_mla.AscendMLATorchairBackend"
            if use_mla and use_sparse:
                return "vllm_ascend.torchair.torchair_sfa.AscendSFATorchairBackend"