[Quickfix] Fix dp+ep+tp error when sp chunked the hidden_states (#3246)

### What this PR does / why we need it? Fix dp+ep+tp inplace copy error when sp chunked the `hidden_states`. ### How was this patch tested? test locally with the following scripts ```bash python examples/offline_data_parallel.py \ --model="Qwen/Qwen3-30B-A3B" \ --dp-size=2 \ --tp-size=2 \ --enable-expert-parallel ``` Signed-off-by: MengqingCao <cmq0113@163.com>
2025-09-29 09:12:49 +08:00
parent cf445c41f9
commit 050d202bb9
2 changed files with 4 additions and 0 deletions
--- a/vllm_ascend/ops/fused_moe.py
+++ b/vllm_ascend/ops/fused_moe.py
@@ -295,6 +295,7 @@ class AscendFusedMoE(FusedMoE):
                in_dtype=params_dtype,
            )
        self.moe_config = moe
+        # TODO: The self.moe_config.tp_size here is not correct, fixme soon

        if quant_config is None:
            self.quant_method = AscendUnquantizedFusedMoEMethod(moe)
--- a/vllm_ascend/platform.py
+++ b/vllm_ascend/platform.py
@@ -16,6 +16,7 @@
 #

 import gc
+import os
 from datetime import timedelta
 from typing import TYPE_CHECKING, Optional, Tuple

@@ -260,6 +261,8 @@ class NPUPlatform(Platform):
            compilation_config.level = CompilationLevel.NO_COMPILATION

        if parallel_config and parallel_config.worker_cls == "auto":
+            # TODO: this is a tricky way to disable `use_sequence_parallel_moe` in vllm.
+            os.environ["VLLM_ALL2ALL_BACKEND"] = "flashinfer_all2allv"
            if ascend_config.torchair_graph_config.enabled or ascend_config.enable_shared_expert_dp:
                parallel_config.worker_cls = "vllm_ascend.torchair.torchair_worker.NPUTorchairWorker"
            else: