[Patch]patch of v1 executor when enable eplb. (#3511)

### What this PR does / why we need it? when using dynamic eplb, patch v1 executor to avoid create child process failed. ### How was this patch tested? deepseek in v3. - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 --------- Signed-off-by: offline0806 <3337230449@qq.com> Co-authored-by: offline0806 <3337230449@qq.com>
2025-10-19 10:54:26 +08:00
parent 646c1db5d7
commit 6c9909c861
5 changed files with 192 additions and 10 deletions
--- a/vllm_ascend/torchair/ops/torchair_fused_moe.py
+++ b/vllm_ascend/torchair/ops/torchair_fused_moe.py
@@ -1089,7 +1089,8 @@ class TorchairAscendFusedMoE(FusedMoE):
        local_num_experts = (torch.sum(self.expert_map != -1)
                             if self.expert_map is not None else num_experts)
        if self.dynamic_eplb:
-            self.moe_load = torch.zeros(local_num_experts, dtype=torch.int64)
+            self.moe_load = torch.zeros(local_num_experts,
+                                        dtype=torch.int64).npu()

        self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled
        self.multistream_overlap_shared_expert = \
@@ -1311,17 +1312,26 @@ class TorchairAscendFusedMoE(FusedMoE):
                          tuple) and len(e_hidden_states) == 2:
                e_hidden_states, shared_hidden_states = e_hidden_states

-        if self.dynamic_eplb and isinstance(
+        if isinstance(e_hidden_states, tuple) and len(e_hidden_states) == 4:
+            e_hidden_states, shared_hidden_states, group_list_type, expert_tokens = e_hidden_states
+            if self.dynamic_eplb:
+                self.moe_load += expert_tokens if group_list_type else \
+                torch.cat([expert_tokens[:1], expert_tokens[1:] - expert_tokens[:-1]])
+
+        if shared_experts is None and isinstance(
                e_hidden_states, tuple) and len(e_hidden_states) == 3:
            e_hidden_states, group_list_type, expert_tokens = e_hidden_states
-            self.moe_load += expert_tokens if group_list_type else \
-                torch.cat([expert_tokens[:1], expert_tokens[1:] - expert_tokens[:-1]])
+            if self.dynamic_eplb:
+                self.moe_load += expert_tokens if group_list_type else \
+                    torch.cat([expert_tokens[:1], expert_tokens[1:] - expert_tokens[:-1]])

        if (fused_moe_state not in [
                FusedMoEState.AllGather, FusedMoEState.AllGatherEP,
                FusedMoEState.NaiveMulticast
        ] and not replace_allreduce and not self.enable_shared_expert_dp):
            if tp_size > 1:
+                if isinstance(e_hidden_states, tuple):
+                    e_hidden_states = e_hidden_states[0]
                dist.all_gather(list(chunk_hidden_states), e_hidden_states,
                                self.tp_group)
                final_hidden_states = torch.cat(chunk_hidden_states, dim=0)
--- a/vllm_ascend/torchair/quantization/torchair_w8a8_dynamic.py
+++ b/vllm_ascend/torchair/quantization/torchair_w8a8_dynamic.py
@@ -365,17 +365,18 @@ def torchair_fused_experts_with_mc2(
    ) if enable_dispatch_v2 else torch_npu.npu_moe_distribute_combine(
        **kwargs_mc2)

-    if dynamic_eplb:
-        return (hidden_states, 1, expert_token_nums)
-
    if shared_experts is None:
+        if dynamic_eplb:
+            return (hidden_states, 1, expert_token_nums)
        return hidden_states
    else:
        with npu_stream_switch("moe_secondary", 0):
            npu_wait_tensor(shared_act, down_out_list)
            shared_output, _ = shared_experts.down_proj(
                (shared_act, swiglu_out_scale))
-        return hidden_states, shared_output
+        if dynamic_eplb:
+            return (hidden_states, shared_output, 1, expert_token_nums)
+        return (hidden_states, shared_output)


 def torchair_init_routing_quant(hidden_states, top_k, topk_ids,