init v0.11.0rc0

2025-10-14 10:38:28 +08:00
parent 67afd0ea78
commit 66dc16f966
278 changed files with 28130 additions and 11708 deletions
--- a/vllm_ascend/ops/fused_moe.py
+++ b/vllm_ascend/ops/fused_moe.py
@@ -19,13 +19,9 @@ import os
 from typing import Any, Callable, Optional

 import torch
-import torch.distributed as dist
 import torch_npu
-from torch import nn
 from vllm.config import get_current_vllm_config
-from vllm.distributed import (get_tensor_model_parallel_rank,
-                              get_tensor_model_parallel_world_size,
-                              tensor_model_parallel_all_reduce)
+from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.distributed.parallel_state import (get_dp_group, get_ep_group,
                                             get_tp_group)
 from vllm.forward_context import get_forward_context
@@ -39,70 +35,16 @@ from vllm.model_executor.layers.quantization.base_config import \
    QuantizationConfig

 from vllm_ascend.ascend_config import get_ascend_config
-from vllm_ascend.ascend_forward_context import FusedMoEState
-from vllm_ascend.distributed.communication_op import \
-    data_parallel_reduce_scatter
 from vllm_ascend.distributed.parallel_state import get_mc2_group
+from vllm_ascend.eplb.core.eplb_utils import (determine_default_expert_map,
+                                              determine_default_log2phy_map)
 from vllm_ascend.ops.expert_load_balancer import ExpertLoadBalancer
-from vllm_ascend.ops.layers.experts_selector import select_experts
-from vllm_ascend.ops.layers.moe_mlp import unified_apply_mlp
-from vllm_ascend.ops.sequence_parallel import MetadataForPadding
-from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, dispose_tensor,
+from vllm_ascend.ops.moe.experts_selector import select_experts
+from vllm_ascend.ops.moe.moe_comm_method import setup_moe_comm_method
+from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ,
                               get_all_reduce_merge_state,
-                               get_rm_router_logits_state, is_310p)
-
-
-def unified_fused_experts_eager(hidden_states: torch.Tensor,
-                                w1: torch.Tensor,
-                                w2: torch.Tensor,
-                                topk_weights: torch.Tensor,
-                                topk_ids: torch.Tensor,
-                                row_idx: torch.Tensor,
-                                expert_map: Optional[torch.Tensor] = None,
-                                log2phy: Optional[torch.Tensor] = None,
-                                global_redundant_expert_num: int = 0,
-                                w1_scale: Optional[torch.Tensor] = None,
-                                w1_scale_bias: Optional[torch.Tensor] = None,
-                                w2_scale: Optional[torch.Tensor] = None,
-                                w2_scale_bias: Optional[torch.Tensor] = None,
-                                shared_experts: Optional[torch.Tensor] = None,
-                                shared_gate_up: Optional[Any] = None,
-                                shared_dequant_scale: Optional[Any] = None,
-                                mc2_mask: Optional[torch.Tensor] = None,
-                                apply_router_weight_on_input: bool = False,
-                                with_quant: bool = False):
-    token_dispatcher = get_forward_context().token_dispatcher
-
-    results = token_dispatcher.token_dispatch(
-        hidden_states=hidden_states,
-        topk_weights=topk_weights,
-        topk_ids=topk_ids,
-        row_idx=row_idx,
-        expert_map=expert_map,
-        log2phy=log2phy,
-        global_redundant_expert_num=global_redundant_expert_num,
-        shared_experts=shared_experts,
-        shared_gate_up=shared_gate_up,
-        shared_dequant_scale=shared_dequant_scale,
-        mc2_mask=mc2_mask,
-        apply_router_weight_on_input=apply_router_weight_on_input,
-        with_quant=with_quant)
-
-    expert_output = unified_apply_mlp(
-        hidden_states=results["hidden_states"],
-        w1=w1,
-        w1_scale=w1_scale,
-        w2=w2,
-        w2_scale=w2_scale,
-        group_list=results["group_list"],
-        dynamic_scale=results.get("dynamic_scale"),
-        group_list_type=results.get("group_list_type"),
-        w1_scale_bias=w1_scale_bias,
-        w2_scale_bias=w2_scale_bias,
-        topk_scales=results.get("topk_scales"),
-        with_quant=with_quant)
-    final_hidden_states = token_dispatcher.token_combine(expert_output)
-    return final_hidden_states
+                               get_rm_router_logits_state, is_310p,
+                               vllm_version_is)


 class AscendUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod):
@@ -115,6 +57,7 @@ class AscendUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod):
        self.global_batch_size = vllm_config.scheduler_config.max_num_seqs
        self.max_model_len = vllm_config.model_config.max_model_len
        get_ascend_config()
+        self.dynamic_eplb = get_ascend_config().dynamic_eplb

        try:
            device_group = get_mc2_group().device_group
@@ -182,17 +125,19 @@ class AscendUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod):
        if enable_force_load_balance and not self.use_aclgraph:
            topk_ids = torch.randint_like(topk_ids, 0, global_num_experts)

-        return unified_fused_experts_eager(hidden_states=x,
-                                           w1=layer.w13_weight,
-                                           w2=layer.w2_weight,
-                                           topk_weights=topk_weights,
-                                           topk_ids=topk_ids,
-                                           row_idx=row_idx,
-                                           expert_map=expert_map,
-                                           shared_experts=shared_experts,
-                                           mc2_mask=kwargs.get(
-                                               "mc2_mask", None),
-                                           with_quant=False)
+        moe_comm_method = get_forward_context().moe_comm_method
+        return moe_comm_method.fused_experts(
+            hidden_states=x,
+            w1=layer.w13_weight,
+            w2=layer.w2_weight,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            row_idx=row_idx,
+            global_num_experts=global_num_experts,
+            expert_map=expert_map,
+            shared_experts=shared_experts,
+            need_trans=True,
+            dynamic_eplb=self.dynamic_eplb)


 class AscendFusedMoE(FusedMoE):
@@ -290,42 +235,67 @@ class AscendFusedMoE(FusedMoE):
            self.moe_parallel_config.ep_size, is_deepseek_v3_r1)

        ascend_config = get_ascend_config()
-        expert_map_path = ascend_config.expert_map_path
-        if expert_map_path and os.path.exists(expert_map_path):
-            # moe expert load balance
-            expert_load_balancer = ExpertLoadBalancer(expert_map_path,
-                                                      self.global_num_experts)
-            self.local_num_experts, self.expert_map = \
-                                expert_load_balancer.get_rank_placement_map(
-                                                self.moe_instance_id,
-                                                get_ep_group().rank_in_group)
-            self.log2phy = expert_load_balancer.get_rank_log2phy_map(
-                self.moe_instance_id,
-                get_ep_group().rank_in_group)
-            self.global_redundant_expert_num = \
-                        expert_load_balancer.get_global_redundant_expert_num()
+        self.dynamic_eplb = ascend_config.dynamic_eplb
+        self.expert_map_path = ascend_config.expert_map_path
+        self.global_redundant_expert_num = ascend_config.init_redundancy_expert
+        self.global_num_experts = num_experts + self.global_redundant_expert_num
+        # static eplb initializing with expert_map_path
+        if self.expert_map_path and os.path.exists(
+                self.expert_map_path) and os.access(self.expert_map_path,
+                                                    os.R_OK):
+            self.expert_load_balancer = ExpertLoadBalancer(
+                self.expert_map_path, self.global_num_experts)
+            self.local_num_experts, self.expert_map = (
+                self.expert_load_balancer.get_rank_placement_map(
+                    self.moe_instance_id, self.ep_rank))
+            self.log2phy = self.expert_load_balancer.get_rank_log2phy_map(
+                self.moe_instance_id, self.ep_rank).npu()
+            self.global_redundant_expert_num = (
+                self.expert_load_balancer.get_global_redundant_expert_num())
        else:
-            # Create a tensor of size num_experts filled with -1
+            # init moe.
            self.local_num_experts, self.expert_map = determine_expert_map(
-                self.ep_size,
-                get_ep_group().rank_in_group, self.global_num_experts)
+                self.ep_size, self.ep_rank, self.global_num_experts)
+            # dynamic eplb initializing with not expert_map_path
+            if self.dynamic_eplb:
+                self.global_redundant_expert_num = ascend_config.init_redundancy_expert
+                self.local_num_experts, self.expert_map = determine_default_expert_map(
+                    self.global_num_experts, self.ep_size, self.ep_rank,
+                    self.global_redundant_expert_num)
+                self.log2phy = determine_default_log2phy_map(
+                    self.global_num_experts, self.ep_size, self.ep_rank,
+                    self.global_redundant_expert_num)
+        local_num_experts = (torch.sum(self.expert_map != -1)
+                             if self.expert_map is not None else num_experts)
+        if self.dynamic_eplb:
+            self.moe_load = torch.zeros(local_num_experts, dtype=torch.int64)

        self.enable_shared_expert_dp = ascend_config.enable_shared_expert_dp

        if self.scoring_func != "softmax" and not self.use_grouped_topk:
            raise ValueError("Only softmax scoring function is supported for "
                             "non-grouped topk.")
-        moe = FusedMoEConfig.make(
-            num_experts=self.global_num_experts,
-            experts_per_token=top_k,
-            hidden_dim=hidden_size,
-            num_local_experts=self.local_num_experts,
-            moe_parallel_config=self.moe_parallel_config,
-            # TODO (bnell): this needs to be fixed for quantized types.
-            in_dtype=params_dtype,
-            quant_config=quant_config)
-
+        if vllm_version_is("0.10.2"):
+            moe = FusedMoEConfig.make(
+                num_experts=self.global_num_experts,
+                experts_per_token=top_k,
+                hidden_dim=hidden_size,
+                num_local_experts=self.local_num_experts,
+                moe_parallel_config=self.moe_parallel_config,
+                # TODO (bnell): this needs to be fixed for quantized types.
+                in_dtype=params_dtype,
+                quant_config=quant_config)
+        else:
+            moe = FusedMoEConfig(
+                num_experts=self.global_num_experts,
+                experts_per_token=top_k,
+                hidden_dim=hidden_size,
+                num_local_experts=self.local_num_experts,
+                moe_parallel_config=self.moe_parallel_config,
+                in_dtype=params_dtype,
+            )
        self.moe_config = moe
+        # TODO: The self.moe_config.tp_size here is not correct, fixme soon

        if quant_config is None:
            self.quant_method = AscendUnquantizedFusedMoEMethod(moe)
@@ -337,6 +307,11 @@ class AscendFusedMoE(FusedMoE):
        local_num_experts = torch.sum(self.expert_map != -1) \
            if self.expert_map is not None else num_experts

+        self.moe_load = None
+
+        if self.dynamic_eplb:
+            self.moe_load = torch.zeros(local_num_experts, dtype=torch.int64)
+
        moe_quant_params = {
            "num_experts": local_num_experts,
            "hidden_size": hidden_size,
@@ -354,34 +329,27 @@ class AscendFusedMoE(FusedMoE):
        # NOTE: self.tp_group is not expert_tp_group
        self.tp_group = get_tp_group().device_group
        self.quant_method.create_weights(layer=self, **moe_quant_params)
-        self.token_dispatcher = None

-        ep_size = (get_ep_group().world_size if
-                   vllm_config.parallel_config.enable_expert_parallel else 1)
-        from vllm_ascend.ops.moe_dispatcher.token_dispatcher import \
-            setup_token_dispatchers
-        setup_token_dispatchers(
-            ep_size,
-            top_k=self.top_k,
-            num_experts=self.global_num_experts,
-            num_global_redundant_experts=self.global_redundant_expert_num,
-            num_local_experts=self.local_num_experts)
+        self.moe_config.tp_group = get_tp_group()
+        self.moe_config.dp_group = get_dp_group()
+        self.moe_config.ep_group = get_ep_group()
+        self.moe_config.mc2_group = get_mc2_group()
+        self.moe_config.num_global_redundant_experts = self.global_redundant_expert_num

-    def naive_multicast(self, x: torch.Tensor,
-                        cu_tokens_across_dp_cpu: torch.Tensor):
-        assert (len(x.shape) == 2)
-        buffer = torch.empty((cu_tokens_across_dp_cpu[-1], x.size(1)),
-                             device=x.device,
-                             dtype=x.dtype)
-        start = 0 if self.dp_rank == 0 else cu_tokens_across_dp_cpu[
-            self.dp_rank - 1]
-        end = cu_tokens_across_dp_cpu[self.dp_rank]
-        buffer[start:end, :].copy_(x)
-        for idx in range(self.dp_size):
-            start = 0 if idx == 0 else cu_tokens_across_dp_cpu[idx - 1]
-            end = cu_tokens_across_dp_cpu[idx]
-            get_dp_group().broadcast(buffer[start:end, :], idx)
-        return buffer
+        setup_moe_comm_method(self.moe_config)
+
+    def update_expert_map(self, new_expert_map):
+        self.expert_map = new_expert_map
+
+    def get_map(self):
+        return self.expert_map
+
+    def get_log2phy_map(self):
+        return self.logical_to_physical_map
+
+    def clear_moe_load(self):
+        if self.moe_load is not None:
+            self.moe_load.zero_()

    def forward(self,
                hidden_states: torch.Tensor,
@@ -391,8 +359,7 @@ class AscendFusedMoE(FusedMoE):
                top_k: Optional[int] = None,
                shared_experts: Optional[Any] = None,
                gate=None,
-                replace_allreduce: bool = False,
-                _metadata_for_padding: Optional[MetadataForPadding] = None):
+                replace_allreduce: bool = False):

        assert self.quant_method is not None

@@ -401,10 +368,7 @@ class AscendFusedMoE(FusedMoE):
        else:
            real_top_k = self.top_k

-        num_tokens, hidden_size = hidden_states.shape
-
        forward_context = get_forward_context()
-        fused_moe_state = forward_context.fused_moe_state
        mc2_mask = forward_context.mc2_mask
        # For w8a8 dynamic we can do npu_dynamic_quant and gate in parallel.
        quantized_x_for_share, dynamic_scale_for_share = None, None
@@ -413,74 +377,16 @@ class AscendFusedMoE(FusedMoE):
            # When all_reduce_merge is in progress, shared_experts does not do all_reduce in mlp, but waits until shared_experts+router_experts are completed before doing all_reduce
            shared_hidden_states = shared_experts(hidden_states)

-        mc2_mask = forward_context.mc2_mask
-
-        enable_sp = _metadata_for_padding is not None and _metadata_for_padding.not_dummy_and_is_prefill
-        tp_size = get_tensor_model_parallel_world_size()
-        if enable_sp:
-            tp_rank = get_tensor_model_parallel_rank()
-            mc2_mask_sp = _metadata_for_padding.mc2_mask if _metadata_for_padding is not None else forward_context.mc2_mask
-            chunk_mc2_mask = torch.tensor_split(mc2_mask_sp, tp_size, dim=0)
-            mc2_mask = chunk_mc2_mask[tp_rank]
+        if forward_context.sp_enabled:
            replace_allreduce = True

-        if (fused_moe_state not in [
-                FusedMoEState.AllGather, FusedMoEState.AllGatherEP,
-                FusedMoEState.NaiveMulticast
-        ] and not replace_allreduce):
-            if fused_moe_state in {FusedMoEState.MC2}:
-                padding_size = forward_context.padded_num_tokens
-            else:
-                # TODO: Determine if we can remove the padding
-                padding_size = tp_size
-            if num_tokens < padding_size and not self.enable_shared_expert_dp:
-                hidden_states = nn.functional.pad(
-                    hidden_states, (0, 0, 0, padding_size - num_tokens))
-                router_logits = nn.functional.pad(
-                    router_logits, (0, 0, 0, padding_size - num_tokens))
-            if tp_size > 1:
-                tp_rank = get_tensor_model_parallel_rank()
-                if not self.enable_shared_expert_dp:
-                    chunk_hidden_states = torch.tensor_split(hidden_states,
-                                                             tp_size,
-                                                             dim=0)
-                    chunk_router_logits = torch.tensor_split(router_logits,
-                                                             tp_size,
-                                                             dim=0)
-                    hidden_states = chunk_hidden_states[tp_rank]
-                    router_logits = chunk_router_logits[tp_rank]
-
-                chunk_mc2_mask = torch.tensor_split(mc2_mask, tp_size, dim=0)
-                mc2_mask = chunk_mc2_mask[tp_rank]
-
-        if self.dp_size > 1:
-            if fused_moe_state == FusedMoEState.AllGather:
-                # NOTE: When in torchair graph, it has been padded in model_runner_v1
-                max_tokens_across_dp = forward_context.max_tokens_across_dp
-                if num_tokens < max_tokens_across_dp:
-                    hidden_states = nn.functional.pad(
-                        hidden_states,
-                        (0, 0, 0, max_tokens_across_dp - num_tokens))
-                    if not self.rm_router_logits:
-                        router_logits = nn.functional.pad(
-                            router_logits,
-                            (0, 0, 0, max_tokens_across_dp - num_tokens))
-                hidden_states = get_dp_group().all_gather(hidden_states, 0)
-                if self.rm_router_logits:
-                    router_logits, _ = gate(hidden_states)
-                else:
-                    router_logits = get_dp_group().all_gather(router_logits, 0)
-
-            elif fused_moe_state == FusedMoEState.NaiveMulticast:
-                cu_tokens_across_dp_cpu = get_forward_context(
-                ).dp_metadata.cu_tokens_across_dp_cpu
-                hidden_states = self.naive_multicast(hidden_states,
-                                                     cu_tokens_across_dp_cpu)
-                if self.rm_router_logits:
-                    router_logits, _ = gate(hidden_states)
-                else:
-                    router_logits = self.naive_multicast(
-                        router_logits, cu_tokens_across_dp_cpu)
+        hidden_states, router_logits = forward_context.moe_comm_method.prepare(
+            hidden_states=hidden_states,
+            router_logits=router_logits,
+            enable_shared_expert_dp=self.enable_shared_expert_dp,
+            rm_router_logits=self.rm_router_logits,
+            replace_allreduce=replace_allreduce,
+            gate=gate)

        # Matrix multiply.
        e_hidden_states = self.quant_method.apply(
@@ -503,53 +409,27 @@ class AscendFusedMoE(FusedMoE):
            global_redundant_expert_num=self.global_redundant_expert_num,
            shared_experts=None,
            mc2_mask=mc2_mask,
-            token_dispatcher=self.token_dispatcher,
            quantized_x_for_share=quantized_x_for_share,
            dynamic_scale_for_share=dynamic_scale_for_share,
        )

+        group_list_type = None
+
        if shared_experts:
-            if isinstance(e_hidden_states, tuple):
+            if isinstance(e_hidden_states,
+                          tuple) and len(e_hidden_states) == 2:
                e_hidden_states, shared_hidden_states = e_hidden_states

-        if (fused_moe_state not in [
-                FusedMoEState.AllGather, FusedMoEState.AllGatherEP,
-                FusedMoEState.NaiveMulticast
-        ] and not replace_allreduce and not self.enable_shared_expert_dp):
-            if tp_size > 1:
-                dist.all_gather(list(chunk_hidden_states), e_hidden_states,
-                                self.tp_group)
-                final_hidden_states = torch.cat(chunk_hidden_states, dim=0)
-                dispose_tensor(e_hidden_states)
-            else:
-                final_hidden_states = e_hidden_states
-            if num_tokens < padding_size:
-                final_hidden_states = final_hidden_states[:num_tokens]
-        elif self.dp_size > 1 and not self.enable_shared_expert_dp:
-            if fused_moe_state == FusedMoEState.NaiveMulticast:
-                start = 0 if self.dp_rank == 0 else cu_tokens_across_dp_cpu[
-                    self.dp_rank - 1]
-                end = cu_tokens_across_dp_cpu[self.dp_rank]
-                final_hidden_states = get_dp_group().all_reduce(
-                    e_hidden_states)
-                final_hidden_states = final_hidden_states[start:end, :]
-                dispose_tensor(e_hidden_states)
-            elif fused_moe_state == FusedMoEState.AllGather:
-                final_hidden_states = data_parallel_reduce_scatter(
-                    e_hidden_states, dim=0)
-                final_hidden_states = final_hidden_states[:num_tokens]
-                dispose_tensor(e_hidden_states)
-            else:
-                final_hidden_states = e_hidden_states
-        else:
-            final_hidden_states = e_hidden_states
+        if isinstance(e_hidden_states, tuple) and len(e_hidden_states) == 3:
+            e_hidden_states, group_list_type, expert_tokens = e_hidden_states

-        if tp_size > 1 and not self.all_reduce_merge and fused_moe_state in [
-                FusedMoEState.AllGather, FusedMoEState.AllGatherEP,
-                FusedMoEState.NaiveMulticast
-        ]:
-            final_hidden_states = tensor_model_parallel_all_reduce(
-                final_hidden_states)
+        if self.dynamic_eplb and group_list_type is not None:
+            self.moe_load += expert_tokens if group_list_type else \
+                torch.cat([expert_tokens[:1], expert_tokens[1:] - expert_tokens[:-1]])
+
+        final_hidden_states = forward_context.moe_comm_method.finalize(
+            hidden_states=e_hidden_states,
+            reduce_results=(not self.all_reduce_merge))

        if shared_experts:
            return final_hidden_states, shared_hidden_states