[Refactor][MoE] remove redundant code after refactoring fused_moe (#2612)

### What this PR does / why we need it? There are a lot of redundant codes related to moe here, and the structure is not very clear. We did the following things： we have placed the relatively independent code related to apply_mlp into a separate file; removed the environment variables of alltoall_buffer and alltoall_seq. Remove the code related to alltoall_buffer and alltoall_seq, and retain the sole TokenDispatcher inheritance class. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? e2e&ut - vLLM version: v0.10.1.1 - vLLM main: 4071c76cf3 --------- Signed-off-by: Pr0Wh1teGivee <calvin_zhu0210@outlook.com> Signed-off-by: weijinqian_v1 <weijinqian@huawei.com> Co-authored-by: weijinqian0 <12153182+weijinqian0@users.noreply.github.com>
2025-08-30 22:28:50 +08:00
parent 20ae71291d
commit 3a5fc5ee01
13 changed files with 417 additions and 1237 deletions
--- a/vllm_ascend/ops/fused_moe.py
+++ b/vllm_ascend/ops/fused_moe.py
@@ -38,7 +38,6 @@ from vllm.model_executor.layers.fused_moe.layer import (
 from vllm.model_executor.layers.quantization.base_config import \
    QuantizationConfig

-import vllm_ascend.envs as envs_ascend
 from vllm_ascend.ascend_config import get_ascend_config
 from vllm_ascend.ascend_forward_context import FusedMoEState
 from vllm_ascend.distributed.communication_op import \
@@ -46,397 +45,12 @@ from vllm_ascend.distributed.communication_op import \
 from vllm_ascend.distributed.parallel_state import get_mc2_group
 from vllm_ascend.ops.expert_load_balancer import ExpertLoadBalancer
 from vllm_ascend.ops.layers.experts_selector import select_experts
-from vllm_ascend.ops.moe_dispatcher.token_dispatcher import (
-    MoEAlltoAllSeqOverLapDispatcher, MoEDispatcherConfig)
+from vllm_ascend.ops.layers.moe_mlp import unified_apply_mlp
 from vllm_ascend.ops.sequence_parallel import MetadataForPadding
 from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, dispose_tensor,
                               get_all_reduce_merge_state,
                               get_rm_router_logits_state, is_310p)

-MOE_ALL2ALL_BUFFER: bool = envs_ascend.MOE_ALL2ALL_BUFFER
-
-
-def process_topk_ids(topk_ids: torch.Tensor, expert_num: int, ep_size: int,
-                     max_row_per_ep_rank: int, num_tokens: int,
-                     top_k: int) -> tuple[torch.Tensor, torch.Tensor]:
-    original_total_elements = num_tokens * top_k
-    device = topk_ids.device
-    original_dtype = topk_ids.dtype
-
-    if original_total_elements == 0:
-        output_len = ep_size * max_row_per_ep_rank
-        topk_ids_pad = torch.full((output_len, ),
-                                  expert_num,
-                                  dtype=original_dtype,
-                                  device=device)
-        unpad_indices = torch.full((original_total_elements, ),
-                                   -1,
-                                   dtype=torch.long,
-                                   device=device)
-        return topk_ids_pad, unpad_indices
-
-    experts_per_ep_rank_val = expert_num // ep_size
-    if experts_per_ep_rank_val == 0:
-        raise ValueError(
-            "expert_num // ep_size is 0, which leads to division by zero in ep_rank calculation. "
-            "Ensure expert_num >= ep_size.")
-
-    assigned_ep_rank = (topk_ids.float() /
-                        experts_per_ep_rank_val).to(original_dtype)
-    indices_arange = torch.arange(topk_ids.shape[0], device=device)
-
-    is_new_segment = torch.cat(
-        (torch.tensor([True], device=device), assigned_ep_rank[1:]
-         != assigned_ep_rank[:-1]))
-    temp_start_markers = torch.full_like(indices_arange,
-                                         -1,
-                                         dtype=indices_arange.dtype)
-    temp_start_markers[is_new_segment] = indices_arange[is_new_segment]
-    start_offset_for_each_token = torch.cummax(temp_start_markers, dim=0)[0]
-    token_intra_ep_rank_idx = indices_arange - start_offset_for_each_token
-    is_kept_mask = token_intra_ep_rank_idx < max_row_per_ep_rank
-    cumsum_kept = torch.cumsum(is_kept_mask.float(), dim=0).to(torch.long)
-    indices_in_rec_cond_list_for_all = cumsum_kept - 1
-    unpad_indices = torch.where(
-        is_kept_mask, indices_in_rec_cond_list_for_all,
-        torch.tensor(-1, device=device, dtype=torch.long))
-    output_len = ep_size * max_row_per_ep_rank
-    topk_ids_pad = torch.full((output_len, ),
-                              expert_num,
-                              dtype=original_dtype,
-                              device=device)
-    if topk_ids.shape[0] > 0:
-        all_destination_indices = assigned_ep_rank * max_row_per_ep_rank + token_intra_ep_rank_idx
-        temp_pad_buffer = torch.full((output_len + 1, ),
-                                     expert_num,
-                                     dtype=original_dtype,
-                                     device=device)
-        output_len_tensor = torch.tensor(output_len,
-                                         dtype=torch.long,
-                                         device=device)
-        scatter_indices = torch.where(is_kept_mask, all_destination_indices,
-                                      output_len_tensor)
-        temp_pad_buffer.scatter_(0, scatter_indices, topk_ids)
-        topk_ids_pad = temp_pad_buffer[:output_len]
-    return topk_ids_pad, unpad_indices
-
-
-def apply_mlp(
-    hidden_states: torch.Tensor,
-    w1: torch.Tensor,
-    w2: torch.Tensor,
-    group_list: torch.Tensor,
-    group_list_type: int = 1,
-) -> torch.Tensor:
-    """
-    apply MLP: gate_up_proj -> swiglu -> down_proj
-
-    Args:
-        hidden_states_wrapper: wrapper of input hidden states with shape (num_tokens, hidden_size).
-        w1: expert weights1 with shape
-            (num_experts, hidden_size, intermediate_size * 2)
-        w2: expert weights2 with shape
-            (num_experts, intermediate_size, hidden_size)
-        group_list: number of tokens for each expert, follow cumsum mode, and
-            with shape (num_experts).
-        transpose_weight:
-            w1: (num_experts, intermediate_size * 2, hidden_size) ->
-                    (num_experts, hidden_size, intermediate_size * 2)
-            w2: (num_experts, hidden_size, intermediate_size) ->
-                    (num_experts, intermediate_size, hidden_size)
-
-    Returns:
-        hidden_states: output hidden states after MLP.
-    """
-
-    w1 = w1.transpose(1, 2)
-    hidden_states = torch_npu.npu_grouped_matmul(
-        x=[hidden_states],
-        weight=[w1],
-        split_item=2,
-        group_list_type=group_list_type,
-        group_type=0,
-        group_list=group_list,
-    )[0]
-
-    hidden_states = torch_npu.npu_swiglu(hidden_states)
-
-    w2 = w2.transpose(1, 2)
-    hidden_states = torch_npu.npu_grouped_matmul(
-        x=[hidden_states],
-        weight=[w2],
-        split_item=2,
-        group_list_type=group_list_type,
-        group_type=0,
-        group_list=group_list,
-    )[0]
-
-    return hidden_states
-
-
-def fused_experts_moge(
-    hidden_states: torch.Tensor,
-    w1: torch.Tensor,
-    w2: torch.Tensor,
-    moe_parallel_config: FusedMoEParallelConfig,
-    topk_weights: torch.Tensor,
-    topk_ids: torch.Tensor,
-    top_k: int,
-    global_num_experts: int,
-    expert_map: torch.Tensor = None,
-    apply_router_weight_on_input: bool = False,
-) -> torch.Tensor:
-    """
-
-    Args:
-        hidden_states: Hidden states of shape (num_tokens, hidden_size).
-        w1: Expert weights1 of shape (num_experts, intermediate_size * 2, hidden_size).
-        w2: Expert weights2 of shape (num_experts, hidden_size, intermediate_size).
-        topk_weights: Routing weights of shape (num_tokens, top_k).
-        topk_ids: Selected expert IDs of shape (num_tokens, top_k).
-        top_k: Number of experts to select.
-        expert_map: Expert mapping of shape (num_experts,).
-
-    Returns:
-        hidden_states: Hidden states after routing.
-    """
-    ep_size = moe_parallel_config.ep_size
-    local_num_experts = global_num_experts // ep_size
-    local_num_group = top_k // ep_size
-
-    if apply_router_weight_on_input:
-        assert (topk_weights.dim() == 2
-                ), "`topk_weights` should be in shape (num_tokens, topk)"
-        _, topk = topk_weights.shape
-        assert (
-            topk == 1
-        ), "Only support topk=1 when `apply_router_weight_on_input` is True"
-        hidden_states = hidden_states * topk_weights.to(hidden_states.dtype)
-
-    bsz, _ = hidden_states.shape
-    flatten_topk_ids = topk_ids.view(-1)
-    sorted_topk_ids = torch.argsort(flatten_topk_ids.float())
-    sorted_topk_ids = sorted_topk_ids.to(torch.int32)
-    sorted_hidden_states = hidden_states.index_select(
-        0, sorted_topk_ids // local_num_group)
-
-    experts_id = torch.arange(0,
-                              local_num_experts,
-                              dtype=topk_ids.dtype,
-                              device=topk_ids.device)
-    num_tokens_per_expert = (flatten_topk_ids.unsqueeze(-1) == experts_id).to(
-        torch.float32).sum(0)
-    topk_scales = topk_weights.view(-1).index_select(
-        0, sorted_topk_ids).unsqueeze(-1)
-    group_list = num_tokens_per_expert.cumsum(dim=0).to(torch.int64)
-
-    gate_up_out = torch_npu.npu_grouped_matmul(
-        x=[sorted_hidden_states],
-        weight=[w1],
-        split_item=2,
-        group_list_type=0,
-        group_type=0,
-        group_list=group_list,
-    )[0]
-
-    if is_310p():
-        gate_up_out = torch_npu.npu_swiglu(gate_up_out.to(torch.float32)).to(
-            torch.float16)
-    else:
-        gate_up_out = torch_npu.npu_swiglu(gate_up_out)
-    gate_up_out *= topk_scales
-
-    down_out_list = torch_npu.npu_grouped_matmul(
-        x=[gate_up_out],
-        weight=[w2],
-        split_item=2,
-        group_list_type=0,
-        group_type=0,
-        group_list=group_list,
-    )[0]
-
-    unsorted_topk_ids = torch.argsort(sorted_topk_ids.float()).to(torch.int32)
-    unsorted_hidden_states = down_out_list.index_select(0, unsorted_topk_ids)
-    final_hidden_states = unsorted_hidden_states.reshape(
-        bsz, top_k // ep_size, -1).sum(1)
-
-    return final_hidden_states
-
-
-def quant_apply_mlp(hidden_states: torch.Tensor,
-                    w1: torch.Tensor,
-                    w1_scale: torch.Tensor,
-                    w2: torch.Tensor,
-                    w2_scale: torch.Tensor,
-                    group_list: torch.Tensor,
-                    dynamic_scale: torch.Tensor = None,
-                    group_list_type: int = 1,
-                    w1_scale_bias: torch.Tensor = None,
-                    w2_scale_bias: torch.Tensor = None) -> torch.Tensor:
-    if dynamic_scale is None:
-        unquantized_hidden_states = hidden_states
-        hidden_states, pertoken_scale = torch_npu.npu_dynamic_quant(
-            hidden_states)
-        # Dispose the original unquantized hidden states
-        # to save npu memory because they're no longer used.
-        dispose_tensor(unquantized_hidden_states)
-    else:
-        pertoken_scale = dynamic_scale
-
-    bias1, bias2 = None, None
-    _output_dtype = w2_scale.dtype
-
-    is_mc2 = get_forward_context().fused_moe_state == FusedMoEState.MC2
-    if w1_scale_bias is None and is_mc2:
-        w1_scale = w1_scale.to(torch.float32)
-
-        # gmm1: gate_up_proj
-        hidden_states = torch_npu.npu_grouped_matmul(
-            x=[hidden_states],
-            weight=[w1],
-            split_item=3,
-            group_list_type=group_list_type,
-            group_type=0,
-            group_list=group_list,
-            output_dtype=torch.int32)[0]
-
-        # act_fn: swiglu
-        hidden_states, swiglu_out_scale = torch_npu.npu_dequant_swiglu_quant(
-            x=hidden_states,
-            weight_scale=w1_scale,
-            activation_scale=pertoken_scale,
-            bias=None,
-            quant_scale=None,
-            quant_offset=None,
-            group_index=group_list,
-            activate_left=True,
-            quant_mode=1,
-        )
-
-        # gmm2: down_proj
-        hidden_states = torch_npu.npu_grouped_matmul(
-            x=[hidden_states],
-            weight=[w2],
-            scale=[w2_scale],
-            per_token_scale=[swiglu_out_scale],
-            split_item=2,
-            group_list_type=group_list_type,
-            group_type=0,
-            group_list=group_list,
-            output_dtype=w2_scale.dtype)[0]
-    else:
-        if w1_scale_bias is not None:
-            if group_list_type == 0:
-                group_list = torch.cat(
-                    [group_list[:1],
-                     torch.diff(group_list, dim=0)])
-                group_list_type = 1
-            bias1 = [w1_scale_bias]
-            bias2 = [w2_scale_bias]
-            # TODO w4a8 scene: dynamic acquisition of dtype in the future
-            _output_dtype = torch.bfloat16
-
-        # gmm1: gate_up_proj
-        hidden_states = torch_npu.npu_grouped_matmul(
-            x=[hidden_states],
-            weight=[w1],
-            scale=[w1_scale],
-            bias=bias1,
-            per_token_scale=[pertoken_scale],
-            split_item=2,
-            group_list_type=group_list_type,
-            group_type=0,
-            group_list=group_list,
-            output_dtype=_output_dtype)[0]
-
-        # act_fn: swiglu
-        hidden_states = torch_npu.npu_swiglu(hidden_states)
-        hidden_states, swiglu_out_scale = torch_npu.npu_dynamic_quant(
-            hidden_states)
-
-        # gmm2: down_proj
-        hidden_states = torch_npu.npu_grouped_matmul(
-            x=[hidden_states],
-            weight=[w2],
-            scale=[w2_scale],
-            bias=bias2,
-            per_token_scale=[swiglu_out_scale],
-            split_item=2,
-            group_list_type=group_list_type,
-            group_type=0,
-            group_list=group_list,
-            output_dtype=_output_dtype)[0]
-    return hidden_states
-
-
-def unquant_apply_mlp(
-        hidden_states: torch.Tensor,
-        w1: torch.Tensor,
-        w2: torch.Tensor,
-        group_list: torch.Tensor,
-        group_list_type: int = 1,
-        topk_scales: Optional[torch.Tensor] = None) -> torch.Tensor:
-    w1 = w1.transpose(1, 2)
-    gate_up_out = torch_npu.npu_grouped_matmul(
-        x=[hidden_states],
-        weight=[w1],
-        split_item=2,
-        group_list_type=group_list_type,
-        group_type=0,
-        group_list=group_list,
-    )[0]
-    if is_310p():
-        gate_up_out = torch_npu.npu_swiglu(gate_up_out.to(torch.float32)).to(
-            torch.float16)
-    else:
-        gate_up_out = torch_npu.npu_swiglu(gate_up_out)
-
-    if topk_scales is not None:
-        gate_up_out *= topk_scales
-
-    w2 = w2.transpose(1, 2)
-    hidden_states = torch_npu.npu_grouped_matmul(
-        x=[gate_up_out],
-        weight=[w2],
-        split_item=2,
-        group_list_type=group_list_type,
-        group_type=0,
-        group_list=group_list,
-    )[0]
-    return hidden_states
-
-
-def unified_apply_mlp(hidden_states: torch.Tensor,
-                      w1: torch.Tensor,
-                      w1_scale: torch.Tensor,
-                      w2: torch.Tensor,
-                      w2_scale: torch.Tensor,
-                      group_list: torch.Tensor,
-                      dynamic_scale: torch.Tensor = None,
-                      group_list_type: int = 1,
-                      w1_scale_bias: torch.Tensor = None,
-                      w2_scale_bias: torch.Tensor = None,
-                      topk_scales: Optional[torch.Tensor] = None,
-                      with_quant: bool = False) -> torch.Tensor:
-    if with_quant:
-        return quant_apply_mlp(hidden_states=hidden_states,
-                               w1=w1,
-                               w1_scale=w1_scale,
-                               w2=w2,
-                               w2_scale=w2_scale,
-                               group_list=group_list,
-                               dynamic_scale=dynamic_scale,
-                               group_list_type=group_list_type,
-                               w1_scale_bias=w1_scale_bias,
-                               w2_scale_bias=w2_scale_bias)
-    else:
-        return unquant_apply_mlp(hidden_states=hidden_states,
-                                 w1=w1,
-                                 w2=w2,
-                                 group_list=group_list,
-                                 group_list_type=group_list_type,
-                                 topk_scales=topk_scales)
-

 def unified_fused_experts_eager(hidden_states: torch.Tensor,
                                w1: torch.Tensor,
@@ -742,24 +356,6 @@ class AscendFusedMoE(FusedMoE):
        self.tp_group = get_tp_group().device_group
        self.quant_method.create_weights(layer=self, **moe_quant_params)
        self.token_dispatcher = None
-        if envs_ascend.VLLM_ASCEND_ENABLE_MOE_ALL2ALL_SEQ and isinstance(
-                self.quant_method, AscendUnquantizedFusedMoEMethod):
-            self.reduce_results = False
-            moe_dispatcher_config = (
-                MoEDispatcherConfig().set_num_moe_experts(
-                    self.global_num_experts).set_num_local_experts(
-                        self.local_num_experts).set_moe_router_topk(
-                            top_k).set_group_topk(topk_group).
-                set_num_groups(num_expert_group).set_expert_bias(
-                    e_score_correction_bias).set_scaling_factor(1.0).build())
-            self.token_dispatcher = MoEAlltoAllSeqOverLapDispatcher(
-                moe_dispatcher_config)
-            if envs_ascend.VLLM_ASCEND_ENABLE_DBO:
-                token_dispatcher1 = MoEAlltoAllSeqOverLapDispatcher(
-                    moe_dispatcher_config)
-                self.token_dispatchers = [
-                    self.token_dispatcher, token_dispatcher1
-                ]

        ep_size = (get_ep_group().world_size if
                   vllm_config.parallel_config.enable_expert_parallel else 1)