[refactor] replace scattered business kwargs with typed request objects and explicit stage boundaries (#7024)

### What this PR does / why we need it? Refactor `vllm_ascend/ops/fused_moe` to replace scattered MoE business `**kwargs` with typed request objects and explicit stage boundaries. - Prepare, dispatch, MLP, and quant stages now have clearer ownership. - Main MoE path no longer depends on business `kwargs.get(...)` lookups. - Comm and dispatcher interfaces are request-only on the main path. - UTs can assert stage-level fields directly instead of inferring behavior indirectly. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? CI passed. --------- Signed-off-by: linfeng-yuan <1102311262@qq.com>
2026-03-20 23:23:57 +08:00
parent c860535246
commit 88d03a783f
33 changed files with 2146 additions and 947 deletions
--- a/vllm_ascend/_310p/fused_moe/moe_comm_method.py
+++ b/vllm_ascend/_310p/fused_moe/moe_comm_method.py
@@ -17,8 +17,8 @@ from __future__ import annotations

 import torch

-from vllm_ascend.ascend_forward_context import _EXTRA_CTX
-from vllm_ascend.ops.fused_moe.moe_comm_method import AllGatherCommImpl, FusedExpertsResult
+from vllm_ascend.ops.fused_moe.moe_comm_method import AllGatherCommImpl
+from vllm_ascend.ops.fused_moe.moe_runtime_args import MoEMlpComputeInput

 from .moe_mlp import unified_apply_mlp
 from .token_dispatcher import TokenDispatcherWithAllGather310
@@ -35,52 +35,12 @@ class AllGatherCommImpl310(AllGatherCommImpl):
    to handle the token-to-expert mapping and communication efficiently.
    """

-    def fused_experts(  # type: ignore[override]
-        self,
-        hidden_states: torch.Tensor,
-        w1: torch.Tensor,
-        w2: torch.Tensor,
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-        expert_map: torch.Tensor | None = None,
-        use_int8_w8a8: bool = False,
-        w1_scale: torch.Tensor | None = None,
-        w2_scale: torch.Tensor | None = None,
-        apply_router_weight_on_input: bool = False,
-    ) -> FusedExpertsResult:
-        # This method is overridden to use the 310p-specific unified_apply_mlp
-        # which provides optimized MLP computation for the 310p platform
-        moe_comm_method = _EXTRA_CTX.moe_comm_method
-        assert moe_comm_method is not None, "Missing communication context"
+    def __init__(self, moe_config):
+        super().__init__(moe_config)
+        self.use_fusion_ops = False

-        dispatch_results = self.token_dispatcher.token_dispatch(
-            hidden_states=hidden_states,
-            topk_weights=topk_weights,
-            topk_ids=topk_ids,
-            expert_map=expert_map,
-            apply_router_weight_on_input=apply_router_weight_on_input,
-        )
-
-        mlp_output = unified_apply_mlp(
-            hidden_states=dispatch_results.hidden_states,
-            w1=w1,
-            w2=w2,
-            w1_scale=w1_scale,
-            w2_scale=w2_scale,
-            group_list=dispatch_results.group_list,
-            group_list_type=dispatch_results.group_list_type,
-            with_quant=use_int8_w8a8,
-        )
-
-        combine_results = self.token_dispatcher.token_combine(
-            hidden_states=mlp_output, context_metadata=dispatch_results.context_metadata
-        )
-
-        return FusedExpertsResult(
-            routed_out=combine_results.routed_out,
-            group_list_type=dispatch_results.group_list_type,
-            expert_tokens=dispatch_results.group_list,
-        )
+    def _apply_mlp(self, mlp_compute_input: MoEMlpComputeInput) -> torch.Tensor:
+        return unified_apply_mlp(mlp_compute_input=mlp_compute_input)

    def _get_token_dispatcher(self):
        return TokenDispatcherWithAllGather310(