[Feat] 310p support MoE W8A8 quantizaition (#6641)

### What this PR does / why we need it? This PR introduces support for W8A8 dynamic quantization for Mixture-of-Experts (MoE) models on Ascend 310P devices. This is achieved by: - Implementing a new quantization scheme `AscendW8A8DynamicFusedMoEMethod310`. - Adding a unified MLP implementation (`unified_apply_mlp`) for 310P that handles both quantized and unquantized paths. - Refactoring the MoE and quantization configuration logic to correctly route to the new 310P-specific implementations. - Adding new e2e and unit tests to verify the functionality of MoE W8A8 quantization. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? - Added a new e2e test `test_qwen3_moe_tp2_w8a8` to test MoE W8A8 quantization in a multi-card setup. - Added several new unit tests for the 310P-specific MoE components, including `experts_selector`, `fused_moe`, `moe_comm_method`, `moe_mlp`, and the new `w8a8_dynamic` quantization method. - vLLM version: v0.15.0 - vLLM main: d7e17aaacd --------- Signed-off-by: pu-zhe <zpuaa@outlook.com>
2026-02-10 17:17:44 +08:00
parent 1eb07986bf
commit 02886e2641
15 changed files with 695 additions and 157 deletions
--- a/vllm_ascend/_310p/fused_moe/fused_moe.py
+++ b/vllm_ascend/_310p/fused_moe/fused_moe.py
@@ -58,7 +58,6 @@ class AscendUnquantizedFusedMoEMethod310(UnquantizedFusedMoEMethod):
        num_expert_group: int | None = None,
        custom_routing_function: Callable | None = None,
        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
        e_score_correction_bias: torch.Tensor | None = None,
        global_num_experts: int = -1,
        expert_map: torch.Tensor | None = None,
@@ -67,7 +66,6 @@ class AscendUnquantizedFusedMoEMethod310(UnquantizedFusedMoEMethod):
    ) -> torch.Tensor:
        zero_expert_num = getattr(layer, "zero_expert_num", 0)
        zero_expert_type = getattr(layer, "zero_expert_type", None)
-        assert routed_scaling_factor == 1.0

        topk_weights, topk_ids = select_experts(
            hidden_states=x,
@@ -195,44 +193,36 @@ class AscendFusedMoE310(FusedMoE):

        method = quant_method.quant_method
        quant_type = getattr(method, "quant_type", QuantType.NONE)
-        if quant_type != QuantType.NONE:
-            # TODO: w8a8 quantization will be supported soon, and only reject w4a8 here.
-            raise RuntimeError("W8A8 is not supported currently.")
-        return QuantType.NONE
+        if quant_type not in [QuantType.NONE, QuantType.W8A8]:
+            raise RuntimeError("Only Unquant and W8A8 is supported.")
+        return quant_type

    def forward_impl(  # type: ignore[override]
        self, hidden_states: torch.Tensor, router_logits: torch.Tensor
    ) -> torch.Tensor:
        assert self.quant_method is not None
+        assert self.routed_scaling_factor == 1.0, "routed_scaling_factor != 1.0 is not supported."
        forward_context = get_forward_context()

        hidden_states, router_logits, _, context_metadata = forward_context.moe_comm_method.prepare(
            hidden_states=hidden_states, router_logits=router_logits, quant_type=self.quant_type
        )

-        if isinstance(hidden_states, tuple):
-            hidden_states, pertoken_scale = hidden_states
-        else:
-            pertoken_scale = None
-
        # Matrix multiply.
        fused_experts_results: FusedExpertsResult = self.quant_method.apply(
            layer=self,
            x=hidden_states,
-            router_logits=router_logits,
-            pertoken_scale=pertoken_scale,
-            top_k=self.top_k,
-            renormalize=self.renormalize,
            use_grouped_topk=self.use_grouped_topk,
-            global_num_experts=self.global_num_experts,
-            expert_map=self.local_expert_map,
+            top_k=self.top_k,
+            router_logits=router_logits,
+            renormalize=self.renormalize,
            topk_group=self.topk_group,
            num_expert_group=self.num_expert_group,
            custom_routing_function=self.custom_routing_function,
            scoring_func=self.scoring_func,
-            routed_scaling_factor=self.routed_scaling_factor,
            e_score_correction_bias=self.e_score_correction_bias,
-            activation=self.activation,
+            global_num_experts=self.global_num_experts,
+            expert_map=self.local_expert_map,
            apply_router_weight_on_input=self.apply_router_weight_on_input,
        )