[MoE] [Refactor] Combine common_fused_moe and fused_moe (#3176)

### What this PR does / why we need it? 1. Move additional functionalities from fused_moe.py to common_fused_moe.py and remove fused_moe.py 2. Remove unnecessary custom classes from qwen3_moe.py, and it will be completely removed after we release vllm-ascend v0.11.0 ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Qwen3-30B-A3B/Qwen3-30B-A3B-W8A8/DeepSeek-V3-W4A8-Pruing/deepseek-mtp/pangu-pro-moe-pruing: 1. Enable/Disable EP 3. Aclgraph & eager 4. SP - vLLM version: v0.11.0 --------- Signed-off-by: Pr0Wh1teGivee <calvin_zhu0210@outlook.com> Co-authored-by: weijinqian0 <12153182+weijinqian0@users.noreply.github.com>
2025-10-09 14:12:46 +08:00
parent a36e3da78e
commit 94dd832815
17 changed files with 175 additions and 1110 deletions
--- a/vllm_ascend/torchair/models/qwen3_moe.py
+++ b/vllm_ascend/torchair/models/qwen3_moe.py
@@ -53,9 +53,9 @@ from vllm.sequence import IntermediateTensors

 from vllm_ascend.ascend_config import get_ascend_config
 from vllm_ascend.attention.attention_v1 import AscendAttentionState
-from vllm_ascend.ops.fused_moe import AscendFusedMoE
 from vllm_ascend.torchair.ops.sequence_parallel import (MetadataForPadding,
                                                        init_metadata_for_sp)
+from vllm_ascend.torchair.ops.torchair_fused_moe import TorchairAscendFusedMoE


 class CustomSparseMoeBlock(Qwen3MoeSparseMoeBlock):
@@ -81,7 +81,7 @@ class CustomSparseMoeBlock(Qwen3MoeSparseMoeBlock):
            prefix=f"{prefix}.gate",
        )

-        self.experts = AscendFusedMoE(
+        self.experts = TorchairAscendFusedMoE(
            num_experts=config.num_experts,
            top_k=config.num_experts_per_tok,
            hidden_size=config.hidden_size,