add mxfp8 moe quantization (#6670)

### What this PR does / why we need it? support mxfp8 quantization (Qwen MOE ) Using adaptor to make the hardware-specific behavior clearer and more maintainable ### How was this patch tested? - vLLM version: v0.15.0 - vLLM main: 13397841ab --------- Signed-off-by: fangrongcan <17343701736@163.com> Signed-off-by: wangyao-i <iwangyao@outlook.com> Signed-off-by: linfeng-yuan <1102311262@qq.com> Signed-off-by: Eric-dot <60131170+Eric-dot@users.noreply.github.com> Co-authored-by: fangrongcan <f00876277@china.huawei.com> Co-authored-by: wangyao-i <iwangyao@outlook.com> Co-authored-by: linfeng-yuan <1102311262@qq.com>
2026-03-02 11:04:06 +08:00
parent c324053b44
commit 3c66a970f2
10 changed files with 802 additions and 100 deletions
--- a/vllm_ascend/ascend_forward_context.py
+++ b/vllm_ascend/ascend_forward_context.py
@@ -264,6 +264,11 @@ def select_moe_comm_method(num_tokens: int, vllm_config: VllmConfig, is_draft_mo
            moe_comm_type = MoECommType.FUSED_MC2 if fused_prefill_enable else MoECommType.ALLTOALL
    elif soc_version in {AscendDeviceType._310P}:
        moe_comm_type = MoECommType.ALLGATHER
+    elif soc_version in {AscendDeviceType.A5}:
+        if num_tokens <= mc2_tokens_capacity and vllm_config.parallel_config.world_size_across_dp > 1:
+            moe_comm_type = MoECommType.MC2
+        else:
+            moe_comm_type = MoECommType.ALLTOALL
    else:
        raise ValueError(f"Unsupported soc_version: {soc_version}")
    return moe_comm_type