[Feat]support dynamic quantization in allgather (#2841)

### What this PR does / why we need it? [Feat]support dynamic quantization in allgather ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: main - vLLM main: 5931b7e5d9 Signed-off-by: withHades <244036962@qq.com> Signed-off-by: WithHades <244036962@qq.com>
2025-09-11 18:47:20 +08:00
parent 07c58669fd
commit c3c2221503
4 changed files with 112 additions and 4 deletions
--- a/tests/e2e/singlecard/ops/test_fused_moe.py
+++ b/tests/e2e/singlecard/ops/test_fused_moe.py
@@ -29,6 +29,7 @@ import torch_npu
 from vllm.model_executor.layers.activation import SiluAndMul

 from vllm_ascend.ops.moe.experts_selector import select_experts
+from vllm_ascend.ops.moe.moe_mlp import unified_apply_mlp
 from vllm_ascend.ops.moe.token_dispatcher import TokenDispatcherWithAllGather

 NUM_EXPERTS = [8, 64]
@@ -165,6 +166,87 @@ def test_token_dispatcher_with_all_gather(
    torch.npu.reset_peak_memory_stats()


+@pytest.mark.parametrize("m", [1, 33, 64])
+@pytest.mark.parametrize("n", [128, 1024, 2048])
+@pytest.mark.parametrize("k", [128, 511, 1024])
+@pytest.mark.parametrize("e", NUM_EXPERTS)
+@pytest.mark.parametrize("topk", TOP_KS)
+@pytest.mark.parametrize("ep_size", EP_SIZE)
+@pytest.mark.parametrize("dtype", [torch.bfloat16])
+@pytest.mark.parametrize("device", DEVICE)
+def test_token_dispatcher_with_all_gather_quant(
+    m: int,
+    n: int,
+    k: int,
+    e: int,
+    topk: int,
+    ep_size: int,
+    dtype: torch.dtype,
+    device: str,
+):
+    context_mock = MagicMock()
+    context_mock.fused_moe_state = 0
+    with patch("vllm_ascend.ops.moe.moe_mlp.get_forward_context",
+               return_value=context_mock):
+        a = torch.randn((m, k), device=device, dtype=dtype) / 10
+        w1 = torch.randn((e, k, 2 * n), device=device, dtype=torch.int8)
+        w1_scale = torch.empty((e, 2 * n), device=device, dtype=dtype)
+        w2 = torch.randn((e, n, k), device=device, dtype=torch.int8)
+        w2_scale = torch.empty((e, k), device=device, dtype=dtype)
+
+        score = torch.randn((m, e), device=device, dtype=dtype)
+        expert_map = None
+        local_e = e
+
+        score = torch.softmax(score, dim=-1, dtype=dtype)
+        topk_weights, topk_ids = torch.topk(score, topk)
+        topk_ids = topk_ids.to(torch.int32)
+        row_idx = (torch.arange(
+            0,
+            m * topk,
+            device=device,
+            dtype=torch.int32,
+        ).view(topk, -1).permute(1, 0).contiguous())
+
+        dispatcher_kwargs = {
+            "num_experts": e,
+            "top_k": topk,
+            "num_local_experts": local_e,
+        }
+        dispatcher = TokenDispatcherWithAllGather(**dispatcher_kwargs)
+
+        apply_router_weight_on_input = False
+        dispatch_output = dispatcher.token_dispatch(
+            hidden_states=a,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            row_idx=row_idx,
+            expert_map=expert_map,
+            apply_router_weight_on_input=apply_router_weight_on_input,
+            with_quant=True)
+
+        sorted_hidden_states = dispatch_output["hidden_states"]
+        group_list = dispatch_output["group_list"]
+        group_list_type = dispatch_output.get("group_list_type", 1)
+        dynamic_scale = dispatch_output["dynamic_scale"]
+
+        expert_output = unified_apply_mlp(hidden_states=sorted_hidden_states,
+                                          w1=w1,
+                                          w1_scale=w1_scale,
+                                          w2=w2,
+                                          w2_scale=w2_scale,
+                                          group_list=group_list,
+                                          group_list_type=group_list_type,
+                                          dynamic_scale=dynamic_scale,
+                                          with_quant=True)
+        combined_output = dispatcher.token_combine(hidden_states=expert_output,
+                                                   bias=None)
+        assert combined_output.shape == (m, k)
+        gc.collect()
+        torch.npu.empty_cache()
+        torch.npu.reset_peak_memory_stats()
+
+
@pytest.mark.parametrize("m", [1, 33, 64])
@pytest.mark.parametrize("n", [128, 1024, 2048])
@pytest.mark.parametrize("e", NUM_EXPERTS)