[Feat]support dynamic quantization in allgather (#2841)

### What this PR does / why we need it? [Feat]support dynamic quantization in allgather ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: main - vLLM main: 5931b7e5d9 Signed-off-by: withHades <244036962@qq.com> Signed-off-by: WithHades <244036962@qq.com>
2025-09-11 18:47:20 +08:00
parent 07c58669fd
commit c3c2221503
4 changed files with 112 additions and 4 deletions
--- a/tests/e2e/multicard/test_qwen3_moe.py
+++ b/tests/e2e/multicard/test_qwen3_moe.py
@@ -66,7 +66,6 @@ def test_models_distributed_Qwen3_MOE_W8A8():
            max_model_len=8192,
            tensor_parallel_size=2,
            quantization="ascend",
            enforce_eager=True,
    ) as vllm_model:
        vllm_model.generate_greedy(example_prompts, max_tokens)
--- a/tests/e2e/singlecard/ops/test_fused_moe.py
+++ b/tests/e2e/singlecard/ops/test_fused_moe.py
@@ -29,6 +29,7 @@ import torch_npu
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm_ascend.ops.moe.experts_selector import select_experts
 from vllm_ascend.ops.moe.moe_mlp import unified_apply_mlp
 from vllm_ascend.ops.moe.token_dispatcher import TokenDispatcherWithAllGather
 NUM_EXPERTS = [8, 64]
@@ -165,6 +166,87 @@ def test_token_dispatcher_with_all_gather(
    torch.npu.reset_peak_memory_stats()
@pytest.mark.parametrize("m", [1, 33, 64])
@pytest.mark.parametrize("n", [128, 1024, 2048])
@pytest.mark.parametrize("k", [128, 511, 1024])
@pytest.mark.parametrize("e", NUM_EXPERTS)
@pytest.mark.parametrize("topk", TOP_KS)
@pytest.mark.parametrize("ep_size", EP_SIZE)
@pytest.mark.parametrize("dtype", [torch.bfloat16])
@pytest.mark.parametrize("device", DEVICE)
 def test_token_dispatcher_with_all_gather_quant(
    m: int,
    n: int,
    k: int,
    e: int,
    topk: int,
    ep_size: int,
    dtype: torch.dtype,
    device: str,
 ):
    context_mock = MagicMock()
    context_mock.fused_moe_state = 0
    with patch("vllm_ascend.ops.moe.moe_mlp.get_forward_context",
               return_value=context_mock):
        a = torch.randn((m, k), device=device, dtype=dtype) / 10
        w1 = torch.randn((e, k, 2 * n), device=device, dtype=torch.int8)
        w1_scale = torch.empty((e, 2 * n), device=device, dtype=dtype)
        w2 = torch.randn((e, n, k), device=device, dtype=torch.int8)
        w2_scale = torch.empty((e, k), device=device, dtype=dtype)
        score = torch.randn((m, e), device=device, dtype=dtype)
        expert_map = None
        local_e = e
        score = torch.softmax(score, dim=-1, dtype=dtype)
        topk_weights, topk_ids = torch.topk(score, topk)
        topk_ids = topk_ids.to(torch.int32)
        row_idx = (torch.arange(
            0,
            m * topk,
            device=device,
            dtype=torch.int32,
        ).view(topk, -1).permute(1, 0).contiguous())
        dispatcher_kwargs = {
            "num_experts": e,
            "top_k": topk,
            "num_local_experts": local_e,
        }
        dispatcher = TokenDispatcherWithAllGather(**dispatcher_kwargs)
        apply_router_weight_on_input = False
        dispatch_output = dispatcher.token_dispatch(
            hidden_states=a,
            topk_weights=topk_weights,
            topk_ids=topk_ids,
            row_idx=row_idx,
            expert_map=expert_map,
            apply_router_weight_on_input=apply_router_weight_on_input,
            with_quant=True)
        sorted_hidden_states = dispatch_output["hidden_states"]
        group_list = dispatch_output["group_list"]
        group_list_type = dispatch_output.get("group_list_type", 1)
        dynamic_scale = dispatch_output["dynamic_scale"]
        expert_output = unified_apply_mlp(hidden_states=sorted_hidden_states,
                                          w1=w1,
                                          w1_scale=w1_scale,
                                          w2=w2,
                                          w2_scale=w2_scale,
                                          group_list=group_list,
                                          group_list_type=group_list_type,
                                          dynamic_scale=dynamic_scale,
                                          with_quant=True)
        combined_output = dispatcher.token_combine(hidden_states=expert_output,
                                                   bias=None)
        assert combined_output.shape == (m, k)
        gc.collect()
        torch.npu.empty_cache()
        torch.npu.reset_peak_memory_stats()
@pytest.mark.parametrize("m", [1, 33, 64])
@pytest.mark.parametrize("n", [128, 1024, 2048])
@pytest.mark.parametrize("e", NUM_EXPERTS)
--- a/tests/ut/ops/test_token_dispatcher.py
+++ b/tests/ut/ops/test_token_dispatcher.py
@@ -221,7 +221,7 @@ class TestTokenDispatcherWithAllGather(TestBase):
        self.assertEqual(results["group_list_type"], 1)
-    def test_token_dispatch_with_quant(self):
+    def test_token_dispatch_without_quant(self):
        kwargs = {
            "apply_router_weight_on_input": False,
            "top_k": 2,
@@ -241,6 +241,32 @@ class TestTokenDispatcherWithAllGather(TestBase):
        self.assertEqual(results["group_list_type"], 1)
    def test_token_dispatch_with_quant(self):
        kwargs = {
            "apply_router_weight_on_input": False,
            "top_k": 2,
            "max_num_tokens": 100,
            "ep_size": 2,
            "num_experts": 128,
        }
        self.dispatcher_quant = TokenDispatcherWithAllGather(**kwargs)
        hidden_states = torch.randn(3, 128)
        topk_weights = torch.tensor([[0.7, 0.3], [0.6, 0.4], [0.5, 0.5]])
        topk_ids = torch.tensor([[0, 1], [1, 2], [2, 3]])
        results = self.dispatcher_quant.token_dispatch(hidden_states,
                                                       topk_weights,
                                                       topk_ids,
                                                       self.row_idx,
                                                       None,
                                                       with_quant=True)
        self.assertIsNotNone(results["hidden_states"])
        self.assertIsNotNone(results["group_list"])
        self.assertIsNotNone(results["dynamic_scale"])
        self.assertEqual(results["group_list_type"], 1)
    def test_token_combine_with_expert_map(self):
        self.dispatcher.expert_map = torch.tensor([0, 1, 2, 3])
        self.dispatcher.sorted_token_indices = torch.tensor([0, 1, 1, 1, 1, 1])
--- a/vllm_ascend/ops/moe/token_dispatcher.py
+++ b/vllm_ascend/ops/moe/token_dispatcher.py
@@ -367,7 +367,7 @@ class TokenDispatcherWithAllGather(MoETokenDispatcher):
            last_expert_idx = self.num_experts_local
            global_num_experts = self.num_experts_local
-        sorted_hidden_states, self.expanded_row_idx, expert_tokens, _ = (
+        sorted_hidden_states, self.expanded_row_idx, expert_tokens, pertoken_scale = (
            torch_npu.npu_moe_init_routing_v2(
                hidden_states,
                topk_ids,
@@ -376,7 +376,7 @@ class TokenDispatcherWithAllGather(MoETokenDispatcher):
                expert_tokens_num_type=1,
                expert_tokens_num_flag=True,
                active_expert_range=[first_expert_idx, last_expert_idx],
-                quant_mode=-1,
+                quant_mode=1 if self.with_quant else -1,
            ))
        expert_tokens = expert_tokens.to(torch.int64)
        group_list_type = 1  # `count` mode
@@ -384,6 +384,7 @@ class TokenDispatcherWithAllGather(MoETokenDispatcher):
            "group_list_type": group_list_type,
            "hidden_states": sorted_hidden_states,
            "group_list": expert_tokens,
            "dynamic_scale": pertoken_scale if self.with_quant else None,
        }
    def token_combine(self,