[EPLB][bugfix] Bugfix for fused mc2 (#6794)

### What this PR does / why we need it? This pull request addresses a bug related to the fused mc2 functionality within the EPLB (Expert Parallelism Load Balancing) system, specifically impacting quantization and MoE communication. ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.15.0 - vLLM main: 83b47f67b1 Signed-off-by: Spicy-Stick <873805887@qq.com> Signed-off-by: root <root@localhost.localdomain>
2026-03-09 11:26:57 +08:00
parent 06ec136f08
commit 23bf5d4d48
5 changed files with 50 additions and 28 deletions
--- a/tests/ut/eplb/adaptor/test_vllm_adaptor.py
+++ b/tests/ut/eplb/adaptor/test_vllm_adaptor.py
@@ -4,6 +4,7 @@ from unittest.mock import MagicMock, patch
 import torch

 from vllm_ascend.eplb.adaptor.vllm_adaptor import VllmEplbAdaptor
+from vllm_ascend.quantization.methods.base import QuantType
 from transformers import DeepseekV2Config


@@ -17,6 +18,8 @@ class TestVllmAdaptor(unittest.TestCase):
        mock_model.get_expert_map.return_value = [i for i in range(n_routed_experts)]
        mock_model.get_log2phy_map.return_value = [i for i in range(n_routed_experts)]
        self.model = mock_model
+        num_dense_layers = getattr(config, "first_k_dense_replace", 0)
+        self.model.model.layers[num_dense_layers].mlp.experts.quant_type = QuantType.W8A8

        self.mock_rank = patch("vllm_ascend.eplb.adaptor.vllm_adaptor.dist.get_rank", return_value=0).start()
        self.mock_size = patch("vllm_ascend.eplb.adaptor.vllm_adaptor.dist.get_world_size", return_value=4).start()