xc-llm-ascend/tests/ut/quantization/test_w8a8_dynamic.py

from unittest.mock import Mock, patch

import torch

from tests.ut.base import TestBase
from vllm_ascend.quantization.w8a8_dynamic import \
    AscendW8A8DynamicFusedMoEMethod


class TestAscendW8A8FusedMoEMethod(TestBase):
    num_experts = 8
    hidden_size = 128
    intermediate_size = 128

    @patch("torch.distributed.get_rank")
    @patch("vllm_ascend.quantization.w8a8_dynamic.get_mc2_group")
    @patch("vllm_ascend.quantization.w8a8_dynamic.get_ascend_config")
    @patch("vllm_ascend.quantization.w8a8_dynamic.get_ep_group")
    def setUp(self, mock_get_ep_group, mock_get_ascend_config,
              mock_get_mc2_group, mock_get_rank):
        with patch(
                'vllm_ascend.quantization.w8a8_dynamic.get_current_vllm_config'
        ) as mock_get_current_vllm_config:
            mock_vllm_config = Mock()
            mock_vllm_config.quant_config = Mock(
                quant_description={"group_size": 256})
            mock_vllm_config.scheduler_config = Mock(
                max_num_batched_tokens=2048,
                max_model_len=2048,
                enable_chunked_prefill=False)
            mock_get_current_vllm_config.return_value = mock_vllm_config
            mock_ep_group = Mock()
            mock_get_ep_group.return_value = mock_ep_group
            mock_ascend_config = Mock()

            # 创建一个具有具体属性的 Mock 对象来表示 ascend_scheduler_config
            mock_ascend_scheduler_config = Mock()
            mock_ascend_scheduler_config.enabled = False
            mock_ascend_scheduler_config.max_num_batched_tokens = 1024
            mock_ascend_scheduler_config.max_model_len = 2048
            mock_ascend_config.ascend_scheduler_config = mock_ascend_scheduler_config

            mock_ascend_config.torchair_graph_config = Mock(enabled=False)
            mock_ascend_config.enable_chunked_prefill = False
            mock_get_ascend_config.return_value = mock_ascend_config
            mock_mc2_group = Mock(device_group=0)
            mock_get_mc2_group.return_value = mock_mc2_group
            mock_rank = Mock()
            mock_get_rank.return_value = mock_rank

            self.quant_method = AscendW8A8DynamicFusedMoEMethod()

    def test_get_weight(self):
        param_dict = self.quant_method.get_weight(self.num_experts,
                                                  self.intermediate_size,
                                                  self.hidden_size,
                                                  torch.bfloat16)
        self.assertEqual(param_dict["w13_weight"].dtype, torch.int8)
        self.assertEqual(
            param_dict["w13_weight"].shape,
            (self.num_experts, 2 * self.intermediate_size, self.hidden_size))

    def test_get_dynamic_quant_param(self):
        param_dict = self.quant_method.get_dynamic_quant_param(
            self.num_experts, self.intermediate_size, self.hidden_size,
            torch.bfloat16)
        self.assertEqual(param_dict["w13_weight_scale"].dtype, torch.bfloat16)
        self.assertEqual(param_dict["w13_weight_scale"].shape,
                         (self.num_experts, 2 * self.intermediate_size, 1))
[main] Fuse GroupedMatmul, Swiglu and DynamicQuant in `W8A8_DYNAMIC` quantized MoE layers (#2275) ### What this PR does / why we need it? Fuse `GroupedMatmul`, `Swiglu` and `DynamicQuant` into one fusion operation `GroupedMatmulSwigluQuant`. 1. extract common functions in `w4a8_dynamic.py` and `w8a8_dynamic.py` 2. if in supported occasion, use fusion operation `npu_grouped_matmul_swiglu_quant` ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? Tested on W8A8 quantized Qwen3-235B-A22B model with `bs=16` 1. `tp=8`, `dp=1`, `moe_tp=8`, `moe_ep=1`, TPOP increased 21.54%, Output Token Throughput increased 27.35% <img width="3443" height="211" alt="image" src="https://github.com/user-attachments/assets/a1a9c14d-2310-41be-9a03-36125dabae6e" /> 3. `tp=8`, `dp=1`, `moe_tp=1`, `moe_ep=8`, TPOP increased 17.38%, Output Token Throughput increased 6.86% <img width="3443" height="211" alt="image" src="https://github.com/user-attachments/assets/1ce92e92-720d-40c0-8b4d-c493e5cb10a6" /> - vLLM version: v0.10.1.1 - vLLM main: https://github.com/vllm-project/vllm/commit/6997a25ac65ed6cc3c2be6d09ca45f633a345f63 --------- Signed-off-by: Ruri <33858552+zhoux77899@users.noreply.github.com> Signed-off-by: zhoux77899 <zhouxiang100@huawei.com> 2025-09-04 11:37:32 +08:00			`from unittest.mock import Mock, patch`

			`import torch`

			`from tests.ut.base import TestBase`
			`from vllm_ascend.quantization.w8a8_dynamic import \`
			`AscendW8A8DynamicFusedMoEMethod`


			`class TestAscendW8A8FusedMoEMethod(TestBase):`
			`num_experts = 8`
			`hidden_size = 128`
			`intermediate_size = 128`

			`@patch("torch.distributed.get_rank")`
			`@patch("vllm_ascend.quantization.w8a8_dynamic.get_mc2_group")`
			`@patch("vllm_ascend.quantization.w8a8_dynamic.get_ascend_config")`
			`@patch("vllm_ascend.quantization.w8a8_dynamic.get_ep_group")`
			`def setUp(self, mock_get_ep_group, mock_get_ascend_config,`
			`mock_get_mc2_group, mock_get_rank):`
[feat]: oproj tensor parallelism in pure DP and graph-mode scenarios. (#2167) ### What this PR does / why we need it? This PR introduces Oproj matrix tensor model parallel to achieve decreasing of memory consumption. It only support graph mode in pure DP scenario. In deepseek r1 w8a8 PD disagregated Decode instance, using pure DP, with oproj_tensor_parallel_size = 8, we have 1 ms TPOT increasing, saved 5.8 GB NPU memory per RANK. We got best performance when oproj_tensor_parallel_size=4 without TPOT increasing. performance data: <img width="1442" height="442" alt="image" src="https://github.com/user-attachments/assets/83270fc5-868a-4387-b0a9-fac29b4a376d" /> ### Does this PR introduce _any_ user-facing change? This PR introduces one new config in `additional_config`. \| Name \| Effect \| Required \| Type \| Constraints \| \| :---------------------------- \| :--------------------------------------- \| :------- \| :--- \| :----------------- \| \| oproj_tensor_parallel_size \| Split the o_proj matrix along the row dimension (head num * head dim) into oproj_tensor_parallel_size pieces. \| No \| int \| default value is None, once this value is set, the feature will be enabled, head num * head dim must be divisible by this value. \| example `--additional_config={"oproj_tensor_parallel_size": 8}` ### How was this patch tested? - vLLM version: v0.10.1.1 - vLLM main: https://github.com/vllm-project/vllm/commit/eddaafc1c77b0690194cbd1b73747d572793838c --------- Signed-off-by: zzhx1 <zzh_201018@outlook.com> Co-authored-by: zzh <zzh_201018@outlook.com> 2025-09-07 10:31:32 +08:00			`with patch(`
			`'vllm_ascend.quantization.w8a8_dynamic.get_current_vllm_config'`
			`) as mock_get_current_vllm_config:`
			`mock_vllm_config = Mock()`
			`mock_vllm_config.quant_config = Mock(`
			`quant_description={"group_size": 256})`
			`mock_vllm_config.scheduler_config = Mock(`
			`max_num_batched_tokens=2048,`
			`max_model_len=2048,`
			`enable_chunked_prefill=False)`
			`mock_get_current_vllm_config.return_value = mock_vllm_config`
			`mock_ep_group = Mock()`
			`mock_get_ep_group.return_value = mock_ep_group`
			`mock_ascend_config = Mock()`

			`# 创建一个具有具体属性的 Mock 对象来表示 ascend_scheduler_config`
			`mock_ascend_scheduler_config = Mock()`
			`mock_ascend_scheduler_config.enabled = False`
			`mock_ascend_scheduler_config.max_num_batched_tokens = 1024`
			`mock_ascend_scheduler_config.max_model_len = 2048`
			`mock_ascend_config.ascend_scheduler_config = mock_ascend_scheduler_config`

			`mock_ascend_config.torchair_graph_config = Mock(enabled=False)`
			`mock_ascend_config.enable_chunked_prefill = False`
			`mock_get_ascend_config.return_value = mock_ascend_config`
			`mock_mc2_group = Mock(device_group=0)`
			`mock_get_mc2_group.return_value = mock_mc2_group`
			`mock_rank = Mock()`
			`mock_get_rank.return_value = mock_rank`

			`self.quant_method = AscendW8A8DynamicFusedMoEMethod()`
[main] Fuse GroupedMatmul, Swiglu and DynamicQuant in `W8A8_DYNAMIC` quantized MoE layers (#2275) ### What this PR does / why we need it? Fuse `GroupedMatmul`, `Swiglu` and `DynamicQuant` into one fusion operation `GroupedMatmulSwigluQuant`. 1. extract common functions in `w4a8_dynamic.py` and `w8a8_dynamic.py` 2. if in supported occasion, use fusion operation `npu_grouped_matmul_swiglu_quant` ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? Tested on W8A8 quantized Qwen3-235B-A22B model with `bs=16` 1. `tp=8`, `dp=1`, `moe_tp=8`, `moe_ep=1`, TPOP increased 21.54%, Output Token Throughput increased 27.35% <img width="3443" height="211" alt="image" src="https://github.com/user-attachments/assets/a1a9c14d-2310-41be-9a03-36125dabae6e" /> 3. `tp=8`, `dp=1`, `moe_tp=1`, `moe_ep=8`, TPOP increased 17.38%, Output Token Throughput increased 6.86% <img width="3443" height="211" alt="image" src="https://github.com/user-attachments/assets/1ce92e92-720d-40c0-8b4d-c493e5cb10a6" /> - vLLM version: v0.10.1.1 - vLLM main: https://github.com/vllm-project/vllm/commit/6997a25ac65ed6cc3c2be6d09ca45f633a345f63 --------- Signed-off-by: Ruri <33858552+zhoux77899@users.noreply.github.com> Signed-off-by: zhoux77899 <zhouxiang100@huawei.com> 2025-09-04 11:37:32 +08:00
			`def test_get_weight(self):`
			`param_dict = self.quant_method.get_weight(self.num_experts,`
			`self.intermediate_size,`
			`self.hidden_size,`
			`torch.bfloat16)`
			`self.assertEqual(param_dict["w13_weight"].dtype, torch.int8)`
			`self.assertEqual(`
			`param_dict["w13_weight"].shape,`
			`(self.num_experts, 2 * self.intermediate_size, self.hidden_size))`

			`def test_get_dynamic_quant_param(self):`
			`param_dict = self.quant_method.get_dynamic_quant_param(`
			`self.num_experts, self.intermediate_size, self.hidden_size,`
			`torch.bfloat16)`
			`self.assertEqual(param_dict["w13_weight_scale"].dtype, torch.bfloat16)`
			`self.assertEqual(param_dict["w13_weight_scale"].shape,`
			`(self.num_experts, 2 * self.intermediate_size, 1))`