2025-07-23 16:24:09 +08:00
|
|
|
#
|
|
|
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
|
# you may not use this file except in compliance with the License.
|
|
|
|
|
# You may obtain a copy of the License at
|
|
|
|
|
#
|
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
|
#
|
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
|
# limitations under the License.
|
|
|
|
|
# This file is a part of the vllm-ascend project.
|
|
|
|
|
#
|
2026-03-20 23:23:57 +08:00
|
|
|
from typing import TypedDict
|
2025-07-23 16:24:09 +08:00
|
|
|
from unittest.mock import MagicMock, patch
|
|
|
|
|
|
|
|
|
|
import pytest
|
|
|
|
|
import torch
|
|
|
|
|
import torch.nn as nn
|
2025-07-28 14:06:20 +08:00
|
|
|
import torch_npu
|
2025-07-23 16:24:09 +08:00
|
|
|
from pytest_mock import MockerFixture
|
2026-03-20 23:23:57 +08:00
|
|
|
|
2025-08-28 10:13:35 +08:00
|
|
|
from tests.ut.base import TestBase
|
2025-09-22 19:12:58 +08:00
|
|
|
from vllm_ascend.ascend_forward_context import MoECommType
|
2025-10-25 11:22:03 +08:00
|
|
|
from vllm_ascend.ops.fused_moe.experts_selector import select_experts
|
2025-12-08 20:34:52 +08:00
|
|
|
from vllm_ascend.ops.fused_moe.fused_moe import AscendUnquantizedFusedMoEMethod
|
2026-03-20 23:23:57 +08:00
|
|
|
from vllm_ascend.ops.fused_moe.moe_mlp import cumsum_group_list, unified_apply_mlp
|
|
|
|
|
from vllm_ascend.ops.fused_moe.moe_runtime_args import (
|
|
|
|
|
MoEMlpComputeInput,
|
|
|
|
|
MoEPrepareOutput,
|
|
|
|
|
MoEQuantParams,
|
|
|
|
|
MoEWeights,
|
|
|
|
|
)
|
|
|
|
|
from vllm_ascend.quantization.quant_type import QuantType
|
[refact] unified soc_version code (#4359)
### What this PR does / why we need it?
Currently, there are two paths to judge the chip type in code,
`get_ascend_soc_version` use `get_soc_version` api in torch_npu, and
`is_310p` `use _build_info.__soc_version__`, which generate when
install. We need to unify the two paths.
We need to unify these codes based on the following points:
1. We need to ensure consistency in chip type judgment between compiling
and running states;
2. In compiling state, we need chip type to complete op's compilation,
but in running state, we only need device
type(910B/910_93/310P/910_95/etc) to make code branch judgement;
3. In compiling state, torch_npu may not have been installed yet, so we
can't use torch_npu's api.
Based on the above points, we have made the following changes:
1. When user set env `SOC_VERSION`, use it; when not set, query
soc_version by `npu-smi`;
2. generate device_type based on soc_version when compiling, and write
`__device_type__` instead of `__soc_version__` in `_build_info.py`;
3. In running state, use `__device_type__` to judge code branch.
### Does this PR introduce _any_ user-facing change?
When not set env `SOC_VERSION`, it will not be `ASCEND910B1` by default,
we will query soc_version by `npu-smi`. And env `SOC_VERSION` must be in
the list `soc_to_device` in `setup.py`.
- vLLM version: v0.11.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2918c1b49c88c29783c86f78d2c4221cb9622379
Signed-off-by: zzzzwwjj <1183291235@qq.com>
2025-11-26 14:28:55 +08:00
|
|
|
from vllm_ascend.utils import AscendDeviceType, adapt_patch
|
2025-07-23 16:24:09 +08:00
|
|
|
|
|
|
|
|
adapt_patch(True)
|
|
|
|
|
|
|
|
|
|
|
2025-07-28 14:06:20 +08:00
|
|
|
def mock_ep_and_mc2_group(mocker):
|
2025-07-23 16:24:09 +08:00
|
|
|
mock_group = mocker.MagicMock()
|
|
|
|
|
mock_group.rank_in_group = 0
|
|
|
|
|
mock_group.rank = 0
|
|
|
|
|
mock_group.world_size = 4
|
|
|
|
|
mock_group.device_group = "mock_group_ep"
|
|
|
|
|
mock_group.all_to_all = MagicMock(return_value=torch.randn(8, 8))
|
|
|
|
|
return mock_group
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def mock_dp_and_tp_group(mocker):
|
|
|
|
|
mock_group = mocker.MagicMock()
|
|
|
|
|
mock_group.rank_in_group = 0
|
|
|
|
|
mock_group.world_size = 2
|
|
|
|
|
mock_group.device_group = "mock_group"
|
|
|
|
|
mock_group.all_gather = MagicMock(return_value=torch.randn(10, 32))
|
|
|
|
|
return mock_group
|
|
|
|
|
|
|
|
|
|
|
2025-08-27 11:25:02 +08:00
|
|
|
def mock_npu_format_cast(weight_data, format):
|
|
|
|
|
return weight_data
|
|
|
|
|
|
|
|
|
|
|
2026-03-20 23:23:57 +08:00
|
|
|
def build_mlp_compute_input_fixture(
|
|
|
|
|
*,
|
|
|
|
|
hidden_states: torch.Tensor,
|
|
|
|
|
w1: torch.Tensor | list[torch.Tensor],
|
|
|
|
|
w2: torch.Tensor | list[torch.Tensor],
|
|
|
|
|
group_list: torch.Tensor,
|
|
|
|
|
with_quant: bool,
|
|
|
|
|
group_list_type: int = 1,
|
|
|
|
|
dynamic_scale: torch.Tensor | None = None,
|
|
|
|
|
topk_scales: torch.Tensor | None = None,
|
|
|
|
|
w1_scale: torch.Tensor | list[torch.Tensor] | None = None,
|
|
|
|
|
w2_scale: torch.Tensor | list[torch.Tensor] | None = None,
|
|
|
|
|
w1_scale_bias: torch.Tensor | None = None,
|
|
|
|
|
w2_scale_bias: torch.Tensor | None = None,
|
|
|
|
|
w1_offset: torch.Tensor | None = None,
|
|
|
|
|
w2_offset: torch.Tensor | None = None,
|
|
|
|
|
fusion: bool = False,
|
|
|
|
|
activation: str = "silu",
|
|
|
|
|
need_trans: bool = True,
|
|
|
|
|
dynamic_eplb: bool = False,
|
|
|
|
|
) -> MoEMlpComputeInput:
|
|
|
|
|
return MoEMlpComputeInput(
|
|
|
|
|
hidden_states=hidden_states,
|
|
|
|
|
group_list=group_list,
|
|
|
|
|
group_list_type=group_list_type,
|
|
|
|
|
dynamic_scale=dynamic_scale,
|
|
|
|
|
topk_scales=topk_scales,
|
|
|
|
|
weights=MoEWeights(
|
|
|
|
|
w1=w1,
|
|
|
|
|
w2=w2,
|
|
|
|
|
w1_scale=w1_scale,
|
|
|
|
|
w2_scale=w2_scale,
|
|
|
|
|
w1_scale_bias=w1_scale_bias,
|
|
|
|
|
w2_scale_bias=w2_scale_bias,
|
|
|
|
|
w1_offset=w1_offset,
|
|
|
|
|
w2_offset=w2_offset,
|
|
|
|
|
),
|
|
|
|
|
quant=MoEQuantParams(quant_type=QuantType.W8A8 if with_quant else QuantType.NONE),
|
|
|
|
|
fusion=fusion,
|
|
|
|
|
activation=activation,
|
|
|
|
|
need_trans=need_trans,
|
|
|
|
|
dynamic_eplb=dynamic_eplb,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
2025-09-22 19:12:58 +08:00
|
|
|
@pytest.fixture(autouse=True)
|
|
|
|
|
def setup_vllm_config_mock(mocker: MockerFixture):
|
|
|
|
|
mock_hf_config = MagicMock()
|
|
|
|
|
mock_hf_config.model_type = "llama"
|
|
|
|
|
|
|
|
|
|
mock_model_config = MagicMock()
|
|
|
|
|
mock_model_config.hf_config = mock_hf_config
|
|
|
|
|
|
|
|
|
|
mock_vllm_config = MagicMock()
|
|
|
|
|
mock_vllm_config.model_config = mock_model_config
|
|
|
|
|
mock_vllm_config.parallel_config = MagicMock(tensor_parallel_size=2)
|
|
|
|
|
mock_vllm_config.scheduler_config = MagicMock(max_num_seqs=4)
|
|
|
|
|
mock_vllm_config.model_config.max_model_len = 2048
|
|
|
|
|
|
2025-10-25 11:22:03 +08:00
|
|
|
mocker.patch('vllm_ascend.ops.fused_moe.fused_moe.get_current_vllm_config',
|
2025-09-22 19:12:58 +08:00
|
|
|
return_value=mock_vllm_config)
|
|
|
|
|
|
|
|
|
|
|
2025-07-23 16:24:09 +08:00
|
|
|
@pytest.fixture
|
|
|
|
|
def mock_dist_env(mocker: MockerFixture):
|
2025-09-16 11:06:00 +08:00
|
|
|
mock_moe_comm_method = MagicMock()
|
2025-08-28 10:13:35 +08:00
|
|
|
|
2025-09-16 11:06:00 +08:00
|
|
|
def mock_prepare(hidden_states, router_logits, **kwargs):
|
2026-03-20 23:23:57 +08:00
|
|
|
return MoEPrepareOutput(
|
|
|
|
|
hidden_states=hidden_states,
|
|
|
|
|
router_logits=router_logits,
|
|
|
|
|
mc2_mask=kwargs.get("mc2_mask"),
|
|
|
|
|
padded_hidden_states_shape=None,
|
|
|
|
|
pertoken_scale=None,
|
|
|
|
|
)
|
2025-08-28 10:13:35 +08:00
|
|
|
|
2025-09-16 11:06:00 +08:00
|
|
|
mock_moe_comm_method.prepare.side_effect = mock_prepare
|
2025-08-28 10:13:35 +08:00
|
|
|
|
2025-09-16 11:06:00 +08:00
|
|
|
mock_fused_experts_result = torch.randn(16, 2)
|
|
|
|
|
mock_moe_comm_method.fused_experts.return_value = mock_fused_experts_result
|
2025-08-28 10:13:35 +08:00
|
|
|
|
2025-09-16 11:06:00 +08:00
|
|
|
def mock_finalize(hidden_states, **kwargs):
|
|
|
|
|
return hidden_states
|
2025-08-28 10:13:35 +08:00
|
|
|
|
2025-09-16 11:06:00 +08:00
|
|
|
mock_moe_comm_method.finalize.side_effect = mock_finalize
|
2025-10-09 14:07:26 +08:00
|
|
|
dp_metadata = MagicMock(num_tokens_across_dp_cpu=[5, 5])
|
2025-10-14 20:16:33 +08:00
|
|
|
mock_weight_prefetch_method = MagicMock()
|
2025-12-23 08:49:52 +08:00
|
|
|
mock_forward_context_obj = MagicMock(moe_comm_method=mock_moe_comm_method,
|
|
|
|
|
moe_comm_type=MoECommType.MC2,
|
|
|
|
|
max_tokens_across_dp=10,
|
|
|
|
|
dp_metadata=dp_metadata,
|
|
|
|
|
mc2_mask=torch.zeros(
|
|
|
|
|
16, dtype=torch.bool),
|
|
|
|
|
padded_num_tokens=16,
|
|
|
|
|
with_quant=False)
|
2025-07-23 16:24:09 +08:00
|
|
|
|
|
|
|
|
with patch('torch.distributed.get_rank', return_value=0), \
|
2025-08-30 22:28:50 +08:00
|
|
|
patch('torch.distributed.get_world_size', return_value=4), \
|
2025-10-25 11:22:03 +08:00
|
|
|
patch('vllm_ascend.ops.fused_moe.fused_moe.get_ep_group', return_value=mock_ep_and_mc2_group(mocker)), \
|
|
|
|
|
patch('vllm_ascend.ops.fused_moe.token_dispatcher.get_ep_group', return_value=mock_ep_and_mc2_group(mocker)), \
|
|
|
|
|
patch('vllm_ascend.ops.fused_moe.fused_moe.get_mc2_group', return_value=mock_ep_and_mc2_group(mocker)), \
|
|
|
|
|
patch('vllm_ascend.ops.fused_moe.fused_moe.get_tp_group', return_value=mock_dp_and_tp_group(mocker)), \
|
2025-08-30 22:28:50 +08:00
|
|
|
patch('vllm.distributed.parallel_state.get_tp_group', return_value=mock_dp_and_tp_group(mocker)), \
|
2025-10-25 11:22:03 +08:00
|
|
|
patch('vllm_ascend.ops.fused_moe.fused_moe.get_dp_group', return_value=mock_dp_and_tp_group(mocker)), \
|
2025-08-30 22:28:50 +08:00
|
|
|
patch('vllm.model_executor.layers.fused_moe.layer.get_dp_group', return_value=mock_dp_and_tp_group(mocker)), \
|
|
|
|
|
patch('vllm.model_executor.layers.fused_moe.config.get_dp_group',
|
|
|
|
|
return_value=mock_dp_and_tp_group(mocker)), \
|
2025-10-25 11:22:03 +08:00
|
|
|
patch('vllm_ascend.ops.fused_moe.fused_moe.get_ascend_config',
|
2025-08-30 22:28:50 +08:00
|
|
|
return_value=MagicMock(
|
2025-09-19 11:06:45 +08:00
|
|
|
enable_multistream_moe=False,
|
2025-08-30 22:28:50 +08:00
|
|
|
expert_map_path=None
|
|
|
|
|
)), \
|
[EPLB][refactor] Modification of the initialization logic for expert_map and log2phy(depend on pr5285) (#5311)
### What this PR does / why we need it?
Unify the loading logic for expert_map and log2phy.
1. The map generated when enabling the redundancy expert is incorrect.
The community generation map function only accepts the number of global
experts. When we pass in the number of logical experts plus redundant
experts, the local expert ID of the last card will index to an expert ID
that does not exist. Now we ensure that the index points to a real
existing expert ID, and each expert can be accessed. Moreover, when
redundant experts are not enabled, the output of our function remains
consistent with the community's function.
2. The map we generate is based on the length of the physical expert,
but in reality, we only need to use the length of the logical expert.
Later on, we will need to pad it accordingly, so we can simply generate
a map with the length of the logical [expert.]
3. Unify the initialization logic across different scenarios and
simplify the code for fused_moe.
**Before refactoring**
- map path is not None:
expert map: get_rank_placement_map from _'expert_load_balancer.py'_,
maintains the map for all ranks and all layers.
log2phy: get_rank_log2phy_map from _'expert_load_balancer.py'_,
maintains the map for all ranks and all layers.
- map path is None:
expert map: determine_expert_map from '_vllm.laye_r', The function does
not support the redundant experts of vllm-ascend.
log2phy: determine_default_log2phy_map from _'eplb_utils.py'_. The
function does not support the redundant experts of vllm-ascend.
**Refactoring**
eplb_utils.py
init_eplb_config
generate placement
generate expert map
generate log2phy
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
Expert Mapping Test Generation:
ep size: 16, num of experts: 256, num of redundant experts: 16
+++++++++++++++++++++++++++++++++++++++++
Expert Mapping (Non-1 indicates the expert responsible for this rank)
for Rank 15:
vllm map:
[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 0 1 2 3 4 5 6 7 8
9 10 11 12 13 14 15 16]
+++++++++++++++++++++++++++++++++++++++++
Improved map:
[16 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
Expert Mapping Test Generation:
ep size: 16, num of experts: 256, num of redundant experts: 0
+++++++++++++++++++++++++++++++++++++++++
Expert Mapping (Non-1 indicates the expert responsible for this rank)
for Rank 15:
vllm map:
[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
+++++++++++++++++++++++++++++++++++++++
Improved map:
[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
dsr1 baselie:
| dataset | version | metric | mode | vllm-api-general-chat |
|----- | ----- | ----- | ----- | -----|
| gsm8k-lite | 7cd45e | accuracy | gen | 100.00 |
dsr1 eplb:
| dataset | version | metric | mode | vllm-api-general-chat |
|----- | ----- | ----- | ----- | -----|
| gsm8k-lite | 7cd45e | accuracy | gen | 100.00 |
- vLLM version: release/v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/5fbfa8d9ef15948599631baeb91e8220b2ee9bcc
Signed-off-by: shenchuxiaofugui <1311027364@qq.com>
Co-authored-by: weijinqian0 <1184188277@qq.com>
2025-12-29 09:26:14 +08:00
|
|
|
patch('vllm_ascend.ops.fused_moe.fused_moe.init_eplb_config',
|
|
|
|
|
return_value=(torch.tensor([0, 1, 2, -1, -1, -1, -1, -1]), None, 0)), \
|
2025-10-25 11:22:03 +08:00
|
|
|
patch('vllm_ascend.ops.fused_moe.fused_moe.get_forward_context',
|
2025-08-30 22:28:50 +08:00
|
|
|
return_value=mock_forward_context_obj), \
|
2026-03-13 09:11:46 +08:00
|
|
|
patch('vllm_ascend.ascend_forward_context.get_forward_context',
|
2025-09-16 11:06:00 +08:00
|
|
|
return_value=mock_forward_context_obj), \
|
2025-12-17 14:08:19 +08:00
|
|
|
patch("vllm_ascend.utils.get_ascend_device_type", return_value=AscendDeviceType.A3), \
|
2025-10-25 11:22:03 +08:00
|
|
|
patch('vllm_ascend.ops.fused_moe.moe_comm_method.MC2CommImpl._get_token_dispatcher',
|
2025-09-16 11:06:00 +08:00
|
|
|
return_value=None), \
|
2025-10-25 11:22:03 +08:00
|
|
|
patch('vllm_ascend.ops.fused_moe.moe_comm_method.AlltoAllCommImpl._get_token_dispatcher',
|
2025-09-16 11:06:00 +08:00
|
|
|
return_value=None), \
|
2025-10-25 11:22:03 +08:00
|
|
|
patch('vllm_ascend.ops.fused_moe.moe_comm_method.AllGatherCommImpl._get_token_dispatcher',
|
2025-10-14 20:16:33 +08:00
|
|
|
return_value=None), \
|
2025-12-23 08:49:52 +08:00
|
|
|
patch('vllm_ascend.ops.fused_moe.experts_selector.get_weight_prefetch_method',
|
|
|
|
|
return_value=mock_weight_prefetch_method):
|
2025-08-28 10:13:35 +08:00
|
|
|
|
|
|
|
|
yield {
|
|
|
|
|
'mock_forward_context_obj': mock_forward_context_obj,
|
2025-09-16 11:06:00 +08:00
|
|
|
'mock_moe_comm_method': mock_moe_comm_method,
|
2025-08-28 10:13:35 +08:00
|
|
|
}
|
|
|
|
|
|
2025-07-23 16:24:09 +08:00
|
|
|
|
|
|
|
|
@pytest.fixture
|
|
|
|
|
def mock_moe_env(mocker: MockerFixture):
|
|
|
|
|
|
|
|
|
|
with patch('torch_npu.npu_moe_gating_top_k', return_value=(
|
|
|
|
|
torch.randn(8, 2),
|
|
|
|
|
torch.randint(0, 8, (8, 2)),
|
|
|
|
|
None
|
|
|
|
|
)), \
|
|
|
|
|
patch('torch_npu.npu_moe_init_routing', return_value=(
|
|
|
|
|
torch.randn(8, 2),
|
|
|
|
|
torch.randint(0, 8, (8, 2)),
|
|
|
|
|
torch.tensor([0, 1, 2, 4, 6, 2, 7, 1])
|
|
|
|
|
)), \
|
|
|
|
|
patch("torch_npu.npu_moe_compute_expert_tokens", return_value=(
|
|
|
|
|
torch.randn(8, 2)
|
|
|
|
|
)), \
|
|
|
|
|
patch("torch_npu.npu_moe_distribute_dispatch", return_value=(
|
|
|
|
|
torch.randn(16, 2)
|
|
|
|
|
)), \
|
|
|
|
|
patch("torch_npu.npu_moe_distribute_combine", return_value=(
|
|
|
|
|
torch.randn(16, 2)
|
|
|
|
|
)), \
|
|
|
|
|
patch("torch_npu.npu_grouped_matmul", return_value=(
|
2025-08-07 17:20:19 +08:00
|
|
|
[torch.randn(16, 2)]
|
2025-07-23 16:24:09 +08:00
|
|
|
)), \
|
|
|
|
|
patch("torch_npu.npu_swiglu", return_value=(
|
|
|
|
|
torch.randn(16, 2)
|
|
|
|
|
)), \
|
|
|
|
|
patch("torch_npu.npu_moe_gating_top_k_softmax", return_value=(
|
|
|
|
|
torch.randn(8, 2),
|
|
|
|
|
torch.randint(0, 8, (8, 2)),
|
|
|
|
|
torch.tensor([0, 1, 2, 4, 6, 2, 7, 1])
|
|
|
|
|
)), \
|
|
|
|
|
patch("torch_npu.npu_moe_finalize_routing", return_value=(
|
|
|
|
|
torch.randn(16, 2)
|
|
|
|
|
)):
|
2025-07-28 14:06:20 +08:00
|
|
|
if hasattr(torch_npu, 'npu_moe_distribute_dispatch_v2'):
|
|
|
|
|
with patch("torch_npu.npu_moe_distribute_dispatch_v2", return_value=(
|
|
|
|
|
torch.randn(16, 2))), \
|
|
|
|
|
patch("torch_npu.npu_moe_distribute_combine_v2", return_value=(
|
|
|
|
|
torch.randn(16, 2))):
|
|
|
|
|
yield
|
|
|
|
|
else:
|
|
|
|
|
yield
|
2025-07-23 16:24:09 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture
|
|
|
|
|
def default_moe_config():
|
|
|
|
|
return {
|
|
|
|
|
'num_experts': 8,
|
|
|
|
|
'top_k': 2,
|
|
|
|
|
'hidden_size': 512,
|
|
|
|
|
'intermediate_size': 1024
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture
|
|
|
|
|
def moe_method(mock_dist_env):
|
|
|
|
|
moe = MagicMock()
|
|
|
|
|
moe.moe_parallel_config.return_value = MagicMock(ep_size=4)
|
2025-09-25 14:20:10 +08:00
|
|
|
moe.moe_parallel_config.use_ep = False
|
|
|
|
|
moe.moe_parallel_config.dp_size = 1
|
2025-07-23 16:24:09 +08:00
|
|
|
return AscendUnquantizedFusedMoEMethod(moe)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Device(TypedDict):
|
|
|
|
|
device_id: int
|
2026-03-20 23:23:57 +08:00
|
|
|
device_expert: list[int]
|
2025-07-23 16:24:09 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
class Layer(TypedDict):
|
|
|
|
|
layer_id: int
|
|
|
|
|
device_count: int
|
2026-03-20 23:23:57 +08:00
|
|
|
device_list: list[Device]
|
2025-07-23 16:24:09 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
class MockData(TypedDict):
|
|
|
|
|
moe_layer_count: int
|
2026-03-20 23:23:57 +08:00
|
|
|
layer_list: list[Layer]
|
2025-07-23 16:24:09 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
class MockQuantMethod(nn.Module):
|
|
|
|
|
|
|
|
|
|
def __init__(self, shared_experts, num_tokens):
|
|
|
|
|
super().__init__()
|
|
|
|
|
if shared_experts:
|
|
|
|
|
self.apply = MagicMock(return_value=(torch.randn(num_tokens, 32),
|
|
|
|
|
torch.randn(num_tokens, 10)))
|
|
|
|
|
else:
|
|
|
|
|
self.apply = MagicMock(return_value=(torch.randn(num_tokens, 32)))
|
|
|
|
|
|
|
|
|
|
|
2025-08-14 11:50:53 +08:00
|
|
|
class TestExpertsSelector:
|
|
|
|
|
|
2025-10-18 00:09:16 +08:00
|
|
|
@pytest.mark.parametrize("global_num_experts", [256, 128])
|
2025-08-14 11:50:53 +08:00
|
|
|
def test_select_experts(self, mock_dist_env, mock_moe_env,
|
|
|
|
|
global_num_experts):
|
|
|
|
|
|
|
|
|
|
x = torch.randn(8, 2)
|
|
|
|
|
router_logits = torch.randn(8, 2)
|
2025-10-15 09:08:31 +08:00
|
|
|
topk_weights, topk_ids = select_experts(
|
2025-08-14 11:50:53 +08:00
|
|
|
hidden_states=x,
|
|
|
|
|
router_logits=router_logits,
|
|
|
|
|
top_k=2,
|
|
|
|
|
use_grouped_topk=False,
|
|
|
|
|
renormalize=True,
|
|
|
|
|
topk_group=None,
|
|
|
|
|
num_expert_group=None,
|
|
|
|
|
custom_routing_function=None,
|
|
|
|
|
scoring_func="softmax",
|
|
|
|
|
e_score_correction_bias=None,
|
|
|
|
|
global_num_experts=global_num_experts)
|
|
|
|
|
|
|
|
|
|
assert topk_weights.shape == (8, 2)
|
|
|
|
|
assert topk_ids.shape == (8, 2)
|
2025-08-28 10:13:35 +08:00
|
|
|
|
|
|
|
|
|
[main] Fuse GroupedMatmul, Swiglu and DynamicQuant in `W8A8_DYNAMIC` quantized MoE layers (#2275)
### What this PR does / why we need it?
Fuse `GroupedMatmul`, `Swiglu` and `DynamicQuant` into one fusion
operation `GroupedMatmulSwigluQuant`.
1. extract common functions in `w4a8_dynamic.py` and `w8a8_dynamic.py`
2. if in supported occasion, use fusion operation
`npu_grouped_matmul_swiglu_quant`
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
Tested on W8A8 quantized Qwen3-235B-A22B model with `bs=16`
1. `tp=8`, `dp=1`, `moe_tp=8`, `moe_ep=1`, TPOP increased 21.54%, Output
Token Throughput increased 27.35%
<img width="3443" height="211" alt="image"
src="https://github.com/user-attachments/assets/a1a9c14d-2310-41be-9a03-36125dabae6e"
/>
3. `tp=8`, `dp=1`, `moe_tp=1`, `moe_ep=8`, TPOP increased 17.38%, Output
Token Throughput increased 6.86%
<img width="3443" height="211" alt="image"
src="https://github.com/user-attachments/assets/1ce92e92-720d-40c0-8b4d-c493e5cb10a6"
/>
- vLLM version: v0.10.1.1
- vLLM main:
https://github.com/vllm-project/vllm/commit/6997a25ac65ed6cc3c2be6d09ca45f633a345f63
---------
Signed-off-by: Ruri <33858552+zhoux77899@users.noreply.github.com>
Signed-off-by: zhoux77899 <zhouxiang100@huawei.com>
2025-09-04 11:37:32 +08:00
|
|
|
class TestCumsumGroupList(TestBase):
|
|
|
|
|
|
|
|
|
|
def setUp(self):
|
|
|
|
|
self.active_num = 8
|
|
|
|
|
self.expert_num = 128
|
|
|
|
|
self.experts = torch.zeros((self.expert_num, ), dtype=torch.int64)
|
|
|
|
|
self.experts[:self.active_num] = 1
|
|
|
|
|
self.experts = self.experts[torch.randperm(self.expert_num)]
|
|
|
|
|
self.group_list = self.experts.cumsum(dim=0)
|
|
|
|
|
|
|
|
|
|
def test_cumsum_group_list_with_type_0(self):
|
|
|
|
|
group_list = self.experts.cumsum(dim=0)
|
|
|
|
|
group_list_type = 0
|
2025-12-12 14:51:20 +08:00
|
|
|
result = cumsum_group_list(group_list, group_list_type, 0)
|
[main] Fuse GroupedMatmul, Swiglu and DynamicQuant in `W8A8_DYNAMIC` quantized MoE layers (#2275)
### What this PR does / why we need it?
Fuse `GroupedMatmul`, `Swiglu` and `DynamicQuant` into one fusion
operation `GroupedMatmulSwigluQuant`.
1. extract common functions in `w4a8_dynamic.py` and `w8a8_dynamic.py`
2. if in supported occasion, use fusion operation
`npu_grouped_matmul_swiglu_quant`
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
Tested on W8A8 quantized Qwen3-235B-A22B model with `bs=16`
1. `tp=8`, `dp=1`, `moe_tp=8`, `moe_ep=1`, TPOP increased 21.54%, Output
Token Throughput increased 27.35%
<img width="3443" height="211" alt="image"
src="https://github.com/user-attachments/assets/a1a9c14d-2310-41be-9a03-36125dabae6e"
/>
3. `tp=8`, `dp=1`, `moe_tp=1`, `moe_ep=8`, TPOP increased 17.38%, Output
Token Throughput increased 6.86%
<img width="3443" height="211" alt="image"
src="https://github.com/user-attachments/assets/1ce92e92-720d-40c0-8b4d-c493e5cb10a6"
/>
- vLLM version: v0.10.1.1
- vLLM main:
https://github.com/vllm-project/vllm/commit/6997a25ac65ed6cc3c2be6d09ca45f633a345f63
---------
Signed-off-by: Ruri <33858552+zhoux77899@users.noreply.github.com>
Signed-off-by: zhoux77899 <zhouxiang100@huawei.com>
2025-09-04 11:37:32 +08:00
|
|
|
self.assertTrue(torch.equal(result, self.group_list))
|
|
|
|
|
|
|
|
|
|
def test_cumsum_group_list_with_type_1(self):
|
|
|
|
|
group_list = self.experts
|
|
|
|
|
group_list_type = 1
|
2025-12-12 14:51:20 +08:00
|
|
|
result = cumsum_group_list(group_list, group_list_type, 0)
|
[main] Fuse GroupedMatmul, Swiglu and DynamicQuant in `W8A8_DYNAMIC` quantized MoE layers (#2275)
### What this PR does / why we need it?
Fuse `GroupedMatmul`, `Swiglu` and `DynamicQuant` into one fusion
operation `GroupedMatmulSwigluQuant`.
1. extract common functions in `w4a8_dynamic.py` and `w8a8_dynamic.py`
2. if in supported occasion, use fusion operation
`npu_grouped_matmul_swiglu_quant`
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
Tested on W8A8 quantized Qwen3-235B-A22B model with `bs=16`
1. `tp=8`, `dp=1`, `moe_tp=8`, `moe_ep=1`, TPOP increased 21.54%, Output
Token Throughput increased 27.35%
<img width="3443" height="211" alt="image"
src="https://github.com/user-attachments/assets/a1a9c14d-2310-41be-9a03-36125dabae6e"
/>
3. `tp=8`, `dp=1`, `moe_tp=1`, `moe_ep=8`, TPOP increased 17.38%, Output
Token Throughput increased 6.86%
<img width="3443" height="211" alt="image"
src="https://github.com/user-attachments/assets/1ce92e92-720d-40c0-8b4d-c493e5cb10a6"
/>
- vLLM version: v0.10.1.1
- vLLM main:
https://github.com/vllm-project/vllm/commit/6997a25ac65ed6cc3c2be6d09ca45f633a345f63
---------
Signed-off-by: Ruri <33858552+zhoux77899@users.noreply.github.com>
Signed-off-by: zhoux77899 <zhouxiang100@huawei.com>
2025-09-04 11:37:32 +08:00
|
|
|
self.assertTrue(torch.equal(result, self.group_list))
|
|
|
|
|
|
|
|
|
|
def test_cumsum_group_list_with_type_2(self):
|
|
|
|
|
tokens = torch.arange(self.expert_num, dtype=torch.int64)
|
|
|
|
|
group_list = torch.cat([
|
|
|
|
|
tokens.reshape(self.expert_num, 1),
|
|
|
|
|
self.experts.reshape(self.expert_num, 1)
|
|
|
|
|
],
|
|
|
|
|
dim=1)
|
|
|
|
|
group_list_type = 2
|
|
|
|
|
result = cumsum_group_list(group_list,
|
|
|
|
|
group_list_type,
|
2025-12-12 14:51:20 +08:00
|
|
|
0,
|
[main] Fuse GroupedMatmul, Swiglu and DynamicQuant in `W8A8_DYNAMIC` quantized MoE layers (#2275)
### What this PR does / why we need it?
Fuse `GroupedMatmul`, `Swiglu` and `DynamicQuant` into one fusion
operation `GroupedMatmulSwigluQuant`.
1. extract common functions in `w4a8_dynamic.py` and `w8a8_dynamic.py`
2. if in supported occasion, use fusion operation
`npu_grouped_matmul_swiglu_quant`
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
Tested on W8A8 quantized Qwen3-235B-A22B model with `bs=16`
1. `tp=8`, `dp=1`, `moe_tp=8`, `moe_ep=1`, TPOP increased 21.54%, Output
Token Throughput increased 27.35%
<img width="3443" height="211" alt="image"
src="https://github.com/user-attachments/assets/a1a9c14d-2310-41be-9a03-36125dabae6e"
/>
3. `tp=8`, `dp=1`, `moe_tp=1`, `moe_ep=8`, TPOP increased 17.38%, Output
Token Throughput increased 6.86%
<img width="3443" height="211" alt="image"
src="https://github.com/user-attachments/assets/1ce92e92-720d-40c0-8b4d-c493e5cb10a6"
/>
- vLLM version: v0.10.1.1
- vLLM main:
https://github.com/vllm-project/vllm/commit/6997a25ac65ed6cc3c2be6d09ca45f633a345f63
---------
Signed-off-by: Ruri <33858552+zhoux77899@users.noreply.github.com>
Signed-off-by: zhoux77899 <zhouxiang100@huawei.com>
2025-09-04 11:37:32 +08:00
|
|
|
active_num=self.active_num,
|
|
|
|
|
expert_num=self.expert_num)
|
|
|
|
|
self.assertTrue(torch.equal(result, self.group_list))
|
|
|
|
|
|
|
|
|
|
|
2025-08-28 10:13:35 +08:00
|
|
|
class TestUnifiedApplyMLP(TestBase):
|
|
|
|
|
|
2026-02-10 14:14:37 +08:00
|
|
|
@patch('vllm_ascend.ops.fused_moe.moe_mlp.get_weight_prefetch_method',
|
|
|
|
|
return_value=MagicMock())
|
2026-03-13 09:11:46 +08:00
|
|
|
@patch('vllm_ascend.ascend_forward_context.get_forward_context')
|
[refact] unified soc_version code (#4359)
### What this PR does / why we need it?
Currently, there are two paths to judge the chip type in code,
`get_ascend_soc_version` use `get_soc_version` api in torch_npu, and
`is_310p` `use _build_info.__soc_version__`, which generate when
install. We need to unify the two paths.
We need to unify these codes based on the following points:
1. We need to ensure consistency in chip type judgment between compiling
and running states;
2. In compiling state, we need chip type to complete op's compilation,
but in running state, we only need device
type(910B/910_93/310P/910_95/etc) to make code branch judgement;
3. In compiling state, torch_npu may not have been installed yet, so we
can't use torch_npu's api.
Based on the above points, we have made the following changes:
1. When user set env `SOC_VERSION`, use it; when not set, query
soc_version by `npu-smi`;
2. generate device_type based on soc_version when compiling, and write
`__device_type__` instead of `__soc_version__` in `_build_info.py`;
3. In running state, use `__device_type__` to judge code branch.
### Does this PR introduce _any_ user-facing change?
When not set env `SOC_VERSION`, it will not be `ASCEND910B1` by default,
we will query soc_version by `npu-smi`. And env `SOC_VERSION` must be in
the list `soc_to_device` in `setup.py`.
- vLLM version: v0.11.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2918c1b49c88c29783c86f78d2c4221cb9622379
Signed-off-by: zzzzwwjj <1183291235@qq.com>
2025-11-26 14:28:55 +08:00
|
|
|
@patch('vllm_ascend.utils.get_ascend_device_type',
|
2025-12-17 14:08:19 +08:00
|
|
|
return_value=AscendDeviceType.A3)
|
2025-08-28 10:13:35 +08:00
|
|
|
@patch('torch_npu.npu_grouped_matmul')
|
|
|
|
|
@patch('torch_npu.npu_dynamic_quant')
|
|
|
|
|
@patch('torch_npu.npu_dequant_swiglu_quant')
|
|
|
|
|
def test_unified_apply_mlp_with_quantization_mc2(self, mock_npu_dequant,
|
|
|
|
|
mock_npu_dynamic_quant,
|
|
|
|
|
mock_npu_grouped_matmul,
|
[refact] unified soc_version code (#4359)
### What this PR does / why we need it?
Currently, there are two paths to judge the chip type in code,
`get_ascend_soc_version` use `get_soc_version` api in torch_npu, and
`is_310p` `use _build_info.__soc_version__`, which generate when
install. We need to unify the two paths.
We need to unify these codes based on the following points:
1. We need to ensure consistency in chip type judgment between compiling
and running states;
2. In compiling state, we need chip type to complete op's compilation,
but in running state, we only need device
type(910B/910_93/310P/910_95/etc) to make code branch judgement;
3. In compiling state, torch_npu may not have been installed yet, so we
can't use torch_npu's api.
Based on the above points, we have made the following changes:
1. When user set env `SOC_VERSION`, use it; when not set, query
soc_version by `npu-smi`;
2. generate device_type based on soc_version when compiling, and write
`__device_type__` instead of `__soc_version__` in `_build_info.py`;
3. In running state, use `__device_type__` to judge code branch.
### Does this PR introduce _any_ user-facing change?
When not set env `SOC_VERSION`, it will not be `ASCEND910B1` by default,
we will query soc_version by `npu-smi`. And env `SOC_VERSION` must be in
the list `soc_to_device` in `setup.py`.
- vLLM version: v0.11.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2918c1b49c88c29783c86f78d2c4221cb9622379
Signed-off-by: zzzzwwjj <1183291235@qq.com>
2025-11-26 14:28:55 +08:00
|
|
|
mock_soc_version,
|
2026-02-10 14:14:37 +08:00
|
|
|
mock_get_forward_context,
|
|
|
|
|
mock_get_weight_prefetch_method):
|
2025-08-28 10:13:35 +08:00
|
|
|
|
|
|
|
|
mock_forward_context = MagicMock()
|
2025-09-22 19:12:58 +08:00
|
|
|
mock_forward_context.moe_comm_type = MoECommType.MC2
|
2025-08-28 10:13:35 +08:00
|
|
|
mock_get_forward_context.return_value = mock_forward_context
|
|
|
|
|
|
|
|
|
|
mock_npu_dynamic_quant.return_value = (torch.randint(-128,
|
|
|
|
|
127, (10, 20),
|
|
|
|
|
dtype=torch.int8),
|
|
|
|
|
torch.rand(10,
|
|
|
|
|
1,
|
|
|
|
|
dtype=torch.float32))
|
|
|
|
|
|
|
|
|
|
mock_npu_grouped_matmul.side_effect = [[
|
|
|
|
|
torch.randint(-2147483648, 2147483647, (10, 40), dtype=torch.int32)
|
|
|
|
|
], [torch.randn(10, 20, dtype=torch.bfloat16)]]
|
|
|
|
|
|
|
|
|
|
mock_npu_dequant.return_value = (torch.randn(10,
|
|
|
|
|
40,
|
|
|
|
|
dtype=torch.bfloat16),
|
|
|
|
|
torch.randn(10,
|
|
|
|
|
1,
|
|
|
|
|
dtype=torch.float32))
|
|
|
|
|
|
|
|
|
|
hidden_states = torch.randn(10, 20, dtype=torch.bfloat16)
|
|
|
|
|
w1 = torch.randint(-128, 127, (5, 20, 40), dtype=torch.int8)
|
|
|
|
|
w1_scale = torch.randn(5, 40, dtype=torch.float32)
|
|
|
|
|
w2 = torch.randint(-128, 127, (5, 40, 20), dtype=torch.int8)
|
|
|
|
|
w2_scale = torch.randn(5, 20, dtype=torch.bfloat16)
|
|
|
|
|
group_list = torch.tensor([2, 4, 6, 8, 10], dtype=torch.int64)
|
|
|
|
|
|
2026-03-20 23:23:57 +08:00
|
|
|
result = unified_apply_mlp(mlp_compute_input=build_mlp_compute_input_fixture(
|
|
|
|
|
hidden_states=hidden_states,
|
|
|
|
|
w1=w1,
|
|
|
|
|
w2=w2,
|
|
|
|
|
group_list=group_list,
|
|
|
|
|
with_quant=True,
|
|
|
|
|
w1_scale=w1_scale,
|
|
|
|
|
w2_scale=w2_scale,
|
|
|
|
|
))
|
2025-08-28 10:13:35 +08:00
|
|
|
|
|
|
|
|
mock_get_forward_context.assert_called()
|
|
|
|
|
|
|
|
|
|
mock_npu_dynamic_quant.assert_called()
|
|
|
|
|
|
|
|
|
|
self.assertEqual(mock_npu_grouped_matmul.call_count, 2)
|
|
|
|
|
|
|
|
|
|
mock_npu_dequant.assert_called_once()
|
|
|
|
|
|
|
|
|
|
self.assertEqual(result.dtype, torch.bfloat16)
|
|
|
|
|
|
[refact] unified soc_version code (#4359)
### What this PR does / why we need it?
Currently, there are two paths to judge the chip type in code,
`get_ascend_soc_version` use `get_soc_version` api in torch_npu, and
`is_310p` `use _build_info.__soc_version__`, which generate when
install. We need to unify the two paths.
We need to unify these codes based on the following points:
1. We need to ensure consistency in chip type judgment between compiling
and running states;
2. In compiling state, we need chip type to complete op's compilation,
but in running state, we only need device
type(910B/910_93/310P/910_95/etc) to make code branch judgement;
3. In compiling state, torch_npu may not have been installed yet, so we
can't use torch_npu's api.
Based on the above points, we have made the following changes:
1. When user set env `SOC_VERSION`, use it; when not set, query
soc_version by `npu-smi`;
2. generate device_type based on soc_version when compiling, and write
`__device_type__` instead of `__soc_version__` in `_build_info.py`;
3. In running state, use `__device_type__` to judge code branch.
### Does this PR introduce _any_ user-facing change?
When not set env `SOC_VERSION`, it will not be `ASCEND910B1` by default,
we will query soc_version by `npu-smi`. And env `SOC_VERSION` must be in
the list `soc_to_device` in `setup.py`.
- vLLM version: v0.11.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2918c1b49c88c29783c86f78d2c4221cb9622379
Signed-off-by: zzzzwwjj <1183291235@qq.com>
2025-11-26 14:28:55 +08:00
|
|
|
@patch('vllm_ascend.utils.get_ascend_device_type',
|
2025-12-17 14:08:19 +08:00
|
|
|
return_value=AscendDeviceType.A3)
|
2025-08-28 10:13:35 +08:00
|
|
|
@patch('torch_npu.npu_grouped_matmul')
|
|
|
|
|
@patch('torch_npu.npu_swiglu')
|
|
|
|
|
@patch('torch_npu.npu_dynamic_quant')
|
2025-08-29 16:20:22 +08:00
|
|
|
def test_unified_apply_mlp_without_quantization(self,
|
|
|
|
|
mock_npu_dynamic_quant,
|
|
|
|
|
mock_npu_swiglu,
|
|
|
|
|
mock_npu_grouped_matmul,
|
[refact] unified soc_version code (#4359)
### What this PR does / why we need it?
Currently, there are two paths to judge the chip type in code,
`get_ascend_soc_version` use `get_soc_version` api in torch_npu, and
`is_310p` `use _build_info.__soc_version__`, which generate when
install. We need to unify the two paths.
We need to unify these codes based on the following points:
1. We need to ensure consistency in chip type judgment between compiling
and running states;
2. In compiling state, we need chip type to complete op's compilation,
but in running state, we only need device
type(910B/910_93/310P/910_95/etc) to make code branch judgement;
3. In compiling state, torch_npu may not have been installed yet, so we
can't use torch_npu's api.
Based on the above points, we have made the following changes:
1. When user set env `SOC_VERSION`, use it; when not set, query
soc_version by `npu-smi`;
2. generate device_type based on soc_version when compiling, and write
`__device_type__` instead of `__soc_version__` in `_build_info.py`;
3. In running state, use `__device_type__` to judge code branch.
### Does this PR introduce _any_ user-facing change?
When not set env `SOC_VERSION`, it will not be `ASCEND910B1` by default,
we will query soc_version by `npu-smi`. And env `SOC_VERSION` must be in
the list `soc_to_device` in `setup.py`.
- vLLM version: v0.11.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2918c1b49c88c29783c86f78d2c4221cb9622379
Signed-off-by: zzzzwwjj <1183291235@qq.com>
2025-11-26 14:28:55 +08:00
|
|
|
mock_soc_version):
|
2025-08-28 10:13:35 +08:00
|
|
|
mock_npu_grouped_matmul.side_effect = [[
|
|
|
|
|
torch.randn(10, 40, dtype=torch.float16)
|
|
|
|
|
], [torch.randn(10, 20, dtype=torch.float16)]]
|
|
|
|
|
mock_npu_swiglu.return_value = torch.randn(10, 40, dtype=torch.float16)
|
|
|
|
|
mock_npu_dynamic_quant.return_value = (MagicMock(), MagicMock())
|
|
|
|
|
|
|
|
|
|
hidden_states = torch.randn(10, 20, dtype=torch.float16)
|
|
|
|
|
w1 = torch.randn(5, 20, 40, dtype=torch.float16)
|
|
|
|
|
w2 = torch.randn(5, 40, 20, dtype=torch.float16)
|
|
|
|
|
group_list = torch.tensor([2, 4, 6, 8, 10], dtype=torch.int64)
|
|
|
|
|
topk_scales = torch.randn(10, 1, dtype=torch.float16)
|
|
|
|
|
|
2026-03-20 23:23:57 +08:00
|
|
|
result = unified_apply_mlp(mlp_compute_input=build_mlp_compute_input_fixture(
|
|
|
|
|
hidden_states=hidden_states,
|
|
|
|
|
w1=w1,
|
|
|
|
|
w2=w2,
|
|
|
|
|
group_list=group_list,
|
|
|
|
|
with_quant=False,
|
|
|
|
|
topk_scales=topk_scales,
|
|
|
|
|
))
|
2025-08-28 10:13:35 +08:00
|
|
|
|
|
|
|
|
self.assertEqual(mock_npu_grouped_matmul.call_count, 2)
|
|
|
|
|
mock_npu_swiglu.assert_called_once()
|
|
|
|
|
|
|
|
|
|
self.assertEqual(result.shape, hidden_states.shape)
|
|
|
|
|
self.assertEqual(result.dtype, torch.float16)
|
|
|
|
|
|
2025-12-23 12:47:35 +08:00
|
|
|
@patch('vllm_ascend.ops.fused_moe.moe_mlp.HAS_TRITON', False)
|
2026-02-10 14:14:37 +08:00
|
|
|
@patch('vllm_ascend.ops.fused_moe.moe_mlp.get_weight_prefetch_method',
|
|
|
|
|
return_value=MagicMock())
|
2026-03-13 09:11:46 +08:00
|
|
|
@patch('vllm_ascend.ascend_forward_context.get_forward_context')
|
2025-08-28 10:13:35 +08:00
|
|
|
@patch('torch_npu.npu_grouped_matmul')
|
|
|
|
|
@patch('torch_npu.npu_swiglu')
|
|
|
|
|
@patch('torch_npu.npu_dynamic_quant')
|
|
|
|
|
def test_unified_apply_mlp_with_quantization_and_dynamic_scale(
|
|
|
|
|
self, mock_npu_dynamic_quant, mock_npu_swiglu,
|
2026-02-10 14:14:37 +08:00
|
|
|
mock_npu_grouped_matmul, mock_get_forward_context,
|
|
|
|
|
mock_get_weight_prefetch_method):
|
2025-08-28 10:13:35 +08:00
|
|
|
|
|
|
|
|
mock_forward_context = MagicMock()
|
|
|
|
|
mock_forward_context.with_quant = True
|
|
|
|
|
mock_forward_context.fused_moe_state = "NOT_MC2"
|
|
|
|
|
mock_get_forward_context.return_value = mock_forward_context
|
|
|
|
|
|
|
|
|
|
mock_npu_grouped_matmul.side_effect = [[
|
|
|
|
|
torch.randn(10, 40, dtype=torch.bfloat16)
|
|
|
|
|
], [torch.randn(10, 20, dtype=torch.bfloat16)]]
|
|
|
|
|
|
|
|
|
|
mock_npu_swiglu.return_value = torch.randn(10,
|
|
|
|
|
40,
|
|
|
|
|
dtype=torch.bfloat16)
|
|
|
|
|
|
|
|
|
|
mock_npu_dynamic_quant.return_value = (torch.randint(-128,
|
|
|
|
|
127, (10, 40),
|
|
|
|
|
dtype=torch.int8),
|
|
|
|
|
torch.rand(10,
|
|
|
|
|
1,
|
|
|
|
|
dtype=torch.float32))
|
|
|
|
|
|
|
|
|
|
hidden_states = torch.randn(10, 20, dtype=torch.bfloat16)
|
2025-11-04 16:49:58 +08:00
|
|
|
hidden_states_shape = hidden_states.shape
|
2025-08-28 10:13:35 +08:00
|
|
|
w1 = torch.randn(5, 20, 40, dtype=torch.bfloat16)
|
|
|
|
|
w1_scale = torch.randn(5, 40, dtype=torch.bfloat16)
|
|
|
|
|
w2 = torch.randn(5, 40, 20, dtype=torch.bfloat16)
|
|
|
|
|
w2_scale = torch.randn(5, 20, dtype=torch.bfloat16)
|
|
|
|
|
w1_scale_bias = torch.randn(5, 40, dtype=torch.bfloat16)
|
|
|
|
|
w2_scale_bias = torch.randn(5, 20, dtype=torch.bfloat16)
|
|
|
|
|
group_list = torch.tensor([2, 4, 6, 8, 10], dtype=torch.int64)
|
|
|
|
|
provided_dynamic_scale = torch.rand(10, 1, dtype=torch.float32)
|
|
|
|
|
|
2026-03-20 23:23:57 +08:00
|
|
|
result = unified_apply_mlp(mlp_compute_input=build_mlp_compute_input_fixture(
|
|
|
|
|
hidden_states=hidden_states,
|
|
|
|
|
w1=w1,
|
|
|
|
|
w2=w2,
|
|
|
|
|
group_list=group_list,
|
|
|
|
|
with_quant=True,
|
|
|
|
|
dynamic_scale=provided_dynamic_scale,
|
|
|
|
|
w1_scale=w1_scale,
|
|
|
|
|
w2_scale=w2_scale,
|
|
|
|
|
w1_scale_bias=w1_scale_bias,
|
|
|
|
|
w2_scale_bias=w2_scale_bias,
|
|
|
|
|
))
|
2025-08-28 10:13:35 +08:00
|
|
|
|
|
|
|
|
mock_get_forward_context.assert_called()
|
|
|
|
|
|
|
|
|
|
self.assertEqual(mock_npu_grouped_matmul.call_count, 2)
|
|
|
|
|
mock_npu_swiglu.assert_called_once()
|
|
|
|
|
mock_npu_dynamic_quant.assert_called_once()
|
|
|
|
|
|
2025-11-04 16:49:58 +08:00
|
|
|
self.assertEqual(result.shape, hidden_states_shape)
|
2025-08-28 10:13:35 +08:00
|
|
|
self.assertEqual(result.dtype, torch.bfloat16)
|
|
|
|
|
|
[refact] unified soc_version code (#4359)
### What this PR does / why we need it?
Currently, there are two paths to judge the chip type in code,
`get_ascend_soc_version` use `get_soc_version` api in torch_npu, and
`is_310p` `use _build_info.__soc_version__`, which generate when
install. We need to unify the two paths.
We need to unify these codes based on the following points:
1. We need to ensure consistency in chip type judgment between compiling
and running states;
2. In compiling state, we need chip type to complete op's compilation,
but in running state, we only need device
type(910B/910_93/310P/910_95/etc) to make code branch judgement;
3. In compiling state, torch_npu may not have been installed yet, so we
can't use torch_npu's api.
Based on the above points, we have made the following changes:
1. When user set env `SOC_VERSION`, use it; when not set, query
soc_version by `npu-smi`;
2. generate device_type based on soc_version when compiling, and write
`__device_type__` instead of `__soc_version__` in `_build_info.py`;
3. In running state, use `__device_type__` to judge code branch.
### Does this PR introduce _any_ user-facing change?
When not set env `SOC_VERSION`, it will not be `ASCEND910B1` by default,
we will query soc_version by `npu-smi`. And env `SOC_VERSION` must be in
the list `soc_to_device` in `setup.py`.
- vLLM version: v0.11.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2918c1b49c88c29783c86f78d2c4221cb9622379
Signed-off-by: zzzzwwjj <1183291235@qq.com>
2025-11-26 14:28:55 +08:00
|
|
|
@patch('vllm_ascend.utils.get_ascend_device_type',
|
|
|
|
|
return_value=AscendDeviceType._310P)
|
2025-08-28 10:13:35 +08:00
|
|
|
@patch('torch_npu.npu_grouped_matmul')
|
|
|
|
|
@patch('torch_npu.npu_swiglu')
|
|
|
|
|
@patch('torch_npu.npu_dynamic_quant')
|
|
|
|
|
def test_unified_apply_mlp_without_quantization_310p(
|
|
|
|
|
self, mock_npu_dynamic_quant, mock_npu_swiglu,
|
[refact] unified soc_version code (#4359)
### What this PR does / why we need it?
Currently, there are two paths to judge the chip type in code,
`get_ascend_soc_version` use `get_soc_version` api in torch_npu, and
`is_310p` `use _build_info.__soc_version__`, which generate when
install. We need to unify the two paths.
We need to unify these codes based on the following points:
1. We need to ensure consistency in chip type judgment between compiling
and running states;
2. In compiling state, we need chip type to complete op's compilation,
but in running state, we only need device
type(910B/910_93/310P/910_95/etc) to make code branch judgement;
3. In compiling state, torch_npu may not have been installed yet, so we
can't use torch_npu's api.
Based on the above points, we have made the following changes:
1. When user set env `SOC_VERSION`, use it; when not set, query
soc_version by `npu-smi`;
2. generate device_type based on soc_version when compiling, and write
`__device_type__` instead of `__soc_version__` in `_build_info.py`;
3. In running state, use `__device_type__` to judge code branch.
### Does this PR introduce _any_ user-facing change?
When not set env `SOC_VERSION`, it will not be `ASCEND910B1` by default,
we will query soc_version by `npu-smi`. And env `SOC_VERSION` must be in
the list `soc_to_device` in `setup.py`.
- vLLM version: v0.11.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2918c1b49c88c29783c86f78d2c4221cb9622379
Signed-off-by: zzzzwwjj <1183291235@qq.com>
2025-11-26 14:28:55 +08:00
|
|
|
mock_npu_grouped_matmul, mock_soc_version):
|
2025-08-28 10:13:35 +08:00
|
|
|
mock_gmm1_out = torch.randn(10, 40, dtype=torch.float16)
|
|
|
|
|
mock_gmm2_out = torch.randn(10, 20, dtype=torch.float16)
|
|
|
|
|
mock_npu_grouped_matmul.side_effect = [[mock_gmm1_out],
|
|
|
|
|
[mock_gmm2_out]]
|
|
|
|
|
|
|
|
|
|
mock_npu_swiglu.return_value = torch.randn(10, 40, dtype=torch.float16)
|
|
|
|
|
|
|
|
|
|
mock_npu_dynamic_quant.return_value = (MagicMock(), MagicMock())
|
|
|
|
|
|
|
|
|
|
hidden_states = torch.randn(10, 20, dtype=torch.float16)
|
|
|
|
|
w1 = torch.randn(5, 20, 40, dtype=torch.float16)
|
|
|
|
|
w2 = torch.randn(5, 40, 20, dtype=torch.float16)
|
|
|
|
|
group_list = torch.tensor([2, 4, 6, 8, 10], dtype=torch.int64)
|
|
|
|
|
topk_scales = torch.randn(10, 1, dtype=torch.float16)
|
|
|
|
|
|
2026-03-20 23:23:57 +08:00
|
|
|
result = unified_apply_mlp(mlp_compute_input=build_mlp_compute_input_fixture(
|
|
|
|
|
hidden_states=hidden_states,
|
|
|
|
|
w1=w1,
|
|
|
|
|
w2=w2,
|
|
|
|
|
group_list=group_list,
|
|
|
|
|
with_quant=False,
|
|
|
|
|
topk_scales=topk_scales,
|
|
|
|
|
))
|
2025-08-28 10:13:35 +08:00
|
|
|
|
|
|
|
|
self.assertEqual(mock_npu_grouped_matmul.call_count, 2)
|
|
|
|
|
mock_npu_swiglu.assert_called_once()
|
|
|
|
|
|
|
|
|
|
self.assertEqual(result.shape, hidden_states.shape)
|
|
|
|
|
self.assertEqual(result.dtype, torch.float16)
|
[main] Fuse GroupedMatmul, Swiglu and DynamicQuant in `W8A8_DYNAMIC` quantized MoE layers (#2275)
### What this PR does / why we need it?
Fuse `GroupedMatmul`, `Swiglu` and `DynamicQuant` into one fusion
operation `GroupedMatmulSwigluQuant`.
1. extract common functions in `w4a8_dynamic.py` and `w8a8_dynamic.py`
2. if in supported occasion, use fusion operation
`npu_grouped_matmul_swiglu_quant`
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
Tested on W8A8 quantized Qwen3-235B-A22B model with `bs=16`
1. `tp=8`, `dp=1`, `moe_tp=8`, `moe_ep=1`, TPOP increased 21.54%, Output
Token Throughput increased 27.35%
<img width="3443" height="211" alt="image"
src="https://github.com/user-attachments/assets/a1a9c14d-2310-41be-9a03-36125dabae6e"
/>
3. `tp=8`, `dp=1`, `moe_tp=1`, `moe_ep=8`, TPOP increased 17.38%, Output
Token Throughput increased 6.86%
<img width="3443" height="211" alt="image"
src="https://github.com/user-attachments/assets/1ce92e92-720d-40c0-8b4d-c493e5cb10a6"
/>
- vLLM version: v0.10.1.1
- vLLM main:
https://github.com/vllm-project/vllm/commit/6997a25ac65ed6cc3c2be6d09ca45f633a345f63
---------
Signed-off-by: Ruri <33858552+zhoux77899@users.noreply.github.com>
Signed-off-by: zhoux77899 <zhouxiang100@huawei.com>
2025-09-04 11:37:32 +08:00
|
|
|
|
2026-02-10 14:14:37 +08:00
|
|
|
@patch("vllm_ascend.ops.fused_moe.moe_mlp.get_weight_prefetch_method",
|
|
|
|
|
return_value=MagicMock())
|
2026-03-13 09:11:46 +08:00
|
|
|
@patch("vllm_ascend.ascend_forward_context.get_forward_context")
|
[main] Fuse GroupedMatmul, Swiglu and DynamicQuant in `W8A8_DYNAMIC` quantized MoE layers (#2275)
### What this PR does / why we need it?
Fuse `GroupedMatmul`, `Swiglu` and `DynamicQuant` into one fusion
operation `GroupedMatmulSwigluQuant`.
1. extract common functions in `w4a8_dynamic.py` and `w8a8_dynamic.py`
2. if in supported occasion, use fusion operation
`npu_grouped_matmul_swiglu_quant`
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
Tested on W8A8 quantized Qwen3-235B-A22B model with `bs=16`
1. `tp=8`, `dp=1`, `moe_tp=8`, `moe_ep=1`, TPOP increased 21.54%, Output
Token Throughput increased 27.35%
<img width="3443" height="211" alt="image"
src="https://github.com/user-attachments/assets/a1a9c14d-2310-41be-9a03-36125dabae6e"
/>
3. `tp=8`, `dp=1`, `moe_tp=1`, `moe_ep=8`, TPOP increased 17.38%, Output
Token Throughput increased 6.86%
<img width="3443" height="211" alt="image"
src="https://github.com/user-attachments/assets/1ce92e92-720d-40c0-8b4d-c493e5cb10a6"
/>
- vLLM version: v0.10.1.1
- vLLM main:
https://github.com/vllm-project/vllm/commit/6997a25ac65ed6cc3c2be6d09ca45f633a345f63
---------
Signed-off-by: Ruri <33858552+zhoux77899@users.noreply.github.com>
Signed-off-by: zhoux77899 <zhouxiang100@huawei.com>
2025-09-04 11:37:32 +08:00
|
|
|
@patch("torch_npu.npu_grouped_matmul")
|
|
|
|
|
@patch("torch_npu.npu_swiglu")
|
|
|
|
|
@patch("torch_npu.npu_grouped_matmul_swiglu_quant")
|
|
|
|
|
@patch("torch_npu.npu_dynamic_quant")
|
|
|
|
|
def test_unified_apply_mlp_with_quantization_and_fusion_mlp(
|
|
|
|
|
self, mock_npu_dynamic_quant, mock_npu_grouped_matmul_swiglu_quant,
|
|
|
|
|
mock_npu_swiglu, mock_npu_grouped_matmul,
|
2026-02-10 14:14:37 +08:00
|
|
|
mock_get_forward_context,
|
|
|
|
|
mock_get_weight_prefetch_method):
|
[main] Fuse GroupedMatmul, Swiglu and DynamicQuant in `W8A8_DYNAMIC` quantized MoE layers (#2275)
### What this PR does / why we need it?
Fuse `GroupedMatmul`, `Swiglu` and `DynamicQuant` into one fusion
operation `GroupedMatmulSwigluQuant`.
1. extract common functions in `w4a8_dynamic.py` and `w8a8_dynamic.py`
2. if in supported occasion, use fusion operation
`npu_grouped_matmul_swiglu_quant`
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
Tested on W8A8 quantized Qwen3-235B-A22B model with `bs=16`
1. `tp=8`, `dp=1`, `moe_tp=8`, `moe_ep=1`, TPOP increased 21.54%, Output
Token Throughput increased 27.35%
<img width="3443" height="211" alt="image"
src="https://github.com/user-attachments/assets/a1a9c14d-2310-41be-9a03-36125dabae6e"
/>
3. `tp=8`, `dp=1`, `moe_tp=1`, `moe_ep=8`, TPOP increased 17.38%, Output
Token Throughput increased 6.86%
<img width="3443" height="211" alt="image"
src="https://github.com/user-attachments/assets/1ce92e92-720d-40c0-8b4d-c493e5cb10a6"
/>
- vLLM version: v0.10.1.1
- vLLM main:
https://github.com/vllm-project/vllm/commit/6997a25ac65ed6cc3c2be6d09ca45f633a345f63
---------
Signed-off-by: Ruri <33858552+zhoux77899@users.noreply.github.com>
Signed-off-by: zhoux77899 <zhouxiang100@huawei.com>
2025-09-04 11:37:32 +08:00
|
|
|
|
|
|
|
|
mock_forward_context = MagicMock()
|
|
|
|
|
mock_forward_context.with_quant = True
|
|
|
|
|
mock_forward_context.fused_moe_state = "NOT_MC2"
|
|
|
|
|
mock_get_forward_context.return_value = mock_forward_context
|
|
|
|
|
|
|
|
|
|
mock_npu_grouped_matmul_swiglu_quant.return_value = (torch.randint(
|
|
|
|
|
-128, 127, (10, 40),
|
|
|
|
|
dtype=torch.int8), torch.rand(
|
|
|
|
|
10, 1,
|
|
|
|
|
dtype=torch.float32), torch.rand(10, 1, dtype=torch.float32))
|
|
|
|
|
mock_npu_grouped_matmul.side_effect = [[
|
|
|
|
|
torch.randn(10, 20, dtype=torch.bfloat16)
|
|
|
|
|
]]
|
|
|
|
|
mock_npu_swiglu.return_value = torch.randn(10,
|
|
|
|
|
40,
|
|
|
|
|
dtype=torch.bfloat16)
|
|
|
|
|
mock_npu_dynamic_quant.return_value = (torch.randint(-128,
|
|
|
|
|
127, (10, 40),
|
|
|
|
|
dtype=torch.int8),
|
|
|
|
|
torch.rand(10,
|
|
|
|
|
1,
|
|
|
|
|
dtype=torch.float32))
|
|
|
|
|
|
|
|
|
|
hidden_states = torch.randn(10, 20, dtype=torch.bfloat16)
|
2025-11-04 16:49:58 +08:00
|
|
|
hidden_states_shape = hidden_states.shape
|
[main] Fuse GroupedMatmul, Swiglu and DynamicQuant in `W8A8_DYNAMIC` quantized MoE layers (#2275)
### What this PR does / why we need it?
Fuse `GroupedMatmul`, `Swiglu` and `DynamicQuant` into one fusion
operation `GroupedMatmulSwigluQuant`.
1. extract common functions in `w4a8_dynamic.py` and `w8a8_dynamic.py`
2. if in supported occasion, use fusion operation
`npu_grouped_matmul_swiglu_quant`
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
Tested on W8A8 quantized Qwen3-235B-A22B model with `bs=16`
1. `tp=8`, `dp=1`, `moe_tp=8`, `moe_ep=1`, TPOP increased 21.54%, Output
Token Throughput increased 27.35%
<img width="3443" height="211" alt="image"
src="https://github.com/user-attachments/assets/a1a9c14d-2310-41be-9a03-36125dabae6e"
/>
3. `tp=8`, `dp=1`, `moe_tp=1`, `moe_ep=8`, TPOP increased 17.38%, Output
Token Throughput increased 6.86%
<img width="3443" height="211" alt="image"
src="https://github.com/user-attachments/assets/1ce92e92-720d-40c0-8b4d-c493e5cb10a6"
/>
- vLLM version: v0.10.1.1
- vLLM main:
https://github.com/vllm-project/vllm/commit/6997a25ac65ed6cc3c2be6d09ca45f633a345f63
---------
Signed-off-by: Ruri <33858552+zhoux77899@users.noreply.github.com>
Signed-off-by: zhoux77899 <zhouxiang100@huawei.com>
2025-09-04 11:37:32 +08:00
|
|
|
w1 = torch.randn(5, 20, 40, dtype=torch.bfloat16)
|
|
|
|
|
w1_scale = torch.randn(5, 40, dtype=torch.bfloat16)
|
|
|
|
|
w2 = torch.randn(5, 40, 20, dtype=torch.bfloat16)
|
|
|
|
|
w2_scale = torch.randn(5, 20, dtype=torch.bfloat16)
|
|
|
|
|
w1_scale_bias = torch.randn(5, 40, dtype=torch.bfloat16)
|
|
|
|
|
w2_scale_bias = torch.randn(5, 20, dtype=torch.bfloat16)
|
|
|
|
|
group_list = torch.tensor([2, 4, 6, 8, 10], dtype=torch.int64)
|
|
|
|
|
provided_dynamic_scale = torch.rand(10, 1, dtype=torch.float32)
|
|
|
|
|
|
2026-03-20 23:23:57 +08:00
|
|
|
result = unified_apply_mlp(mlp_compute_input=build_mlp_compute_input_fixture(
|
|
|
|
|
hidden_states=hidden_states,
|
|
|
|
|
w1=w1,
|
|
|
|
|
w2=w2,
|
|
|
|
|
group_list=group_list,
|
|
|
|
|
with_quant=True,
|
|
|
|
|
dynamic_scale=provided_dynamic_scale,
|
|
|
|
|
w1_scale=w1_scale,
|
|
|
|
|
w2_scale=w2_scale,
|
|
|
|
|
w1_scale_bias=w1_scale_bias,
|
|
|
|
|
w2_scale_bias=w2_scale_bias,
|
|
|
|
|
fusion=True,
|
|
|
|
|
))
|
[main] Fuse GroupedMatmul, Swiglu and DynamicQuant in `W8A8_DYNAMIC` quantized MoE layers (#2275)
### What this PR does / why we need it?
Fuse `GroupedMatmul`, `Swiglu` and `DynamicQuant` into one fusion
operation `GroupedMatmulSwigluQuant`.
1. extract common functions in `w4a8_dynamic.py` and `w8a8_dynamic.py`
2. if in supported occasion, use fusion operation
`npu_grouped_matmul_swiglu_quant`
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
Tested on W8A8 quantized Qwen3-235B-A22B model with `bs=16`
1. `tp=8`, `dp=1`, `moe_tp=8`, `moe_ep=1`, TPOP increased 21.54%, Output
Token Throughput increased 27.35%
<img width="3443" height="211" alt="image"
src="https://github.com/user-attachments/assets/a1a9c14d-2310-41be-9a03-36125dabae6e"
/>
3. `tp=8`, `dp=1`, `moe_tp=1`, `moe_ep=8`, TPOP increased 17.38%, Output
Token Throughput increased 6.86%
<img width="3443" height="211" alt="image"
src="https://github.com/user-attachments/assets/1ce92e92-720d-40c0-8b4d-c493e5cb10a6"
/>
- vLLM version: v0.10.1.1
- vLLM main:
https://github.com/vllm-project/vllm/commit/6997a25ac65ed6cc3c2be6d09ca45f633a345f63
---------
Signed-off-by: Ruri <33858552+zhoux77899@users.noreply.github.com>
Signed-off-by: zhoux77899 <zhouxiang100@huawei.com>
2025-09-04 11:37:32 +08:00
|
|
|
|
|
|
|
|
mock_get_forward_context.assert_called()
|
|
|
|
|
mock_npu_grouped_matmul.assert_called_once()
|
|
|
|
|
mock_npu_grouped_matmul_swiglu_quant.assert_called_once()
|
|
|
|
|
|
|
|
|
|
self.assertTrue(mock_forward_context.with_quant)
|
2025-11-04 16:49:58 +08:00
|
|
|
self.assertEqual(result.shape, hidden_states_shape)
|
2025-12-23 08:49:52 +08:00
|
|
|
self.assertEqual(result.dtype, torch.bfloat16)
|