### What this PR does / why we need it? 1. Move additional functionalities from fused_moe.py to common_fused_moe.py and remove fused_moe.py 2. Remove unnecessary custom classes from qwen3_moe.py, and it will be completely removed after we release vllm-ascend v0.11.0 ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Qwen3-30B-A3B/Qwen3-30B-A3B-W8A8/DeepSeek-V3-W4A8-Pruing/deepseek-mtp/pangu-pro-moe-pruing: 1. Enable/Disable EP 3. Aclgraph & eager 4. SP - vLLM version: v0.11.0 --------- Signed-off-by: Pr0Wh1teGivee <calvin_zhu0210@outlook.com> Co-authored-by: weijinqian0 <12153182+weijinqian0@users.noreply.github.com>
115 lines
3.9 KiB
Python
115 lines
3.9 KiB
Python
from types import SimpleNamespace
|
|
from unittest.mock import MagicMock, Mock, patch
|
|
|
|
import pytest
|
|
import torch
|
|
from transformers import PretrainedConfig
|
|
from vllm.config import CacheConfig, EPLBConfig, ParallelConfig
|
|
from vllm.distributed.parallel_state import GroupCoordinator
|
|
|
|
|
|
@pytest.fixture
|
|
def base_config():
|
|
config = PretrainedConfig(
|
|
hidden_size=128,
|
|
num_attention_heads=8,
|
|
num_hidden_layers=2,
|
|
intermediate_size=256,
|
|
hidden_act="silu",
|
|
rms_norm_eps=1e-6,
|
|
rope_theta=10000.0,
|
|
max_position_embeddings=2048,
|
|
n_routed_experts=4,
|
|
n_shared_experts=1,
|
|
moe_intermediate_size=256,
|
|
num_experts_per_tok=2,
|
|
routed_scaling_factor=1.0,
|
|
first_k_dense_replace=0,
|
|
moe_layer_freq=1,
|
|
kv_lora_rank=16,
|
|
qk_nope_head_dim=16,
|
|
qk_rope_head_dim=16,
|
|
v_head_dim=32,
|
|
topk_method="noaux_tc",
|
|
scoring_func="softmax",
|
|
norm_topk_prob=True,
|
|
n_group=1,
|
|
topk_group=1,
|
|
vocab_size=10000,
|
|
)
|
|
return config
|
|
|
|
|
|
@pytest.fixture
|
|
def vllm_config(base_config):
|
|
model_config = SimpleNamespace(
|
|
hf_config=base_config,
|
|
tensor_parallel_size=1,
|
|
dtype=torch.float32,
|
|
use_mla=True,
|
|
quant_config=None,
|
|
max_model_len=2048,
|
|
)
|
|
parallel_config = MagicMock(spec=ParallelConfig)
|
|
eplb_config = MagicMock(spec=EPLBConfig)
|
|
eplb_config.num_redundant_experts = 0
|
|
parallel_config.eplb_config = eplb_config
|
|
|
|
cache_config = CacheConfig()
|
|
vllm_config = Mock()
|
|
vllm_config.model_config = model_config
|
|
vllm_config.cache_config = cache_config
|
|
vllm_config.quant_config = None
|
|
vllm_config.parallel_config = parallel_config
|
|
return vllm_config
|
|
|
|
|
|
@pytest.fixture
|
|
def mock_distributed():
|
|
tp_group = Mock(spec=GroupCoordinator)
|
|
tp_group.rank_in_group = 0
|
|
tp_group.world_size = 1
|
|
tp_group.device_group = Mock()
|
|
|
|
dp_group = Mock(spec=GroupCoordinator)
|
|
dp_group.rank_in_group = 0
|
|
dp_group.world_size = 1
|
|
|
|
ep_group = Mock(spec=GroupCoordinator)
|
|
ep_group.rank_in_group = 0
|
|
ep_group.world_size = 1
|
|
ep_group.device_group = Mock()
|
|
ep_group.device_group.rank.return_value = 0
|
|
ep_group.device_group.size.return_value = 1
|
|
|
|
pp_group = Mock(spec=GroupCoordinator)
|
|
pp_group.rank_in_group = 0
|
|
pp_group.world_size = 1
|
|
|
|
mock_vllm_config = Mock()
|
|
mock_vllm_config.scheduler_config = Mock(max_num_seqs=256)
|
|
mock_vllm_config.model_config = Mock(max_model_len=2048, quant_config=None)
|
|
|
|
with patch("vllm_ascend.models.deepseek_v2.get_tensor_model_parallel_rank", return_value=0), \
|
|
patch("vllm_ascend.models.deepseek_v2.get_tensor_model_parallel_world_size", return_value=1), \
|
|
patch("vllm_ascend.models.deepseek_v2.get_tp_group", return_value=tp_group), \
|
|
patch("vllm_ascend.models.deepseek_v2.get_pp_group", return_value=pp_group), \
|
|
patch("vllm_ascend.models.deepseek_v2.get_pp_group",
|
|
return_value=Mock(is_first_rank=False, is_last_rank=False)), \
|
|
patch("vllm_ascend.ops.common_fused_moe.get_current_vllm_config", return_value=mock_vllm_config), \
|
|
patch("vllm_ascend.ops.moe.token_dispatcher.torch.distributed.get_rank", return_value=0), \
|
|
patch("vllm_ascend.ops.moe.token_dispatcher.get_ascend_soc_version", return_value=None), \
|
|
patch.dict("vllm.distributed.parallel_state.__dict__", _TP=tp_group, _EP=ep_group, _DP=dp_group,
|
|
_PP=pp_group), \
|
|
patch.dict("vllm_ascend.distributed.parallel_state.__dict__", _MC2=ep_group), \
|
|
patch("torch.npu.current_device", return_value=0):
|
|
yield
|
|
|
|
|
|
@pytest.fixture
|
|
def mock_forward_context():
|
|
forward_context = Mock(in_profile_run=False, with_prefill=False)
|
|
with patch("vllm_ascend.models.deepseek_v2.get_forward_context",
|
|
return_value=forward_context):
|
|
yield
|