[Refactor] Adjustments to moe_comm_method selection process (#3001)
### What this PR does / why we need it?
Fix issues mentioned in
https://github.com/vllm-project/vllm-ascend/pull/2791 and some minor
refactoring.
1. Use Enum instead of string.
2. Avoid setting a new property to forward_context in
AscendFusedMoE.forward().
3. Enabling TokenDispatcherWithMoge.
4. Remove redundant code.
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
Qwen3-30B-A3B/Qwen3-30B-A3B-W8A8/DeepSeek-V3-W4A8-Pruing/deepseek-mtp/pangu-pro-moe-pruing:
1. Enable/Disable EP
2. Aclgraph & eager
- vLLM version: v0.10.2
- vLLM main:
9607d5eb44
Signed-off-by: Pr0Wh1teGivee <calvin_zhu0210@outlook.com>
Co-authored-by: weijinqian0 <12153182+weijinqian0@users.noreply.github.com>
This commit is contained in:
@@ -15,6 +15,7 @@ from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm_ascend.ascend_forward_context import MoECommType
|
||||
from vllm_ascend.utils import AscendSocVersion
|
||||
from vllm_ascend.worker.model_runner_v1 import NPUModelRunner
|
||||
|
||||
@@ -24,21 +25,21 @@ from vllm_ascend.worker.model_runner_v1 import NPUModelRunner
|
||||
"soc_version, enable_expert_parallel, world_size, num_tokens, mc2_tokens_capacity, quant_type, expected_method",
|
||||
[
|
||||
# Case 1: Expert parallel is disabled, should always be 'allgather'
|
||||
(AscendSocVersion.A2, False, 8, 100, 256, None, "allgather"),
|
||||
(AscendSocVersion.A3, False, 16, 500, 256, None, "allgather"),
|
||||
(AscendSocVersion.A2, False, 8, 100, 256, None, MoECommType.ALLGATHER),
|
||||
(AscendSocVersion.A3, False, 16, 500, 256, None, MoECommType.ALLGATHER),
|
||||
|
||||
# Case 2: A2 SOC with w4a8_dynamic -> use alltoall when not mc2
|
||||
(AscendSocVersion.A2, True, 8, 100, 256, "w4a8_dynamic", "alltoall"),
|
||||
(AscendSocVersion.A2, True, 16, 257, 256, "w4a8_dynamic", "alltoall"),
|
||||
(AscendSocVersion.A2, True, 16, 100, 256, "w4a8_dynamic", "mc2"), # meets mc2 condition
|
||||
(AscendSocVersion.A2, True, 8, 100, 256, "w4a8_dynamic", MoECommType.ALLTOALL),
|
||||
(AscendSocVersion.A2, True, 16, 257, 256, "w4a8_dynamic", MoECommType.ALLTOALL),
|
||||
(AscendSocVersion.A2, True, 16, 100, 256, "w4a8_dynamic", MoECommType.MC2), # meets mc2 condition
|
||||
|
||||
# Case 3: A2 SOC without w4a8_dynamic -> fallback to allgather
|
||||
(AscendSocVersion.A2, True, 8, 100, 256, None, "allgather"),
|
||||
(AscendSocVersion.A2, True, 16, 257, 256, None, "allgather"),
|
||||
(AscendSocVersion.A2, True, 8, 100, 256, None, MoECommType.ALLGATHER),
|
||||
(AscendSocVersion.A2, True, 16, 257, 256, None, MoECommType.ALLGATHER),
|
||||
|
||||
# Case 4: A3 SOC
|
||||
(AscendSocVersion.A3, True, 8, 100, 256, None, "mc2"),
|
||||
(AscendSocVersion.A3, True, 8, 257, 256, None, "alltoall"),
|
||||
(AscendSocVersion.A3, True, 8, 100, 256, None, MoECommType.MC2),
|
||||
(AscendSocVersion.A3, True, 8, 257, 256, None, MoECommType.ALLTOALL),
|
||||
])
|
||||
# yapf: enable
|
||||
def test_select_moe_comm_method(soc_version, enable_expert_parallel,
|
||||
|
||||
Reference in New Issue
Block a user