Reapply "[MoE] [Refactor] Remove manual memory cleanup (#3365)" (#3483) (#3512)

### What this PR does / why we need it?
1. Replace manual memory cleanup with passing parameter.
2. FusedMoEPrepareAndFinalizeWithMC2 inherits All2All avoid duplicated
code.
3. Fix MC2 bug introduced in
https://github.com/vllm-project/vllm-ascend/pull/3365
4. Unify aclgraph & eager in W8A8_dynamic.
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
e2e & ut

- vLLM version: v0.11.0rc3
- vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0

Signed-off-by: Pr0Wh1teGivee <calvin_zhu0210@outlook.com>
This commit is contained in:
weichen
2025-10-22 11:41:30 +08:00
committed by GitHub
parent 6ef62cb427
commit 2f1b9a7a64
13 changed files with 608 additions and 522 deletions

View File

@@ -22,6 +22,7 @@ Run `pytest tests/e2e/multicard/test_qwen3_moe.py`.
"""
import os
from unittest.mock import patch
from modelscope import snapshot_download # type: ignore
@@ -41,6 +42,7 @@ def test_models_distributed_Qwen3_MOE_TP2():
vllm_model.generate_greedy(example_prompts, max_tokens)
@patch.dict(os.environ, {"HCCL_BUFFSIZE": "1024"})
def test_models_distributed_Qwen3_MOE_TP2_WITH_EP():
example_prompts = [
"Hello, my name is",
@@ -70,6 +72,22 @@ def test_models_distributed_Qwen3_MOE_W8A8():
vllm_model.generate_greedy(example_prompts, max_tokens)
@patch.dict(os.environ, {"HCCL_BUFFSIZE": "1024"})
def test_models_distributed_Qwen3_MOE_W8A8_WITH_EP():
example_prompts = [
"Hello, my name is",
]
max_tokens = 5
with VllmRunner(
snapshot_download("vllm-ascend/Qwen3-30B-A3B-W8A8"),
max_model_len=8192,
tensor_parallel_size=2,
enable_expert_parallel=True,
quantization="ascend",
) as vllm_model:
vllm_model.generate_greedy(example_prompts, max_tokens)
def test_models_distributed_Qwen3_MOE_TP2_WITH_ACLGRAPH_AIV():
os.environ['HCCL_OP_EXPANSION_MODE'] = 'AIV'
example_prompts = [