[Bugfix]Support Qwen3-MOE on aclgraph mode in sizes capture and add new ut (#2511)
[Bugfix]Support Qwen3-MOE on aclgraph mode in sizes capture and add new
ut
What this PR does / why we need it?
This PR solves the problem of sizes capture and stream error caused by
using ACLgraph on the Qwen3-30B MOE model.
Add new ut.
Does this PR introduce any user-facing change?
no
How was this patch tested?
ut
- vLLM version: v0.10.1.1
- vLLM main:
6fad29b11b
Signed-off-by: lilinsiman <lilinsiman@gmail.com>
This commit is contained in:
@@ -21,6 +21,8 @@
|
||||
Run `pytest tests/e2e/multicard/test_qwen3_moe.py`.
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
from modelscope import snapshot_download # type: ignore
|
||||
|
||||
from tests.e2e.conftest import VllmRunner
|
||||
@@ -72,3 +74,36 @@ def test_models_distributed_Qwen3_MOE_W8A8():
|
||||
enforce_eager=False,
|
||||
) as vllm_model:
|
||||
vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
|
||||
def test_models_distributed_Qwen3_MOE_TP2_WITH_ACLGRAPH_AIV():
|
||||
os.environ['HCCL_OP_EXPANSION_MODE'] = 'AIV'
|
||||
example_prompts = [
|
||||
"Hello, my name is",
|
||||
]
|
||||
dtype = "auto"
|
||||
max_tokens = 5
|
||||
with VllmRunner(
|
||||
"Qwen/Qwen3-30B-A3B",
|
||||
dtype=dtype,
|
||||
tensor_parallel_size=2,
|
||||
enforce_eager=False,
|
||||
) as vllm_model:
|
||||
vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
|
||||
def test_models_distributed_Qwen3_MOE_TP2_WITH_ACLGRAPH():
|
||||
if 'HCCL_OP_EXPANSION_MODE' in os.environ:
|
||||
del os.environ['HCCL_OP_EXPANSION_MODE']
|
||||
example_prompts = [
|
||||
"Hello, my name is",
|
||||
]
|
||||
dtype = "auto"
|
||||
max_tokens = 5
|
||||
with VllmRunner(
|
||||
"Qwen/Qwen3-30B-A3B",
|
||||
dtype=dtype,
|
||||
tensor_parallel_size=2,
|
||||
enforce_eager=False,
|
||||
) as vllm_model:
|
||||
vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
Reference in New Issue
Block a user