[Bugfix]Support Qwen3-MOE on aclgraph mode in sizes capture and add new ut (#2511)

[Bugfix]Support Qwen3-MOE on aclgraph mode in sizes capture and add new ut What this PR does / why we need it? This PR solves the problem of sizes capture and stream error caused by using ACLgraph on the Qwen3-30B MOE model. Add new ut. Does this PR introduce any user-facing change? no How was this patch tested? ut - vLLM version: v0.10.1.1 - vLLM main: 6fad29b11b Signed-off-by: lilinsiman <lilinsiman@gmail.com>
2025-08-26 12:39:21 +08:00
parent b3fdd78a6b
commit cfe77e83ae
3 changed files with 80 additions and 7 deletions
--- a/tests/e2e/multicard/test_qwen3_moe.py
+++ b/tests/e2e/multicard/test_qwen3_moe.py
@@ -21,6 +21,8 @@
 Run `pytest tests/e2e/multicard/test_qwen3_moe.py`.
 """

+import os
+
 from modelscope import snapshot_download  # type: ignore

 from tests.e2e.conftest import VllmRunner
@@ -72,3 +74,36 @@ def test_models_distributed_Qwen3_MOE_W8A8():
            enforce_eager=False,
    ) as vllm_model:
        vllm_model.generate_greedy(example_prompts, max_tokens)
+
+
+def test_models_distributed_Qwen3_MOE_TP2_WITH_ACLGRAPH_AIV():
+    os.environ['HCCL_OP_EXPANSION_MODE'] = 'AIV'
+    example_prompts = [
+        "Hello, my name is",
+    ]
+    dtype = "auto"
+    max_tokens = 5
+    with VllmRunner(
+            "Qwen/Qwen3-30B-A3B",
+            dtype=dtype,
+            tensor_parallel_size=2,
+            enforce_eager=False,
+    ) as vllm_model:
+        vllm_model.generate_greedy(example_prompts, max_tokens)
+
+
+def test_models_distributed_Qwen3_MOE_TP2_WITH_ACLGRAPH():
+    if 'HCCL_OP_EXPANSION_MODE' in os.environ:
+        del os.environ['HCCL_OP_EXPANSION_MODE']
+    example_prompts = [
+        "Hello, my name is",
+    ]
+    dtype = "auto"
+    max_tokens = 5
+    with VllmRunner(
+            "Qwen/Qwen3-30B-A3B",
+            dtype=dtype,
+            tensor_parallel_size=2,
+            enforce_eager=False,
+    ) as vllm_model:
+        vllm_model.generate_greedy(example_prompts, max_tokens)