[EPLB][CI] Add dynamic EPLB CI for qwen3-moe (#5179)

### What this PR does / why we need it? Add dynamic EPLB CI for qwen3-moe-30B-W8A8 - vLLM version: v0.12.0 - vLLM main: ad32e3e19c Signed-off-by: shenchuxiaofugui <1311027364@qq.com>
2025-12-23 11:31:00 +08:00
parent 449f8f65a7
commit 2e010e12dd
1 changed files with 56 additions and 17 deletions
--- a/tests/e2e/multicard/test_qwen3_moe.py
+++ b/tests/e2e/multicard/test_qwen3_moe.py
@@ -21,12 +21,16 @@
 Run `pytest tests/e2e/multicard/test_qwen3_moe.py`.
 """

+import json
 import os
 from unittest.mock import patch

+import openai
+import pytest
 from modelscope import snapshot_download  # type: ignore
+from vllm.utils import get_open_port

-from tests.e2e.conftest import VllmRunner
+from tests.e2e.conftest import RemoteOpenAIServer, VllmRunner


@patch.dict(os.environ, {"HCCL_BUFFSIZE": "1024"})
@@ -58,22 +62,6 @@ def test_qwen3_moe_w8a8_distributed_tp2():
        vllm_model.generate_greedy(example_prompts, max_tokens)


-@patch.dict(os.environ, {"HCCL_BUFFSIZE": "1024"})
-def test_qwen3_moe_w8a8_distributed_tp2_ep():
-    example_prompts = [
-        "Hello, my name is",
-    ]
-    max_tokens = 5
-    with VllmRunner(
-            snapshot_download("vllm-ascend/Qwen3-30B-A3B-W8A8"),
-            max_model_len=8192,
-            tensor_parallel_size=2,
-            enable_expert_parallel=True,
-            quantization="ascend",
-    ) as vllm_model:
-        vllm_model.generate_greedy(example_prompts, max_tokens)
-
-
 def test_qwen3_moe_distributed_aiv_tp2():
    os.environ['HCCL_OP_EXPANSION_MODE'] = 'AIV'
    example_prompts = [
@@ -87,3 +75,54 @@ def test_qwen3_moe_distributed_aiv_tp2():
            tensor_parallel_size=2,
    ) as vllm_model:
        vllm_model.generate_greedy(example_prompts, max_tokens)
+
+
+@pytest.mark.asyncio
+async def test_qwen3_moe_w8a8_distributed_tp2_ep_dynamic_eplb():
+    model = "vllm-ascend/Qwen3-30B-A3B-W8A8"
+    port = get_open_port()
+    server_args = [
+        "--max_model_len", "8192", "--tensor_parallel_size", "2",
+        "--enable_expert_parallel", "--quantization", "ascend", "--port",
+        str(port), "--enforce_eager"
+    ]
+    env_dict = {"HCCL_BUFFSIZE": "1024"}
+    with RemoteOpenAIServer(model,
+                            server_args,
+                            server_port=port,
+                            auto_port=False,
+                            env_dict=env_dict) as server:
+        client = server.get_async_client()
+        batch = await client.completions.create(model=model,
+                                                prompt="What is deeplearning?",
+                                                max_tokens=300,
+                                                temperature=0,
+                                                top_p=1.0,
+                                                n=1)
+        gt_choices: list[openai.types.CompletionChoice] = batch.choices
+
+    # dynamic eplb test
+    # Since pytest runs as a daemon, it conflicts with the dynamic eplb manager
+    # during initialization in offline mode, so the online mode is used instead.
+    env_dict.update({"DYNAMIC_EPLB": "true"})
+    additional_config = {
+        "dynamic_eplb": True,
+        "num_iterations_eplb_update": 100,
+        "num_wait_worker_iterations": 20
+    }
+    server_args.extend(["--additional-config", json.dumps(additional_config)])
+    with RemoteOpenAIServer(model,
+                            server_args,
+                            server_port=port,
+                            auto_port=False,
+                            env_dict=env_dict) as server:
+        client = server.get_async_client()
+        batch = await client.completions.create(model=model,
+                                                prompt="What is deeplearning?",
+                                                max_tokens=300,
+                                                temperature=0,
+                                                top_p=1.0,
+                                                n=1)
+        eplb_choices: list[openai.types.CompletionChoice] = batch.choices
+    assert gt_choices[0].text == eplb_choices[
+        0].text, f"{gt_choices[0].text=} \n {eplb_choices[0].text=}"