[EPLB][Nightly] Refactor UT (#6543)

### What this PR does / why we need it? The basic configs are extracted and reused for eplb UT. This is done so that if the basic configs are changed later, eplb UT does not need to be modified repeatedly. ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.15.0 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.15.0 Signed-off-by: bigsir007 <xujiacheng12@huawei.com> Co-authored-by: bigsir007 <xujiacheng12@huawei.com>
2026-02-14 10:56:29 +08:00
parent 1e77077788
commit 64aea60f2e
4 changed files with 61 additions and 121 deletions
--- a/tests/e2e/nightly/single_node/models/test_deepseek_r1_0528_w8a8.py
+++ b/tests/e2e/nightly/single_node/models/test_deepseek_r1_0528_w8a8.py
@@ -63,10 +63,7 @@ aisbench_cases = [{
 }]


-@pytest.mark.asyncio
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("mode", MODES)
-async def test_models(model: str, mode: str) -> None:
+def config():
    port = get_open_port()
    env_dict = {
        "OMP_NUM_THREADS": "10",
@@ -85,6 +82,13 @@ async def test_models(model: str, mode: str) -> None:
        "--speculative-config",
        json.dumps(speculative_config)
    ]
+    return port, env_dict, additional_config, server_args
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("mode", MODES)
+async def test_models(model: str, mode: str) -> None:
+    port, env_dict, additional_config, server_args = config()
    if mode == "single":
        server_args.append("--enforce-eager")
    server_args.extend(["--additional-config", json.dumps(additional_config)])
--- a/tests/e2e/nightly/single_node/models/test_deepseek_r1_0528_w8a8_eplb.py
+++ b/tests/e2e/nightly/single_node/models/test_deepseek_r1_0528_w8a8_eplb.py
@@ -23,18 +23,7 @@ from vllm.utils.network_utils import get_open_port

 from tests.e2e.conftest import RemoteOpenAIServer
 from tools.aisbench import run_aisbench_cases
-
-MODELS = [
-    "vllm-ascend/DeepSeek-R1-0528-W8A8",
-]
-
-prompts = [
-    "San Francisco is a",
-]
-
-api_keyword_args = {
-    "max_tokens": 10,
-}
+from .test_deepseek_r1_0528_w8a8 import *

 aisbench_cases = [{
    "case_type": "accuracy",
@@ -50,46 +39,23 @@ aisbench_cases = [{

@pytest.mark.asyncio
@pytest.mark.parametrize("model", MODELS)
-async def test_models(model: str) -> None:
-    port = get_open_port()
-    env_dict = {
-        "OMP_NUM_THREADS": "100",
-        "OMP_PROC_BIND": "false",
-        "HCCL_BUFFSIZE": "200",
-        "VLLM_RPC_TIMEOUT": "3600000",
-        "VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS": "3600000",
-        "DISABLE_L2_CACHE": "1",
-        "DYNAMIC_EPLB": "true",
-    }
-    speculative_config = {"num_speculative_tokens": 1, "method": "mtp"}
-    compilation_config = {
-        "cudagraph_capture_sizes": [24],
-        "cudagraph_mode": "FULL_DECODE_ONLY"
-    }
-    additional_config: dict[str, Any] = {
-        "enable_shared_expert_dp": False,
-        "multistream_overlap_shared_expert": False,
-        "eplb_config": {
-            "dynamic_eplb": True,
-            "expert_heat_collection_interval": 512,
-            "algorithm_execution_interval": 100,
-            "num_redundant_experts": 0
+async def test_models_eplb(model: str) -> None:
+    port, env_dict, additional_config, server_args = config()
+    additional_config.update(
+        {
+            "eplb_config": {
+                "dynamic_eplb": "true",
+                "expert_heat_collection_interval": 1000,
+                "algorithm_execution_interval": 50,
+                "eplb_policy_type": 3,
+            }
        }
-    }
-    server_args = [
-        "--quantization", "ascend", "--seed", "1024",
-        "--no-enable-prefix-caching", "--data-parallel-size", "4",
-        "--tensor-parallel-size", "4", "--enable-expert-parallel", "--port",
-        str(port), "--max-model-len", "40000", "--max-num-batched-tokens",
-        "4096", "--max-num-seqs", "12", "--trust-remote-code",
-        "--gpu-memory-utilization", "0.92"
-    ]
-    server_args.extend(
-        ["--speculative-config",
-         json.dumps(speculative_config)])
-    server_args.extend(
-        ["--compilation-config",
-         json.dumps(compilation_config)])
+    )
+    env_dict.update(
+        {
+            "DYNAMIC_EPLB": "true",
+        }
+    )
    server_args.extend(["--additional-config", json.dumps(additional_config)])
    request_keyword_args: dict[str, Any] = {
        **api_keyword_args,
@@ -113,3 +79,4 @@ async def test_models(model: str) -> None:
                           port,
                           aisbench_cases,
                           server_args=server_args)
+
--- a/tests/e2e/nightly/single_node/models/test_qwen3_235b_a22b_w8a8_eplb.py
+++ b/tests/e2e/nightly/single_node/models/test_qwen3_235b_a22b_w8a8_eplb.py
@@ -23,64 +23,30 @@ from vllm.utils.network_utils import get_open_port

 from tests.e2e.conftest import RemoteOpenAIServer
 from tools.aisbench import run_aisbench_cases
-
-MODELS = [
-    "vllm-ascend/Qwen3-235B-A22B-W8A8",
-]
-
-prompts = [
-    "San Francisco is a",
-]
-
-api_keyword_args = {
-    "max_tokens": 10,
-}
-
-aisbench_cases = [{
-    "case_type": "accuracy",
-    "dataset_path": "vllm-ascend/gsm8k-lite",
-    "request_conf": "vllm_api_general_chat",
-    "dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_chat_prompt",
-    "max_out_len": 32768,
-    "batch_size": 32,
-    "top_k": 20,
-    "baseline": 95,
-    "threshold": 5
-}]
+from .test_qwen3_235b_w8a8 import *


@pytest.mark.asyncio
@pytest.mark.parametrize("model", MODELS)
-async def test_models(model: str) -> None:
-    port = get_open_port()
-    env_dict = {
-        "OMP_NUM_THREADS": "10",
-        "OMP_PROC_BIND": "false",
-        "HCCL_BUFFSIZE": "1024",
-        "PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True",
-        "VLLM_ASCEND_ENABLE_FLASHCOMM1": "1"
-    }
+async def test_models_eplb(model: str) -> None:
+    port, aisbench_cases, env_dict, compilation_config, server_args = config()
+    env_dict.update(
+        {
+            "DYNAMIC_EPLB": "true",
+        }
+    )
    additional_config: dict[str, Any] = {}
-    compilation_config = {"cudagraph_mode": "FULL_DECODE_ONLY"}
-    server_args = [
-        "--quantization", "ascend", "--async-scheduling",
-        "--data-parallel-size", "4", "--tensor-parallel-size", "4",
-        "--enable-expert-parallel", "--port",
-        str(port), "--max-model-len", "40960", "--max-num-batched-tokens",
-        "8192", "--max-num-seqs", "12", "--trust-remote-code",
-        "--gpu-memory-utilization", "0.9"
-    ]
-    env_dict["DYNAMIC_EPLB"] = "true"
    additional_config["eplb_config"] = {
-        "dynamic_eplb": True,
-        "expert_heat_collection_interval": 512,
-        "algorithm_execution_interval": 100,
-        "num_redundant_experts": 0
+        "dynamic_eplb": "true",
+        "expert_heat_collection_interval": 600,
+        "algorithm_execution_interval": 50,
+        "num_redundant_experts": 16,
+        "eplb_policy_type": 2,
    }
+    server_args.extend(["--additional-config", json.dumps(additional_config)])
    server_args.extend(
        ["--compilation-config",
         json.dumps(compilation_config)])
-    server_args.extend(["--additional-config", json.dumps(additional_config)])
    request_keyword_args: dict[str, Any] = {
        **api_keyword_args,
    }
--- a/tests/e2e/nightly/single_node/models/test_qwen3_235b_w8a8.py
+++ b/tests/e2e/nightly/single_node/models/test_qwen3_235b_w8a8.py
@@ -38,24 +38,19 @@ api_keyword_args = {
    "max_tokens": 10,
 }

-aisbench_cases = [{
-    "case_type": "accuracy",
-    "dataset_path": "vllm-ascend/gsm8k-lite",
-    "request_conf": "vllm_api_general_chat",
-    "dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_chat_prompt",
-    "max_out_len": 32768,
-    "batch_size": 32,
-    "top_k": 20,
-    "baseline": 95,
-    "threshold": 5
-}]
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("mode", MODES)
-async def test_models(model: str, mode: str) -> None:
+def config():
    port = get_open_port()
+    aisbench_cases = [{
+        "case_type": "accuracy",
+        "dataset_path": "vllm-ascend/gsm8k-lite",
+        "request_conf": "vllm_api_general_chat",
+        "dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_chat_prompt",
+        "max_out_len": 32768,
+        "batch_size": 32,
+        "top_k": 20,
+        "baseline": 95,
+        "threshold": 5
+    }]
    env_dict = {
        "OMP_NUM_THREADS": "10",
        "OMP_PROC_BIND": "false",
@@ -72,11 +67,19 @@ async def test_models(model: str, mode: str) -> None:
        "8192", "--max-num-seqs", "12", "--trust-remote-code",
        "--gpu-memory-utilization", "0.9"
    ]
+    return port, aisbench_cases, env_dict, compilation_config, server_args
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("mode", MODES)
+async def test_models(model: str, mode: str) -> None:
+    port, aisbench_cases, env_dict, compilation_config, server_args = config()
    if mode == "piecewise":
        compilation_config["cudagraph_mode"] = "PIECEWISE"
    server_args.extend(
        ["--compilation-config",
-         json.dumps(compilation_config)])
+            json.dumps(compilation_config)])
    request_keyword_args: dict[str, Any] = {
        **api_keyword_args,
    }