From 64aea60f2e0256e6cdfd505cca9243972bcccc7c Mon Sep 17 00:00:00 2001 From: JIACHENG XU <56331162+Spicy-Stick@users.noreply.github.com> Date: Sat, 14 Feb 2026 10:56:29 +0800 Subject: [PATCH] [EPLB][Nightly] Refactor UT (#6543) ### What this PR does / why we need it? The basic configs are extracted and reused for eplb UT. This is done so that if the basic configs are changed later, eplb UT does not need to be modified repeatedly. ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.15.0 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.15.0 Signed-off-by: bigsir007 Co-authored-by: bigsir007 --- .../models/test_deepseek_r1_0528_w8a8.py | 12 ++-- .../models/test_deepseek_r1_0528_w8a8_eplb.py | 69 +++++-------------- .../models/test_qwen3_235b_a22b_w8a8_eplb.py | 62 ++++------------- .../models/test_qwen3_235b_w8a8.py | 39 ++++++----- 4 files changed, 61 insertions(+), 121 deletions(-) diff --git a/tests/e2e/nightly/single_node/models/test_deepseek_r1_0528_w8a8.py b/tests/e2e/nightly/single_node/models/test_deepseek_r1_0528_w8a8.py index b71d8854..a16aca24 100644 --- a/tests/e2e/nightly/single_node/models/test_deepseek_r1_0528_w8a8.py +++ b/tests/e2e/nightly/single_node/models/test_deepseek_r1_0528_w8a8.py @@ -63,10 +63,7 @@ aisbench_cases = [{ }] -@pytest.mark.asyncio -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("mode", MODES) -async def test_models(model: str, mode: str) -> None: +def config(): port = get_open_port() env_dict = { "OMP_NUM_THREADS": "10", @@ -85,6 +82,13 @@ async def test_models(model: str, mode: str) -> None: "--speculative-config", json.dumps(speculative_config) ] + return port, env_dict, additional_config, server_args + +@pytest.mark.asyncio +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("mode", MODES) +async def test_models(model: str, mode: str) -> None: + port, env_dict, additional_config, server_args = config() if mode == "single": server_args.append("--enforce-eager") server_args.extend(["--additional-config", json.dumps(additional_config)]) diff --git a/tests/e2e/nightly/single_node/models/test_deepseek_r1_0528_w8a8_eplb.py b/tests/e2e/nightly/single_node/models/test_deepseek_r1_0528_w8a8_eplb.py index 08db9a15..35c5b30f 100644 --- a/tests/e2e/nightly/single_node/models/test_deepseek_r1_0528_w8a8_eplb.py +++ b/tests/e2e/nightly/single_node/models/test_deepseek_r1_0528_w8a8_eplb.py @@ -23,18 +23,7 @@ from vllm.utils.network_utils import get_open_port from tests.e2e.conftest import RemoteOpenAIServer from tools.aisbench import run_aisbench_cases - -MODELS = [ - "vllm-ascend/DeepSeek-R1-0528-W8A8", -] - -prompts = [ - "San Francisco is a", -] - -api_keyword_args = { - "max_tokens": 10, -} +from .test_deepseek_r1_0528_w8a8 import * aisbench_cases = [{ "case_type": "accuracy", @@ -50,46 +39,23 @@ aisbench_cases = [{ @pytest.mark.asyncio @pytest.mark.parametrize("model", MODELS) -async def test_models(model: str) -> None: - port = get_open_port() - env_dict = { - "OMP_NUM_THREADS": "100", - "OMP_PROC_BIND": "false", - "HCCL_BUFFSIZE": "200", - "VLLM_RPC_TIMEOUT": "3600000", - "VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS": "3600000", - "DISABLE_L2_CACHE": "1", - "DYNAMIC_EPLB": "true", - } - speculative_config = {"num_speculative_tokens": 1, "method": "mtp"} - compilation_config = { - "cudagraph_capture_sizes": [24], - "cudagraph_mode": "FULL_DECODE_ONLY" - } - additional_config: dict[str, Any] = { - "enable_shared_expert_dp": False, - "multistream_overlap_shared_expert": False, - "eplb_config": { - "dynamic_eplb": True, - "expert_heat_collection_interval": 512, - "algorithm_execution_interval": 100, - "num_redundant_experts": 0 +async def test_models_eplb(model: str) -> None: + port, env_dict, additional_config, server_args = config() + additional_config.update( + { + "eplb_config": { + "dynamic_eplb": "true", + "expert_heat_collection_interval": 1000, + "algorithm_execution_interval": 50, + "eplb_policy_type": 3, + } } - } - server_args = [ - "--quantization", "ascend", "--seed", "1024", - "--no-enable-prefix-caching", "--data-parallel-size", "4", - "--tensor-parallel-size", "4", "--enable-expert-parallel", "--port", - str(port), "--max-model-len", "40000", "--max-num-batched-tokens", - "4096", "--max-num-seqs", "12", "--trust-remote-code", - "--gpu-memory-utilization", "0.92" - ] - server_args.extend( - ["--speculative-config", - json.dumps(speculative_config)]) - server_args.extend( - ["--compilation-config", - json.dumps(compilation_config)]) + ) + env_dict.update( + { + "DYNAMIC_EPLB": "true", + } + ) server_args.extend(["--additional-config", json.dumps(additional_config)]) request_keyword_args: dict[str, Any] = { **api_keyword_args, @@ -113,3 +79,4 @@ async def test_models(model: str) -> None: port, aisbench_cases, server_args=server_args) + diff --git a/tests/e2e/nightly/single_node/models/test_qwen3_235b_a22b_w8a8_eplb.py b/tests/e2e/nightly/single_node/models/test_qwen3_235b_a22b_w8a8_eplb.py index 50a6ae53..2d54c275 100644 --- a/tests/e2e/nightly/single_node/models/test_qwen3_235b_a22b_w8a8_eplb.py +++ b/tests/e2e/nightly/single_node/models/test_qwen3_235b_a22b_w8a8_eplb.py @@ -23,64 +23,30 @@ from vllm.utils.network_utils import get_open_port from tests.e2e.conftest import RemoteOpenAIServer from tools.aisbench import run_aisbench_cases - -MODELS = [ - "vllm-ascend/Qwen3-235B-A22B-W8A8", -] - -prompts = [ - "San Francisco is a", -] - -api_keyword_args = { - "max_tokens": 10, -} - -aisbench_cases = [{ - "case_type": "accuracy", - "dataset_path": "vllm-ascend/gsm8k-lite", - "request_conf": "vllm_api_general_chat", - "dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_chat_prompt", - "max_out_len": 32768, - "batch_size": 32, - "top_k": 20, - "baseline": 95, - "threshold": 5 -}] +from .test_qwen3_235b_w8a8 import * @pytest.mark.asyncio @pytest.mark.parametrize("model", MODELS) -async def test_models(model: str) -> None: - port = get_open_port() - env_dict = { - "OMP_NUM_THREADS": "10", - "OMP_PROC_BIND": "false", - "HCCL_BUFFSIZE": "1024", - "PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True", - "VLLM_ASCEND_ENABLE_FLASHCOMM1": "1" - } +async def test_models_eplb(model: str) -> None: + port, aisbench_cases, env_dict, compilation_config, server_args = config() + env_dict.update( + { + "DYNAMIC_EPLB": "true", + } + ) additional_config: dict[str, Any] = {} - compilation_config = {"cudagraph_mode": "FULL_DECODE_ONLY"} - server_args = [ - "--quantization", "ascend", "--async-scheduling", - "--data-parallel-size", "4", "--tensor-parallel-size", "4", - "--enable-expert-parallel", "--port", - str(port), "--max-model-len", "40960", "--max-num-batched-tokens", - "8192", "--max-num-seqs", "12", "--trust-remote-code", - "--gpu-memory-utilization", "0.9" - ] - env_dict["DYNAMIC_EPLB"] = "true" additional_config["eplb_config"] = { - "dynamic_eplb": True, - "expert_heat_collection_interval": 512, - "algorithm_execution_interval": 100, - "num_redundant_experts": 0 + "dynamic_eplb": "true", + "expert_heat_collection_interval": 600, + "algorithm_execution_interval": 50, + "num_redundant_experts": 16, + "eplb_policy_type": 2, } + server_args.extend(["--additional-config", json.dumps(additional_config)]) server_args.extend( ["--compilation-config", json.dumps(compilation_config)]) - server_args.extend(["--additional-config", json.dumps(additional_config)]) request_keyword_args: dict[str, Any] = { **api_keyword_args, } diff --git a/tests/e2e/nightly/single_node/models/test_qwen3_235b_w8a8.py b/tests/e2e/nightly/single_node/models/test_qwen3_235b_w8a8.py index f97a50a8..c468a851 100644 --- a/tests/e2e/nightly/single_node/models/test_qwen3_235b_w8a8.py +++ b/tests/e2e/nightly/single_node/models/test_qwen3_235b_w8a8.py @@ -38,24 +38,19 @@ api_keyword_args = { "max_tokens": 10, } -aisbench_cases = [{ - "case_type": "accuracy", - "dataset_path": "vllm-ascend/gsm8k-lite", - "request_conf": "vllm_api_general_chat", - "dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_chat_prompt", - "max_out_len": 32768, - "batch_size": 32, - "top_k": 20, - "baseline": 95, - "threshold": 5 -}] - - -@pytest.mark.asyncio -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("mode", MODES) -async def test_models(model: str, mode: str) -> None: +def config(): port = get_open_port() + aisbench_cases = [{ + "case_type": "accuracy", + "dataset_path": "vllm-ascend/gsm8k-lite", + "request_conf": "vllm_api_general_chat", + "dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_chat_prompt", + "max_out_len": 32768, + "batch_size": 32, + "top_k": 20, + "baseline": 95, + "threshold": 5 + }] env_dict = { "OMP_NUM_THREADS": "10", "OMP_PROC_BIND": "false", @@ -72,11 +67,19 @@ async def test_models(model: str, mode: str) -> None: "8192", "--max-num-seqs", "12", "--trust-remote-code", "--gpu-memory-utilization", "0.9" ] + return port, aisbench_cases, env_dict, compilation_config, server_args + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("mode", MODES) +async def test_models(model: str, mode: str) -> None: + port, aisbench_cases, env_dict, compilation_config, server_args = config() if mode == "piecewise": compilation_config["cudagraph_mode"] = "PIECEWISE" server_args.extend( ["--compilation-config", - json.dumps(compilation_config)]) + json.dumps(compilation_config)]) request_keyword_args: dict[str, Any] = { **api_keyword_args, }