diff --git a/tests/e2e/nightly/single_node/models/test_deepseek_r1_0528_w8a8.py b/tests/e2e/nightly/single_node/models/test_deepseek_r1_0528_w8a8.py index b71d8854..a16aca24 100644 --- a/tests/e2e/nightly/single_node/models/test_deepseek_r1_0528_w8a8.py +++ b/tests/e2e/nightly/single_node/models/test_deepseek_r1_0528_w8a8.py @@ -63,10 +63,7 @@ aisbench_cases = [{ }] -@pytest.mark.asyncio -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("mode", MODES) -async def test_models(model: str, mode: str) -> None: +def config(): port = get_open_port() env_dict = { "OMP_NUM_THREADS": "10", @@ -85,6 +82,13 @@ async def test_models(model: str, mode: str) -> None: "--speculative-config", json.dumps(speculative_config) ] + return port, env_dict, additional_config, server_args + +@pytest.mark.asyncio +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("mode", MODES) +async def test_models(model: str, mode: str) -> None: + port, env_dict, additional_config, server_args = config() if mode == "single": server_args.append("--enforce-eager") server_args.extend(["--additional-config", json.dumps(additional_config)]) diff --git a/tests/e2e/nightly/single_node/models/test_deepseek_r1_0528_w8a8_eplb.py b/tests/e2e/nightly/single_node/models/test_deepseek_r1_0528_w8a8_eplb.py index 08db9a15..35c5b30f 100644 --- a/tests/e2e/nightly/single_node/models/test_deepseek_r1_0528_w8a8_eplb.py +++ b/tests/e2e/nightly/single_node/models/test_deepseek_r1_0528_w8a8_eplb.py @@ -23,18 +23,7 @@ from vllm.utils.network_utils import get_open_port from tests.e2e.conftest import RemoteOpenAIServer from tools.aisbench import run_aisbench_cases - -MODELS = [ - "vllm-ascend/DeepSeek-R1-0528-W8A8", -] - -prompts = [ - "San Francisco is a", -] - -api_keyword_args = { - "max_tokens": 10, -} +from .test_deepseek_r1_0528_w8a8 import * aisbench_cases = [{ "case_type": "accuracy", @@ -50,46 +39,23 @@ aisbench_cases = [{ @pytest.mark.asyncio @pytest.mark.parametrize("model", MODELS) -async def test_models(model: str) -> None: - port = get_open_port() - env_dict = { - "OMP_NUM_THREADS": "100", - "OMP_PROC_BIND": "false", - "HCCL_BUFFSIZE": "200", - "VLLM_RPC_TIMEOUT": "3600000", - "VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS": "3600000", - "DISABLE_L2_CACHE": "1", - "DYNAMIC_EPLB": "true", - } - speculative_config = {"num_speculative_tokens": 1, "method": "mtp"} - compilation_config = { - "cudagraph_capture_sizes": [24], - "cudagraph_mode": "FULL_DECODE_ONLY" - } - additional_config: dict[str, Any] = { - "enable_shared_expert_dp": False, - "multistream_overlap_shared_expert": False, - "eplb_config": { - "dynamic_eplb": True, - "expert_heat_collection_interval": 512, - "algorithm_execution_interval": 100, - "num_redundant_experts": 0 +async def test_models_eplb(model: str) -> None: + port, env_dict, additional_config, server_args = config() + additional_config.update( + { + "eplb_config": { + "dynamic_eplb": "true", + "expert_heat_collection_interval": 1000, + "algorithm_execution_interval": 50, + "eplb_policy_type": 3, + } } - } - server_args = [ - "--quantization", "ascend", "--seed", "1024", - "--no-enable-prefix-caching", "--data-parallel-size", "4", - "--tensor-parallel-size", "4", "--enable-expert-parallel", "--port", - str(port), "--max-model-len", "40000", "--max-num-batched-tokens", - "4096", "--max-num-seqs", "12", "--trust-remote-code", - "--gpu-memory-utilization", "0.92" - ] - server_args.extend( - ["--speculative-config", - json.dumps(speculative_config)]) - server_args.extend( - ["--compilation-config", - json.dumps(compilation_config)]) + ) + env_dict.update( + { + "DYNAMIC_EPLB": "true", + } + ) server_args.extend(["--additional-config", json.dumps(additional_config)]) request_keyword_args: dict[str, Any] = { **api_keyword_args, @@ -113,3 +79,4 @@ async def test_models(model: str) -> None: port, aisbench_cases, server_args=server_args) + diff --git a/tests/e2e/nightly/single_node/models/test_qwen3_235b_a22b_w8a8_eplb.py b/tests/e2e/nightly/single_node/models/test_qwen3_235b_a22b_w8a8_eplb.py index 50a6ae53..2d54c275 100644 --- a/tests/e2e/nightly/single_node/models/test_qwen3_235b_a22b_w8a8_eplb.py +++ b/tests/e2e/nightly/single_node/models/test_qwen3_235b_a22b_w8a8_eplb.py @@ -23,64 +23,30 @@ from vllm.utils.network_utils import get_open_port from tests.e2e.conftest import RemoteOpenAIServer from tools.aisbench import run_aisbench_cases - -MODELS = [ - "vllm-ascend/Qwen3-235B-A22B-W8A8", -] - -prompts = [ - "San Francisco is a", -] - -api_keyword_args = { - "max_tokens": 10, -} - -aisbench_cases = [{ - "case_type": "accuracy", - "dataset_path": "vllm-ascend/gsm8k-lite", - "request_conf": "vllm_api_general_chat", - "dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_chat_prompt", - "max_out_len": 32768, - "batch_size": 32, - "top_k": 20, - "baseline": 95, - "threshold": 5 -}] +from .test_qwen3_235b_w8a8 import * @pytest.mark.asyncio @pytest.mark.parametrize("model", MODELS) -async def test_models(model: str) -> None: - port = get_open_port() - env_dict = { - "OMP_NUM_THREADS": "10", - "OMP_PROC_BIND": "false", - "HCCL_BUFFSIZE": "1024", - "PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True", - "VLLM_ASCEND_ENABLE_FLASHCOMM1": "1" - } +async def test_models_eplb(model: str) -> None: + port, aisbench_cases, env_dict, compilation_config, server_args = config() + env_dict.update( + { + "DYNAMIC_EPLB": "true", + } + ) additional_config: dict[str, Any] = {} - compilation_config = {"cudagraph_mode": "FULL_DECODE_ONLY"} - server_args = [ - "--quantization", "ascend", "--async-scheduling", - "--data-parallel-size", "4", "--tensor-parallel-size", "4", - "--enable-expert-parallel", "--port", - str(port), "--max-model-len", "40960", "--max-num-batched-tokens", - "8192", "--max-num-seqs", "12", "--trust-remote-code", - "--gpu-memory-utilization", "0.9" - ] - env_dict["DYNAMIC_EPLB"] = "true" additional_config["eplb_config"] = { - "dynamic_eplb": True, - "expert_heat_collection_interval": 512, - "algorithm_execution_interval": 100, - "num_redundant_experts": 0 + "dynamic_eplb": "true", + "expert_heat_collection_interval": 600, + "algorithm_execution_interval": 50, + "num_redundant_experts": 16, + "eplb_policy_type": 2, } + server_args.extend(["--additional-config", json.dumps(additional_config)]) server_args.extend( ["--compilation-config", json.dumps(compilation_config)]) - server_args.extend(["--additional-config", json.dumps(additional_config)]) request_keyword_args: dict[str, Any] = { **api_keyword_args, } diff --git a/tests/e2e/nightly/single_node/models/test_qwen3_235b_w8a8.py b/tests/e2e/nightly/single_node/models/test_qwen3_235b_w8a8.py index f97a50a8..c468a851 100644 --- a/tests/e2e/nightly/single_node/models/test_qwen3_235b_w8a8.py +++ b/tests/e2e/nightly/single_node/models/test_qwen3_235b_w8a8.py @@ -38,24 +38,19 @@ api_keyword_args = { "max_tokens": 10, } -aisbench_cases = [{ - "case_type": "accuracy", - "dataset_path": "vllm-ascend/gsm8k-lite", - "request_conf": "vllm_api_general_chat", - "dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_chat_prompt", - "max_out_len": 32768, - "batch_size": 32, - "top_k": 20, - "baseline": 95, - "threshold": 5 -}] - - -@pytest.mark.asyncio -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("mode", MODES) -async def test_models(model: str, mode: str) -> None: +def config(): port = get_open_port() + aisbench_cases = [{ + "case_type": "accuracy", + "dataset_path": "vllm-ascend/gsm8k-lite", + "request_conf": "vllm_api_general_chat", + "dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_chat_prompt", + "max_out_len": 32768, + "batch_size": 32, + "top_k": 20, + "baseline": 95, + "threshold": 5 + }] env_dict = { "OMP_NUM_THREADS": "10", "OMP_PROC_BIND": "false", @@ -72,11 +67,19 @@ async def test_models(model: str, mode: str) -> None: "8192", "--max-num-seqs", "12", "--trust-remote-code", "--gpu-memory-utilization", "0.9" ] + return port, aisbench_cases, env_dict, compilation_config, server_args + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("mode", MODES) +async def test_models(model: str, mode: str) -> None: + port, aisbench_cases, env_dict, compilation_config, server_args = config() if mode == "piecewise": compilation_config["cudagraph_mode"] = "PIECEWISE" server_args.extend( ["--compilation-config", - json.dumps(compilation_config)]) + json.dumps(compilation_config)]) request_keyword_args: dict[str, Any] = { **api_keyword_args, }