diff --git a/tests/e2e/nightly/models/test_deepseek_r1_w8a8_eplb.py b/tests/e2e/nightly/models/test_deepseek_r1_w8a8_eplb.py index 4782663f..de77145b 100644 --- a/tests/e2e/nightly/models/test_deepseek_r1_w8a8_eplb.py +++ b/tests/e2e/nightly/models/test_deepseek_r1_w8a8_eplb.py @@ -45,7 +45,7 @@ aisbench_cases = [{ "dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_chat_prompt", "max_out_len": 32768, "batch_size": 32, - "baseline": 95, + "baseline": 93, "threshold": 5 }, { "case_type": "performance", @@ -72,7 +72,8 @@ async def test_models(model: str, tp_size: int, dp_size: int) -> None: "OMP_PROC_BIND": "false", "HCCL_OP_EXPANSION_MODE": "AIV", "PAGED_ATTENTION_MASK_LEN": "5500", - "DYNAMIC_EPLB": "true" + "DYNAMIC_EPLB": "true", + "HCCL_BUFFSIZE": "1024" } server_args = [ "--no-enable-prefix-caching", "--enable-expert-parallel", @@ -84,7 +85,7 @@ async def test_models(model: str, tp_size: int, dp_size: int) -> None: "--quantization", "ascend", "--gpu-memory-utilization", "0.9", "--additional-config", '{"enable_weight_nz_layout":true, ' '"torch_air_graph_config":{"enabled": true, "enable_multistream_mla": true, "graph_batch_size": [16], "use_cached_graph": true},' - '"dynamic_eplb": true, "num_iterations_eplb_update": 200, "num_wait_worker_iterations": 100, "init_redundancy_expert": 16}' + '"dynamic_eplb": true, "num_iterations_eplb_update": 1000, "num_wait_worker_iterations": 200, "init_redundancy_expert": 16}' ] request_keyword_args: dict[str, Any] = { **api_keyword_args, diff --git a/tests/e2e/nightly/models/test_qwen3_235b_a22b_w8a8_eplb.py b/tests/e2e/nightly/models/test_qwen3_235b_a22b_w8a8_eplb.py index b40a7859..6814d5d4 100644 --- a/tests/e2e/nightly/models/test_qwen3_235b_a22b_w8a8_eplb.py +++ b/tests/e2e/nightly/models/test_qwen3_235b_a22b_w8a8_eplb.py @@ -44,7 +44,7 @@ aisbench_cases = [{ "dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_chat_prompt", "max_out_len": 32768, "batch_size": 32, - "baseline": 95, + "baseline": 93, "threshold": 5 }, { "case_type": "performance", @@ -70,7 +70,8 @@ async def test_models(model: str, tp_size: int) -> None: "OMP_PROC_BIND": "false", "HCCL_OP_EXPANSION_MODE": "AIV", "PAGED_ATTENTION_MASK_LEN": "5500", - "DYNAMIC_EPLB": "true" + "DYNAMIC_EPLB": "true", + "HCCL_BUFFSIZE": "1024" } server_args = [ "--no-enable-prefix-caching", "--enable-expert-parallel", @@ -81,7 +82,7 @@ async def test_models(model: str, tp_size: int) -> None: "--quantization", "ascend", "--gpu-memory-utilization", "0.9", "--additional-config", '{"enable_weight_nz_layout":true, "dynamic_eplb": true, ' - '"num_iterations_eplb_update": 200, "num_wait_worker_iterations": 100, ' + '"num_iterations_eplb_update": 1000, "num_wait_worker_iterations": 200, ' '"init_redundancy_expert": 16}' ] request_keyword_args: dict[str, Any] = {