[EPLB][Nightly] Refactor UT (#6543)
### What this PR does / why we need it? The basic configs are extracted and reused for eplb UT. This is done so that if the basic configs are changed later, eplb UT does not need to be modified repeatedly. ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.15.0 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.15.0 Signed-off-by: bigsir007 <xujiacheng12@huawei.com> Co-authored-by: bigsir007 <xujiacheng12@huawei.com>
This commit is contained in:
@@ -63,10 +63,7 @@ aisbench_cases = [{
|
||||
}]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("mode", MODES)
|
||||
async def test_models(model: str, mode: str) -> None:
|
||||
def config():
|
||||
port = get_open_port()
|
||||
env_dict = {
|
||||
"OMP_NUM_THREADS": "10",
|
||||
@@ -85,6 +82,13 @@ async def test_models(model: str, mode: str) -> None:
|
||||
"--speculative-config",
|
||||
json.dumps(speculative_config)
|
||||
]
|
||||
return port, env_dict, additional_config, server_args
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("mode", MODES)
|
||||
async def test_models(model: str, mode: str) -> None:
|
||||
port, env_dict, additional_config, server_args = config()
|
||||
if mode == "single":
|
||||
server_args.append("--enforce-eager")
|
||||
server_args.extend(["--additional-config", json.dumps(additional_config)])
|
||||
|
||||
@@ -23,18 +23,7 @@ from vllm.utils.network_utils import get_open_port
|
||||
|
||||
from tests.e2e.conftest import RemoteOpenAIServer
|
||||
from tools.aisbench import run_aisbench_cases
|
||||
|
||||
MODELS = [
|
||||
"vllm-ascend/DeepSeek-R1-0528-W8A8",
|
||||
]
|
||||
|
||||
prompts = [
|
||||
"San Francisco is a",
|
||||
]
|
||||
|
||||
api_keyword_args = {
|
||||
"max_tokens": 10,
|
||||
}
|
||||
from .test_deepseek_r1_0528_w8a8 import *
|
||||
|
||||
aisbench_cases = [{
|
||||
"case_type": "accuracy",
|
||||
@@ -50,46 +39,23 @@ aisbench_cases = [{
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
async def test_models(model: str) -> None:
|
||||
port = get_open_port()
|
||||
env_dict = {
|
||||
"OMP_NUM_THREADS": "100",
|
||||
"OMP_PROC_BIND": "false",
|
||||
"HCCL_BUFFSIZE": "200",
|
||||
"VLLM_RPC_TIMEOUT": "3600000",
|
||||
"VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS": "3600000",
|
||||
"DISABLE_L2_CACHE": "1",
|
||||
"DYNAMIC_EPLB": "true",
|
||||
}
|
||||
speculative_config = {"num_speculative_tokens": 1, "method": "mtp"}
|
||||
compilation_config = {
|
||||
"cudagraph_capture_sizes": [24],
|
||||
"cudagraph_mode": "FULL_DECODE_ONLY"
|
||||
}
|
||||
additional_config: dict[str, Any] = {
|
||||
"enable_shared_expert_dp": False,
|
||||
"multistream_overlap_shared_expert": False,
|
||||
"eplb_config": {
|
||||
"dynamic_eplb": True,
|
||||
"expert_heat_collection_interval": 512,
|
||||
"algorithm_execution_interval": 100,
|
||||
"num_redundant_experts": 0
|
||||
async def test_models_eplb(model: str) -> None:
|
||||
port, env_dict, additional_config, server_args = config()
|
||||
additional_config.update(
|
||||
{
|
||||
"eplb_config": {
|
||||
"dynamic_eplb": "true",
|
||||
"expert_heat_collection_interval": 1000,
|
||||
"algorithm_execution_interval": 50,
|
||||
"eplb_policy_type": 3,
|
||||
}
|
||||
}
|
||||
}
|
||||
server_args = [
|
||||
"--quantization", "ascend", "--seed", "1024",
|
||||
"--no-enable-prefix-caching", "--data-parallel-size", "4",
|
||||
"--tensor-parallel-size", "4", "--enable-expert-parallel", "--port",
|
||||
str(port), "--max-model-len", "40000", "--max-num-batched-tokens",
|
||||
"4096", "--max-num-seqs", "12", "--trust-remote-code",
|
||||
"--gpu-memory-utilization", "0.92"
|
||||
]
|
||||
server_args.extend(
|
||||
["--speculative-config",
|
||||
json.dumps(speculative_config)])
|
||||
server_args.extend(
|
||||
["--compilation-config",
|
||||
json.dumps(compilation_config)])
|
||||
)
|
||||
env_dict.update(
|
||||
{
|
||||
"DYNAMIC_EPLB": "true",
|
||||
}
|
||||
)
|
||||
server_args.extend(["--additional-config", json.dumps(additional_config)])
|
||||
request_keyword_args: dict[str, Any] = {
|
||||
**api_keyword_args,
|
||||
@@ -113,3 +79,4 @@ async def test_models(model: str) -> None:
|
||||
port,
|
||||
aisbench_cases,
|
||||
server_args=server_args)
|
||||
|
||||
|
||||
@@ -23,64 +23,30 @@ from vllm.utils.network_utils import get_open_port
|
||||
|
||||
from tests.e2e.conftest import RemoteOpenAIServer
|
||||
from tools.aisbench import run_aisbench_cases
|
||||
|
||||
MODELS = [
|
||||
"vllm-ascend/Qwen3-235B-A22B-W8A8",
|
||||
]
|
||||
|
||||
prompts = [
|
||||
"San Francisco is a",
|
||||
]
|
||||
|
||||
api_keyword_args = {
|
||||
"max_tokens": 10,
|
||||
}
|
||||
|
||||
aisbench_cases = [{
|
||||
"case_type": "accuracy",
|
||||
"dataset_path": "vllm-ascend/gsm8k-lite",
|
||||
"request_conf": "vllm_api_general_chat",
|
||||
"dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_chat_prompt",
|
||||
"max_out_len": 32768,
|
||||
"batch_size": 32,
|
||||
"top_k": 20,
|
||||
"baseline": 95,
|
||||
"threshold": 5
|
||||
}]
|
||||
from .test_qwen3_235b_w8a8 import *
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
async def test_models(model: str) -> None:
|
||||
port = get_open_port()
|
||||
env_dict = {
|
||||
"OMP_NUM_THREADS": "10",
|
||||
"OMP_PROC_BIND": "false",
|
||||
"HCCL_BUFFSIZE": "1024",
|
||||
"PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True",
|
||||
"VLLM_ASCEND_ENABLE_FLASHCOMM1": "1"
|
||||
}
|
||||
async def test_models_eplb(model: str) -> None:
|
||||
port, aisbench_cases, env_dict, compilation_config, server_args = config()
|
||||
env_dict.update(
|
||||
{
|
||||
"DYNAMIC_EPLB": "true",
|
||||
}
|
||||
)
|
||||
additional_config: dict[str, Any] = {}
|
||||
compilation_config = {"cudagraph_mode": "FULL_DECODE_ONLY"}
|
||||
server_args = [
|
||||
"--quantization", "ascend", "--async-scheduling",
|
||||
"--data-parallel-size", "4", "--tensor-parallel-size", "4",
|
||||
"--enable-expert-parallel", "--port",
|
||||
str(port), "--max-model-len", "40960", "--max-num-batched-tokens",
|
||||
"8192", "--max-num-seqs", "12", "--trust-remote-code",
|
||||
"--gpu-memory-utilization", "0.9"
|
||||
]
|
||||
env_dict["DYNAMIC_EPLB"] = "true"
|
||||
additional_config["eplb_config"] = {
|
||||
"dynamic_eplb": True,
|
||||
"expert_heat_collection_interval": 512,
|
||||
"algorithm_execution_interval": 100,
|
||||
"num_redundant_experts": 0
|
||||
"dynamic_eplb": "true",
|
||||
"expert_heat_collection_interval": 600,
|
||||
"algorithm_execution_interval": 50,
|
||||
"num_redundant_experts": 16,
|
||||
"eplb_policy_type": 2,
|
||||
}
|
||||
server_args.extend(["--additional-config", json.dumps(additional_config)])
|
||||
server_args.extend(
|
||||
["--compilation-config",
|
||||
json.dumps(compilation_config)])
|
||||
server_args.extend(["--additional-config", json.dumps(additional_config)])
|
||||
request_keyword_args: dict[str, Any] = {
|
||||
**api_keyword_args,
|
||||
}
|
||||
|
||||
@@ -38,24 +38,19 @@ api_keyword_args = {
|
||||
"max_tokens": 10,
|
||||
}
|
||||
|
||||
aisbench_cases = [{
|
||||
"case_type": "accuracy",
|
||||
"dataset_path": "vllm-ascend/gsm8k-lite",
|
||||
"request_conf": "vllm_api_general_chat",
|
||||
"dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_chat_prompt",
|
||||
"max_out_len": 32768,
|
||||
"batch_size": 32,
|
||||
"top_k": 20,
|
||||
"baseline": 95,
|
||||
"threshold": 5
|
||||
}]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("mode", MODES)
|
||||
async def test_models(model: str, mode: str) -> None:
|
||||
def config():
|
||||
port = get_open_port()
|
||||
aisbench_cases = [{
|
||||
"case_type": "accuracy",
|
||||
"dataset_path": "vllm-ascend/gsm8k-lite",
|
||||
"request_conf": "vllm_api_general_chat",
|
||||
"dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_chat_prompt",
|
||||
"max_out_len": 32768,
|
||||
"batch_size": 32,
|
||||
"top_k": 20,
|
||||
"baseline": 95,
|
||||
"threshold": 5
|
||||
}]
|
||||
env_dict = {
|
||||
"OMP_NUM_THREADS": "10",
|
||||
"OMP_PROC_BIND": "false",
|
||||
@@ -72,11 +67,19 @@ async def test_models(model: str, mode: str) -> None:
|
||||
"8192", "--max-num-seqs", "12", "--trust-remote-code",
|
||||
"--gpu-memory-utilization", "0.9"
|
||||
]
|
||||
return port, aisbench_cases, env_dict, compilation_config, server_args
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("mode", MODES)
|
||||
async def test_models(model: str, mode: str) -> None:
|
||||
port, aisbench_cases, env_dict, compilation_config, server_args = config()
|
||||
if mode == "piecewise":
|
||||
compilation_config["cudagraph_mode"] = "PIECEWISE"
|
||||
server_args.extend(
|
||||
["--compilation-config",
|
||||
json.dumps(compilation_config)])
|
||||
json.dumps(compilation_config)])
|
||||
request_keyword_args: dict[str, Any] = {
|
||||
**api_keyword_args,
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user