[TEST]Add 2P1D multi node cases for nightly test (#3764)
### What this PR does / why we need it?
This PR adds the 2P1D multi node func/acc/perf test cases, we need test
them daily
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
by running the test
- vLLM version: v0.11.0rc3
- vLLM main:
c9461e05a4
---------
Signed-off-by: jiangyunfan1 <jiangyunfan1@h-partners.com>
Signed-off-by: wangli <wangli858794774@gmail.com>
Co-authored-by: wangli <wangli858794774@gmail.com>
This commit is contained in:
@@ -144,20 +144,21 @@ deployment:
|
|||||||
benchmarks:
|
benchmarks:
|
||||||
perf:
|
perf:
|
||||||
case_type: performance
|
case_type: performance
|
||||||
dataset_path: vllm-ascend/GSM8K-in3500-bs400
|
dataset_path: vllm-ascend/GSM8K-in3500-bs2800
|
||||||
request_conf: vllm_api_stream_chat
|
request_conf: vllm_api_stream_chat
|
||||||
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf
|
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf
|
||||||
num_prompts: 1
|
num_prompts: 2800
|
||||||
max_out_len: 2
|
max_out_len: 1500
|
||||||
batch_size: 1
|
batch_size: 700
|
||||||
baseline: 5
|
request_rate: 11.2
|
||||||
|
baseline: 1
|
||||||
threshold: 0.97
|
threshold: 0.97
|
||||||
acc:
|
acc:
|
||||||
case_type: accuracy
|
case_type: accuracy
|
||||||
dataset_path: vllm-ascend/AIME2024
|
dataset_path: vllm-ascend/gsm8k
|
||||||
request_conf: vllm_api_general_chat
|
request_conf: vllm_api_general_chat
|
||||||
dataset_conf: aime2024/aime2024_gen_0_shot_chat_prompt
|
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_chat_prompt
|
||||||
max_out_len: 10
|
max_out_len: 32768
|
||||||
batch_size: 32
|
batch_size: 512
|
||||||
baseline: 1
|
baseline: 95
|
||||||
threshold: 1
|
threshold: 5
|
||||||
|
|||||||
@@ -97,22 +97,3 @@ deployment:
|
|||||||
}
|
}
|
||||||
}'
|
}'
|
||||||
benchmarks:
|
benchmarks:
|
||||||
perf:
|
|
||||||
case_type: performance
|
|
||||||
dataset_path: vllm-ascend/GSM8K-in3500-bs400
|
|
||||||
request_conf: vllm_api_stream_chat
|
|
||||||
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf
|
|
||||||
num_prompts: 1
|
|
||||||
max_out_len: 2
|
|
||||||
batch_size: 1
|
|
||||||
baseline: 5
|
|
||||||
threshold: 0.97
|
|
||||||
acc:
|
|
||||||
case_type: accuracy
|
|
||||||
dataset_path: vllm-ascend/AIME2024
|
|
||||||
request_conf: vllm_api_general_chat
|
|
||||||
dataset_conf: aime2024/aime2024_gen_0_shot_chat_prompt
|
|
||||||
max_out_len: 10
|
|
||||||
batch_size: 32
|
|
||||||
baseline: 1
|
|
||||||
threshold: 1
|
|
||||||
|
|||||||
@@ -47,22 +47,3 @@ deployment:
|
|||||||
--no-enable-prefix-caching
|
--no-enable-prefix-caching
|
||||||
--gpu-memory-utilization 0.9
|
--gpu-memory-utilization 0.9
|
||||||
benchmarks:
|
benchmarks:
|
||||||
perf:
|
|
||||||
case_type: performance
|
|
||||||
dataset_path: vllm-ascend/GSM8K-in3500-bs400
|
|
||||||
request_conf: vllm_api_stream_chat
|
|
||||||
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf
|
|
||||||
num_prompts: 1
|
|
||||||
max_out_len: 2
|
|
||||||
batch_size: 1
|
|
||||||
baseline: 5
|
|
||||||
threshold: 0.97
|
|
||||||
acc:
|
|
||||||
case_type: accuracy
|
|
||||||
dataset_path: vllm-ascend/AIME2024
|
|
||||||
request_conf: vllm_api_general_chat
|
|
||||||
dataset_conf: aime2024/aime2024_gen_0_shot_chat_prompt
|
|
||||||
max_out_len: 10
|
|
||||||
batch_size: 32
|
|
||||||
baseline: 1
|
|
||||||
threshold: 1
|
|
||||||
|
|||||||
@@ -47,22 +47,3 @@ deployment:
|
|||||||
--no-enable-prefix-caching
|
--no-enable-prefix-caching
|
||||||
--gpu-memory-utilization 0.9
|
--gpu-memory-utilization 0.9
|
||||||
benchmarks:
|
benchmarks:
|
||||||
perf:
|
|
||||||
case_type: performance
|
|
||||||
dataset_path: vllm-ascend/GSM8K-in3500-bs400
|
|
||||||
request_conf: vllm_api_stream_chat
|
|
||||||
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf
|
|
||||||
num_prompts: 1
|
|
||||||
max_out_len: 2
|
|
||||||
batch_size: 1
|
|
||||||
baseline: 5
|
|
||||||
threshold: 0.97
|
|
||||||
acc:
|
|
||||||
case_type: accuracy
|
|
||||||
dataset_path: vllm-ascend/AIME2024
|
|
||||||
request_conf: vllm_api_general_chat
|
|
||||||
dataset_conf: aime2024/aime2024_gen_0_shot_chat_prompt
|
|
||||||
max_out_len: 10
|
|
||||||
batch_size: 32
|
|
||||||
baseline: 1
|
|
||||||
threshold: 1
|
|
||||||
|
|||||||
@@ -84,22 +84,3 @@ deployment:
|
|||||||
}
|
}
|
||||||
}'
|
}'
|
||||||
benchmarks:
|
benchmarks:
|
||||||
perf:
|
|
||||||
case_type: performance
|
|
||||||
dataset_path: vllm-ascend/GSM8K-in3500-bs400
|
|
||||||
request_conf: vllm_api_stream_chat
|
|
||||||
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf
|
|
||||||
num_prompts: 1
|
|
||||||
max_out_len: 2
|
|
||||||
batch_size: 1
|
|
||||||
baseline: 5
|
|
||||||
threshold: 0.97
|
|
||||||
acc:
|
|
||||||
case_type: accuracy
|
|
||||||
dataset_path: vllm-ascend/AIME2024
|
|
||||||
request_conf: vllm_api_general_chat
|
|
||||||
dataset_conf: aime2024/aime2024_gen_0_shot_chat_prompt
|
|
||||||
max_out_len: 10
|
|
||||||
batch_size: 32
|
|
||||||
baseline: 1
|
|
||||||
threshold: 1
|
|
||||||
|
|||||||
@@ -50,8 +50,6 @@ class MultiNodeConfig:
|
|||||||
self.proxy_port = get_avaliable_port()
|
self.proxy_port = get_avaliable_port()
|
||||||
self.perf_cmd = perf_cmd
|
self.perf_cmd = perf_cmd
|
||||||
self.acc_cmd = acc_cmd
|
self.acc_cmd = acc_cmd
|
||||||
assert perf_cmd is not None, "perf_cmd must be provided"
|
|
||||||
assert acc_cmd is not None, "acc_cmd must be provided"
|
|
||||||
|
|
||||||
self.cur_index = int(os.getenv("LWS_WORKER_INDEX", 0))
|
self.cur_index = int(os.getenv("LWS_WORKER_INDEX", 0))
|
||||||
self.cur_ip = get_cur_ip()
|
self.cur_ip = get_cur_ip()
|
||||||
@@ -220,10 +218,10 @@ class MultiNodeConfig:
|
|||||||
server_port=server_port,
|
server_port=server_port,
|
||||||
server_cmd=server_cmd))
|
server_cmd=server_cmd))
|
||||||
|
|
||||||
benchmarks = config_data.get("benchmarks", {})
|
benchmarks = config_data.get("benchmarks") or {}
|
||||||
assert benchmarks is not None, "benchmarks must be provided"
|
assert benchmarks is not None, "benchmarks must be provided"
|
||||||
perf_cmd = benchmarks["perf"]
|
perf_cmd = benchmarks.get("perf")
|
||||||
acc_cmd = benchmarks["acc"]
|
acc_cmd = benchmarks.get("acc")
|
||||||
|
|
||||||
return cls(model=model,
|
return cls(model=model,
|
||||||
test_name=test_name,
|
test_name=test_name,
|
||||||
@@ -290,3 +288,8 @@ class MultiNodeConfig:
|
|||||||
subprocess.run(cmd, env=env, check=True)
|
subprocess.run(cmd, env=env, check=True)
|
||||||
assert os.path.exists(
|
assert os.path.exists(
|
||||||
str(ranktable_path)), "failed generate ranktable.json"
|
str(ranktable_path)), "failed generate ranktable.json"
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
config = MultiNodeConfig.from_yaml()
|
||||||
|
print(config.perf_cmd)
|
||||||
|
|||||||
@@ -120,6 +120,17 @@ download_go() {
|
|||||||
print_success "Go $GOVER installed successfully"
|
print_success "Go $GOVER installed successfully"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
install_ais_bench() {
|
||||||
|
local AIS_BENCH="$SRC_DIR/benchmark"
|
||||||
|
git clone https://gitee.com/aisbench/benchmark.git $AIS_BENCH
|
||||||
|
cd $AIS_BENCH
|
||||||
|
git checkout v3.0-20250930-master
|
||||||
|
pip3 install -e ./
|
||||||
|
pip3 install -r requirements/api.txt
|
||||||
|
pip3 install -r requirements/extra.txt
|
||||||
|
cd -
|
||||||
|
}
|
||||||
|
|
||||||
install_go() {
|
install_go() {
|
||||||
# Check if Go is already installed
|
# Check if Go is already installed
|
||||||
if command -v go &> /dev/null; then
|
if command -v go &> /dev/null; then
|
||||||
@@ -167,6 +178,7 @@ main() {
|
|||||||
checkout_src
|
checkout_src
|
||||||
install_sys_dependencies
|
install_sys_dependencies
|
||||||
install_vllm
|
install_vllm
|
||||||
|
install_ais_bench
|
||||||
# to speed up mooncake build process, install Go here
|
# to speed up mooncake build process, install Go here
|
||||||
install_go
|
install_go
|
||||||
cd "$WORKSPACE/source_code"
|
cd "$WORKSPACE/source_code"
|
||||||
|
|||||||
@@ -1,13 +1,98 @@
|
|||||||
|
import time
|
||||||
|
from typing import Any, List, Optional, Union
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
import pytest
|
||||||
|
from modelscope import snapshot_download # type: ignore
|
||||||
|
from requests.exceptions import ConnectionError, HTTPError, Timeout
|
||||||
|
|
||||||
from tests.e2e.conftest import RemoteOpenAIServer
|
from tests.e2e.conftest import RemoteOpenAIServer
|
||||||
from tests.e2e.nightly.multi_node.config.multi_node_config import (
|
from tests.e2e.nightly.multi_node.config.multi_node_config import (
|
||||||
DISAGGREGATED_PREFILL_PROXY_SCRIPT, MultiNodeConfig)
|
DISAGGREGATED_PREFILL_PROXY_SCRIPT, MultiNodeConfig)
|
||||||
|
from tools.aisbench import run_aisbench_cases
|
||||||
|
|
||||||
|
prompts = [
|
||||||
|
"San Francisco is a",
|
||||||
|
]
|
||||||
|
|
||||||
|
api_keyword_args = {
|
||||||
|
"max_tokens": 10,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def test_multi_node() -> None:
|
def get_local_model_path_with_retry(
|
||||||
|
model: str,
|
||||||
|
revision: str = "master",
|
||||||
|
max_retries: int = 5,
|
||||||
|
delay: int = 5,
|
||||||
|
) -> Optional[str]:
|
||||||
|
for attempt in range(1, max_retries + 1):
|
||||||
|
try:
|
||||||
|
local_model_path = snapshot_download(
|
||||||
|
model_id=model,
|
||||||
|
revision=revision,
|
||||||
|
)
|
||||||
|
return local_model_path
|
||||||
|
|
||||||
|
except HTTPError:
|
||||||
|
continue
|
||||||
|
|
||||||
|
except (ConnectionError, Timeout):
|
||||||
|
continue
|
||||||
|
|
||||||
|
if attempt < max_retries:
|
||||||
|
time.sleep(delay)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
async def get_completions(url: str, model: str, prompts: Union[str, List[str]],
|
||||||
|
**api_kwargs: Any) -> List[str]:
|
||||||
|
"""
|
||||||
|
Asynchronously send HTTP requests to a /v1/completions endpoint.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url: Full endpoint URL, e.g. "http://localhost:1025/v1/completions"
|
||||||
|
model: Model name or local model path
|
||||||
|
prompts: A single prompt string or a list of prompts
|
||||||
|
**api_kwargs: Additional parameters (e.g., max_tokens, temperature)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List[str]: A list of generated texts corresponding to each prompt
|
||||||
|
"""
|
||||||
|
headers = {"Content-Type": "application/json"}
|
||||||
|
|
||||||
|
if isinstance(prompts, str):
|
||||||
|
prompts = [prompts]
|
||||||
|
|
||||||
|
results = []
|
||||||
|
async with httpx.AsyncClient(timeout=600.0) as client:
|
||||||
|
for prompt in prompts:
|
||||||
|
payload = {"model": model, "prompt": prompt, **api_kwargs}
|
||||||
|
|
||||||
|
response = await client.post(url, headers=headers, json=payload)
|
||||||
|
if response.status_code != 200:
|
||||||
|
raise RuntimeError(
|
||||||
|
f"Request failed with status {response.status_code}: {response.text}"
|
||||||
|
)
|
||||||
|
|
||||||
|
resp_json = response.json()
|
||||||
|
choices = resp_json.get("choices", [])
|
||||||
|
if not choices or not choices[0].get("text"):
|
||||||
|
raise ValueError("Empty response from server")
|
||||||
|
|
||||||
|
results.append(choices[0]["text"])
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_multi_node() -> None:
|
||||||
config = MultiNodeConfig.from_yaml()
|
config = MultiNodeConfig.from_yaml()
|
||||||
|
local_model_path = get_local_model_path_with_retry(config.model)
|
||||||
|
assert local_model_path is not None, "can not find any local weight for test"
|
||||||
env_dict = config.envs
|
env_dict = config.envs
|
||||||
# perf_cmd = config.perf_cmd
|
perf_cmd = config.perf_cmd
|
||||||
# acc_cmd = config.acc_cmd
|
acc_cmd = config.acc_cmd
|
||||||
nodes_info = config.nodes_info
|
nodes_info = config.nodes_info
|
||||||
disaggregated_prefill = config.disaggregated_prefill
|
disaggregated_prefill = config.disaggregated_prefill
|
||||||
server_port = config.server_port
|
server_port = config.server_port
|
||||||
@@ -15,7 +100,7 @@ def test_multi_node() -> None:
|
|||||||
server_host = config.cluster_ips[0]
|
server_host = config.cluster_ips[0]
|
||||||
with config.launch_server_proxy(DISAGGREGATED_PREFILL_PROXY_SCRIPT):
|
with config.launch_server_proxy(DISAGGREGATED_PREFILL_PROXY_SCRIPT):
|
||||||
with RemoteOpenAIServer(
|
with RemoteOpenAIServer(
|
||||||
model=config.model,
|
model=local_model_path,
|
||||||
vllm_serve_args=config.server_cmd,
|
vllm_serve_args=config.server_cmd,
|
||||||
server_port=server_port,
|
server_port=server_port,
|
||||||
server_host=server_host,
|
server_host=server_host,
|
||||||
@@ -26,11 +111,17 @@ def test_multi_node() -> None:
|
|||||||
nodes_info=nodes_info,
|
nodes_info=nodes_info,
|
||||||
max_wait_seconds=2000,
|
max_wait_seconds=2000,
|
||||||
) as remote_server:
|
) as remote_server:
|
||||||
# base_url = remote_server.url_root
|
|
||||||
if config.is_master:
|
if config.is_master:
|
||||||
pass
|
port = proxy_port if disaggregated_prefill else server_port
|
||||||
# TODO: enable perf and acc test
|
base_url = f"http://localhost:{port}/v1/completions"
|
||||||
# subprocess.run(perf_cmd, check=True)
|
_ = await get_completions(url=base_url,
|
||||||
# subprocess.run(acc_cmd, check=True)
|
model=local_model_path,
|
||||||
|
prompts=prompts,
|
||||||
|
api_kwargs=api_keyword_args)
|
||||||
|
# aisbench test
|
||||||
|
if acc_cmd:
|
||||||
|
run_aisbench_cases(local_model_path, port, acc_cmd)
|
||||||
|
if perf_cmd:
|
||||||
|
run_aisbench_cases(local_model_path, port, perf_cmd)
|
||||||
else:
|
else:
|
||||||
remote_server.hang_until_terminated()
|
remote_server.hang_until_terminated()
|
||||||
|
|||||||
@@ -231,6 +231,8 @@ class AisbenchRunner:
|
|||||||
|
|
||||||
|
|
||||||
def run_aisbench_cases(model, port, aisbench_cases):
|
def run_aisbench_cases(model, port, aisbench_cases):
|
||||||
|
if isinstance(aisbench_cases, dict):
|
||||||
|
aisbench_cases = [aisbench_cases]
|
||||||
aisbench_results = []
|
aisbench_results = []
|
||||||
aisbench_errors = []
|
aisbench_errors = []
|
||||||
for aisbench_case in aisbench_cases:
|
for aisbench_case in aisbench_cases:
|
||||||
|
|||||||
Reference in New Issue
Block a user