sglangv0.5.2 & support Qwen3-Next-80B-A3B-Instruct
This commit is contained in:
512
sgl-router/py_test/e2e/conftest.py
Normal file
512
sgl-router/py_test/e2e/conftest.py
Normal file
@@ -0,0 +1,512 @@
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import shutil
|
||||
import signal
|
||||
import socket
|
||||
import subprocess
|
||||
import time
|
||||
from pathlib import Path
|
||||
from types import SimpleNamespace
|
||||
from typing import Callable, Optional
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import pytest
|
||||
import requests
|
||||
|
||||
from sglang.test.test_utils import (
|
||||
DEFAULT_MODEL_NAME_FOR_TEST,
|
||||
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
||||
DEFAULT_URL_FOR_TEST,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _find_available_port() -> int:
|
||||
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
||||
s.bind(("127.0.0.1", 0))
|
||||
return s.getsockname()[1]
|
||||
|
||||
|
||||
def _parse_url(base_url: str) -> tuple[str, str]:
|
||||
"""Parse a base URL and return (host, port) as strings.
|
||||
|
||||
This is more robust than simple string splitting and supports different schemes
|
||||
and URL shapes like trailing paths.
|
||||
"""
|
||||
parsed = urlparse(base_url)
|
||||
return parsed.hostname or "127.0.0.1", (
|
||||
str(parsed.port) if parsed.port is not None else ""
|
||||
)
|
||||
|
||||
|
||||
def _wait_router_health(base_url: str, timeout: float) -> None:
|
||||
start = time.perf_counter()
|
||||
with requests.Session() as session:
|
||||
while time.perf_counter() - start < timeout:
|
||||
try:
|
||||
r = session.get(f"{base_url}/health", timeout=5)
|
||||
if r.status_code == 200:
|
||||
return
|
||||
except requests.RequestException:
|
||||
pass
|
||||
time.sleep(2)
|
||||
raise TimeoutError("Router failed to become healthy in time")
|
||||
|
||||
|
||||
def _popen_launch_router(
|
||||
model: str,
|
||||
base_url: str,
|
||||
dp_size: int,
|
||||
timeout: float,
|
||||
policy: str = "cache_aware",
|
||||
) -> subprocess.Popen:
|
||||
host, port = _parse_url(base_url)
|
||||
|
||||
prom_port = _find_available_port()
|
||||
|
||||
cmd = [
|
||||
"python3",
|
||||
"-m",
|
||||
"sglang_router.launch_server",
|
||||
"--model-path",
|
||||
model,
|
||||
"--host",
|
||||
host,
|
||||
"--port",
|
||||
port,
|
||||
"--dp",
|
||||
str(dp_size),
|
||||
"--router-policy",
|
||||
policy,
|
||||
"--allow-auto-truncate",
|
||||
"--router-prometheus-port",
|
||||
str(prom_port),
|
||||
"--router-prometheus-host",
|
||||
"127.0.0.1",
|
||||
]
|
||||
|
||||
proc = subprocess.Popen(cmd)
|
||||
_wait_router_health(base_url, timeout)
|
||||
return proc
|
||||
|
||||
|
||||
def _popen_launch_worker(
|
||||
model: str,
|
||||
base_url: str,
|
||||
*,
|
||||
dp_size: int | None = None,
|
||||
api_key: str | None = None,
|
||||
base_gpu_id: int | None = 0,
|
||||
) -> subprocess.Popen:
|
||||
host, port = _parse_url(base_url)
|
||||
|
||||
cmd = [
|
||||
"python3",
|
||||
"-m",
|
||||
"sglang.launch_server",
|
||||
"--model-path",
|
||||
model,
|
||||
"--host",
|
||||
host,
|
||||
"--port",
|
||||
port,
|
||||
"--base-gpu-id",
|
||||
str(base_gpu_id or 0),
|
||||
]
|
||||
if dp_size is not None:
|
||||
cmd += ["--dp-size", str(dp_size)]
|
||||
if api_key is not None:
|
||||
cmd += ["--api-key", api_key]
|
||||
return subprocess.Popen(cmd)
|
||||
|
||||
|
||||
def _popen_launch_router_only(
|
||||
base_url: str,
|
||||
policy: str = "round_robin",
|
||||
timeout: float = 120.0,
|
||||
*,
|
||||
dp_aware: bool = False,
|
||||
api_key: str | None = None,
|
||||
) -> subprocess.Popen:
|
||||
host, port = _parse_url(base_url)
|
||||
|
||||
prom_port = _find_available_port()
|
||||
cmd = [
|
||||
"python3",
|
||||
"-m",
|
||||
"sglang_router.launch_router",
|
||||
"--host",
|
||||
host,
|
||||
"--port",
|
||||
port,
|
||||
"--policy",
|
||||
policy,
|
||||
]
|
||||
if dp_aware:
|
||||
cmd += ["--dp-aware"]
|
||||
if api_key is not None:
|
||||
cmd += ["--api-key", api_key]
|
||||
cmd += [
|
||||
"--prometheus-port",
|
||||
str(prom_port),
|
||||
"--prometheus-host",
|
||||
"127.0.0.1",
|
||||
]
|
||||
proc = subprocess.Popen(cmd)
|
||||
_wait_router_health(base_url, timeout)
|
||||
return proc
|
||||
|
||||
|
||||
def _terminate(proc: subprocess.Popen, timeout: float = 120) -> None:
|
||||
if proc is None:
|
||||
return
|
||||
proc.terminate()
|
||||
start = time.perf_counter()
|
||||
while proc.poll() is None:
|
||||
if time.perf_counter() - start > timeout:
|
||||
proc.kill()
|
||||
break
|
||||
time.sleep(1)
|
||||
|
||||
|
||||
def _which(cmd: str) -> Optional[str]:
|
||||
try:
|
||||
return shutil.which(cmd)
|
||||
except Exception as e:
|
||||
logger.warning("shutil.which(%r) failed: %s", cmd, e)
|
||||
return None
|
||||
|
||||
|
||||
def _graceful_stop_popen(p: subprocess.Popen) -> None:
|
||||
if p is None:
|
||||
return
|
||||
try:
|
||||
if p.poll() is None:
|
||||
p.terminate()
|
||||
for _ in range(5):
|
||||
if p.poll() is not None:
|
||||
break
|
||||
time.sleep(1)
|
||||
if p.poll() is None:
|
||||
p.kill()
|
||||
except Exception as e:
|
||||
logger.warning("Exception during graceful stop of popen: %s", e)
|
||||
|
||||
|
||||
def _pid_alive(pid: int) -> bool:
|
||||
try:
|
||||
os.kill(pid, 0)
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def _graceful_stop_pid(pid: int) -> None:
|
||||
try:
|
||||
if _pid_alive(pid):
|
||||
try:
|
||||
os.kill(pid, signal.SIGTERM)
|
||||
except Exception:
|
||||
pass
|
||||
for _ in range(5):
|
||||
if not _pid_alive(pid):
|
||||
break
|
||||
time.sleep(1)
|
||||
if _pid_alive(pid):
|
||||
try:
|
||||
os.kill(pid, signal.SIGKILL)
|
||||
except Exception:
|
||||
pass
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def _graceful_stop_any(obj) -> None:
|
||||
try:
|
||||
if isinstance(obj, subprocess.Popen):
|
||||
_graceful_stop_popen(obj)
|
||||
return
|
||||
if isinstance(obj, int):
|
||||
_graceful_stop_pid(obj)
|
||||
return
|
||||
proc_obj = getattr(obj, "proc", None)
|
||||
if isinstance(proc_obj, subprocess.Popen):
|
||||
_graceful_stop_popen(proc_obj)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def genai_bench_runner() -> Callable[..., None]:
|
||||
"""Provide a callable to run genai-bench and validate metrics.
|
||||
|
||||
Usage in tests:
|
||||
def test(..., genai_bench_runner):
|
||||
genai_bench_runner(router_url=..., model_path=..., experiment_folder=...)
|
||||
"""
|
||||
|
||||
def _run(
|
||||
*,
|
||||
router_url: str,
|
||||
model_path: str,
|
||||
experiment_folder: str,
|
||||
timeout_sec: int | None = None,
|
||||
thresholds: dict | None = None,
|
||||
extra_env: dict | None = None,
|
||||
num_concurrency: int = 32,
|
||||
traffic_scenario: str = "D(4000,100)",
|
||||
max_requests_per_run: int | None = None,
|
||||
clean_experiment: bool = True,
|
||||
kill_procs: list | None = None,
|
||||
drain_delay_sec: int = 6,
|
||||
) -> None:
|
||||
cli = _which("genai-bench")
|
||||
if not cli:
|
||||
pytest.fail(
|
||||
"genai-bench CLI not found; please install it to run benchmarks"
|
||||
)
|
||||
|
||||
# Clean previous experiment folder under current working directory
|
||||
if clean_experiment:
|
||||
exp_dir = Path.cwd() / experiment_folder
|
||||
if exp_dir.exists():
|
||||
shutil.rmtree(exp_dir, ignore_errors=True)
|
||||
|
||||
# Default requests per run if not provided
|
||||
mrr = (
|
||||
max_requests_per_run
|
||||
if max_requests_per_run is not None
|
||||
else num_concurrency * 3
|
||||
)
|
||||
|
||||
cmd = [
|
||||
cli,
|
||||
"benchmark",
|
||||
"--api-backend",
|
||||
"openai",
|
||||
"--api-base",
|
||||
router_url,
|
||||
"--api-key",
|
||||
"dummy-token",
|
||||
"--api-model-name",
|
||||
model_path,
|
||||
"--model-tokenizer",
|
||||
model_path,
|
||||
"--task",
|
||||
"text-to-text",
|
||||
"--num-concurrency",
|
||||
str(num_concurrency),
|
||||
"--traffic-scenario",
|
||||
traffic_scenario,
|
||||
"--max-requests-per-run",
|
||||
str(mrr),
|
||||
"--max-time-per-run",
|
||||
"2",
|
||||
"--experiment-folder-name",
|
||||
experiment_folder,
|
||||
"--experiment-base-dir",
|
||||
str(Path.cwd()),
|
||||
]
|
||||
|
||||
env = os.environ.copy()
|
||||
if extra_env:
|
||||
env.update(extra_env)
|
||||
|
||||
to = timeout_sec or int(os.environ.get("GENAI_BENCH_TEST_TIMEOUT", "120"))
|
||||
proc = subprocess.Popen(
|
||||
cmd, env=env, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
|
||||
)
|
||||
stdout = stderr = ""
|
||||
rc = None
|
||||
try:
|
||||
try:
|
||||
stdout, stderr = proc.communicate(timeout=to)
|
||||
except subprocess.TimeoutExpired:
|
||||
# Simple: kill the CLI process if it doesn't exit in time
|
||||
try:
|
||||
proc.kill()
|
||||
except Exception:
|
||||
pass
|
||||
stdout, stderr = proc.communicate()
|
||||
rc = proc.returncode
|
||||
|
||||
# Prefer exact path under cwd; fallback to rglob search
|
||||
base = Path.cwd()
|
||||
direct = base / experiment_folder
|
||||
candidates = [direct] if direct.is_dir() else []
|
||||
if not candidates:
|
||||
for p in base.rglob(experiment_folder):
|
||||
if p.is_dir() and p.name == experiment_folder:
|
||||
candidates = [p]
|
||||
break
|
||||
if not candidates:
|
||||
raise AssertionError(
|
||||
"Benchmark failed: experiment folder not found: "
|
||||
f"{experiment_folder}\nExit code: {rc}\nSTDOUT (tail):\n{stdout[-1000:]}\nSTDERR (tail):\n{stderr[-1000:]}"
|
||||
)
|
||||
actual_folder = candidates[0]
|
||||
|
||||
json_files = [
|
||||
p
|
||||
for p in actual_folder.rglob("*.json")
|
||||
if "experiment_metadata" not in p.name
|
||||
]
|
||||
if not json_files:
|
||||
raise AssertionError(
|
||||
"Benchmark failed: no JSON results found\n"
|
||||
f"Exit code: {rc}\nSTDOUT (tail):\n{stdout[-1000:]}\nSTDERR (tail):\n{stderr[-1000:]}"
|
||||
)
|
||||
|
||||
th = thresholds # None means "log only", no validation
|
||||
|
||||
for jf in json_files:
|
||||
with jf.open("r") as f:
|
||||
data = json.load(f)
|
||||
stats = data.get("aggregated_metrics", {}).get("stats", {})
|
||||
ttft_mean = float(stats.get("ttft", {}).get("mean", float("inf")))
|
||||
e2e_latency_mean = float(
|
||||
stats.get("e2e_latency", {}).get("mean", float("inf"))
|
||||
)
|
||||
input_tp_mean = float(stats.get("input_throughput", {}).get("mean", 0.0))
|
||||
output_tp_mean = float(stats.get("output_throughput", {}).get("mean", 0.0))
|
||||
|
||||
logger.info(
|
||||
"genai-bench[%s] %s ttft_mean=%.3fs e2e_latency_mean=%.3fs input_tp_mean=%.1f tok/s output_tp_mean=%.1f tok/s",
|
||||
experiment_folder,
|
||||
jf.name,
|
||||
ttft_mean,
|
||||
e2e_latency_mean,
|
||||
input_tp_mean,
|
||||
output_tp_mean,
|
||||
)
|
||||
|
||||
if th is not None:
|
||||
assert (
|
||||
ttft_mean <= th["ttft_mean_max"]
|
||||
), f"TTFT validation failed: {ttft_mean} > {th['ttft_mean_max']} (file={jf.name})"
|
||||
assert (
|
||||
e2e_latency_mean <= th["e2e_latency_mean_max"]
|
||||
), f"E2E latency validation failed: {e2e_latency_mean} > {th['e2e_latency_mean_max']} (file={jf.name})"
|
||||
assert (
|
||||
input_tp_mean >= th["input_throughput_mean_min"]
|
||||
), f"Input throughput validation failed: {input_tp_mean} < {th['input_throughput_mean_min']} (file={jf.name})"
|
||||
assert (
|
||||
output_tp_mean >= th["output_throughput_mean_min"]
|
||||
), f"Output throughput validation failed: {output_tp_mean} < {th['output_throughput_mean_min']} (file={jf.name})"
|
||||
|
||||
finally:
|
||||
# Always attempt to stop workers to avoid resource leakage
|
||||
if kill_procs:
|
||||
# Give router/workers a small grace period to finish any last drains
|
||||
if drain_delay_sec > 0:
|
||||
try:
|
||||
time.sleep(drain_delay_sec)
|
||||
except Exception:
|
||||
pass
|
||||
for p in kill_procs:
|
||||
_graceful_stop_any(p)
|
||||
try:
|
||||
time.sleep(2)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return _run
|
||||
|
||||
|
||||
def pytest_configure(config):
|
||||
config.addinivalue_line("markers", "e2e: mark as end-to-end test")
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def e2e_model() -> str:
|
||||
# Always use the default test model
|
||||
return DEFAULT_MODEL_NAME_FOR_TEST
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def e2e_router(e2e_model: str):
|
||||
# Keep this available but tests below use router-only to avoid GPU contention
|
||||
base_url = DEFAULT_URL_FOR_TEST
|
||||
proc = _popen_launch_router(
|
||||
e2e_model, base_url, dp_size=2, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
|
||||
)
|
||||
try:
|
||||
yield SimpleNamespace(proc=proc, url=base_url)
|
||||
finally:
|
||||
_terminate(proc)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def e2e_router_only_rr():
|
||||
port = _find_available_port()
|
||||
base_url = f"http://127.0.0.1:{port}"
|
||||
proc = _popen_launch_router_only(base_url, policy="round_robin")
|
||||
try:
|
||||
yield SimpleNamespace(proc=proc, url=base_url)
|
||||
finally:
|
||||
_terminate(proc)
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def e2e_primary_worker(e2e_model: str):
|
||||
port = _find_available_port()
|
||||
base_url = f"http://127.0.0.1:{port}"
|
||||
proc = _popen_launch_worker(e2e_model, base_url)
|
||||
# Router health gate will handle worker readiness
|
||||
try:
|
||||
yield SimpleNamespace(proc=proc, url=base_url)
|
||||
finally:
|
||||
_terminate(proc)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def e2e_router_only_rr_dp_aware_api():
|
||||
"""Router-only with dp-aware enabled and an API key."""
|
||||
port = _find_available_port()
|
||||
base_url = f"http://127.0.0.1:{port}"
|
||||
api_key = "secret"
|
||||
proc = _popen_launch_router_only(
|
||||
base_url, policy="round_robin", timeout=180.0, dp_aware=True, api_key=api_key
|
||||
)
|
||||
try:
|
||||
yield SimpleNamespace(proc=proc, url=base_url, api_key=api_key)
|
||||
finally:
|
||||
_terminate(proc)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def e2e_worker_dp2_api(e2e_model: str, e2e_router_only_rr_dp_aware_api):
|
||||
"""Worker with dp-size=2 and the same API key as the dp-aware router."""
|
||||
port = _find_available_port()
|
||||
base_url = f"http://127.0.0.1:{port}"
|
||||
api_key = e2e_router_only_rr_dp_aware_api.api_key
|
||||
proc = _popen_launch_worker(e2e_model, base_url, dp_size=2, api_key=api_key)
|
||||
try:
|
||||
yield SimpleNamespace(proc=proc, url=base_url)
|
||||
finally:
|
||||
_terminate(proc)
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def e2e_two_workers_dp2(e2e_model: str):
|
||||
"""Launch two workers, each with dp_size=2, mapped to GPUs [0,1] and [2,3]."""
|
||||
workers = []
|
||||
try:
|
||||
# Worker A on GPUs 0-1
|
||||
port_a = _find_available_port()
|
||||
url_a = f"http://127.0.0.1:{port_a}"
|
||||
proc_a = _popen_launch_worker(e2e_model, url_a, dp_size=2, base_gpu_id=0)
|
||||
workers.append(SimpleNamespace(proc=proc_a, url=url_a))
|
||||
|
||||
# Worker B on GPUs 2-3
|
||||
port_b = _find_available_port()
|
||||
url_b = f"http://127.0.0.1:{port_b}"
|
||||
proc_b = _popen_launch_worker(e2e_model, url_b, dp_size=2, base_gpu_id=2)
|
||||
workers.append(SimpleNamespace(proc=proc_b, url=url_b))
|
||||
|
||||
yield workers
|
||||
finally:
|
||||
for w in workers:
|
||||
_terminate(w.proc)
|
||||
262
sgl-router/py_test/e2e/test_pd_router.py
Normal file
262
sgl-router/py_test/e2e/test_pd_router.py
Normal file
@@ -0,0 +1,262 @@
|
||||
import logging
|
||||
import os
|
||||
import socket
|
||||
import subprocess
|
||||
import time
|
||||
from types import SimpleNamespace
|
||||
from typing import Optional
|
||||
|
||||
import pytest
|
||||
import requests
|
||||
|
||||
from sglang.test.run_eval import run_eval
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _find_available_port() -> int:
|
||||
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
||||
s.bind(("127.0.0.1", 0))
|
||||
return s.getsockname()[1]
|
||||
|
||||
|
||||
def _wait_health(url: str, timeout: float = 180.0) -> None:
|
||||
start = time.perf_counter()
|
||||
with requests.Session() as session:
|
||||
while time.perf_counter() - start < timeout:
|
||||
try:
|
||||
r = session.get(f"{url}/health", timeout=5)
|
||||
if r.status_code == 200:
|
||||
return
|
||||
except requests.RequestException:
|
||||
pass
|
||||
time.sleep(1)
|
||||
raise TimeoutError(f"Service at {url} failed to become healthy in time")
|
||||
|
||||
|
||||
def _detect_ib_device() -> Optional[str]:
|
||||
"""Return first active IB device name (e.g., mlx5_0) or None if unavailable."""
|
||||
# Fast check that ibv_devinfo exists
|
||||
try:
|
||||
subprocess.run(
|
||||
["ibv_devinfo", "-l"],
|
||||
stdout=subprocess.DEVNULL,
|
||||
stderr=subprocess.DEVNULL,
|
||||
timeout=1,
|
||||
)
|
||||
except (FileNotFoundError, subprocess.TimeoutExpired):
|
||||
return None
|
||||
|
||||
for i in range(12):
|
||||
dev = f"mlx5_{i}"
|
||||
try:
|
||||
res = subprocess.run(
|
||||
["ibv_devinfo", dev],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=2,
|
||||
)
|
||||
if res.returncode == 0 and ("state:" in res.stdout):
|
||||
for line in res.stdout.splitlines():
|
||||
if "state:" in line and "PORT_ACTIVE" in line:
|
||||
return dev
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def _popen_launch_prefill_worker(
|
||||
model: str,
|
||||
bootstrap_port: int,
|
||||
ib_device: Optional[str] = None,
|
||||
base_gpu_id: int = 0,
|
||||
) -> SimpleNamespace:
|
||||
port = _find_available_port()
|
||||
url = f"http://127.0.0.1:{port}"
|
||||
cmd = [
|
||||
"python3",
|
||||
"-m",
|
||||
"sglang.launch_server",
|
||||
"--model-path",
|
||||
model,
|
||||
"--disaggregation-mode",
|
||||
"prefill",
|
||||
"--host",
|
||||
"127.0.0.1",
|
||||
"--port",
|
||||
str(port),
|
||||
"--disaggregation-bootstrap-port",
|
||||
str(bootstrap_port),
|
||||
"--base-gpu-id",
|
||||
str(base_gpu_id),
|
||||
]
|
||||
if ib_device:
|
||||
cmd += ["--disaggregation-ib-device", ib_device]
|
||||
proc = subprocess.Popen(cmd)
|
||||
_wait_health(url, timeout=300.0)
|
||||
return SimpleNamespace(proc=proc, url=url, bootstrap_port=bootstrap_port)
|
||||
|
||||
|
||||
def _popen_launch_decode_worker(
|
||||
model: str, ib_device: Optional[str] = None, base_gpu_id: int = 0
|
||||
) -> SimpleNamespace:
|
||||
port = _find_available_port()
|
||||
url = f"http://127.0.0.1:{port}"
|
||||
cmd = [
|
||||
"python3",
|
||||
"-m",
|
||||
"sglang.launch_server",
|
||||
"--model-path",
|
||||
model,
|
||||
"--disaggregation-mode",
|
||||
"decode",
|
||||
"--host",
|
||||
"127.0.0.1",
|
||||
"--port",
|
||||
str(port),
|
||||
"--base-gpu-id",
|
||||
str(base_gpu_id),
|
||||
]
|
||||
if ib_device:
|
||||
cmd += ["--disaggregation-ib-device", ib_device]
|
||||
proc = subprocess.Popen(cmd)
|
||||
_wait_health(url, timeout=300.0)
|
||||
return SimpleNamespace(proc=proc, url=url)
|
||||
|
||||
|
||||
def _terminate(proc: subprocess.Popen, timeout: float = 120) -> None:
|
||||
if proc is None:
|
||||
return
|
||||
proc.terminate()
|
||||
start = time.perf_counter()
|
||||
while proc.poll() is None:
|
||||
if time.perf_counter() - start > timeout:
|
||||
proc.kill()
|
||||
break
|
||||
time.sleep(1)
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def pd_cluster(e2e_model: str):
|
||||
"""Start 2 prefill + 2 decode workers and one PD router, once per module."""
|
||||
# Environment capability checks: require sgl_kernel and GPU backend
|
||||
try:
|
||||
import sgl_kernel # noqa: F401
|
||||
except Exception as e: # pragma: no cover - environment dependent
|
||||
pytest.fail(f"PD e2e requires sgl_kernel but it is not available: {e}")
|
||||
|
||||
try:
|
||||
import torch # noqa: F401
|
||||
except Exception as e: # pragma: no cover - environment dependent
|
||||
pytest.fail(
|
||||
f"PD e2e requires torch but it is not available or misconfigured: {e}"
|
||||
)
|
||||
|
||||
if not torch.cuda.is_available(): # pragma: no cover - environment dependent
|
||||
pytest.fail("PD e2e requires CUDA backend, but CUDA is not available")
|
||||
|
||||
workers: list[SimpleNamespace] = []
|
||||
router_proc = None
|
||||
try:
|
||||
ib_device = _detect_ib_device()
|
||||
|
||||
# Launch 4 workers across 4 GPUs: prefill on 0,1 and decode on 2,3
|
||||
pf1 = _popen_launch_prefill_worker(
|
||||
e2e_model,
|
||||
bootstrap_port=_find_available_port(),
|
||||
ib_device=ib_device,
|
||||
base_gpu_id=0,
|
||||
)
|
||||
pf2 = _popen_launch_prefill_worker(
|
||||
e2e_model,
|
||||
bootstrap_port=_find_available_port(),
|
||||
ib_device=ib_device,
|
||||
base_gpu_id=1,
|
||||
)
|
||||
dc1 = _popen_launch_decode_worker(e2e_model, ib_device=ib_device, base_gpu_id=2)
|
||||
dc2 = _popen_launch_decode_worker(e2e_model, ib_device=ib_device, base_gpu_id=3)
|
||||
prefills = [pf1, pf2]
|
||||
decodes = [dc1, dc2]
|
||||
workers.extend(prefills + decodes)
|
||||
|
||||
# PD router with two prefill and two decode endpoints
|
||||
rport = _find_available_port()
|
||||
router_url = f"http://127.0.0.1:{rport}"
|
||||
pport = _find_available_port()
|
||||
|
||||
prefill = [(pf.url, pf.bootstrap_port) for pf in prefills]
|
||||
decode = [dc.url for dc in decodes]
|
||||
|
||||
cmd = [
|
||||
"python3",
|
||||
"-m",
|
||||
"sglang_router.launch_router",
|
||||
"--host",
|
||||
"127.0.0.1",
|
||||
"--port",
|
||||
str(rport),
|
||||
"--policy",
|
||||
"round_robin",
|
||||
"--pd-disaggregation",
|
||||
]
|
||||
for url, bport in prefill:
|
||||
cmd += ["--prefill", url, str(bport)]
|
||||
for url in decode:
|
||||
cmd += ["--decode", url]
|
||||
cmd += [
|
||||
"--prometheus-port",
|
||||
str(pport),
|
||||
"--prometheus-host",
|
||||
"127.0.0.1",
|
||||
]
|
||||
|
||||
router_proc = subprocess.Popen(cmd)
|
||||
_wait_health(router_url, timeout=180.0)
|
||||
|
||||
yield SimpleNamespace(
|
||||
router_url=router_url, workers=workers, router_proc=router_proc
|
||||
)
|
||||
finally:
|
||||
if router_proc is not None:
|
||||
_terminate(router_proc)
|
||||
for w in workers:
|
||||
_terminate(w.proc)
|
||||
|
||||
|
||||
@pytest.mark.e2e
|
||||
def test_pd_mmlu(e2e_model: str, pd_cluster):
|
||||
"""
|
||||
Launch 4 workers, start a PD router (2 prefill + 2 decode), then run MMLU.
|
||||
"""
|
||||
args = SimpleNamespace(
|
||||
base_url=pd_cluster.router_url,
|
||||
model=e2e_model,
|
||||
eval_name="mmlu",
|
||||
num_examples=64,
|
||||
num_threads=32,
|
||||
temperature=0.1,
|
||||
)
|
||||
metrics = run_eval(args)
|
||||
assert metrics["score"] >= 0.65
|
||||
|
||||
|
||||
@pytest.mark.e2e
|
||||
def test_pd_genai_bench(e2e_model: str, pd_cluster, genai_bench_runner):
|
||||
"""
|
||||
Launch 4 workers, start a PD router (2 prefill + 2 decode), then run a
|
||||
short genai-bench benchmark and validate aggregate metrics.
|
||||
"""
|
||||
# Run genai-bench against the shared router
|
||||
policy_label = "benchmark_round_robin_pd"
|
||||
genai_bench_runner(
|
||||
router_url=pd_cluster.router_url,
|
||||
model_path=e2e_model,
|
||||
experiment_folder=policy_label,
|
||||
thresholds={
|
||||
"ttft_mean_max": 12,
|
||||
"e2e_latency_mean_max": 15,
|
||||
"input_throughput_mean_min": 400,
|
||||
"output_throughput_mean_min": 20,
|
||||
},
|
||||
kill_procs=pd_cluster.workers,
|
||||
)
|
||||
169
sgl-router/py_test/e2e/test_regular_router.py
Normal file
169
sgl-router/py_test/e2e/test_regular_router.py
Normal file
@@ -0,0 +1,169 @@
|
||||
import threading
|
||||
import time
|
||||
from types import SimpleNamespace
|
||||
|
||||
import pytest
|
||||
import requests
|
||||
|
||||
from sglang.test.run_eval import run_eval
|
||||
|
||||
|
||||
@pytest.mark.e2e
|
||||
def test_mmlu(e2e_router_only_rr, e2e_two_workers_dp2, e2e_model):
|
||||
# Attach two dp=2 workers (total 4 GPUs) to a fresh router-only instance
|
||||
base = e2e_router_only_rr.url
|
||||
for w in e2e_two_workers_dp2:
|
||||
r = requests.post(f"{base}/add_worker", params={"url": w.url}, timeout=180)
|
||||
r.raise_for_status()
|
||||
|
||||
args = SimpleNamespace(
|
||||
base_url=base,
|
||||
model=e2e_model,
|
||||
eval_name="mmlu",
|
||||
num_examples=64,
|
||||
num_threads=32,
|
||||
temperature=0.1,
|
||||
)
|
||||
metrics = run_eval(args)
|
||||
assert metrics["score"] >= 0.65
|
||||
|
||||
|
||||
@pytest.mark.e2e
|
||||
def test_genai_bench(
|
||||
e2e_router_only_rr, e2e_two_workers_dp2, e2e_model, genai_bench_runner
|
||||
):
|
||||
"""Attach a worker to the regular router and run a short genai-bench."""
|
||||
base = e2e_router_only_rr.url
|
||||
for w in e2e_two_workers_dp2:
|
||||
r = requests.post(f"{base}/add_worker", params={"url": w.url}, timeout=180)
|
||||
r.raise_for_status()
|
||||
|
||||
genai_bench_runner(
|
||||
router_url=base,
|
||||
model_path=e2e_model,
|
||||
experiment_folder="benchmark_round_robin_regular",
|
||||
thresholds={
|
||||
"ttft_mean_max": 6,
|
||||
"e2e_latency_mean_max": 14,
|
||||
"input_throughput_mean_min": 1000,
|
||||
"output_throughput_mean_min": 12,
|
||||
},
|
||||
kill_procs=e2e_two_workers_dp2,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.e2e
|
||||
def test_add_and_remove_worker_live(e2e_router_only_rr, e2e_primary_worker, e2e_model):
|
||||
base = e2e_router_only_rr.url
|
||||
worker_url = e2e_primary_worker.url
|
||||
|
||||
r = requests.post(f"{base}/add_worker", params={"url": worker_url}, timeout=180)
|
||||
r.raise_for_status()
|
||||
|
||||
with requests.Session() as s:
|
||||
for i in range(8):
|
||||
r = s.post(
|
||||
f"{base}/v1/completions",
|
||||
json={
|
||||
"model": e2e_model,
|
||||
"prompt": f"x{i}",
|
||||
"max_tokens": 1,
|
||||
"stream": False,
|
||||
},
|
||||
timeout=120,
|
||||
)
|
||||
r.raise_for_status()
|
||||
|
||||
# Remove the worker
|
||||
r = requests.post(f"{base}/remove_worker", params={"url": worker_url}, timeout=60)
|
||||
r.raise_for_status()
|
||||
|
||||
|
||||
@pytest.mark.e2e
|
||||
def test_lazy_fault_tolerance_live(e2e_router_only_rr, e2e_primary_worker, e2e_model):
|
||||
base = e2e_router_only_rr.url
|
||||
worker = e2e_primary_worker
|
||||
|
||||
r = requests.post(f"{base}/add_worker", params={"url": worker.url}, timeout=180)
|
||||
r.raise_for_status()
|
||||
|
||||
def killer():
|
||||
time.sleep(10)
|
||||
try:
|
||||
worker.proc.terminate()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
t = threading.Thread(target=killer, daemon=True)
|
||||
t.start()
|
||||
|
||||
args = SimpleNamespace(
|
||||
base_url=base,
|
||||
model=e2e_model,
|
||||
eval_name="mmlu",
|
||||
num_examples=32,
|
||||
num_threads=16,
|
||||
temperature=0.0,
|
||||
)
|
||||
metrics = run_eval(args)
|
||||
assert 0.0 <= metrics["score"] <= 1.0
|
||||
|
||||
|
||||
@pytest.mark.e2e
|
||||
def test_dp_aware_worker_expansion_and_api_key(
|
||||
e2e_model,
|
||||
e2e_router_only_rr_dp_aware_api,
|
||||
e2e_worker_dp2_api,
|
||||
):
|
||||
"""
|
||||
Launch a router-only instance in dp_aware mode and a single worker with dp_size=2
|
||||
and API key protection. Verify expansion, auth enforcement, and basic eval.
|
||||
"""
|
||||
import os
|
||||
|
||||
router_url = e2e_router_only_rr_dp_aware_api.url
|
||||
worker_url = e2e_worker_dp2_api.url
|
||||
api_key = e2e_router_only_rr_dp_aware_api.api_key
|
||||
|
||||
# Attach worker; router should expand to dp_size logical workers
|
||||
r = requests.post(
|
||||
f"{router_url}/add_worker", params={"url": worker_url}, timeout=180
|
||||
)
|
||||
r.raise_for_status()
|
||||
|
||||
r = requests.get(f"{router_url}/list_workers", timeout=30)
|
||||
r.raise_for_status()
|
||||
urls = r.json().get("urls", [])
|
||||
assert len(urls) == 2
|
||||
assert set(urls) == {f"{worker_url}@0", f"{worker_url}@1"}
|
||||
|
||||
# Verify API key enforcement path-through
|
||||
# 1) Without Authorization -> 401 from backend
|
||||
r = requests.post(
|
||||
f"{router_url}/v1/completions",
|
||||
json={"model": e2e_model, "prompt": "hi", "max_tokens": 1},
|
||||
timeout=60,
|
||||
)
|
||||
assert r.status_code == 401
|
||||
|
||||
# 2) With correct Authorization -> 200
|
||||
r = requests.post(
|
||||
f"{router_url}/v1/completions",
|
||||
json={"model": e2e_model, "prompt": "hi", "max_tokens": 1},
|
||||
headers={"Authorization": f"Bearer {api_key}"},
|
||||
timeout=60,
|
||||
)
|
||||
assert r.status_code == 200
|
||||
|
||||
# Finally, run MMLU eval through the router with auth
|
||||
os.environ["OPENAI_API_KEY"] = api_key
|
||||
args = SimpleNamespace(
|
||||
base_url=router_url,
|
||||
model=e2e_model,
|
||||
eval_name="mmlu",
|
||||
num_examples=64,
|
||||
num_threads=32,
|
||||
temperature=0.1,
|
||||
)
|
||||
metrics = run_eval(args)
|
||||
assert metrics["score"] >= 0.65
|
||||
Reference in New Issue
Block a user