Files
sglang/sgl-router/py_test/e2e/test_pd_router.py

264 lines
7.6 KiB
Python
Raw Normal View History

import logging
import os
import socket
import subprocess
import time
from types import SimpleNamespace
from typing import Optional
import pytest
import requests
from sglang.test.run_eval import run_eval
logger = logging.getLogger(__name__)
def _find_available_port() -> int:
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.bind(("127.0.0.1", 0))
return s.getsockname()[1]
def _wait_health(url: str, timeout: float = 180.0) -> None:
start = time.perf_counter()
with requests.Session() as session:
while time.perf_counter() - start < timeout:
try:
r = session.get(f"{url}/health", timeout=5)
if r.status_code == 200:
return
except requests.RequestException:
pass
time.sleep(1)
raise TimeoutError(f"Service at {url} failed to become healthy in time")
def _detect_ib_device() -> Optional[str]:
"""Return first active IB device name (e.g., mlx5_0) or None if unavailable."""
# Fast check that ibv_devinfo exists
try:
subprocess.run(
["ibv_devinfo", "-l"],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
timeout=1,
)
except (FileNotFoundError, subprocess.TimeoutExpired):
return None
for i in range(12):
dev = f"mlx5_{i}"
try:
res = subprocess.run(
["ibv_devinfo", dev],
capture_output=True,
text=True,
timeout=2,
)
if res.returncode == 0 and ("state:" in res.stdout):
for line in res.stdout.splitlines():
if "state:" in line and "PORT_ACTIVE" in line:
return dev
except Exception:
pass
return None
def _popen_launch_prefill_worker(
model: str,
bootstrap_port: int,
ib_device: Optional[str] = None,
base_gpu_id: int = 0,
) -> SimpleNamespace:
port = _find_available_port()
url = f"http://127.0.0.1:{port}"
cmd = [
"python3",
"-m",
"sglang.launch_server",
"--model-path",
model,
"--disaggregation-mode",
"prefill",
"--host",
"127.0.0.1",
"--port",
str(port),
"--disaggregation-bootstrap-port",
str(bootstrap_port),
"--base-gpu-id",
str(base_gpu_id),
]
if ib_device:
cmd += ["--disaggregation-ib-device", ib_device]
proc = subprocess.Popen(cmd)
_wait_health(url, timeout=300.0)
return SimpleNamespace(proc=proc, url=url, bootstrap_port=bootstrap_port)
def _popen_launch_decode_worker(
model: str, ib_device: Optional[str] = None, base_gpu_id: int = 0
) -> SimpleNamespace:
port = _find_available_port()
url = f"http://127.0.0.1:{port}"
cmd = [
"python3",
"-m",
"sglang.launch_server",
"--model-path",
model,
"--disaggregation-mode",
"decode",
"--host",
"127.0.0.1",
"--port",
str(port),
"--base-gpu-id",
str(base_gpu_id),
]
if ib_device:
cmd += ["--disaggregation-ib-device", ib_device]
proc = subprocess.Popen(cmd)
_wait_health(url, timeout=300.0)
return SimpleNamespace(proc=proc, url=url)
def _terminate(proc: subprocess.Popen, timeout: float = 120) -> None:
if proc is None:
return
proc.terminate()
start = time.perf_counter()
while proc.poll() is None:
if time.perf_counter() - start > timeout:
proc.kill()
break
time.sleep(1)
@pytest.fixture(scope="module")
def pd_cluster(e2e_model: str):
"""Start 2 prefill + 2 decode workers and one PD router, once per module."""
# Environment capability checks: require sgl_kernel and GPU backend
try:
import sgl_kernel # noqa: F401
except Exception as e: # pragma: no cover - environment dependent
pytest.fail(f"PD e2e requires sgl_kernel but it is not available: {e}")
try:
import torch # noqa: F401
except Exception as e: # pragma: no cover - environment dependent
pytest.fail(
f"PD e2e requires torch but it is not available or misconfigured: {e}"
)
if not torch.cuda.is_available(): # pragma: no cover - environment dependent
pytest.fail("PD e2e requires CUDA backend, but CUDA is not available")
workers: list[SimpleNamespace] = []
router_proc = None
try:
ib_device = _detect_ib_device()
# Launch 4 workers across 4 GPUs: prefill on 0,1 and decode on 2,3
pf1 = _popen_launch_prefill_worker(
e2e_model,
bootstrap_port=_find_available_port(),
ib_device=ib_device,
base_gpu_id=0,
)
pf2 = _popen_launch_prefill_worker(
e2e_model,
bootstrap_port=_find_available_port(),
ib_device=ib_device,
base_gpu_id=1,
)
dc1 = _popen_launch_decode_worker(e2e_model, ib_device=ib_device, base_gpu_id=2)
dc2 = _popen_launch_decode_worker(e2e_model, ib_device=ib_device, base_gpu_id=3)
prefills = [pf1, pf2]
decodes = [dc1, dc2]
workers.extend(prefills + decodes)
# PD router with two prefill and two decode endpoints
rport = _find_available_port()
router_url = f"http://127.0.0.1:{rport}"
pport = _find_available_port()
prefill = [(pf.url, pf.bootstrap_port) for pf in prefills]
decode = [dc.url for dc in decodes]
cmd = [
"python3",
"-m",
"sglang_router.launch_router",
"--host",
"127.0.0.1",
"--port",
str(rport),
"--policy",
"round_robin",
"--pd-disaggregation",
]
for url, bport in prefill:
cmd += ["--prefill", url, str(bport)]
for url in decode:
cmd += ["--decode", url]
cmd += [
"--prometheus-port",
str(pport),
"--prometheus-host",
"127.0.0.1",
]
router_proc = subprocess.Popen(cmd)
_wait_health(router_url, timeout=180.0)
yield SimpleNamespace(
router_url=router_url, workers=workers, router_proc=router_proc
)
finally:
if router_proc is not None:
_terminate(router_proc)
for w in workers:
_terminate(w.proc)
@pytest.mark.e2e
def test_pd_mmlu(e2e_model: str, pd_cluster):
"""
Launch 4 workers, start a PD router (2 prefill + 2 decode), then run MMLU.
"""
args = SimpleNamespace(
base_url=pd_cluster.router_url,
model=e2e_model,
eval_name="mmlu",
num_examples=64,
num_threads=32,
temperature=0.1,
)
metrics = run_eval(args)
assert metrics["score"] >= 0.65
@pytest.mark.e2e
def test_pd_genai_bench(e2e_model: str, pd_cluster, genai_bench_runner):
"""
Launch 4 workers, start a PD router (2 prefill + 2 decode), then run a
short genai-bench benchmark and validate aggregate metrics.
"""
# Run genai-bench against the shared router
policy_label = "benchmark_round_robin_pd"
genai_bench_runner(
router_url=pd_cluster.router_url,
model_path=e2e_model,
experiment_folder=policy_label,
thresholds={
"ttft_mean_max": 12,
"e2e_latency_mean_max": 15,
"input_throughput_mean_min": 400,
"output_throughput_mean_min": 20,
"gpu_util_p50_min": 99,
},
kill_procs=pd_cluster.workers,
)