sglang/sgl-router/py_test/e2e/test_pd_router.py

import logging
import os
import socket
import subprocess
import time
from types import SimpleNamespace
from typing import Optional

import pytest
import requests

from sglang.test.run_eval import run_eval

logger = logging.getLogger(__name__)


def _find_available_port() -> int:
    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
        s.bind(("127.0.0.1", 0))
        return s.getsockname()[1]


def _wait_health(url: str, timeout: float = 180.0) -> None:
    start = time.perf_counter()
    with requests.Session() as session:
        while time.perf_counter() - start < timeout:
            try:
                r = session.get(f"{url}/health", timeout=5)
                if r.status_code == 200:
                    return
            except requests.RequestException:
                pass
            time.sleep(1)
    raise TimeoutError(f"Service at {url} failed to become healthy in time")


def _detect_ib_device() -> Optional[str]:
    """Return first active IB device name (e.g., mlx5_0) or None if unavailable."""
    # Fast check that ibv_devinfo exists
    try:
        subprocess.run(
            ["ibv_devinfo", "-l"],
            stdout=subprocess.DEVNULL,
            stderr=subprocess.DEVNULL,
            timeout=1,
        )
    except (FileNotFoundError, subprocess.TimeoutExpired):
        return None

    for i in range(12):
        dev = f"mlx5_{i}"
        try:
            res = subprocess.run(
                ["ibv_devinfo", dev],
                capture_output=True,
                text=True,
                timeout=2,
            )
            if res.returncode == 0 and ("state:" in res.stdout):
                for line in res.stdout.splitlines():
                    if "state:" in line and "PORT_ACTIVE" in line:
                        return dev
        except Exception:
            pass
    return None


def _popen_launch_prefill_worker(
    model: str,
    bootstrap_port: int,
    ib_device: Optional[str] = None,
    base_gpu_id: int = 0,
) -> SimpleNamespace:
    port = _find_available_port()
    url = f"http://127.0.0.1:{port}"
    cmd = [
        "python3",
        "-m",
        "sglang.launch_server",
        "--model-path",
        model,
        "--disaggregation-mode",
        "prefill",
        "--host",
        "127.0.0.1",
        "--port",
        str(port),
        "--disaggregation-bootstrap-port",
        str(bootstrap_port),
        "--base-gpu-id",
        str(base_gpu_id),
    ]
    if ib_device:
        cmd += ["--disaggregation-ib-device", ib_device]
    proc = subprocess.Popen(cmd)
    _wait_health(url, timeout=300.0)
    return SimpleNamespace(proc=proc, url=url, bootstrap_port=bootstrap_port)


def _popen_launch_decode_worker(
    model: str, ib_device: Optional[str] = None, base_gpu_id: int = 0
) -> SimpleNamespace:
    port = _find_available_port()
    url = f"http://127.0.0.1:{port}"
    cmd = [
        "python3",
        "-m",
        "sglang.launch_server",
        "--model-path",
        model,
        "--disaggregation-mode",
        "decode",
        "--host",
        "127.0.0.1",
        "--port",
        str(port),
        "--base-gpu-id",
        str(base_gpu_id),
    ]
    if ib_device:
        cmd += ["--disaggregation-ib-device", ib_device]
    proc = subprocess.Popen(cmd)
    _wait_health(url, timeout=300.0)
    return SimpleNamespace(proc=proc, url=url)


def _terminate(proc: subprocess.Popen, timeout: float = 120) -> None:
    if proc is None:
        return
    proc.terminate()
    start = time.perf_counter()
    while proc.poll() is None:
        if time.perf_counter() - start > timeout:
            proc.kill()
            break
        time.sleep(1)


@pytest.fixture(scope="module")
def pd_cluster(e2e_model: str):
    """Start 2 prefill + 2 decode workers and one PD router, once per module."""
    # Environment capability checks: require sgl_kernel and GPU backend
    try:
        import sgl_kernel  # noqa: F401
    except Exception as e:  # pragma: no cover - environment dependent
        pytest.fail(f"PD e2e requires sgl_kernel but it is not available: {e}")

    try:
        import torch  # noqa: F401
    except Exception as e:  # pragma: no cover - environment dependent
        pytest.fail(
            f"PD e2e requires torch but it is not available or misconfigured: {e}"
        )

    if not torch.cuda.is_available():  # pragma: no cover - environment dependent
        pytest.fail("PD e2e requires CUDA backend, but CUDA is not available")

    workers: list[SimpleNamespace] = []
    router_proc = None
    try:
        ib_device = _detect_ib_device()

        # Launch 4 workers across 4 GPUs: prefill on 0,1 and decode on 2,3
        pf1 = _popen_launch_prefill_worker(
            e2e_model,
            bootstrap_port=_find_available_port(),
            ib_device=ib_device,
            base_gpu_id=0,
        )
        pf2 = _popen_launch_prefill_worker(
            e2e_model,
            bootstrap_port=_find_available_port(),
            ib_device=ib_device,
            base_gpu_id=1,
        )
        dc1 = _popen_launch_decode_worker(e2e_model, ib_device=ib_device, base_gpu_id=2)
        dc2 = _popen_launch_decode_worker(e2e_model, ib_device=ib_device, base_gpu_id=3)
        prefills = [pf1, pf2]
        decodes = [dc1, dc2]
        workers.extend(prefills + decodes)

        # PD router with two prefill and two decode endpoints
        rport = _find_available_port()
        router_url = f"http://127.0.0.1:{rport}"
        pport = _find_available_port()

        prefill = [(pf.url, pf.bootstrap_port) for pf in prefills]
        decode = [dc.url for dc in decodes]

        cmd = [
            "python3",
            "-m",
            "sglang_router.launch_router",
            "--host",
            "127.0.0.1",
            "--port",
            str(rport),
            "--policy",
            "round_robin",
            "--pd-disaggregation",
        ]
        for url, bport in prefill:
            cmd += ["--prefill", url, str(bport)]
        for url in decode:
            cmd += ["--decode", url]
        cmd += [
            "--prometheus-port",
            str(pport),
            "--prometheus-host",
            "127.0.0.1",
        ]

        router_proc = subprocess.Popen(cmd)
        _wait_health(router_url, timeout=180.0)

        yield SimpleNamespace(
            router_url=router_url, workers=workers, router_proc=router_proc
        )
    finally:
        if router_proc is not None:
            _terminate(router_proc)
        for w in workers:
            _terminate(w.proc)


@pytest.mark.e2e
def test_pd_mmlu(e2e_model: str, pd_cluster):
    """
    Launch 4 workers, start a PD router (2 prefill + 2 decode), then run MMLU.
    """
    args = SimpleNamespace(
        base_url=pd_cluster.router_url,
        model=e2e_model,
        eval_name="mmlu",
        num_examples=64,
        num_threads=32,
        temperature=0.1,
    )
    metrics = run_eval(args)
    assert metrics["score"] >= 0.65


@pytest.mark.e2e
def test_pd_genai_bench(e2e_model: str, pd_cluster, genai_bench_runner):
    """
    Launch 4 workers, start a PD router (2 prefill + 2 decode), then run a
    short genai-bench benchmark and validate aggregate metrics.
    """
    # Run genai-bench against the shared router
    policy_label = "benchmark_round_robin_pd"
    genai_bench_runner(
        router_url=pd_cluster.router_url,
        model_path=e2e_model,
        experiment_folder=policy_label,
        thresholds={
            "ttft_mean_max": 12,
            "e2e_latency_mean_max": 15,
            "input_throughput_mean_min": 400,
            "output_throughput_mean_min": 20,
            "gpu_util_p50_min": 99,
        },
        kill_procs=pd_cluster.workers,
    )
[router] add benchmark for regular router and pd router (#10280) 2025-09-11 12:04:11 -07:00			`import logging`
			`import os`
[router] Add PD router mmlu test (#10256) 2025-09-10 08:47:24 -07:00			`import socket`
			`import subprocess`
			`import time`
			`from types import SimpleNamespace`
			`from typing import Optional`

			`import pytest`
			`import requests`

			`from sglang.test.run_eval import run_eval`

[router] add benchmark for regular router and pd router (#10280) 2025-09-11 12:04:11 -07:00			`logger = logging.getLogger(__name__)`

[router] Add PD router mmlu test (#10256) 2025-09-10 08:47:24 -07:00
			`def _find_available_port() -> int:`
			`with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:`
			`s.bind(("127.0.0.1", 0))`
			`return s.getsockname()[1]`


			`def _wait_health(url: str, timeout: float = 180.0) -> None:`
			`start = time.perf_counter()`
			`with requests.Session() as session:`
			`while time.perf_counter() - start < timeout:`
			`try:`
			`r = session.get(f"{url}/health", timeout=5)`
			`if r.status_code == 200:`
			`return`
			`except requests.RequestException:`
			`pass`
			`time.sleep(1)`
			`raise TimeoutError(f"Service at {url} failed to become healthy in time")`


			`def _detect_ib_device() -> Optional[str]:`
			`"""Return first active IB device name (e.g., mlx5_0) or None if unavailable."""`
			`# Fast check that ibv_devinfo exists`
			`try:`
			`subprocess.run(`
			`["ibv_devinfo", "-l"],`
			`stdout=subprocess.DEVNULL,`
			`stderr=subprocess.DEVNULL,`
			`timeout=1,`
			`)`
			`except (FileNotFoundError, subprocess.TimeoutExpired):`
			`return None`

			`for i in range(12):`
			`dev = f"mlx5_{i}"`
			`try:`
			`res = subprocess.run(`
			`["ibv_devinfo", dev],`
			`capture_output=True,`
			`text=True,`
			`timeout=2,`
			`)`
			`if res.returncode == 0 and ("state:" in res.stdout):`
			`for line in res.stdout.splitlines():`
			`if "state:" in line and "PORT_ACTIVE" in line:`
			`return dev`
			`except Exception:`
			`pass`
			`return None`


			`def _popen_launch_prefill_worker(`
			`model: str,`
			`bootstrap_port: int,`
			`ib_device: Optional[str] = None,`
			`base_gpu_id: int = 0,`
			`) -> SimpleNamespace:`
			`port = _find_available_port()`
			`url = f"http://127.0.0.1:{port}"`
			`cmd = [`
			`"python3",`
			`"-m",`
			`"sglang.launch_server",`
			`"--model-path",`
			`model,`
			`"--disaggregation-mode",`
			`"prefill",`
			`"--host",`
			`"127.0.0.1",`
			`"--port",`
			`str(port),`
			`"--disaggregation-bootstrap-port",`
			`str(bootstrap_port),`
			`"--base-gpu-id",`
			`str(base_gpu_id),`
			`]`
			`if ib_device:`
			`cmd += ["--disaggregation-ib-device", ib_device]`
			`proc = subprocess.Popen(cmd)`
			`_wait_health(url, timeout=300.0)`
			`return SimpleNamespace(proc=proc, url=url, bootstrap_port=bootstrap_port)`


			`def _popen_launch_decode_worker(`
			`model: str, ib_device: Optional[str] = None, base_gpu_id: int = 0`
			`) -> SimpleNamespace:`
			`port = _find_available_port()`
			`url = f"http://127.0.0.1:{port}"`
			`cmd = [`
			`"python3",`
			`"-m",`
			`"sglang.launch_server",`
			`"--model-path",`
			`model,`
			`"--disaggregation-mode",`
			`"decode",`
			`"--host",`
			`"127.0.0.1",`
			`"--port",`
			`str(port),`
			`"--base-gpu-id",`
			`str(base_gpu_id),`
			`]`
			`if ib_device:`
			`cmd += ["--disaggregation-ib-device", ib_device]`
			`proc = subprocess.Popen(cmd)`
			`_wait_health(url, timeout=300.0)`
			`return SimpleNamespace(proc=proc, url=url)`


			`def _terminate(proc: subprocess.Popen, timeout: float = 120) -> None:`
			`if proc is None:`
			`return`
			`proc.terminate()`
			`start = time.perf_counter()`
			`while proc.poll() is None:`
			`if time.perf_counter() - start > timeout:`
			`proc.kill()`
			`break`
			`time.sleep(1)`


[router] add benchmark for regular router and pd router (#10280) 2025-09-11 12:04:11 -07:00			`@pytest.fixture(scope="module")`
			`def pd_cluster(e2e_model: str):`
			`"""Start 2 prefill + 2 decode workers and one PD router, once per module."""`
[router] Add PD router mmlu test (#10256) 2025-09-10 08:47:24 -07:00			`# Environment capability checks: require sgl_kernel and GPU backend`
			`try:`
			`import sgl_kernel # noqa: F401`
			`except Exception as e: # pragma: no cover - environment dependent`
			`pytest.fail(f"PD e2e requires sgl_kernel but it is not available: {e}")`

			`try:`
			`import torch # noqa: F401`
			`except Exception as e: # pragma: no cover - environment dependent`
			`pytest.fail(`
			`f"PD e2e requires torch but it is not available or misconfigured: {e}"`
			`)`

			`if not torch.cuda.is_available(): # pragma: no cover - environment dependent`
			`pytest.fail("PD e2e requires CUDA backend, but CUDA is not available")`

			`workers: list[SimpleNamespace] = []`
[router] add benchmark for regular router and pd router (#10280) 2025-09-11 12:04:11 -07:00			`router_proc = None`
[router] Add PD router mmlu test (#10256) 2025-09-10 08:47:24 -07:00			`try:`
			`ib_device = _detect_ib_device()`

			`# Launch 4 workers across 4 GPUs: prefill on 0,1 and decode on 2,3`
			`pf1 = _popen_launch_prefill_worker(`
			`e2e_model,`
			`bootstrap_port=_find_available_port(),`
			`ib_device=ib_device,`
			`base_gpu_id=0,`
			`)`
			`pf2 = _popen_launch_prefill_worker(`
			`e2e_model,`
			`bootstrap_port=_find_available_port(),`
			`ib_device=ib_device,`
			`base_gpu_id=1,`
			`)`
			`dc1 = _popen_launch_decode_worker(e2e_model, ib_device=ib_device, base_gpu_id=2)`
			`dc2 = _popen_launch_decode_worker(e2e_model, ib_device=ib_device, base_gpu_id=3)`
			`prefills = [pf1, pf2]`
			`decodes = [dc1, dc2]`
			`workers.extend(prefills + decodes)`

			`# PD router with two prefill and two decode endpoints`
			`rport = _find_available_port()`
			`router_url = f"http://127.0.0.1:{rport}"`
			`pport = _find_available_port()`

			`prefill = [(pf.url, pf.bootstrap_port) for pf in prefills]`
			`decode = [dc.url for dc in decodes]`

			`cmd = [`
			`"python3",`
			`"-m",`
			`"sglang_router.launch_router",`
			`"--host",`
			`"127.0.0.1",`
			`"--port",`
			`str(rport),`
			`"--policy",`
			`"round_robin",`
			`"--pd-disaggregation",`
			`]`
			`for url, bport in prefill:`
			`cmd += ["--prefill", url, str(bport)]`
			`for url in decode:`
			`cmd += ["--decode", url]`
			`cmd += [`
			`"--prometheus-port",`
			`str(pport),`
			`"--prometheus-host",`
			`"127.0.0.1",`
			`]`

			`router_proc = subprocess.Popen(cmd)`
[router] add benchmark for regular router and pd router (#10280) 2025-09-11 12:04:11 -07:00			`_wait_health(router_url, timeout=180.0)`

			`yield SimpleNamespace(`
			`router_url=router_url, workers=workers, router_proc=router_proc`
			`)`
[router] Add PD router mmlu test (#10256) 2025-09-10 08:47:24 -07:00			`finally:`
[router] add benchmark for regular router and pd router (#10280) 2025-09-11 12:04:11 -07:00			`if router_proc is not None:`
			`_terminate(router_proc)`
[router] Add PD router mmlu test (#10256) 2025-09-10 08:47:24 -07:00			`for w in workers:`
			`_terminate(w.proc)`
[router] add benchmark for regular router and pd router (#10280) 2025-09-11 12:04:11 -07:00

			`@pytest.mark.e2e`
			`def test_pd_mmlu(e2e_model: str, pd_cluster):`
			`"""`
			`Launch 4 workers, start a PD router (2 prefill + 2 decode), then run MMLU.`
			`"""`
			`args = SimpleNamespace(`
			`base_url=pd_cluster.router_url,`
			`model=e2e_model,`
			`eval_name="mmlu",`
			`num_examples=64,`
			`num_threads=32,`
			`temperature=0.1,`
			`)`
			`metrics = run_eval(args)`
			`assert metrics["score"] >= 0.65`


			`@pytest.mark.e2e`
			`def test_pd_genai_bench(e2e_model: str, pd_cluster, genai_bench_runner):`
			`"""`
			`Launch 4 workers, start a PD router (2 prefill + 2 decode), then run a`
			`short genai-bench benchmark and validate aggregate metrics.`
			`"""`
			`# Run genai-bench against the shared router`
			`policy_label = "benchmark_round_robin_pd"`
			`genai_bench_runner(`
			`router_url=pd_cluster.router_url,`
			`model_path=e2e_model,`
			`experiment_folder=policy_label,`
			`thresholds={`
			`"ttft_mean_max": 12,`
			`"e2e_latency_mean_max": 15,`
			`"input_throughput_mean_min": 400,`
			`"output_throughput_mean_min": 20,`
[router][ci] Add gpu utilization analyze with nvml (#10345) 2025-09-11 19:26:02 -07:00			`"gpu_util_p50_min": 99,`
[router] add benchmark for regular router and pd router (#10280) 2025-09-11 12:04:11 -07:00			`},`
			`kill_procs=pd_cluster.workers,`
			`)`