[router] Introduce router integration tests (#10086)
This commit is contained in:
1
sgl-router/py_test/fixtures/__init__.py
Normal file
1
sgl-router/py_test/fixtures/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
"""Shared fixtures for router integration tests."""
|
||||
248
sgl-router/py_test/fixtures/mock_worker.py
Normal file
248
sgl-router/py_test/fixtures/mock_worker.py
Normal file
@@ -0,0 +1,248 @@
|
||||
"""
|
||||
Lightweight mock worker HTTP server for router integration tests.
|
||||
|
||||
Implements minimal endpoints used by the router:
|
||||
- GET /health, /health_generate
|
||||
- POST /generate, /v1/completions, /v1/chat/completions
|
||||
- POST /flush_cache
|
||||
- GET /get_server_info, /get_model_info, /v1/models
|
||||
|
||||
Behavior knobs are controlled via CLI flags to simulate failures, latency, and load.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
import random
|
||||
import signal
|
||||
import sys
|
||||
import time
|
||||
from contextlib import asynccontextmanager
|
||||
from typing import Optional
|
||||
|
||||
import uvicorn
|
||||
from fastapi import FastAPI, HTTPException, Request
|
||||
from fastapi.responses import JSONResponse, PlainTextResponse, StreamingResponse
|
||||
|
||||
# Global state (per-process)
|
||||
_inflight = 0
|
||||
_failures_seen = 0
|
||||
|
||||
|
||||
def _parse_args() -> argparse.Namespace:
|
||||
p = argparse.ArgumentParser()
|
||||
p.add_argument("--host", default="127.0.0.1")
|
||||
p.add_argument("--port", type=int, required=True)
|
||||
p.add_argument("--worker-id", default=None)
|
||||
p.add_argument("--latency-ms", type=int, default=0)
|
||||
p.add_argument("--timeout", action="store_true")
|
||||
p.add_argument("--status-code", type=int, default=200)
|
||||
p.add_argument("--fail-first-n", type=int, default=0)
|
||||
p.add_argument("--random-fail-rate", type=float, default=0.0)
|
||||
p.add_argument("--require-api-key", action="store_true")
|
||||
p.add_argument("--api-key", default=None)
|
||||
p.add_argument("--max-payload-bytes", type=int, default=10 * 1024 * 1024)
|
||||
p.add_argument("--stream", action="store_true")
|
||||
p.add_argument("--crash-on-request", action="store_true")
|
||||
p.add_argument("--health-fail-after-ms", type=int, default=0)
|
||||
return p.parse_args()
|
||||
|
||||
|
||||
def _extract_worker_id(args: argparse.Namespace) -> str:
|
||||
if args.worker_id:
|
||||
return str(args.worker_id)
|
||||
# default to port (unique enough for tests)
|
||||
return f"worker-{args.port}"
|
||||
|
||||
|
||||
def create_app(args: argparse.Namespace) -> FastAPI:
|
||||
app = FastAPI()
|
||||
worker_id = _extract_worker_id(args)
|
||||
start_ts = time.time()
|
||||
crashed = {"done": False}
|
||||
|
||||
async def maybe_delay():
|
||||
if args.latency_ms > 0:
|
||||
await asyncio.sleep(args.latency_ms / 1000.0)
|
||||
|
||||
def should_fail() -> Optional[int]:
|
||||
global _failures_seen
|
||||
# Fail first N requests (500)
|
||||
if args.fail_first_n > 0 and _failures_seen < args.fail_first_n:
|
||||
_failures_seen += 1
|
||||
return 500
|
||||
# Random failure probability (500)
|
||||
if args.random_fail_rate > 0.0 and random.random() < args.random_fail_rate:
|
||||
return 500
|
||||
# Forced status code override (non-200) for all responses
|
||||
if args.status_code != 200:
|
||||
return int(args.status_code)
|
||||
return None
|
||||
|
||||
def check_api_key(request: Request):
|
||||
if not args.require_api_key:
|
||||
return
|
||||
auth = request.headers.get("Authorization")
|
||||
if not auth or not auth.startswith("Bearer "):
|
||||
raise HTTPException(status_code=401, detail="Unauthorized")
|
||||
key = auth.split(" ", 1)[1]
|
||||
if args.api_key and key != args.api_key:
|
||||
raise HTTPException(status_code=401, detail="Unauthorized")
|
||||
|
||||
@asynccontextmanager
|
||||
async def track_inflight():
|
||||
global _inflight
|
||||
_inflight += 1
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
_inflight -= 1
|
||||
|
||||
@app.get("/health")
|
||||
async def health():
|
||||
if (
|
||||
args.health_fail_after_ms
|
||||
and (time.time() - start_ts) * 1000.0 >= args.health_fail_after_ms
|
||||
):
|
||||
return PlainTextResponse("bad", status_code=500)
|
||||
return PlainTextResponse("ok", status_code=200)
|
||||
|
||||
@app.get("/health_generate")
|
||||
async def health_generate():
|
||||
return PlainTextResponse("ok", status_code=200)
|
||||
|
||||
@app.post("/flush_cache")
|
||||
async def flush_cache():
|
||||
return PlainTextResponse("ok", status_code=200)
|
||||
|
||||
@app.get("/get_model_info")
|
||||
async def get_model_info():
|
||||
return JSONResponse({"model": "mock", "vocab_size": 32000})
|
||||
|
||||
@app.get("/v1/models")
|
||||
async def list_models():
|
||||
return JSONResponse({"data": [{"id": "mock", "object": "model"}]})
|
||||
|
||||
@app.get("/get_server_info")
|
||||
async def get_server_info():
|
||||
return JSONResponse(
|
||||
{
|
||||
"worker_id": worker_id,
|
||||
"load_in_flight": _inflight,
|
||||
"cache": {"size": 0, "hit_rate": 0.0},
|
||||
}
|
||||
)
|
||||
|
||||
@app.get("/get_load")
|
||||
async def get_load():
|
||||
return JSONResponse({"load": _inflight})
|
||||
|
||||
def make_json_response(obj: dict, status_code: int = 200) -> JSONResponse:
|
||||
resp = JSONResponse(obj, status_code=status_code)
|
||||
resp.headers["X-Worker-Id"] = worker_id
|
||||
return resp
|
||||
|
||||
async def handle_text_request(request: Request):
|
||||
# Authorization
|
||||
check_api_key(request)
|
||||
|
||||
# Payload limit
|
||||
body = await request.body()
|
||||
if len(body) > args.max_payload_bytes:
|
||||
return make_json_response({"error": "payload too large"}, status_code=413)
|
||||
|
||||
# Simulate crash on first request
|
||||
if args.crash_on_request and not crashed["done"]:
|
||||
crashed["done"] = True
|
||||
os._exit(1)
|
||||
|
||||
# Optional timeout (simulate hang)
|
||||
if args.timeout:
|
||||
await asyncio.sleep(3600)
|
||||
|
||||
# Optional latency
|
||||
await maybe_delay()
|
||||
|
||||
# Optional failures
|
||||
fail_code = should_fail()
|
||||
if fail_code is not None and fail_code != 200:
|
||||
return make_json_response(
|
||||
{"error": f"mock failure {fail_code}"}, status_code=fail_code
|
||||
)
|
||||
|
||||
# Build response echoing minimal shape
|
||||
try:
|
||||
data = await request.json()
|
||||
except (json.JSONDecodeError, ValueError):
|
||||
data = {}
|
||||
|
||||
now = time.time()
|
||||
ret = {
|
||||
"id": f"cmpl-{int(now*1000)}",
|
||||
"object": "text_completion",
|
||||
"created": int(now),
|
||||
"model": "mock",
|
||||
"choices": [
|
||||
{
|
||||
"text": "ok",
|
||||
"index": 0,
|
||||
"finish_reason": "stop",
|
||||
}
|
||||
],
|
||||
"worker_id": worker_id,
|
||||
"echo": data,
|
||||
}
|
||||
return make_json_response(ret, status_code=200)
|
||||
|
||||
async def handle_stream_request(request: Request):
|
||||
check_api_key(request)
|
||||
|
||||
async def gen():
|
||||
# minimal 2-chunk stream then [DONE]
|
||||
for i in range(2):
|
||||
await asyncio.sleep(0.01)
|
||||
chunk = {
|
||||
"choices": [{"delta": {"content": "x"}}],
|
||||
"worker_id": worker_id,
|
||||
}
|
||||
yield f"data: {json.dumps(chunk)}\n\n"
|
||||
yield "data: [DONE]\n\n"
|
||||
|
||||
headers = {"X-Worker-Id": worker_id}
|
||||
return StreamingResponse(gen(), media_type="text/event-stream", headers=headers)
|
||||
|
||||
@app.post("/generate")
|
||||
async def generate(request: Request):
|
||||
async with track_inflight():
|
||||
if args.stream:
|
||||
return await handle_stream_request(request)
|
||||
return await handle_text_request(request)
|
||||
|
||||
@app.post("/v1/completions")
|
||||
async def completions(request: Request):
|
||||
async with track_inflight():
|
||||
if args.stream:
|
||||
return await handle_stream_request(request)
|
||||
return await handle_text_request(request)
|
||||
|
||||
@app.post("/v1/chat/completions")
|
||||
async def chat_completions(request: Request):
|
||||
async with track_inflight():
|
||||
if args.stream:
|
||||
return await handle_stream_request(request)
|
||||
return await handle_text_request(request)
|
||||
|
||||
return app
|
||||
|
||||
|
||||
def main() -> None:
|
||||
args = _parse_args()
|
||||
app = create_app(args)
|
||||
# Handle SIGTERM gracefully for fast test teardown
|
||||
signal.signal(signal.SIGTERM, lambda *_: sys.exit(0))
|
||||
uvicorn.run(app, host=args.host, port=args.port, log_level="warning")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
8
sgl-router/py_test/fixtures/ports.py
Normal file
8
sgl-router/py_test/fixtures/ports.py
Normal file
@@ -0,0 +1,8 @@
|
||||
import socket
|
||||
|
||||
|
||||
def find_free_port() -> int:
|
||||
"""Return an available TCP port on localhost."""
|
||||
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
||||
s.bind(("127.0.0.1", 0))
|
||||
return s.getsockname()[1]
|
||||
158
sgl-router/py_test/fixtures/router_manager.py
Normal file
158
sgl-router/py_test/fixtures/router_manager.py
Normal file
@@ -0,0 +1,158 @@
|
||||
import subprocess
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
import requests
|
||||
|
||||
from .ports import find_free_port
|
||||
|
||||
|
||||
@dataclass
|
||||
class ProcHandle:
|
||||
process: subprocess.Popen
|
||||
url: str
|
||||
|
||||
|
||||
class RouterManager:
|
||||
"""Helper to spawn a router process and interact with admin endpoints."""
|
||||
|
||||
def __init__(self):
|
||||
self._children: List[subprocess.Popen] = []
|
||||
|
||||
def start_router(
|
||||
self,
|
||||
worker_urls: Optional[List[str]] = None,
|
||||
policy: str = "round_robin",
|
||||
port: Optional[int] = None,
|
||||
extra: Optional[Dict] = None,
|
||||
# PD options
|
||||
pd_disaggregation: bool = False,
|
||||
prefill_urls: Optional[List[tuple]] = None,
|
||||
decode_urls: Optional[List[str]] = None,
|
||||
prefill_policy: Optional[str] = None,
|
||||
decode_policy: Optional[str] = None,
|
||||
) -> ProcHandle:
|
||||
worker_urls = worker_urls or []
|
||||
port = port or find_free_port()
|
||||
cmd = [
|
||||
"python3",
|
||||
"-m",
|
||||
"sglang_router.launch_router",
|
||||
"--host",
|
||||
"127.0.0.1",
|
||||
"--port",
|
||||
str(port),
|
||||
"--policy",
|
||||
policy,
|
||||
]
|
||||
# Avoid Prometheus port collisions by assigning a free port per router
|
||||
prom_port = find_free_port()
|
||||
cmd.extend(
|
||||
["--prometheus-port", str(prom_port), "--prometheus-host", "127.0.0.1"]
|
||||
)
|
||||
if worker_urls:
|
||||
cmd.extend(["--worker-urls", *worker_urls])
|
||||
|
||||
# PD routing configuration
|
||||
if pd_disaggregation:
|
||||
cmd.append("--pd-disaggregation")
|
||||
if prefill_urls:
|
||||
for url, bport in prefill_urls:
|
||||
if bport is None:
|
||||
cmd.extend(["--prefill", url, "none"])
|
||||
else:
|
||||
cmd.extend(["--prefill", url, str(bport)])
|
||||
if decode_urls:
|
||||
for url in decode_urls:
|
||||
cmd.extend(["--decode", url])
|
||||
if prefill_policy:
|
||||
cmd.extend(["--prefill-policy", prefill_policy])
|
||||
if decode_policy:
|
||||
cmd.extend(["--decode-policy", decode_policy])
|
||||
|
||||
# Map supported extras to CLI flags (subset for integration)
|
||||
if extra:
|
||||
flag_map = {
|
||||
"max_payload_size": "--max-payload-size",
|
||||
"dp_aware": "--dp-aware",
|
||||
"api_key": "--api-key",
|
||||
# Health/monitoring
|
||||
"worker_startup_check_interval": "--worker-startup-check-interval",
|
||||
# Cache-aware tuning
|
||||
"cache_threshold": "--cache-threshold",
|
||||
"balance_abs_threshold": "--balance-abs-threshold",
|
||||
"balance_rel_threshold": "--balance-rel-threshold",
|
||||
# Retry
|
||||
"retry_max_retries": "--retry-max-retries",
|
||||
"retry_initial_backoff_ms": "--retry-initial-backoff-ms",
|
||||
"retry_max_backoff_ms": "--retry-max-backoff-ms",
|
||||
"retry_backoff_multiplier": "--retry-backoff-multiplier",
|
||||
"retry_jitter_factor": "--retry-jitter-factor",
|
||||
"disable_retries": "--disable-retries",
|
||||
# Circuit breaker
|
||||
"cb_failure_threshold": "--cb-failure-threshold",
|
||||
"cb_success_threshold": "--cb-success-threshold",
|
||||
"cb_timeout_duration_secs": "--cb-timeout-duration-secs",
|
||||
"cb_window_duration_secs": "--cb-window-duration-secs",
|
||||
"disable_circuit_breaker": "--disable-circuit-breaker",
|
||||
# Rate limiting
|
||||
"max_concurrent_requests": "--max-concurrent-requests",
|
||||
"queue_size": "--queue-size",
|
||||
"queue_timeout_secs": "--queue-timeout-secs",
|
||||
"rate_limit_tokens_per_second": "--rate-limit-tokens-per-second",
|
||||
}
|
||||
for k, v in extra.items():
|
||||
if v is None:
|
||||
continue
|
||||
flag = flag_map.get(k)
|
||||
if not flag:
|
||||
continue
|
||||
if isinstance(v, bool):
|
||||
if v:
|
||||
cmd.append(flag)
|
||||
else:
|
||||
cmd.extend([flag, str(v)])
|
||||
|
||||
proc = subprocess.Popen(cmd)
|
||||
self._children.append(proc)
|
||||
url = f"http://127.0.0.1:{port}"
|
||||
self._wait_health(url)
|
||||
return ProcHandle(process=proc, url=url)
|
||||
|
||||
def _wait_health(self, base_url: str, timeout: float = 30.0):
|
||||
start = time.time()
|
||||
with requests.Session() as s:
|
||||
while time.time() - start < timeout:
|
||||
try:
|
||||
r = s.get(f"{base_url}/health", timeout=2)
|
||||
if r.status_code == 200:
|
||||
return
|
||||
except requests.RequestException:
|
||||
pass
|
||||
time.sleep(0.2)
|
||||
raise TimeoutError(f"Router at {base_url} did not become healthy")
|
||||
|
||||
def add_worker(self, base_url: str, worker_url: str) -> None:
|
||||
r = requests.post(f"{base_url}/add_worker", params={"url": worker_url})
|
||||
assert r.status_code == 200, f"add_worker failed: {r.status_code} {r.text}"
|
||||
|
||||
def remove_worker(self, base_url: str, worker_url: str) -> None:
|
||||
r = requests.post(f"{base_url}/remove_worker", params={"url": worker_url})
|
||||
assert r.status_code == 200, f"remove_worker failed: {r.status_code} {r.text}"
|
||||
|
||||
def list_workers(self, base_url: str) -> list[str]:
|
||||
r = requests.get(f"{base_url}/list_workers")
|
||||
assert r.status_code == 200, f"list_workers failed: {r.status_code} {r.text}"
|
||||
data = r.json()
|
||||
return data.get("urls", [])
|
||||
|
||||
def stop_all(self):
|
||||
for p in self._children:
|
||||
if p.poll() is None:
|
||||
p.terminate()
|
||||
try:
|
||||
p.wait(timeout=5)
|
||||
except subprocess.TimeoutExpired:
|
||||
p.kill()
|
||||
self._children.clear()
|
||||
Reference in New Issue
Block a user