[router] Introduce router integration tests (#10086)
This commit is contained in:
1
sgl-router/py_test/integration/__init__.py
Normal file
1
sgl-router/py_test/integration/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
"""Integration test package for the router."""
|
||||
109
sgl-router/py_test/integration/conftest.py
Normal file
109
sgl-router/py_test/integration/conftest.py
Normal file
@@ -0,0 +1,109 @@
|
||||
import os
|
||||
import subprocess
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Dict, Iterable, List, Tuple
|
||||
|
||||
import pytest
|
||||
import requests
|
||||
|
||||
from ..fixtures.ports import find_free_port
|
||||
from ..fixtures.router_manager import RouterManager
|
||||
|
||||
|
||||
def pytest_configure(config):
|
||||
config.addinivalue_line("markers", "integration: mark as router integration test")
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def router_manager() -> Iterable[RouterManager]:
|
||||
mgr = RouterManager()
|
||||
try:
|
||||
yield mgr
|
||||
finally:
|
||||
mgr.stop_all()
|
||||
|
||||
|
||||
def _spawn_mock_worker(args: List[str]) -> Tuple[subprocess.Popen, str, str]:
|
||||
repo_root = Path(__file__).resolve().parents[2]
|
||||
script = repo_root / "py_test" / "fixtures" / "mock_worker.py"
|
||||
port = find_free_port()
|
||||
worker_id = f"worker-{port}"
|
||||
base_cmd = [
|
||||
"python3",
|
||||
str(script),
|
||||
"--port",
|
||||
str(port),
|
||||
"--worker-id",
|
||||
worker_id,
|
||||
]
|
||||
cmd = base_cmd + args
|
||||
proc = subprocess.Popen(cmd)
|
||||
url = f"http://127.0.0.1:{port}"
|
||||
_wait_health(url)
|
||||
return proc, url, worker_id
|
||||
|
||||
|
||||
def _wait_health(url: str, timeout: float = 10.0):
|
||||
start = time.time()
|
||||
with requests.Session() as s:
|
||||
while time.time() - start < timeout:
|
||||
try:
|
||||
r = s.get(f"{url}/health", timeout=1)
|
||||
if r.status_code == 200:
|
||||
return
|
||||
except requests.RequestException:
|
||||
pass
|
||||
time.sleep(0.1)
|
||||
raise TimeoutError(f"Mock worker at {url} did not become healthy")
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_worker():
|
||||
"""Start a single healthy mock worker; yields (process, url, worker_id)."""
|
||||
proc, url, worker_id = _spawn_mock_worker([])
|
||||
try:
|
||||
yield proc, url, worker_id
|
||||
finally:
|
||||
if proc.poll() is None:
|
||||
proc.terminate()
|
||||
try:
|
||||
proc.wait(timeout=3)
|
||||
except subprocess.TimeoutExpired:
|
||||
proc.kill()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_workers():
|
||||
"""Factory to start N workers with custom args.
|
||||
|
||||
Usage:
|
||||
procs, urls, ids = mock_workers(n=3, args=["--latency-ms", "5"]) # same args for all
|
||||
...
|
||||
"""
|
||||
|
||||
procs: List[subprocess.Popen] = []
|
||||
|
||||
def _start(n: int, args: List[str] | None = None):
|
||||
args = args or []
|
||||
new_procs: List[subprocess.Popen] = []
|
||||
urls: List[str] = []
|
||||
ids: List[str] = []
|
||||
for _ in range(n):
|
||||
p, url, wid = _spawn_mock_worker(args)
|
||||
procs.append(p)
|
||||
new_procs.append(p)
|
||||
urls.append(url)
|
||||
ids.append(wid)
|
||||
return new_procs, urls, ids
|
||||
|
||||
try:
|
||||
yield _start
|
||||
finally:
|
||||
for p in procs:
|
||||
if p.poll() is None:
|
||||
p.terminate()
|
||||
try:
|
||||
p.wait(timeout=3)
|
||||
except subprocess.TimeoutExpired:
|
||||
p.kill()
|
||||
@@ -0,0 +1 @@
|
||||
"""Load balancing integration tests."""
|
||||
@@ -0,0 +1,73 @@
|
||||
import collections
|
||||
import concurrent.futures
|
||||
import uuid
|
||||
|
||||
import pytest
|
||||
import requests
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_cache_aware_affinity(mock_workers, router_manager):
|
||||
# Two workers; same prompt should stick to one due to cache tree
|
||||
_, urls, ids = mock_workers(n=2)
|
||||
rh = router_manager.start_router(worker_urls=urls, policy="cache_aware")
|
||||
|
||||
counts = collections.Counter()
|
||||
with requests.Session() as s:
|
||||
for i in range(12):
|
||||
r = s.post(
|
||||
f"{rh.url}/v1/completions",
|
||||
json={
|
||||
"model": "test-model",
|
||||
"prompt": "repeated prompt for cache",
|
||||
"max_tokens": 1,
|
||||
"stream": False,
|
||||
},
|
||||
)
|
||||
assert r.status_code == 200
|
||||
wid = r.headers.get("X-Worker-Id") or r.json().get("worker_id")
|
||||
counts[wid] += 1
|
||||
|
||||
# Expect strong skew toward one worker (tree match); majority > 80%
|
||||
top = max(counts.values())
|
||||
assert top >= 10, counts
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_cache_aware_diverse_prompts_balances(mock_workers, router_manager):
|
||||
# Add latency so concurrent requests overlap and influence load-based selection
|
||||
_, urls, ids = mock_workers(n=3, args=["--latency-ms", "30"])
|
||||
rh = router_manager.start_router(
|
||||
worker_urls=urls,
|
||||
policy="cache_aware",
|
||||
extra={
|
||||
"cache_threshold": 0.99,
|
||||
"balance_abs_threshold": 0,
|
||||
"balance_rel_threshold": 1.0,
|
||||
},
|
||||
)
|
||||
|
||||
counts = collections.Counter()
|
||||
|
||||
def call(i):
|
||||
# Use diverse, unrelated prompts to avoid prefix matches entirely
|
||||
prompt = str(uuid.uuid4())
|
||||
r = requests.post(
|
||||
f"{rh.url}/v1/completions",
|
||||
json={
|
||||
"model": "test-model",
|
||||
"prompt": prompt,
|
||||
"max_tokens": 1,
|
||||
"stream": False,
|
||||
},
|
||||
timeout=5,
|
||||
)
|
||||
assert r.status_code == 200
|
||||
return r.headers.get("X-Worker-Id") or r.json().get("worker_id")
|
||||
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=16) as ex:
|
||||
for wid in ex.map(call, range(40)):
|
||||
counts[wid] += 1
|
||||
|
||||
# Expect participation of at least two workers
|
||||
assert sum(1 for v in counts.values() if v > 0) >= 2, counts
|
||||
@@ -0,0 +1,89 @@
|
||||
import collections
|
||||
import concurrent.futures
|
||||
import time
|
||||
|
||||
import pytest
|
||||
import requests
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_power_of_two_prefers_less_loaded(mock_workers, router_manager):
|
||||
# Start two workers: one slow (higher inflight), one fast
|
||||
# Router monitors /get_load and Power-of-Two uses cached loads to choose
|
||||
# Start one slow and one fast worker using the fixture factory
|
||||
procs_slow, urls_slow, ids_slow = mock_workers(n=1, args=["--latency-ms", "200"])
|
||||
procs_fast, urls_fast, ids_fast = mock_workers(n=1, args=["--latency-ms", "0"])
|
||||
procs = procs_slow + procs_fast
|
||||
urls = urls_slow + urls_fast
|
||||
ids = ids_slow + ids_fast
|
||||
slow_id = ids_slow[0]
|
||||
|
||||
rh = router_manager.start_router(
|
||||
worker_urls=urls,
|
||||
policy="power_of_two",
|
||||
extra={"worker_startup_check_interval": 1},
|
||||
)
|
||||
|
||||
# Prime: fire a burst to create measurable load on slow worker, then wait for monitor tick
|
||||
|
||||
def _prime_call(i):
|
||||
try:
|
||||
requests.post(
|
||||
f"{rh.url}/v1/completions",
|
||||
json={
|
||||
"model": "test-model",
|
||||
"prompt": f"warm-{i}",
|
||||
"max_tokens": 1,
|
||||
"stream": False,
|
||||
},
|
||||
timeout=5,
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=32) as ex:
|
||||
list(ex.map(_prime_call, range(128)))
|
||||
time.sleep(2)
|
||||
|
||||
# Apply direct background load on the slow worker to amplify load diff
|
||||
def _direct_load(i):
|
||||
try:
|
||||
requests.post(
|
||||
f"{slow_url}/v1/completions",
|
||||
json={
|
||||
"model": "test-model",
|
||||
"prompt": f"bg-{i}",
|
||||
"max_tokens": 1,
|
||||
"stream": False,
|
||||
},
|
||||
timeout=5,
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=32) as ex:
|
||||
list(ex.map(_direct_load, range(128)))
|
||||
time.sleep(1)
|
||||
|
||||
def call(i):
|
||||
r = requests.post(
|
||||
f"{rh.url}/v1/completions",
|
||||
json={
|
||||
"model": "test-model",
|
||||
"prompt": f"p{i}",
|
||||
"max_tokens": 1,
|
||||
"stream": False,
|
||||
},
|
||||
timeout=5,
|
||||
)
|
||||
assert r.status_code == 200
|
||||
return r.headers.get("X-Worker-Id") or r.json().get("worker_id")
|
||||
|
||||
counts = collections.Counter()
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=32) as ex:
|
||||
for wid in ex.map(call, range(200)):
|
||||
counts[wid] += 1
|
||||
|
||||
# Expect the slow worker (higher latency/inflight) to receive fewer requests
|
||||
fast_worker_id = [i for i in ids if i != slow_id][0]
|
||||
assert counts[slow_id] < counts[fast_worker_id], counts
|
||||
33
sgl-router/py_test/integration/load_balancing/test_random.py
Normal file
33
sgl-router/py_test/integration/load_balancing/test_random.py
Normal file
@@ -0,0 +1,33 @@
|
||||
import collections
|
||||
import math
|
||||
|
||||
import pytest
|
||||
import requests
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_random_distribution(mock_workers, router_manager):
|
||||
procs, urls, ids = mock_workers(n=4)
|
||||
rh = router_manager.start_router(worker_urls=urls, policy="random")
|
||||
|
||||
counts = collections.Counter()
|
||||
N = 200
|
||||
with requests.Session() as s:
|
||||
for i in range(N):
|
||||
r = s.post(
|
||||
f"{rh.url}/v1/completions",
|
||||
json={
|
||||
"model": "test-model",
|
||||
"prompt": f"p{i}",
|
||||
"max_tokens": 1,
|
||||
"stream": False,
|
||||
},
|
||||
)
|
||||
assert r.status_code == 200
|
||||
wid = r.headers.get("X-Worker-Id") or r.json().get("worker_id")
|
||||
counts[wid] += 1
|
||||
|
||||
# simple statistical tolerance: each worker should be within ±50% of mean
|
||||
mean = N / len(ids)
|
||||
for wid in ids:
|
||||
assert 0.5 * mean <= counts[wid] <= 1.5 * mean, counts
|
||||
@@ -0,0 +1,34 @@
|
||||
import collections
|
||||
import time
|
||||
|
||||
import pytest
|
||||
import requests
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_round_robin_distribution(mock_workers, router_manager):
|
||||
procs, urls, ids = mock_workers(n=3)
|
||||
|
||||
rh = router_manager.start_router(worker_urls=urls, policy="round_robin")
|
||||
|
||||
counts = collections.Counter()
|
||||
with requests.Session() as s:
|
||||
for i in range(30):
|
||||
r = s.post(
|
||||
f"{rh.url}/v1/completions",
|
||||
json={
|
||||
"model": "test-model",
|
||||
"prompt": f"hello {i}",
|
||||
"max_tokens": 1,
|
||||
"stream": False,
|
||||
},
|
||||
)
|
||||
assert r.status_code == 200
|
||||
wid = r.headers.get("X-Worker-Id") or r.json().get("worker_id")
|
||||
assert wid in ids
|
||||
counts[wid] += 1
|
||||
|
||||
# Expect near-even distribution across 3 workers
|
||||
# 30 requests -> ideally 10 each; allow small tolerance ±3
|
||||
for wid in ids:
|
||||
assert 7 <= counts[wid] <= 13, counts
|
||||
38
sgl-router/py_test/integration/test_api_auth.py
Normal file
38
sgl-router/py_test/integration/test_api_auth.py
Normal file
@@ -0,0 +1,38 @@
|
||||
import pytest
|
||||
import requests
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_router_api_key_enforcement(router_manager, mock_workers):
|
||||
# Start backend requiring API key; router should forward Authorization header transparently
|
||||
_, urls, _ = mock_workers(
|
||||
n=1, args=["--require-api-key", "--api-key", "correct_api_key"]
|
||||
)
|
||||
rh = router_manager.start_router(
|
||||
worker_urls=urls,
|
||||
policy="round_robin",
|
||||
extra={},
|
||||
)
|
||||
|
||||
# No auth -> 401
|
||||
r = requests.post(
|
||||
f"{rh.url}/v1/completions",
|
||||
json={"model": "test-model", "prompt": "x", "max_tokens": 1, "stream": False},
|
||||
)
|
||||
assert r.status_code == 401
|
||||
|
||||
# Invalid auth -> 401
|
||||
r = requests.post(
|
||||
f"{rh.url}/v1/completions",
|
||||
json={"model": "test-model", "prompt": "x", "max_tokens": 1, "stream": False},
|
||||
headers={"Authorization": "Bearer wrong"},
|
||||
)
|
||||
assert r.status_code == 401
|
||||
|
||||
# Correct auth -> 200
|
||||
r = requests.post(
|
||||
f"{rh.url}/v1/completions",
|
||||
json={"model": "test-model", "prompt": "x", "max_tokens": 1, "stream": False},
|
||||
headers={"Authorization": "Bearer correct_api_key"},
|
||||
)
|
||||
assert r.status_code == 200
|
||||
191
sgl-router/py_test/integration/test_circuit_breaker.py
Normal file
191
sgl-router/py_test/integration/test_circuit_breaker.py
Normal file
@@ -0,0 +1,191 @@
|
||||
import time
|
||||
|
||||
import pytest
|
||||
import requests
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_circuit_breaker_opens_and_recovers(router_manager, mock_workers):
|
||||
# A single worker that fails first 3 requests, then succeeds
|
||||
_, [wurl], _ = mock_workers(n=1, args=["--fail-first-n", "3"]) # fails first 3
|
||||
rh = router_manager.start_router(
|
||||
worker_urls=[wurl],
|
||||
policy="round_robin",
|
||||
extra={
|
||||
"cb_failure_threshold": 3,
|
||||
"cb_success_threshold": 2,
|
||||
"cb_timeout_duration_secs": 3,
|
||||
"cb_window_duration_secs": 10,
|
||||
"disable_retries": True,
|
||||
},
|
||||
)
|
||||
|
||||
def post_once():
|
||||
return requests.post(
|
||||
f"{rh.url}/v1/completions",
|
||||
json={
|
||||
"model": "test-model",
|
||||
"prompt": "trigger",
|
||||
"max_tokens": 1,
|
||||
"stream": False,
|
||||
},
|
||||
timeout=3,
|
||||
)
|
||||
|
||||
saw_503 = False
|
||||
for _ in range(8):
|
||||
r = post_once()
|
||||
if r.status_code == 503:
|
||||
saw_503 = True
|
||||
break
|
||||
assert saw_503, "circuit breaker did not open to return 503"
|
||||
|
||||
time.sleep(4)
|
||||
r1 = post_once()
|
||||
r2 = post_once()
|
||||
assert r1.status_code == 200 and r2.status_code == 200
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_circuit_breaker_half_open_failure_reopens(router_manager, mock_workers):
|
||||
_, [wurl], _ = mock_workers(n=1, args=["--status-code", "500"]) # always fail
|
||||
rh = router_manager.start_router(
|
||||
worker_urls=[wurl],
|
||||
policy="round_robin",
|
||||
extra={
|
||||
"cb_failure_threshold": 2,
|
||||
"cb_success_threshold": 2,
|
||||
"cb_timeout_duration_secs": 2,
|
||||
"cb_window_duration_secs": 5,
|
||||
"disable_retries": True,
|
||||
},
|
||||
)
|
||||
|
||||
def post_once():
|
||||
return requests.post(
|
||||
f"{rh.url}/v1/completions",
|
||||
json={
|
||||
"model": "test-model",
|
||||
"prompt": "x",
|
||||
"max_tokens": 1,
|
||||
"stream": False,
|
||||
},
|
||||
timeout=3,
|
||||
)
|
||||
|
||||
opened = False
|
||||
for _ in range(8):
|
||||
r = post_once()
|
||||
if r.status_code == 503:
|
||||
opened = True
|
||||
break
|
||||
assert opened, "circuit breaker did not open"
|
||||
|
||||
time.sleep(3)
|
||||
r = post_once()
|
||||
assert r.status_code == 500
|
||||
r2 = post_once()
|
||||
assert r2.status_code == 503
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_circuit_breaker_disable_flag(router_manager, mock_workers):
|
||||
_, [wurl], _ = mock_workers(n=1, args=["--status-code", "500"]) # always fail
|
||||
rh = router_manager.start_router(
|
||||
worker_urls=[wurl],
|
||||
policy="round_robin",
|
||||
extra={
|
||||
"disable_circuit_breaker": True,
|
||||
"disable_retries": True,
|
||||
},
|
||||
)
|
||||
r = requests.post(
|
||||
f"{rh.url}/v1/completions",
|
||||
json={
|
||||
"model": "test-model",
|
||||
"prompt": "x",
|
||||
"max_tokens": 1,
|
||||
"stream": False,
|
||||
},
|
||||
timeout=3,
|
||||
)
|
||||
assert r.status_code == 500
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_circuit_breaker_per_worker_isolation(router_manager, mock_workers):
|
||||
_, [fail_url], _ = mock_workers(n=1, args=["--status-code", "500"]) # always fail
|
||||
_, [ok_url], _ = mock_workers(n=1)
|
||||
rh = router_manager.start_router(
|
||||
worker_urls=[fail_url, ok_url],
|
||||
policy="round_robin",
|
||||
extra={
|
||||
"cb_failure_threshold": 2,
|
||||
"cb_success_threshold": 1,
|
||||
"cb_timeout_duration_secs": 2,
|
||||
"cb_window_duration_secs": 10,
|
||||
"disable_retries": True,
|
||||
},
|
||||
)
|
||||
|
||||
def post_once():
|
||||
return requests.post(
|
||||
f"{rh.url}/v1/completions",
|
||||
json={
|
||||
"model": "test-model",
|
||||
"prompt": "y",
|
||||
"max_tokens": 1,
|
||||
"stream": False,
|
||||
},
|
||||
timeout=3,
|
||||
)
|
||||
|
||||
failures = 0
|
||||
successes_after_open = 0
|
||||
opened = False
|
||||
for _ in range(30):
|
||||
r = post_once()
|
||||
if not opened:
|
||||
if r.status_code == 500:
|
||||
failures += 1
|
||||
if failures >= 2:
|
||||
_ = post_once()
|
||||
_ = post_once()
|
||||
opened = True
|
||||
else:
|
||||
if r.status_code == 200:
|
||||
successes_after_open += 1
|
||||
else:
|
||||
assert False, f"Unexpected non-200 after CB open: {r.status_code}"
|
||||
assert opened and successes_after_open >= 5
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_circuit_breaker_with_retries(router_manager, mock_workers):
|
||||
_, [fail_url], _ = mock_workers(n=1, args=["--status-code", "500"]) # always fail
|
||||
_, [ok_url], _ = mock_workers(n=1)
|
||||
rh = router_manager.start_router(
|
||||
worker_urls=[fail_url, ok_url],
|
||||
policy="round_robin",
|
||||
extra={
|
||||
"retry_max_retries": 3,
|
||||
"retry_initial_backoff_ms": 10,
|
||||
"retry_max_backoff_ms": 50,
|
||||
"cb_failure_threshold": 2,
|
||||
"cb_success_threshold": 1,
|
||||
"cb_timeout_duration_secs": 2,
|
||||
"cb_window_duration_secs": 10,
|
||||
},
|
||||
)
|
||||
|
||||
r = requests.post(
|
||||
f"{rh.url}/v1/completions",
|
||||
json={
|
||||
"model": "test-model",
|
||||
"prompt": "z",
|
||||
"max_tokens": 1,
|
||||
"stream": False,
|
||||
},
|
||||
timeout=5,
|
||||
)
|
||||
assert r.status_code == 200
|
||||
36
sgl-router/py_test/integration/test_fault_tolerance.py
Normal file
36
sgl-router/py_test/integration/test_fault_tolerance.py
Normal file
@@ -0,0 +1,36 @@
|
||||
import concurrent.futures
|
||||
import subprocess
|
||||
import time
|
||||
|
||||
import pytest
|
||||
import requests
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_worker_crash_reroute_with_retries(router_manager, mock_workers):
|
||||
# Start one healthy and one that will crash on first request
|
||||
_, [ok_url], _ = mock_workers(n=1)
|
||||
_, [crash_url], _ = mock_workers(n=1, args=["--crash-on-request"])
|
||||
rh = router_manager.start_router(
|
||||
worker_urls=[crash_url, ok_url],
|
||||
policy="round_robin",
|
||||
extra={
|
||||
"retry_max_retries": 3,
|
||||
"retry_initial_backoff_ms": 10,
|
||||
"retry_max_backoff_ms": 50,
|
||||
},
|
||||
)
|
||||
|
||||
# A single request should succeed via retry to the healthy worker
|
||||
r = requests.post(
|
||||
f"{rh.url}/v1/completions",
|
||||
json={
|
||||
"model": "test-model",
|
||||
"prompt": "crash",
|
||||
"max_tokens": 1,
|
||||
"stream": False,
|
||||
},
|
||||
timeout=5,
|
||||
)
|
||||
assert r.status_code == 200
|
||||
# mock_workers fixture handles cleanup
|
||||
127
sgl-router/py_test/integration/test_pd_routing.py
Normal file
127
sgl-router/py_test/integration/test_pd_routing.py
Normal file
@@ -0,0 +1,127 @@
|
||||
import collections
|
||||
import concurrent.futures
|
||||
import subprocess
|
||||
import time
|
||||
|
||||
import pytest
|
||||
import requests
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_pd_power_of_two_decode_attribution(router_manager, mock_workers):
|
||||
# Start two prefill and three decode mock workers via fixture
|
||||
_, prefill_urls_raw, prefill_ids = mock_workers(n=2)
|
||||
_, decode_urls_raw, decode_ids_list = mock_workers(n=3)
|
||||
prefill_urls = [(u, None) for u in prefill_urls_raw]
|
||||
decode_urls = list(decode_urls_raw)
|
||||
decode_ids = set(decode_ids_list)
|
||||
|
||||
rh = router_manager.start_router(
|
||||
policy="power_of_two",
|
||||
pd_disaggregation=True,
|
||||
prefill_urls=prefill_urls,
|
||||
decode_urls=decode_urls,
|
||||
extra={"worker_startup_check_interval": 1},
|
||||
)
|
||||
|
||||
counts = collections.Counter()
|
||||
with requests.Session() as s:
|
||||
for i in range(30):
|
||||
r = s.post(
|
||||
f"{rh.url}/v1/completions",
|
||||
json={
|
||||
"model": "test-model",
|
||||
"prompt": f"p{i}",
|
||||
"max_tokens": 1,
|
||||
"stream": False,
|
||||
},
|
||||
)
|
||||
assert r.status_code == 200
|
||||
wid = r.headers.get("X-Worker-Id") or r.json().get("worker_id")
|
||||
assert wid in decode_ids
|
||||
counts[wid] += 1
|
||||
|
||||
assert sum(1 for v in counts.values() if v > 0) >= 2
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_pd_power_of_two_skews_to_faster_decode(router_manager, mock_workers):
|
||||
# Start two prefill workers (fast)
|
||||
_, prefill_urls_raw, _ = mock_workers(n=2)
|
||||
|
||||
# Start two decode workers: one slow, one fast
|
||||
_, [decode_slow_url], [slow_id] = mock_workers(
|
||||
n=1, args=["--latency-ms", "300"]
|
||||
) # slower decode
|
||||
_, [decode_fast_url], [fast_id] = mock_workers(n=1)
|
||||
decode_urls_raw = [decode_slow_url, decode_fast_url]
|
||||
|
||||
prefill_urls = [(u, None) for u in prefill_urls_raw]
|
||||
decode_urls = list(decode_urls_raw)
|
||||
|
||||
rh = router_manager.start_router(
|
||||
policy="power_of_two",
|
||||
pd_disaggregation=True,
|
||||
prefill_urls=prefill_urls,
|
||||
decode_urls=decode_urls,
|
||||
extra={"worker_startup_check_interval": 1},
|
||||
)
|
||||
|
||||
def _prime_call(i):
|
||||
try:
|
||||
requests.post(
|
||||
f"{rh.url}/v1/completions",
|
||||
json={
|
||||
"model": "test-model",
|
||||
"prompt": f"warm-{i}",
|
||||
"max_tokens": 1,
|
||||
"stream": False,
|
||||
},
|
||||
timeout=8,
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=32) as ex:
|
||||
list(ex.map(_prime_call, range(128)))
|
||||
time.sleep(2)
|
||||
|
||||
def _direct_decode_load(i):
|
||||
try:
|
||||
requests.post(
|
||||
f"{decode_slow_url}/v1/completions",
|
||||
json={
|
||||
"model": "test-model",
|
||||
"prompt": f"bg-{i}",
|
||||
"max_tokens": 1,
|
||||
"stream": False,
|
||||
},
|
||||
timeout=8,
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=32) as ex:
|
||||
list(ex.map(_direct_decode_load, range(128)))
|
||||
time.sleep(1)
|
||||
|
||||
def call(i):
|
||||
r = requests.post(
|
||||
f"{rh.url}/v1/completions",
|
||||
json={
|
||||
"model": "test-model",
|
||||
"prompt": f"p{i}",
|
||||
"max_tokens": 1,
|
||||
"stream": False,
|
||||
},
|
||||
timeout=8,
|
||||
)
|
||||
assert r.status_code == 200
|
||||
return r.headers.get("X-Worker-Id") or r.json().get("worker_id")
|
||||
|
||||
counts = collections.Counter()
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=32) as ex:
|
||||
for wid in ex.map(call, range(200)):
|
||||
counts[wid] += 1
|
||||
|
||||
assert counts[slow_id] < counts[fast_id], counts
|
||||
91
sgl-router/py_test/integration/test_rate_limiting.py
Normal file
91
sgl-router/py_test/integration/test_rate_limiting.py
Normal file
@@ -0,0 +1,91 @@
|
||||
import concurrent.futures
|
||||
import time
|
||||
|
||||
import pytest
|
||||
import requests
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_rate_limit_and_queue(router_manager, mock_workers):
|
||||
# One fast backend
|
||||
_, urls, _ = mock_workers(n=1)
|
||||
rh = router_manager.start_router(
|
||||
worker_urls=urls,
|
||||
policy="round_robin",
|
||||
extra={
|
||||
"max_concurrent_requests": 2,
|
||||
"queue_size": 0, # no queue -> immediate 429 when limit exceeded
|
||||
},
|
||||
)
|
||||
|
||||
def call_once(i):
|
||||
try:
|
||||
r = requests.post(
|
||||
f"{rh.url}/v1/completions",
|
||||
json={
|
||||
"model": "test-model",
|
||||
"prompt": f"p{i}",
|
||||
"max_tokens": 1,
|
||||
"stream": False,
|
||||
},
|
||||
timeout=3,
|
||||
)
|
||||
return r.status_code
|
||||
except Exception:
|
||||
return 599
|
||||
|
||||
# Fire a burst of concurrent requests
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=16) as ex:
|
||||
results = list(ex.map(call_once, range(16)))
|
||||
|
||||
# Expect some to succeed and some to be rate limited (429)
|
||||
assert any(code == 200 for code in results)
|
||||
assert any(code == 429 for code in results)
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_rate_limit_queue_and_timeout(router_manager, mock_workers):
|
||||
# Slow backend: ~2s per request ensures queue wait > timeout
|
||||
_, urls, _ = mock_workers(n=1, args=["--latency-ms", "2000"]) # 2.0s per request
|
||||
|
||||
# Allow 1 concurrent, queue up to 1, with 1s queue timeout
|
||||
rh = router_manager.start_router(
|
||||
worker_urls=urls,
|
||||
policy="round_robin",
|
||||
extra={
|
||||
"max_concurrent_requests": 1,
|
||||
"queue_size": 1,
|
||||
"queue_timeout_secs": 1,
|
||||
},
|
||||
)
|
||||
|
||||
def call_once(i):
|
||||
try:
|
||||
r = requests.post(
|
||||
f"{rh.url}/v1/completions",
|
||||
json={
|
||||
"model": "test-model",
|
||||
"prompt": f"q{i}",
|
||||
"max_tokens": 1,
|
||||
"stream": False,
|
||||
},
|
||||
timeout=5,
|
||||
)
|
||||
return r.status_code
|
||||
except Exception:
|
||||
return 599
|
||||
|
||||
# Fire 4 concurrent requests: 1 runs (~2s), 1 queued (times out at 1s -> 408), 2 overflow -> 429
|
||||
import concurrent.futures
|
||||
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=8) as ex:
|
||||
results = list(ex.map(call_once, range(4)))
|
||||
|
||||
# We expect:
|
||||
# - Some 200s (processed)
|
||||
# - At least one 408 (queued too long and timed out)
|
||||
# - Remaining non-200s are either 429 (queue overflow) or additional 408s depending on scheduling
|
||||
assert any(code == 200 for code in results)
|
||||
assert any(code == 408 for code in results), results
|
||||
non200 = [c for c in results if c != 200]
|
||||
assert len(non200) >= 2 and all(c in (408, 429) for c in non200), results
|
||||
65
sgl-router/py_test/integration/test_retries.py
Normal file
65
sgl-router/py_test/integration/test_retries.py
Normal file
@@ -0,0 +1,65 @@
|
||||
import concurrent.futures
|
||||
import subprocess
|
||||
import time
|
||||
|
||||
import pytest
|
||||
import requests
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_retry_reroutes_to_healthy_worker(router_manager, mock_workers):
|
||||
# Worker A always 500; Worker B healthy
|
||||
# Worker A always 500; Worker B/C healthy
|
||||
_, [url_a], [id_a] = mock_workers(n=1, args=["--status-code", "500"]) # fail
|
||||
_, [url_b], [id_b] = mock_workers(n=1)
|
||||
_, [url_c], [id_c] = mock_workers(n=1)
|
||||
rh = router_manager.start_router(
|
||||
worker_urls=[url_a, url_b, url_c],
|
||||
policy="round_robin",
|
||||
extra={
|
||||
"retry_max_retries": 3,
|
||||
"retry_initial_backoff_ms": 10,
|
||||
"retry_max_backoff_ms": 50,
|
||||
},
|
||||
)
|
||||
|
||||
r = requests.post(
|
||||
f"{rh.url}/v1/completions",
|
||||
json={
|
||||
"model": "test-model",
|
||||
"prompt": "x",
|
||||
"max_tokens": 1,
|
||||
"stream": False,
|
||||
},
|
||||
timeout=5,
|
||||
)
|
||||
assert r.status_code == 200
|
||||
wid = r.headers.get("X-Worker-Id") or r.json().get("worker_id")
|
||||
assert wid == id_b # should have retried onto healthy worker
|
||||
# mock_workers fixture handles cleanup
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_disable_retries_surfaces_failure(router_manager, mock_workers):
|
||||
# Single failing worker, retries disabled -> should return 500
|
||||
_, [url], [wid] = mock_workers(n=1, args=["--status-code", "500"]) # always fail
|
||||
rh = router_manager.start_router(
|
||||
worker_urls=[url],
|
||||
policy="round_robin",
|
||||
extra={
|
||||
"disable_retries": True,
|
||||
},
|
||||
)
|
||||
|
||||
r = requests.post(
|
||||
f"{rh.url}/v1/completions",
|
||||
json={
|
||||
"model": "test-model",
|
||||
"prompt": "x",
|
||||
"max_tokens": 1,
|
||||
"stream": False,
|
||||
},
|
||||
timeout=5,
|
||||
)
|
||||
assert r.status_code == 500
|
||||
# mock_workers fixture handles cleanup
|
||||
@@ -0,0 +1,36 @@
|
||||
import pytest
|
||||
import requests
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_discovery_shim_add_remove(router_manager, mock_workers):
|
||||
# Start router without workers
|
||||
rh = router_manager.start_router(worker_urls=[], policy="round_robin")
|
||||
|
||||
# Initially empty
|
||||
urls = router_manager.list_workers(rh.url)
|
||||
assert urls == []
|
||||
|
||||
# Add a worker (simulate discovery event)
|
||||
_, [wurl], [wid] = mock_workers(n=1)
|
||||
router_manager.add_worker(rh.url, wurl)
|
||||
urls = router_manager.list_workers(rh.url)
|
||||
assert wurl in urls
|
||||
|
||||
# Can serve a request
|
||||
r = requests.post(
|
||||
f"{rh.url}/v1/completions",
|
||||
json={
|
||||
"model": "test-model",
|
||||
"prompt": "hi",
|
||||
"max_tokens": 1,
|
||||
"stream": False,
|
||||
},
|
||||
)
|
||||
assert r.status_code == 200
|
||||
|
||||
# Remove worker (simulate pod deletion)
|
||||
router_manager.remove_worker(rh.url, wurl)
|
||||
urls = router_manager.list_workers(rh.url)
|
||||
assert wurl not in urls
|
||||
# mock_workers fixture handles cleanup
|
||||
61
sgl-router/py_test/integration/test_worker_management.py
Normal file
61
sgl-router/py_test/integration/test_worker_management.py
Normal file
@@ -0,0 +1,61 @@
|
||||
import collections
|
||||
import subprocess
|
||||
import time
|
||||
|
||||
import pytest
|
||||
import requests
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_add_and_remove_worker(mock_worker, router_manager, mock_workers):
|
||||
# Start with a single worker
|
||||
proc1, url1, id1 = mock_worker
|
||||
rh = router_manager.start_router(worker_urls=[url1], policy="round_robin")
|
||||
|
||||
# Add a second worker
|
||||
|
||||
procs2, urls2, ids2 = mock_workers(n=1)
|
||||
url2 = urls2[0]
|
||||
id2 = ids2[0]
|
||||
router_manager.add_worker(rh.url, url2)
|
||||
|
||||
# Send some requests and ensure both workers are seen
|
||||
seen = set()
|
||||
with requests.Session() as s:
|
||||
for i in range(20):
|
||||
r = s.post(
|
||||
f"{rh.url}/v1/completions",
|
||||
json={
|
||||
"model": "test-model",
|
||||
"prompt": f"x{i}",
|
||||
"max_tokens": 1,
|
||||
"stream": False,
|
||||
},
|
||||
)
|
||||
assert r.status_code == 200
|
||||
wid = r.headers.get("X-Worker-Id") or r.json().get("worker_id")
|
||||
seen.add(wid)
|
||||
if len(seen) == 2:
|
||||
break
|
||||
|
||||
assert id1 in seen and id2 in seen
|
||||
|
||||
# Now remove the second worker
|
||||
router_manager.remove_worker(rh.url, url2)
|
||||
|
||||
# After removal, subsequent requests should only come from first worker
|
||||
with requests.Session() as s:
|
||||
for i in range(10):
|
||||
r = s.post(
|
||||
f"{rh.url}/v1/completions",
|
||||
json={
|
||||
"model": "test-model",
|
||||
"prompt": f"y{i}",
|
||||
"max_tokens": 1,
|
||||
"stream": False,
|
||||
},
|
||||
)
|
||||
assert r.status_code == 200
|
||||
wid = r.headers.get("X-Worker-Id") or r.json().get("worker_id")
|
||||
assert wid == id1
|
||||
# mock_workers fixture handles cleanup
|
||||
Reference in New Issue
Block a user