[router] Introduce router integration tests (#10086)

This commit is contained in:
Keyang Ru
2025-09-05 18:52:53 -07:00
committed by GitHub
parent db37422c92
commit 21b9a4b435
23 changed files with 1417 additions and 2 deletions

View File

@@ -0,0 +1 @@
"""Integration test package for the router."""

View File

@@ -0,0 +1,109 @@
import os
import subprocess
import time
from pathlib import Path
from typing import Dict, Iterable, List, Tuple
import pytest
import requests
from ..fixtures.ports import find_free_port
from ..fixtures.router_manager import RouterManager
def pytest_configure(config):
config.addinivalue_line("markers", "integration: mark as router integration test")
@pytest.fixture
def router_manager() -> Iterable[RouterManager]:
mgr = RouterManager()
try:
yield mgr
finally:
mgr.stop_all()
def _spawn_mock_worker(args: List[str]) -> Tuple[subprocess.Popen, str, str]:
repo_root = Path(__file__).resolve().parents[2]
script = repo_root / "py_test" / "fixtures" / "mock_worker.py"
port = find_free_port()
worker_id = f"worker-{port}"
base_cmd = [
"python3",
str(script),
"--port",
str(port),
"--worker-id",
worker_id,
]
cmd = base_cmd + args
proc = subprocess.Popen(cmd)
url = f"http://127.0.0.1:{port}"
_wait_health(url)
return proc, url, worker_id
def _wait_health(url: str, timeout: float = 10.0):
start = time.time()
with requests.Session() as s:
while time.time() - start < timeout:
try:
r = s.get(f"{url}/health", timeout=1)
if r.status_code == 200:
return
except requests.RequestException:
pass
time.sleep(0.1)
raise TimeoutError(f"Mock worker at {url} did not become healthy")
@pytest.fixture
def mock_worker():
"""Start a single healthy mock worker; yields (process, url, worker_id)."""
proc, url, worker_id = _spawn_mock_worker([])
try:
yield proc, url, worker_id
finally:
if proc.poll() is None:
proc.terminate()
try:
proc.wait(timeout=3)
except subprocess.TimeoutExpired:
proc.kill()
@pytest.fixture
def mock_workers():
"""Factory to start N workers with custom args.
Usage:
procs, urls, ids = mock_workers(n=3, args=["--latency-ms", "5"]) # same args for all
...
"""
procs: List[subprocess.Popen] = []
def _start(n: int, args: List[str] | None = None):
args = args or []
new_procs: List[subprocess.Popen] = []
urls: List[str] = []
ids: List[str] = []
for _ in range(n):
p, url, wid = _spawn_mock_worker(args)
procs.append(p)
new_procs.append(p)
urls.append(url)
ids.append(wid)
return new_procs, urls, ids
try:
yield _start
finally:
for p in procs:
if p.poll() is None:
p.terminate()
try:
p.wait(timeout=3)
except subprocess.TimeoutExpired:
p.kill()

View File

@@ -0,0 +1 @@
"""Load balancing integration tests."""

View File

@@ -0,0 +1,73 @@
import collections
import concurrent.futures
import uuid
import pytest
import requests
@pytest.mark.integration
def test_cache_aware_affinity(mock_workers, router_manager):
# Two workers; same prompt should stick to one due to cache tree
_, urls, ids = mock_workers(n=2)
rh = router_manager.start_router(worker_urls=urls, policy="cache_aware")
counts = collections.Counter()
with requests.Session() as s:
for i in range(12):
r = s.post(
f"{rh.url}/v1/completions",
json={
"model": "test-model",
"prompt": "repeated prompt for cache",
"max_tokens": 1,
"stream": False,
},
)
assert r.status_code == 200
wid = r.headers.get("X-Worker-Id") or r.json().get("worker_id")
counts[wid] += 1
# Expect strong skew toward one worker (tree match); majority > 80%
top = max(counts.values())
assert top >= 10, counts
@pytest.mark.integration
def test_cache_aware_diverse_prompts_balances(mock_workers, router_manager):
# Add latency so concurrent requests overlap and influence load-based selection
_, urls, ids = mock_workers(n=3, args=["--latency-ms", "30"])
rh = router_manager.start_router(
worker_urls=urls,
policy="cache_aware",
extra={
"cache_threshold": 0.99,
"balance_abs_threshold": 0,
"balance_rel_threshold": 1.0,
},
)
counts = collections.Counter()
def call(i):
# Use diverse, unrelated prompts to avoid prefix matches entirely
prompt = str(uuid.uuid4())
r = requests.post(
f"{rh.url}/v1/completions",
json={
"model": "test-model",
"prompt": prompt,
"max_tokens": 1,
"stream": False,
},
timeout=5,
)
assert r.status_code == 200
return r.headers.get("X-Worker-Id") or r.json().get("worker_id")
with concurrent.futures.ThreadPoolExecutor(max_workers=16) as ex:
for wid in ex.map(call, range(40)):
counts[wid] += 1
# Expect participation of at least two workers
assert sum(1 for v in counts.values() if v > 0) >= 2, counts

View File

@@ -0,0 +1,89 @@
import collections
import concurrent.futures
import time
import pytest
import requests
@pytest.mark.integration
def test_power_of_two_prefers_less_loaded(mock_workers, router_manager):
# Start two workers: one slow (higher inflight), one fast
# Router monitors /get_load and Power-of-Two uses cached loads to choose
# Start one slow and one fast worker using the fixture factory
procs_slow, urls_slow, ids_slow = mock_workers(n=1, args=["--latency-ms", "200"])
procs_fast, urls_fast, ids_fast = mock_workers(n=1, args=["--latency-ms", "0"])
procs = procs_slow + procs_fast
urls = urls_slow + urls_fast
ids = ids_slow + ids_fast
slow_id = ids_slow[0]
rh = router_manager.start_router(
worker_urls=urls,
policy="power_of_two",
extra={"worker_startup_check_interval": 1},
)
# Prime: fire a burst to create measurable load on slow worker, then wait for monitor tick
def _prime_call(i):
try:
requests.post(
f"{rh.url}/v1/completions",
json={
"model": "test-model",
"prompt": f"warm-{i}",
"max_tokens": 1,
"stream": False,
},
timeout=5,
)
except Exception:
pass
with concurrent.futures.ThreadPoolExecutor(max_workers=32) as ex:
list(ex.map(_prime_call, range(128)))
time.sleep(2)
# Apply direct background load on the slow worker to amplify load diff
def _direct_load(i):
try:
requests.post(
f"{slow_url}/v1/completions",
json={
"model": "test-model",
"prompt": f"bg-{i}",
"max_tokens": 1,
"stream": False,
},
timeout=5,
)
except Exception:
pass
with concurrent.futures.ThreadPoolExecutor(max_workers=32) as ex:
list(ex.map(_direct_load, range(128)))
time.sleep(1)
def call(i):
r = requests.post(
f"{rh.url}/v1/completions",
json={
"model": "test-model",
"prompt": f"p{i}",
"max_tokens": 1,
"stream": False,
},
timeout=5,
)
assert r.status_code == 200
return r.headers.get("X-Worker-Id") or r.json().get("worker_id")
counts = collections.Counter()
with concurrent.futures.ThreadPoolExecutor(max_workers=32) as ex:
for wid in ex.map(call, range(200)):
counts[wid] += 1
# Expect the slow worker (higher latency/inflight) to receive fewer requests
fast_worker_id = [i for i in ids if i != slow_id][0]
assert counts[slow_id] < counts[fast_worker_id], counts

View File

@@ -0,0 +1,33 @@
import collections
import math
import pytest
import requests
@pytest.mark.integration
def test_random_distribution(mock_workers, router_manager):
procs, urls, ids = mock_workers(n=4)
rh = router_manager.start_router(worker_urls=urls, policy="random")
counts = collections.Counter()
N = 200
with requests.Session() as s:
for i in range(N):
r = s.post(
f"{rh.url}/v1/completions",
json={
"model": "test-model",
"prompt": f"p{i}",
"max_tokens": 1,
"stream": False,
},
)
assert r.status_code == 200
wid = r.headers.get("X-Worker-Id") or r.json().get("worker_id")
counts[wid] += 1
# simple statistical tolerance: each worker should be within ±50% of mean
mean = N / len(ids)
for wid in ids:
assert 0.5 * mean <= counts[wid] <= 1.5 * mean, counts

View File

@@ -0,0 +1,34 @@
import collections
import time
import pytest
import requests
@pytest.mark.integration
def test_round_robin_distribution(mock_workers, router_manager):
procs, urls, ids = mock_workers(n=3)
rh = router_manager.start_router(worker_urls=urls, policy="round_robin")
counts = collections.Counter()
with requests.Session() as s:
for i in range(30):
r = s.post(
f"{rh.url}/v1/completions",
json={
"model": "test-model",
"prompt": f"hello {i}",
"max_tokens": 1,
"stream": False,
},
)
assert r.status_code == 200
wid = r.headers.get("X-Worker-Id") or r.json().get("worker_id")
assert wid in ids
counts[wid] += 1
# Expect near-even distribution across 3 workers
# 30 requests -> ideally 10 each; allow small tolerance ±3
for wid in ids:
assert 7 <= counts[wid] <= 13, counts

View File

@@ -0,0 +1,38 @@
import pytest
import requests
@pytest.mark.integration
def test_router_api_key_enforcement(router_manager, mock_workers):
# Start backend requiring API key; router should forward Authorization header transparently
_, urls, _ = mock_workers(
n=1, args=["--require-api-key", "--api-key", "correct_api_key"]
)
rh = router_manager.start_router(
worker_urls=urls,
policy="round_robin",
extra={},
)
# No auth -> 401
r = requests.post(
f"{rh.url}/v1/completions",
json={"model": "test-model", "prompt": "x", "max_tokens": 1, "stream": False},
)
assert r.status_code == 401
# Invalid auth -> 401
r = requests.post(
f"{rh.url}/v1/completions",
json={"model": "test-model", "prompt": "x", "max_tokens": 1, "stream": False},
headers={"Authorization": "Bearer wrong"},
)
assert r.status_code == 401
# Correct auth -> 200
r = requests.post(
f"{rh.url}/v1/completions",
json={"model": "test-model", "prompt": "x", "max_tokens": 1, "stream": False},
headers={"Authorization": "Bearer correct_api_key"},
)
assert r.status_code == 200

View File

@@ -0,0 +1,191 @@
import time
import pytest
import requests
@pytest.mark.integration
def test_circuit_breaker_opens_and_recovers(router_manager, mock_workers):
# A single worker that fails first 3 requests, then succeeds
_, [wurl], _ = mock_workers(n=1, args=["--fail-first-n", "3"]) # fails first 3
rh = router_manager.start_router(
worker_urls=[wurl],
policy="round_robin",
extra={
"cb_failure_threshold": 3,
"cb_success_threshold": 2,
"cb_timeout_duration_secs": 3,
"cb_window_duration_secs": 10,
"disable_retries": True,
},
)
def post_once():
return requests.post(
f"{rh.url}/v1/completions",
json={
"model": "test-model",
"prompt": "trigger",
"max_tokens": 1,
"stream": False,
},
timeout=3,
)
saw_503 = False
for _ in range(8):
r = post_once()
if r.status_code == 503:
saw_503 = True
break
assert saw_503, "circuit breaker did not open to return 503"
time.sleep(4)
r1 = post_once()
r2 = post_once()
assert r1.status_code == 200 and r2.status_code == 200
@pytest.mark.integration
def test_circuit_breaker_half_open_failure_reopens(router_manager, mock_workers):
_, [wurl], _ = mock_workers(n=1, args=["--status-code", "500"]) # always fail
rh = router_manager.start_router(
worker_urls=[wurl],
policy="round_robin",
extra={
"cb_failure_threshold": 2,
"cb_success_threshold": 2,
"cb_timeout_duration_secs": 2,
"cb_window_duration_secs": 5,
"disable_retries": True,
},
)
def post_once():
return requests.post(
f"{rh.url}/v1/completions",
json={
"model": "test-model",
"prompt": "x",
"max_tokens": 1,
"stream": False,
},
timeout=3,
)
opened = False
for _ in range(8):
r = post_once()
if r.status_code == 503:
opened = True
break
assert opened, "circuit breaker did not open"
time.sleep(3)
r = post_once()
assert r.status_code == 500
r2 = post_once()
assert r2.status_code == 503
@pytest.mark.integration
def test_circuit_breaker_disable_flag(router_manager, mock_workers):
_, [wurl], _ = mock_workers(n=1, args=["--status-code", "500"]) # always fail
rh = router_manager.start_router(
worker_urls=[wurl],
policy="round_robin",
extra={
"disable_circuit_breaker": True,
"disable_retries": True,
},
)
r = requests.post(
f"{rh.url}/v1/completions",
json={
"model": "test-model",
"prompt": "x",
"max_tokens": 1,
"stream": False,
},
timeout=3,
)
assert r.status_code == 500
@pytest.mark.integration
def test_circuit_breaker_per_worker_isolation(router_manager, mock_workers):
_, [fail_url], _ = mock_workers(n=1, args=["--status-code", "500"]) # always fail
_, [ok_url], _ = mock_workers(n=1)
rh = router_manager.start_router(
worker_urls=[fail_url, ok_url],
policy="round_robin",
extra={
"cb_failure_threshold": 2,
"cb_success_threshold": 1,
"cb_timeout_duration_secs": 2,
"cb_window_duration_secs": 10,
"disable_retries": True,
},
)
def post_once():
return requests.post(
f"{rh.url}/v1/completions",
json={
"model": "test-model",
"prompt": "y",
"max_tokens": 1,
"stream": False,
},
timeout=3,
)
failures = 0
successes_after_open = 0
opened = False
for _ in range(30):
r = post_once()
if not opened:
if r.status_code == 500:
failures += 1
if failures >= 2:
_ = post_once()
_ = post_once()
opened = True
else:
if r.status_code == 200:
successes_after_open += 1
else:
assert False, f"Unexpected non-200 after CB open: {r.status_code}"
assert opened and successes_after_open >= 5
@pytest.mark.integration
def test_circuit_breaker_with_retries(router_manager, mock_workers):
_, [fail_url], _ = mock_workers(n=1, args=["--status-code", "500"]) # always fail
_, [ok_url], _ = mock_workers(n=1)
rh = router_manager.start_router(
worker_urls=[fail_url, ok_url],
policy="round_robin",
extra={
"retry_max_retries": 3,
"retry_initial_backoff_ms": 10,
"retry_max_backoff_ms": 50,
"cb_failure_threshold": 2,
"cb_success_threshold": 1,
"cb_timeout_duration_secs": 2,
"cb_window_duration_secs": 10,
},
)
r = requests.post(
f"{rh.url}/v1/completions",
json={
"model": "test-model",
"prompt": "z",
"max_tokens": 1,
"stream": False,
},
timeout=5,
)
assert r.status_code == 200

View File

@@ -0,0 +1,36 @@
import concurrent.futures
import subprocess
import time
import pytest
import requests
@pytest.mark.integration
def test_worker_crash_reroute_with_retries(router_manager, mock_workers):
# Start one healthy and one that will crash on first request
_, [ok_url], _ = mock_workers(n=1)
_, [crash_url], _ = mock_workers(n=1, args=["--crash-on-request"])
rh = router_manager.start_router(
worker_urls=[crash_url, ok_url],
policy="round_robin",
extra={
"retry_max_retries": 3,
"retry_initial_backoff_ms": 10,
"retry_max_backoff_ms": 50,
},
)
# A single request should succeed via retry to the healthy worker
r = requests.post(
f"{rh.url}/v1/completions",
json={
"model": "test-model",
"prompt": "crash",
"max_tokens": 1,
"stream": False,
},
timeout=5,
)
assert r.status_code == 200
# mock_workers fixture handles cleanup

View File

@@ -0,0 +1,127 @@
import collections
import concurrent.futures
import subprocess
import time
import pytest
import requests
@pytest.mark.integration
def test_pd_power_of_two_decode_attribution(router_manager, mock_workers):
# Start two prefill and three decode mock workers via fixture
_, prefill_urls_raw, prefill_ids = mock_workers(n=2)
_, decode_urls_raw, decode_ids_list = mock_workers(n=3)
prefill_urls = [(u, None) for u in prefill_urls_raw]
decode_urls = list(decode_urls_raw)
decode_ids = set(decode_ids_list)
rh = router_manager.start_router(
policy="power_of_two",
pd_disaggregation=True,
prefill_urls=prefill_urls,
decode_urls=decode_urls,
extra={"worker_startup_check_interval": 1},
)
counts = collections.Counter()
with requests.Session() as s:
for i in range(30):
r = s.post(
f"{rh.url}/v1/completions",
json={
"model": "test-model",
"prompt": f"p{i}",
"max_tokens": 1,
"stream": False,
},
)
assert r.status_code == 200
wid = r.headers.get("X-Worker-Id") or r.json().get("worker_id")
assert wid in decode_ids
counts[wid] += 1
assert sum(1 for v in counts.values() if v > 0) >= 2
@pytest.mark.integration
def test_pd_power_of_two_skews_to_faster_decode(router_manager, mock_workers):
# Start two prefill workers (fast)
_, prefill_urls_raw, _ = mock_workers(n=2)
# Start two decode workers: one slow, one fast
_, [decode_slow_url], [slow_id] = mock_workers(
n=1, args=["--latency-ms", "300"]
) # slower decode
_, [decode_fast_url], [fast_id] = mock_workers(n=1)
decode_urls_raw = [decode_slow_url, decode_fast_url]
prefill_urls = [(u, None) for u in prefill_urls_raw]
decode_urls = list(decode_urls_raw)
rh = router_manager.start_router(
policy="power_of_two",
pd_disaggregation=True,
prefill_urls=prefill_urls,
decode_urls=decode_urls,
extra={"worker_startup_check_interval": 1},
)
def _prime_call(i):
try:
requests.post(
f"{rh.url}/v1/completions",
json={
"model": "test-model",
"prompt": f"warm-{i}",
"max_tokens": 1,
"stream": False,
},
timeout=8,
)
except Exception:
pass
with concurrent.futures.ThreadPoolExecutor(max_workers=32) as ex:
list(ex.map(_prime_call, range(128)))
time.sleep(2)
def _direct_decode_load(i):
try:
requests.post(
f"{decode_slow_url}/v1/completions",
json={
"model": "test-model",
"prompt": f"bg-{i}",
"max_tokens": 1,
"stream": False,
},
timeout=8,
)
except Exception:
pass
with concurrent.futures.ThreadPoolExecutor(max_workers=32) as ex:
list(ex.map(_direct_decode_load, range(128)))
time.sleep(1)
def call(i):
r = requests.post(
f"{rh.url}/v1/completions",
json={
"model": "test-model",
"prompt": f"p{i}",
"max_tokens": 1,
"stream": False,
},
timeout=8,
)
assert r.status_code == 200
return r.headers.get("X-Worker-Id") or r.json().get("worker_id")
counts = collections.Counter()
with concurrent.futures.ThreadPoolExecutor(max_workers=32) as ex:
for wid in ex.map(call, range(200)):
counts[wid] += 1
assert counts[slow_id] < counts[fast_id], counts

View File

@@ -0,0 +1,91 @@
import concurrent.futures
import time
import pytest
import requests
@pytest.mark.integration
def test_rate_limit_and_queue(router_manager, mock_workers):
# One fast backend
_, urls, _ = mock_workers(n=1)
rh = router_manager.start_router(
worker_urls=urls,
policy="round_robin",
extra={
"max_concurrent_requests": 2,
"queue_size": 0, # no queue -> immediate 429 when limit exceeded
},
)
def call_once(i):
try:
r = requests.post(
f"{rh.url}/v1/completions",
json={
"model": "test-model",
"prompt": f"p{i}",
"max_tokens": 1,
"stream": False,
},
timeout=3,
)
return r.status_code
except Exception:
return 599
# Fire a burst of concurrent requests
with concurrent.futures.ThreadPoolExecutor(max_workers=16) as ex:
results = list(ex.map(call_once, range(16)))
# Expect some to succeed and some to be rate limited (429)
assert any(code == 200 for code in results)
assert any(code == 429 for code in results)
@pytest.mark.integration
def test_rate_limit_queue_and_timeout(router_manager, mock_workers):
# Slow backend: ~2s per request ensures queue wait > timeout
_, urls, _ = mock_workers(n=1, args=["--latency-ms", "2000"]) # 2.0s per request
# Allow 1 concurrent, queue up to 1, with 1s queue timeout
rh = router_manager.start_router(
worker_urls=urls,
policy="round_robin",
extra={
"max_concurrent_requests": 1,
"queue_size": 1,
"queue_timeout_secs": 1,
},
)
def call_once(i):
try:
r = requests.post(
f"{rh.url}/v1/completions",
json={
"model": "test-model",
"prompt": f"q{i}",
"max_tokens": 1,
"stream": False,
},
timeout=5,
)
return r.status_code
except Exception:
return 599
# Fire 4 concurrent requests: 1 runs (~2s), 1 queued (times out at 1s -> 408), 2 overflow -> 429
import concurrent.futures
with concurrent.futures.ThreadPoolExecutor(max_workers=8) as ex:
results = list(ex.map(call_once, range(4)))
# We expect:
# - Some 200s (processed)
# - At least one 408 (queued too long and timed out)
# - Remaining non-200s are either 429 (queue overflow) or additional 408s depending on scheduling
assert any(code == 200 for code in results)
assert any(code == 408 for code in results), results
non200 = [c for c in results if c != 200]
assert len(non200) >= 2 and all(c in (408, 429) for c in non200), results

View File

@@ -0,0 +1,65 @@
import concurrent.futures
import subprocess
import time
import pytest
import requests
@pytest.mark.integration
def test_retry_reroutes_to_healthy_worker(router_manager, mock_workers):
# Worker A always 500; Worker B healthy
# Worker A always 500; Worker B/C healthy
_, [url_a], [id_a] = mock_workers(n=1, args=["--status-code", "500"]) # fail
_, [url_b], [id_b] = mock_workers(n=1)
_, [url_c], [id_c] = mock_workers(n=1)
rh = router_manager.start_router(
worker_urls=[url_a, url_b, url_c],
policy="round_robin",
extra={
"retry_max_retries": 3,
"retry_initial_backoff_ms": 10,
"retry_max_backoff_ms": 50,
},
)
r = requests.post(
f"{rh.url}/v1/completions",
json={
"model": "test-model",
"prompt": "x",
"max_tokens": 1,
"stream": False,
},
timeout=5,
)
assert r.status_code == 200
wid = r.headers.get("X-Worker-Id") or r.json().get("worker_id")
assert wid == id_b # should have retried onto healthy worker
# mock_workers fixture handles cleanup
@pytest.mark.integration
def test_disable_retries_surfaces_failure(router_manager, mock_workers):
# Single failing worker, retries disabled -> should return 500
_, [url], [wid] = mock_workers(n=1, args=["--status-code", "500"]) # always fail
rh = router_manager.start_router(
worker_urls=[url],
policy="round_robin",
extra={
"disable_retries": True,
},
)
r = requests.post(
f"{rh.url}/v1/completions",
json={
"model": "test-model",
"prompt": "x",
"max_tokens": 1,
"stream": False,
},
timeout=5,
)
assert r.status_code == 500
# mock_workers fixture handles cleanup

View File

@@ -0,0 +1,36 @@
import pytest
import requests
@pytest.mark.integration
def test_discovery_shim_add_remove(router_manager, mock_workers):
# Start router without workers
rh = router_manager.start_router(worker_urls=[], policy="round_robin")
# Initially empty
urls = router_manager.list_workers(rh.url)
assert urls == []
# Add a worker (simulate discovery event)
_, [wurl], [wid] = mock_workers(n=1)
router_manager.add_worker(rh.url, wurl)
urls = router_manager.list_workers(rh.url)
assert wurl in urls
# Can serve a request
r = requests.post(
f"{rh.url}/v1/completions",
json={
"model": "test-model",
"prompt": "hi",
"max_tokens": 1,
"stream": False,
},
)
assert r.status_code == 200
# Remove worker (simulate pod deletion)
router_manager.remove_worker(rh.url, wurl)
urls = router_manager.list_workers(rh.url)
assert wurl not in urls
# mock_workers fixture handles cleanup

View File

@@ -0,0 +1,61 @@
import collections
import subprocess
import time
import pytest
import requests
@pytest.mark.integration
def test_add_and_remove_worker(mock_worker, router_manager, mock_workers):
# Start with a single worker
proc1, url1, id1 = mock_worker
rh = router_manager.start_router(worker_urls=[url1], policy="round_robin")
# Add a second worker
procs2, urls2, ids2 = mock_workers(n=1)
url2 = urls2[0]
id2 = ids2[0]
router_manager.add_worker(rh.url, url2)
# Send some requests and ensure both workers are seen
seen = set()
with requests.Session() as s:
for i in range(20):
r = s.post(
f"{rh.url}/v1/completions",
json={
"model": "test-model",
"prompt": f"x{i}",
"max_tokens": 1,
"stream": False,
},
)
assert r.status_code == 200
wid = r.headers.get("X-Worker-Id") or r.json().get("worker_id")
seen.add(wid)
if len(seen) == 2:
break
assert id1 in seen and id2 in seen
# Now remove the second worker
router_manager.remove_worker(rh.url, url2)
# After removal, subsequent requests should only come from first worker
with requests.Session() as s:
for i in range(10):
r = s.post(
f"{rh.url}/v1/completions",
json={
"model": "test-model",
"prompt": f"y{i}",
"max_tokens": 1,
"stream": False,
},
)
assert r.status_code == 200
wid = r.headers.get("X-Worker-Id") or r.json().get("worker_id")
assert wid == id1
# mock_workers fixture handles cleanup