[router] regular router circuit breaker (#8997)

This commit is contained in:
Simo Lin
2025-08-10 21:19:30 -07:00
committed by GitHub
parent 6beeff41c5
commit 067068f271
22 changed files with 967 additions and 112 deletions

View File

@@ -53,6 +53,17 @@ class TestLaunchRouter(unittest.TestCase):
prefill=None,
decode=None,
worker_urls=[],
retry_max_retries=3,
retry_initial_backoff_ms=100,
retry_max_backoff_ms=10_000,
retry_backoff_multiplier=2.0,
retry_jitter_factor=0.1,
cb_failure_threshold=5,
cb_success_threshold=2,
cb_timeout_duration_secs=30,
cb_window_duration_secs=60,
disable_retries=False,
disable_circuit_breaker=False,
)
def create_router_args(self, **kwargs):

View File

@@ -31,6 +31,16 @@ def popen_launch_router(
prometheus_port: int = None,
prometheus_host: str = None,
dp_aware: bool = False,
# Router retry/CB tuning (optional)
router_retry_max_retries: int = None,
router_retry_initial_backoff_ms: int = None,
router_retry_max_backoff_ms: int = None,
router_retry_backoff_multiplier: float = None,
router_retry_jitter_factor: float = None,
router_cb_failure_threshold: int = None,
router_cb_success_threshold: int = None,
router_cb_timeout_duration_secs: int = None,
router_cb_window_duration_secs: int = None,
):
"""
Launch the router server process.
@@ -107,6 +117,21 @@ def popen_launch_router(
if dp_aware:
command.append("--router-dp-aware")
# Append router retry/CB tuning flags if provided
def _add(flag: str, val):
if val is not None:
command.extend([flag, str(val)])
_add("--router-retry-max-retries", router_retry_max_retries)
_add("--router-retry-initial-backoff-ms", router_retry_initial_backoff_ms)
_add("--router-retry-max-backoff-ms", router_retry_max_backoff_ms)
_add("--router-retry-backoff-multiplier", router_retry_backoff_multiplier)
_add("--router-retry-jitter-factor", router_retry_jitter_factor)
_add("--router-cb-failure-threshold", router_cb_failure_threshold)
_add("--router-cb-success-threshold", router_cb_success_threshold)
_add("--router-cb-timeout-duration-secs", router_cb_timeout_duration_secs)
_add("--router-cb-window-duration-secs", router_cb_window_duration_secs)
process = subprocess.Popen(command, stdout=None, stderr=None)
start_time = time.perf_counter()