[router] add token bucket rate limiter (#9656)

This commit is contained in:
Chang Su
2025-08-26 10:36:26 -07:00
committed by GitHub
parent 3578eb1e9b
commit 90313fb09a
15 changed files with 533 additions and 10 deletions

View File

@@ -72,6 +72,12 @@ class RouterArgs:
request_timeout_secs: int = 1800
# Max concurrent requests for rate limiting
max_concurrent_requests: int = 256
# Queue size for pending requests when max concurrent limit reached
queue_size: int = 100
# Maximum time (in seconds) a request can wait in queue before timing out
queue_timeout_secs: int = 60
# Token bucket refill rate (tokens per second). If not set, defaults to max_concurrent_requests
rate_limit_tokens_per_second: Optional[int] = None
# CORS allowed origins
cors_allowed_origins: List[str] = dataclasses.field(default_factory=list)
# Retry configuration
@@ -402,6 +408,24 @@ class RouterArgs:
default=RouterArgs.max_concurrent_requests,
help="Maximum number of concurrent requests allowed (for rate limiting)",
)
parser.add_argument(
f"--{prefix}queue-size",
type=int,
default=RouterArgs.queue_size,
help="Queue size for pending requests when max concurrent limit reached (0 = no queue, return 429 immediately)",
)
parser.add_argument(
f"--{prefix}queue-timeout-secs",
type=int,
default=RouterArgs.queue_timeout_secs,
help="Maximum time (in seconds) a request can wait in queue before timing out",
)
parser.add_argument(
f"--{prefix}rate-limit-tokens-per-second",
type=int,
default=RouterArgs.rate_limit_tokens_per_second,
help="Token bucket refill rate (tokens per second). If not set, defaults to max_concurrent_requests",
)
parser.add_argument(
f"--{prefix}cors-allowed-origins",
type=str,
@@ -478,6 +502,21 @@ class RouterArgs:
f"{prefix}max_concurrent_requests",
RouterArgs.max_concurrent_requests,
),
queue_size=getattr(
args,
f"{prefix}queue_size",
RouterArgs.queue_size,
),
queue_timeout_secs=getattr(
args,
f"{prefix}queue_timeout_secs",
RouterArgs.queue_timeout_secs,
),
rate_limit_tokens_per_second=getattr(
args,
f"{prefix}rate_limit_tokens_per_second",
RouterArgs.rate_limit_tokens_per_second,
),
cors_allowed_origins=getattr(args, f"{prefix}cors_allowed_origins", []),
retry_max_retries=getattr(args, f"{prefix}retry_max_retries"),
retry_initial_backoff_ms=getattr(args, f"{prefix}retry_initial_backoff_ms"),
@@ -700,6 +739,9 @@ def launch_router(args: argparse.Namespace) -> Optional[Router]:
),
request_id_headers=router_args.request_id_headers,
max_concurrent_requests=router_args.max_concurrent_requests,
queue_size=router_args.queue_size,
queue_timeout_secs=router_args.queue_timeout_secs,
rate_limit_tokens_per_second=router_args.rate_limit_tokens_per_second,
cors_allowed_origins=router_args.cors_allowed_origins,
retry_max_retries=router_args.retry_max_retries,
retry_initial_backoff_ms=router_args.retry_initial_backoff_ms,