[router] add token bucket rate limiter (#9656)
This commit is contained in:
@@ -64,7 +64,10 @@ class Router:
|
||||
bootstrap_port_annotation: Kubernetes annotation name for bootstrap port (PD mode).
|
||||
Default: 'sglang.ai/bootstrap-port'
|
||||
request_timeout_secs: Request timeout in seconds. Default: 600
|
||||
max_concurrent_requests: Maximum number of concurrent requests allowed for rate limiting. Default: 64
|
||||
max_concurrent_requests: Maximum number of concurrent requests allowed for rate limiting. Default: 256
|
||||
queue_size: Queue size for pending requests when max concurrent limit reached (0 = no queue, return 429 immediately). Default: 100
|
||||
queue_timeout_secs: Maximum time (in seconds) a request can wait in queue before timing out. Default: 60
|
||||
rate_limit_tokens_per_second: Token bucket refill rate (tokens per second). If not set, defaults to max_concurrent_requests. Default: None
|
||||
cors_allowed_origins: List of allowed origins for CORS. Empty list allows all origins. Default: []
|
||||
health_failure_threshold: Number of consecutive health check failures before marking worker unhealthy. Default: 3
|
||||
health_success_threshold: Number of consecutive health check successes before marking worker healthy. Default: 2
|
||||
@@ -108,6 +111,9 @@ class Router:
|
||||
prefill_policy: Optional[PolicyType] = None,
|
||||
decode_policy: Optional[PolicyType] = None,
|
||||
max_concurrent_requests: int = 256,
|
||||
queue_size: int = 100,
|
||||
queue_timeout_secs: int = 60,
|
||||
rate_limit_tokens_per_second: Optional[int] = None,
|
||||
cors_allowed_origins: List[str] = None,
|
||||
retry_max_retries: int = 5,
|
||||
retry_initial_backoff_ms: int = 50,
|
||||
@@ -169,6 +175,9 @@ class Router:
|
||||
prefill_policy=prefill_policy,
|
||||
decode_policy=decode_policy,
|
||||
max_concurrent_requests=max_concurrent_requests,
|
||||
queue_size=queue_size,
|
||||
queue_timeout_secs=queue_timeout_secs,
|
||||
rate_limit_tokens_per_second=rate_limit_tokens_per_second,
|
||||
cors_allowed_origins=cors_allowed_origins,
|
||||
retry_max_retries=retry_max_retries,
|
||||
retry_initial_backoff_ms=retry_initial_backoff_ms,
|
||||
|
||||
Reference in New Issue
Block a user