From f4aa78801e683dbd71a13e6360d44b2ac3748930 Mon Sep 17 00:00:00 2001 From: Jonah Bernard <96398205+Jonahcb@users.noreply.github.com> Date: Mon, 13 Oct 2025 14:08:48 -0400 Subject: [PATCH] [router] Add Rust CLI flags for queue size, timeout, and rate limit for token bucket rate limiter (#11483) Co-authored-by: Simo Lin --- sgl-router/README.md | 5 +++++ sgl-router/src/config/validation.rs | 18 ++++++++++++++++++ sgl-router/src/main.rs | 15 ++++++++++++--- 3 files changed, 35 insertions(+), 3 deletions(-) diff --git a/sgl-router/README.md b/sgl-router/README.md index ead374e4a..eaf58187e 100644 --- a/sgl-router/README.md +++ b/sgl-router/README.md @@ -426,6 +426,11 @@ curl -X POST http://localhost:8080/add_worker?url=http://worker3:8000&api_key=wo #### Authentication - `--api-key`: API key for router authentication (clients must provide this as Bearer token) +#### Concurrency and Rate Limiting +- `--queue-size`: Size of the pending-request queue when concurrency limits are reached (default: 100; set to 0 to disable queuing) +- `--queue-timeout-secs`: Maximum time a request may wait in the queue before timing out (default: 60; must be > 0 when queue is enabled) +- `--rate-limit-tokens-per-second`: Override token bucket refill rate for rate limiting (defaults to `--max-concurrent-requests` when omitted) + ## Development ### Build Process diff --git a/sgl-router/src/config/validation.rs b/sgl-router/src/config/validation.rs index ba2f71984..97f825f5a 100644 --- a/sgl-router/src/config/validation.rs +++ b/sgl-router/src/config/validation.rs @@ -205,6 +205,24 @@ impl ConfigValidator { }); } + if config.queue_size > 0 && config.queue_timeout_secs == 0 { + return Err(ConfigError::InvalidValue { + field: "queue_timeout_secs".to_string(), + value: config.queue_timeout_secs.to_string(), + reason: "Must be > 0 when queue_size > 0".to_string(), + }); + } + + if let Some(tokens_per_second) = config.rate_limit_tokens_per_second { + if tokens_per_second <= 0 { + return Err(ConfigError::InvalidValue { + field: "rate_limit_tokens_per_second".to_string(), + value: tokens_per_second.to_string(), + reason: "Must be > 0 when specified".to_string(), + }); + } + } + if config.worker_startup_timeout_secs == 0 { return Err(ConfigError::InvalidValue { field: "worker_startup_timeout_secs".to_string(), diff --git a/sgl-router/src/main.rs b/sgl-router/src/main.rs index 95910046d..ba1dc0a4b 100644 --- a/sgl-router/src/main.rs +++ b/sgl-router/src/main.rs @@ -195,6 +195,15 @@ struct CliArgs { #[arg(long, default_value_t = -1)] max_concurrent_requests: i32, + #[arg(long, default_value_t = 100)] + queue_size: usize, + + #[arg(long, default_value_t = 60)] + queue_timeout_secs: u64, + + #[arg(long)] + rate_limit_tokens_per_second: Option, + #[arg(long, num_args = 0..)] cors_allowed_origins: Vec, @@ -535,8 +544,8 @@ impl CliArgs { Some(self.request_id_headers.clone()) }, max_concurrent_requests: self.max_concurrent_requests, - queue_size: 100, - queue_timeout_secs: 60, + queue_size: self.queue_size, + queue_timeout_secs: self.queue_timeout_secs, cors_allowed_origins: self.cors_allowed_origins.clone(), retry: RetryConfig { max_retries: self.retry_max_retries, @@ -561,7 +570,7 @@ impl CliArgs { endpoint: self.health_check_endpoint.clone(), }, enable_igw: self.enable_igw, - rate_limit_tokens_per_second: None, + rate_limit_tokens_per_second: self.rate_limit_tokens_per_second, model_path: self.model_path.clone(), tokenizer_path: self.tokenizer_path.clone(), chat_template: self.chat_template.clone(),