From f4aa78801e683dbd71a13e6360d44b2ac3748930 Mon Sep 17 00:00:00 2001
From: Jonah Bernard <96398205+Jonahcb@users.noreply.github.com>
Date: Mon, 13 Oct 2025 14:08:48 -0400
Subject: [PATCH] [router] Add Rust CLI flags for queue size, timeout, and rate
 limit for token bucket rate limiter (#11483)

Co-authored-by: Simo Lin <linsimo.mark@gmail.com>
---
 sgl-router/README.md                |  5 +++++
 sgl-router/src/config/validation.rs | 18 ++++++++++++++++++
 sgl-router/src/main.rs              | 15 ++++++++++++---
 3 files changed, 35 insertions(+), 3 deletions(-)
diff --git a/sgl-router/README.md b/sgl-router/README.md
index ead374e4a..eaf58187e 100644
--- a/sgl-router/README.md
+++ b/sgl-router/README.md
@@ -426,6 +426,11 @@ curl -X POST http://localhost:8080/add_worker?url=http://worker3:8000&api_key=wo
 #### Authentication
 - `--api-key`: API key for router authentication (clients must provide this as Bearer token)
 
+#### Concurrency and Rate Limiting
+- `--queue-size`: Size of the pending-request queue when concurrency limits are reached (default: 100; set to 0 to disable queuing)
+- `--queue-timeout-secs`: Maximum time a request may wait in the queue before timing out (default: 60; must be > 0 when queue is enabled)
+- `--rate-limit-tokens-per-second`: Override token bucket refill rate for rate limiting (defaults to `--max-concurrent-requests` when omitted)
+
 ## Development
 
 ### Build Process
diff --git a/sgl-router/src/config/validation.rs b/sgl-router/src/config/validation.rs
index ba2f71984..97f825f5a 100644
--- a/sgl-router/src/config/validation.rs
+++ b/sgl-router/src/config/validation.rs
@@ -205,6 +205,24 @@ impl ConfigValidator {
             });
         }
 
+        if config.queue_size > 0 && config.queue_timeout_secs == 0 {
+            return Err(ConfigError::InvalidValue {
+                field: "queue_timeout_secs".to_string(),
+                value: config.queue_timeout_secs.to_string(),
+                reason: "Must be > 0 when queue_size > 0".to_string(),
+            });
+        }
+
+        if let Some(tokens_per_second) = config.rate_limit_tokens_per_second {
+            if tokens_per_second <= 0 {
+                return Err(ConfigError::InvalidValue {
+                    field: "rate_limit_tokens_per_second".to_string(),
+                    value: tokens_per_second.to_string(),
+                    reason: "Must be > 0 when specified".to_string(),
+                });
+            }
+        }
+
         if config.worker_startup_timeout_secs == 0 {
             return Err(ConfigError::InvalidValue {
                 field: "worker_startup_timeout_secs".to_string(),
diff --git a/sgl-router/src/main.rs b/sgl-router/src/main.rs
index 95910046d..ba1dc0a4b 100644
--- a/sgl-router/src/main.rs
+++ b/sgl-router/src/main.rs
@@ -195,6 +195,15 @@ struct CliArgs {
     #[arg(long, default_value_t = -1)]
     max_concurrent_requests: i32,
 
+    #[arg(long, default_value_t = 100)]
+    queue_size: usize,
+
+    #[arg(long, default_value_t = 60)]
+    queue_timeout_secs: u64,
+
+    #[arg(long)]
+    rate_limit_tokens_per_second: Option<i32>,
+
     #[arg(long, num_args = 0..)]
     cors_allowed_origins: Vec<String>,
 
@@ -535,8 +544,8 @@ impl CliArgs {
                 Some(self.request_id_headers.clone())
             },
             max_concurrent_requests: self.max_concurrent_requests,
-            queue_size: 100,
-            queue_timeout_secs: 60,
+            queue_size: self.queue_size,
+            queue_timeout_secs: self.queue_timeout_secs,
             cors_allowed_origins: self.cors_allowed_origins.clone(),
             retry: RetryConfig {
                 max_retries: self.retry_max_retries,
@@ -561,7 +570,7 @@ impl CliArgs {
                 endpoint: self.health_check_endpoint.clone(),
             },
             enable_igw: self.enable_igw,
-            rate_limit_tokens_per_second: None,
+            rate_limit_tokens_per_second: self.rate_limit_tokens_per_second,
             model_path: self.model_path.clone(),
             tokenizer_path: self.tokenizer_path.clone(),
             chat_template: self.chat_template.clone(),