feat: throttle requests at scheduler based on --max_queued_requests (#7565)

This commit is contained in:
harrisonlimh
2025-07-28 07:32:33 -07:00
committed by GitHub
parent b582159246
commit 747dd45077
10 changed files with 218 additions and 6 deletions

View File

@@ -19,6 +19,7 @@ import json
import logging
import os
import random
import sys
import tempfile
from typing import List, Literal, Optional, Union
@@ -74,6 +75,7 @@ class ServerArgs:
# Memory and scheduling
mem_fraction_static: Optional[float] = None
max_running_requests: Optional[int] = None
max_queued_requests: Optional[int] = sys.maxsize
max_total_tokens: Optional[int] = None
chunked_prefill_size: Optional[int] = None
max_prefill_tokens: int = 16384
@@ -805,6 +807,12 @@ class ServerArgs:
default=ServerArgs.max_running_requests,
help="The maximum number of running requests.",
)
parser.add_argument(
"--max-queued-requests",
type=int,
default=ServerArgs.max_queued_requests,
help="The maximum number of queued requests. This option is ignored when using disaggregation-mode.",
)
parser.add_argument(
"--max-total-tokens",
type=int,