feat: throttle requests at scheduler based on --max_queued_requests (#7565)
This commit is contained in:
@@ -19,6 +19,7 @@ import json
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
import sys
|
||||
import tempfile
|
||||
from typing import List, Literal, Optional, Union
|
||||
|
||||
@@ -74,6 +75,7 @@ class ServerArgs:
|
||||
# Memory and scheduling
|
||||
mem_fraction_static: Optional[float] = None
|
||||
max_running_requests: Optional[int] = None
|
||||
max_queued_requests: Optional[int] = sys.maxsize
|
||||
max_total_tokens: Optional[int] = None
|
||||
chunked_prefill_size: Optional[int] = None
|
||||
max_prefill_tokens: int = 16384
|
||||
@@ -805,6 +807,12 @@ class ServerArgs:
|
||||
default=ServerArgs.max_running_requests,
|
||||
help="The maximum number of running requests.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-queued-requests",
|
||||
type=int,
|
||||
default=ServerArgs.max_queued_requests,
|
||||
help="The maximum number of queued requests. This option is ignored when using disaggregation-mode.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-total-tokens",
|
||||
type=int,
|
||||
|
||||
Reference in New Issue
Block a user