feat: add priority based scheduling with priority based request acceptance and preemption (#8746)
This commit is contained in:
@@ -172,11 +172,14 @@ class ServerArgs:
|
||||
# Memory and scheduling
|
||||
mem_fraction_static: Optional[float] = None
|
||||
max_running_requests: Optional[int] = None
|
||||
max_queued_requests: Optional[int] = sys.maxsize
|
||||
max_queued_requests: Optional[int] = None
|
||||
max_total_tokens: Optional[int] = None
|
||||
chunked_prefill_size: Optional[int] = None
|
||||
max_prefill_tokens: int = 16384
|
||||
schedule_policy: str = "fcfs"
|
||||
enable_priority_scheduling: bool = False
|
||||
schedule_low_priority_values_first: bool = False
|
||||
priority_scheduling_preemption_threshold: int = 10
|
||||
schedule_conservativeness: float = 1.0
|
||||
page_size: Optional[int] = None
|
||||
hybrid_kvcache_ratio: Optional[float] = None
|
||||
@@ -1166,6 +1169,24 @@ class ServerArgs:
|
||||
choices=["lpm", "random", "fcfs", "dfs-weight", "lof", "priority"],
|
||||
help="The scheduling policy of the requests.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--enable-priority-scheduling",
|
||||
action="store_true",
|
||||
default=ServerArgs.enable_priority_scheduling,
|
||||
help="Enable priority scheduling. Requests with higher priority integer values will be scheduled first by default.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--schedule-low-priority-values-first",
|
||||
action="store_true",
|
||||
default=ServerArgs.schedule_low_priority_values_first,
|
||||
help="If specified with --enable-priority-scheduling, the scheduler will schedule requests with lower priority integer values first.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--priority-scheduling-preemption-threshold",
|
||||
type=int,
|
||||
default=ServerArgs.priority_scheduling_preemption_threshold,
|
||||
help="Minimum difference in priorities for an incoming request to have to preempt running request(s).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--schedule-conservativeness",
|
||||
type=float,
|
||||
@@ -2455,6 +2476,13 @@ class ServerArgs:
|
||||
"--generation-tokens-buckets", self.generation_tokens_buckets
|
||||
)
|
||||
|
||||
# Check scheduling policy
|
||||
if self.enable_priority_scheduling:
|
||||
assert self.schedule_policy in [
|
||||
"fcfs",
|
||||
"lof",
|
||||
], f"To use priority scheduling, schedule_policy must be 'fcfs' or 'lof'. '{self.schedule_policy}' is not supported."
|
||||
|
||||
def check_lora_server_args(self):
|
||||
assert self.max_loras_per_batch > 0, "max_loras_per_batch must be positive"
|
||||
|
||||
|
||||
Reference in New Issue
Block a user