feat: add priority based scheduling with priority based request acceptance and preemption (#8746)

This commit is contained in:
harrisonlimh
2025-09-16 17:10:10 -07:00
committed by GitHub
parent f949ad5794
commit 14fdd52740
16 changed files with 822 additions and 71 deletions

View File

@@ -172,11 +172,14 @@ class ServerArgs:
# Memory and scheduling
mem_fraction_static: Optional[float] = None
max_running_requests: Optional[int] = None
max_queued_requests: Optional[int] = sys.maxsize
max_queued_requests: Optional[int] = None
max_total_tokens: Optional[int] = None
chunked_prefill_size: Optional[int] = None
max_prefill_tokens: int = 16384
schedule_policy: str = "fcfs"
enable_priority_scheduling: bool = False
schedule_low_priority_values_first: bool = False
priority_scheduling_preemption_threshold: int = 10
schedule_conservativeness: float = 1.0
page_size: Optional[int] = None
hybrid_kvcache_ratio: Optional[float] = None
@@ -1166,6 +1169,24 @@ class ServerArgs:
choices=["lpm", "random", "fcfs", "dfs-weight", "lof", "priority"],
help="The scheduling policy of the requests.",
)
parser.add_argument(
"--enable-priority-scheduling",
action="store_true",
default=ServerArgs.enable_priority_scheduling,
help="Enable priority scheduling. Requests with higher priority integer values will be scheduled first by default.",
)
parser.add_argument(
"--schedule-low-priority-values-first",
action="store_true",
default=ServerArgs.schedule_low_priority_values_first,
help="If specified with --enable-priority-scheduling, the scheduler will schedule requests with lower priority integer values first.",
)
parser.add_argument(
"--priority-scheduling-preemption-threshold",
type=int,
default=ServerArgs.priority_scheduling_preemption_threshold,
help="Minimum difference in priorities for an incoming request to have to preempt running request(s).",
)
parser.add_argument(
"--schedule-conservativeness",
type=float,
@@ -2455,6 +2476,13 @@ class ServerArgs:
"--generation-tokens-buckets", self.generation_tokens_buckets
)
# Check scheduling policy
if self.enable_priority_scheduling:
assert self.schedule_policy in [
"fcfs",
"lof",
], f"To use priority scheduling, schedule_policy must be 'fcfs' or 'lof'. '{self.schedule_policy}' is not supported."
def check_lora_server_args(self):
assert self.max_loras_per_batch > 0, "max_loras_per_batch must be positive"