feat: add priority based scheduling with priority based request acceptance and preemption (#8746)

2025-09-16 17:10:10 -07:00
parent f949ad5794
commit 14fdd52740
16 changed files with 822 additions and 71 deletions
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -172,11 +172,14 @@ class ServerArgs:
    # Memory and scheduling
    mem_fraction_static: Optional[float] = None
    max_running_requests: Optional[int] = None
-    max_queued_requests: Optional[int] = sys.maxsize
+    max_queued_requests: Optional[int] = None
    max_total_tokens: Optional[int] = None
    chunked_prefill_size: Optional[int] = None
    max_prefill_tokens: int = 16384
    schedule_policy: str = "fcfs"
+    enable_priority_scheduling: bool = False
+    schedule_low_priority_values_first: bool = False
+    priority_scheduling_preemption_threshold: int = 10
    schedule_conservativeness: float = 1.0
    page_size: Optional[int] = None
    hybrid_kvcache_ratio: Optional[float] = None
@@ -1166,6 +1169,24 @@ class ServerArgs:
            choices=["lpm", "random", "fcfs", "dfs-weight", "lof", "priority"],
            help="The scheduling policy of the requests.",
        )
+        parser.add_argument(
+            "--enable-priority-scheduling",
+            action="store_true",
+            default=ServerArgs.enable_priority_scheduling,
+            help="Enable priority scheduling. Requests with higher priority integer values will be scheduled first by default.",
+        )
+        parser.add_argument(
+            "--schedule-low-priority-values-first",
+            action="store_true",
+            default=ServerArgs.schedule_low_priority_values_first,
+            help="If specified with --enable-priority-scheduling, the scheduler will schedule requests with lower priority integer values first.",
+        )
+        parser.add_argument(
+            "--priority-scheduling-preemption-threshold",
+            type=int,
+            default=ServerArgs.priority_scheduling_preemption_threshold,
+            help="Minimum difference in priorities for an incoming request to have to preempt running request(s).",
+        )
        parser.add_argument(
            "--schedule-conservativeness",
            type=float,
@@ -2455,6 +2476,13 @@ class ServerArgs:
            "--generation-tokens-buckets", self.generation_tokens_buckets
        )

+        # Check scheduling policy
+        if self.enable_priority_scheduling:
+            assert self.schedule_policy in [
+                "fcfs",
+                "lof",
+            ], f"To use priority scheduling, schedule_policy must be 'fcfs' or 'lof'. '{self.schedule_policy}' is not supported."
+
    def check_lora_server_args(self):
        assert self.max_loras_per_batch > 0, "max_loras_per_batch must be positive"