Remove prefill-only-one-req (#4117)

2025-03-05 20:58:48 -08:00
parent 718c391fd7
commit 286e6540a6
3 changed files with 4 additions and 17 deletions
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -274,10 +274,8 @@ class Scheduler:
                target_worker=self.tp_worker,
                dp_rank=dp_rank,
            )
-            self.prefill_only_one_req = True
        else:
            self.draft_worker = None
-            self.prefill_only_one_req = False

        # Get token and memory info from the model worker
        (
@@ -1077,8 +1075,6 @@ class Scheduler:
                    else:
                        self.batch_is_full = True
                break
-            if self.prefill_only_one_req:
-                break

        # Update waiting queue
        can_run_list: List[Req] = adder.can_run_list
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -71,7 +71,6 @@ class ServerArgs:
    schedule_policy: str = "fcfs"
    schedule_conservativeness: float = 1.0
    cpu_offload_gb: int = 0
-    prefill_only_one_req: bool = False

    # Other runtime options
    tp_size: int = 1
@@ -277,19 +276,17 @@ class ServerArgs:
            self.speculative_algorithm = "EAGLE"

        if self.speculative_algorithm == "EAGLE":
-            self.disable_overlap_schedule = True
-            self.prefill_only_one_req = True
-            self.disable_cuda_graph_padding = True
            if self.max_running_requests is None:
                self.max_running_requests = 32
+            self.disable_overlap_schedule = True
+            self.disable_cuda_graph_padding = True
            logger.info(
                "Overlap scheduler are disabled because of using "
                "eagle speculative decoding."
-                "Max running request set to 32 because of using eagle speculative decoding."
            )
            # The token generated from the verify step is counted.
            # If sepculative_num_steps >= speculative_num_draft_tokens, the additional tokens will definitely be discarded.
-            assert self.speculative_num_steps < self.speculative_num_draft_tokens
+            # assert self.speculative_num_steps < self.speculative_num_draft_tokens

        # GGUF
        if (
@@ -509,12 +506,6 @@ class ServerArgs:
            default=ServerArgs.cpu_offload_gb,
            help="How many GBs of RAM to reserve for CPU offloading",
        )
-        parser.add_argument(
-            "--prefill-only-one-req",
-            type=bool,
-            help="If true, we only prefill one request at one prefill batch",
-            default=ServerArgs.prefill_only_one_req,
-        )

        # Other runtime options
        parser.add_argument(