Clean up readme and arguments of chunked prefill (#1022)

2024-08-11 01:18:52 -07:00
parent 33d61356b8
commit a97df79124
3 changed files with 30 additions and 34 deletions
--- a/python/sglang/srt/managers/tp_worker.py
+++ b/python/sglang/srt/managers/tp_worker.py
@@ -118,11 +118,7 @@ class ModelTpServer:
                    trust_remote_code=server_args.trust_remote_code,
                )
        self.max_total_num_tokens = self.model_runner.max_total_num_tokens
-        self.max_prefill_tokens = (
-            16384
-            if server_args.max_prefill_tokens is None
-            else server_args.max_prefill_tokens
-        )
+        self.max_prefill_tokens = server_args.max_prefill_tokens
        self.max_running_requests = min(
            (
                self.max_total_num_tokens // 2
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -43,10 +43,11 @@ class ServerArgs:

    # Memory and scheduling
    mem_fraction_static: Optional[float] = None
-    max_prefill_tokens: Optional[int] = None
    max_running_requests: Optional[int] = None
    max_num_reqs: Optional[int] = None
    max_total_tokens: Optional[int] = None
+    chunked_prefill_size: int = -1
+    max_prefill_tokens: int = 16384
    schedule_policy: str = "lpm"
    schedule_conservativeness: float = 1.0

@@ -69,9 +70,6 @@ class ServerArgs:
    dp_size: int = 1
    load_balance_method: str = "round_robin"

-    # Chunked Prefill
-    chunked_prefill_size: Optional[int] = None
-
    # Optimization/debug options
    disable_flashinfer: bool = False
    disable_flashinfer_sampling: bool = False
@@ -97,6 +95,10 @@ class ServerArgs:
        if self.served_model_name is None:
            self.served_model_name = self.model_path

+        if self.chunked_prefill_size <= 0:
+            # Disable chunked prefill
+            self.chunked_prefill_size = None
+
        if self.mem_fraction_static is None:
            if self.tp_size >= 16:
                self.mem_fraction_static = 0.79
@@ -108,6 +110,7 @@ class ServerArgs:
                self.mem_fraction_static = 0.87
            else:
                self.mem_fraction_static = 0.88
+
        if isinstance(self.additional_ports, int):
            self.additional_ports = [self.additional_ports]
        elif self.additional_ports is None:
@@ -232,12 +235,6 @@ class ServerArgs:
            default=ServerArgs.mem_fraction_static,
            help="The fraction of the memory used for static allocation (model weights and KV cache memory pool). Use a smaller value if you see out-of-memory errors.",
        )
-        parser.add_argument(
-            "--max-prefill-tokens",
-            type=int,
-            default=ServerArgs.max_prefill_tokens,
-            help="The maximum number of tokens in a prefill batch. The real bound will be the maximum of this value and the model's maximum context length.",
-        )
        parser.add_argument(
            "--max-running-requests",
            type=int,
@@ -256,6 +253,18 @@ class ServerArgs:
            default=ServerArgs.max_total_tokens,
            help="The maximum number of tokens in the memory pool. If not specified, it will be automatically calculated based on the memory usage fraction. This option is typically used for development and debugging purposes.",
        )
+        parser.add_argument(
+            "--chunked-prefill-size",
+            type=int,
+            default=ServerArgs.chunked_prefill_size,
+            help="The maximum number of tokens in a chunk for the chunked prefill. Setting this to -1 means disabling chunked prefill",
+        )
+        parser.add_argument(
+            "--max-prefill-tokens",
+            type=int,
+            default=ServerArgs.max_prefill_tokens,
+            help="The maximum number of tokens in a prefill batch. The real bound will be the maximum of this value and the model's maximum context length.",
+        )
        parser.add_argument(
            "--schedule-policy",
            type=str,
@@ -353,14 +362,6 @@ class ServerArgs:
        )
        parser.add_argument("--node-rank", type=int, help="The node rank.")

-        # Chunked prefill
-        parser.add_argument(
-            "--chunked-prefill-size",
-            type=int,
-            default=ServerArgs.chunked_prefill_size,
-            help="The size of the chunked prefill.",
-        )
-
        # Optimization/debug options
        parser.add_argument(
            "--disable-flashinfer",