Chunked prefill support (#797)

2024-07-29 02:21:50 -07:00
parent 8f6274c82b
commit 2ec39ab712
5 changed files with 160 additions and 54 deletions
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -65,6 +65,9 @@ class ServerArgs:
    dp_size: int = 1
    load_balance_method: str = "round_robin"

+    # Chunked Prefill
+    chunked_prefill_size: Optional[int] = None
+
    # Optimization/debug options
    disable_flashinfer: bool = False
    disable_flashinfer_sampling: bool = False
@@ -83,6 +86,8 @@ class ServerArgs:
    node_rank: Optional[int] = None

    def __post_init__(self):
+        if self.chunked_prefill_size is None:
+            self.chunked_prefill_size = int(10**9)
        if self.tokenizer_path is None:
            self.tokenizer_path = self.model_path
        if self.mem_fraction_static is None:
@@ -223,7 +228,7 @@ class ServerArgs:
        parser.add_argument(
            "--max-num-reqs",
            type=int,
-            default=None,
+            default=ServerArgs.max_num_reqs,
            help="The maximum number of requests to serve in the memory pool. If the model have a large context length, you may need to decrease this value to avoid out-of-memory errors.",
        )
        parser.add_argument(
@@ -311,10 +316,18 @@ class ServerArgs:
            help="The nccl init address of multi-node server.",
        )
        parser.add_argument(
-            "--nnodes", type=int, default=1, help="The number of nodes."
+            "--nnodes", type=int, default=ServerArgs.nnodes, help="The number of nodes."
        )
        parser.add_argument("--node-rank", type=int, help="The node rank.")

+        # Chunked prefill
+        parser.add_argument(
+            "--chunked-prefill-size",
+            type=int,
+            default=ServerArgs.chunked_prefill_size,
+            help="The size of the chunked prefill.",
+        )
+
        # Optimization/debug options
        parser.add_argument(
            "--disable-flashinfer",
@@ -393,6 +406,10 @@ class ServerArgs:
            self.dp_size > 1 and self.node_rank is not None
        ), "multi-node data parallel is not supported"

+        assert not (
+            self.chunked_prefill_size is not None and self.disable_radix_cache
+        ), "chunked prefill is not supported with radix cache disabled currently"
+

@dataclasses.dataclass
 class PortArgs: