Revert "Chunked prefill support" (#799)

2024-07-29 02:38:31 -07:00
parent 2ec39ab712
commit 98111fbe3e
5 changed files with 54 additions and 160 deletions
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -65,9 +65,6 @@ class ServerArgs:
    dp_size: int = 1
    load_balance_method: str = "round_robin"

-    # Chunked Prefill
-    chunked_prefill_size: Optional[int] = None
-
    # Optimization/debug options
    disable_flashinfer: bool = False
    disable_flashinfer_sampling: bool = False
@@ -86,8 +83,6 @@ class ServerArgs:
    node_rank: Optional[int] = None

    def __post_init__(self):
-        if self.chunked_prefill_size is None:
-            self.chunked_prefill_size = int(10**9)
        if self.tokenizer_path is None:
            self.tokenizer_path = self.model_path
        if self.mem_fraction_static is None:
@@ -228,7 +223,7 @@ class ServerArgs:
        parser.add_argument(
            "--max-num-reqs",
            type=int,
-            default=ServerArgs.max_num_reqs,
+            default=None,
            help="The maximum number of requests to serve in the memory pool. If the model have a large context length, you may need to decrease this value to avoid out-of-memory errors.",
        )
        parser.add_argument(
@@ -316,18 +311,10 @@ class ServerArgs:
            help="The nccl init address of multi-node server.",
        )
        parser.add_argument(
-            "--nnodes", type=int, default=ServerArgs.nnodes, help="The number of nodes."
+            "--nnodes", type=int, default=1, help="The number of nodes."
        )
        parser.add_argument("--node-rank", type=int, help="The node rank.")

-        # Chunked prefill
-        parser.add_argument(
-            "--chunked-prefill-size",
-            type=int,
-            default=ServerArgs.chunked_prefill_size,
-            help="The size of the chunked prefill.",
-        )
-
        # Optimization/debug options
        parser.add_argument(
            "--disable-flashinfer",
@@ -406,10 +393,6 @@ class ServerArgs:
            self.dp_size > 1 and self.node_rank is not None
        ), "multi-node data parallel is not supported"

-        assert not (
-            self.chunked_prefill_size is not None and self.disable_radix_cache
-        ), "chunked prefill is not supported with radix cache disabled currently"
-

@dataclasses.dataclass
 class PortArgs: