Revert "Chunked prefill support" (#799)

This commit is contained in:
Ying Sheng
2024-07-29 02:38:31 -07:00
committed by GitHub
parent 2ec39ab712
commit 98111fbe3e
5 changed files with 54 additions and 160 deletions

View File

@@ -65,9 +65,6 @@ class ServerArgs:
dp_size: int = 1
load_balance_method: str = "round_robin"
# Chunked Prefill
chunked_prefill_size: Optional[int] = None
# Optimization/debug options
disable_flashinfer: bool = False
disable_flashinfer_sampling: bool = False
@@ -86,8 +83,6 @@ class ServerArgs:
node_rank: Optional[int] = None
def __post_init__(self):
if self.chunked_prefill_size is None:
self.chunked_prefill_size = int(10**9)
if self.tokenizer_path is None:
self.tokenizer_path = self.model_path
if self.mem_fraction_static is None:
@@ -228,7 +223,7 @@ class ServerArgs:
parser.add_argument(
"--max-num-reqs",
type=int,
default=ServerArgs.max_num_reqs,
default=None,
help="The maximum number of requests to serve in the memory pool. If the model have a large context length, you may need to decrease this value to avoid out-of-memory errors.",
)
parser.add_argument(
@@ -316,18 +311,10 @@ class ServerArgs:
help="The nccl init address of multi-node server.",
)
parser.add_argument(
"--nnodes", type=int, default=ServerArgs.nnodes, help="The number of nodes."
"--nnodes", type=int, default=1, help="The number of nodes."
)
parser.add_argument("--node-rank", type=int, help="The node rank.")
# Chunked prefill
parser.add_argument(
"--chunked-prefill-size",
type=int,
default=ServerArgs.chunked_prefill_size,
help="The size of the chunked prefill.",
)
# Optimization/debug options
parser.add_argument(
"--disable-flashinfer",
@@ -406,10 +393,6 @@ class ServerArgs:
self.dp_size > 1 and self.node_rank is not None
), "multi-node data parallel is not supported"
assert not (
self.chunked_prefill_size is not None and self.disable_radix_cache
), "chunked prefill is not supported with radix cache disabled currently"
@dataclasses.dataclass
class PortArgs: