Add --max-total-tokens (#840)

2024-07-30 13:33:55 -07:00
parent 1edd4e07d6
commit 6b0f2e9088
2 changed files with 24 additions and 2 deletions
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -44,6 +44,7 @@ class ServerArgs:
    max_prefill_tokens: Optional[int] = None
    max_running_requests: Optional[int] = None
    max_num_reqs: Optional[int] = None
+    max_total_tokens: Optional[int] = None
    schedule_policy: str = "lpm"
    schedule_conservativeness: float = 1.0

@@ -231,6 +232,12 @@ class ServerArgs:
            default=ServerArgs.max_num_reqs,
            help="The maximum number of requests to serve in the memory pool. If the model have a large context length, you may need to decrease this value to avoid out-of-memory errors.",
        )
+        parser.add_argument(
+            "--max-total-tokens",
+            type=int,
+            default=ServerArgs.max_total_tokens,
+            help="The maximum number of tokens in the memory pool. If not specified, it will be automatically calculated based on the memory usage fraction. This option is typically used for development and debugging purposes.",
+        )
        parser.add_argument(
            "--schedule-policy",
            type=str,