Simplify the event loop and expose --num-continuous-decode-steps as an argument (#1652)

2024-10-12 21:35:30 -07:00
parent 9610fcd469
commit 7ee6c259ff
5 changed files with 85 additions and 62 deletions
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -111,6 +111,7 @@ class ServerArgs:
    torchao_config: str = ""
    enable_p2p_check: bool = False
    triton_attention_reduce_in_fp32: bool = False
+    num_continuous_decode_steps: int = 1

    def __post_init__(self):
        # Set missing default values
@@ -559,6 +560,14 @@ class ServerArgs:
            help="Cast the intermidiate attention results to fp32 to avoid possible crashes related to fp16."
            "This only affects Triton attention kernels.",
        )
+        parser.add_argument(
+            "--num-continuous-decode-steps",
+            type=int,
+            default=ServerArgs.num_continuous_decode_steps,
+            help="Run multiple continuous decoding steps to reduce scheduling overhead. "
+            "This can potentially increase throughput but may also increase time-to-first-token latency. "
+            "The default value is 1, meaning only run one decoding step at a time.",
+        )

    @classmethod
    def from_cli_args(cls, args: argparse.Namespace):