Simplify the event loop and expose --num-continuous-decode-steps as an argument (#1652)

This commit is contained in:
Lianmin Zheng
2024-10-12 21:35:30 -07:00
committed by GitHub
parent 9610fcd469
commit 7ee6c259ff
5 changed files with 85 additions and 62 deletions

View File

@@ -111,6 +111,7 @@ class ServerArgs:
torchao_config: str = ""
enable_p2p_check: bool = False
triton_attention_reduce_in_fp32: bool = False
num_continuous_decode_steps: int = 1
def __post_init__(self):
# Set missing default values
@@ -559,6 +560,14 @@ class ServerArgs:
help="Cast the intermidiate attention results to fp32 to avoid possible crashes related to fp16."
"This only affects Triton attention kernels.",
)
parser.add_argument(
"--num-continuous-decode-steps",
type=int,
default=ServerArgs.num_continuous_decode_steps,
help="Run multiple continuous decoding steps to reduce scheduling overhead. "
"This can potentially increase throughput but may also increase time-to-first-token latency. "
"The default value is 1, meaning only run one decoding step at a time.",
)
@classmethod
def from_cli_args(cls, args: argparse.Namespace):