Support penalty in overlap mode; return logprob with chunked prefill; improve benchmark scripts (#3988)

Co-authored-by: SangBin Cho <rkooo567@gmail.com>
Co-authored-by: dhou-xai <dhou@x.ai>
Co-authored-by: Hanming Lu <hanming_lu@berkeley.edu>
This commit is contained in:
Lianmin Zheng
2025-03-03 00:12:04 -08:00
parent 0194948fd9
commit ac2387279e
86 changed files with 4116 additions and 2015 deletions

View File

@@ -15,15 +15,21 @@
import argparse
import dataclasses
import json
import logging
import os
import random
import subprocess
import tempfile
import uuid
from pathlib import Path
from typing import List, Optional
import torch
from sglang.srt.hf_transformers_utils import check_gguf_file
from sglang.srt.utils import (
create_checksum,
get_amdgpu_memory_capacity,
get_hpu_memory_capacity,
get_nvgpu_memory_capacity,
@@ -43,12 +49,13 @@ class ServerArgs:
model_path: str
tokenizer_path: Optional[str] = None
tokenizer_mode: str = "auto"
skip_tokenizer_init: bool = False
load_format: str = "auto"
trust_remote_code: bool = True
trust_remote_code: bool = False
dtype: str = "auto"
kv_cache_dtype: str = "auto"
quantization_param_path: nullable_str = None
quantization: Optional[str] = None
quantization_param_path: nullable_str = None
context_length: Optional[int] = None
device: str = "cuda"
served_model_name: Optional[str] = None
@@ -67,7 +74,7 @@ class ServerArgs:
max_total_tokens: Optional[int] = None
chunked_prefill_size: Optional[int] = None
max_prefill_tokens: int = 16384
schedule_policy: str = "lpm"
schedule_policy: str = "fcfs"
schedule_conservativeness: float = 1.0
cpu_offload_gb: int = 0
prefill_only_one_req: bool = False
@@ -88,6 +95,7 @@ class ServerArgs:
log_level: str = "info"
log_level_http: Optional[str] = None
log_requests: bool = False
log_requests_level: int = 0
show_time_cost: bool = False
enable_metrics: bool = False
decode_log_interval: int = 40
@@ -123,11 +131,13 @@ class ServerArgs:
grammar_backend: Optional[str] = "outlines"
# Speculative decoding
speculative_draft_model_path: Optional[str] = None
speculative_algorithm: Optional[str] = None
speculative_draft_model_path: Optional[str] = None
speculative_num_steps: int = 5
speculative_eagle_topk: int = 8
speculative_num_draft_tokens: int = 64
speculative_eagle_topk: int = 4
speculative_num_draft_tokens: int = 8
speculative_accept_threshold_single: float = 1.0
speculative_accept_threshold_acc: float = 1.0
speculative_token_map: Optional[str] = None
# Double Sparsity
@@ -169,6 +179,12 @@ class ServerArgs:
enable_hierarchical_cache: bool = False
enable_flashinfer_mla: bool = False
flashinfer_mla_disable_ragged: bool = False
warmups: Optional[str] = None
# Debug tensor dumps
debug_tensor_dump_output_folder: Optional[str] = None
debug_tensor_dump_input_file: Optional[str] = None
debug_tensor_dump_inject: bool = False
def __post_init__(self):
# Set missing default values
@@ -266,10 +282,10 @@ class ServerArgs:
self.speculative_algorithm == "EAGLE"
or self.speculative_algorithm == "NEXTN"
):
self.disable_overlap_schedule = True
self.prefill_only_one_req = True
self.disable_cuda_graph_padding = True
self.disable_radix_cache = True
self.disable_overlap_schedule = True
self.chunked_prefill_size = -1
logger.info(
f"The radix cache, chunked prefill, and overlap scheduler are disabled because of using {self.speculative_algorithm} speculative decoding."
@@ -377,15 +393,6 @@ class ServerArgs:
choices=["auto", "fp8_e5m2", "fp8_e4m3"],
help='Data type for kv cache storage. "auto" will use model data type. "fp8_e5m2" and "fp8_e4m3" is supported for CUDA 11.8+.',
)
parser.add_argument(
"--quantization-param-path",
type=nullable_str,
default=None,
help="Path to the JSON file containing the KV cache "
"scaling factors. This should generally be supplied, when "
"KV cache dtype is FP8. Otherwise, KV cache scaling factors "
"default to 1.0, which may cause accuracy issues. ",
)
parser.add_argument(
"--quantization",
type=str,
@@ -404,6 +411,15 @@ class ServerArgs:
],
help="The quantization method.",
)
parser.add_argument(
"--quantization-param-path",
type=nullable_str,
default=None,
help="Path to the JSON file containing the KV cache "
"scaling factors. This should generally be supplied, when "
"KV cache dtype is FP8. Otherwise, KV cache scaling factors "
"default to 1.0, which may cause accuracy issues. ",
)
parser.add_argument(
"--context-length",
type=int,
@@ -578,7 +594,14 @@ class ServerArgs:
parser.add_argument(
"--log-requests",
action="store_true",
help="Log the inputs and outputs of all requests.",
help="Log metadata, inputs, outputs of all requests. The verbosity is decided by --log-requests-level",
)
parser.add_argument(
"--log-requests-level",
type=int,
default=0,
help="0: Log metadata. 1. Log metadata and partial input/output. 2. Log every input/output.",
choices=[0, 1, 2],
)
parser.add_argument(
"--show-time-cost",
@@ -742,16 +765,28 @@ class ServerArgs:
parser.add_argument(
"--speculative-eagle-topk",
type=int,
help="The number of token sampled from draft model in eagle2 each step.",
help="The number of tokens sampled from the draft model in eagle2 each step.",
choices=[1, 2, 4, 8],
default=ServerArgs.speculative_eagle_topk,
)
parser.add_argument(
"--speculative-num-draft-tokens",
type=int,
help="The number of token sampled from draft model in Speculative Decoding.",
help="The number of tokens sampled from the draft model in Speculative Decoding.",
default=ServerArgs.speculative_num_draft_tokens,
)
parser.add_argument(
"--speculative-accept-threshold-single",
type=float,
help="Accept a draft token if its probability in the target model is greater than this threshold.",
default=ServerArgs.speculative_accept_threshold_single,
)
parser.add_argument(
"--speculative-accept-threshold-acc",
type=float,
help="The accept probability of a draft token is raised from its target probability p to min(1, p / threshold_acc).",
default=ServerArgs.speculative_accept_threshold_acc,
)
parser.add_argument(
"--speculative-token-map",
type=str,
@@ -949,6 +984,35 @@ class ServerArgs:
help="Enable hierarchical cache",
)
# Server warmups
parser.add_argument(
"--warmups",
type=str,
required=False,
help="Specify custom warmup functions (csv) to run before server starts eg. --warmups=warmup_name1,warmup_name2 "
"will run the functions `warmup_name1` and `warmup_name2` specified in warmup.py before the server starts listening for requests",
)
# Debug tensor dumps
parser.add_argument(
"--debug-tensor-dump-output-folder",
type=str,
default=ServerArgs.debug_tensor_dump_output_folder,
help="The output folder for dumping tensors.",
)
parser.add_argument(
"--debug-tensor-dump-input-file",
type=str,
default=ServerArgs.debug_tensor_dump_input_file,
help="The input filename for dumping tensors",
)
parser.add_argument(
"--debug-tensor-dump-inject",
type=str,
default=ServerArgs.debug_tensor_dump_inject,
help="Inject the outputs from jax as the input of every layer.",
)
@classmethod
def from_cli_args(cls, args: argparse.Namespace):
args.tp_size = args.tensor_parallel_size