diff --git a/benchmark/latency_throughput/bench_serving.py b/benchmark/latency_throughput/bench_serving.py index 8566420ed..74fafc949 100644 --- a/benchmark/latency_throughput/bench_serving.py +++ b/benchmark/latency_throughput/bench_serving.py @@ -248,7 +248,7 @@ def main(args: argparse.Namespace): random.seed(args.seed) np.random.seed(args.seed) - api_url = f"http://{args.host}:{args.port}/generate" + api_url = f"{args.host}:{args.port}/generate" if args.tokenizer.endswith(".json") or args.tokenizer.endswith(".model"): from sglang.srt.hf_transformers_utils import get_tokenizer @@ -333,7 +333,7 @@ if __name__ == "__main__": default="srt", choices=["vllm", "tgi", "srt", "lightllm", "ginfer"], ) - parser.add_argument("--host", type=str, default="localhost") + parser.add_argument("--host", type=str, default="http://localhost") parser.add_argument("--port", type=int, default=30000) parser.add_argument("--dataset", type=str, help="Path to the dataset.") parser.add_argument("--input-len", type=int, default=2048) diff --git a/python/sglang/bench_serving.py b/python/sglang/bench_serving.py index c7b73de80..e5f61b9a1 100644 --- a/python/sglang/bench_serving.py +++ b/python/sglang/bench_serving.py @@ -5,6 +5,9 @@ Benchmark online serving. Usage: python3 -m sglang.bench_serving --backend sglang --num-prompt 10 + +python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompts 3000 --random-input 1024 --random-output 1024 --random-range-ratio 0.5 +python3 -m sglang.bench_serving --backend sglang --dataset-name random --request-rate-range 1,2,4,8,16,32 --random-input 4096 --random-output 1024 --random-range-ratio 0.125 --multi """ import argparse diff --git a/python/sglang/srt/managers/controller/tp_worker.py b/python/sglang/srt/managers/controller/tp_worker.py index baf509877..b04f0aa2d 100644 --- a/python/sglang/srt/managers/controller/tp_worker.py +++ b/python/sglang/srt/managers/controller/tp_worker.py @@ -116,13 +116,9 @@ class ModelTpServer: f"[gpu_id={self.gpu_id}] " f"max_total_num_tokens={self.max_total_num_tokens}, " f"max_prefill_tokens={self.max_prefill_tokens}, " + f"max_running_requests={self.max_running_requests}, " f"context_len={self.model_config.context_len}" ) - if self.tp_rank == 0: - logger.info( - f"[gpu_id={self.gpu_id}] " - f"server_args: {server_args.print_mode_args()}" - ) # Init cache self.tree_cache = RadixCache(