From 11c8efff73fb869b728fbe75aa0ecd7387f814da Mon Sep 17 00:00:00 2001 From: Ying Sheng Date: Fri, 19 Jul 2024 11:12:23 -0700 Subject: [PATCH] Add benchmark instructions (#663) --- README.md | 17 ++++++++++++++--- python/sglang/bench_serving.py | 6 ++++++ 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 7e895b862..3e0cdffc3 100644 --- a/README.md +++ b/README.md @@ -162,9 +162,9 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct - Gemma / Gemma 2 - Qwen / Qwen 2 / Qwen 2 MoE - LLaVA 1.5 / 1.6 - - `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000` - - `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.6-vicuna-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000` - - `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.6-34b --tokenizer-path liuhaotian/llava-v1.6-34b-tokenizer --port 3000` + - `python -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000` + - `python -m sglang.launch_server --model-path liuhaotian/llava-v1.6-vicuna-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000` + - `python -m sglang.launch_server --model-path liuhaotian/llava-v1.6-34b --tokenizer-path liuhaotian/llava-v1.6-34b-tokenizer --port 30000` - LLaVA-NeXT-Video - see [srt_example_llava_v.sh](examples/usage/llava_video/srt_example_llava_v.sh) - Yi-VL @@ -178,6 +178,17 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/model_support.md). +### Benchmark Performance + +- Benchmark a single static batch. Run the following command without launching a server. The arguments are the same as those for `launch_server.py`. + ``` + python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 32 --input-len 256 --output-len 32 + ``` +- Benchmark online serving. Launch a server first and run the following command. + ``` + python3 -m sglang.bench_serving --backend sglang --num-prompt 10 + ``` + ## Frontend: Structured Generation Language (SGLang) The frontend language can be used with local models or API models. diff --git a/python/sglang/bench_serving.py b/python/sglang/bench_serving.py index e0686c45a..870cca543 100644 --- a/python/sglang/bench_serving.py +++ b/python/sglang/bench_serving.py @@ -1,5 +1,11 @@ # Adapted from https://github.com/vllm-project/vllm/blob/6366efc67b0aedd2c1721c14385370e50b297fb3/benchmarks/backend_request_func.py # Adapted from https://github.com/vllm-project/vllm/blob/6366efc67b0aedd2c1721c14385370e50b297fb3/benchmarks/benchmark_serving.py +""" +Benchmark online serving. + +Usage: +python3 -m sglang.bench_serving --backend sglang --num-prompt 10 +""" import argparse import asyncio