Organize public APIs (#809)
This commit is contained in:
@@ -208,11 +208,11 @@ Instructions for supporting a new model are [here](https://github.com/sgl-projec
|
||||
|
||||
- Benchmark a single static batch by running the following command without launching a server. The arguments are the same as those for `launch_server.py`. This is not a dynamic batching server, so it may run out of memory for a batch size that can run successfully with a real server. This is because a real server will truncate the prefill into several batches/chunks, while this unit test does not do this.
|
||||
```
|
||||
python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 32 --input-len 256 --output-len 32
|
||||
python -m sglang.benchmarks.bench_latency --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 32 --input-len 256 --output-len 32
|
||||
```
|
||||
- Benchmark online serving. Launch a server first and run the following command.
|
||||
```
|
||||
python3 -m sglang.bench_serving --backend sglang --num-prompt 10
|
||||
python3 -m sglang.benchmarks.bench_serving --backend sglang --num-prompt 10
|
||||
```
|
||||
|
||||
## Frontend: Structured Generation Language (SGLang)
|
||||
|
||||
@@ -9,16 +9,16 @@
|
||||
# python -m sglang.launch_server --model ~/llama-3.1-405b-fp8-dummy/ --load-format dummy --tp 8 --quant fp8 --disable-radix --mem-frac 0.87
|
||||
|
||||
# offline
|
||||
python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 3000 --random-input 1024 --random-output 1024 > sglang_log11
|
||||
python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 4000 --random-input 1024 --random-output 512 > sglang_log12
|
||||
python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 800 --random-input 4096 --random-output 2048 > sglang_log13
|
||||
python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 1500 --random-input 4096 --random-output 1024 > sglang_log14
|
||||
python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 6000 --random-input 256 --random-output 512 > sglang_log15
|
||||
python3 -m sglang.bench_serving --backend sglang --dataset-name sharegpt --num-prompt 2000 > sglang_log21
|
||||
python3 -m sglang.benchmarks.bench_serving --backend sglang --dataset-name random --num-prompt 3000 --random-input 1024 --random-output 1024 > sglang_log11
|
||||
python3 -m sglang.benchmarks.bench_serving --backend sglang --dataset-name random --num-prompt 4000 --random-input 1024 --random-output 512 > sglang_log12
|
||||
python3 -m sglang.benchmarks.bench_serving --backend sglang --dataset-name random --num-prompt 800 --random-input 4096 --random-output 2048 > sglang_log13
|
||||
python3 -m sglang.benchmarks.bench_serving --backend sglang --dataset-name random --num-prompt 1500 --random-input 4096 --random-output 1024 > sglang_log14
|
||||
python3 -m sglang.benchmarks.bench_serving --backend sglang --dataset-name random --num-prompt 6000 --random-input 256 --random-output 512 > sglang_log15
|
||||
python3 -m sglang.benchmarks.bench_serving --backend sglang --dataset-name sharegpt --num-prompt 2000 > sglang_log21
|
||||
|
||||
# online
|
||||
python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 300 --request-rate 1 --random-input 1024 --random-output 1024 > sglang_log31
|
||||
python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 600 --request-rate 2 --random-input 1024 --random-output 1024 > sglang_log32
|
||||
python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 1200 --request-rate 4 --random-input 1024 --random-output 1024 > sglang_log33
|
||||
python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 2400 --request-rate 8 --random-input 1024 --random-output 1024 > sglang_log34
|
||||
python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 3200 --request-rate 16 --random-input 1024 --random-output 1024 > sglang_log35
|
||||
python3 -m sglang.benchmarks.bench_serving --backend sglang --dataset-name random --num-prompt 300 --request-rate 1 --random-input 1024 --random-output 1024 > sglang_log31
|
||||
python3 -m sglang.benchmarks.bench_serving --backend sglang --dataset-name random --num-prompt 600 --request-rate 2 --random-input 1024 --random-output 1024 > sglang_log32
|
||||
python3 -m sglang.benchmarks.bench_serving --backend sglang --dataset-name random --num-prompt 1200 --request-rate 4 --random-input 1024 --random-output 1024 > sglang_log33
|
||||
python3 -m sglang.benchmarks.bench_serving --backend sglang --dataset-name random --num-prompt 2400 --request-rate 8 --random-input 1024 --random-output 1024 > sglang_log34
|
||||
python3 -m sglang.benchmarks.bench_serving --backend sglang --dataset-name random --num-prompt 3200 --request-rate 16 --random-input 1024 --random-output 1024 > sglang_log35
|
||||
@@ -48,23 +48,23 @@ Please ensure you have the appropriate hardware before running the benchmarks.
|
||||
#### Offline benchmark
|
||||
|
||||
```bash
|
||||
python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompts 4000 --random-input 1024 --random-output 1024 --output-file offline.jsonl
|
||||
python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompts 5000 --random-input 1024 --random-output 512 --output-file offline.jsonl
|
||||
python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompts 1000 --random-input 4096 --random-output 2048 --output-file offline.jsonl
|
||||
python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompts 2000 --random-input 4096 --random-output 1024 --output-file offline.jsonl
|
||||
python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompts 6000 --random-input 256 --random-output 512 --output-file offline.jsonl
|
||||
python3 -m sglang.bench_serving --backend sglang --dataset-name sharegpt --num-prompts 3000 --output-file offline.jsonl
|
||||
python3 -m sglang.benchmarks.bench_serving --backend sglang --dataset-name random --num-prompts 4000 --random-input 1024 --random-output 1024 --output-file offline.jsonl
|
||||
python3 -m sglang.benchmarks.bench_serving --backend sglang --dataset-name random --num-prompts 5000 --random-input 1024 --random-output 512 --output-file offline.jsonl
|
||||
python3 -m sglang.benchmarks.bench_serving --backend sglang --dataset-name random --num-prompts 1000 --random-input 4096 --random-output 2048 --output-file offline.jsonl
|
||||
python3 -m sglang.benchmarks.bench_serving --backend sglang --dataset-name random --num-prompts 2000 --random-input 4096 --random-output 1024 --output-file offline.jsonl
|
||||
python3 -m sglang.benchmarks.bench_serving --backend sglang --dataset-name random --num-prompts 6000 --random-input 256 --random-output 512 --output-file offline.jsonl
|
||||
python3 -m sglang.benchmarks.bench_serving --backend sglang --dataset-name sharegpt --num-prompts 3000 --output-file offline.jsonl
|
||||
cat offline.jsonl | cut -d':' -f12 | cut -d',' -f1
|
||||
```
|
||||
|
||||
#### Online benchmark
|
||||
|
||||
```bash
|
||||
python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 300 --request-rate 1 --output-file online.jsonl
|
||||
python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 600 --request-rate 2 --output-file online.jsonl
|
||||
python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 1200 --request-rate 4 --output-file online.jsonl
|
||||
python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 2400 --request-rate 8 --output-file online.jsonl
|
||||
python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 3200 --request-rate 16 --output-file online.jsonl
|
||||
python3 -m sglang.benchmarks.bench_serving --backend sglang --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 300 --request-rate 1 --output-file online.jsonl
|
||||
python3 -m sglang.benchmarks.bench_serving --backend sglang --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 600 --request-rate 2 --output-file online.jsonl
|
||||
python3 -m sglang.benchmarks.bench_serving --backend sglang --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 1200 --request-rate 4 --output-file online.jsonl
|
||||
python3 -m sglang.benchmarks.bench_serving --backend sglang --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 2400 --request-rate 8 --output-file online.jsonl
|
||||
python3 -m sglang.benchmarks.bench_serving --backend sglang --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 3200 --request-rate 16 --output-file online.jsonl
|
||||
cat online.jsonl | cut -d':' -f9 | cut -d',' -f1
|
||||
```
|
||||
|
||||
|
||||
@@ -33,7 +33,7 @@ python3 bench_serving.py --backend srt --port 30000 --tokenizer meta-llama/Llama
|
||||
```
|
||||
|
||||
### Profile with Nsight
|
||||
1. To profile a single batch, use `nsys profile --cuda-graph-trace=node python3 -m sglang.bench_latency --model meta-llama/Meta-Llama-3-8B --batch-size 64 --input-len 512`
|
||||
1. To profile a single batch, use `nsys profile --cuda-graph-trace=node python3 -m sglang.benchmarks.bench_latency --model meta-llama/Meta-Llama-3-8B --batch-size 64 --input-len 512`
|
||||
2. To profile a server, use `nsys profile --cuda-graph-trace=node python3 -m sglang.launch_server --model meta-llama/Meta-Llama-3-8B`.
|
||||
|
||||
|
||||
|
||||
@@ -12,5 +12,5 @@ To port a model from vLLM to SGLang, you can compare these two files [SGLang LLa
|
||||
- Add `EntryClass` at the end.
|
||||
- Test correctness by comparing the final logits and outputs of the two following commands:
|
||||
- `python3 playground/reference_hf.py --model [new model]`
|
||||
- `python3 -m sglang.bench_latency --model [new model] --correct --output-len 16 --trust-remote-code`
|
||||
- `python3 -m sglang.benchmarks.bench_latency --model [new model] --correct --output-len 16 --trust-remote-code`
|
||||
- Update [Supported Models](https://github.com/sgl-project/sglang/tree/main?tab=readme-ov-file#supported-models) at [README](../README.md).
|
||||
|
||||
@@ -4,15 +4,15 @@
|
||||
Make sure your changes do not slow down the following benchmarks
|
||||
```
|
||||
# single gpu
|
||||
python -m sglang.bench_latency --model-path meta-llama/Llama-2-7b-chat-hf --mem-fraction-static 0.8 --batch 32 --input-len 512 --output-len 256
|
||||
python -m sglang.bench_latency --model-path meta-llama/Llama-2-7b-chat-hf --mem-fraction-static 0.8 --batch 1 --input-len 512 --output-len 256
|
||||
python -m sglang.benchmarks.bench_latency --model-path meta-llama/Llama-2-7b-chat-hf --mem-fraction-static 0.8 --batch 32 --input-len 512 --output-len 256
|
||||
python -m sglang.benchmarks.bench_latency --model-path meta-llama/Llama-2-7b-chat-hf --mem-fraction-static 0.8 --batch 1 --input-len 512 --output-len 256
|
||||
|
||||
# multiple gpu
|
||||
python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-70B --tp 8 --mem-fraction-static 0.6 --batch 32 --input-len 8192 --output-len 1
|
||||
python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-70B --tp 8 --mem-fraction-static 0.6 --batch 1 --input-len 8100 --output-len 32
|
||||
python -m sglang.benchmarks.bench_latency --model-path meta-llama/Meta-Llama-3-70B --tp 8 --mem-fraction-static 0.6 --batch 32 --input-len 8192 --output-len 1
|
||||
python -m sglang.benchmarks.bench_latency --model-path meta-llama/Meta-Llama-3-70B --tp 8 --mem-fraction-static 0.6 --batch 1 --input-len 8100 --output-len 32
|
||||
|
||||
# moe model
|
||||
python -m sglang.bench_latency --model-path databricks/dbrx-base --tp 8 --mem-fraction-static 0.6 --batch 4 --input-len 1024 --output-len 32
|
||||
python -m sglang.benchmarks.bench_latency --model-path databricks/dbrx-base --tp 8 --mem-fraction-static 0.6 --batch 4 --input-len 1024 --output-len 32
|
||||
```
|
||||
|
||||
### High-level API
|
||||
|
||||
@@ -22,46 +22,53 @@ from sglang.api import (
|
||||
video,
|
||||
)
|
||||
|
||||
# SGLang DSL APIs
|
||||
__all__ = [
|
||||
"Runtime",
|
||||
"assistant",
|
||||
"assistant_begin",
|
||||
"assistant_end",
|
||||
"flush_cache",
|
||||
"function",
|
||||
"gen",
|
||||
"gen_int",
|
||||
"gen_string",
|
||||
"get_server_args",
|
||||
"image",
|
||||
"select",
|
||||
"set_default_backend",
|
||||
"system",
|
||||
"system_begin",
|
||||
"system_end",
|
||||
"user",
|
||||
"user_begin",
|
||||
"user_end",
|
||||
"video",
|
||||
]
|
||||
|
||||
|
||||
# Global Configurations
|
||||
from sglang.global_config import global_config
|
||||
|
||||
__all__ += ["global_config"]
|
||||
|
||||
# SGL Backends
|
||||
from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
|
||||
from sglang.utils import LazyImport
|
||||
from sglang.version import __version__
|
||||
|
||||
Anthropic = LazyImport("sglang.lang.backend.anthropic", "Anthropic")
|
||||
LiteLLM = LazyImport("sglang.lang.backend.litellm", "LiteLLM")
|
||||
OpenAI = LazyImport("sglang.lang.backend.openai", "OpenAI")
|
||||
VertexAI = LazyImport("sglang.lang.backend.vertexai", "VertexAI")
|
||||
|
||||
__all__ += ["RuntimeEndpoint", "Anthropic", "LiteLLM", "OpenAI", "VertexAI"]
|
||||
|
||||
# public APIs management
|
||||
__all__ = [
|
||||
"global_config",
|
||||
"Anthropic",
|
||||
"LiteLLM",
|
||||
"OpenAI",
|
||||
"RuntimeEndpoint",
|
||||
"VertexAI",
|
||||
"function",
|
||||
"Runtime",
|
||||
"set_default_backend",
|
||||
"flush_cache",
|
||||
"get_server_args",
|
||||
"gen",
|
||||
"gen_int",
|
||||
"gen_string",
|
||||
"image",
|
||||
"video",
|
||||
"select",
|
||||
"system",
|
||||
"user",
|
||||
"assistant",
|
||||
"user_begin",
|
||||
"user_end",
|
||||
"assistant_begin",
|
||||
"assistant_end",
|
||||
"system_begin",
|
||||
"system_end",
|
||||
]
|
||||
# Version
|
||||
from sglang.version import __version__
|
||||
|
||||
__all__ += ["__version__"]
|
||||
|
||||
# Core Benchmarks
|
||||
from sglang.benchmarks import bench_latency, bench_serving
|
||||
|
||||
__all__ += ["bench_latency", "bench_serving"]
|
||||
|
||||
1
python/sglang/benchmarks/__init__.py
Normal file
1
python/sglang/benchmarks/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
"""SGLang core benchmarks."""
|
||||
@@ -2,10 +2,10 @@
|
||||
Benchmark the latency of a given model. It accepts arguments similar to those of launch_server.py.
|
||||
|
||||
# Usage (latency test):
|
||||
python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-8B-Instruct --load-format dummy
|
||||
python -m sglang.benchmarks.bench_latency --model-path meta-llama/Meta-Llama-3-8B-Instruct --load-format dummy
|
||||
|
||||
# Usage (correctness test):
|
||||
python -m sglang.bench_latency --model-path TinyLlama/TinyLlama-1.1B-Chat-v0.4 --correct
|
||||
python -m sglang.benchmarks.bench_latency --model-path TinyLlama/TinyLlama-1.1B-Chat-v0.4 --correct
|
||||
|
||||
### Reference output:
|
||||
prefill logits (first half) tensor([[-10.0312, -9.5000, 0.8936, ..., -4.9414, -3.2402, -3.3633],
|
||||
@@ -4,10 +4,10 @@
|
||||
Benchmark online serving.
|
||||
|
||||
Usage:
|
||||
python3 -m sglang.bench_serving --backend sglang --num-prompt 10
|
||||
python3 -m sglang.benchmarks.bench_serving --backend sglang --num-prompt 10
|
||||
|
||||
python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompts 3000 --random-input 1024 --random-output 1024 --random-range-ratio 0.5
|
||||
python3 -m sglang.bench_serving --backend sglang --dataset-name random --request-rate-range 1,2,4,8,16,32 --random-input 4096 --random-output 1024 --random-range-ratio 0.125 --multi
|
||||
python3 -m sglang.benchmarks.bench_serving --backend sglang --dataset-name random --num-prompts 3000 --random-input 1024 --random-output 1024 --random-range-ratio 0.5
|
||||
python3 -m sglang.benchmarks.bench_serving --backend sglang --dataset-name random --request-rate-range 1,2,4,8,16,32 --random-input 4096 --random-output 1024 --random-range-ratio 0.125 --multi
|
||||
"""
|
||||
|
||||
import argparse
|
||||
Reference in New Issue
Block a user