From c8e9fed87a85241180cb83230c8407d5d96c5f85 Mon Sep 17 00:00:00 2001 From: Liangsheng Yin Date: Mon, 29 Jul 2024 15:34:16 -0700 Subject: [PATCH] Organize public APIs (#809) --- README.md | 4 +- benchmark/blog_v0_2/405b_sglang.sh | 22 +++--- benchmark/blog_v0_2/README.md | 22 +++--- benchmark/latency_throughput/README.md | 2 +- docs/en/model_support.md | 2 +- docs/en/test_process.md | 10 +-- python/sglang/__init__.py | 67 ++++++++++--------- python/sglang/benchmarks/__init__.py | 1 + .../sglang/{ => benchmarks}/bench_latency.py | 4 +- .../sglang/{ => benchmarks}/bench_serving.py | 6 +- 10 files changed, 74 insertions(+), 66 deletions(-) create mode 100644 python/sglang/benchmarks/__init__.py rename python/sglang/{ => benchmarks}/bench_latency.py (98%) rename python/sglang/{ => benchmarks}/bench_serving.py (98%) diff --git a/README.md b/README.md index ee93680e6..bd2e65374 100644 --- a/README.md +++ b/README.md @@ -208,11 +208,11 @@ Instructions for supporting a new model are [here](https://github.com/sgl-projec - Benchmark a single static batch by running the following command without launching a server. The arguments are the same as those for `launch_server.py`. This is not a dynamic batching server, so it may run out of memory for a batch size that can run successfully with a real server. This is because a real server will truncate the prefill into several batches/chunks, while this unit test does not do this. ``` - python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 32 --input-len 256 --output-len 32 + python -m sglang.benchmarks.bench_latency --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 32 --input-len 256 --output-len 32 ``` - Benchmark online serving. Launch a server first and run the following command. ``` - python3 -m sglang.bench_serving --backend sglang --num-prompt 10 + python3 -m sglang.benchmarks.bench_serving --backend sglang --num-prompt 10 ``` ## Frontend: Structured Generation Language (SGLang) diff --git a/benchmark/blog_v0_2/405b_sglang.sh b/benchmark/blog_v0_2/405b_sglang.sh index eae5e2206..e5a107629 100644 --- a/benchmark/blog_v0_2/405b_sglang.sh +++ b/benchmark/blog_v0_2/405b_sglang.sh @@ -9,16 +9,16 @@ # python -m sglang.launch_server --model ~/llama-3.1-405b-fp8-dummy/ --load-format dummy --tp 8 --quant fp8 --disable-radix --mem-frac 0.87 # offline -python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 3000 --random-input 1024 --random-output 1024 > sglang_log11 -python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 4000 --random-input 1024 --random-output 512 > sglang_log12 -python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 800 --random-input 4096 --random-output 2048 > sglang_log13 -python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 1500 --random-input 4096 --random-output 1024 > sglang_log14 -python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 6000 --random-input 256 --random-output 512 > sglang_log15 -python3 -m sglang.bench_serving --backend sglang --dataset-name sharegpt --num-prompt 2000 > sglang_log21 +python3 -m sglang.benchmarks.bench_serving --backend sglang --dataset-name random --num-prompt 3000 --random-input 1024 --random-output 1024 > sglang_log11 +python3 -m sglang.benchmarks.bench_serving --backend sglang --dataset-name random --num-prompt 4000 --random-input 1024 --random-output 512 > sglang_log12 +python3 -m sglang.benchmarks.bench_serving --backend sglang --dataset-name random --num-prompt 800 --random-input 4096 --random-output 2048 > sglang_log13 +python3 -m sglang.benchmarks.bench_serving --backend sglang --dataset-name random --num-prompt 1500 --random-input 4096 --random-output 1024 > sglang_log14 +python3 -m sglang.benchmarks.bench_serving --backend sglang --dataset-name random --num-prompt 6000 --random-input 256 --random-output 512 > sglang_log15 +python3 -m sglang.benchmarks.bench_serving --backend sglang --dataset-name sharegpt --num-prompt 2000 > sglang_log21 # online -python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 300 --request-rate 1 --random-input 1024 --random-output 1024 > sglang_log31 -python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 600 --request-rate 2 --random-input 1024 --random-output 1024 > sglang_log32 -python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 1200 --request-rate 4 --random-input 1024 --random-output 1024 > sglang_log33 -python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 2400 --request-rate 8 --random-input 1024 --random-output 1024 > sglang_log34 -python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 3200 --request-rate 16 --random-input 1024 --random-output 1024 > sglang_log35 \ No newline at end of file +python3 -m sglang.benchmarks.bench_serving --backend sglang --dataset-name random --num-prompt 300 --request-rate 1 --random-input 1024 --random-output 1024 > sglang_log31 +python3 -m sglang.benchmarks.bench_serving --backend sglang --dataset-name random --num-prompt 600 --request-rate 2 --random-input 1024 --random-output 1024 > sglang_log32 +python3 -m sglang.benchmarks.bench_serving --backend sglang --dataset-name random --num-prompt 1200 --request-rate 4 --random-input 1024 --random-output 1024 > sglang_log33 +python3 -m sglang.benchmarks.bench_serving --backend sglang --dataset-name random --num-prompt 2400 --request-rate 8 --random-input 1024 --random-output 1024 > sglang_log34 +python3 -m sglang.benchmarks.bench_serving --backend sglang --dataset-name random --num-prompt 3200 --request-rate 16 --random-input 1024 --random-output 1024 > sglang_log35 \ No newline at end of file diff --git a/benchmark/blog_v0_2/README.md b/benchmark/blog_v0_2/README.md index 9355cd34b..1d753a1e1 100644 --- a/benchmark/blog_v0_2/README.md +++ b/benchmark/blog_v0_2/README.md @@ -48,23 +48,23 @@ Please ensure you have the appropriate hardware before running the benchmarks. #### Offline benchmark ```bash -python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompts 4000 --random-input 1024 --random-output 1024 --output-file offline.jsonl -python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompts 5000 --random-input 1024 --random-output 512 --output-file offline.jsonl -python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompts 1000 --random-input 4096 --random-output 2048 --output-file offline.jsonl -python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompts 2000 --random-input 4096 --random-output 1024 --output-file offline.jsonl -python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompts 6000 --random-input 256 --random-output 512 --output-file offline.jsonl -python3 -m sglang.bench_serving --backend sglang --dataset-name sharegpt --num-prompts 3000 --output-file offline.jsonl +python3 -m sglang.benchmarks.bench_serving --backend sglang --dataset-name random --num-prompts 4000 --random-input 1024 --random-output 1024 --output-file offline.jsonl +python3 -m sglang.benchmarks.bench_serving --backend sglang --dataset-name random --num-prompts 5000 --random-input 1024 --random-output 512 --output-file offline.jsonl +python3 -m sglang.benchmarks.bench_serving --backend sglang --dataset-name random --num-prompts 1000 --random-input 4096 --random-output 2048 --output-file offline.jsonl +python3 -m sglang.benchmarks.bench_serving --backend sglang --dataset-name random --num-prompts 2000 --random-input 4096 --random-output 1024 --output-file offline.jsonl +python3 -m sglang.benchmarks.bench_serving --backend sglang --dataset-name random --num-prompts 6000 --random-input 256 --random-output 512 --output-file offline.jsonl +python3 -m sglang.benchmarks.bench_serving --backend sglang --dataset-name sharegpt --num-prompts 3000 --output-file offline.jsonl cat offline.jsonl | cut -d':' -f12 | cut -d',' -f1 ``` #### Online benchmark ```bash -python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 300 --request-rate 1 --output-file online.jsonl -python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 600 --request-rate 2 --output-file online.jsonl -python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 1200 --request-rate 4 --output-file online.jsonl -python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 2400 --request-rate 8 --output-file online.jsonl -python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 3200 --request-rate 16 --output-file online.jsonl +python3 -m sglang.benchmarks.bench_serving --backend sglang --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 300 --request-rate 1 --output-file online.jsonl +python3 -m sglang.benchmarks.bench_serving --backend sglang --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 600 --request-rate 2 --output-file online.jsonl +python3 -m sglang.benchmarks.bench_serving --backend sglang --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 1200 --request-rate 4 --output-file online.jsonl +python3 -m sglang.benchmarks.bench_serving --backend sglang --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 2400 --request-rate 8 --output-file online.jsonl +python3 -m sglang.benchmarks.bench_serving --backend sglang --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 3200 --request-rate 16 --output-file online.jsonl cat online.jsonl | cut -d':' -f9 | cut -d',' -f1 ``` diff --git a/benchmark/latency_throughput/README.md b/benchmark/latency_throughput/README.md index 31433a030..ce410bc89 100644 --- a/benchmark/latency_throughput/README.md +++ b/benchmark/latency_throughput/README.md @@ -33,7 +33,7 @@ python3 bench_serving.py --backend srt --port 30000 --tokenizer meta-llama/Llama ``` ### Profile with Nsight -1. To profile a single batch, use `nsys profile --cuda-graph-trace=node python3 -m sglang.bench_latency --model meta-llama/Meta-Llama-3-8B --batch-size 64 --input-len 512` +1. To profile a single batch, use `nsys profile --cuda-graph-trace=node python3 -m sglang.benchmarks.bench_latency --model meta-llama/Meta-Llama-3-8B --batch-size 64 --input-len 512` 2. To profile a server, use `nsys profile --cuda-graph-trace=node python3 -m sglang.launch_server --model meta-llama/Meta-Llama-3-8B`. diff --git a/docs/en/model_support.md b/docs/en/model_support.md index 4cfa4c0da..f31429988 100644 --- a/docs/en/model_support.md +++ b/docs/en/model_support.md @@ -12,5 +12,5 @@ To port a model from vLLM to SGLang, you can compare these two files [SGLang LLa - Add `EntryClass` at the end. - Test correctness by comparing the final logits and outputs of the two following commands: - `python3 playground/reference_hf.py --model [new model]` - - `python3 -m sglang.bench_latency --model [new model] --correct --output-len 16 --trust-remote-code` + - `python3 -m sglang.benchmarks.bench_latency --model [new model] --correct --output-len 16 --trust-remote-code` - Update [Supported Models](https://github.com/sgl-project/sglang/tree/main?tab=readme-ov-file#supported-models) at [README](../README.md). diff --git a/docs/en/test_process.md b/docs/en/test_process.md index 99889f999..4b92f1507 100644 --- a/docs/en/test_process.md +++ b/docs/en/test_process.md @@ -4,15 +4,15 @@ Make sure your changes do not slow down the following benchmarks ``` # single gpu -python -m sglang.bench_latency --model-path meta-llama/Llama-2-7b-chat-hf --mem-fraction-static 0.8 --batch 32 --input-len 512 --output-len 256 -python -m sglang.bench_latency --model-path meta-llama/Llama-2-7b-chat-hf --mem-fraction-static 0.8 --batch 1 --input-len 512 --output-len 256 +python -m sglang.benchmarks.bench_latency --model-path meta-llama/Llama-2-7b-chat-hf --mem-fraction-static 0.8 --batch 32 --input-len 512 --output-len 256 +python -m sglang.benchmarks.bench_latency --model-path meta-llama/Llama-2-7b-chat-hf --mem-fraction-static 0.8 --batch 1 --input-len 512 --output-len 256 # multiple gpu -python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-70B --tp 8 --mem-fraction-static 0.6 --batch 32 --input-len 8192 --output-len 1 -python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-70B --tp 8 --mem-fraction-static 0.6 --batch 1 --input-len 8100 --output-len 32 +python -m sglang.benchmarks.bench_latency --model-path meta-llama/Meta-Llama-3-70B --tp 8 --mem-fraction-static 0.6 --batch 32 --input-len 8192 --output-len 1 +python -m sglang.benchmarks.bench_latency --model-path meta-llama/Meta-Llama-3-70B --tp 8 --mem-fraction-static 0.6 --batch 1 --input-len 8100 --output-len 32 # moe model -python -m sglang.bench_latency --model-path databricks/dbrx-base --tp 8 --mem-fraction-static 0.6 --batch 4 --input-len 1024 --output-len 32 +python -m sglang.benchmarks.bench_latency --model-path databricks/dbrx-base --tp 8 --mem-fraction-static 0.6 --batch 4 --input-len 1024 --output-len 32 ``` ### High-level API diff --git a/python/sglang/__init__.py b/python/sglang/__init__.py index 413ab9e7c..0199f23d2 100644 --- a/python/sglang/__init__.py +++ b/python/sglang/__init__.py @@ -22,46 +22,53 @@ from sglang.api import ( video, ) +# SGLang DSL APIs +__all__ = [ + "Runtime", + "assistant", + "assistant_begin", + "assistant_end", + "flush_cache", + "function", + "gen", + "gen_int", + "gen_string", + "get_server_args", + "image", + "select", + "set_default_backend", + "system", + "system_begin", + "system_end", + "user", + "user_begin", + "user_end", + "video", +] + + # Global Configurations from sglang.global_config import global_config +__all__ += ["global_config"] + # SGL Backends from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint from sglang.utils import LazyImport -from sglang.version import __version__ Anthropic = LazyImport("sglang.lang.backend.anthropic", "Anthropic") LiteLLM = LazyImport("sglang.lang.backend.litellm", "LiteLLM") OpenAI = LazyImport("sglang.lang.backend.openai", "OpenAI") VertexAI = LazyImport("sglang.lang.backend.vertexai", "VertexAI") +__all__ += ["RuntimeEndpoint", "Anthropic", "LiteLLM", "OpenAI", "VertexAI"] -# public APIs management -__all__ = [ - "global_config", - "Anthropic", - "LiteLLM", - "OpenAI", - "RuntimeEndpoint", - "VertexAI", - "function", - "Runtime", - "set_default_backend", - "flush_cache", - "get_server_args", - "gen", - "gen_int", - "gen_string", - "image", - "video", - "select", - "system", - "user", - "assistant", - "user_begin", - "user_end", - "assistant_begin", - "assistant_end", - "system_begin", - "system_end", -] +# Version +from sglang.version import __version__ + +__all__ += ["__version__"] + +# Core Benchmarks +from sglang.benchmarks import bench_latency, bench_serving + +__all__ += ["bench_latency", "bench_serving"] diff --git a/python/sglang/benchmarks/__init__.py b/python/sglang/benchmarks/__init__.py new file mode 100644 index 000000000..35ac61edd --- /dev/null +++ b/python/sglang/benchmarks/__init__.py @@ -0,0 +1 @@ +"""SGLang core benchmarks.""" diff --git a/python/sglang/bench_latency.py b/python/sglang/benchmarks/bench_latency.py similarity index 98% rename from python/sglang/bench_latency.py rename to python/sglang/benchmarks/bench_latency.py index c2eb93a24..2b90b992a 100644 --- a/python/sglang/bench_latency.py +++ b/python/sglang/benchmarks/bench_latency.py @@ -2,10 +2,10 @@ Benchmark the latency of a given model. It accepts arguments similar to those of launch_server.py. # Usage (latency test): -python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-8B-Instruct --load-format dummy +python -m sglang.benchmarks.bench_latency --model-path meta-llama/Meta-Llama-3-8B-Instruct --load-format dummy # Usage (correctness test): -python -m sglang.bench_latency --model-path TinyLlama/TinyLlama-1.1B-Chat-v0.4 --correct +python -m sglang.benchmarks.bench_latency --model-path TinyLlama/TinyLlama-1.1B-Chat-v0.4 --correct ### Reference output: prefill logits (first half) tensor([[-10.0312, -9.5000, 0.8936, ..., -4.9414, -3.2402, -3.3633], diff --git a/python/sglang/bench_serving.py b/python/sglang/benchmarks/bench_serving.py similarity index 98% rename from python/sglang/bench_serving.py rename to python/sglang/benchmarks/bench_serving.py index 839f947c1..9fd9e7f49 100644 --- a/python/sglang/bench_serving.py +++ b/python/sglang/benchmarks/bench_serving.py @@ -4,10 +4,10 @@ Benchmark online serving. Usage: -python3 -m sglang.bench_serving --backend sglang --num-prompt 10 +python3 -m sglang.benchmarks.bench_serving --backend sglang --num-prompt 10 -python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompts 3000 --random-input 1024 --random-output 1024 --random-range-ratio 0.5 -python3 -m sglang.bench_serving --backend sglang --dataset-name random --request-rate-range 1,2,4,8,16,32 --random-input 4096 --random-output 1024 --random-range-ratio 0.125 --multi +python3 -m sglang.benchmarks.bench_serving --backend sglang --dataset-name random --num-prompts 3000 --random-input 1024 --random-output 1024 --random-range-ratio 0.5 +python3 -m sglang.benchmarks.bench_serving --backend sglang --dataset-name random --request-rate-range 1,2,4,8,16,32 --random-input 4096 --random-output 1024 --random-range-ratio 0.125 --multi """ import argparse