Revert "Organize public APIs" (#815)

2024-07-29 19:40:28 -07:00
parent 3520f75fb1
commit db6089e6f3
10 changed files with 66 additions and 74 deletions
--- a/README.md
+++ b/README.md
@@ -208,11 +208,11 @@ Instructions for supporting a new model are [here](https://github.com/sgl-projec
 - Benchmark a single static batch by running the following command without launching a server. The arguments are the same as those for `launch_server.py`. This is not a dynamic batching server, so it may run out of memory for a batch size that can run successfully with a real server. This is because a real server will truncate the prefill into several batches/chunks, while this unit test does not do this.
  ```
-  python -m sglang.benchmarks.bench_latency --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 32 --input-len 256 --output-len 32
+  python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 32 --input-len 256 --output-len 32
  ```
 - Benchmark online serving. Launch a server first and run the following command.
  ```
-  python3 -m sglang.benchmarks.bench_serving --backend sglang --num-prompt 10
+  python3 -m sglang.bench_serving --backend sglang --num-prompt 10
  ```
 ## Frontend: Structured Generation Language (SGLang)
--- a/benchmark/blog_v0_2/405b_sglang.sh
+++ b/benchmark/blog_v0_2/405b_sglang.sh
@@ -9,16 +9,16 @@
 # python -m sglang.launch_server --model ~/llama-3.1-405b-fp8-dummy/ --load-format dummy --tp 8 --quant fp8 --disable-radix --mem-frac 0.87
 # offline
-python3 -m sglang.benchmarks.bench_serving --backend sglang --dataset-name random --num-prompt 3000 --random-input 1024 --random-output 1024 > sglang_log11
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 3000 --random-input 1024 --random-output 1024 > sglang_log11
-python3 -m sglang.benchmarks.bench_serving --backend sglang --dataset-name random --num-prompt 4000 --random-input 1024 --random-output 512 > sglang_log12
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 4000 --random-input 1024 --random-output 512 > sglang_log12
-python3 -m sglang.benchmarks.bench_serving --backend sglang --dataset-name random --num-prompt 800 --random-input 4096 --random-output 2048 > sglang_log13
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 800 --random-input 4096 --random-output 2048 > sglang_log13
-python3 -m sglang.benchmarks.bench_serving --backend sglang --dataset-name random --num-prompt 1500 --random-input 4096 --random-output 1024 > sglang_log14
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 1500 --random-input 4096 --random-output 1024 > sglang_log14
-python3 -m sglang.benchmarks.bench_serving --backend sglang --dataset-name random --num-prompt 6000 --random-input 256 --random-output 512 > sglang_log15
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 6000 --random-input 256 --random-output 512 > sglang_log15
-python3 -m sglang.benchmarks.bench_serving --backend sglang --dataset-name sharegpt --num-prompt 2000 > sglang_log21
+python3 -m sglang.bench_serving --backend sglang --dataset-name sharegpt --num-prompt 2000 > sglang_log21
 # online
-python3 -m sglang.benchmarks.bench_serving --backend sglang --dataset-name random --num-prompt 300 --request-rate 1 --random-input 1024 --random-output 1024 > sglang_log31
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 300 --request-rate 1 --random-input 1024 --random-output 1024 > sglang_log31
-python3 -m sglang.benchmarks.bench_serving --backend sglang --dataset-name random --num-prompt 600 --request-rate 2 --random-input 1024 --random-output 1024 > sglang_log32
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 600 --request-rate 2 --random-input 1024 --random-output 1024 > sglang_log32
-python3 -m sglang.benchmarks.bench_serving --backend sglang --dataset-name random --num-prompt 1200 --request-rate 4 --random-input 1024 --random-output 1024 > sglang_log33
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 1200 --request-rate 4 --random-input 1024 --random-output 1024 > sglang_log33
-python3 -m sglang.benchmarks.bench_serving --backend sglang --dataset-name random --num-prompt 2400 --request-rate 8 --random-input 1024 --random-output 1024 > sglang_log34
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 2400 --request-rate 8 --random-input 1024 --random-output 1024 > sglang_log34
-python3 -m sglang.benchmarks.bench_serving --backend sglang --dataset-name random --num-prompt 3200 --request-rate 16 --random-input 1024 --random-output 1024 > sglang_log35
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 3200 --request-rate 16 --random-input 1024 --random-output 1024 > sglang_log35
--- a/benchmark/blog_v0_2/README.md
+++ b/benchmark/blog_v0_2/README.md
@@ -48,23 +48,23 @@ Please ensure you have the appropriate hardware before running the benchmarks.
 #### Offline benchmark
 ```bash
-python3 -m sglang.benchmarks.bench_serving --backend sglang --dataset-name random --num-prompts 4000 --random-input 1024 --random-output 1024 --output-file offline.jsonl
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompts 4000 --random-input 1024 --random-output 1024 --output-file offline.jsonl
-python3 -m sglang.benchmarks.bench_serving --backend sglang --dataset-name random --num-prompts 5000 --random-input 1024 --random-output 512 --output-file offline.jsonl
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompts 5000 --random-input 1024 --random-output 512 --output-file offline.jsonl
-python3 -m sglang.benchmarks.bench_serving --backend sglang --dataset-name random --num-prompts 1000 --random-input 4096 --random-output 2048 --output-file offline.jsonl
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompts 1000 --random-input 4096 --random-output 2048 --output-file offline.jsonl
-python3 -m sglang.benchmarks.bench_serving --backend sglang --dataset-name random --num-prompts 2000 --random-input 4096 --random-output 1024 --output-file offline.jsonl
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompts 2000 --random-input 4096 --random-output 1024 --output-file offline.jsonl
-python3 -m sglang.benchmarks.bench_serving --backend sglang --dataset-name random --num-prompts 6000 --random-input 256 --random-output 512 --output-file offline.jsonl
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompts 6000 --random-input 256 --random-output 512 --output-file offline.jsonl
-python3 -m sglang.benchmarks.bench_serving --backend sglang --dataset-name sharegpt --num-prompts 3000 --output-file offline.jsonl
+python3 -m sglang.bench_serving --backend sglang --dataset-name sharegpt --num-prompts 3000 --output-file offline.jsonl
 cat offline.jsonl | cut -d':' -f12 | cut -d',' -f1
 ```
 #### Online benchmark
 ```bash
-python3 -m sglang.benchmarks.bench_serving --backend sglang --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 300 --request-rate 1 --output-file online.jsonl
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 300 --request-rate 1 --output-file online.jsonl
-python3 -m sglang.benchmarks.bench_serving --backend sglang --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 600 --request-rate 2 --output-file online.jsonl
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 600 --request-rate 2 --output-file online.jsonl
-python3 -m sglang.benchmarks.bench_serving --backend sglang --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 1200 --request-rate 4 --output-file online.jsonl
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 1200 --request-rate 4 --output-file online.jsonl
-python3 -m sglang.benchmarks.bench_serving --backend sglang --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 2400 --request-rate 8 --output-file online.jsonl
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 2400 --request-rate 8 --output-file online.jsonl
-python3 -m sglang.benchmarks.bench_serving --backend sglang --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 3200 --request-rate 16 --output-file online.jsonl
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 3200 --request-rate 16 --output-file online.jsonl
 cat online.jsonl | cut -d':' -f9 | cut -d',' -f1
 ```
--- a/benchmark/latency_throughput/README.md
+++ b/benchmark/latency_throughput/README.md
@@ -33,7 +33,7 @@ python3 bench_serving.py --backend srt --port 30000 --tokenizer meta-llama/Llama
 ```
 ### Profile with Nsight
-1. To profile a single batch, use `nsys profile --cuda-graph-trace=node python3 -m sglang.benchmarks.bench_latency --model meta-llama/Meta-Llama-3-8B --batch-size 64 --input-len 512`
+1. To profile a single batch, use `nsys profile --cuda-graph-trace=node python3 -m sglang.bench_latency --model meta-llama/Meta-Llama-3-8B --batch-size 64 --input-len 512`
 2. To profile a server, use `nsys profile --cuda-graph-trace=node python3 -m sglang.launch_server --model meta-llama/Meta-Llama-3-8B`.
--- a/docs/en/model_support.md
+++ b/docs/en/model_support.md
@@ -12,5 +12,5 @@ To port a model from vLLM to SGLang, you can compare these two files [SGLang LLa
  - Add `EntryClass` at the end.
  - Test correctness by comparing the final logits and outputs of the two following commands:
    - `python3 playground/reference_hf.py --model [new model]`
-    - `python3 -m sglang.benchmarks.bench_latency --model [new model] --correct --output-len 16 --trust-remote-code`
+    - `python3 -m sglang.bench_latency --model [new model] --correct --output-len 16 --trust-remote-code`
  - Update [Supported Models](https://github.com/sgl-project/sglang/tree/main?tab=readme-ov-file#supported-models) at [README](../README.md).
--- a/docs/en/test_process.md
+++ b/docs/en/test_process.md
@@ -4,15 +4,15 @@
 Make sure your changes do not slow down the following benchmarks
 ```
 # single gpu
-python -m sglang.benchmarks.bench_latency --model-path meta-llama/Llama-2-7b-chat-hf --mem-fraction-static 0.8 --batch 32 --input-len 512 --output-len 256
+python -m sglang.bench_latency --model-path meta-llama/Llama-2-7b-chat-hf --mem-fraction-static 0.8 --batch 32 --input-len 512 --output-len 256
-python -m sglang.benchmarks.bench_latency --model-path meta-llama/Llama-2-7b-chat-hf --mem-fraction-static 0.8 --batch 1 --input-len 512 --output-len 256
+python -m sglang.bench_latency --model-path meta-llama/Llama-2-7b-chat-hf --mem-fraction-static 0.8 --batch 1 --input-len 512 --output-len 256
 # multiple gpu
-python -m sglang.benchmarks.bench_latency --model-path meta-llama/Meta-Llama-3-70B --tp 8 --mem-fraction-static 0.6 --batch 32 --input-len 8192 --output-len 1
+python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-70B --tp 8 --mem-fraction-static 0.6 --batch 32 --input-len 8192 --output-len 1
-python -m sglang.benchmarks.bench_latency --model-path meta-llama/Meta-Llama-3-70B --tp 8 --mem-fraction-static 0.6 --batch 1 --input-len 8100 --output-len 32
+python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-70B --tp 8 --mem-fraction-static 0.6 --batch 1 --input-len 8100 --output-len 32
 # moe model
-python -m sglang.benchmarks.bench_latency --model-path databricks/dbrx-base --tp 8 --mem-fraction-static 0.6 --batch 4 --input-len 1024 --output-len 32
+python -m sglang.bench_latency --model-path databricks/dbrx-base --tp 8 --mem-fraction-static 0.6 --batch 4 --input-len 1024 --output-len 32
 ```
 ### High-level API
--- a/python/sglang/init.py
+++ b/python/sglang/init.py
@@ -22,53 +22,46 @@ from sglang.api import (
    video,
 )
 # SGLang DSL APIs
 __all__ = [
    "Runtime",
    "assistant",
    "assistant_begin",
    "assistant_end",
    "flush_cache",
    "function",
    "gen",
    "gen_int",
    "gen_string",
    "get_server_args",
    "image",
    "select",
    "set_default_backend",
    "system",
    "system_begin",
    "system_end",
    "user",
    "user_begin",
    "user_end",
    "video",
 ]
 # Global Configurations
 from sglang.global_config import global_config
 __all__ += ["global_config"]
 # SGL Backends
 from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
 from sglang.utils import LazyImport
 from sglang.version import __version__
 Anthropic = LazyImport("sglang.lang.backend.anthropic", "Anthropic")
 LiteLLM = LazyImport("sglang.lang.backend.litellm", "LiteLLM")
 OpenAI = LazyImport("sglang.lang.backend.openai", "OpenAI")
 VertexAI = LazyImport("sglang.lang.backend.vertexai", "VertexAI")
 __all__ += ["RuntimeEndpoint", "Anthropic", "LiteLLM", "OpenAI", "VertexAI"]
-# Version
+# public APIs management
-from sglang.version import __version__
+__all__ = [
-
+    "global_config",
-__all__ += ["__version__"]
+    "Anthropic",
-
+    "LiteLLM",
-# Core Benchmarks
+    "OpenAI",
-from sglang.benchmarks import bench_latency, bench_serving
+    "RuntimeEndpoint",
-
+    "VertexAI",
-__all__ += ["bench_latency", "bench_serving"]
+    "function",
    "Runtime",
    "set_default_backend",
    "flush_cache",
    "get_server_args",
    "gen",
    "gen_int",
    "gen_string",
    "image",
    "video",
    "select",
    "system",
    "user",
    "assistant",
    "user_begin",
    "user_end",
    "assistant_begin",
    "assistant_end",
    "system_begin",
    "system_end",
 ]
--- a/python/sglang/benchmarks/bench_latency.py
+++ b/python/sglang/benchmarks/bench_latency.py
@@ -2,10 +2,10 @@
 Benchmark the latency of a given model. It accepts arguments similar to those of launch_server.py.
 # Usage (latency test):
-python -m sglang.benchmarks.bench_latency --model-path meta-llama/Meta-Llama-3-8B-Instruct --load-format dummy
+python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-8B-Instruct --load-format dummy
 # Usage (correctness test):
-python -m sglang.benchmarks.bench_latency --model-path TinyLlama/TinyLlama-1.1B-Chat-v0.4 --correct
+python -m sglang.bench_latency --model-path TinyLlama/TinyLlama-1.1B-Chat-v0.4 --correct
 ### Reference output:
 prefill logits (first half) tensor([[-10.0312,  -9.5000,   0.8936,  ...,  -4.9414,  -3.2402,  -3.3633],
--- a/python/sglang/benchmarks/bench_serving.py
+++ b/python/sglang/benchmarks/bench_serving.py
@@ -4,10 +4,10 @@
 Benchmark online serving.
 Usage:
-python3 -m sglang.benchmarks.bench_serving --backend sglang --num-prompt 10
+python3 -m sglang.bench_serving --backend sglang --num-prompt 10
-python3 -m sglang.benchmarks.bench_serving --backend sglang --dataset-name random --num-prompts 3000 --random-input 1024 --random-output 1024 --random-range-ratio 0.5
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompts 3000 --random-input 1024 --random-output 1024 --random-range-ratio 0.5
-python3 -m sglang.benchmarks.bench_serving --backend sglang --dataset-name random --request-rate-range 1,2,4,8,16,32 --random-input 4096 --random-output 1024 --random-range-ratio 0.125 --multi
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --request-rate-range 1,2,4,8,16,32 --random-input 4096 --random-output 1024 --random-range-ratio 0.125 --multi
 """
 import argparse
--- a/python/sglang/benchmarks/init.py
+++ b/python/sglang/benchmarks/init.py
@@ -1 +0,0 @@
 """SGLang core benchmarks."""