diff --git a/.github/workflows/pr-e2e-test.yml b/.github/workflows/pr-e2e-test.yml index fce30d488..f62855391 100644 --- a/.github/workflows/pr-e2e-test.yml +++ b/.github/workflows/pr-e2e-test.yml @@ -32,7 +32,13 @@ jobs: pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/ --force-reinstall pip install --upgrade transformers - - name: Launch server and run benchmark + - name: Test OpenAI Backend + run: | + export OPENAI_API_KEY=secrets.OPENAI_API_KEY + cd sglang/test/lang + python3 test_openai_backend.py + + - name: Benchmark Serving run: | python3 -m sglang.launch_server --model /home/lmzheng/zhyncs/Meta-Llama-3.1-8B-Instruct --port 8413 --disable-radix-cache & diff --git a/.github/workflows/release-docker.yml b/.github/workflows/release-docker.yml index 8db50c6e4..504bcf760 100644 --- a/.github/workflows/release-docker.yml +++ b/.github/workflows/release-docker.yml @@ -1,4 +1,4 @@ -name: publish docker +name: Release Docker on: push: branches: diff --git a/.github/workflows/release-fake-tag.yml b/.github/workflows/release-fake-tag.yml index 98ab8e6b4..c4b1c338a 100644 --- a/.github/workflows/release-fake-tag.yml +++ b/.github/workflows/release-fake-tag.yml @@ -1,4 +1,4 @@ -name: fake tag +name: Release Fake Tag on: push: branches: diff --git a/.github/workflows/release-github.yml b/.github/workflows/release-github.yml index 5ad3414fd..12a2309a6 100644 --- a/.github/workflows/release-github.yml +++ b/.github/workflows/release-github.yml @@ -1,4 +1,4 @@ -name: release tag +name: Release GitHub on: workflow_dispatch: jobs: diff --git a/.github/workflows/release-pypi.yml b/.github/workflows/release-pypi.yml index 4c278a8b8..c79e46cb7 100644 --- a/.github/workflows/release-pypi.yml +++ b/.github/workflows/release-pypi.yml @@ -1,4 +1,4 @@ -name: publish to pypi +name: Release PyPI on: push: branches: diff --git a/assets/llama_7b.jpg b/assets/llama_7b.jpg deleted file mode 100644 index e68960e01..000000000 Binary files a/assets/llama_7b.jpg and /dev/null differ diff --git a/assets/mixtral_8x7b.jpg b/assets/mixtral_8x7b.jpg deleted file mode 100644 index 755e4296e..000000000 Binary files a/assets/mixtral_8x7b.jpg and /dev/null differ diff --git a/docs/en/benchmark_results.md b/docs/en/benchmark_results.md deleted file mode 100644 index 2688c0c16..000000000 --- a/docs/en/benchmark_results.md +++ /dev/null @@ -1,22 +0,0 @@ -# Benchmark Results - -We tested our system on the following common LLM workloads and reported the achieved throughput: -- **[MMLU](https://arxiv.org/abs/2009.03300)**: A 5-shot, multi-choice, multi-task benchmark. -- **[HellaSwag](https://arxiv.org/abs/1905.07830)**: A 20-shot, multi-choice sentence completion benchmark. -- **[ReAct Agent](https://arxiv.org/abs/2210.03629)**: An agent task using prompt traces collected from the original ReAct paper. -- **[Tree-of-Thought](https://arxiv.org/pdf/2305.10601.pdf)**: A custom tree search-based prompt for solving GSM-8K problems. -- **JSON Decode**: Extracting information from a Wikipedia page and outputting it in JSON format. -- **Chat (short)**: A synthetic chat benchmark where each conversation includes 4 turns with short LLM outputs. -- **Chat (long)**: A synthetic chat benchmark where each conversation includes 4 turns with long LLM outputs. -- **[DSPy RAG](https://github.com/stanfordnlp/dspy)**: A retrieval-augmented generation pipeline in the DSPy tutorial. -- **[LLaVA Bench](https://github.com/haotian-liu/LLaVA)**: Running LLaVA v1.5, a vision language model on the LLaVA-in-the-wild benchmark. - -We tested both Llama-7B on one NVIDIA A10G GPU (24GB) and Mixtral-8x7B on 8 NVIDIA A10G GPUs with tensor parallelism, using FP16 precision. We used vllm v0.2.5, guidance v0.1.8, Hugging Face TGI v1.3.0, and SGLang v0.1.5. - -- Llama-7B on NVIDIA A10G, FP16, Tensor Parallelism=1 -![llama_7b](../assets/llama_7b.jpg) - -- Mixtral-8x7B on NVIDIA A10G, FP16, Tensor Parallelism=8 -![mixtral_8x7b](../assets/mixtral_8x7b.jpg) - -The benchmark code is available [here](https://github.com/sgl-project/sglang/tree/main/benchmark). diff --git a/docs/en/model_support.md b/docs/en/model_support.md index 4cfa4c0da..e46e99e85 100644 --- a/docs/en/model_support.md +++ b/docs/en/model_support.md @@ -11,6 +11,6 @@ To port a model from vLLM to SGLang, you can compare these two files [SGLang LLa - Change `forward()` functions, and add `input_metadata`. - Add `EntryClass` at the end. - Test correctness by comparing the final logits and outputs of the two following commands: - - `python3 playground/reference_hf.py --model [new model]` + - `python3 scripts/playground/reference_hf.py --model [new model]` - `python3 -m sglang.bench_latency --model [new model] --correct --output-len 16 --trust-remote-code` - Update [Supported Models](https://github.com/sgl-project/sglang/tree/main?tab=readme-ov-file#supported-models) at [README](../README.md). diff --git a/playground/launch_tgi.sh b/scripts/playground/launch_tgi.sh similarity index 100% rename from playground/launch_tgi.sh rename to scripts/playground/launch_tgi.sh diff --git a/playground/load_tokenizer.py b/scripts/playground/load_tokenizer.py similarity index 100% rename from playground/load_tokenizer.py rename to scripts/playground/load_tokenizer.py diff --git a/playground/reference_hf.py b/scripts/playground/reference_hf.py similarity index 100% rename from playground/reference_hf.py rename to scripts/playground/reference_hf.py diff --git a/test/__init__.py b/test/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/test/lang/test_bind_cache.py b/test/lang/test_bind_cache.py index b2c6bfbe8..378378175 100644 --- a/test/lang/test_bind_cache.py +++ b/test/lang/test_bind_cache.py @@ -1,3 +1,9 @@ +""" +Usage: +python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 +python3 test_bind_cache.py +""" + import unittest import sglang as sgl diff --git a/test/lang/test_srt_backend.py b/test/lang/test_srt_backend.py index c92568c0b..9d2b8fd7a 100644 --- a/test/lang/test_srt_backend.py +++ b/test/lang/test_srt_backend.py @@ -1,5 +1,7 @@ """ -python3 -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 +Usage: +python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 +python3 test_srt_backend.py """ import json diff --git a/test/lang/test_tracing.py b/test/lang/test_tracing.py index ae7a95cad..5f2bc1d04 100644 --- a/test/lang/test_tracing.py +++ b/test/lang/test_tracing.py @@ -16,7 +16,7 @@ class TestTracing(unittest.TestCase): s += "A:" + sgl.gen("answer", stop="\n") tracer = few_shot_qa.trace() - print(tracer.last_node.print_graph_dfs() + "\n") + # print(tracer.last_node.print_graph_dfs() + "\n") def test_select(self): @sgl.function @@ -26,7 +26,7 @@ class TestTracing(unittest.TestCase): s += "It is a city" + sgl.gen("description", stop=".") tracer = capital.trace() - print(tracer.last_node.print_graph_dfs() + "\n") + # print(tracer.last_node.print_graph_dfs() + "\n") def test_raise_warning(self): @sgl.function @@ -66,11 +66,11 @@ class TestTracing(unittest.TestCase): s += "In summary" + sgl.gen("summary") compiled = tip_suggestion.compile() - compiled.print_graph() + # compiled.print_graph() sgl.set_default_backend(sgl.OpenAI("gpt-3.5-turbo-instruct")) state = compiled.run(topic="staying healthy") - print(state.text() + "\n") + # print(state.text() + "\n") states = compiled.run_batch( [ @@ -80,8 +80,8 @@ class TestTracing(unittest.TestCase): ], temperature=0, ) - for s in states: - print(s.text() + "\n") + # for s in states: + # print(s.text() + "\n") def test_role(self): @sgl.function @@ -95,7 +95,7 @@ class TestTracing(unittest.TestCase): backend.chat_template = get_chat_template("llama-2-chat") compiled = multi_turn_chat.compile(backend=backend) - compiled.print_graph() + # compiled.print_graph() def test_fork(self): @sgl.function @@ -118,10 +118,10 @@ class TestTracing(unittest.TestCase): s += "In summary" + sgl.gen("summary") tracer = tip_suggestion.trace() - print(tracer.last_node.print_graph_dfs()) + # print(tracer.last_node.print_graph_dfs()) a = tip_suggestion.run(backend=sgl.OpenAI("gpt-3.5-turbo-instruct")) - print(a.text()) + # print(a.text()) if __name__ == "__main__":