Add OpenAI backend to the CI test (#869)
This commit is contained in:
8
.github/workflows/pr-e2e-test.yml
vendored
8
.github/workflows/pr-e2e-test.yml
vendored
@@ -32,7 +32,13 @@ jobs:
|
|||||||
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/ --force-reinstall
|
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/ --force-reinstall
|
||||||
pip install --upgrade transformers
|
pip install --upgrade transformers
|
||||||
|
|
||||||
- name: Launch server and run benchmark
|
- name: Test OpenAI Backend
|
||||||
|
run: |
|
||||||
|
export OPENAI_API_KEY=secrets.OPENAI_API_KEY
|
||||||
|
cd sglang/test/lang
|
||||||
|
python3 test_openai_backend.py
|
||||||
|
|
||||||
|
- name: Benchmark Serving
|
||||||
run: |
|
run: |
|
||||||
python3 -m sglang.launch_server --model /home/lmzheng/zhyncs/Meta-Llama-3.1-8B-Instruct --port 8413 --disable-radix-cache &
|
python3 -m sglang.launch_server --model /home/lmzheng/zhyncs/Meta-Llama-3.1-8B-Instruct --port 8413 --disable-radix-cache &
|
||||||
|
|
||||||
|
|||||||
2
.github/workflows/release-docker.yml
vendored
2
.github/workflows/release-docker.yml
vendored
@@ -1,4 +1,4 @@
|
|||||||
name: publish docker
|
name: Release Docker
|
||||||
on:
|
on:
|
||||||
push:
|
push:
|
||||||
branches:
|
branches:
|
||||||
|
|||||||
2
.github/workflows/release-fake-tag.yml
vendored
2
.github/workflows/release-fake-tag.yml
vendored
@@ -1,4 +1,4 @@
|
|||||||
name: fake tag
|
name: Release Fake Tag
|
||||||
on:
|
on:
|
||||||
push:
|
push:
|
||||||
branches:
|
branches:
|
||||||
|
|||||||
2
.github/workflows/release-github.yml
vendored
2
.github/workflows/release-github.yml
vendored
@@ -1,4 +1,4 @@
|
|||||||
name: release tag
|
name: Release GitHub
|
||||||
on:
|
on:
|
||||||
workflow_dispatch:
|
workflow_dispatch:
|
||||||
jobs:
|
jobs:
|
||||||
|
|||||||
2
.github/workflows/release-pypi.yml
vendored
2
.github/workflows/release-pypi.yml
vendored
@@ -1,4 +1,4 @@
|
|||||||
name: publish to pypi
|
name: Release PyPI
|
||||||
on:
|
on:
|
||||||
push:
|
push:
|
||||||
branches:
|
branches:
|
||||||
|
|||||||
Binary file not shown.
|
Before Width: | Height: | Size: 231 KiB |
Binary file not shown.
|
Before Width: | Height: | Size: 157 KiB |
@@ -1,22 +0,0 @@
|
|||||||
# Benchmark Results
|
|
||||||
|
|
||||||
We tested our system on the following common LLM workloads and reported the achieved throughput:
|
|
||||||
- **[MMLU](https://arxiv.org/abs/2009.03300)**: A 5-shot, multi-choice, multi-task benchmark.
|
|
||||||
- **[HellaSwag](https://arxiv.org/abs/1905.07830)**: A 20-shot, multi-choice sentence completion benchmark.
|
|
||||||
- **[ReAct Agent](https://arxiv.org/abs/2210.03629)**: An agent task using prompt traces collected from the original ReAct paper.
|
|
||||||
- **[Tree-of-Thought](https://arxiv.org/pdf/2305.10601.pdf)**: A custom tree search-based prompt for solving GSM-8K problems.
|
|
||||||
- **JSON Decode**: Extracting information from a Wikipedia page and outputting it in JSON format.
|
|
||||||
- **Chat (short)**: A synthetic chat benchmark where each conversation includes 4 turns with short LLM outputs.
|
|
||||||
- **Chat (long)**: A synthetic chat benchmark where each conversation includes 4 turns with long LLM outputs.
|
|
||||||
- **[DSPy RAG](https://github.com/stanfordnlp/dspy)**: A retrieval-augmented generation pipeline in the DSPy tutorial.
|
|
||||||
- **[LLaVA Bench](https://github.com/haotian-liu/LLaVA)**: Running LLaVA v1.5, a vision language model on the LLaVA-in-the-wild benchmark.
|
|
||||||
|
|
||||||
We tested both Llama-7B on one NVIDIA A10G GPU (24GB) and Mixtral-8x7B on 8 NVIDIA A10G GPUs with tensor parallelism, using FP16 precision. We used vllm v0.2.5, guidance v0.1.8, Hugging Face TGI v1.3.0, and SGLang v0.1.5.
|
|
||||||
|
|
||||||
- Llama-7B on NVIDIA A10G, FP16, Tensor Parallelism=1
|
|
||||||

|
|
||||||
|
|
||||||
- Mixtral-8x7B on NVIDIA A10G, FP16, Tensor Parallelism=8
|
|
||||||

|
|
||||||
|
|
||||||
The benchmark code is available [here](https://github.com/sgl-project/sglang/tree/main/benchmark).
|
|
||||||
@@ -11,6 +11,6 @@ To port a model from vLLM to SGLang, you can compare these two files [SGLang LLa
|
|||||||
- Change `forward()` functions, and add `input_metadata`.
|
- Change `forward()` functions, and add `input_metadata`.
|
||||||
- Add `EntryClass` at the end.
|
- Add `EntryClass` at the end.
|
||||||
- Test correctness by comparing the final logits and outputs of the two following commands:
|
- Test correctness by comparing the final logits and outputs of the two following commands:
|
||||||
- `python3 playground/reference_hf.py --model [new model]`
|
- `python3 scripts/playground/reference_hf.py --model [new model]`
|
||||||
- `python3 -m sglang.bench_latency --model [new model] --correct --output-len 16 --trust-remote-code`
|
- `python3 -m sglang.bench_latency --model [new model] --correct --output-len 16 --trust-remote-code`
|
||||||
- Update [Supported Models](https://github.com/sgl-project/sglang/tree/main?tab=readme-ov-file#supported-models) at [README](../README.md).
|
- Update [Supported Models](https://github.com/sgl-project/sglang/tree/main?tab=readme-ov-file#supported-models) at [README](../README.md).
|
||||||
|
|||||||
@@ -1,3 +1,9 @@
|
|||||||
|
"""
|
||||||
|
Usage:
|
||||||
|
python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000
|
||||||
|
python3 test_bind_cache.py
|
||||||
|
"""
|
||||||
|
|
||||||
import unittest
|
import unittest
|
||||||
|
|
||||||
import sglang as sgl
|
import sglang as sgl
|
||||||
|
|||||||
@@ -1,5 +1,7 @@
|
|||||||
"""
|
"""
|
||||||
python3 -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
|
Usage:
|
||||||
|
python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000
|
||||||
|
python3 test_srt_backend.py
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import json
|
import json
|
||||||
|
|||||||
@@ -16,7 +16,7 @@ class TestTracing(unittest.TestCase):
|
|||||||
s += "A:" + sgl.gen("answer", stop="\n")
|
s += "A:" + sgl.gen("answer", stop="\n")
|
||||||
|
|
||||||
tracer = few_shot_qa.trace()
|
tracer = few_shot_qa.trace()
|
||||||
print(tracer.last_node.print_graph_dfs() + "\n")
|
# print(tracer.last_node.print_graph_dfs() + "\n")
|
||||||
|
|
||||||
def test_select(self):
|
def test_select(self):
|
||||||
@sgl.function
|
@sgl.function
|
||||||
@@ -26,7 +26,7 @@ class TestTracing(unittest.TestCase):
|
|||||||
s += "It is a city" + sgl.gen("description", stop=".")
|
s += "It is a city" + sgl.gen("description", stop=".")
|
||||||
|
|
||||||
tracer = capital.trace()
|
tracer = capital.trace()
|
||||||
print(tracer.last_node.print_graph_dfs() + "\n")
|
# print(tracer.last_node.print_graph_dfs() + "\n")
|
||||||
|
|
||||||
def test_raise_warning(self):
|
def test_raise_warning(self):
|
||||||
@sgl.function
|
@sgl.function
|
||||||
@@ -66,11 +66,11 @@ class TestTracing(unittest.TestCase):
|
|||||||
s += "In summary" + sgl.gen("summary")
|
s += "In summary" + sgl.gen("summary")
|
||||||
|
|
||||||
compiled = tip_suggestion.compile()
|
compiled = tip_suggestion.compile()
|
||||||
compiled.print_graph()
|
# compiled.print_graph()
|
||||||
|
|
||||||
sgl.set_default_backend(sgl.OpenAI("gpt-3.5-turbo-instruct"))
|
sgl.set_default_backend(sgl.OpenAI("gpt-3.5-turbo-instruct"))
|
||||||
state = compiled.run(topic="staying healthy")
|
state = compiled.run(topic="staying healthy")
|
||||||
print(state.text() + "\n")
|
# print(state.text() + "\n")
|
||||||
|
|
||||||
states = compiled.run_batch(
|
states = compiled.run_batch(
|
||||||
[
|
[
|
||||||
@@ -80,8 +80,8 @@ class TestTracing(unittest.TestCase):
|
|||||||
],
|
],
|
||||||
temperature=0,
|
temperature=0,
|
||||||
)
|
)
|
||||||
for s in states:
|
# for s in states:
|
||||||
print(s.text() + "\n")
|
# print(s.text() + "\n")
|
||||||
|
|
||||||
def test_role(self):
|
def test_role(self):
|
||||||
@sgl.function
|
@sgl.function
|
||||||
@@ -95,7 +95,7 @@ class TestTracing(unittest.TestCase):
|
|||||||
backend.chat_template = get_chat_template("llama-2-chat")
|
backend.chat_template = get_chat_template("llama-2-chat")
|
||||||
|
|
||||||
compiled = multi_turn_chat.compile(backend=backend)
|
compiled = multi_turn_chat.compile(backend=backend)
|
||||||
compiled.print_graph()
|
# compiled.print_graph()
|
||||||
|
|
||||||
def test_fork(self):
|
def test_fork(self):
|
||||||
@sgl.function
|
@sgl.function
|
||||||
@@ -118,10 +118,10 @@ class TestTracing(unittest.TestCase):
|
|||||||
s += "In summary" + sgl.gen("summary")
|
s += "In summary" + sgl.gen("summary")
|
||||||
|
|
||||||
tracer = tip_suggestion.trace()
|
tracer = tip_suggestion.trace()
|
||||||
print(tracer.last_node.print_graph_dfs())
|
# print(tracer.last_node.print_graph_dfs())
|
||||||
|
|
||||||
a = tip_suggestion.run(backend=sgl.OpenAI("gpt-3.5-turbo-instruct"))
|
a = tip_suggestion.run(backend=sgl.OpenAI("gpt-3.5-turbo-instruct"))
|
||||||
print(a.text())
|
# print(a.text())
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
Reference in New Issue
Block a user