[engine] support async and streaming (#1614)

This commit is contained in:
Byron Hsu
2024-10-11 15:26:25 -07:00
committed by GitHub
parent 00c7e6368b
commit 862cd265e5
7 changed files with 384 additions and 8 deletions

View File

@@ -0,0 +1,53 @@
from sanic import Sanic, text
from sanic.response import json
import sglang as sgl
engine = None
# Create an instance of the Sanic app
app = Sanic("sanic-server")
# Define an asynchronous route handler
@app.route("/generate", methods=["POST"])
async def generate(request):
prompt = request.json.get("prompt")
if not prompt:
return json({"error": "Prompt is required"}, status=400)
# async_generate returns a dict
result = await engine.async_generate(prompt)
return text(result["text"])
@app.route("/generate_stream", methods=["POST"])
async def generate_stream(request):
prompt = request.json.get("prompt")
if not prompt:
return json({"error": "Prompt is required"}, status=400)
# async_generate returns a dict
result = await engine.async_generate(prompt, stream=True)
# https://sanic.dev/en/guide/advanced/streaming.md#streaming
# init the response
response = await request.respond()
# result is an async generator
async for chunk in result:
await response.send(chunk["text"])
await response.eof()
def run_server():
global engine
engine = sgl.Engine(model_path="meta-llama/Meta-Llama-3.1-8B-Instruct")
app.run(host="0.0.0.0", port=8000, single_process=True)
if __name__ == "__main__":
run_server()

View File

@@ -0,0 +1,28 @@
import sglang as sgl
def main():
# Sample prompts.
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# Create a sampling params object.
sampling_params = {"temperature": 0.8, "top_p": 0.95}
# Create an LLM.
llm = sgl.Engine(model_path="meta-llama/Meta-Llama-3.1-8B-Instruct")
outputs = llm.generate(prompts, sampling_params)
# Print the outputs.
for prompt, output in zip(prompts, outputs):
print("===============================")
print(f"Prompt: {prompt}\nGenerated text: {output['text']}")
# The __main__ condition is necessary here because we use "spawn" to create subprocesses
# Spawn starts a fresh program every time, if there is no __main__, it will run into infinite loop to keep spawning processes from sgl.Engine
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,40 @@
# SGLang Engine
## Introduction
SGLang provides a direct inference engine without the need for an HTTP server. There are generally two use cases:
1. **Offline Batch Inference**
2. **Custom Server on Top of the Engine**
## Examples
### 1. [Offline Batch Inference](./offline_batch_inference.py)
In this example, we launch an SGLang engine and feed a batch of inputs for inference. If you provide a very large batch, the engine will intelligently schedule the requests to process efficiently and prevent OOM (Out of Memory) errors.
### 2. [Custom Server](./custom_server.py)
This example demonstrates how to create a custom server on top of the SGLang Engine. We use [Sanic](https://sanic.dev/en/) as an example. The server supports both non-streaming and streaming endpoints.
#### Steps:
1. Install Sanic:
```bash
pip install sanic
```
2. Run the server:
```bash
python custom_server
```
3. Send requests:
```bash
curl -X POST http://localhost:8000/generate -H "Content-Type: application/json" -d '{"prompt": "The Transformer architecture is..."}'
curl -X POST http://localhost:8000/generate_stream -H "Content-Type: application/json" -d '{"prompt": "The Transformer architecture is..."}' --no-buffer
```
This will send both non-streaming and streaming requests to the server.