[engine] support async and streaming (#1614)
This commit is contained in:
53
examples/runtime/engine/custom_server.py
Normal file
53
examples/runtime/engine/custom_server.py
Normal file
@@ -0,0 +1,53 @@
|
||||
from sanic import Sanic, text
|
||||
from sanic.response import json
|
||||
|
||||
import sglang as sgl
|
||||
|
||||
engine = None
|
||||
|
||||
# Create an instance of the Sanic app
|
||||
app = Sanic("sanic-server")
|
||||
|
||||
|
||||
# Define an asynchronous route handler
|
||||
@app.route("/generate", methods=["POST"])
|
||||
async def generate(request):
|
||||
prompt = request.json.get("prompt")
|
||||
if not prompt:
|
||||
return json({"error": "Prompt is required"}, status=400)
|
||||
|
||||
# async_generate returns a dict
|
||||
result = await engine.async_generate(prompt)
|
||||
|
||||
return text(result["text"])
|
||||
|
||||
|
||||
@app.route("/generate_stream", methods=["POST"])
|
||||
async def generate_stream(request):
|
||||
prompt = request.json.get("prompt")
|
||||
|
||||
if not prompt:
|
||||
return json({"error": "Prompt is required"}, status=400)
|
||||
|
||||
# async_generate returns a dict
|
||||
result = await engine.async_generate(prompt, stream=True)
|
||||
|
||||
# https://sanic.dev/en/guide/advanced/streaming.md#streaming
|
||||
# init the response
|
||||
response = await request.respond()
|
||||
|
||||
# result is an async generator
|
||||
async for chunk in result:
|
||||
await response.send(chunk["text"])
|
||||
|
||||
await response.eof()
|
||||
|
||||
|
||||
def run_server():
|
||||
global engine
|
||||
engine = sgl.Engine(model_path="meta-llama/Meta-Llama-3.1-8B-Instruct")
|
||||
app.run(host="0.0.0.0", port=8000, single_process=True)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_server()
|
||||
28
examples/runtime/engine/offline_batch_inference.py
Normal file
28
examples/runtime/engine/offline_batch_inference.py
Normal file
@@ -0,0 +1,28 @@
|
||||
import sglang as sgl
|
||||
|
||||
|
||||
def main():
|
||||
# Sample prompts.
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
# Create a sampling params object.
|
||||
sampling_params = {"temperature": 0.8, "top_p": 0.95}
|
||||
|
||||
# Create an LLM.
|
||||
llm = sgl.Engine(model_path="meta-llama/Meta-Llama-3.1-8B-Instruct")
|
||||
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
# Print the outputs.
|
||||
for prompt, output in zip(prompts, outputs):
|
||||
print("===============================")
|
||||
print(f"Prompt: {prompt}\nGenerated text: {output['text']}")
|
||||
|
||||
|
||||
# The __main__ condition is necessary here because we use "spawn" to create subprocesses
|
||||
# Spawn starts a fresh program every time, if there is no __main__, it will run into infinite loop to keep spawning processes from sgl.Engine
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
40
examples/runtime/engine/readme.md
Normal file
40
examples/runtime/engine/readme.md
Normal file
@@ -0,0 +1,40 @@
|
||||
# SGLang Engine
|
||||
|
||||
## Introduction
|
||||
SGLang provides a direct inference engine without the need for an HTTP server. There are generally two use cases:
|
||||
|
||||
1. **Offline Batch Inference**
|
||||
2. **Custom Server on Top of the Engine**
|
||||
|
||||
## Examples
|
||||
|
||||
### 1. [Offline Batch Inference](./offline_batch_inference.py)
|
||||
|
||||
In this example, we launch an SGLang engine and feed a batch of inputs for inference. If you provide a very large batch, the engine will intelligently schedule the requests to process efficiently and prevent OOM (Out of Memory) errors.
|
||||
|
||||
### 2. [Custom Server](./custom_server.py)
|
||||
|
||||
This example demonstrates how to create a custom server on top of the SGLang Engine. We use [Sanic](https://sanic.dev/en/) as an example. The server supports both non-streaming and streaming endpoints.
|
||||
|
||||
#### Steps:
|
||||
|
||||
1. Install Sanic:
|
||||
|
||||
```bash
|
||||
pip install sanic
|
||||
```
|
||||
|
||||
2. Run the server:
|
||||
|
||||
```bash
|
||||
python custom_server
|
||||
```
|
||||
|
||||
3. Send requests:
|
||||
|
||||
```bash
|
||||
curl -X POST http://localhost:8000/generate -H "Content-Type: application/json" -d '{"prompt": "The Transformer architecture is..."}'
|
||||
curl -X POST http://localhost:8000/generate_stream -H "Content-Type: application/json" -d '{"prompt": "The Transformer architecture is..."}' --no-buffer
|
||||
```
|
||||
|
||||
This will send both non-streaming and streaming requests to the server.
|
||||
Reference in New Issue
Block a user