[engine] support async and streaming (#1614)

2024-10-11 15:26:25 -07:00
parent 00c7e6368b
commit 862cd265e5
7 changed files with 384 additions and 8 deletions
--- a/examples/runtime/engine/custom_server.py
+++ b/examples/runtime/engine/custom_server.py
@@ -0,0 +1,53 @@
+from sanic import Sanic, text
+from sanic.response import json
+
+import sglang as sgl
+
+engine = None
+
+# Create an instance of the Sanic app
+app = Sanic("sanic-server")
+
+
+# Define an asynchronous route handler
+@app.route("/generate", methods=["POST"])
+async def generate(request):
+    prompt = request.json.get("prompt")
+    if not prompt:
+        return json({"error": "Prompt is required"}, status=400)
+
+    # async_generate returns a dict
+    result = await engine.async_generate(prompt)
+
+    return text(result["text"])
+
+
+@app.route("/generate_stream", methods=["POST"])
+async def generate_stream(request):
+    prompt = request.json.get("prompt")
+
+    if not prompt:
+        return json({"error": "Prompt is required"}, status=400)
+
+    # async_generate returns a dict
+    result = await engine.async_generate(prompt, stream=True)
+
+    # https://sanic.dev/en/guide/advanced/streaming.md#streaming
+    # init the response
+    response = await request.respond()
+
+    # result is an async generator
+    async for chunk in result:
+        await response.send(chunk["text"])
+
+    await response.eof()
+
+
+def run_server():
+    global engine
+    engine = sgl.Engine(model_path="meta-llama/Meta-Llama-3.1-8B-Instruct")
+    app.run(host="0.0.0.0", port=8000, single_process=True)
+
+
+if __name__ == "__main__":
+    run_server()
--- a/examples/runtime/engine/offline_batch_inference.py
+++ b/examples/runtime/engine/offline_batch_inference.py
@@ -0,0 +1,28 @@
+import sglang as sgl
+
+
+def main():
+    # Sample prompts.
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+    # Create a sampling params object.
+    sampling_params = {"temperature": 0.8, "top_p": 0.95}
+
+    # Create an LLM.
+    llm = sgl.Engine(model_path="meta-llama/Meta-Llama-3.1-8B-Instruct")
+
+    outputs = llm.generate(prompts, sampling_params)
+    # Print the outputs.
+    for prompt, output in zip(prompts, outputs):
+        print("===============================")
+        print(f"Prompt: {prompt}\nGenerated text: {output['text']}")
+
+
+# The __main__ condition is necessary here because we use "spawn" to create subprocesses
+# Spawn starts a fresh program every time, if there is no __main__, it will run into infinite loop to keep spawning processes from sgl.Engine
+if __name__ == "__main__":
+    main()
--- a/examples/runtime/engine/readme.md
+++ b/examples/runtime/engine/readme.md
@@ -0,0 +1,40 @@
+# SGLang Engine
+
+## Introduction
+SGLang provides a direct inference engine without the need for an HTTP server. There are generally two use cases:
+
+1. **Offline Batch Inference**
+2. **Custom Server on Top of the Engine**
+
+## Examples
+
+### 1. [Offline Batch Inference](./offline_batch_inference.py)
+
+In this example, we launch an SGLang engine and feed a batch of inputs for inference. If you provide a very large batch, the engine will intelligently schedule the requests to process efficiently and prevent OOM (Out of Memory) errors.
+
+### 2. [Custom Server](./custom_server.py)
+
+This example demonstrates how to create a custom server on top of the SGLang Engine. We use [Sanic](https://sanic.dev/en/) as an example. The server supports both non-streaming and streaming endpoints.
+
+#### Steps:
+
+1. Install Sanic:
+
+```bash
+pip install sanic
+```
+
+2. Run the server:
+
+```bash
+python custom_server
+```
+
+3. Send requests:
+
+```bash
+curl -X POST http://localhost:8000/generate  -H "Content-Type: application/json"  -d '{"prompt": "The Transformer architecture is..."}'
+curl -X POST http://localhost:8000/generate_stream  -H "Content-Type: application/json"  -d '{"prompt": "The Transformer architecture is..."}' --no-buffer
+```
+
+This will send both non-streaming and streaming requests to the server.