Add engine api (#1894)
This commit is contained in:
@@ -41,7 +41,7 @@
|
||||
")\n",
|
||||
"\n",
|
||||
"server_process = execute_shell_command(\n",
|
||||
"\"\"\"\n",
|
||||
" \"\"\"\n",
|
||||
"python3 -m sglang.launch_server --model-path meta-llama/Llama-3.2-1B-Instruct --port=30010\n",
|
||||
"\"\"\"\n",
|
||||
")\n",
|
||||
|
||||
210
docs/backend/offline_engine_api.ipynb
Normal file
210
docs/backend/offline_engine_api.ipynb
Normal file
@@ -0,0 +1,210 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Offline Engine API\n",
|
||||
"\n",
|
||||
"SGLang provides a direct inference engine without the need for an HTTP server, especially for use cases where additional HTTP server adds unnecessary complexity or overhead. Here are two general use cases:\n",
|
||||
"\n",
|
||||
"- Offline Batch Inference\n",
|
||||
"- Custom Server on Top of the Engine\n",
|
||||
"\n",
|
||||
"This document focuses on the offline batch inference, demonstrating four different inference modes:\n",
|
||||
"\n",
|
||||
"- Non-streaming synchronous generation\n",
|
||||
"- Streaming synchronous generation\n",
|
||||
"- Non-streaming asynchronous generation\n",
|
||||
"- Streaming asynchronous generation\n",
|
||||
"\n",
|
||||
"Additionally, you can easily build a custom server on top of the SGLang offline engine. A detailed example working in a python script can be found in [custom_server](https://github.com/sgl-project/sglang/blob/main/examples/runtime/engine/custom_server.py)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Offline Batch Inference\n",
|
||||
"\n",
|
||||
"SGLang offline engine supports batch inference with efficient scheduling to prevent OOM errors for large batches. For details on this cache-aware scheduling algorithm, see our [paper](https://arxiv.org/pdf/2312.07104)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# launch the offline engine\n",
|
||||
"\n",
|
||||
"import sglang as sgl\n",
|
||||
"from sglang.utils import print_highlight\n",
|
||||
"import asyncio\n",
|
||||
"\n",
|
||||
"llm = sgl.Engine(model_path=\"meta-llama/Meta-Llama-3.1-8B-Instruct\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Non-streaming Synchronous Generation"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"prompts = [\n",
|
||||
" \"Hello, my name is\",\n",
|
||||
" \"The president of the United States is\",\n",
|
||||
" \"The capital of France is\",\n",
|
||||
" \"The future of AI is\",\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"sampling_params = {\"temperature\": 0.8, \"top_p\": 0.95}\n",
|
||||
"\n",
|
||||
"outputs = llm.generate(prompts, sampling_params)\n",
|
||||
"for prompt, output in zip(prompts, outputs):\n",
|
||||
" print_highlight(\"===============================\")\n",
|
||||
" print_highlight(f\"Prompt: {prompt}\\nGenerated text: {output['text']}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Streaming Synchronous Generation"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"prompts = [\n",
|
||||
" \"Hello, my name is\",\n",
|
||||
" \"The capital of France is\",\n",
|
||||
" \"The future of AI is\",\n",
|
||||
"]\n",
|
||||
"sampling_params = {\"temperature\": 0.8, \"top_p\": 0.95}\n",
|
||||
"\n",
|
||||
"print_highlight(\"\\n=== Testing synchronous streaming generation ===\")\n",
|
||||
"\n",
|
||||
"for prompt in prompts:\n",
|
||||
" print_highlight(f\"\\nPrompt: {prompt}\")\n",
|
||||
" print(\"Generated text: \", end=\"\", flush=True)\n",
|
||||
"\n",
|
||||
" for chunk in llm.generate(prompt, sampling_params, stream=True):\n",
|
||||
" print(chunk[\"text\"], end=\"\", flush=True)\n",
|
||||
" print()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Non-streaming Asynchronous Generation"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"prompts = [\n",
|
||||
" \"Hello, my name is\",\n",
|
||||
" \"The capital of France is\",\n",
|
||||
" \"The future of AI is\",\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"sampling_params = {\"temperature\": 0.8, \"top_p\": 0.95}\n",
|
||||
"\n",
|
||||
"print_highlight(\"\\n=== Testing asynchronous batch generation ===\")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"async def main():\n",
|
||||
" outputs = await llm.async_generate(prompts, sampling_params)\n",
|
||||
"\n",
|
||||
" for prompt, output in zip(prompts, outputs):\n",
|
||||
" print_highlight(f\"\\nPrompt: {prompt}\")\n",
|
||||
" print_highlight(f\"Generated text: {output['text']}\")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"asyncio.run(main())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Streaming Asynchronous Generation"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"prompts = [\n",
|
||||
" \"Hello, my name is\",\n",
|
||||
" \"The capital of France is\",\n",
|
||||
" \"The future of AI is\",\n",
|
||||
"]\n",
|
||||
"sampling_params = {\"temperature\": 0.8, \"top_p\": 0.95}\n",
|
||||
"\n",
|
||||
"print_highlight(\"\\n=== Testing asynchronous streaming generation ===\")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"async def main():\n",
|
||||
" for prompt in prompts:\n",
|
||||
" print_highlight(f\"\\nPrompt: {prompt}\")\n",
|
||||
" print(\"Generated text: \", end=\"\", flush=True)\n",
|
||||
"\n",
|
||||
" generator = await llm.async_generate(prompt, sampling_params, stream=True)\n",
|
||||
" async for chunk in generator:\n",
|
||||
" print(chunk[\"text\"], end=\"\", flush=True)\n",
|
||||
" print()\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"asyncio.run(main())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"llm.shutdown()"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "AlphaMeemory",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.7"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
@@ -113,10 +113,7 @@
|
||||
"\n",
|
||||
"response = requests.post(\n",
|
||||
" \"http://localhost:30000/v1/embeddings\",\n",
|
||||
" json={\n",
|
||||
" \"model\": \"Alibaba-NLP/gte-Qwen2-7B-instruct\",\n",
|
||||
" \"input\": text\n",
|
||||
" }\n",
|
||||
" json={\"model\": \"Alibaba-NLP/gte-Qwen2-7B-instruct\", \"input\": text},\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"text_embedding = response.json()[\"data\"][0][\"embedding\"]\n",
|
||||
|
||||
@@ -126,20 +126,17 @@
|
||||
" {\n",
|
||||
" \"role\": \"user\",\n",
|
||||
" \"content\": [\n",
|
||||
" {\n",
|
||||
" \"type\": \"text\",\n",
|
||||
" \"text\": \"What’s in this image?\"\n",
|
||||
" },\n",
|
||||
" {\"type\": \"text\", \"text\": \"What’s in this image?\"},\n",
|
||||
" {\n",
|
||||
" \"type\": \"image_url\",\n",
|
||||
" \"image_url\": {\n",
|
||||
" \"url\": \"https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true\"\n",
|
||||
" }\n",
|
||||
" }\n",
|
||||
" ]\n",
|
||||
" },\n",
|
||||
" },\n",
|
||||
" ],\n",
|
||||
" }\n",
|
||||
" ],\n",
|
||||
" \"max_tokens\": 300\n",
|
||||
" \"max_tokens\": 300,\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"response = requests.post(url, json=data)\n",
|
||||
|
||||
@@ -27,6 +27,7 @@ The core features include:
|
||||
backend/openai_api_vision.ipynb
|
||||
backend/openai_api_embeddings.ipynb
|
||||
backend/native_api.ipynb
|
||||
backend/offline_engine_api.ipynb
|
||||
backend/backend.md
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user