Add engine api (#1894)

2024-11-02 22:03:38 -07:00
parent f4cd804073
commit 908dd7f9aa
5 changed files with 218 additions and 13 deletions
--- a/docs/backend/native_api.ipynb
+++ b/docs/backend/native_api.ipynb
@@ -41,7 +41,7 @@
    ")\n",
    "\n",
    "server_process = execute_shell_command(\n",
-    "\"\"\"\n",
+    "    \"\"\"\n",
    "python3 -m sglang.launch_server --model-path meta-llama/Llama-3.2-1B-Instruct --port=30010\n",
    "\"\"\"\n",
    ")\n",
--- a/docs/backend/offline_engine_api.ipynb
+++ b/docs/backend/offline_engine_api.ipynb
@@ -0,0 +1,210 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Offline Engine API\n",
+    "\n",
+    "SGLang provides a direct inference engine without the need for an HTTP server, especially for use cases where additional HTTP server adds unnecessary complexity or overhead. Here are two general use cases:\n",
+    "\n",
+    "- Offline Batch Inference\n",
+    "- Custom Server on Top of the Engine\n",
+    "\n",
+    "This document focuses on the offline batch inference, demonstrating four different inference modes:\n",
+    "\n",
+    "- Non-streaming synchronous generation\n",
+    "- Streaming synchronous generation\n",
+    "- Non-streaming asynchronous generation\n",
+    "- Streaming asynchronous generation\n",
+    "\n",
+    "Additionally, you can easily build a custom server on top of the SGLang offline engine. A detailed example working in a python script can be found in [custom_server](https://github.com/sgl-project/sglang/blob/main/examples/runtime/engine/custom_server.py)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Offline Batch Inference\n",
+    "\n",
+    "SGLang offline engine supports batch inference with efficient scheduling to prevent OOM errors for large batches. For details on this cache-aware scheduling algorithm, see our [paper](https://arxiv.org/pdf/2312.07104)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# launch the offline engine\n",
+    "\n",
+    "import sglang as sgl\n",
+    "from sglang.utils import print_highlight\n",
+    "import asyncio\n",
+    "\n",
+    "llm = sgl.Engine(model_path=\"meta-llama/Meta-Llama-3.1-8B-Instruct\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Non-streaming Synchronous Generation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "prompts = [\n",
+    "    \"Hello, my name is\",\n",
+    "    \"The president of the United States is\",\n",
+    "    \"The capital of France is\",\n",
+    "    \"The future of AI is\",\n",
+    "]\n",
+    "\n",
+    "sampling_params = {\"temperature\": 0.8, \"top_p\": 0.95}\n",
+    "\n",
+    "outputs = llm.generate(prompts, sampling_params)\n",
+    "for prompt, output in zip(prompts, outputs):\n",
+    "    print_highlight(\"===============================\")\n",
+    "    print_highlight(f\"Prompt: {prompt}\\nGenerated text: {output['text']}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Streaming Synchronous Generation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "prompts = [\n",
+    "    \"Hello, my name is\",\n",
+    "    \"The capital of France is\",\n",
+    "    \"The future of AI is\",\n",
+    "]\n",
+    "sampling_params = {\"temperature\": 0.8, \"top_p\": 0.95}\n",
+    "\n",
+    "print_highlight(\"\\n=== Testing synchronous streaming generation ===\")\n",
+    "\n",
+    "for prompt in prompts:\n",
+    "    print_highlight(f\"\\nPrompt: {prompt}\")\n",
+    "    print(\"Generated text: \", end=\"\", flush=True)\n",
+    "\n",
+    "    for chunk in llm.generate(prompt, sampling_params, stream=True):\n",
+    "        print(chunk[\"text\"], end=\"\", flush=True)\n",
+    "    print()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Non-streaming Asynchronous Generation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "prompts = [\n",
+    "    \"Hello, my name is\",\n",
+    "    \"The capital of France is\",\n",
+    "    \"The future of AI is\",\n",
+    "]\n",
+    "\n",
+    "sampling_params = {\"temperature\": 0.8, \"top_p\": 0.95}\n",
+    "\n",
+    "print_highlight(\"\\n=== Testing asynchronous batch generation ===\")\n",
+    "\n",
+    "\n",
+    "async def main():\n",
+    "    outputs = await llm.async_generate(prompts, sampling_params)\n",
+    "\n",
+    "    for prompt, output in zip(prompts, outputs):\n",
+    "        print_highlight(f\"\\nPrompt: {prompt}\")\n",
+    "        print_highlight(f\"Generated text: {output['text']}\")\n",
+    "\n",
+    "\n",
+    "asyncio.run(main())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Streaming Asynchronous Generation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "prompts = [\n",
+    "    \"Hello, my name is\",\n",
+    "    \"The capital of France is\",\n",
+    "    \"The future of AI is\",\n",
+    "]\n",
+    "sampling_params = {\"temperature\": 0.8, \"top_p\": 0.95}\n",
+    "\n",
+    "print_highlight(\"\\n=== Testing asynchronous streaming generation ===\")\n",
+    "\n",
+    "\n",
+    "async def main():\n",
+    "    for prompt in prompts:\n",
+    "        print_highlight(f\"\\nPrompt: {prompt}\")\n",
+    "        print(\"Generated text: \", end=\"\", flush=True)\n",
+    "\n",
+    "        generator = await llm.async_generate(prompt, sampling_params, stream=True)\n",
+    "        async for chunk in generator:\n",
+    "            print(chunk[\"text\"], end=\"\", flush=True)\n",
+    "        print()\n",
+    "\n",
+    "\n",
+    "asyncio.run(main())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "llm.shutdown()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "AlphaMeemory",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/docs/backend/openai_api_embeddings.ipynb
+++ b/docs/backend/openai_api_embeddings.ipynb
@@ -113,10 +113,7 @@
    "\n",
    "response = requests.post(\n",
    "    \"http://localhost:30000/v1/embeddings\",\n",
-    "    json={\n",
-    "        \"model\": \"Alibaba-NLP/gte-Qwen2-7B-instruct\",\n",
-    "        \"input\": text\n",
-    "    }\n",
+    "    json={\"model\": \"Alibaba-NLP/gte-Qwen2-7B-instruct\", \"input\": text},\n",
    ")\n",
    "\n",
    "text_embedding = response.json()[\"data\"][0][\"embedding\"]\n",
--- a/docs/backend/openai_api_vision.ipynb
+++ b/docs/backend/openai_api_vision.ipynb
@@ -126,20 +126,17 @@
    "        {\n",
    "            \"role\": \"user\",\n",
    "            \"content\": [\n",
-    "                {\n",
-    "                    \"type\": \"text\",\n",
-    "                    \"text\": \"What’s in this image?\"\n",
-    "                },\n",
+    "                {\"type\": \"text\", \"text\": \"What’s in this image?\"},\n",
    "                {\n",
    "                    \"type\": \"image_url\",\n",
    "                    \"image_url\": {\n",
    "                        \"url\": \"https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true\"\n",
-    "                    }\n",
-    "                }\n",
-    "            ]\n",
+    "                    },\n",
+    "                },\n",
+    "            ],\n",
    "        }\n",
    "    ],\n",
-    "    \"max_tokens\": 300\n",
+    "    \"max_tokens\": 300,\n",
    "}\n",
    "\n",
    "response = requests.post(url, json=data)\n",
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -27,6 +27,7 @@ The core features include:
   backend/openai_api_vision.ipynb
   backend/openai_api_embeddings.ipynb
   backend/native_api.ipynb
+   backend/offline_engine_api.ipynb
   backend/backend.md