diff --git a/docs/backend/native_api.ipynb b/docs/backend/native_api.ipynb index 775917c00..39cd12cd3 100644 --- a/docs/backend/native_api.ipynb +++ b/docs/backend/native_api.ipynb @@ -41,7 +41,7 @@ ")\n", "\n", "server_process = execute_shell_command(\n", - "\"\"\"\n", + " \"\"\"\n", "python3 -m sglang.launch_server --model-path meta-llama/Llama-3.2-1B-Instruct --port=30010\n", "\"\"\"\n", ")\n", diff --git a/docs/backend/offline_engine_api.ipynb b/docs/backend/offline_engine_api.ipynb new file mode 100644 index 000000000..63a175ffa --- /dev/null +++ b/docs/backend/offline_engine_api.ipynb @@ -0,0 +1,210 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Offline Engine API\n", + "\n", + "SGLang provides a direct inference engine without the need for an HTTP server, especially for use cases where additional HTTP server adds unnecessary complexity or overhead. Here are two general use cases:\n", + "\n", + "- Offline Batch Inference\n", + "- Custom Server on Top of the Engine\n", + "\n", + "This document focuses on the offline batch inference, demonstrating four different inference modes:\n", + "\n", + "- Non-streaming synchronous generation\n", + "- Streaming synchronous generation\n", + "- Non-streaming asynchronous generation\n", + "- Streaming asynchronous generation\n", + "\n", + "Additionally, you can easily build a custom server on top of the SGLang offline engine. A detailed example working in a python script can be found in [custom_server](https://github.com/sgl-project/sglang/blob/main/examples/runtime/engine/custom_server.py)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Offline Batch Inference\n", + "\n", + "SGLang offline engine supports batch inference with efficient scheduling to prevent OOM errors for large batches. For details on this cache-aware scheduling algorithm, see our [paper](https://arxiv.org/pdf/2312.07104)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# launch the offline engine\n", + "\n", + "import sglang as sgl\n", + "from sglang.utils import print_highlight\n", + "import asyncio\n", + "\n", + "llm = sgl.Engine(model_path=\"meta-llama/Meta-Llama-3.1-8B-Instruct\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Non-streaming Synchronous Generation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "prompts = [\n", + " \"Hello, my name is\",\n", + " \"The president of the United States is\",\n", + " \"The capital of France is\",\n", + " \"The future of AI is\",\n", + "]\n", + "\n", + "sampling_params = {\"temperature\": 0.8, \"top_p\": 0.95}\n", + "\n", + "outputs = llm.generate(prompts, sampling_params)\n", + "for prompt, output in zip(prompts, outputs):\n", + " print_highlight(\"===============================\")\n", + " print_highlight(f\"Prompt: {prompt}\\nGenerated text: {output['text']}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Streaming Synchronous Generation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "prompts = [\n", + " \"Hello, my name is\",\n", + " \"The capital of France is\",\n", + " \"The future of AI is\",\n", + "]\n", + "sampling_params = {\"temperature\": 0.8, \"top_p\": 0.95}\n", + "\n", + "print_highlight(\"\\n=== Testing synchronous streaming generation ===\")\n", + "\n", + "for prompt in prompts:\n", + " print_highlight(f\"\\nPrompt: {prompt}\")\n", + " print(\"Generated text: \", end=\"\", flush=True)\n", + "\n", + " for chunk in llm.generate(prompt, sampling_params, stream=True):\n", + " print(chunk[\"text\"], end=\"\", flush=True)\n", + " print()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Non-streaming Asynchronous Generation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "prompts = [\n", + " \"Hello, my name is\",\n", + " \"The capital of France is\",\n", + " \"The future of AI is\",\n", + "]\n", + "\n", + "sampling_params = {\"temperature\": 0.8, \"top_p\": 0.95}\n", + "\n", + "print_highlight(\"\\n=== Testing asynchronous batch generation ===\")\n", + "\n", + "\n", + "async def main():\n", + " outputs = await llm.async_generate(prompts, sampling_params)\n", + "\n", + " for prompt, output in zip(prompts, outputs):\n", + " print_highlight(f\"\\nPrompt: {prompt}\")\n", + " print_highlight(f\"Generated text: {output['text']}\")\n", + "\n", + "\n", + "asyncio.run(main())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Streaming Asynchronous Generation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "prompts = [\n", + " \"Hello, my name is\",\n", + " \"The capital of France is\",\n", + " \"The future of AI is\",\n", + "]\n", + "sampling_params = {\"temperature\": 0.8, \"top_p\": 0.95}\n", + "\n", + "print_highlight(\"\\n=== Testing asynchronous streaming generation ===\")\n", + "\n", + "\n", + "async def main():\n", + " for prompt in prompts:\n", + " print_highlight(f\"\\nPrompt: {prompt}\")\n", + " print(\"Generated text: \", end=\"\", flush=True)\n", + "\n", + " generator = await llm.async_generate(prompt, sampling_params, stream=True)\n", + " async for chunk in generator:\n", + " print(chunk[\"text\"], end=\"\", flush=True)\n", + " print()\n", + "\n", + "\n", + "asyncio.run(main())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "llm.shutdown()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "AlphaMeemory", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/backend/openai_api_embeddings.ipynb b/docs/backend/openai_api_embeddings.ipynb index 0a40b0a0d..54b48d60c 100644 --- a/docs/backend/openai_api_embeddings.ipynb +++ b/docs/backend/openai_api_embeddings.ipynb @@ -113,10 +113,7 @@ "\n", "response = requests.post(\n", " \"http://localhost:30000/v1/embeddings\",\n", - " json={\n", - " \"model\": \"Alibaba-NLP/gte-Qwen2-7B-instruct\",\n", - " \"input\": text\n", - " }\n", + " json={\"model\": \"Alibaba-NLP/gte-Qwen2-7B-instruct\", \"input\": text},\n", ")\n", "\n", "text_embedding = response.json()[\"data\"][0][\"embedding\"]\n", diff --git a/docs/backend/openai_api_vision.ipynb b/docs/backend/openai_api_vision.ipynb index ecddf6c30..eb06e55ed 100644 --- a/docs/backend/openai_api_vision.ipynb +++ b/docs/backend/openai_api_vision.ipynb @@ -126,20 +126,17 @@ " {\n", " \"role\": \"user\",\n", " \"content\": [\n", - " {\n", - " \"type\": \"text\",\n", - " \"text\": \"What’s in this image?\"\n", - " },\n", + " {\"type\": \"text\", \"text\": \"What’s in this image?\"},\n", " {\n", " \"type\": \"image_url\",\n", " \"image_url\": {\n", " \"url\": \"https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true\"\n", - " }\n", - " }\n", - " ]\n", + " },\n", + " },\n", + " ],\n", " }\n", " ],\n", - " \"max_tokens\": 300\n", + " \"max_tokens\": 300,\n", "}\n", "\n", "response = requests.post(url, json=data)\n", diff --git a/docs/index.rst b/docs/index.rst index 55d3e81be..1f83acfb4 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -27,6 +27,7 @@ The core features include: backend/openai_api_vision.ipynb backend/openai_api_embeddings.ipynb backend/native_api.ipynb + backend/offline_engine_api.ipynb backend/backend.md