diff --git a/docs/backend/constrained_decoding.ipynb b/docs/backend/constrained_decoding.ipynb new file mode 100644 index 000000000..a0ea915f3 --- /dev/null +++ b/docs/backend/constrained_decoding.ipynb @@ -0,0 +1,525 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Constrained Decoding Tutorial\n", + "\n", + "This tutorial shows how to format model outputs using constrained decoding in SGLang." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Constrained Decoding\n", + "\n", + "With SGLang, You can define a JSON schema, EBNF or regular expression to constrain the model's output.\n", + "\n", + "[JSON Schema](https://json-schema.org/): Formats output into structured JSON objects with validation rules.\n", + "\n", + "[EBNF (Extended Backus-Naur Form)](https://en.wikipedia.org/wiki/Extended_Backus%E2%80%93Naur_form): Defines complex syntax rules, especially for recursive patterns like nested structures.\n", + "\n", + "[Regular Expressions](https://en.wikipedia.org/wiki/Regular_expression): Matches text patterns for simple validation and formatting.\n", + "\n", + "### Constrained Decoding Backends\n", + "\n", + "SGLang has two backends: [Outlines](https://github.com/dottxt-ai/outlines) (default) and [XGrammar](https://blog.mlc.ai/2024/11/22/achieving-efficient-flexible-portable-structured-generation-with-xgrammar). We suggest using XGrammar whenever possible for its better performance. For more details, see [XGrammar technical overview](https://blog.mlc.ai/2024/11/22/achieving-efficient-flexible-portable-structured-generation-with-xgrammar).\n", + "\n", + "* Xgrammar Backend: JSON and EBNF\n", + "* Outlines Backend: JSON and regular expressions" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## OpenAI Compatible API" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To use Xgrammar, simply add `--grammar-backend xgrammar` when launching the server. If no backend is specified, Outlines will be used as the default." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sglang.utils import (\n", + " execute_shell_command,\n", + " wait_for_server,\n", + " terminate_process,\n", + " print_highlight,\n", + ")\n", + "import openai\n", + "\n", + "server_process = execute_shell_command(\n", + " \"python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --port 30000 --host 0.0.0.0 --grammar-backend xgrammar\"\n", + ")\n", + "\n", + "wait_for_server(\"http://localhost:30000\")\n", + "client = openai.Client(base_url=\"http://127.0.0.1:30000/v1\", api_key=\"None\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### JSON" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "\n", + "json_schema = json.dumps(\n", + " {\n", + " \"type\": \"object\",\n", + " \"properties\": {\n", + " \"name\": {\"type\": \"string\", \"pattern\": \"^[\\\\w]+$\"},\n", + " \"population\": {\"type\": \"integer\"},\n", + " },\n", + " \"required\": [\"name\", \"population\"],\n", + " }\n", + ")\n", + "\n", + "response = client.chat.completions.create(\n", + " model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n", + " messages=[\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": \"Give me the information of the capital of France in the JSON format.\",\n", + " },\n", + " ],\n", + " temperature=0,\n", + " max_tokens=128,\n", + " response_format={\n", + " \"type\": \"json_schema\",\n", + " \"json_schema\": {\"name\": \"foo\", \"schema\": json.loads(json_schema)},\n", + " },\n", + ")\n", + "\n", + "print_highlight(response.choices[0].message.content)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### EBNF" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ebnf_grammar = \"\"\"\n", + "root ::= city | description\n", + "city ::= \"London\" | \"Paris\" | \"Berlin\" | \"Rome\"\n", + "description ::= city \" is \" status\n", + "status ::= \"the capital of \" country\n", + "country ::= \"England\" | \"France\" | \"Germany\" | \"Italy\"\n", + "\"\"\"\n", + "\n", + "response = client.chat.completions.create(\n", + " model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n", + " messages=[\n", + " {\"role\": \"system\", \"content\": \"You are a helpful geography bot.\"},\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": \"Give me the information of the capital of France.\",\n", + " },\n", + " ],\n", + " temperature=0,\n", + " max_tokens=32,\n", + " extra_body={\"ebnf\": ebnf_grammar},\n", + ")\n", + "\n", + "print_highlight(response.choices[0].message.content)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "terminate_process(server_process)\n", + "server_process = execute_shell_command(\n", + " \"python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --port 30000 --host 0.0.0.0\"\n", + ")\n", + "\n", + "wait_for_server(\"http://localhost:30000\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Regular expression" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "response = client.chat.completions.create(\n", + " model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n", + " messages=[\n", + " {\"role\": \"user\", \"content\": \"What is the capital of France?\"},\n", + " ],\n", + " temperature=0,\n", + " max_tokens=128,\n", + " extra_body={\"regex\": \"(Paris|London)\"},\n", + ")\n", + "\n", + "print_highlight(response.choices[0].message.content)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "terminate_process(server_process)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Native API and SGLang Runtime (SRT)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sglang.utils import (\n", + " execute_shell_command,\n", + " wait_for_server,\n", + " terminate_process,\n", + " print_highlight,\n", + ")\n", + "\n", + "import requests\n", + "\n", + "server_process = execute_shell_command(\n", + " \"\"\"\n", + "python3 -m sglang.launch_server --model-path meta-llama/Llama-3.2-1B-Instruct --port=30010 --grammar-backend xgrammar\n", + "\"\"\"\n", + ")\n", + "\n", + "wait_for_server(\"http://localhost:30010\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### JSON" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import requests\n", + "\n", + "json_schema = json.dumps(\n", + " {\n", + " \"type\": \"object\",\n", + " \"properties\": {\n", + " \"name\": {\"type\": \"string\", \"pattern\": \"^[\\\\w]+$\"},\n", + " \"population\": {\"type\": \"integer\"},\n", + " },\n", + " \"required\": [\"name\", \"population\"],\n", + " }\n", + ")\n", + "\n", + "# JSON\n", + "response = requests.post(\n", + " \"http://localhost:30010/generate\",\n", + " json={\n", + " \"text\": \"Here is the information of the capital of France in the JSON format.\\n\",\n", + " \"sampling_params\": {\n", + " \"temperature\": 0,\n", + " \"max_new_tokens\": 64,\n", + " \"json_schema\": json_schema,\n", + " },\n", + " },\n", + ")\n", + "\n", + "print_highlight(response.json())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### EBNF" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import requests\n", + "\n", + "response = requests.post(\n", + " \"http://localhost:30010/generate\",\n", + " json={\n", + " \"text\": \"Give me the information of the capital of France.\",\n", + " \"sampling_params\": {\n", + " \"max_new_tokens\": 128,\n", + " \"temperature\": 0,\n", + " \"n\": 3,\n", + " \"ebnf\": (\n", + " \"root ::= city | description\\n\"\n", + " 'city ::= \"London\" | \"Paris\" | \"Berlin\" | \"Rome\"\\n'\n", + " 'description ::= city \" is \" status\\n'\n", + " 'status ::= \"the capital of \" country\\n'\n", + " 'country ::= \"England\" | \"France\" | \"Germany\" | \"Italy\"'\n", + " ),\n", + " },\n", + " \"stream\": False,\n", + " \"return_logprob\": False,\n", + " },\n", + ")\n", + "\n", + "print_highlight(response.json())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "terminate_process(server_process)\n", + "server_process = execute_shell_command(\n", + " \"\"\"\n", + "python3 -m sglang.launch_server --model-path meta-llama/Llama-3.2-1B-Instruct --port=30010\n", + "\"\"\"\n", + ")\n", + "\n", + "wait_for_server(\"http://localhost:30010\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Regular expression" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "response = requests.post(\n", + " \"http://localhost:30010/generate\",\n", + " json={\n", + " \"text\": \"Paris is the capital of\",\n", + " \"sampling_params\": {\n", + " \"temperature\": 0,\n", + " \"max_new_tokens\": 64,\n", + " \"regex\": \"(France|England)\",\n", + " },\n", + " },\n", + ")\n", + "print_highlight(response.json())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "terminate_process(server_process)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Offline Engine API" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sglang as sgl\n", + "\n", + "llm_xgrammar = sgl.Engine(\n", + " model_path=\"meta-llama/Meta-Llama-3.1-8B-Instruct\", grammar_backend=\"xgrammar\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### JSON" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "\n", + "prompts = [\n", + " \"Give me the information of the capital of China in the JSON format.\",\n", + " \"Give me the information of the capital of France in the JSON format.\",\n", + " \"Give me the information of the capital of Ireland in the JSON format.\",\n", + "]\n", + "\n", + "json_schema = json.dumps(\n", + " {\n", + " \"type\": \"object\",\n", + " \"properties\": {\n", + " \"name\": {\"type\": \"string\", \"pattern\": \"^[\\\\w]+$\"},\n", + " \"population\": {\"type\": \"integer\"},\n", + " },\n", + " \"required\": [\"name\", \"population\"],\n", + " }\n", + ")\n", + "\n", + "sampling_params = {\"temperature\": 0.1, \"top_p\": 0.95, \"json_schema\": json_schema}\n", + "\n", + "outputs = llm_xgrammar.generate(prompts, sampling_params)\n", + "for prompt, output in zip(prompts, outputs):\n", + " print_highlight(\"===============================\")\n", + " print_highlight(f\"Prompt: {prompt}\\nGenerated text: {output['text']}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### EBNF\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "prompts = [\n", + " \"Give me the information of the capital of France.\",\n", + " \"Give me the information of the capital of Germany.\",\n", + " \"Give me the information of the capital of Italy.\",\n", + "]\n", + "\n", + "sampling_params = {\n", + " \"temperature\": 0.8,\n", + " \"top_p\": 0.95,\n", + " \"ebnf\": (\n", + " \"root ::= city | description\\n\"\n", + " 'city ::= \"London\" | \"Paris\" | \"Berlin\" | \"Rome\"\\n'\n", + " 'description ::= city \" is \" status\\n'\n", + " 'status ::= \"the capital of \" country\\n'\n", + " 'country ::= \"England\" | \"France\" | \"Germany\" | \"Italy\"'\n", + " ),\n", + "}\n", + "\n", + "outputs = llm_xgrammar.generate(prompts, sampling_params)\n", + "for prompt, output in zip(prompts, outputs):\n", + " print_highlight(\"===============================\")\n", + " print_highlight(f\"Prompt: {prompt}\\nGenerated text: {output['text']}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "llm_xgrammar.shutdown()\n", + "llm_outlines = sgl.Engine(model_path=\"meta-llama/Meta-Llama-3.1-8B-Instruct\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Regular expression" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "prompts = [\n", + " \"Please provide information about London as a major global city:\",\n", + " \"Please provide information about Paris as a major global city:\",\n", + "]\n", + "\n", + "sampling_params = {\"temperature\": 0.8, \"top_p\": 0.95, \"regex\": \"(France|England)\"}\n", + "\n", + "outputs = llm_outlines.generate(prompts, sampling_params)\n", + "for prompt, output in zip(prompts, outputs):\n", + " print_highlight(\"===============================\")\n", + " print_highlight(f\"Prompt: {prompt}\\nGenerated text: {output['text']}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "llm_outlines.shutdown()" + ] + } + ], + "metadata": { + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/index.rst b/docs/index.rst index 8c6c018c4..d3ec65229 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -29,6 +29,7 @@ The core features include: backend/native_api.ipynb backend/offline_engine_api.ipynb backend/backend.md + backend/constrained_decoding.ipynb .. toctree::