From 3b60558dd79e1f4aeadc34ed5dbae45cb75e5a00 Mon Sep 17 00:00:00 2001
From: Chayenne <zhaochen20@outlook.com>
Date: Sat, 2 Nov 2024 01:02:17 -0700
Subject: [PATCH] Native api (#1886)

Co-authored-by: Chayenne <zhaochenyang@g.ucla.edu>
---
 .github/workflows/release-docs.yml            |  4 +
 docs/Makefile                                 |  2 +-
 docs/backend/native_api.ipynb                 |  4 +-
 docs/backend/openai_api_completions.ipynb     | 36 +++++---
 docs/backend/openai_api_vision.ipynb          | 77 ++++++++++++++---
 ...model.ipynb => openai_embedding_api.ipynb} | 42 ++++++++--
 docs/index.rst                                |  1 +
 docs/start/send_request.ipynb                 | 84 ++++++++++++-------
 8 files changed, 184 insertions(+), 66 deletions(-)
 rename docs/backend/{embedding_model.ipynb => openai_embedding_api.ipynb} (87%)

diff --git a/.github/workflows/release-docs.yml b/.github/workflows/release-docs.yml
index 7abcf5768..bca6df5ca 100644
--- a/.github/workflows/release-docs.yml
+++ b/.github/workflows/release-docs.yml
@@ -9,6 +9,10 @@ on:
       - 'python/sglang/version.py'
   workflow_dispatch:
 
+concurrency:
+  group: execute-notebook-${{ github.ref }}
+  cancel-in-progress: true
+
 jobs:
   execute-and-deploy:
     runs-on: 1-gpu-runner
diff --git a/docs/Makefile b/docs/Makefile
index b439c4fe2..51446dc38 100644
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -1,7 +1,7 @@
 # Minimal makefile for Sphinx documentation
 #
 
-# You can set these variables from the command line, and also
+# You can set these variables from the terminal, and also
 # from the environment for the first two.
 SPHINXOPTS    ?=
 SPHINXBUILD   ?= sphinx-build
diff --git a/docs/backend/native_api.ipynb b/docs/backend/native_api.ipynb
index 65cbbab18..57ffa14af 100644
--- a/docs/backend/native_api.ipynb
+++ b/docs/backend/native_api.ipynb
@@ -4,7 +4,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Native Server API\n",
+    "# Native API\n",
     "\n",
     "Apart from the OpenAI compatible API, the SGLang Runtime also provides its native server API. We introduce these following API:\n",
     "\n",
@@ -254,7 +254,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 22,
    "metadata": {},
    "outputs": [],
    "source": [
diff --git a/docs/backend/openai_api_completions.ipynb b/docs/backend/openai_api_completions.ipynb
index 13ea0acdb..2f4b988d5 100644
--- a/docs/backend/openai_api_completions.ipynb
+++ b/docs/backend/openai_api_completions.ipynb
@@ -36,7 +36,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -69,7 +69,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {
     "execution": {
      "iopub.execute_input": "2024-11-01T02:45:16.624550Z",
@@ -110,7 +110,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "metadata": {
     "execution": {
      "iopub.execute_input": "2024-11-01T02:45:18.090228Z",
@@ -151,12 +151,12 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Streaming mode is also supported"
+    "Streaming mode is also supported."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "metadata": {
     "execution": {
      "iopub.execute_input": "2024-11-01T02:45:21.195226Z",
@@ -190,7 +190,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "metadata": {
     "execution": {
      "iopub.execute_input": "2024-11-01T02:45:21.676813Z",
@@ -226,7 +226,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
    "metadata": {
     "execution": {
      "iopub.execute_input": "2024-11-01T02:45:23.186337Z",
@@ -272,7 +272,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
    "metadata": {
     "execution": {
      "iopub.execute_input": "2024-11-01T02:45:26.772016Z",
@@ -334,7 +334,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": null,
    "metadata": {
     "execution": {
      "iopub.execute_input": "2024-11-01T02:45:26.796422Z",
@@ -389,7 +389,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": null,
    "metadata": {
     "execution": {
      "iopub.execute_input": "2024-11-01T02:45:29.812339Z",
@@ -472,7 +472,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": null,
    "metadata": {
     "execution": {
      "iopub.execute_input": "2024-11-01T02:45:54.854018Z",
@@ -567,7 +567,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 11,
    "metadata": {
     "execution": {
      "iopub.execute_input": "2024-11-01T02:46:07.896114Z",
@@ -587,6 +587,18 @@
    "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
diff --git a/docs/backend/openai_api_vision.ipynb b/docs/backend/openai_api_vision.ipynb
index 6b006606b..4707a9e65 100644
--- a/docs/backend/openai_api_vision.ipynb
+++ b/docs/backend/openai_api_vision.ipynb
@@ -36,7 +36,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -63,21 +63,19 @@
    "source": [
     "## Using cURL\n",
     "\n",
-    "Once the server is up, you can send test requests using curl."
+    "Once the server is up, you can send test requests using curl or requests."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "import subprocess\n",
     "\n",
     "curl_command = \"\"\"\n",
-    "curl http://localhost:30010/v1/chat/completions \\\n",
-    "  -H \"Content-Type: application/json\" \\\n",
-    "  -H \"Authorization: Bearer None\" \\\n",
+    "curl -s http://localhost:30010/v1/chat/completions \\\n",
     "  -d '{\n",
     "    \"model\": \"meta-llama/Llama-3.2-11B-Vision-Instruct\",\n",
     "    \"messages\": [\n",
@@ -109,14 +107,57 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Using OpenAI Python Client\n",
-    "\n",
-    "You can use the OpenAI Python API library to send requests."
+    "## Using OpenAI Compatible API w/ Requests"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import requests\n",
+    "\n",
+    "url = \"http://localhost:30010/v1/chat/completions\"\n",
+    "\n",
+    "data = {\n",
+    "    \"model\": \"meta-llama/Llama-3.2-11B-Vision-Instruct\",\n",
+    "    \"messages\": [\n",
+    "        {\n",
+    "            \"role\": \"user\",\n",
+    "            \"content\": [\n",
+    "                {\n",
+    "                    \"type\": \"text\",\n",
+    "                    \"text\": \"What’s in this image?\"\n",
+    "                },\n",
+    "                {\n",
+    "                    \"type\": \"image_url\",\n",
+    "                    \"image_url\": {\n",
+    "                        \"url\": \"https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true\"\n",
+    "                    }\n",
+    "                }\n",
+    "            ]\n",
+    "        }\n",
+    "    ],\n",
+    "    \"max_tokens\": 300\n",
+    "}\n",
+    "\n",
+    "response = requests.post(url, json=data)\n",
+    "print_highlight(response.text)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Using OpenAI Python Client\n",
+    "\n",
+    "Also, you can use the OpenAI Python API library to send requests."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -160,7 +201,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -202,7 +243,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -233,6 +274,18 @@
    "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
diff --git a/docs/backend/embedding_model.ipynb b/docs/backend/openai_embedding_api.ipynb
similarity index 87%
rename from docs/backend/embedding_model.ipynb
rename to docs/backend/openai_embedding_api.ipynb
index 45928587b..356a57121 100644
--- a/docs/backend/embedding_model.ipynb
+++ b/docs/backend/openai_embedding_api.ipynb
@@ -4,7 +4,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Embedding Model\n",
+    "# OpenAI APIs - Embedding\n",
     "\n",
     "SGLang supports embedding models in the same way as completion models. Here are some example models:\n",
     "\n",
@@ -62,7 +62,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Use Curl"
+    "## Using cURL"
    ]
   },
   {
@@ -83,8 +83,6 @@
     "text = \"Once upon a time\"\n",
     "\n",
     "curl_text = f\"\"\"curl -s http://localhost:30010/v1/embeddings \\\n",
-    "  -H \"Content-Type: application/json\" \\\n",
-    "  -H \"Authorization: Bearer None\" \\\n",
     "  -d '{{\"model\": \"Alibaba-NLP/gte-Qwen2-7B-instruct\", \"input\": \"{text}\"}}'\"\"\"\n",
     "\n",
     "text_embedding = json.loads(subprocess.check_output(curl_text, shell=True))[\"data\"][0][\n",
@@ -98,7 +96,37 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Using OpenAI Compatible API"
+    "## Using OpenAI Compatible API w/ Requests"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import requests\n",
+    "\n",
+    "text = \"Once upon a time\"\n",
+    "\n",
+    "response = requests.post(\n",
+    "    \"http://localhost:30010/v1/embeddings\",\n",
+    "    json={\n",
+    "        \"model\": \"Alibaba-NLP/gte-Qwen2-7B-instruct\",\n",
+    "        \"input\": text\n",
+    "    }\n",
+    ")\n",
+    "\n",
+    "text_embedding = response.json()[\"data\"][0][\"embedding\"]\n",
+    "\n",
+    "print_highlight(f\"Text embedding (first 10): {text_embedding[:10]}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Using OpenAI Python Client"
    ]
   },
   {
@@ -160,8 +188,6 @@
     "input_ids = tokenizer.encode(text)\n",
     "\n",
     "curl_ids = f\"\"\"curl -s http://localhost:30010/v1/embeddings \\\n",
-    "  -H \"Content-Type: application/json\" \\\n",
-    "  -H \"Authorization: Bearer None\" \\\n",
     "  -d '{{\"model\": \"Alibaba-NLP/gte-Qwen2-7B-instruct\", \"input\": {json.dumps(input_ids)}}}'\"\"\"\n",
     "\n",
     "input_ids_embedding = json.loads(subprocess.check_output(curl_ids, shell=True))[\"data\"][\n",
@@ -173,7 +199,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "metadata": {
     "execution": {
      "iopub.execute_input": "2024-11-01T02:48:01.875204Z",
diff --git a/docs/index.rst b/docs/index.rst
index 7d4935a8f..d73ce8ac1 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -25,6 +25,7 @@ The core features include:
 
    backend/openai_api_completions.ipynb
    backend/openai_api_vision.ipynb
+   backend/openai_embedding_api.ipynb
    backend/native_api.ipynb
    backend/backend.md
 
diff --git a/docs/start/send_request.ipynb b/docs/start/send_request.ipynb
index 9a2a8555b..99c22332f 100644
--- a/docs/start/send_request.ipynb
+++ b/docs/start/send_request.ipynb
@@ -22,12 +22,12 @@
     "--port 30000 --host 0.0.0.0\n",
     "```\n",
     "\n",
-    "in your terminal and wait for the server to be ready."
+    "in your terminal and wait for the server to be ready. Once the server is running, you can send test requests using curl or requests. The server implements the [OpenAI-compatible API](https://platform.openai.com/docs/api-reference/chat)."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
    "metadata": {
     "execution": {
      "iopub.execute_input": "2024-11-01T02:46:13.611212Z",
@@ -59,14 +59,36 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Send a Request\n",
-    "\n",
-    "Once the server is up, you can send test requests using curl. The server implements the [OpenAI-compatible API](https://platform.openai.com/docs/api-reference/)."
+    "## Using cURL\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import subprocess, json\n",
+    "\n",
+    "curl_command = \"\"\"\n",
+    "curl -s http://localhost:30000/v1/chat/completions \\\n",
+    "  -d '{\"model\": \"meta-llama/Meta-Llama-3.1-8B-Instruct\", \"messages\": [{\"role\": \"system\", \"content\": \"You are a helpful assistant.\"}, {\"role\": \"user\", \"content\": \"What is a LLM?\"}]}'\n",
+    "\"\"\"\n",
+    "\n",
+    "response = json.loads(subprocess.check_output(curl_command, shell=True))\n",
+    "print_highlight(response)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Using OpenAI Compatible API w/ Requests"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
    "metadata": {
     "execution": {
      "iopub.execute_input": "2024-11-01T02:46:42.813656Z",
@@ -77,30 +99,20 @@
    },
    "outputs": [],
    "source": [
-    "import subprocess\n",
+    "import requests\n",
     "\n",
-    "curl_command = \"\"\"\n",
-    "curl http://localhost:30000/v1/chat/completions \\\\\n",
-    "  -H \"Content-Type: application/json\" \\\\\n",
-    "  -H \"Authorization: Bearer None\" \\\\\n",
-    "  -d '{\n",
+    "url = \"http://localhost:30000/v1/chat/completions\"\n",
+    "\n",
+    "data = {\n",
     "    \"model\": \"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
     "    \"messages\": [\n",
-    "      {\n",
-    "        \"role\": \"system\",\n",
-    "        \"content\": \"You are a helpful assistant.\"\n",
-    "      },\n",
-    "      {\n",
-    "        \"role\": \"user\",\n",
-    "        \"content\": \"What is an LLM? Tell me in one sentence.\"\n",
-    "      }\n",
+    "        {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n",
+    "        {\"role\": \"user\", \"content\": \"What is a LLM?\"}\n",
     "    ]\n",
-    "  }'\n",
-    "\"\"\"\n",
+    "}\n",
     "\n",
-    "response = subprocess.check_output(curl_command, shell=True).decode()\n",
-    "\n",
-    "print_highlight(response)"
+    "response = requests.post(url, json=data)\n",
+    "print_highlight(response.json())"
    ]
   },
   {
@@ -109,12 +121,12 @@
    "source": [
     "## Using OpenAI Python Client\n",
     "\n",
-    "You can use the OpenAI Python API library to send requests."
+    "You can also use the OpenAI Python API library to send requests."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "metadata": {
     "execution": {
      "iopub.execute_input": "2024-11-01T02:46:51.439372Z",
@@ -138,7 +150,6 @@
     "    temperature=0,\n",
     "    max_tokens=64,\n",
     ")\n",
-    "\n",
     "print_highlight(response)"
    ]
   },
@@ -148,13 +159,12 @@
    "source": [
     "## Using Native Generation APIs\n",
     "\n",
-    "You can also use the native `/generate` endpoint. It provides more flexiblity.\n",
-    "An API reference is available at [Sampling Parameters](https://sgl-project.github.io/references/sampling_params.html)."
+    "You can also use the native `/generate` endpoint with requests, which provides more flexiblity. An API reference is available at [Sampling Parameters](https://sgl-project.github.io/references/sampling_params.html)."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -196,6 +206,18 @@
    "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,