From 3b60558dd79e1f4aeadc34ed5dbae45cb75e5a00 Mon Sep 17 00:00:00 2001 From: Chayenne Date: Sat, 2 Nov 2024 01:02:17 -0700 Subject: [PATCH] Native api (#1886) Co-authored-by: Chayenne --- .github/workflows/release-docs.yml | 4 + docs/Makefile | 2 +- docs/backend/native_api.ipynb | 4 +- docs/backend/openai_api_completions.ipynb | 36 +++++--- docs/backend/openai_api_vision.ipynb | 77 ++++++++++++++--- ...model.ipynb => openai_embedding_api.ipynb} | 42 ++++++++-- docs/index.rst | 1 + docs/start/send_request.ipynb | 84 ++++++++++++------- 8 files changed, 184 insertions(+), 66 deletions(-) rename docs/backend/{embedding_model.ipynb => openai_embedding_api.ipynb} (87%) diff --git a/.github/workflows/release-docs.yml b/.github/workflows/release-docs.yml index 7abcf5768..bca6df5ca 100644 --- a/.github/workflows/release-docs.yml +++ b/.github/workflows/release-docs.yml @@ -9,6 +9,10 @@ on: - 'python/sglang/version.py' workflow_dispatch: +concurrency: + group: execute-notebook-${{ github.ref }} + cancel-in-progress: true + jobs: execute-and-deploy: runs-on: 1-gpu-runner diff --git a/docs/Makefile b/docs/Makefile index b439c4fe2..51446dc38 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -1,7 +1,7 @@ # Minimal makefile for Sphinx documentation # -# You can set these variables from the command line, and also +# You can set these variables from the terminal, and also # from the environment for the first two. SPHINXOPTS ?= SPHINXBUILD ?= sphinx-build diff --git a/docs/backend/native_api.ipynb b/docs/backend/native_api.ipynb index 65cbbab18..57ffa14af 100644 --- a/docs/backend/native_api.ipynb +++ b/docs/backend/native_api.ipynb @@ -4,7 +4,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Native Server API\n", + "# Native API\n", "\n", "Apart from the OpenAI compatible API, the SGLang Runtime also provides its native server API. We introduce these following API:\n", "\n", @@ -254,7 +254,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 22, "metadata": {}, "outputs": [], "source": [ diff --git a/docs/backend/openai_api_completions.ipynb b/docs/backend/openai_api_completions.ipynb index 13ea0acdb..2f4b988d5 100644 --- a/docs/backend/openai_api_completions.ipynb +++ b/docs/backend/openai_api_completions.ipynb @@ -36,7 +36,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -69,7 +69,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": { "execution": { "iopub.execute_input": "2024-11-01T02:45:16.624550Z", @@ -110,7 +110,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": { "execution": { "iopub.execute_input": "2024-11-01T02:45:18.090228Z", @@ -151,12 +151,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Streaming mode is also supported" + "Streaming mode is also supported." ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": { "execution": { "iopub.execute_input": "2024-11-01T02:45:21.195226Z", @@ -190,7 +190,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": { "execution": { "iopub.execute_input": "2024-11-01T02:45:21.676813Z", @@ -226,7 +226,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": { "execution": { "iopub.execute_input": "2024-11-01T02:45:23.186337Z", @@ -272,7 +272,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": { "execution": { "iopub.execute_input": "2024-11-01T02:45:26.772016Z", @@ -334,7 +334,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": { "execution": { "iopub.execute_input": "2024-11-01T02:45:26.796422Z", @@ -389,7 +389,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": { "execution": { "iopub.execute_input": "2024-11-01T02:45:29.812339Z", @@ -472,7 +472,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "metadata": { "execution": { "iopub.execute_input": "2024-11-01T02:45:54.854018Z", @@ -567,7 +567,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 11, "metadata": { "execution": { "iopub.execute_input": "2024-11-01T02:46:07.896114Z", @@ -587,6 +587,18 @@ "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.7" } }, "nbformat": 4, diff --git a/docs/backend/openai_api_vision.ipynb b/docs/backend/openai_api_vision.ipynb index 6b006606b..4707a9e65 100644 --- a/docs/backend/openai_api_vision.ipynb +++ b/docs/backend/openai_api_vision.ipynb @@ -36,7 +36,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -63,21 +63,19 @@ "source": [ "## Using cURL\n", "\n", - "Once the server is up, you can send test requests using curl." + "Once the server is up, you can send test requests using curl or requests." ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import subprocess\n", "\n", "curl_command = \"\"\"\n", - "curl http://localhost:30010/v1/chat/completions \\\n", - " -H \"Content-Type: application/json\" \\\n", - " -H \"Authorization: Bearer None\" \\\n", + "curl -s http://localhost:30010/v1/chat/completions \\\n", " -d '{\n", " \"model\": \"meta-llama/Llama-3.2-11B-Vision-Instruct\",\n", " \"messages\": [\n", @@ -109,14 +107,57 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Using OpenAI Python Client\n", - "\n", - "You can use the OpenAI Python API library to send requests." + "## Using OpenAI Compatible API w/ Requests" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import requests\n", + "\n", + "url = \"http://localhost:30010/v1/chat/completions\"\n", + "\n", + "data = {\n", + " \"model\": \"meta-llama/Llama-3.2-11B-Vision-Instruct\",\n", + " \"messages\": [\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": [\n", + " {\n", + " \"type\": \"text\",\n", + " \"text\": \"What’s in this image?\"\n", + " },\n", + " {\n", + " \"type\": \"image_url\",\n", + " \"image_url\": {\n", + " \"url\": \"https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true\"\n", + " }\n", + " }\n", + " ]\n", + " }\n", + " ],\n", + " \"max_tokens\": 300\n", + "}\n", + "\n", + "response = requests.post(url, json=data)\n", + "print_highlight(response.text)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Using OpenAI Python Client\n", + "\n", + "Also, you can use the OpenAI Python API library to send requests." + ] + }, + { + "cell_type": "code", + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -160,7 +201,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -202,7 +243,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -233,6 +274,18 @@ "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.7" } }, "nbformat": 4, diff --git a/docs/backend/embedding_model.ipynb b/docs/backend/openai_embedding_api.ipynb similarity index 87% rename from docs/backend/embedding_model.ipynb rename to docs/backend/openai_embedding_api.ipynb index 45928587b..356a57121 100644 --- a/docs/backend/embedding_model.ipynb +++ b/docs/backend/openai_embedding_api.ipynb @@ -4,7 +4,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Embedding Model\n", + "# OpenAI APIs - Embedding\n", "\n", "SGLang supports embedding models in the same way as completion models. Here are some example models:\n", "\n", @@ -62,7 +62,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Use Curl" + "## Using cURL" ] }, { @@ -83,8 +83,6 @@ "text = \"Once upon a time\"\n", "\n", "curl_text = f\"\"\"curl -s http://localhost:30010/v1/embeddings \\\n", - " -H \"Content-Type: application/json\" \\\n", - " -H \"Authorization: Bearer None\" \\\n", " -d '{{\"model\": \"Alibaba-NLP/gte-Qwen2-7B-instruct\", \"input\": \"{text}\"}}'\"\"\"\n", "\n", "text_embedding = json.loads(subprocess.check_output(curl_text, shell=True))[\"data\"][0][\n", @@ -98,7 +96,37 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Using OpenAI Compatible API" + "## Using OpenAI Compatible API w/ Requests" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import requests\n", + "\n", + "text = \"Once upon a time\"\n", + "\n", + "response = requests.post(\n", + " \"http://localhost:30010/v1/embeddings\",\n", + " json={\n", + " \"model\": \"Alibaba-NLP/gte-Qwen2-7B-instruct\",\n", + " \"input\": text\n", + " }\n", + ")\n", + "\n", + "text_embedding = response.json()[\"data\"][0][\"embedding\"]\n", + "\n", + "print_highlight(f\"Text embedding (first 10): {text_embedding[:10]}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Using OpenAI Python Client" ] }, { @@ -160,8 +188,6 @@ "input_ids = tokenizer.encode(text)\n", "\n", "curl_ids = f\"\"\"curl -s http://localhost:30010/v1/embeddings \\\n", - " -H \"Content-Type: application/json\" \\\n", - " -H \"Authorization: Bearer None\" \\\n", " -d '{{\"model\": \"Alibaba-NLP/gte-Qwen2-7B-instruct\", \"input\": {json.dumps(input_ids)}}}'\"\"\"\n", "\n", "input_ids_embedding = json.loads(subprocess.check_output(curl_ids, shell=True))[\"data\"][\n", @@ -173,7 +199,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": { "execution": { "iopub.execute_input": "2024-11-01T02:48:01.875204Z", diff --git a/docs/index.rst b/docs/index.rst index 7d4935a8f..d73ce8ac1 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -25,6 +25,7 @@ The core features include: backend/openai_api_completions.ipynb backend/openai_api_vision.ipynb + backend/openai_embedding_api.ipynb backend/native_api.ipynb backend/backend.md diff --git a/docs/start/send_request.ipynb b/docs/start/send_request.ipynb index 9a2a8555b..99c22332f 100644 --- a/docs/start/send_request.ipynb +++ b/docs/start/send_request.ipynb @@ -22,12 +22,12 @@ "--port 30000 --host 0.0.0.0\n", "```\n", "\n", - "in your terminal and wait for the server to be ready." + "in your terminal and wait for the server to be ready. Once the server is running, you can send test requests using curl or requests. The server implements the [OpenAI-compatible API](https://platform.openai.com/docs/api-reference/chat)." ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": { "execution": { "iopub.execute_input": "2024-11-01T02:46:13.611212Z", @@ -59,14 +59,36 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Send a Request\n", - "\n", - "Once the server is up, you can send test requests using curl. The server implements the [OpenAI-compatible API](https://platform.openai.com/docs/api-reference/)." + "## Using cURL\n" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import subprocess, json\n", + "\n", + "curl_command = \"\"\"\n", + "curl -s http://localhost:30000/v1/chat/completions \\\n", + " -d '{\"model\": \"meta-llama/Meta-Llama-3.1-8B-Instruct\", \"messages\": [{\"role\": \"system\", \"content\": \"You are a helpful assistant.\"}, {\"role\": \"user\", \"content\": \"What is a LLM?\"}]}'\n", + "\"\"\"\n", + "\n", + "response = json.loads(subprocess.check_output(curl_command, shell=True))\n", + "print_highlight(response)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Using OpenAI Compatible API w/ Requests" + ] + }, + { + "cell_type": "code", + "execution_count": null, "metadata": { "execution": { "iopub.execute_input": "2024-11-01T02:46:42.813656Z", @@ -77,30 +99,20 @@ }, "outputs": [], "source": [ - "import subprocess\n", + "import requests\n", "\n", - "curl_command = \"\"\"\n", - "curl http://localhost:30000/v1/chat/completions \\\\\n", - " -H \"Content-Type: application/json\" \\\\\n", - " -H \"Authorization: Bearer None\" \\\\\n", - " -d '{\n", + "url = \"http://localhost:30000/v1/chat/completions\"\n", + "\n", + "data = {\n", " \"model\": \"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n", " \"messages\": [\n", - " {\n", - " \"role\": \"system\",\n", - " \"content\": \"You are a helpful assistant.\"\n", - " },\n", - " {\n", - " \"role\": \"user\",\n", - " \"content\": \"What is an LLM? Tell me in one sentence.\"\n", - " }\n", + " {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n", + " {\"role\": \"user\", \"content\": \"What is a LLM?\"}\n", " ]\n", - " }'\n", - "\"\"\"\n", + "}\n", "\n", - "response = subprocess.check_output(curl_command, shell=True).decode()\n", - "\n", - "print_highlight(response)" + "response = requests.post(url, json=data)\n", + "print_highlight(response.json())" ] }, { @@ -109,12 +121,12 @@ "source": [ "## Using OpenAI Python Client\n", "\n", - "You can use the OpenAI Python API library to send requests." + "You can also use the OpenAI Python API library to send requests." ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": { "execution": { "iopub.execute_input": "2024-11-01T02:46:51.439372Z", @@ -138,7 +150,6 @@ " temperature=0,\n", " max_tokens=64,\n", ")\n", - "\n", "print_highlight(response)" ] }, @@ -148,13 +159,12 @@ "source": [ "## Using Native Generation APIs\n", "\n", - "You can also use the native `/generate` endpoint. It provides more flexiblity.\n", - "An API reference is available at [Sampling Parameters](https://sgl-project.github.io/references/sampling_params.html)." + "You can also use the native `/generate` endpoint with requests, which provides more flexiblity. An API reference is available at [Sampling Parameters](https://sgl-project.github.io/references/sampling_params.html)." ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -196,6 +206,18 @@ "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.7" } }, "nbformat": 4,