diff --git a/.github/workflows/deploy-docs.yml b/.github/workflows/deploy-docs.yml index 5f941973b..e556cd02c 100644 --- a/.github/workflows/deploy-docs.yml +++ b/.github/workflows/deploy-docs.yml @@ -10,12 +10,17 @@ on: workflow_dispatch: jobs: - execute-notebooks: + execute-and-deploy: runs-on: 1-gpu-runner if: github.repository == 'sgl-project/sglang' + defaults: + run: + working-directory: docs steps: - name: Checkout code uses: actions/checkout@v3 + with: + path: . - name: Set up Python uses: actions/setup-python@v4 @@ -25,7 +30,9 @@ jobs: - name: Install dependencies run: | bash scripts/ci_install_dependency.sh - pip install -r docs/requirements.txt + pip install -r requirements.txt + apt-get update + apt-get install -y pandoc - name: Setup Jupyter Kernel run: | @@ -33,7 +40,6 @@ jobs: - name: Execute notebooks run: | - cd docs for nb in *.ipynb; do if [ -f "$nb" ]; then echo "Executing $nb" @@ -43,36 +49,15 @@ jobs: fi done - build-and-deploy: - needs: execute-notebooks - if: github.repository == 'sgl-project/sglang' - runs-on: 1-gpu-runner - steps: - - name: Checkout code - uses: actions/checkout@v3 - - - name: Set up Python - uses: actions/setup-python@v4 - with: - python-version: '3.9' - - - name: Install dependencies - run: | - bash scripts/ci_install_dependency.sh - pip install -r docs/requirements.txt - apt-get update - apt-get install -y pandoc - - name: Build documentation run: | - cd docs make html - name: Push to sgl-project.github.io env: GITHUB_TOKEN: ${{ secrets.PAT_TOKEN }} run: | - cd docs/_build/html + cd _build/html git clone https://$GITHUB_TOKEN@github.com/sgl-project/sgl-project.github.io.git ../sgl-project.github.io cp -r * ../sgl-project.github.io cd ../sgl-project.github.io diff --git a/.github/workflows/execute-notebook.yml b/.github/workflows/execute-notebook.yml index 48578bf73..ebc73bac1 100644 --- a/.github/workflows/execute-notebook.yml +++ b/.github/workflows/execute-notebook.yml @@ -1,12 +1,24 @@ name: Execute Notebooks on: - pull_request: push: - branches: - - main + branches: [ main ] + paths: + - "python/sglang/**" + - "docs/**" + pull_request: + branches: [ main ] + paths: + - "python/sglang/**" + - "docs/**" workflow_dispatch: + +concurrency: + group: execute-notebook-${{ github.ref }} + cancel-in-progress: true + + jobs: run-all-notebooks: runs-on: 1-gpu-runner diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 62dcd455a..8c1de4d99 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -10,6 +10,9 @@ repos: rev: 24.10.0 hooks: - id: black + additional_dependencies: ['.[jupyter]'] + types: [python, jupyter] + types_or: [python, jupyter] - repo: https://github.com/pre-commit/pre-commit-hooks rev: v5.0.0 diff --git a/docs/embedding_model.ipynb b/docs/embedding_model.ipynb index bfa44aa11..1e0b72d1d 100644 --- a/docs/embedding_model.ipynb +++ b/docs/embedding_model.ipynb @@ -4,19 +4,32 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Embedding Model" + "# Embedding Model\n", + "\n", + "SGLang supports embedding models in the same way as completion models. Here are some example models:\n", + "\n", + "- [intfloat/e5-mistral-7b-instruct](https://huggingface.co/intfloat/e5-mistral-7b-instruct)\n", + "- [Alibaba-NLP/gte-Qwen2-7B-instruct](https://huggingface.co/Alibaba-NLP/gte-Qwen2-7B-instruct)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Launch A Server" + "## Launch A Server\n", + "\n", + "The following code is equivalent to running this in the shell:\n", + "```bash\n", + "python -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-7B-instruct \\\n", + " --port 30010 --host 0.0.0.0 --is-embedding --log-level error\n", + "```\n", + "\n", + "Remember to add `--is-embedding` to the command." ] }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -28,14 +41,14 @@ } ], "source": [ - "# Equivalent to running this in the shell:\n", - "# python -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-7B-instruct --port 30010 --host 0.0.0.0 --is-embedding --log-level error\n", "from sglang.utils import execute_shell_command, wait_for_server, terminate_process\n", "\n", - "embedding_process = execute_shell_command(\"\"\"\n", + "embedding_process = execute_shell_command(\n", + " \"\"\"\n", "python -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-7B-instruct \\\n", " --port 30010 --host 0.0.0.0 --is-embedding --log-level error\n", - "\"\"\")\n", + "\"\"\"\n", + ")\n", "\n", "wait_for_server(\"http://localhost:30010\")\n", "\n", @@ -51,25 +64,32 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "[0.0083160400390625, 0.0006804466247558594, -0.00809478759765625, -0.0006995201110839844, 0.0143890380859375, -0.0090179443359375, 0.01238250732421875, 0.00209808349609375, 0.0062103271484375, -0.003047943115234375]\n" + "Text embedding (first 10): [0.0083160400390625, 0.0006804466247558594, -0.00809478759765625, -0.0006995201110839844, 0.0143890380859375, -0.0090179443359375, 0.01238250732421875, 0.00209808349609375, 0.0062103271484375, -0.003047943115234375]\n" ] } ], "source": [ - "# Get the first 10 elements of the embedding\n", + "import subprocess, json\n", "\n", - "! curl -s http://localhost:30010/v1/embeddings \\\n", + "text = \"Once upon a time\"\n", + "\n", + "curl_text = f\"\"\"curl -s http://localhost:30010/v1/embeddings \\\n", " -H \"Content-Type: application/json\" \\\n", " -H \"Authorization: Bearer None\" \\\n", - " -d '{\"model\": \"Alibaba-NLP/gte-Qwen2-7B-instruct\", \"input\": \"Once upon a time\"}' \\\n", - " | python3 -c \"import sys, json; print(json.load(sys.stdin)['data'][0]['embedding'][:10])\"" + " -d '{{\"model\": \"Alibaba-NLP/gte-Qwen2-7B-instruct\", \"input\": \"{text}\"}}'\"\"\"\n", + "\n", + "text_embedding = json.loads(subprocess.check_output(curl_text, shell=True))[\"data\"][0][\n", + " \"embedding\"\n", + "]\n", + "\n", + "print(f\"Text embedding (first 10): {text_embedding[:10]}\")" ] }, { @@ -81,37 +101,79 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "[0.00603485107421875, -0.0190582275390625, -0.01273345947265625, 0.01552581787109375, 0.0066680908203125, -0.0135955810546875, 0.01131439208984375, 0.0013713836669921875, -0.0089874267578125, 0.021759033203125]\n" + "Text embedding (first 10): [0.00829315185546875, 0.0007004737854003906, -0.00809478759765625, -0.0006799697875976562, 0.01438140869140625, -0.00897979736328125, 0.0123748779296875, 0.0020923614501953125, 0.006195068359375, -0.0030498504638671875]\n" ] } ], "source": [ "import openai\n", "\n", - "client = openai.Client(\n", - " base_url=\"http://127.0.0.1:30010/v1\", api_key=\"None\"\n", - ")\n", + "client = openai.Client(base_url=\"http://127.0.0.1:30010/v1\", api_key=\"None\")\n", "\n", "# Text embedding example\n", "response = client.embeddings.create(\n", " model=\"Alibaba-NLP/gte-Qwen2-7B-instruct\",\n", - " input=\"How are you today\",\n", + " input=text,\n", ")\n", "\n", "embedding = response.data[0].embedding[:10]\n", - "print(embedding)" + "print(f\"Text embedding (first 10): {embedding}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Using Input IDs\n", + "\n", + "SGLang also supports `input_ids` as input to get the embedding." ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Input IDs embedding (first 10): [0.00829315185546875, 0.0007004737854003906, -0.00809478759765625, -0.0006799697875976562, 0.01438140869140625, -0.00897979736328125, 0.0123748779296875, 0.0020923614501953125, 0.006195068359375, -0.0030498504638671875]\n" + ] + } + ], + "source": [ + "import json\n", + "import os\n", + "from transformers import AutoTokenizer\n", + "\n", + "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n", + "\n", + "tokenizer = AutoTokenizer.from_pretrained(\"Alibaba-NLP/gte-Qwen2-7B-instruct\")\n", + "input_ids = tokenizer.encode(text)\n", + "\n", + "curl_ids = f\"\"\"curl -s http://localhost:30010/v1/embeddings \\\n", + " -H \"Content-Type: application/json\" \\\n", + " -H \"Authorization: Bearer None\" \\\n", + " -d '{{\"model\": \"Alibaba-NLP/gte-Qwen2-7B-instruct\", \"input\": {json.dumps(input_ids)}}}'\"\"\"\n", + "\n", + "input_ids_embedding = json.loads(subprocess.check_output(curl_ids, shell=True))[\"data\"][\n", + " 0\n", + "][\"embedding\"]\n", + "\n", + "print(f\"Input IDs embedding (first 10): {input_ids_embedding[:10]}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ diff --git a/docs/index.rst b/docs/index.rst index b21236e3d..ab54a3901 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -16,12 +16,14 @@ The core features include: :caption: Getting Started install.md + send_request.ipynb .. toctree:: :maxdepth: 1 :caption: Backend Tutorial + openai_api.ipynb backend.md @@ -43,3 +45,4 @@ The core features include: choices_methods.md benchmark_and_profiling.md troubleshooting.md + embedding_model.ipynb diff --git a/docs/openai_api.ipynb b/docs/openai_api.ipynb new file mode 100644 index 000000000..cc413238d --- /dev/null +++ b/docs/openai_api.ipynb @@ -0,0 +1,676 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# OpenAI Compatible API\n", + "\n", + "SGLang provides an OpenAI compatible API for smooth transition from OpenAI services.\n", + "\n", + "- `chat/completions`\n", + "- `completions`\n", + "- `batches`\n", + "- `embeddings`(refer to [embedding_model.ipynb](embedding_model.ipynb))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Chat Completions\n", + "\n", + "### Usage\n", + "\n", + "Similar to [send_request.ipynb](send_request.ipynb), we can send a chat completion request to SGLang server with OpenAI API format." + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Server is ready. Proceeding with the next steps.\n" + ] + } + ], + "source": [ + "from sglang.utils import execute_shell_command, wait_for_server, terminate_process\n", + "\n", + "server_process = execute_shell_command(\n", + " \"\"\"\n", + "python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \\\n", + "--port 30000 --host 0.0.0.0 --log-level warning\n", + "\"\"\"\n", + ")\n", + "\n", + "wait_for_server(\"http://localhost:30000\")\n", + "print(\"Server is ready. Proceeding with the next steps.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ChatCompletion(id='e854540ec7914b2d8c712f16fd9ed2ca', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\\n\\n1. **Country:** Japan\\n**Capital:** Tokyo\\n\\n2. **Country:** Australia\\n**Capital:** Canberra\\n\\n3. **Country:** Brazil\\n**Capital:** Brasília', refusal=None, role='assistant', function_call=None, tool_calls=None), matched_stop=128009)], created=1730012326, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=46, prompt_tokens=49, total_tokens=95, prompt_tokens_details=None))\n" + ] + } + ], + "source": [ + "import openai\n", + "\n", + "# Always assign an api_key, even if not specified during server initialization.\n", + "# Setting an API key during server initialization is strongly recommended.\n", + "\n", + "client = openai.Client(base_url=\"http://127.0.0.1:30000/v1\", api_key=\"None\")\n", + "\n", + "# Chat completion example\n", + "\n", + "response = client.chat.completions.create(\n", + " model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n", + " messages=[\n", + " {\"role\": \"system\", \"content\": \"You are a helpful AI assistant\"},\n", + " {\"role\": \"user\", \"content\": \"List 3 countries and their capitals.\"},\n", + " ],\n", + " temperature=0,\n", + " max_tokens=64,\n", + ")\n", + "print(response)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Parameters\n", + "\n", + "The chat completions API accepts the following parameters (refer to [OpenAI Chat Completions API](https://platform.openai.com/docs/api-reference/chat/create) for more details):\n", + "\n", + "- `messages`: List of messages in the conversation, each containing `role` and `content`\n", + "- `model`: The model identifier to use for completion\n", + "- `max_tokens`: Maximum number of tokens to generate in the response\n", + "- `temperature`: Controls randomness (0-2). Lower values make output more focused and deterministic\n", + "- `top_p`: Alternative to temperature. Controls diversity via nucleus sampling\n", + "- `n`: Number of chat completion choices to generate\n", + "- `stream`: If true, partial message deltas will be sent as they become available\n", + "- `stop`: Sequences where the API will stop generating further tokens\n", + "- `presence_penalty`: Penalizes new tokens based on their presence in the text so far (-2.0 to 2.0)\n", + "- `frequency_penalty`: Penalizes new tokens based on their frequency in the text so far (-2.0 to 2.0)\n", + "- `logit_bias`: Modify the likelihood of specified tokens appearing in the completion\n", + "- `logprobs`: Include log probabilities of tokens in the response\n", + "- `top_logprobs`: Number of most likely tokens to return probabilities for\n", + "- `seed`: Random seed for deterministic results\n", + "- `response_format`: Specify the format of the response (e.g., JSON)\n", + "- `stream_options`: Additional options for streaming responses\n", + "- `user`: A unique identifier representing your end-user\n", + "\n", + "Here is an example of a detailed chat completion request:" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Ancient Rome's major achievements include:" + ] + } + ], + "source": [ + "response = client.chat.completions.create(\n", + " model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n", + " messages=[\n", + " {\n", + " \"role\": \"system\",\n", + " \"content\": \"You are a knowledgeable historian who provides concise responses.\",\n", + " },\n", + " {\"role\": \"user\", \"content\": \"Tell me about ancient Rome\"},\n", + " {\n", + " \"role\": \"assistant\",\n", + " \"content\": \"Ancient Rome was a civilization centered in Italy.\",\n", + " },\n", + " {\"role\": \"user\", \"content\": \"What were their major achievements?\"},\n", + " ],\n", + " temperature=0.3, # Lower temperature for more focused responses\n", + " max_tokens=100, # Reasonable length for a concise response\n", + " top_p=0.95, # Slightly higher for better fluency\n", + " stop=[\"\\n\\n\"], # Simple stop sequence\n", + " presence_penalty=0.2, # Mild penalty to avoid repetition\n", + " frequency_penalty=0.2, # Mild penalty for more natural language\n", + " n=1, # Single response is usually more stable\n", + " seed=42, # Keep for reproducibility\n", + " stream=True, # Keep streaming for real-time output\n", + ")\n", + "\n", + "for chunk in response:\n", + " print(chunk.choices[0].delta.content or \"\", end=\"\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Completions\n", + "\n", + "### Usage\n", + "\n", + "Completions API is similar to Chat Completions API, but without the `messages` parameter. Refer to [OpenAI Completions API](https://platform.openai.com/docs/api-reference/completions/create) for more details." + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Completion(id='a6e07198f4b445baa0fb08a2178ceb59', choices=[CompletionChoice(finish_reason='length', index=0, logprobs=None, text=' 1. 2. 3.\\n1. United States - Washington D.C. 2. Japan - Tokyo 3. Australia - Canberra\\nList 3 countries and their capitals. 1. 2. 3.\\n1. China - Beijing 2. Brazil - Bras', matched_stop=None)], created=1730012328, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=64, prompt_tokens=9, total_tokens=73, prompt_tokens_details=None))\n" + ] + } + ], + "source": [ + "response = client.completions.create(\n", + " model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n", + " prompt=\"List 3 countries and their capitals.\",\n", + " temperature=0,\n", + " max_tokens=64,\n", + " n=1,\n", + " stop=None,\n", + ")\n", + "print(response)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Parameters\n", + "\n", + "The completions API accepts the following parameters:\n", + "\n", + "- `model`: The model identifier to use for completion\n", + "- `prompt`: Input text to generate completions for. Can be a string, array of strings, or token arrays\n", + "- `best_of`: Number of completions to generate server-side and return the best one\n", + "- `echo`: If true, the prompt will be included in the response\n", + "- `frequency_penalty`: Penalizes new tokens based on their frequency in the text so far (-2.0 to 2.0)\n", + "- `logit_bias`: Modify the likelihood of specified tokens appearing in the completion\n", + "- `logprobs`: Include log probabilities of tokens in the response\n", + "- `max_tokens`: Maximum number of tokens to generate in the response (default: 16)\n", + "- `n`: Number of completion choices to generate\n", + "- `presence_penalty`: Penalizes new tokens based on their presence in the text so far (-2.0 to 2.0)\n", + "- `seed`: Random seed for deterministic results\n", + "- `stop`: Sequences where the API will stop generating further tokens\n", + "- `stream`: If true, partial completion deltas will be sent as they become available\n", + "- `stream_options`: Additional options for streaming responses\n", + "- `suffix`: Text to append to the completion\n", + "- `temperature`: Controls randomness (0-2). Lower values make output more focused and deterministic\n", + "- `top_p`: Alternative to temperature. Controls diversity via nucleus sampling\n", + "- `user`: A unique identifier representing your end-user\n", + "\n", + "Here is an example of a detailed completions request:" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Space explorer, Captain Orion Blackwood, had been traveling through the galaxy for 12 years, searching for a new home for humanity. His ship, the Aurora, had been his home for so long that he barely remembered what it was like to walk on solid ground.\n", + "As he navigated through the dense asteroid field, the ship's computer, S.A.R.A. (Self-Aware Reasoning Algorithm), alerted him to a strange reading on one of the asteroids. Captain Blackwood's curiosity was piqued, and he decided to investigate further.\n", + "\"Captain, I'm detecting unusual energy signatures emanating from the asteroid,\" S.A.R.A. said. \"It's unlike anything I've seen before.\"\n", + "Captain Blackwood's eyes narrowed as" + ] + } + ], + "source": [ + "response = client.completions.create(\n", + " model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n", + " prompt=\"Write a short story about a space explorer.\",\n", + " temperature=0.7, # Moderate temperature for creative writing\n", + " max_tokens=150, # Longer response for a story\n", + " top_p=0.9, # Balanced diversity in word choice\n", + " stop=[\"\\n\\n\", \"THE END\"], # Multiple stop sequences\n", + " presence_penalty=0.3, # Encourage novel elements\n", + " frequency_penalty=0.3, # Reduce repetitive phrases\n", + " n=1, # Generate one completion\n", + " seed=123, # For reproducible results\n", + " stream=True, # Stream the response\n", + ")\n", + "\n", + "for chunk in response:\n", + " print(chunk.choices[0].text or \"\", end=\"\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Batches\n", + "\n", + "We have implemented the batches API for chat completions and completions. You can upload your requests in `jsonl` files, create a batch job, and retrieve the results when the batch job is completed (which takes longer but costs less).\n", + "\n", + "The batches APIs are:\n", + "\n", + "- `batches`\n", + "- `batches/{batch_id}/cancel`\n", + "- `batches/{batch_id}`\n", + "\n", + "Here is an example of a batch job for chat completions, completions are similar.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Batch job created with ID: batch_03d7f74f-dffe-4c26-b5e7-bb9fb5cb89ff\n" + ] + } + ], + "source": [ + "import json\n", + "import time\n", + "from openai import OpenAI\n", + "\n", + "client = OpenAI(base_url=\"http://127.0.0.1:30000/v1\", api_key=\"None\")\n", + "\n", + "requests = [\n", + " {\n", + " \"custom_id\": \"request-1\",\n", + " \"method\": \"POST\",\n", + " \"url\": \"/chat/completions\",\n", + " \"body\": {\n", + " \"model\": \"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n", + " \"messages\": [\n", + " {\"role\": \"user\", \"content\": \"Tell me a joke about programming\"}\n", + " ],\n", + " \"max_tokens\": 50,\n", + " },\n", + " },\n", + " {\n", + " \"custom_id\": \"request-2\",\n", + " \"method\": \"POST\",\n", + " \"url\": \"/chat/completions\",\n", + " \"body\": {\n", + " \"model\": \"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n", + " \"messages\": [{\"role\": \"user\", \"content\": \"What is Python?\"}],\n", + " \"max_tokens\": 50,\n", + " },\n", + " },\n", + "]\n", + "\n", + "input_file_path = \"batch_requests.jsonl\"\n", + "\n", + "with open(input_file_path, \"w\") as f:\n", + " for req in requests:\n", + " f.write(json.dumps(req) + \"\\n\")\n", + "\n", + "with open(input_file_path, \"rb\") as f:\n", + " file_response = client.files.create(file=f, purpose=\"batch\")\n", + "\n", + "batch_response = client.batches.create(\n", + " input_file_id=file_response.id,\n", + " endpoint=\"/v1/chat/completions\",\n", + " completion_window=\"24h\",\n", + ")\n", + "\n", + "print(f\"Batch job created with ID: {batch_response.id}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Batch job status: validating...trying again in 3 seconds...\n", + "Batch job completed successfully!\n", + "Request counts: BatchRequestCounts(completed=2, failed=0, total=2)\n", + "\n", + "Request request-1:\n", + "Response: {'status_code': 200, 'request_id': 'request-1', 'body': {'id': 'request-1', 'object': 'chat.completion', 'created': 1730012333, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': {'index': 0, 'message': {'role': 'assistant', 'content': 'Why do programmers prefer dark mode?\\n\\nBecause light attracts bugs.'}, 'logprobs': None, 'finish_reason': 'stop', 'matched_stop': 128009}, 'usage': {'prompt_tokens': 41, 'completion_tokens': 13, 'total_tokens': 54}, 'system_fingerprint': None}}\n", + "\n", + "Request request-2:\n", + "Response: {'status_code': 200, 'request_id': 'request-2', 'body': {'id': 'request-2', 'object': 'chat.completion', 'created': 1730012333, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': {'index': 0, 'message': {'role': 'assistant', 'content': '**What is Python?**\\n\\nPython is a high-level, interpreted programming language that is widely used for various purposes, including:\\n\\n* **Web Development**: Building web applications, web services, and web scraping.\\n* **Data Science**: Data analysis'}, 'logprobs': None, 'finish_reason': 'length', 'matched_stop': None}, 'usage': {'prompt_tokens': 39, 'completion_tokens': 50, 'total_tokens': 89}, 'system_fingerprint': None}}\n", + "\n", + "Cleaning up files...\n" + ] + } + ], + "source": [ + "while batch_response.status not in [\"completed\", \"failed\", \"cancelled\"]:\n", + " time.sleep(3)\n", + " print(f\"Batch job status: {batch_response.status}...trying again in 3 seconds...\")\n", + " batch_response = client.batches.retrieve(batch_response.id)\n", + "\n", + "if batch_response.status == \"completed\":\n", + " print(\"Batch job completed successfully!\")\n", + " print(f\"Request counts: {batch_response.request_counts}\")\n", + "\n", + " result_file_id = batch_response.output_file_id\n", + " file_response = client.files.content(result_file_id)\n", + " result_content = file_response.read().decode(\"utf-8\")\n", + "\n", + " results = [\n", + " json.loads(line) for line in result_content.split(\"\\n\") if line.strip() != \"\"\n", + " ]\n", + "\n", + " for result in results:\n", + " print(f\"\\nRequest {result['custom_id']}:\")\n", + " print(f\"Response: {result['response']}\")\n", + "\n", + " print(\"\\nCleaning up files...\")\n", + " # Only delete the result file ID since file_response is just content\n", + " client.files.delete(result_file_id)\n", + "else:\n", + " print(f\"Batch job failed with status: {batch_response.status}\")\n", + " if hasattr(batch_response, \"errors\"):\n", + " print(f\"Errors: {batch_response.errors}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It takes a while to complete the batch job. You can use these two APIs to retrieve the batch job status or cancel the batch job.\n", + "\n", + "1. `batches/{batch_id}`: Retrieve the batch job status.\n", + "2. `batches/{batch_id}/cancel`: Cancel the batch job.\n", + "\n", + "Here is an example to check the batch job status." + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Created batch job with ID: batch_6b9625ac-9ebc-4c4f-bfd5-f84f88b0100d\n", + "Initial status: validating\n", + "Batch job details (check 1/5):\n", + "ID: batch_6b9625ac-9ebc-4c4f-bfd5-f84f88b0100d\n", + "Status: in_progress\n", + "Created at: 1730012334\n", + "Input file ID: backend_input_file-8203d42a-109c-4573-9663-13b5d9cb6a2b\n", + "Output file ID: None\n", + "Request counts:\n", + "Total: 0\n", + "Completed: 0\n", + "Failed: 0\n", + "Batch job details (check 2/5):\n", + "ID: batch_6b9625ac-9ebc-4c4f-bfd5-f84f88b0100d\n", + "Status: in_progress\n", + "Created at: 1730012334\n", + "Input file ID: backend_input_file-8203d42a-109c-4573-9663-13b5d9cb6a2b\n", + "Output file ID: None\n", + "Request counts:\n", + "Total: 0\n", + "Completed: 0\n", + "Failed: 0\n", + "Batch job details (check 3/5):\n", + "ID: batch_6b9625ac-9ebc-4c4f-bfd5-f84f88b0100d\n", + "Status: in_progress\n", + "Created at: 1730012334\n", + "Input file ID: backend_input_file-8203d42a-109c-4573-9663-13b5d9cb6a2b\n", + "Output file ID: None\n", + "Request counts:\n", + "Total: 0\n", + "Completed: 0\n", + "Failed: 0\n", + "Batch job details (check 4/5):\n", + "ID: batch_6b9625ac-9ebc-4c4f-bfd5-f84f88b0100d\n", + "Status: completed\n", + "Created at: 1730012334\n", + "Input file ID: backend_input_file-8203d42a-109c-4573-9663-13b5d9cb6a2b\n", + "Output file ID: backend_result_file-d32f441d-e737-4da3-b07a-c39349425b3a\n", + "Request counts:\n", + "Total: 100\n", + "Completed: 100\n", + "Failed: 0\n", + "Batch job details (check 5/5):\n", + "ID: batch_6b9625ac-9ebc-4c4f-bfd5-f84f88b0100d\n", + "Status: completed\n", + "Created at: 1730012334\n", + "Input file ID: backend_input_file-8203d42a-109c-4573-9663-13b5d9cb6a2b\n", + "Output file ID: backend_result_file-d32f441d-e737-4da3-b07a-c39349425b3a\n", + "Request counts:\n", + "Total: 100\n", + "Completed: 100\n", + "Failed: 0\n" + ] + } + ], + "source": [ + "import json\n", + "import time\n", + "from openai import OpenAI\n", + "\n", + "client = OpenAI(base_url=\"http://127.0.0.1:30000/v1\", api_key=\"None\")\n", + "\n", + "requests = []\n", + "for i in range(100):\n", + " requests.append(\n", + " {\n", + " \"custom_id\": f\"request-{i}\",\n", + " \"method\": \"POST\",\n", + " \"url\": \"/chat/completions\",\n", + " \"body\": {\n", + " \"model\": \"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n", + " \"messages\": [\n", + " {\n", + " \"role\": \"system\",\n", + " \"content\": f\"{i}: You are a helpful AI assistant\",\n", + " },\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": \"Write a detailed story about topic. Make it very long.\",\n", + " },\n", + " ],\n", + " \"max_tokens\": 500,\n", + " },\n", + " }\n", + " )\n", + "\n", + "input_file_path = \"batch_requests.jsonl\"\n", + "with open(input_file_path, \"w\") as f:\n", + " for req in requests:\n", + " f.write(json.dumps(req) + \"\\n\")\n", + "\n", + "with open(input_file_path, \"rb\") as f:\n", + " uploaded_file = client.files.create(file=f, purpose=\"batch\")\n", + "\n", + "batch_job = client.batches.create(\n", + " input_file_id=uploaded_file.id,\n", + " endpoint=\"/v1/chat/completions\",\n", + " completion_window=\"24h\",\n", + ")\n", + "\n", + "print(f\"Created batch job with ID: {batch_job.id}\")\n", + "print(f\"Initial status: {batch_job.status}\")\n", + "\n", + "time.sleep(10)\n", + "\n", + "max_checks = 5\n", + "for i in range(max_checks):\n", + " batch_details = client.batches.retrieve(batch_id=batch_job.id)\n", + " print(f\"Batch job details (check {i+1}/{max_checks}):\")\n", + " print(f\"ID: {batch_details.id}\")\n", + " print(f\"Status: {batch_details.status}\")\n", + " print(f\"Created at: {batch_details.created_at}\")\n", + " print(f\"Input file ID: {batch_details.input_file_id}\")\n", + " print(f\"Output file ID: {batch_details.output_file_id}\")\n", + "\n", + " print(\"Request counts:\")\n", + " print(f\"Total: {batch_details.request_counts.total}\")\n", + " print(f\"Completed: {batch_details.request_counts.completed}\")\n", + " print(f\"Failed: {batch_details.request_counts.failed}\")\n", + "\n", + " time.sleep(3)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here is an example to cancel a batch job." + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Created batch job with ID: batch_3d2dd881-ad84-465a-85ee-6d5991794e5e\n", + "Initial status: validating\n", + "Cancellation initiated. Status: cancelling\n", + "Current status: cancelled\n", + "Batch job successfully cancelled\n", + "Successfully cleaned up input file\n" + ] + } + ], + "source": [ + "import json\n", + "import time\n", + "from openai import OpenAI\n", + "\n", + "client = OpenAI(base_url=\"http://127.0.0.1:30000/v1\", api_key=\"None\")\n", + "\n", + "requests = []\n", + "for i in range(500):\n", + " requests.append(\n", + " {\n", + " \"custom_id\": f\"request-{i}\",\n", + " \"method\": \"POST\",\n", + " \"url\": \"/chat/completions\",\n", + " \"body\": {\n", + " \"model\": \"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n", + " \"messages\": [\n", + " {\n", + " \"role\": \"system\",\n", + " \"content\": f\"{i}: You are a helpful AI assistant\",\n", + " },\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": \"Write a detailed story about topic. Make it very long.\",\n", + " },\n", + " ],\n", + " \"max_tokens\": 500,\n", + " },\n", + " }\n", + " )\n", + "\n", + "input_file_path = \"batch_requests.jsonl\"\n", + "with open(input_file_path, \"w\") as f:\n", + " for req in requests:\n", + " f.write(json.dumps(req) + \"\\n\")\n", + "\n", + "with open(input_file_path, \"rb\") as f:\n", + " uploaded_file = client.files.create(file=f, purpose=\"batch\")\n", + "\n", + "batch_job = client.batches.create(\n", + " input_file_id=uploaded_file.id,\n", + " endpoint=\"/v1/chat/completions\",\n", + " completion_window=\"24h\",\n", + ")\n", + "\n", + "print(f\"Created batch job with ID: {batch_job.id}\")\n", + "print(f\"Initial status: {batch_job.status}\")\n", + "\n", + "time.sleep(10)\n", + "\n", + "try:\n", + " cancelled_job = client.batches.cancel(batch_id=batch_job.id)\n", + " print(f\"Cancellation initiated. Status: {cancelled_job.status}\")\n", + " assert cancelled_job.status == \"cancelling\"\n", + "\n", + " # Monitor the cancellation process\n", + " while cancelled_job.status not in [\"failed\", \"cancelled\"]:\n", + " time.sleep(3)\n", + " cancelled_job = client.batches.retrieve(batch_job.id)\n", + " print(f\"Current status: {cancelled_job.status}\")\n", + "\n", + " # Verify final status\n", + " assert cancelled_job.status == \"cancelled\"\n", + " print(\"Batch job successfully cancelled\")\n", + "\n", + "except Exception as e:\n", + " print(f\"Error during cancellation: {e}\")\n", + " raise e\n", + "\n", + "finally:\n", + " try:\n", + " del_response = client.files.delete(uploaded_file.id)\n", + " if del_response.deleted:\n", + " print(\"Successfully cleaned up input file\")\n", + " except Exception as e:\n", + " print(f\"Error cleaning up: {e}\")\n", + " raise e" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [], + "source": [ + "terminate_process(server_process)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "AlphaMeemory", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/send_request.ipynb b/docs/send_request.ipynb index 8062ae004..c0172e459 100644 --- a/docs/send_request.ipynb +++ b/docs/send_request.ipynb @@ -4,7 +4,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Quick Start" + "# Quick Start: Launch A Server and Send Requests\n", + "\n", + "This section provides a quick start guide to using SGLang after installation." ] }, { @@ -13,12 +15,13 @@ "source": [ "## Launch a server\n", "\n", - "This code uses `subprocess.Popen` to start an SGLang server process, equivalent to executing \n", + "This code block is equivalent to executing \n", "\n", "```bash\n", "python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \\\n", "--port 30000 --host 0.0.0.0 --log-level warning\n", "```\n", + "\n", "in your command line and wait for the server to be ready." ] }, @@ -39,10 +42,12 @@ "from sglang.utils import execute_shell_command, wait_for_server, terminate_process\n", "\n", "\n", - "server_process = execute_shell_command(\"\"\"\n", + "server_process = execute_shell_command(\n", + " \"\"\"\n", "python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \\\n", "--port 30000 --host 0.0.0.0 --log-level warning\n", - "\"\"\")\n", + "\"\"\"\n", + ")\n", "\n", "wait_for_server(\"http://localhost:30000\")\n", "print(\"Server is ready. Proceeding with the next steps.\")" @@ -105,9 +110,7 @@ "# Always assign an api_key, even if not specified during server initialization.\n", "# Setting an API key during server initialization is strongly recommended.\n", "\n", - "client = openai.Client(\n", - " base_url=\"http://127.0.0.1:30000/v1\", api_key=\"None\"\n", - ")\n", + "client = openai.Client(base_url=\"http://127.0.0.1:30000/v1\", api_key=\"None\")\n", "\n", "# Chat completion example\n", "\n",