Add openAI compatible API (#1810)
Co-authored-by: Chayenne <zhaochenyang@g.ucla.edu>
This commit is contained in:
35
.github/workflows/deploy-docs.yml
vendored
35
.github/workflows/deploy-docs.yml
vendored
@@ -10,12 +10,17 @@ on:
|
|||||||
workflow_dispatch:
|
workflow_dispatch:
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
execute-notebooks:
|
execute-and-deploy:
|
||||||
runs-on: 1-gpu-runner
|
runs-on: 1-gpu-runner
|
||||||
if: github.repository == 'sgl-project/sglang'
|
if: github.repository == 'sgl-project/sglang'
|
||||||
|
defaults:
|
||||||
|
run:
|
||||||
|
working-directory: docs
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout code
|
- name: Checkout code
|
||||||
uses: actions/checkout@v3
|
uses: actions/checkout@v3
|
||||||
|
with:
|
||||||
|
path: .
|
||||||
|
|
||||||
- name: Set up Python
|
- name: Set up Python
|
||||||
uses: actions/setup-python@v4
|
uses: actions/setup-python@v4
|
||||||
@@ -25,7 +30,9 @@ jobs:
|
|||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: |
|
run: |
|
||||||
bash scripts/ci_install_dependency.sh
|
bash scripts/ci_install_dependency.sh
|
||||||
pip install -r docs/requirements.txt
|
pip install -r requirements.txt
|
||||||
|
apt-get update
|
||||||
|
apt-get install -y pandoc
|
||||||
|
|
||||||
- name: Setup Jupyter Kernel
|
- name: Setup Jupyter Kernel
|
||||||
run: |
|
run: |
|
||||||
@@ -33,7 +40,6 @@ jobs:
|
|||||||
|
|
||||||
- name: Execute notebooks
|
- name: Execute notebooks
|
||||||
run: |
|
run: |
|
||||||
cd docs
|
|
||||||
for nb in *.ipynb; do
|
for nb in *.ipynb; do
|
||||||
if [ -f "$nb" ]; then
|
if [ -f "$nb" ]; then
|
||||||
echo "Executing $nb"
|
echo "Executing $nb"
|
||||||
@@ -43,36 +49,15 @@ jobs:
|
|||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
build-and-deploy:
|
|
||||||
needs: execute-notebooks
|
|
||||||
if: github.repository == 'sgl-project/sglang'
|
|
||||||
runs-on: 1-gpu-runner
|
|
||||||
steps:
|
|
||||||
- name: Checkout code
|
|
||||||
uses: actions/checkout@v3
|
|
||||||
|
|
||||||
- name: Set up Python
|
|
||||||
uses: actions/setup-python@v4
|
|
||||||
with:
|
|
||||||
python-version: '3.9'
|
|
||||||
|
|
||||||
- name: Install dependencies
|
|
||||||
run: |
|
|
||||||
bash scripts/ci_install_dependency.sh
|
|
||||||
pip install -r docs/requirements.txt
|
|
||||||
apt-get update
|
|
||||||
apt-get install -y pandoc
|
|
||||||
|
|
||||||
- name: Build documentation
|
- name: Build documentation
|
||||||
run: |
|
run: |
|
||||||
cd docs
|
|
||||||
make html
|
make html
|
||||||
|
|
||||||
- name: Push to sgl-project.github.io
|
- name: Push to sgl-project.github.io
|
||||||
env:
|
env:
|
||||||
GITHUB_TOKEN: ${{ secrets.PAT_TOKEN }}
|
GITHUB_TOKEN: ${{ secrets.PAT_TOKEN }}
|
||||||
run: |
|
run: |
|
||||||
cd docs/_build/html
|
cd _build/html
|
||||||
git clone https://$GITHUB_TOKEN@github.com/sgl-project/sgl-project.github.io.git ../sgl-project.github.io
|
git clone https://$GITHUB_TOKEN@github.com/sgl-project/sgl-project.github.io.git ../sgl-project.github.io
|
||||||
cp -r * ../sgl-project.github.io
|
cp -r * ../sgl-project.github.io
|
||||||
cd ../sgl-project.github.io
|
cd ../sgl-project.github.io
|
||||||
|
|||||||
18
.github/workflows/execute-notebook.yml
vendored
18
.github/workflows/execute-notebook.yml
vendored
@@ -1,12 +1,24 @@
|
|||||||
name: Execute Notebooks
|
name: Execute Notebooks
|
||||||
|
|
||||||
on:
|
on:
|
||||||
pull_request:
|
|
||||||
push:
|
push:
|
||||||
branches:
|
branches: [ main ]
|
||||||
- main
|
paths:
|
||||||
|
- "python/sglang/**"
|
||||||
|
- "docs/**"
|
||||||
|
pull_request:
|
||||||
|
branches: [ main ]
|
||||||
|
paths:
|
||||||
|
- "python/sglang/**"
|
||||||
|
- "docs/**"
|
||||||
workflow_dispatch:
|
workflow_dispatch:
|
||||||
|
|
||||||
|
|
||||||
|
concurrency:
|
||||||
|
group: execute-notebook-${{ github.ref }}
|
||||||
|
cancel-in-progress: true
|
||||||
|
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
run-all-notebooks:
|
run-all-notebooks:
|
||||||
runs-on: 1-gpu-runner
|
runs-on: 1-gpu-runner
|
||||||
|
|||||||
@@ -10,6 +10,9 @@ repos:
|
|||||||
rev: 24.10.0
|
rev: 24.10.0
|
||||||
hooks:
|
hooks:
|
||||||
- id: black
|
- id: black
|
||||||
|
additional_dependencies: ['.[jupyter]']
|
||||||
|
types: [python, jupyter]
|
||||||
|
types_or: [python, jupyter]
|
||||||
|
|
||||||
- repo: https://github.com/pre-commit/pre-commit-hooks
|
- repo: https://github.com/pre-commit/pre-commit-hooks
|
||||||
rev: v5.0.0
|
rev: v5.0.0
|
||||||
|
|||||||
@@ -4,19 +4,32 @@
|
|||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"# Embedding Model"
|
"# Embedding Model\n",
|
||||||
|
"\n",
|
||||||
|
"SGLang supports embedding models in the same way as completion models. Here are some example models:\n",
|
||||||
|
"\n",
|
||||||
|
"- [intfloat/e5-mistral-7b-instruct](https://huggingface.co/intfloat/e5-mistral-7b-instruct)\n",
|
||||||
|
"- [Alibaba-NLP/gte-Qwen2-7B-instruct](https://huggingface.co/Alibaba-NLP/gte-Qwen2-7B-instruct)\n"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"## Launch A Server"
|
"## Launch A Server\n",
|
||||||
|
"\n",
|
||||||
|
"The following code is equivalent to running this in the shell:\n",
|
||||||
|
"```bash\n",
|
||||||
|
"python -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-7B-instruct \\\n",
|
||||||
|
" --port 30010 --host 0.0.0.0 --is-embedding --log-level error\n",
|
||||||
|
"```\n",
|
||||||
|
"\n",
|
||||||
|
"Remember to add `--is-embedding` to the command."
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 1,
|
"execution_count": 7,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
@@ -28,14 +41,14 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"# Equivalent to running this in the shell:\n",
|
|
||||||
"# python -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-7B-instruct --port 30010 --host 0.0.0.0 --is-embedding --log-level error\n",
|
|
||||||
"from sglang.utils import execute_shell_command, wait_for_server, terminate_process\n",
|
"from sglang.utils import execute_shell_command, wait_for_server, terminate_process\n",
|
||||||
"\n",
|
"\n",
|
||||||
"embedding_process = execute_shell_command(\"\"\"\n",
|
"embedding_process = execute_shell_command(\n",
|
||||||
|
" \"\"\"\n",
|
||||||
"python -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-7B-instruct \\\n",
|
"python -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-7B-instruct \\\n",
|
||||||
" --port 30010 --host 0.0.0.0 --is-embedding --log-level error\n",
|
" --port 30010 --host 0.0.0.0 --is-embedding --log-level error\n",
|
||||||
"\"\"\")\n",
|
"\"\"\"\n",
|
||||||
|
")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"wait_for_server(\"http://localhost:30010\")\n",
|
"wait_for_server(\"http://localhost:30010\")\n",
|
||||||
"\n",
|
"\n",
|
||||||
@@ -51,25 +64,32 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 2,
|
"execution_count": 8,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"name": "stdout",
|
"name": "stdout",
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"[0.0083160400390625, 0.0006804466247558594, -0.00809478759765625, -0.0006995201110839844, 0.0143890380859375, -0.0090179443359375, 0.01238250732421875, 0.00209808349609375, 0.0062103271484375, -0.003047943115234375]\n"
|
"Text embedding (first 10): [0.0083160400390625, 0.0006804466247558594, -0.00809478759765625, -0.0006995201110839844, 0.0143890380859375, -0.0090179443359375, 0.01238250732421875, 0.00209808349609375, 0.0062103271484375, -0.003047943115234375]\n"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"# Get the first 10 elements of the embedding\n",
|
"import subprocess, json\n",
|
||||||
"\n",
|
"\n",
|
||||||
"! curl -s http://localhost:30010/v1/embeddings \\\n",
|
"text = \"Once upon a time\"\n",
|
||||||
|
"\n",
|
||||||
|
"curl_text = f\"\"\"curl -s http://localhost:30010/v1/embeddings \\\n",
|
||||||
" -H \"Content-Type: application/json\" \\\n",
|
" -H \"Content-Type: application/json\" \\\n",
|
||||||
" -H \"Authorization: Bearer None\" \\\n",
|
" -H \"Authorization: Bearer None\" \\\n",
|
||||||
" -d '{\"model\": \"Alibaba-NLP/gte-Qwen2-7B-instruct\", \"input\": \"Once upon a time\"}' \\\n",
|
" -d '{{\"model\": \"Alibaba-NLP/gte-Qwen2-7B-instruct\", \"input\": \"{text}\"}}'\"\"\"\n",
|
||||||
" | python3 -c \"import sys, json; print(json.load(sys.stdin)['data'][0]['embedding'][:10])\""
|
"\n",
|
||||||
|
"text_embedding = json.loads(subprocess.check_output(curl_text, shell=True))[\"data\"][0][\n",
|
||||||
|
" \"embedding\"\n",
|
||||||
|
"]\n",
|
||||||
|
"\n",
|
||||||
|
"print(f\"Text embedding (first 10): {text_embedding[:10]}\")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -81,37 +101,79 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 3,
|
"execution_count": 9,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"name": "stdout",
|
"name": "stdout",
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"[0.00603485107421875, -0.0190582275390625, -0.01273345947265625, 0.01552581787109375, 0.0066680908203125, -0.0135955810546875, 0.01131439208984375, 0.0013713836669921875, -0.0089874267578125, 0.021759033203125]\n"
|
"Text embedding (first 10): [0.00829315185546875, 0.0007004737854003906, -0.00809478759765625, -0.0006799697875976562, 0.01438140869140625, -0.00897979736328125, 0.0123748779296875, 0.0020923614501953125, 0.006195068359375, -0.0030498504638671875]\n"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"import openai\n",
|
"import openai\n",
|
||||||
"\n",
|
"\n",
|
||||||
"client = openai.Client(\n",
|
"client = openai.Client(base_url=\"http://127.0.0.1:30010/v1\", api_key=\"None\")\n",
|
||||||
" base_url=\"http://127.0.0.1:30010/v1\", api_key=\"None\"\n",
|
|
||||||
")\n",
|
|
||||||
"\n",
|
"\n",
|
||||||
"# Text embedding example\n",
|
"# Text embedding example\n",
|
||||||
"response = client.embeddings.create(\n",
|
"response = client.embeddings.create(\n",
|
||||||
" model=\"Alibaba-NLP/gte-Qwen2-7B-instruct\",\n",
|
" model=\"Alibaba-NLP/gte-Qwen2-7B-instruct\",\n",
|
||||||
" input=\"How are you today\",\n",
|
" input=text,\n",
|
||||||
")\n",
|
")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"embedding = response.data[0].embedding[:10]\n",
|
"embedding = response.data[0].embedding[:10]\n",
|
||||||
"print(embedding)"
|
"print(f\"Text embedding (first 10): {embedding}\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Using Input IDs\n",
|
||||||
|
"\n",
|
||||||
|
"SGLang also supports `input_ids` as input to get the embedding."
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 4,
|
"execution_count": 10,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Input IDs embedding (first 10): [0.00829315185546875, 0.0007004737854003906, -0.00809478759765625, -0.0006799697875976562, 0.01438140869140625, -0.00897979736328125, 0.0123748779296875, 0.0020923614501953125, 0.006195068359375, -0.0030498504638671875]\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"import json\n",
|
||||||
|
"import os\n",
|
||||||
|
"from transformers import AutoTokenizer\n",
|
||||||
|
"\n",
|
||||||
|
"os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n",
|
||||||
|
"\n",
|
||||||
|
"tokenizer = AutoTokenizer.from_pretrained(\"Alibaba-NLP/gte-Qwen2-7B-instruct\")\n",
|
||||||
|
"input_ids = tokenizer.encode(text)\n",
|
||||||
|
"\n",
|
||||||
|
"curl_ids = f\"\"\"curl -s http://localhost:30010/v1/embeddings \\\n",
|
||||||
|
" -H \"Content-Type: application/json\" \\\n",
|
||||||
|
" -H \"Authorization: Bearer None\" \\\n",
|
||||||
|
" -d '{{\"model\": \"Alibaba-NLP/gte-Qwen2-7B-instruct\", \"input\": {json.dumps(input_ids)}}}'\"\"\"\n",
|
||||||
|
"\n",
|
||||||
|
"input_ids_embedding = json.loads(subprocess.check_output(curl_ids, shell=True))[\"data\"][\n",
|
||||||
|
" 0\n",
|
||||||
|
"][\"embedding\"]\n",
|
||||||
|
"\n",
|
||||||
|
"print(f\"Input IDs embedding (first 10): {input_ids_embedding[:10]}\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 11,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
|
|||||||
@@ -16,12 +16,14 @@ The core features include:
|
|||||||
:caption: Getting Started
|
:caption: Getting Started
|
||||||
|
|
||||||
install.md
|
install.md
|
||||||
|
send_request.ipynb
|
||||||
|
|
||||||
|
|
||||||
.. toctree::
|
.. toctree::
|
||||||
:maxdepth: 1
|
:maxdepth: 1
|
||||||
:caption: Backend Tutorial
|
:caption: Backend Tutorial
|
||||||
|
|
||||||
|
openai_api.ipynb
|
||||||
backend.md
|
backend.md
|
||||||
|
|
||||||
|
|
||||||
@@ -43,3 +45,4 @@ The core features include:
|
|||||||
choices_methods.md
|
choices_methods.md
|
||||||
benchmark_and_profiling.md
|
benchmark_and_profiling.md
|
||||||
troubleshooting.md
|
troubleshooting.md
|
||||||
|
embedding_model.ipynb
|
||||||
|
|||||||
676
docs/openai_api.ipynb
Normal file
676
docs/openai_api.ipynb
Normal file
@@ -0,0 +1,676 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# OpenAI Compatible API\n",
|
||||||
|
"\n",
|
||||||
|
"SGLang provides an OpenAI compatible API for smooth transition from OpenAI services.\n",
|
||||||
|
"\n",
|
||||||
|
"- `chat/completions`\n",
|
||||||
|
"- `completions`\n",
|
||||||
|
"- `batches`\n",
|
||||||
|
"- `embeddings`(refer to [embedding_model.ipynb](embedding_model.ipynb))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Chat Completions\n",
|
||||||
|
"\n",
|
||||||
|
"### Usage\n",
|
||||||
|
"\n",
|
||||||
|
"Similar to [send_request.ipynb](send_request.ipynb), we can send a chat completion request to SGLang server with OpenAI API format."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 38,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Server is ready. Proceeding with the next steps.\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from sglang.utils import execute_shell_command, wait_for_server, terminate_process\n",
|
||||||
|
"\n",
|
||||||
|
"server_process = execute_shell_command(\n",
|
||||||
|
" \"\"\"\n",
|
||||||
|
"python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \\\n",
|
||||||
|
"--port 30000 --host 0.0.0.0 --log-level warning\n",
|
||||||
|
"\"\"\"\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"wait_for_server(\"http://localhost:30000\")\n",
|
||||||
|
"print(\"Server is ready. Proceeding with the next steps.\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 39,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"ChatCompletion(id='e854540ec7914b2d8c712f16fd9ed2ca', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\\n\\n1. **Country:** Japan\\n**Capital:** Tokyo\\n\\n2. **Country:** Australia\\n**Capital:** Canberra\\n\\n3. **Country:** Brazil\\n**Capital:** Brasília', refusal=None, role='assistant', function_call=None, tool_calls=None), matched_stop=128009)], created=1730012326, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=46, prompt_tokens=49, total_tokens=95, prompt_tokens_details=None))\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"import openai\n",
|
||||||
|
"\n",
|
||||||
|
"# Always assign an api_key, even if not specified during server initialization.\n",
|
||||||
|
"# Setting an API key during server initialization is strongly recommended.\n",
|
||||||
|
"\n",
|
||||||
|
"client = openai.Client(base_url=\"http://127.0.0.1:30000/v1\", api_key=\"None\")\n",
|
||||||
|
"\n",
|
||||||
|
"# Chat completion example\n",
|
||||||
|
"\n",
|
||||||
|
"response = client.chat.completions.create(\n",
|
||||||
|
" model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
|
||||||
|
" messages=[\n",
|
||||||
|
" {\"role\": \"system\", \"content\": \"You are a helpful AI assistant\"},\n",
|
||||||
|
" {\"role\": \"user\", \"content\": \"List 3 countries and their capitals.\"},\n",
|
||||||
|
" ],\n",
|
||||||
|
" temperature=0,\n",
|
||||||
|
" max_tokens=64,\n",
|
||||||
|
")\n",
|
||||||
|
"print(response)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Parameters\n",
|
||||||
|
"\n",
|
||||||
|
"The chat completions API accepts the following parameters (refer to [OpenAI Chat Completions API](https://platform.openai.com/docs/api-reference/chat/create) for more details):\n",
|
||||||
|
"\n",
|
||||||
|
"- `messages`: List of messages in the conversation, each containing `role` and `content`\n",
|
||||||
|
"- `model`: The model identifier to use for completion\n",
|
||||||
|
"- `max_tokens`: Maximum number of tokens to generate in the response\n",
|
||||||
|
"- `temperature`: Controls randomness (0-2). Lower values make output more focused and deterministic\n",
|
||||||
|
"- `top_p`: Alternative to temperature. Controls diversity via nucleus sampling\n",
|
||||||
|
"- `n`: Number of chat completion choices to generate\n",
|
||||||
|
"- `stream`: If true, partial message deltas will be sent as they become available\n",
|
||||||
|
"- `stop`: Sequences where the API will stop generating further tokens\n",
|
||||||
|
"- `presence_penalty`: Penalizes new tokens based on their presence in the text so far (-2.0 to 2.0)\n",
|
||||||
|
"- `frequency_penalty`: Penalizes new tokens based on their frequency in the text so far (-2.0 to 2.0)\n",
|
||||||
|
"- `logit_bias`: Modify the likelihood of specified tokens appearing in the completion\n",
|
||||||
|
"- `logprobs`: Include log probabilities of tokens in the response\n",
|
||||||
|
"- `top_logprobs`: Number of most likely tokens to return probabilities for\n",
|
||||||
|
"- `seed`: Random seed for deterministic results\n",
|
||||||
|
"- `response_format`: Specify the format of the response (e.g., JSON)\n",
|
||||||
|
"- `stream_options`: Additional options for streaming responses\n",
|
||||||
|
"- `user`: A unique identifier representing your end-user\n",
|
||||||
|
"\n",
|
||||||
|
"Here is an example of a detailed chat completion request:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 40,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Ancient Rome's major achievements include:"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"response = client.chat.completions.create(\n",
|
||||||
|
" model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
|
||||||
|
" messages=[\n",
|
||||||
|
" {\n",
|
||||||
|
" \"role\": \"system\",\n",
|
||||||
|
" \"content\": \"You are a knowledgeable historian who provides concise responses.\",\n",
|
||||||
|
" },\n",
|
||||||
|
" {\"role\": \"user\", \"content\": \"Tell me about ancient Rome\"},\n",
|
||||||
|
" {\n",
|
||||||
|
" \"role\": \"assistant\",\n",
|
||||||
|
" \"content\": \"Ancient Rome was a civilization centered in Italy.\",\n",
|
||||||
|
" },\n",
|
||||||
|
" {\"role\": \"user\", \"content\": \"What were their major achievements?\"},\n",
|
||||||
|
" ],\n",
|
||||||
|
" temperature=0.3, # Lower temperature for more focused responses\n",
|
||||||
|
" max_tokens=100, # Reasonable length for a concise response\n",
|
||||||
|
" top_p=0.95, # Slightly higher for better fluency\n",
|
||||||
|
" stop=[\"\\n\\n\"], # Simple stop sequence\n",
|
||||||
|
" presence_penalty=0.2, # Mild penalty to avoid repetition\n",
|
||||||
|
" frequency_penalty=0.2, # Mild penalty for more natural language\n",
|
||||||
|
" n=1, # Single response is usually more stable\n",
|
||||||
|
" seed=42, # Keep for reproducibility\n",
|
||||||
|
" stream=True, # Keep streaming for real-time output\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"for chunk in response:\n",
|
||||||
|
" print(chunk.choices[0].delta.content or \"\", end=\"\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Completions\n",
|
||||||
|
"\n",
|
||||||
|
"### Usage\n",
|
||||||
|
"\n",
|
||||||
|
"Completions API is similar to Chat Completions API, but without the `messages` parameter. Refer to [OpenAI Completions API](https://platform.openai.com/docs/api-reference/completions/create) for more details."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 41,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Completion(id='a6e07198f4b445baa0fb08a2178ceb59', choices=[CompletionChoice(finish_reason='length', index=0, logprobs=None, text=' 1. 2. 3.\\n1. United States - Washington D.C. 2. Japan - Tokyo 3. Australia - Canberra\\nList 3 countries and their capitals. 1. 2. 3.\\n1. China - Beijing 2. Brazil - Bras', matched_stop=None)], created=1730012328, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=64, prompt_tokens=9, total_tokens=73, prompt_tokens_details=None))\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"response = client.completions.create(\n",
|
||||||
|
" model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
|
||||||
|
" prompt=\"List 3 countries and their capitals.\",\n",
|
||||||
|
" temperature=0,\n",
|
||||||
|
" max_tokens=64,\n",
|
||||||
|
" n=1,\n",
|
||||||
|
" stop=None,\n",
|
||||||
|
")\n",
|
||||||
|
"print(response)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Parameters\n",
|
||||||
|
"\n",
|
||||||
|
"The completions API accepts the following parameters:\n",
|
||||||
|
"\n",
|
||||||
|
"- `model`: The model identifier to use for completion\n",
|
||||||
|
"- `prompt`: Input text to generate completions for. Can be a string, array of strings, or token arrays\n",
|
||||||
|
"- `best_of`: Number of completions to generate server-side and return the best one\n",
|
||||||
|
"- `echo`: If true, the prompt will be included in the response\n",
|
||||||
|
"- `frequency_penalty`: Penalizes new tokens based on their frequency in the text so far (-2.0 to 2.0)\n",
|
||||||
|
"- `logit_bias`: Modify the likelihood of specified tokens appearing in the completion\n",
|
||||||
|
"- `logprobs`: Include log probabilities of tokens in the response\n",
|
||||||
|
"- `max_tokens`: Maximum number of tokens to generate in the response (default: 16)\n",
|
||||||
|
"- `n`: Number of completion choices to generate\n",
|
||||||
|
"- `presence_penalty`: Penalizes new tokens based on their presence in the text so far (-2.0 to 2.0)\n",
|
||||||
|
"- `seed`: Random seed for deterministic results\n",
|
||||||
|
"- `stop`: Sequences where the API will stop generating further tokens\n",
|
||||||
|
"- `stream`: If true, partial completion deltas will be sent as they become available\n",
|
||||||
|
"- `stream_options`: Additional options for streaming responses\n",
|
||||||
|
"- `suffix`: Text to append to the completion\n",
|
||||||
|
"- `temperature`: Controls randomness (0-2). Lower values make output more focused and deterministic\n",
|
||||||
|
"- `top_p`: Alternative to temperature. Controls diversity via nucleus sampling\n",
|
||||||
|
"- `user`: A unique identifier representing your end-user\n",
|
||||||
|
"\n",
|
||||||
|
"Here is an example of a detailed completions request:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 42,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
" Space explorer, Captain Orion Blackwood, had been traveling through the galaxy for 12 years, searching for a new home for humanity. His ship, the Aurora, had been his home for so long that he barely remembered what it was like to walk on solid ground.\n",
|
||||||
|
"As he navigated through the dense asteroid field, the ship's computer, S.A.R.A. (Self-Aware Reasoning Algorithm), alerted him to a strange reading on one of the asteroids. Captain Blackwood's curiosity was piqued, and he decided to investigate further.\n",
|
||||||
|
"\"Captain, I'm detecting unusual energy signatures emanating from the asteroid,\" S.A.R.A. said. \"It's unlike anything I've seen before.\"\n",
|
||||||
|
"Captain Blackwood's eyes narrowed as"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"response = client.completions.create(\n",
|
||||||
|
" model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
|
||||||
|
" prompt=\"Write a short story about a space explorer.\",\n",
|
||||||
|
" temperature=0.7, # Moderate temperature for creative writing\n",
|
||||||
|
" max_tokens=150, # Longer response for a story\n",
|
||||||
|
" top_p=0.9, # Balanced diversity in word choice\n",
|
||||||
|
" stop=[\"\\n\\n\", \"THE END\"], # Multiple stop sequences\n",
|
||||||
|
" presence_penalty=0.3, # Encourage novel elements\n",
|
||||||
|
" frequency_penalty=0.3, # Reduce repetitive phrases\n",
|
||||||
|
" n=1, # Generate one completion\n",
|
||||||
|
" seed=123, # For reproducible results\n",
|
||||||
|
" stream=True, # Stream the response\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"for chunk in response:\n",
|
||||||
|
" print(chunk.choices[0].text or \"\", end=\"\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Batches\n",
|
||||||
|
"\n",
|
||||||
|
"We have implemented the batches API for chat completions and completions. You can upload your requests in `jsonl` files, create a batch job, and retrieve the results when the batch job is completed (which takes longer but costs less).\n",
|
||||||
|
"\n",
|
||||||
|
"The batches APIs are:\n",
|
||||||
|
"\n",
|
||||||
|
"- `batches`\n",
|
||||||
|
"- `batches/{batch_id}/cancel`\n",
|
||||||
|
"- `batches/{batch_id}`\n",
|
||||||
|
"\n",
|
||||||
|
"Here is an example of a batch job for chat completions, completions are similar.\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 43,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Batch job created with ID: batch_03d7f74f-dffe-4c26-b5e7-bb9fb5cb89ff\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"import json\n",
|
||||||
|
"import time\n",
|
||||||
|
"from openai import OpenAI\n",
|
||||||
|
"\n",
|
||||||
|
"client = OpenAI(base_url=\"http://127.0.0.1:30000/v1\", api_key=\"None\")\n",
|
||||||
|
"\n",
|
||||||
|
"requests = [\n",
|
||||||
|
" {\n",
|
||||||
|
" \"custom_id\": \"request-1\",\n",
|
||||||
|
" \"method\": \"POST\",\n",
|
||||||
|
" \"url\": \"/chat/completions\",\n",
|
||||||
|
" \"body\": {\n",
|
||||||
|
" \"model\": \"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
|
||||||
|
" \"messages\": [\n",
|
||||||
|
" {\"role\": \"user\", \"content\": \"Tell me a joke about programming\"}\n",
|
||||||
|
" ],\n",
|
||||||
|
" \"max_tokens\": 50,\n",
|
||||||
|
" },\n",
|
||||||
|
" },\n",
|
||||||
|
" {\n",
|
||||||
|
" \"custom_id\": \"request-2\",\n",
|
||||||
|
" \"method\": \"POST\",\n",
|
||||||
|
" \"url\": \"/chat/completions\",\n",
|
||||||
|
" \"body\": {\n",
|
||||||
|
" \"model\": \"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
|
||||||
|
" \"messages\": [{\"role\": \"user\", \"content\": \"What is Python?\"}],\n",
|
||||||
|
" \"max_tokens\": 50,\n",
|
||||||
|
" },\n",
|
||||||
|
" },\n",
|
||||||
|
"]\n",
|
||||||
|
"\n",
|
||||||
|
"input_file_path = \"batch_requests.jsonl\"\n",
|
||||||
|
"\n",
|
||||||
|
"with open(input_file_path, \"w\") as f:\n",
|
||||||
|
" for req in requests:\n",
|
||||||
|
" f.write(json.dumps(req) + \"\\n\")\n",
|
||||||
|
"\n",
|
||||||
|
"with open(input_file_path, \"rb\") as f:\n",
|
||||||
|
" file_response = client.files.create(file=f, purpose=\"batch\")\n",
|
||||||
|
"\n",
|
||||||
|
"batch_response = client.batches.create(\n",
|
||||||
|
" input_file_id=file_response.id,\n",
|
||||||
|
" endpoint=\"/v1/chat/completions\",\n",
|
||||||
|
" completion_window=\"24h\",\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"print(f\"Batch job created with ID: {batch_response.id}\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 44,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Batch job status: validating...trying again in 3 seconds...\n",
|
||||||
|
"Batch job completed successfully!\n",
|
||||||
|
"Request counts: BatchRequestCounts(completed=2, failed=0, total=2)\n",
|
||||||
|
"\n",
|
||||||
|
"Request request-1:\n",
|
||||||
|
"Response: {'status_code': 200, 'request_id': 'request-1', 'body': {'id': 'request-1', 'object': 'chat.completion', 'created': 1730012333, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': {'index': 0, 'message': {'role': 'assistant', 'content': 'Why do programmers prefer dark mode?\\n\\nBecause light attracts bugs.'}, 'logprobs': None, 'finish_reason': 'stop', 'matched_stop': 128009}, 'usage': {'prompt_tokens': 41, 'completion_tokens': 13, 'total_tokens': 54}, 'system_fingerprint': None}}\n",
|
||||||
|
"\n",
|
||||||
|
"Request request-2:\n",
|
||||||
|
"Response: {'status_code': 200, 'request_id': 'request-2', 'body': {'id': 'request-2', 'object': 'chat.completion', 'created': 1730012333, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': {'index': 0, 'message': {'role': 'assistant', 'content': '**What is Python?**\\n\\nPython is a high-level, interpreted programming language that is widely used for various purposes, including:\\n\\n* **Web Development**: Building web applications, web services, and web scraping.\\n* **Data Science**: Data analysis'}, 'logprobs': None, 'finish_reason': 'length', 'matched_stop': None}, 'usage': {'prompt_tokens': 39, 'completion_tokens': 50, 'total_tokens': 89}, 'system_fingerprint': None}}\n",
|
||||||
|
"\n",
|
||||||
|
"Cleaning up files...\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"while batch_response.status not in [\"completed\", \"failed\", \"cancelled\"]:\n",
|
||||||
|
" time.sleep(3)\n",
|
||||||
|
" print(f\"Batch job status: {batch_response.status}...trying again in 3 seconds...\")\n",
|
||||||
|
" batch_response = client.batches.retrieve(batch_response.id)\n",
|
||||||
|
"\n",
|
||||||
|
"if batch_response.status == \"completed\":\n",
|
||||||
|
" print(\"Batch job completed successfully!\")\n",
|
||||||
|
" print(f\"Request counts: {batch_response.request_counts}\")\n",
|
||||||
|
"\n",
|
||||||
|
" result_file_id = batch_response.output_file_id\n",
|
||||||
|
" file_response = client.files.content(result_file_id)\n",
|
||||||
|
" result_content = file_response.read().decode(\"utf-8\")\n",
|
||||||
|
"\n",
|
||||||
|
" results = [\n",
|
||||||
|
" json.loads(line) for line in result_content.split(\"\\n\") if line.strip() != \"\"\n",
|
||||||
|
" ]\n",
|
||||||
|
"\n",
|
||||||
|
" for result in results:\n",
|
||||||
|
" print(f\"\\nRequest {result['custom_id']}:\")\n",
|
||||||
|
" print(f\"Response: {result['response']}\")\n",
|
||||||
|
"\n",
|
||||||
|
" print(\"\\nCleaning up files...\")\n",
|
||||||
|
" # Only delete the result file ID since file_response is just content\n",
|
||||||
|
" client.files.delete(result_file_id)\n",
|
||||||
|
"else:\n",
|
||||||
|
" print(f\"Batch job failed with status: {batch_response.status}\")\n",
|
||||||
|
" if hasattr(batch_response, \"errors\"):\n",
|
||||||
|
" print(f\"Errors: {batch_response.errors}\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"It takes a while to complete the batch job. You can use these two APIs to retrieve the batch job status or cancel the batch job.\n",
|
||||||
|
"\n",
|
||||||
|
"1. `batches/{batch_id}`: Retrieve the batch job status.\n",
|
||||||
|
"2. `batches/{batch_id}/cancel`: Cancel the batch job.\n",
|
||||||
|
"\n",
|
||||||
|
"Here is an example to check the batch job status."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 45,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Created batch job with ID: batch_6b9625ac-9ebc-4c4f-bfd5-f84f88b0100d\n",
|
||||||
|
"Initial status: validating\n",
|
||||||
|
"Batch job details (check 1/5):\n",
|
||||||
|
"ID: batch_6b9625ac-9ebc-4c4f-bfd5-f84f88b0100d\n",
|
||||||
|
"Status: in_progress\n",
|
||||||
|
"Created at: 1730012334\n",
|
||||||
|
"Input file ID: backend_input_file-8203d42a-109c-4573-9663-13b5d9cb6a2b\n",
|
||||||
|
"Output file ID: None\n",
|
||||||
|
"Request counts:\n",
|
||||||
|
"Total: 0\n",
|
||||||
|
"Completed: 0\n",
|
||||||
|
"Failed: 0\n",
|
||||||
|
"Batch job details (check 2/5):\n",
|
||||||
|
"ID: batch_6b9625ac-9ebc-4c4f-bfd5-f84f88b0100d\n",
|
||||||
|
"Status: in_progress\n",
|
||||||
|
"Created at: 1730012334\n",
|
||||||
|
"Input file ID: backend_input_file-8203d42a-109c-4573-9663-13b5d9cb6a2b\n",
|
||||||
|
"Output file ID: None\n",
|
||||||
|
"Request counts:\n",
|
||||||
|
"Total: 0\n",
|
||||||
|
"Completed: 0\n",
|
||||||
|
"Failed: 0\n",
|
||||||
|
"Batch job details (check 3/5):\n",
|
||||||
|
"ID: batch_6b9625ac-9ebc-4c4f-bfd5-f84f88b0100d\n",
|
||||||
|
"Status: in_progress\n",
|
||||||
|
"Created at: 1730012334\n",
|
||||||
|
"Input file ID: backend_input_file-8203d42a-109c-4573-9663-13b5d9cb6a2b\n",
|
||||||
|
"Output file ID: None\n",
|
||||||
|
"Request counts:\n",
|
||||||
|
"Total: 0\n",
|
||||||
|
"Completed: 0\n",
|
||||||
|
"Failed: 0\n",
|
||||||
|
"Batch job details (check 4/5):\n",
|
||||||
|
"ID: batch_6b9625ac-9ebc-4c4f-bfd5-f84f88b0100d\n",
|
||||||
|
"Status: completed\n",
|
||||||
|
"Created at: 1730012334\n",
|
||||||
|
"Input file ID: backend_input_file-8203d42a-109c-4573-9663-13b5d9cb6a2b\n",
|
||||||
|
"Output file ID: backend_result_file-d32f441d-e737-4da3-b07a-c39349425b3a\n",
|
||||||
|
"Request counts:\n",
|
||||||
|
"Total: 100\n",
|
||||||
|
"Completed: 100\n",
|
||||||
|
"Failed: 0\n",
|
||||||
|
"Batch job details (check 5/5):\n",
|
||||||
|
"ID: batch_6b9625ac-9ebc-4c4f-bfd5-f84f88b0100d\n",
|
||||||
|
"Status: completed\n",
|
||||||
|
"Created at: 1730012334\n",
|
||||||
|
"Input file ID: backend_input_file-8203d42a-109c-4573-9663-13b5d9cb6a2b\n",
|
||||||
|
"Output file ID: backend_result_file-d32f441d-e737-4da3-b07a-c39349425b3a\n",
|
||||||
|
"Request counts:\n",
|
||||||
|
"Total: 100\n",
|
||||||
|
"Completed: 100\n",
|
||||||
|
"Failed: 0\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"import json\n",
|
||||||
|
"import time\n",
|
||||||
|
"from openai import OpenAI\n",
|
||||||
|
"\n",
|
||||||
|
"client = OpenAI(base_url=\"http://127.0.0.1:30000/v1\", api_key=\"None\")\n",
|
||||||
|
"\n",
|
||||||
|
"requests = []\n",
|
||||||
|
"for i in range(100):\n",
|
||||||
|
" requests.append(\n",
|
||||||
|
" {\n",
|
||||||
|
" \"custom_id\": f\"request-{i}\",\n",
|
||||||
|
" \"method\": \"POST\",\n",
|
||||||
|
" \"url\": \"/chat/completions\",\n",
|
||||||
|
" \"body\": {\n",
|
||||||
|
" \"model\": \"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
|
||||||
|
" \"messages\": [\n",
|
||||||
|
" {\n",
|
||||||
|
" \"role\": \"system\",\n",
|
||||||
|
" \"content\": f\"{i}: You are a helpful AI assistant\",\n",
|
||||||
|
" },\n",
|
||||||
|
" {\n",
|
||||||
|
" \"role\": \"user\",\n",
|
||||||
|
" \"content\": \"Write a detailed story about topic. Make it very long.\",\n",
|
||||||
|
" },\n",
|
||||||
|
" ],\n",
|
||||||
|
" \"max_tokens\": 500,\n",
|
||||||
|
" },\n",
|
||||||
|
" }\n",
|
||||||
|
" )\n",
|
||||||
|
"\n",
|
||||||
|
"input_file_path = \"batch_requests.jsonl\"\n",
|
||||||
|
"with open(input_file_path, \"w\") as f:\n",
|
||||||
|
" for req in requests:\n",
|
||||||
|
" f.write(json.dumps(req) + \"\\n\")\n",
|
||||||
|
"\n",
|
||||||
|
"with open(input_file_path, \"rb\") as f:\n",
|
||||||
|
" uploaded_file = client.files.create(file=f, purpose=\"batch\")\n",
|
||||||
|
"\n",
|
||||||
|
"batch_job = client.batches.create(\n",
|
||||||
|
" input_file_id=uploaded_file.id,\n",
|
||||||
|
" endpoint=\"/v1/chat/completions\",\n",
|
||||||
|
" completion_window=\"24h\",\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"print(f\"Created batch job with ID: {batch_job.id}\")\n",
|
||||||
|
"print(f\"Initial status: {batch_job.status}\")\n",
|
||||||
|
"\n",
|
||||||
|
"time.sleep(10)\n",
|
||||||
|
"\n",
|
||||||
|
"max_checks = 5\n",
|
||||||
|
"for i in range(max_checks):\n",
|
||||||
|
" batch_details = client.batches.retrieve(batch_id=batch_job.id)\n",
|
||||||
|
" print(f\"Batch job details (check {i+1}/{max_checks}):\")\n",
|
||||||
|
" print(f\"ID: {batch_details.id}\")\n",
|
||||||
|
" print(f\"Status: {batch_details.status}\")\n",
|
||||||
|
" print(f\"Created at: {batch_details.created_at}\")\n",
|
||||||
|
" print(f\"Input file ID: {batch_details.input_file_id}\")\n",
|
||||||
|
" print(f\"Output file ID: {batch_details.output_file_id}\")\n",
|
||||||
|
"\n",
|
||||||
|
" print(\"Request counts:\")\n",
|
||||||
|
" print(f\"Total: {batch_details.request_counts.total}\")\n",
|
||||||
|
" print(f\"Completed: {batch_details.request_counts.completed}\")\n",
|
||||||
|
" print(f\"Failed: {batch_details.request_counts.failed}\")\n",
|
||||||
|
"\n",
|
||||||
|
" time.sleep(3)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Here is an example to cancel a batch job."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 46,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Created batch job with ID: batch_3d2dd881-ad84-465a-85ee-6d5991794e5e\n",
|
||||||
|
"Initial status: validating\n",
|
||||||
|
"Cancellation initiated. Status: cancelling\n",
|
||||||
|
"Current status: cancelled\n",
|
||||||
|
"Batch job successfully cancelled\n",
|
||||||
|
"Successfully cleaned up input file\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"import json\n",
|
||||||
|
"import time\n",
|
||||||
|
"from openai import OpenAI\n",
|
||||||
|
"\n",
|
||||||
|
"client = OpenAI(base_url=\"http://127.0.0.1:30000/v1\", api_key=\"None\")\n",
|
||||||
|
"\n",
|
||||||
|
"requests = []\n",
|
||||||
|
"for i in range(500):\n",
|
||||||
|
" requests.append(\n",
|
||||||
|
" {\n",
|
||||||
|
" \"custom_id\": f\"request-{i}\",\n",
|
||||||
|
" \"method\": \"POST\",\n",
|
||||||
|
" \"url\": \"/chat/completions\",\n",
|
||||||
|
" \"body\": {\n",
|
||||||
|
" \"model\": \"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
|
||||||
|
" \"messages\": [\n",
|
||||||
|
" {\n",
|
||||||
|
" \"role\": \"system\",\n",
|
||||||
|
" \"content\": f\"{i}: You are a helpful AI assistant\",\n",
|
||||||
|
" },\n",
|
||||||
|
" {\n",
|
||||||
|
" \"role\": \"user\",\n",
|
||||||
|
" \"content\": \"Write a detailed story about topic. Make it very long.\",\n",
|
||||||
|
" },\n",
|
||||||
|
" ],\n",
|
||||||
|
" \"max_tokens\": 500,\n",
|
||||||
|
" },\n",
|
||||||
|
" }\n",
|
||||||
|
" )\n",
|
||||||
|
"\n",
|
||||||
|
"input_file_path = \"batch_requests.jsonl\"\n",
|
||||||
|
"with open(input_file_path, \"w\") as f:\n",
|
||||||
|
" for req in requests:\n",
|
||||||
|
" f.write(json.dumps(req) + \"\\n\")\n",
|
||||||
|
"\n",
|
||||||
|
"with open(input_file_path, \"rb\") as f:\n",
|
||||||
|
" uploaded_file = client.files.create(file=f, purpose=\"batch\")\n",
|
||||||
|
"\n",
|
||||||
|
"batch_job = client.batches.create(\n",
|
||||||
|
" input_file_id=uploaded_file.id,\n",
|
||||||
|
" endpoint=\"/v1/chat/completions\",\n",
|
||||||
|
" completion_window=\"24h\",\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"print(f\"Created batch job with ID: {batch_job.id}\")\n",
|
||||||
|
"print(f\"Initial status: {batch_job.status}\")\n",
|
||||||
|
"\n",
|
||||||
|
"time.sleep(10)\n",
|
||||||
|
"\n",
|
||||||
|
"try:\n",
|
||||||
|
" cancelled_job = client.batches.cancel(batch_id=batch_job.id)\n",
|
||||||
|
" print(f\"Cancellation initiated. Status: {cancelled_job.status}\")\n",
|
||||||
|
" assert cancelled_job.status == \"cancelling\"\n",
|
||||||
|
"\n",
|
||||||
|
" # Monitor the cancellation process\n",
|
||||||
|
" while cancelled_job.status not in [\"failed\", \"cancelled\"]:\n",
|
||||||
|
" time.sleep(3)\n",
|
||||||
|
" cancelled_job = client.batches.retrieve(batch_job.id)\n",
|
||||||
|
" print(f\"Current status: {cancelled_job.status}\")\n",
|
||||||
|
"\n",
|
||||||
|
" # Verify final status\n",
|
||||||
|
" assert cancelled_job.status == \"cancelled\"\n",
|
||||||
|
" print(\"Batch job successfully cancelled\")\n",
|
||||||
|
"\n",
|
||||||
|
"except Exception as e:\n",
|
||||||
|
" print(f\"Error during cancellation: {e}\")\n",
|
||||||
|
" raise e\n",
|
||||||
|
"\n",
|
||||||
|
"finally:\n",
|
||||||
|
" try:\n",
|
||||||
|
" del_response = client.files.delete(uploaded_file.id)\n",
|
||||||
|
" if del_response.deleted:\n",
|
||||||
|
" print(\"Successfully cleaned up input file\")\n",
|
||||||
|
" except Exception as e:\n",
|
||||||
|
" print(f\"Error cleaning up: {e}\")\n",
|
||||||
|
" raise e"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 47,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"terminate_process(server_process)"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "AlphaMeemory",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.11.7"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
||||||
@@ -4,7 +4,9 @@
|
|||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"# Quick Start"
|
"# Quick Start: Launch A Server and Send Requests\n",
|
||||||
|
"\n",
|
||||||
|
"This section provides a quick start guide to using SGLang after installation."
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -13,12 +15,13 @@
|
|||||||
"source": [
|
"source": [
|
||||||
"## Launch a server\n",
|
"## Launch a server\n",
|
||||||
"\n",
|
"\n",
|
||||||
"This code uses `subprocess.Popen` to start an SGLang server process, equivalent to executing \n",
|
"This code block is equivalent to executing \n",
|
||||||
"\n",
|
"\n",
|
||||||
"```bash\n",
|
"```bash\n",
|
||||||
"python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \\\n",
|
"python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \\\n",
|
||||||
"--port 30000 --host 0.0.0.0 --log-level warning\n",
|
"--port 30000 --host 0.0.0.0 --log-level warning\n",
|
||||||
"```\n",
|
"```\n",
|
||||||
|
"\n",
|
||||||
"in your command line and wait for the server to be ready."
|
"in your command line and wait for the server to be ready."
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
@@ -39,10 +42,12 @@
|
|||||||
"from sglang.utils import execute_shell_command, wait_for_server, terminate_process\n",
|
"from sglang.utils import execute_shell_command, wait_for_server, terminate_process\n",
|
||||||
"\n",
|
"\n",
|
||||||
"\n",
|
"\n",
|
||||||
"server_process = execute_shell_command(\"\"\"\n",
|
"server_process = execute_shell_command(\n",
|
||||||
|
" \"\"\"\n",
|
||||||
"python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \\\n",
|
"python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \\\n",
|
||||||
"--port 30000 --host 0.0.0.0 --log-level warning\n",
|
"--port 30000 --host 0.0.0.0 --log-level warning\n",
|
||||||
"\"\"\")\n",
|
"\"\"\"\n",
|
||||||
|
")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"wait_for_server(\"http://localhost:30000\")\n",
|
"wait_for_server(\"http://localhost:30000\")\n",
|
||||||
"print(\"Server is ready. Proceeding with the next steps.\")"
|
"print(\"Server is ready. Proceeding with the next steps.\")"
|
||||||
@@ -105,9 +110,7 @@
|
|||||||
"# Always assign an api_key, even if not specified during server initialization.\n",
|
"# Always assign an api_key, even if not specified during server initialization.\n",
|
||||||
"# Setting an API key during server initialization is strongly recommended.\n",
|
"# Setting an API key during server initialization is strongly recommended.\n",
|
||||||
"\n",
|
"\n",
|
||||||
"client = openai.Client(\n",
|
"client = openai.Client(base_url=\"http://127.0.0.1:30000/v1\", api_key=\"None\")\n",
|
||||||
" base_url=\"http://127.0.0.1:30000/v1\", api_key=\"None\"\n",
|
|
||||||
")\n",
|
|
||||||
"\n",
|
"\n",
|
||||||
"# Chat completion example\n",
|
"# Chat completion example\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
|||||||
Reference in New Issue
Block a user