Add openAI compatible API (#1810)
Co-authored-by: Chayenne <zhaochenyang@g.ucla.edu>
This commit is contained in:
35
.github/workflows/deploy-docs.yml
vendored
35
.github/workflows/deploy-docs.yml
vendored
@@ -10,12 +10,17 @@ on:
|
||||
workflow_dispatch:
|
||||
|
||||
jobs:
|
||||
execute-notebooks:
|
||||
execute-and-deploy:
|
||||
runs-on: 1-gpu-runner
|
||||
if: github.repository == 'sgl-project/sglang'
|
||||
defaults:
|
||||
run:
|
||||
working-directory: docs
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
path: .
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v4
|
||||
@@ -25,7 +30,9 @@ jobs:
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
bash scripts/ci_install_dependency.sh
|
||||
pip install -r docs/requirements.txt
|
||||
pip install -r requirements.txt
|
||||
apt-get update
|
||||
apt-get install -y pandoc
|
||||
|
||||
- name: Setup Jupyter Kernel
|
||||
run: |
|
||||
@@ -33,7 +40,6 @@ jobs:
|
||||
|
||||
- name: Execute notebooks
|
||||
run: |
|
||||
cd docs
|
||||
for nb in *.ipynb; do
|
||||
if [ -f "$nb" ]; then
|
||||
echo "Executing $nb"
|
||||
@@ -43,36 +49,15 @@ jobs:
|
||||
fi
|
||||
done
|
||||
|
||||
build-and-deploy:
|
||||
needs: execute-notebooks
|
||||
if: github.repository == 'sgl-project/sglang'
|
||||
runs-on: 1-gpu-runner
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: '3.9'
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
bash scripts/ci_install_dependency.sh
|
||||
pip install -r docs/requirements.txt
|
||||
apt-get update
|
||||
apt-get install -y pandoc
|
||||
|
||||
- name: Build documentation
|
||||
run: |
|
||||
cd docs
|
||||
make html
|
||||
|
||||
- name: Push to sgl-project.github.io
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.PAT_TOKEN }}
|
||||
run: |
|
||||
cd docs/_build/html
|
||||
cd _build/html
|
||||
git clone https://$GITHUB_TOKEN@github.com/sgl-project/sgl-project.github.io.git ../sgl-project.github.io
|
||||
cp -r * ../sgl-project.github.io
|
||||
cd ../sgl-project.github.io
|
||||
|
||||
18
.github/workflows/execute-notebook.yml
vendored
18
.github/workflows/execute-notebook.yml
vendored
@@ -1,12 +1,24 @@
|
||||
name: Execute Notebooks
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
branches: [ main ]
|
||||
paths:
|
||||
- "python/sglang/**"
|
||||
- "docs/**"
|
||||
pull_request:
|
||||
branches: [ main ]
|
||||
paths:
|
||||
- "python/sglang/**"
|
||||
- "docs/**"
|
||||
workflow_dispatch:
|
||||
|
||||
|
||||
concurrency:
|
||||
group: execute-notebook-${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
|
||||
jobs:
|
||||
run-all-notebooks:
|
||||
runs-on: 1-gpu-runner
|
||||
|
||||
@@ -10,6 +10,9 @@ repos:
|
||||
rev: 24.10.0
|
||||
hooks:
|
||||
- id: black
|
||||
additional_dependencies: ['.[jupyter]']
|
||||
types: [python, jupyter]
|
||||
types_or: [python, jupyter]
|
||||
|
||||
- repo: https://github.com/pre-commit/pre-commit-hooks
|
||||
rev: v5.0.0
|
||||
|
||||
@@ -4,19 +4,32 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Embedding Model"
|
||||
"# Embedding Model\n",
|
||||
"\n",
|
||||
"SGLang supports embedding models in the same way as completion models. Here are some example models:\n",
|
||||
"\n",
|
||||
"- [intfloat/e5-mistral-7b-instruct](https://huggingface.co/intfloat/e5-mistral-7b-instruct)\n",
|
||||
"- [Alibaba-NLP/gte-Qwen2-7B-instruct](https://huggingface.co/Alibaba-NLP/gte-Qwen2-7B-instruct)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Launch A Server"
|
||||
"## Launch A Server\n",
|
||||
"\n",
|
||||
"The following code is equivalent to running this in the shell:\n",
|
||||
"```bash\n",
|
||||
"python -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-7B-instruct \\\n",
|
||||
" --port 30010 --host 0.0.0.0 --is-embedding --log-level error\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"Remember to add `--is-embedding` to the command."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@@ -28,14 +41,14 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Equivalent to running this in the shell:\n",
|
||||
"# python -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-7B-instruct --port 30010 --host 0.0.0.0 --is-embedding --log-level error\n",
|
||||
"from sglang.utils import execute_shell_command, wait_for_server, terminate_process\n",
|
||||
"\n",
|
||||
"embedding_process = execute_shell_command(\"\"\"\n",
|
||||
"embedding_process = execute_shell_command(\n",
|
||||
" \"\"\"\n",
|
||||
"python -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-7B-instruct \\\n",
|
||||
" --port 30010 --host 0.0.0.0 --is-embedding --log-level error\n",
|
||||
"\"\"\")\n",
|
||||
"\"\"\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"wait_for_server(\"http://localhost:30010\")\n",
|
||||
"\n",
|
||||
@@ -51,25 +64,32 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[0.0083160400390625, 0.0006804466247558594, -0.00809478759765625, -0.0006995201110839844, 0.0143890380859375, -0.0090179443359375, 0.01238250732421875, 0.00209808349609375, 0.0062103271484375, -0.003047943115234375]\n"
|
||||
"Text embedding (first 10): [0.0083160400390625, 0.0006804466247558594, -0.00809478759765625, -0.0006995201110839844, 0.0143890380859375, -0.0090179443359375, 0.01238250732421875, 0.00209808349609375, 0.0062103271484375, -0.003047943115234375]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Get the first 10 elements of the embedding\n",
|
||||
"import subprocess, json\n",
|
||||
"\n",
|
||||
"! curl -s http://localhost:30010/v1/embeddings \\\n",
|
||||
"text = \"Once upon a time\"\n",
|
||||
"\n",
|
||||
"curl_text = f\"\"\"curl -s http://localhost:30010/v1/embeddings \\\n",
|
||||
" -H \"Content-Type: application/json\" \\\n",
|
||||
" -H \"Authorization: Bearer None\" \\\n",
|
||||
" -d '{\"model\": \"Alibaba-NLP/gte-Qwen2-7B-instruct\", \"input\": \"Once upon a time\"}' \\\n",
|
||||
" | python3 -c \"import sys, json; print(json.load(sys.stdin)['data'][0]['embedding'][:10])\""
|
||||
" -d '{{\"model\": \"Alibaba-NLP/gte-Qwen2-7B-instruct\", \"input\": \"{text}\"}}'\"\"\"\n",
|
||||
"\n",
|
||||
"text_embedding = json.loads(subprocess.check_output(curl_text, shell=True))[\"data\"][0][\n",
|
||||
" \"embedding\"\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"print(f\"Text embedding (first 10): {text_embedding[:10]}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -81,37 +101,79 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[0.00603485107421875, -0.0190582275390625, -0.01273345947265625, 0.01552581787109375, 0.0066680908203125, -0.0135955810546875, 0.01131439208984375, 0.0013713836669921875, -0.0089874267578125, 0.021759033203125]\n"
|
||||
"Text embedding (first 10): [0.00829315185546875, 0.0007004737854003906, -0.00809478759765625, -0.0006799697875976562, 0.01438140869140625, -0.00897979736328125, 0.0123748779296875, 0.0020923614501953125, 0.006195068359375, -0.0030498504638671875]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import openai\n",
|
||||
"\n",
|
||||
"client = openai.Client(\n",
|
||||
" base_url=\"http://127.0.0.1:30010/v1\", api_key=\"None\"\n",
|
||||
")\n",
|
||||
"client = openai.Client(base_url=\"http://127.0.0.1:30010/v1\", api_key=\"None\")\n",
|
||||
"\n",
|
||||
"# Text embedding example\n",
|
||||
"response = client.embeddings.create(\n",
|
||||
" model=\"Alibaba-NLP/gte-Qwen2-7B-instruct\",\n",
|
||||
" input=\"How are you today\",\n",
|
||||
" input=text,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"embedding = response.data[0].embedding[:10]\n",
|
||||
"print(embedding)"
|
||||
"print(f\"Text embedding (first 10): {embedding}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Using Input IDs\n",
|
||||
"\n",
|
||||
"SGLang also supports `input_ids` as input to get the embedding."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Input IDs embedding (first 10): [0.00829315185546875, 0.0007004737854003906, -0.00809478759765625, -0.0006799697875976562, 0.01438140869140625, -0.00897979736328125, 0.0123748779296875, 0.0020923614501953125, 0.006195068359375, -0.0030498504638671875]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import json\n",
|
||||
"import os\n",
|
||||
"from transformers import AutoTokenizer\n",
|
||||
"\n",
|
||||
"os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n",
|
||||
"\n",
|
||||
"tokenizer = AutoTokenizer.from_pretrained(\"Alibaba-NLP/gte-Qwen2-7B-instruct\")\n",
|
||||
"input_ids = tokenizer.encode(text)\n",
|
||||
"\n",
|
||||
"curl_ids = f\"\"\"curl -s http://localhost:30010/v1/embeddings \\\n",
|
||||
" -H \"Content-Type: application/json\" \\\n",
|
||||
" -H \"Authorization: Bearer None\" \\\n",
|
||||
" -d '{{\"model\": \"Alibaba-NLP/gte-Qwen2-7B-instruct\", \"input\": {json.dumps(input_ids)}}}'\"\"\"\n",
|
||||
"\n",
|
||||
"input_ids_embedding = json.loads(subprocess.check_output(curl_ids, shell=True))[\"data\"][\n",
|
||||
" 0\n",
|
||||
"][\"embedding\"]\n",
|
||||
"\n",
|
||||
"print(f\"Input IDs embedding (first 10): {input_ids_embedding[:10]}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
||||
@@ -16,12 +16,14 @@ The core features include:
|
||||
:caption: Getting Started
|
||||
|
||||
install.md
|
||||
send_request.ipynb
|
||||
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
:caption: Backend Tutorial
|
||||
|
||||
openai_api.ipynb
|
||||
backend.md
|
||||
|
||||
|
||||
@@ -43,3 +45,4 @@ The core features include:
|
||||
choices_methods.md
|
||||
benchmark_and_profiling.md
|
||||
troubleshooting.md
|
||||
embedding_model.ipynb
|
||||
|
||||
676
docs/openai_api.ipynb
Normal file
676
docs/openai_api.ipynb
Normal file
@@ -0,0 +1,676 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# OpenAI Compatible API\n",
|
||||
"\n",
|
||||
"SGLang provides an OpenAI compatible API for smooth transition from OpenAI services.\n",
|
||||
"\n",
|
||||
"- `chat/completions`\n",
|
||||
"- `completions`\n",
|
||||
"- `batches`\n",
|
||||
"- `embeddings`(refer to [embedding_model.ipynb](embedding_model.ipynb))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Chat Completions\n",
|
||||
"\n",
|
||||
"### Usage\n",
|
||||
"\n",
|
||||
"Similar to [send_request.ipynb](send_request.ipynb), we can send a chat completion request to SGLang server with OpenAI API format."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 38,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Server is ready. Proceeding with the next steps.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from sglang.utils import execute_shell_command, wait_for_server, terminate_process\n",
|
||||
"\n",
|
||||
"server_process = execute_shell_command(\n",
|
||||
" \"\"\"\n",
|
||||
"python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \\\n",
|
||||
"--port 30000 --host 0.0.0.0 --log-level warning\n",
|
||||
"\"\"\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"wait_for_server(\"http://localhost:30000\")\n",
|
||||
"print(\"Server is ready. Proceeding with the next steps.\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 39,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"ChatCompletion(id='e854540ec7914b2d8c712f16fd9ed2ca', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\\n\\n1. **Country:** Japan\\n**Capital:** Tokyo\\n\\n2. **Country:** Australia\\n**Capital:** Canberra\\n\\n3. **Country:** Brazil\\n**Capital:** Brasília', refusal=None, role='assistant', function_call=None, tool_calls=None), matched_stop=128009)], created=1730012326, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=46, prompt_tokens=49, total_tokens=95, prompt_tokens_details=None))\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import openai\n",
|
||||
"\n",
|
||||
"# Always assign an api_key, even if not specified during server initialization.\n",
|
||||
"# Setting an API key during server initialization is strongly recommended.\n",
|
||||
"\n",
|
||||
"client = openai.Client(base_url=\"http://127.0.0.1:30000/v1\", api_key=\"None\")\n",
|
||||
"\n",
|
||||
"# Chat completion example\n",
|
||||
"\n",
|
||||
"response = client.chat.completions.create(\n",
|
||||
" model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
|
||||
" messages=[\n",
|
||||
" {\"role\": \"system\", \"content\": \"You are a helpful AI assistant\"},\n",
|
||||
" {\"role\": \"user\", \"content\": \"List 3 countries and their capitals.\"},\n",
|
||||
" ],\n",
|
||||
" temperature=0,\n",
|
||||
" max_tokens=64,\n",
|
||||
")\n",
|
||||
"print(response)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Parameters\n",
|
||||
"\n",
|
||||
"The chat completions API accepts the following parameters (refer to [OpenAI Chat Completions API](https://platform.openai.com/docs/api-reference/chat/create) for more details):\n",
|
||||
"\n",
|
||||
"- `messages`: List of messages in the conversation, each containing `role` and `content`\n",
|
||||
"- `model`: The model identifier to use for completion\n",
|
||||
"- `max_tokens`: Maximum number of tokens to generate in the response\n",
|
||||
"- `temperature`: Controls randomness (0-2). Lower values make output more focused and deterministic\n",
|
||||
"- `top_p`: Alternative to temperature. Controls diversity via nucleus sampling\n",
|
||||
"- `n`: Number of chat completion choices to generate\n",
|
||||
"- `stream`: If true, partial message deltas will be sent as they become available\n",
|
||||
"- `stop`: Sequences where the API will stop generating further tokens\n",
|
||||
"- `presence_penalty`: Penalizes new tokens based on their presence in the text so far (-2.0 to 2.0)\n",
|
||||
"- `frequency_penalty`: Penalizes new tokens based on their frequency in the text so far (-2.0 to 2.0)\n",
|
||||
"- `logit_bias`: Modify the likelihood of specified tokens appearing in the completion\n",
|
||||
"- `logprobs`: Include log probabilities of tokens in the response\n",
|
||||
"- `top_logprobs`: Number of most likely tokens to return probabilities for\n",
|
||||
"- `seed`: Random seed for deterministic results\n",
|
||||
"- `response_format`: Specify the format of the response (e.g., JSON)\n",
|
||||
"- `stream_options`: Additional options for streaming responses\n",
|
||||
"- `user`: A unique identifier representing your end-user\n",
|
||||
"\n",
|
||||
"Here is an example of a detailed chat completion request:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 40,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Ancient Rome's major achievements include:"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"response = client.chat.completions.create(\n",
|
||||
" model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
|
||||
" messages=[\n",
|
||||
" {\n",
|
||||
" \"role\": \"system\",\n",
|
||||
" \"content\": \"You are a knowledgeable historian who provides concise responses.\",\n",
|
||||
" },\n",
|
||||
" {\"role\": \"user\", \"content\": \"Tell me about ancient Rome\"},\n",
|
||||
" {\n",
|
||||
" \"role\": \"assistant\",\n",
|
||||
" \"content\": \"Ancient Rome was a civilization centered in Italy.\",\n",
|
||||
" },\n",
|
||||
" {\"role\": \"user\", \"content\": \"What were their major achievements?\"},\n",
|
||||
" ],\n",
|
||||
" temperature=0.3, # Lower temperature for more focused responses\n",
|
||||
" max_tokens=100, # Reasonable length for a concise response\n",
|
||||
" top_p=0.95, # Slightly higher for better fluency\n",
|
||||
" stop=[\"\\n\\n\"], # Simple stop sequence\n",
|
||||
" presence_penalty=0.2, # Mild penalty to avoid repetition\n",
|
||||
" frequency_penalty=0.2, # Mild penalty for more natural language\n",
|
||||
" n=1, # Single response is usually more stable\n",
|
||||
" seed=42, # Keep for reproducibility\n",
|
||||
" stream=True, # Keep streaming for real-time output\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"for chunk in response:\n",
|
||||
" print(chunk.choices[0].delta.content or \"\", end=\"\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Completions\n",
|
||||
"\n",
|
||||
"### Usage\n",
|
||||
"\n",
|
||||
"Completions API is similar to Chat Completions API, but without the `messages` parameter. Refer to [OpenAI Completions API](https://platform.openai.com/docs/api-reference/completions/create) for more details."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 41,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Completion(id='a6e07198f4b445baa0fb08a2178ceb59', choices=[CompletionChoice(finish_reason='length', index=0, logprobs=None, text=' 1. 2. 3.\\n1. United States - Washington D.C. 2. Japan - Tokyo 3. Australia - Canberra\\nList 3 countries and their capitals. 1. 2. 3.\\n1. China - Beijing 2. Brazil - Bras', matched_stop=None)], created=1730012328, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=64, prompt_tokens=9, total_tokens=73, prompt_tokens_details=None))\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"response = client.completions.create(\n",
|
||||
" model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
|
||||
" prompt=\"List 3 countries and their capitals.\",\n",
|
||||
" temperature=0,\n",
|
||||
" max_tokens=64,\n",
|
||||
" n=1,\n",
|
||||
" stop=None,\n",
|
||||
")\n",
|
||||
"print(response)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Parameters\n",
|
||||
"\n",
|
||||
"The completions API accepts the following parameters:\n",
|
||||
"\n",
|
||||
"- `model`: The model identifier to use for completion\n",
|
||||
"- `prompt`: Input text to generate completions for. Can be a string, array of strings, or token arrays\n",
|
||||
"- `best_of`: Number of completions to generate server-side and return the best one\n",
|
||||
"- `echo`: If true, the prompt will be included in the response\n",
|
||||
"- `frequency_penalty`: Penalizes new tokens based on their frequency in the text so far (-2.0 to 2.0)\n",
|
||||
"- `logit_bias`: Modify the likelihood of specified tokens appearing in the completion\n",
|
||||
"- `logprobs`: Include log probabilities of tokens in the response\n",
|
||||
"- `max_tokens`: Maximum number of tokens to generate in the response (default: 16)\n",
|
||||
"- `n`: Number of completion choices to generate\n",
|
||||
"- `presence_penalty`: Penalizes new tokens based on their presence in the text so far (-2.0 to 2.0)\n",
|
||||
"- `seed`: Random seed for deterministic results\n",
|
||||
"- `stop`: Sequences where the API will stop generating further tokens\n",
|
||||
"- `stream`: If true, partial completion deltas will be sent as they become available\n",
|
||||
"- `stream_options`: Additional options for streaming responses\n",
|
||||
"- `suffix`: Text to append to the completion\n",
|
||||
"- `temperature`: Controls randomness (0-2). Lower values make output more focused and deterministic\n",
|
||||
"- `top_p`: Alternative to temperature. Controls diversity via nucleus sampling\n",
|
||||
"- `user`: A unique identifier representing your end-user\n",
|
||||
"\n",
|
||||
"Here is an example of a detailed completions request:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 42,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" Space explorer, Captain Orion Blackwood, had been traveling through the galaxy for 12 years, searching for a new home for humanity. His ship, the Aurora, had been his home for so long that he barely remembered what it was like to walk on solid ground.\n",
|
||||
"As he navigated through the dense asteroid field, the ship's computer, S.A.R.A. (Self-Aware Reasoning Algorithm), alerted him to a strange reading on one of the asteroids. Captain Blackwood's curiosity was piqued, and he decided to investigate further.\n",
|
||||
"\"Captain, I'm detecting unusual energy signatures emanating from the asteroid,\" S.A.R.A. said. \"It's unlike anything I've seen before.\"\n",
|
||||
"Captain Blackwood's eyes narrowed as"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"response = client.completions.create(\n",
|
||||
" model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
|
||||
" prompt=\"Write a short story about a space explorer.\",\n",
|
||||
" temperature=0.7, # Moderate temperature for creative writing\n",
|
||||
" max_tokens=150, # Longer response for a story\n",
|
||||
" top_p=0.9, # Balanced diversity in word choice\n",
|
||||
" stop=[\"\\n\\n\", \"THE END\"], # Multiple stop sequences\n",
|
||||
" presence_penalty=0.3, # Encourage novel elements\n",
|
||||
" frequency_penalty=0.3, # Reduce repetitive phrases\n",
|
||||
" n=1, # Generate one completion\n",
|
||||
" seed=123, # For reproducible results\n",
|
||||
" stream=True, # Stream the response\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"for chunk in response:\n",
|
||||
" print(chunk.choices[0].text or \"\", end=\"\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Batches\n",
|
||||
"\n",
|
||||
"We have implemented the batches API for chat completions and completions. You can upload your requests in `jsonl` files, create a batch job, and retrieve the results when the batch job is completed (which takes longer but costs less).\n",
|
||||
"\n",
|
||||
"The batches APIs are:\n",
|
||||
"\n",
|
||||
"- `batches`\n",
|
||||
"- `batches/{batch_id}/cancel`\n",
|
||||
"- `batches/{batch_id}`\n",
|
||||
"\n",
|
||||
"Here is an example of a batch job for chat completions, completions are similar.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 43,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Batch job created with ID: batch_03d7f74f-dffe-4c26-b5e7-bb9fb5cb89ff\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import json\n",
|
||||
"import time\n",
|
||||
"from openai import OpenAI\n",
|
||||
"\n",
|
||||
"client = OpenAI(base_url=\"http://127.0.0.1:30000/v1\", api_key=\"None\")\n",
|
||||
"\n",
|
||||
"requests = [\n",
|
||||
" {\n",
|
||||
" \"custom_id\": \"request-1\",\n",
|
||||
" \"method\": \"POST\",\n",
|
||||
" \"url\": \"/chat/completions\",\n",
|
||||
" \"body\": {\n",
|
||||
" \"model\": \"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
|
||||
" \"messages\": [\n",
|
||||
" {\"role\": \"user\", \"content\": \"Tell me a joke about programming\"}\n",
|
||||
" ],\n",
|
||||
" \"max_tokens\": 50,\n",
|
||||
" },\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"custom_id\": \"request-2\",\n",
|
||||
" \"method\": \"POST\",\n",
|
||||
" \"url\": \"/chat/completions\",\n",
|
||||
" \"body\": {\n",
|
||||
" \"model\": \"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
|
||||
" \"messages\": [{\"role\": \"user\", \"content\": \"What is Python?\"}],\n",
|
||||
" \"max_tokens\": 50,\n",
|
||||
" },\n",
|
||||
" },\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"input_file_path = \"batch_requests.jsonl\"\n",
|
||||
"\n",
|
||||
"with open(input_file_path, \"w\") as f:\n",
|
||||
" for req in requests:\n",
|
||||
" f.write(json.dumps(req) + \"\\n\")\n",
|
||||
"\n",
|
||||
"with open(input_file_path, \"rb\") as f:\n",
|
||||
" file_response = client.files.create(file=f, purpose=\"batch\")\n",
|
||||
"\n",
|
||||
"batch_response = client.batches.create(\n",
|
||||
" input_file_id=file_response.id,\n",
|
||||
" endpoint=\"/v1/chat/completions\",\n",
|
||||
" completion_window=\"24h\",\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print(f\"Batch job created with ID: {batch_response.id}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 44,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Batch job status: validating...trying again in 3 seconds...\n",
|
||||
"Batch job completed successfully!\n",
|
||||
"Request counts: BatchRequestCounts(completed=2, failed=0, total=2)\n",
|
||||
"\n",
|
||||
"Request request-1:\n",
|
||||
"Response: {'status_code': 200, 'request_id': 'request-1', 'body': {'id': 'request-1', 'object': 'chat.completion', 'created': 1730012333, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': {'index': 0, 'message': {'role': 'assistant', 'content': 'Why do programmers prefer dark mode?\\n\\nBecause light attracts bugs.'}, 'logprobs': None, 'finish_reason': 'stop', 'matched_stop': 128009}, 'usage': {'prompt_tokens': 41, 'completion_tokens': 13, 'total_tokens': 54}, 'system_fingerprint': None}}\n",
|
||||
"\n",
|
||||
"Request request-2:\n",
|
||||
"Response: {'status_code': 200, 'request_id': 'request-2', 'body': {'id': 'request-2', 'object': 'chat.completion', 'created': 1730012333, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': {'index': 0, 'message': {'role': 'assistant', 'content': '**What is Python?**\\n\\nPython is a high-level, interpreted programming language that is widely used for various purposes, including:\\n\\n* **Web Development**: Building web applications, web services, and web scraping.\\n* **Data Science**: Data analysis'}, 'logprobs': None, 'finish_reason': 'length', 'matched_stop': None}, 'usage': {'prompt_tokens': 39, 'completion_tokens': 50, 'total_tokens': 89}, 'system_fingerprint': None}}\n",
|
||||
"\n",
|
||||
"Cleaning up files...\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"while batch_response.status not in [\"completed\", \"failed\", \"cancelled\"]:\n",
|
||||
" time.sleep(3)\n",
|
||||
" print(f\"Batch job status: {batch_response.status}...trying again in 3 seconds...\")\n",
|
||||
" batch_response = client.batches.retrieve(batch_response.id)\n",
|
||||
"\n",
|
||||
"if batch_response.status == \"completed\":\n",
|
||||
" print(\"Batch job completed successfully!\")\n",
|
||||
" print(f\"Request counts: {batch_response.request_counts}\")\n",
|
||||
"\n",
|
||||
" result_file_id = batch_response.output_file_id\n",
|
||||
" file_response = client.files.content(result_file_id)\n",
|
||||
" result_content = file_response.read().decode(\"utf-8\")\n",
|
||||
"\n",
|
||||
" results = [\n",
|
||||
" json.loads(line) for line in result_content.split(\"\\n\") if line.strip() != \"\"\n",
|
||||
" ]\n",
|
||||
"\n",
|
||||
" for result in results:\n",
|
||||
" print(f\"\\nRequest {result['custom_id']}:\")\n",
|
||||
" print(f\"Response: {result['response']}\")\n",
|
||||
"\n",
|
||||
" print(\"\\nCleaning up files...\")\n",
|
||||
" # Only delete the result file ID since file_response is just content\n",
|
||||
" client.files.delete(result_file_id)\n",
|
||||
"else:\n",
|
||||
" print(f\"Batch job failed with status: {batch_response.status}\")\n",
|
||||
" if hasattr(batch_response, \"errors\"):\n",
|
||||
" print(f\"Errors: {batch_response.errors}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"It takes a while to complete the batch job. You can use these two APIs to retrieve the batch job status or cancel the batch job.\n",
|
||||
"\n",
|
||||
"1. `batches/{batch_id}`: Retrieve the batch job status.\n",
|
||||
"2. `batches/{batch_id}/cancel`: Cancel the batch job.\n",
|
||||
"\n",
|
||||
"Here is an example to check the batch job status."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 45,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Created batch job with ID: batch_6b9625ac-9ebc-4c4f-bfd5-f84f88b0100d\n",
|
||||
"Initial status: validating\n",
|
||||
"Batch job details (check 1/5):\n",
|
||||
"ID: batch_6b9625ac-9ebc-4c4f-bfd5-f84f88b0100d\n",
|
||||
"Status: in_progress\n",
|
||||
"Created at: 1730012334\n",
|
||||
"Input file ID: backend_input_file-8203d42a-109c-4573-9663-13b5d9cb6a2b\n",
|
||||
"Output file ID: None\n",
|
||||
"Request counts:\n",
|
||||
"Total: 0\n",
|
||||
"Completed: 0\n",
|
||||
"Failed: 0\n",
|
||||
"Batch job details (check 2/5):\n",
|
||||
"ID: batch_6b9625ac-9ebc-4c4f-bfd5-f84f88b0100d\n",
|
||||
"Status: in_progress\n",
|
||||
"Created at: 1730012334\n",
|
||||
"Input file ID: backend_input_file-8203d42a-109c-4573-9663-13b5d9cb6a2b\n",
|
||||
"Output file ID: None\n",
|
||||
"Request counts:\n",
|
||||
"Total: 0\n",
|
||||
"Completed: 0\n",
|
||||
"Failed: 0\n",
|
||||
"Batch job details (check 3/5):\n",
|
||||
"ID: batch_6b9625ac-9ebc-4c4f-bfd5-f84f88b0100d\n",
|
||||
"Status: in_progress\n",
|
||||
"Created at: 1730012334\n",
|
||||
"Input file ID: backend_input_file-8203d42a-109c-4573-9663-13b5d9cb6a2b\n",
|
||||
"Output file ID: None\n",
|
||||
"Request counts:\n",
|
||||
"Total: 0\n",
|
||||
"Completed: 0\n",
|
||||
"Failed: 0\n",
|
||||
"Batch job details (check 4/5):\n",
|
||||
"ID: batch_6b9625ac-9ebc-4c4f-bfd5-f84f88b0100d\n",
|
||||
"Status: completed\n",
|
||||
"Created at: 1730012334\n",
|
||||
"Input file ID: backend_input_file-8203d42a-109c-4573-9663-13b5d9cb6a2b\n",
|
||||
"Output file ID: backend_result_file-d32f441d-e737-4da3-b07a-c39349425b3a\n",
|
||||
"Request counts:\n",
|
||||
"Total: 100\n",
|
||||
"Completed: 100\n",
|
||||
"Failed: 0\n",
|
||||
"Batch job details (check 5/5):\n",
|
||||
"ID: batch_6b9625ac-9ebc-4c4f-bfd5-f84f88b0100d\n",
|
||||
"Status: completed\n",
|
||||
"Created at: 1730012334\n",
|
||||
"Input file ID: backend_input_file-8203d42a-109c-4573-9663-13b5d9cb6a2b\n",
|
||||
"Output file ID: backend_result_file-d32f441d-e737-4da3-b07a-c39349425b3a\n",
|
||||
"Request counts:\n",
|
||||
"Total: 100\n",
|
||||
"Completed: 100\n",
|
||||
"Failed: 0\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import json\n",
|
||||
"import time\n",
|
||||
"from openai import OpenAI\n",
|
||||
"\n",
|
||||
"client = OpenAI(base_url=\"http://127.0.0.1:30000/v1\", api_key=\"None\")\n",
|
||||
"\n",
|
||||
"requests = []\n",
|
||||
"for i in range(100):\n",
|
||||
" requests.append(\n",
|
||||
" {\n",
|
||||
" \"custom_id\": f\"request-{i}\",\n",
|
||||
" \"method\": \"POST\",\n",
|
||||
" \"url\": \"/chat/completions\",\n",
|
||||
" \"body\": {\n",
|
||||
" \"model\": \"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
|
||||
" \"messages\": [\n",
|
||||
" {\n",
|
||||
" \"role\": \"system\",\n",
|
||||
" \"content\": f\"{i}: You are a helpful AI assistant\",\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"role\": \"user\",\n",
|
||||
" \"content\": \"Write a detailed story about topic. Make it very long.\",\n",
|
||||
" },\n",
|
||||
" ],\n",
|
||||
" \"max_tokens\": 500,\n",
|
||||
" },\n",
|
||||
" }\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
"input_file_path = \"batch_requests.jsonl\"\n",
|
||||
"with open(input_file_path, \"w\") as f:\n",
|
||||
" for req in requests:\n",
|
||||
" f.write(json.dumps(req) + \"\\n\")\n",
|
||||
"\n",
|
||||
"with open(input_file_path, \"rb\") as f:\n",
|
||||
" uploaded_file = client.files.create(file=f, purpose=\"batch\")\n",
|
||||
"\n",
|
||||
"batch_job = client.batches.create(\n",
|
||||
" input_file_id=uploaded_file.id,\n",
|
||||
" endpoint=\"/v1/chat/completions\",\n",
|
||||
" completion_window=\"24h\",\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print(f\"Created batch job with ID: {batch_job.id}\")\n",
|
||||
"print(f\"Initial status: {batch_job.status}\")\n",
|
||||
"\n",
|
||||
"time.sleep(10)\n",
|
||||
"\n",
|
||||
"max_checks = 5\n",
|
||||
"for i in range(max_checks):\n",
|
||||
" batch_details = client.batches.retrieve(batch_id=batch_job.id)\n",
|
||||
" print(f\"Batch job details (check {i+1}/{max_checks}):\")\n",
|
||||
" print(f\"ID: {batch_details.id}\")\n",
|
||||
" print(f\"Status: {batch_details.status}\")\n",
|
||||
" print(f\"Created at: {batch_details.created_at}\")\n",
|
||||
" print(f\"Input file ID: {batch_details.input_file_id}\")\n",
|
||||
" print(f\"Output file ID: {batch_details.output_file_id}\")\n",
|
||||
"\n",
|
||||
" print(\"Request counts:\")\n",
|
||||
" print(f\"Total: {batch_details.request_counts.total}\")\n",
|
||||
" print(f\"Completed: {batch_details.request_counts.completed}\")\n",
|
||||
" print(f\"Failed: {batch_details.request_counts.failed}\")\n",
|
||||
"\n",
|
||||
" time.sleep(3)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Here is an example to cancel a batch job."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 46,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Created batch job with ID: batch_3d2dd881-ad84-465a-85ee-6d5991794e5e\n",
|
||||
"Initial status: validating\n",
|
||||
"Cancellation initiated. Status: cancelling\n",
|
||||
"Current status: cancelled\n",
|
||||
"Batch job successfully cancelled\n",
|
||||
"Successfully cleaned up input file\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import json\n",
|
||||
"import time\n",
|
||||
"from openai import OpenAI\n",
|
||||
"\n",
|
||||
"client = OpenAI(base_url=\"http://127.0.0.1:30000/v1\", api_key=\"None\")\n",
|
||||
"\n",
|
||||
"requests = []\n",
|
||||
"for i in range(500):\n",
|
||||
" requests.append(\n",
|
||||
" {\n",
|
||||
" \"custom_id\": f\"request-{i}\",\n",
|
||||
" \"method\": \"POST\",\n",
|
||||
" \"url\": \"/chat/completions\",\n",
|
||||
" \"body\": {\n",
|
||||
" \"model\": \"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
|
||||
" \"messages\": [\n",
|
||||
" {\n",
|
||||
" \"role\": \"system\",\n",
|
||||
" \"content\": f\"{i}: You are a helpful AI assistant\",\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"role\": \"user\",\n",
|
||||
" \"content\": \"Write a detailed story about topic. Make it very long.\",\n",
|
||||
" },\n",
|
||||
" ],\n",
|
||||
" \"max_tokens\": 500,\n",
|
||||
" },\n",
|
||||
" }\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
"input_file_path = \"batch_requests.jsonl\"\n",
|
||||
"with open(input_file_path, \"w\") as f:\n",
|
||||
" for req in requests:\n",
|
||||
" f.write(json.dumps(req) + \"\\n\")\n",
|
||||
"\n",
|
||||
"with open(input_file_path, \"rb\") as f:\n",
|
||||
" uploaded_file = client.files.create(file=f, purpose=\"batch\")\n",
|
||||
"\n",
|
||||
"batch_job = client.batches.create(\n",
|
||||
" input_file_id=uploaded_file.id,\n",
|
||||
" endpoint=\"/v1/chat/completions\",\n",
|
||||
" completion_window=\"24h\",\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print(f\"Created batch job with ID: {batch_job.id}\")\n",
|
||||
"print(f\"Initial status: {batch_job.status}\")\n",
|
||||
"\n",
|
||||
"time.sleep(10)\n",
|
||||
"\n",
|
||||
"try:\n",
|
||||
" cancelled_job = client.batches.cancel(batch_id=batch_job.id)\n",
|
||||
" print(f\"Cancellation initiated. Status: {cancelled_job.status}\")\n",
|
||||
" assert cancelled_job.status == \"cancelling\"\n",
|
||||
"\n",
|
||||
" # Monitor the cancellation process\n",
|
||||
" while cancelled_job.status not in [\"failed\", \"cancelled\"]:\n",
|
||||
" time.sleep(3)\n",
|
||||
" cancelled_job = client.batches.retrieve(batch_job.id)\n",
|
||||
" print(f\"Current status: {cancelled_job.status}\")\n",
|
||||
"\n",
|
||||
" # Verify final status\n",
|
||||
" assert cancelled_job.status == \"cancelled\"\n",
|
||||
" print(\"Batch job successfully cancelled\")\n",
|
||||
"\n",
|
||||
"except Exception as e:\n",
|
||||
" print(f\"Error during cancellation: {e}\")\n",
|
||||
" raise e\n",
|
||||
"\n",
|
||||
"finally:\n",
|
||||
" try:\n",
|
||||
" del_response = client.files.delete(uploaded_file.id)\n",
|
||||
" if del_response.deleted:\n",
|
||||
" print(\"Successfully cleaned up input file\")\n",
|
||||
" except Exception as e:\n",
|
||||
" print(f\"Error cleaning up: {e}\")\n",
|
||||
" raise e"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 47,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"terminate_process(server_process)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "AlphaMeemory",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.7"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
@@ -4,7 +4,9 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Quick Start"
|
||||
"# Quick Start: Launch A Server and Send Requests\n",
|
||||
"\n",
|
||||
"This section provides a quick start guide to using SGLang after installation."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -13,12 +15,13 @@
|
||||
"source": [
|
||||
"## Launch a server\n",
|
||||
"\n",
|
||||
"This code uses `subprocess.Popen` to start an SGLang server process, equivalent to executing \n",
|
||||
"This code block is equivalent to executing \n",
|
||||
"\n",
|
||||
"```bash\n",
|
||||
"python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \\\n",
|
||||
"--port 30000 --host 0.0.0.0 --log-level warning\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"in your command line and wait for the server to be ready."
|
||||
]
|
||||
},
|
||||
@@ -39,10 +42,12 @@
|
||||
"from sglang.utils import execute_shell_command, wait_for_server, terminate_process\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"server_process = execute_shell_command(\"\"\"\n",
|
||||
"server_process = execute_shell_command(\n",
|
||||
" \"\"\"\n",
|
||||
"python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \\\n",
|
||||
"--port 30000 --host 0.0.0.0 --log-level warning\n",
|
||||
"\"\"\")\n",
|
||||
"\"\"\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"wait_for_server(\"http://localhost:30000\")\n",
|
||||
"print(\"Server is ready. Proceeding with the next steps.\")"
|
||||
@@ -105,9 +110,7 @@
|
||||
"# Always assign an api_key, even if not specified during server initialization.\n",
|
||||
"# Setting an API key during server initialization is strongly recommended.\n",
|
||||
"\n",
|
||||
"client = openai.Client(\n",
|
||||
" base_url=\"http://127.0.0.1:30000/v1\", api_key=\"None\"\n",
|
||||
")\n",
|
||||
"client = openai.Client(base_url=\"http://127.0.0.1:30000/v1\", api_key=\"None\")\n",
|
||||
"\n",
|
||||
"# Chat completion example\n",
|
||||
"\n",
|
||||
|
||||
Reference in New Issue
Block a user