[CI] Improve Docs CI Efficiency (#3587)

Co-authored-by: zhaochenyang20 <zhaochen20@outlook.com>
This commit is contained in:
Shi Shuai
2025-02-15 03:57:00 +00:00
committed by GitHub
parent 862dd76c76
commit 7443197a63
19 changed files with 366 additions and 231 deletions

View File

@@ -1,34 +1,42 @@
# Minimal makefile for Sphinx documentation
#
# You can set these variables from the terminal, and also
# from the environment for the first two.
# Minimal Makefile for Sphinx documentation
SPHINXOPTS ?=
SPHINXBUILD ?= sphinx-build
SOURCEDIR = .
BUILDDIR = _build
# Put it first so that "make" without argument is like "make help".
help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
# New target to compile Markdown and Jupyter Notebook files
# Compile Notebook files and record execution time
compile:
find $(SOURCEDIR) -path "*/_build/*" -prune -o -name "*.ipynb" -print | while read nb; do \
if [ -f "$$nb" ]; then \
echo "Executing $$nb"; \
jupyter nbconvert --to notebook --execute --inplace "$$nb" \
--ExecutePreprocessor.timeout=600 \
--ExecutePreprocessor.kernel_name=python3 || exit 1; \
fi; \
done
@set -e; \
echo "Starting Notebook compilation..."; \
mkdir -p logs; \
echo "Notebook execution timings:" > logs/timing.log; \
START_TOTAL=$$(date +%s); \
find $(SOURCEDIR) -path "*/_build/*" -prune -o -name "*.ipynb" -print0 | \
parallel -0 -j3 --halt soon,fail=1 ' \
NB_NAME=$$(basename {}); \
START_TIME=$$(date +%s); \
jupyter nbconvert --to notebook --execute --inplace "{}" \
--ExecutePreprocessor.timeout=600 \
--ExecutePreprocessor.kernel_name=python3; \
RET_CODE=$$?; \
END_TIME=$$(date +%s); \
ELAPSED_TIME=$$((END_TIME - START_TIME)); \
echo "$${NB_NAME}: $${ELAPSED_TIME}s" >> logs/timing.log; \
exit $$RET_CODE' || exit 1; \
END_TOTAL=$$(date +%s); \
TOTAL_ELAPSED=$$((END_TOTAL - START_TOTAL)); \
echo "---------------------------------" >> logs/timing.log; \
echo "Total execution time: $${TOTAL_ELAPSED}s" >> logs/timing.log; \
echo "All Notebook execution timings:" && cat logs/timing.log
.PHONY: help Makefile compile
# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
.PHONY: help Makefile compile clean
%: Makefile
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
clean:
rm -rf $(BUILDDIR)/*
rm -rf $(BUILDDIR)/* logs/timing.log

0
docs/backend/__init__.py Normal file
View File

View File

@@ -31,17 +31,19 @@
"source": [
"from openai import OpenAI\n",
"import json\n",
"from sglang.utils import (\n",
" execute_shell_command,\n",
" wait_for_server,\n",
" terminate_process,\n",
" print_highlight,\n",
")\n",
"from sglang.utils import wait_for_server, print_highlight, terminate_process\n",
"from sglang.test.test_utils import is_in_ci\n",
"\n",
"server_process = execute_shell_command(\n",
" \"python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --tool-call-parser llama3 --port 30333 --host 0.0.0.0\" # llama3\n",
"if is_in_ci():\n",
" from patch import launch_server_cmd\n",
"else:\n",
" from sglang.utils import launch_server_cmd\n",
"\n",
"\n",
"server_process, port = launch_server_cmd(\n",
" \"python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --tool-call-parser llama3 --host 0.0.0.0\" # llama3\n",
")\n",
"wait_for_server(\"http://localhost:30333\")"
"wait_for_server(f\"http://localhost:{port}\")"
]
},
{
@@ -141,7 +143,7 @@
"outputs": [],
"source": [
"# Initialize OpenAI-like client\n",
"client = OpenAI(api_key=\"None\", base_url=\"http://0.0.0.0:30333/v1\")\n",
"client = OpenAI(api_key=\"None\", base_url=f\"http://0.0.0.0:{port}/v1\")\n",
"model_name = client.models.list().data[0].id"
]
},
@@ -377,13 +379,13 @@
" tools=tools,\n",
")\n",
"\n",
"gen_url = \"http://localhost:30333/generate\"\n",
"gen_url = f\"http://localhost:{port}/generate\"\n",
"gen_data = {\"text\": input, \"sampling_params\": {\"skip_special_tokens\": False}}\n",
"gen_response = requests.post(gen_url, json=gen_data).json()[\"text\"]\n",
"print(gen_response)\n",
"\n",
"# parse the response\n",
"parse_url = \"http://localhost:30333/function_call\"\n",
"parse_url = f\"http://localhost:{port}/function_call\"\n",
"\n",
"function_call_input = {\n",
" \"text\": gen_response,\n",
@@ -403,7 +405,7 @@
"metadata": {},
"outputs": [],
"source": [
"terminate_process(server_process)"
"terminate_process(server_process, port)"
]
},
{

View File

@@ -34,22 +34,22 @@
"metadata": {},
"outputs": [],
"source": [
"from sglang.utils import (\n",
" execute_shell_command,\n",
" wait_for_server,\n",
" terminate_process,\n",
" print_highlight,\n",
")\n",
"\n",
"import requests\n",
"from sglang.test.test_utils import is_in_ci\n",
"\n",
"server_process = execute_shell_command(\n",
" \"\"\"\n",
"python3 -m sglang.launch_server --model-path meta-llama/Llama-3.2-1B-Instruct --port=30010\n",
"\"\"\"\n",
"if is_in_ci():\n",
" from patch import launch_server_cmd\n",
"else:\n",
" from sglang.utils import launch_server_cmd\n",
"\n",
"from sglang.utils import wait_for_server, print_highlight, terminate_process\n",
"\n",
"\n",
"server_process, port = launch_server_cmd(\n",
" \"python -m sglang.launch_server --model-path meta-llama/Llama-3.2-1B-Instruct --host 0.0.0.0\"\n",
")\n",
"\n",
"wait_for_server(\"http://localhost:30010\")"
"wait_for_server(f\"http://localhost:{port}\")"
]
},
{
@@ -66,7 +66,7 @@
"metadata": {},
"outputs": [],
"source": [
"url = \"http://localhost:30010/generate\"\n",
"url = f\"http://localhost:{port}/generate\"\n",
"data = {\"text\": \"What is the capital of France?\"}\n",
"\n",
"response = requests.post(url, json=data)\n",
@@ -92,7 +92,7 @@
"metadata": {},
"outputs": [],
"source": [
"url = \"http://localhost:30010/get_model_info\"\n",
"url = f\"http://localhost:{port}/get_model_info\"\n",
"\n",
"response = requests.get(url)\n",
"response_json = response.json()\n",
@@ -123,7 +123,7 @@
"source": [
"# get_server_info\n",
"\n",
"url = \"http://localhost:30010/get_server_info\"\n",
"url = f\"http://localhost:{port}/get_server_info\"\n",
"\n",
"response = requests.get(url)\n",
"print_highlight(response.text)"
@@ -144,7 +144,7 @@
"metadata": {},
"outputs": [],
"source": [
"url = \"http://localhost:30010/health_generate\"\n",
"url = f\"http://localhost:{port}/health_generate\"\n",
"\n",
"response = requests.get(url)\n",
"print_highlight(response.text)"
@@ -156,7 +156,7 @@
"metadata": {},
"outputs": [],
"source": [
"url = \"http://localhost:30010/health\"\n",
"url = f\"http://localhost:{port}/health\"\n",
"\n",
"response = requests.get(url)\n",
"print_highlight(response.text)"
@@ -179,7 +179,7 @@
"source": [
"# flush cache\n",
"\n",
"url = \"http://localhost:30010/flush_cache\"\n",
"url = f\"http://localhost:{port}/flush_cache\"\n",
"\n",
"response = requests.post(url)\n",
"print_highlight(response.text)"
@@ -204,7 +204,7 @@
"source": [
"# successful update with same architecture and size\n",
"\n",
"url = \"http://localhost:30010/update_weights_from_disk\"\n",
"url = f\"http://localhost:{port}/update_weights_from_disk\"\n",
"data = {\"model_path\": \"meta-llama/Llama-3.2-1B\"}\n",
"\n",
"response = requests.post(url, json=data)\n",
@@ -222,7 +222,7 @@
"source": [
"# failed update with different parameter size or wrong name\n",
"\n",
"url = \"http://localhost:30010/update_weights_from_disk\"\n",
"url = f\"http://localhost:{port}/update_weights_from_disk\"\n",
"data = {\"model_path\": \"meta-llama/Llama-3.2-1B-wrong\"}\n",
"\n",
"response = requests.post(url, json=data)\n",
@@ -252,16 +252,16 @@
"metadata": {},
"outputs": [],
"source": [
"terminate_process(server_process)\n",
"terminate_process(server_process, port)\n",
"\n",
"embedding_process = execute_shell_command(\n",
"embedding_process, port = launch_server_cmd(\n",
" \"\"\"\n",
"python -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-7B-instruct \\\n",
" --port 30020 --host 0.0.0.0 --is-embedding\n",
" --host 0.0.0.0 --is-embedding\n",
"\"\"\"\n",
")\n",
"\n",
"wait_for_server(\"http://localhost:30020\")"
"wait_for_server(f\"http://localhost:{port}\")"
]
},
{
@@ -272,7 +272,7 @@
"source": [
"# successful encode for embedding model\n",
"\n",
"url = \"http://localhost:30020/encode\"\n",
"url = f\"http://localhost:{port}/encode\"\n",
"data = {\"model\": \"Alibaba-NLP/gte-Qwen2-7B-instruct\", \"text\": \"Once upon a time\"}\n",
"\n",
"response = requests.post(url, json=data)\n",
@@ -280,6 +280,15 @@
"print_highlight(f\"Text embedding (first 10): {response_json['embedding'][:10]}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"terminate_process(embedding_process, port)"
]
},
{
"cell_type": "markdown",
"metadata": {},
@@ -295,18 +304,18 @@
"metadata": {},
"outputs": [],
"source": [
"terminate_process(embedding_process)\n",
"terminate_process(embedding_process, port)\n",
"\n",
"# Note that SGLang now treats embedding models and reward models as the same type of models.\n",
"# This will be updated in the future.\n",
"\n",
"reward_process = execute_shell_command(\n",
"reward_process, port = launch_server_cmd(\n",
" \"\"\"\n",
"python -m sglang.launch_server --model-path Skywork/Skywork-Reward-Llama-3.1-8B-v0.2 --port 30030 --host 0.0.0.0 --is-embedding\n",
"python -m sglang.launch_server --model-path Skywork/Skywork-Reward-Llama-3.1-8B-v0.2 --host 0.0.0.0 --is-embedding\n",
"\"\"\"\n",
")\n",
"\n",
"wait_for_server(\"http://localhost:30030\")"
"wait_for_server(f\"http://localhost:{port}\")"
]
},
{
@@ -332,7 +341,7 @@
"tokenizer = AutoTokenizer.from_pretrained(\"Skywork/Skywork-Reward-Llama-3.1-8B-v0.2\")\n",
"prompts = tokenizer.apply_chat_template(CONVS, tokenize=False)\n",
"\n",
"url = \"http://localhost:30030/classify\"\n",
"url = f\"http://localhost:{port}/classify\"\n",
"data = {\"model\": \"Skywork/Skywork-Reward-Llama-3.1-8B-v0.2\", \"text\": prompts}\n",
"\n",
"responses = requests.post(url, json=data).json()\n",
@@ -346,7 +355,7 @@
"metadata": {},
"outputs": [],
"source": [
"terminate_process(reward_process)"
"terminate_process(reward_process, port)"
]
},
{
@@ -364,13 +373,13 @@
"metadata": {},
"outputs": [],
"source": [
"tokenizer_free_server_process = execute_shell_command(\n",
"tokenizer_free_server_process, port = launch_server_cmd(\n",
" \"\"\"\n",
"python3 -m sglang.launch_server --model-path meta-llama/Llama-3.2-1B-Instruct --port=30010 --skip-tokenizer-init\n",
"python3 -m sglang.launch_server --model-path meta-llama/Llama-3.2-1B-Instruct --skip-tokenizer-init\n",
"\"\"\"\n",
")\n",
"\n",
"wait_for_server(\"http://localhost:30010\")"
"wait_for_server(f\"http://localhost:{port}\")"
]
},
{
@@ -390,7 +399,7 @@
"print_highlight(f\"Tokenized Input: {input_tokens}\")\n",
"\n",
"response = requests.post(\n",
" \"http://localhost:30010/generate\",\n",
" f\"http://localhost:{port}/generate\",\n",
" json={\n",
" \"input_ids\": input_tokens,\n",
" \"sampling_params\": {\n",
@@ -416,7 +425,7 @@
"metadata": {},
"outputs": [],
"source": [
"terminate_process(tokenizer_free_server_process)"
"terminate_process(tokenizer_free_server_process, port)"
]
}
],

View File

@@ -40,6 +40,11 @@
"from sglang.utils import stream_and_merge, async_stream_and_merge\n",
"import sglang as sgl\n",
"import asyncio\n",
"from sglang.test.test_utils import is_in_ci\n",
"\n",
"if is_in_ci():\n",
" import patch\n",
"\n",
"\n",
"llm = sgl.Engine(model_path=\"meta-llama/Meta-Llama-3.1-8B-Instruct\")"
]
@@ -201,8 +206,6 @@
"metadata": {},
"outputs": [],
"source": [
"import sglang as sgl\n",
"\n",
"llm = sgl.Engine(\n",
" model_path=\"meta-llama/Meta-Llama-3.1-8B-Instruct\", return_hidden_states=True\n",
")"

View File

@@ -33,18 +33,22 @@
"metadata": {},
"outputs": [],
"source": [
"from sglang.utils import (\n",
" execute_shell_command,\n",
" wait_for_server,\n",
" terminate_process,\n",
" print_highlight,\n",
"from sglang.test.test_utils import is_in_ci\n",
"\n",
"if is_in_ci():\n",
" from patch import launch_server_cmd\n",
"else:\n",
" from sglang.utils import launch_server_cmd\n",
"\n",
"from sglang.utils import wait_for_server, print_highlight, terminate_process\n",
"\n",
"\n",
"server_process, port = launch_server_cmd(\n",
" \"python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --host 0.0.0.0\"\n",
")\n",
"\n",
"server_process = execute_shell_command(\n",
" \"python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --port 30020 --host 0.0.0.0\"\n",
")\n",
"\n",
"wait_for_server(\"http://localhost:30020\")"
"wait_for_server(f\"http://localhost:{port}\")\n",
"print(f\"Server started on http://localhost:{port}\")"
]
},
{
@@ -68,7 +72,7 @@
"source": [
"import openai\n",
"\n",
"client = openai.Client(base_url=\"http://127.0.0.1:30020/v1\", api_key=\"None\")\n",
"client = openai.Client(base_url=f\"http://127.0.0.1:{port}/v1\", api_key=\"None\")\n",
"\n",
"response = client.chat.completions.create(\n",
" model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
@@ -245,7 +249,7 @@
"import time\n",
"from openai import OpenAI\n",
"\n",
"client = OpenAI(base_url=\"http://127.0.0.1:30020/v1\", api_key=\"None\")\n",
"client = OpenAI(base_url=f\"http://127.0.0.1:{port}/v1\", api_key=\"None\")\n",
"\n",
"requests = [\n",
" {\n",
@@ -348,10 +352,10 @@
"import time\n",
"from openai import OpenAI\n",
"\n",
"client = OpenAI(base_url=\"http://127.0.0.1:30020/v1\", api_key=\"None\")\n",
"client = OpenAI(base_url=f\"http://127.0.0.1:{port}/v1\", api_key=\"None\")\n",
"\n",
"requests = []\n",
"for i in range(100):\n",
"for i in range(20):\n",
" requests.append(\n",
" {\n",
" \"custom_id\": f\"request-{i}\",\n",
@@ -369,7 +373,7 @@
" \"content\": \"Write a detailed story about topic. Make it very long.\",\n",
" },\n",
" ],\n",
" \"max_tokens\": 500,\n",
" \"max_tokens\": 64,\n",
" },\n",
" }\n",
" )\n",
@@ -425,10 +429,10 @@
"from openai import OpenAI\n",
"import os\n",
"\n",
"client = OpenAI(base_url=\"http://127.0.0.1:30020/v1\", api_key=\"None\")\n",
"client = OpenAI(base_url=f\"http://127.0.0.1:{port}/v1\", api_key=\"None\")\n",
"\n",
"requests = []\n",
"for i in range(500):\n",
"for i in range(5000):\n",
" requests.append(\n",
" {\n",
" \"custom_id\": f\"request-{i}\",\n",
@@ -446,7 +450,7 @@
" \"content\": \"Write a detailed story about topic. Make it very long.\",\n",
" },\n",
" ],\n",
" \"max_tokens\": 500,\n",
" \"max_tokens\": 128,\n",
" },\n",
" }\n",
" )\n",
@@ -508,7 +512,7 @@
"metadata": {},
"outputs": [],
"source": [
"terminate_process(server_process)"
"terminate_process(server_process, port)"
]
}
],

View File

@@ -29,21 +29,23 @@
"metadata": {},
"outputs": [],
"source": [
"from sglang.utils import (\n",
" execute_shell_command,\n",
" wait_for_server,\n",
" terminate_process,\n",
" print_highlight,\n",
")\n",
"from sglang.test.test_utils import is_in_ci\n",
"\n",
"embedding_process = execute_shell_command(\n",
"if is_in_ci():\n",
" from patch import launch_server_cmd\n",
"else:\n",
" from sglang.utils import launch_server_cmd\n",
"\n",
"from sglang.utils import wait_for_server, print_highlight, terminate_process\n",
"\n",
"embedding_process, port = launch_server_cmd(\n",
" \"\"\"\n",
"python -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-7B-instruct \\\n",
" --port 30000 --host 0.0.0.0 --is-embedding\n",
" --host 0.0.0.0 --is-embedding\n",
"\"\"\"\n",
")\n",
"\n",
"wait_for_server(\"http://localhost:30000\")"
"wait_for_server(f\"http://localhost:{port}\")"
]
},
{
@@ -63,7 +65,7 @@
"\n",
"text = \"Once upon a time\"\n",
"\n",
"curl_text = f\"\"\"curl -s http://localhost:30000/v1/embeddings \\\n",
"curl_text = f\"\"\"curl -s http://localhost:{port}/v1/embeddings \\\n",
" -d '{{\"model\": \"Alibaba-NLP/gte-Qwen2-7B-instruct\", \"input\": \"{text}\"}}'\"\"\"\n",
"\n",
"text_embedding = json.loads(subprocess.check_output(curl_text, shell=True))[\"data\"][0][\n",
@@ -91,7 +93,7 @@
"text = \"Once upon a time\"\n",
"\n",
"response = requests.post(\n",
" \"http://localhost:30000/v1/embeddings\",\n",
" f\"http://localhost:{port}/v1/embeddings\",\n",
" json={\"model\": \"Alibaba-NLP/gte-Qwen2-7B-instruct\", \"input\": text},\n",
")\n",
"\n",
@@ -115,7 +117,7 @@
"source": [
"import openai\n",
"\n",
"client = openai.Client(base_url=\"http://127.0.0.1:30000/v1\", api_key=\"None\")\n",
"client = openai.Client(base_url=f\"http://127.0.0.1:{port}/v1\", api_key=\"None\")\n",
"\n",
"# Text embedding example\n",
"response = client.embeddings.create(\n",
@@ -151,7 +153,7 @@
"tokenizer = AutoTokenizer.from_pretrained(\"Alibaba-NLP/gte-Qwen2-7B-instruct\")\n",
"input_ids = tokenizer.encode(text)\n",
"\n",
"curl_ids = f\"\"\"curl -s http://localhost:30000/v1/embeddings \\\n",
"curl_ids = f\"\"\"curl -s http://localhost:{port}/v1/embeddings \\\n",
" -d '{{\"model\": \"Alibaba-NLP/gte-Qwen2-7B-instruct\", \"input\": {json.dumps(input_ids)}}}'\"\"\"\n",
"\n",
"input_ids_embedding = json.loads(subprocess.check_output(curl_ids, shell=True))[\"data\"][\n",
@@ -167,7 +169,7 @@
"metadata": {},
"outputs": [],
"source": [
"terminate_process(embedding_process)"
"terminate_process(embedding_process, port)"
]
}
],

View File

@@ -34,21 +34,23 @@
"metadata": {},
"outputs": [],
"source": [
"from sglang.utils import (\n",
" execute_shell_command,\n",
" wait_for_server,\n",
" terminate_process,\n",
" print_highlight,\n",
")\n",
"from sglang.test.test_utils import is_in_ci\n",
"\n",
"embedding_process = execute_shell_command(\n",
"if is_in_ci():\n",
" from patch import launch_server_cmd\n",
"else:\n",
" from sglang.utils import launch_server_cmd\n",
"\n",
"from sglang.utils import wait_for_server, print_highlight, terminate_process\n",
"\n",
"embedding_process, port = launch_server_cmd(\n",
" \"\"\"\n",
"python3 -m sglang.launch_server --model-path meta-llama/Llama-3.2-11B-Vision-Instruct \\\n",
" --port=30000 --chat-template=llama_3_vision\n",
" --chat-template=llama_3_vision\n",
"\"\"\"\n",
")\n",
"\n",
"wait_for_server(\"http://localhost:30000\")"
"wait_for_server(f\"http://localhost:{port}\")"
]
},
{
@@ -68,32 +70,36 @@
"source": [
"import subprocess\n",
"\n",
"curl_command = \"\"\"\n",
"curl -s http://localhost:30000/v1/chat/completions \\\n",
" -d '{\n",
"curl_command = f\"\"\"\n",
"curl -s http://localhost:{port}/v1/chat/completions \\\\\n",
" -d '{{\n",
" \"model\": \"meta-llama/Llama-3.2-11B-Vision-Instruct\",\n",
" \"messages\": [\n",
" {\n",
" {{\n",
" \"role\": \"user\",\n",
" \"content\": [\n",
" {\n",
" {{\n",
" \"type\": \"text\",\n",
" \"text\": \"Whats in this image?\"\n",
" },\n",
" {\n",
" }},\n",
" {{\n",
" \"type\": \"image_url\",\n",
" \"image_url\": {\n",
" \"image_url\": {{\n",
" \"url\": \"https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true\"\n",
" }\n",
" }\n",
" }}\n",
" }}\n",
" ]\n",
" }\n",
" }}\n",
" ],\n",
" \"max_tokens\": 300\n",
" }'\n",
" }}'\n",
"\"\"\"\n",
"\n",
"response = subprocess.check_output(curl_command, shell=True).decode()\n",
"print_highlight(response)\n",
"\n",
"\n",
"response = subprocess.check_output(curl_command, shell=True).decode()\n",
"print_highlight(response)"
]
},
@@ -112,7 +118,7 @@
"source": [
"import requests\n",
"\n",
"url = \"http://localhost:30000/v1/chat/completions\"\n",
"url = f\"http://localhost:{port}/v1/chat/completions\"\n",
"\n",
"data = {\n",
" \"model\": \"meta-llama/Llama-3.2-11B-Vision-Instruct\",\n",
@@ -152,7 +158,7 @@
"source": [
"from openai import OpenAI\n",
"\n",
"client = OpenAI(base_url=\"http://localhost:30000/v1\", api_key=\"None\")\n",
"client = OpenAI(base_url=f\"http://localhost:{port}/v1\", api_key=\"None\")\n",
"\n",
"response = client.chat.completions.create(\n",
" model=\"meta-llama/Llama-3.2-11B-Vision-Instruct\",\n",
@@ -196,7 +202,7 @@
"source": [
"from openai import OpenAI\n",
"\n",
"client = OpenAI(base_url=\"http://localhost:30000/v1\", api_key=\"None\")\n",
"client = OpenAI(base_url=f\"http://localhost:{port}/v1\", api_key=\"None\")\n",
"\n",
"response = client.chat.completions.create(\n",
" model=\"meta-llama/Llama-3.2-11B-Vision-Instruct\",\n",
@@ -236,7 +242,7 @@
"metadata": {},
"outputs": [],
"source": [
"terminate_process(embedding_process)"
"terminate_process(embedding_process, port)"
]
},
{

35
docs/backend/patch.py Normal file
View File

@@ -0,0 +1,35 @@
import os
from sglang.utils import execute_shell_command, reserve_port
DEFAULT_MAX_RUNNING_REQUESTS = 200
DEFAULT_MAX_TOTAL_TOKENS = 20480
import sglang.srt.server_args as server_args_mod
_original_post_init = server_args_mod.ServerArgs.__post_init__
def patched_post_init(self):
_original_post_init(self)
if self.max_running_requests is None:
self.max_running_requests = DEFAULT_MAX_RUNNING_REQUESTS
if self.max_total_tokens is None:
self.max_total_tokens = DEFAULT_MAX_TOTAL_TOKENS
self.disable_cuda_graph = True
server_args_mod.ServerArgs.__post_init__ = patched_post_init
def launch_server_cmd(command: str, host: str = "0.0.0.0", port: int = None):
if port is None:
port = reserve_port()
extra_flags = (
f"--max-running-requests {DEFAULT_MAX_RUNNING_REQUESTS} "
f"--max-total-tokens {DEFAULT_MAX_TOTAL_TOKENS} "
f"--disable-cuda-graph"
)
full_command = f"{command} --port {port} {extra_flags}"
process = execute_shell_command(full_command)
return process, port

View File

@@ -22,7 +22,7 @@
"\n",
"```bash\n",
"python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \\\n",
"--port 30000 --host 0.0.0.0\n",
" --host 0.0.0.0\n",
"```\n",
"\n",
"in your terminal and wait for the server to be ready. Once the server is running, you can send test requests using curl or requests. The server implements the [OpenAI-compatible APIs](https://platform.openai.com/docs/api-reference/chat)."
@@ -34,21 +34,23 @@
"metadata": {},
"outputs": [],
"source": [
"from sglang.utils import (\n",
" execute_shell_command,\n",
" wait_for_server,\n",
" terminate_process,\n",
" print_highlight,\n",
")\n",
"from sglang.test.test_utils import is_in_ci\n",
"from sglang.utils import wait_for_server, print_highlight, terminate_process\n",
"\n",
"server_process = execute_shell_command(\n",
"if is_in_ci():\n",
" from patch import launch_server_cmd\n",
"else:\n",
" from sglang.utils import launch_server_cmd\n",
"\n",
"\n",
"server_process, port = launch_server_cmd(\n",
" \"\"\"\n",
"python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \\\n",
"--port 30000 --host 0.0.0.0\n",
" --host 0.0.0.0\n",
"\"\"\"\n",
")\n",
"\n",
"wait_for_server(\"http://localhost:30000\")"
"wait_for_server(f\"http://localhost:{port}\")"
]
},
{
@@ -66,9 +68,10 @@
"source": [
"import subprocess, json\n",
"\n",
"curl_command = \"\"\"\n",
"curl -s http://localhost:30000/v1/chat/completions \\\n",
" -d '{\"model\": \"meta-llama/Meta-Llama-3.1-8B-Instruct\", \"messages\": [{\"role\": \"user\", \"content\": \"What is the capital of France?\"}]}'\n",
"curl_command = f\"\"\"\n",
"curl -s http://localhost:{port}/v1/chat/completions \\\n",
" -H \"Content-Type: application/json\" \\\n",
" -d '{{\"model\": \"meta-llama/Meta-Llama-3.1-8B-Instruct\", \"messages\": [{{\"role\": \"user\", \"content\": \"What is the capital of France?\"}}]}}'\n",
"\"\"\"\n",
"\n",
"response = json.loads(subprocess.check_output(curl_command, shell=True))\n",
@@ -90,7 +93,7 @@
"source": [
"import requests\n",
"\n",
"url = \"http://localhost:30000/v1/chat/completions\"\n",
"url = f\"http://localhost:{port}/v1/chat/completions\"\n",
"\n",
"data = {\n",
" \"model\": \"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
@@ -116,7 +119,7 @@
"source": [
"import openai\n",
"\n",
"client = openai.Client(base_url=\"http://127.0.0.1:30000/v1\", api_key=\"None\")\n",
"client = openai.Client(base_url=f\"http://127.0.0.1:{port}/v1\", api_key=\"None\")\n",
"\n",
"response = client.chat.completions.create(\n",
" model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
@@ -144,7 +147,7 @@
"source": [
"import openai\n",
"\n",
"client = openai.Client(base_url=\"http://127.0.0.1:30000/v1\", api_key=\"None\")\n",
"client = openai.Client(base_url=f\"http://127.0.0.1:{port}/v1\", api_key=\"None\")\n",
"\n",
"# Use stream=True for streaming responses\n",
"response = client.chat.completions.create(\n",
@@ -181,7 +184,7 @@
"import requests\n",
"\n",
"response = requests.post(\n",
" \"http://localhost:30000/generate\",\n",
" f\"http://localhost:{port}/generate\",\n",
" json={\n",
" \"text\": \"The capital of France is\",\n",
" \"sampling_params\": {\n",
@@ -210,7 +213,7 @@
"import requests, json\n",
"\n",
"response = requests.post(\n",
" \"http://localhost:30000/generate\",\n",
" f\"http://localhost:{port}/generate\",\n",
" json={\n",
" \"text\": \"The capital of France is\",\n",
" \"sampling_params\": {\n",
@@ -240,8 +243,15 @@
"metadata": {},
"outputs": [],
"source": [
"terminate_process(server_process)"
"terminate_process(server_process, port)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {

View File

@@ -35,23 +35,24 @@
"metadata": {},
"outputs": [],
"source": [
"# EAGLE decoding\n",
"from sglang.utils import (\n",
" execute_shell_command,\n",
" wait_for_server,\n",
" terminate_process,\n",
" print_highlight,\n",
")\n",
"from sglang.test.test_utils import is_in_ci\n",
"\n",
"server_process = execute_shell_command(\n",
"if is_in_ci():\n",
" from patch import launch_server_cmd\n",
"else:\n",
" from sglang.utils import launch_server_cmd\n",
"\n",
"from sglang.utils import wait_for_server, print_highlight, terminate_process\n",
"\n",
"server_process, port = launch_server_cmd(\n",
" \"\"\"\n",
"python3 -m sglang.launch_server --model meta-llama/Llama-2-7b-chat-hf --speculative-algo EAGLE \\\n",
" --speculative-draft lmzheng/sglang-EAGLE-llama2-chat-7B --speculative-num-steps 5 \\\n",
" --speculative-eagle-topk 8 --speculative-num-draft-tokens 64 --mem-fraction 0.7 --port=30020 --cuda-graph-max-bs 32\n",
" --speculative-eagle-topk 8 --speculative-num-draft-tokens 64\n",
"\"\"\"\n",
")\n",
"\n",
"wait_for_server(\"http://localhost:30020\")"
"wait_for_server(f\"http://localhost:{port}\")"
]
},
{
@@ -62,7 +63,7 @@
"source": [
"import openai\n",
"\n",
"client = openai.Client(base_url=\"http://127.0.0.1:30020/v1\", api_key=\"None\")\n",
"client = openai.Client(base_url=f\"http://127.0.0.1:{port}/v1\", api_key=\"None\")\n",
"\n",
"response = client.chat.completions.create(\n",
" model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
@@ -100,25 +101,16 @@
"metadata": {},
"outputs": [],
"source": [
"server_process = execute_shell_command(\n",
"server_process, port = launch_server_cmd(\n",
" \"\"\"\n",
"python3 -m sglang.launch_server --model meta-llama/Llama-2-7b-chat-hf --speculative-algo EAGLE \\\n",
" --speculative-draft lmzheng/sglang-EAGLE-llama2-chat-7B --speculative-num-steps 5 \\\n",
" --speculative-eagle-topk 8 --speculative-num-draft-tokens 64 --mem-fraction 0.7 \\\n",
" --enable-torch-compile --cuda-graph-max-bs 2 --port=30020\n",
" --speculative-eagle-topk 8 --speculative-num-draft-tokens 64 --mem-fraction 0.6 \\\n",
" --enable-torch-compile --cuda-graph-max-bs 2\n",
"\"\"\"\n",
")\n",
"\n",
"wait_for_server(\"http://localhost:30020\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Benchmark Script\n",
"\n",
"The following code example shows how to measure the decoding speed when generating tokens:\n"
"wait_for_server(f\"http://localhost:{port}\")"
]
},
{
@@ -127,27 +119,20 @@
"metadata": {},
"outputs": [],
"source": [
"import time\n",
"import requests\n",
"import openai\n",
"\n",
"tic = time.time()\n",
"response = requests.post(\n",
" \"http://localhost:30020/generate\",\n",
" json={\n",
" \"text\": \"[INST] Give me a simple FastAPI server. Show the python code. [/INST]\",\n",
" \"sampling_params\": {\n",
" \"temperature\": 0,\n",
" \"max_new_tokens\": 256,\n",
" },\n",
" },\n",
"client = openai.Client(base_url=f\"http://127.0.0.1:{port}/v1\", api_key=\"None\")\n",
"\n",
"response = client.chat.completions.create(\n",
" model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
" messages=[\n",
" {\"role\": \"user\", \"content\": \"List 3 countries and their capitals.\"},\n",
" ],\n",
" temperature=0,\n",
" max_tokens=64,\n",
")\n",
"latency = time.time() - tic\n",
"ret = response.json()\n",
"completion_text = ret[\"text\"]\n",
"speed = ret[\"meta_info\"][\"completion_tokens\"] / latency\n",
"\n",
"print_highlight(completion_text)\n",
"print_highlight(f\"speed: {speed:.2f} token/s\")"
"print_highlight(f\"Response: {response}\")"
]
},
{

View File

@@ -38,24 +38,26 @@
"metadata": {},
"outputs": [],
"source": [
"from sglang.utils import (\n",
" execute_shell_command,\n",
" wait_for_server,\n",
" terminate_process,\n",
" print_highlight,\n",
")\n",
"import openai\n",
"import os\n",
"from sglang.test.test_utils import is_in_ci\n",
"\n",
"if is_in_ci():\n",
" from patch import launch_server_cmd\n",
"else:\n",
" from sglang.utils import launch_server_cmd\n",
"\n",
"from sglang.utils import wait_for_server, print_highlight, terminate_process\n",
"\n",
"os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n",
"\n",
"\n",
"server_process = execute_shell_command(\n",
" \"python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --port 30000 --host 0.0.0.0 --grammar-backend xgrammar\"\n",
"server_process, port = launch_server_cmd(\n",
" \"python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --host 0.0.0.0 --grammar-backend xgrammar\"\n",
")\n",
"\n",
"wait_for_server(\"http://localhost:30000\")\n",
"client = openai.Client(base_url=\"http://127.0.0.1:30000/v1\", api_key=\"None\")"
"wait_for_server(f\"http://localhost:{port}\")\n",
"client = openai.Client(base_url=f\"http://127.0.0.1:{port}/v1\", api_key=\"None\")"
]
},
{
@@ -264,7 +266,7 @@
"\n",
"# Make API request\n",
"response = requests.post(\n",
" \"http://localhost:30000/generate\",\n",
" f\"http://localhost:{port}/generate\",\n",
" json={\n",
" \"text\": \"Here is the information of the capital of France in the JSON format.\\n\",\n",
" \"sampling_params\": {\n",
@@ -309,7 +311,7 @@
"\n",
"# JSON\n",
"response = requests.post(\n",
" \"http://localhost:30000/generate\",\n",
" f\"http://localhost:{port}/generate\",\n",
" json={\n",
" \"text\": \"Here is the information of the capital of France in the JSON format.\\n\",\n",
" \"sampling_params\": {\n",
@@ -339,7 +341,7 @@
"import requests\n",
"\n",
"response = requests.post(\n",
" \"http://localhost:30000/generate\",\n",
" f\"http://localhost:{port}/generate\",\n",
" json={\n",
" \"text\": \"Give me the information of the capital of France.\",\n",
" \"sampling_params\": {\n",
@@ -376,7 +378,7 @@
"outputs": [],
"source": [
"response = requests.post(\n",
" \"http://localhost:30000/generate\",\n",
" f\"http://localhost:{port}/generate\",\n",
" json={\n",
" \"text\": \"Paris is the capital of\",\n",
" \"sampling_params\": {\n",
@@ -395,7 +397,7 @@
"metadata": {},
"outputs": [],
"source": [
"terminate_process(server_process)"
"terminate_process(server_process, port)"
]
},
{

View File

@@ -16,13 +16,12 @@ The core features include:
:caption: Getting Started
start/install.md
start/send_request.ipynb
.. toctree::
:maxdepth: 1
:caption: Backend Tutorial
backend/send_request.ipynb
backend/openai_api_completions.ipynb
backend/openai_api_vision.ipynb
backend/openai_api_embeddings.ipynb
@@ -33,7 +32,6 @@ The core features include:
backend/function_calling.ipynb
backend/server_arguments.md
.. toctree::
:maxdepth: 1
:caption: Frontend Tutorial