[CI] Improve Docs CI Efficiency (#3587)

Co-authored-by: zhaochenyang20 <zhaochen20@outlook.com>
This commit is contained in:
Shi Shuai
2025-02-15 03:57:00 +00:00
committed by GitHub
parent 862dd76c76
commit 7443197a63
19 changed files with 366 additions and 231 deletions

View File

@@ -35,23 +35,24 @@
"metadata": {},
"outputs": [],
"source": [
"# EAGLE decoding\n",
"from sglang.utils import (\n",
" execute_shell_command,\n",
" wait_for_server,\n",
" terminate_process,\n",
" print_highlight,\n",
")\n",
"from sglang.test.test_utils import is_in_ci\n",
"\n",
"server_process = execute_shell_command(\n",
"if is_in_ci():\n",
" from patch import launch_server_cmd\n",
"else:\n",
" from sglang.utils import launch_server_cmd\n",
"\n",
"from sglang.utils import wait_for_server, print_highlight, terminate_process\n",
"\n",
"server_process, port = launch_server_cmd(\n",
" \"\"\"\n",
"python3 -m sglang.launch_server --model meta-llama/Llama-2-7b-chat-hf --speculative-algo EAGLE \\\n",
" --speculative-draft lmzheng/sglang-EAGLE-llama2-chat-7B --speculative-num-steps 5 \\\n",
" --speculative-eagle-topk 8 --speculative-num-draft-tokens 64 --mem-fraction 0.7 --port=30020 --cuda-graph-max-bs 32\n",
" --speculative-eagle-topk 8 --speculative-num-draft-tokens 64\n",
"\"\"\"\n",
")\n",
"\n",
"wait_for_server(\"http://localhost:30020\")"
"wait_for_server(f\"http://localhost:{port}\")"
]
},
{
@@ -62,7 +63,7 @@
"source": [
"import openai\n",
"\n",
"client = openai.Client(base_url=\"http://127.0.0.1:30020/v1\", api_key=\"None\")\n",
"client = openai.Client(base_url=f\"http://127.0.0.1:{port}/v1\", api_key=\"None\")\n",
"\n",
"response = client.chat.completions.create(\n",
" model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
@@ -100,25 +101,16 @@
"metadata": {},
"outputs": [],
"source": [
"server_process = execute_shell_command(\n",
"server_process, port = launch_server_cmd(\n",
" \"\"\"\n",
"python3 -m sglang.launch_server --model meta-llama/Llama-2-7b-chat-hf --speculative-algo EAGLE \\\n",
" --speculative-draft lmzheng/sglang-EAGLE-llama2-chat-7B --speculative-num-steps 5 \\\n",
" --speculative-eagle-topk 8 --speculative-num-draft-tokens 64 --mem-fraction 0.7 \\\n",
" --enable-torch-compile --cuda-graph-max-bs 2 --port=30020\n",
" --speculative-eagle-topk 8 --speculative-num-draft-tokens 64 --mem-fraction 0.6 \\\n",
" --enable-torch-compile --cuda-graph-max-bs 2\n",
"\"\"\"\n",
")\n",
"\n",
"wait_for_server(\"http://localhost:30020\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Benchmark Script\n",
"\n",
"The following code example shows how to measure the decoding speed when generating tokens:\n"
"wait_for_server(f\"http://localhost:{port}\")"
]
},
{
@@ -127,27 +119,20 @@
"metadata": {},
"outputs": [],
"source": [
"import time\n",
"import requests\n",
"import openai\n",
"\n",
"tic = time.time()\n",
"response = requests.post(\n",
" \"http://localhost:30020/generate\",\n",
" json={\n",
" \"text\": \"[INST] Give me a simple FastAPI server. Show the python code. [/INST]\",\n",
" \"sampling_params\": {\n",
" \"temperature\": 0,\n",
" \"max_new_tokens\": 256,\n",
" },\n",
" },\n",
"client = openai.Client(base_url=f\"http://127.0.0.1:{port}/v1\", api_key=\"None\")\n",
"\n",
"response = client.chat.completions.create(\n",
" model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
" messages=[\n",
" {\"role\": \"user\", \"content\": \"List 3 countries and their capitals.\"},\n",
" ],\n",
" temperature=0,\n",
" max_tokens=64,\n",
")\n",
"latency = time.time() - tic\n",
"ret = response.json()\n",
"completion_text = ret[\"text\"]\n",
"speed = ret[\"meta_info\"][\"completion_tokens\"] / latency\n",
"\n",
"print_highlight(completion_text)\n",
"print_highlight(f\"speed: {speed:.2f} token/s\")"
"print_highlight(f\"Response: {response}\")"
]
},
{