[CI] Improve Docs CI Efficiency (#3587)
Co-authored-by: zhaochenyang20 <zhaochen20@outlook.com>
This commit is contained in:
@@ -35,23 +35,24 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# EAGLE decoding\n",
|
||||
"from sglang.utils import (\n",
|
||||
" execute_shell_command,\n",
|
||||
" wait_for_server,\n",
|
||||
" terminate_process,\n",
|
||||
" print_highlight,\n",
|
||||
")\n",
|
||||
"from sglang.test.test_utils import is_in_ci\n",
|
||||
"\n",
|
||||
"server_process = execute_shell_command(\n",
|
||||
"if is_in_ci():\n",
|
||||
" from patch import launch_server_cmd\n",
|
||||
"else:\n",
|
||||
" from sglang.utils import launch_server_cmd\n",
|
||||
"\n",
|
||||
"from sglang.utils import wait_for_server, print_highlight, terminate_process\n",
|
||||
"\n",
|
||||
"server_process, port = launch_server_cmd(\n",
|
||||
" \"\"\"\n",
|
||||
"python3 -m sglang.launch_server --model meta-llama/Llama-2-7b-chat-hf --speculative-algo EAGLE \\\n",
|
||||
" --speculative-draft lmzheng/sglang-EAGLE-llama2-chat-7B --speculative-num-steps 5 \\\n",
|
||||
" --speculative-eagle-topk 8 --speculative-num-draft-tokens 64 --mem-fraction 0.7 --port=30020 --cuda-graph-max-bs 32\n",
|
||||
" --speculative-eagle-topk 8 --speculative-num-draft-tokens 64\n",
|
||||
"\"\"\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"wait_for_server(\"http://localhost:30020\")"
|
||||
"wait_for_server(f\"http://localhost:{port}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -62,7 +63,7 @@
|
||||
"source": [
|
||||
"import openai\n",
|
||||
"\n",
|
||||
"client = openai.Client(base_url=\"http://127.0.0.1:30020/v1\", api_key=\"None\")\n",
|
||||
"client = openai.Client(base_url=f\"http://127.0.0.1:{port}/v1\", api_key=\"None\")\n",
|
||||
"\n",
|
||||
"response = client.chat.completions.create(\n",
|
||||
" model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
|
||||
@@ -100,25 +101,16 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"server_process = execute_shell_command(\n",
|
||||
"server_process, port = launch_server_cmd(\n",
|
||||
" \"\"\"\n",
|
||||
"python3 -m sglang.launch_server --model meta-llama/Llama-2-7b-chat-hf --speculative-algo EAGLE \\\n",
|
||||
" --speculative-draft lmzheng/sglang-EAGLE-llama2-chat-7B --speculative-num-steps 5 \\\n",
|
||||
" --speculative-eagle-topk 8 --speculative-num-draft-tokens 64 --mem-fraction 0.7 \\\n",
|
||||
" --enable-torch-compile --cuda-graph-max-bs 2 --port=30020\n",
|
||||
" --speculative-eagle-topk 8 --speculative-num-draft-tokens 64 --mem-fraction 0.6 \\\n",
|
||||
" --enable-torch-compile --cuda-graph-max-bs 2\n",
|
||||
"\"\"\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"wait_for_server(\"http://localhost:30020\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Benchmark Script\n",
|
||||
"\n",
|
||||
"The following code example shows how to measure the decoding speed when generating tokens:\n"
|
||||
"wait_for_server(f\"http://localhost:{port}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -127,27 +119,20 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import time\n",
|
||||
"import requests\n",
|
||||
"import openai\n",
|
||||
"\n",
|
||||
"tic = time.time()\n",
|
||||
"response = requests.post(\n",
|
||||
" \"http://localhost:30020/generate\",\n",
|
||||
" json={\n",
|
||||
" \"text\": \"[INST] Give me a simple FastAPI server. Show the python code. [/INST]\",\n",
|
||||
" \"sampling_params\": {\n",
|
||||
" \"temperature\": 0,\n",
|
||||
" \"max_new_tokens\": 256,\n",
|
||||
" },\n",
|
||||
" },\n",
|
||||
"client = openai.Client(base_url=f\"http://127.0.0.1:{port}/v1\", api_key=\"None\")\n",
|
||||
"\n",
|
||||
"response = client.chat.completions.create(\n",
|
||||
" model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
|
||||
" messages=[\n",
|
||||
" {\"role\": \"user\", \"content\": \"List 3 countries and their capitals.\"},\n",
|
||||
" ],\n",
|
||||
" temperature=0,\n",
|
||||
" max_tokens=64,\n",
|
||||
")\n",
|
||||
"latency = time.time() - tic\n",
|
||||
"ret = response.json()\n",
|
||||
"completion_text = ret[\"text\"]\n",
|
||||
"speed = ret[\"meta_info\"][\"completion_tokens\"] / latency\n",
|
||||
"\n",
|
||||
"print_highlight(completion_text)\n",
|
||||
"print_highlight(f\"speed: {speed:.2f} token/s\")"
|
||||
"print_highlight(f\"Response: {response}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user