[CI] Improve Docs CI Efficiency (#3587)

Co-authored-by: zhaochenyang20 <zhaochen20@outlook.com>
2025-02-15 03:57:00 +00:00
parent 862dd76c76
commit 7443197a63
19 changed files with 366 additions and 231 deletions
--- a/docs/backend/speculative_decoding.ipynb
+++ b/docs/backend/speculative_decoding.ipynb
@@ -35,23 +35,24 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "# EAGLE decoding\n",
-    "from sglang.utils import (\n",
-    "    execute_shell_command,\n",
-    "    wait_for_server,\n",
-    "    terminate_process,\n",
-    "    print_highlight,\n",
-    ")\n",
+    "from sglang.test.test_utils import is_in_ci\n",
    "\n",
-    "server_process = execute_shell_command(\n",
+    "if is_in_ci():\n",
+    "    from patch import launch_server_cmd\n",
+    "else:\n",
+    "    from sglang.utils import launch_server_cmd\n",
+    "\n",
+    "from sglang.utils import wait_for_server, print_highlight, terminate_process\n",
+    "\n",
+    "server_process, port = launch_server_cmd(\n",
    "    \"\"\"\n",
    "python3 -m sglang.launch_server --model meta-llama/Llama-2-7b-chat-hf  --speculative-algo EAGLE \\\n",
    "    --speculative-draft lmzheng/sglang-EAGLE-llama2-chat-7B --speculative-num-steps 5 \\\n",
-    "    --speculative-eagle-topk 8 --speculative-num-draft-tokens 64 --mem-fraction 0.7 --port=30020 --cuda-graph-max-bs 32\n",
+    "    --speculative-eagle-topk 8 --speculative-num-draft-tokens 64\n",
    "\"\"\"\n",
    ")\n",
    "\n",
-    "wait_for_server(\"http://localhost:30020\")"
+    "wait_for_server(f\"http://localhost:{port}\")"
   ]
  },
  {
@@ -62,7 +63,7 @@
   "source": [
    "import openai\n",
    "\n",
-    "client = openai.Client(base_url=\"http://127.0.0.1:30020/v1\", api_key=\"None\")\n",
+    "client = openai.Client(base_url=f\"http://127.0.0.1:{port}/v1\", api_key=\"None\")\n",
    "\n",
    "response = client.chat.completions.create(\n",
    "    model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
@@ -100,25 +101,16 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "server_process = execute_shell_command(\n",
+    "server_process, port = launch_server_cmd(\n",
    "    \"\"\"\n",
    "python3 -m sglang.launch_server --model meta-llama/Llama-2-7b-chat-hf  --speculative-algo EAGLE \\\n",
    "    --speculative-draft lmzheng/sglang-EAGLE-llama2-chat-7B --speculative-num-steps 5 \\\n",
-    "        --speculative-eagle-topk 8 --speculative-num-draft-tokens 64 --mem-fraction 0.7 \\\n",
-    "            --enable-torch-compile --cuda-graph-max-bs 2 --port=30020\n",
+    "        --speculative-eagle-topk 8 --speculative-num-draft-tokens 64 --mem-fraction 0.6 \\\n",
+    "            --enable-torch-compile --cuda-graph-max-bs 2\n",
    "\"\"\"\n",
    ")\n",
    "\n",
-    "wait_for_server(\"http://localhost:30020\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Benchmark Script\n",
-    "\n",
-    "The following code example shows how to measure the decoding speed when generating tokens:\n"
+    "wait_for_server(f\"http://localhost:{port}\")"
   ]
  },
  {
@@ -127,27 +119,20 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "import time\n",
-    "import requests\n",
+    "import openai\n",
    "\n",
-    "tic = time.time()\n",
-    "response = requests.post(\n",
-    "    \"http://localhost:30020/generate\",\n",
-    "    json={\n",
-    "        \"text\": \"[INST] Give me a simple FastAPI server. Show the python code. [/INST]\",\n",
-    "        \"sampling_params\": {\n",
-    "            \"temperature\": 0,\n",
-    "            \"max_new_tokens\": 256,\n",
-    "        },\n",
-    "    },\n",
+    "client = openai.Client(base_url=f\"http://127.0.0.1:{port}/v1\", api_key=\"None\")\n",
+    "\n",
+    "response = client.chat.completions.create(\n",
+    "    model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
+    "    messages=[\n",
+    "        {\"role\": \"user\", \"content\": \"List 3 countries and their capitals.\"},\n",
+    "    ],\n",
+    "    temperature=0,\n",
+    "    max_tokens=64,\n",
    ")\n",
-    "latency = time.time() - tic\n",
-    "ret = response.json()\n",
-    "completion_text = ret[\"text\"]\n",
-    "speed = ret[\"meta_info\"][\"completion_tokens\"] / latency\n",
    "\n",
-    "print_highlight(completion_text)\n",
-    "print_highlight(f\"speed: {speed:.2f} token/s\")"
+    "print_highlight(f\"Response: {response}\")"
   ]
  },
  {