Add endpoints to dump selected expert ids (#4435)

Co-authored-by: Cheng Wan <54331508+ch-wan@users.noreply.github.com>
2025-03-24 21:34:19 -07:00
parent 6b7038babd
commit 199bb01d00
10 changed files with 328 additions and 2 deletions
--- a/docs/backend/native_api.ipynb
+++ b/docs/backend/native_api.ipynb
@@ -17,6 +17,9 @@
    "- `/update_weights`\n",
    "- `/encode`(embedding model)\n",
    "- `/classify`(reward model)\n",
+    "- `/start_expert_distribution_record`\n",
+    "- `/stop_expert_distribution_record`\n",
+    "- `/dump_expert_distribution_record`\n",
    "\n",
    "We mainly use `requests` to test these APIs in the following examples. You can also use `curl`."
   ]
@@ -362,6 +365,67 @@
    "terminate_process(reward_process)"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Capture expert selection distribution in MoE models\n",
+    "\n",
+    "SGLang Runtime supports recording the number of times an expert is selected in a MoE model run for each expert in the model. This is useful when analyzing the throughput of the model and plan for optimization."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "expert_record_server_process, port = launch_server_cmd(\n",
+    "    \"python -m sglang.launch_server --model-path Qwen/Qwen1.5-MoE-A2.7B --host 0.0.0.0\"\n",
+    ")\n",
+    "\n",
+    "wait_for_server(f\"http://localhost:{port}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "response = requests.post(f\"http://localhost:{port}/start_expert_distribution_record\")\n",
+    "print_highlight(response)\n",
+    "\n",
+    "url = f\"http://localhost:{port}/generate\"\n",
+    "data = {\"text\": \"What is the capital of France?\"}\n",
+    "\n",
+    "response = requests.post(url, json=data)\n",
+    "print_highlight(response.json())\n",
+    "\n",
+    "response = requests.post(f\"http://localhost:{port}/stop_expert_distribution_record\")\n",
+    "print_highlight(response)\n",
+    "\n",
+    "response = requests.post(f\"http://localhost:{port}/dump_expert_distribution_record\")\n",
+    "print_highlight(response)\n",
+    "\n",
+    "import glob\n",
+    "\n",
+    "output_file = glob.glob(\"expert_distribution_*.csv\")[0]\n",
+    "with open(output_file, \"r\") as f:\n",
+    "    print_highlight(\"Content of dumped record:\")\n",
+    "    for line in f:\n",
+    "        print_highlight(line.strip())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "terminate_process(expert_record_server_process)"
+   ]
+  },
  {
   "cell_type": "markdown",
   "metadata": {},