Add endpoints to dump selected expert ids (#4435)
Co-authored-by: Cheng Wan <54331508+ch-wan@users.noreply.github.com>
This commit is contained in:
@@ -17,6 +17,9 @@
|
||||
"- `/update_weights`\n",
|
||||
"- `/encode`(embedding model)\n",
|
||||
"- `/classify`(reward model)\n",
|
||||
"- `/start_expert_distribution_record`\n",
|
||||
"- `/stop_expert_distribution_record`\n",
|
||||
"- `/dump_expert_distribution_record`\n",
|
||||
"\n",
|
||||
"We mainly use `requests` to test these APIs in the following examples. You can also use `curl`."
|
||||
]
|
||||
@@ -362,6 +365,67 @@
|
||||
"terminate_process(reward_process)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Capture expert selection distribution in MoE models\n",
|
||||
"\n",
|
||||
"SGLang Runtime supports recording the number of times an expert is selected in a MoE model run for each expert in the model. This is useful when analyzing the throughput of the model and plan for optimization."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"expert_record_server_process, port = launch_server_cmd(\n",
|
||||
" \"python -m sglang.launch_server --model-path Qwen/Qwen1.5-MoE-A2.7B --host 0.0.0.0\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"wait_for_server(f\"http://localhost:{port}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"response = requests.post(f\"http://localhost:{port}/start_expert_distribution_record\")\n",
|
||||
"print_highlight(response)\n",
|
||||
"\n",
|
||||
"url = f\"http://localhost:{port}/generate\"\n",
|
||||
"data = {\"text\": \"What is the capital of France?\"}\n",
|
||||
"\n",
|
||||
"response = requests.post(url, json=data)\n",
|
||||
"print_highlight(response.json())\n",
|
||||
"\n",
|
||||
"response = requests.post(f\"http://localhost:{port}/stop_expert_distribution_record\")\n",
|
||||
"print_highlight(response)\n",
|
||||
"\n",
|
||||
"response = requests.post(f\"http://localhost:{port}/dump_expert_distribution_record\")\n",
|
||||
"print_highlight(response)\n",
|
||||
"\n",
|
||||
"import glob\n",
|
||||
"\n",
|
||||
"output_file = glob.glob(\"expert_distribution_*.csv\")[0]\n",
|
||||
"with open(output_file, \"r\") as f:\n",
|
||||
" print_highlight(\"Content of dumped record:\")\n",
|
||||
" for line in f:\n",
|
||||
" print_highlight(line.strip())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"terminate_process(expert_record_server_process)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
|
||||
Reference in New Issue
Block a user