Support XiaomiMiMo inference with mtp (#6059)

2025-05-23 05:14:49 +08:00
parent 0b07c4a99f
commit a6ae3af15e
6 changed files with 344 additions and 6 deletions
--- a/docs/backend/speculative_decoding.ipynb
+++ b/docs/backend/speculative_decoding.ipynb
@@ -283,6 +283,60 @@
    "terminate_process(server_process)"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Multi Token Prediction\n",
+    "\n",
+    "We support [MTP(Multi-Token Prediction)](https://arxiv.org/pdf/2404.19737) in SGLang by using speculative decoding. We use Xiaomi/MiMo-7B-RL model as example here (deepseek mtp usage refer to [deepseek doc](../references/deepseek.md#multi-token-prediction))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "server_process, port = launch_server_cmd(\n",
+    "    \"\"\"\n",
+    "    python3 -m sglang.launch_server --model-path XiaomiMiMo/MiMo-7B-RL --host 0.0.0.0 --trust-remote-code \\\n",
+    "    --speculative-algorithm EAGLE --speculative-num-steps 1 --speculative-eagle-topk 1 --speculative-num-draft-tokens 2 \\\n",
+    "    --mem-fraction 0.5\n",
+    "\"\"\"\n",
+    ")\n",
+    "\n",
+    "wait_for_server(f\"http://localhost:{port}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import requests\n",
+    "\n",
+    "url = f\"http://localhost:{port}/v1/chat/completions\"\n",
+    "\n",
+    "data = {\n",
+    "    \"model\": \"XiaomiMiMo/MiMo-7B-RL\",\n",
+    "    \"messages\": [{\"role\": \"user\", \"content\": \"What is the capital of France?\"}],\n",
+    "}\n",
+    "\n",
+    "response = requests.post(url, json=data)\n",
+    "print_highlight(response.json())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "terminate_process(server_process)"
+   ]
+  },
  {
   "cell_type": "markdown",
   "metadata": {},