[Feature] Add /tokenize and /detokenize OpenAI compatible endpoints (#9545)

2025-10-08 10:08:48 +05:30
parent edd86b8853
commit 7c3f07dbcb
5 changed files with 434 additions and 1 deletions
--- a/docs/basic_usage/native_api.ipynb
+++ b/docs/basic_usage/native_api.ipynb
@@ -21,6 +21,8 @@
    "- `/start_expert_distribution_record`\n",
    "- `/stop_expert_distribution_record`\n",
    "- `/dump_expert_distribution_record`\n",
+    "- `/tokenize`\n",
+    "- `/detokenize`\n",
    "- A full list of these APIs can be found at [http_server.py](https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/entrypoints/http_server.py)\n",
    "\n",
    "We mainly use `requests` to test these APIs in the following examples. You can also use `curl`.\n"
@@ -477,6 +479,104 @@
   "source": [
    "terminate_process(expert_record_server_process)"
   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Tokenize/Detokenize Example (Round Trip)\n",
+    "\n",
+    "This example demonstrates how to use the /tokenize and /detokenize endpoints together. We first tokenize a string, then detokenize the resulting IDs to reconstruct the original text. This workflow is useful when you need to handle tokenization externally but still leverage the server for detokenization."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tokenizer_free_server_process, port = launch_server_cmd(\n",
+    "    \"\"\"\n",
+    "python3 -m sglang.launch_server --model-path qwen/qwen2.5-0.5b-instruct\n",
+    "\"\"\"\n",
+    ")\n",
+    "\n",
+    "wait_for_server(f\"http://localhost:{port}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import requests\n",
+    "from sglang.utils import print_highlight\n",
+    "\n",
+    "base_url = f\"http://localhost:{port}\"\n",
+    "tokenize_url = f\"{base_url}/tokenize\"\n",
+    "detokenize_url = f\"{base_url}/detokenize\"\n",
+    "\n",
+    "model_name = \"qwen/qwen2.5-0.5b-instruct\"\n",
+    "input_text = \"SGLang provides efficient tokenization endpoints.\"\n",
+    "print_highlight(f\"Original Input Text:\\n'{input_text}'\")\n",
+    "\n",
+    "# --- tokenize the input text ---\n",
+    "tokenize_payload = {\n",
+    "    \"model\": model_name,\n",
+    "    \"prompt\": input_text,\n",
+    "    \"add_special_tokens\": False,\n",
+    "}\n",
+    "try:\n",
+    "    tokenize_response = requests.post(tokenize_url, json=tokenize_payload)\n",
+    "    tokenize_response.raise_for_status()\n",
+    "    tokenization_result = tokenize_response.json()\n",
+    "    token_ids = tokenization_result.get(\"tokens\")\n",
+    "\n",
+    "    if not token_ids:\n",
+    "        raise ValueError(\"Tokenization returned empty tokens.\")\n",
+    "\n",
+    "    print_highlight(f\"\\nTokenized Output (IDs):\\n{token_ids}\")\n",
+    "    print_highlight(f\"Token Count: {tokenization_result.get('count')}\")\n",
+    "    print_highlight(f\"Max Model Length: {tokenization_result.get('max_model_len')}\")\n",
+    "\n",
+    "    # --- detokenize the obtained token IDs ---\n",
+    "    detokenize_payload = {\n",
+    "        \"model\": model_name,\n",
+    "        \"tokens\": token_ids,\n",
+    "        \"skip_special_tokens\": True,\n",
+    "    }\n",
+    "\n",
+    "    detokenize_response = requests.post(detokenize_url, json=detokenize_payload)\n",
+    "    detokenize_response.raise_for_status()\n",
+    "    detokenization_result = detokenize_response.json()\n",
+    "    reconstructed_text = detokenization_result.get(\"text\")\n",
+    "\n",
+    "    print_highlight(f\"\\nDetokenized Output (Text):\\n'{reconstructed_text}'\")\n",
+    "\n",
+    "    if input_text == reconstructed_text:\n",
+    "        print_highlight(\n",
+    "            \"\\nRound Trip Successful: Original and reconstructed text match.\"\n",
+    "        )\n",
+    "    else:\n",
+    "        print_highlight(\n",
+    "            \"\\nRound Trip Mismatch: Original and reconstructed text differ.\"\n",
+    "        )\n",
+    "\n",
+    "except requests.exceptions.RequestException as e:\n",
+    "    print_highlight(f\"\\nHTTP Request Error: {e}\")\n",
+    "except Exception as e:\n",
+    "    print_highlight(f\"\\nAn error occurred: {e}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "terminate_process(tokenizer_free_server_process)"
+   ]
  }
 ],
 "metadata": {
@@ -493,5 +593,5 @@
  }
 },
 "nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
 }