Improve: Token-In Token-Out Usage for RLHF (#2843)

2025-01-11 23:14:26 +00:00
parent 197cbf9bab
commit c4f9707e16
12 changed files with 166 additions and 128 deletions
--- a/docs/backend/native_api.ipynb
+++ b/docs/backend/native_api.ipynb
@@ -348,6 +348,76 @@
   "source": [
    "terminate_process(reward_process)"
   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Skip Tokenizer and Detokenizer\n",
+    "\n",
+    "SGLang Runtime also supports skip tokenizer and detokenizer. This is useful in cases like integrating with RLHF workflow."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tokenizer_free_server_process = execute_shell_command(\n",
+    "    \"\"\"\n",
+    "python3 -m sglang.launch_server --model-path meta-llama/Llama-3.2-1B-Instruct --port=30010 --skip-tokenizer-init\n",
+    "\"\"\"\n",
+    ")\n",
+    "\n",
+    "wait_for_server(\"http://localhost:30010\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import AutoTokenizer\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(\"meta-llama/Llama-3.2-1B-Instruct\")\n",
+    "\n",
+    "input_text = \"What is the capital of France?\"\n",
+    "\n",
+    "input_tokens = tokenizer.encode(input_text)\n",
+    "print_highlight(f\"Input Text: {input_text}\")\n",
+    "print_highlight(f\"Tokenized Input: {input_tokens}\")\n",
+    "\n",
+    "response = requests.post(\n",
+    "    \"http://localhost:30010/generate\",\n",
+    "    json={\n",
+    "        \"input_ids\": input_tokens,\n",
+    "        \"sampling_params\": {\n",
+    "            \"temperature\": 0,\n",
+    "            \"max_new_tokens\": 256,\n",
+    "            \"stop_token_ids\": [tokenizer.eos_token_id],\n",
+    "        },\n",
+    "        \"stream\": False,\n",
+    "    },\n",
+    ")\n",
+    "output = response.json()\n",
+    "output_tokens = output[\"token_ids\"]\n",
+    "\n",
+    "output_text = tokenizer.decode(output_tokens, skip_special_tokens=False)\n",
+    "print_highlight(f\"Tokenized Output: {output_tokens}\")\n",
+    "print_highlight(f\"Decoded Output: {output_text}\")\n",
+    "print_highlight(f\"Output Text: {output['meta_info']['finish_reason']}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "terminate_process(tokenizer_free_server_process)"
+   ]
  }
 ],
 "metadata": {
--- a/docs/backend/structured_outputs.ipynb
+++ b/docs/backend/structured_outputs.ipynb
@@ -4,7 +4,7 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "# Structured Outputs (JSON, Regex, EBNF)"
+    "# Structured Outputs"
   ]
  },
  {
@@ -43,6 +43,10 @@
    "    print_highlight,\n",
    ")\n",
    "import openai\n",
+    "import os\n",
+    "\n",
+    "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n",
+    "\n",
    "\n",
    "server_process = execute_shell_command(\n",
    "    \"python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --port 30000 --host 0.0.0.0 --grammar-backend xgrammar\"\n",