[docs] Refactor, remove compiled results and add gpt-oss (#9613)

Co-authored-by: zhaochenyang20 <zhaochenyang20@gmail.com>
2025-08-25 15:27:06 -07:00
parent a0a77d937b
commit 9b08d975a0
5 changed files with 176 additions and 621 deletions
--- a/docs/advanced_features/function_calling.ipynb
+++ b/docs/advanced_features/function_calling.ipynb
@@ -51,7 +51,8 @@
    "- mistral: Mistral (e.g. mistralai/Mistral-7B-Instruct-v0.3, mistralai/Mistral-Nemo-Instruct-2407, mistralai/\n",
    "Mistral-Nemo-Instruct-2407, mistralai/Mistral-7B-v0.3).\n",
    "- qwen25: Qwen 2.5 (e.g. Qwen/Qwen2.5-1.5B-Instruct, Qwen/Qwen2.5-7B-Instruct) and QwQ (i.e. Qwen/QwQ-32B). Especially, for QwQ, we can enable the reasoning parser together with tool call parser, details about reasoning parser can be found in [reasoning parser](https://docs.sglang.ai/backend/separate_reasoning.html).\n",
-    "- deepseekv3: DeepSeek-v3 (e.g., deepseek-ai/DeepSeek-V3-0324).\n"
+    "- deepseekv3: DeepSeek-v3 (e.g., deepseek-ai/DeepSeek-V3-0324).\n",
+    "- gpt-oss: GPT-OSS (e.g., openai/gpt-oss-120b, openai/gpt-oss-20b, lmsys/gpt-oss-120b-bf16, lmsys/gpt-oss-20b-bf16). Note: The gpt-oss tool parser filters out analysis channel events and only preserves normal text. This can cause the content to be empty when explanations are in the analysis channel. To work around this, complete the tool round by returning tool results as role=\"tool\" messages, which enables the model to generate the final content."
   ]
  },
  {
@@ -354,6 +355,155 @@
    "print(final_response.choices[0].message.content)"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Native API and SGLang Runtime (SRT)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import AutoTokenizer\n",
+    "import requests\n",
+    "\n",
+    "# generate an answer\n",
+    "tokenizer = AutoTokenizer.from_pretrained(\"Qwen/Qwen2.5-7B-Instruct\")\n",
+    "\n",
+    "messages = get_messages()\n",
+    "\n",
+    "input = tokenizer.apply_chat_template(\n",
+    "    messages,\n",
+    "    tokenize=False,\n",
+    "    add_generation_prompt=True,\n",
+    "    tools=tools,\n",
+    ")\n",
+    "\n",
+    "gen_url = f\"http://localhost:{port}/generate\"\n",
+    "gen_data = {\n",
+    "    \"text\": input,\n",
+    "    \"sampling_params\": {\n",
+    "        \"skip_special_tokens\": False,\n",
+    "        \"max_new_tokens\": 1024,\n",
+    "        \"temperature\": 0,\n",
+    "        \"top_p\": 0.95,\n",
+    "    },\n",
+    "}\n",
+    "gen_response = requests.post(gen_url, json=gen_data).json()[\"text\"]\n",
+    "print_highlight(\"==== Response ====\")\n",
+    "print(gen_response)\n",
+    "\n",
+    "# parse the response\n",
+    "parse_url = f\"http://localhost:{port}/parse_function_call\"\n",
+    "\n",
+    "function_call_input = {\n",
+    "    \"text\": gen_response,\n",
+    "    \"tool_call_parser\": \"qwen25\",\n",
+    "    \"tools\": tools,\n",
+    "}\n",
+    "\n",
+    "function_call_response = requests.post(parse_url, json=function_call_input)\n",
+    "function_call_response_json = function_call_response.json()\n",
+    "\n",
+    "print_highlight(\"==== Text ====\")\n",
+    "print(function_call_response_json[\"normal_text\"])\n",
+    "print_highlight(\"==== Calls ====\")\n",
+    "print(\"function name: \", function_call_response_json[\"calls\"][0][\"name\"])\n",
+    "print(\"function arguments: \", function_call_response_json[\"calls\"][0][\"parameters\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "terminate_process(server_process)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Offline Engine API"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sglang as sgl\n",
+    "from sglang.srt.function_call.function_call_parser import FunctionCallParser\n",
+    "from sglang.srt.managers.io_struct import Tool, Function\n",
+    "\n",
+    "llm = sgl.Engine(model_path=\"Qwen/Qwen2.5-7B-Instruct\")\n",
+    "tokenizer = llm.tokenizer_manager.tokenizer\n",
+    "input_ids = tokenizer.apply_chat_template(\n",
+    "    messages, tokenize=True, add_generation_prompt=True, tools=tools\n",
+    ")\n",
+    "\n",
+    "# Note that for gpt-oss tool parser, adding \"no_stop_trim\": True\n",
+    "# to make sure the tool call token <call> is not trimmed.\n",
+    "\n",
+    "sampling_params = {\n",
+    "    \"max_new_tokens\": 1024,\n",
+    "    \"temperature\": 0,\n",
+    "    \"top_p\": 0.95,\n",
+    "    \"skip_special_tokens\": False,\n",
+    "}\n",
+    "\n",
+    "# 1) Offline generation\n",
+    "result = llm.generate(input_ids=input_ids, sampling_params=sampling_params)\n",
+    "generated_text = result[\"text\"]  # Assume there is only one prompt\n",
+    "\n",
+    "print(\"=== Offline Engine Output Text ===\")\n",
+    "print(generated_text)\n",
+    "\n",
+    "\n",
+    "# 2) Parse using FunctionCallParser\n",
+    "def convert_dict_to_tool(tool_dict: dict) -> Tool:\n",
+    "    function_dict = tool_dict.get(\"function\", {})\n",
+    "    return Tool(\n",
+    "        type=tool_dict.get(\"type\", \"function\"),\n",
+    "        function=Function(\n",
+    "            name=function_dict.get(\"name\"),\n",
+    "            description=function_dict.get(\"description\"),\n",
+    "            parameters=function_dict.get(\"parameters\"),\n",
+    "        ),\n",
+    "    )\n",
+    "\n",
+    "\n",
+    "tools = [convert_dict_to_tool(raw_tool) for raw_tool in tools]\n",
+    "\n",
+    "parser = FunctionCallParser(tools=tools, tool_call_parser=\"qwen25\")\n",
+    "normal_text, calls = parser.parse_non_stream(generated_text)\n",
+    "\n",
+    "print(\"=== Parsing Result ===\")\n",
+    "print(\"Normal text portion:\", normal_text)\n",
+    "print(\"Function call portion:\")\n",
+    "for call in calls:\n",
+    "    # call: ToolCallItem\n",
+    "    print(f\"  - tool name: {call.name}\")\n",
+    "    print(f\"    parameters: {call.parameters}\")\n",
+    "\n",
+    "# 3) If needed, perform additional logic on the parsed functions, such as automatically calling the corresponding function to obtain a return value, etc."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "llm.shutdown()"
+   ]
+  },
  {
   "cell_type": "markdown",
   "metadata": {},
@@ -490,152 +640,6 @@
    "terminate_process(server_process_tool_choice)"
   ]
  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Native API and SGLang Runtime (SRT)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from transformers import AutoTokenizer\n",
-    "import requests\n",
-    "\n",
-    "# generate an answer\n",
-    "tokenizer = AutoTokenizer.from_pretrained(\"Qwen/Qwen2.5-7B-Instruct\")\n",
-    "\n",
-    "messages = get_messages()\n",
-    "\n",
-    "input = tokenizer.apply_chat_template(\n",
-    "    messages,\n",
-    "    tokenize=False,\n",
-    "    add_generation_prompt=True,\n",
-    "    tools=tools,\n",
-    ")\n",
-    "\n",
-    "gen_url = f\"http://localhost:{port}/generate\"\n",
-    "gen_data = {\n",
-    "    \"text\": input,\n",
-    "    \"sampling_params\": {\n",
-    "        \"skip_special_tokens\": False,\n",
-    "        \"max_new_tokens\": 1024,\n",
-    "        \"temperature\": 0,\n",
-    "        \"top_p\": 0.95,\n",
-    "    },\n",
-    "}\n",
-    "gen_response = requests.post(gen_url, json=gen_data).json()[\"text\"]\n",
-    "print_highlight(\"==== Response ====\")\n",
-    "print(gen_response)\n",
-    "\n",
-    "# parse the response\n",
-    "parse_url = f\"http://localhost:{port}/parse_function_call\"\n",
-    "\n",
-    "function_call_input = {\n",
-    "    \"text\": gen_response,\n",
-    "    \"tool_call_parser\": \"qwen25\",\n",
-    "    \"tools\": tools,\n",
-    "}\n",
-    "\n",
-    "function_call_response = requests.post(parse_url, json=function_call_input)\n",
-    "function_call_response_json = function_call_response.json()\n",
-    "\n",
-    "print_highlight(\"==== Text ====\")\n",
-    "print(function_call_response_json[\"normal_text\"])\n",
-    "print_highlight(\"==== Calls ====\")\n",
-    "print(\"function name: \", function_call_response_json[\"calls\"][0][\"name\"])\n",
-    "print(\"function arguments: \", function_call_response_json[\"calls\"][0][\"parameters\"])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "terminate_process(server_process)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Offline Engine API"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import sglang as sgl\n",
-    "from sglang.srt.function_call.function_call_parser import FunctionCallParser\n",
-    "from sglang.srt.managers.io_struct import Tool, Function\n",
-    "\n",
-    "llm = sgl.Engine(model_path=\"Qwen/Qwen2.5-7B-Instruct\")\n",
-    "tokenizer = llm.tokenizer_manager.tokenizer\n",
-    "input_ids = tokenizer.apply_chat_template(\n",
-    "    messages, tokenize=True, add_generation_prompt=True, tools=tools\n",
-    ")\n",
-    "\n",
-    "sampling_params = {\n",
-    "    \"max_new_tokens\": 1024,\n",
-    "    \"temperature\": 0,\n",
-    "    \"top_p\": 0.95,\n",
-    "    \"skip_special_tokens\": False,\n",
-    "}\n",
-    "\n",
-    "# 1) Offline generation\n",
-    "result = llm.generate(input_ids=input_ids, sampling_params=sampling_params)\n",
-    "generated_text = result[\"text\"]  # Assume there is only one prompt\n",
-    "\n",
-    "print(\"=== Offline Engine Output Text ===\")\n",
-    "print(generated_text)\n",
-    "\n",
-    "\n",
-    "# 2) Parse using FunctionCallParser\n",
-    "def convert_dict_to_tool(tool_dict: dict) -> Tool:\n",
-    "    function_dict = tool_dict.get(\"function\", {})\n",
-    "    return Tool(\n",
-    "        type=tool_dict.get(\"type\", \"function\"),\n",
-    "        function=Function(\n",
-    "            name=function_dict.get(\"name\"),\n",
-    "            description=function_dict.get(\"description\"),\n",
-    "            parameters=function_dict.get(\"parameters\"),\n",
-    "        ),\n",
-    "    )\n",
-    "\n",
-    "\n",
-    "tools = [convert_dict_to_tool(raw_tool) for raw_tool in tools]\n",
-    "\n",
-    "parser = FunctionCallParser(tools=tools, tool_call_parser=\"qwen25\")\n",
-    "normal_text, calls = parser.parse_non_stream(generated_text)\n",
-    "\n",
-    "print(\"=== Parsing Result ===\")\n",
-    "print(\"Normal text portion:\", normal_text)\n",
-    "print(\"Function call portion:\")\n",
-    "for call in calls:\n",
-    "    # call: ToolCallItem\n",
-    "    print(f\"  - tool name: {call.name}\")\n",
-    "    print(f\"    parameters: {call.parameters}\")\n",
-    "\n",
-    "# 3) If needed, perform additional logic on the parsed functions, such as automatically calling the corresponding function to obtain a return value, etc."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "llm.shutdown()"
-   ]
-  },
  {
   "cell_type": "markdown",
   "metadata": {},
@@ -657,6 +661,8 @@
    "\n",
    "For more information, refer to Meta’s documentation on  [Zero shot function calling](https://github.com/meta-llama/llama-models/blob/main/models/llama4/prompt_format.md#zero-shot-function-calling---system-message).\n",
    "\n",
+    "Note that this feature is still under development on Blackwell.\n",
+    "\n",
    "### How to enable\n",
    "- Launch the server with `--tool-call-parser pythonic`\n",
    "- You may also specify --chat-template with the improved template for the model (e.g., `--chat-template=examples/chat_template/tool_chat_template_llama4_pythonic.jinja`).\n",