[Doc] Fix SGLang tool parser doc (#9886)

This commit is contained in:
Huapeng Zhou
2025-09-04 09:52:53 -04:00
committed by GitHub
parent ec15c8360e
commit 75ee00112d
14 changed files with 67 additions and 53 deletions

View File

@@ -80,6 +80,7 @@
" --enable-lora \\\n",
" --lora-paths lora0=algoprog/fact-generation-llama-3.1-8b-instruct-lora \\\n",
" --max-loras-per-batch 1 --lora-backend triton \\\n",
" --log-level warning \\\n",
"\"\"\"\n",
")\n",
"\n",
@@ -139,6 +140,7 @@
" --lora-paths lora0=algoprog/fact-generation-llama-3.1-8b-instruct-lora \\\n",
" lora1=Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16 \\\n",
" --max-loras-per-batch 2 --lora-backend triton \\\n",
" --log-level warning \\\n",
"\"\"\"\n",
")\n",
"\n",
@@ -215,6 +217,7 @@
" --max-loras-per-batch 2 --lora-backend triton \\\n",
" --max-lora-rank 256\n",
" --lora-target-modules all\n",
" --log-level warning\n",
" \"\"\"\n",
")\n",
"\n",
@@ -417,6 +420,7 @@
" {\"lora_name\":\"lora0\",\"lora_path\":\"Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16\",\"pinned\":true} \\\n",
" {\"lora_name\":\"lora1\",\"lora_path\":\"algoprog/fact-generation-llama-3.1-8b-instruct-lora\"} \\\n",
" lora2=philschmid/code-llama-3-1-8b-text-to-sql-lora\n",
" --log-level warning\n",
" \"\"\"\n",
")\n",
"\n",

View File

@@ -67,7 +67,7 @@
"from sglang.utils import wait_for_server, print_highlight, terminate_process\n",
"\n",
"server_process, port = launch_server_cmd(\n",
" \"python3 -m sglang.launch_server --model-path deepseek-ai/DeepSeek-R1-Distill-Qwen-7B --host 0.0.0.0 --reasoning-parser deepseek-r1\"\n",
" \"python3 -m sglang.launch_server --model-path deepseek-ai/DeepSeek-R1-Distill-Qwen-7B --host 0.0.0.0 --reasoning-parser deepseek-r1 --log-level warning\"\n",
")\n",
"\n",
"wait_for_server(f\"http://localhost:{port}\")"

View File

@@ -70,7 +70,7 @@
" \"\"\"\n",
"python3 -m sglang.launch_server --model meta-llama/Llama-2-7b-chat-hf --speculative-algorithm EAGLE \\\n",
" --speculative-draft-model-path lmsys/sglang-EAGLE-llama2-chat-7B --speculative-num-steps 3 \\\n",
" --speculative-eagle-topk 4 --speculative-num-draft-tokens 16 --cuda-graph-max-bs 8\n",
" --speculative-eagle-topk 4 --speculative-num-draft-tokens 16 --cuda-graph-max-bs 8 --log-level warning\n",
"\"\"\"\n",
")\n",
"\n",
@@ -126,7 +126,7 @@
"python3 -m sglang.launch_server --model meta-llama/Llama-2-7b-chat-hf --speculative-algorithm EAGLE \\\n",
" --speculative-draft-model-path lmsys/sglang-EAGLE-llama2-chat-7B --speculative-num-steps 5 \\\n",
" --speculative-eagle-topk 8 --speculative-num-draft-tokens 64 --mem-fraction 0.6 \\\n",
" --enable-torch-compile --torch-compile-max-bs 2\n",
" --enable-torch-compile --torch-compile-max-bs 2 --log-level warning\n",
"\"\"\"\n",
")\n",
"\n",
@@ -186,7 +186,7 @@
"python3 -m sglang.launch_server --model meta-llama/Meta-Llama-3-8B-Instruct --speculative-algorithm EAGLE \\\n",
" --speculative-draft-model-path lmsys/sglang-EAGLE-LLaMA3-Instruct-8B --speculative-num-steps 5 \\\n",
" --speculative-eagle-topk 8 --speculative-num-draft-tokens 64 --speculative-token-map thunlp/LLaMA3-Instruct-8B-FR-Spec/freq_32768.pt \\\n",
" --mem-fraction 0.7 --cuda-graph-max-bs 2 --dtype float16 \n",
" --mem-fraction 0.7 --cuda-graph-max-bs 2 --dtype float16 --log-level warning\n",
"\"\"\"\n",
")\n",
"\n",
@@ -242,7 +242,7 @@
"python3 -m sglang.launch_server --model meta-llama/Llama-3.1-8B-Instruct --speculative-algorithm EAGLE3 \\\n",
" --speculative-draft-model-path jamesliu1/sglang-EAGLE3-Llama-3.1-Instruct-8B --speculative-num-steps 5 \\\n",
" --speculative-eagle-topk 8 --speculative-num-draft-tokens 32 --mem-fraction 0.6 \\\n",
" --cuda-graph-max-bs 2 --dtype float16\n",
" --cuda-graph-max-bs 2 --dtype float16 --log-level warning\n",
"\"\"\"\n",
")\n",
"\n",
@@ -297,7 +297,7 @@
" \"\"\"\n",
" python3 -m sglang.launch_server --model-path XiaomiMiMo/MiMo-7B-RL --host 0.0.0.0 --trust-remote-code \\\n",
" --speculative-algorithm EAGLE --speculative-num-steps 1 --speculative-eagle-topk 1 --speculative-num-draft-tokens 2 \\\n",
" --mem-fraction 0.5\n",
" --mem-fraction 0.5 --log-level warning\n",
"\"\"\"\n",
")\n",
"\n",

View File

@@ -51,7 +51,7 @@
"\n",
"\n",
"server_process, port = launch_server_cmd(\n",
" \"python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --host 0.0.0.0\"\n",
" \"python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --host 0.0.0.0 --log-level warning\"\n",
")\n",
"\n",
"wait_for_server(f\"http://localhost:{port}\")\n",

View File

@@ -47,7 +47,7 @@
"\n",
"\n",
"server_process, port = launch_server_cmd(\n",
" \"python -m sglang.launch_server --model-path deepseek-ai/DeepSeek-R1-Distill-Qwen-7B --host 0.0.0.0 --reasoning-parser deepseek-r1\"\n",
" \"python -m sglang.launch_server --model-path deepseek-ai/DeepSeek-R1-Distill-Qwen-7B --host 0.0.0.0 --reasoning-parser deepseek-r1 --log-level warning\"\n",
")\n",
"\n",
"wait_for_server(f\"http://localhost:{port}\")\n",

View File

@@ -4,11 +4,29 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"# Tool and Function Calling\n",
"# Tool Parser\n",
"\n",
"This guide demonstrates how to use SGLangs [Function calling](https://platform.openai.com/docs/guides/function-calling) functionality."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Currently supported parsers:\n",
"\n",
"| Parser | Supported Models | Notes |\n",
"|---|---|---|\n",
"| `llama3` | Llama 3.1 / 3.2 / 3.3 (e.g. `meta-llama/Llama-3.1-8B-Instruct`, `meta-llama/Llama-3.2-1B-Instruct`, `meta-llama/Llama-3.3-70B-Instruct`) | |\n",
"| `llama4` | Llama 4 (e.g. `meta-llama/Llama-4-Scout-17B-16E-Instruct`) | |\n",
"| `mistral` | Mistral (e.g. `mistralai/Mistral-7B-Instruct-v0.3`, `mistralai/Mistral-Nemo-Instruct-2407`, `mistralai/Mistral-7B-v0.3`) | |\n",
"| `qwen25` | Qwen 2.5 (e.g. `Qwen/Qwen2.5-1.5B-Instruct`, `Qwen/Qwen2.5-7B-Instruct`) and QwQ (i.e. `Qwen/QwQ-32B`) | For QwQ, reasoning parser can be enabled together with tool call parser. See [reasoning parser](https://docs.sglang.ai/backend/separate_reasoning.html). |\n",
"| `deepseekv3` | DeepSeek-v3 (e.g., `deepseek-ai/DeepSeek-V3-0324`) | |\n",
"| `gpt-oss` | GPT-OSS (e.g., `openai/gpt-oss-120b`, `openai/gpt-oss-20b`, `lmsys/gpt-oss-120b-bf16`, `lmsys/gpt-oss-20b-bf16`) | The gpt-oss tool parser filters out analysis channel events and only preserves normal text. This can cause the content to be empty when explanations are in the analysis channel. To work around this, complete the tool round by returning tool results as `role=\"tool\"` messages, which enables the model to generate the final content. |\n",
"| `kimi_k2` | `moonshotai/Kimi-K2-Instruct` | |\n",
"| `pythonic` | Llama-3.2 / Llama-3.3 / Llama-4 | Model outputs function calls as Python code. Requires `--tool-call-parser pythonic` and is recommended to use with a specific chat template. |\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
@@ -35,7 +53,7 @@
"from openai import OpenAI\n",
"\n",
"server_process, port = launch_server_cmd(\n",
" \"python3 -m sglang.launch_server --model-path Qwen/Qwen2.5-7B-Instruct --tool-call-parser qwen25 --host 0.0.0.0\" # qwen25\n",
" \"python3 -m sglang.launch_server --model-path Qwen/Qwen2.5-7B-Instruct --tool-call-parser qwen25 --host 0.0.0.0 --log-level warning\" # qwen25\n",
")\n",
"wait_for_server(f\"http://localhost:{port}\")"
]
@@ -44,16 +62,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"Note that `--tool-call-parser` defines the parser used to interpret responses. Currently supported parsers include:\n",
"\n",
"- llama3: Llama 3.1 / 3.2 / 3.3 (e.g. meta-llama/Llama-3.1-8B-Instruct, meta-llama/Llama-3.2-1B-Instruct, meta-llama/Llama-3.3-70B-Instruct).\n",
"- llama4: Llama 4 (e.g. meta-llama/Llama-4-Scout-17B-16E-Instruct).\n",
"- mistral: Mistral (e.g. mistralai/Mistral-7B-Instruct-v0.3, mistralai/Mistral-Nemo-Instruct-2407, mistralai/\n",
"Mistral-Nemo-Instruct-2407, mistralai/Mistral-7B-v0.3).\n",
"- qwen25: Qwen 2.5 (e.g. Qwen/Qwen2.5-1.5B-Instruct, Qwen/Qwen2.5-7B-Instruct) and QwQ (i.e. Qwen/QwQ-32B). Especially, for QwQ, we can enable the reasoning parser together with tool call parser, details about reasoning parser can be found in [reasoning parser](https://docs.sglang.ai/backend/separate_reasoning.html).\n",
"- deepseekv3: DeepSeek-v3 (e.g., deepseek-ai/DeepSeek-V3-0324).\n",
"- gpt-oss: GPT-OSS (e.g., openai/gpt-oss-120b, openai/gpt-oss-20b, lmsys/gpt-oss-120b-bf16, lmsys/gpt-oss-20b-bf16). Note: The gpt-oss tool parser filters out analysis channel events and only preserves normal text. This can cause the content to be empty when explanations are in the analysis channel. To work around this, complete the tool round by returning tool results as role=\"tool\" messages, which enables the model to generate the final content.\n",
"- kimi_k2: moonshotai/Kimi-K2-Instruct"
"Note that `--tool-call-parser` defines the parser used to interpret responses."
]
},
{
@@ -169,11 +178,11 @@
" tools=tools,\n",
")\n",
"print_highlight(\"Non-stream response:\")\n",
"print(response_non_stream)\n",
"print_highlight(response_non_stream)\n",
"print_highlight(\"==== content ====\")\n",
"print(response_non_stream.choices[0].message.content)\n",
"print_highlight(response_non_stream.choices[0].message.content)\n",
"print_highlight(\"==== tool_calls ====\")\n",
"print(response_non_stream.choices[0].message.tool_calls)"
"print_highlight(response_non_stream.choices[0].message.tool_calls)"
]
},
{
@@ -234,11 +243,11 @@
" if chunk.choices[0].delta.tool_calls:\n",
" tool_calls.append(chunk.choices[0].delta.tool_calls[0])\n",
"print_highlight(\"==== Text ====\")\n",
"print(texts)\n",
"print_highlight(texts)\n",
"\n",
"print_highlight(\"==== Tool Call ====\")\n",
"for tool_call in tool_calls:\n",
" print(tool_call)"
" print_highlight(tool_call)"
]
},
{
@@ -350,10 +359,10 @@
" tools=tools,\n",
")\n",
"print_highlight(\"Non-stream response:\")\n",
"print(final_response)\n",
"print_highlight(final_response)\n",
"\n",
"print_highlight(\"==== Text ====\")\n",
"print(final_response.choices[0].message.content)"
"print_highlight(final_response.choices[0].message.content)"
]
},
{
@@ -396,7 +405,7 @@
"}\n",
"gen_response = requests.post(gen_url, json=gen_data).json()[\"text\"]\n",
"print_highlight(\"==== Response ====\")\n",
"print(gen_response)\n",
"print_highlight(gen_response)\n",
"\n",
"# parse the response\n",
"parse_url = f\"http://localhost:{port}/parse_function_call\"\n",
@@ -463,8 +472,8 @@
"result = llm.generate(input_ids=input_ids, sampling_params=sampling_params)\n",
"generated_text = result[\"text\"] # Assume there is only one prompt\n",
"\n",
"print(\"=== Offline Engine Output Text ===\")\n",
"print(generated_text)\n",
"print_highlight(\"=== Offline Engine Output Text ===\")\n",
"print_highlight(generated_text)\n",
"\n",
"\n",
"# 2) Parse using FunctionCallParser\n",
@@ -485,13 +494,13 @@
"parser = FunctionCallParser(tools=tools, tool_call_parser=\"qwen25\")\n",
"normal_text, calls = parser.parse_non_stream(generated_text)\n",
"\n",
"print(\"=== Parsing Result ===\")\n",
"print_highlight(\"=== Parsing Result ===\")\n",
"print(\"Normal text portion:\", normal_text)\n",
"print(\"Function call portion:\")\n",
"print_highlight(\"Function call portion:\")\n",
"for call in calls:\n",
" # call: ToolCallItem\n",
" print(f\" - tool name: {call.name}\")\n",
" print(f\" parameters: {call.parameters}\")\n",
" print_highlight(f\" - tool name: {call.name}\")\n",
" print_highlight(f\" parameters: {call.parameters}\")\n",
"\n",
"# 3) If needed, perform additional logic on the parsed functions, such as automatically calling the corresponding function to obtain a return value, etc."
]
@@ -537,7 +546,7 @@
"\n",
"# Start a new server session for tool choice examples\n",
"server_process_tool_choice, port_tool_choice = launch_server_cmd(\n",
" \"python3 -m sglang.launch_server --model-path Qwen/Qwen2.5-7B-Instruct --tool-call-parser qwen25 --host 0.0.0.0\"\n",
" \"python3 -m sglang.launch_server --model-path Qwen/Qwen2.5-7B-Instruct --tool-call-parser qwen25 --host 0.0.0.0 --log-level warning\"\n",
")\n",
"wait_for_server(f\"http://localhost:{port_tool_choice}\")\n",
"\n",
@@ -628,8 +637,8 @@
"\n",
"if response_specific.choices[0].message.tool_calls:\n",
" tool_call = response_specific.choices[0].message.tool_calls[0]\n",
" print(f\"Called function: {tool_call.function.name}\")\n",
" print(f\"Arguments: {tool_call.function.arguments}\")"
" print_highlight(f\"Called function: {tool_call.function.name}\")\n",
" print_highlight(f\"Arguments: {tool_call.function.arguments}\")"
]
},
{
@@ -682,7 +691,7 @@
"import openai\n",
"\n",
"server_process, port = launch_server_cmd(\n",
" \" python3 -m sglang.launch_server --model-path meta-llama/Llama-3.2-1B-Instruct --tool-call-parser pythonic --tp 1\" # llama-3.2-1b-instruct\n",
" \" python3 -m sglang.launch_server --model-path meta-llama/Llama-3.2-1B-Instruct --tool-call-parser pythonic --tp 1 --log-level warning\" # llama-3.2-1b-instruct\n",
")\n",
"wait_for_server(f\"http://localhost:{port}\")\n",
"\n",
@@ -762,7 +771,7 @@
" tools=tools,\n",
")\n",
"print_highlight(\"Non-stream response:\")\n",
"print(response_non_stream)\n",
"print_highlight(response_non_stream)\n",
"\n",
"response_stream = client.chat.completions.create(\n",
" model=model_name,\n",
@@ -785,11 +794,11 @@
"\n",
"print_highlight(\"Streaming Response:\")\n",
"print_highlight(\"==== Text ====\")\n",
"print(texts)\n",
"print_highlight(texts)\n",
"\n",
"print_highlight(\"==== Tool Call ====\")\n",
"for tool_call in tool_calls:\n",
" print(tool_call)\n",
" print_highlight(tool_call)\n",
"\n",
"terminate_process(server_process)"
]

View File

@@ -43,7 +43,7 @@
"from sglang.utils import wait_for_server, print_highlight, terminate_process\n",
"\n",
"server_process, port = launch_server_cmd(\n",
" \"python3 -m sglang.launch_server --model-path qwen/qwen2.5-0.5b-instruct --host 0.0.0.0\"\n",
" \"python3 -m sglang.launch_server --model-path qwen/qwen2.5-0.5b-instruct --host 0.0.0.0 --log-level warning\"\n",
")\n",
"\n",
"wait_for_server(f\"http://localhost:{port}\")"
@@ -267,7 +267,7 @@
"embedding_process, port = launch_server_cmd(\n",
" \"\"\"\n",
"python3 -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-1.5B-instruct \\\n",
" --host 0.0.0.0 --is-embedding\n",
" --host 0.0.0.0 --is-embedding --log-level warning\n",
"\"\"\"\n",
")\n",
"\n",
@@ -316,7 +316,7 @@
"reranker_process, port = launch_server_cmd(\n",
" \"\"\"\n",
"python3 -m sglang.launch_server --model-path BAAI/bge-reranker-v2-m3 \\\n",
" --host 0.0.0.0 --disable-radix-cache --chunked-prefill-size -1 --attention-backend triton --is-embedding\n",
" --host 0.0.0.0 --disable-radix-cache --chunked-prefill-size -1 --attention-backend triton --is-embedding --log-level warning\n",
"\"\"\"\n",
")\n",
"\n",
@@ -376,7 +376,7 @@
"\n",
"reward_process, port = launch_server_cmd(\n",
" \"\"\"\n",
"python3 -m sglang.launch_server --model-path Skywork/Skywork-Reward-Llama-3.1-8B-v0.2 --host 0.0.0.0 --is-embedding\n",
"python3 -m sglang.launch_server --model-path Skywork/Skywork-Reward-Llama-3.1-8B-v0.2 --host 0.0.0.0 --is-embedding --log-level warning\n",
"\"\"\"\n",
")\n",
"\n",
@@ -441,7 +441,7 @@
"outputs": [],
"source": [
"expert_record_server_process, port = launch_server_cmd(\n",
" \"python3 -m sglang.launch_server --model-path Qwen/Qwen1.5-MoE-A2.7B --host 0.0.0.0 --expert-distribution-recorder-mode stat\"\n",
" \"python3 -m sglang.launch_server --model-path Qwen/Qwen1.5-MoE-A2.7B --host 0.0.0.0 --expert-distribution-recorder-mode stat --log-level warning\"\n",
")\n",
"\n",
"wait_for_server(f\"http://localhost:{port}\")"

View File

@@ -36,7 +36,7 @@
"from sglang.utils import wait_for_server, print_highlight, terminate_process\n",
"\n",
"server_process, port = launch_server_cmd(\n",
" \"python3 -m sglang.launch_server --model-path qwen/qwen2.5-0.5b-instruct --host 0.0.0.0\"\n",
" \"python3 -m sglang.launch_server --model-path qwen/qwen2.5-0.5b-instruct --host 0.0.0.0 --log-level warning\"\n",
")\n",
"\n",
"wait_for_server(f\"http://localhost:{port}\")\n",

View File

@@ -33,7 +33,7 @@
"embedding_process, port = launch_server_cmd(\n",
" \"\"\"\n",
"python3 -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-1.5B-instruct \\\n",
" --host 0.0.0.0 --is-embedding\n",
" --host 0.0.0.0 --is-embedding --log-level warning\n",
"\"\"\"\n",
")\n",
"\n",

View File

@@ -35,7 +35,7 @@
"\n",
"vision_process, port = launch_server_cmd(\n",
" \"\"\"\n",
"python3 -m sglang.launch_server --model-path Qwen/Qwen2.5-VL-7B-Instruct\n",
"python3 -m sglang.launch_server --model-path Qwen/Qwen2.5-VL-7B-Instruct --log-level warning\n",
"\"\"\"\n",
")\n",
"\n",

View File

@@ -34,7 +34,7 @@
"server_process, port = launch_server_cmd(\n",
" \"\"\"\n",
"python3 -m sglang.launch_server --model-path qwen/qwen2.5-0.5b-instruct \\\n",
" --host 0.0.0.0\n",
" --host 0.0.0.0 --log-level warning\n",
"\"\"\"\n",
")\n",
"\n",

View File

@@ -38,7 +38,7 @@ The core features include:
advanced_features/speculative_decoding.ipynb
advanced_features/structured_outputs.ipynb
advanced_features/structured_outputs_for_reasoning_models.ipynb
advanced_features/function_calling.ipynb
advanced_features/tool_parser.ipynb
advanced_features/separate_reasoning.ipynb
advanced_features/quantization.md
advanced_features/lora.ipynb

View File

@@ -39,7 +39,7 @@
"from sglang.utils import print_highlight, terminate_process, wait_for_server\n",
"\n",
"server_process, port = launch_server_cmd(\n",
" \"python -m sglang.launch_server --model-path Qwen/Qwen2.5-7B-Instruct --host 0.0.0.0\"\n",
" \"python -m sglang.launch_server --model-path Qwen/Qwen2.5-7B-Instruct --host 0.0.0.0 --log-level warning\"\n",
")\n",
"\n",
"wait_for_server(f\"http://localhost:{port}\")\n",
@@ -395,7 +395,7 @@
"outputs": [],
"source": [
"server_process, port = launch_server_cmd(\n",
" \"python -m sglang.launch_server --model-path Qwen/Qwen2.5-VL-7B-Instruct --host 0.0.0.0\"\n",
" \"python -m sglang.launch_server --model-path Qwen/Qwen2.5-VL-7B-Instruct --host 0.0.0.0 --log-level warning\"\n",
")\n",
"\n",
"wait_for_server(f\"http://localhost:{port}\")\n",

View File

@@ -457,6 +457,7 @@ def wait_for_server(base_url: str, timeout: int = None) -> None:
NOTE: Typically, the server runs in a separate terminal.
In this notebook, we run the server and notebook code together, so their outputs are combined.
To improve clarity, the server logs are displayed in the original black color, while the notebook outputs are highlighted in blue.
To reduce the log length, we set the log level to warning for the server, the default log level is info.
We are running those notebooks in a CI environment, so the throughput is not representative of the actual performance.
"""
)