diff --git a/docs/advanced_features/lora.ipynb b/docs/advanced_features/lora.ipynb index cccf9d749..1925baffc 100644 --- a/docs/advanced_features/lora.ipynb +++ b/docs/advanced_features/lora.ipynb @@ -80,6 +80,7 @@ " --enable-lora \\\n", " --lora-paths lora0=algoprog/fact-generation-llama-3.1-8b-instruct-lora \\\n", " --max-loras-per-batch 1 --lora-backend triton \\\n", + " --log-level warning \\\n", "\"\"\"\n", ")\n", "\n", @@ -139,6 +140,7 @@ " --lora-paths lora0=algoprog/fact-generation-llama-3.1-8b-instruct-lora \\\n", " lora1=Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16 \\\n", " --max-loras-per-batch 2 --lora-backend triton \\\n", + " --log-level warning \\\n", "\"\"\"\n", ")\n", "\n", @@ -215,6 +217,7 @@ " --max-loras-per-batch 2 --lora-backend triton \\\n", " --max-lora-rank 256\n", " --lora-target-modules all\n", + " --log-level warning\n", " \"\"\"\n", ")\n", "\n", @@ -417,6 +420,7 @@ " {\"lora_name\":\"lora0\",\"lora_path\":\"Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16\",\"pinned\":true} \\\n", " {\"lora_name\":\"lora1\",\"lora_path\":\"algoprog/fact-generation-llama-3.1-8b-instruct-lora\"} \\\n", " lora2=philschmid/code-llama-3-1-8b-text-to-sql-lora\n", + " --log-level warning\n", " \"\"\"\n", ")\n", "\n", diff --git a/docs/advanced_features/separate_reasoning.ipynb b/docs/advanced_features/separate_reasoning.ipynb index 8850863a4..0c20c5a08 100644 --- a/docs/advanced_features/separate_reasoning.ipynb +++ b/docs/advanced_features/separate_reasoning.ipynb @@ -67,7 +67,7 @@ "from sglang.utils import wait_for_server, print_highlight, terminate_process\n", "\n", "server_process, port = launch_server_cmd(\n", - " \"python3 -m sglang.launch_server --model-path deepseek-ai/DeepSeek-R1-Distill-Qwen-7B --host 0.0.0.0 --reasoning-parser deepseek-r1\"\n", + " \"python3 -m sglang.launch_server --model-path deepseek-ai/DeepSeek-R1-Distill-Qwen-7B --host 0.0.0.0 --reasoning-parser deepseek-r1 --log-level warning\"\n", ")\n", "\n", "wait_for_server(f\"http://localhost:{port}\")" diff --git a/docs/advanced_features/speculative_decoding.ipynb b/docs/advanced_features/speculative_decoding.ipynb index 2f2f0b87f..aa62b897a 100644 --- a/docs/advanced_features/speculative_decoding.ipynb +++ b/docs/advanced_features/speculative_decoding.ipynb @@ -70,7 +70,7 @@ " \"\"\"\n", "python3 -m sglang.launch_server --model meta-llama/Llama-2-7b-chat-hf --speculative-algorithm EAGLE \\\n", " --speculative-draft-model-path lmsys/sglang-EAGLE-llama2-chat-7B --speculative-num-steps 3 \\\n", - " --speculative-eagle-topk 4 --speculative-num-draft-tokens 16 --cuda-graph-max-bs 8\n", + " --speculative-eagle-topk 4 --speculative-num-draft-tokens 16 --cuda-graph-max-bs 8 --log-level warning\n", "\"\"\"\n", ")\n", "\n", @@ -126,7 +126,7 @@ "python3 -m sglang.launch_server --model meta-llama/Llama-2-7b-chat-hf --speculative-algorithm EAGLE \\\n", " --speculative-draft-model-path lmsys/sglang-EAGLE-llama2-chat-7B --speculative-num-steps 5 \\\n", " --speculative-eagle-topk 8 --speculative-num-draft-tokens 64 --mem-fraction 0.6 \\\n", - " --enable-torch-compile --torch-compile-max-bs 2\n", + " --enable-torch-compile --torch-compile-max-bs 2 --log-level warning\n", "\"\"\"\n", ")\n", "\n", @@ -186,7 +186,7 @@ "python3 -m sglang.launch_server --model meta-llama/Meta-Llama-3-8B-Instruct --speculative-algorithm EAGLE \\\n", " --speculative-draft-model-path lmsys/sglang-EAGLE-LLaMA3-Instruct-8B --speculative-num-steps 5 \\\n", " --speculative-eagle-topk 8 --speculative-num-draft-tokens 64 --speculative-token-map thunlp/LLaMA3-Instruct-8B-FR-Spec/freq_32768.pt \\\n", - " --mem-fraction 0.7 --cuda-graph-max-bs 2 --dtype float16 \n", + " --mem-fraction 0.7 --cuda-graph-max-bs 2 --dtype float16 --log-level warning\n", "\"\"\"\n", ")\n", "\n", @@ -242,7 +242,7 @@ "python3 -m sglang.launch_server --model meta-llama/Llama-3.1-8B-Instruct --speculative-algorithm EAGLE3 \\\n", " --speculative-draft-model-path jamesliu1/sglang-EAGLE3-Llama-3.1-Instruct-8B --speculative-num-steps 5 \\\n", " --speculative-eagle-topk 8 --speculative-num-draft-tokens 32 --mem-fraction 0.6 \\\n", - " --cuda-graph-max-bs 2 --dtype float16\n", + " --cuda-graph-max-bs 2 --dtype float16 --log-level warning\n", "\"\"\"\n", ")\n", "\n", @@ -297,7 +297,7 @@ " \"\"\"\n", " python3 -m sglang.launch_server --model-path XiaomiMiMo/MiMo-7B-RL --host 0.0.0.0 --trust-remote-code \\\n", " --speculative-algorithm EAGLE --speculative-num-steps 1 --speculative-eagle-topk 1 --speculative-num-draft-tokens 2 \\\n", - " --mem-fraction 0.5\n", + " --mem-fraction 0.5 --log-level warning\n", "\"\"\"\n", ")\n", "\n", diff --git a/docs/advanced_features/structured_outputs.ipynb b/docs/advanced_features/structured_outputs.ipynb index cd7e42e9d..1382f1e0e 100644 --- a/docs/advanced_features/structured_outputs.ipynb +++ b/docs/advanced_features/structured_outputs.ipynb @@ -51,7 +51,7 @@ "\n", "\n", "server_process, port = launch_server_cmd(\n", - " \"python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --host 0.0.0.0\"\n", + " \"python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --host 0.0.0.0 --log-level warning\"\n", ")\n", "\n", "wait_for_server(f\"http://localhost:{port}\")\n", diff --git a/docs/advanced_features/structured_outputs_for_reasoning_models.ipynb b/docs/advanced_features/structured_outputs_for_reasoning_models.ipynb index 1adb715be..c8f51a98a 100644 --- a/docs/advanced_features/structured_outputs_for_reasoning_models.ipynb +++ b/docs/advanced_features/structured_outputs_for_reasoning_models.ipynb @@ -47,7 +47,7 @@ "\n", "\n", "server_process, port = launch_server_cmd(\n", - " \"python -m sglang.launch_server --model-path deepseek-ai/DeepSeek-R1-Distill-Qwen-7B --host 0.0.0.0 --reasoning-parser deepseek-r1\"\n", + " \"python -m sglang.launch_server --model-path deepseek-ai/DeepSeek-R1-Distill-Qwen-7B --host 0.0.0.0 --reasoning-parser deepseek-r1 --log-level warning\"\n", ")\n", "\n", "wait_for_server(f\"http://localhost:{port}\")\n", diff --git a/docs/advanced_features/function_calling.ipynb b/docs/advanced_features/tool_parser.ipynb similarity index 90% rename from docs/advanced_features/function_calling.ipynb rename to docs/advanced_features/tool_parser.ipynb index 1a2403df6..fd88b6799 100644 --- a/docs/advanced_features/function_calling.ipynb +++ b/docs/advanced_features/tool_parser.ipynb @@ -4,11 +4,29 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Tool and Function Calling\n", + "# Tool Parser\n", "\n", "This guide demonstrates how to use SGLang’s [Function calling](https://platform.openai.com/docs/guides/function-calling) functionality." ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Currently supported parsers:\n", + "\n", + "| Parser | Supported Models | Notes |\n", + "|---|---|---|\n", + "| `llama3` | Llama 3.1 / 3.2 / 3.3 (e.g. `meta-llama/Llama-3.1-8B-Instruct`, `meta-llama/Llama-3.2-1B-Instruct`, `meta-llama/Llama-3.3-70B-Instruct`) | |\n", + "| `llama4` | Llama 4 (e.g. `meta-llama/Llama-4-Scout-17B-16E-Instruct`) | |\n", + "| `mistral` | Mistral (e.g. `mistralai/Mistral-7B-Instruct-v0.3`, `mistralai/Mistral-Nemo-Instruct-2407`, `mistralai/Mistral-7B-v0.3`) | |\n", + "| `qwen25` | Qwen 2.5 (e.g. `Qwen/Qwen2.5-1.5B-Instruct`, `Qwen/Qwen2.5-7B-Instruct`) and QwQ (i.e. `Qwen/QwQ-32B`) | For QwQ, reasoning parser can be enabled together with tool call parser. See [reasoning parser](https://docs.sglang.ai/backend/separate_reasoning.html). |\n", + "| `deepseekv3` | DeepSeek-v3 (e.g., `deepseek-ai/DeepSeek-V3-0324`) | |\n", + "| `gpt-oss` | GPT-OSS (e.g., `openai/gpt-oss-120b`, `openai/gpt-oss-20b`, `lmsys/gpt-oss-120b-bf16`, `lmsys/gpt-oss-20b-bf16`) | The gpt-oss tool parser filters out analysis channel events and only preserves normal text. This can cause the content to be empty when explanations are in the analysis channel. To work around this, complete the tool round by returning tool results as `role=\"tool\"` messages, which enables the model to generate the final content. |\n", + "| `kimi_k2` | `moonshotai/Kimi-K2-Instruct` | |\n", + "| `pythonic` | Llama-3.2 / Llama-3.3 / Llama-4 | Model outputs function calls as Python code. Requires `--tool-call-parser pythonic` and is recommended to use with a specific chat template. |\n" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -35,7 +53,7 @@ "from openai import OpenAI\n", "\n", "server_process, port = launch_server_cmd(\n", - " \"python3 -m sglang.launch_server --model-path Qwen/Qwen2.5-7B-Instruct --tool-call-parser qwen25 --host 0.0.0.0\" # qwen25\n", + " \"python3 -m sglang.launch_server --model-path Qwen/Qwen2.5-7B-Instruct --tool-call-parser qwen25 --host 0.0.0.0 --log-level warning\" # qwen25\n", ")\n", "wait_for_server(f\"http://localhost:{port}\")" ] @@ -44,16 +62,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Note that `--tool-call-parser` defines the parser used to interpret responses. Currently supported parsers include:\n", - "\n", - "- llama3: Llama 3.1 / 3.2 / 3.3 (e.g. meta-llama/Llama-3.1-8B-Instruct, meta-llama/Llama-3.2-1B-Instruct, meta-llama/Llama-3.3-70B-Instruct).\n", - "- llama4: Llama 4 (e.g. meta-llama/Llama-4-Scout-17B-16E-Instruct).\n", - "- mistral: Mistral (e.g. mistralai/Mistral-7B-Instruct-v0.3, mistralai/Mistral-Nemo-Instruct-2407, mistralai/\n", - "Mistral-Nemo-Instruct-2407, mistralai/Mistral-7B-v0.3).\n", - "- qwen25: Qwen 2.5 (e.g. Qwen/Qwen2.5-1.5B-Instruct, Qwen/Qwen2.5-7B-Instruct) and QwQ (i.e. Qwen/QwQ-32B). Especially, for QwQ, we can enable the reasoning parser together with tool call parser, details about reasoning parser can be found in [reasoning parser](https://docs.sglang.ai/backend/separate_reasoning.html).\n", - "- deepseekv3: DeepSeek-v3 (e.g., deepseek-ai/DeepSeek-V3-0324).\n", - "- gpt-oss: GPT-OSS (e.g., openai/gpt-oss-120b, openai/gpt-oss-20b, lmsys/gpt-oss-120b-bf16, lmsys/gpt-oss-20b-bf16). Note: The gpt-oss tool parser filters out analysis channel events and only preserves normal text. This can cause the content to be empty when explanations are in the analysis channel. To work around this, complete the tool round by returning tool results as role=\"tool\" messages, which enables the model to generate the final content.\n", - "- kimi_k2: moonshotai/Kimi-K2-Instruct" + "Note that `--tool-call-parser` defines the parser used to interpret responses." ] }, { @@ -169,11 +178,11 @@ " tools=tools,\n", ")\n", "print_highlight(\"Non-stream response:\")\n", - "print(response_non_stream)\n", + "print_highlight(response_non_stream)\n", "print_highlight(\"==== content ====\")\n", - "print(response_non_stream.choices[0].message.content)\n", + "print_highlight(response_non_stream.choices[0].message.content)\n", "print_highlight(\"==== tool_calls ====\")\n", - "print(response_non_stream.choices[0].message.tool_calls)" + "print_highlight(response_non_stream.choices[0].message.tool_calls)" ] }, { @@ -234,11 +243,11 @@ " if chunk.choices[0].delta.tool_calls:\n", " tool_calls.append(chunk.choices[0].delta.tool_calls[0])\n", "print_highlight(\"==== Text ====\")\n", - "print(texts)\n", + "print_highlight(texts)\n", "\n", "print_highlight(\"==== Tool Call ====\")\n", "for tool_call in tool_calls:\n", - " print(tool_call)" + " print_highlight(tool_call)" ] }, { @@ -350,10 +359,10 @@ " tools=tools,\n", ")\n", "print_highlight(\"Non-stream response:\")\n", - "print(final_response)\n", + "print_highlight(final_response)\n", "\n", "print_highlight(\"==== Text ====\")\n", - "print(final_response.choices[0].message.content)" + "print_highlight(final_response.choices[0].message.content)" ] }, { @@ -396,7 +405,7 @@ "}\n", "gen_response = requests.post(gen_url, json=gen_data).json()[\"text\"]\n", "print_highlight(\"==== Response ====\")\n", - "print(gen_response)\n", + "print_highlight(gen_response)\n", "\n", "# parse the response\n", "parse_url = f\"http://localhost:{port}/parse_function_call\"\n", @@ -463,8 +472,8 @@ "result = llm.generate(input_ids=input_ids, sampling_params=sampling_params)\n", "generated_text = result[\"text\"] # Assume there is only one prompt\n", "\n", - "print(\"=== Offline Engine Output Text ===\")\n", - "print(generated_text)\n", + "print_highlight(\"=== Offline Engine Output Text ===\")\n", + "print_highlight(generated_text)\n", "\n", "\n", "# 2) Parse using FunctionCallParser\n", @@ -485,13 +494,13 @@ "parser = FunctionCallParser(tools=tools, tool_call_parser=\"qwen25\")\n", "normal_text, calls = parser.parse_non_stream(generated_text)\n", "\n", - "print(\"=== Parsing Result ===\")\n", + "print_highlight(\"=== Parsing Result ===\")\n", "print(\"Normal text portion:\", normal_text)\n", - "print(\"Function call portion:\")\n", + "print_highlight(\"Function call portion:\")\n", "for call in calls:\n", " # call: ToolCallItem\n", - " print(f\" - tool name: {call.name}\")\n", - " print(f\" parameters: {call.parameters}\")\n", + " print_highlight(f\" - tool name: {call.name}\")\n", + " print_highlight(f\" parameters: {call.parameters}\")\n", "\n", "# 3) If needed, perform additional logic on the parsed functions, such as automatically calling the corresponding function to obtain a return value, etc." ] @@ -537,7 +546,7 @@ "\n", "# Start a new server session for tool choice examples\n", "server_process_tool_choice, port_tool_choice = launch_server_cmd(\n", - " \"python3 -m sglang.launch_server --model-path Qwen/Qwen2.5-7B-Instruct --tool-call-parser qwen25 --host 0.0.0.0\"\n", + " \"python3 -m sglang.launch_server --model-path Qwen/Qwen2.5-7B-Instruct --tool-call-parser qwen25 --host 0.0.0.0 --log-level warning\"\n", ")\n", "wait_for_server(f\"http://localhost:{port_tool_choice}\")\n", "\n", @@ -628,8 +637,8 @@ "\n", "if response_specific.choices[0].message.tool_calls:\n", " tool_call = response_specific.choices[0].message.tool_calls[0]\n", - " print(f\"Called function: {tool_call.function.name}\")\n", - " print(f\"Arguments: {tool_call.function.arguments}\")" + " print_highlight(f\"Called function: {tool_call.function.name}\")\n", + " print_highlight(f\"Arguments: {tool_call.function.arguments}\")" ] }, { @@ -682,7 +691,7 @@ "import openai\n", "\n", "server_process, port = launch_server_cmd(\n", - " \" python3 -m sglang.launch_server --model-path meta-llama/Llama-3.2-1B-Instruct --tool-call-parser pythonic --tp 1\" # llama-3.2-1b-instruct\n", + " \" python3 -m sglang.launch_server --model-path meta-llama/Llama-3.2-1B-Instruct --tool-call-parser pythonic --tp 1 --log-level warning\" # llama-3.2-1b-instruct\n", ")\n", "wait_for_server(f\"http://localhost:{port}\")\n", "\n", @@ -762,7 +771,7 @@ " tools=tools,\n", ")\n", "print_highlight(\"Non-stream response:\")\n", - "print(response_non_stream)\n", + "print_highlight(response_non_stream)\n", "\n", "response_stream = client.chat.completions.create(\n", " model=model_name,\n", @@ -785,11 +794,11 @@ "\n", "print_highlight(\"Streaming Response:\")\n", "print_highlight(\"==== Text ====\")\n", - "print(texts)\n", + "print_highlight(texts)\n", "\n", "print_highlight(\"==== Tool Call ====\")\n", "for tool_call in tool_calls:\n", - " print(tool_call)\n", + " print_highlight(tool_call)\n", "\n", "terminate_process(server_process)" ] diff --git a/docs/basic_usage/native_api.ipynb b/docs/basic_usage/native_api.ipynb index 33dffea74..5e4ca19a1 100644 --- a/docs/basic_usage/native_api.ipynb +++ b/docs/basic_usage/native_api.ipynb @@ -43,7 +43,7 @@ "from sglang.utils import wait_for_server, print_highlight, terminate_process\n", "\n", "server_process, port = launch_server_cmd(\n", - " \"python3 -m sglang.launch_server --model-path qwen/qwen2.5-0.5b-instruct --host 0.0.0.0\"\n", + " \"python3 -m sglang.launch_server --model-path qwen/qwen2.5-0.5b-instruct --host 0.0.0.0 --log-level warning\"\n", ")\n", "\n", "wait_for_server(f\"http://localhost:{port}\")" @@ -267,7 +267,7 @@ "embedding_process, port = launch_server_cmd(\n", " \"\"\"\n", "python3 -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-1.5B-instruct \\\n", - " --host 0.0.0.0 --is-embedding\n", + " --host 0.0.0.0 --is-embedding --log-level warning\n", "\"\"\"\n", ")\n", "\n", @@ -316,7 +316,7 @@ "reranker_process, port = launch_server_cmd(\n", " \"\"\"\n", "python3 -m sglang.launch_server --model-path BAAI/bge-reranker-v2-m3 \\\n", - " --host 0.0.0.0 --disable-radix-cache --chunked-prefill-size -1 --attention-backend triton --is-embedding\n", + " --host 0.0.0.0 --disable-radix-cache --chunked-prefill-size -1 --attention-backend triton --is-embedding --log-level warning\n", "\"\"\"\n", ")\n", "\n", @@ -376,7 +376,7 @@ "\n", "reward_process, port = launch_server_cmd(\n", " \"\"\"\n", - "python3 -m sglang.launch_server --model-path Skywork/Skywork-Reward-Llama-3.1-8B-v0.2 --host 0.0.0.0 --is-embedding\n", + "python3 -m sglang.launch_server --model-path Skywork/Skywork-Reward-Llama-3.1-8B-v0.2 --host 0.0.0.0 --is-embedding --log-level warning\n", "\"\"\"\n", ")\n", "\n", @@ -441,7 +441,7 @@ "outputs": [], "source": [ "expert_record_server_process, port = launch_server_cmd(\n", - " \"python3 -m sglang.launch_server --model-path Qwen/Qwen1.5-MoE-A2.7B --host 0.0.0.0 --expert-distribution-recorder-mode stat\"\n", + " \"python3 -m sglang.launch_server --model-path Qwen/Qwen1.5-MoE-A2.7B --host 0.0.0.0 --expert-distribution-recorder-mode stat --log-level warning\"\n", ")\n", "\n", "wait_for_server(f\"http://localhost:{port}\")" diff --git a/docs/basic_usage/openai_api_completions.ipynb b/docs/basic_usage/openai_api_completions.ipynb index eb9ff7875..6b967709f 100644 --- a/docs/basic_usage/openai_api_completions.ipynb +++ b/docs/basic_usage/openai_api_completions.ipynb @@ -36,7 +36,7 @@ "from sglang.utils import wait_for_server, print_highlight, terminate_process\n", "\n", "server_process, port = launch_server_cmd(\n", - " \"python3 -m sglang.launch_server --model-path qwen/qwen2.5-0.5b-instruct --host 0.0.0.0\"\n", + " \"python3 -m sglang.launch_server --model-path qwen/qwen2.5-0.5b-instruct --host 0.0.0.0 --log-level warning\"\n", ")\n", "\n", "wait_for_server(f\"http://localhost:{port}\")\n", diff --git a/docs/basic_usage/openai_api_embeddings.ipynb b/docs/basic_usage/openai_api_embeddings.ipynb index 9c7c99c0f..26e95a4e7 100644 --- a/docs/basic_usage/openai_api_embeddings.ipynb +++ b/docs/basic_usage/openai_api_embeddings.ipynb @@ -33,7 +33,7 @@ "embedding_process, port = launch_server_cmd(\n", " \"\"\"\n", "python3 -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-1.5B-instruct \\\n", - " --host 0.0.0.0 --is-embedding\n", + " --host 0.0.0.0 --is-embedding --log-level warning\n", "\"\"\"\n", ")\n", "\n", diff --git a/docs/basic_usage/openai_api_vision.ipynb b/docs/basic_usage/openai_api_vision.ipynb index 3669f5ca6..88d1ef7dd 100644 --- a/docs/basic_usage/openai_api_vision.ipynb +++ b/docs/basic_usage/openai_api_vision.ipynb @@ -35,7 +35,7 @@ "\n", "vision_process, port = launch_server_cmd(\n", " \"\"\"\n", - "python3 -m sglang.launch_server --model-path Qwen/Qwen2.5-VL-7B-Instruct\n", + "python3 -m sglang.launch_server --model-path Qwen/Qwen2.5-VL-7B-Instruct --log-level warning\n", "\"\"\"\n", ")\n", "\n", diff --git a/docs/basic_usage/send_request.ipynb b/docs/basic_usage/send_request.ipynb index b53bd3560..6e457a02b 100644 --- a/docs/basic_usage/send_request.ipynb +++ b/docs/basic_usage/send_request.ipynb @@ -34,7 +34,7 @@ "server_process, port = launch_server_cmd(\n", " \"\"\"\n", "python3 -m sglang.launch_server --model-path qwen/qwen2.5-0.5b-instruct \\\n", - " --host 0.0.0.0\n", + " --host 0.0.0.0 --log-level warning\n", "\"\"\"\n", ")\n", "\n", diff --git a/docs/index.rst b/docs/index.rst index 040aa53f3..1d9c790dd 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -38,7 +38,7 @@ The core features include: advanced_features/speculative_decoding.ipynb advanced_features/structured_outputs.ipynb advanced_features/structured_outputs_for_reasoning_models.ipynb - advanced_features/function_calling.ipynb + advanced_features/tool_parser.ipynb advanced_features/separate_reasoning.ipynb advanced_features/quantization.md advanced_features/lora.ipynb diff --git a/docs/references/frontend/frontend_tutorial.ipynb b/docs/references/frontend/frontend_tutorial.ipynb index 68fb916a1..836cab627 100644 --- a/docs/references/frontend/frontend_tutorial.ipynb +++ b/docs/references/frontend/frontend_tutorial.ipynb @@ -39,7 +39,7 @@ "from sglang.utils import print_highlight, terminate_process, wait_for_server\n", "\n", "server_process, port = launch_server_cmd(\n", - " \"python -m sglang.launch_server --model-path Qwen/Qwen2.5-7B-Instruct --host 0.0.0.0\"\n", + " \"python -m sglang.launch_server --model-path Qwen/Qwen2.5-7B-Instruct --host 0.0.0.0 --log-level warning\"\n", ")\n", "\n", "wait_for_server(f\"http://localhost:{port}\")\n", @@ -395,7 +395,7 @@ "outputs": [], "source": [ "server_process, port = launch_server_cmd(\n", - " \"python -m sglang.launch_server --model-path Qwen/Qwen2.5-VL-7B-Instruct --host 0.0.0.0\"\n", + " \"python -m sglang.launch_server --model-path Qwen/Qwen2.5-VL-7B-Instruct --host 0.0.0.0 --log-level warning\"\n", ")\n", "\n", "wait_for_server(f\"http://localhost:{port}\")\n", diff --git a/python/sglang/utils.py b/python/sglang/utils.py index 651a25155..c84842e94 100644 --- a/python/sglang/utils.py +++ b/python/sglang/utils.py @@ -457,6 +457,7 @@ def wait_for_server(base_url: str, timeout: int = None) -> None: NOTE: Typically, the server runs in a separate terminal. In this notebook, we run the server and notebook code together, so their outputs are combined. To improve clarity, the server logs are displayed in the original black color, while the notebook outputs are highlighted in blue. + To reduce the log length, we set the log level to warning for the server, the default log level is info. We are running those notebooks in a CI environment, so the throughput is not representative of the actual performance. """ )