From 0b14159fc4e07c57bed6f4603056bb3dd0303e8d Mon Sep 17 00:00:00 2001
From: Vincent Zhong <207368749+vincentzed@users.noreply.github.com>
Date: Sun, 14 Sep 2025 23:27:40 -0400
Subject: [PATCH] Add reasoning examples for GPT-OSS in Markdown examples
 (#9626)

---
 docs/basic_usage/gpt_oss.md                      | 12 +++++++++++-
 python/sglang/srt/entrypoints/openai/protocol.py |  2 +-
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/docs/basic_usage/gpt_oss.md b/docs/basic_usage/gpt_oss.md
index 240463ec4..798fd678d 100644
--- a/docs/basic_usage/gpt_oss.md
+++ b/docs/basic_usage/gpt_oss.md
@@ -6,7 +6,7 @@ Please refer to [https://github.com/sgl-project/sglang/issues/8833](https://gith
 
 ### Responses API
 
-GPT‑OSS is compatible with the OpenAI Responses API. Use `client.responses.create(...)` with `model`, `instructions`, `input`, and optional `tools` to enable built‑in tool use.
+GPT‑OSS is compatible with the OpenAI Responses API. Use `client.responses.create(...)` with `model`, `instructions`, `input`, and optional `tools` to enable built‑in tool use. You can set reasoning level via `instructions`, e.g., "Reasoning: high" (also supports "medium" and "low") — levels: low (fast), medium (balanced), high (deep).
 
 ### Built-in Tools
 
@@ -69,6 +69,16 @@ tools = [
     {"type": "web_search_preview"},
 ]
 
+# Reasoning level example
+response = client.responses.create(
+    model="openai/gpt-oss-120b",
+    instructions="You are a helpful assistant."
+    reasoning_effort="high" # Supports high, medium, or low
+    input="In one sentence, explain the transformer architecture.",
+)
+print("====== reasoning: high ======")
+print(response.output_text)
+
 # Test python tool
 response = client.responses.create(
     model="openai/gpt-oss-120b",
diff --git a/python/sglang/srt/entrypoints/openai/protocol.py b/python/sglang/srt/entrypoints/openai/protocol.py
index 7fed16703..8111f1939 100644
--- a/python/sglang/srt/entrypoints/openai/protocol.py
+++ b/python/sglang/srt/entrypoints/openai/protocol.py
@@ -450,7 +450,7 @@ class ChatCompletionRequest(BaseModel):
         description="Constrains effort on reasoning for reasoning models. "
         "'low' is the least effort, 'high' is the most effort. Reducing reasoning effort can "
         "result in faster responses and fewer tokens used on reasoning in a response. "
-        "Currently only supported for OpenAI models.",
+        "Currently only supported for OpenAI models in the harmony path, i.e GPT-OSS models.",
     )
 
     @model_validator(mode="before")