[fix] Fix mxfp4 weight loading bug with TP sharding in GPT-OSS (#9433)

Signed-off-by: Hao Lu <14827759+hlu1@users.noreply.github.com> Signed-off-by: Xinyuan Tong <xinyuantong.cs@gmail.com> Co-authored-by: Xinyuan Tong <xinyuantong.cs@gmail.com>
2025-08-21 03:50:51 -07:00
parent e85cb1ce9d
commit dae9a80f43
2 changed files with 11 additions and 3 deletions
--- a/python/sglang/srt/entrypoints/openai/protocol.py
+++ b/python/sglang/srt/entrypoints/openai/protocol.py
@@ -737,8 +737,8 @@ class ResponsesRequest(BaseModel):
        else:
            max_tokens = default_max_tokens

-        # Avoid exceed the context length by minus 1 token
-        max_tokens -= 1
+        # Avoid exceed the context length by minus 2 token
+        max_tokens -= 2

        # Get parameters with defaults
        temperature = self.temperature