From 0aba644633a6c85c0638fe963e938fd4dc1511d4 Mon Sep 17 00:00:00 2001 From: Yikun Jiang Date: Tue, 16 Sep 2025 09:27:50 +0800 Subject: [PATCH] Update max_tokens and prompt in qwen3 online doc (#2945) ### What this PR does / why we need it? Update max_tokens and prompt in qwen3 online doc Before: ``` "'max_tokens' or 'max_completion_tokens' is too large: 4096. This model's maximum context length is 4096 tokens and your request has 18 input tokens (4096 > 4096 - 18). None" ``` After: ``` curl http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{ "model": "/root/.cache/modelscope/hub/models/Qwen-SGlang/Qwen3-Next-80B-A3B-Instruct", "messages": [ {"role": "user", "content": "Who are you?"} ], "temperature": 0.6, "top_p": 0.95, "top_k": 20, "max_tokens": 32 }' .{"id":"chatcmpl-8ddbd65c9ddc405397219a6792feb9a0","object":"chat.completion","created":1757985049,"model":"/root/.cache/modelscope/hub/models/Qwen-SGlang/Qwen3-Next-80B-A3B-Instruct","choices":[{"index":0,"message":{"role":"assistant","content":"Hello! I am Qwen, a large-scale language model independently developed by the Tongyi Lab under Alibaba Group. I am designed to assist you in generating various","refusal":null,"annotations":null,"audio":null,"function_call":null,"tool_calls":[],"reasoning_content":null},"logprobs":null,"finish_reason":"length","stop_reason":null,"token_ids":null}],"service_tier":null,"system_fingerprint":null,"usage":{"prompt_tokens":12,"total_tokens":44,"completion_tokens":32,"prompt_tokens_details":null},"prompt_logprobs":null,"prompt_token_ids":null,"kv_transfer_params":null} ``` ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? - Manually test on my local env - CI passed Signed-off-by: Yikun Jiang --- docs/source/tutorials/multi_npu_qwen3_next.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/tutorials/multi_npu_qwen3_next.md b/docs/source/tutorials/multi_npu_qwen3_next.md index e24f28a..4fa5861 100644 --- a/docs/source/tutorials/multi_npu_qwen3_next.md +++ b/docs/source/tutorials/multi_npu_qwen3_next.md @@ -95,12 +95,12 @@ Once your server is started, you can query the model with input prompts curl http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{ "model": "Qwen/Qwen3-Next-80B-A3B-Instruct", "messages": [ - {"role": "user", "content": "Give me a short introduction to large language models."} + {"role": "user", "content": "Who are you?"} ], "temperature": 0.6, "top_p": 0.95, "top_k": 20, - "max_tokens": 4096 + "max_tokens": 32 }' ```