support qwen3-next full_decode_only mode. (#3949)

### What this PR does / why we need it? support qwen3-next full_decode_only mode. bs=1, max_token=1024 | branch| tps| e2e time| | --- | --- | --- | |piecewise |3.06 | 8.15 | |fulldecodeonly | 7.2 | 3.47 | - vLLM version: v0.11.0 - vLLM main: 83f478bb19 Signed-off-by: wangxiaoxin-sherie <wangxiaoxin7@huawei.com> Co-authored-by: wangxiaoxin-sherie <wangxiaoxin7@huawei.com>
2025-11-05 08:46:05 +08:00
parent 5f08e07208
commit 738bf2b720
4 changed files with 66 additions and 9 deletions
--- a/tests/e2e/multicard/test_qwen3_next.py
+++ b/tests/e2e/multicard/test_qwen3_next.py
@@ -36,3 +36,21 @@ def test_models_distributed_Qwen3_NEXT_TP4():
                    distributed_executor_backend="mp",
                    enforce_eager=True) as vllm_model:
        vllm_model.generate_greedy(example_prompts, max_tokens)
+
+
+def test_models_distributed_Qwen3_NEXT_TP4_FULL_DECODE_ONLY():
+    example_prompts = [
+        "Hello, my name is",
+    ] * 4
+    max_tokens = 5
+    with VllmRunner("Qwen/Qwen3-Next-80B-A3B-Instruct",
+                    tensor_parallel_size=4,
+                    max_model_len=4096,
+                    gpu_memory_utilization=0.8,
+                    distributed_executor_backend="mp",
+                    enforce_eager=False,
+                    compilation_config={
+                        "cudagraph_mode": "FULL_DECODE_ONLY",
+                        "cudagraph_capture_sizes": [1, 8, 24, 48, 60]
+                    }) as vllm_model:
+        vllm_model.generate_greedy(example_prompts, max_tokens)