support qwen3-next full_decode_only mode. (#3949)

### What this PR does / why we need it?
support qwen3-next full_decode_only mode. 
bs=1, max_token=1024
| branch| tps| e2e time|
| --- | --- | --- |
|piecewise  |3.06  | 8.15 |
|fulldecodeonly | 7.2 | 3.47 |

- vLLM version: v0.11.0
- vLLM main:
83f478bb19

Signed-off-by: wangxiaoxin-sherie <wangxiaoxin7@huawei.com>
Co-authored-by: wangxiaoxin-sherie <wangxiaoxin7@huawei.com>
This commit is contained in:
XiaoxinWang
2025-11-05 08:46:05 +08:00
committed by GitHub
parent 5f08e07208
commit 738bf2b720
4 changed files with 66 additions and 9 deletions

View File

@@ -36,3 +36,21 @@ def test_models_distributed_Qwen3_NEXT_TP4():
distributed_executor_backend="mp",
enforce_eager=True) as vllm_model:
vllm_model.generate_greedy(example_prompts, max_tokens)
def test_models_distributed_Qwen3_NEXT_TP4_FULL_DECODE_ONLY():
example_prompts = [
"Hello, my name is",
] * 4
max_tokens = 5
with VllmRunner("Qwen/Qwen3-Next-80B-A3B-Instruct",
tensor_parallel_size=4,
max_model_len=4096,
gpu_memory_utilization=0.8,
distributed_executor_backend="mp",
enforce_eager=False,
compilation_config={
"cudagraph_mode": "FULL_DECODE_ONLY",
"cudagraph_capture_sizes": [1, 8, 24, 48, 60]
}) as vllm_model:
vllm_model.generate_greedy(example_prompts, max_tokens)