support qwen3-next full_decode_only mode. (#3949)
### What this PR does / why we need it?
support qwen3-next full_decode_only mode.
bs=1, max_token=1024
| branch| tps| e2e time|
| --- | --- | --- |
|piecewise |3.06 | 8.15 |
|fulldecodeonly | 7.2 | 3.47 |
- vLLM version: v0.11.0
- vLLM main:
83f478bb19
Signed-off-by: wangxiaoxin-sherie <wangxiaoxin7@huawei.com>
Co-authored-by: wangxiaoxin-sherie <wangxiaoxin7@huawei.com>
This commit is contained in:
@@ -36,3 +36,21 @@ def test_models_distributed_Qwen3_NEXT_TP4():
|
||||
distributed_executor_backend="mp",
|
||||
enforce_eager=True) as vllm_model:
|
||||
vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
|
||||
def test_models_distributed_Qwen3_NEXT_TP4_FULL_DECODE_ONLY():
|
||||
example_prompts = [
|
||||
"Hello, my name is",
|
||||
] * 4
|
||||
max_tokens = 5
|
||||
with VllmRunner("Qwen/Qwen3-Next-80B-A3B-Instruct",
|
||||
tensor_parallel_size=4,
|
||||
max_model_len=4096,
|
||||
gpu_memory_utilization=0.8,
|
||||
distributed_executor_backend="mp",
|
||||
enforce_eager=False,
|
||||
compilation_config={
|
||||
"cudagraph_mode": "FULL_DECODE_ONLY",
|
||||
"cudagraph_capture_sizes": [1, 8, 24, 48, 60]
|
||||
}) as vllm_model:
|
||||
vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
Reference in New Issue
Block a user