[main][BugFix] Fixed an accuracy bug of Qwen3-next-MTP when batched inferring (#4932)
### What this PR does / why we need it?
Fixes an accuracy bug of Qwen3-next-MTP when batched inferring.
It is descibed in
https://github.com/vllm-project/vllm-ascend/issues/4930.
- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c
Signed-off-by: drslark <slarksblood@qq.com>
This commit is contained in:
@@ -61,9 +61,14 @@ def test_qwen3_next_distributed_mp_full_decode_only_tp4():
|
||||
del vllm_model
|
||||
|
||||
|
||||
# TODO: Fix the accuary of batch chunked prefill
|
||||
def test_qwen3_next_distributed_mp_eager_mtp_similarity_tp4():
|
||||
example_prompts = ["Hello, my name is"]
|
||||
example_prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
|
||||
max_tokens = 20
|
||||
|
||||
with VllmRunner(
|
||||
|
||||
Reference in New Issue
Block a user