[BugFix][main] Adapted Qwen3-Next-MTP to chunked prefill (#4770)

### What this PR does / why we need it? The pad `-1` modification is from https://github.com/vllm-project/vllm/pull/25743. It still has bugs for batched chunked prefill. - vLLM version: v0.12.0 - vLLM main: ad32e3e19c Signed-off-by: drslark <slarksblood@qq.com> Co-authored-by: Mengqing Cao <cmq0113@163.com>
2025-12-10 22:54:24 +08:00
parent 490ddf536f
commit 0fb1dc43a1
8 changed files with 646 additions and 28 deletions
--- a/tests/e2e/multicard/test_qwen3_next.py
+++ b/tests/e2e/multicard/test_qwen3_next.py
@@ -24,7 +24,6 @@ Run `pytest tests/e2e/multicard/test_qwen3_next.py`.
 import os
 from unittest.mock import patch

-import pytest
 from modelscope import snapshot_download  # type: ignore

 from tests.e2e.conftest import VllmRunner
@@ -64,14 +63,9 @@ def test_models_distributed_Qwen3_NEXT_TP4_FULL_DECODE_ONLY():
        del vllm_model


-@pytest.mark.skip
+# TODO: Fix the accuary of batch chunked prefill
 def test_models_distributed_Qwen3_NEXT_MTP_TP4_SIMILARITY():
-    example_prompts = [
-        "Hello, my name is",
-        "The president of the United States is",
-        "The capital of France is",
-        "The future of AI is",
-    ]
+    example_prompts = ["Hello, my name is"]
    max_tokens = 20

    with VllmRunner(
@@ -115,7 +109,6 @@ def test_models_distributed_Qwen3_NEXT_MTP_TP4_SIMILARITY():


 # TODO: will conduct accuracy verification after the subsequent version becomes stable
-@pytest.mark.skip
@patch.dict(os.environ, {"HCCL_BUFFSIZE": "1024"})
 def test_models_distributed_Qwen3_NEXT_W8A8DYNAMIC_WITH_EP():
    example_prompts = [