[Feat] support basic pcp&dcp for qwen3next (#6091)

### What this PR does / why we need it? This PR implements Context Parallelism (CP) support for the Qwen3-Next model, including PCP (Parallel Context Parallelism) and DCP (Dynamic/Data Context Parallelism). - vLLM version: v0.15.0 - vLLM main: f176443446 --------- Signed-off-by: SunnyLee219 <3294305115@qq.com> Signed-off-by: Jingchun Gao <gaojingchun1@huawei.com> Signed-off-by: 白永斌 <baiyongbin3@h-partners.com> Signed-off-by: Bai Yongbin <845473182@qq.com> Co-authored-by: SunnyLee219 <3294305115@qq.com> Co-authored-by: Jingchun Gao <gaojingchun1@huawei.com> Co-authored-by: 白永斌 <baiyongbin3@h-partners.com> Co-authored-by: Mengqing Cao <cmq0113@163.com>
2026-02-28 21:44:08 +08:00
parent 64fba51275
commit 9d09488b4a
16 changed files with 906 additions and 81 deletions
--- a/tests/e2e/multicard/4-cards/long_sequence/test_basic.py
+++ b/tests/e2e/multicard/4-cards/long_sequence/test_basic.py
@@ -44,16 +44,15 @@ def test_models_pcp_dcp_basic():
        runner.model.generate(prompts, sampling_params)

    model = "vllm-ascend/Qwen3-30B-A3B-W8A8"
-    with VllmRunner(
-            model,
-            enforce_eager=True,
-            max_model_len=1024,
-            tensor_parallel_size=2,
-            prefill_context_parallel_size=2,
-            decode_context_parallel_size=1,
-            enable_expert_parallel=True,
-            block_size=128,
-            quantization="ascend",
+    with VllmRunner(model,
+                    enforce_eager=True,
+                    max_model_len=1024,
+                    tensor_parallel_size=2,
+                    prefill_context_parallel_size=2,
+                    decode_context_parallel_size=1,
+                    enable_expert_parallel=True,
+                    block_size=128,
+                    quantization="ascend",
    ) as runner:
        runner.model.generate(prompts, sampling_params)
    
@@ -71,6 +70,19 @@ def test_models_pcp_dcp_basic():
    ) as runner:
        runner.model.generate(prompts, sampling_params)

+    model = "Qwen/Qwen3-Next-80B-A3B-Instruct"
+    with VllmRunner(model,
+                    enforce_eager=True,
+                    max_model_len=1024,
+                    tensor_parallel_size=2,
+                    prefill_context_parallel_size=2,
+                    decode_context_parallel_size=1,
+                    max_num_batched_tokens=1024,
+                    enable_expert_parallel=True,
+                    gpu_memory_utilization=0.8,
+                    block_size=128) as runner:
+        runner.model.generate(prompts, sampling_params)
+

 def test_models_pcp_dcp_full_graph():
    prompts = [