[BugFix] Fix num_pcp_pads Assignment Issues (#5273)

### What this PR does / why we need it? The variable `self.num_pcp_pads` was incorrectly truncated during assignment, causing errors in certain scenarios such as PD disaggregated. This issue has now been resolved. ### Does this PR introduce _any_ user-facing change? NO ### How was this patch tested? Co-author by: QiuChunshuo <qiuchunshuo@huawei.com> - vLLM version: release/v0.13.0 - vLLM main: ad32e3e19c --------- Signed-off-by: daishixun <dsxsteven@sina.com> Co-authored-by: weijinqian0 <1184188277@qq.com>
2025-12-25 10:38:09 +08:00
parent fca2f948c1
commit 30778f371b
2 changed files with 138 additions and 4 deletions
--- a/tests/e2e/multicard/long_sequence/test_basic.py
+++ b/tests/e2e/multicard/long_sequence/test_basic.py
@@ -135,3 +135,137 @@ def test_models_pcp_dcp_piece_wise():
                    block_size=128,
                    quantization="ascend") as runner:
        runner.model.generate(prompts, sampling_params)
+
+
+@pytest.mark.skipif(vllm_version_is('0.12.0'),
+                    reason="0.12.0 is not supported for context sequence.")
+def test_pcp_basic():
+    prompts = [
+        "The capital of France is", "Hello, my name is Tom, I am",
+        "The president of United States is", "AI future is"
+    ]
+    model = "deepseek-ai/DeepSeek-V2-Lite-Chat"
+    sampling_params = SamplingParams(max_tokens=32, temperature=0.0)
+    with VllmRunner(model,
+                    enforce_eager=True,
+                    max_model_len=1024,
+                    tensor_parallel_size=2,
+                    prefill_context_parallel_size=2,
+                    decode_context_parallel_size=1,
+                    max_num_batched_tokens=1024,
+                    enable_expert_parallel=True,
+                    block_size=128) as runner:
+        runner.model.generate(prompts, sampling_params)
+
+
+@pytest.mark.skipif(vllm_version_is('0.12.0'),
+                    reason="0.12.0 is not supported for context sequence.")
+def test_pcp_full_graph():
+    prompts = [
+        "The capital of France is", "Hello, my name is Tom, I am",
+        "The president of United States is", "AI future is"
+    ]
+    model = "deepseek-ai/DeepSeek-V2-Lite-Chat"
+    sampling_params = SamplingParams(max_tokens=32, temperature=0.0)
+    with VllmRunner(model,
+                    enforce_eager=False,
+                    max_model_len=1024,
+                    tensor_parallel_size=2,
+                    prefill_context_parallel_size=2,
+                    decode_context_parallel_size=1,
+                    max_num_batched_tokens=1024,
+                    enable_expert_parallel=True,
+                    block_size=128,
+                    compilation_config={
+                        "cudagraph_mode": "FULL_DECODE_ONLY",
+                        "cudagraph_capture_sizes": [4, 8, 24, 48, 60]
+                    }) as runner:
+        runner.model.generate(prompts, sampling_params)
+
+
+@pytest.mark.skipif(vllm_version_is('0.12.0'),
+                    reason="0.12.0 is not supported for context sequence.")
+def test_pcp_piece_wise():
+    prompts = [
+        "The capital of France is", "Hello, my name is Tom, I am",
+        "The president of United States is", "AI future is"
+    ]
+    model = "deepseek-ai/DeepSeek-V2-Lite-Chat"
+    sampling_params = SamplingParams(max_tokens=32, temperature=0.0)
+    with VllmRunner(model,
+                    enforce_eager=False,
+                    max_model_len=1024,
+                    tensor_parallel_size=2,
+                    prefill_context_parallel_size=2,
+                    decode_context_parallel_size=1,
+                    max_num_batched_tokens=1024,
+                    enable_expert_parallel=True,
+                    block_size=128) as runner:
+        runner.model.generate(prompts, sampling_params)
+
+
+@pytest.mark.skipif(vllm_version_is('0.12.0'),
+                    reason="0.12.0 is not supported for context sequence.")
+def test_dcp_basic():
+    prompts = [
+        "The capital of France is", "Hello, my name is Tom, I am",
+        "The president of United States is", "AI future is"
+    ]
+    model = "deepseek-ai/DeepSeek-V2-Lite-Chat"
+    sampling_params = SamplingParams(max_tokens=32, temperature=0.0)
+    with VllmRunner(model,
+                    enforce_eager=True,
+                    max_model_len=1024,
+                    tensor_parallel_size=4,
+                    prefill_context_parallel_size=1,
+                    decode_context_parallel_size=2,
+                    max_num_batched_tokens=1024,
+                    enable_expert_parallel=True,
+                    block_size=128) as runner:
+        runner.model.generate(prompts, sampling_params)
+
+
+@pytest.mark.skipif(vllm_version_is('0.12.0'),
+                    reason="0.12.0 is not supported for context sequence.")
+def test_dcp_full_graph():
+    prompts = [
+        "The capital of France is", "Hello, my name is Tom, I am",
+        "The president of United States is", "AI future is"
+    ]
+    model = "deepseek-ai/DeepSeek-V2-Lite-Chat"
+    sampling_params = SamplingParams(max_tokens=32, temperature=0.0)
+    with VllmRunner(model,
+                    enforce_eager=False,
+                    max_model_len=1024,
+                    tensor_parallel_size=4,
+                    prefill_context_parallel_size=1,
+                    decode_context_parallel_size=2,
+                    max_num_batched_tokens=1024,
+                    enable_expert_parallel=True,
+                    block_size=128,
+                    compilation_config={
+                        "cudagraph_mode": "FULL_DECODE_ONLY",
+                        "cudagraph_capture_sizes": [4, 8, 24, 48, 60]
+                    }) as runner:
+        runner.model.generate(prompts, sampling_params)
+
+
+@pytest.mark.skipif(vllm_version_is('0.12.0'),
+                    reason="0.12.0 is not supported for context sequence.")
+def test_dcp_piece_wise():
+    prompts = [
+        "The capital of France is", "Hello, my name is Tom, I am",
+        "The president of United States is", "AI future is"
+    ]
+    model = "deepseek-ai/DeepSeek-V2-Lite-Chat"
+    sampling_params = SamplingParams(max_tokens=32, temperature=0.0)
+    with VllmRunner(model,
+                    enforce_eager=False,
+                    max_model_len=1024,
+                    tensor_parallel_size=4,
+                    prefill_context_parallel_size=1,
+                    decode_context_parallel_size=2,
+                    max_num_batched_tokens=1024,
+                    enable_expert_parallel=True,
+                    block_size=128) as runner:
+        runner.model.generate(prompts, sampling_params)