[Bugfix] fix dcp_only bug and add e2e accuracy test for dcp only and pcp only (#5565)

### What this PR does / why we need it? [Bugfix] fix dcp_only bug and add e2e accuracy test for dcp only and pcp only this pr fix the bug of accuracy test when decode_parallel_size>1 and prefill_context_parallel_size=1. ### Does this PR introduce _any_ user-facing change? NO ### How was this patch tested? - vLLM version: v0.13.0 - vLLM main: 7157596103 --------- Signed-off-by: zhenwenqi2024 <zhenwenqi_2022@qq.com>
2026-01-06 22:48:21 +08:00
parent 77a029979e
commit ad9b711f89
3 changed files with 128 additions and 10 deletions
--- a/tests/e2e/multicard/long_sequence/test_accuracy.py
+++ b/tests/e2e/multicard/long_sequence/test_accuracy.py
@@ -96,3 +96,117 @@ def test_models_long_sequence_output_between_tp_and_cp(
        name_0="vllm_eager_outputs",
        name_1="vllm_context_parallel_outputs",
    )
+
+
+model = "vllm-ascend/DeepSeek-V2-Lite-W8A8"
+
+
+@pytest.mark.parametrize("max_tokens", [10])
+def test_accuracy_dcp_only_graph(max_tokens: int, ) -> None:
+    prompts = [
+        "The president of the United States is", "The capital of France is"
+    ]
+    cp_kwargs = {
+        "tensor_parallel_size": 2,
+        "decode_context_parallel_size": 2,
+        "prefill_context_parallel_size": 1,
+        "enable_expert_parallel": True,
+        "compilation_config": {
+            "cudagraph_mode": "FULL_DECODE_ONLY",
+            "cudagraph_capture_sizes": [4, 8, 24, 48, 60]
+        },
+        "quantization": "ascend",
+        "max_model_len": 1024,
+    }
+    tp_kwargs = {
+        "tensor_parallel_size": 4,
+        "enable_expert_parallel": True,
+        "enforce_eager": True,
+        "quantization": "ascend",
+        "max_model_len": 1024,
+    }
+    with VllmRunner(model, **cp_kwargs) as runner:  # type: ignore
+        vllm_context_parallel_outputs = runner.generate_greedy(
+            prompts, max_tokens)
+
+    with VllmRunner(model, **tp_kwargs) as runner:  # type: ignore
+        vllm_eager_outputs = runner.generate_greedy(prompts, max_tokens)
+
+    check_outputs_equal(
+        outputs_0_lst=vllm_eager_outputs,
+        outputs_1_lst=vllm_context_parallel_outputs,
+        name_0="vllm_eager_outputs",
+        name_1="vllm_dcp_only_graph_outputs",
+    )
+
+
+@pytest.mark.parametrize("max_tokens", [10])
+def test_accuracy_dcp_only_eager(max_tokens: int, ) -> None:
+    prompts = [
+        "The president of the United States is", "The capital of France is"
+    ]
+    cp_kwargs = {
+        "tensor_parallel_size": 2,
+        "decode_context_parallel_size": 2,
+        "prefill_context_parallel_size": 1,
+        "enable_expert_parallel": True,
+        "enforce_eager": True,
+        "quantization": "ascend",
+        "max_model_len": 1024,
+    }
+    tp_kwargs = {
+        "tensor_parallel_size": 4,
+        "enable_expert_parallel": True,
+        "enforce_eager": True,
+        "quantization": "ascend",
+        "max_model_len": 1024,
+    }
+    with VllmRunner(model, **cp_kwargs) as runner:  # type: ignore
+        vllm_context_parallel_outputs = runner.generate_greedy(
+            prompts, max_tokens)
+
+    with VllmRunner(model, **tp_kwargs) as runner:  # type: ignore
+        vllm_eager_outputs = runner.generate_greedy(prompts, max_tokens)
+
+    check_outputs_equal(
+        outputs_0_lst=vllm_eager_outputs,
+        outputs_1_lst=vllm_context_parallel_outputs,
+        name_0="vllm_eager_outputs",
+        name_1="vllm_dcp_only_eager_outputs",
+    )
+
+
+@pytest.mark.parametrize("max_tokens", [10])
+def test_accuracy_pcp_only(max_tokens: int, ) -> None:
+    prompts = [
+        "The president of the United States is", "The capital of France is"
+    ]
+    cp_kwargs = {
+        "tensor_parallel_size": 2,
+        "decode_context_parallel_size": 1,
+        "prefill_context_parallel_size": 2,
+        "enable_expert_parallel": True,
+        "enforce_eager": True,
+        "quantization": "ascend",
+        "max_model_len": 1024,
+    }
+    tp_kwargs = {
+        "tensor_parallel_size": 4,
+        "enable_expert_parallel": True,
+        "enforce_eager": True,
+        "quantization": "ascend",
+        "max_model_len": 1024,
+    }
+    with VllmRunner(model, **cp_kwargs) as runner:  # type: ignore
+        vllm_context_parallel_outputs = runner.generate_greedy(
+            prompts, max_tokens)
+
+    with VllmRunner(model, **tp_kwargs) as runner:  # type: ignore
+        vllm_eager_outputs = runner.generate_greedy(prompts, max_tokens)
+
+    check_outputs_equal(
+        outputs_0_lst=vllm_eager_outputs,
+        outputs_1_lst=vllm_context_parallel_outputs,
+        name_0="vllm_eager_outputs",
+        name_1="vllm_pcp_only_outputs",
+    )