xc-llm-ascend/tests/e2e/multicard/4-cards/long_sequence/test_accuracy.py

#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# Copyright 2023 The vLLM team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""
Compare the outputs of vLLM with and without context parallel.

Run `pytest tests/e2e/multicard/long_sequence/test_accuracy.py`.
"""

import pytest

from tests.e2e.conftest import VllmRunner
from tests.e2e.model_utils import check_outputs_equal

MODELS = [
    "Qwen/Qwen3-8B",
    "vllm-ascend/DeepSeek-V2-Lite-W8A8",
]


@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("max_tokens", [10])
def test_models_long_sequence_output_between_tp_and_cp(
    model: str,
    max_tokens: int,
) -> None:
    prompts = [
        "The president of the United States is", "The capital of France is"
    ]

    common_kwargs = {
        "max_model_len": 1024,
    }

    if model == "vllm-ascend/DeepSeek-V2-Lite-W8A8":
        cp_kwargs = {
            "tensor_parallel_size": 2,
            "decode_context_parallel_size": 2,
            "prefill_context_parallel_size": 2,
            "enable_expert_parallel": True,
            "enforce_eager": True,
            "quantization": "ascend",
        }
        tp_kwargs = {
            "tensor_parallel_size": 4,
            "enable_expert_parallel": True,
            "enforce_eager": True,
            "quantization": "ascend",
        }

    else:
        cp_kwargs = {
            "tensor_parallel_size": 1,
            "decode_context_parallel_size": 1,
            "prefill_context_parallel_size": 2,
            "compilation_config": {
                "cudagraph_mode": "FULL_DECODE_ONLY",
                "cudagraph_capture_sizes": [4, 8, 24, 48, 60]
            },
        }
        tp_kwargs = {
            "tensor_parallel_size": 2,
            "enforce_eager": True,
        }

    cp_full_kwargs = {}
    cp_full_kwargs.update(common_kwargs)  # type: ignore
    cp_full_kwargs.update(cp_kwargs)  # type: ignore

    tp_full_kwargs = {}
    tp_full_kwargs.update(common_kwargs)  # type: ignore
    tp_full_kwargs.update(tp_kwargs)  # type: ignore
    with VllmRunner(model, **cp_full_kwargs) as runner:  # type: ignore
        vllm_context_parallel_outputs = runner.generate_greedy(
            prompts, max_tokens)

    with VllmRunner(model, **tp_full_kwargs) as runner:  # type: ignore
        vllm_eager_outputs = runner.generate_greedy(prompts, max_tokens)

    check_outputs_equal(
        outputs_0_lst=vllm_eager_outputs,
        outputs_1_lst=vllm_context_parallel_outputs,
        name_0="vllm_eager_outputs",
        name_1="vllm_context_parallel_outputs",
    )


model = "vllm-ascend/DeepSeek-V2-Lite-W8A8"


@pytest.mark.parametrize("max_tokens", [10])
def test_accuracy_dcp_only_graph(max_tokens: int, ) -> None:
    prompts = [
        "The president of the United States is", "The capital of France is"
    ]
    cp_kwargs = {
        "tensor_parallel_size": 2,
        "decode_context_parallel_size": 2,
        "prefill_context_parallel_size": 1,
        "enable_expert_parallel": True,
        "compilation_config": {
            "cudagraph_mode": "FULL_DECODE_ONLY",
            "cudagraph_capture_sizes": [4, 8, 24, 48, 60]
        },
        "quantization": "ascend",
        "max_model_len": 1024,
    }
    tp_kwargs = {
        "tensor_parallel_size": 4,
        "enable_expert_parallel": True,
        "enforce_eager": True,
        "quantization": "ascend",
        "max_model_len": 1024,
    }
    with VllmRunner(model, **cp_kwargs) as runner:  # type: ignore
        vllm_context_parallel_outputs = runner.generate_greedy(
            prompts, max_tokens)

    with VllmRunner(model, **tp_kwargs) as runner:  # type: ignore
        vllm_eager_outputs = runner.generate_greedy(prompts, max_tokens)

    check_outputs_equal(
        outputs_0_lst=vllm_eager_outputs,
        outputs_1_lst=vllm_context_parallel_outputs,
        name_0="vllm_eager_outputs",
        name_1="vllm_dcp_only_graph_outputs",
    )


@pytest.mark.parametrize("max_tokens", [10])
def test_accuracy_dcp_only_eager(max_tokens: int, ) -> None:
    prompts = [
        "The president of the United States is", "The capital of France is"
    ]
    cp_kwargs = {
        "tensor_parallel_size": 2,
        "decode_context_parallel_size": 2,
        "prefill_context_parallel_size": 1,
        "enable_expert_parallel": True,
        "enforce_eager": True,
        "quantization": "ascend",
        "max_model_len": 1024,
    }
    tp_kwargs = {
        "tensor_parallel_size": 4,
        "enable_expert_parallel": True,
        "enforce_eager": True,
        "quantization": "ascend",
        "max_model_len": 1024,
    }
    with VllmRunner(model, **cp_kwargs) as runner:  # type: ignore
        vllm_context_parallel_outputs = runner.generate_greedy(
            prompts, max_tokens)

    with VllmRunner(model, **tp_kwargs) as runner:  # type: ignore
        vllm_eager_outputs = runner.generate_greedy(prompts, max_tokens)

    check_outputs_equal(
        outputs_0_lst=vllm_eager_outputs,
        outputs_1_lst=vllm_context_parallel_outputs,
        name_0="vllm_eager_outputs",
        name_1="vllm_dcp_only_eager_outputs",
    )


@pytest.mark.parametrize("max_tokens", [10])
def test_accuracy_pcp_only(max_tokens: int, ) -> None:
    prompts = [
        "The president of the United States is", "The capital of France is"
    ]
    cp_kwargs = {
        "tensor_parallel_size": 2,
        "decode_context_parallel_size": 1,
        "prefill_context_parallel_size": 2,
        "enable_expert_parallel": True,
        "enforce_eager": True,
        "quantization": "ascend",
        "max_model_len": 1024,
    }
    tp_kwargs = {
        "tensor_parallel_size": 4,
        "enable_expert_parallel": True,
        "enforce_eager": True,
        "quantization": "ascend",
        "max_model_len": 1024,
    }
    with VllmRunner(model, **cp_kwargs) as runner:  # type: ignore
        vllm_context_parallel_outputs = runner.generate_greedy(
            prompts, max_tokens)

    with VllmRunner(model, **tp_kwargs) as runner:  # type: ignore
        vllm_eager_outputs = runner.generate_greedy(prompts, max_tokens)

    check_outputs_equal(
        outputs_0_lst=vllm_eager_outputs,
        outputs_1_lst=vllm_context_parallel_outputs,
        name_0="vllm_eager_outputs",
        name_1="vllm_pcp_only_outputs",
    )


@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("max_tokens", [10])
def test_models_long_sequence_cp_kv_interleave_size_output_between_tp_and_cp(
    model: str,
    max_tokens: int,
) -> None:
    prompts = [
        "The president of the United States is", "The capital of France is"
    ]

    common_kwargs = {
        "max_model_len": 1024,
    }

    if model == "vllm-ascend/DeepSeek-V2-Lite-W8A8":
        cp_kwargs = {
            "tensor_parallel_size": 2,
            "decode_context_parallel_size": 2,
            "prefill_context_parallel_size": 2,
            "enable_expert_parallel": True,
            "cp_kv_cache_interleave_size": 128,
            "enforce_eager": True,
            "quantization": "ascend",
        }
        tp_kwargs = {
            "tensor_parallel_size": 4,
            "enable_expert_parallel": True,
            "enforce_eager": True,
            "quantization": "ascend",
        }

    else:
        cp_kwargs = {
            "tensor_parallel_size": 1,
            "decode_context_parallel_size": 1,
            "prefill_context_parallel_size": 2,
            "cp_kv_cache_interleave_size": 128,
            "compilation_config": {
                "cudagraph_mode": "FULL_DECODE_ONLY",
                "cudagraph_capture_sizes": [4, 8, 24, 48, 60]
            },
        }
        tp_kwargs = {
            "tensor_parallel_size": 2,
            "enforce_eager": True,
        }

    cp_full_kwargs = {}
    cp_full_kwargs.update(common_kwargs)  # type: ignore
    cp_full_kwargs.update(cp_kwargs)  # type: ignore

    tp_full_kwargs = {}
    tp_full_kwargs.update(common_kwargs)  # type: ignore
    tp_full_kwargs.update(tp_kwargs)  # type: ignore
    with VllmRunner(model, **cp_full_kwargs) as runner:  # type: ignore
        vllm_context_parallel_outputs = runner.generate_greedy(
            prompts, max_tokens)

    with VllmRunner(model, **tp_full_kwargs) as runner:  # type: ignore
        vllm_eager_outputs = runner.generate_greedy(prompts, max_tokens)

    check_outputs_equal(
        outputs_0_lst=vllm_eager_outputs,
        outputs_1_lst=vllm_context_parallel_outputs,
        name_0="vllm_eager_outputs",
        name_1="vllm_context_parallel_outputs",
    )
[e2e] add pcp e2e (#5141) ### What this PR does / why we need it? add pcp accuracy e2e test case - vLLM version: v0.12.0 - vLLM main: https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9 --------- Signed-off-by: weiguihua2 <weiguihua2@huawei.com> 2025-12-20 16:56:46 +08:00			`#`
			`# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.`
			`# Copyright 2023 The vLLM team.`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
			`#`
			`"""`
			`Compare the outputs of vLLM with and without context parallel.`

			Run `pytest tests/e2e/multicard/long_sequence/test_accuracy.py`.
			`"""`

			`import pytest`

			`from tests.e2e.conftest import VllmRunner`
			`from tests.e2e.model_utils import check_outputs_equal`

			`MODELS = [`
			`"Qwen/Qwen3-8B",`
			`"vllm-ascend/DeepSeek-V2-Lite-W8A8",`
			`]`


			`@pytest.mark.parametrize("model", MODELS)`
			`@pytest.mark.parametrize("max_tokens", [10])`
[CI] refect e2e ci test (#5246) ### What this PR does / why we need it? efect e2e ci test： 1. tests/e2e/singlecard/pooling/test_embedding.py: remove the eager parameter and rename test case 2. tests/e2e/singlecard/pooling/test_scoring.py: Rename test cases 3. tests/e2e/singlecard/pooling/test_classification.py: Rename test case 4. tests/e2e/singlecard/test_quantization.py: remove the eager parameter and chage model to vllm-ascend/Qwen2.5-0.6B-W8A8 and Rename test case 5. tests/e2e/multicard/test_shared_expert_dp.py: Rename test cases 6. tests/e2e/singlecard/test_sampler.py: Rename test cases 7. tests/e2e/singlecard/test_aclgraph_accuracy.py: Rename test cases 8. tests/e2e/multicard/test_offline_inference_distributed.py: Rename test cases and remove the eager parameter 9. tests/e2e/multicard/long_sequence/test_accuracy.py: Rename test cases and remove the eager parameter 10. tests/e2e/multicard/long_sequence/test_basic.py: Rename test cases and remove the eager parameter 11.tests/e2e/multicard/test_expert_parallel.py:remove the eager parameter 12.tests/e2e/multicard/test_full_graph_mode.py:remove the eager parameter 13.tests/e2e/multicard/test_ilama_lora_tp2.py:remove the eager parameter 14.tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py:remove the eager parameter 15.tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py:remove the eager parameter 16.tests/e2e/singlecard/test_aclgraph_accuracy.py:remove the eager parameter 17.tests/e2e/singlecard/test_camem.py:remove the eager parameter 18.tests/e2e/singlecard/test_ilama_lora.py:remove the eager parameter 19.tests/e2e/singlecard/test_multistream_overlap_shared_expert.py:remove the eager parameter 20.tests/e2e/singlecard/test_vlm.py:remove the eager parameter 21.tests/e2e/singlecard/test_xli:remove the eager parameter ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: release/v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9 Signed-off-by: hfadzxy <starmoon_zhang@163.com> 2025-12-23 18:42:35 +08:00			`def test_models_long_sequence_output_between_tp_and_cp(`
[e2e] add pcp e2e (#5141) ### What this PR does / why we need it? add pcp accuracy e2e test case - vLLM version: v0.12.0 - vLLM main: https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9 --------- Signed-off-by: weiguihua2 <weiguihua2@huawei.com> 2025-12-20 16:56:46 +08:00			`model: str,`
			`max_tokens: int,`
			`) -> None:`
			`prompts = [`
			`"The president of the United States is", "The capital of France is"`
			`]`

			`common_kwargs = {`
			`"max_model_len": 1024,`
			`}`

			`if model == "vllm-ascend/DeepSeek-V2-Lite-W8A8":`
			`cp_kwargs = {`
			`"tensor_parallel_size": 2,`
			`"decode_context_parallel_size": 2,`
			`"prefill_context_parallel_size": 2,`
			`"enable_expert_parallel": True,`
			`"enforce_eager": True,`
			`"quantization": "ascend",`
			`}`
			`tp_kwargs = {`
			`"tensor_parallel_size": 4,`
			`"enable_expert_parallel": True,`
			`"enforce_eager": True,`
			`"quantization": "ascend",`
			`}`

			`else:`
			`cp_kwargs = {`
			`"tensor_parallel_size": 1,`
			`"decode_context_parallel_size": 1,`
			`"prefill_context_parallel_size": 2,`
			`"compilation_config": {`
			`"cudagraph_mode": "FULL_DECODE_ONLY",`
			`"cudagraph_capture_sizes": [4, 8, 24, 48, 60]`
			`},`
			`}`
			`tp_kwargs = {`
			`"tensor_parallel_size": 2,`
			`"enforce_eager": True,`
			`}`

			`cp_full_kwargs = {}`
[lint]clean code (#5218) ### What this PR does / why we need it? Fix lint error inreoduced by https://github.com/vllm-project/vllm-ascend/pull/5141 - vLLM version: release/v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9 Signed-off-by: weiguihua2 <weiguihua2@huawei.com> 2025-12-20 18:24:04 +08:00			`cp_full_kwargs.update(common_kwargs) # type: ignore`
			`cp_full_kwargs.update(cp_kwargs) # type: ignore`
[e2e] add pcp e2e (#5141) ### What this PR does / why we need it? add pcp accuracy e2e test case - vLLM version: v0.12.0 - vLLM main: https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9 --------- Signed-off-by: weiguihua2 <weiguihua2@huawei.com> 2025-12-20 16:56:46 +08:00
			`tp_full_kwargs = {}`
[lint]clean code (#5218) ### What this PR does / why we need it? Fix lint error inreoduced by https://github.com/vllm-project/vllm-ascend/pull/5141 - vLLM version: release/v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9 Signed-off-by: weiguihua2 <weiguihua2@huawei.com> 2025-12-20 18:24:04 +08:00			`tp_full_kwargs.update(common_kwargs) # type: ignore`
			`tp_full_kwargs.update(tp_kwargs) # type: ignore`
			`with VllmRunner(model, **cp_full_kwargs) as runner: # type: ignore`
[e2e] add pcp e2e (#5141) ### What this PR does / why we need it? add pcp accuracy e2e test case - vLLM version: v0.12.0 - vLLM main: https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9 --------- Signed-off-by: weiguihua2 <weiguihua2@huawei.com> 2025-12-20 16:56:46 +08:00			`vllm_context_parallel_outputs = runner.generate_greedy(`
			`prompts, max_tokens)`

[lint]clean code (#5218) ### What this PR does / why we need it? Fix lint error inreoduced by https://github.com/vllm-project/vllm-ascend/pull/5141 - vLLM version: release/v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9 Signed-off-by: weiguihua2 <weiguihua2@huawei.com> 2025-12-20 18:24:04 +08:00			`with VllmRunner(model, **tp_full_kwargs) as runner: # type: ignore`
[e2e] add pcp e2e (#5141) ### What this PR does / why we need it? add pcp accuracy e2e test case - vLLM version: v0.12.0 - vLLM main: https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9 --------- Signed-off-by: weiguihua2 <weiguihua2@huawei.com> 2025-12-20 16:56:46 +08:00			`vllm_eager_outputs = runner.generate_greedy(prompts, max_tokens)`

			`check_outputs_equal(`
			`outputs_0_lst=vllm_eager_outputs,`
			`outputs_1_lst=vllm_context_parallel_outputs,`
			`name_0="vllm_eager_outputs",`
			`name_1="vllm_context_parallel_outputs",`
			`)`
[Bugfix] fix dcp_only bug and add e2e accuracy test for dcp only and pcp only (#5565) ### What this PR does / why we need it? [Bugfix] fix dcp_only bug and add e2e accuracy test for dcp only and pcp only this pr fix the bug of accuracy test when decode_parallel_size>1 and prefill_context_parallel_size=1. ### Does this PR introduce _any_ user-facing change? NO ### How was this patch tested? - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/7157596103666ee7ccb7008acee8bff8a8ff1731 --------- Signed-off-by: zhenwenqi2024 <zhenwenqi_2022@qq.com> 2026-01-06 22:48:21 +08:00

			`model = "vllm-ascend/DeepSeek-V2-Lite-W8A8"`


			`@pytest.mark.parametrize("max_tokens", [10])`
			`def test_accuracy_dcp_only_graph(max_tokens: int, ) -> None:`
			`prompts = [`
			`"The president of the United States is", "The capital of France is"`
			`]`
			`cp_kwargs = {`
			`"tensor_parallel_size": 2,`
			`"decode_context_parallel_size": 2,`
			`"prefill_context_parallel_size": 1,`
			`"enable_expert_parallel": True,`
			`"compilation_config": {`
			`"cudagraph_mode": "FULL_DECODE_ONLY",`
			`"cudagraph_capture_sizes": [4, 8, 24, 48, 60]`
			`},`
			`"quantization": "ascend",`
			`"max_model_len": 1024,`
			`}`
			`tp_kwargs = {`
			`"tensor_parallel_size": 4,`
			`"enable_expert_parallel": True,`
			`"enforce_eager": True,`
			`"quantization": "ascend",`
			`"max_model_len": 1024,`
			`}`
			`with VllmRunner(model, **cp_kwargs) as runner: # type: ignore`
			`vllm_context_parallel_outputs = runner.generate_greedy(`
			`prompts, max_tokens)`

			`with VllmRunner(model, **tp_kwargs) as runner: # type: ignore`
			`vllm_eager_outputs = runner.generate_greedy(prompts, max_tokens)`

			`check_outputs_equal(`
			`outputs_0_lst=vllm_eager_outputs,`
			`outputs_1_lst=vllm_context_parallel_outputs,`
			`name_0="vllm_eager_outputs",`
			`name_1="vllm_dcp_only_graph_outputs",`
			`)`


			`@pytest.mark.parametrize("max_tokens", [10])`
			`def test_accuracy_dcp_only_eager(max_tokens: int, ) -> None:`
			`prompts = [`
			`"The president of the United States is", "The capital of France is"`
			`]`
			`cp_kwargs = {`
			`"tensor_parallel_size": 2,`
			`"decode_context_parallel_size": 2,`
			`"prefill_context_parallel_size": 1,`
			`"enable_expert_parallel": True,`
			`"enforce_eager": True,`
			`"quantization": "ascend",`
			`"max_model_len": 1024,`
			`}`
			`tp_kwargs = {`
			`"tensor_parallel_size": 4,`
			`"enable_expert_parallel": True,`
			`"enforce_eager": True,`
			`"quantization": "ascend",`
			`"max_model_len": 1024,`
			`}`
			`with VllmRunner(model, **cp_kwargs) as runner: # type: ignore`
			`vllm_context_parallel_outputs = runner.generate_greedy(`
			`prompts, max_tokens)`

			`with VllmRunner(model, **tp_kwargs) as runner: # type: ignore`
			`vllm_eager_outputs = runner.generate_greedy(prompts, max_tokens)`

			`check_outputs_equal(`
			`outputs_0_lst=vllm_eager_outputs,`
			`outputs_1_lst=vllm_context_parallel_outputs,`
			`name_0="vllm_eager_outputs",`
			`name_1="vllm_dcp_only_eager_outputs",`
			`)`


			`@pytest.mark.parametrize("max_tokens", [10])`
			`def test_accuracy_pcp_only(max_tokens: int, ) -> None:`
			`prompts = [`
			`"The president of the United States is", "The capital of France is"`
			`]`
			`cp_kwargs = {`
			`"tensor_parallel_size": 2,`
			`"decode_context_parallel_size": 1,`
			`"prefill_context_parallel_size": 2,`
			`"enable_expert_parallel": True,`
			`"enforce_eager": True,`
			`"quantization": "ascend",`
			`"max_model_len": 1024,`
			`}`
			`tp_kwargs = {`
			`"tensor_parallel_size": 4,`
			`"enable_expert_parallel": True,`
			`"enforce_eager": True,`
			`"quantization": "ascend",`
			`"max_model_len": 1024,`
			`}`
			`with VllmRunner(model, **cp_kwargs) as runner: # type: ignore`
			`vllm_context_parallel_outputs = runner.generate_greedy(`
			`prompts, max_tokens)`

			`with VllmRunner(model, **tp_kwargs) as runner: # type: ignore`
			`vllm_eager_outputs = runner.generate_greedy(prompts, max_tokens)`

			`check_outputs_equal(`
			`outputs_0_lst=vllm_eager_outputs,`
			`outputs_1_lst=vllm_context_parallel_outputs,`
			`name_0="vllm_eager_outputs",`
			`name_1="vllm_pcp_only_outputs",`
			`)`
[Misc] Remove CP Redundant Variables after FIA operator enables for CANN 8.5 (#6013) ### What this PR does / why we need it? PCP/DCP splits the kv-cache onto different cards. After introducing the parameter cp-kv-cache-interleave-size, the first size tokens will be cached at Card 0, and so on. However, if there are too few tokens, some cards will not store the key-value pairs, resulting in values of 0, corrupted values, and precision issues. Currently, additional operations are introduced to avoid this precision problem. After we integrate FIA operator in mla_cp._forward_decode and CANN updates to 8.5.0, we now can remove these additional operations. ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? passed all CI by CANN 8.5.0 - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2c24bc6996cb165fce92f780b388a5e39b3f4060 Signed-off-by: dsxsteven <dsxsteven@sina.com> Signed-off-by: dsxsteven <36877507+dsxsteven@users.noreply.github.com> 2026-01-23 14:13:12 +08:00

			`@pytest.mark.parametrize("model", MODELS)`
			`@pytest.mark.parametrize("max_tokens", [10])`
			`def test_models_long_sequence_cp_kv_interleave_size_output_between_tp_and_cp(`
			`model: str,`
			`max_tokens: int,`
			`) -> None:`
			`prompts = [`
			`"The president of the United States is", "The capital of France is"`
			`]`

			`common_kwargs = {`
			`"max_model_len": 1024,`
			`}`

			`if model == "vllm-ascend/DeepSeek-V2-Lite-W8A8":`
			`cp_kwargs = {`
			`"tensor_parallel_size": 2,`
			`"decode_context_parallel_size": 2,`
			`"prefill_context_parallel_size": 2,`
			`"enable_expert_parallel": True,`
			`"cp_kv_cache_interleave_size": 128,`
			`"enforce_eager": True,`
			`"quantization": "ascend",`
			`}`
			`tp_kwargs = {`
			`"tensor_parallel_size": 4,`
			`"enable_expert_parallel": True,`
			`"enforce_eager": True,`
			`"quantization": "ascend",`
			`}`

			`else:`
			`cp_kwargs = {`
			`"tensor_parallel_size": 1,`
			`"decode_context_parallel_size": 1,`
			`"prefill_context_parallel_size": 2,`
			`"cp_kv_cache_interleave_size": 128,`
			`"compilation_config": {`
			`"cudagraph_mode": "FULL_DECODE_ONLY",`
			`"cudagraph_capture_sizes": [4, 8, 24, 48, 60]`
			`},`
			`}`
			`tp_kwargs = {`
			`"tensor_parallel_size": 2,`
			`"enforce_eager": True,`
			`}`

			`cp_full_kwargs = {}`
			`cp_full_kwargs.update(common_kwargs) # type: ignore`
			`cp_full_kwargs.update(cp_kwargs) # type: ignore`

			`tp_full_kwargs = {}`
			`tp_full_kwargs.update(common_kwargs) # type: ignore`
			`tp_full_kwargs.update(tp_kwargs) # type: ignore`
			`with VllmRunner(model, **cp_full_kwargs) as runner: # type: ignore`
			`vllm_context_parallel_outputs = runner.generate_greedy(`
			`prompts, max_tokens)`

			`with VllmRunner(model, **tp_full_kwargs) as runner: # type: ignore`
			`vllm_eager_outputs = runner.generate_greedy(prompts, max_tokens)`

			`check_outputs_equal(`
			`outputs_0_lst=vllm_eager_outputs,`
			`outputs_1_lst=vllm_context_parallel_outputs,`
			`name_0="vllm_eager_outputs",`
			`name_1="vllm_context_parallel_outputs",`
			`)`