xc-llm-ascend/tests/e2e/multicard/test_prefix_caching.py

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Compare the with and without prefix caching on V1 scheduler or AscendScheduler."""

import pytest

from tests.e2e.conftest import VllmRunner
from tests.e2e.model_utils import check_outputs_equal
from vllm_ascend.ascend_config import clear_ascend_config

MODELS = [
    # for MHA
    "Qwen/Qwen3-8B-Base",
    # for MLA
    "deepseek-ai/DeepSeek-V2-Lite-Chat"
]

# A prompt containing a large markdown table. The table is randomly generated by GPT-4.
LONG_PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as follows.\n# Table\n" + """
| ID  | Name          | Age | Occupation    | Country       | Email                  | Phone Number   | Address                       |
|-----|---------------|-----|---------------|---------------|------------------------|----------------|------------------------------|
| 1   | John Doe      | 29  | Engineer      | USA           | john.doe@example.com   | 555-1234       | 123 Elm St, Springfield, IL  |
| 2   | Jane Smith    | 34  | Doctor        | Canada        | jane.smith@example.com | 555-5678       | 456 Oak St, Toronto, ON      |
| 3   | Alice Johnson | 27  | Teacher       | UK            | alice.j@example.com    | 555-8765       | 789 Pine St, London, UK      |
| 4   | Bob Brown     | 45  | Artist        | Australia     | bob.b@example.com      | 555-4321       | 321 Maple St, Sydney, NSW    |
| 5   | Carol White   | 31  | Scientist     | New Zealand   | carol.w@example.com    | 555-6789       | 654 Birch St, Wellington, NZ |
| 6   | Dave Green    | 28  | Lawyer        | Ireland       | dave.g@example.com     | 555-3456       | 987 Cedar St, Dublin, IE     |
| 7   | Emma Black    | 40  | Musician      | USA           | emma.b@example.com     | 555-1111       | 246 Ash St, New York, NY     |
| 8   | Frank Blue    | 37  | Chef          | Canada        | frank.b@example.com    | 555-2222       | 135 Spruce St, Vancouver, BC |
| 9   | Grace Yellow  | 50  | Engineer      | UK            | grace.y@example.com    | 555-3333       | 864 Fir St, Manchester, UK   |
| 10  | Henry Violet  | 32  | Artist        | Australia     | henry.v@example.com    | 555-4444       | 753 Willow St, Melbourne, VIC|
| 11  | Irene Orange  | 26  | Scientist     | New Zealand   | irene.o@example.com    | 555-5555       | 912 Poplar St, Auckland, NZ  |
| 12  | Jack Indigo   | 38  | Teacher       | Ireland       | jack.i@example.com     | 555-6666       | 159 Elm St, Cork, IE         |
| 13  | Karen Red     | 41  | Lawyer        | USA           | karen.r@example.com    | 555-7777       | 357 Cedar St, Boston, MA     |
| 14  | Leo Brown     | 30  | Chef          | Canada        | leo.b@example.com      | 555-8888       | 246 Oak St, Calgary, AB      |
| 15  | Mia Green     | 33  | Musician      | UK            | mia.g@example.com      | 555-9999       | 975 Pine St, Edinburgh, UK   |
| 16  | Noah Yellow   | 29  | Doctor        | Australia     | noah.y@example.com     | 555-0000       | 864 Birch St, Brisbane, QLD  |
| 17  | Olivia Blue   | 35  | Engineer      | New Zealand   | olivia.b@example.com   | 555-1212       | 753 Maple St, Hamilton, NZ   |
| 18  | Peter Black   | 42  | Artist        | Ireland       | peter.b@example.com    | 555-3434       | 912 Fir St, Limerick, IE     |
| 19  | Quinn White   | 28  | Scientist     | USA           | quinn.w@example.com    | 555-5656       | 159 Willow St, Seattle, WA   |
| 20  | Rachel Red    | 31  | Teacher       | Canada        | rachel.r@example.com   | 555-7878       | 357 Poplar St, Ottawa, ON    |
| 21  | Steve Green   | 44  | Lawyer        | UK            | steve.g@example.com    | 555-9090       | 753 Elm St, Birmingham, UK   |
| 22  | Tina Blue     | 36  | Musician      | Australia     | tina.b@example.com     | 555-1213       | 864 Cedar St, Perth, WA      |
| 23  | Umar Black    | 39  | Chef          | New Zealand   | umar.b@example.com     | 555-3435       | 975 Spruce St, Christchurch, NZ|
| 24  | Victor Yellow | 43  | Engineer      | Ireland       | victor.y@example.com   | 555-5657       | 246 Willow St, Galway, IE    |
| 25  | Wendy Orange  | 27  | Artist        | USA           | wendy.o@example.com    | 555-7879       | 135 Elm St, Denver, CO       |
| 26  | Xavier Green  | 34  | Scientist     | Canada        | xavier.g@example.com   | 555-9091       | 357 Oak St, Montreal, QC     |
| 27  | Yara Red      | 41  | Teacher       | UK            | yara.r@example.com     | 555-1214       | 975 Pine St, Leeds, UK       |
| 28  | Zack Blue     | 30  | Lawyer        | Australia     | zack.b@example.com     | 555-3436       | 135 Birch St, Adelaide, SA   |
| 29  | Amy White     | 33  | Musician      | New Zealand   | amy.w@example.com      | 555-5658       | 159 Maple St, Wellington, NZ |
| 30  | Ben Black     | 38  | Chef          | Ireland       | ben.b@example.com      | 555-7870       | 246 Fir St, Waterford, IE    |
"""

INPUT_PROMPTS = [
    LONG_PROMPT +
    "Question: what is the age of John Doe? Your answer: The age of John Doe is ",
    LONG_PROMPT +
    "Question: what is the age of Zack Blue? Your answer: The age of Zack Blue is "
]


@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("max_tokens", [50])
def test_prefix_cache_with_v1_scheduler(model: str, max_tokens: int) -> None:
    with VllmRunner(model,
                    enforce_eager=True,
                    max_model_len=2048,
                    tensor_parallel_size=2,
                    gpu_memory_utilization=0.7) as vllm_model:
        prefix_cache_output = vllm_model.generate_greedy(
            INPUT_PROMPTS, max_tokens)

    with VllmRunner(model,
                    enable_prefix_caching=False,
                    enforce_eager=True,
                    max_model_len=2048,
                    tensor_parallel_size=2,
                    gpu_memory_utilization=0.7) as vllm_model:
        vllm_output = vllm_model.generate_greedy(INPUT_PROMPTS, max_tokens)

    check_outputs_equal(
        outputs_0_lst=vllm_output,
        outputs_1_lst=prefix_cache_output,
        name_0="vllm_output",
        name_1="prefix_cache_output",
    )


@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("max_tokens", [50])
def test_prefix_cache_with_ascend_scheduler(model: str,
                                            max_tokens: int) -> None:

    with VllmRunner(model,
                    additional_config={
                        'ascend_scheduler_config': {
                            'enabled': True,
                        },
                    },
                    enforce_eager=True,
                    max_model_len=2048,
                    tensor_parallel_size=2,
                    gpu_memory_utilization=0.7) as vllm_model:
        vllm_output = vllm_model.generate_greedy(INPUT_PROMPTS, max_tokens)

    clear_ascend_config()

    with VllmRunner(model,
                    additional_config={
                        'ascend_scheduler_config': {
                            'enabled': True,
                            'enable_prefix_caching': True,
                        },
                    },
                    enforce_eager=True,
                    max_model_len=2048,
                    tensor_parallel_size=2,
                    gpu_memory_utilization=0.7) as vllm_model:
        prefix_cache_output = vllm_model.generate_greedy(
            INPUT_PROMPTS, max_tokens)

    clear_ascend_config()

    with VllmRunner(model,
                    additional_config={
                        'ascend_scheduler_config': {
                            'enabled': True,
                            'enable_prefix_caching': True,
                            "enable_chunked_prefill": True,
                        },
                    },
                    enforce_eager=True,
                    max_model_len=2048,
                    tensor_parallel_size=2,
                    gpu_memory_utilization=0.7) as vllm_model:
        chunk_prefill_prefix_cache_output = vllm_model.generate_greedy(
            INPUT_PROMPTS, max_tokens)

    clear_ascend_config()

    check_outputs_equal(
        outputs_0_lst=vllm_output,
        outputs_1_lst=prefix_cache_output,
        name_0="vllm_output",
        name_1="prefix_cache_output",
    )

    check_outputs_equal(
        outputs_0_lst=chunk_prefill_prefix_cache_output,
        outputs_1_lst=prefix_cache_output,
        name_0="chunk_prefill_prefix_cache_output",
        name_1="prefix_cache_output",
    )