xc-llm-ascend/tests/e2e/singlecard/test_ascend_scheduler.py

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from vllm import SamplingParams

from tests.e2e.conftest import VllmRunner
from tests.e2e.model_utils import check_outputs_equal

MODEL = "Qwen/Qwen3-0.6B"


def test_concurrent_partial_prefill():
    with VllmRunner(MODEL,
                    additional_config={
                        'ascend_scheduler_config': {
                            'enabled': True,
                        },
                    },
                    max_num_seqs=3,
                    max_num_batched_tokens=2048,
                    enforce_eager=True,
                    max_model_len=2048,
                    gpu_memory_utilization=0.7) as vllm_model:
        outputs = vllm_model.model.generate(["Hello my name is Robert and I"] *
                                            3)
        assert len(outputs) == 3
        for output in outputs:
            assert len(output.outputs) == 1


def test_prefix_cache_stats_is_recorded():
    with VllmRunner(MODEL,
                    additional_config={
                        'ascend_scheduler_config': {
                            'enabled': True,
                        },
                    },
                    max_num_seqs=3,
                    max_num_batched_tokens=2048,
                    enforce_eager=True,
                    max_model_len=2048,
                    gpu_memory_utilization=0.7) as vllm_model:
        # 17 tokens will make sure first 16 tokens are cached in a block
        input_tokens = {"prompt_token_ids": [101] * 129}
        _ = vllm_model.model.generate([input_tokens])
        outputs = vllm_model.model.generate([input_tokens])
        assert outputs[0].num_cached_tokens == 128


@pytest.mark.parametrize("max_tokens",
                         [4])  # cannot align results when max_tokens > 4
@pytest.mark.parametrize("chunked_prefill_token_size", [16])
def test_chunked_prefill_with_ascend_scheduler(
        max_tokens: int, chunked_prefill_token_size: int) -> None:
    example_prompts = [
        "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs."
    ]
    max_num_seqs = chunked_prefill_token_size
    max_num_batched_tokens = chunked_prefill_token_size
    with VllmRunner(MODEL,
                    additional_config={
                        'ascend_scheduler_config': {
                            'enabled': True,
                            'enable_chunked_prefill': True,
                        },
                    },
                    max_num_seqs=max_num_seqs,
                    max_num_batched_tokens=max_num_batched_tokens,
                    max_model_len=2048,
                    gpu_memory_utilization=0.7) as vllm_model:
        chunked_prefill_output = vllm_model.generate_greedy(
            example_prompts, max_tokens)

    with VllmRunner(MODEL,
                    additional_config={
                        'ascend_scheduler_config': {
                            'enabled': True,
                        },
                    },
                    max_model_len=2048,
                    gpu_memory_utilization=0.7) as vllm_model:
        vllm_output = vllm_model.generate_greedy(example_prompts, max_tokens)

    check_outputs_equal(
        outputs_0_lst=vllm_output,
        outputs_1_lst=chunked_prefill_output,
        name_0="vllm_output",
        name_1="chunked_prefill_output",
    )


def test_async_scheduling() -> None:
    prompts = [
        "Hello, my name is",
        "The president of the United States is",
        "The capital of France is",
        "The future of AI is",
    ] * 10
    sampling_params = SamplingParams(temperature=0.2,
                                     max_tokens=10,
                                     stop_token_ids=None)

    with VllmRunner(
            "Qwen/Qwen2.5-0.5B-Instruct",
            max_model_len=4096,
            max_num_seqs=50,
            dtype="bfloat16",
            gpu_memory_utilization=0.9,
            async_scheduling=True,
    ) as vllm_model:
        vllm_model.generate(prompts, sampling_params=sampling_params)
[Scheduler][MTP] Add support for speculative decoding in AsecendScheduler. (#943) This PR adds support for speculative decoding in AsecendScheduler. Also inculde part of support for disaggregated prefill, full support will be merged in follow-up PR. --------- Signed-off-by: whx-sjtu <2952154980@qq.com> 2025-06-11 20:55:44 +08:00			`# SPDX-License-Identifier: Apache-2.0`
			`# SPDX-FileCopyrightText: Copyright contributors to the vLLM project`
			`import pytest`
[Perf][V1] Fully overlap model execution (#2783) This PR is based on top of [#23569](https://github.com/vllm-project/vllm/pull/23569) and [#24219](https://github.com/vllm-project/vllm/pull/24219). ### What this PR does / why we need it? This PR allows the model runner to function asynchronously when using async scheduling. This allows full overlap of the cpu operations (including prepare_inputs) and the model forward pass. This diff is functional and does not support speculative decoding, PP, or guided decoding. Expected speedup is 5-10% over the current async scheduling. ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? server ``` python -m vllm.entrypoints.openai.api_server --model=Qwen3-32B\ --trust-remote-code --enforce-eager \ --distributed-executor-backend=mp \ -tp=4 \ --port 8006 \ --max-model-len 32000 \ --block-size 128 \ --gpu-memory-utilization 0.99 ``` client ``` python $TEST_PY --backend vllm --trust-remote-code --model Qwen3-32B \ --dataset-name random --random-input-len 2048 --random-output-len 2048 \ --ignore-eos\ --num-prompts 48 --max-concurrency 48 --request-rate inf --temperature 0 \ --metric-percentiles 90 --base-url http://localhost:8006 --save-result \ --result-dir $PROFILER_DIR ``` benchmark test based on Qwen3-32B TPOT result: \|\|forward async\| scheduler async \|sync\| \|-\|-\|-\|-\| \|avg\|41.73\|41.86\|44.20\| \|improve0\|0.3%\|0\|0\| \|improve1\|5.58%\|0\|0\| benchmark test based on Qwen2___5-VL-7B-Instruct TPOT result: \|\|forward async\|sync\| \|-\|-\|-\| \|avg\|23.22\|29.16\| \|improve\|20.3%\|0\| - vLLM version: main - vLLM main: https://github.com/vllm-project/vllm/commit/e93f4cc9e37484009f74e15d3111a1f335c532a5 Signed-off-by: jiangpeng36 <jiangpeng36@huawei.com> Signed-off-by: Ronald1995 <ronaldautomobile@163.com> Co-authored-by: jiangpeng36 <jiangpeng36@huawei.com> Co-authored-by: Ronald1995 <ronaldautomobile@163.com> 2025-09-11 16:35:36 +08:00			`from vllm import SamplingParams`
[Scheduler][MTP] Add support for speculative decoding in AsecendScheduler. (#943) This PR adds support for speculative decoding in AsecendScheduler. Also inculde part of support for disaggregated prefill, full support will be merged in follow-up PR. --------- Signed-off-by: whx-sjtu <2952154980@qq.com> 2025-06-11 20:55:44 +08:00
[Test] Clean up duplicate test for ascend scheduler (#1819) There are some duplicate tests for ascend scheduler. This PR remove them to make the test clear. After this PR. the singlecard e2e cost time is reduced from 47min to 46min. - vLLM version: v0.9.2 - vLLM main: https://github.com/vllm-project/vllm/commit/1eb2b9c10205b68658dede9dac73390706ef2e05 Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> 2025-07-16 17:57:48 +08:00			`from tests.e2e.conftest import VllmRunner`
			`from tests.e2e.model_utils import check_outputs_equal`
[Scheduler][MTP] Add support for speculative decoding in AsecendScheduler. (#943) This PR adds support for speculative decoding in AsecendScheduler. Also inculde part of support for disaggregated prefill, full support will be merged in follow-up PR. --------- Signed-off-by: whx-sjtu <2952154980@qq.com> 2025-06-11 20:55:44 +08:00
[Test] Clean up duplicate test for ascend scheduler (#1819) There are some duplicate tests for ascend scheduler. This PR remove them to make the test clear. After this PR. the singlecard e2e cost time is reduced from 47min to 46min. - vLLM version: v0.9.2 - vLLM main: https://github.com/vllm-project/vllm/commit/1eb2b9c10205b68658dede9dac73390706ef2e05 Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> 2025-07-16 17:57:48 +08:00			`MODEL = "Qwen/Qwen3-0.6B"`
[Scheduler][MTP] Add support for speculative decoding in AsecendScheduler. (#943) This PR adds support for speculative decoding in AsecendScheduler. Also inculde part of support for disaggregated prefill, full support will be merged in follow-up PR. --------- Signed-off-by: whx-sjtu <2952154980@qq.com> 2025-06-11 20:55:44 +08:00

[Test] Clean up duplicate test for ascend scheduler (#1819) There are some duplicate tests for ascend scheduler. This PR remove them to make the test clear. After this PR. the singlecard e2e cost time is reduced from 47min to 46min. - vLLM version: v0.9.2 - vLLM main: https://github.com/vllm-project/vllm/commit/1eb2b9c10205b68658dede9dac73390706ef2e05 Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> 2025-07-16 17:57:48 +08:00			`def test_concurrent_partial_prefill():`
			`with VllmRunner(MODEL,`
			`additional_config={`
			`'ascend_scheduler_config': {`
			`'enabled': True,`
			`},`
			`},`
			`max_num_seqs=3,`
[Scheduler] validate max_num_batched_tokens and max_model_len in AscendSchedulerConfig (#2434) ### What this PR does / why we need it? Add configuration check logic for ascend scheduler: if chunked_prefill is disabled, max_num_batched_tokens couldn't be less than max_model_len, following vLLM; ### Does this PR introduce _any_ user-facing change? users cannot set max_num_batched_tokens smaller than max_model_len with ascend scheduler ### How was this patch tested? CI and vllm serving passed - vLLM version: v0.10.0 - vLLM main: https://github.com/vllm-project/vllm/commit/f77a0802b758a32c5b9f7bc04e9498d77e8d99e0 Signed-off-by: linfeng-yuan <1102311262@qq.com> 2025-08-23 19:39:44 +08:00			`max_num_batched_tokens=2048,`
[Test] Clean up duplicate test for ascend scheduler (#1819) There are some duplicate tests for ascend scheduler. This PR remove them to make the test clear. After this PR. the singlecard e2e cost time is reduced from 47min to 46min. - vLLM version: v0.9.2 - vLLM main: https://github.com/vllm-project/vllm/commit/1eb2b9c10205b68658dede9dac73390706ef2e05 Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> 2025-07-16 17:57:48 +08:00			`enforce_eager=True,`
			`max_model_len=2048,`
			`gpu_memory_utilization=0.7) as vllm_model:`
			`outputs = vllm_model.model.generate(["Hello my name is Robert and I"] *`
			`3)`
			`assert len(outputs) == 3`
			`for output in outputs:`
			`assert len(output.outputs) == 1`


			`def test_prefix_cache_stats_is_recorded():`
			`with VllmRunner(MODEL,`
			`additional_config={`
			`'ascend_scheduler_config': {`
			`'enabled': True,`
			`},`
			`},`
			`max_num_seqs=3,`
[Scheduler] validate max_num_batched_tokens and max_model_len in AscendSchedulerConfig (#2434) ### What this PR does / why we need it? Add configuration check logic for ascend scheduler: if chunked_prefill is disabled, max_num_batched_tokens couldn't be less than max_model_len, following vLLM; ### Does this PR introduce _any_ user-facing change? users cannot set max_num_batched_tokens smaller than max_model_len with ascend scheduler ### How was this patch tested? CI and vllm serving passed - vLLM version: v0.10.0 - vLLM main: https://github.com/vllm-project/vllm/commit/f77a0802b758a32c5b9f7bc04e9498d77e8d99e0 Signed-off-by: linfeng-yuan <1102311262@qq.com> 2025-08-23 19:39:44 +08:00			`max_num_batched_tokens=2048,`
[Test] Clean up duplicate test for ascend scheduler (#1819) There are some duplicate tests for ascend scheduler. This PR remove them to make the test clear. After this PR. the singlecard e2e cost time is reduced from 47min to 46min. - vLLM version: v0.9.2 - vLLM main: https://github.com/vllm-project/vllm/commit/1eb2b9c10205b68658dede9dac73390706ef2e05 Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> 2025-07-16 17:57:48 +08:00			`enforce_eager=True,`
			`max_model_len=2048,`
			`gpu_memory_utilization=0.7) as vllm_model:`
			`# 17 tokens will make sure first 16 tokens are cached in a block`
			`input_tokens = {"prompt_token_ids": [101] * 129}`
			`_ = vllm_model.model.generate([input_tokens])`
			`outputs = vllm_model.model.generate([input_tokens])`
			`assert outputs[0].num_cached_tokens == 128`


			`@pytest.mark.parametrize("max_tokens",`
			`[4]) # cannot align results when max_tokens > 4`
			`@pytest.mark.parametrize("chunked_prefill_token_size", [16])`
			`def test_chunked_prefill_with_ascend_scheduler(`
Refactor e2e CI (#2276) Refactor E2E CI to make it clear and faster 1. remove some uesless e2e test 2. remove some uesless function 3. Make sure all test runs with VLLMRunner to avoid oom error 4. Make sure all ops test end with torch.empty_cache to avoid oom error 5. run the test one by one to avoid resource limit error - vLLM version: v0.10.1.1 - vLLM main: https://github.com/vllm-project/vllm/commit/a344a5aa0a58cc1758d9721e848ce1f5ca4b6c7f Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> 2025-09-02 09:02:22 +08:00			`max_tokens: int, chunked_prefill_token_size: int) -> None:`
			`example_prompts = [`
			`"vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs."`
			`]`
[Test] Clean up duplicate test for ascend scheduler (#1819) There are some duplicate tests for ascend scheduler. This PR remove them to make the test clear. After this PR. the singlecard e2e cost time is reduced from 47min to 46min. - vLLM version: v0.9.2 - vLLM main: https://github.com/vllm-project/vllm/commit/1eb2b9c10205b68658dede9dac73390706ef2e05 Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> 2025-07-16 17:57:48 +08:00			`max_num_seqs = chunked_prefill_token_size`
			`max_num_batched_tokens = chunked_prefill_token_size`
			`with VllmRunner(MODEL,`
			`additional_config={`
			`'ascend_scheduler_config': {`
			`'enabled': True,`
			`'enable_chunked_prefill': True,`
			`},`
			`},`
			`max_num_seqs=max_num_seqs,`
			`max_num_batched_tokens=max_num_batched_tokens,`
			`max_model_len=2048,`
			`gpu_memory_utilization=0.7) as vllm_model:`
			`chunked_prefill_output = vllm_model.generate_greedy(`
			`example_prompts, max_tokens)`

			`with VllmRunner(MODEL,`
			`additional_config={`
			`'ascend_scheduler_config': {`
			`'enabled': True,`
			`},`
			`},`
			`max_model_len=2048,`
			`gpu_memory_utilization=0.7) as vllm_model:`
			`vllm_output = vllm_model.generate_greedy(example_prompts, max_tokens)`

			`check_outputs_equal(`
			`outputs_0_lst=vllm_output,`
			`outputs_1_lst=chunked_prefill_output,`
			`name_0="vllm_output",`
			`name_1="chunked_prefill_output",`
			`)`
[Perf][V1] Fully overlap model execution (#2783) This PR is based on top of [#23569](https://github.com/vllm-project/vllm/pull/23569) and [#24219](https://github.com/vllm-project/vllm/pull/24219). ### What this PR does / why we need it? This PR allows the model runner to function asynchronously when using async scheduling. This allows full overlap of the cpu operations (including prepare_inputs) and the model forward pass. This diff is functional and does not support speculative decoding, PP, or guided decoding. Expected speedup is 5-10% over the current async scheduling. ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? server ``` python -m vllm.entrypoints.openai.api_server --model=Qwen3-32B\ --trust-remote-code --enforce-eager \ --distributed-executor-backend=mp \ -tp=4 \ --port 8006 \ --max-model-len 32000 \ --block-size 128 \ --gpu-memory-utilization 0.99 ``` client ``` python $TEST_PY --backend vllm --trust-remote-code --model Qwen3-32B \ --dataset-name random --random-input-len 2048 --random-output-len 2048 \ --ignore-eos\ --num-prompts 48 --max-concurrency 48 --request-rate inf --temperature 0 \ --metric-percentiles 90 --base-url http://localhost:8006 --save-result \ --result-dir $PROFILER_DIR ``` benchmark test based on Qwen3-32B TPOT result: \|\|forward async\| scheduler async \|sync\| \|-\|-\|-\|-\| \|avg\|41.73\|41.86\|44.20\| \|improve0\|0.3%\|0\|0\| \|improve1\|5.58%\|0\|0\| benchmark test based on Qwen2___5-VL-7B-Instruct TPOT result: \|\|forward async\|sync\| \|-\|-\|-\| \|avg\|23.22\|29.16\| \|improve\|20.3%\|0\| - vLLM version: main - vLLM main: https://github.com/vllm-project/vllm/commit/e93f4cc9e37484009f74e15d3111a1f335c532a5 Signed-off-by: jiangpeng36 <jiangpeng36@huawei.com> Signed-off-by: Ronald1995 <ronaldautomobile@163.com> Co-authored-by: jiangpeng36 <jiangpeng36@huawei.com> Co-authored-by: Ronald1995 <ronaldautomobile@163.com> 2025-09-11 16:35:36 +08:00

			`def test_async_scheduling() -> None:`
			`prompts = [`
			`"Hello, my name is",`
			`"The president of the United States is",`
			`"The capital of France is",`
			`"The future of AI is",`
			`] * 10`
			`sampling_params = SamplingParams(temperature=0.2,`
			`max_tokens=10,`
			`stop_token_ids=None)`

			`with VllmRunner(`
			`"Qwen/Qwen2.5-0.5B-Instruct",`
			`max_model_len=4096,`
			`max_num_seqs=50,`
			`dtype="bfloat16",`
			`gpu_memory_utilization=0.9,`
			`async_scheduling=True,`
			`) as vllm_model:`
			`vllm_model.generate(prompts, sampling_params=sampling_params)`