[CI] Drop ascend scheduler from test (#4613)

Drop ascend scheduler from test - vLLM version: v0.11.2 Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-12-02 13:18:17 +08:00
parent 6360eb1dea
commit 400af665e6
7 changed files with 1 additions and 165 deletions
--- a/tests/e2e/singlecard/test_ascend_scheduler.py
+++ b/tests/e2e/singlecard/test_ascend_scheduler.py
@@ -1,118 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import pytest
-from vllm import SamplingParams
-
-from tests.e2e.conftest import VllmRunner
-from tests.e2e.model_utils import check_outputs_equal
-
-MODEL = "Qwen/Qwen3-0.6B"
-
-
-@pytest.mark.parametrize("enforce_eager", [True, False])
-def test_concurrent_partial_prefill(enforce_eager):
-    with VllmRunner(MODEL,
-                    max_num_seqs=3,
-                    max_num_batched_tokens=8192,
-                    enforce_eager=enforce_eager,
-                    gpu_memory_utilization=0.7) as vllm_model:
-        outputs = vllm_model.model.generate(["Hello my name is Robert and I"] *
-                                            3)
-        assert len(outputs) == 3
-        for output in outputs:
-            assert len(output.outputs) == 1
-
-
-@pytest.mark.parametrize("enforce_eager", [True, False])
-def test_prefix_cache_stats_is_recorded(enforce_eager):
-    with VllmRunner(MODEL,
-                    max_num_seqs=3,
-                    max_num_batched_tokens=8192,
-                    enforce_eager=enforce_eager,
-                    gpu_memory_utilization=0.7) as vllm_model:
-        # 17 tokens will make sure first 16 tokens are cached in a block
-        input_tokens = {"prompt_token_ids": [101] * 129}
-        _ = vllm_model.model.generate([input_tokens])
-        outputs = vllm_model.model.generate([input_tokens])
-        assert outputs[0].num_cached_tokens == 128
-
-
-@pytest.mark.parametrize("max_tokens",
-                         [4])  # cannot align results when max_tokens > 4
-@pytest.mark.parametrize("chunked_prefill_token_size", [2048])
-def test_chunked_prefill_with_scheduler_dynamic_batch(
-        max_tokens: int, chunked_prefill_token_size: int) -> None:
-    example_prompts = [
-        "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs."
-    ]
-    max_num_seqs = chunked_prefill_token_size
-    max_num_batched_tokens = chunked_prefill_token_size
-    with VllmRunner(MODEL,
-                    additional_config={
-                        'SLO_limits_for_dynamic_batch': 0,
-                    },
-                    max_num_seqs=max_num_seqs,
-                    max_num_batched_tokens=max_num_batched_tokens,
-                    max_model_len=2048,
-                    gpu_memory_utilization=0.7) as vllm_model:
-        dynamic_batch_output = vllm_model.generate_greedy(
-            example_prompts, max_tokens)
-
-    with VllmRunner(MODEL,
-                    additional_config={
-                        'SLO_limits_for_dynamic_batch': -1,
-                    },
-                    max_model_len=2048,
-                    gpu_memory_utilization=0.7) as vllm_model:
-        vllm_output = vllm_model.generate_greedy(example_prompts, max_tokens)
-
-    check_outputs_equal(
-        outputs_0_lst=vllm_output,
-        outputs_1_lst=dynamic_batch_output,
-        name_0="vllm_output",
-        name_1="chunked_prefill_output",
-    )
-
-
-def test_async_scheduling_eager() -> None:
-    prompts = [
-        "Hello, my name is",
-        "The president of the United States is",
-        "The capital of France is",
-        "The future of AI is",
-    ] * 10
-    sampling_params = SamplingParams(temperature=0.2,
-                                     max_tokens=10,
-                                     stop_token_ids=None)
-
-    with VllmRunner(
-            "Qwen/Qwen2.5-0.5B-Instruct",
-            max_model_len=4096,
-            max_num_seqs=50,
-            dtype="bfloat16",
-            gpu_memory_utilization=0.9,
-            async_scheduling=True,
-    ) as vllm_model:
-        vllm_model.generate(prompts, sampling_params=sampling_params)
-
-
-def test_async_scheduling_with_full_graph() -> None:
-    prompts = [
-        "Hello, my name is",
-        "The president of the United States is",
-        "The capital of France is",
-        "The future of AI is",
-    ] * 10
-    sampling_params = SamplingParams(temperature=0.2,
-                                     max_tokens=10,
-                                     stop_token_ids=None)
-
-    with VllmRunner("Qwen/Qwen3-8B",
-                    max_model_len=4096,
-                    max_num_seqs=50,
-                    dtype="bfloat16",
-                    gpu_memory_utilization=0.9,
-                    async_scheduling=True,
-                    compilation_config={"cudagraph_mode":
-                                        "FULL"}) as vllm_model:
-        vllm_model.generate(prompts, sampling_params=sampling_params)