[CI] drop ascend scheduler test (#4582)

let' drop ascend scheduler test first to ensure all function works without it. - vLLM version: v0.11.2 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.2 Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-12-01 20:33:50 +08:00
parent 203b4e6777
commit 27b09ca9b9
28 changed files with 53 additions and 376 deletions
--- a/tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py
+++ b/tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py
@@ -48,27 +48,26 @@ def mtp_correctness(sampling_config: SamplingParams,
    if graph_mode == CUDAGraphMode.FULL:
        graph_mode_str = "FULL_DECODE_ONLY"

-    with VllmRunner(
-            model_name,
-            tensor_parallel_size=1,
-            max_num_seqs=256,
-            gpu_memory_utilization=0.7,
-            distributed_executor_backend="mp",
-            enable_expert_parallel=True,
-            speculative_config={
-                "method": "deepseek_mtp",
-                "num_speculative_tokens": num_speculative_tokens,
-                "disable_padded_drafter_batch": disable_padded_drafter_batch,
-            },
-            enforce_eager=enforce_eager,
-            max_model_len=2000,
-            compilation_config=CompilationConfig(
-                cudagraph_mode=graph_mode_str,
-                cudagraph_capture_sizes=[12],
-            ),
-            additional_config={"ascend_scheduler_config": {
-                "enabled": False
-            }}) as spec_llm:
+    with VllmRunner(model_name,
+                    tensor_parallel_size=1,
+                    max_num_seqs=256,
+                    gpu_memory_utilization=0.7,
+                    distributed_executor_backend="mp",
+                    enable_expert_parallel=True,
+                    speculative_config={
+                        "method":
+                        "deepseek_mtp",
+                        "num_speculative_tokens":
+                        num_speculative_tokens,
+                        "disable_padded_drafter_batch":
+                        disable_padded_drafter_batch,
+                    },
+                    enforce_eager=enforce_eager,
+                    max_model_len=2000,
+                    compilation_config=CompilationConfig(
+                        cudagraph_mode=graph_mode_str,
+                        cudagraph_capture_sizes=[12],
+                    )) as spec_llm:
        spec_outputs = spec_llm.generate(example_prompts, sampling_config)

    matches = 0
--- a/tests/e2e/singlecard/test_ascend_scheduler.py
+++ b/tests/e2e/singlecard/test_ascend_scheduler.py
@@ -12,11 +12,6 @@ MODEL = "Qwen/Qwen3-0.6B"
@pytest.mark.parametrize("enforce_eager", [True, False])
 def test_concurrent_partial_prefill(enforce_eager):
    with VllmRunner(MODEL,
-                    additional_config={
-                        'ascend_scheduler_config': {
-                            'enabled': True,
-                        },
-                    },
                    max_num_seqs=3,
                    max_num_batched_tokens=8192,
                    enforce_eager=enforce_eager,
@@ -31,11 +26,6 @@ def test_concurrent_partial_prefill(enforce_eager):
@pytest.mark.parametrize("enforce_eager", [True, False])
 def test_prefix_cache_stats_is_recorded(enforce_eager):
    with VllmRunner(MODEL,
-                    additional_config={
-                        'ascend_scheduler_config': {
-                            'enabled': True,
-                        },
-                    },
                    max_num_seqs=3,
                    max_num_batched_tokens=8192,
                    enforce_eager=enforce_eager,
@@ -47,48 +37,6 @@ def test_prefix_cache_stats_is_recorded(enforce_eager):
        assert outputs[0].num_cached_tokens == 128


-@pytest.mark.parametrize("max_tokens",
-                         [4])  # cannot align results when max_tokens > 4
-@pytest.mark.parametrize("chunked_prefill_token_size", [2048])
-def test_chunked_prefill_with_ascend_scheduler(
-        max_tokens: int, chunked_prefill_token_size: int) -> None:
-    example_prompts = [
-        "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs."
-    ]
-    max_num_seqs = chunked_prefill_token_size
-    max_num_batched_tokens = chunked_prefill_token_size
-    with VllmRunner(MODEL,
-                    additional_config={
-                        'ascend_scheduler_config': {
-                            'enabled': True,
-                            'enable_chunked_prefill': True,
-                        },
-                    },
-                    max_num_seqs=max_num_seqs,
-                    max_num_batched_tokens=max_num_batched_tokens,
-                    max_model_len=2048,
-                    gpu_memory_utilization=0.7) as vllm_model:
-        chunked_prefill_output = vllm_model.generate_greedy(
-            example_prompts, max_tokens)
-
-    with VllmRunner(MODEL,
-                    additional_config={
-                        'ascend_scheduler_config': {
-                            'enabled': True,
-                        },
-                    },
-                    max_model_len=2048,
-                    gpu_memory_utilization=0.7) as vllm_model:
-        vllm_output = vllm_model.generate_greedy(example_prompts, max_tokens)
-
-    check_outputs_equal(
-        outputs_0_lst=vllm_output,
-        outputs_1_lst=chunked_prefill_output,
-        name_0="vllm_output",
-        name_1="chunked_prefill_output",
-    )
-
-
@pytest.mark.parametrize("max_tokens",
                         [4])  # cannot align results when max_tokens > 4
@pytest.mark.parametrize("chunked_prefill_token_size", [2048])
--- a/tests/e2e/singlecard/test_chunked.py
+++ b/tests/e2e/singlecard/test_chunked.py
@@ -1,82 +0,0 @@
-#
-# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
-# Copyright 2023 The vLLM team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-"""
-Compare the outputs of vLLM with and without aclgraph.
-
-Run `pytest tests/compile/test_aclgraph.py`.
-"""
-import gc
-
-import pytest
-import torch
-from vllm import SamplingParams
-
-from tests.e2e.conftest import VllmRunner
-
-MODELS = ["Qwen/Qwen2.5-0.5B-Instruct"]
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("max_tokens", [1])
-def test_models(
-    model: str,
-    max_tokens: int,
-) -> None:
-    prompts = ["The president of the United States is"]
-
-    sampling_params = SamplingParams(
-        max_tokens=max_tokens,
-        temperature=0.0,
-    )
-
-    with VllmRunner(model,
-                    long_prefill_token_threshold=20,
-                    enforce_eager=False) as vllm_model:
-        output1 = vllm_model.generate(prompts, sampling_params)
-
-    with VllmRunner(model,
-                    enforce_eager=False,
-                    additional_config={
-                        'ascend_scheduler_config': {
-                            'enabled': True
-                        },
-                    }) as vllm_model:
-        output2 = vllm_model.generate(prompts, sampling_params)
-
-    # Extract the generated token IDs for comparison
-    token_ids1 = output1[0][0][0]
-    token_ids2 = output2[0][0][0]
-
-    print(f"Token IDs 1: {token_ids1}")
-    print(f"Token IDs 2: {token_ids2}")
-
-    # Convert token IDs to tensors and calculate cosine similarity
-    # Take the length of a shorter sequence to ensure consistent dimensions
-    min_len = min(len(token_ids1), len(token_ids2))
-
-    tensor1 = torch.tensor(token_ids1[:min_len], dtype=torch.float32)
-    tensor2 = torch.tensor(token_ids2[:min_len], dtype=torch.float32)
-
-    # Calculate similarity using torch.cosine_similarity
-    similarity = torch.cosine_similarity(tensor1, tensor2, dim=0)
-    print(f"Token IDs cosine similarity: {similarity.item()}")
-
-    assert similarity > 0.95
-
-    gc.collect()
-    torch.npu.empty_cache()
-    torch.npu.reset_peak_memory_stats()
--- a/tests/e2e/singlecard/test_vlm.py
+++ b/tests/e2e/singlecard/test_vlm.py
@@ -20,7 +20,6 @@

 Run `pytest tests/test_offline_inference.py`.
 """
-import pytest
 from vllm import SamplingParams
 from vllm.assets.audio import AudioAsset
 from vllm.assets.image import ImageAsset
@@ -55,40 +54,6 @@ def test_multimodal_vl(prompt_template):
            assert output_str, "Generated output should not be empty."


-@pytest.mark.skip(reason="This e2e test will stuck in multi-batch scenario. "
-                  "Add this back after fixing the issue.")
-def test_multimodal_ascend_scheduler(prompt_template):
-    image = ImageAsset("cherry_blossom") \
-        .pil_image.convert("RGB")
-    img_questions = [
-        "What is the content of this image?",
-        "Describe the content of this image in detail.",
-        "What's in the image?",
-        "Where is this image taken?",
-    ]
-    images = [image] * len(img_questions)
-    prompts = prompt_template(img_questions)
-    with VllmRunner("Qwen/Qwen2.5-VL-3B-Instruct",
-                    max_model_len=4096,
-                    additional_config={
-                        'ascend_scheduler_config': {
-                            'enabled': True,
-                        },
-                    },
-                    mm_processor_kwargs={
-                        "min_pixels": 28 * 28,
-                        "max_pixels": 1280 * 28 * 28,
-                        "fps": 1,
-                    },
-                    enforce_eager=True) as vllm_model:
-        outputs = vllm_model.generate_greedy(prompts=prompts,
-                                             images=images,
-                                             max_tokens=64)
-        assert len(outputs) == len(prompts)
-        for _, output_str in outputs:
-            assert output_str, "Generated output should not be empty."
-
-
 def test_multimodal_audio():
    audio_prompt = "".join([
        f"Audio {idx+1}: <|audio_bos|><|AUDIO|><|audio_eos|>\n"