Clean up v0.9.1 code (#1672)

vllm has released 0.9.2. This PR drop 0.9.1 support. - vLLM version: v0.9.1 - vLLM main: b942c094e3 Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-07-09 08:52:24 +08:00
parent 0d4bc03946
commit 830332ebfc
23 changed files with 205 additions and 846 deletions
--- a/tests/e2e/multicard/test_offline_inference_distributed.py
+++ b/tests/e2e/multicard/test_offline_inference_distributed.py
@@ -73,28 +73,6 @@ def test_models_distributed_DeepSeek_multistream_moe():
        vllm_model.generate_greedy(example_prompts, max_tokens)


-@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_TOPK_OPTIMIZE": "1"})
-def test_models_distributed_topk() -> None:
-    example_prompts = [
-        "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.",
-        "Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020.",
-        "Compare and contrast artificial intelligence with human intelligence in terms of processing information.",
-    ]
-    dtype = "half"
-    sampling_params = SamplingParams(max_tokens=5,
-                                     temperature=0.0,
-                                     top_k=50,
-                                     top_p=0.9)
-
-    with VllmRunner(
-            "deepseek-ai/DeepSeek-V2-Lite",
-            dtype=dtype,
-            tensor_parallel_size=4,
-            distributed_executor_backend="mp",
-    ) as vllm_model:
-        vllm_model.generate(example_prompts, sampling_params)
-
-
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_DBO": "1"})
 def test_models_distributed_DeepSeek_dbo():
    example_prompts = ["The president of the United States is"] * 41
--- a/tests/e2e/singlecard/core/ascend_scheduler/test_ascend_scheduler.py
+++ b/tests/e2e/singlecard/core/ascend_scheduler/test_ascend_scheduler.py
@@ -16,7 +16,6 @@ from vllm.v1.request import Request, RequestStatus
 from vllm.v1.structured_output import StructuredOutputManager

 from vllm_ascend.core.scheduler import AscendScheduler
-from vllm_ascend.utils import vllm_version_is

 EOS_TOKEN_ID = 50256

@@ -140,9 +139,7 @@ def create_requests(num_requests: int,
            multi_modal_placeholders=mm_position,
            multi_modal_hashes=None,
            eos_token_id=EOS_TOKEN_ID,
-            **({
-                "pooling_params": None
-            } if not vllm_version_is("0.9.1") else {}),
+            pooling_params=None,
        )
        requests.append(request)
    return requests
@@ -201,10 +198,7 @@ def test_schedule(enable_prefix_caching: Optional[bool],
    # Test initial scheduling
    output = scheduler.schedule()
    assert len(output.scheduled_new_reqs) == len(requests)
-    if vllm_version_is("0.9.1"):
-        assert len(output.scheduled_cached_reqs) == 0
-    else:
-        assert output.scheduled_cached_reqs.num_reqs == 0
+    assert output.scheduled_cached_reqs.num_reqs == 0
    assert len(output.finished_req_ids) == 0
    # Verify all requests are scheduled.
    for req_id, num_tokens in output.num_scheduled_tokens.items():
@@ -241,10 +235,7 @@ def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool):

    output = scheduler.schedule()
    assert len(output.scheduled_new_reqs) == 3
-    if vllm_version_is("0.9.1"):
-        assert len(output.scheduled_cached_reqs) == 0
-    else:
-        assert output.scheduled_cached_reqs.num_reqs == 0
+    assert output.scheduled_cached_reqs.num_reqs == 0
    assert len(output.finished_req_ids) == 0

    # The first request is scheduled partially - 400.
@@ -264,9 +255,7 @@ def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool):
        spec_token_ids=None,
        logprobs=None,
        prompt_logprobs_dict={},
-        **({
-            "pooler_output": []
-        } if not vllm_version_is("0.9.1") else {}))
+        pooler_output=[])
    scheduler.update_from_output(output, model_runner_output)

    # Schedule the next step. All three requests are running.
@@ -274,10 +263,7 @@ def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool):
    output1 = scheduler.schedule()
    assert len(scheduler.running) == 3
    assert len(output1.scheduled_new_reqs) == 0
-    if vllm_version_is("0.9.1"):
-        assert len(output1.scheduled_cached_reqs) == 3
-    else:
-        assert output1.scheduled_cached_reqs.num_reqs == 3
+    assert output1.scheduled_cached_reqs.num_reqs == 3
    assert len(output1.finished_req_ids) == 0
    assert output1.num_scheduled_tokens[requests[0].request_id] == 400
    assert output1.num_scheduled_tokens[requests[1].request_id] == 400
@@ -293,18 +279,13 @@ def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool):
        spec_token_ids=None,
        logprobs=None,
        prompt_logprobs_dict={},
-        **({
-            "pooler_output": []
-        } if not vllm_version_is("0.9.1") else {}))
+        pooler_output=[])

    scheduler.update_from_output(output1, model_runner_output)
    output2 = scheduler.schedule()
    assert len(scheduler.running) == 3
    assert len(output2.scheduled_new_reqs) == 0
-    if vllm_version_is("0.9.1"):
-        assert len(output2.scheduled_cached_reqs) == 3
-    else:
-        assert output2.scheduled_cached_reqs.num_reqs == 3
+    assert output2.scheduled_cached_reqs.num_reqs == 3
    assert len(output2.finished_req_ids) == 0
    assert output2.num_scheduled_tokens[requests[0].request_id] == 1
    assert output2.num_scheduled_tokens[requests[1].request_id] == 1
@@ -351,9 +332,7 @@ def test_stop_via_update_from_output():
        spec_token_ids=None,
        logprobs=None,
        prompt_logprobs_dict={},
-        **({
-            "pooler_output": []
-        } if not vllm_version_is("0.9.1") else {}))
+        pooler_output=[])

    scheduler.update_from_output(scheduler_output, model_output)

@@ -402,9 +381,7 @@ def test_stop_via_update_from_output():
        spec_token_ids=None,
        logprobs=None,
        prompt_logprobs_dict={},
-        **({
-            "pooler_output": []
-        } if not vllm_version_is("0.9.1") else {}))
+        pooler_output=[])

    scheduler.update_from_output(scheduler_output, model_output)

@@ -452,9 +429,7 @@ def test_stop_via_update_from_output():
        spec_token_ids=None,
        logprobs=None,
        prompt_logprobs_dict={},
-        **({
-            "pooler_output": []
-        } if not vllm_version_is("0.9.1") else {}))
+        pooler_output=[])

    scheduler.update_from_output(scheduler_output, model_output)

@@ -497,9 +472,7 @@ def test_stop_via_update_from_output():
        spec_token_ids=None,
        logprobs=None,
        prompt_logprobs_dict={},
-        **({
-            "pooler_output": []
-        } if not vllm_version_is("0.9.1") else {}))
+        pooler_output=[])

    scheduler.update_from_output(scheduler_output, model_output)

@@ -549,9 +522,7 @@ def test_schedule_concurrent_batches(enable_prefix_caching: Optional[bool],
        spec_token_ids=None,
        logprobs=None,
        prompt_logprobs_dict={},
-        **({
-            "pooler_output": []
-        } if not vllm_version_is("0.9.1") else {}))
+        pooler_output=[])

    scheduler.update_from_output(scheduler_output0, model_runner_output)

@@ -569,9 +540,7 @@ def test_schedule_concurrent_batches(enable_prefix_caching: Optional[bool],
        spec_token_ids=None,
        logprobs=None,
        prompt_logprobs_dict={},
-        **({
-            "pooler_output": []
-        } if not vllm_version_is("0.9.1") else {}))
+        pooler_output=[])

    scheduler.update_from_output(scheduler_output1, model_runner_output)

@@ -622,9 +591,7 @@ def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected):
        spec_token_ids=spec_tokens,
        logprobs=None,
        prompt_logprobs_dict={},
-        **({
-            "pooler_output": []
-        } if not vllm_version_is("0.9.1") else {}))
+        pooler_output=[])

    engine_core_outputs = scheduler.update_from_output(output,
                                                       model_runner_output)
@@ -657,16 +624,13 @@ def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected):
        else:
            assert req_id not in output.scheduled_spec_decode_tokens

-    model_runner_output = ModelRunnerOutput(
-        req_ids=req_ids,
-        req_id_to_index=req_to_index,
-        sampled_token_ids=output_tokens,
-        spec_token_ids=None,
-        logprobs=None,
-        prompt_logprobs_dict={},
-        **({
-            "pooler_output": []
-        } if not vllm_version_is("0.9.1") else {}))
+    model_runner_output = ModelRunnerOutput(req_ids=req_ids,
+                                            req_id_to_index=req_to_index,
+                                            sampled_token_ids=output_tokens,
+                                            spec_token_ids=None,
+                                            logprobs=None,
+                                            prompt_logprobs_dict={},
+                                            pooler_output=[])

    engine_core_outputs = scheduler.update_from_output(output,
                                                       model_runner_output)
@@ -695,9 +659,7 @@ def make_output(scheduler: AscendScheduler):
        spec_token_ids=None,
        logprobs=None,
        prompt_logprobs_dict={},
-        **({
-            "pooler_output": []
-        } if not vllm_version_is("0.9.1") else {}))
+        pooler_output=[])


 def assert_scheduler_empty(scheduler: AscendScheduler):
--- a/tests/e2e/singlecard/sample/test_rejection_sampler.py
+++ b/tests/e2e/singlecard/sample/test_rejection_sampler.py
@@ -4,12 +4,12 @@ from typing import Any, Optional
 import pytest
 import torch
 import torch.nn.functional as F
+from vllm.v1.sample.logits_processor import LogitsProcessorManager
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.spec_decode.metadata import SpecDecodeMetadata

 from vllm_ascend.sample.rejection_sampler import (PLACEHOLDER_TOKEN_ID,
                                                  AscendRejectionSampler)
-from vllm_ascend.utils import vllm_version_is

 DEVICE = "npu"

@@ -50,46 +50,23 @@ def create_sampling_metadata(
        temperature = None
    else:
        assert temperature is not None
-    if vllm_version_is("0.9.1"):
-        return SamplingMetadata(
-            temperature=temperature,
-            all_greedy=all_greedy,
-            all_random=not all_greedy,
-            top_p=top_p,
-            top_k=top_k,
-            min_p=torch.empty(1, ),
-            generators=generators,
-            max_num_logprobs=0,
-            no_penalties=False,
-            prompt_token_ids=None,
-            frequency_penalties=torch.tensor([]),
-            presence_penalties=torch.tensor([]),
-            repetition_penalties=torch.tensor([]),
-            output_token_ids=[],
-            min_tokens={},
-            logit_bias=[None],
-            allowed_token_ids_mask=None,
-            bad_words_token_ids={},
-        )
-    else:
-        from vllm.v1.sample.logits_processor import LogitsProcessorManager

-        return SamplingMetadata(temperature=temperature,
-                                all_greedy=all_greedy,
-                                all_random=not all_greedy,
-                                top_p=top_p,
-                                top_k=top_k,
-                                generators=generators,
-                                max_num_logprobs=0,
-                                no_penalties=False,
-                                prompt_token_ids=None,
-                                frequency_penalties=torch.tensor([]),
-                                presence_penalties=torch.tensor([]),
-                                repetition_penalties=torch.tensor([]),
-                                output_token_ids=[],
-                                allowed_token_ids_mask=None,
-                                bad_words_token_ids={},
-                                logitsprocs=LogitsProcessorManager())
+    return SamplingMetadata(temperature=temperature,
+                            all_greedy=all_greedy,
+                            all_random=not all_greedy,
+                            top_p=top_p,
+                            top_k=top_k,
+                            generators=generators,
+                            max_num_logprobs=0,
+                            no_penalties=False,
+                            prompt_token_ids=None,
+                            frequency_penalties=torch.tensor([]),
+                            presence_penalties=torch.tensor([]),
+                            repetition_penalties=torch.tensor([]),
+                            output_token_ids=[],
+                            allowed_token_ids_mask=None,
+                            bad_words_token_ids={},
+                            logitsprocs=LogitsProcessorManager())


 ########################### Tests for Greedy Sampling ###################
--- a/tests/e2e/singlecard/test_embedding.py
+++ b/tests/e2e/singlecard/test_embedding.py
@@ -19,12 +19,10 @@
 from collections.abc import Sequence
 from typing import Optional

-import pytest
 from modelscope import snapshot_download  # type: ignore[import-untyped]

 from tests.conftest import HfRunner
 from tests.utils import check_embeddings_close, matryoshka_fy
-from vllm_ascend.utils import vllm_version_is


 def run_embedding_correctness_test(
@@ -51,8 +49,6 @@ def test_dummy():
    assert True


-@pytest.mark.skipif(vllm_version_is("0.9.1"),
-                    reason="vLLM 0.9.1 does not support embed task for v1")
 def test_embed_models_correctness(hf_runner, vllm_runner):
    queries = ['What is the capital of China?', 'Explain gravity']

--- a/tests/e2e/singlecard/test_offline_inference.py
+++ b/tests/e2e/singlecard/test_offline_inference.py
@@ -21,12 +21,9 @@
 Run `pytest tests/test_offline_inference.py`.
 """
 import os
-from unittest.mock import patch

 import pytest
-import vllm  # noqa: F401
 from modelscope import snapshot_download  # type: ignore[import-untyped]
-from vllm import SamplingParams
 from vllm.assets.image import ImageAsset

 import vllm_ascend  # noqa: F401
@@ -106,24 +103,3 @@ def test_multimodal(model, prompt_template, vllm_runner):
        vllm_model.generate_greedy(prompts=prompts,
                                   images=images,
                                   max_tokens=64)
-
-
-@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_TOPK_OPTIMIZE": "1"})
-def test_models_topk() -> None:
-    example_prompts = [
-        "Hello, my name is",
-        "The president of the United States is",
-        "The capital of France is",
-        "The future of AI is",
-    ]
-    sampling_params = SamplingParams(max_tokens=5,
-                                     temperature=0.0,
-                                     top_k=50,
-                                     top_p=0.9)
-
-    with VllmRunner("Qwen/Qwen2.5-0.5B-Instruct",
-                    max_model_len=8192,
-                    dtype="float16",
-                    enforce_eager=True,
-                    gpu_memory_utilization=0.7) as vllm_model:
-        vllm_model.generate(example_prompts, sampling_params)
--- a/tests/e2e/singlecard/test_sampler.py
+++ b/tests/e2e/singlecard/test_sampler.py
@@ -1,152 +0,0 @@
-#
-# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
-# This file is a part of the vllm-ascend project.
-# Adapted from vllm/tests/entrypoints/llm/test_guided_generate.py
-# Copyright 2023 The vLLM team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-from typing import Optional
-
-import pytest
-import torch
-from vllm.v1.sample.sampler import Sampler  # noqa: F401
-
-from vllm_ascend.utils import vllm_version_is
-
-# Set tolerance to 1 for quant ops
-DEFAULT_ATOL = 1e-3
-DEFAULT_RTOL = 1e-3
-
-
-def apply_min_p_new(
-    logits: torch.Tensor,
-    min_p: torch.Tensor,
-) -> torch.Tensor:
-    """
-    Filters logits using adaptive probability thresholding.
-    """
-    if min_p == 0:
-        return logits
-    # Convert logits to probability distribution
-    probability_values = torch.nn.functional.softmax(logits, dim=-1)
-    # Calculate maximum probabilities per sequence
-    max_probabilities = torch.amax(probability_values, dim=-1, keepdim=True)
-    # Reshape min_p for broadcasting
-    adjusted_min_p = min_p.unsqueeze(1) * max_probabilities
-    # Identify valid tokens using threshold comparison
-    # Apply mask using boolean indexing
-    logits = logits.masked_fill(probability_values < adjusted_min_p,
-                                -float('inf'))
-    return logits
-
-
-def apply_top_k_top_p(
-    logits: torch.Tensor,
-    k: Optional[torch.Tensor],
-    p: Optional[torch.Tensor],
-) -> torch.Tensor:
-    """Apply top-k and top-p masks to the logits.
-
-    If a top-p is used, this function will sort the logits tensor,
-    which can be slow for large batches.
-
-    The logits tensor may be updated in-place.
-    """
-    logits_sort, logits_idx = logits.sort(dim=-1, descending=False)
-
-    if k is not None:
-        # Apply top-k.
-        top_k_mask = logits_sort.size(1) - k.to(torch.long)  # shape: B
-        # Get all the top_k values.
-        top_k_mask = logits_sort.gather(1, top_k_mask.unsqueeze(dim=1))
-        top_k_mask = logits_sort < top_k_mask
-        logits_sort.masked_fill_(top_k_mask, -float("inf"))
-
-    if p is not None:
-        # Apply top-p.
-        probs_sort = logits_sort.softmax(dim=-1)
-        probs_sum = torch.cumsum(probs_sort, dim=-1, out=probs_sort)
-        top_p_mask = probs_sum <= 1 - p.unsqueeze(dim=1)
-        # at least one
-        top_p_mask[:, -1] = False
-        logits_sort.masked_fill_(top_p_mask, -float("inf"))
-
-    # Re-sort the probabilities.
-    logits = logits_sort.scatter(dim=-1, index=logits_idx, src=logits_sort)
-    return logits
-
-
-def apply_top_k_top_p_new(
-    logits: torch.Tensor,
-    k: Optional[torch.Tensor],
-    p: Optional[torch.Tensor],
-) -> torch.Tensor:
-    batch_size, vocab_size = logits.shape
-    logits_sort, logits_idx = logits.sort(dim=-1, descending=False)
-
-    # Apply top-k.
-    boundary = logits_sort.gather(1, (vocab_size - k).unsqueeze(dim=1))
-    top_k_mask = logits_sort < boundary
-    logits_sort.masked_fill_(top_k_mask, -float("inf"))
-
-    if p is not None:
-        # Apply top-p.
-        cutoff = top_k_mask.sum(dim=-1).min()
-        probs_sort = logits_sort.softmax(dim=-1)[:, cutoff:]
-        probs_sum = probs_sort.cumsum(dim=-1)
-        top_p_mask = probs_sum > 1 - p.unsqueeze(dim=1)
-        top_p_mask[:, -1] = True
-        strides = torch.arange(0,
-                               batch_size * vocab_size,
-                               vocab_size,
-                               device=logits.device)
-        flatten_idx = logits_idx[:, cutoff:] + strides.unsqueeze(dim=1)
-        valid_idx = torch.masked_select(flatten_idx, top_p_mask)
-        logits_flatten = logits.flatten()
-        valid_logits = torch.index_select(logits_flatten, 0, valid_idx)
-        logits = torch.empty_like(logits_flatten).fill_(-float("inf"))
-        logits[valid_idx] = valid_logits
-    return logits.reshape(batch_size, vocab_size)
-
-
-# test with leading dimension and merge seqlen and batch_size as num_tokens
-@pytest.mark.skipif(not vllm_version_is("0.9.1"),
-                    reason="apply_min_p has been removed after vllm 0.9.1")
-@torch.inference_mode()
-def test_apply_min_p() -> None:
-    logits = torch.randn((128, 7168)).npu()
-    min_p = torch.Tensor([0.01]).npu()
-    logits_new = apply_min_p_new(logits, min_p)
-    sampler = Sampler()
-    logits_old = sampler.apply_min_p(logits, min_p)
-    # Compare the results.
-    torch.testing.assert_close(logits_new,
-                               logits_old,
-                               atol=DEFAULT_ATOL,
-                               rtol=DEFAULT_RTOL)
-
-
-# test with leading dimension and merge seqlen and batch_size as num_tokens
-@torch.inference_mode()
-def test_apply_top_k_top_p() -> None:
-    logits = torch.randn((128, 7168)).npu()
-    k = torch.Tensor([-1]).int().npu()
-    p = torch.Tensor([1]).int().npu()
-    logits_new = apply_top_k_top_p_new(logits, k, p)
-    logits_old = apply_top_k_top_p(logits, k, p)
-    # Compare the results.
-    torch.testing.assert_close(logits_new,
-                               logits_old,
-                               atol=DEFAULT_ATOL,
-                               rtol=DEFAULT_RTOL)
--- a/tests/e2e/singlecard/test_scheduler.py
+++ b/tests/e2e/singlecard/test_scheduler.py
@@ -31,7 +31,6 @@ from vllm.v1.request import Request, RequestStatus
 from vllm.v1.structured_output import StructuredOutputManager

 from vllm_ascend.core.scheduler import AscendScheduler
-from vllm_ascend.utils import vllm_version_is

 EOS_TOKEN_ID = 50256

@@ -131,9 +130,7 @@ def create_requests(num_requests: int,
            multi_modal_placeholders=mm_position,
            multi_modal_hashes=None,
            eos_token_id=EOS_TOKEN_ID,
-            **({
-                "pooling_params": None
-            } if not vllm_version_is("0.9.1") else {}),
+            pooling_params=None,
        )
        requests.append(request)
    return requests
@@ -192,10 +189,7 @@ def test_schedule(enable_prefix_caching: Optional[bool],
    # Test initial scheduling
    output = scheduler.schedule()
    assert len(output.scheduled_new_reqs) == len(requests)
-    if vllm_version_is("0.9.1"):
-        assert len(output.scheduled_cached_reqs) == 0
-    else:
-        assert output.scheduled_cached_reqs.num_reqs == 0
+    assert output.scheduled_cached_reqs.num_reqs == 0
    assert len(output.finished_req_ids) == 0
    # Verify all requests are scheduled.
    for req_id, num_tokens in output.num_scheduled_tokens.items():
@@ -245,9 +239,7 @@ def test_stop_via_update_from_output():
        spec_token_ids=None,
        logprobs=None,
        prompt_logprobs_dict={},
-        **({
-            "pooler_output": []
-        } if not vllm_version_is("0.9.1") else {}))
+        pooler_output=[])

    scheduler.update_from_output(scheduler_output, model_output)

@@ -294,9 +286,7 @@ def test_stop_via_update_from_output():
        spec_token_ids=None,
        logprobs=None,
        prompt_logprobs_dict={},
-        **({
-            "pooler_output": []
-        } if not vllm_version_is("0.9.1") else {}))
+        pooler_output=[])

    scheduler.update_from_output(scheduler_output, model_output)

@@ -342,9 +332,7 @@ def test_stop_via_update_from_output():
        spec_token_ids=None,
        logprobs=None,
        prompt_logprobs_dict={},
-        **({
-            "pooler_output": []
-        } if not vllm_version_is("0.9.1") else {}))
+        pooler_output=[])

    scheduler.update_from_output(scheduler_output, model_output)

@@ -386,9 +374,7 @@ def test_stop_via_update_from_output():
        spec_token_ids=None,
        logprobs=None,
        prompt_logprobs_dict={},
-        **({
-            "pooler_output": []
-        } if not vllm_version_is("0.9.1") else {}))
+        pooler_output=[])

    scheduler.update_from_output(scheduler_output, model_output)