Clean up v0.9.1 code (#1672)

vllm has released 0.9.2. This PR drop 0.9.1 support.

- vLLM version: v0.9.1
- vLLM main:
b942c094e3

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
wangxiyuan
2025-07-09 08:52:24 +08:00
committed by GitHub
parent 0d4bc03946
commit 830332ebfc
23 changed files with 205 additions and 846 deletions

View File

@@ -16,7 +16,6 @@ from vllm.v1.request import Request, RequestStatus
from vllm.v1.structured_output import StructuredOutputManager
from vllm_ascend.core.scheduler import AscendScheduler
from vllm_ascend.utils import vllm_version_is
EOS_TOKEN_ID = 50256
@@ -140,9 +139,7 @@ def create_requests(num_requests: int,
multi_modal_placeholders=mm_position,
multi_modal_hashes=None,
eos_token_id=EOS_TOKEN_ID,
**({
"pooling_params": None
} if not vllm_version_is("0.9.1") else {}),
pooling_params=None,
)
requests.append(request)
return requests
@@ -201,10 +198,7 @@ def test_schedule(enable_prefix_caching: Optional[bool],
# Test initial scheduling
output = scheduler.schedule()
assert len(output.scheduled_new_reqs) == len(requests)
if vllm_version_is("0.9.1"):
assert len(output.scheduled_cached_reqs) == 0
else:
assert output.scheduled_cached_reqs.num_reqs == 0
assert output.scheduled_cached_reqs.num_reqs == 0
assert len(output.finished_req_ids) == 0
# Verify all requests are scheduled.
for req_id, num_tokens in output.num_scheduled_tokens.items():
@@ -241,10 +235,7 @@ def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool):
output = scheduler.schedule()
assert len(output.scheduled_new_reqs) == 3
if vllm_version_is("0.9.1"):
assert len(output.scheduled_cached_reqs) == 0
else:
assert output.scheduled_cached_reqs.num_reqs == 0
assert output.scheduled_cached_reqs.num_reqs == 0
assert len(output.finished_req_ids) == 0
# The first request is scheduled partially - 400.
@@ -264,9 +255,7 @@ def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool):
spec_token_ids=None,
logprobs=None,
prompt_logprobs_dict={},
**({
"pooler_output": []
} if not vllm_version_is("0.9.1") else {}))
pooler_output=[])
scheduler.update_from_output(output, model_runner_output)
# Schedule the next step. All three requests are running.
@@ -274,10 +263,7 @@ def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool):
output1 = scheduler.schedule()
assert len(scheduler.running) == 3
assert len(output1.scheduled_new_reqs) == 0
if vllm_version_is("0.9.1"):
assert len(output1.scheduled_cached_reqs) == 3
else:
assert output1.scheduled_cached_reqs.num_reqs == 3
assert output1.scheduled_cached_reqs.num_reqs == 3
assert len(output1.finished_req_ids) == 0
assert output1.num_scheduled_tokens[requests[0].request_id] == 400
assert output1.num_scheduled_tokens[requests[1].request_id] == 400
@@ -293,18 +279,13 @@ def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool):
spec_token_ids=None,
logprobs=None,
prompt_logprobs_dict={},
**({
"pooler_output": []
} if not vllm_version_is("0.9.1") else {}))
pooler_output=[])
scheduler.update_from_output(output1, model_runner_output)
output2 = scheduler.schedule()
assert len(scheduler.running) == 3
assert len(output2.scheduled_new_reqs) == 0
if vllm_version_is("0.9.1"):
assert len(output2.scheduled_cached_reqs) == 3
else:
assert output2.scheduled_cached_reqs.num_reqs == 3
assert output2.scheduled_cached_reqs.num_reqs == 3
assert len(output2.finished_req_ids) == 0
assert output2.num_scheduled_tokens[requests[0].request_id] == 1
assert output2.num_scheduled_tokens[requests[1].request_id] == 1
@@ -351,9 +332,7 @@ def test_stop_via_update_from_output():
spec_token_ids=None,
logprobs=None,
prompt_logprobs_dict={},
**({
"pooler_output": []
} if not vllm_version_is("0.9.1") else {}))
pooler_output=[])
scheduler.update_from_output(scheduler_output, model_output)
@@ -402,9 +381,7 @@ def test_stop_via_update_from_output():
spec_token_ids=None,
logprobs=None,
prompt_logprobs_dict={},
**({
"pooler_output": []
} if not vllm_version_is("0.9.1") else {}))
pooler_output=[])
scheduler.update_from_output(scheduler_output, model_output)
@@ -452,9 +429,7 @@ def test_stop_via_update_from_output():
spec_token_ids=None,
logprobs=None,
prompt_logprobs_dict={},
**({
"pooler_output": []
} if not vllm_version_is("0.9.1") else {}))
pooler_output=[])
scheduler.update_from_output(scheduler_output, model_output)
@@ -497,9 +472,7 @@ def test_stop_via_update_from_output():
spec_token_ids=None,
logprobs=None,
prompt_logprobs_dict={},
**({
"pooler_output": []
} if not vllm_version_is("0.9.1") else {}))
pooler_output=[])
scheduler.update_from_output(scheduler_output, model_output)
@@ -549,9 +522,7 @@ def test_schedule_concurrent_batches(enable_prefix_caching: Optional[bool],
spec_token_ids=None,
logprobs=None,
prompt_logprobs_dict={},
**({
"pooler_output": []
} if not vllm_version_is("0.9.1") else {}))
pooler_output=[])
scheduler.update_from_output(scheduler_output0, model_runner_output)
@@ -569,9 +540,7 @@ def test_schedule_concurrent_batches(enable_prefix_caching: Optional[bool],
spec_token_ids=None,
logprobs=None,
prompt_logprobs_dict={},
**({
"pooler_output": []
} if not vllm_version_is("0.9.1") else {}))
pooler_output=[])
scheduler.update_from_output(scheduler_output1, model_runner_output)
@@ -622,9 +591,7 @@ def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected):
spec_token_ids=spec_tokens,
logprobs=None,
prompt_logprobs_dict={},
**({
"pooler_output": []
} if not vllm_version_is("0.9.1") else {}))
pooler_output=[])
engine_core_outputs = scheduler.update_from_output(output,
model_runner_output)
@@ -657,16 +624,13 @@ def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected):
else:
assert req_id not in output.scheduled_spec_decode_tokens
model_runner_output = ModelRunnerOutput(
req_ids=req_ids,
req_id_to_index=req_to_index,
sampled_token_ids=output_tokens,
spec_token_ids=None,
logprobs=None,
prompt_logprobs_dict={},
**({
"pooler_output": []
} if not vllm_version_is("0.9.1") else {}))
model_runner_output = ModelRunnerOutput(req_ids=req_ids,
req_id_to_index=req_to_index,
sampled_token_ids=output_tokens,
spec_token_ids=None,
logprobs=None,
prompt_logprobs_dict={},
pooler_output=[])
engine_core_outputs = scheduler.update_from_output(output,
model_runner_output)
@@ -695,9 +659,7 @@ def make_output(scheduler: AscendScheduler):
spec_token_ids=None,
logprobs=None,
prompt_logprobs_dict={},
**({
"pooler_output": []
} if not vllm_version_is("0.9.1") else {}))
pooler_output=[])
def assert_scheduler_empty(scheduler: AscendScheduler):

View File

@@ -4,12 +4,12 @@ from typing import Any, Optional
import pytest
import torch
import torch.nn.functional as F
from vllm.v1.sample.logits_processor import LogitsProcessorManager
from vllm.v1.sample.metadata import SamplingMetadata
from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
from vllm_ascend.sample.rejection_sampler import (PLACEHOLDER_TOKEN_ID,
AscendRejectionSampler)
from vllm_ascend.utils import vllm_version_is
DEVICE = "npu"
@@ -50,46 +50,23 @@ def create_sampling_metadata(
temperature = None
else:
assert temperature is not None
if vllm_version_is("0.9.1"):
return SamplingMetadata(
temperature=temperature,
all_greedy=all_greedy,
all_random=not all_greedy,
top_p=top_p,
top_k=top_k,
min_p=torch.empty(1, ),
generators=generators,
max_num_logprobs=0,
no_penalties=False,
prompt_token_ids=None,
frequency_penalties=torch.tensor([]),
presence_penalties=torch.tensor([]),
repetition_penalties=torch.tensor([]),
output_token_ids=[],
min_tokens={},
logit_bias=[None],
allowed_token_ids_mask=None,
bad_words_token_ids={},
)
else:
from vllm.v1.sample.logits_processor import LogitsProcessorManager
return SamplingMetadata(temperature=temperature,
all_greedy=all_greedy,
all_random=not all_greedy,
top_p=top_p,
top_k=top_k,
generators=generators,
max_num_logprobs=0,
no_penalties=False,
prompt_token_ids=None,
frequency_penalties=torch.tensor([]),
presence_penalties=torch.tensor([]),
repetition_penalties=torch.tensor([]),
output_token_ids=[],
allowed_token_ids_mask=None,
bad_words_token_ids={},
logitsprocs=LogitsProcessorManager())
return SamplingMetadata(temperature=temperature,
all_greedy=all_greedy,
all_random=not all_greedy,
top_p=top_p,
top_k=top_k,
generators=generators,
max_num_logprobs=0,
no_penalties=False,
prompt_token_ids=None,
frequency_penalties=torch.tensor([]),
presence_penalties=torch.tensor([]),
repetition_penalties=torch.tensor([]),
output_token_ids=[],
allowed_token_ids_mask=None,
bad_words_token_ids={},
logitsprocs=LogitsProcessorManager())
########################### Tests for Greedy Sampling ###################

View File

@@ -19,12 +19,10 @@
from collections.abc import Sequence
from typing import Optional
import pytest
from modelscope import snapshot_download # type: ignore[import-untyped]
from tests.conftest import HfRunner
from tests.utils import check_embeddings_close, matryoshka_fy
from vllm_ascend.utils import vllm_version_is
def run_embedding_correctness_test(
@@ -51,8 +49,6 @@ def test_dummy():
assert True
@pytest.mark.skipif(vllm_version_is("0.9.1"),
reason="vLLM 0.9.1 does not support embed task for v1")
def test_embed_models_correctness(hf_runner, vllm_runner):
queries = ['What is the capital of China?', 'Explain gravity']

View File

@@ -21,12 +21,9 @@
Run `pytest tests/test_offline_inference.py`.
"""
import os
from unittest.mock import patch
import pytest
import vllm # noqa: F401
from modelscope import snapshot_download # type: ignore[import-untyped]
from vllm import SamplingParams
from vllm.assets.image import ImageAsset
import vllm_ascend # noqa: F401
@@ -106,24 +103,3 @@ def test_multimodal(model, prompt_template, vllm_runner):
vllm_model.generate_greedy(prompts=prompts,
images=images,
max_tokens=64)
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_TOPK_OPTIMIZE": "1"})
def test_models_topk() -> None:
example_prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
sampling_params = SamplingParams(max_tokens=5,
temperature=0.0,
top_k=50,
top_p=0.9)
with VllmRunner("Qwen/Qwen2.5-0.5B-Instruct",
max_model_len=8192,
dtype="float16",
enforce_eager=True,
gpu_memory_utilization=0.7) as vllm_model:
vllm_model.generate(example_prompts, sampling_params)

View File

@@ -1,152 +0,0 @@
#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# This file is a part of the vllm-ascend project.
# Adapted from vllm/tests/entrypoints/llm/test_guided_generate.py
# Copyright 2023 The vLLM team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from typing import Optional
import pytest
import torch
from vllm.v1.sample.sampler import Sampler # noqa: F401
from vllm_ascend.utils import vllm_version_is
# Set tolerance to 1 for quant ops
DEFAULT_ATOL = 1e-3
DEFAULT_RTOL = 1e-3
def apply_min_p_new(
logits: torch.Tensor,
min_p: torch.Tensor,
) -> torch.Tensor:
"""
Filters logits using adaptive probability thresholding.
"""
if min_p == 0:
return logits
# Convert logits to probability distribution
probability_values = torch.nn.functional.softmax(logits, dim=-1)
# Calculate maximum probabilities per sequence
max_probabilities = torch.amax(probability_values, dim=-1, keepdim=True)
# Reshape min_p for broadcasting
adjusted_min_p = min_p.unsqueeze(1) * max_probabilities
# Identify valid tokens using threshold comparison
# Apply mask using boolean indexing
logits = logits.masked_fill(probability_values < adjusted_min_p,
-float('inf'))
return logits
def apply_top_k_top_p(
logits: torch.Tensor,
k: Optional[torch.Tensor],
p: Optional[torch.Tensor],
) -> torch.Tensor:
"""Apply top-k and top-p masks to the logits.
If a top-p is used, this function will sort the logits tensor,
which can be slow for large batches.
The logits tensor may be updated in-place.
"""
logits_sort, logits_idx = logits.sort(dim=-1, descending=False)
if k is not None:
# Apply top-k.
top_k_mask = logits_sort.size(1) - k.to(torch.long) # shape: B
# Get all the top_k values.
top_k_mask = logits_sort.gather(1, top_k_mask.unsqueeze(dim=1))
top_k_mask = logits_sort < top_k_mask
logits_sort.masked_fill_(top_k_mask, -float("inf"))
if p is not None:
# Apply top-p.
probs_sort = logits_sort.softmax(dim=-1)
probs_sum = torch.cumsum(probs_sort, dim=-1, out=probs_sort)
top_p_mask = probs_sum <= 1 - p.unsqueeze(dim=1)
# at least one
top_p_mask[:, -1] = False
logits_sort.masked_fill_(top_p_mask, -float("inf"))
# Re-sort the probabilities.
logits = logits_sort.scatter(dim=-1, index=logits_idx, src=logits_sort)
return logits
def apply_top_k_top_p_new(
logits: torch.Tensor,
k: Optional[torch.Tensor],
p: Optional[torch.Tensor],
) -> torch.Tensor:
batch_size, vocab_size = logits.shape
logits_sort, logits_idx = logits.sort(dim=-1, descending=False)
# Apply top-k.
boundary = logits_sort.gather(1, (vocab_size - k).unsqueeze(dim=1))
top_k_mask = logits_sort < boundary
logits_sort.masked_fill_(top_k_mask, -float("inf"))
if p is not None:
# Apply top-p.
cutoff = top_k_mask.sum(dim=-1).min()
probs_sort = logits_sort.softmax(dim=-1)[:, cutoff:]
probs_sum = probs_sort.cumsum(dim=-1)
top_p_mask = probs_sum > 1 - p.unsqueeze(dim=1)
top_p_mask[:, -1] = True
strides = torch.arange(0,
batch_size * vocab_size,
vocab_size,
device=logits.device)
flatten_idx = logits_idx[:, cutoff:] + strides.unsqueeze(dim=1)
valid_idx = torch.masked_select(flatten_idx, top_p_mask)
logits_flatten = logits.flatten()
valid_logits = torch.index_select(logits_flatten, 0, valid_idx)
logits = torch.empty_like(logits_flatten).fill_(-float("inf"))
logits[valid_idx] = valid_logits
return logits.reshape(batch_size, vocab_size)
# test with leading dimension and merge seqlen and batch_size as num_tokens
@pytest.mark.skipif(not vllm_version_is("0.9.1"),
reason="apply_min_p has been removed after vllm 0.9.1")
@torch.inference_mode()
def test_apply_min_p() -> None:
logits = torch.randn((128, 7168)).npu()
min_p = torch.Tensor([0.01]).npu()
logits_new = apply_min_p_new(logits, min_p)
sampler = Sampler()
logits_old = sampler.apply_min_p(logits, min_p)
# Compare the results.
torch.testing.assert_close(logits_new,
logits_old,
atol=DEFAULT_ATOL,
rtol=DEFAULT_RTOL)
# test with leading dimension and merge seqlen and batch_size as num_tokens
@torch.inference_mode()
def test_apply_top_k_top_p() -> None:
logits = torch.randn((128, 7168)).npu()
k = torch.Tensor([-1]).int().npu()
p = torch.Tensor([1]).int().npu()
logits_new = apply_top_k_top_p_new(logits, k, p)
logits_old = apply_top_k_top_p(logits, k, p)
# Compare the results.
torch.testing.assert_close(logits_new,
logits_old,
atol=DEFAULT_ATOL,
rtol=DEFAULT_RTOL)

View File

@@ -31,7 +31,6 @@ from vllm.v1.request import Request, RequestStatus
from vllm.v1.structured_output import StructuredOutputManager
from vllm_ascend.core.scheduler import AscendScheduler
from vllm_ascend.utils import vllm_version_is
EOS_TOKEN_ID = 50256
@@ -131,9 +130,7 @@ def create_requests(num_requests: int,
multi_modal_placeholders=mm_position,
multi_modal_hashes=None,
eos_token_id=EOS_TOKEN_ID,
**({
"pooling_params": None
} if not vllm_version_is("0.9.1") else {}),
pooling_params=None,
)
requests.append(request)
return requests
@@ -192,10 +189,7 @@ def test_schedule(enable_prefix_caching: Optional[bool],
# Test initial scheduling
output = scheduler.schedule()
assert len(output.scheduled_new_reqs) == len(requests)
if vllm_version_is("0.9.1"):
assert len(output.scheduled_cached_reqs) == 0
else:
assert output.scheduled_cached_reqs.num_reqs == 0
assert output.scheduled_cached_reqs.num_reqs == 0
assert len(output.finished_req_ids) == 0
# Verify all requests are scheduled.
for req_id, num_tokens in output.num_scheduled_tokens.items():
@@ -245,9 +239,7 @@ def test_stop_via_update_from_output():
spec_token_ids=None,
logprobs=None,
prompt_logprobs_dict={},
**({
"pooler_output": []
} if not vllm_version_is("0.9.1") else {}))
pooler_output=[])
scheduler.update_from_output(scheduler_output, model_output)
@@ -294,9 +286,7 @@ def test_stop_via_update_from_output():
spec_token_ids=None,
logprobs=None,
prompt_logprobs_dict={},
**({
"pooler_output": []
} if not vllm_version_is("0.9.1") else {}))
pooler_output=[])
scheduler.update_from_output(scheduler_output, model_output)
@@ -342,9 +332,7 @@ def test_stop_via_update_from_output():
spec_token_ids=None,
logprobs=None,
prompt_logprobs_dict={},
**({
"pooler_output": []
} if not vllm_version_is("0.9.1") else {}))
pooler_output=[])
scheduler.update_from_output(scheduler_output, model_output)
@@ -386,9 +374,7 @@ def test_stop_via_update_from_output():
spec_token_ids=None,
logprobs=None,
prompt_logprobs_dict={},
**({
"pooler_output": []
} if not vllm_version_is("0.9.1") else {}))
pooler_output=[])
scheduler.update_from_output(scheduler_output, model_output)