Upgrade vLLM to v0.10.0 (#1927)

### What this PR does / why we need it?
- Upgrade to v0.10.0
- Drop v0.9.2 version compatibility
- Add patch for
`vllm_ascend/patch/worker/patch_common/patch_sampler_gather_logprobs.py`
as workaround of
f3a683b7c9
for v0.10.0 and also add e2e test `test_models_prompt_logprobs`
- Pin transformers<4.54.0 as workaround of
https://github.com/vllm-project/vllm-ascend/issues/2034

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?
- Test locally:
`VLLM_USE_MODELSCOPE=true pytest -sv
tests/e2e/singlecard/test_offline_inference.py::test_models_prompt_logprobs`
- CI passed

- vLLM version: v0.9.2
- vLLM main:
7728dd77bb

---------

Signed-off-by: Yikun Jiang <yikunkero@gmail.com>
This commit is contained in:
Yikun Jiang
2025-07-26 15:43:29 +08:00
committed by GitHub
parent 2f50304c19
commit 17a430f7b8
29 changed files with 198 additions and 251 deletions

View File

@@ -17,7 +17,7 @@
from dataclasses import dataclass
from enum import Enum
from typing import Any, Dict, List, Optional, Tuple, Type
from typing import List, Optional, Tuple, Type
import torch
import torch_npu
@@ -31,7 +31,7 @@ from vllm.v1.worker.gpu_input_batch import InputBatch
from vllm_ascend.ops.attention import vanilla_chunked_prefill
from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, aligned_16, is_310p,
nd_to_nz_2d, nd_to_nz_spec, vllm_version_is)
nd_to_nz_2d, nd_to_nz_spec)
class AscendAttentionBackend(AttentionBackend):
@@ -43,8 +43,6 @@ class AscendAttentionBackend(AttentionBackend):
@staticmethod
def get_impl_cls() -> Type["AscendAttentionBackendImpl"]:
if vllm_version_is("0.9.2"):
return AscendAttentionBackendImpl092
return AscendAttentionBackendImpl
@staticmethod
@@ -440,38 +438,6 @@ class AscendAttentionBackendImpl(AttentionImpl):
return output.view(num_tokens, self.hidden_size)
class AscendAttentionBackendImpl092(AscendAttentionBackendImpl):
def __init__(
self,
num_heads: int,
head_size: int,
scale: float,
num_kv_heads: int,
alibi_slopes: Optional[List[float]],
sliding_window: Optional[int],
kv_cache_dtype: str,
blocksparse_params: Optional[Dict[str, Any]] = None,
logits_soft_cap: Optional[float] = None,
attn_type: str = AttentionType.DECODER,
kv_sharing_target_layer_name: Optional[str] = None,
use_irope: bool = False,
) -> None:
super().__init__(
num_heads=num_heads,
head_size=head_size,
scale=scale,
num_kv_heads=num_kv_heads,
alibi_slopes=alibi_slopes,
sliding_window=sliding_window,
kv_cache_dtype=kv_cache_dtype,
logits_soft_cap=logits_soft_cap,
attn_type=attn_type,
kv_sharing_target_layer_name=kv_sharing_target_layer_name,
use_irope=use_irope,
)
def unified_ascend_attention_with_output(
query: torch.Tensor,
key: torch.Tensor,

View File

@@ -16,7 +16,7 @@
#
from dataclasses import dataclass
from typing import Any, Dict, List, Optional, Tuple, Type
from typing import List, Optional, Tuple, Type
import numpy as np
import torch
@@ -29,7 +29,7 @@ from vllm.v1.worker.gpu_input_batch import InputBatch
from vllm_ascend.attention.attention_v1 import AscendAttentionState
from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, aligned_16, is_310p,
nd_to_nz_2d, vllm_version_is)
nd_to_nz_2d)
class AscendAttentionTorchairBackend(AttentionBackend):
@@ -41,8 +41,6 @@ class AscendAttentionTorchairBackend(AttentionBackend):
@staticmethod
def get_impl_cls() -> Type["AscendAttentionTorchairBackendImpl"]:
if vllm_version_is("0.9.2"):
return AscendAttentionTorchairBackendImpl092
return AscendAttentionTorchairBackendImpl
@staticmethod
@@ -489,36 +487,3 @@ class AscendAttentionTorchairBackendImpl(AttentionImpl):
"to use ascend scheduler.")
return output.view(num_tokens, self.hidden_size)
class AscendAttentionTorchairBackendImpl092(AscendAttentionTorchairBackendImpl
):
def __init__(
self,
num_heads: int,
head_size: int,
scale: float,
num_kv_heads: int,
alibi_slopes: Optional[List[float]],
sliding_window: Optional[int],
kv_cache_dtype: str,
blocksparse_params: Optional[Dict[str, Any]] = None,
logits_soft_cap: Optional[float] = None,
attn_type: str = AttentionType.DECODER,
kv_sharing_target_layer_name: Optional[str] = None,
use_irope: bool = False,
) -> None:
super().__init__(
num_heads=num_heads,
head_size=head_size,
scale=scale,
num_kv_heads=num_kv_heads,
alibi_slopes=alibi_slopes,
sliding_window=sliding_window,
kv_cache_dtype=kv_cache_dtype,
logits_soft_cap=logits_soft_cap,
attn_type=attn_type,
kv_sharing_target_layer_name=kv_sharing_target_layer_name,
use_irope=use_irope,
)

View File

@@ -1,12 +1,11 @@
from dataclasses import dataclass
from typing import (TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type,
TypeVar)
from typing import TYPE_CHECKING, Optional, Tuple, Type, TypeVar
import numpy as np
import torch
import torch_npu
from vllm.attention.backends.abstract import (AttentionBackend, AttentionLayer,
AttentionMetadata, AttentionType,
AttentionMetadata,
MLAAttentionImpl)
from vllm.attention.backends.utils import PAD_SLOT_ID
from vllm.config import get_current_vllm_config
@@ -22,7 +21,7 @@ from vllm_ascend.multistream.context import get_multistream_comm_context
from vllm_ascend.multistream.ms_split import model_input_split_v1_mla_attn
from vllm_ascend.ops.attention import vanilla_chunked_prefill_mla
from vllm_ascend.torchair.utils import npu_stream_switch, npu_wait_tensor
from vllm_ascend.utils import npu_prefetch, vllm_version_is
from vllm_ascend.utils import npu_prefetch
from vllm_ascend.worker.npu_input_batch import InputBatch
if TYPE_CHECKING:
@@ -54,8 +53,6 @@ class AscendMLABackend(AttentionBackend):
@staticmethod
def get_impl_cls() -> Type["MLAAttentionImpl"]:
if vllm_version_is("0.9.2"):
return AscendMLAImpl092
return AscendMLAImpl
@@ -1212,34 +1209,3 @@ class AscendMLAImpl(MLAAttentionImpl):
output[:num_decode_tokens] = output_decode
return output_padded
class AscendMLAImpl092(AscendMLAImpl):
def __init__(self,
num_heads: int,
head_size: int,
scale: float,
num_kv_heads: int,
alibi_slopes: Optional[List[float]],
sliding_window: Optional[int],
kv_cache_dtype: str,
blocksparse_params: Optional[Dict[str, Any]] = None,
logits_soft_cap: Optional[float] = None,
attn_type: str = AttentionType.DECODER,
kv_sharing_target_layer_name: Optional[str] = None,
use_irope: bool = False,
**kwargs) -> None:
super().__init__(
num_heads=num_heads,
head_size=head_size,
scale=scale,
num_kv_heads=num_kv_heads,
alibi_slopes=alibi_slopes,
sliding_window=sliding_window,
kv_cache_dtype=kv_cache_dtype,
logits_soft_cap=logits_soft_cap,
attn_type=attn_type,
kv_sharing_target_layer_name=kv_sharing_target_layer_name,
use_irope=use_irope,
**kwargs)

View File

@@ -32,8 +32,6 @@ from vllm.v1.outputs import ModelRunnerOutput
from vllm.v1.request import Request, RequestStatus
from vllm.v1.structured_output import StructuredOutputManager
from vllm_ascend.utils import vllm_version_is
class AscendScheduler(Scheduler):
"""This Scheduler extends vllm's original v1 scheduler
@@ -283,23 +281,12 @@ class AscendScheduler(Scheduler):
# allow the lower-priority requests to be scheduled.
req_index += 1
continue
if vllm_version_is("0.9.2"):
num_draft_tokens = max(
num_new_tokens + request.num_computed_tokens -
request.num_tokens, 0)
while True:
if vllm_version_is("0.9.2"):
new_blocks = self.kv_cache_manager.allocate_slots(
request,
num_new_tokens,
num_draft_tokens=num_draft_tokens,
num_lookahead_tokens=self.num_lookahead_tokens)
else:
new_blocks = self.kv_cache_manager.allocate_slots(
request,
num_new_tokens,
num_lookahead_tokens=self.num_lookahead_tokens)
new_blocks = self.kv_cache_manager.allocate_slots(
request,
num_new_tokens,
num_lookahead_tokens=self.num_lookahead_tokens)
if new_blocks is None:
# The request cannot be scheduled.
# Preempt the lowest-priority request.

View File

@@ -24,9 +24,9 @@
# each worker's `__init__` function.
#
# Then in each kind of patch, there are three folders:
# - patch_0_9_2: contains the patches applied when vllm version is 0.9.2.
# - patch_0_10_0: contains the patches applied when vllm version is 0.10.0.
# - patch_main: contains the patches applied when vllm version is main branch.
# - patch_common: contains the patches applied in both 0.9.2 and main branch.
# - patch_common: contains the patches applied in both 0.10.0 and main branch.
#
# Once a new patch is added in vllm-ascend, please add the patch description into this file as well.
# ----------------------------------------------------------------------------------
@@ -101,3 +101,16 @@
# - https://github.com/vllm-project/vllm-ascend/pull/1732
# Future Plan:
# Revert it when the ascend scatter performance improves.
#
# ** File: worker/patch_common/patch_sampler.py **
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# 1. `vllm.v1.sample.sampler.Sampler.gather_logprobs`
# Why:
# We need to patch gather_logprobs to make sure call batched_count_greater_than
# with backend=current_platform.simple_compile_backend
# How
# Patch gather_logprobs call new batched_count_greater_than
# Related PR (if no, explain why):
# - https://github.com/vllm-project/vllm/pull/21591
# Future Plan:
# Revert it when vLLM merge #21591 and release new version

View File

@@ -17,8 +17,8 @@
from vllm_ascend.utils import vllm_version_is
# Import specific patches for different versions
if vllm_version_is("0.9.2"):
from vllm_ascend.patch.platform import patch_0_9_2 # noqa: F401
if vllm_version_is("0.10.0"):
from vllm_ascend.patch.platform import patch_0_10_0 # noqa: F401
from vllm_ascend.patch.platform import patch_common # noqa: F401
else:
from vllm_ascend.patch.platform import patch_common # noqa: F401

View File

@@ -18,8 +18,8 @@
from vllm_ascend.utils import vllm_version_is
# Import specific patches for different versions
if vllm_version_is("0.9.2"):
from vllm_ascend.patch.worker import patch_0_9_2 # noqa: F401
if vllm_version_is("0.10.0"):
from vllm_ascend.patch.worker import patch_0_10_0 # noqa: F401
from vllm_ascend.patch.worker import patch_common # noqa: F401
else:
from vllm_ascend.patch.worker import patch_common # noqa: F401

View File

@@ -14,3 +14,5 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import vllm_ascend.patch.worker.patch_0_10_0.patch_sampler_gather_logprobs # noqa

View File

@@ -0,0 +1,87 @@
#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# SPDX-License-Identifier: Apache-2.0
# This file is a part of the vllm-ascend project.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import torch
from vllm.platforms import current_platform
from vllm.v1.outputs import LogprobsTensors
from vllm.v1.sample.sampler import Sampler
@torch.compile(dynamic=True, backend=current_platform.simple_compile_backend)
def batched_count_greater_than(x: torch.Tensor,
values: torch.Tensor) -> torch.Tensor:
"""
Counts elements in each row of x that are greater than the corresponding
value in values. Use torch.compile to generate an optimized kernel for
this function. otherwise, it will create additional copies of the input
tensors and cause memory issues.
Args:
x (torch.Tensor): A 2D tensor of shape (batch_size, n_elements).
values (torch.Tensor): A 2D tensor of shape (batch_size, 1).
Returns:
torch.Tensor: A 1D tensor of shape (batch_size,) with the counts.
"""
return (x >= values).sum(-1)
def gather_logprobs(
self,
logprobs: torch.Tensor,
num_logprobs: int,
token_ids: torch.Tensor,
) -> LogprobsTensors:
"""
Gather logprobs for topk and sampled/prompt token.
Args:
logprobs: (num tokens) x (vocab) tensor
num_logprobs: minimum number of logprobs to
retain per token
token_ids: prompt tokens (if prompt logprobs)
or sampled tokens (if sampled
logprobs); 1D token ID tensor
with (num tokens) elements
Must be int64.
Returns:
Top-k int indices tensor, (num tokens) x (num_logprobs + 1)
Top-k float logprobs tensor, (num tokens) x (num_logprobs + 1)
Sampled token rank tensor, (num tokens)
"""
assert token_ids.dtype == torch.int64
# Find the topK values.
topk_logprobs, topk_indices = torch.topk(logprobs, num_logprobs, dim=-1)
# Get with the logprob of the prompt or sampled token.
token_ids = token_ids.unsqueeze(-1)
token_logprobs = logprobs.gather(-1, token_ids)
# Compute the ranks of the actual token.
token_ranks = batched_count_greater_than(logprobs, token_logprobs)
# Concatenate together with the topk.
indices = torch.cat((token_ids, topk_indices), dim=1)
logprobs = torch.cat((token_logprobs, topk_logprobs), dim=1)
# Use int32 to reduce the tensor size.
indices = indices.to(torch.int32)
return LogprobsTensors(indices, logprobs, token_ranks)
Sampler.gather_logprobs = gather_logprobs

View File

@@ -45,8 +45,9 @@ from vllm.logger import logger
from vllm.model_executor.layers.fused_moe import FusedMoE
from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
from vllm.model_executor.model_loader import get_model
from vllm.model_executor.models.interfaces_base import (VllmModelForPooling,
is_pooling_model)
from vllm.model_executor.models.interfaces import supports_transcription
from vllm.model_executor.models.interfaces_base import (
VllmModelForPooling, is_pooling_model, is_text_generation_model)
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
from vllm.multimodal.utils import group_mm_inputs_by_modality
@@ -66,7 +67,7 @@ from vllm.v1.sample.sampler import Sampler
from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
from vllm.v1.spec_decode.ngram_proposer import NgramProposer
from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
from vllm.v1.worker.utils import (gather_mm_placeholders,
from vllm.v1.worker.utils import (bind_kv_cache, gather_mm_placeholders,
sanity_check_mm_encoder_outputs,
scatter_mm_placeholders)
@@ -88,15 +89,8 @@ from vllm_ascend.worker.eagle_proposer_v1 import EagleProposer
from vllm_ascend.worker.mtp_proposer_v1 import MtpProposer
from vllm_ascend.worker.npu_input_batch import CachedRequestState, InputBatch
if vllm_version_is("0.9.2"):
from vllm.model_executor.models.interfaces import has_step_pooler
from vllm.v1.utils import bind_kv_cache
else:
from vllm.model_executor.models.interfaces import supports_transcription
from vllm.model_executor.models.interfaces_base import \
is_text_generation_model
if not vllm_version_is("0.10.0"):
from vllm.tasks import GenerationTask, SupportedTask
from vllm.v1.worker.utils import bind_kv_cache
if TYPE_CHECKING:
import xgrammar as xgr # type: ignore[import-untyped]
@@ -409,7 +403,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
else:
generator = None
if not vllm_version_is("0.9.2") and pooling_params:
if pooling_params:
assert (task := pooling_params.task) is not None, (
"You did not set `task` in the API")
model = cast(VllmModelForPooling, self.model)
@@ -585,10 +579,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
# OPTIMIZATION: Start copying the block table first.
# This way, we can overlap the copy with the following CPU operations.
if vllm_version_is("0.9.2"):
self.input_batch.block_table.commit(num_reqs)
else:
self.input_batch.block_table.commit_block_table(num_reqs)
self.input_batch.block_table.commit_block_table(num_reqs)
# Get the number of scheduled tokens for each request.
req_ids = self.input_batch.req_ids
@@ -939,10 +930,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
# OPTIMIZATION: Start copying the block table first.
# This way, we can overlap the copy with the following CPU operations.
if vllm_version_is("0.9.2"):
self.input_batch.block_table.commit(num_reqs)
else:
self.input_batch.block_table.commit_block_table(num_reqs)
self.input_batch.block_table.commit_block_table(num_reqs)
# Get the number of scheduled tokens for each request.
# TODO: The Python loop can be slow. Optimize.
@@ -1771,57 +1759,33 @@ class NPUModelRunner(LoRAModelRunnerMixin):
req_num_tokens = num_tokens // num_reqs
if vllm_version_is("0.9.2"):
dummy_metadata = PoolingMetadata(
prompt_lens=torch.tensor(
[h.shape[0] for h in hidden_states_list],
device=self.device),
prompt_token_ids=torch.zeros((num_reqs, req_num_tokens),
dtype=torch.int32,
device=self.device),
pooling_params=[PoolingParams()] * num_reqs)
try:
pooler_output = self.model.pooler(
hidden_states=hidden_states_list,
pooling_metadata=dummy_metadata)
except RuntimeError as e:
if 'out of memory' in str(e):
raise RuntimeError(
"NPU out of memory occurred when warming up pooler with "
f"{num_reqs} dummy requests. Please try lowering "
"`max_num_seqs` or `gpu_memory_utilization` when "
"initializing the engine.") from e
else:
raise e
else:
model = cast(VllmModelForPooling, self.model)
dummy_task = self.get_supported_pooling_tasks()[0]
dummy_pooling_params = PoolingParams(task=dummy_task)
model = cast(VllmModelForPooling, self.model)
dummy_task = self.get_supported_pooling_tasks()[0]
dummy_pooling_params = PoolingParams(task=dummy_task)
to_update = model.pooler.get_pooling_updates(dummy_task)
to_update.apply(dummy_pooling_params)
to_update = model.pooler.get_pooling_updates(dummy_task)
to_update.apply(dummy_pooling_params)
dummy_metadata = PoolingMetadata(
prompt_lens=torch.tensor(
[h.shape[0] for h in hidden_states_list],
device=self.device),
prompt_token_ids=torch.zeros((num_reqs, req_num_tokens),
dtype=torch.int32,
device=self.device),
pooling_params=[dummy_pooling_params] * num_reqs)
dummy_metadata = PoolingMetadata(
prompt_lens=torch.tensor([h.shape[0] for h in hidden_states_list],
device=self.device),
prompt_token_ids=torch.zeros((num_reqs, req_num_tokens),
dtype=torch.int32,
device=self.device),
pooling_params=[dummy_pooling_params] * num_reqs)
try:
pooler_output = model.pooler(hidden_states=hidden_states_list,
pooling_metadata=dummy_metadata)
except RuntimeError as e:
if 'out of memory' in str(e):
raise RuntimeError(
"NPU out of memory occurred when warming up pooler with "
f"{num_reqs} dummy requests. Please try lowering "
"`max_num_seqs` or `gpu_memory_utilization` when "
"initializing the engine.") from e
else:
raise e
try:
pooler_output = model.pooler(hidden_states=hidden_states_list,
pooling_metadata=dummy_metadata)
except RuntimeError as e:
if 'out of memory' in str(e):
raise RuntimeError(
"NPU out of memory occurred when warming up pooler with "
f"{num_reqs} dummy requests. Please try lowering "
"`max_num_seqs` or `gpu_memory_utilization` when "
"initializing the engine.") from e
else:
raise e
return pooler_output
@@ -1841,9 +1805,6 @@ class NPUModelRunner(LoRAModelRunnerMixin):
QKVParallelLinear, RowParallelLinear)):
module.weight.data = torch_npu.npu_format_cast(
module.weight.data, ACL_FORMAT_FRACTAL_NZ)
if vllm_version_is("0.9.2") and has_step_pooler(self.model):
self.input_batch.logits_processing_needs_token_ids_bool = True
if self.drafter:
logger.info("Loading drafter model...")
if isinstance(self.drafter, EagleProposer):

View File

@@ -35,8 +35,6 @@ from vllm.v1.spec_decode.utils import is_spec_decode_unsupported
from vllm.v1.utils import copy_slice
from vllm.v1.worker.block_table import MultiGroupBlockTable
from vllm_ascend.utils import vllm_version_is
_SAMPLING_EPS = 1e-5
@@ -246,11 +244,8 @@ class InputBatch:
# req_index -> bad_words_token_ids
self.bad_words_token_ids: dict[int, list[list[int]]] = {}
if vllm_version_is("0.9.2"):
self.logits_processing_needs_token_ids_bool = False
else:
self.logits_processing_needs_token_ids = np.zeros(max_num_reqs,
dtype=bool)
self.logits_processing_needs_token_ids = np.zeros(max_num_reqs,
dtype=bool)
self.req_output_token_ids: list[Optional[list[int]]] = []
@@ -387,9 +382,6 @@ class InputBatch:
if sampling_params.bad_words_token_ids:
self.bad_words_token_ids[
req_index] = sampling_params.bad_words_token_ids
elif vllm_version_is("0.9.2"):
assert request.pooling_params is not None
self.pooling_params[req_id] = request.pooling_params
elif pooling_params := request.pooling_params:
self.pooling_params[req_id] = pooling_params
self.logits_processing_needs_token_ids[req_index] = (
@@ -624,15 +616,10 @@ class InputBatch:
self.presence_penalties, num_reqs)
copy_slice(self.repetition_penalties_cpu_tensor,
self.repetition_penalties, num_reqs)
if vllm_version_is("0.9.2"):
needs_prompt_token_ids = (
not self.no_penalties
or (self.num_reqs > 0
and self.logits_processing_needs_token_ids_bool))
else:
needs_prompt_token_ids = (
not self.no_penalties
or self.logits_processing_needs_token_ids[:num_reqs].any())
needs_prompt_token_ids = (
not self.no_penalties
or self.logits_processing_needs_token_ids[:num_reqs].any())
if needs_prompt_token_ids:
# The prompt tokens are used only for applying penalties or
# step pooling during the sampling/pooling process.

View File

@@ -45,7 +45,7 @@ from vllm_ascend.utils import (sleep_mode_enabled, try_register_lib,
vllm_version_is)
from vllm_ascend.worker.model_runner_v1 import NPUModelRunner
if not vllm_version_is("0.9.2"):
if not vllm_version_is("0.10.0"):
from vllm.tasks import SupportedTask