Upgrade vLLM to v0.10.0 (#1927)
### What this PR does / why we need it? - Upgrade to v0.10.0 - Drop v0.9.2 version compatibility - Add patch for `vllm_ascend/patch/worker/patch_common/patch_sampler_gather_logprobs.py` as workaround off3a683b7c9for v0.10.0 and also add e2e test `test_models_prompt_logprobs` - Pin transformers<4.54.0 as workaround of https://github.com/vllm-project/vllm-ascend/issues/2034 ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? - Test locally: `VLLM_USE_MODELSCOPE=true pytest -sv tests/e2e/singlecard/test_offline_inference.py::test_models_prompt_logprobs` - CI passed - vLLM version: v0.9.2 - vLLM main:7728dd77bb--------- Signed-off-by: Yikun Jiang <yikunkero@gmail.com>
This commit is contained in:
@@ -17,7 +17,7 @@
|
||||
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum
|
||||
from typing import Any, Dict, List, Optional, Tuple, Type
|
||||
from typing import List, Optional, Tuple, Type
|
||||
|
||||
import torch
|
||||
import torch_npu
|
||||
@@ -31,7 +31,7 @@ from vllm.v1.worker.gpu_input_batch import InputBatch
|
||||
|
||||
from vllm_ascend.ops.attention import vanilla_chunked_prefill
|
||||
from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, aligned_16, is_310p,
|
||||
nd_to_nz_2d, nd_to_nz_spec, vllm_version_is)
|
||||
nd_to_nz_2d, nd_to_nz_spec)
|
||||
|
||||
|
||||
class AscendAttentionBackend(AttentionBackend):
|
||||
@@ -43,8 +43,6 @@ class AscendAttentionBackend(AttentionBackend):
|
||||
|
||||
@staticmethod
|
||||
def get_impl_cls() -> Type["AscendAttentionBackendImpl"]:
|
||||
if vllm_version_is("0.9.2"):
|
||||
return AscendAttentionBackendImpl092
|
||||
return AscendAttentionBackendImpl
|
||||
|
||||
@staticmethod
|
||||
@@ -440,38 +438,6 @@ class AscendAttentionBackendImpl(AttentionImpl):
|
||||
return output.view(num_tokens, self.hidden_size)
|
||||
|
||||
|
||||
class AscendAttentionBackendImpl092(AscendAttentionBackendImpl):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
num_heads: int,
|
||||
head_size: int,
|
||||
scale: float,
|
||||
num_kv_heads: int,
|
||||
alibi_slopes: Optional[List[float]],
|
||||
sliding_window: Optional[int],
|
||||
kv_cache_dtype: str,
|
||||
blocksparse_params: Optional[Dict[str, Any]] = None,
|
||||
logits_soft_cap: Optional[float] = None,
|
||||
attn_type: str = AttentionType.DECODER,
|
||||
kv_sharing_target_layer_name: Optional[str] = None,
|
||||
use_irope: bool = False,
|
||||
) -> None:
|
||||
super().__init__(
|
||||
num_heads=num_heads,
|
||||
head_size=head_size,
|
||||
scale=scale,
|
||||
num_kv_heads=num_kv_heads,
|
||||
alibi_slopes=alibi_slopes,
|
||||
sliding_window=sliding_window,
|
||||
kv_cache_dtype=kv_cache_dtype,
|
||||
logits_soft_cap=logits_soft_cap,
|
||||
attn_type=attn_type,
|
||||
kv_sharing_target_layer_name=kv_sharing_target_layer_name,
|
||||
use_irope=use_irope,
|
||||
)
|
||||
|
||||
|
||||
def unified_ascend_attention_with_output(
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor,
|
||||
|
||||
@@ -16,7 +16,7 @@
|
||||
#
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict, List, Optional, Tuple, Type
|
||||
from typing import List, Optional, Tuple, Type
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
@@ -29,7 +29,7 @@ from vllm.v1.worker.gpu_input_batch import InputBatch
|
||||
|
||||
from vllm_ascend.attention.attention_v1 import AscendAttentionState
|
||||
from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, aligned_16, is_310p,
|
||||
nd_to_nz_2d, vllm_version_is)
|
||||
nd_to_nz_2d)
|
||||
|
||||
|
||||
class AscendAttentionTorchairBackend(AttentionBackend):
|
||||
@@ -41,8 +41,6 @@ class AscendAttentionTorchairBackend(AttentionBackend):
|
||||
|
||||
@staticmethod
|
||||
def get_impl_cls() -> Type["AscendAttentionTorchairBackendImpl"]:
|
||||
if vllm_version_is("0.9.2"):
|
||||
return AscendAttentionTorchairBackendImpl092
|
||||
return AscendAttentionTorchairBackendImpl
|
||||
|
||||
@staticmethod
|
||||
@@ -489,36 +487,3 @@ class AscendAttentionTorchairBackendImpl(AttentionImpl):
|
||||
"to use ascend scheduler.")
|
||||
|
||||
return output.view(num_tokens, self.hidden_size)
|
||||
|
||||
|
||||
class AscendAttentionTorchairBackendImpl092(AscendAttentionTorchairBackendImpl
|
||||
):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
num_heads: int,
|
||||
head_size: int,
|
||||
scale: float,
|
||||
num_kv_heads: int,
|
||||
alibi_slopes: Optional[List[float]],
|
||||
sliding_window: Optional[int],
|
||||
kv_cache_dtype: str,
|
||||
blocksparse_params: Optional[Dict[str, Any]] = None,
|
||||
logits_soft_cap: Optional[float] = None,
|
||||
attn_type: str = AttentionType.DECODER,
|
||||
kv_sharing_target_layer_name: Optional[str] = None,
|
||||
use_irope: bool = False,
|
||||
) -> None:
|
||||
super().__init__(
|
||||
num_heads=num_heads,
|
||||
head_size=head_size,
|
||||
scale=scale,
|
||||
num_kv_heads=num_kv_heads,
|
||||
alibi_slopes=alibi_slopes,
|
||||
sliding_window=sliding_window,
|
||||
kv_cache_dtype=kv_cache_dtype,
|
||||
logits_soft_cap=logits_soft_cap,
|
||||
attn_type=attn_type,
|
||||
kv_sharing_target_layer_name=kv_sharing_target_layer_name,
|
||||
use_irope=use_irope,
|
||||
)
|
||||
|
||||
@@ -1,12 +1,11 @@
|
||||
from dataclasses import dataclass
|
||||
from typing import (TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type,
|
||||
TypeVar)
|
||||
from typing import TYPE_CHECKING, Optional, Tuple, Type, TypeVar
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch_npu
|
||||
from vllm.attention.backends.abstract import (AttentionBackend, AttentionLayer,
|
||||
AttentionMetadata, AttentionType,
|
||||
AttentionMetadata,
|
||||
MLAAttentionImpl)
|
||||
from vllm.attention.backends.utils import PAD_SLOT_ID
|
||||
from vllm.config import get_current_vllm_config
|
||||
@@ -22,7 +21,7 @@ from vllm_ascend.multistream.context import get_multistream_comm_context
|
||||
from vllm_ascend.multistream.ms_split import model_input_split_v1_mla_attn
|
||||
from vllm_ascend.ops.attention import vanilla_chunked_prefill_mla
|
||||
from vllm_ascend.torchair.utils import npu_stream_switch, npu_wait_tensor
|
||||
from vllm_ascend.utils import npu_prefetch, vllm_version_is
|
||||
from vllm_ascend.utils import npu_prefetch
|
||||
from vllm_ascend.worker.npu_input_batch import InputBatch
|
||||
|
||||
if TYPE_CHECKING:
|
||||
@@ -54,8 +53,6 @@ class AscendMLABackend(AttentionBackend):
|
||||
|
||||
@staticmethod
|
||||
def get_impl_cls() -> Type["MLAAttentionImpl"]:
|
||||
if vllm_version_is("0.9.2"):
|
||||
return AscendMLAImpl092
|
||||
return AscendMLAImpl
|
||||
|
||||
|
||||
@@ -1212,34 +1209,3 @@ class AscendMLAImpl(MLAAttentionImpl):
|
||||
output[:num_decode_tokens] = output_decode
|
||||
|
||||
return output_padded
|
||||
|
||||
|
||||
class AscendMLAImpl092(AscendMLAImpl):
|
||||
|
||||
def __init__(self,
|
||||
num_heads: int,
|
||||
head_size: int,
|
||||
scale: float,
|
||||
num_kv_heads: int,
|
||||
alibi_slopes: Optional[List[float]],
|
||||
sliding_window: Optional[int],
|
||||
kv_cache_dtype: str,
|
||||
blocksparse_params: Optional[Dict[str, Any]] = None,
|
||||
logits_soft_cap: Optional[float] = None,
|
||||
attn_type: str = AttentionType.DECODER,
|
||||
kv_sharing_target_layer_name: Optional[str] = None,
|
||||
use_irope: bool = False,
|
||||
**kwargs) -> None:
|
||||
super().__init__(
|
||||
num_heads=num_heads,
|
||||
head_size=head_size,
|
||||
scale=scale,
|
||||
num_kv_heads=num_kv_heads,
|
||||
alibi_slopes=alibi_slopes,
|
||||
sliding_window=sliding_window,
|
||||
kv_cache_dtype=kv_cache_dtype,
|
||||
logits_soft_cap=logits_soft_cap,
|
||||
attn_type=attn_type,
|
||||
kv_sharing_target_layer_name=kv_sharing_target_layer_name,
|
||||
use_irope=use_irope,
|
||||
**kwargs)
|
||||
|
||||
@@ -32,8 +32,6 @@ from vllm.v1.outputs import ModelRunnerOutput
|
||||
from vllm.v1.request import Request, RequestStatus
|
||||
from vllm.v1.structured_output import StructuredOutputManager
|
||||
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
|
||||
class AscendScheduler(Scheduler):
|
||||
"""This Scheduler extends vllm's original v1 scheduler
|
||||
@@ -283,23 +281,12 @@ class AscendScheduler(Scheduler):
|
||||
# allow the lower-priority requests to be scheduled.
|
||||
req_index += 1
|
||||
continue
|
||||
if vllm_version_is("0.9.2"):
|
||||
num_draft_tokens = max(
|
||||
num_new_tokens + request.num_computed_tokens -
|
||||
request.num_tokens, 0)
|
||||
|
||||
while True:
|
||||
if vllm_version_is("0.9.2"):
|
||||
new_blocks = self.kv_cache_manager.allocate_slots(
|
||||
request,
|
||||
num_new_tokens,
|
||||
num_draft_tokens=num_draft_tokens,
|
||||
num_lookahead_tokens=self.num_lookahead_tokens)
|
||||
else:
|
||||
new_blocks = self.kv_cache_manager.allocate_slots(
|
||||
request,
|
||||
num_new_tokens,
|
||||
num_lookahead_tokens=self.num_lookahead_tokens)
|
||||
new_blocks = self.kv_cache_manager.allocate_slots(
|
||||
request,
|
||||
num_new_tokens,
|
||||
num_lookahead_tokens=self.num_lookahead_tokens)
|
||||
if new_blocks is None:
|
||||
# The request cannot be scheduled.
|
||||
# Preempt the lowest-priority request.
|
||||
|
||||
@@ -24,9 +24,9 @@
|
||||
# each worker's `__init__` function.
|
||||
#
|
||||
# Then in each kind of patch, there are three folders:
|
||||
# - patch_0_9_2: contains the patches applied when vllm version is 0.9.2.
|
||||
# - patch_0_10_0: contains the patches applied when vllm version is 0.10.0.
|
||||
# - patch_main: contains the patches applied when vllm version is main branch.
|
||||
# - patch_common: contains the patches applied in both 0.9.2 and main branch.
|
||||
# - patch_common: contains the patches applied in both 0.10.0 and main branch.
|
||||
#
|
||||
# Once a new patch is added in vllm-ascend, please add the patch description into this file as well.
|
||||
# ----------------------------------------------------------------------------------
|
||||
@@ -101,3 +101,16 @@
|
||||
# - https://github.com/vllm-project/vllm-ascend/pull/1732
|
||||
# Future Plan:
|
||||
# Revert it when the ascend scatter performance improves.
|
||||
#
|
||||
# ** File: worker/patch_common/patch_sampler.py **
|
||||
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
# 1. `vllm.v1.sample.sampler.Sampler.gather_logprobs`
|
||||
# Why:
|
||||
# We need to patch gather_logprobs to make sure call batched_count_greater_than
|
||||
# with backend=current_platform.simple_compile_backend
|
||||
# How:
|
||||
# Patch gather_logprobs call new batched_count_greater_than
|
||||
# Related PR (if no, explain why):
|
||||
# - https://github.com/vllm-project/vllm/pull/21591
|
||||
# Future Plan:
|
||||
# Revert it when vLLM merge #21591 and release new version
|
||||
|
||||
@@ -17,8 +17,8 @@
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
# Import specific patches for different versions
|
||||
if vllm_version_is("0.9.2"):
|
||||
from vllm_ascend.patch.platform import patch_0_9_2 # noqa: F401
|
||||
if vllm_version_is("0.10.0"):
|
||||
from vllm_ascend.patch.platform import patch_0_10_0 # noqa: F401
|
||||
from vllm_ascend.patch.platform import patch_common # noqa: F401
|
||||
else:
|
||||
from vllm_ascend.patch.platform import patch_common # noqa: F401
|
||||
|
||||
@@ -18,8 +18,8 @@
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
# Import specific patches for different versions
|
||||
if vllm_version_is("0.9.2"):
|
||||
from vllm_ascend.patch.worker import patch_0_9_2 # noqa: F401
|
||||
if vllm_version_is("0.10.0"):
|
||||
from vllm_ascend.patch.worker import patch_0_10_0 # noqa: F401
|
||||
from vllm_ascend.patch.worker import patch_common # noqa: F401
|
||||
else:
|
||||
from vllm_ascend.patch.worker import patch_common # noqa: F401
|
||||
|
||||
@@ -14,3 +14,5 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import vllm_ascend.patch.worker.patch_0_10_0.patch_sampler_gather_logprobs # noqa
|
||||
@@ -0,0 +1,87 @@
|
||||
#
|
||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# This file is a part of the vllm-ascend project.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import torch
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.v1.outputs import LogprobsTensors
|
||||
from vllm.v1.sample.sampler import Sampler
|
||||
|
||||
|
||||
@torch.compile(dynamic=True, backend=current_platform.simple_compile_backend)
|
||||
def batched_count_greater_than(x: torch.Tensor,
|
||||
values: torch.Tensor) -> torch.Tensor:
|
||||
"""
|
||||
Counts elements in each row of x that are greater than the corresponding
|
||||
value in values. Use torch.compile to generate an optimized kernel for
|
||||
this function. otherwise, it will create additional copies of the input
|
||||
tensors and cause memory issues.
|
||||
Args:
|
||||
x (torch.Tensor): A 2D tensor of shape (batch_size, n_elements).
|
||||
values (torch.Tensor): A 2D tensor of shape (batch_size, 1).
|
||||
Returns:
|
||||
torch.Tensor: A 1D tensor of shape (batch_size,) with the counts.
|
||||
"""
|
||||
return (x >= values).sum(-1)
|
||||
|
||||
|
||||
def gather_logprobs(
|
||||
self,
|
||||
logprobs: torch.Tensor,
|
||||
num_logprobs: int,
|
||||
token_ids: torch.Tensor,
|
||||
) -> LogprobsTensors:
|
||||
"""
|
||||
Gather logprobs for topk and sampled/prompt token.
|
||||
|
||||
Args:
|
||||
logprobs: (num tokens) x (vocab) tensor
|
||||
num_logprobs: minimum number of logprobs to
|
||||
retain per token
|
||||
token_ids: prompt tokens (if prompt logprobs)
|
||||
or sampled tokens (if sampled
|
||||
logprobs); 1D token ID tensor
|
||||
with (num tokens) elements
|
||||
Must be int64.
|
||||
|
||||
Returns:
|
||||
Top-k int indices tensor, (num tokens) x (num_logprobs + 1)
|
||||
Top-k float logprobs tensor, (num tokens) x (num_logprobs + 1)
|
||||
Sampled token rank tensor, (num tokens)
|
||||
"""
|
||||
assert token_ids.dtype == torch.int64
|
||||
# Find the topK values.
|
||||
topk_logprobs, topk_indices = torch.topk(logprobs, num_logprobs, dim=-1)
|
||||
|
||||
# Get with the logprob of the prompt or sampled token.
|
||||
token_ids = token_ids.unsqueeze(-1)
|
||||
token_logprobs = logprobs.gather(-1, token_ids)
|
||||
|
||||
# Compute the ranks of the actual token.
|
||||
token_ranks = batched_count_greater_than(logprobs, token_logprobs)
|
||||
|
||||
# Concatenate together with the topk.
|
||||
indices = torch.cat((token_ids, topk_indices), dim=1)
|
||||
logprobs = torch.cat((token_logprobs, topk_logprobs), dim=1)
|
||||
|
||||
# Use int32 to reduce the tensor size.
|
||||
indices = indices.to(torch.int32)
|
||||
|
||||
return LogprobsTensors(indices, logprobs, token_ranks)
|
||||
|
||||
|
||||
Sampler.gather_logprobs = gather_logprobs
|
||||
@@ -45,8 +45,9 @@ from vllm.logger import logger
|
||||
from vllm.model_executor.layers.fused_moe import FusedMoE
|
||||
from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
|
||||
from vllm.model_executor.model_loader import get_model
|
||||
from vllm.model_executor.models.interfaces_base import (VllmModelForPooling,
|
||||
is_pooling_model)
|
||||
from vllm.model_executor.models.interfaces import supports_transcription
|
||||
from vllm.model_executor.models.interfaces_base import (
|
||||
VllmModelForPooling, is_pooling_model, is_text_generation_model)
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
|
||||
from vllm.multimodal.utils import group_mm_inputs_by_modality
|
||||
@@ -66,7 +67,7 @@ from vllm.v1.sample.sampler import Sampler
|
||||
from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
|
||||
from vllm.v1.spec_decode.ngram_proposer import NgramProposer
|
||||
from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
|
||||
from vllm.v1.worker.utils import (gather_mm_placeholders,
|
||||
from vllm.v1.worker.utils import (bind_kv_cache, gather_mm_placeholders,
|
||||
sanity_check_mm_encoder_outputs,
|
||||
scatter_mm_placeholders)
|
||||
|
||||
@@ -88,15 +89,8 @@ from vllm_ascend.worker.eagle_proposer_v1 import EagleProposer
|
||||
from vllm_ascend.worker.mtp_proposer_v1 import MtpProposer
|
||||
from vllm_ascend.worker.npu_input_batch import CachedRequestState, InputBatch
|
||||
|
||||
if vllm_version_is("0.9.2"):
|
||||
from vllm.model_executor.models.interfaces import has_step_pooler
|
||||
from vllm.v1.utils import bind_kv_cache
|
||||
else:
|
||||
from vllm.model_executor.models.interfaces import supports_transcription
|
||||
from vllm.model_executor.models.interfaces_base import \
|
||||
is_text_generation_model
|
||||
if not vllm_version_is("0.10.0"):
|
||||
from vllm.tasks import GenerationTask, SupportedTask
|
||||
from vllm.v1.worker.utils import bind_kv_cache
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import xgrammar as xgr # type: ignore[import-untyped]
|
||||
@@ -409,7 +403,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
else:
|
||||
generator = None
|
||||
|
||||
if not vllm_version_is("0.9.2") and pooling_params:
|
||||
if pooling_params:
|
||||
assert (task := pooling_params.task) is not None, (
|
||||
"You did not set `task` in the API")
|
||||
model = cast(VllmModelForPooling, self.model)
|
||||
@@ -585,10 +579,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
|
||||
# OPTIMIZATION: Start copying the block table first.
|
||||
# This way, we can overlap the copy with the following CPU operations.
|
||||
if vllm_version_is("0.9.2"):
|
||||
self.input_batch.block_table.commit(num_reqs)
|
||||
else:
|
||||
self.input_batch.block_table.commit_block_table(num_reqs)
|
||||
self.input_batch.block_table.commit_block_table(num_reqs)
|
||||
|
||||
# Get the number of scheduled tokens for each request.
|
||||
req_ids = self.input_batch.req_ids
|
||||
@@ -939,10 +930,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
|
||||
# OPTIMIZATION: Start copying the block table first.
|
||||
# This way, we can overlap the copy with the following CPU operations.
|
||||
if vllm_version_is("0.9.2"):
|
||||
self.input_batch.block_table.commit(num_reqs)
|
||||
else:
|
||||
self.input_batch.block_table.commit_block_table(num_reqs)
|
||||
self.input_batch.block_table.commit_block_table(num_reqs)
|
||||
|
||||
# Get the number of scheduled tokens for each request.
|
||||
# TODO: The Python loop can be slow. Optimize.
|
||||
@@ -1771,57 +1759,33 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
|
||||
req_num_tokens = num_tokens // num_reqs
|
||||
|
||||
if vllm_version_is("0.9.2"):
|
||||
dummy_metadata = PoolingMetadata(
|
||||
prompt_lens=torch.tensor(
|
||||
[h.shape[0] for h in hidden_states_list],
|
||||
device=self.device),
|
||||
prompt_token_ids=torch.zeros((num_reqs, req_num_tokens),
|
||||
dtype=torch.int32,
|
||||
device=self.device),
|
||||
pooling_params=[PoolingParams()] * num_reqs)
|
||||
try:
|
||||
pooler_output = self.model.pooler(
|
||||
hidden_states=hidden_states_list,
|
||||
pooling_metadata=dummy_metadata)
|
||||
except RuntimeError as e:
|
||||
if 'out of memory' in str(e):
|
||||
raise RuntimeError(
|
||||
"NPU out of memory occurred when warming up pooler with "
|
||||
f"{num_reqs} dummy requests. Please try lowering "
|
||||
"`max_num_seqs` or `gpu_memory_utilization` when "
|
||||
"initializing the engine.") from e
|
||||
else:
|
||||
raise e
|
||||
else:
|
||||
model = cast(VllmModelForPooling, self.model)
|
||||
dummy_task = self.get_supported_pooling_tasks()[0]
|
||||
dummy_pooling_params = PoolingParams(task=dummy_task)
|
||||
model = cast(VllmModelForPooling, self.model)
|
||||
dummy_task = self.get_supported_pooling_tasks()[0]
|
||||
dummy_pooling_params = PoolingParams(task=dummy_task)
|
||||
|
||||
to_update = model.pooler.get_pooling_updates(dummy_task)
|
||||
to_update.apply(dummy_pooling_params)
|
||||
to_update = model.pooler.get_pooling_updates(dummy_task)
|
||||
to_update.apply(dummy_pooling_params)
|
||||
|
||||
dummy_metadata = PoolingMetadata(
|
||||
prompt_lens=torch.tensor(
|
||||
[h.shape[0] for h in hidden_states_list],
|
||||
device=self.device),
|
||||
prompt_token_ids=torch.zeros((num_reqs, req_num_tokens),
|
||||
dtype=torch.int32,
|
||||
device=self.device),
|
||||
pooling_params=[dummy_pooling_params] * num_reqs)
|
||||
dummy_metadata = PoolingMetadata(
|
||||
prompt_lens=torch.tensor([h.shape[0] for h in hidden_states_list],
|
||||
device=self.device),
|
||||
prompt_token_ids=torch.zeros((num_reqs, req_num_tokens),
|
||||
dtype=torch.int32,
|
||||
device=self.device),
|
||||
pooling_params=[dummy_pooling_params] * num_reqs)
|
||||
|
||||
try:
|
||||
pooler_output = model.pooler(hidden_states=hidden_states_list,
|
||||
pooling_metadata=dummy_metadata)
|
||||
except RuntimeError as e:
|
||||
if 'out of memory' in str(e):
|
||||
raise RuntimeError(
|
||||
"NPU out of memory occurred when warming up pooler with "
|
||||
f"{num_reqs} dummy requests. Please try lowering "
|
||||
"`max_num_seqs` or `gpu_memory_utilization` when "
|
||||
"initializing the engine.") from e
|
||||
else:
|
||||
raise e
|
||||
try:
|
||||
pooler_output = model.pooler(hidden_states=hidden_states_list,
|
||||
pooling_metadata=dummy_metadata)
|
||||
except RuntimeError as e:
|
||||
if 'out of memory' in str(e):
|
||||
raise RuntimeError(
|
||||
"NPU out of memory occurred when warming up pooler with "
|
||||
f"{num_reqs} dummy requests. Please try lowering "
|
||||
"`max_num_seqs` or `gpu_memory_utilization` when "
|
||||
"initializing the engine.") from e
|
||||
else:
|
||||
raise e
|
||||
|
||||
return pooler_output
|
||||
|
||||
@@ -1841,9 +1805,6 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
QKVParallelLinear, RowParallelLinear)):
|
||||
module.weight.data = torch_npu.npu_format_cast(
|
||||
module.weight.data, ACL_FORMAT_FRACTAL_NZ)
|
||||
|
||||
if vllm_version_is("0.9.2") and has_step_pooler(self.model):
|
||||
self.input_batch.logits_processing_needs_token_ids_bool = True
|
||||
if self.drafter:
|
||||
logger.info("Loading drafter model...")
|
||||
if isinstance(self.drafter, EagleProposer):
|
||||
|
||||
@@ -35,8 +35,6 @@ from vllm.v1.spec_decode.utils import is_spec_decode_unsupported
|
||||
from vllm.v1.utils import copy_slice
|
||||
from vllm.v1.worker.block_table import MultiGroupBlockTable
|
||||
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
_SAMPLING_EPS = 1e-5
|
||||
|
||||
|
||||
@@ -246,11 +244,8 @@ class InputBatch:
|
||||
|
||||
# req_index -> bad_words_token_ids
|
||||
self.bad_words_token_ids: dict[int, list[list[int]]] = {}
|
||||
if vllm_version_is("0.9.2"):
|
||||
self.logits_processing_needs_token_ids_bool = False
|
||||
else:
|
||||
self.logits_processing_needs_token_ids = np.zeros(max_num_reqs,
|
||||
dtype=bool)
|
||||
self.logits_processing_needs_token_ids = np.zeros(max_num_reqs,
|
||||
dtype=bool)
|
||||
|
||||
self.req_output_token_ids: list[Optional[list[int]]] = []
|
||||
|
||||
@@ -387,9 +382,6 @@ class InputBatch:
|
||||
if sampling_params.bad_words_token_ids:
|
||||
self.bad_words_token_ids[
|
||||
req_index] = sampling_params.bad_words_token_ids
|
||||
elif vllm_version_is("0.9.2"):
|
||||
assert request.pooling_params is not None
|
||||
self.pooling_params[req_id] = request.pooling_params
|
||||
elif pooling_params := request.pooling_params:
|
||||
self.pooling_params[req_id] = pooling_params
|
||||
self.logits_processing_needs_token_ids[req_index] = (
|
||||
@@ -624,15 +616,10 @@ class InputBatch:
|
||||
self.presence_penalties, num_reqs)
|
||||
copy_slice(self.repetition_penalties_cpu_tensor,
|
||||
self.repetition_penalties, num_reqs)
|
||||
if vllm_version_is("0.9.2"):
|
||||
needs_prompt_token_ids = (
|
||||
not self.no_penalties
|
||||
or (self.num_reqs > 0
|
||||
and self.logits_processing_needs_token_ids_bool))
|
||||
else:
|
||||
needs_prompt_token_ids = (
|
||||
not self.no_penalties
|
||||
or self.logits_processing_needs_token_ids[:num_reqs].any())
|
||||
|
||||
needs_prompt_token_ids = (
|
||||
not self.no_penalties
|
||||
or self.logits_processing_needs_token_ids[:num_reqs].any())
|
||||
if needs_prompt_token_ids:
|
||||
# The prompt tokens are used only for applying penalties or
|
||||
# step pooling during the sampling/pooling process.
|
||||
|
||||
@@ -45,7 +45,7 @@ from vllm_ascend.utils import (sleep_mode_enabled, try_register_lib,
|
||||
vllm_version_is)
|
||||
from vllm_ascend.worker.model_runner_v1 import NPUModelRunner
|
||||
|
||||
if not vllm_version_is("0.9.2"):
|
||||
if not vllm_version_is("0.10.0"):
|
||||
from vllm.tasks import SupportedTask
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user