diff --git a/vllm_ascend/pool/__init__.py b/vllm_ascend/pool/__init__.py deleted file mode 100644 index 146a786..0000000 --- a/vllm_ascend/pool/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ -# -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# This file is a part of the vllm-ascend project. -# diff --git a/vllm_ascend/pool/metadata.py b/vllm_ascend/pool/metadata.py deleted file mode 100644 index 6dca038..0000000 --- a/vllm_ascend/pool/metadata.py +++ /dev/null @@ -1,32 +0,0 @@ -# -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# Copyright 2023 The vLLM team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# This file is a part of the vllm-ascend project. -# Adapted from vllm-project/vllm/vllm/v1/pool/metadata.py -# -from dataclasses import dataclass -from typing import Optional - -import torch -from vllm.pooling_params import PoolingParams - - -@dataclass -class PoolingMetadata: - """Tensors for pooling.""" - - prompt_lens: torch.Tensor - prompt_token_ids: Optional[torch.Tensor] - pooling_params: list[PoolingParams] diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index d8ee57a..4be7f47 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -59,6 +59,7 @@ from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, KVCacheSpec) from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, LogprobsTensors, ModelRunnerOutput) +from vllm.v1.pool.metadata import PoolingMetadata from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.sample.sampler import Sampler from vllm.v1.spec_decode.metadata import SpecDecodeMetadata @@ -76,7 +77,6 @@ from vllm_ascend.attention.attention_v1_torchair import AscendTorchairMetadata from vllm_ascend.attention.mla_v1 import (AscendMLAMetadata, CommonAttentionMetadata) from vllm_ascend.platform import NPUPlatform -from vllm_ascend.pool.metadata import PoolingMetadata from vllm_ascend.sample.rejection_sampler import AscendRejectionSampler from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, ACL_FORMAT_FRACTAL_NZ, ProfileExecuteDuration, @@ -571,7 +571,10 @@ class NPUModelRunner(LoRAModelRunnerMixin): # OPTIMIZATION: Start copying the block table first. # This way, we can overlap the copy with the following CPU operations. - self.input_batch.block_table.commit(num_reqs) + if vllm_version_is("0.9.2"): + self.input_batch.block_table.commit(num_reqs) + else: + self.input_batch.block_table.commit_block_table(num_reqs) # Get the number of scheduled tokens for each request. req_ids = self.input_batch.req_ids @@ -902,7 +905,10 @@ class NPUModelRunner(LoRAModelRunnerMixin): # OPTIMIZATION: Start copying the block table first. # This way, we can overlap the copy with the following CPU operations. - self.input_batch.block_table.commit(num_reqs) + if vllm_version_is("0.9.2"): + self.input_batch.block_table.commit(num_reqs) + else: + self.input_batch.block_table.commit_block_table(num_reqs) # Get the number of scheduled tokens for each request. # TODO: The Python loop can be slow. Optimize. diff --git a/vllm_ascend/worker/npu_input_batch.py b/vllm_ascend/worker/npu_input_batch.py index cb5b264..40c1043 100644 --- a/vllm_ascend/worker/npu_input_batch.py +++ b/vllm_ascend/worker/npu_input_batch.py @@ -28,14 +28,13 @@ from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingParams, SamplingType from vllm.utils import swap_dict_values from vllm.v1.outputs import LogprobsTensors +from vllm.v1.pool.metadata import PoolingMetadata from vllm.v1.sample.logits_processor import init_builtin_logitsprocs from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.spec_decode.utils import is_spec_decode_unsupported from vllm.v1.utils import copy_slice from vllm.v1.worker.block_table import MultiGroupBlockTable -from vllm_ascend.pool.metadata import PoolingMetadata - _SAMPLING_EPS = 1e-5