Drop 0.12.0 support (#5146)

We decided to release v0.13.0 soon. So no need to support 0.12.0 now.
Let's drop it.

- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
wangxiyuan
2025-12-20 09:38:53 +08:00
committed by GitHub
parent 243ab7d720
commit 758d81dcb1
21 changed files with 63 additions and 149 deletions

View File

@@ -32,7 +32,7 @@ on:
description: how many pods will be pulled up via lws.yaml, indicates number of nodes we need description: how many pods will be pulled up via lws.yaml, indicates number of nodes we need
vllm_version: vllm_version:
required: false required: false
default: "v0.12.0" default: "v0.13.0"
type: string type: string
description: vllm version to use description: vllm version to use
vllm_ascend_remote_url: vllm_ascend_remote_url:

View File

@@ -60,7 +60,7 @@ jobs:
tests: tests/e2e/nightly/ops tests: tests/e2e/nightly/ops
uses: ./.github/workflows/_e2e_nightly_single_node.yaml uses: ./.github/workflows/_e2e_nightly_single_node.yaml
with: with:
vllm: v0.12.0 vllm: v0.13.0
runner: ${{ matrix.test_config.os }} runner: ${{ matrix.test_config.os }}
tests: ${{ matrix.test_config.tests }} tests: ${{ matrix.test_config.tests }}
name: ${{ matrix.test_config.name }} name: ${{ matrix.test_config.name }}
@@ -128,7 +128,7 @@ jobs:
- Qwen3-VL-30B-A3B-Instruct - Qwen3-VL-30B-A3B-Instruct
uses: ./.github/workflows/_e2e_nightly_single_node_models.yaml uses: ./.github/workflows/_e2e_nightly_single_node_models.yaml
with: with:
vllm: v0.12.0 vllm: v0.13.0
runner: ${{ matrix.test_config.os }} runner: ${{ matrix.test_config.os }}
model_list: ${{ toJson(matrix.test_config.model_list) }} model_list: ${{ toJson(matrix.test_config.model_list) }}
image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc2-910b-ubuntu22.04-py3.11' image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc2-910b-ubuntu22.04-py3.11'

View File

@@ -136,7 +136,7 @@ jobs:
# tests: tests/e2e/nightly/models/test_deepseek_v3_2_exp_w8a8.py # tests: tests/e2e/nightly/models/test_deepseek_v3_2_exp_w8a8.py
uses: ./.github/workflows/_e2e_nightly_single_node.yaml uses: ./.github/workflows/_e2e_nightly_single_node.yaml
with: with:
vllm: v0.12.0 vllm: v0.13.0
runner: ${{ matrix.test_config.os }} runner: ${{ matrix.test_config.os }}
image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a3' image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a3'
tests: ${{ matrix.test_config.tests }} tests: ${{ matrix.test_config.tests }}
@@ -156,7 +156,7 @@ jobs:
uses: ./.github/workflows/_e2e_nightly_single_node.yaml uses: ./.github/workflows/_e2e_nightly_single_node.yaml
with: with:
runner: ${{ matrix.test_config.os }} runner: ${{ matrix.test_config.os }}
vllm: v0.12.0 vllm: v0.13.0
image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a3' image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a3'
tests: ${{ matrix.test_config.tests }} tests: ${{ matrix.test_config.tests }}
name: ${{ matrix.test_config.name }} name: ${{ matrix.test_config.name }}

View File

@@ -74,7 +74,7 @@ jobs:
name: e2e-full name: e2e-full
strategy: strategy:
matrix: matrix:
vllm_version: [releases/v0.13.0, v0.12.0] vllm_version: [v0.13.0]
needs: [changes] needs: [changes]
if: ${{ needs.changes.outputs.e2e_tracker == 'true' }} if: ${{ needs.changes.outputs.e2e_tracker == 'true' }}
uses: ./.github/workflows/_e2e_test.yaml uses: ./.github/workflows/_e2e_test.yaml

View File

@@ -42,7 +42,7 @@ jobs:
lint: lint:
uses: ./.github/workflows/_pre_commit.yml uses: ./.github/workflows/_pre_commit.yml
with: with:
vllm: releases/v0.13.0 vllm: v0.13.0
changes: changes:
runs-on: linux-aarch64-a2-0 runs-on: linux-aarch64-a2-0
outputs: outputs:
@@ -90,7 +90,7 @@ jobs:
SOC_VERSION: ascend910b1 SOC_VERSION: ascend910b1
strategy: strategy:
matrix: matrix:
vllm_version: [releases/v0.13.0, v0.12.0] vllm_version: [v0.13.0]
steps: steps:
- name: Free up disk space - name: Free up disk space
@@ -154,7 +154,7 @@ jobs:
name: e2e-light name: e2e-light
strategy: strategy:
matrix: matrix:
vllm_version: [releases/v0.13.0, v0.12.0] vllm_version: [v0.13.0]
# Note (yikun): If CI resource are limited we can split job into two chain jobs # Note (yikun): If CI resource are limited we can split job into two chain jobs
needs: [lint, changes] needs: [lint, changes]
# only trigger e2e test after lint passed and the change is e2e related with pull request. # only trigger e2e test after lint passed and the change is e2e related with pull request.

View File

@@ -51,7 +51,7 @@ jobs:
strategy: strategy:
matrix: matrix:
include: include:
- vllm_branch: v0.12.0 - vllm_branch: v0.13.0
vllm_ascend_branch: main vllm_ascend_branch: main
max-parallel: 1 max-parallel: 1
container: container:

View File

@@ -48,7 +48,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL}
# Install vLLM # Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.12.0 ARG VLLM_TAG=v0.13.0
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \ RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \

View File

@@ -40,7 +40,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL}
# Install vLLM # Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.12.0 ARG VLLM_TAG=v0.13.0
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \ RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \

View File

@@ -36,7 +36,7 @@ COPY . /vllm-workspace/vllm-ascend/
# Install vLLM # Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.12.0 ARG VLLM_TAG=v0.13.0
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \ RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \

View File

@@ -47,7 +47,7 @@ RUN apt-get update -y && \
# Install vLLM # Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.12.0 ARG VLLM_TAG=v0.13.0
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \ RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \

View File

@@ -50,7 +50,7 @@ RUN yum update -y && \
# Install vLLM # Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.12.0 ARG VLLM_TAG=v0.13.0
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \ RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \

View File

@@ -50,7 +50,7 @@ RUN yum update -y && \
# Install vLLM # Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.12.0 ARG VLLM_TAG=v0.13.0
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \ RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \

View File

@@ -50,7 +50,7 @@ If you're using v0.7.3, don't forget to install [mindie-turbo](https://pypi.org/
For main branch of vLLM Ascend, we usually make it compatible with the latest vLLM release and a newer commit hash of vLLM. Please note that this table is usually updated. Please check it regularly. For main branch of vLLM Ascend, we usually make it compatible with the latest vLLM release and a newer commit hash of vLLM. Please note that this table is usually updated. Please check it regularly.
| vLLM Ascend | vLLM | Python | Stable CANN | PyTorch/torch_npu | | vLLM Ascend | vLLM | Python | Stable CANN | PyTorch/torch_npu |
|-------------|--------------|------------------|-------------|--------------------| |-------------|--------------|------------------|-------------|--------------------|
| main | releases/v0.13.0, v0.12.0 tag | >= 3.10, < 3.12 | 8.3.RC2 | 2.8.0 / 2.8.0 | | main | v0.13.0 tag | >= 3.10, < 3.12 | 8.3.RC2 | 2.8.0 / 2.8.0 |
## Release cadence ## Release cadence

View File

@@ -77,7 +77,7 @@ myst_substitutions = {
# CANN image tag # CANN image tag
'cann_image_tag': "8.3.rc2-910b-ubuntu22.04-py3.11", 'cann_image_tag': "8.3.rc2-910b-ubuntu22.04-py3.11",
# vllm version in ci # vllm version in ci
'ci_vllm_version': 'v0.12.0', 'ci_vllm_version': 'release/v0.13.0',
} }
# For cross-file header anchors # For cross-file header anchors

View File

@@ -3,6 +3,7 @@ from unittest.mock import MagicMock, patch
import pytest import pytest
import torch import torch
from vllm.attention.selector import AttentionSelectorConfig
from vllm.config.compilation import CompilationMode, CUDAGraphMode from vllm.config.compilation import CompilationMode, CUDAGraphMode
from vllm.platforms import PlatformEnum from vllm.platforms import PlatformEnum
@@ -484,28 +485,30 @@ class TestNPUPlatform(TestBase):
self.assertEqual(vllm_config.compilation_config.custom_ops, []) self.assertEqual(vllm_config.compilation_config.custom_ops, [])
def test_get_attn_backend_cls_use_v1_and_mla(self): def test_get_attn_backend_cls_use_v1_and_mla(self):
result = self.platform.get_attn_backend_cls( attn_selector_config = AttentionSelectorConfig(
selected_backend="ascend", dtype=torch.float16,
head_size=64, head_size=0,
dtype="float16", kv_cache_dtype=None,
kv_cache_dtype="float16", block_size=128,
block_size=64,
use_sparse=False,
use_mla=True, use_mla=True,
use_sparse=False,
) )
result = self.platform.get_attn_backend_cls("ascend",
attn_selector_config)
self.assertEqual(result, self.assertEqual(result,
"vllm_ascend.attention.mla_v1.AscendMLABackend") "vllm_ascend.attention.mla_v1.AscendMLABackend")
def test_get_attn_backend_cls_use_v1_only(self): def test_get_attn_backend_cls_use_v1_only(self):
result = self.platform.get_attn_backend_cls( attn_selector_config = AttentionSelectorConfig(
selected_backend="ascend", dtype=torch.float16,
head_size=64, head_size=0,
dtype="float16", kv_cache_dtype=None,
kv_cache_dtype="float16", block_size=128,
block_size=64,
use_sparse=False,
use_mla=False, use_mla=False,
use_sparse=False,
) )
result = self.platform.get_attn_backend_cls("ascend",
attn_selector_config)
self.assertEqual( self.assertEqual(
result, result,
"vllm_ascend.attention.attention_v1.AscendAttentionBackend") "vllm_ascend.attention.attention_v1.AscendAttentionBackend")

View File

@@ -274,15 +274,6 @@ class AscendFusedMoE(FusedMoE):
def update_expert_map(self, new_expert_map): def update_expert_map(self, new_expert_map):
self._expert_map = new_expert_map self._expert_map = new_expert_map
@property
def expert_map(self) -> torch.Tensor | None:
return self._expert_map
@expert_map.setter
def expert_map(self, new_expert_map):
# TODO(Potabk): Remove this once we drop vllm v0.12.0(This makes backward compatibility with vllm v0.12.0)
self._expert_map = new_expert_map
def get_log2phy_map(self): def get_log2phy_map(self):
return self.log2phy return self.log2phy

View File

@@ -17,15 +17,10 @@
import os import os
import vllm_ascend.patch.platform.patch_distributed # noqa import vllm_ascend.patch.platform.patch_distributed # noqa
import vllm_ascend.patch.platform.patch_ec_connector # noqa
import vllm_ascend.patch.platform.patch_mamba_config # noqa import vllm_ascend.patch.platform.patch_mamba_config # noqa
import vllm_ascend.patch.platform.patch_sched_yield # noqa import vllm_ascend.patch.platform.patch_sched_yield # noqa
from vllm_ascend.utils import vllm_version_is
if os.getenv("DYNAMIC_EPLB", "false").lower() in ("true", "1") or os.getenv( if os.getenv("DYNAMIC_EPLB", "false").lower() in ("true", "1") or os.getenv(
"EXPERT_MAP_RECORD", "false") == "true": "EXPERT_MAP_RECORD", "false") == "true":
import vllm_ascend.patch.platform.patch_multiproc_executor # noqa import vllm_ascend.patch.platform.patch_multiproc_executor # noqa
if vllm_version_is("0.12.0"):
import vllm_ascend.patch.platform.patch_ec_connector012 # noqa
else:
import vllm_ascend.patch.platform.patch_ec_connector # noqa

View File

@@ -1,33 +0,0 @@
import vllm.distributed.ec_transfer.ec_connector.shared_storage_connector # type: ignore[import-not-found] # noqa
from safetensors.torch import load_file
from vllm.distributed.ec_transfer.ec_connector.base import \
ECConnectorMetadata # type: ignore[import-not-found] # noqa
from vllm.distributed.ec_transfer.ec_connector.shared_storage_connector import ( # type: ignore[import-not-found] # noqa
ECSharedStorageConnector, ECSharedStorageConnectorMetadata)
from vllm.logger import logger
class AscendECSharedStorageConnector(ECSharedStorageConnector):
def start_load_caches(self, encoder_cache, **kwargs) -> None:
metadata: ECConnectorMetadata = self._get_connector_metadata()
assert isinstance(metadata, ECSharedStorageConnectorMetadata)
assert encoder_cache is not None
if metadata is None:
logger.warning((
"In connector.start_load_caches, ",
"but the connector metadata is None",
))
return
# Load the EC for each mm data
for mm_data in metadata.mm_datas:
if mm_data.mm_hash in encoder_cache:
continue
filename = self._generate_filename_debug(mm_data.mm_hash)
ec_cache = load_file(filename)["ec_cache"].npu()
encoder_cache[mm_data.mm_hash] = ec_cache
logger.debug("Success load encoder cache for hash %s",
mm_data.mm_hash)
vllm.distributed.ec_transfer.ec_connector.shared_storage_connector.ECSharedStorageConnector = AscendECSharedStorageConnector

View File

@@ -351,22 +351,16 @@ class NPUPlatform(Platform):
CUSTOM_OP_REGISTERED = True CUSTOM_OP_REGISTERED = True
@classmethod @classmethod
def get_attn_backend_cls(cls, selected_backend, *args, **kwargs): def get_attn_backend_cls(cls, selected_backend, attn_selector_config):
if "attn_selector_config" in kwargs:
use_mla = kwargs["attn_selector_config"].use_mla
use_sparse = kwargs["attn_selector_config"].use_sparse
else:
use_mla = kwargs.get("use_mla",
args[4] if len(args) >= 5 else None)
use_sparse = kwargs.get("use_sparse",
args[6] if len(args) >= 7 else None)
backend_map = { backend_map = {
(True, False): "vllm_ascend.attention.mla_v1.AscendMLABackend", (True, False): "vllm_ascend.attention.mla_v1.AscendMLABackend",
(False, False): (False, False):
"vllm_ascend.attention.attention_v1.AscendAttentionBackend", "vllm_ascend.attention.attention_v1.AscendAttentionBackend",
(True, True): "vllm_ascend.attention.sfa_v1.AscendSFABackend", (True, True): "vllm_ascend.attention.sfa_v1.AscendSFABackend",
} }
return backend_map[(use_mla, use_sparse)]
return backend_map[(attn_selector_config.use_mla,
attn_selector_config.use_sparse)]
@classmethod @classmethod
def get_punica_wrapper(cls) -> str: def get_punica_wrapper(cls) -> str:

View File

@@ -116,8 +116,7 @@ from vllm_ascend.spec_decode.interface import SpecDcodeType
from vllm_ascend.spec_decode.mtp_proposer import MtpProposer from vllm_ascend.spec_decode.mtp_proposer import MtpProposer
from vllm_ascend.utils import (AscendDeviceType, ProfileExecuteDuration, from vllm_ascend.utils import (AscendDeviceType, ProfileExecuteDuration,
enable_sp, get_ascend_device_type, is_moe_model, enable_sp, get_ascend_device_type, is_moe_model,
lmhead_tp_enable, maybe_trans_nz, lmhead_tp_enable, maybe_trans_nz)
vllm_version_is)
from vllm_ascend.worker.npu_input_batch import NPUInputBatch from vllm_ascend.worker.npu_input_batch import NPUInputBatch
from vllm_ascend.ascend_forward_context import ( # isort: skip from vllm_ascend.ascend_forward_context import ( # isort: skip
@@ -243,24 +242,15 @@ class NPUModelRunner(GPUModelRunner):
# Set up Attention # Set up Attention
self.use_sparse = hasattr(self.vllm_config.model_config.hf_config, self.use_sparse = hasattr(self.vllm_config.model_config.hf_config,
"index_topk") "index_topk")
if vllm_version_is('0.12.0'): self.attn_backend = get_attn_backend(
self.attn_backend = get_attn_backend( 0,
0, self.dtype,
self.dtype, None,
None, self.block_size,
self.block_size, use_mla=self.model_config.use_mla,
use_mla=self.model_config.use_mla, use_sparse=self.use_sparse,
use_sparse=self.use_sparse) use_mm_prefix=self.model_config is not None
else: and self.model_config.is_mm_prefix_lm)
self.attn_backend = get_attn_backend(
0,
self.dtype,
None,
self.block_size,
use_mla=self.model_config.use_mla,
use_sparse=self.use_sparse,
use_mm_prefix=self.model_config is not None
and self.model_config.is_mm_prefix_lm)
self.attn_mask_builder = AttentionMaskBuilder(self.device) self.attn_mask_builder = AttentionMaskBuilder(self.device)
self._set_up_drafter() self._set_up_drafter()
@@ -1877,36 +1867,19 @@ class NPUModelRunner(GPUModelRunner):
self.speculative_config.method == "mtp": self.speculative_config.method == "mtp":
attn_state = AscendAttentionState.SpecDecoding attn_state = AscendAttentionState.SpecDecoding
if vllm_version_is("0.12.0"): common_metadata = CommonAttentionMetadata(
common_metadata = CommonAttentionMetadata( query_start_loc=self.query_start_loc.gpu[:num_reqs + 1],
query_start_loc=self.query_start_loc.gpu[:num_reqs + query_start_loc_cpu=self.query_start_loc.cpu[:num_reqs +
1], 1],
query_start_loc_cpu=self.query_start_loc. _seq_lens_cpu=self.seq_lens.cpu[:num_reqs],
cpu[:num_reqs + 1], seq_lens=self.seq_lens.cpu[:num_reqs],
seq_lens_cpu=self.seq_lens.cpu[:num_reqs], num_reqs=num_reqs,
seq_lens=self.seq_lens.cpu[:num_reqs], num_actual_tokens=num_tokens,
num_reqs=num_reqs, block_table_tensor=block_table_tensor[:num_reqs],
num_actual_tokens=num_tokens, slot_mapping=slot_mapping.gpu,
block_table_tensor=block_table_tensor[:num_reqs], _num_computed_tokens_cpu=num_computed_tokens_cpu,
slot_mapping=slot_mapping.gpu, max_query_len=max_query_len,
num_computed_tokens_cpu=num_computed_tokens_cpu, max_seq_len=seq_lens)
max_query_len=max_query_len,
max_seq_len=seq_lens)
else:
common_metadata = CommonAttentionMetadata(
query_start_loc=self.query_start_loc.gpu[:num_reqs +
1],
query_start_loc_cpu=self.query_start_loc.
cpu[:num_reqs + 1],
_seq_lens_cpu=self.seq_lens.cpu[:num_reqs],
seq_lens=self.seq_lens.cpu[:num_reqs],
num_reqs=num_reqs,
num_actual_tokens=num_tokens,
block_table_tensor=block_table_tensor[:num_reqs],
slot_mapping=slot_mapping.gpu,
_num_computed_tokens_cpu=num_computed_tokens_cpu,
max_query_len=max_query_len,
max_seq_len=seq_lens)
for attn_group in self.attn_groups[kv_cache_group_id]: for attn_group in self.attn_groups[kv_cache_group_id]:
builder = attn_group.get_metadata_builder() builder = attn_group.get_metadata_builder()

View File

@@ -22,6 +22,7 @@ import torch
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
from vllm.pooling_params import PoolingParams from vllm.pooling_params import PoolingParams
from vllm.v1.outputs import LogprobsTensors from vllm.v1.outputs import LogprobsTensors
from vllm.v1.pool.metadata import PoolingStates
from vllm.v1.sample.logits_processor import (BatchUpdateBuilder, from vllm.v1.sample.logits_processor import (BatchUpdateBuilder,
LogitsProcessors) LogitsProcessors)
from vllm.v1.worker.gpu_input_batch import InputBatch from vllm.v1.worker.gpu_input_batch import InputBatch
@@ -29,16 +30,6 @@ from vllm.v1.worker.gpu_input_batch import InputBatch
from vllm_ascend.worker.block_table import MultiGroupBlockTable from vllm_ascend.worker.block_table import MultiGroupBlockTable
class PoolingStates:
# NOTE: This should be removed after we drop support of vLLM v0.12.0
def __init__(self):
# for chunked prefill with ALL pooling
self.hidden_states_cache: list[torch.Tensor] = []
def clean(self):
self.hidden_states_cache.clear()
class NPUInputBatch(InputBatch): class NPUInputBatch(InputBatch):
def __init__( def __init__(