diff --git a/.github/workflows/_e2e_nightly_multi_node.yaml b/.github/workflows/_e2e_nightly_multi_node.yaml index cb6608fa..04f2aa72 100644 --- a/.github/workflows/_e2e_nightly_multi_node.yaml +++ b/.github/workflows/_e2e_nightly_multi_node.yaml @@ -32,7 +32,7 @@ on: description: how many pods will be pulled up via lws.yaml, indicates number of nodes we need vllm_version: required: false - default: "v0.12.0" + default: "v0.13.0" type: string description: vllm version to use vllm_ascend_remote_url: diff --git a/.github/workflows/nightly_test_a2.yaml b/.github/workflows/nightly_test_a2.yaml index 8fca3bad..57003ef9 100644 --- a/.github/workflows/nightly_test_a2.yaml +++ b/.github/workflows/nightly_test_a2.yaml @@ -60,7 +60,7 @@ jobs: tests: tests/e2e/nightly/ops uses: ./.github/workflows/_e2e_nightly_single_node.yaml with: - vllm: v0.12.0 + vllm: v0.13.0 runner: ${{ matrix.test_config.os }} tests: ${{ matrix.test_config.tests }} name: ${{ matrix.test_config.name }} @@ -128,7 +128,7 @@ jobs: - Qwen3-VL-30B-A3B-Instruct uses: ./.github/workflows/_e2e_nightly_single_node_models.yaml with: - vllm: v0.12.0 + vllm: v0.13.0 runner: ${{ matrix.test_config.os }} model_list: ${{ toJson(matrix.test_config.model_list) }} image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc2-910b-ubuntu22.04-py3.11' diff --git a/.github/workflows/nightly_test_a3.yaml b/.github/workflows/nightly_test_a3.yaml index 916a8b36..3038b541 100644 --- a/.github/workflows/nightly_test_a3.yaml +++ b/.github/workflows/nightly_test_a3.yaml @@ -136,7 +136,7 @@ jobs: # tests: tests/e2e/nightly/models/test_deepseek_v3_2_exp_w8a8.py uses: ./.github/workflows/_e2e_nightly_single_node.yaml with: - vllm: v0.12.0 + vllm: v0.13.0 runner: ${{ matrix.test_config.os }} image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a3' tests: ${{ matrix.test_config.tests }} @@ -156,7 +156,7 @@ jobs: uses: ./.github/workflows/_e2e_nightly_single_node.yaml with: runner: ${{ matrix.test_config.os }} - vllm: v0.12.0 + vllm: v0.13.0 image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a3' tests: ${{ matrix.test_config.tests }} name: ${{ matrix.test_config.name }} diff --git a/.github/workflows/pr_test_full.yaml b/.github/workflows/pr_test_full.yaml index 3430699a..e747b5ac 100644 --- a/.github/workflows/pr_test_full.yaml +++ b/.github/workflows/pr_test_full.yaml @@ -74,7 +74,7 @@ jobs: name: e2e-full strategy: matrix: - vllm_version: [releases/v0.13.0, v0.12.0] + vllm_version: [v0.13.0] needs: [changes] if: ${{ needs.changes.outputs.e2e_tracker == 'true' }} uses: ./.github/workflows/_e2e_test.yaml diff --git a/.github/workflows/pr_test_light.yaml b/.github/workflows/pr_test_light.yaml index 50214570..e1aeed2e 100644 --- a/.github/workflows/pr_test_light.yaml +++ b/.github/workflows/pr_test_light.yaml @@ -42,7 +42,7 @@ jobs: lint: uses: ./.github/workflows/_pre_commit.yml with: - vllm: releases/v0.13.0 + vllm: v0.13.0 changes: runs-on: linux-aarch64-a2-0 outputs: @@ -90,7 +90,7 @@ jobs: SOC_VERSION: ascend910b1 strategy: matrix: - vllm_version: [releases/v0.13.0, v0.12.0] + vllm_version: [v0.13.0] steps: - name: Free up disk space @@ -154,7 +154,7 @@ jobs: name: e2e-light strategy: matrix: - vllm_version: [releases/v0.13.0, v0.12.0] + vllm_version: [v0.13.0] # Note (yikun): If CI resource are limited we can split job into two chain jobs needs: [lint, changes] # only trigger e2e test after lint passed and the change is e2e related with pull request. diff --git a/.github/workflows/schedule_test_benchmarks.yaml b/.github/workflows/schedule_test_benchmarks.yaml index 1ba071da..60690ebe 100644 --- a/.github/workflows/schedule_test_benchmarks.yaml +++ b/.github/workflows/schedule_test_benchmarks.yaml @@ -51,7 +51,7 @@ jobs: strategy: matrix: include: - - vllm_branch: v0.12.0 + - vllm_branch: v0.13.0 vllm_ascend_branch: main max-parallel: 1 container: diff --git a/Dockerfile b/Dockerfile index 11f38018..b2c0db4d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -48,7 +48,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} # Install vLLM ARG VLLM_REPO=https://github.com/vllm-project/vllm.git -ARG VLLM_TAG=v0.12.0 +ARG VLLM_TAG=v0.13.0 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \ diff --git a/Dockerfile.310p b/Dockerfile.310p index 9f9072bd..9ca36ad1 100644 --- a/Dockerfile.310p +++ b/Dockerfile.310p @@ -40,7 +40,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} # Install vLLM ARG VLLM_REPO=https://github.com/vllm-project/vllm.git -ARG VLLM_TAG=v0.12.0 +ARG VLLM_TAG=v0.13.0 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \ diff --git a/Dockerfile.310p.openEuler b/Dockerfile.310p.openEuler index f48a3e87..b7758b8c 100644 --- a/Dockerfile.310p.openEuler +++ b/Dockerfile.310p.openEuler @@ -36,7 +36,7 @@ COPY . /vllm-workspace/vllm-ascend/ # Install vLLM ARG VLLM_REPO=https://github.com/vllm-project/vllm.git -ARG VLLM_TAG=v0.12.0 +ARG VLLM_TAG=v0.13.0 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \ diff --git a/Dockerfile.a3 b/Dockerfile.a3 index 73187eea..68c0c6b4 100644 --- a/Dockerfile.a3 +++ b/Dockerfile.a3 @@ -47,7 +47,7 @@ RUN apt-get update -y && \ # Install vLLM ARG VLLM_REPO=https://github.com/vllm-project/vllm.git -ARG VLLM_TAG=v0.12.0 +ARG VLLM_TAG=v0.13.0 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \ diff --git a/Dockerfile.a3.openEuler b/Dockerfile.a3.openEuler index 6ec647cb..4edc89a5 100644 --- a/Dockerfile.a3.openEuler +++ b/Dockerfile.a3.openEuler @@ -50,7 +50,7 @@ RUN yum update -y && \ # Install vLLM ARG VLLM_REPO=https://github.com/vllm-project/vllm.git -ARG VLLM_TAG=v0.12.0 +ARG VLLM_TAG=v0.13.0 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \ diff --git a/Dockerfile.openEuler b/Dockerfile.openEuler index 8a0534dd..f5acbcf4 100644 --- a/Dockerfile.openEuler +++ b/Dockerfile.openEuler @@ -50,7 +50,7 @@ RUN yum update -y && \ # Install vLLM ARG VLLM_REPO=https://github.com/vllm-project/vllm.git -ARG VLLM_TAG=v0.12.0 +ARG VLLM_TAG=v0.13.0 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \ diff --git a/docs/source/community/versioning_policy.md b/docs/source/community/versioning_policy.md index 554b598f..02eb1d2d 100644 --- a/docs/source/community/versioning_policy.md +++ b/docs/source/community/versioning_policy.md @@ -50,7 +50,7 @@ If you're using v0.7.3, don't forget to install [mindie-turbo](https://pypi.org/ For main branch of vLLM Ascend, we usually make it compatible with the latest vLLM release and a newer commit hash of vLLM. Please note that this table is usually updated. Please check it regularly. | vLLM Ascend | vLLM | Python | Stable CANN | PyTorch/torch_npu | |-------------|--------------|------------------|-------------|--------------------| -| main | releases/v0.13.0, v0.12.0 tag | >= 3.10, < 3.12 | 8.3.RC2 | 2.8.0 / 2.8.0 | +| main | v0.13.0 tag | >= 3.10, < 3.12 | 8.3.RC2 | 2.8.0 / 2.8.0 | ## Release cadence diff --git a/docs/source/conf.py b/docs/source/conf.py index 63e1986d..0c636fe4 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -77,7 +77,7 @@ myst_substitutions = { # CANN image tag 'cann_image_tag': "8.3.rc2-910b-ubuntu22.04-py3.11", # vllm version in ci - 'ci_vllm_version': 'v0.12.0', + 'ci_vllm_version': 'release/v0.13.0', } # For cross-file header anchors diff --git a/tests/ut/test_platform.py b/tests/ut/test_platform.py index 1ad608b9..d7a5ecba 100644 --- a/tests/ut/test_platform.py +++ b/tests/ut/test_platform.py @@ -3,6 +3,7 @@ from unittest.mock import MagicMock, patch import pytest import torch +from vllm.attention.selector import AttentionSelectorConfig from vllm.config.compilation import CompilationMode, CUDAGraphMode from vllm.platforms import PlatformEnum @@ -484,28 +485,30 @@ class TestNPUPlatform(TestBase): self.assertEqual(vllm_config.compilation_config.custom_ops, []) def test_get_attn_backend_cls_use_v1_and_mla(self): - result = self.platform.get_attn_backend_cls( - selected_backend="ascend", - head_size=64, - dtype="float16", - kv_cache_dtype="float16", - block_size=64, - use_sparse=False, + attn_selector_config = AttentionSelectorConfig( + dtype=torch.float16, + head_size=0, + kv_cache_dtype=None, + block_size=128, use_mla=True, + use_sparse=False, ) + result = self.platform.get_attn_backend_cls("ascend", + attn_selector_config) self.assertEqual(result, "vllm_ascend.attention.mla_v1.AscendMLABackend") def test_get_attn_backend_cls_use_v1_only(self): - result = self.platform.get_attn_backend_cls( - selected_backend="ascend", - head_size=64, - dtype="float16", - kv_cache_dtype="float16", - block_size=64, - use_sparse=False, + attn_selector_config = AttentionSelectorConfig( + dtype=torch.float16, + head_size=0, + kv_cache_dtype=None, + block_size=128, use_mla=False, + use_sparse=False, ) + result = self.platform.get_attn_backend_cls("ascend", + attn_selector_config) self.assertEqual( result, "vllm_ascend.attention.attention_v1.AscendAttentionBackend") diff --git a/vllm_ascend/ops/fused_moe/fused_moe.py b/vllm_ascend/ops/fused_moe/fused_moe.py index 2a331ed8..9d913f63 100644 --- a/vllm_ascend/ops/fused_moe/fused_moe.py +++ b/vllm_ascend/ops/fused_moe/fused_moe.py @@ -274,15 +274,6 @@ class AscendFusedMoE(FusedMoE): def update_expert_map(self, new_expert_map): self._expert_map = new_expert_map - @property - def expert_map(self) -> torch.Tensor | None: - return self._expert_map - - @expert_map.setter - def expert_map(self, new_expert_map): - # TODO(Potabk): Remove this once we drop vllm v0.12.0(This makes backward compatibility with vllm v0.12.0) - self._expert_map = new_expert_map - def get_log2phy_map(self): return self.log2phy diff --git a/vllm_ascend/patch/platform/__init__.py b/vllm_ascend/patch/platform/__init__.py index 0dff139f..26c4dc86 100644 --- a/vllm_ascend/patch/platform/__init__.py +++ b/vllm_ascend/patch/platform/__init__.py @@ -17,15 +17,10 @@ import os import vllm_ascend.patch.platform.patch_distributed # noqa +import vllm_ascend.patch.platform.patch_ec_connector # noqa import vllm_ascend.patch.platform.patch_mamba_config # noqa import vllm_ascend.patch.platform.patch_sched_yield # noqa -from vllm_ascend.utils import vllm_version_is if os.getenv("DYNAMIC_EPLB", "false").lower() in ("true", "1") or os.getenv( "EXPERT_MAP_RECORD", "false") == "true": import vllm_ascend.patch.platform.patch_multiproc_executor # noqa - -if vllm_version_is("0.12.0"): - import vllm_ascend.patch.platform.patch_ec_connector012 # noqa -else: - import vllm_ascend.patch.platform.patch_ec_connector # noqa diff --git a/vllm_ascend/patch/platform/patch_ec_connector012.py b/vllm_ascend/patch/platform/patch_ec_connector012.py deleted file mode 100644 index f0015738..00000000 --- a/vllm_ascend/patch/platform/patch_ec_connector012.py +++ /dev/null @@ -1,33 +0,0 @@ -import vllm.distributed.ec_transfer.ec_connector.shared_storage_connector # type: ignore[import-not-found] # noqa -from safetensors.torch import load_file -from vllm.distributed.ec_transfer.ec_connector.base import \ - ECConnectorMetadata # type: ignore[import-not-found] # noqa -from vllm.distributed.ec_transfer.ec_connector.shared_storage_connector import ( # type: ignore[import-not-found] # noqa - ECSharedStorageConnector, ECSharedStorageConnectorMetadata) -from vllm.logger import logger - - -class AscendECSharedStorageConnector(ECSharedStorageConnector): - - def start_load_caches(self, encoder_cache, **kwargs) -> None: - metadata: ECConnectorMetadata = self._get_connector_metadata() - assert isinstance(metadata, ECSharedStorageConnectorMetadata) - assert encoder_cache is not None - if metadata is None: - logger.warning(( - "In connector.start_load_caches, ", - "but the connector metadata is None", - )) - return - # Load the EC for each mm data - for mm_data in metadata.mm_datas: - if mm_data.mm_hash in encoder_cache: - continue - filename = self._generate_filename_debug(mm_data.mm_hash) - ec_cache = load_file(filename)["ec_cache"].npu() - encoder_cache[mm_data.mm_hash] = ec_cache - logger.debug("Success load encoder cache for hash %s", - mm_data.mm_hash) - - -vllm.distributed.ec_transfer.ec_connector.shared_storage_connector.ECSharedStorageConnector = AscendECSharedStorageConnector diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py index 12545460..2a70932d 100644 --- a/vllm_ascend/platform.py +++ b/vllm_ascend/platform.py @@ -351,22 +351,16 @@ class NPUPlatform(Platform): CUSTOM_OP_REGISTERED = True @classmethod - def get_attn_backend_cls(cls, selected_backend, *args, **kwargs): - if "attn_selector_config" in kwargs: - use_mla = kwargs["attn_selector_config"].use_mla - use_sparse = kwargs["attn_selector_config"].use_sparse - else: - use_mla = kwargs.get("use_mla", - args[4] if len(args) >= 5 else None) - use_sparse = kwargs.get("use_sparse", - args[6] if len(args) >= 7 else None) + def get_attn_backend_cls(cls, selected_backend, attn_selector_config): backend_map = { (True, False): "vllm_ascend.attention.mla_v1.AscendMLABackend", (False, False): "vllm_ascend.attention.attention_v1.AscendAttentionBackend", (True, True): "vllm_ascend.attention.sfa_v1.AscendSFABackend", } - return backend_map[(use_mla, use_sparse)] + + return backend_map[(attn_selector_config.use_mla, + attn_selector_config.use_sparse)] @classmethod def get_punica_wrapper(cls) -> str: diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 38cfcd0c..19e8a310 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -116,8 +116,7 @@ from vllm_ascend.spec_decode.interface import SpecDcodeType from vllm_ascend.spec_decode.mtp_proposer import MtpProposer from vllm_ascend.utils import (AscendDeviceType, ProfileExecuteDuration, enable_sp, get_ascend_device_type, is_moe_model, - lmhead_tp_enable, maybe_trans_nz, - vllm_version_is) + lmhead_tp_enable, maybe_trans_nz) from vllm_ascend.worker.npu_input_batch import NPUInputBatch from vllm_ascend.ascend_forward_context import ( # isort: skip @@ -243,24 +242,15 @@ class NPUModelRunner(GPUModelRunner): # Set up Attention self.use_sparse = hasattr(self.vllm_config.model_config.hf_config, "index_topk") - if vllm_version_is('0.12.0'): - self.attn_backend = get_attn_backend( - 0, - self.dtype, - None, - self.block_size, - use_mla=self.model_config.use_mla, - use_sparse=self.use_sparse) - else: - self.attn_backend = get_attn_backend( - 0, - self.dtype, - None, - self.block_size, - use_mla=self.model_config.use_mla, - use_sparse=self.use_sparse, - use_mm_prefix=self.model_config is not None - and self.model_config.is_mm_prefix_lm) + self.attn_backend = get_attn_backend( + 0, + self.dtype, + None, + self.block_size, + use_mla=self.model_config.use_mla, + use_sparse=self.use_sparse, + use_mm_prefix=self.model_config is not None + and self.model_config.is_mm_prefix_lm) self.attn_mask_builder = AttentionMaskBuilder(self.device) self._set_up_drafter() @@ -1877,36 +1867,19 @@ class NPUModelRunner(GPUModelRunner): self.speculative_config.method == "mtp": attn_state = AscendAttentionState.SpecDecoding - if vllm_version_is("0.12.0"): - common_metadata = CommonAttentionMetadata( - query_start_loc=self.query_start_loc.gpu[:num_reqs + + common_metadata = CommonAttentionMetadata( + query_start_loc=self.query_start_loc.gpu[:num_reqs + 1], + query_start_loc_cpu=self.query_start_loc.cpu[:num_reqs + 1], - query_start_loc_cpu=self.query_start_loc. - cpu[:num_reqs + 1], - seq_lens_cpu=self.seq_lens.cpu[:num_reqs], - seq_lens=self.seq_lens.cpu[:num_reqs], - num_reqs=num_reqs, - num_actual_tokens=num_tokens, - block_table_tensor=block_table_tensor[:num_reqs], - slot_mapping=slot_mapping.gpu, - num_computed_tokens_cpu=num_computed_tokens_cpu, - max_query_len=max_query_len, - max_seq_len=seq_lens) - else: - common_metadata = CommonAttentionMetadata( - query_start_loc=self.query_start_loc.gpu[:num_reqs + - 1], - query_start_loc_cpu=self.query_start_loc. - cpu[:num_reqs + 1], - _seq_lens_cpu=self.seq_lens.cpu[:num_reqs], - seq_lens=self.seq_lens.cpu[:num_reqs], - num_reqs=num_reqs, - num_actual_tokens=num_tokens, - block_table_tensor=block_table_tensor[:num_reqs], - slot_mapping=slot_mapping.gpu, - _num_computed_tokens_cpu=num_computed_tokens_cpu, - max_query_len=max_query_len, - max_seq_len=seq_lens) + _seq_lens_cpu=self.seq_lens.cpu[:num_reqs], + seq_lens=self.seq_lens.cpu[:num_reqs], + num_reqs=num_reqs, + num_actual_tokens=num_tokens, + block_table_tensor=block_table_tensor[:num_reqs], + slot_mapping=slot_mapping.gpu, + _num_computed_tokens_cpu=num_computed_tokens_cpu, + max_query_len=max_query_len, + max_seq_len=seq_lens) for attn_group in self.attn_groups[kv_cache_group_id]: builder = attn_group.get_metadata_builder() diff --git a/vllm_ascend/worker/npu_input_batch.py b/vllm_ascend/worker/npu_input_batch.py index 846c0d83..2777ea9f 100644 --- a/vllm_ascend/worker/npu_input_batch.py +++ b/vllm_ascend/worker/npu_input_batch.py @@ -22,6 +22,7 @@ import torch from vllm.lora.request import LoRARequest from vllm.pooling_params import PoolingParams from vllm.v1.outputs import LogprobsTensors +from vllm.v1.pool.metadata import PoolingStates from vllm.v1.sample.logits_processor import (BatchUpdateBuilder, LogitsProcessors) from vllm.v1.worker.gpu_input_batch import InputBatch @@ -29,16 +30,6 @@ from vllm.v1.worker.gpu_input_batch import InputBatch from vllm_ascend.worker.block_table import MultiGroupBlockTable -class PoolingStates: - # NOTE: This should be removed after we drop support of vLLM v0.12.0 - def __init__(self): - # for chunked prefill with ALL pooling - self.hidden_states_cache: list[torch.Tensor] = [] - - def clean(self): - self.hidden_states_cache.clear() - - class NPUInputBatch(InputBatch): def __init__(