Drop 0.12.0 support (#5146)
We decided to release v0.13.0 soon. So no need to support 0.12.0 now.
Let's drop it.
- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
@@ -32,7 +32,7 @@ on:
|
|||||||
description: how many pods will be pulled up via lws.yaml, indicates number of nodes we need
|
description: how many pods will be pulled up via lws.yaml, indicates number of nodes we need
|
||||||
vllm_version:
|
vllm_version:
|
||||||
required: false
|
required: false
|
||||||
default: "v0.12.0"
|
default: "v0.13.0"
|
||||||
type: string
|
type: string
|
||||||
description: vllm version to use
|
description: vllm version to use
|
||||||
vllm_ascend_remote_url:
|
vllm_ascend_remote_url:
|
||||||
|
|||||||
4
.github/workflows/nightly_test_a2.yaml
vendored
4
.github/workflows/nightly_test_a2.yaml
vendored
@@ -60,7 +60,7 @@ jobs:
|
|||||||
tests: tests/e2e/nightly/ops
|
tests: tests/e2e/nightly/ops
|
||||||
uses: ./.github/workflows/_e2e_nightly_single_node.yaml
|
uses: ./.github/workflows/_e2e_nightly_single_node.yaml
|
||||||
with:
|
with:
|
||||||
vllm: v0.12.0
|
vllm: v0.13.0
|
||||||
runner: ${{ matrix.test_config.os }}
|
runner: ${{ matrix.test_config.os }}
|
||||||
tests: ${{ matrix.test_config.tests }}
|
tests: ${{ matrix.test_config.tests }}
|
||||||
name: ${{ matrix.test_config.name }}
|
name: ${{ matrix.test_config.name }}
|
||||||
@@ -128,7 +128,7 @@ jobs:
|
|||||||
- Qwen3-VL-30B-A3B-Instruct
|
- Qwen3-VL-30B-A3B-Instruct
|
||||||
uses: ./.github/workflows/_e2e_nightly_single_node_models.yaml
|
uses: ./.github/workflows/_e2e_nightly_single_node_models.yaml
|
||||||
with:
|
with:
|
||||||
vllm: v0.12.0
|
vllm: v0.13.0
|
||||||
runner: ${{ matrix.test_config.os }}
|
runner: ${{ matrix.test_config.os }}
|
||||||
model_list: ${{ toJson(matrix.test_config.model_list) }}
|
model_list: ${{ toJson(matrix.test_config.model_list) }}
|
||||||
image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc2-910b-ubuntu22.04-py3.11'
|
image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc2-910b-ubuntu22.04-py3.11'
|
||||||
|
|||||||
4
.github/workflows/nightly_test_a3.yaml
vendored
4
.github/workflows/nightly_test_a3.yaml
vendored
@@ -136,7 +136,7 @@ jobs:
|
|||||||
# tests: tests/e2e/nightly/models/test_deepseek_v3_2_exp_w8a8.py
|
# tests: tests/e2e/nightly/models/test_deepseek_v3_2_exp_w8a8.py
|
||||||
uses: ./.github/workflows/_e2e_nightly_single_node.yaml
|
uses: ./.github/workflows/_e2e_nightly_single_node.yaml
|
||||||
with:
|
with:
|
||||||
vllm: v0.12.0
|
vllm: v0.13.0
|
||||||
runner: ${{ matrix.test_config.os }}
|
runner: ${{ matrix.test_config.os }}
|
||||||
image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a3'
|
image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a3'
|
||||||
tests: ${{ matrix.test_config.tests }}
|
tests: ${{ matrix.test_config.tests }}
|
||||||
@@ -156,7 +156,7 @@ jobs:
|
|||||||
uses: ./.github/workflows/_e2e_nightly_single_node.yaml
|
uses: ./.github/workflows/_e2e_nightly_single_node.yaml
|
||||||
with:
|
with:
|
||||||
runner: ${{ matrix.test_config.os }}
|
runner: ${{ matrix.test_config.os }}
|
||||||
vllm: v0.12.0
|
vllm: v0.13.0
|
||||||
image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a3'
|
image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a3'
|
||||||
tests: ${{ matrix.test_config.tests }}
|
tests: ${{ matrix.test_config.tests }}
|
||||||
name: ${{ matrix.test_config.name }}
|
name: ${{ matrix.test_config.name }}
|
||||||
|
|||||||
2
.github/workflows/pr_test_full.yaml
vendored
2
.github/workflows/pr_test_full.yaml
vendored
@@ -74,7 +74,7 @@ jobs:
|
|||||||
name: e2e-full
|
name: e2e-full
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
vllm_version: [releases/v0.13.0, v0.12.0]
|
vllm_version: [v0.13.0]
|
||||||
needs: [changes]
|
needs: [changes]
|
||||||
if: ${{ needs.changes.outputs.e2e_tracker == 'true' }}
|
if: ${{ needs.changes.outputs.e2e_tracker == 'true' }}
|
||||||
uses: ./.github/workflows/_e2e_test.yaml
|
uses: ./.github/workflows/_e2e_test.yaml
|
||||||
|
|||||||
6
.github/workflows/pr_test_light.yaml
vendored
6
.github/workflows/pr_test_light.yaml
vendored
@@ -42,7 +42,7 @@ jobs:
|
|||||||
lint:
|
lint:
|
||||||
uses: ./.github/workflows/_pre_commit.yml
|
uses: ./.github/workflows/_pre_commit.yml
|
||||||
with:
|
with:
|
||||||
vllm: releases/v0.13.0
|
vllm: v0.13.0
|
||||||
changes:
|
changes:
|
||||||
runs-on: linux-aarch64-a2-0
|
runs-on: linux-aarch64-a2-0
|
||||||
outputs:
|
outputs:
|
||||||
@@ -90,7 +90,7 @@ jobs:
|
|||||||
SOC_VERSION: ascend910b1
|
SOC_VERSION: ascend910b1
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
vllm_version: [releases/v0.13.0, v0.12.0]
|
vllm_version: [v0.13.0]
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Free up disk space
|
- name: Free up disk space
|
||||||
@@ -154,7 +154,7 @@ jobs:
|
|||||||
name: e2e-light
|
name: e2e-light
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
vllm_version: [releases/v0.13.0, v0.12.0]
|
vllm_version: [v0.13.0]
|
||||||
# Note (yikun): If CI resource are limited we can split job into two chain jobs
|
# Note (yikun): If CI resource are limited we can split job into two chain jobs
|
||||||
needs: [lint, changes]
|
needs: [lint, changes]
|
||||||
# only trigger e2e test after lint passed and the change is e2e related with pull request.
|
# only trigger e2e test after lint passed and the change is e2e related with pull request.
|
||||||
|
|||||||
@@ -51,7 +51,7 @@ jobs:
|
|||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
include:
|
include:
|
||||||
- vllm_branch: v0.12.0
|
- vllm_branch: v0.13.0
|
||||||
vllm_ascend_branch: main
|
vllm_ascend_branch: main
|
||||||
max-parallel: 1
|
max-parallel: 1
|
||||||
container:
|
container:
|
||||||
|
|||||||
@@ -48,7 +48,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL}
|
|||||||
|
|
||||||
# Install vLLM
|
# Install vLLM
|
||||||
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
|
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
|
||||||
ARG VLLM_TAG=v0.12.0
|
ARG VLLM_TAG=v0.13.0
|
||||||
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
|
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
|
||||||
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
|
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
|
||||||
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
|
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
|
||||||
|
|||||||
@@ -40,7 +40,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL}
|
|||||||
|
|
||||||
# Install vLLM
|
# Install vLLM
|
||||||
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
|
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
|
||||||
ARG VLLM_TAG=v0.12.0
|
ARG VLLM_TAG=v0.13.0
|
||||||
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
|
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
|
||||||
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
|
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
|
||||||
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
|
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
|
||||||
|
|||||||
@@ -36,7 +36,7 @@ COPY . /vllm-workspace/vllm-ascend/
|
|||||||
|
|
||||||
# Install vLLM
|
# Install vLLM
|
||||||
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
|
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
|
||||||
ARG VLLM_TAG=v0.12.0
|
ARG VLLM_TAG=v0.13.0
|
||||||
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
|
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
|
||||||
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
|
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
|
||||||
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
|
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
|
||||||
|
|||||||
@@ -47,7 +47,7 @@ RUN apt-get update -y && \
|
|||||||
|
|
||||||
# Install vLLM
|
# Install vLLM
|
||||||
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
|
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
|
||||||
ARG VLLM_TAG=v0.12.0
|
ARG VLLM_TAG=v0.13.0
|
||||||
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
|
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
|
||||||
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
|
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
|
||||||
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
|
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
|
||||||
|
|||||||
@@ -50,7 +50,7 @@ RUN yum update -y && \
|
|||||||
|
|
||||||
# Install vLLM
|
# Install vLLM
|
||||||
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
|
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
|
||||||
ARG VLLM_TAG=v0.12.0
|
ARG VLLM_TAG=v0.13.0
|
||||||
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
|
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
|
||||||
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
|
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
|
||||||
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
|
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
|
||||||
|
|||||||
@@ -50,7 +50,7 @@ RUN yum update -y && \
|
|||||||
|
|
||||||
# Install vLLM
|
# Install vLLM
|
||||||
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
|
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
|
||||||
ARG VLLM_TAG=v0.12.0
|
ARG VLLM_TAG=v0.13.0
|
||||||
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
|
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
|
||||||
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
|
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
|
||||||
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
|
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
|
||||||
|
|||||||
@@ -50,7 +50,7 @@ If you're using v0.7.3, don't forget to install [mindie-turbo](https://pypi.org/
|
|||||||
For main branch of vLLM Ascend, we usually make it compatible with the latest vLLM release and a newer commit hash of vLLM. Please note that this table is usually updated. Please check it regularly.
|
For main branch of vLLM Ascend, we usually make it compatible with the latest vLLM release and a newer commit hash of vLLM. Please note that this table is usually updated. Please check it regularly.
|
||||||
| vLLM Ascend | vLLM | Python | Stable CANN | PyTorch/torch_npu |
|
| vLLM Ascend | vLLM | Python | Stable CANN | PyTorch/torch_npu |
|
||||||
|-------------|--------------|------------------|-------------|--------------------|
|
|-------------|--------------|------------------|-------------|--------------------|
|
||||||
| main | releases/v0.13.0, v0.12.0 tag | >= 3.10, < 3.12 | 8.3.RC2 | 2.8.0 / 2.8.0 |
|
| main | v0.13.0 tag | >= 3.10, < 3.12 | 8.3.RC2 | 2.8.0 / 2.8.0 |
|
||||||
|
|
||||||
## Release cadence
|
## Release cadence
|
||||||
|
|
||||||
|
|||||||
@@ -77,7 +77,7 @@ myst_substitutions = {
|
|||||||
# CANN image tag
|
# CANN image tag
|
||||||
'cann_image_tag': "8.3.rc2-910b-ubuntu22.04-py3.11",
|
'cann_image_tag': "8.3.rc2-910b-ubuntu22.04-py3.11",
|
||||||
# vllm version in ci
|
# vllm version in ci
|
||||||
'ci_vllm_version': 'v0.12.0',
|
'ci_vllm_version': 'release/v0.13.0',
|
||||||
}
|
}
|
||||||
|
|
||||||
# For cross-file header anchors
|
# For cross-file header anchors
|
||||||
|
|||||||
@@ -3,6 +3,7 @@ from unittest.mock import MagicMock, patch
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
|
from vllm.attention.selector import AttentionSelectorConfig
|
||||||
from vllm.config.compilation import CompilationMode, CUDAGraphMode
|
from vllm.config.compilation import CompilationMode, CUDAGraphMode
|
||||||
from vllm.platforms import PlatformEnum
|
from vllm.platforms import PlatformEnum
|
||||||
|
|
||||||
@@ -484,28 +485,30 @@ class TestNPUPlatform(TestBase):
|
|||||||
self.assertEqual(vllm_config.compilation_config.custom_ops, [])
|
self.assertEqual(vllm_config.compilation_config.custom_ops, [])
|
||||||
|
|
||||||
def test_get_attn_backend_cls_use_v1_and_mla(self):
|
def test_get_attn_backend_cls_use_v1_and_mla(self):
|
||||||
result = self.platform.get_attn_backend_cls(
|
attn_selector_config = AttentionSelectorConfig(
|
||||||
selected_backend="ascend",
|
dtype=torch.float16,
|
||||||
head_size=64,
|
head_size=0,
|
||||||
dtype="float16",
|
kv_cache_dtype=None,
|
||||||
kv_cache_dtype="float16",
|
block_size=128,
|
||||||
block_size=64,
|
|
||||||
use_sparse=False,
|
|
||||||
use_mla=True,
|
use_mla=True,
|
||||||
|
use_sparse=False,
|
||||||
)
|
)
|
||||||
|
result = self.platform.get_attn_backend_cls("ascend",
|
||||||
|
attn_selector_config)
|
||||||
self.assertEqual(result,
|
self.assertEqual(result,
|
||||||
"vllm_ascend.attention.mla_v1.AscendMLABackend")
|
"vllm_ascend.attention.mla_v1.AscendMLABackend")
|
||||||
|
|
||||||
def test_get_attn_backend_cls_use_v1_only(self):
|
def test_get_attn_backend_cls_use_v1_only(self):
|
||||||
result = self.platform.get_attn_backend_cls(
|
attn_selector_config = AttentionSelectorConfig(
|
||||||
selected_backend="ascend",
|
dtype=torch.float16,
|
||||||
head_size=64,
|
head_size=0,
|
||||||
dtype="float16",
|
kv_cache_dtype=None,
|
||||||
kv_cache_dtype="float16",
|
block_size=128,
|
||||||
block_size=64,
|
|
||||||
use_sparse=False,
|
|
||||||
use_mla=False,
|
use_mla=False,
|
||||||
|
use_sparse=False,
|
||||||
)
|
)
|
||||||
|
result = self.platform.get_attn_backend_cls("ascend",
|
||||||
|
attn_selector_config)
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
result,
|
result,
|
||||||
"vllm_ascend.attention.attention_v1.AscendAttentionBackend")
|
"vllm_ascend.attention.attention_v1.AscendAttentionBackend")
|
||||||
|
|||||||
@@ -274,15 +274,6 @@ class AscendFusedMoE(FusedMoE):
|
|||||||
def update_expert_map(self, new_expert_map):
|
def update_expert_map(self, new_expert_map):
|
||||||
self._expert_map = new_expert_map
|
self._expert_map = new_expert_map
|
||||||
|
|
||||||
@property
|
|
||||||
def expert_map(self) -> torch.Tensor | None:
|
|
||||||
return self._expert_map
|
|
||||||
|
|
||||||
@expert_map.setter
|
|
||||||
def expert_map(self, new_expert_map):
|
|
||||||
# TODO(Potabk): Remove this once we drop vllm v0.12.0(This makes backward compatibility with vllm v0.12.0)
|
|
||||||
self._expert_map = new_expert_map
|
|
||||||
|
|
||||||
def get_log2phy_map(self):
|
def get_log2phy_map(self):
|
||||||
return self.log2phy
|
return self.log2phy
|
||||||
|
|
||||||
|
|||||||
@@ -17,15 +17,10 @@
|
|||||||
import os
|
import os
|
||||||
|
|
||||||
import vllm_ascend.patch.platform.patch_distributed # noqa
|
import vllm_ascend.patch.platform.patch_distributed # noqa
|
||||||
|
import vllm_ascend.patch.platform.patch_ec_connector # noqa
|
||||||
import vllm_ascend.patch.platform.patch_mamba_config # noqa
|
import vllm_ascend.patch.platform.patch_mamba_config # noqa
|
||||||
import vllm_ascend.patch.platform.patch_sched_yield # noqa
|
import vllm_ascend.patch.platform.patch_sched_yield # noqa
|
||||||
from vllm_ascend.utils import vllm_version_is
|
|
||||||
|
|
||||||
if os.getenv("DYNAMIC_EPLB", "false").lower() in ("true", "1") or os.getenv(
|
if os.getenv("DYNAMIC_EPLB", "false").lower() in ("true", "1") or os.getenv(
|
||||||
"EXPERT_MAP_RECORD", "false") == "true":
|
"EXPERT_MAP_RECORD", "false") == "true":
|
||||||
import vllm_ascend.patch.platform.patch_multiproc_executor # noqa
|
import vllm_ascend.patch.platform.patch_multiproc_executor # noqa
|
||||||
|
|
||||||
if vllm_version_is("0.12.0"):
|
|
||||||
import vllm_ascend.patch.platform.patch_ec_connector012 # noqa
|
|
||||||
else:
|
|
||||||
import vllm_ascend.patch.platform.patch_ec_connector # noqa
|
|
||||||
|
|||||||
@@ -1,33 +0,0 @@
|
|||||||
import vllm.distributed.ec_transfer.ec_connector.shared_storage_connector # type: ignore[import-not-found] # noqa
|
|
||||||
from safetensors.torch import load_file
|
|
||||||
from vllm.distributed.ec_transfer.ec_connector.base import \
|
|
||||||
ECConnectorMetadata # type: ignore[import-not-found] # noqa
|
|
||||||
from vllm.distributed.ec_transfer.ec_connector.shared_storage_connector import ( # type: ignore[import-not-found] # noqa
|
|
||||||
ECSharedStorageConnector, ECSharedStorageConnectorMetadata)
|
|
||||||
from vllm.logger import logger
|
|
||||||
|
|
||||||
|
|
||||||
class AscendECSharedStorageConnector(ECSharedStorageConnector):
|
|
||||||
|
|
||||||
def start_load_caches(self, encoder_cache, **kwargs) -> None:
|
|
||||||
metadata: ECConnectorMetadata = self._get_connector_metadata()
|
|
||||||
assert isinstance(metadata, ECSharedStorageConnectorMetadata)
|
|
||||||
assert encoder_cache is not None
|
|
||||||
if metadata is None:
|
|
||||||
logger.warning((
|
|
||||||
"In connector.start_load_caches, ",
|
|
||||||
"but the connector metadata is None",
|
|
||||||
))
|
|
||||||
return
|
|
||||||
# Load the EC for each mm data
|
|
||||||
for mm_data in metadata.mm_datas:
|
|
||||||
if mm_data.mm_hash in encoder_cache:
|
|
||||||
continue
|
|
||||||
filename = self._generate_filename_debug(mm_data.mm_hash)
|
|
||||||
ec_cache = load_file(filename)["ec_cache"].npu()
|
|
||||||
encoder_cache[mm_data.mm_hash] = ec_cache
|
|
||||||
logger.debug("Success load encoder cache for hash %s",
|
|
||||||
mm_data.mm_hash)
|
|
||||||
|
|
||||||
|
|
||||||
vllm.distributed.ec_transfer.ec_connector.shared_storage_connector.ECSharedStorageConnector = AscendECSharedStorageConnector
|
|
||||||
@@ -351,22 +351,16 @@ class NPUPlatform(Platform):
|
|||||||
CUSTOM_OP_REGISTERED = True
|
CUSTOM_OP_REGISTERED = True
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_attn_backend_cls(cls, selected_backend, *args, **kwargs):
|
def get_attn_backend_cls(cls, selected_backend, attn_selector_config):
|
||||||
if "attn_selector_config" in kwargs:
|
|
||||||
use_mla = kwargs["attn_selector_config"].use_mla
|
|
||||||
use_sparse = kwargs["attn_selector_config"].use_sparse
|
|
||||||
else:
|
|
||||||
use_mla = kwargs.get("use_mla",
|
|
||||||
args[4] if len(args) >= 5 else None)
|
|
||||||
use_sparse = kwargs.get("use_sparse",
|
|
||||||
args[6] if len(args) >= 7 else None)
|
|
||||||
backend_map = {
|
backend_map = {
|
||||||
(True, False): "vllm_ascend.attention.mla_v1.AscendMLABackend",
|
(True, False): "vllm_ascend.attention.mla_v1.AscendMLABackend",
|
||||||
(False, False):
|
(False, False):
|
||||||
"vllm_ascend.attention.attention_v1.AscendAttentionBackend",
|
"vllm_ascend.attention.attention_v1.AscendAttentionBackend",
|
||||||
(True, True): "vllm_ascend.attention.sfa_v1.AscendSFABackend",
|
(True, True): "vllm_ascend.attention.sfa_v1.AscendSFABackend",
|
||||||
}
|
}
|
||||||
return backend_map[(use_mla, use_sparse)]
|
|
||||||
|
return backend_map[(attn_selector_config.use_mla,
|
||||||
|
attn_selector_config.use_sparse)]
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_punica_wrapper(cls) -> str:
|
def get_punica_wrapper(cls) -> str:
|
||||||
|
|||||||
@@ -116,8 +116,7 @@ from vllm_ascend.spec_decode.interface import SpecDcodeType
|
|||||||
from vllm_ascend.spec_decode.mtp_proposer import MtpProposer
|
from vllm_ascend.spec_decode.mtp_proposer import MtpProposer
|
||||||
from vllm_ascend.utils import (AscendDeviceType, ProfileExecuteDuration,
|
from vllm_ascend.utils import (AscendDeviceType, ProfileExecuteDuration,
|
||||||
enable_sp, get_ascend_device_type, is_moe_model,
|
enable_sp, get_ascend_device_type, is_moe_model,
|
||||||
lmhead_tp_enable, maybe_trans_nz,
|
lmhead_tp_enable, maybe_trans_nz)
|
||||||
vllm_version_is)
|
|
||||||
from vllm_ascend.worker.npu_input_batch import NPUInputBatch
|
from vllm_ascend.worker.npu_input_batch import NPUInputBatch
|
||||||
|
|
||||||
from vllm_ascend.ascend_forward_context import ( # isort: skip
|
from vllm_ascend.ascend_forward_context import ( # isort: skip
|
||||||
@@ -243,24 +242,15 @@ class NPUModelRunner(GPUModelRunner):
|
|||||||
# Set up Attention
|
# Set up Attention
|
||||||
self.use_sparse = hasattr(self.vllm_config.model_config.hf_config,
|
self.use_sparse = hasattr(self.vllm_config.model_config.hf_config,
|
||||||
"index_topk")
|
"index_topk")
|
||||||
if vllm_version_is('0.12.0'):
|
self.attn_backend = get_attn_backend(
|
||||||
self.attn_backend = get_attn_backend(
|
0,
|
||||||
0,
|
self.dtype,
|
||||||
self.dtype,
|
None,
|
||||||
None,
|
self.block_size,
|
||||||
self.block_size,
|
use_mla=self.model_config.use_mla,
|
||||||
use_mla=self.model_config.use_mla,
|
use_sparse=self.use_sparse,
|
||||||
use_sparse=self.use_sparse)
|
use_mm_prefix=self.model_config is not None
|
||||||
else:
|
and self.model_config.is_mm_prefix_lm)
|
||||||
self.attn_backend = get_attn_backend(
|
|
||||||
0,
|
|
||||||
self.dtype,
|
|
||||||
None,
|
|
||||||
self.block_size,
|
|
||||||
use_mla=self.model_config.use_mla,
|
|
||||||
use_sparse=self.use_sparse,
|
|
||||||
use_mm_prefix=self.model_config is not None
|
|
||||||
and self.model_config.is_mm_prefix_lm)
|
|
||||||
self.attn_mask_builder = AttentionMaskBuilder(self.device)
|
self.attn_mask_builder = AttentionMaskBuilder(self.device)
|
||||||
|
|
||||||
self._set_up_drafter()
|
self._set_up_drafter()
|
||||||
@@ -1877,36 +1867,19 @@ class NPUModelRunner(GPUModelRunner):
|
|||||||
self.speculative_config.method == "mtp":
|
self.speculative_config.method == "mtp":
|
||||||
attn_state = AscendAttentionState.SpecDecoding
|
attn_state = AscendAttentionState.SpecDecoding
|
||||||
|
|
||||||
if vllm_version_is("0.12.0"):
|
common_metadata = CommonAttentionMetadata(
|
||||||
common_metadata = CommonAttentionMetadata(
|
query_start_loc=self.query_start_loc.gpu[:num_reqs + 1],
|
||||||
query_start_loc=self.query_start_loc.gpu[:num_reqs +
|
query_start_loc_cpu=self.query_start_loc.cpu[:num_reqs +
|
||||||
1],
|
1],
|
||||||
query_start_loc_cpu=self.query_start_loc.
|
_seq_lens_cpu=self.seq_lens.cpu[:num_reqs],
|
||||||
cpu[:num_reqs + 1],
|
seq_lens=self.seq_lens.cpu[:num_reqs],
|
||||||
seq_lens_cpu=self.seq_lens.cpu[:num_reqs],
|
num_reqs=num_reqs,
|
||||||
seq_lens=self.seq_lens.cpu[:num_reqs],
|
num_actual_tokens=num_tokens,
|
||||||
num_reqs=num_reqs,
|
block_table_tensor=block_table_tensor[:num_reqs],
|
||||||
num_actual_tokens=num_tokens,
|
slot_mapping=slot_mapping.gpu,
|
||||||
block_table_tensor=block_table_tensor[:num_reqs],
|
_num_computed_tokens_cpu=num_computed_tokens_cpu,
|
||||||
slot_mapping=slot_mapping.gpu,
|
max_query_len=max_query_len,
|
||||||
num_computed_tokens_cpu=num_computed_tokens_cpu,
|
max_seq_len=seq_lens)
|
||||||
max_query_len=max_query_len,
|
|
||||||
max_seq_len=seq_lens)
|
|
||||||
else:
|
|
||||||
common_metadata = CommonAttentionMetadata(
|
|
||||||
query_start_loc=self.query_start_loc.gpu[:num_reqs +
|
|
||||||
1],
|
|
||||||
query_start_loc_cpu=self.query_start_loc.
|
|
||||||
cpu[:num_reqs + 1],
|
|
||||||
_seq_lens_cpu=self.seq_lens.cpu[:num_reqs],
|
|
||||||
seq_lens=self.seq_lens.cpu[:num_reqs],
|
|
||||||
num_reqs=num_reqs,
|
|
||||||
num_actual_tokens=num_tokens,
|
|
||||||
block_table_tensor=block_table_tensor[:num_reqs],
|
|
||||||
slot_mapping=slot_mapping.gpu,
|
|
||||||
_num_computed_tokens_cpu=num_computed_tokens_cpu,
|
|
||||||
max_query_len=max_query_len,
|
|
||||||
max_seq_len=seq_lens)
|
|
||||||
|
|
||||||
for attn_group in self.attn_groups[kv_cache_group_id]:
|
for attn_group in self.attn_groups[kv_cache_group_id]:
|
||||||
builder = attn_group.get_metadata_builder()
|
builder = attn_group.get_metadata_builder()
|
||||||
|
|||||||
@@ -22,6 +22,7 @@ import torch
|
|||||||
from vllm.lora.request import LoRARequest
|
from vllm.lora.request import LoRARequest
|
||||||
from vllm.pooling_params import PoolingParams
|
from vllm.pooling_params import PoolingParams
|
||||||
from vllm.v1.outputs import LogprobsTensors
|
from vllm.v1.outputs import LogprobsTensors
|
||||||
|
from vllm.v1.pool.metadata import PoolingStates
|
||||||
from vllm.v1.sample.logits_processor import (BatchUpdateBuilder,
|
from vllm.v1.sample.logits_processor import (BatchUpdateBuilder,
|
||||||
LogitsProcessors)
|
LogitsProcessors)
|
||||||
from vllm.v1.worker.gpu_input_batch import InputBatch
|
from vllm.v1.worker.gpu_input_batch import InputBatch
|
||||||
@@ -29,16 +30,6 @@ from vllm.v1.worker.gpu_input_batch import InputBatch
|
|||||||
from vllm_ascend.worker.block_table import MultiGroupBlockTable
|
from vllm_ascend.worker.block_table import MultiGroupBlockTable
|
||||||
|
|
||||||
|
|
||||||
class PoolingStates:
|
|
||||||
# NOTE: This should be removed after we drop support of vLLM v0.12.0
|
|
||||||
def __init__(self):
|
|
||||||
# for chunked prefill with ALL pooling
|
|
||||||
self.hidden_states_cache: list[torch.Tensor] = []
|
|
||||||
|
|
||||||
def clean(self):
|
|
||||||
self.hidden_states_cache.clear()
|
|
||||||
|
|
||||||
|
|
||||||
class NPUInputBatch(InputBatch):
|
class NPUInputBatch(InputBatch):
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
|
|||||||
Reference in New Issue
Block a user