Upgrade vllm commit hash to 1216 (#5053)
### What this PR does / why we need it?
Upstream vLLM PR #30212 https://github.com/vllm-project/vllm/pull/30212
refactored the attention backend selection interface, This PR adapts
vllm-ascend's get_attn_backend_cls to align with the new upstream
standard, ensuring compatibility and reducing maintenance overhead.
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
co-author:[leo-pony][nengjunma@outlook.com](mailto:nengjunma@outlook.com)
- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c
---------
Signed-off-by: zxwang <1476209578@qq.com>
Signed-off-by: leo-pony <nengjunma@outlook.com>
Co-authored-by: leo-pony <nengjunma@outlook.com>
This commit is contained in:
2
.github/workflows/pr_test_full.yaml
vendored
2
.github/workflows/pr_test_full.yaml
vendored
@@ -74,7 +74,7 @@ jobs:
|
|||||||
name: e2e-full
|
name: e2e-full
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
vllm_version: [4429d934de3c5cc327b0d7aec8e473aeba38db90, v0.12.0]
|
vllm_version: [releases/v0.13.0, v0.12.0]
|
||||||
needs: [changes]
|
needs: [changes]
|
||||||
if: ${{ needs.changes.outputs.e2e_tracker == 'true' }}
|
if: ${{ needs.changes.outputs.e2e_tracker == 'true' }}
|
||||||
uses: ./.github/workflows/_e2e_test.yaml
|
uses: ./.github/workflows/_e2e_test.yaml
|
||||||
|
|||||||
6
.github/workflows/pr_test_light.yaml
vendored
6
.github/workflows/pr_test_light.yaml
vendored
@@ -42,7 +42,7 @@ jobs:
|
|||||||
lint:
|
lint:
|
||||||
uses: ./.github/workflows/_pre_commit.yml
|
uses: ./.github/workflows/_pre_commit.yml
|
||||||
with:
|
with:
|
||||||
vllm: 4429d934de3c5cc327b0d7aec8e473aeba38db90
|
vllm: releases/v0.13.0
|
||||||
changes:
|
changes:
|
||||||
runs-on: linux-aarch64-a2-0
|
runs-on: linux-aarch64-a2-0
|
||||||
outputs:
|
outputs:
|
||||||
@@ -90,7 +90,7 @@ jobs:
|
|||||||
SOC_VERSION: ascend910b1
|
SOC_VERSION: ascend910b1
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
vllm_version: [4429d934de3c5cc327b0d7aec8e473aeba38db90, v0.12.0]
|
vllm_version: [releases/v0.13.0, v0.12.0]
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Free up disk space
|
- name: Free up disk space
|
||||||
@@ -154,7 +154,7 @@ jobs:
|
|||||||
name: e2e-light
|
name: e2e-light
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
vllm_version: [4429d934de3c5cc327b0d7aec8e473aeba38db90, v0.12.0]
|
vllm_version: [releases/v0.13.0, v0.12.0]
|
||||||
# Note (yikun): If CI resource are limited we can split job into two chain jobs
|
# Note (yikun): If CI resource are limited we can split job into two chain jobs
|
||||||
needs: [lint, changes]
|
needs: [lint, changes]
|
||||||
# only trigger e2e test after lint passed and the change is e2e related with pull request.
|
# only trigger e2e test after lint passed and the change is e2e related with pull request.
|
||||||
|
|||||||
@@ -50,7 +50,7 @@ If you're using v0.7.3, don't forget to install [mindie-turbo](https://pypi.org/
|
|||||||
For main branch of vLLM Ascend, we usually make it compatible with the latest vLLM release and a newer commit hash of vLLM. Please note that this table is usually updated. Please check it regularly.
|
For main branch of vLLM Ascend, we usually make it compatible with the latest vLLM release and a newer commit hash of vLLM. Please note that this table is usually updated. Please check it regularly.
|
||||||
| vLLM Ascend | vLLM | Python | Stable CANN | PyTorch/torch_npu |
|
| vLLM Ascend | vLLM | Python | Stable CANN | PyTorch/torch_npu |
|
||||||
|-------------|--------------|------------------|-------------|--------------------|
|
|-------------|--------------|------------------|-------------|--------------------|
|
||||||
| main | 4429d934de3c5cc327b0d7aec8e473aeba38db90, v0.12.0 tag | >= 3.10, < 3.12 | 8.3.RC2 | 2.8.0 / 2.8.0 |
|
| main | releases/v0.13.0, v0.12.0 tag | >= 3.10, < 3.12 | 8.3.RC2 | 2.8.0 / 2.8.0 |
|
||||||
|
|
||||||
## Release cadence
|
## Release cadence
|
||||||
|
|
||||||
|
|||||||
@@ -113,11 +113,9 @@ def test_sp_for_qwen3_moe() -> None:
|
|||||||
dtype="auto",
|
dtype="auto",
|
||||||
tensor_parallel_size=2,
|
tensor_parallel_size=2,
|
||||||
distributed_executor_backend="mp",
|
distributed_executor_backend="mp",
|
||||||
compilation_config={
|
compilation_config={"pass_config": {
|
||||||
"pass_config": {
|
"enable_sp": True
|
||||||
"enable_sequence_parallelism": True
|
}},
|
||||||
}
|
|
||||||
},
|
|
||||||
enable_expert_parallel=True,
|
enable_expert_parallel=True,
|
||||||
enforce_eager=True) as vllm_model:
|
enforce_eager=True) as vllm_model:
|
||||||
vllm_model.generate(example_prompts, sampling_params)
|
vllm_model.generate(example_prompts, sampling_params)
|
||||||
|
|||||||
@@ -355,23 +355,15 @@ class NPUPlatform(Platform):
|
|||||||
CUSTOM_OP_REGISTERED = True
|
CUSTOM_OP_REGISTERED = True
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_attn_backend_cls(
|
def get_attn_backend_cls(cls, selected_backend, *args, **kwargs):
|
||||||
cls,
|
if "attn_selector_config" in kwargs:
|
||||||
selected_backend,
|
use_mla = kwargs["attn_selector_config"].use_mla
|
||||||
head_size,
|
use_sparse = kwargs["attn_selector_config"].use_sparse
|
||||||
dtype,
|
else:
|
||||||
kv_cache_dtype,
|
use_mla = kwargs.get("use_mla",
|
||||||
block_size,
|
args[4] if len(args) >= 5 else None)
|
||||||
use_mla,
|
use_sparse = kwargs.get("use_sparse",
|
||||||
has_sink=False,
|
args[6] if len(args) >= 7 else None)
|
||||||
use_sparse=False,
|
|
||||||
# NOTE: Please pay special attention to the order of these parameters.
|
|
||||||
# Although we are only using some of them so far
|
|
||||||
# vllm passes them in sequence when using this interface.
|
|
||||||
use_mm_prefix: bool = False,
|
|
||||||
attn_type: str | None = None,
|
|
||||||
):
|
|
||||||
# choose attention backend based on use_mla
|
|
||||||
backend_map = {
|
backend_map = {
|
||||||
(True, False): "vllm_ascend.attention.mla_v1.AscendMLABackend",
|
(True, False): "vllm_ascend.attention.mla_v1.AscendMLABackend",
|
||||||
(False, False):
|
(False, False):
|
||||||
|
|||||||
Reference in New Issue
Block a user