Upgrade vllm commit hash to 1216 (#5053)
### What this PR does / why we need it?
Upstream vLLM PR #30212 https://github.com/vllm-project/vllm/pull/30212
refactored the attention backend selection interface, This PR adapts
vllm-ascend's get_attn_backend_cls to align with the new upstream
standard, ensuring compatibility and reducing maintenance overhead.
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
co-author:[leo-pony][nengjunma@outlook.com](mailto:nengjunma@outlook.com)
- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c
---------
Signed-off-by: zxwang <1476209578@qq.com>
Signed-off-by: leo-pony <nengjunma@outlook.com>
Co-authored-by: leo-pony <nengjunma@outlook.com>
This commit is contained in:
2
.github/workflows/pr_test_full.yaml
vendored
2
.github/workflows/pr_test_full.yaml
vendored
@@ -74,7 +74,7 @@ jobs:
|
||||
name: e2e-full
|
||||
strategy:
|
||||
matrix:
|
||||
vllm_version: [4429d934de3c5cc327b0d7aec8e473aeba38db90, v0.12.0]
|
||||
vllm_version: [releases/v0.13.0, v0.12.0]
|
||||
needs: [changes]
|
||||
if: ${{ needs.changes.outputs.e2e_tracker == 'true' }}
|
||||
uses: ./.github/workflows/_e2e_test.yaml
|
||||
|
||||
6
.github/workflows/pr_test_light.yaml
vendored
6
.github/workflows/pr_test_light.yaml
vendored
@@ -42,7 +42,7 @@ jobs:
|
||||
lint:
|
||||
uses: ./.github/workflows/_pre_commit.yml
|
||||
with:
|
||||
vllm: 4429d934de3c5cc327b0d7aec8e473aeba38db90
|
||||
vllm: releases/v0.13.0
|
||||
changes:
|
||||
runs-on: linux-aarch64-a2-0
|
||||
outputs:
|
||||
@@ -90,7 +90,7 @@ jobs:
|
||||
SOC_VERSION: ascend910b1
|
||||
strategy:
|
||||
matrix:
|
||||
vllm_version: [4429d934de3c5cc327b0d7aec8e473aeba38db90, v0.12.0]
|
||||
vllm_version: [releases/v0.13.0, v0.12.0]
|
||||
|
||||
steps:
|
||||
- name: Free up disk space
|
||||
@@ -154,7 +154,7 @@ jobs:
|
||||
name: e2e-light
|
||||
strategy:
|
||||
matrix:
|
||||
vllm_version: [4429d934de3c5cc327b0d7aec8e473aeba38db90, v0.12.0]
|
||||
vllm_version: [releases/v0.13.0, v0.12.0]
|
||||
# Note (yikun): If CI resource are limited we can split job into two chain jobs
|
||||
needs: [lint, changes]
|
||||
# only trigger e2e test after lint passed and the change is e2e related with pull request.
|
||||
|
||||
@@ -50,7 +50,7 @@ If you're using v0.7.3, don't forget to install [mindie-turbo](https://pypi.org/
|
||||
For main branch of vLLM Ascend, we usually make it compatible with the latest vLLM release and a newer commit hash of vLLM. Please note that this table is usually updated. Please check it regularly.
|
||||
| vLLM Ascend | vLLM | Python | Stable CANN | PyTorch/torch_npu |
|
||||
|-------------|--------------|------------------|-------------|--------------------|
|
||||
| main | 4429d934de3c5cc327b0d7aec8e473aeba38db90, v0.12.0 tag | >= 3.10, < 3.12 | 8.3.RC2 | 2.8.0 / 2.8.0 |
|
||||
| main | releases/v0.13.0, v0.12.0 tag | >= 3.10, < 3.12 | 8.3.RC2 | 2.8.0 / 2.8.0 |
|
||||
|
||||
## Release cadence
|
||||
|
||||
|
||||
@@ -113,11 +113,9 @@ def test_sp_for_qwen3_moe() -> None:
|
||||
dtype="auto",
|
||||
tensor_parallel_size=2,
|
||||
distributed_executor_backend="mp",
|
||||
compilation_config={
|
||||
"pass_config": {
|
||||
"enable_sequence_parallelism": True
|
||||
}
|
||||
},
|
||||
compilation_config={"pass_config": {
|
||||
"enable_sp": True
|
||||
}},
|
||||
enable_expert_parallel=True,
|
||||
enforce_eager=True) as vllm_model:
|
||||
vllm_model.generate(example_prompts, sampling_params)
|
||||
|
||||
@@ -355,23 +355,15 @@ class NPUPlatform(Platform):
|
||||
CUSTOM_OP_REGISTERED = True
|
||||
|
||||
@classmethod
|
||||
def get_attn_backend_cls(
|
||||
cls,
|
||||
selected_backend,
|
||||
head_size,
|
||||
dtype,
|
||||
kv_cache_dtype,
|
||||
block_size,
|
||||
use_mla,
|
||||
has_sink=False,
|
||||
use_sparse=False,
|
||||
# NOTE: Please pay special attention to the order of these parameters.
|
||||
# Although we are only using some of them so far
|
||||
# vllm passes them in sequence when using this interface.
|
||||
use_mm_prefix: bool = False,
|
||||
attn_type: str | None = None,
|
||||
):
|
||||
# choose attention backend based on use_mla
|
||||
def get_attn_backend_cls(cls, selected_backend, *args, **kwargs):
|
||||
if "attn_selector_config" in kwargs:
|
||||
use_mla = kwargs["attn_selector_config"].use_mla
|
||||
use_sparse = kwargs["attn_selector_config"].use_sparse
|
||||
else:
|
||||
use_mla = kwargs.get("use_mla",
|
||||
args[4] if len(args) >= 5 else None)
|
||||
use_sparse = kwargs.get("use_sparse",
|
||||
args[6] if len(args) >= 7 else None)
|
||||
backend_map = {
|
||||
(True, False): "vllm_ascend.attention.mla_v1.AscendMLABackend",
|
||||
(False, False):
|
||||
|
||||
Reference in New Issue
Block a user