From 378e92a2a2a2d3cc632e5f96693667ec3ef671dc Mon Sep 17 00:00:00 2001 From: Icey <1790571317@qq.com> Date: Mon, 17 Nov 2025 10:56:23 +0800 Subject: [PATCH] [Cherry-pick][0.11.0] Adapted to torch_npu.npu_fused_infer_attention_score (#4202) ### What this PR does / why we need it? Fixes a compatible bug with torch_npu.npu_fused_infer_attention_score which is discribed in https://github.com/vllm-project/vllm-ascend/issues/4020. @momo609 tells us this solution. cherry-pick: https://github.com/vllm-project/vllm-ascend/pull/4025 ### Does this PR introduce _any_ user-facing change? N/A ### How was this patch tested? CI passed with new added/existing test. Signed-off-by: Icey <1790571317@qq.com> --- vllm_ascend/attention/attention_v1.py | 2 +- vllm_ascend/patch/platform/patch_mamba_config.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py index e003ca6..7c5e247 100644 --- a/vllm_ascend/attention/attention_v1.py +++ b/vllm_ascend/attention/attention_v1.py @@ -115,7 +115,7 @@ class AscendAttentionBackend(AttentionBackend): @staticmethod def get_supported_block_size() -> list[int]: - return [64] + return [128] class AscendAttentionState(Enum): diff --git a/vllm_ascend/patch/platform/patch_mamba_config.py b/vllm_ascend/patch/platform/patch_mamba_config.py index 1afb9e1..1420fac 100644 --- a/vllm_ascend/patch/platform/patch_mamba_config.py +++ b/vllm_ascend/patch/platform/patch_mamba_config.py @@ -51,7 +51,7 @@ def verify_and_update_config(cls, vllm_config) -> None: block_size=model_config.max_model_len, ).page_size_bytes - block_alignment_bytes = 64 + block_alignment_bytes = 128 # some attention backends (e.g. FA) only support setting # block size to multiple of 16, so let's suggest a value