From 1804b60ec8d93a8b610b7cb543ee1f493cbd8b19 Mon Sep 17 00:00:00 2001 From: drslark <96540755+drslark@users.noreply.github.com> Date: Thu, 6 Nov 2025 22:00:24 +0800 Subject: [PATCH] [BugFix][main] Adapted to torch_npu.npu_fused_infer_attention_score (#4025) ### What this PR does / why we need it? Fixes a compatible bug with `torch_npu.npu_fused_infer_attention_score` which is discribed in https://github.com/vllm-project/vllm-ascend/issues/4020. @momo609 tells us this solution. ### Does this PR introduce _any_ user-facing change? N/A ### How was this patch tested? The environment is same with this issue, https://github.com/vllm-project/vllm-ascend/issues/4020. We modify the code according to https://github.com/vllm-project/vllm-ascend/pull/3918. And run below codes: ```python # run with Qwen3-next-mtp prompts = [ "Who are you?", ] sampling_params = SamplingParams(temperature=0.0, top_p=0.95, top_k=40, max_tokens=128) llm = LLM(model="/home/model/Qwen3-Next-80B-A3B-Instruct", tensor_parallel_size=4, enforce_eager=True, distributed_executor_backend="mp", gpu_memory_utilization=0.7, speculative_config={ "method": "qwen3_next_mtp", "num_speculative_tokens": 1, }, max_model_len=4096) outputs = llm.generate(prompts, sampling_params) for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") ``` Outputs: ```text Prompt: 'Who are you?', Generated text: ' I am Qwen, a large-scale language model independently developed by the Tongyi Lab under Alibaba Group. I am designed to answer questions, create text such as stories, official documents, emails, scripts, and more, as well as perform logical reasoning, programming, and other tasks. If you have any questions or need assistance, feel free to let me know anytime!' ``` Now, `torch_npu.npu_fused_infer_attention_score` is compatible with Qwen3-Next. - vLLM version: v0.11.0 - vLLM main: https://github.com/vllm-project/vllm/commit/83f478bb19489b41e9d208b47b4bb5a95ac171ac Signed-off-by: drslark --- vllm_ascend/attention/attention_v1.py | 2 +- vllm_ascend/patch/platform/patch_mamba_config.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py index 6181631d..7a1f79a4 100644 --- a/vllm_ascend/attention/attention_v1.py +++ b/vllm_ascend/attention/attention_v1.py @@ -127,7 +127,7 @@ class AscendAttentionBackend(AttentionBackend): @staticmethod def get_supported_block_size() -> list[int]: - return [64] + return [128] class AscendAttentionState(Enum): diff --git a/vllm_ascend/patch/platform/patch_mamba_config.py b/vllm_ascend/patch/platform/patch_mamba_config.py index ad083f51..1b077b41 100644 --- a/vllm_ascend/patch/platform/patch_mamba_config.py +++ b/vllm_ascend/patch/platform/patch_mamba_config.py @@ -58,7 +58,7 @@ def verify_and_update_config(cls, vllm_config) -> None: block_size=model_config.max_model_len, ).page_size_bytes - block_alignment_bytes = 64 + block_alignment_bytes = 128 # some attention backends (e.g. FA) only support setting # block size to multiple of 16, so let's suggest a value