From cc0110abb4e4469faf04e90555a68a247dd2d6e9 Mon Sep 17 00:00:00 2001 From: yeyifan Date: Tue, 6 Jan 2026 17:24:43 +0800 Subject: [PATCH] [Bugfix] Remove swa parameter of fia (#5602) ### What this PR does / why we need it? When using the swa parameter in fia, headDim does not currently support 256, and when gemma3's headDim is equal to 256, an error will occur. Therefore, code rollback is required, and it will be incorporated after cann supports it. ### Does this PR introduce _any_ user-facing change? Remove swa parameter of fia. ### How was this patch tested? - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/7157596103666ee7ccb7008acee8bff8a8ff1731 --------- Signed-off-by: nsdie Co-authored-by: Mengqing Cao --- vllm_ascend/attention/attention_v1.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py index 8eb89b97..91051768 100644 --- a/vllm_ascend/attention/attention_v1.py +++ b/vllm_ascend/attention/attention_v1.py @@ -574,11 +574,7 @@ class AscendAttentionBackendImpl(AttentionImpl): query=query, key=key, value=value, - pre_tokens=self.sliding_window - if self.sliding_window else SWA_INT_MAX, - next_tokens=0 if self.sliding_window else SWA_INT_MAX, - atten_mask=attn_metadata.swa_mask - if self.sliding_window else attn_metadata.attn_mask, + atten_mask=attn_metadata.attn_mask, block_table=block_table, input_layout="TND", block_size=block_size, @@ -587,7 +583,7 @@ class AscendAttentionBackendImpl(AttentionImpl): num_key_value_heads=self.num_kv_heads, num_heads=self.num_heads, scale=self.scale, - sparse_mode=4 if self.sliding_window else 3, + sparse_mode=3, ) attn_output = attn_output.view(num_tokens, self.num_heads,