From 2a6d95c389b91774f9fe846f44449c85478f8b4e Mon Sep 17 00:00:00 2001 From: LICO67373 <110013619+LICO1314@users.noreply.github.com> Date: Wed, 14 Jan 2026 16:52:51 +0800 Subject: [PATCH] [Cleanup] Remove dead code make_attention_mask function (#5818) ### What this PR does / why we need it? This PR removes the unused `make_attention_mask` function from `vllm_ascend/worker/v2/attn_utils.py`. **Why it's dead code:** - After PR #4870 (attention mask unification refactor), attention mask generation has been centralized in the `AttentionMaskBuilder` singleton class - The mask is now generated directly by metadata builders when needed (e.g., `AscendAttentionMetadataBuilder`, `AscendMLAMetadataBuilder`) - The `make_attention_mask` function is no longer called anywhere in the codebase - The function's parameters (including `attn_mask` and `spec_attn_mask`) were also removed from `build_attn_metadata` in the same refactor **Changes:** - Remove `make_attention_mask` function (24 lines) from `vllm_ascend/worker/v2/attn_utils.py` ### Does this PR introduce _any_ user-facing change? No. This is a code cleanup that removes dead code. No user-facing behavior changes. ### How was this patch tested? - Verified that `make_attention_mask` is not called anywhere in the codebase (via `grep`) - CI tests pass to ensure no regressions - The function has been unused since PR #4870 was merged - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2f4e6548efec402b913ffddc8726230d9311948d Signed-off-by: lico67373 <918688502@qq.com> Co-authored-by: weijinqian0 <1184188277@qq.com> --- vllm_ascend/worker/v2/attn_utils.py | 24 ------------------------ 1 file changed, 24 deletions(-) diff --git a/vllm_ascend/worker/v2/attn_utils.py b/vllm_ascend/worker/v2/attn_utils.py index aef319e2..e8ed5a28 100644 --- a/vllm_ascend/worker/v2/attn_utils.py +++ b/vllm_ascend/worker/v2/attn_utils.py @@ -23,7 +23,6 @@ from typing import Any, Tuple import numpy as np import torch from vllm.config import VllmConfig -from vllm.config.model import ModelDType from vllm.v1.attention.backends.utils import AttentionMetadataBuilder from vllm.v1.kv_cache_interface import EncoderOnlyAttentionSpec, KVCacheConfig @@ -145,26 +144,3 @@ def build_attn_state( else: attn_state = AscendAttentionState.PrefillCacheHit return attn_state - - -def make_attention_mask( - vllm_config: VllmConfig, - attn_state: AscendAttentionState, - dtype: ModelDType | torch.dtype, - device: torch.device, -) -> torch.Tensor: - """make attention mask for npu's attention backend.""" - attn_mask_builder = get_attn_mask_builder(device) - # pcp situation. - if attn_mask_builder is None: - raise ValueError("Attn mask builder is None") - # Pooling situation. - if vllm_config.model_config.runner_type == "pooling": - return attn_mask_builder.get_attn_mask(2048, torch.bool) - - # TODO(Ronald1995) cosidering pcp. - if vllm_config.model_config.use_mla: - # mla prefill - if attn_state != AscendAttentionState.DecodeOnly: - return attn_mask_builder.get_mla_mask(dtype) - return attn_mask_builder.get_splitfuse_attn_mask()