From 84fc7402c3176bcf39654ccec05af14ff6fd07cd Mon Sep 17 00:00:00 2001 From: Shanshan Shen <467638484@qq.com> Date: Thu, 24 Jul 2025 19:31:36 +0800 Subject: [PATCH] [Misc] Refactor AscendMetaData Comments to Make It Clearer (#1967) ### What this PR does / why we need it? Refactor the comments of `AscendMetaData` to make it clearer. - vLLM version: v0.9.2 - vLLM main: https://github.com/vllm-project/vllm/commit/f3137cdd81cae3a48282c22130fbcadcfc64ea95 --------- Signed-off-by: shen-shanshan <467638484@qq.com> --- vllm_ascend/attention/attention_v1.py | 50 +++++++++++++++++---------- 1 file changed, 31 insertions(+), 19 deletions(-) diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py index fa2a528..1551565 100644 --- a/vllm_ascend/attention/attention_v1.py +++ b/vllm_ascend/attention/attention_v1.py @@ -121,28 +121,40 @@ class AscendAttentionState(Enum): @dataclass class AscendMetadata: - num_actual_tokens: int # Number of tokens excluding padding. - # (batch_size, max_blocks_per_seq). - # Block addresses per sequence. (Seq id -> list of physical block) - block_tables: torch.Tensor - # (batch_size,). The sequence length per sequence. Sequence length means - # the computed tokens + new tokens None if it is a decoding. - query_start_loc: torch.Tensor - query_lens: torch.Tensor - seq_lens: torch.Tensor - # max value of number of tokens across dp group - max_num_tokens_across_dp: int = 0 - # Maximum query length in the batch. None for decoding. - max_query_len: Optional[int] = None - # (num_tokens,). The indices of the token slots that input tokens will be - # stored into. E.g., if `slot_mapping` is [35, 2, 17] and the block size - # is 16, the three tokens are stored in the 3rd slot in block 2, 2nd slot - # in block 0, and 1st slot in block 1, respectively. - slot_mapping: torch.Tensor = None + # **************************** Basic Properties **************************** + attn_mask: Optional[torch.Tensor] = None # Current state of this attention run. attn_state: AscendAttentionState = AscendAttentionState.ChunkedPrefill - attn_mask: Optional[torch.Tensor] = None + + # Number of tokens excluding padding. + num_actual_tokens: int = 0 + + # The sequence length per sequence. Sequence length means the computed + # tokens + new tokens (is None if it is a decoding). + # (batch_size,) + seq_lens: torch.Tensor = None + + query_start_loc: torch.Tensor = None + query_lens: torch.Tensor = None + # Maximum query length in the batch (None for decoding). + max_query_len: Optional[int] = None + + # ********************** KV Cache Related Properties *********************** + # Block addresses per sequence (Seq id -> list of physical block). + # (batch_size, max_blocks_per_seq) + block_tables: torch.Tensor = None + + # The indices of the token slots that input tokens will be stored into. + # E.g., if `slot_mapping` is [35, 2, 17] and the block size is 16, the + # three tokens are stored in the 3rd slot in block 2, 2nd slot in block 0, + # and 1st slot in block 1, respectively. + # (num_tokens,) + slot_mapping: torch.Tensor = None + + # ************************* DP Related Properties ************************** with_prefill_across_dp: bool = False + # Maximum number of tokens across dp group + max_num_tokens_across_dp: int = 0 class AscendAttentionMetadataBuilder: