[Misc] Clean up uesless code in attention (#1933)

Before do attention module refactor, we can do some code cleanup to make the next step easier. What this PR does: 1. remove uesless `common_prefix_len` for attention builder 2. remove uesless `is_only_prefill` and `num_input_tokens` in attention metadata. 3. remove `CommonAttentionMetadata` and ues `query_start_loc` instead, `CommonAttentionMetadata` is over designed and uesless 4. update the attention backend input parameters to keep the same as vLLM. 5. Rename attention name to the same style with `ASCEND` prefix - vLLM version: v0.9.2 - vLLM main: 107111a859 Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-07-24 10:23:34 +08:00
parent b5ad70e1a6
commit 846555cdb5
7 changed files with 41 additions and 93 deletions
--- a/vllm_ascend/worker/eagle_proposer_v1.py
+++ b/vllm_ascend/worker/eagle_proposer_v1.py
@@ -132,7 +132,6 @@ class EagleProposer:
            num_reqs=batch_size,
            num_actual_tokens=num_tokens,
            max_query_len=max_query_len,
-            common_prefix_len=0,
        )
        if self.use_cuda_graph and \
            num_tokens <= self.cudagraph_batch_sizes[-1]:
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -75,8 +75,7 @@ from vllm_ascend.attention.attention_mask import AttentionMaskBuilder
 from vllm_ascend.attention.attention_v1 import (AscendAttentionState,
                                                AscendMetadata)
 from vllm_ascend.attention.attention_v1_torchair import AscendTorchairMetadata
-from vllm_ascend.attention.mla_v1 import (AscendMLAMetadata,
-                                          CommonAttentionMetadata)
+from vllm_ascend.attention.mla_v1 import AscendMLAMetadata
 from vllm_ascend.platform import NPUPlatform
 from vllm_ascend.sample.rejection_sampler import AscendRejectionSampler
 from vllm_ascend.torchair.utils import (check_torchair_cache_exist,
@@ -694,15 +693,10 @@ class NPUModelRunner(LoRAModelRunnerMixin):
        # in the same group share the same metadata.
        for kv_cache_group_id, kv_cache_group_spec in enumerate(
                self.kv_cache_config.kv_cache_groups):
-
-            # Prepare for cascade attention if enabled & beneficial.
-            common_prefix_len = 0
-
            attn_metadata_i = self.attn_metadata_builder.build(
                num_reqs=num_reqs,
                num_actual_tokens=total_num_scheduled_tokens,
                max_query_len=max_num_scheduled_tokens,
-                common_prefix_len=common_prefix_len,
            )
            for layer_name in kv_cache_group_spec.layer_names:
                attn_metadata[layer_name] = attn_metadata_i
@@ -1049,27 +1043,22 @@ class NPUModelRunner(LoRAModelRunnerMixin):
            extra_builder_kwargs['graph_pad_size'] = graph_pad_size

        if self.vllm_config.model_config.use_mla:
-            query_start_loc = self.query_start_loc[:num_reqs + 1]
-            seq_lens = self.seq_lens[:num_reqs]
-            common_attn_metadata = CommonAttentionMetadata(
-                query_start_loc=query_start_loc, seq_lens=seq_lens)
+            extra_builder_kwargs[
+                "query_start_loc"] = self.query_start_loc[:num_reqs + 1]
            attn_metadata = self.attn_metadata_builder.build(  # type: ignore
                num_reqs=num_reqs,
                num_actual_tokens=total_num_scheduled_tokens,
                max_query_len=max_num_scheduled_tokens,
-                common_attn_metadata=common_attn_metadata,
-                common_prefix_len=None,
                **extra_builder_kwargs,
            )
+            attn_metadata.num_input_tokens = num_input_tokens
        else:
            attn_metadata = self.attn_metadata_builder.build(  # type: ignore
                num_reqs=num_reqs,
                num_actual_tokens=total_num_scheduled_tokens,
                max_query_len=max_num_scheduled_tokens,
-                common_prefix_len=None,
                **extra_builder_kwargs,
            )
-        attn_metadata.num_input_tokens = num_input_tokens

        # Prepare input_ids
        token_indices = (positions_np +
--- a/vllm_ascend/worker/mtp_proposer_v1.py
+++ b/vllm_ascend/worker/mtp_proposer_v1.py
@@ -8,7 +8,6 @@ from vllm.model_executor.model_loader.utils import (
    process_weights_after_loading, set_default_torch_dtype)
 from vllm.v1.sample.metadata import SamplingMetadata

-from vllm_ascend.attention.mla_v1 import CommonAttentionMetadata
 from vllm_ascend.models.deepseek_mtp import CustomDeepSeekMTP


@@ -100,11 +99,6 @@ class MtpProposer:
        query_lens = cu_num_tokens[1:] - cu_num_tokens[:-1]
        max_query_len = query_lens.max().item()

-        seq_lens = (target_positions[last_token_indices] + 1)
-
-        common_attn_metadata = CommonAttentionMetadata(
-            query_start_loc=cu_num_tokens, seq_lens=seq_lens)
-
        # FIXME: reorder_batch() needs to be called before build()
        # because fields of attn_metadata_builder needs to be updated.
        # However, currently reorder_batch() takes input_batch and
@@ -120,8 +114,7 @@ class MtpProposer:
            num_reqs=batch_size,
            num_actual_tokens=num_tokens,
            max_query_len=max_query_len,
-            common_prefix_len=0,
-            common_attn_metadata=common_attn_metadata,
+            query_start_loc=cu_num_tokens,
        )

        with set_forward_context(attn_metadata, self.vllm_config):