[Misc] Clean up uesless code in attention (#1933)

Before do attention module refactor, we can do some code cleanup to make the next step easier. What this PR does: 1. remove uesless `common_prefix_len` for attention builder 2. remove uesless `is_only_prefill` and `num_input_tokens` in attention metadata. 3. remove `CommonAttentionMetadata` and ues `query_start_loc` instead, `CommonAttentionMetadata` is over designed and uesless 4. update the attention backend input parameters to keep the same as vLLM. 5. Rename attention name to the same style with `ASCEND` prefix - vLLM version: v0.9.2 - vLLM main: 107111a859 Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-07-24 10:23:34 +08:00
parent b5ad70e1a6
commit 846555cdb5
7 changed files with 41 additions and 93 deletions
--- a/tests/ut/attention/test_attention_v1.py
+++ b/tests/ut/attention/test_attention_v1.py
@@ -96,7 +96,6 @@ class TestAscendAttentionMetadataBuilder(TestBase):
        num_reqs = 2
        num_actual_tokens = 10
        max_query_len = 5
-        common_prefix_len = 1

        self.mock_runner.input_batch.block_table = [MagicMock()]
        self.mock_runner.input_batch.block_table[
@@ -114,8 +113,11 @@ class TestAscendAttentionMetadataBuilder(TestBase):
        mock_nd_to_nz_2d.return_value = mock_nz_tensor
        mock_npu_format_cast.return_value = mock_nz_tensor

-        self.builder.build(num_reqs, num_actual_tokens, max_query_len,
-                           common_prefix_len)
+        self.builder.build(
+            num_reqs,
+            num_actual_tokens,
+            max_query_len,
+        )

    @patch('vllm_ascend.attention.attention_v1.AscendMetadata')
    @patch('torch_npu.npu_format_cast')
@@ -148,7 +150,7 @@ class TestAscendAttentionMetadataBuilder(TestBase):
        mock_nd_to_nz_spec.return_value = mock_nz_tensor
        mock_npu_format_cast.return_value = mock_nz_tensor

-        self.builder.build(num_reqs, num_actual_tokens, max_query_len, 0)
+        self.builder.build(num_reqs, num_actual_tokens, max_query_len)

    @patch('vllm_ascend.attention.attention_v1.AscendMetadata')
    @patch('vllm_ascend.attention.attention_v1.is_310p', return_value=False)
@@ -169,7 +171,7 @@ class TestAscendAttentionMetadataBuilder(TestBase):
        self.mock_runner.attn_state = AscendAttentionState.ChunkedPrefill
        self.mock_runner.query_start_loc_cpu = torch.tensor([0, 2, 5, 9])

-        self.builder.build(num_reqs, num_actual_tokens, max_query_len, 0)
+        self.builder.build(num_reqs, num_actual_tokens, max_query_len)


 class TestAscendAttentionBackendImpl(TestBase):
@@ -201,7 +203,9 @@ class TestAscendAttentionBackendImpl(TestBase):
            alibi_slopes=None,
            sliding_window=None,
            kv_cache_dtype="float16",
-            attn_type=self.attention_type.DECODER)
+            logits_soft_cap=None,
+            attn_type=self.attention_type.DECODER,
+            kv_sharing_target_layer_name=None)

        self.impl_192 = AscendAttentionBackendImpl(
            num_heads=8,
@@ -211,16 +215,21 @@ class TestAscendAttentionBackendImpl(TestBase):
            alibi_slopes=None,
            sliding_window=None,
            kv_cache_dtype="float16",
-            attn_type=self.attention_type.DECODER)
+            logits_soft_cap=None,
+            attn_type=self.attention_type.DECODER,
+            kv_sharing_target_layer_name=None)

-        self.impl_error = AscendAttentionBackendImpl(num_heads=8,
-                                                     head_size=192,
-                                                     scale=1.0,
-                                                     num_kv_heads=8,
-                                                     alibi_slopes=None,
-                                                     sliding_window=None,
-                                                     kv_cache_dtype="float16",
-                                                     attn_type=None)
+        self.impl_error = AscendAttentionBackendImpl(
+            num_heads=8,
+            head_size=192,
+            scale=1.0,
+            num_kv_heads=8,
+            alibi_slopes=None,
+            sliding_window=None,
+            kv_cache_dtype="float16",
+            logits_soft_cap=None,
+            attn_type=None,
+            kv_sharing_target_layer_name=None)

    @patch('torch.ops.vllm.unified_ascend_attention_with_output')
    def test_forward_trace_flag_true(self, mock_unified_attention):