[Bugfix]Fix the compatibility issue of may_reinitialize_input_batch (#6290)

### What this PR does / why we need it? Added a check in the may_reinitialize_input_batch method to verify whether the backend implements the get_supported_block_size method ### Does this PR introduce _any_ user-facing change? no user-facing change ### How was this patch tested? Only a few lines of code within the methods were modified, and the format check test has been passed. - vLLM version: v0.14.1 - vLLM main: dc917cceb8 --------- Signed-off-by: Debuuuuger <huangzr@cmbchina.com> Signed-off-by: debuger <102402761+huangazazaz@users.noreply.github.com> Signed-off-by: Debuuuuger <12110718@mail.sustech.edu.cn> Co-authored-by: Debuuuuger <huangzr@cmbchina.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2026-02-02 19:16:26 +08:00
parent 7932255c06
commit c1618a0427
5 changed files with 15 additions and 7 deletions
--- a/vllm_ascend/_310p/model_runner_310p.py
+++ b/vllm_ascend/_310p/model_runner_310p.py
@@ -134,8 +134,8 @@ class NPUModelRunner310(NPUModelRunner):
                    num_blocks = sum_page_size_bytes // kv_cache_spec.page_size_bytes
                    assert num_blocks >= kv_cache_config.num_blocks

-                    if hasattr(attn_backend, "get_supported_block_size") and self.use_hybrid_blocks:
-                        block_size = attn_backend.get_supported_block_size()[0]
+                    if hasattr(attn_backend, "get_supported_kernel_block_sizes") and self.use_hybrid_blocks:
+                        block_size = attn_backend.get_supported_kernel_block_sizes()[0]

                        block_size_chunk = kv_cache_spec.block_size // block_size
                        kv_cache_shape = attn_backend.get_kv_cache_shape(
--- a/vllm_ascend/attention/attention_v1.py
+++ b/vllm_ascend/attention/attention_v1.py
@@ -127,7 +127,7 @@ class AscendAttentionBackend(AttentionBackend):
            value_caches[dst_indices] = value_caches[src_indices]

    @staticmethod
-    def get_supported_block_size() -> list[int]:
+    def get_supported_kernel_block_sizes() -> list[int]:
        return [128]


@@ -227,7 +227,7 @@ class AscendAttentionMetadataBuilder(AttentionMetadataBuilder[AscendMetadata]):
        self.compilation_config = vllm_config.compilation_config
        self.device = device
        self.max_num_blocks_per_req = cdiv(
-            self.model_config.max_model_len, AscendAttentionBackend.get_supported_block_size()[0]
+            self.model_config.max_model_len, AscendAttentionBackend.get_supported_kernel_block_sizes()[0]
        )

        self.speculative_config = vllm_config.speculative_config
--- a/vllm_ascend/attention/mla_v1.py
+++ b/vllm_ascend/attention/mla_v1.py
@@ -89,6 +89,10 @@ class AscendMLABackend(AttentionBackend):
            return AscendMlaCPImpl
        return AscendMLAImpl

+    @staticmethod
+    def get_supported_kernel_block_sizes() -> list[int]:
+        return [128]
+

@dataclass
 class ChunkedContextMetadata:
--- a/vllm_ascend/attention/sfa_v1.py
+++ b/vllm_ascend/attention/sfa_v1.py
@@ -78,6 +78,10 @@ class AscendSFABackend(AttentionBackend):
    def get_impl_cls() -> type["AscendSFAImpl"]:
        return AscendSFAImpl

+    @staticmethod
+    def get_supported_kernel_block_sizes() -> list[int]:
+        return [128]
+

@dataclass
 class DSACPContext:
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -2485,8 +2485,8 @@ class NPUModelRunner(GPUModelRunner):
                    # the min of all `num_blocks`. Verify it here.
                    assert num_blocks >= kv_cache_config.num_blocks

-                    if hasattr(attn_backend, "get_supported_block_size") and self.use_hybrid_blocks:
-                        block_size = attn_backend.get_supported_block_size()[0]
+                    if hasattr(attn_backend, "get_supported_kernel_block_sizes") and self.use_hybrid_blocks:
+                        block_size = attn_backend.get_supported_kernel_block_sizes()[0]

                        block_size_chunk = kv_cache_spec.block_size // block_size
                        kv_cache_shape = attn_backend.get_kv_cache_shape(
@@ -2600,7 +2600,7 @@ class NPUModelRunner(GPUModelRunner):
                if attn_groups and self.use_hybrid_blocks:
                    # Use the backend's supported block size list
                    backend = attn_groups[0].backend
-                    supported_sizes = backend.get_supported_block_size()
+                    supported_sizes = backend.get_supported_kernel_block_sizes()
                    # If no specific sizes supported, use cache config
                    # block_size
                    kernel_block_size_list = supported_sizes if supported_sizes else [self.cache_config.block_size]