From c1618a04273e967616e40551a86f370e7a76222b Mon Sep 17 00:00:00 2001 From: debuger <102402761+huangazazaz@users.noreply.github.com> Date: Mon, 2 Feb 2026 19:16:26 +0800 Subject: [PATCH] [Bugfix]Fix the compatibility issue of may_reinitialize_input_batch (#6290) ### What this PR does / why we need it? Added a check in the may_reinitialize_input_batch method to verify whether the backend implements the get_supported_block_size method ### Does this PR introduce _any_ user-facing change? no user-facing change ### How was this patch tested? Only a few lines of code within the methods were modified, and the format check test has been passed. - vLLM version: v0.14.1 - vLLM main: https://github.com/vllm-project/vllm/commit/dc917cceb877dfd13f98c538c4c96158047d98bd --------- Signed-off-by: Debuuuuger Signed-off-by: debuger <102402761+huangazazaz@users.noreply.github.com> Signed-off-by: Debuuuuger <12110718@mail.sustech.edu.cn> Co-authored-by: Debuuuuger Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- vllm_ascend/_310p/model_runner_310p.py | 4 ++-- vllm_ascend/attention/attention_v1.py | 4 ++-- vllm_ascend/attention/mla_v1.py | 4 ++++ vllm_ascend/attention/sfa_v1.py | 4 ++++ vllm_ascend/worker/model_runner_v1.py | 6 +++--- 5 files changed, 15 insertions(+), 7 deletions(-) diff --git a/vllm_ascend/_310p/model_runner_310p.py b/vllm_ascend/_310p/model_runner_310p.py index 00da9b0f..e3df5c9a 100644 --- a/vllm_ascend/_310p/model_runner_310p.py +++ b/vllm_ascend/_310p/model_runner_310p.py @@ -134,8 +134,8 @@ class NPUModelRunner310(NPUModelRunner): num_blocks = sum_page_size_bytes // kv_cache_spec.page_size_bytes assert num_blocks >= kv_cache_config.num_blocks - if hasattr(attn_backend, "get_supported_block_size") and self.use_hybrid_blocks: - block_size = attn_backend.get_supported_block_size()[0] + if hasattr(attn_backend, "get_supported_kernel_block_sizes") and self.use_hybrid_blocks: + block_size = attn_backend.get_supported_kernel_block_sizes()[0] block_size_chunk = kv_cache_spec.block_size // block_size kv_cache_shape = attn_backend.get_kv_cache_shape( diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py index d28d0f26..87d28ec7 100644 --- a/vllm_ascend/attention/attention_v1.py +++ b/vllm_ascend/attention/attention_v1.py @@ -127,7 +127,7 @@ class AscendAttentionBackend(AttentionBackend): value_caches[dst_indices] = value_caches[src_indices] @staticmethod - def get_supported_block_size() -> list[int]: + def get_supported_kernel_block_sizes() -> list[int]: return [128] @@ -227,7 +227,7 @@ class AscendAttentionMetadataBuilder(AttentionMetadataBuilder[AscendMetadata]): self.compilation_config = vllm_config.compilation_config self.device = device self.max_num_blocks_per_req = cdiv( - self.model_config.max_model_len, AscendAttentionBackend.get_supported_block_size()[0] + self.model_config.max_model_len, AscendAttentionBackend.get_supported_kernel_block_sizes()[0] ) self.speculative_config = vllm_config.speculative_config diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py index b93200c6..e3c4b0e8 100644 --- a/vllm_ascend/attention/mla_v1.py +++ b/vllm_ascend/attention/mla_v1.py @@ -89,6 +89,10 @@ class AscendMLABackend(AttentionBackend): return AscendMlaCPImpl return AscendMLAImpl + @staticmethod + def get_supported_kernel_block_sizes() -> list[int]: + return [128] + @dataclass class ChunkedContextMetadata: diff --git a/vllm_ascend/attention/sfa_v1.py b/vllm_ascend/attention/sfa_v1.py index ec7d1ecd..26f8c927 100644 --- a/vllm_ascend/attention/sfa_v1.py +++ b/vllm_ascend/attention/sfa_v1.py @@ -78,6 +78,10 @@ class AscendSFABackend(AttentionBackend): def get_impl_cls() -> type["AscendSFAImpl"]: return AscendSFAImpl + @staticmethod + def get_supported_kernel_block_sizes() -> list[int]: + return [128] + @dataclass class DSACPContext: diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index fe62d3cf..7ecebb60 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -2485,8 +2485,8 @@ class NPUModelRunner(GPUModelRunner): # the min of all `num_blocks`. Verify it here. assert num_blocks >= kv_cache_config.num_blocks - if hasattr(attn_backend, "get_supported_block_size") and self.use_hybrid_blocks: - block_size = attn_backend.get_supported_block_size()[0] + if hasattr(attn_backend, "get_supported_kernel_block_sizes") and self.use_hybrid_blocks: + block_size = attn_backend.get_supported_kernel_block_sizes()[0] block_size_chunk = kv_cache_spec.block_size // block_size kv_cache_shape = attn_backend.get_kv_cache_shape( @@ -2600,7 +2600,7 @@ class NPUModelRunner(GPUModelRunner): if attn_groups and self.use_hybrid_blocks: # Use the backend's supported block size list backend = attn_groups[0].backend - supported_sizes = backend.get_supported_block_size() + supported_sizes = backend.get_supported_kernel_block_sizes() # If no specific sizes supported, use cache config # block_size kernel_block_size_list = supported_sizes if supported_sizes else [self.cache_config.block_size]