[Bugfix]Fix the compatibility issue of may_reinitialize_input_batch (#6290)
### What this PR does / why we need it?
Added a check in the may_reinitialize_input_batch method to verify
whether the backend implements the get_supported_block_size method
### Does this PR introduce _any_ user-facing change?
no user-facing change
### How was this patch tested?
Only a few lines of code within the methods were modified, and the
format check test has been passed.
- vLLM version: v0.14.1
- vLLM main:
dc917cceb8
---------
Signed-off-by: Debuuuuger <huangzr@cmbchina.com>
Signed-off-by: debuger <102402761+huangazazaz@users.noreply.github.com>
Signed-off-by: Debuuuuger <12110718@mail.sustech.edu.cn>
Co-authored-by: Debuuuuger <huangzr@cmbchina.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
This commit is contained in:
@@ -134,8 +134,8 @@ class NPUModelRunner310(NPUModelRunner):
|
|||||||
num_blocks = sum_page_size_bytes // kv_cache_spec.page_size_bytes
|
num_blocks = sum_page_size_bytes // kv_cache_spec.page_size_bytes
|
||||||
assert num_blocks >= kv_cache_config.num_blocks
|
assert num_blocks >= kv_cache_config.num_blocks
|
||||||
|
|
||||||
if hasattr(attn_backend, "get_supported_block_size") and self.use_hybrid_blocks:
|
if hasattr(attn_backend, "get_supported_kernel_block_sizes") and self.use_hybrid_blocks:
|
||||||
block_size = attn_backend.get_supported_block_size()[0]
|
block_size = attn_backend.get_supported_kernel_block_sizes()[0]
|
||||||
|
|
||||||
block_size_chunk = kv_cache_spec.block_size // block_size
|
block_size_chunk = kv_cache_spec.block_size // block_size
|
||||||
kv_cache_shape = attn_backend.get_kv_cache_shape(
|
kv_cache_shape = attn_backend.get_kv_cache_shape(
|
||||||
|
|||||||
@@ -127,7 +127,7 @@ class AscendAttentionBackend(AttentionBackend):
|
|||||||
value_caches[dst_indices] = value_caches[src_indices]
|
value_caches[dst_indices] = value_caches[src_indices]
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_supported_block_size() -> list[int]:
|
def get_supported_kernel_block_sizes() -> list[int]:
|
||||||
return [128]
|
return [128]
|
||||||
|
|
||||||
|
|
||||||
@@ -227,7 +227,7 @@ class AscendAttentionMetadataBuilder(AttentionMetadataBuilder[AscendMetadata]):
|
|||||||
self.compilation_config = vllm_config.compilation_config
|
self.compilation_config = vllm_config.compilation_config
|
||||||
self.device = device
|
self.device = device
|
||||||
self.max_num_blocks_per_req = cdiv(
|
self.max_num_blocks_per_req = cdiv(
|
||||||
self.model_config.max_model_len, AscendAttentionBackend.get_supported_block_size()[0]
|
self.model_config.max_model_len, AscendAttentionBackend.get_supported_kernel_block_sizes()[0]
|
||||||
)
|
)
|
||||||
|
|
||||||
self.speculative_config = vllm_config.speculative_config
|
self.speculative_config = vllm_config.speculative_config
|
||||||
|
|||||||
@@ -89,6 +89,10 @@ class AscendMLABackend(AttentionBackend):
|
|||||||
return AscendMlaCPImpl
|
return AscendMlaCPImpl
|
||||||
return AscendMLAImpl
|
return AscendMLAImpl
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_supported_kernel_block_sizes() -> list[int]:
|
||||||
|
return [128]
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class ChunkedContextMetadata:
|
class ChunkedContextMetadata:
|
||||||
|
|||||||
@@ -78,6 +78,10 @@ class AscendSFABackend(AttentionBackend):
|
|||||||
def get_impl_cls() -> type["AscendSFAImpl"]:
|
def get_impl_cls() -> type["AscendSFAImpl"]:
|
||||||
return AscendSFAImpl
|
return AscendSFAImpl
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_supported_kernel_block_sizes() -> list[int]:
|
||||||
|
return [128]
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class DSACPContext:
|
class DSACPContext:
|
||||||
|
|||||||
@@ -2485,8 +2485,8 @@ class NPUModelRunner(GPUModelRunner):
|
|||||||
# the min of all `num_blocks`. Verify it here.
|
# the min of all `num_blocks`. Verify it here.
|
||||||
assert num_blocks >= kv_cache_config.num_blocks
|
assert num_blocks >= kv_cache_config.num_blocks
|
||||||
|
|
||||||
if hasattr(attn_backend, "get_supported_block_size") and self.use_hybrid_blocks:
|
if hasattr(attn_backend, "get_supported_kernel_block_sizes") and self.use_hybrid_blocks:
|
||||||
block_size = attn_backend.get_supported_block_size()[0]
|
block_size = attn_backend.get_supported_kernel_block_sizes()[0]
|
||||||
|
|
||||||
block_size_chunk = kv_cache_spec.block_size // block_size
|
block_size_chunk = kv_cache_spec.block_size // block_size
|
||||||
kv_cache_shape = attn_backend.get_kv_cache_shape(
|
kv_cache_shape = attn_backend.get_kv_cache_shape(
|
||||||
@@ -2600,7 +2600,7 @@ class NPUModelRunner(GPUModelRunner):
|
|||||||
if attn_groups and self.use_hybrid_blocks:
|
if attn_groups and self.use_hybrid_blocks:
|
||||||
# Use the backend's supported block size list
|
# Use the backend's supported block size list
|
||||||
backend = attn_groups[0].backend
|
backend = attn_groups[0].backend
|
||||||
supported_sizes = backend.get_supported_block_size()
|
supported_sizes = backend.get_supported_kernel_block_sizes()
|
||||||
# If no specific sizes supported, use cache config
|
# If no specific sizes supported, use cache config
|
||||||
# block_size
|
# block_size
|
||||||
kernel_block_size_list = supported_sizes if supported_sizes else [self.cache_config.block_size]
|
kernel_block_size_list = supported_sizes if supported_sizes else [self.cache_config.block_size]
|
||||||
|
|||||||
Reference in New Issue
Block a user