diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py index 6181631d..7a1f79a4 100644 --- a/vllm_ascend/attention/attention_v1.py +++ b/vllm_ascend/attention/attention_v1.py @@ -127,7 +127,7 @@ class AscendAttentionBackend(AttentionBackend): @staticmethod def get_supported_block_size() -> list[int]: - return [64] + return [128] class AscendAttentionState(Enum): diff --git a/vllm_ascend/patch/platform/patch_mamba_config.py b/vllm_ascend/patch/platform/patch_mamba_config.py index ad083f51..1b077b41 100644 --- a/vllm_ascend/patch/platform/patch_mamba_config.py +++ b/vllm_ascend/patch/platform/patch_mamba_config.py @@ -58,7 +58,7 @@ def verify_and_update_config(cls, vllm_config) -> None: block_size=model_config.max_model_len, ).page_size_bytes - block_alignment_bytes = 64 + block_alignment_bytes = 128 # some attention backends (e.g. FA) only support setting # block size to multiple of 16, so let's suggest a value