[Refactor] AttentionBuilder inherit from base class in vllm (#5916)

### What this PR does / why we need it? This PR makes `AscendMLAMetadataBuilder` and `AscendSFAMetadataBuilder` properly inherit from the base class `MLACommonMetadataBuilder` in vllm by adding `super().__init__()` calls. **Changes:** - Add `super().__init__()` call in `AscendMLAMetadataBuilder.__init__()` - Add `super().__init__()` call in `AscendSFAMetadataBuilder.__init__()` - Extract `ascend_chunked_prefill_workspace_size()` to `vllm_ascend/attention/utils.py` to avoid code duplication - Override `determine_chunked_prefill_workspace_size()` to support Ascend-specific 128k tokens workspace size (vs 64k in parent class) - Update unit tests to mock parent class `__init__` for proper isolation **Why we need it:** - Follow proper Python inheritance patterns by calling `super().__init__()` - Reduce code duplication by reusing parent class initialization logic - Better maintainability as parent class changes will be automatically inherited Part of issue #5463 item 10 ### Does this PR introduce _any_ user-facing change? No, this is an internal refactoring that does not change any user-facing behavior. Signed-off-by: lico67373 <918688502@qq.com>
2026-01-21 10:45:45 +08:00
parent 839e03cbc9
commit 12a668b1d9
5 changed files with 158 additions and 40 deletions
--- a/vllm_ascend/attention/utils.py
+++ b/vllm_ascend/attention/utils.py
@@ -12,6 +12,34 @@ from vllm.v1.attention.backends.utils import CommonAttentionMetadata
 from vllm_ascend.utils import AscendDeviceType, get_ascend_config, get_ascend_device_type


+def ascend_chunked_prefill_workspace_size(vllm_config: VllmConfig) -> int:
+    scheduler_config = vllm_config.scheduler_config
+    cache_config = vllm_config.cache_config
+    model_config = vllm_config.model_config
+
+    chunked_prefill_workspace_size = min(
+        # Make sure there is enough for 8 full length request or at least
+        # 4 pages of cache per request
+        max(8 * model_config.max_model_len, 4 * scheduler_config.max_num_seqs * cache_config.block_size),
+        # For long-context models try not to over-allocate limiting
+        # kv-cache space, limiting it to 128k tokens,
+        # which would result in the workspace being:
+        #   2*(576)*(128*1024) = 288mb
+        # (assuming 576 MLA head dim, and fp16)
+        # which would result in up-projected context being
+        #   2*(192*128)*(128*1024) = 6gb
+        # (assuming 192 QK head dim, 128 heads, and fp16)
+        128 * 1024,
+    )
+
+    chunked_prefill_workspace_size = max(
+        chunked_prefill_workspace_size,
+        scheduler_config.max_num_seqs * cache_config.block_size,
+    )
+
+    return chunked_prefill_workspace_size
+
+
 def using_paged_attention(runtime_shape: int, vllm_config: VllmConfig) -> bool:
    if vllm_config.speculative_config is not None:
        return False