[Refactor] AttentionBuilder inherit from base class in vllm (#5916)
### What this PR does / why we need it? This PR makes `AscendMLAMetadataBuilder` and `AscendSFAMetadataBuilder` properly inherit from the base class `MLACommonMetadataBuilder` in vllm by adding `super().__init__()` calls. **Changes:** - Add `super().__init__()` call in `AscendMLAMetadataBuilder.__init__()` - Add `super().__init__()` call in `AscendSFAMetadataBuilder.__init__()` - Extract `ascend_chunked_prefill_workspace_size()` to `vllm_ascend/attention/utils.py` to avoid code duplication - Override `determine_chunked_prefill_workspace_size()` to support Ascend-specific 128k tokens workspace size (vs 64k in parent class) - Update unit tests to mock parent class `__init__` for proper isolation **Why we need it:** - Follow proper Python inheritance patterns by calling `super().__init__()` - Reduce code duplication by reusing parent class initialization logic - Better maintainability as parent class changes will be automatically inherited Part of issue #5463 item 10 ### Does this PR introduce _any_ user-facing change? No, this is an internal refactoring that does not change any user-facing behavior. Signed-off-by: lico67373 <918688502@qq.com>
This commit is contained in:
@@ -12,6 +12,34 @@ from vllm.v1.attention.backends.utils import CommonAttentionMetadata
|
||||
from vllm_ascend.utils import AscendDeviceType, get_ascend_config, get_ascend_device_type
|
||||
|
||||
|
||||
def ascend_chunked_prefill_workspace_size(vllm_config: VllmConfig) -> int:
|
||||
scheduler_config = vllm_config.scheduler_config
|
||||
cache_config = vllm_config.cache_config
|
||||
model_config = vllm_config.model_config
|
||||
|
||||
chunked_prefill_workspace_size = min(
|
||||
# Make sure there is enough for 8 full length request or at least
|
||||
# 4 pages of cache per request
|
||||
max(8 * model_config.max_model_len, 4 * scheduler_config.max_num_seqs * cache_config.block_size),
|
||||
# For long-context models try not to over-allocate limiting
|
||||
# kv-cache space, limiting it to 128k tokens,
|
||||
# which would result in the workspace being:
|
||||
# 2*(576)*(128*1024) = 288mb
|
||||
# (assuming 576 MLA head dim, and fp16)
|
||||
# which would result in up-projected context being
|
||||
# 2*(192*128)*(128*1024) = 6gb
|
||||
# (assuming 192 QK head dim, 128 heads, and fp16)
|
||||
128 * 1024,
|
||||
)
|
||||
|
||||
chunked_prefill_workspace_size = max(
|
||||
chunked_prefill_workspace_size,
|
||||
scheduler_config.max_num_seqs * cache_config.block_size,
|
||||
)
|
||||
|
||||
return chunked_prefill_workspace_size
|
||||
|
||||
|
||||
def using_paged_attention(runtime_shape: int, vllm_config: VllmConfig) -> bool:
|
||||
if vllm_config.speculative_config is not None:
|
||||
return False
|
||||
|
||||
Reference in New Issue
Block a user