[Platform][BugFix] Preserve hybrid block size on Ascend (#7528)
### What this PR does / why we need it
This PR fixes a startup regression for Ascend hybrid attention + mamba
models after upgrading to vLLM `0.18.0`.
However, after the vLLM `0.18.0` upgrade, worker initialization still
calls the generic platform hook:
- `current_platform.update_block_size_for_backend(vllm_config)`
### How this PR fixes it
This PR keeps the fix strictly inside `vllm-ascend`.
It adds an Ascend override for
`NPUPlatform.update_block_size_for_backend()`:
- for hybrid models, do not run the generic upstream block-size fallback
- preserve the block size that was already computed by the hybrid
model-specific config logic
- for non-hybrid models, keep the original upstream behavior unchanged
- vLLM version: v0.18.0
- vLLM main:
8b6325758c
---------
Signed-off-by: maoxx241 <maomaoyu870@gmail.com>
Signed-off-by: Mengqing Cao <cmq0113@163.com>
Co-authored-by: Mengqing Cao <cmq0113@163.com>
This commit is contained in:
@@ -307,6 +307,26 @@ class TestNPUPlatform(TestBase):
|
||||
|
||||
self.assertEqual(vllm_config.cache_config.block_size, 128)
|
||||
|
||||
def test_update_block_size_for_backend_preserves_hybrid_block_size(self):
|
||||
vllm_config = TestNPUPlatform.mock_vllm_config()
|
||||
vllm_config.model_config.is_hybrid = True
|
||||
vllm_config.cache_config.block_size = 1024
|
||||
vllm_config.cache_config.user_specified_block_size = False
|
||||
|
||||
self.platform.update_block_size_for_backend(vllm_config)
|
||||
|
||||
self.assertEqual(vllm_config.cache_config.block_size, 1024)
|
||||
|
||||
def test_update_block_size_for_backend_preserves_user_block_size(self):
|
||||
vllm_config = TestNPUPlatform.mock_vllm_config()
|
||||
vllm_config.model_config.is_hybrid = False
|
||||
vllm_config.cache_config.block_size = 512
|
||||
vllm_config.cache_config.user_specified_block_size = True
|
||||
|
||||
self.platform.update_block_size_for_backend(vllm_config)
|
||||
|
||||
self.assertEqual(vllm_config.cache_config.block_size, 512)
|
||||
|
||||
@patch("vllm_ascend.quantization.utils.maybe_auto_detect_quantization")
|
||||
@patch("vllm_ascend.utils.get_ascend_device_type", return_value=AscendDeviceType.A3)
|
||||
@patch("vllm_ascend.ascend_config.init_ascend_config")
|
||||
|
||||
Reference in New Issue
Block a user