[P/D]Mooncake Layerwise Connector supports hybrid attention manager with multiple kvcache groups (#7022)
### What this PR does / why we need it?
Mooncake Layerwise Connector supports hybrid attention manager with
multiple kvcache groups.
### Does this PR introduce _any_ user-facing change?
Yes.
### How was this patch tested?
By CI.
- vLLM version: v0.16.0
- vLLM main:
15d76f74e2
---------
Signed-off-by: nwpu-zxr <zhouxuerong2@huawei.com>
This commit is contained in:
@@ -1073,7 +1073,11 @@ def refresh_block_size(vllm_config):
|
||||
return
|
||||
|
||||
# TODO(MengqingCao): Remove the model_type check, after resolving the hidden error in get_kv_cache_groups.
|
||||
if model_config.hf_text_config.model_type != "qwen3_next" and cache_config.block_size != 128:
|
||||
if (
|
||||
"qwen3_next" not in model_config.hf_text_config.model_type
|
||||
and "qwen3_5" not in model_config.hf_text_config.model_type
|
||||
and cache_config.block_size != 128
|
||||
):
|
||||
if cache_config.enable_prefix_caching or scheduler_config.enable_chunked_prefill:
|
||||
logger.info("Block size is set to 128 if prefix cache or chunked prefill is enabled.")
|
||||
cache_config.block_size = 128
|
||||
|
||||
Reference in New Issue
Block a user