[P/D]Mooncake Layerwise Connector supports hybrid attention manager with multiple kvcache groups (#7022)

### What this PR does / why we need it? Mooncake Layerwise Connector supports hybrid attention manager with multiple kvcache groups. ### Does this PR introduce _any_ user-facing change? Yes. ### How was this patch tested? By CI. - vLLM version: v0.16.0 - vLLM main: 15d76f74e2 --------- Signed-off-by: nwpu-zxr <zhouxuerong2@huawei.com>
2026-03-10 23:59:20 +08:00
parent 0f289fa2a8
commit 239683c7a6
6 changed files with 565 additions and 224 deletions
--- a/vllm_ascend/utils.py
+++ b/vllm_ascend/utils.py
@@ -1073,7 +1073,11 @@ def refresh_block_size(vllm_config):
        return

    # TODO(MengqingCao): Remove the model_type check, after resolving the hidden error in get_kv_cache_groups.
-    if model_config.hf_text_config.model_type != "qwen3_next" and cache_config.block_size != 128:
+    if (
+        "qwen3_next" not in model_config.hf_text_config.model_type
+        and "qwen3_5" not in model_config.hf_text_config.model_type
+        and cache_config.block_size != 128
+    ):
        if cache_config.enable_prefix_caching or scheduler_config.enable_chunked_prefill:
            logger.info("Block size is set to 128 if prefix cache or chunked prefill is enabled.")
            cache_config.block_size = 128