[P/D]Mooncake Layerwise Connector supports hybrid attention manager with multiple kvcache groups (#7022)

### What this PR does / why we need it?
Mooncake Layerwise Connector supports hybrid attention manager with
multiple kvcache groups.

### Does this PR introduce _any_ user-facing change?
Yes.

### How was this patch tested?
By CI.

- vLLM version: v0.16.0
- vLLM main:
15d76f74e2

---------

Signed-off-by: nwpu-zxr <zhouxuerong2@huawei.com>
This commit is contained in:
zxr2333
2026-03-10 23:59:20 +08:00
committed by GitHub
parent 0f289fa2a8
commit 239683c7a6
6 changed files with 565 additions and 224 deletions

View File

@@ -29,6 +29,7 @@ from vllm.v1.attention.backend import AttentionMetadata # type: ignore
from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadata
from vllm.v1.attention.backends.utils import PAD_SLOT_ID
from vllm_ascend.attention.utils import maybe_save_kv_layer_to_connector
from vllm_ascend.ops.triton.fla.fused_qkvzba_split_reshape import fused_qkvzba_split_reshape_cat
from vllm_ascend.ops.triton.fused_gdn_gating import fused_gdn_gating_patch
from vllm_ascend.utils import enable_sp
@@ -85,6 +86,7 @@ class AscendQwen3Next_GatedDeltaNet(Qwen3NextGatedDeltaNet):
# ============================================================
# Part 3: Output Projection
# ============================================================
maybe_save_kv_layer_to_connector("", [])
z_shape_og = z.shape
# Reshape input data into 2D tensor
core_attn_out = core_attn_out.reshape(-1, core_attn_out.shape[-1])