From 900086fdc6537b197e1ad4dd6abae1e6927f9682 Mon Sep 17 00:00:00 2001 From: Mengqing Cao Date: Wed, 29 Oct 2025 14:18:52 +0800 Subject: [PATCH] [HybridKV][Bugfix] Fix Hybrid kvcache sharing bug in same attention type (#3760) ### What this PR does / why we need it? Part of https://github.com/vllm-project/vllm-ascend/pull/3106 Fix Hybrid kvcache sharing bug in same attention type Change the `shared_by` logic so that the same attention spec could share the same buffer instead of allocating more hbm. After this pr, kvcache memory saved 50% in qwen3-next compared with before (`self_attn:linear_attn=1:3` in an `attn_group`), and `gpu_memory_utilization` could increase to `0.8` on Qwen3-Next when running on A2 64G/card with tp4 image ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? Test pass with the latest e2e test case on qwen3-next - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/c9461e05a4ed3557cfbf4b15ded1e26761cc39ca --------- Signed-off-by: MengqingCao --- tests/e2e/multicard/test_qwen3_next.py | 4 +-- vllm_ascend/worker/model_runner_v1.py | 42 +++++++++++++++----------- 2 files changed, 26 insertions(+), 20 deletions(-) diff --git a/tests/e2e/multicard/test_qwen3_next.py b/tests/e2e/multicard/test_qwen3_next.py index fe246d19..2f54b5b2 100644 --- a/tests/e2e/multicard/test_qwen3_next.py +++ b/tests/e2e/multicard/test_qwen3_next.py @@ -27,12 +27,12 @@ from tests.e2e.conftest import VllmRunner def test_models_distributed_Qwen3_NEXT_TP4(): example_prompts = [ "Hello, my name is", - ] + ] * 4 max_tokens = 5 with VllmRunner("Qwen/Qwen3-Next-80B-A3B-Instruct", tensor_parallel_size=4, max_model_len=4096, - gpu_memory_utilization=0.7, + gpu_memory_utilization=0.8, distributed_executor_backend="mp", enforce_eager=True) as vllm_model: vllm_model.generate_greedy(example_prompts, max_tokens) diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 3b1226df..e9886bbc 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -3225,25 +3225,26 @@ class NPUModelRunner(LoRAModelRunnerMixin): # TODO: REFACTOR ME to sharing hybrid cache for idx in range(len(kv_cache_tensor.shared_by)): layer_name = kv_cache_tensor.shared_by[idx] - if "linear_attn" in layer_name: + if "linear_attn" in layer_name and layer_name not in kv_cache_raw_tensors.keys( + ): # for mamba linear attention + if self.vllm_config.kv_transfer_config is None: + tensor = torch.zeros(kv_cache_tensor.size, + dtype=torch.int8, + device=self.device) + else: + cache_size_aligned = kv_cache_tensor.size + alignment + tensor = torch.zeros(cache_size_aligned, + dtype=torch.int8, + device=self.device) + tensor = self._align_memory( + tensor, alignment)[:kv_cache_tensor.size] for layer_name_inner in kv_cache_tensor.shared_by: - if ("attn" in layer_name_inner and "linear_attn" not in layer_name_inner) or \ - layer_name_inner in kv_cache_raw_tensors.keys(): - continue - if self.vllm_config.kv_transfer_config is None: - tensor = torch.zeros(kv_cache_tensor.size, - dtype=torch.int8, - device=self.device) - else: - cache_size_aligned = kv_cache_tensor.size + alignment - tensor = torch.zeros(cache_size_aligned, - dtype=torch.int8, - device=self.device) - tensor = self._align_memory( - tensor, alignment)[:kv_cache_tensor.size] - kv_cache_raw_tensors[layer_name_inner] = tensor - elif "attn" in layer_name: + # shared the kvcache between the linear_attn specs in the same group + if "linear_attn" in layer_name_inner: + kv_cache_raw_tensors[layer_name_inner] = tensor + elif "attn" in layer_name and layer_name not in kv_cache_raw_tensors.keys( + ): # for other attentions, e.g., self_attn, sliding window attn if self.vllm_config.kv_transfer_config is None: k_tensor = torch.zeros(kv_cache_tensor.size // 2, @@ -3265,7 +3266,12 @@ class NPUModelRunner(LoRAModelRunnerMixin): alignment)[:cache_size] v_tensor = self._align_memory(v_tensor, alignment)[:cache_size] - kv_cache_raw_tensors[layer_name] = (k_tensor, v_tensor) + for layer_name_inner in kv_cache_tensor.shared_by: + # shared the kvcache between the self_attn specs in the same group + if ("attn" in layer_name_inner + and "linear_attn" not in layer_name_inner): + kv_cache_raw_tensors[layer_name_inner] = (k_tensor, + v_tensor) layer_names = set() for group in kv_cache_config.kv_cache_groups: