[bugfix] Fixing KV Pool Memory Retention and Performance Degradation Issues (#5751)

### What this PR does / why we need it? 1.Fixed memory retention on certain GPUs caused by missing PUT operations. 2.Fixed performance degradation resulting from architectural incompatibilities in the underlying refactor. ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.13.0 - vLLM main: 2f4e6548ef --------- Signed-off-by: fems14 <1804143737@qq.com>
2026-01-09 17:46:23 +08:00
parent 3ba064f804
commit ff4c1a47b3
6 changed files with 27 additions and 22 deletions
--- a/vllm_ascend/distributed/kvpool/backend/mooncake_backend.py
+++ b/vllm_ascend/distributed/kvpool/backend/mooncake_backend.py
@@ -6,6 +6,7 @@ from dataclasses import dataclass
 from typing import Union

 # Third Party
+from mooncake.store import ReplicateConfig  # type: ignore
 from vllm.config import ParallelConfig
 from vllm.logger import logger
 from vllm.utils.network_utils import get_ip
@@ -56,7 +57,11 @@ class MooncakeBackend(Backend):
    def put(self, keys: list[str], addrs: list[list[int]],
            sizes: list[list[int]]):
        try:
-            res = self.store.batch_put_from_multi_buffers(keys, addrs, sizes)
+            config = ReplicateConfig()
+            config.preferred_segment = self.local_seg
+            config.prefer_alloc_in_same_node = True
+            res = self.store.batch_put_from_multi_buffers(
+                keys, addrs, sizes, config)
            for value in res:
                if value < 0:
                    logger.error(f"Failed to put key {keys},res:{res}")
@@ -66,7 +71,8 @@ class MooncakeBackend(Backend):
    def get(self, keys: list[str], addrs: list[list[int]],
            sizes: list[list[int]]):
        try:
-            res = self.store.batch_get_into_multi_buffers(keys, addrs, sizes)
+            res = self.store.batch_get_into_multi_buffers(
+                keys, addrs, sizes, True)
            for value in res:
                if value < 0:
                    logger.error(f"Failed to get key {keys}, res:{res}")