[bugfix] Fixing KV Pool Memory Retention and Performance Degradation Issues (#5751)

### What this PR does / why we need it?
1.Fixed memory retention on certain GPUs caused by missing PUT
operations.

2.Fixed performance degradation resulting from architectural
incompatibilities in the underlying refactor.
### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: v0.13.0
- vLLM main:
2f4e6548ef

---------

Signed-off-by: fems14 <1804143737@qq.com>
This commit is contained in:
fems14
2026-01-09 17:46:23 +08:00
committed by GitHub
parent 3ba064f804
commit ff4c1a47b3
6 changed files with 27 additions and 22 deletions

View File

@@ -6,6 +6,7 @@ from dataclasses import dataclass
from typing import Union
# Third Party
from mooncake.store import ReplicateConfig # type: ignore
from vllm.config import ParallelConfig
from vllm.logger import logger
from vllm.utils.network_utils import get_ip
@@ -56,7 +57,11 @@ class MooncakeBackend(Backend):
def put(self, keys: list[str], addrs: list[list[int]],
sizes: list[list[int]]):
try:
res = self.store.batch_put_from_multi_buffers(keys, addrs, sizes)
config = ReplicateConfig()
config.preferred_segment = self.local_seg
config.prefer_alloc_in_same_node = True
res = self.store.batch_put_from_multi_buffers(
keys, addrs, sizes, config)
for value in res:
if value < 0:
logger.error(f"Failed to put key {keys},res:{res}")
@@ -66,7 +71,8 @@ class MooncakeBackend(Backend):
def get(self, keys: list[str], addrs: list[list[int]],
sizes: list[list[int]]):
try:
res = self.store.batch_get_into_multi_buffers(keys, addrs, sizes)
res = self.store.batch_get_into_multi_buffers(
keys, addrs, sizes, True)
for value in res:
if value < 0:
logger.error(f"Failed to get key {keys}, res:{res}")