[bugfix] Fixing KV Pool Memory Retention and Performance Degradation Issues (#5751)
### What this PR does / why we need it?
1.Fixed memory retention on certain GPUs caused by missing PUT
operations.
2.Fixed performance degradation resulting from architectural
incompatibilities in the underlying refactor.
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.13.0
- vLLM main:
2f4e6548ef
---------
Signed-off-by: fems14 <1804143737@qq.com>
This commit is contained in:
@@ -6,6 +6,7 @@ from dataclasses import dataclass
|
||||
from typing import Union
|
||||
|
||||
# Third Party
|
||||
from mooncake.store import ReplicateConfig # type: ignore
|
||||
from vllm.config import ParallelConfig
|
||||
from vllm.logger import logger
|
||||
from vllm.utils.network_utils import get_ip
|
||||
@@ -56,7 +57,11 @@ class MooncakeBackend(Backend):
|
||||
def put(self, keys: list[str], addrs: list[list[int]],
|
||||
sizes: list[list[int]]):
|
||||
try:
|
||||
res = self.store.batch_put_from_multi_buffers(keys, addrs, sizes)
|
||||
config = ReplicateConfig()
|
||||
config.preferred_segment = self.local_seg
|
||||
config.prefer_alloc_in_same_node = True
|
||||
res = self.store.batch_put_from_multi_buffers(
|
||||
keys, addrs, sizes, config)
|
||||
for value in res:
|
||||
if value < 0:
|
||||
logger.error(f"Failed to put key {keys},res:{res}")
|
||||
@@ -66,7 +71,8 @@ class MooncakeBackend(Backend):
|
||||
def get(self, keys: list[str], addrs: list[list[int]],
|
||||
sizes: list[list[int]]):
|
||||
try:
|
||||
res = self.store.batch_get_into_multi_buffers(keys, addrs, sizes)
|
||||
res = self.store.batch_get_into_multi_buffers(
|
||||
keys, addrs, sizes, True)
|
||||
for value in res:
|
||||
if value < 0:
|
||||
logger.error(f"Failed to get key {keys}, res:{res}")
|
||||
|
||||
Reference in New Issue
Block a user