[Feature] LMCache Connector Integration (#9741)

Signed-off-by: Oasis-Git <ayw.sirius19@gmail.com> Signed-off-by: YuhanLiu11 <yliu738@wisc.edu> Co-authored-by: Zhiqiang Xie <xiezhq@stanford.edu>
2025-09-06 20:14:55 -07:00
parent cb3918a091
commit 9a7ced4e4d
7 changed files with 478 additions and 3 deletions
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -656,6 +656,21 @@ class Scheduler(
                    page_size=self.page_size,
                    disable=server_args.disable_radix_cache,
                )
+            elif server_args.enable_lmcache:
+                from sglang.srt.mem_cache.storage.lmcache.lmc_radix_cache import (
+                    LMCRadixCache,
+                )
+
+                self.tree_cache = LMCRadixCache(
+                    req_to_token_pool=self.req_to_token_pool,
+                    token_to_kv_pool_allocator=self.token_to_kv_pool_allocator,
+                    page_size=self.page_size,
+                    disable=server_args.disable_radix_cache,
+                    model_config=self.model_config,
+                    tp_size=self.tp_size,
+                    rank=self.tp_rank,
+                    tp_group=self.tp_group,
+                )
            else:
                self.tree_cache = RadixCache(
                    req_to_token_pool=self.req_to_token_pool,
@@ -1411,9 +1426,11 @@ class Scheduler(
            _, _, available_size, evictable_size = self._get_token_info()
            protected_size = self.tree_cache.protected_size()
            memory_leak = (available_size + evictable_size) != (
+                # self.max_total_num_tokens
+                # if not self.enable_hierarchical_cache
+                # else self.max_total_num_tokens - protected_size
                self.max_total_num_tokens
-                if not self.enable_hierarchical_cache
-                else self.max_total_num_tokens - protected_size
+                - protected_size
            )
            token_msg = f"{self.max_total_num_tokens=}, {available_size=}, {evictable_size=}, {protected_size=}\n"

--- a/python/sglang/srt/mem_cache/memory_pool.py
+++ b/python/sglang/srt/mem_cache/memory_pool.py
@@ -369,7 +369,6 @@ class MHATokenToKVPool(KVCache):
        # same applies to get_value_buffer and get_kv_buffer
        if self.layer_transfer_counter is not None:
            self.layer_transfer_counter.wait_until(layer_id - self.start_layer)
-
        return self._get_key_buffer(layer_id)

    def _get_value_buffer(self, layer_id: int):
--- a/python/sglang/srt/mem_cache/storage/lmcache/README.md
+++ b/python/sglang/srt/mem_cache/storage/lmcache/README.md
@@ -0,0 +1,43 @@
+# LMCache Connector for SGLang
+
+This document describes how to use LMCache as KV Cache Management Backend for SGLang engine.
+For more details about LMCache, please refer to: https://lmcache.ai
+
+## Install LMCache
+
+### Method 1: with pip
+
+```bash
+pip install lmcache
+```
+
+### Method 2: from source
+
+Clone LMCache project:
+
+```bash
+git clone https://github.com/LMCache/LMCache
+```
+
+Install:
+
+```bash
+cd LMCache
+pip install -e . --no-build-isolation
+```
+
+
+## Use LMCache
+
+Firstly, setup LMCache config. An example config is set at `example_config.yaml`. For more settings please refer to https://docs.lmcache.ai/api_reference/configurations.html.
+
+Secondly, setup SGLang serving engine with lmcache:
+
+```bash
+export LMCACHE_USE_EXPERIMENTAL=True
+export LMCACHE_CONFIG_FILE=example_config.yaml
+
+python -m sglang.launch_server \
+  --model-path MODEL \
+  --enable-lmcache
+```
--- a/python/sglang/srt/mem_cache/storage/lmcache/example_config.yaml
+++ b/python/sglang/srt/mem_cache/storage/lmcache/example_config.yaml
@@ -0,0 +1,7 @@
+# Basic configurations
+chunk_size: 256
+
+# CPU offloading configurations
+local_cpu: true
+use_layerwise: true
+max_local_cpu_size: 10 # number of CPU backend GB
--- a/python/sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py
+++ b/python/sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py
@@ -0,0 +1,280 @@
+from __future__ import annotations
+
+import logging
+import threading
+from typing import TYPE_CHECKING, List, Optional
+
+import torch
+
+from sglang.srt.mem_cache.allocator import BaseTokenToKVPoolAllocator
+from sglang.srt.mem_cache.base_prefix_cache import MatchResult
+from sglang.srt.mem_cache.memory_pool import ReqToTokenPool
+from sglang.srt.mem_cache.radix_cache import RadixCache, TreeNode
+
+try:
+    from lmcache.integration.sglang.sglang_adapter import (
+        LMCacheLayerwiseConnector,
+        LoadMetadata,
+        StoreMetadata,
+    )
+except ImportError as e:
+    raise RuntimeError(
+        "LMCache is not installed. Please install it by running `pip install lmcache`"
+    ) from e
+
+if TYPE_CHECKING:
+    from sglang.srt.configs.model_config import ModelConfig
+    from sglang.srt.managers.schedule_batch import Req
+
+logger = logging.getLogger(__name__)
+
+
+class LayerTransferCounter:
+    """Minimal adapter that lets the memory pool notify LMCache per-layer.
+
+    The KV pool calls `wait_until(layer_id)` after finishing a layer, which we
+    translate into a `load_kv_layerwise(layer_id)` call on the LMCache connector
+    within the provided CUDA stream.
+    """
+
+    def __init__(
+        self,
+        num_layers: int,
+        load_stream: torch.cuda.Stream,
+        lmc_connector: LMCacheLayerwiseConnector,
+        printable: bool = False,
+    ):
+        self.num_layers = num_layers
+        self.load_stream = load_stream
+        self.lmc_connector = lmc_connector
+
+    def wait_until(self, layer_id: int):
+        # Ensure ordering of the async loads wrt compute stream(s).
+        self.load_stream.synchronize()
+        with self.load_stream:
+            self.lmc_connector.load_kv_layerwise(layer_id)
+
+
+class LMCRadixCache(RadixCache):
+    """RadixCache + LMCache IO.
+
+    This subclass adds:
+      - LMCache connector setup (device/host buffers, TP rank/size)
+      - Two CUDA streams for async load/store
+      - Layer-wise transfer executor wiring to the KV cache
+      - Overridden `match_prefix` to fetch missing prefix chunks from LMCache
+      - Extended cache_finalization paths to store back into LMCache
+      - Eviction barrier that respects any in-flight host->device stores
+    """
+
+    def __init__(
+        self,
+        req_to_token_pool: ReqToTokenPool,
+        token_to_kv_pool_allocator: BaseTokenToKVPoolAllocator,
+        page_size: int,
+        disable: bool = False,
+        enable_kv_cache_events: bool = False,
+        model_config: Optional["ModelConfig"] = None,
+        tp_size: int = 1,
+        rank: int = 0,
+        tp_group: Optional[torch.distributed.ProcessGroup] = None,
+    ):
+        super().__init__(
+            req_to_token_pool=req_to_token_pool,
+            token_to_kv_pool_allocator=token_to_kv_pool_allocator,
+            page_size=page_size,
+            disable=disable,
+            enable_kv_cache_events=enable_kv_cache_events,
+        )
+
+        kvcache = self.token_to_kv_pool_allocator.get_kvcache()
+        self.lmcache_connector = LMCacheLayerwiseConnector(
+            sgl_config=model_config,
+            tp_size=tp_size,
+            rank=rank,
+            # NOTE: The original implementation accessed private buffers via
+            # `_kvcache.k_buffer` / `.v_buffer`. We prefer public accessors when
+            # available; fall back to private fields if needed.
+            k_pool=getattr(
+                kvcache,
+                "k_buffer",
+                getattr(self.token_to_kv_pool_allocator._kvcache, "k_buffer"),
+            ),
+            v_pool=getattr(
+                kvcache,
+                "v_buffer",
+                getattr(self.token_to_kv_pool_allocator._kvcache, "v_buffer"),
+            ),
+            tp_group=tp_group,
+        )
+
+        self.load_stream = torch.cuda.Stream()
+        self.store_stream = torch.cuda.Stream()
+
+        self.layer_done_executor = LayerTransferCounter(
+            num_layers=(
+                model_config.num_hidden_layers if model_config is not None else 0
+            ),
+            load_stream=self.load_stream,
+            lmc_connector=self.lmcache_connector,
+        )
+        kvcache.register_layer_transfer_counter(self.layer_done_executor)
+
+        self._in_flight_nodes: list[TreeNode] = []
+        self._node_lock = threading.Lock()
+
+    def reset(self):  # type: ignore[override]
+        super().reset()
+        if hasattr(self, "_in_flight_nodes"):
+            with self._node_lock:
+                self._in_flight_nodes.clear()
+
+    def match_prefix(self, key: List[int], **kwargs) -> MatchResult:  # type: ignore[override]
+        """Match cached prefix; if there's a tail miss, prefetch from LMCache.
+
+        Reuses the base matching logic to obtain (value, last_node). If there
+        remains a *page-aligned* uncached suffix and there is room (or after
+        eviction), we allocate token slots and trigger an async LMCache load
+        into those slots, then materialize a new child node for the retrieved
+        chunk.
+        """
+        if self.disable or not key:
+            return super().match_prefix(key, **kwargs)
+
+        if self.page_size != 1:
+            aligned_len = len(key) // self.page_size * self.page_size
+            key = key[:aligned_len]
+
+        base_res = super().match_prefix(key, **kwargs)
+        value: torch.Tensor = base_res.device_indices
+        last_node: TreeNode = base_res.last_device_node
+
+        if value.numel() == len(key):
+            return base_res
+
+        uncached_len = len(key) - value.numel()
+        if uncached_len == 0:
+            return base_res
+
+        chunk_size = self.lmcache_connector.chunk_size()
+        prefix_pad = value.numel() % chunk_size
+
+        if self.token_to_kv_pool_allocator.available_size() < uncached_len:
+            self.evict(uncached_len)
+
+        token_slots = self.token_to_kv_pool_allocator.alloc(uncached_len)
+        if token_slots is None:
+            return base_res
+
+        slot_mapping = torch.cat(
+            [
+                torch.full((value.numel(),), -1, dtype=torch.int64, device=self.device),
+                token_slots.detach().clone().to(torch.int64).to(self.device),
+            ]
+        )
+
+        with torch.cuda.stream(self.load_stream):
+            num_retrieved = self.lmcache_connector.start_load_kv(
+                LoadMetadata(
+                    token_ids=key,  # full page-aligned key
+                    slot_mapping=slot_mapping,
+                    offset=value.numel() - prefix_pad,  # LMCache offset convention
+                )
+            )
+        logger.debug("num_retrieved_tokens: %s", num_retrieved)
+
+        if num_retrieved > 0:
+            self.token_to_kv_pool_allocator.free(
+                token_slots[(num_retrieved - prefix_pad) :]
+            )
+        else:
+            self.token_to_kv_pool_allocator.free(token_slots)
+
+        if num_retrieved > 0:
+            fetched = num_retrieved - prefix_pad
+            new_node = TreeNode()
+            start = value.numel()
+            end = start + fetched
+            new_node.key = key[start:end]
+            new_node.value = token_slots[:fetched]
+            new_node.parent = last_node
+            last_node.children[self.get_child_key_fn(new_node.key)] = new_node
+            last_node = new_node
+
+            value = torch.cat([value, token_slots[:fetched]])
+            self.evictable_size_ += fetched
+
+            self._record_store_event(new_node.parent)
+            self._record_store_event(new_node)
+
+            return MatchResult(
+                device_indices=value,
+                last_device_node=last_node,
+                last_host_node=last_node,
+            )
+
+        return base_res
+
+    def cache_finished_req(self, req: "Req") -> None:  # type: ignore[override]
+        """On request completion, insert device KV into radix and store to LMCache."""
+
+        super().cache_finished_req(req)
+
+        token_ids = (req.origin_input_ids + req.output_ids)[:-1]
+        kv_indices = self.req_to_token_pool.req_to_token[
+            req.req_pool_idx, : len(token_ids)
+        ]
+
+        _, new_last_node, _, _ = self.match_prefix(token_ids)
+        assert new_last_node is not None
+
+        self.inc_lock_ref(new_last_node)
+        store_md = StoreMetadata(
+            last_node=new_last_node,
+            token_ids=token_ids,
+            kv_indices=kv_indices,
+            offset=0,
+        )
+        with torch.cuda.stream(self.store_stream):
+            self.lmcache_connector.store_kv(store_md)
+        with self._node_lock:
+            self._in_flight_nodes.append(new_last_node)
+
+    def evict(self, num_tokens: int) -> None:  # type: ignore[override]
+        """Before base eviction, wait for any outstanding stores and release locks."""
+        if self.disable:
+            return
+
+        self.store_stream.synchronize()
+        with self._node_lock:
+            for node in self._in_flight_nodes:
+                self.dec_lock_ref(node)
+            self._in_flight_nodes.clear()
+
+        super().evict(num_tokens)
+
+    def pretty_print(self):  # type: ignore[override]
+        super().pretty_print()
+        try:
+            logger.debug(
+                "evictable=%d protected=%d", self.evictable_size_, self.protected_size_
+            )
+        except Exception:  # pragma: no cover
+            pass
+
+
+if __name__ == "__main__":
+    cache = LMCRadixCache(
+        req_to_token_pool=None,
+        token_to_kv_pool_allocator=None,
+        page_size=1,
+        disable=False,
+        enable_kv_cache_events=False,
+        model_config=None,
+        tp_size=1,
+        rank=0,
+        tp_group=None,
+    )
+    cache.insert([1, 2, 3], torch.tensor([10, 11, 12], dtype=torch.int64))
+    cache.insert([1, 2, 3, 4], torch.tensor([10, 11, 12, 13], dtype=torch.int64))
+    cache.pretty_print()
--- a/python/sglang/srt/mem_cache/storage/lmcache/unit_test.py
+++ b/python/sglang/srt/mem_cache/storage/lmcache/unit_test.py
@@ -0,0 +1,121 @@
+try:
+    from lmcache.integration.sglang.sglang_adapter import (
+        LMCacheLayerwiseConnector,
+        LoadMetadata,
+        StoreMetadata,
+    )
+except ImportError:
+    raise RuntimeError(
+        "LMCache is not installed. Please install it by running `pip install lmcache` in the root directory of LMCache"
+    )
+
+import os
+
+import torch
+
+from sglang.srt.configs.model_config import ModelConfig
+
+os.environ["LMCACHE_USE_EXPERIMENTAL"] = "True"
+os.environ["LMCACHE_CONFIG_FILE"] = "example_config.yaml"
+
+
+def test_load_store_metadata():
+    model_config = ModelConfig(
+        model_path="Qwen/Qwen3-4B",
+    )
+
+    # Generate Dummy KV Cache
+    head_num = model_config.num_key_value_heads
+    head_dim = model_config.head_dim
+    layer_num = model_config.num_hidden_layers
+    buffer_size = 256
+    input_id_len = 16
+
+    k_buffer = [
+        torch.randn(buffer_size, head_num, head_dim, dtype=torch.bfloat16).cuda()
+        for _ in range(layer_num)
+    ]
+    v_buffer = [
+        torch.randn(buffer_size, head_num, head_dim, dtype=torch.bfloat16).cuda()
+        for _ in range(layer_num)
+    ]
+
+    connector = LMCacheLayerwiseConnector(model_config, 1, 0, k_buffer, v_buffer)
+
+    fake_token_ids = torch.randint(0, model_config.vocab_size, (input_id_len,)).tolist()
+    fake_kv_indices = torch.randint(0, buffer_size, (input_id_len,))
+    offset = 0
+
+    store_metadata = StoreMetadata(
+        last_node=None,
+        token_ids=fake_token_ids,
+        kv_indices=fake_kv_indices,
+        offset=offset,
+    )
+
+    load_metadata = LoadMetadata(
+        token_ids=fake_token_ids,
+        slot_mapping=fake_kv_indices,
+        offset=offset,
+    )
+
+    current_stream = torch.cuda.current_stream()
+
+    retrieve_token_num = connector.start_load_kv(load_metadata)
+    assert retrieve_token_num == 0
+
+    connector.store_kv(store_metadata)
+    current_stream.synchronize()
+
+    # check retrieve
+    gt_key_buffer = [
+        torch.zeros(input_id_len, head_num, head_dim, dtype=torch.bfloat16).cuda()
+        for _ in range(layer_num)
+    ]
+    gt_value_buffer = [
+        torch.zeros(input_id_len, head_num, head_dim, dtype=torch.bfloat16).cuda()
+        for _ in range(layer_num)
+    ]
+
+    for i in range(layer_num):
+        gt_key_buffer[i] = k_buffer[i][fake_kv_indices]
+        gt_value_buffer[i] = v_buffer[i][fake_kv_indices]
+
+    # clear the k_buffer and v_buffer
+    for _ in range(layer_num):
+        k_buffer[i].zero_()
+        v_buffer[i].zero_()
+
+    retrieve_token_num = connector.start_load_kv(load_metadata)
+    assert retrieve_token_num == input_id_len
+
+    for i in range(layer_num):
+        current_stream.synchronize()
+        connector.load_kv_layerwise(i)
+
+    current_stream.synchronize()
+    test_key_buffer = [
+        torch.zeros(input_id_len, head_num, head_dim, dtype=torch.bfloat16).cuda()
+        for _ in range(layer_num)
+    ]
+    test_value_buffer = [
+        torch.zeros(input_id_len, head_num, head_dim, dtype=torch.bfloat16).cuda()
+        for _ in range(layer_num)
+    ]
+
+    for i in range(layer_num):
+        test_key_buffer[i] = k_buffer[i][fake_kv_indices]
+        test_value_buffer[i] = v_buffer[i][fake_kv_indices]
+
+    for i in range(layer_num):
+        assert torch.allclose(test_key_buffer[i], gt_key_buffer[i])
+        assert torch.allclose(test_value_buffer[i], gt_value_buffer[i])
+
+    print("================================================")
+    print("TEST_LOAD_STORE_METADATA PASSED!")
+    print("================================================")
+    connector.close()
+
+
+if __name__ == "__main__":
+    test_load_store_metadata()
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -303,6 +303,8 @@ class ServerArgs:
    hicache_storage_backend: Optional[str] = None
    hicache_storage_prefetch_policy: str = "best_effort"
    hicache_storage_backend_extra_config: Optional[str] = None
+    # LMCache
+    enable_lmcache: bool = False

    # Double Sparsity
    enable_double_sparsity: bool = False
@@ -1735,6 +1737,12 @@ class ServerArgs:
            default=ServerArgs.hicache_storage_backend_extra_config,
            help="A dictionary in JSON string format containing extra configuration for the storage backend.",
        )
+        # LMCache
+        parser.add_argument(
+            "--enable-lmcache",
+            action="store_true",
+            help="Using LMCache as an alternative hierarchical cache solution",
+        )

        # Double Sparsity
        parser.add_argument(