integrate AIBrix KVcache (#10376)

2025-09-25 14:47:09 +08:00
parent 3d40794fcf
commit fce170480a
6 changed files with 307 additions and 2 deletions
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -41,7 +41,7 @@ repos:
    hooks:
      - id: codespell
        additional_dependencies: ['tomli']
-        args: ['--toml', 'python/pyproject.toml', '-L', 'cann,thi,makro,wil,rouge']
+        args: ['--toml', 'python/pyproject.toml', '-L', 'cann,thi,makro,wil,rouge,PRIS']
        exclude: |
          (?x)^(
            test/srt/test_reasoning_parser\.py|
--- a/python/sglang/srt/managers/cache_controller.py
+++ b/python/sglang/srt/managers/cache_controller.py
@@ -289,6 +289,14 @@ class HiCacheController:
                )

                self.storage_backend = MooncakeStore(self.storage_config)
+            elif storage_backend == "aibrix":
+                from sglang.srt.mem_cache.storage.aibrix_kvcache.aibrix_kvcache_storage import (
+                    AibrixKVCacheStorage,
+                )
+
+                self.storage_backend = AibrixKVCacheStorage(
+                    self.storage_config, self.mem_pool_host
+                )
            elif storage_backend == "hf3fs":
                from sglang.srt.mem_cache.storage.hf3fs.storage_hf3fs import (
                    HiCacheHF3FS,
--- a/python/sglang/srt/mem_cache/storage/aibrix_kvcache/README.md
+++ b/python/sglang/srt/mem_cache/storage/aibrix_kvcache/README.md
@@ -0,0 +1,37 @@
+# AIBrix KVCache as L3 KV Cache
+This document provides brief instructions for setting up a AIBrixKVCache storage backend +  AIBrixKVCache + SGLang runtime environment from scratch, describing how to utilize AIBrixKVCache as the L3 KV cache for SGLang.
+The process consists of three main steps:
+
+## Step1:Install AIbrix KVCache
+Refer to the [AIBrix KVCache documentation](https://github.com/vllm-project/aibrix/blob/main/python/aibrix_kvcache/README.md) to install  AIBrix KVCache.
+
+## Step2: Deploy AIBrix Distributed KVCache Storage
+
+AIBrix KVCache currently supports multiple distributed KVCache backends, including ByteDance's open-source Infinistore and the not-yet-open source PrisKV incubated by ByteDance's PrisDB & IAAS & DMI team.
+
+For the Infinistore installation process, please refer to [this link](https://github.com/bytedance/InfiniStore).
+
+PrisKV for AIBrix KVCache is currently in the open-source preparation stage, and no public documentation is available yet.
+
+
+## Step3: Deploy Model Serving
+
+For information on configuring a distributed KVCache backend for AIBrixKVCache, please refer to [this link](https://aibrix.readthedocs.io/latest/designs/aibrix-kvcache-offloading-framework.html)
+
+Using PrisKV as an example, the startup command is as follows:
+```bash
+export AIBRIX_KV_CACHE_OL_L1_CACHE_ENABLED="0"
+export AIBRIX_KV_CACHE_OL_L2_CACHE_BACKEND="PRIS"
+export AIBRIX_KV_CACHE_OL_PRIS_REMOTE_ADDR="127.0.0.1"
+export AIBRIX_KV_CACHE_OL_PRIS_REMOTE_PORT="6379"
+export AIBRIX_KV_CACHE_OL_PRIS_PASSWORD="kvcache-redis"
+MODEL_LENGTH=32768&&NCCL_MIN_NCHANNELS=24&&NCCL_IB_QPS_PER_CONNECTION=8&&NCCL_DEBUG=INFO \
+python3 -m sglang.launch_server \
+	--model-path /code/models/Qwen3-32B \
+	--host 0.0.0.0 --port 8080 \
+	--enable-hierarchical-cache \
+	--hicache-storage-backend aibrix \
+	--page-size 16 \
+	--hicache-write-policy write_back \
+	--enable-metrics --hicache-ratio=2
+```
--- a/python/sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py
+++ b/python/sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py
@@ -0,0 +1,151 @@
+import logging
+from typing import Any, List, Optional
+
+import torch
+from aibrix_kvcache import (
+    BaseKVCacheManager,
+    BlockHashes,
+    KVCacheBlockLayout,
+    KVCacheBlockSpec,
+    KVCacheConfig,
+    KVCacheTensorSpec,
+    ModelSpec,
+)
+from aibrix_kvcache.common.absl_logging import log_every_n_seconds
+
+from sglang.srt.mem_cache.hicache_storage import HiCacheStorage, HiCacheStorageConfig
+from sglang.srt.mem_cache.memory_pool_host import HostKVCache
+
+logger = logging.getLogger(__name__)
+
+
+class AibrixKVCacheStorage(HiCacheStorage):
+    def __init__(self, storage_config: HiCacheStorageConfig, mem_pool: HostKVCache):
+        if storage_config is not None:
+            self.is_mla_backend = storage_config.is_mla_model
+            self.local_rank = storage_config.tp_rank
+        else:
+            self.is_mla_backend = False
+            self.local_rank = 0
+        kv_cache = mem_pool.device_pool
+        self.page_size = mem_pool.page_size
+        self.kv_cache_dtype = kv_cache.dtype
+        self.layer_num = kv_cache.layer_num
+        self.kv_head_ids = [
+            self.local_rank * kv_cache.head_num + i for i in range(kv_cache.head_num)
+        ]
+        if not self.is_mla_backend:
+            self.layer_ids = range(
+                kv_cache.start_layer, kv_cache.end_layer
+            )  # for pipeline parallel
+
+            self.block_spec = KVCacheBlockSpec(
+                block_ntokens=self.page_size,
+                block_dtype=self.kv_cache_dtype,
+                block_layout=KVCacheBlockLayout(KVCacheBlockLayout.NCLD),
+                tensor_spec=KVCacheTensorSpec(
+                    heads=self.kv_head_ids,
+                    layers=self.layer_ids,
+                    head_size=kv_cache.head_dim,
+                ),
+            )
+            logger.info(self.block_spec)
+            config = KVCacheConfig(
+                block_spec=self.block_spec, model_spec=ModelSpec(102400)
+            )
+            self.kv_cache_manager = BaseKVCacheManager(config)
+        else:
+            raise NotImplementedError(
+                "MLA is not supported by AibrixKVCacheStorage yet."
+            )
+
+    def _aibrix_kvcache_metrics_report(self):
+        self.kv_cache_manager.metrics.summary()
+        self.kv_cache_manager.metrics.reset()
+
+    def batch_get(
+        self,
+        keys: List[str],
+        target_locations: List[torch.Tensor],
+        target_sizes: Optional[Any] = None,
+    ) -> List[torch.Tensor | None]:
+        block_hash = BlockHashes(keys, self.page_size)
+        status = self.kv_cache_manager.acquire(None, block_hash)
+        log_every_n_seconds(
+            logger, logging.INFO, self._aibrix_kvcache_metrics_report(), 1
+        )
+        if status.is_ok():
+            num_fetched_tokens, handle = status.value
+            kv_blocks = handle.to_tensors()
+            assert len(kv_blocks) == len(target_locations)
+            for i in range(len(kv_blocks)):
+                assert (
+                    target_locations[i].nbytes == kv_blocks[i].nbytes
+                ), f"{target_locations[i].nbytes}, {kv_blocks[i].nbytes}"
+                target_locations[i].copy_(kv_blocks[i].flatten())
+            handle.release()
+            return target_locations
+
+        return [None] * len(keys)
+
+    def get(
+        self,
+        key: str,
+        target_location: Optional[Any] = None,
+        target_size: Optional[Any] = None,
+    ) -> torch.Tensor | None:
+        return self.batch_get([key], [target_location], [target_size])[0]
+
+    def batch_set(
+        self,
+        keys: List[str],
+        values: Optional[Any] = None,
+        target_locations: Optional[Any] = None,
+        target_sizes: Optional[Any] = None,
+    ) -> bool:
+        block_hash = BlockHashes(keys, self.page_size)
+        status = self.kv_cache_manager.allocate_for(None, block_hash)
+        if not status.is_ok():
+            logger.warning(
+                f"aibrix_kvcache set allocate failed, error_code {status.error_code}"
+            )
+            return False
+        handle = status.value
+        tensors = handle.to_tensors()
+        if len(tensors) != len(values):
+            logger.warning("aibrix_kvcache set allocate not enough")
+            return False
+        for i in range(len(tensors)):
+            assert (
+                tensors[i].nbytes == values[i].nbytes
+            ), f"{tensors[i].nbytes}, {values[i].nbytes}"
+            tensors[i].reshape(values[i].shape).copy_(values[i]).reshape(
+                tensors[i].shape
+            )
+        status = self.kv_cache_manager.put(None, block_hash, handle)
+        if not status.is_ok():
+            logger.info(
+                f"AIBrix KVCache Storage set failed, error_code {status.error_code}"
+            )
+            return False
+        completed = status.value
+        return completed == len(keys) * self.page_size
+
+    def set(
+        self,
+        key: str,
+        value: Optional[Any] = None,
+        target_location: Optional[Any] = None,
+        target_size: Optional[Any] = None,
+    ) -> bool:
+        return self.batch_set([key], [value], [target_location], [target_size])
+
+    def batch_exists(self, keys: List[str]) -> int:
+        block_hash = BlockHashes(keys, self.page_size)
+        status = self.kv_cache_manager.exists(None, block_hash)
+        if status.is_ok():
+            return status.value // self.page_size
+        return 0
+
+    def exists(self, key: str) -> bool | dict:
+        return self.batch_exists([key]) > 0
--- a/python/sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py
+++ b/python/sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py
@@ -0,0 +1,109 @@
+import logging
+import os
+
+import torch
+import torch.distributed
+from aibrix_kvcache import (
+    BaseKVCacheManager,
+    GroupAwareKVCacheManager,
+    KVCacheBlockLayout,
+    KVCacheBlockSpec,
+    KVCacheConfig,
+    KVCacheMetrics,
+    KVCacheTensorSpec,
+    ModelSpec,
+    TokenListView,
+)
+from aibrix_kvcache.common.absl_logging import getLogger, log_every_n_seconds, log_if
+from aibrix_kvcache_storage import AibrixKVCacheStorage
+from torch.distributed import Backend, ProcessGroup
+
+from sglang.srt.mem_cache.hicache_storage import HiCacheStorageConfig
+from sglang.srt.mem_cache.memory_pool import MHATokenToKVPool
+from sglang.srt.mem_cache.memory_pool_host import MHATokenToKVPoolHost
+
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
+)
+
+logger = logging.getLogger(__name__)
+
+
+def setup():
+    os.environ["RANK"] = "0"
+    os.environ["WORLD_SIZE"] = "1"
+    os.environ["MASTER_ADDR"] = "127.0.0.1"
+    os.environ["MASTER_PORT"] = "63886"
+
+
+class AIBrixKVCacheStorageTest:
+    def test_with_page_size(self):
+        config = HiCacheStorageConfig(
+            tp_rank=0,
+            tp_size=1,
+            is_mla_model=False,
+            is_page_first_layout=True,
+            model_name="test",
+        )
+        for page_size in range(1, 3):
+            logger.info(f"page_size: {page_size}")
+            batch_size = 2
+            head_num = 1
+            layer_num = 64
+            head_dim = 128
+            kv_cache = MHATokenToKVPool(
+                1024,
+                page_size,
+                torch.float16,
+                head_num,
+                head_dim,
+                layer_num,
+                "cpu",
+                False,
+                0,
+                layer_num,
+            )
+            mem_pool = MHATokenToKVPoolHost(kv_cache, 2, 0, page_size, "layer_first")
+            query_length = batch_size * 2
+            partial = batch_size
+            self.aibrix_kvcache = AibrixKVCacheStorage(config, mem_pool)
+            target_shape = (2, layer_num, page_size, head_num, head_dim)
+            rand_tensor = [
+                torch.rand(target_shape, dtype=torch.float16)
+                for _ in range(query_length)
+            ]
+            keys = ["hash" + str(i) for i in range(query_length)]
+            partial_keys = keys[batch_size:query_length]
+            assert self.aibrix_kvcache.batch_exists(keys) == 0
+            assert self.aibrix_kvcache.batch_set(keys, rand_tensor)
+            get_tensor = [
+                torch.rand(target_shape, dtype=torch.float16).flatten()
+                for _ in range(query_length)
+            ]
+            self.aibrix_kvcache.batch_get(keys, get_tensor)
+            for i in range(query_length):
+                assert torch.equal(get_tensor[i], rand_tensor[i].flatten())
+            ret = self.aibrix_kvcache.batch_exists(keys)
+            assert self.aibrix_kvcache.batch_exists(keys) == query_length
+            assert self.aibrix_kvcache.batch_exists(partial_keys) == partial
+            partial_get_tensor = [
+                torch.rand(target_shape, dtype=torch.float16).flatten()
+                for _ in range(partial)
+            ]
+            self.aibrix_kvcache.batch_get(partial_keys, partial_get_tensor)
+            for i in range(partial):
+                assert torch.equal(
+                    partial_get_tensor[i], rand_tensor[i + partial].flatten()
+                )
+            log_every_n_seconds(
+                logger,
+                logging.INFO,
+                self.aibrix_kvcache.kv_cache_manager.metrics.summary(),
+                1,
+            )
+
+
+if __name__ == "__main__":
+    setup()
+    test = AIBrixKVCacheStorageTest()
+    test.test_with_page_size()
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -2154,7 +2154,7 @@ class ServerArgs:
        parser.add_argument(
            "--hicache-storage-backend",
            type=str,
-            choices=["file", "mooncake", "hf3fs", "nixl"],
+            choices=["file", "mooncake", "hf3fs", "nixl", "aibrix"],
            default=ServerArgs.hicache_storage_backend,
            help="The storage backend for hierarchical KV cache.",
        )