Unit test for Hierarchical Caching (#4486)

2025-03-17 17:45:00 -07:00
parent 9b81f9bd34
commit a98290aea3
7 changed files with 65 additions and 5 deletions
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -445,6 +445,7 @@ class Scheduler(SchedulerOutputProcessorMixin):
                    token_to_kv_pool_allocator=self.token_to_kv_pool_allocator,
                    tp_cache_group=self.tp_worker.get_tp_cpu_group(),
                    page_size=self.page_size,
+                    hicache_ratio=server_args.hicache_ratio,
                )
            else:
                self.tree_cache = RadixCache(
--- a/python/sglang/srt/mem_cache/hiradix_cache.py
+++ b/python/sglang/srt/mem_cache/hiradix_cache.py
@@ -29,6 +29,7 @@ class HiRadixCache(RadixCache):
        token_to_kv_pool_allocator: TokenToKVPoolAllocator,
        tp_cache_group: torch.distributed.ProcessGroup,
        page_size: int,
+        hicache_ratio: float,
    ):
        if page_size != 1:
            raise ValueError(
@@ -36,9 +37,13 @@ class HiRadixCache(RadixCache):
            )
        self.kv_cache = token_to_kv_pool_allocator.get_kvcache()
        if isinstance(self.kv_cache, MHATokenToKVPool):
-            self.token_to_kv_pool_host = MHATokenToKVPoolHost(self.kv_cache)
+            self.token_to_kv_pool_host = MHATokenToKVPoolHost(
+                self.kv_cache, hicache_ratio
+            )
        elif isinstance(self.kv_cache, MLATokenToKVPool):
-            self.token_to_kv_pool_host = MLATokenToKVPoolHost(self.kv_cache)
+            self.token_to_kv_pool_host = MLATokenToKVPoolHost(
+                self.kv_cache, hicache_ratio
+            )
        else:
            raise ValueError(f"Only MHA and MLA supports swap kv_cache to host.")

--- a/python/sglang/srt/mem_cache/memory_pool.py
+++ b/python/sglang/srt/mem_cache/memory_pool.py
@@ -581,7 +581,7 @@ class HostKVCache(abc.ABC):
    def __init__(
        self,
        device_pool: MHATokenToKVPool,
-        host_to_device_ratio: float = 3.0,
+        host_to_device_ratio: float,
        pin_memory: bool = False,  # no need to use pin memory with the double buffering
        device: str = "cpu",
    ):
@@ -747,7 +747,7 @@ class MHATokenToKVPoolHost(HostKVCache):
    def __init__(
        self,
        device_pool: MHATokenToKVPool,
-        host_to_device_ratio: float = 3.0,
+        host_to_device_ratio: float,
        pin_memory: bool = False,  # no need to use pin memory with the double buffering
        device: str = "cpu",
    ):
@@ -789,7 +789,7 @@ class MLATokenToKVPoolHost(HostKVCache):
    def __init__(
        self,
        device_pool: MLATokenToKVPool,
-        host_to_device_ratio: float = 4.0,
+        host_to_device_ratio: float,
        pin_memory: bool = False,  # no need to use pin memory with the double buffering
        device: str = "cpu",
    ):
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -173,6 +173,7 @@ class ServerArgs:
    enable_custom_logit_processor: bool = False
    tool_call_parser: str = None
    enable_hierarchical_cache: bool = False
+    hicache_ratio: float = 2.0
    enable_flashinfer_mla: bool = False
    enable_flashmla: bool = False
    flashinfer_mla_disable_ragged: bool = False
@@ -1007,6 +1008,13 @@ class ServerArgs:
            action="store_true",
            help="Enable hierarchical cache",
        )
+        parser.add_argument(
+            "--hicache-ratio",
+            type=float,
+            required=False,
+            default=ServerArgs.hicache_ratio,
+            help="The ratio of the size of host KV cache memory pool to the size of device pool.",
+        )

        # Server warmups
        parser.add_argument(