Unit test for Hierarchical Caching (#4486)
This commit is contained in:
@@ -29,6 +29,7 @@ class HiRadixCache(RadixCache):
|
||||
token_to_kv_pool_allocator: TokenToKVPoolAllocator,
|
||||
tp_cache_group: torch.distributed.ProcessGroup,
|
||||
page_size: int,
|
||||
hicache_ratio: float,
|
||||
):
|
||||
if page_size != 1:
|
||||
raise ValueError(
|
||||
@@ -36,9 +37,13 @@ class HiRadixCache(RadixCache):
|
||||
)
|
||||
self.kv_cache = token_to_kv_pool_allocator.get_kvcache()
|
||||
if isinstance(self.kv_cache, MHATokenToKVPool):
|
||||
self.token_to_kv_pool_host = MHATokenToKVPoolHost(self.kv_cache)
|
||||
self.token_to_kv_pool_host = MHATokenToKVPoolHost(
|
||||
self.kv_cache, hicache_ratio
|
||||
)
|
||||
elif isinstance(self.kv_cache, MLATokenToKVPool):
|
||||
self.token_to_kv_pool_host = MLATokenToKVPoolHost(self.kv_cache)
|
||||
self.token_to_kv_pool_host = MLATokenToKVPoolHost(
|
||||
self.kv_cache, hicache_ratio
|
||||
)
|
||||
else:
|
||||
raise ValueError(f"Only MHA and MLA supports swap kv_cache to host.")
|
||||
|
||||
|
||||
@@ -581,7 +581,7 @@ class HostKVCache(abc.ABC):
|
||||
def __init__(
|
||||
self,
|
||||
device_pool: MHATokenToKVPool,
|
||||
host_to_device_ratio: float = 3.0,
|
||||
host_to_device_ratio: float,
|
||||
pin_memory: bool = False, # no need to use pin memory with the double buffering
|
||||
device: str = "cpu",
|
||||
):
|
||||
@@ -747,7 +747,7 @@ class MHATokenToKVPoolHost(HostKVCache):
|
||||
def __init__(
|
||||
self,
|
||||
device_pool: MHATokenToKVPool,
|
||||
host_to_device_ratio: float = 3.0,
|
||||
host_to_device_ratio: float,
|
||||
pin_memory: bool = False, # no need to use pin memory with the double buffering
|
||||
device: str = "cpu",
|
||||
):
|
||||
@@ -789,7 +789,7 @@ class MLATokenToKVPoolHost(HostKVCache):
|
||||
def __init__(
|
||||
self,
|
||||
device_pool: MLATokenToKVPool,
|
||||
host_to_device_ratio: float = 4.0,
|
||||
host_to_device_ratio: float,
|
||||
pin_memory: bool = False, # no need to use pin memory with the double buffering
|
||||
device: str = "cpu",
|
||||
):
|
||||
|
||||
Reference in New Issue
Block a user