Kernels for efficient KV cache IO (#7313)

This commit is contained in:
Zhiqiang Xie
2025-07-06 22:53:36 -07:00
committed by GitHub
parent 253454de9b
commit 2fc824b84c
7 changed files with 184 additions and 371 deletions

View File

@@ -217,6 +217,7 @@ class ServerArgs:
hicache_ratio: float = 2.0
hicache_size: int = 0
hicache_write_policy: str = "write_through_selective"
hicache_io_backend: str = ""
flashinfer_mla_disable_ragged: bool = False
disable_shared_experts_fusion: bool = False
disable_chunked_prefix_cache: bool = False
@@ -1530,6 +1531,13 @@ class ServerArgs:
default=ServerArgs.hicache_write_policy,
help="The write policy of hierarchical cache.",
)
parser.add_argument(
"--hicache-io-backend",
type=str,
choices=["direct", "kernel"],
default=ServerArgs.hicache_io_backend,
help="The IO backend for KV cache transfer between CPU and GPU",
)
parser.add_argument(
"--flashinfer-mla-disable-ragged",
action="store_true",