Kernels for efficient KV cache IO (#7313)
This commit is contained in:
@@ -217,6 +217,7 @@ class ServerArgs:
|
||||
hicache_ratio: float = 2.0
|
||||
hicache_size: int = 0
|
||||
hicache_write_policy: str = "write_through_selective"
|
||||
hicache_io_backend: str = ""
|
||||
flashinfer_mla_disable_ragged: bool = False
|
||||
disable_shared_experts_fusion: bool = False
|
||||
disable_chunked_prefix_cache: bool = False
|
||||
@@ -1530,6 +1531,13 @@ class ServerArgs:
|
||||
default=ServerArgs.hicache_write_policy,
|
||||
help="The write policy of hierarchical cache.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--hicache-io-backend",
|
||||
type=str,
|
||||
choices=["direct", "kernel"],
|
||||
default=ServerArgs.hicache_io_backend,
|
||||
help="The IO backend for KV cache transfer between CPU and GPU",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--flashinfer-mla-disable-ragged",
|
||||
action="store_true",
|
||||
|
||||
Reference in New Issue
Block a user