[PD-HiCache]: Support Async Offloading KVCache In Decode Side (#10192)
Signed-off-by: Shangming Cai <csmthu@gmail.com> Co-authored-by: Shangming Cai <csmthu@gmail.com>
This commit is contained in:
@@ -44,6 +44,9 @@ from sglang.srt.disaggregation.decode import (
|
||||
DecodeTransferQueue,
|
||||
SchedulerDisaggregationDecodeMixin,
|
||||
)
|
||||
from sglang.srt.disaggregation.decode_kvcache_offload_manager import (
|
||||
DecodeKVCacheOffloadManager,
|
||||
)
|
||||
from sglang.srt.disaggregation.prefill import (
|
||||
PrefillBootstrapQueue,
|
||||
SchedulerDisaggregationPrefillMixin,
|
||||
@@ -755,6 +758,24 @@ class Scheduler(
|
||||
eviction_policy=server_args.radix_eviction_policy,
|
||||
)
|
||||
|
||||
if (
|
||||
server_args.disaggregation_mode == "decode"
|
||||
and server_args.disaggregation_decode_enable_offload_kvcache
|
||||
):
|
||||
self.decode_offload_manager = DecodeKVCacheOffloadManager(
|
||||
req_to_token_pool=self.req_to_token_pool,
|
||||
token_to_kv_pool_allocator=self.token_to_kv_pool_allocator,
|
||||
tp_group=(
|
||||
self.attn_tp_cpu_group
|
||||
if self.server_args.enable_dp_attention
|
||||
else self.tp_cpu_group
|
||||
),
|
||||
tree_cache=self.tree_cache,
|
||||
server_args=self.server_args,
|
||||
)
|
||||
else:
|
||||
self.decode_offload_manager = None
|
||||
|
||||
self.decode_mem_cache_buf_multiplier = (
|
||||
1
|
||||
if self.spec_algorithm.is_none()
|
||||
|
||||
Reference in New Issue
Block a user