[PD] Add PD support for hybrid model (Qwen3-Next, DeepSeek V3.2 Exp) (#10912)
Signed-off-by: Shangming Cai <csmthu@gmail.com> Co-authored-by: hzh0425 <hzh0425@apache.org> Co-authored-by: ZeldaHuang <hzm414167@alibaba-inc.com>
This commit is contained in:
@@ -1669,19 +1669,34 @@ class ModelRunner:
|
||||
extra_max_context_len += self.server_args.speculative_num_draft_tokens
|
||||
|
||||
if self.server_args.disaggregation_mode == "decode":
|
||||
from sglang.srt.disaggregation.decode import DecodeReqToTokenPool
|
||||
from sglang.srt.disaggregation.decode import (
|
||||
DecodeReqToTokenPool,
|
||||
HybridMambaDecodeReqToTokenPool,
|
||||
)
|
||||
|
||||
# subscribe memory for pre-allocated requests
|
||||
# if max_num_reqs <= 32, we pre-allocate 2x requests
|
||||
pre_alloc_size = max_num_reqs * 2 if max_num_reqs <= 32 else 0
|
||||
self.req_to_token_pool = DecodeReqToTokenPool(
|
||||
size=max_num_reqs,
|
||||
max_context_len=self.model_config.context_len
|
||||
+ extra_max_context_len,
|
||||
device=self.device,
|
||||
enable_memory_saver=self.server_args.enable_memory_saver,
|
||||
pre_alloc_size=pre_alloc_size,
|
||||
)
|
||||
if config := self.mambaish_config:
|
||||
self.req_to_token_pool = HybridMambaDecodeReqToTokenPool(
|
||||
size=max_num_reqs,
|
||||
max_context_len=self.model_config.context_len
|
||||
+ extra_max_context_len,
|
||||
device=self.device,
|
||||
enable_memory_saver=self.server_args.enable_memory_saver,
|
||||
cache_params=config.mamba2_cache_params,
|
||||
speculative_num_draft_tokens=self.server_args.speculative_num_draft_tokens,
|
||||
pre_alloc_size=pre_alloc_size,
|
||||
)
|
||||
else:
|
||||
self.req_to_token_pool = DecodeReqToTokenPool(
|
||||
size=max_num_reqs,
|
||||
max_context_len=self.model_config.context_len
|
||||
+ extra_max_context_len,
|
||||
device=self.device,
|
||||
enable_memory_saver=self.server_args.enable_memory_saver,
|
||||
pre_alloc_size=pre_alloc_size,
|
||||
)
|
||||
elif config := self.mambaish_config:
|
||||
self.req_to_token_pool = HybridReqToTokenPool(
|
||||
size=max_num_reqs,
|
||||
@@ -1807,6 +1822,7 @@ class ModelRunner:
|
||||
),
|
||||
enable_kvcache_transpose=False,
|
||||
device=self.device,
|
||||
mamba_pool=self.req_to_token_pool.mamba_pool,
|
||||
)
|
||||
else:
|
||||
self.token_to_kv_pool = MHATokenToKVPool(
|
||||
|
||||
Reference in New Issue
Block a user