[PD] support spec decode (#6507)
Co-authored-by: SangBin Cho <rkooo567@gmail.com>
This commit is contained in:
@@ -591,6 +591,11 @@ class Scheduler(
|
||||
self.disagg_decode_prealloc_queue = DecodePreallocQueue(
|
||||
req_to_token_pool=self.req_to_token_pool,
|
||||
token_to_kv_pool_allocator=self.token_to_kv_pool_allocator,
|
||||
draft_token_to_kv_pool=(
|
||||
None
|
||||
if self.draft_worker is None
|
||||
else self.draft_worker.model_runner.token_to_kv_pool
|
||||
),
|
||||
req_to_metadata_buffer_idx_allocator=req_to_metadata_buffer_idx_allocator,
|
||||
metadata_buffers=metadata_buffers,
|
||||
aux_dtype=aux_dtype,
|
||||
@@ -624,6 +629,11 @@ class Scheduler(
|
||||
|
||||
self.disagg_prefill_bootstrap_queue = PrefillBootstrapQueue(
|
||||
token_to_kv_pool=self.token_to_kv_pool_allocator.get_kvcache(),
|
||||
draft_token_to_kv_pool=(
|
||||
None
|
||||
if self.draft_worker is None
|
||||
else self.draft_worker.model_runner.token_to_kv_pool
|
||||
),
|
||||
req_to_metadata_buffer_idx_allocator=req_to_metadata_buffer_idx_allocator,
|
||||
metadata_buffers=metadata_buffers,
|
||||
aux_dtype=aux_dtype,
|
||||
@@ -1409,6 +1419,13 @@ class Scheduler(
|
||||
self.running_batch.batch_is_full = True
|
||||
break
|
||||
|
||||
if self.disaggregation_mode == DisaggregationMode.PREFILL:
|
||||
# In prefill mode, prealloc queue and transfer queue can also take memory,
|
||||
# so we need to check if the available size for the actual available size.
|
||||
if len(adder.can_run_list) >= self.req_to_token_pool.available_size():
|
||||
self.running_batch.batch_is_full = True
|
||||
break
|
||||
|
||||
req.init_next_round_input(
|
||||
None if prefix_computed else self.tree_cache,
|
||||
self.enable_hierarchical_cache,
|
||||
|
||||
Reference in New Issue
Block a user