[P/D]Mooncake Layerwise Connector supports hybrid attention manager with multiple kvcache groups (#7022)
### What this PR does / why we need it?
Mooncake Layerwise Connector supports hybrid attention manager with
multiple kvcache groups.
### Does this PR introduce _any_ user-facing change?
Yes.
### How was this patch tested?
By CI.
- vLLM version: v0.16.0
- vLLM main:
15d76f74e2
---------
Signed-off-by: nwpu-zxr <zhouxuerong2@huawei.com>
This commit is contained in:
@@ -91,6 +91,11 @@ class RecomputeScheduler(Scheduler):
|
||||
and self.vllm_config.kv_transfer_config
|
||||
and self.vllm_config.kv_transfer_config.is_kv_consumer
|
||||
)
|
||||
self.is_kv_producer = self.vllm_config.kv_transfer_config and self.vllm_config.kv_transfer_config.is_kv_producer
|
||||
self.is_hybrid_model = (
|
||||
"qwen3_next" in self.vllm_config.model_config.model_type
|
||||
or "qwen3_5" in self.vllm_config.model_config.model_type
|
||||
)
|
||||
|
||||
def add_request(self, request: Request) -> None:
|
||||
existing = self.requests.get(request.request_id)
|
||||
@@ -111,6 +116,10 @@ class RecomputeScheduler(Scheduler):
|
||||
request.streaming_queue = deque()
|
||||
# Fill in placeholder tokens to enable full graph compatibility. Without
|
||||
# placeholders, graph matching may fail, forcing eager mode execution.
|
||||
if self.is_kv_producer and self.is_hybrid_model and request.num_tokens > 1:
|
||||
request.prompt_token_ids.pop()
|
||||
request._all_token_ids.pop()
|
||||
request.num_prompt_tokens -= 1
|
||||
if self.is_mtp_kv_consumer:
|
||||
request.spec_token_ids = [PLACEHOLDER_TOKEN_ID] * self.num_spec_tokens
|
||||
self.waiting.add_request(request)
|
||||
@@ -118,6 +127,55 @@ class RecomputeScheduler(Scheduler):
|
||||
if self.log_stats:
|
||||
request.record_event(EngineCoreEventType.QUEUED)
|
||||
|
||||
def _update_waiting_for_remote_kv(self, request: Request) -> bool:
|
||||
"""
|
||||
KV Connector: check if the request_id is finished_recving.
|
||||
|
||||
The finished_recving_kv_req_ids list is populated
|
||||
on the previous steps()'s update_from_output based
|
||||
on the worker side connector.
|
||||
|
||||
When the kv transfer is ready, we cache the blocks
|
||||
and the request state will be moved back to WAITING from
|
||||
WAITING_FOR_REMOTE_KV.
|
||||
"""
|
||||
assert self.connector is not None
|
||||
if request.request_id not in self.finished_recving_kv_req_ids:
|
||||
return False
|
||||
|
||||
if request.request_id in self.failed_recving_kv_req_ids:
|
||||
# Request had KV load failures; num_computed_tokens was already
|
||||
# updated in _update_requests_with_invalid_blocks
|
||||
if request.num_computed_tokens:
|
||||
# Cache any valid computed tokens.
|
||||
self.kv_cache_manager.cache_blocks(request, request.num_computed_tokens)
|
||||
else:
|
||||
# No valid computed tokens, release allocated blocks.
|
||||
# There may be a local cache hit on retry.
|
||||
self.kv_cache_manager.free(request)
|
||||
|
||||
self.failed_recving_kv_req_ids.remove(request.request_id)
|
||||
else:
|
||||
# Now that the blocks are ready, actually cache them.
|
||||
block_ids = self.kv_cache_manager.get_block_ids(request.request_id)
|
||||
if len(block_ids) == 1:
|
||||
num_computed_tokens = len(block_ids[0]) * self.block_size
|
||||
# Handle the case where num request tokens less than one block.
|
||||
num_computed_tokens = min(num_computed_tokens, request.num_tokens)
|
||||
else:
|
||||
num_computed_tokens = request.num_tokens
|
||||
if num_computed_tokens == request.num_tokens:
|
||||
num_computed_tokens -= 1
|
||||
# This will cache the blocks iff caching is enabled.
|
||||
self.kv_cache_manager.cache_blocks(request, num_computed_tokens)
|
||||
|
||||
# Update the request state for scheduling.
|
||||
request.num_computed_tokens = num_computed_tokens
|
||||
|
||||
# Return that we are ready.
|
||||
self.finished_recving_kv_req_ids.remove(request.request_id)
|
||||
return True
|
||||
|
||||
def schedule(self) -> RecomputeSchedulerOutput:
|
||||
# NOTE(woosuk) on the scheduling algorithm:
|
||||
# There's no "decoding phase" nor "prefill phase" in the scheduler.
|
||||
|
||||
Reference in New Issue
Block a user