[Feat] A Connector that supports Mooncake store (#2913)

### What this PR does / why we need it?
Added a new connector for Mooncake store integration to enable kvcache
reuse in scenarios with system prompts or multi-turn dialogues.

### How was this patch tested?


- vLLM version: v0.10.2
- vLLM main:
5963b98b46

---------

Signed-off-by: LCAIZJ <leichao139636@163.com>
Signed-off-by: fems14 <1804143737@qq.com>
Co-authored-by: fems14 <1804143737@qq.com>
Co-authored-by: Dreamerleader <2270923832@qq.com>
Co-authored-by: Pz1116 <zpbzpb123123@gmail.com>
Co-authored-by: lizy124 <1950471827@qq.com>
Co-authored-by: zouyida2052 <zouyida2002@gmail.com>
This commit is contained in:
Chao Lei
2025-09-18 14:04:45 +08:00
committed by GitHub
parent 723d460894
commit cef43b524e
9 changed files with 2033 additions and 9 deletions

View File

@@ -1811,13 +1811,9 @@ class NPUModelRunner(LoRAModelRunnerMixin):
if self.drafter and self.drafter.name == SpecDcodeType.EAGLE3:
hidden_states, aux_hidden_states = hidden_states
kv_connector_output = None
if finished_sending is not None or finished_recving is not None:
kv_connector_output = KVConnectorOutput(
finished_sending=finished_sending,
finished_recving=finished_recving)
else:
kv_connector_output = None
kv_connector_output = KVConnectorOutput(
finished_sending=finished_sending,
finished_recving=finished_recving)
finished_sending = None
finished_recving = None
with ProfileExecuteDuration().capture_async("post process"):
@@ -2067,8 +2063,6 @@ class NPUModelRunner(LoRAModelRunnerMixin):
# For the case of no forward caused by receiving remote kv,
# one round of dummy inference is necessary
# to prevent hang over the collective calls.
if not finished_sending and not finished_recving:
return EMPTY_MODEL_RUNNER_OUTPUT
output = copy.copy(EMPTY_MODEL_RUNNER_OUTPUT)
output.kv_connector_output = KVConnectorOutput(