[Feat] A Connector that supports Mooncake store (#2913)
### What this PR does / why we need it?
Added a new connector for Mooncake store integration to enable kvcache
reuse in scenarios with system prompts or multi-turn dialogues.
### How was this patch tested?
- vLLM version: v0.10.2
- vLLM main:
5963b98b46
---------
Signed-off-by: LCAIZJ <leichao139636@163.com>
Signed-off-by: fems14 <1804143737@qq.com>
Co-authored-by: fems14 <1804143737@qq.com>
Co-authored-by: Dreamerleader <2270923832@qq.com>
Co-authored-by: Pz1116 <zpbzpb123123@gmail.com>
Co-authored-by: lizy124 <1950471827@qq.com>
Co-authored-by: zouyida2052 <zouyida2002@gmail.com>
This commit is contained in:
@@ -1811,13 +1811,9 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
if self.drafter and self.drafter.name == SpecDcodeType.EAGLE3:
|
||||
hidden_states, aux_hidden_states = hidden_states
|
||||
|
||||
kv_connector_output = None
|
||||
if finished_sending is not None or finished_recving is not None:
|
||||
kv_connector_output = KVConnectorOutput(
|
||||
finished_sending=finished_sending,
|
||||
finished_recving=finished_recving)
|
||||
else:
|
||||
kv_connector_output = None
|
||||
kv_connector_output = KVConnectorOutput(
|
||||
finished_sending=finished_sending,
|
||||
finished_recving=finished_recving)
|
||||
finished_sending = None
|
||||
finished_recving = None
|
||||
with ProfileExecuteDuration().capture_async("post process"):
|
||||
@@ -2067,8 +2063,6 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
# For the case of no forward caused by receiving remote kv,
|
||||
# one round of dummy inference is necessary
|
||||
# to prevent hang over the collective calls.
|
||||
if not finished_sending and not finished_recving:
|
||||
return EMPTY_MODEL_RUNNER_OUTPUT
|
||||
|
||||
output = copy.copy(EMPTY_MODEL_RUNNER_OUTPUT)
|
||||
output.kv_connector_output = KVConnectorOutput(
|
||||
|
||||
Reference in New Issue
Block a user