[Feat] A Connector that supports Mooncake store (#2913)

### What this PR does / why we need it? Added a new connector for Mooncake store integration to enable kvcache reuse in scenarios with system prompts or multi-turn dialogues. ### How was this patch tested? - vLLM version: v0.10.2 - vLLM main: 5963b98b46 --------- Signed-off-by: LCAIZJ <leichao139636@163.com> Signed-off-by: fems14 <1804143737@qq.com> Co-authored-by: fems14 <1804143737@qq.com> Co-authored-by: Dreamerleader <2270923832@qq.com> Co-authored-by: Pz1116 <zpbzpb123123@gmail.com> Co-authored-by: lizy124 <1950471827@qq.com> Co-authored-by: zouyida2052 <zouyida2002@gmail.com>
2025-09-18 14:04:45 +08:00
parent 723d460894
commit cef43b524e
9 changed files with 2033 additions and 9 deletions
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -1811,13 +1811,9 @@ class NPUModelRunner(LoRAModelRunnerMixin):
            if self.drafter and self.drafter.name == SpecDcodeType.EAGLE3:
                hidden_states, aux_hidden_states = hidden_states

-        kv_connector_output = None
-        if finished_sending is not None or finished_recving is not None:
-            kv_connector_output = KVConnectorOutput(
-                finished_sending=finished_sending,
-                finished_recving=finished_recving)
-        else:
-            kv_connector_output = None
+        kv_connector_output = KVConnectorOutput(
+            finished_sending=finished_sending,
+            finished_recving=finished_recving)
        finished_sending = None
        finished_recving = None
        with ProfileExecuteDuration().capture_async("post process"):
@@ -2067,8 +2063,6 @@ class NPUModelRunner(LoRAModelRunnerMixin):
            # For the case of no forward caused by receiving remote kv,
            # one round of dummy inference is necessary
            # to prevent hang over the collective calls.
-        if not finished_sending and not finished_recving:
-            return EMPTY_MODEL_RUNNER_OUTPUT

        output = copy.copy(EMPTY_MODEL_RUNNER_OUTPUT)
        output.kv_connector_output = KVConnectorOutput(