From aa6154703aeeb259611326a3fba095b6abc5b791 Mon Sep 17 00:00:00 2001 From: DreamerLeader <88812830+DreamerLeader@users.noreply.github.com> Date: Wed, 15 Oct 2025 23:28:17 +0800 Subject: [PATCH] [BugFix]GPQA Accuracy Issue Bugfix (#3476) ### What this PR does / why we need it? The GPQA dataset accuracy in the PD separation scenario of testing is 33.2, which does not meet the paper's requirement of 70. Resolve this accuracy issue. ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? qpqa has accuracy issues, but modifying the code can ensure the accuracy meets the standard - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 --------- Signed-off-by: fjw <2270923832@qq.com> --- .../distributed/mooncake/mooncake_store_connector_v1.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/vllm_ascend/distributed/mooncake/mooncake_store_connector_v1.py b/vllm_ascend/distributed/mooncake/mooncake_store_connector_v1.py index 6254e47..3a7169a 100644 --- a/vllm_ascend/distributed/mooncake/mooncake_store_connector_v1.py +++ b/vllm_ascend/distributed/mooncake/mooncake_store_connector_v1.py @@ -163,6 +163,8 @@ class MooncakeStoreConnectorV1Scheduler: self.client = MooncakeLookupClient(vllm_config) self.use_layerwise = use_layerwise self.kv_role = vllm_config.kv_transfer_config.kv_role + self.consumer_is_to_load = vllm_config.kv_transfer_config.kv_connector_extra_config.get( + "consumer_is_to_load", False) # request_id -> (vllm cached tokes, mooncake cached tokens) self.load_specs: dict[str, LoadSpec] = {} self._block_size = vllm_config.cache_config.block_size @@ -192,6 +194,8 @@ class MooncakeStoreConnectorV1Scheduler: the number of tokens that can be loaded from the external KV cache beyond what is already computed. """ + if self.kv_role == "kv_consumer" and not self.consumer_is_to_load: + return 0, False if self._discard_partial_chunks: token_block_end = len(request.prompt_token_ids @@ -481,4 +485,4 @@ class MooncakeLookupServer: def close(self): self.socket.close(linger=0) - # TODO: close the thread! \ No newline at end of file + # TODO: close the thread!