From a7f91fce71ac06fb1106542b6379032abe2c356f Mon Sep 17 00:00:00 2001 From: pz1116 <47019764+Pz1116@users.noreply.github.com> Date: Wed, 11 Mar 2026 15:05:34 +0800 Subject: [PATCH] [KV Pool]get_num_new_matched_tokens return 0 if token length < block_size (#7146) ### What this PR does / why we need it? Currently, we call lookup_client for looking up token hit in KV Pool, however, when token length < block size, the key will be empty and there is no point to lookup in KV Pool backend since there will never be a hit. Hence, add early return in `get_num_new_matched_tokens` when `token_len` < `block_size` ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.16.0 - vLLM main: https://github.com/vllm-project/vllm/commit/4034c3d32e30d01639459edd3ab486f56993876d --------- Signed-off-by: Pz1116 Co-authored-by: fems14 <1804143737@qq.com> --- .../kv_transfer/kv_pool/ascend_store/pool_scheduler.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vllm_ascend/distributed/kv_transfer/kv_pool/ascend_store/pool_scheduler.py b/vllm_ascend/distributed/kv_transfer/kv_pool/ascend_store/pool_scheduler.py index 80468456..2629b695 100644 --- a/vllm_ascend/distributed/kv_transfer/kv_pool/ascend_store/pool_scheduler.py +++ b/vllm_ascend/distributed/kv_transfer/kv_pool/ascend_store/pool_scheduler.py @@ -78,6 +78,9 @@ class KVPoolScheduler: else: token_len = len(request.prompt_token_ids) + if token_len < self._block_size: + return 0, False + num_external_hit_tokens = self.client.lookup(token_len, request.block_hashes) if num_external_hit_tokens == request.num_tokens: