Fix cache hit rate when chunked prefill (#2555)
This commit is contained in:
@@ -248,7 +248,7 @@ class PrefillAdder:
|
||||
self.can_run_list.append(req)
|
||||
|
||||
self._prefill_one_req(
|
||||
len(req.prefix_indices),
|
||||
0,
|
||||
req.extend_input_len,
|
||||
(
|
||||
min(req.sampling_params.max_new_tokens, CLIP_MAX_NEW_TOKENS_ESTIMATION)
|
||||
|
||||
@@ -629,16 +629,13 @@ class Scheduler:
|
||||
self.waiting_queue.append(req)
|
||||
|
||||
def log_prefill_stats(self, adder, can_run_list, running_bs, has_being_chunked):
|
||||
if isinstance(self.tree_cache, RadixCache):
|
||||
self.tree_cache_metrics["total"] += (
|
||||
adder.log_input_tokens + adder.log_hit_tokens
|
||||
) / 10**9
|
||||
self.tree_cache_metrics["hit"] += (adder.log_hit_tokens) / 10**9
|
||||
tree_cache_hit_rate = (
|
||||
self.tree_cache_metrics["hit"] / self.tree_cache_metrics["total"]
|
||||
)
|
||||
else:
|
||||
tree_cache_hit_rate = 0.0
|
||||
self.tree_cache_metrics["total"] += (
|
||||
adder.log_input_tokens + adder.log_hit_tokens
|
||||
) / 10**9
|
||||
self.tree_cache_metrics["hit"] += (adder.log_hit_tokens) / 10**9
|
||||
tree_cache_hit_rate = (
|
||||
self.tree_cache_metrics["hit"] / self.tree_cache_metrics["total"]
|
||||
)
|
||||
|
||||
num_used = self.max_total_num_tokens - (
|
||||
self.token_to_kv_pool.available_size() + self.tree_cache.evictable_size()
|
||||
|
||||
Reference in New Issue
Block a user