Fix cache hit rate when chunked prefill (#2555)
This commit is contained in:
@@ -248,7 +248,7 @@ class PrefillAdder:
|
|||||||
self.can_run_list.append(req)
|
self.can_run_list.append(req)
|
||||||
|
|
||||||
self._prefill_one_req(
|
self._prefill_one_req(
|
||||||
len(req.prefix_indices),
|
0,
|
||||||
req.extend_input_len,
|
req.extend_input_len,
|
||||||
(
|
(
|
||||||
min(req.sampling_params.max_new_tokens, CLIP_MAX_NEW_TOKENS_ESTIMATION)
|
min(req.sampling_params.max_new_tokens, CLIP_MAX_NEW_TOKENS_ESTIMATION)
|
||||||
|
|||||||
@@ -629,16 +629,13 @@ class Scheduler:
|
|||||||
self.waiting_queue.append(req)
|
self.waiting_queue.append(req)
|
||||||
|
|
||||||
def log_prefill_stats(self, adder, can_run_list, running_bs, has_being_chunked):
|
def log_prefill_stats(self, adder, can_run_list, running_bs, has_being_chunked):
|
||||||
if isinstance(self.tree_cache, RadixCache):
|
self.tree_cache_metrics["total"] += (
|
||||||
self.tree_cache_metrics["total"] += (
|
adder.log_input_tokens + adder.log_hit_tokens
|
||||||
adder.log_input_tokens + adder.log_hit_tokens
|
) / 10**9
|
||||||
) / 10**9
|
self.tree_cache_metrics["hit"] += (adder.log_hit_tokens) / 10**9
|
||||||
self.tree_cache_metrics["hit"] += (adder.log_hit_tokens) / 10**9
|
tree_cache_hit_rate = (
|
||||||
tree_cache_hit_rate = (
|
self.tree_cache_metrics["hit"] / self.tree_cache_metrics["total"]
|
||||||
self.tree_cache_metrics["hit"] / self.tree_cache_metrics["total"]
|
)
|
||||||
)
|
|
||||||
else:
|
|
||||||
tree_cache_hit_rate = 0.0
|
|
||||||
|
|
||||||
num_used = self.max_total_num_tokens - (
|
num_used = self.max_total_num_tokens - (
|
||||||
self.token_to_kv_pool.available_size() + self.tree_cache.evictable_size()
|
self.token_to_kv_pool.available_size() + self.tree_cache.evictable_size()
|
||||||
|
|||||||
Reference in New Issue
Block a user