diff --git a/python/sglang/srt/managers/schedule_policy.py b/python/sglang/srt/managers/schedule_policy.py index 70af61ddb..e92b1ddce 100644 --- a/python/sglang/srt/managers/schedule_policy.py +++ b/python/sglang/srt/managers/schedule_policy.py @@ -248,7 +248,7 @@ class PrefillAdder: self.can_run_list.append(req) self._prefill_one_req( - len(req.prefix_indices), + 0, req.extend_input_len, ( min(req.sampling_params.max_new_tokens, CLIP_MAX_NEW_TOKENS_ESTIMATION) diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py index a94a0c8c4..6b4d0276c 100644 --- a/python/sglang/srt/managers/scheduler.py +++ b/python/sglang/srt/managers/scheduler.py @@ -629,16 +629,13 @@ class Scheduler: self.waiting_queue.append(req) def log_prefill_stats(self, adder, can_run_list, running_bs, has_being_chunked): - if isinstance(self.tree_cache, RadixCache): - self.tree_cache_metrics["total"] += ( - adder.log_input_tokens + adder.log_hit_tokens - ) / 10**9 - self.tree_cache_metrics["hit"] += (adder.log_hit_tokens) / 10**9 - tree_cache_hit_rate = ( - self.tree_cache_metrics["hit"] / self.tree_cache_metrics["total"] - ) - else: - tree_cache_hit_rate = 0.0 + self.tree_cache_metrics["total"] += ( + adder.log_input_tokens + adder.log_hit_tokens + ) / 10**9 + self.tree_cache_metrics["hit"] += (adder.log_hit_tokens) / 10**9 + tree_cache_hit_rate = ( + self.tree_cache_metrics["hit"] / self.tree_cache_metrics["total"] + ) num_used = self.max_total_num_tokens - ( self.token_to_kv_pool.available_size() + self.tree_cache.evictable_size()