Returning a per request metric for number of cached_tokens read (#1599)

2024-10-16 20:49:22 +02:00
parent dbec2f1847
commit ecb8bad276
7 changed files with 245 additions and 3 deletions
--- a/python/sglang/srt/managers/schedule_batch.py
+++ b/python/sglang/srt/managers/schedule_batch.py
@@ -196,6 +196,9 @@ class Req:
        # this does not include the jump forward tokens.
        self.completion_tokens_wo_jump_forward = 0

+        # The number of cached tokens, that were already cached in the KV store
+        self.cached_tokens = 0
+
        # For vision inputs
        self.image_inputs: Optional[ImageInputs] = None

@@ -499,6 +502,13 @@ class ScheduleBatch:

        pt = 0
        for i, req in enumerate(reqs):
+            already_computed = (
+                req.extend_logprob_start_len + 1 + req.cached_tokens
+                if req.extend_logprob_start_len > 0
+                else 0
+            )
+            req.cached_tokens += len(req.prefix_indices) - already_computed
+
            req.req_pool_idx = req_pool_indices[i]
            pre_len, seq_len = len(req.prefix_indices), len(req.fill_ids)
            seq_lens.append(seq_len)
--- a/python/sglang/srt/managers/schedule_policy.py
+++ b/python/sglang/srt/managers/schedule_policy.py
@@ -51,6 +51,7 @@ class SchedulePolicy:
                r.prefix_indices, r.last_node = self.tree_cache.match_prefix(
                    rid=r.rid, key=r.adjust_max_prefix_ids()
                )
+
            prefix_computed = True

        if self.policy == "lpm":
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -978,6 +978,7 @@ class Scheduler:
                        "prompt_tokens": len(req.origin_input_ids),
                        "completion_tokens": len(req.output_ids),
                        "completion_tokens_wo_jump_forward": req.completion_tokens_wo_jump_forward,
+                        "cached_tokens": req.cached_tokens,
                        "finish_reason": (
                            req.finished_reason.to_json()
                            if req.finished_reason is not None