sglang/python/sglang/srt/mem_cache/chunk_cache.py

from __future__ import annotations

"""Cache for chunked prefill, used when RadixCache is disabled."""
from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Tuple

import torch

from sglang.srt.mem_cache.base_prefix_cache import BasePrefixCache
from sglang.srt.mem_cache.memory_pool import ReqToTokenPool, TokenToKVPoolAllocator

if TYPE_CHECKING:
    from sglang.srt.managers.schedule_batch import Req


class ChunkCacheEntry:
    def __init__(self, rid: str, value: torch.Tensor):
        self.rid = rid
        self.value = value


class ChunkCache(BasePrefixCache):
    def __init__(
        self,
        req_to_token_pool: ReqToTokenPool,
        token_to_kv_pool_allocator: TokenToKVPoolAllocator,
    ):
        self.disable = True
        self.req_to_token_pool = req_to_token_pool
        self.token_to_kv_pool_allocator = token_to_kv_pool_allocator
        self.entries: Dict[str, ChunkCacheEntry] = {}

        self.reset()

    def reset(self):
        self.entries = {}

    def match_prefix(self, rid: int, key: List[int]) -> Tuple[List[int], int]:
        if rid not in self.entries:
            return [], None

        entry = self.entries[rid]
        max_prefix_len = len(key)
        return entry.value[:max_prefix_len], entry

    def cache_finished_req(self, req: Req, token_ids: Optional[List[int]] = None):
        if token_ids is None:
            token_id_len = len(req.origin_input_ids) + len(req.output_ids) - 1
        else:
            token_id_len = len(token_ids)

        kv_indices = self.req_to_token_pool.req_to_token[
            req.req_pool_idx, :token_id_len
        ]
        self.req_to_token_pool.free(req.req_pool_idx)
        self.token_to_kv_pool_allocator.free(kv_indices)

        if req.rid in self.entries:
            del self.entries[req.rid]

    def cache_unfinished_req(self, req: Req):
        token_id_len = len(req.fill_ids)

        kv_indices = self.req_to_token_pool.req_to_token[
            req.req_pool_idx, :token_id_len
        ]

        if req.rid not in self.entries:
            self.entries[req.rid] = ChunkCacheEntry(req.rid, kv_indices)

        entry = self.entries[req.rid]
        entry.value = kv_indices
        req.prefix_indices = kv_indices
        req.last_node = entry

    def insert(self):
        raise NotImplementedError()

    def evict(self, num_tokens: int, evict_callback: Callable):
        pass

    def inc_lock_ref(self, node):
        return 0

    def dec_lock_ref(self, node):
        return 0

    def evictable_size(self):
        return 0

    def pretty_print(self):
        return ""

    def protected_size(self):
        return 0

    def pretty_print(self):
        return ""
Improve type annotation (#1029) 2024-08-11 02:44:59 -07:00			`from __future__ import annotations`

Support chunked prefill when radix cache is disabled (#811) 2024-08-01 00:29:01 -07:00			`"""Cache for chunked prefill, used when RadixCache is disabled."""`
Support penalty in overlap mode; return logprob with chunked prefill; improve benchmark scripts (#3988) Co-authored-by: SangBin Cho <rkooo567@gmail.com> Co-authored-by: dhou-xai <dhou@x.ai> Co-authored-by: Hanming Lu <hanming_lu@berkeley.edu> 2025-03-03 00:12:04 -08:00			`from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Tuple`

			`import torch`
RadixCache method adjust (#977) 2024-08-07 15:52:24 -07:00
			`from sglang.srt.mem_cache.base_prefix_cache import BasePrefixCache`
[Eagle] Refactor eagle speculative decoding (#3986) Co-authored-by: Ke Bao <ISPObaoke@163.com> 2025-03-05 08:06:07 -08:00			`from sglang.srt.mem_cache.memory_pool import ReqToTokenPool, TokenToKVPoolAllocator`
RadixCache method adjust (#977) 2024-08-07 15:52:24 -07:00
			`if TYPE_CHECKING:`
			`from sglang.srt.managers.schedule_batch import Req`
Support chunked prefill when radix cache is disabled (#811) 2024-08-01 00:29:01 -07:00

			`class ChunkCacheEntry:`
Support penalty in overlap mode; return logprob with chunked prefill; improve benchmark scripts (#3988) Co-authored-by: SangBin Cho <rkooo567@gmail.com> Co-authored-by: dhou-xai <dhou@x.ai> Co-authored-by: Hanming Lu <hanming_lu@berkeley.edu> 2025-03-03 00:12:04 -08:00			`def __init__(self, rid: str, value: torch.Tensor):`
Support chunked prefill when radix cache is disabled (#811) 2024-08-01 00:29:01 -07:00			`self.rid = rid`
			`self.value = value`


			`class ChunkCache(BasePrefixCache):`
Improve type annotation (#1029) 2024-08-11 02:44:59 -07:00			`def __init__(`
[Eagle] Refactor eagle speculative decoding (#3986) Co-authored-by: Ke Bao <ISPObaoke@163.com> 2025-03-05 08:06:07 -08:00			`self,`
			`req_to_token_pool: ReqToTokenPool,`
			`token_to_kv_pool_allocator: TokenToKVPoolAllocator,`
Improve type annotation (#1029) 2024-08-11 02:44:59 -07:00			`):`
Support chunked prefill when radix cache is disabled (#811) 2024-08-01 00:29:01 -07:00			`self.disable = True`
			`self.req_to_token_pool = req_to_token_pool`
[Eagle] Refactor eagle speculative decoding (#3986) Co-authored-by: Ke Bao <ISPObaoke@163.com> 2025-03-05 08:06:07 -08:00			`self.token_to_kv_pool_allocator = token_to_kv_pool_allocator`
Support penalty in overlap mode; return logprob with chunked prefill; improve benchmark scripts (#3988) Co-authored-by: SangBin Cho <rkooo567@gmail.com> Co-authored-by: dhou-xai <dhou@x.ai> Co-authored-by: Hanming Lu <hanming_lu@berkeley.edu> 2025-03-03 00:12:04 -08:00			`self.entries: Dict[str, ChunkCacheEntry] = {}`
Support chunked prefill when radix cache is disabled (#811) 2024-08-01 00:29:01 -07:00
			`self.reset()`

			`def reset(self):`
			`self.entries = {}`

[Core] in batch prefix caching by delay scheduling (#2442) 2024-12-11 12:51:50 -08:00			`def match_prefix(self, rid: int, key: List[int]) -> Tuple[List[int], int]:`
Support chunked prefill when radix cache is disabled (#811) 2024-08-01 00:29:01 -07:00			`if rid not in self.entries:`
			`return [], None`

			`entry = self.entries[rid]`
minor: some potential bugs (#1044) 2024-08-11 22:35:44 -07:00			`max_prefix_len = len(key)`
			`return entry.value[:max_prefix_len], entry`
Support chunked prefill when radix cache is disabled (#811) 2024-08-01 00:29:01 -07:00
Fix the race condition in overlap mode (#1712) 2024-10-19 06:50:56 -07:00			`def cache_finished_req(self, req: Req, token_ids: Optional[List[int]] = None):`
RadixCache method adjust (#977) 2024-08-07 15:52:24 -07:00			`if token_ids is None:`
Add a new event loop (#1677) 2024-10-16 01:33:20 -07:00			`token_id_len = len(req.origin_input_ids) + len(req.output_ids) - 1`
			`else:`
			`token_id_len = len(token_ids)`
Support chunked prefill when radix cache is disabled (#811) 2024-08-01 00:29:01 -07:00
RadixCache method adjust (#977) 2024-08-07 15:52:24 -07:00			`kv_indices = self.req_to_token_pool.req_to_token[`
Fix the race condition in overlap mode (#1712) 2024-10-19 06:50:56 -07:00			`req.req_pool_idx, :token_id_len`
RadixCache method adjust (#977) 2024-08-07 15:52:24 -07:00			`]`
			`self.req_to_token_pool.free(req.req_pool_idx)`
[Eagle] Refactor eagle speculative decoding (#3986) Co-authored-by: Ke Bao <ISPObaoke@163.com> 2025-03-05 08:06:07 -08:00			`self.token_to_kv_pool_allocator.free(kv_indices)`
Support chunked prefill when radix cache is disabled (#811) 2024-08-01 00:29:01 -07:00
Fix wrong assert (#1028) 2024-08-11 02:22:16 -07:00			`if req.rid in self.entries:`
			`del self.entries[req.rid]`

Support penalty in overlap mode; return logprob with chunked prefill; improve benchmark scripts (#3988) Co-authored-by: SangBin Cho <rkooo567@gmail.com> Co-authored-by: dhou-xai <dhou@x.ai> Co-authored-by: Hanming Lu <hanming_lu@berkeley.edu> 2025-03-03 00:12:04 -08:00			`def cache_unfinished_req(self, req: Req):`
			`token_id_len = len(req.fill_ids)`
RadixCache method adjust (#977) 2024-08-07 15:52:24 -07:00
			`kv_indices = self.req_to_token_pool.req_to_token[`
Add a new event loop (#1677) 2024-10-16 01:33:20 -07:00			`req.req_pool_idx, :token_id_len`
RadixCache method adjust (#977) 2024-08-07 15:52:24 -07:00			`]`

			`if req.rid not in self.entries:`
			`self.entries[req.rid] = ChunkCacheEntry(req.rid, kv_indices)`

			`entry = self.entries[req.rid]`
			`entry.value = kv_indices`
Fix chunked prefill (#984) 2024-08-07 22:28:42 -07:00			`req.prefix_indices = kv_indices`
			`req.last_node = entry`
Support chunked prefill when radix cache is disabled (#811) 2024-08-01 00:29:01 -07:00
			`def insert(self):`
Re-organize CI tests (#1052) 2024-08-12 03:39:01 -07:00			`raise NotImplementedError()`
Support chunked prefill when radix cache is disabled (#811) 2024-08-01 00:29:01 -07:00
Improve type annotation (#1029) 2024-08-11 02:44:59 -07:00			`def evict(self, num_tokens: int, evict_callback: Callable):`
Support chunked prefill when radix cache is disabled (#811) 2024-08-01 00:29:01 -07:00			`pass`

			`def inc_lock_ref(self, node):`
			`return 0`

			`def dec_lock_ref(self, node):`
			`return 0`

			`def evictable_size(self):`
			`return 0`
Sanity check to prevent performance regression (#3171) Co-authored-by: Lianmin Zheng <lianminzheng@gmail.com> 2025-01-27 12:28:17 -08:00
Support penalty in overlap mode; return logprob with chunked prefill; improve benchmark scripts (#3988) Co-authored-by: SangBin Cho <rkooo567@gmail.com> Co-authored-by: dhou-xai <dhou@x.ai> Co-authored-by: Hanming Lu <hanming_lu@berkeley.edu> 2025-03-03 00:12:04 -08:00			`def pretty_print(self):`
			`return ""`

Sanity check to prevent performance regression (#3171) Co-authored-by: Lianmin Zheng <lianminzheng@gmail.com> 2025-01-27 12:28:17 -08:00			`def protected_size(self):`
			`return 0`
[Eagle] Refactor eagle speculative decoding (#3986) Co-authored-by: Ke Bao <ISPObaoke@163.com> 2025-03-05 08:06:07 -08:00
			`def pretty_print(self):`
			`return ""`