Files
enginex-mthreads-vllm/vllm/v1/core/kv_cache_metrics.py
2026-01-19 10:38:50 +08:00

97 lines
3.1 KiB
Python

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""KV cache metrics tracking."""
import random
import time
from collections import deque
from typing import TYPE_CHECKING
if TYPE_CHECKING:
from vllm.v1.core.kv_cache_utils import KVCacheBlock
from vllm.v1.metrics.stats import KVCacheEvictionEvent
class BlockMetricsState:
"""Tracks lifecycle metrics for a single KV cache block."""
def __init__(self):
now_ns = time.monotonic_ns()
self.birth_time_ns = now_ns
self.last_access_ns = now_ns
# Bounded to prevent unbounded growth if a block is accessed many times.
self.access_history: deque[int] = deque(maxlen=4)
def record_access(self) -> None:
now_ns = time.monotonic_ns()
self.last_access_ns = now_ns
self.access_history.append(now_ns)
def get_lifetime_seconds(self) -> float:
now_ns = time.monotonic_ns()
return (now_ns - self.birth_time_ns) / 1e9
def get_idle_time_seconds(self) -> float:
now_ns = time.monotonic_ns()
return (now_ns - self.last_access_ns) / 1e9
def get_reuse_gaps_seconds(self) -> list[float]:
if len(self.access_history) < 2:
return []
history = list(self.access_history)
return [(history[i] - history[i - 1]) / 1e9 for i in range(1, len(history))]
class KVCacheMetricsCollector:
"""Collects KV cache residency metrics with sampling."""
def __init__(self, sample_rate: float = 0.01):
assert 0 < sample_rate <= 1.0, (
f"sample_rate must be in (0, 1.0], got {sample_rate}"
)
self.sample_rate = sample_rate
self.block_metrics: dict[int, BlockMetricsState] = {}
self._eviction_events: list[KVCacheEvictionEvent] = []
def should_sample_block(self) -> bool:
return random.random() < self.sample_rate
def on_block_allocated(self, block: "KVCacheBlock") -> None:
if self.should_sample_block():
self.block_metrics[block.block_id] = BlockMetricsState()
def on_block_accessed(self, block: "KVCacheBlock") -> None:
metrics = self.block_metrics.get(block.block_id)
if metrics:
metrics.record_access()
def on_block_evicted(self, block: "KVCacheBlock") -> None:
metrics = self.block_metrics.pop(block.block_id, None)
if not metrics:
return
lifetime = metrics.get_lifetime_seconds()
idle_time = metrics.get_idle_time_seconds()
reuse_gaps = tuple(metrics.get_reuse_gaps_seconds())
self._eviction_events.append(
KVCacheEvictionEvent(
lifetime_seconds=lifetime,
idle_seconds=idle_time,
reuse_gaps_seconds=reuse_gaps,
)
)
def reset(self) -> None:
"""Clear all state on cache reset."""
self.block_metrics.clear()
self._eviction_events.clear()
def drain_events(self) -> list[KVCacheEvictionEvent]:
events = self._eviction_events
self._eviction_events = []
return events