Improve streaming, log_level, memory report, weight loading, and benchmark script (#7632)

Co-authored-by: Kan Wu <wukanustc@gmail.com>
This commit is contained in:
Lianmin Zheng
2025-06-29 23:16:19 -07:00
committed by GitHub
parent c5131f7a2f
commit 22352d47a9
24 changed files with 626 additions and 160 deletions

View File

@@ -123,6 +123,7 @@ class KVCache(abc.ABC):
self.memory_saver_adapter = TorchMemorySaverAdapter.create(
enable=enable_memory_saver
)
self.mem_usage = 0
# used for chunked cpu-offloading
self.cpu_offloading_chunk_size = 8192
@@ -219,6 +220,7 @@ class MHATokenToKVPool(KVCache):
logger.info(
f"KV Cache is allocated. #tokens: {size}, K size: {k_size / GB:.2f} GB, V size: {v_size / GB:.2f} GB"
)
self.mem_usage = (k_size + v_size) / GB
def _create_buffers(self):
with self.memory_saver_adapter.region(GPU_MEMORY_TYPE_KV_CACHE):
@@ -695,6 +697,7 @@ class MLATokenToKVPool(KVCache):
logger.info(
f"KV Cache is allocated. #tokens: {size}, KV size: {kv_size / GB:.2f} GB"
)
self.mem_usage = kv_size / GB
def get_kv_size_bytes(self):
assert hasattr(self, "kv_buffer")