Improve streaming, log_level, memory report, weight loading, and benchmark script (#7632)
Co-authored-by: Kan Wu <wukanustc@gmail.com>
This commit is contained in:
@@ -123,6 +123,7 @@ class KVCache(abc.ABC):
|
||||
self.memory_saver_adapter = TorchMemorySaverAdapter.create(
|
||||
enable=enable_memory_saver
|
||||
)
|
||||
self.mem_usage = 0
|
||||
|
||||
# used for chunked cpu-offloading
|
||||
self.cpu_offloading_chunk_size = 8192
|
||||
@@ -219,6 +220,7 @@ class MHATokenToKVPool(KVCache):
|
||||
logger.info(
|
||||
f"KV Cache is allocated. #tokens: {size}, K size: {k_size / GB:.2f} GB, V size: {v_size / GB:.2f} GB"
|
||||
)
|
||||
self.mem_usage = (k_size + v_size) / GB
|
||||
|
||||
def _create_buffers(self):
|
||||
with self.memory_saver_adapter.region(GPU_MEMORY_TYPE_KV_CACHE):
|
||||
@@ -695,6 +697,7 @@ class MLATokenToKVPool(KVCache):
|
||||
logger.info(
|
||||
f"KV Cache is allocated. #tokens: {size}, KV size: {kv_size / GB:.2f} GB"
|
||||
)
|
||||
self.mem_usage = kv_size / GB
|
||||
|
||||
def get_kv_size_bytes(self):
|
||||
assert hasattr(self, "kv_buffer")
|
||||
|
||||
Reference in New Issue
Block a user