[Optimization] Update estimated_num_new_pages logic in TokenToKVPoolAllocator (#8794)

Signed-off-by: Xingrui Yi <yixingrui@linux.alibaba.com>
Co-authored-by: Xingrui Yi <yixingrui@linux.alibaba.com>
This commit is contained in:
YiXR
2025-08-11 07:01:51 +08:00
committed by GitHub
parent e322a94d1f
commit 0418b9d4ea
2 changed files with 56 additions and 49 deletions

View File

@@ -1300,6 +1300,8 @@ class ModelRunner:
dtype=self.kv_cache_dtype,
device=self.device,
kvcache=self.token_to_kv_pool,
need_sort=self.server_args.disaggregation_mode
in ("decode", "prefill"),
)
else:
self.token_to_kv_pool_allocator = TokenToKVPoolAllocator(
@@ -1307,6 +1309,8 @@ class ModelRunner:
dtype=self.kv_cache_dtype,
device=self.device,
kvcache=self.token_to_kv_pool,
need_sort=self.server_args.disaggregation_mode
in ("decode", "prefill"),
)
else:
if _is_npu:
@@ -1316,6 +1320,8 @@ class ModelRunner:
dtype=self.kv_cache_dtype,
device=self.device,
kvcache=self.token_to_kv_pool,
need_sort=self.server_args.disaggregation_mode
in ("decode", "prefill"),
)
else:
self.token_to_kv_pool_allocator = PagedTokenToKVPoolAllocator(
@@ -1324,6 +1330,8 @@ class ModelRunner:
dtype=self.kv_cache_dtype,
device=self.device,
kvcache=self.token_to_kv_pool,
need_sort=self.server_args.disaggregation_mode
in ("decode", "prefill"),
)
else:
assert self.is_draft_worker