Fuse top_k and top_k in the sampler (#1457)

This commit is contained in:
Lianmin Zheng
2024-09-18 04:35:35 -07:00
committed by GitHub
parent 1acccb364a
commit 7f24ea95c3
3 changed files with 12 additions and 4 deletions

View File

@@ -400,8 +400,8 @@ class ModelRunner:
)
self.req_to_token_pool = ReqToTokenPool(
max_num_reqs,
self.model_config.context_len + 8,
max_num_reqs + 1,
self.model_config.context_len + 4,
)
if (
self.model_config.attention_arch == AttentionArch.MLA