Fuse top_k and top_k in the sampler (#1457)

2024-09-18 04:35:35 -07:00
parent 1acccb364a
commit 7f24ea95c3
3 changed files with 12 additions and 4 deletions
--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -400,8 +400,8 @@ class ModelRunner:
            )

        self.req_to_token_pool = ReqToTokenPool(
-            max_num_reqs,
-            self.model_config.context_len + 8,
+            max_num_reqs + 1,
+            self.model_config.context_len + 4,
        )
        if (
            self.model_config.attention_arch == AttentionArch.MLA