Split the overlapped version of TpModelWorkerClient into a separate file (#1726)

This commit is contained in:
Lianmin Zheng
2024-10-20 00:29:29 -07:00
committed by GitHub
parent 593b19f29d
commit b48edff67f
7 changed files with 217 additions and 131 deletions

View File

@@ -461,6 +461,7 @@ class ModelRunner:
size=max_num_reqs + 1,
max_context_len=self.model_config.context_len + 4,
device=self.device,
use_records=False,
)
if (
self.model_config.attention_arch == AttentionArch.MLA