Move scheduler code from tp_worker.py to scheduler.py (#1538)

This commit is contained in:
Lianmin Zheng
2024-09-29 17:42:45 -07:00
committed by GitHub
parent acaffd233f
commit f86c1e611f
8 changed files with 933 additions and 870 deletions

View File

@@ -87,6 +87,7 @@ class ModelRunner:
self.model_config.hf_config.architectures
)
# Model-specific adjustment
if (
self.model_config.attention_arch == AttentionArch.MLA
and not self.server_args.disable_mla
@@ -94,6 +95,13 @@ class ModelRunner:
logger.info("MLA optimization is tunred on. Use triton backend.")
self.server_args.attention_backend = "triton"
if self.is_multimodal_model:
logger.info(
"Automatically turn off --chunked-prefill-size and adjust --mem-fraction-static for multimodal models."
)
server_args.chunked_prefill_size = None
server_args.mem_fraction_static *= 0.95
global_server_args_dict.update(
{
"attention_backend": server_args.attention_backend,
@@ -104,14 +112,6 @@ class ModelRunner:
}
)
# Model-specific adjustment
if self.is_multimodal_model:
logger.info(
"Automatically turn off --chunked-prefill-size and adjust --mem-fraction-static for multimodal models."
)
server_args.chunked_prefill_size = None
server_args.mem_fraction_static *= 0.95
# Init componnets
min_per_gpu_memory = self.init_torch_distributed()
self.sampler = Sampler()
@@ -400,8 +400,7 @@ class ModelRunner:
)
self.req_to_token_pool = ReqToTokenPool(
max_num_reqs + 1,
self.model_config.context_len + 4,
max_num_reqs + 1, self.model_config.context_len + 4, device="cuda"
)
if (
self.model_config.attention_arch == AttentionArch.MLA