Move scheduler code from tp_worker.py to scheduler.py (#1538)
This commit is contained in:
@@ -87,6 +87,7 @@ class ModelRunner:
|
||||
self.model_config.hf_config.architectures
|
||||
)
|
||||
|
||||
# Model-specific adjustment
|
||||
if (
|
||||
self.model_config.attention_arch == AttentionArch.MLA
|
||||
and not self.server_args.disable_mla
|
||||
@@ -94,6 +95,13 @@ class ModelRunner:
|
||||
logger.info("MLA optimization is tunred on. Use triton backend.")
|
||||
self.server_args.attention_backend = "triton"
|
||||
|
||||
if self.is_multimodal_model:
|
||||
logger.info(
|
||||
"Automatically turn off --chunked-prefill-size and adjust --mem-fraction-static for multimodal models."
|
||||
)
|
||||
server_args.chunked_prefill_size = None
|
||||
server_args.mem_fraction_static *= 0.95
|
||||
|
||||
global_server_args_dict.update(
|
||||
{
|
||||
"attention_backend": server_args.attention_backend,
|
||||
@@ -104,14 +112,6 @@ class ModelRunner:
|
||||
}
|
||||
)
|
||||
|
||||
# Model-specific adjustment
|
||||
if self.is_multimodal_model:
|
||||
logger.info(
|
||||
"Automatically turn off --chunked-prefill-size and adjust --mem-fraction-static for multimodal models."
|
||||
)
|
||||
server_args.chunked_prefill_size = None
|
||||
server_args.mem_fraction_static *= 0.95
|
||||
|
||||
# Init componnets
|
||||
min_per_gpu_memory = self.init_torch_distributed()
|
||||
self.sampler = Sampler()
|
||||
@@ -400,8 +400,7 @@ class ModelRunner:
|
||||
)
|
||||
|
||||
self.req_to_token_pool = ReqToTokenPool(
|
||||
max_num_reqs + 1,
|
||||
self.model_config.context_len + 4,
|
||||
max_num_reqs + 1, self.model_config.context_len + 4, device="cuda"
|
||||
)
|
||||
if (
|
||||
self.model_config.attention_arch == AttentionArch.MLA
|
||||
|
||||
Reference in New Issue
Block a user