[VLM] Support chunk prefill for VLM (#6355)
Co-authored-by: yizhang2077 <1109276519@qq.com>
This commit is contained in:
@@ -166,6 +166,9 @@ class ModelRunner:
|
||||
self.is_draft_worker = is_draft_worker
|
||||
self.is_generation = model_config.is_generation
|
||||
self.is_multimodal = model_config.is_multimodal
|
||||
self.is_multimodal_chunked_prefill_supported = (
|
||||
model_config.is_multimodal_chunked_prefill_supported
|
||||
)
|
||||
self.spec_algorithm = SpeculativeAlgorithm.from_string(
|
||||
server_args.speculative_algorithm
|
||||
)
|
||||
@@ -389,12 +392,15 @@ class ModelRunner:
|
||||
if self.is_multimodal:
|
||||
self.mem_fraction_static *= 0.90
|
||||
logger.info(
|
||||
f"Automatically reduce --mem-fraction-static to {self.mem_fraction_static:.3f} because this is a multimodal model."
|
||||
)
|
||||
server_args.chunked_prefill_size = -1
|
||||
logger.info(
|
||||
"Automatically turn off --chunked-prefill-size for multimodal model."
|
||||
f"Automatically reduce --mem-fraction-static to {self.mem_fraction_static:.3f} "
|
||||
f"because this is a multimodal model."
|
||||
)
|
||||
if not self.is_multimodal_chunked_prefill_supported:
|
||||
server_args.chunked_prefill_size = -1
|
||||
logger.info(
|
||||
f"Automatically turn of --chunked-prefill-size as it is not supported for "
|
||||
f"{self.model_config.hf_config.model_type}"
|
||||
)
|
||||
|
||||
if not self.use_mla_backend:
|
||||
server_args.disable_chunked_prefix_cache = True
|
||||
|
||||
Reference in New Issue
Block a user