[VLM] Support chunk prefill for VLM (#6355)

Co-authored-by: yizhang2077 <1109276519@qq.com>
This commit is contained in:
Chang Su
2025-05-22 20:32:41 -07:00
committed by GitHub
parent 0a4fc73b48
commit 4685fbb888
20 changed files with 510 additions and 184 deletions

View File

@@ -166,6 +166,9 @@ class ModelRunner:
self.is_draft_worker = is_draft_worker
self.is_generation = model_config.is_generation
self.is_multimodal = model_config.is_multimodal
self.is_multimodal_chunked_prefill_supported = (
model_config.is_multimodal_chunked_prefill_supported
)
self.spec_algorithm = SpeculativeAlgorithm.from_string(
server_args.speculative_algorithm
)
@@ -389,12 +392,15 @@ class ModelRunner:
if self.is_multimodal:
self.mem_fraction_static *= 0.90
logger.info(
f"Automatically reduce --mem-fraction-static to {self.mem_fraction_static:.3f} because this is a multimodal model."
)
server_args.chunked_prefill_size = -1
logger.info(
"Automatically turn off --chunked-prefill-size for multimodal model."
f"Automatically reduce --mem-fraction-static to {self.mem_fraction_static:.3f} "
f"because this is a multimodal model."
)
if not self.is_multimodal_chunked_prefill_supported:
server_args.chunked_prefill_size = -1
logger.info(
f"Automatically turn of --chunked-prefill-size as it is not supported for "
f"{self.model_config.hf_config.model_type}"
)
if not self.use_mla_backend:
server_args.disable_chunked_prefix_cache = True