[VLM] Support chunk prefill for VLM (#6355)

Co-authored-by: yizhang2077 <1109276519@qq.com>
This commit is contained in:
Chang Su
2025-05-22 20:32:41 -07:00
committed by GitHub
parent 0a4fc73b48
commit 4685fbb888
20 changed files with 510 additions and 184 deletions

View File

@@ -102,6 +102,7 @@ from sglang.srt.managers.io_struct import (
UpdateWeightsFromTensorReqInput,
UpdateWeightsFromTensorReqOutput,
)
from sglang.srt.managers.mm_utils import init_embedding_cache
from sglang.srt.managers.schedule_batch import (
FINISH_ABORT,
MultimodalInputs,
@@ -2282,6 +2283,10 @@ def run_scheduler_process(
if get_bool_env_var("SGLANG_SET_CPU_AFFINITY"):
set_gpu_proc_affinity(server_args.tp_size, server_args.nnodes, gpu_id)
embedding_cache_size = 100
if "SGLANG_VLM_CACHE_SIZE_MB" in os.environ:
embedding_cache_size = int(os.environ["SGLANG_VLM_CACHE_SIZE_MB"])
init_embedding_cache(embedding_cache_size * 1024 * 1024)
# Create a scheduler and run the event loop
try:
scheduler = Scheduler(server_args, port_args, gpu_id, tp_rank, pp_rank, dp_rank)