[VLM] Support chunk prefill for VLM (#6355)
Co-authored-by: yizhang2077 <1109276519@qq.com>
This commit is contained in:
@@ -102,6 +102,7 @@ from sglang.srt.managers.io_struct import (
|
||||
UpdateWeightsFromTensorReqInput,
|
||||
UpdateWeightsFromTensorReqOutput,
|
||||
)
|
||||
from sglang.srt.managers.mm_utils import init_embedding_cache
|
||||
from sglang.srt.managers.schedule_batch import (
|
||||
FINISH_ABORT,
|
||||
MultimodalInputs,
|
||||
@@ -2282,6 +2283,10 @@ def run_scheduler_process(
|
||||
if get_bool_env_var("SGLANG_SET_CPU_AFFINITY"):
|
||||
set_gpu_proc_affinity(server_args.tp_size, server_args.nnodes, gpu_id)
|
||||
|
||||
embedding_cache_size = 100
|
||||
if "SGLANG_VLM_CACHE_SIZE_MB" in os.environ:
|
||||
embedding_cache_size = int(os.environ["SGLANG_VLM_CACHE_SIZE_MB"])
|
||||
init_embedding_cache(embedding_cache_size * 1024 * 1024)
|
||||
# Create a scheduler and run the event loop
|
||||
try:
|
||||
scheduler = Scheduler(server_args, port_args, gpu_id, tp_rank, pp_rank, dp_rank)
|
||||
|
||||
Reference in New Issue
Block a user