vlm: optimize tensor transport (#6003)
Co-authored-by: Xinyuan Tong <115166877+JustinTong0323@users.noreply.github.com>
This commit is contained in:
@@ -112,6 +112,7 @@ from sglang.srt.managers.io_struct import (
|
||||
UpdateWeightsFromTensorReqInput,
|
||||
UpdateWeightsFromTensorReqOutput,
|
||||
)
|
||||
from sglang.srt.managers.mm_utils import TensorTransportMode
|
||||
from sglang.srt.managers.multimodal_processor import get_mm_processor, import_processors
|
||||
from sglang.srt.metrics.collector import TokenizerMetricsCollector
|
||||
from sglang.srt.sampling.sampling_params import SamplingParams
|
||||
@@ -166,6 +167,16 @@ class ReqState:
|
||||
output_token_ids_logprobs_idx: List = dataclasses.field(default_factory=list)
|
||||
|
||||
|
||||
def _determine_tensor_transport_mode(server_args: ServerArgs) -> TensorTransportMode:
|
||||
is_cross_node = server_args.dist_init_addr
|
||||
|
||||
if is_cross_node:
|
||||
# Fallback to default CPU transport for multi-node
|
||||
return "default"
|
||||
else:
|
||||
return "cuda_ipc"
|
||||
|
||||
|
||||
class TokenizerManager:
|
||||
"""TokenizerManager is a process that tokenizes the text."""
|
||||
|
||||
@@ -216,12 +227,13 @@ class TokenizerManager:
|
||||
revision=server_args.revision,
|
||||
use_fast=not server_args.disable_fast_image_processor,
|
||||
)
|
||||
transport_mode = _determine_tensor_transport_mode(self.server_args)
|
||||
|
||||
# We want to parallelize the image pre-processing so we create an executor for it
|
||||
# We create mm_processor for any skip_tokenizer_init to make sure we still encode
|
||||
# images even with skip_tokenizer_init=False.
|
||||
self.mm_processor = get_mm_processor(
|
||||
self.model_config.hf_config, server_args, _processor
|
||||
self.model_config.hf_config, server_args, _processor, transport_mode
|
||||
)
|
||||
|
||||
if server_args.skip_tokenizer_init:
|
||||
|
||||
Reference in New Issue
Block a user