model: adapt mllama4 to VisionAttention (#8512)

Co-authored-by: root <mickjagger19@icloud.com>
This commit is contained in:
Wenchen Lo
2025-08-02 00:39:40 -07:00
committed by GitHub
parent 4bec99ecd0
commit ea93079b30
6 changed files with 518 additions and 52 deletions

View File

@@ -70,7 +70,6 @@ from sglang.srt.managers.io_struct import (
BatchMultimodalOut,
BatchStrOut,
BatchTokenIDOut,
BlockReqType,
CloseSessionReqInput,
ConfigureLoggingReq,
EmbeddingReqInput,
@@ -202,13 +201,29 @@ class TokenizerManager:
if self.model_config.is_multimodal:
import_processors()
_processor = get_processor(
server_args.tokenizer_path,
tokenizer_mode=server_args.tokenizer_mode,
trust_remote_code=server_args.trust_remote_code,
revision=server_args.revision,
use_fast=not server_args.disable_fast_image_processor,
)
try:
_processor = get_processor(
server_args.tokenizer_path,
tokenizer_mode=server_args.tokenizer_mode,
trust_remote_code=server_args.trust_remote_code,
revision=server_args.revision,
use_fast=not server_args.disable_fast_image_processor,
)
except ValueError as e:
error_message = str(e)
if "does not have a slow version" in error_message:
logger.info(
f"Processor {server_args.tokenizer_path} does not have a slow version. Automatically use fast version"
)
_processor = get_processor(
server_args.tokenizer_path,
tokenizer_mode=server_args.tokenizer_mode,
trust_remote_code=server_args.trust_remote_code,
revision=server_args.revision,
use_fast=True,
)
else:
raise e
transport_mode = _determine_tensor_transport_mode(self.server_args)
# We want to parallelize the image pre-processing so we create an executor for it