model: adapt mllama4 to VisionAttention (#8512)

Co-authored-by: root <mickjagger19@icloud.com>
This commit is contained in:
Wenchen Lo
2025-08-02 00:39:40 -07:00
committed by GitHub
parent 4bec99ecd0
commit ea93079b30
6 changed files with 518 additions and 52 deletions

View File

@@ -12,7 +12,6 @@ import torch
from PIL import Image
from transformers import BaseImageProcessorFast
from sglang.srt.managers.mm_utils import TransportProxyTensor
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
from sglang.srt.utils import load_audio, load_image, load_video, logger
@@ -218,8 +217,10 @@ class BaseMultimodalProcessor(ABC):
kwargs["audio"] = audios
processor = self._processor
if hasattr(processor, "image_processor") and isinstance(
processor.image_processor, BaseImageProcessorFast
if (
hasattr(processor, "image_processor")
and isinstance(processor.image_processor, BaseImageProcessorFast)
and not self.server_args.disable_fast_image_processor
):
kwargs["device"] = "cuda"
result = processor.__call__(