Move multimodal processors into a separate folder (#7581)

This commit is contained in:
Lianmin Zheng
2025-06-27 11:58:24 -07:00
committed by GitHub
parent 41650b0d70
commit ce3a3e8783
29 changed files with 63 additions and 84 deletions

View File

@@ -1,8 +0,0 @@
# COPIED FROM DeepGEMM
def align(x: int, y: int) -> int:
return ceil_div(x, y) * y
# COPIED FROM DeepGEMM
def ceil_div(x: int, y: int) -> int:
return (x + y - 1) // y

View File

@@ -19,7 +19,7 @@ from transformers import (
from transformers.image_utils import to_numpy_array from transformers.image_utils import to_numpy_array
from sglang.srt.configs.utils import register_image_processor, register_processor from sglang.srt.configs.utils import register_image_processor, register_processor
from sglang.srt.mm_utils import expand2square from sglang.srt.multimodal.mm_utils import expand2square
class DictToObject(dict): class DictToObject(dict):

View File

@@ -4,9 +4,8 @@ from typing import List, Optional
import torch import torch
import triton import triton
from sglang.math_utils import ceil_div
from sglang.srt.layers.quantization.fp8_kernel import per_token_group_quant_fp8 from sglang.srt.layers.quantization.fp8_kernel import per_token_group_quant_fp8
from sglang.srt.utils import dispose_tensor, is_cuda from sglang.srt.utils import ceil_div, dispose_tensor, is_cuda
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)

View File

@@ -12,7 +12,6 @@ import torch
import triton import triton
import triton.language as tl import triton.language as tl
from sglang.math_utils import ceil_div
from sglang.srt.layers.moe.topk import select_experts from sglang.srt.layers.moe.topk import select_experts
from sglang.srt.layers.quantization.fp8_kernel import ( from sglang.srt.layers.quantization.fp8_kernel import (
per_token_group_quant_fp8, per_token_group_quant_fp8,
@@ -25,6 +24,7 @@ from sglang.srt.layers.quantization.int8_kernel import (
sglang_per_token_group_quant_int8, sglang_per_token_group_quant_int8,
) )
from sglang.srt.utils import ( from sglang.srt.utils import (
ceil_div,
cpu_has_amx_support, cpu_has_amx_support,
direct_register_custom_op, direct_register_custom_op,
get_bool_env_var, get_bool_env_var,
@@ -32,7 +32,6 @@ from sglang.srt.utils import (
is_cpu, is_cpu,
is_cuda, is_cuda,
is_hip, is_hip,
log_info_on_rank0,
next_power_of_2, next_power_of_2,
) )

View File

@@ -23,9 +23,9 @@ import torch
import triton import triton
import triton.language as tl import triton.language as tl
from sglang.math_utils import align
from sglang.srt.layers.quantization import deep_gemm_wrapper from sglang.srt.layers.quantization import deep_gemm_wrapper
from sglang.srt.utils import ( from sglang.srt.utils import (
align,
direct_register_custom_op, direct_register_custom_op,
get_device_core_count, get_device_core_count,
get_device_name, get_device_name,

View File

@@ -1,9 +1,7 @@
from typing import Callable, List, Optional, Tuple from typing import Callable, List, Optional, Tuple
import einops
import torch import torch
from sglang.math_utils import align
from sglang.srt.layers.quantization import deep_gemm_wrapper from sglang.srt.layers.quantization import deep_gemm_wrapper
from sglang.srt.layers.quantization.fp8_kernel import sglang_per_token_group_quant_fp8 from sglang.srt.layers.quantization.fp8_kernel import sglang_per_token_group_quant_fp8
from sglang.srt.layers.utils import is_sm100_supported from sglang.srt.layers.utils import is_sm100_supported
@@ -27,6 +25,7 @@ from sglang.srt.layers.quantization.fp8_kernel import (
w8a8_block_fp8_matmul_triton, w8a8_block_fp8_matmul_triton,
) )
from sglang.srt.utils import ( from sglang.srt.utils import (
align,
get_bool_env_var, get_bool_env_var,
get_cuda_version, get_cuda_version,
get_device_capability, get_device_capability,

View File

@@ -22,7 +22,7 @@ from dataclasses import dataclass, field
from enum import Enum from enum import Enum
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
from sglang.srt.mm_utils import has_valid_data from sglang.srt.multimodal.mm_utils import has_valid_data
# handle serialization of Image for pydantic # handle serialization of Image for pydantic
if TYPE_CHECKING: if TYPE_CHECKING:

View File

@@ -2,8 +2,6 @@
Multi-modality utils Multi-modality utils
""" """
import dataclasses
import logging
from abc import abstractmethod from abc import abstractmethod
from typing import Callable, List, Optional, Tuple from typing import Callable, List, Optional, Tuple

View File

@@ -5,9 +5,7 @@ import logging
import pkgutil import pkgutil
from functools import lru_cache from functools import lru_cache
from sglang.srt.managers.multimodal_processors.base_processor import ( from sglang.srt.multimodal.processors.base_processor import BaseMultimodalProcessor
BaseMultimodalProcessor,
)
from sglang.srt.server_args import ServerArgs from sglang.srt.server_args import ServerArgs
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -29,7 +27,7 @@ def get_dummy_processor():
@lru_cache() @lru_cache()
def import_processors(): def import_processors():
package_name = "sglang.srt.managers.multimodal_processors" package_name = "sglang.srt.multimodal.processors"
package = importlib.import_module(package_name) package = importlib.import_module(package_name)
for _, name, ispkg in pkgutil.iter_modules(package.__path__, package_name + "."): for _, name, ispkg in pkgutil.iter_modules(package.__path__, package_name + "."):
if not ispkg: if not ispkg:

View File

@@ -41,16 +41,16 @@ from sglang.srt.managers.schedule_batch import (
MultimodalDataItem, MultimodalDataItem,
MultimodalInputs, MultimodalInputs,
) )
from sglang.srt.mm_utils import (
get_anyres_image_grid_shape,
unpad_image,
unpad_image_shape,
)
from sglang.srt.model_executor.forward_batch_info import ForwardBatch from sglang.srt.model_executor.forward_batch_info import ForwardBatch
from sglang.srt.model_loader.weight_utils import default_weight_loader from sglang.srt.model_loader.weight_utils import default_weight_loader
from sglang.srt.models.llama import LlamaForCausalLM from sglang.srt.models.llama import LlamaForCausalLM
from sglang.srt.models.mistral import MistralForCausalLM from sglang.srt.models.mistral import MistralForCausalLM
from sglang.srt.models.qwen2 import Qwen2ForCausalLM from sglang.srt.models.qwen2 import Qwen2ForCausalLM
from sglang.srt.multimodal.mm_utils import (
get_anyres_image_grid_shape,
unpad_image,
unpad_image_shape,
)
from sglang.srt.utils import add_prefix, flatten_nested_list, logger from sglang.srt.utils import add_prefix, flatten_nested_list, logger

View File

@@ -1,10 +1,8 @@
from typing import List, Union from typing import List, Union
from sglang.srt.managers.multimodal_processors.base_processor import (
BaseMultimodalProcessor,
)
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
from sglang.srt.models.clip import CLIPModel from sglang.srt.models.clip import CLIPModel
from sglang.srt.multimodal.processors.base_processor import BaseMultimodalProcessor
from sglang.srt.utils import load_image from sglang.srt.utils import load_image

View File

@@ -20,12 +20,12 @@ from typing import List, Union
import torch import torch
from sglang.srt.managers.multimodal_processors.base_processor import ( from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
from sglang.srt.models.deepseek_vl2 import DeepseekVL2ForCausalLM
from sglang.srt.multimodal.processors.base_processor import (
BaseMultimodalProcessor, BaseMultimodalProcessor,
MultimodalSpecialTokens, MultimodalSpecialTokens,
) )
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
from sglang.srt.models.deepseek_vl2 import DeepseekVL2ForCausalLM
class DeepseekVL2ImageProcessor(BaseMultimodalProcessor): class DeepseekVL2ImageProcessor(BaseMultimodalProcessor):

View File

@@ -4,11 +4,9 @@ from typing import Dict, List, Union
from sglang.srt.managers.multimodal_processor import ( from sglang.srt.managers.multimodal_processor import (
BaseMultimodalProcessor as SGLangBaseProcessor, BaseMultimodalProcessor as SGLangBaseProcessor,
) )
from sglang.srt.managers.multimodal_processors.base_processor import (
MultimodalSpecialTokens,
)
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
from sglang.srt.models.gemma3_mm import Gemma3ForConditionalGeneration from sglang.srt.models.gemma3_mm import Gemma3ForConditionalGeneration
from sglang.srt.multimodal.processors.base_processor import MultimodalSpecialTokens
# Copied from: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gemma3/image_processing_gemma3_fast.py # Copied from: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gemma3/image_processing_gemma3_fast.py
# will be removed in the future # will be removed in the future

View File

@@ -18,10 +18,8 @@ from typing import Dict, List, Optional, Union
from sglang.srt.managers.multimodal_processor import ( from sglang.srt.managers.multimodal_processor import (
BaseMultimodalProcessor as SGLangBaseProcessor, BaseMultimodalProcessor as SGLangBaseProcessor,
) )
from sglang.srt.managers.multimodal_processors.base_processor import (
MultimodalSpecialTokens,
)
from sglang.srt.models.gemma3n_mm import Gemma3nForConditionalGeneration from sglang.srt.models.gemma3n_mm import Gemma3nForConditionalGeneration
from sglang.srt.multimodal.processors.base_processor import MultimodalSpecialTokens
class Gemma3nSGLangProcessor(SGLangBaseProcessor): class Gemma3nSGLangProcessor(SGLangBaseProcessor):

View File

@@ -5,12 +5,12 @@ import torch
from decord import VideoReader, cpu from decord import VideoReader, cpu
from PIL import Image from PIL import Image
from sglang.srt.managers.multimodal_processors.base_processor import ( from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
from sglang.srt.models.internvl import InternVLChatModel
from sglang.srt.multimodal.processors.base_processor import (
BaseMultimodalProcessor, BaseMultimodalProcessor,
MultimodalSpecialTokens, MultimodalSpecialTokens,
) )
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
from sglang.srt.models.internvl import InternVLChatModel
class InternVLImageProcessor(BaseMultimodalProcessor): class InternVLImageProcessor(BaseMultimodalProcessor):

View File

@@ -1,11 +1,11 @@
from typing import List, Union from typing import List, Union
from sglang.srt.managers.multimodal_processors.base_processor import ( from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
from sglang.srt.models.deepseek_janus_pro import MultiModalityCausalLM
from sglang.srt.multimodal.processors.base_processor import (
BaseMultimodalProcessor, BaseMultimodalProcessor,
MultimodalSpecialTokens, MultimodalSpecialTokens,
) )
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
from sglang.srt.models.deepseek_janus_pro import MultiModalityCausalLM
class JanusProImageProcessor(BaseMultimodalProcessor): class JanusProImageProcessor(BaseMultimodalProcessor):

View File

@@ -3,14 +3,12 @@ from typing import Any, Dict, List, Optional, Union
import torch import torch
from sglang.srt.managers.multimodal_processors.base_processor import (
BaseMultimodalProcessor as SGLangBaseProcessor,
)
from sglang.srt.managers.multimodal_processors.base_processor import (
MultimodalSpecialTokens,
)
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
from sglang.srt.models.kimi_vl import KimiVLForConditionalGeneration from sglang.srt.models.kimi_vl import KimiVLForConditionalGeneration
from sglang.srt.multimodal.processors.base_processor import (
BaseMultimodalProcessor as SGLangBaseProcessor,
)
from sglang.srt.multimodal.processors.base_processor import MultimodalSpecialTokens
# Compatible with KimiVLForConditionalGeneration # Compatible with KimiVLForConditionalGeneration

View File

@@ -7,11 +7,7 @@ from transformers.models.auto.processing_auto import (
) )
import sglang.srt.managers.multimodal_processor as sgl_mm_processor_utils import sglang.srt.managers.multimodal_processor as sgl_mm_processor_utils
from sglang.srt.managers.multimodal_processors.base_processor import (
BaseMultimodalProcessor,
)
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
from sglang.srt.mm_utils import expand2square, process_anyres_image
from sglang.srt.models.llava import ( from sglang.srt.models.llava import (
LlavaForConditionalGeneration, LlavaForConditionalGeneration,
LlavaLlamaForCausalLM, LlavaLlamaForCausalLM,
@@ -20,6 +16,8 @@ from sglang.srt.models.llava import (
) )
from sglang.srt.models.llavavid import LlavaVidForCausalLM from sglang.srt.models.llavavid import LlavaVidForCausalLM
from sglang.srt.models.mistral import Mistral3ForConditionalGeneration from sglang.srt.models.mistral import Mistral3ForConditionalGeneration
from sglang.srt.multimodal.mm_utils import expand2square, process_anyres_image
from sglang.srt.multimodal.processors.base_processor import BaseMultimodalProcessor
from sglang.srt.utils import load_image, logger from sglang.srt.utils import load_image, logger
from sglang.utils import get_exception_traceback from sglang.utils import get_exception_traceback

View File

@@ -2,13 +2,13 @@ from typing import List, Union
import torch import torch
from sglang.srt.managers.multimodal_processors.base_processor import (
BaseMultimodalProcessor,
MultimodalSpecialTokens,
)
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
from sglang.srt.models.minicpmo import MiniCPMO from sglang.srt.models.minicpmo import MiniCPMO
from sglang.srt.models.minicpmv import MiniCPMV from sglang.srt.models.minicpmv import MiniCPMV
from sglang.srt.multimodal.processors.base_processor import (
BaseMultimodalProcessor,
MultimodalSpecialTokens,
)
# Compatible with both 'O' and 'V' # Compatible with both 'O' and 'V'

View File

@@ -1,10 +1,8 @@
from typing import List, Union from typing import List, Union
from sglang.srt.managers.multimodal_processors.base_processor import (
BaseMultimodalProcessor,
)
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
from sglang.srt.models.mllama import MllamaForConditionalGeneration from sglang.srt.models.mllama import MllamaForConditionalGeneration
from sglang.srt.multimodal.processors.base_processor import BaseMultimodalProcessor
from sglang.srt.utils import load_image from sglang.srt.utils import load_image

View File

@@ -7,12 +7,12 @@ from transformers.models.llama4.image_processing_llama4_fast import (
get_best_fit, get_best_fit,
) )
from sglang.srt.managers.multimodal_processors.base_processor import ( from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
from sglang.srt.models.mllama4 import Llama4ForConditionalGeneration
from sglang.srt.multimodal.processors.base_processor import (
BaseMultimodalProcessor, BaseMultimodalProcessor,
MultimodalSpecialTokens, MultimodalSpecialTokens,
) )
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
from sglang.srt.models.mllama4 import Llama4ForConditionalGeneration
class Mllama4ImageProcessor(BaseMultimodalProcessor): class Mllama4ImageProcessor(BaseMultimodalProcessor):

View File

@@ -1,12 +1,12 @@
import logging import logging
from typing import List, Union from typing import List, Union
from sglang.srt.managers.multimodal_processors.base_processor import ( from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
from sglang.srt.models.phi4mm import Phi4MMForCausalLM
from sglang.srt.multimodal.processors.base_processor import (
BaseMultimodalProcessor, BaseMultimodalProcessor,
MultimodalSpecialTokens, MultimodalSpecialTokens,
) )
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
from sglang.srt.models.phi4mm import Phi4MMForCausalLM
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)

View File

@@ -6,12 +6,12 @@ from transformers.models.pixtral.image_processing_pixtral import (
_num_image_tokens as _get_pixtral_hf_num_image_tokens, _num_image_tokens as _get_pixtral_hf_num_image_tokens,
) )
from sglang.srt.managers.multimodal_processors.base_processor import ( from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
from sglang.srt.models.pixtral import PixtralVisionModel
from sglang.srt.multimodal.processors.base_processor import (
BaseMultimodalProcessor, BaseMultimodalProcessor,
MultimodalSpecialTokens, MultimodalSpecialTokens,
) )
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
from sglang.srt.models.pixtral import PixtralVisionModel
class PixtralProcessor(BaseMultimodalProcessor): class PixtralProcessor(BaseMultimodalProcessor):

View File

@@ -7,15 +7,13 @@ import torch
from PIL import Image from PIL import Image
from sglang.srt.layers.rotary_embedding import MRotaryEmbedding from sglang.srt.layers.rotary_embedding import MRotaryEmbedding
from sglang.srt.managers.multimodal_processors.base_processor import (
BaseMultimodalProcessor as SGLangBaseProcessor,
)
from sglang.srt.managers.multimodal_processors.base_processor import (
MultimodalSpecialTokens,
)
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
from sglang.srt.models.qwen2_5_vl import Qwen2_5_VLForConditionalGeneration from sglang.srt.models.qwen2_5_vl import Qwen2_5_VLForConditionalGeneration
from sglang.srt.models.qwen2_vl import Qwen2VLForConditionalGeneration from sglang.srt.models.qwen2_vl import Qwen2VLForConditionalGeneration
from sglang.srt.multimodal.processors.base_processor import (
BaseMultimodalProcessor as SGLangBaseProcessor,
)
from sglang.srt.multimodal.processors.base_processor import MultimodalSpecialTokens
# Compatible with Qwen2VL and Qwen2_5VL # Compatible with Qwen2VL and Qwen2_5VL

View File

@@ -10,12 +10,12 @@ from sglang.srt.managers.io_struct import (
GenerateReqInput, GenerateReqInput,
ImageDataItem, ImageDataItem,
) )
from sglang.srt.managers.multimodal_processors.base_processor import ( from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
from sglang.srt.models.vila import VILAForConditionalGeneration
from sglang.srt.multimodal.processors.base_processor import (
BaseMultimodalProcessor, BaseMultimodalProcessor,
MultimodalSpecialTokens, MultimodalSpecialTokens,
) )
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
from sglang.srt.models.vila import VILAForConditionalGeneration
from sglang.srt.server_args import ServerArgs from sglang.srt.server_args import ServerArgs

View File

@@ -2577,3 +2577,13 @@ def configure_gc_logger():
) )
gc.callbacks.append(gc_callback) gc.callbacks.append(gc_callback)
# COPIED FROM DeepGEMM
def align(x: int, y: int) -> int:
return ceil_div(x, y) * y
# COPIED FROM DeepGEMM
def ceil_div(x: int, y: int) -> int:
return (x + y - 1) // y

View File

@@ -23,15 +23,13 @@ from sglang.srt.configs.model_config import ModelConfig
from sglang.srt.conversation import generate_chat_conv from sglang.srt.conversation import generate_chat_conv
from sglang.srt.entrypoints.openai.protocol import ChatCompletionRequest from sglang.srt.entrypoints.openai.protocol import ChatCompletionRequest
from sglang.srt.managers.mm_utils import embed_mm_inputs, init_embedding_cache from sglang.srt.managers.mm_utils import embed_mm_inputs, init_embedding_cache
from sglang.srt.managers.multimodal_processors.base_processor import (
BaseMultimodalProcessor,
)
from sglang.srt.managers.schedule_batch import ( from sglang.srt.managers.schedule_batch import (
Modality, Modality,
MultimodalDataItem, MultimodalDataItem,
MultimodalInputs, MultimodalInputs,
) )
from sglang.srt.model_executor.model_runner import ModelRunner from sglang.srt.model_executor.model_runner import ModelRunner
from sglang.srt.multimodal.processors.base_processor import BaseMultimodalProcessor
from sglang.srt.server_args import ServerArgs from sglang.srt.server_args import ServerArgs