Tiny move files to utils folder (#11166)

2025-10-03 22:40:06 +08:00
parent 04b86b3c5c
commit fdc4e1e570
66 changed files with 91 additions and 79 deletions
--- a/benchmark/json_schema/bench_sglang.py
+++ b/benchmark/json_schema/bench_sglang.py
@@ -8,7 +8,7 @@ from datasets import load_dataset
 import sglang as sgl
 from sglang.global_config import global_config
-from sglang.srt.hf_transformers_utils import get_tokenizer
+from sglang.srt.utils.hf_transformers_utils import get_tokenizer
 from sglang.test.test_utils import (
    add_common_sglang_args_and_parse,
    select_sglang_backend,
--- a/benchmark/multi_turn_chat/long_prompt_multi_turn.py
+++ b/benchmark/multi_turn_chat/long_prompt_multi_turn.py
@@ -7,7 +7,7 @@ from pathlib import Path
 from tqdm import tqdm
 import sglang as sgl
-from sglang.srt.hf_transformers_utils import get_tokenizer
+from sglang.srt.utils.hf_transformers_utils import get_tokenizer
 from sglang.test.test_utils import (
    add_common_sglang_args_and_parse,
    select_sglang_backend,
--- a/examples/runtime/token_in_token_out/token_in_token_out_llm_engine.py
+++ b/examples/runtime/token_in_token_out/token_in_token_out_llm_engine.py
@@ -3,7 +3,7 @@ This example demonstrates how to provide tokenized ids to LLM as input instead o
 """
 import sglang as sgl
-from sglang.srt.hf_transformers_utils import get_tokenizer
+from sglang.srt.utils.hf_transformers_utils import get_tokenizer
 MODEL_PATH = "meta-llama/Llama-3.1-8B-Instruct"
--- a/examples/runtime/token_in_token_out/token_in_token_out_llm_server.py
+++ b/examples/runtime/token_in_token_out/token_in_token_out_llm_server.py
@@ -7,7 +7,7 @@ python token_in_token_out_llm_server.py
 import requests
-from sglang.srt.hf_transformers_utils import get_tokenizer
+from sglang.srt.utils.hf_transformers_utils import get_tokenizer
 from sglang.test.test_utils import is_in_ci
 from sglang.utils import terminate_process, wait_for_server
--- a/python/sglang/bench_one_batch.py
+++ b/python/sglang/bench_one_batch.py
@@ -60,7 +60,6 @@ import torch.distributed as dist
 from sglang.srt.configs.model_config import ModelConfig
 from sglang.srt.distributed.parallel_state import destroy_distributed_environment
 from sglang.srt.entrypoints.engine import _set_envs_and_config
 from sglang.srt.hf_transformers_utils import get_tokenizer
 from sglang.srt.layers.moe import initialize_moe_config
 from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
 from sglang.srt.managers.scheduler import Scheduler
@@ -78,6 +77,7 @@ from sglang.srt.utils import (
    set_gpu_proc_affinity,
    suppress_other_loggers,
 )
 from sglang.srt.utils.hf_transformers_utils import get_tokenizer
@dataclasses.dataclass
--- a/python/sglang/bench_serving.py
+++ b/python/sglang/bench_serving.py
@@ -635,7 +635,7 @@ def get_tokenizer(
    if pretrained_model_name_or_path.endswith(
        ".json"
    ) or pretrained_model_name_or_path.endswith(".model"):
-        from sglang.srt.hf_transformers_utils import get_tokenizer
+        from sglang.srt.utils.hf_transformers_utils import get_tokenizer
        return get_tokenizer(pretrained_model_name_or_path)
--- a/python/sglang/lang/backend/runtime_endpoint.py
+++ b/python/sglang/lang/backend/runtime_endpoint.py
@@ -433,7 +433,7 @@ class Runtime:
        self.endpoint.cache_prefix(prefix)
    def get_tokenizer(self):
-        from sglang.srt.hf_transformers_utils import get_tokenizer
+        from sglang.srt.utils.hf_transformers_utils import get_tokenizer
        return get_tokenizer(
            self.server_args.tokenizer_path,
--- a/python/sglang/srt/configs/model_config.py
+++ b/python/sglang/srt/configs/model_config.py
@@ -23,16 +23,16 @@ import torch
 from transformers import PretrainedConfig
 from sglang.srt.environ import envs
-from sglang.srt.hf_transformers_utils import (
+from sglang.srt.layers.quantization import QUANTIZATION_METHODS
 from sglang.srt.server_args import ServerArgs
 from sglang.srt.utils import is_hip, retry
 from sglang.srt.utils.hf_transformers_utils import (
    get_config,
    get_context_length,
    get_generation_config,
    get_hf_text_config,
    get_sparse_attention_config,
 )
 from sglang.srt.layers.quantization import QUANTIZATION_METHODS
 from sglang.srt.server_args import ServerArgs
 from sglang.srt.utils import is_hip, retry
 from sglang.utils import is_in_ci
 logger = logging.getLogger(__name__)
--- a/python/sglang/srt/lora/lora.py
+++ b/python/sglang/srt/lora/lora.py
@@ -26,12 +26,12 @@ import torch
 from torch import nn
 from sglang.srt.configs.load_config import LoadConfig
 from sglang.srt.hf_transformers_utils import AutoConfig
 from sglang.srt.lora.backend.base_backend import BaseLoRABackend
 from sglang.srt.lora.backend.chunked_backend import ChunkedSgmvLoRABackend
 from sglang.srt.lora.backend.triton_backend import TritonLoRABackend
 from sglang.srt.lora.lora_config import LoRAConfig
 from sglang.srt.model_loader.loader import DefaultModelLoader
 from sglang.srt.utils.hf_transformers_utils import AutoConfig
 logger = logging.getLogger(__name__)
--- a/python/sglang/srt/lora/lora_manager.py
+++ b/python/sglang/srt/lora/lora_manager.py
@@ -21,7 +21,6 @@ from typing import Dict, Iterable, List, Optional, Set, Tuple
 import torch
 from sglang.srt.configs.load_config import LoadConfig
 from sglang.srt.hf_transformers_utils import AutoConfig
 from sglang.srt.lora.backend.base_backend import BaseLoRABackend, get_backend_from_name
 from sglang.srt.lora.layers import BaseLayerWithLoRA, get_lora_layer
 from sglang.srt.lora.lora import LoRAAdapter
@@ -39,6 +38,7 @@ from sglang.srt.managers.io_struct import LoRAUpdateOutput
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
 from sglang.srt.server_args import ServerArgs
 from sglang.srt.utils import replace_submodule
 from sglang.srt.utils.hf_transformers_utils import AutoConfig
 logger = logging.getLogger(__name__)
--- a/python/sglang/srt/lora/mem_pool.py
+++ b/python/sglang/srt/lora/mem_pool.py
@@ -4,7 +4,6 @@ from typing import Callable, Dict, Iterable, List, Optional, Set, Tuple, Union
 import torch
 from sglang.srt.distributed import divide
 from sglang.srt.hf_transformers_utils import AutoConfig
 from sglang.srt.lora.layers import BaseLayerWithLoRA
 from sglang.srt.lora.lora import LoRAAdapter
 from sglang.srt.lora.lora_config import LoRAConfig
@@ -17,6 +16,7 @@ from sglang.srt.lora.utils import (
    get_stacked_multiply,
    get_target_module_name,
 )
 from sglang.srt.utils.hf_transformers_utils import AutoConfig
 logger = logging.getLogger(__name__)
--- a/python/sglang/srt/lora/utils.py
+++ b/python/sglang/srt/lora/utils.py
@@ -5,7 +5,7 @@ from typing import Iterable, Optional, Set, Tuple
 import torch
-from sglang.srt.hf_transformers_utils import AutoConfig
+from sglang.srt.utils.hf_transformers_utils import AutoConfig
@dataclass
--- a/python/sglang/srt/managers/detokenizer_manager.py
+++ b/python/sglang/srt/managers/detokenizer_manager.py
@@ -24,7 +24,6 @@ import psutil
 import setproctitle
 import zmq
 from sglang.srt.hf_transformers_utils import get_tokenizer
 from sglang.srt.managers.io_struct import (
    BatchEmbeddingOutput,
    BatchMultimodalDecodeReq,
@@ -42,6 +41,7 @@ from sglang.srt.utils import (
    get_zmq_socket,
    kill_itself_when_parent_died,
 )
 from sglang.srt.utils.hf_transformers_utils import get_tokenizer
 from sglang.utils import (
    TypeBasedDispatcher,
    find_printable_text,
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -60,11 +60,6 @@ from sglang.srt.disaggregation.utils import (
 )
 from sglang.srt.distributed import get_pp_group, get_world_group
 from sglang.srt.eplb.expert_distribution import get_global_expert_distribution_recorder
 from sglang.srt.hf_transformers_utils import (
    get_processor,
    get_tokenizer,
    get_tokenizer_from_processor,
 )
 from sglang.srt.layers.dp_attention import compute_dp_attention_world_info
 from sglang.srt.layers.logits_processor import LogitsProcessorOutput
 from sglang.srt.layers.moe import initialize_moe_config
@@ -190,6 +185,11 @@ from sglang.srt.utils import (
    set_random_seed,
    suppress_other_loggers,
 )
 from sglang.srt.utils.hf_transformers_utils import (
    get_processor,
    get_tokenizer,
    get_tokenizer_from_processor,
 )
 from sglang.utils import TypeBasedDispatcher, get_exception_traceback
 logger = logging.getLogger(__name__)
--- a/python/sglang/srt/managers/scheduler_input_blocker.py
+++ b/python/sglang/srt/managers/scheduler_input_blocker.py
@@ -17,7 +17,7 @@ from enum import Enum, auto
 from typing import Any, List, Optional
 from sglang.srt.managers.io_struct import BlockReqInput, BlockReqType
-from sglang.srt.poll_based_barrier import PollBasedBarrier
+from sglang.srt.utils.poll_based_barrier import PollBasedBarrier
 logger = logging.getLogger(__name__)
--- a/python/sglang/srt/managers/scheduler_profiler_mixin.py
+++ b/python/sglang/srt/managers/scheduler_profiler_mixin.py
@@ -204,7 +204,7 @@ class SchedulerProfilerMixin:
            torch.distributed.barrier(self.tp_cpu_group)
            if self.tp_rank == 0:
-                from sglang.srt.rpd_utils import rpd_to_chrome_trace
+                from sglang.srt.utils.rpd_utils import rpd_to_chrome_trace
                rpd_to_chrome_trace("trace.rpd", self.rpd_profile_path)
            self.rpd_profiler = None
--- a/python/sglang/srt/managers/tokenizer_manager.py
+++ b/python/sglang/srt/managers/tokenizer_manager.py
@@ -43,11 +43,6 @@ from fastapi import BackgroundTasks
 from sglang.srt.aio_rwlock import RWLock
 from sglang.srt.configs.model_config import ModelConfig
 from sglang.srt.disaggregation.utils import DisaggregationMode
 from sglang.srt.hf_transformers_utils import (
    get_processor,
    get_tokenizer,
    get_tokenizer_from_processor,
 )
 from sglang.srt.lora.lora_registry import LoRARegistry
 from sglang.srt.managers.async_dynamic_batch_tokenizer import AsyncDynamicbatchTokenizer
 from sglang.srt.managers.disagg_service import start_disagg_service
@@ -99,6 +94,11 @@ from sglang.srt.utils import (
    get_zmq_socket,
    kill_process_tree,
 )
 from sglang.srt.utils.hf_transformers_utils import (
    get_processor,
    get_tokenizer,
    get_tokenizer_from_processor,
 )
 from sglang.utils import TypeBasedDispatcher, get_exception_traceback
 asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
--- a/python/sglang/srt/managers/tp_worker.py
+++ b/python/sglang/srt/managers/tp_worker.py
@@ -22,11 +22,6 @@ import torch
 from sglang.srt.configs.model_config import ModelConfig
 from sglang.srt.distributed import get_pp_group, get_world_group
 from sglang.srt.hf_transformers_utils import (
    get_processor,
    get_tokenizer,
    get_tokenizer_from_processor,
 )
 from sglang.srt.layers.logits_processor import LogitsProcessorOutput
 from sglang.srt.managers.io_struct import (
    DestroyWeightsUpdateGroupReqInput,
@@ -49,9 +44,14 @@ from sglang.srt.model_executor.forward_batch_info import (
    PPProxyTensors,
 )
 from sglang.srt.model_executor.model_runner import ModelRunner
 from sglang.srt.patch_torch import monkey_patch_torch_reductions
 from sglang.srt.server_args import ServerArgs
 from sglang.srt.utils import MultiprocessingSerializer, broadcast_pyobj, set_random_seed
 from sglang.srt.utils.hf_transformers_utils import (
    get_processor,
    get_tokenizer,
    get_tokenizer_from_processor,
 )
 from sglang.srt.utils.patch_torch import monkey_patch_torch_reductions
 if TYPE_CHECKING:
    from sglang.srt.managers.cache_controller import LayerDoneCounter
--- a/python/sglang/srt/model_executor/cpu_graph_runner.py
+++ b/python/sglang/srt/model_executor/cpu_graph_runner.py
@@ -34,7 +34,6 @@ from sglang.srt.model_executor.forward_batch_info import (
    ForwardMode,
    PPProxyTensors,
 )
 from sglang.srt.patch_torch import monkey_patch_torch_compile
 from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
 from sglang.srt.utils import (
    log_info_on_rank0,
@@ -43,6 +42,7 @@ from sglang.srt.utils import (
    require_mlp_sync,
    require_mlp_tp_gather,
 )
 from sglang.srt.utils.patch_torch import monkey_patch_torch_compile
 logger = logging.getLogger(__name__)
--- a/python/sglang/srt/model_executor/cuda_graph_runner.py
+++ b/python/sglang/srt/model_executor/cuda_graph_runner.py
@@ -48,7 +48,6 @@ from sglang.srt.model_executor.forward_batch_info import (
    PPProxyTensors,
    enable_num_token_non_padded,
 )
 from sglang.srt.patch_torch import monkey_patch_torch_compile
 from sglang.srt.two_batch_overlap import TboCudaGraphRunnerPlugin
 from sglang.srt.utils import (
    empty_context,
@@ -62,6 +61,7 @@ from sglang.srt.utils import (
    require_mlp_sync,
    require_mlp_tp_gather,
 )
 from sglang.srt.utils.patch_torch import monkey_patch_torch_compile
 _is_hip = is_hip()
--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -29,7 +29,6 @@ from typing import List, Optional, Tuple, Union
 import torch
 import torch.distributed as dist
 from sglang.srt import slow_rank_detector
 from sglang.srt.configs.device_config import DeviceConfig
 from sglang.srt.configs.load_config import LoadConfig, LoadFormat
 from sglang.srt.configs.model_config import AttentionArch, ModelConfig
@@ -115,7 +114,6 @@ from sglang.srt.offloader import (
    get_offloader,
    set_offloader,
 )
 from sglang.srt.patch_torch import monkey_patch_torch_reductions
 from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo
 from sglang.srt.server_args import ServerArgs
 from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
@@ -140,7 +138,9 @@ from sglang.srt.utils import (
    monkey_patch_p2p_access_check,
    monkey_patch_vllm_gguf_config,
    set_cuda_arch,
    slow_rank_detector,
 )
 from sglang.srt.utils.patch_torch import monkey_patch_torch_reductions
 from sglang.srt.weight_sync.tensor_bucket import (
    FlattenedTensorBucket,
    FlattenedTensorMetadata,
--- a/python/sglang/srt/models/dots_ocr.py
+++ b/python/sglang/srt/models/dots_ocr.py
@@ -9,7 +9,6 @@ import torch.nn as nn
 from transformers.activations import ACT2FN
 from sglang.srt.configs import DotsOCRConfig
 from sglang.srt.hf_transformers_utils import get_processor
 from sglang.srt.layers.logits_processor import LogitsProcessor
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.layers.vocab_parallel_embedding import ParallelLMHead
@@ -23,6 +22,7 @@ from sglang.srt.model_loader.weight_utils import default_weight_loader
 from sglang.srt.models.dots_vlm_vit import DotsVisionTransformer
 from sglang.srt.models.qwen2 import Qwen2ForCausalLM
 from sglang.srt.utils import add_prefix
 from sglang.srt.utils.hf_transformers_utils import get_processor
 logger = logging.getLogger(__name__)
--- a/python/sglang/srt/models/gemma3_mm.py
+++ b/python/sglang/srt/models/gemma3_mm.py
@@ -23,7 +23,6 @@ import torch
 from torch import nn
 from transformers import Gemma3Config, PreTrainedModel
 from sglang.srt.hf_transformers_utils import get_processor
 from sglang.srt.layers.layernorm import Gemma3RMSNorm
 from sglang.srt.layers.logits_processor import LogitsProcessor
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
@@ -44,6 +43,7 @@ from sglang.srt.model_loader.weight_utils import (
 from sglang.srt.models.gemma3_causal import Gemma3ForCausalLM
 from sglang.srt.models.siglip import SiglipVisionModel
 from sglang.srt.utils import add_prefix
 from sglang.srt.utils.hf_transformers_utils import get_processor
 logger = logging.getLogger(__name__)
--- a/python/sglang/srt/models/gemma3n_mm.py
+++ b/python/sglang/srt/models/gemma3n_mm.py
@@ -14,7 +14,6 @@ from transformers import (
 )
 from transformers.models.auto.modeling_auto import AutoModel
 from sglang.srt.hf_transformers_utils import get_processor
 from sglang.srt.layers.layernorm import RMSNorm
 from sglang.srt.layers.linear import ColumnParallelLinear, RowParallelLinear
 from sglang.srt.layers.logits_processor import LogitsProcessor
@@ -38,6 +37,7 @@ from sglang.srt.model_loader.weight_utils import (
 from sglang.srt.models.gemma3n_audio import Gemma3nAudioEncoder
 from sglang.srt.models.gemma3n_causal import Gemma3nRMSNorm, Gemma3nTextModel
 from sglang.srt.utils import add_prefix
 from sglang.srt.utils.hf_transformers_utils import get_processor
 logger = logging.getLogger(__name__)
--- a/python/sglang/srt/models/glm4v.py
+++ b/python/sglang/srt/models/glm4v.py
@@ -7,7 +7,6 @@ import torch.nn as nn
 import torch.nn.functional as F
 from transformers.models.glm4v.configuration_glm4v import Glm4vConfig, Glm4vVisionConfig
 from sglang.srt.hf_transformers_utils import get_processor
 from sglang.srt.layers.activation import SiluAndMul
 from sglang.srt.layers.attention import vision_utils
 from sglang.srt.layers.layernorm import RMSNorm
@@ -28,6 +27,7 @@ from sglang.srt.models.qwen2_5_vl import (
    Qwen2_5_VLForConditionalGeneration,
 )
 from sglang.srt.utils import add_prefix
 from sglang.srt.utils.hf_transformers_utils import get_processor
 logger = logging.getLogger(__name__)
--- a/python/sglang/srt/models/glm4v_moe.py
+++ b/python/sglang/srt/models/glm4v_moe.py
@@ -10,7 +10,6 @@ from sglang.srt.distributed import (
    get_moe_expert_parallel_world_size,
    get_tensor_model_parallel_world_size,
 )
 from sglang.srt.hf_transformers_utils import get_processor
 from sglang.srt.layers.attention import vision_utils
 from sglang.srt.layers.logits_processor import LogitsProcessor
 from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
@@ -22,6 +21,7 @@ from sglang.srt.model_loader.weight_utils import default_weight_loader
 from sglang.srt.models.glm4_moe import Glm4MoeModel
 from sglang.srt.models.glm4v import Glm4vForConditionalGeneration, Glm4vVisionModel
 from sglang.srt.utils import add_prefix, is_cuda, log_info_on_rank0
 from sglang.srt.utils.hf_transformers_utils import get_processor
 _is_cuda = is_cuda()
--- a/python/sglang/srt/models/qwen2_5_vl.py
+++ b/python/sglang/srt/models/qwen2_5_vl.py
@@ -40,7 +40,6 @@ from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import (
    Qwen2_5_VisionRotaryEmbedding,
 )
 from sglang.srt.hf_transformers_utils import get_processor
 from sglang.srt.layers.attention.vision import VisionAttention
 from sglang.srt.layers.layernorm import RMSNorm
 from sglang.srt.layers.linear import (
@@ -61,6 +60,7 @@ from sglang.srt.model_executor.forward_batch_info import ForwardBatch
 from sglang.srt.model_loader.weight_utils import default_weight_loader
 from sglang.srt.models.qwen2 import Qwen2Model
 from sglang.srt.utils import add_prefix
 from sglang.srt.utils.hf_transformers_utils import get_processor
 logger = logging.getLogger(__name__)
--- a/python/sglang/srt/models/qwen2_audio.py
+++ b/python/sglang/srt/models/qwen2_audio.py
@@ -39,7 +39,6 @@ from transformers.models.qwen2_audio.modeling_qwen2_audio import (
    Qwen2AudioMultiModalProjector,
 )
 from sglang.srt.hf_transformers_utils import get_processor
 from sglang.srt.layers.activation import QuickGELU
 from sglang.srt.layers.attention.vision import VisionAttention
 from sglang.srt.layers.linear import ColumnParallelLinear, RowParallelLinear
@@ -61,6 +60,7 @@ from sglang.srt.model_executor.forward_batch_info import ForwardBatch
 from sglang.srt.model_loader.weight_utils import default_weight_loader
 from sglang.srt.models.qwen2 import Qwen2ForCausalLM
 from sglang.srt.utils import add_prefix
 from sglang.srt.utils.hf_transformers_utils import get_processor
 logger = logging.getLogger(__name__)
--- a/python/sglang/srt/models/qwen2_vl.py
+++ b/python/sglang/srt/models/qwen2_vl.py
@@ -33,7 +33,6 @@ from einops import rearrange
 from transformers import Qwen2VLConfig
 from transformers.models.qwen2_vl.configuration_qwen2_vl import Qwen2VLVisionConfig
 from sglang.srt.hf_transformers_utils import get_processor
 from sglang.srt.layers.activation import QuickGELU
 from sglang.srt.layers.attention.vision import VisionAttention
 from sglang.srt.layers.linear import ColumnParallelLinear, RowParallelLinear
@@ -50,6 +49,7 @@ from sglang.srt.model_executor.forward_batch_info import ForwardBatch
 from sglang.srt.model_loader.weight_utils import default_weight_loader
 from sglang.srt.models.qwen2 import Qwen2Model
 from sglang.srt.utils import add_prefix
 from sglang.srt.utils.hf_transformers_utils import get_processor
 logger = logging.getLogger(__name__)
--- a/python/sglang/srt/models/qwen3_vl.py
+++ b/python/sglang/srt/models/qwen3_vl.py
@@ -28,7 +28,6 @@ from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import (
 )
 from sglang.srt.configs.qwen3_vl import Qwen3VLConfig, Qwen3VLVisionConfig
 from sglang.srt.hf_transformers_utils import get_processor
 from sglang.srt.layers.attention.vision import VisionAttention
 from sglang.srt.layers.linear import ColumnParallelLinear, RowParallelLinear
 from sglang.srt.layers.logits_processor import LogitsProcessor
@@ -45,6 +44,7 @@ from sglang.srt.model_loader.weight_utils import default_weight_loader
 from sglang.srt.models.qwen2_vl import Qwen2VLVideoInputs
 from sglang.srt.models.qwen3 import Qwen3Model
 from sglang.srt.utils import add_prefix
 from sglang.srt.utils.hf_transformers_utils import get_processor
 logger = logging.getLogger(__name__)
--- a/python/sglang/srt/models/qwen3_vl_moe.py
+++ b/python/sglang/srt/models/qwen3_vl_moe.py
@@ -34,7 +34,6 @@ from sglang.srt.distributed import (
    get_pp_group,
    get_tensor_model_parallel_rank,
 )
 from sglang.srt.hf_transformers_utils import get_processor
 from sglang.srt.layers.logits_processor import LogitsProcessor
 from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE
 from sglang.srt.layers.pooler import Pooler, PoolingType
@@ -58,6 +57,7 @@ from sglang.srt.models.qwen3_vl import (
    Qwen3VLForConditionalGeneration,
 )
 from sglang.srt.utils import add_prefix
 from sglang.srt.utils.hf_transformers_utils import get_processor
 logger = logging.getLogger(__name__)
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -24,7 +24,6 @@ from typing import List, Literal, Optional, Union
 from sglang.srt.connector import ConnectorType
 from sglang.srt.function_call.function_call_parser import FunctionCallParser
 from sglang.srt.hf_transformers_utils import check_gguf_file, get_config
 from sglang.srt.lora.lora_registry import LoRARef
 from sglang.srt.parser.reasoning_parser import ReasoningParser
 from sglang.srt.utils import (
@@ -47,6 +46,7 @@ from sglang.srt.utils import (
    nullable_str,
    parse_connector_type,
 )
 from sglang.srt.utils.hf_transformers_utils import check_gguf_file, get_config
 from sglang.utils import is_in_ci
 logger = logging.getLogger(__name__)
--- a/python/sglang/srt/utils/init.py
+++ b/python/sglang/srt/utils/init.py
@@ -0,0 +1,2 @@
 # Temporarily do this to avoid changing all imports in the repo
 from .common import *
--- a/python/sglang/srt/utils/common.py
+++ b/python/sglang/srt/utils/common.py
--- a/python/sglang/srt/utils/hf_transformers_utils.py
+++ b/python/sglang/srt/utils/hf_transformers_utils.py
--- a/python/sglang/srt/utils/patch_torch.py
+++ b/python/sglang/srt/utils/patch_torch.py
--- a/python/sglang/srt/utils/poll_based_barrier.py
+++ b/python/sglang/srt/utils/poll_based_barrier.py
--- a/python/sglang/srt/utils/rpd_utils.py
+++ b/python/sglang/srt/utils/rpd_utils.py
--- a/python/sglang/srt/utils/slow_rank_detector.py
+++ b/python/sglang/srt/utils/slow_rank_detector.py
--- a/python/sglang/srt/weight_sync/utils.py
+++ b/python/sglang/srt/weight_sync/utils.py
@@ -33,7 +33,7 @@ async def update_weights(
    """
    infer_tp_size = device_mesh[device_mesh_key].mesh.size()[0]
    infer_tp_rank = device_mesh[device_mesh_key].get_local_rank()
-    from sglang.srt.patch_torch import monkey_patch_torch_reductions
+    from sglang.srt.utils.patch_torch import monkey_patch_torch_reductions
    monkey_patch_torch_reductions()
--- a/python/sglang/test/runners.py
+++ b/python/sglang/test/runners.py
@@ -30,8 +30,8 @@ from transformers import (
 )
 from sglang.srt.entrypoints.engine import Engine
 from sglang.srt.hf_transformers_utils import get_tokenizer
 from sglang.srt.utils import load_image
 from sglang.srt.utils.hf_transformers_utils import get_tokenizer
 from sglang.test.test_utils import DEFAULT_PORT_FOR_SRT_TEST_RUNNER, calculate_rouge_l
 DEFAULT_PROMPTS = [
--- a/python/sglang/test/test_programs.py
+++ b/python/sglang/test/test_programs.py
@@ -551,7 +551,7 @@ def test_gen_min_new_tokens():
    We verify that the number of tokens in the answer is >= the min_tokens threshold.
    """
    import sglang as sgl
-    from sglang.srt.hf_transformers_utils import get_tokenizer
+    from sglang.srt.utils.hf_transformers_utils import get_tokenizer
    model_path = sgl.global_config.default_backend.endpoint.get_model_name()
    MIN_TOKENS, MAX_TOKENS = 64, 128
--- a/python/sglang/test/test_utils.py
+++ b/python/sglang/test/test_utils.py
@@ -921,7 +921,7 @@ def run_score_benchmark(
    async def _run_benchmark():
        # Load tokenizer for generating test data
-        from sglang.srt.hf_transformers_utils import get_tokenizer
+        from sglang.srt.utils.hf_transformers_utils import get_tokenizer
        tokenizer = get_tokenizer(model)
--- a/scripts/playground/load_tokenizer.py
+++ b/scripts/playground/load_tokenizer.py
@@ -1,7 +1,7 @@
 import argparse
 import code
-from sglang.srt.hf_transformers_utils import get_tokenizer
+from sglang.srt.utils.hf_transformers_utils import get_tokenizer
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
--- a/scripts/playground/reference_hf.py
+++ b/scripts/playground/reference_hf.py
@@ -38,7 +38,7 @@ from transformers import (
    AutoProcessor,
 )
-from sglang.srt.hf_transformers_utils import get_tokenizer
+from sglang.srt.utils.hf_transformers_utils import get_tokenizer
@torch.no_grad()
--- a/test/srt/openai_server/basic/test_openai_server.py
+++ b/test/srt/openai_server/basic/test_openai_server.py
@@ -13,8 +13,8 @@ import numpy as np
 import openai
 import requests
 from sglang.srt.hf_transformers_utils import get_tokenizer
 from sglang.srt.utils import kill_process_tree
 from sglang.srt.utils.hf_transformers_utils import get_tokenizer
 from sglang.test.runners import TEST_RERANK_QUERY_DOCS
 from sglang.test.test_utils import (
    DEFAULT_SMALL_CROSS_ENCODER_MODEL_NAME_FOR_TEST,
--- a/test/srt/openai_server/features/test_enable_thinking.py
+++ b/test/srt/openai_server/features/test_enable_thinking.py
@@ -16,8 +16,8 @@ import unittest
 import openai
 import requests
 from sglang.srt.hf_transformers_utils import get_tokenizer
 from sglang.srt.utils import kill_process_tree
 from sglang.srt.utils.hf_transformers_utils import get_tokenizer
 from sglang.test.test_utils import (
    DEFAULT_ENABLE_THINKING_MODEL_NAME_FOR_TEST,
    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
--- a/test/srt/openai_server/features/test_openai_server_ebnf.py
+++ b/test/srt/openai_server/features/test_openai_server_ebnf.py
@@ -2,8 +2,8 @@ import re
 import openai
 from sglang.srt.hf_transformers_utils import get_tokenizer
 from sglang.srt.utils import kill_process_tree
 from sglang.srt.utils.hf_transformers_utils import get_tokenizer
 from sglang.test.test_utils import (
    DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
--- a/test/srt/openai_server/features/test_openai_server_hidden_states.py
+++ b/test/srt/openai_server/features/test_openai_server_hidden_states.py
@@ -8,8 +8,8 @@ import numpy as np
 import openai
 import torch
 from sglang.srt.hf_transformers_utils import get_tokenizer
 from sglang.srt.utils import kill_process_tree
 from sglang.srt.utils.hf_transformers_utils import get_tokenizer
 from sglang.test.test_utils import (
    DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST,
    DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST,
--- a/test/srt/openai_server/function_call/test_openai_function_calling.py
+++ b/test/srt/openai_server/function_call/test_openai_function_calling.py
@@ -4,8 +4,8 @@ import unittest
 import openai
 from sglang.srt.hf_transformers_utils import get_tokenizer
 from sglang.srt.utils import kill_process_tree
 from sglang.srt.utils.hf_transformers_utils import get_tokenizer
 from sglang.test.test_utils import (
    DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
--- a/test/srt/openai_server/function_call/test_tool_choice.py
+++ b/test/srt/openai_server/function_call/test_tool_choice.py
@@ -12,8 +12,8 @@ import unittest
 import openai
 from sglang.srt.hf_transformers_utils import get_tokenizer
 from sglang.srt.utils import kill_process_tree
 from sglang.srt.utils.hf_transformers_utils import get_tokenizer
 from sglang.test.test_utils import (
    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
    DEFAULT_URL_FOR_TEST,
--- a/test/srt/openai_server/validation/test_large_max_new_tokens.py
+++ b/test/srt/openai_server/validation/test_large_max_new_tokens.py
@@ -9,8 +9,8 @@ from concurrent.futures import ThreadPoolExecutor
 import openai
 from sglang.srt.hf_transformers_utils import get_tokenizer
 from sglang.srt.utils import kill_process_tree
 from sglang.srt.utils.hf_transformers_utils import get_tokenizer
 from sglang.test.test_utils import (
    DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
--- a/test/srt/openai_server/validation/test_openai_server_ignore_eos.py
+++ b/test/srt/openai_server/validation/test_openai_server_ignore_eos.py
@@ -1,7 +1,7 @@
 import openai
 from sglang.srt.hf_transformers_utils import get_tokenizer
 from sglang.srt.utils import kill_process_tree
 from sglang.srt.utils.hf_transformers_utils import get_tokenizer
 from sglang.test.test_utils import (
    DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
--- a/test/srt/rl/test_verl_engine_2_gpu.py
+++ b/test/srt/rl/test_verl_engine_2_gpu.py
@@ -19,8 +19,8 @@ from torch.distributed.fsdp.api import (
 from transformers import AutoModelForCausalLM
 from sglang.srt.entrypoints.verl_engine import VerlEngine
 from sglang.srt.hf_transformers_utils import get_tokenizer
 from sglang.srt.utils import is_port_available
 from sglang.srt.utils.hf_transformers_utils import get_tokenizer
 from sglang.test.runners import (
    HFRunner,
    SRTRunner,
--- a/test/srt/rl/test_verl_engine_4_gpu.py
+++ b/test/srt/rl/test_verl_engine_4_gpu.py
@@ -19,8 +19,8 @@ from torch.distributed.fsdp.api import (
 from transformers import AutoModelForCausalLM
 from sglang.srt.entrypoints.verl_engine import VerlEngine
 from sglang.srt.hf_transformers_utils import get_tokenizer
 from sglang.srt.utils import is_port_available
 from sglang.srt.utils.hf_transformers_utils import get_tokenizer
 from sglang.test.runners import (
    HFRunner,
    SRTRunner,
--- a/test/srt/test_bench_serving.py
+++ b/test/srt/test_bench_serving.py
@@ -4,7 +4,7 @@ import unittest
 import requests
-from sglang.srt.hf_transformers_utils import get_tokenizer
+from sglang.srt.utils.hf_transformers_utils import get_tokenizer
 from sglang.test.test_utils import (
    DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST,
    DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST,
--- a/test/srt/test_eagle_infer_a.py
+++ b/test/srt/test_eagle_infer_a.py
@@ -4,8 +4,8 @@ import requests
 import torch
 import sglang as sgl
 from sglang.srt.hf_transformers_utils import get_tokenizer
 from sglang.srt.utils import kill_process_tree
 from sglang.srt.utils.hf_transformers_utils import get_tokenizer
 from sglang.test.test_utils import (
    DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST,
    DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST,
--- a/test/srt/test_fim_completion.py
+++ b/test/srt/test_fim_completion.py
@@ -2,8 +2,8 @@ import unittest
 import openai
 from sglang.srt.hf_transformers_utils import get_tokenizer
 from sglang.srt.utils import kill_process_tree
 from sglang.srt.utils.hf_transformers_utils import get_tokenizer
 from sglang.test.test_utils import (
    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
    DEFAULT_URL_FOR_TEST,
--- a/test/srt/test_forward_split_prefill.py
+++ b/test/srt/test_forward_split_prefill.py
@@ -13,13 +13,13 @@ import numpy as np
 import torch
 from sglang.srt.configs.model_config import ModelConfig
 from sglang.srt.hf_transformers_utils import get_tokenizer
 from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
 from sglang.srt.model_executor.model_runner import ModelRunner
 from sglang.srt.sampling.sampling_params import SamplingParams
 from sglang.srt.server_args import PortArgs, ServerArgs
 from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
 from sglang.srt.utils.hf_transformers_utils import get_tokenizer
 from sglang.test.test_utils import DEFAULT_SMALL_MODEL_NAME_FOR_TEST, CustomTestCase
--- a/test/srt/test_function_call_parser.py
+++ b/test/srt/test_function_call_parser.py
@@ -15,7 +15,7 @@ from sglang.srt.function_call.mistral_detector import MistralDetector
 from sglang.srt.function_call.pythonic_detector import PythonicDetector
 from sglang.srt.function_call.qwen3_coder_detector import Qwen3CoderDetector
 from sglang.srt.function_call.qwen25_detector import Qwen25Detector
-from sglang.srt.hf_transformers_utils import get_tokenizer
+from sglang.srt.utils.hf_transformers_utils import get_tokenizer
 from sglang.test.test_utils import DEFAULT_SMALL_MODEL_NAME_FOR_TEST
--- a/test/srt/test_patch_torch.py
+++ b/test/srt/test_patch_torch.py
@@ -6,7 +6,7 @@ from typing import Dict, List
 import torch
 import torch.multiprocessing as mp
-from sglang.srt.patch_torch import monkey_patch_torch_reductions
+from sglang.srt.utils.patch_torch import monkey_patch_torch_reductions
 class TestReleaseMemoryOccupation(unittest.TestCase):
--- a/test/srt/test_sagemaker_server.py
+++ b/test/srt/test_sagemaker_server.py
@@ -7,8 +7,8 @@ import unittest
 import requests
 from sglang.srt.hf_transformers_utils import get_tokenizer
 from sglang.srt.utils import kill_process_tree
 from sglang.srt.utils.hf_transformers_utils import get_tokenizer
 from sglang.test.test_utils import (
    DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
--- a/test/srt/test_session_control.py
+++ b/test/srt/test_session_control.py
@@ -13,8 +13,8 @@ import unittest
 import aiohttp
 import requests
 from sglang.srt.hf_transformers_utils import get_tokenizer
 from sglang.srt.utils import kill_process_tree
 from sglang.srt.utils.hf_transformers_utils import get_tokenizer
 from sglang.test.test_utils import (
    DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
--- a/test/srt/test_srt_engine.py
+++ b/test/srt/test_srt_engine.py
@@ -12,8 +12,8 @@ import torch
 import sglang as sgl
 from sglang.bench_offline_throughput import BenchArgs, throughput_test
 from sglang.srt.hf_transformers_utils import get_tokenizer
 from sglang.srt.server_args import ServerArgs
 from sglang.srt.utils.hf_transformers_utils import get_tokenizer
 from sglang.test.few_shot_gsm8k_engine import run_eval
 from sglang.test.test_utils import (
    DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST,
--- a/test/srt/test_tokenizer_batch_encode.py
+++ b/test/srt/test_tokenizer_batch_encode.py
@@ -34,7 +34,9 @@ class TestTokenizerBatchEncode(unittest.TestCase):
        with patch("zmq.asyncio.Context"), patch(
            "sglang.srt.utils.get_zmq_socket"
-        ), patch("sglang.srt.hf_transformers_utils.get_tokenizer") as mock_tokenizer:
+        ), patch(
            "sglang.srt.utils.hf_transformers_utils.get_tokenizer"
        ) as mock_tokenizer:
            mock_tokenizer.return_value = Mock(vocab_size=32000)
            self.tokenizer_manager = TokenizerManager(self.server_args, self.port_args)
--- a/test/srt/test_tokenizer_manager.py
+++ b/test/srt/test_tokenizer_manager.py
@@ -31,7 +31,9 @@ class TestInputFormatDetection(unittest.TestCase):
        with patch("zmq.asyncio.Context"), patch(
            "sglang.srt.utils.get_zmq_socket"
-        ), patch("sglang.srt.hf_transformers_utils.get_tokenizer") as mock_tokenizer:
+        ), patch(
            "sglang.srt.utils.hf_transformers_utils.get_tokenizer"
        ) as mock_tokenizer:
            mock_tokenizer.return_value = Mock(vocab_size=32000)
            self.tokenizer_manager = TokenizerManager(self.server_args, self.port_args)
@@ -125,7 +127,9 @@ class TestTokenizerInputPreparation(unittest.TestCase):
        with patch("zmq.asyncio.Context"), patch(
            "sglang.srt.utils.get_zmq_socket"
-        ), patch("sglang.srt.hf_transformers_utils.get_tokenizer") as mock_tokenizer:
+        ), patch(
            "sglang.srt.utils.hf_transformers_utils.get_tokenizer"
        ) as mock_tokenizer:
            mock_tokenizer.return_value = Mock(vocab_size=32000)
            self.tokenizer_manager = TokenizerManager(self.server_args, self.port_args)
@@ -177,7 +181,9 @@ class TestTokenizerResultExtraction(unittest.TestCase):
        with patch("zmq.asyncio.Context"), patch(
            "sglang.srt.utils.get_zmq_socket"
-        ), patch("sglang.srt.hf_transformers_utils.get_tokenizer") as mock_tokenizer:
+        ), patch(
            "sglang.srt.utils.hf_transformers_utils.get_tokenizer"
        ) as mock_tokenizer:
            mock_tokenizer.return_value = Mock(vocab_size=32000)
            self.tokenizer_manager = TokenizerManager(self.server_args, self.port_args)
@@ -279,7 +285,9 @@ class TestTokenizerManagerIntegration(unittest.TestCase):
        with patch("zmq.asyncio.Context"), patch(
            "sglang.srt.utils.get_zmq_socket"
-        ), patch("sglang.srt.hf_transformers_utils.get_tokenizer") as mock_tokenizer:
+        ), patch(
            "sglang.srt.utils.hf_transformers_utils.get_tokenizer"
        ) as mock_tokenizer:
            mock_tokenizer.return_value = Mock(vocab_size=32000)
            self.tokenizer_manager = TokenizerManager(self.server_args, self.port_args)
		`@@ -0,0 +1,2 @@`
							`# Temporarily do this to avoid changing all imports in the repo`
							`from .common import *`