Tiny move files to utils folder (#11166)
This commit is contained in:
@@ -8,7 +8,7 @@ from datasets import load_dataset
|
||||
|
||||
import sglang as sgl
|
||||
from sglang.global_config import global_config
|
||||
from sglang.srt.hf_transformers_utils import get_tokenizer
|
||||
from sglang.srt.utils.hf_transformers_utils import get_tokenizer
|
||||
from sglang.test.test_utils import (
|
||||
add_common_sglang_args_and_parse,
|
||||
select_sglang_backend,
|
||||
|
||||
@@ -7,7 +7,7 @@ from pathlib import Path
|
||||
from tqdm import tqdm
|
||||
|
||||
import sglang as sgl
|
||||
from sglang.srt.hf_transformers_utils import get_tokenizer
|
||||
from sglang.srt.utils.hf_transformers_utils import get_tokenizer
|
||||
from sglang.test.test_utils import (
|
||||
add_common_sglang_args_and_parse,
|
||||
select_sglang_backend,
|
||||
|
||||
@@ -3,7 +3,7 @@ This example demonstrates how to provide tokenized ids to LLM as input instead o
|
||||
"""
|
||||
|
||||
import sglang as sgl
|
||||
from sglang.srt.hf_transformers_utils import get_tokenizer
|
||||
from sglang.srt.utils.hf_transformers_utils import get_tokenizer
|
||||
|
||||
MODEL_PATH = "meta-llama/Llama-3.1-8B-Instruct"
|
||||
|
||||
|
||||
@@ -7,7 +7,7 @@ python token_in_token_out_llm_server.py
|
||||
|
||||
import requests
|
||||
|
||||
from sglang.srt.hf_transformers_utils import get_tokenizer
|
||||
from sglang.srt.utils.hf_transformers_utils import get_tokenizer
|
||||
from sglang.test.test_utils import is_in_ci
|
||||
from sglang.utils import terminate_process, wait_for_server
|
||||
|
||||
|
||||
@@ -60,7 +60,6 @@ import torch.distributed as dist
|
||||
from sglang.srt.configs.model_config import ModelConfig
|
||||
from sglang.srt.distributed.parallel_state import destroy_distributed_environment
|
||||
from sglang.srt.entrypoints.engine import _set_envs_and_config
|
||||
from sglang.srt.hf_transformers_utils import get_tokenizer
|
||||
from sglang.srt.layers.moe import initialize_moe_config
|
||||
from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
|
||||
from sglang.srt.managers.scheduler import Scheduler
|
||||
@@ -78,6 +77,7 @@ from sglang.srt.utils import (
|
||||
set_gpu_proc_affinity,
|
||||
suppress_other_loggers,
|
||||
)
|
||||
from sglang.srt.utils.hf_transformers_utils import get_tokenizer
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
|
||||
@@ -635,7 +635,7 @@ def get_tokenizer(
|
||||
if pretrained_model_name_or_path.endswith(
|
||||
".json"
|
||||
) or pretrained_model_name_or_path.endswith(".model"):
|
||||
from sglang.srt.hf_transformers_utils import get_tokenizer
|
||||
from sglang.srt.utils.hf_transformers_utils import get_tokenizer
|
||||
|
||||
return get_tokenizer(pretrained_model_name_or_path)
|
||||
|
||||
|
||||
@@ -433,7 +433,7 @@ class Runtime:
|
||||
self.endpoint.cache_prefix(prefix)
|
||||
|
||||
def get_tokenizer(self):
|
||||
from sglang.srt.hf_transformers_utils import get_tokenizer
|
||||
from sglang.srt.utils.hf_transformers_utils import get_tokenizer
|
||||
|
||||
return get_tokenizer(
|
||||
self.server_args.tokenizer_path,
|
||||
|
||||
@@ -23,16 +23,16 @@ import torch
|
||||
from transformers import PretrainedConfig
|
||||
|
||||
from sglang.srt.environ import envs
|
||||
from sglang.srt.hf_transformers_utils import (
|
||||
from sglang.srt.layers.quantization import QUANTIZATION_METHODS
|
||||
from sglang.srt.server_args import ServerArgs
|
||||
from sglang.srt.utils import is_hip, retry
|
||||
from sglang.srt.utils.hf_transformers_utils import (
|
||||
get_config,
|
||||
get_context_length,
|
||||
get_generation_config,
|
||||
get_hf_text_config,
|
||||
get_sparse_attention_config,
|
||||
)
|
||||
from sglang.srt.layers.quantization import QUANTIZATION_METHODS
|
||||
from sglang.srt.server_args import ServerArgs
|
||||
from sglang.srt.utils import is_hip, retry
|
||||
from sglang.utils import is_in_ci
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -26,12 +26,12 @@ import torch
|
||||
from torch import nn
|
||||
|
||||
from sglang.srt.configs.load_config import LoadConfig
|
||||
from sglang.srt.hf_transformers_utils import AutoConfig
|
||||
from sglang.srt.lora.backend.base_backend import BaseLoRABackend
|
||||
from sglang.srt.lora.backend.chunked_backend import ChunkedSgmvLoRABackend
|
||||
from sglang.srt.lora.backend.triton_backend import TritonLoRABackend
|
||||
from sglang.srt.lora.lora_config import LoRAConfig
|
||||
from sglang.srt.model_loader.loader import DefaultModelLoader
|
||||
from sglang.srt.utils.hf_transformers_utils import AutoConfig
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@@ -21,7 +21,6 @@ from typing import Dict, Iterable, List, Optional, Set, Tuple
|
||||
import torch
|
||||
|
||||
from sglang.srt.configs.load_config import LoadConfig
|
||||
from sglang.srt.hf_transformers_utils import AutoConfig
|
||||
from sglang.srt.lora.backend.base_backend import BaseLoRABackend, get_backend_from_name
|
||||
from sglang.srt.lora.layers import BaseLayerWithLoRA, get_lora_layer
|
||||
from sglang.srt.lora.lora import LoRAAdapter
|
||||
@@ -39,6 +38,7 @@ from sglang.srt.managers.io_struct import LoRAUpdateOutput
|
||||
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
||||
from sglang.srt.server_args import ServerArgs
|
||||
from sglang.srt.utils import replace_submodule
|
||||
from sglang.srt.utils.hf_transformers_utils import AutoConfig
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@@ -4,7 +4,6 @@ from typing import Callable, Dict, Iterable, List, Optional, Set, Tuple, Union
|
||||
import torch
|
||||
|
||||
from sglang.srt.distributed import divide
|
||||
from sglang.srt.hf_transformers_utils import AutoConfig
|
||||
from sglang.srt.lora.layers import BaseLayerWithLoRA
|
||||
from sglang.srt.lora.lora import LoRAAdapter
|
||||
from sglang.srt.lora.lora_config import LoRAConfig
|
||||
@@ -17,6 +16,7 @@ from sglang.srt.lora.utils import (
|
||||
get_stacked_multiply,
|
||||
get_target_module_name,
|
||||
)
|
||||
from sglang.srt.utils.hf_transformers_utils import AutoConfig
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@@ -5,7 +5,7 @@ from typing import Iterable, Optional, Set, Tuple
|
||||
|
||||
import torch
|
||||
|
||||
from sglang.srt.hf_transformers_utils import AutoConfig
|
||||
from sglang.srt.utils.hf_transformers_utils import AutoConfig
|
||||
|
||||
|
||||
@dataclass
|
||||
|
||||
@@ -24,7 +24,6 @@ import psutil
|
||||
import setproctitle
|
||||
import zmq
|
||||
|
||||
from sglang.srt.hf_transformers_utils import get_tokenizer
|
||||
from sglang.srt.managers.io_struct import (
|
||||
BatchEmbeddingOutput,
|
||||
BatchMultimodalDecodeReq,
|
||||
@@ -42,6 +41,7 @@ from sglang.srt.utils import (
|
||||
get_zmq_socket,
|
||||
kill_itself_when_parent_died,
|
||||
)
|
||||
from sglang.srt.utils.hf_transformers_utils import get_tokenizer
|
||||
from sglang.utils import (
|
||||
TypeBasedDispatcher,
|
||||
find_printable_text,
|
||||
|
||||
@@ -60,11 +60,6 @@ from sglang.srt.disaggregation.utils import (
|
||||
)
|
||||
from sglang.srt.distributed import get_pp_group, get_world_group
|
||||
from sglang.srt.eplb.expert_distribution import get_global_expert_distribution_recorder
|
||||
from sglang.srt.hf_transformers_utils import (
|
||||
get_processor,
|
||||
get_tokenizer,
|
||||
get_tokenizer_from_processor,
|
||||
)
|
||||
from sglang.srt.layers.dp_attention import compute_dp_attention_world_info
|
||||
from sglang.srt.layers.logits_processor import LogitsProcessorOutput
|
||||
from sglang.srt.layers.moe import initialize_moe_config
|
||||
@@ -190,6 +185,11 @@ from sglang.srt.utils import (
|
||||
set_random_seed,
|
||||
suppress_other_loggers,
|
||||
)
|
||||
from sglang.srt.utils.hf_transformers_utils import (
|
||||
get_processor,
|
||||
get_tokenizer,
|
||||
get_tokenizer_from_processor,
|
||||
)
|
||||
from sglang.utils import TypeBasedDispatcher, get_exception_traceback
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -17,7 +17,7 @@ from enum import Enum, auto
|
||||
from typing import Any, List, Optional
|
||||
|
||||
from sglang.srt.managers.io_struct import BlockReqInput, BlockReqType
|
||||
from sglang.srt.poll_based_barrier import PollBasedBarrier
|
||||
from sglang.srt.utils.poll_based_barrier import PollBasedBarrier
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@@ -204,7 +204,7 @@ class SchedulerProfilerMixin:
|
||||
|
||||
torch.distributed.barrier(self.tp_cpu_group)
|
||||
if self.tp_rank == 0:
|
||||
from sglang.srt.rpd_utils import rpd_to_chrome_trace
|
||||
from sglang.srt.utils.rpd_utils import rpd_to_chrome_trace
|
||||
|
||||
rpd_to_chrome_trace("trace.rpd", self.rpd_profile_path)
|
||||
self.rpd_profiler = None
|
||||
|
||||
@@ -43,11 +43,6 @@ from fastapi import BackgroundTasks
|
||||
from sglang.srt.aio_rwlock import RWLock
|
||||
from sglang.srt.configs.model_config import ModelConfig
|
||||
from sglang.srt.disaggregation.utils import DisaggregationMode
|
||||
from sglang.srt.hf_transformers_utils import (
|
||||
get_processor,
|
||||
get_tokenizer,
|
||||
get_tokenizer_from_processor,
|
||||
)
|
||||
from sglang.srt.lora.lora_registry import LoRARegistry
|
||||
from sglang.srt.managers.async_dynamic_batch_tokenizer import AsyncDynamicbatchTokenizer
|
||||
from sglang.srt.managers.disagg_service import start_disagg_service
|
||||
@@ -99,6 +94,11 @@ from sglang.srt.utils import (
|
||||
get_zmq_socket,
|
||||
kill_process_tree,
|
||||
)
|
||||
from sglang.srt.utils.hf_transformers_utils import (
|
||||
get_processor,
|
||||
get_tokenizer,
|
||||
get_tokenizer_from_processor,
|
||||
)
|
||||
from sglang.utils import TypeBasedDispatcher, get_exception_traceback
|
||||
|
||||
asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
|
||||
|
||||
@@ -22,11 +22,6 @@ import torch
|
||||
|
||||
from sglang.srt.configs.model_config import ModelConfig
|
||||
from sglang.srt.distributed import get_pp_group, get_world_group
|
||||
from sglang.srt.hf_transformers_utils import (
|
||||
get_processor,
|
||||
get_tokenizer,
|
||||
get_tokenizer_from_processor,
|
||||
)
|
||||
from sglang.srt.layers.logits_processor import LogitsProcessorOutput
|
||||
from sglang.srt.managers.io_struct import (
|
||||
DestroyWeightsUpdateGroupReqInput,
|
||||
@@ -49,9 +44,14 @@ from sglang.srt.model_executor.forward_batch_info import (
|
||||
PPProxyTensors,
|
||||
)
|
||||
from sglang.srt.model_executor.model_runner import ModelRunner
|
||||
from sglang.srt.patch_torch import monkey_patch_torch_reductions
|
||||
from sglang.srt.server_args import ServerArgs
|
||||
from sglang.srt.utils import MultiprocessingSerializer, broadcast_pyobj, set_random_seed
|
||||
from sglang.srt.utils.hf_transformers_utils import (
|
||||
get_processor,
|
||||
get_tokenizer,
|
||||
get_tokenizer_from_processor,
|
||||
)
|
||||
from sglang.srt.utils.patch_torch import monkey_patch_torch_reductions
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from sglang.srt.managers.cache_controller import LayerDoneCounter
|
||||
|
||||
@@ -34,7 +34,6 @@ from sglang.srt.model_executor.forward_batch_info import (
|
||||
ForwardMode,
|
||||
PPProxyTensors,
|
||||
)
|
||||
from sglang.srt.patch_torch import monkey_patch_torch_compile
|
||||
from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
|
||||
from sglang.srt.utils import (
|
||||
log_info_on_rank0,
|
||||
@@ -43,6 +42,7 @@ from sglang.srt.utils import (
|
||||
require_mlp_sync,
|
||||
require_mlp_tp_gather,
|
||||
)
|
||||
from sglang.srt.utils.patch_torch import monkey_patch_torch_compile
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@@ -48,7 +48,6 @@ from sglang.srt.model_executor.forward_batch_info import (
|
||||
PPProxyTensors,
|
||||
enable_num_token_non_padded,
|
||||
)
|
||||
from sglang.srt.patch_torch import monkey_patch_torch_compile
|
||||
from sglang.srt.two_batch_overlap import TboCudaGraphRunnerPlugin
|
||||
from sglang.srt.utils import (
|
||||
empty_context,
|
||||
@@ -62,6 +61,7 @@ from sglang.srt.utils import (
|
||||
require_mlp_sync,
|
||||
require_mlp_tp_gather,
|
||||
)
|
||||
from sglang.srt.utils.patch_torch import monkey_patch_torch_compile
|
||||
|
||||
_is_hip = is_hip()
|
||||
|
||||
|
||||
@@ -29,7 +29,6 @@ from typing import List, Optional, Tuple, Union
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
|
||||
from sglang.srt import slow_rank_detector
|
||||
from sglang.srt.configs.device_config import DeviceConfig
|
||||
from sglang.srt.configs.load_config import LoadConfig, LoadFormat
|
||||
from sglang.srt.configs.model_config import AttentionArch, ModelConfig
|
||||
@@ -115,7 +114,6 @@ from sglang.srt.offloader import (
|
||||
get_offloader,
|
||||
set_offloader,
|
||||
)
|
||||
from sglang.srt.patch_torch import monkey_patch_torch_reductions
|
||||
from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo
|
||||
from sglang.srt.server_args import ServerArgs
|
||||
from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
|
||||
@@ -140,7 +138,9 @@ from sglang.srt.utils import (
|
||||
monkey_patch_p2p_access_check,
|
||||
monkey_patch_vllm_gguf_config,
|
||||
set_cuda_arch,
|
||||
slow_rank_detector,
|
||||
)
|
||||
from sglang.srt.utils.patch_torch import monkey_patch_torch_reductions
|
||||
from sglang.srt.weight_sync.tensor_bucket import (
|
||||
FlattenedTensorBucket,
|
||||
FlattenedTensorMetadata,
|
||||
|
||||
@@ -9,7 +9,6 @@ import torch.nn as nn
|
||||
from transformers.activations import ACT2FN
|
||||
|
||||
from sglang.srt.configs import DotsOCRConfig
|
||||
from sglang.srt.hf_transformers_utils import get_processor
|
||||
from sglang.srt.layers.logits_processor import LogitsProcessor
|
||||
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
||||
from sglang.srt.layers.vocab_parallel_embedding import ParallelLMHead
|
||||
@@ -23,6 +22,7 @@ from sglang.srt.model_loader.weight_utils import default_weight_loader
|
||||
from sglang.srt.models.dots_vlm_vit import DotsVisionTransformer
|
||||
from sglang.srt.models.qwen2 import Qwen2ForCausalLM
|
||||
from sglang.srt.utils import add_prefix
|
||||
from sglang.srt.utils.hf_transformers_utils import get_processor
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@@ -23,7 +23,6 @@ import torch
|
||||
from torch import nn
|
||||
from transformers import Gemma3Config, PreTrainedModel
|
||||
|
||||
from sglang.srt.hf_transformers_utils import get_processor
|
||||
from sglang.srt.layers.layernorm import Gemma3RMSNorm
|
||||
from sglang.srt.layers.logits_processor import LogitsProcessor
|
||||
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
||||
@@ -44,6 +43,7 @@ from sglang.srt.model_loader.weight_utils import (
|
||||
from sglang.srt.models.gemma3_causal import Gemma3ForCausalLM
|
||||
from sglang.srt.models.siglip import SiglipVisionModel
|
||||
from sglang.srt.utils import add_prefix
|
||||
from sglang.srt.utils.hf_transformers_utils import get_processor
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@@ -14,7 +14,6 @@ from transformers import (
|
||||
)
|
||||
from transformers.models.auto.modeling_auto import AutoModel
|
||||
|
||||
from sglang.srt.hf_transformers_utils import get_processor
|
||||
from sglang.srt.layers.layernorm import RMSNorm
|
||||
from sglang.srt.layers.linear import ColumnParallelLinear, RowParallelLinear
|
||||
from sglang.srt.layers.logits_processor import LogitsProcessor
|
||||
@@ -38,6 +37,7 @@ from sglang.srt.model_loader.weight_utils import (
|
||||
from sglang.srt.models.gemma3n_audio import Gemma3nAudioEncoder
|
||||
from sglang.srt.models.gemma3n_causal import Gemma3nRMSNorm, Gemma3nTextModel
|
||||
from sglang.srt.utils import add_prefix
|
||||
from sglang.srt.utils.hf_transformers_utils import get_processor
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@@ -7,7 +7,6 @@ import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
from transformers.models.glm4v.configuration_glm4v import Glm4vConfig, Glm4vVisionConfig
|
||||
|
||||
from sglang.srt.hf_transformers_utils import get_processor
|
||||
from sglang.srt.layers.activation import SiluAndMul
|
||||
from sglang.srt.layers.attention import vision_utils
|
||||
from sglang.srt.layers.layernorm import RMSNorm
|
||||
@@ -28,6 +27,7 @@ from sglang.srt.models.qwen2_5_vl import (
|
||||
Qwen2_5_VLForConditionalGeneration,
|
||||
)
|
||||
from sglang.srt.utils import add_prefix
|
||||
from sglang.srt.utils.hf_transformers_utils import get_processor
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@@ -10,7 +10,6 @@ from sglang.srt.distributed import (
|
||||
get_moe_expert_parallel_world_size,
|
||||
get_tensor_model_parallel_world_size,
|
||||
)
|
||||
from sglang.srt.hf_transformers_utils import get_processor
|
||||
from sglang.srt.layers.attention import vision_utils
|
||||
from sglang.srt.layers.logits_processor import LogitsProcessor
|
||||
from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
|
||||
@@ -22,6 +21,7 @@ from sglang.srt.model_loader.weight_utils import default_weight_loader
|
||||
from sglang.srt.models.glm4_moe import Glm4MoeModel
|
||||
from sglang.srt.models.glm4v import Glm4vForConditionalGeneration, Glm4vVisionModel
|
||||
from sglang.srt.utils import add_prefix, is_cuda, log_info_on_rank0
|
||||
from sglang.srt.utils.hf_transformers_utils import get_processor
|
||||
|
||||
_is_cuda = is_cuda()
|
||||
|
||||
|
||||
@@ -40,7 +40,6 @@ from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import (
|
||||
Qwen2_5_VisionRotaryEmbedding,
|
||||
)
|
||||
|
||||
from sglang.srt.hf_transformers_utils import get_processor
|
||||
from sglang.srt.layers.attention.vision import VisionAttention
|
||||
from sglang.srt.layers.layernorm import RMSNorm
|
||||
from sglang.srt.layers.linear import (
|
||||
@@ -61,6 +60,7 @@ from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
||||
from sglang.srt.model_loader.weight_utils import default_weight_loader
|
||||
from sglang.srt.models.qwen2 import Qwen2Model
|
||||
from sglang.srt.utils import add_prefix
|
||||
from sglang.srt.utils.hf_transformers_utils import get_processor
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@@ -39,7 +39,6 @@ from transformers.models.qwen2_audio.modeling_qwen2_audio import (
|
||||
Qwen2AudioMultiModalProjector,
|
||||
)
|
||||
|
||||
from sglang.srt.hf_transformers_utils import get_processor
|
||||
from sglang.srt.layers.activation import QuickGELU
|
||||
from sglang.srt.layers.attention.vision import VisionAttention
|
||||
from sglang.srt.layers.linear import ColumnParallelLinear, RowParallelLinear
|
||||
@@ -61,6 +60,7 @@ from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
||||
from sglang.srt.model_loader.weight_utils import default_weight_loader
|
||||
from sglang.srt.models.qwen2 import Qwen2ForCausalLM
|
||||
from sglang.srt.utils import add_prefix
|
||||
from sglang.srt.utils.hf_transformers_utils import get_processor
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@@ -33,7 +33,6 @@ from einops import rearrange
|
||||
from transformers import Qwen2VLConfig
|
||||
from transformers.models.qwen2_vl.configuration_qwen2_vl import Qwen2VLVisionConfig
|
||||
|
||||
from sglang.srt.hf_transformers_utils import get_processor
|
||||
from sglang.srt.layers.activation import QuickGELU
|
||||
from sglang.srt.layers.attention.vision import VisionAttention
|
||||
from sglang.srt.layers.linear import ColumnParallelLinear, RowParallelLinear
|
||||
@@ -50,6 +49,7 @@ from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
||||
from sglang.srt.model_loader.weight_utils import default_weight_loader
|
||||
from sglang.srt.models.qwen2 import Qwen2Model
|
||||
from sglang.srt.utils import add_prefix
|
||||
from sglang.srt.utils.hf_transformers_utils import get_processor
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@@ -28,7 +28,6 @@ from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import (
|
||||
)
|
||||
|
||||
from sglang.srt.configs.qwen3_vl import Qwen3VLConfig, Qwen3VLVisionConfig
|
||||
from sglang.srt.hf_transformers_utils import get_processor
|
||||
from sglang.srt.layers.attention.vision import VisionAttention
|
||||
from sglang.srt.layers.linear import ColumnParallelLinear, RowParallelLinear
|
||||
from sglang.srt.layers.logits_processor import LogitsProcessor
|
||||
@@ -45,6 +44,7 @@ from sglang.srt.model_loader.weight_utils import default_weight_loader
|
||||
from sglang.srt.models.qwen2_vl import Qwen2VLVideoInputs
|
||||
from sglang.srt.models.qwen3 import Qwen3Model
|
||||
from sglang.srt.utils import add_prefix
|
||||
from sglang.srt.utils.hf_transformers_utils import get_processor
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@@ -34,7 +34,6 @@ from sglang.srt.distributed import (
|
||||
get_pp_group,
|
||||
get_tensor_model_parallel_rank,
|
||||
)
|
||||
from sglang.srt.hf_transformers_utils import get_processor
|
||||
from sglang.srt.layers.logits_processor import LogitsProcessor
|
||||
from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE
|
||||
from sglang.srt.layers.pooler import Pooler, PoolingType
|
||||
@@ -58,6 +57,7 @@ from sglang.srt.models.qwen3_vl import (
|
||||
Qwen3VLForConditionalGeneration,
|
||||
)
|
||||
from sglang.srt.utils import add_prefix
|
||||
from sglang.srt.utils.hf_transformers_utils import get_processor
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@@ -24,7 +24,6 @@ from typing import List, Literal, Optional, Union
|
||||
|
||||
from sglang.srt.connector import ConnectorType
|
||||
from sglang.srt.function_call.function_call_parser import FunctionCallParser
|
||||
from sglang.srt.hf_transformers_utils import check_gguf_file, get_config
|
||||
from sglang.srt.lora.lora_registry import LoRARef
|
||||
from sglang.srt.parser.reasoning_parser import ReasoningParser
|
||||
from sglang.srt.utils import (
|
||||
@@ -47,6 +46,7 @@ from sglang.srt.utils import (
|
||||
nullable_str,
|
||||
parse_connector_type,
|
||||
)
|
||||
from sglang.srt.utils.hf_transformers_utils import check_gguf_file, get_config
|
||||
from sglang.utils import is_in_ci
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
2
python/sglang/srt/utils/__init__.py
Normal file
2
python/sglang/srt/utils/__init__.py
Normal file
@@ -0,0 +1,2 @@
|
||||
# Temporarily do this to avoid changing all imports in the repo
|
||||
from .common import *
|
||||
@@ -33,7 +33,7 @@ async def update_weights(
|
||||
"""
|
||||
infer_tp_size = device_mesh[device_mesh_key].mesh.size()[0]
|
||||
infer_tp_rank = device_mesh[device_mesh_key].get_local_rank()
|
||||
from sglang.srt.patch_torch import monkey_patch_torch_reductions
|
||||
from sglang.srt.utils.patch_torch import monkey_patch_torch_reductions
|
||||
|
||||
monkey_patch_torch_reductions()
|
||||
|
||||
|
||||
@@ -30,8 +30,8 @@ from transformers import (
|
||||
)
|
||||
|
||||
from sglang.srt.entrypoints.engine import Engine
|
||||
from sglang.srt.hf_transformers_utils import get_tokenizer
|
||||
from sglang.srt.utils import load_image
|
||||
from sglang.srt.utils.hf_transformers_utils import get_tokenizer
|
||||
from sglang.test.test_utils import DEFAULT_PORT_FOR_SRT_TEST_RUNNER, calculate_rouge_l
|
||||
|
||||
DEFAULT_PROMPTS = [
|
||||
|
||||
@@ -551,7 +551,7 @@ def test_gen_min_new_tokens():
|
||||
We verify that the number of tokens in the answer is >= the min_tokens threshold.
|
||||
"""
|
||||
import sglang as sgl
|
||||
from sglang.srt.hf_transformers_utils import get_tokenizer
|
||||
from sglang.srt.utils.hf_transformers_utils import get_tokenizer
|
||||
|
||||
model_path = sgl.global_config.default_backend.endpoint.get_model_name()
|
||||
MIN_TOKENS, MAX_TOKENS = 64, 128
|
||||
|
||||
@@ -921,7 +921,7 @@ def run_score_benchmark(
|
||||
async def _run_benchmark():
|
||||
|
||||
# Load tokenizer for generating test data
|
||||
from sglang.srt.hf_transformers_utils import get_tokenizer
|
||||
from sglang.srt.utils.hf_transformers_utils import get_tokenizer
|
||||
|
||||
tokenizer = get_tokenizer(model)
|
||||
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
import argparse
|
||||
import code
|
||||
|
||||
from sglang.srt.hf_transformers_utils import get_tokenizer
|
||||
from sglang.srt.utils.hf_transformers_utils import get_tokenizer
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
@@ -38,7 +38,7 @@ from transformers import (
|
||||
AutoProcessor,
|
||||
)
|
||||
|
||||
from sglang.srt.hf_transformers_utils import get_tokenizer
|
||||
from sglang.srt.utils.hf_transformers_utils import get_tokenizer
|
||||
|
||||
|
||||
@torch.no_grad()
|
||||
|
||||
@@ -13,8 +13,8 @@ import numpy as np
|
||||
import openai
|
||||
import requests
|
||||
|
||||
from sglang.srt.hf_transformers_utils import get_tokenizer
|
||||
from sglang.srt.utils import kill_process_tree
|
||||
from sglang.srt.utils.hf_transformers_utils import get_tokenizer
|
||||
from sglang.test.runners import TEST_RERANK_QUERY_DOCS
|
||||
from sglang.test.test_utils import (
|
||||
DEFAULT_SMALL_CROSS_ENCODER_MODEL_NAME_FOR_TEST,
|
||||
|
||||
@@ -16,8 +16,8 @@ import unittest
|
||||
import openai
|
||||
import requests
|
||||
|
||||
from sglang.srt.hf_transformers_utils import get_tokenizer
|
||||
from sglang.srt.utils import kill_process_tree
|
||||
from sglang.srt.utils.hf_transformers_utils import get_tokenizer
|
||||
from sglang.test.test_utils import (
|
||||
DEFAULT_ENABLE_THINKING_MODEL_NAME_FOR_TEST,
|
||||
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
||||
|
||||
@@ -2,8 +2,8 @@ import re
|
||||
|
||||
import openai
|
||||
|
||||
from sglang.srt.hf_transformers_utils import get_tokenizer
|
||||
from sglang.srt.utils import kill_process_tree
|
||||
from sglang.srt.utils.hf_transformers_utils import get_tokenizer
|
||||
from sglang.test.test_utils import (
|
||||
DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
|
||||
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
||||
|
||||
@@ -8,8 +8,8 @@ import numpy as np
|
||||
import openai
|
||||
import torch
|
||||
|
||||
from sglang.srt.hf_transformers_utils import get_tokenizer
|
||||
from sglang.srt.utils import kill_process_tree
|
||||
from sglang.srt.utils.hf_transformers_utils import get_tokenizer
|
||||
from sglang.test.test_utils import (
|
||||
DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST,
|
||||
DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST,
|
||||
|
||||
@@ -4,8 +4,8 @@ import unittest
|
||||
|
||||
import openai
|
||||
|
||||
from sglang.srt.hf_transformers_utils import get_tokenizer
|
||||
from sglang.srt.utils import kill_process_tree
|
||||
from sglang.srt.utils.hf_transformers_utils import get_tokenizer
|
||||
from sglang.test.test_utils import (
|
||||
DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
|
||||
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
||||
|
||||
@@ -12,8 +12,8 @@ import unittest
|
||||
|
||||
import openai
|
||||
|
||||
from sglang.srt.hf_transformers_utils import get_tokenizer
|
||||
from sglang.srt.utils import kill_process_tree
|
||||
from sglang.srt.utils.hf_transformers_utils import get_tokenizer
|
||||
from sglang.test.test_utils import (
|
||||
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
||||
DEFAULT_URL_FOR_TEST,
|
||||
|
||||
@@ -9,8 +9,8 @@ from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
import openai
|
||||
|
||||
from sglang.srt.hf_transformers_utils import get_tokenizer
|
||||
from sglang.srt.utils import kill_process_tree
|
||||
from sglang.srt.utils.hf_transformers_utils import get_tokenizer
|
||||
from sglang.test.test_utils import (
|
||||
DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
|
||||
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
import openai
|
||||
|
||||
from sglang.srt.hf_transformers_utils import get_tokenizer
|
||||
from sglang.srt.utils import kill_process_tree
|
||||
from sglang.srt.utils.hf_transformers_utils import get_tokenizer
|
||||
from sglang.test.test_utils import (
|
||||
DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
|
||||
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
||||
|
||||
@@ -19,8 +19,8 @@ from torch.distributed.fsdp.api import (
|
||||
from transformers import AutoModelForCausalLM
|
||||
|
||||
from sglang.srt.entrypoints.verl_engine import VerlEngine
|
||||
from sglang.srt.hf_transformers_utils import get_tokenizer
|
||||
from sglang.srt.utils import is_port_available
|
||||
from sglang.srt.utils.hf_transformers_utils import get_tokenizer
|
||||
from sglang.test.runners import (
|
||||
HFRunner,
|
||||
SRTRunner,
|
||||
|
||||
@@ -19,8 +19,8 @@ from torch.distributed.fsdp.api import (
|
||||
from transformers import AutoModelForCausalLM
|
||||
|
||||
from sglang.srt.entrypoints.verl_engine import VerlEngine
|
||||
from sglang.srt.hf_transformers_utils import get_tokenizer
|
||||
from sglang.srt.utils import is_port_available
|
||||
from sglang.srt.utils.hf_transformers_utils import get_tokenizer
|
||||
from sglang.test.runners import (
|
||||
HFRunner,
|
||||
SRTRunner,
|
||||
|
||||
@@ -4,7 +4,7 @@ import unittest
|
||||
|
||||
import requests
|
||||
|
||||
from sglang.srt.hf_transformers_utils import get_tokenizer
|
||||
from sglang.srt.utils.hf_transformers_utils import get_tokenizer
|
||||
from sglang.test.test_utils import (
|
||||
DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST,
|
||||
DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST,
|
||||
|
||||
@@ -4,8 +4,8 @@ import requests
|
||||
import torch
|
||||
|
||||
import sglang as sgl
|
||||
from sglang.srt.hf_transformers_utils import get_tokenizer
|
||||
from sglang.srt.utils import kill_process_tree
|
||||
from sglang.srt.utils.hf_transformers_utils import get_tokenizer
|
||||
from sglang.test.test_utils import (
|
||||
DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST,
|
||||
DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST,
|
||||
|
||||
@@ -2,8 +2,8 @@ import unittest
|
||||
|
||||
import openai
|
||||
|
||||
from sglang.srt.hf_transformers_utils import get_tokenizer
|
||||
from sglang.srt.utils import kill_process_tree
|
||||
from sglang.srt.utils.hf_transformers_utils import get_tokenizer
|
||||
from sglang.test.test_utils import (
|
||||
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
||||
DEFAULT_URL_FOR_TEST,
|
||||
|
||||
@@ -13,13 +13,13 @@ import numpy as np
|
||||
import torch
|
||||
|
||||
from sglang.srt.configs.model_config import ModelConfig
|
||||
from sglang.srt.hf_transformers_utils import get_tokenizer
|
||||
from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
|
||||
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
||||
from sglang.srt.model_executor.model_runner import ModelRunner
|
||||
from sglang.srt.sampling.sampling_params import SamplingParams
|
||||
from sglang.srt.server_args import PortArgs, ServerArgs
|
||||
from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
|
||||
from sglang.srt.utils.hf_transformers_utils import get_tokenizer
|
||||
from sglang.test.test_utils import DEFAULT_SMALL_MODEL_NAME_FOR_TEST, CustomTestCase
|
||||
|
||||
|
||||
|
||||
@@ -15,7 +15,7 @@ from sglang.srt.function_call.mistral_detector import MistralDetector
|
||||
from sglang.srt.function_call.pythonic_detector import PythonicDetector
|
||||
from sglang.srt.function_call.qwen3_coder_detector import Qwen3CoderDetector
|
||||
from sglang.srt.function_call.qwen25_detector import Qwen25Detector
|
||||
from sglang.srt.hf_transformers_utils import get_tokenizer
|
||||
from sglang.srt.utils.hf_transformers_utils import get_tokenizer
|
||||
from sglang.test.test_utils import DEFAULT_SMALL_MODEL_NAME_FOR_TEST
|
||||
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@ from typing import Dict, List
|
||||
import torch
|
||||
import torch.multiprocessing as mp
|
||||
|
||||
from sglang.srt.patch_torch import monkey_patch_torch_reductions
|
||||
from sglang.srt.utils.patch_torch import monkey_patch_torch_reductions
|
||||
|
||||
|
||||
class TestReleaseMemoryOccupation(unittest.TestCase):
|
||||
|
||||
@@ -7,8 +7,8 @@ import unittest
|
||||
|
||||
import requests
|
||||
|
||||
from sglang.srt.hf_transformers_utils import get_tokenizer
|
||||
from sglang.srt.utils import kill_process_tree
|
||||
from sglang.srt.utils.hf_transformers_utils import get_tokenizer
|
||||
from sglang.test.test_utils import (
|
||||
DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
|
||||
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
||||
|
||||
@@ -13,8 +13,8 @@ import unittest
|
||||
import aiohttp
|
||||
import requests
|
||||
|
||||
from sglang.srt.hf_transformers_utils import get_tokenizer
|
||||
from sglang.srt.utils import kill_process_tree
|
||||
from sglang.srt.utils.hf_transformers_utils import get_tokenizer
|
||||
from sglang.test.test_utils import (
|
||||
DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
|
||||
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
||||
|
||||
@@ -12,8 +12,8 @@ import torch
|
||||
|
||||
import sglang as sgl
|
||||
from sglang.bench_offline_throughput import BenchArgs, throughput_test
|
||||
from sglang.srt.hf_transformers_utils import get_tokenizer
|
||||
from sglang.srt.server_args import ServerArgs
|
||||
from sglang.srt.utils.hf_transformers_utils import get_tokenizer
|
||||
from sglang.test.few_shot_gsm8k_engine import run_eval
|
||||
from sglang.test.test_utils import (
|
||||
DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST,
|
||||
|
||||
@@ -34,7 +34,9 @@ class TestTokenizerBatchEncode(unittest.TestCase):
|
||||
|
||||
with patch("zmq.asyncio.Context"), patch(
|
||||
"sglang.srt.utils.get_zmq_socket"
|
||||
), patch("sglang.srt.hf_transformers_utils.get_tokenizer") as mock_tokenizer:
|
||||
), patch(
|
||||
"sglang.srt.utils.hf_transformers_utils.get_tokenizer"
|
||||
) as mock_tokenizer:
|
||||
|
||||
mock_tokenizer.return_value = Mock(vocab_size=32000)
|
||||
self.tokenizer_manager = TokenizerManager(self.server_args, self.port_args)
|
||||
|
||||
@@ -31,7 +31,9 @@ class TestInputFormatDetection(unittest.TestCase):
|
||||
|
||||
with patch("zmq.asyncio.Context"), patch(
|
||||
"sglang.srt.utils.get_zmq_socket"
|
||||
), patch("sglang.srt.hf_transformers_utils.get_tokenizer") as mock_tokenizer:
|
||||
), patch(
|
||||
"sglang.srt.utils.hf_transformers_utils.get_tokenizer"
|
||||
) as mock_tokenizer:
|
||||
mock_tokenizer.return_value = Mock(vocab_size=32000)
|
||||
self.tokenizer_manager = TokenizerManager(self.server_args, self.port_args)
|
||||
|
||||
@@ -125,7 +127,9 @@ class TestTokenizerInputPreparation(unittest.TestCase):
|
||||
|
||||
with patch("zmq.asyncio.Context"), patch(
|
||||
"sglang.srt.utils.get_zmq_socket"
|
||||
), patch("sglang.srt.hf_transformers_utils.get_tokenizer") as mock_tokenizer:
|
||||
), patch(
|
||||
"sglang.srt.utils.hf_transformers_utils.get_tokenizer"
|
||||
) as mock_tokenizer:
|
||||
mock_tokenizer.return_value = Mock(vocab_size=32000)
|
||||
self.tokenizer_manager = TokenizerManager(self.server_args, self.port_args)
|
||||
|
||||
@@ -177,7 +181,9 @@ class TestTokenizerResultExtraction(unittest.TestCase):
|
||||
|
||||
with patch("zmq.asyncio.Context"), patch(
|
||||
"sglang.srt.utils.get_zmq_socket"
|
||||
), patch("sglang.srt.hf_transformers_utils.get_tokenizer") as mock_tokenizer:
|
||||
), patch(
|
||||
"sglang.srt.utils.hf_transformers_utils.get_tokenizer"
|
||||
) as mock_tokenizer:
|
||||
mock_tokenizer.return_value = Mock(vocab_size=32000)
|
||||
self.tokenizer_manager = TokenizerManager(self.server_args, self.port_args)
|
||||
|
||||
@@ -279,7 +285,9 @@ class TestTokenizerManagerIntegration(unittest.TestCase):
|
||||
|
||||
with patch("zmq.asyncio.Context"), patch(
|
||||
"sglang.srt.utils.get_zmq_socket"
|
||||
), patch("sglang.srt.hf_transformers_utils.get_tokenizer") as mock_tokenizer:
|
||||
), patch(
|
||||
"sglang.srt.utils.hf_transformers_utils.get_tokenizer"
|
||||
) as mock_tokenizer:
|
||||
mock_tokenizer.return_value = Mock(vocab_size=32000)
|
||||
self.tokenizer_manager = TokenizerManager(self.server_args, self.port_args)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user