diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 04eb1ecc3..1e09712ab 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -27,9 +27,9 @@ repos: rev: v0.11.7 hooks: - id: ruff - args: [--select=F401, --fixable=F401] - files: ^(benchmark/|docs/|examples/) - exclude: \.ipynb$|^python/sglang/srt/grpc/.*_pb2\.py$|^python/sglang/srt/grpc/.*_pb2_grpc\.py$|^python/sglang/srt/grpc/.*_pb2\.pyi$|^python/sglang/srt/grpc/.*_pb2_grpc\.pyi$ + args: [--select=F401,F821, --fixable=F401] + files: ^(benchmark/|docs/|examples/|python/sglang/) + exclude: __init__\.py$|\.ipynb$|^python/sglang/srt/grpc/.*_pb2\.py$|^python/sglang/srt/grpc/.*_pb2_grpc\.py$|^python/sglang/srt/grpc/.*_pb2\.pyi$|^python/sglang/srt/grpc/.*_pb2_grpc\.pyi$ - repo: https://github.com/psf/black rev: 24.10.0 hooks: diff --git a/python/sglang/srt/_custom_ops.py b/python/sglang/srt/_custom_ops.py index 5ed175312..de47707c1 100644 --- a/python/sglang/srt/_custom_ops.py +++ b/python/sglang/srt/_custom_ops.py @@ -15,7 +15,7 @@ if not is_hpu(): # ROCm does not use vllm custom allreduce if use_vllm_custom_allreduce and not is_hip(): try: - import vllm._C + import vllm._C # noqa: F401 except ImportError as e: logger.warning("Failed to import from vllm._C with %r", e) else: diff --git a/python/sglang/srt/compilation/cuda_piecewise_backend.py b/python/sglang/srt/compilation/cuda_piecewise_backend.py index 9f4b8cc8e..44e3803ff 100644 --- a/python/sglang/srt/compilation/cuda_piecewise_backend.py +++ b/python/sglang/srt/compilation/cuda_piecewise_backend.py @@ -9,7 +9,6 @@ from unittest.mock import patch import torch import torch.fx as fx -import sglang.srt.compilation.weak_ref_tensor_jit from sglang.srt.compilation.compilation_config import CompilationConfig from sglang.srt.compilation.compilation_counter import compilation_counter diff --git a/python/sglang/srt/configs/deepseekvl2.py b/python/sglang/srt/configs/deepseekvl2.py index bcb0afe5a..9621f058b 100644 --- a/python/sglang/srt/configs/deepseekvl2.py +++ b/python/sglang/srt/configs/deepseekvl2.py @@ -1,5 +1,4 @@ import math -import os from dataclasses import dataclass from typing import Dict, List, Optional, Tuple diff --git a/python/sglang/srt/configs/dots_vlm.py b/python/sglang/srt/configs/dots_vlm.py index 155d6ee47..dc921582c 100644 --- a/python/sglang/srt/configs/dots_vlm.py +++ b/python/sglang/srt/configs/dots_vlm.py @@ -1,10 +1,5 @@ -from typing import Any, List, Optional, Union - -from transformers import AutoProcessor, LlamaTokenizerFast, PretrainedConfig -from transformers.feature_extraction_utils import BatchFeature -from transformers.image_utils import ImageInput -from transformers.processing_utils import ProcessingKwargs, Unpack -from transformers.tokenization_utils_base import PreTokenizedInput, TextInput +from transformers import AutoProcessor, PretrainedConfig +from transformers.processing_utils import ProcessingKwargs try: from transformers import Qwen2_5_VLProcessor diff --git a/python/sglang/srt/configs/falcon_h1.py b/python/sglang/srt/configs/falcon_h1.py index d323b056d..b8869b4ff 100644 --- a/python/sglang/srt/configs/falcon_h1.py +++ b/python/sglang/srt/configs/falcon_h1.py @@ -14,17 +14,12 @@ # limitations under the License. """Falcon-H1 model configuration""" -import enum from transformers.configuration_utils import PretrainedConfig -from transformers.modeling_rope_utils import rope_config_validation from transformers.utils import logging from sglang.srt.configs.mamba_utils import Mamba2CacheParams, Mamba2StateShape -from sglang.srt.layers.dp_attention import ( - get_attention_tp_size, - get_tensor_model_parallel_world_size, -) +from sglang.srt.layers.dp_attention import get_tensor_model_parallel_world_size logger = logging.get_logger(__name__) diff --git a/python/sglang/srt/configs/qwen3_next.py b/python/sglang/srt/configs/qwen3_next.py index 09c9b5a1b..630227a2c 100644 --- a/python/sglang/srt/configs/qwen3_next.py +++ b/python/sglang/srt/configs/qwen3_next.py @@ -21,7 +21,6 @@ from transformers.modeling_rope_utils import rope_config_validation from transformers.utils import logging from sglang.srt.configs.mamba_utils import Mamba2CacheParams, Mamba2StateShape -from sglang.srt.distributed.utils import divide from sglang.srt.layers.dp_attention import get_attention_tp_size logger = logging.get_logger(__name__) diff --git a/python/sglang/srt/connector/remote_instance.py b/python/sglang/srt/connector/remote_instance.py index e1f00037f..0a4e67cfd 100644 --- a/python/sglang/srt/connector/remote_instance.py +++ b/python/sglang/srt/connector/remote_instance.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import logging -from typing import Generator, List, Optional, Tuple +from typing import Generator, Optional, Tuple from urllib.parse import urlparse import torch diff --git a/python/sglang/srt/disaggregation/ascend/transfer_engine.py b/python/sglang/srt/disaggregation/ascend/transfer_engine.py index a1fe58ce6..a701838b6 100644 --- a/python/sglang/srt/disaggregation/ascend/transfer_engine.py +++ b/python/sglang/srt/disaggregation/ascend/transfer_engine.py @@ -1,6 +1,6 @@ import logging import os -from typing import List, Optional +from typing import List import torch diff --git a/python/sglang/srt/disaggregation/decode.py b/python/sglang/srt/disaggregation/decode.py index 45589ec51..5e05cdd74 100644 --- a/python/sglang/srt/disaggregation/decode.py +++ b/python/sglang/srt/disaggregation/decode.py @@ -25,7 +25,7 @@ import time from collections import deque from dataclasses import dataclass from http import HTTPStatus -from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Type, Union +from typing import TYPE_CHECKING, List, Optional, Tuple, Type, Union import torch from torch.distributed import ProcessGroup @@ -48,10 +48,7 @@ from sglang.srt.disaggregation.utils import ( ) from sglang.srt.layers.dp_attention import get_attention_tp_size from sglang.srt.managers.schedule_batch import FINISH_ABORT, RequestStage, ScheduleBatch -from sglang.srt.mem_cache.allocator import ( - BaseTokenToKVPoolAllocator, - SWATokenToKVPoolAllocator, -) +from sglang.srt.mem_cache.allocator import BaseTokenToKVPoolAllocator from sglang.srt.mem_cache.base_prefix_cache import BasePrefixCache from sglang.srt.mem_cache.memory_pool import ( HybridLinearKVPool, @@ -61,7 +58,6 @@ from sglang.srt.mem_cache.memory_pool import ( ReqToTokenPool, SWAKVPool, ) -from sglang.srt.model_executor.forward_batch_info import ForwardMode from sglang.srt.utils import get_int_env_var, require_mlp_sync from sglang.srt.utils.torch_memory_saver_adapter import TorchMemorySaverAdapter diff --git a/python/sglang/srt/disaggregation/prefill.py b/python/sglang/srt/disaggregation/prefill.py index 23cd0dd17..86ef0498f 100644 --- a/python/sglang/srt/disaggregation/prefill.py +++ b/python/sglang/srt/disaggregation/prefill.py @@ -20,7 +20,6 @@ Life cycle of a request in the prefill server from __future__ import annotations import logging -import threading import time from collections import deque from http import HTTPStatus @@ -54,7 +53,7 @@ from sglang.srt.mem_cache.memory_pool import ( NSATokenToKVPool, SWAKVPool, ) -from sglang.srt.model_executor.forward_batch_info import ForwardMode, PPProxyTensors +from sglang.srt.model_executor.forward_batch_info import PPProxyTensors from sglang.srt.utils import ( DynamicGradMode, broadcast_pyobj, diff --git a/python/sglang/srt/distributed/device_communicators/custom_all_reduce.py b/python/sglang/srt/distributed/device_communicators/custom_all_reduce.py index bb7128206..72668bf2e 100644 --- a/python/sglang/srt/distributed/device_communicators/custom_all_reduce.py +++ b/python/sglang/srt/distributed/device_communicators/custom_all_reduce.py @@ -32,7 +32,7 @@ try: ops.meta_size() else: # Use custom allreduce from sgl kernel (ROCM and TRT-LLM) - import sgl_kernel + import sgl_kernel # noqa: F401 custom_ar = True except Exception: # For CPUs diff --git a/python/sglang/srt/distributed/device_communicators/pymscclpp.py b/python/sglang/srt/distributed/device_communicators/pymscclpp.py index 78269ed05..5d7511c2c 100644 --- a/python/sglang/srt/distributed/device_communicators/pymscclpp.py +++ b/python/sglang/srt/distributed/device_communicators/pymscclpp.py @@ -4,7 +4,7 @@ import math import os from contextlib import contextmanager from enum import IntEnum -from typing import Any, Callable, List, Optional, TypeVar, Union +from typing import Optional, Union import torch import torch.distributed as dist @@ -24,7 +24,7 @@ if _is_hip: mscclpp_is_available = False if _is_cuda: try: - import sgl_kernel + import sgl_kernel # noqa: F401 mscclpp_is_available = True except: diff --git a/python/sglang/srt/distributed/device_communicators/symm_mem.py b/python/sglang/srt/distributed/device_communicators/symm_mem.py index 0d69a33a2..48e20627e 100644 --- a/python/sglang/srt/distributed/device_communicators/symm_mem.py +++ b/python/sglang/srt/distributed/device_communicators/symm_mem.py @@ -9,7 +9,7 @@ from torch.distributed import ProcessGroup from sglang.srt.distributed.device_communicators.all_reduce_utils import ( SYMM_MEM_ALL_REDUCE_MAX_SIZES, ) -from sglang.srt.utils import get_device_capability, is_cuda, is_hip +from sglang.srt.utils import is_cuda, is_hip try: import torch.distributed._symmetric_memory as torch_symm_mem diff --git a/python/sglang/srt/distributed/naive_distributed.py b/python/sglang/srt/distributed/naive_distributed.py index 61165d90c..b340ff44d 100644 --- a/python/sglang/srt/distributed/naive_distributed.py +++ b/python/sglang/srt/distributed/naive_distributed.py @@ -1,5 +1,4 @@ import base64 -import os import pickle import time from pathlib import Path diff --git a/python/sglang/srt/entrypoints/context.py b/python/sglang/srt/entrypoints/context.py index 9314083b4..972c0f4f3 100644 --- a/python/sglang/srt/entrypoints/context.py +++ b/python/sglang/srt/entrypoints/context.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 # Copied from vLLM -import json import logging from abc import ABC, abstractmethod from typing import Union diff --git a/python/sglang/srt/entrypoints/harmony_utils.py b/python/sglang/srt/entrypoints/harmony_utils.py index ad6350d16..68bbbf094 100644 --- a/python/sglang/srt/entrypoints/harmony_utils.py +++ b/python/sglang/srt/entrypoints/harmony_utils.py @@ -3,7 +3,6 @@ # Adapted from vLLM: https://github.com/vllm-project/vllm/blob/1b9902806915040ac9b3029f2ab7522ec505afc3/vllm/entrypoints/harmony_utils.py # Slight differences in processing chat messages import datetime -import json from collections.abc import Iterable from typing import Literal, Optional, Union diff --git a/python/sglang/srt/entrypoints/http_server.py b/python/sglang/srt/entrypoints/http_server.py index 335be026d..00fe4ca17 100644 --- a/python/sglang/srt/entrypoints/http_server.py +++ b/python/sglang/srt/entrypoints/http_server.py @@ -19,7 +19,6 @@ This file implements HTTP APIs for the inference engine via fastapi. import asyncio import dataclasses -import json import logging import multiprocessing as multiprocessing import os diff --git a/python/sglang/srt/entrypoints/http_server_engine.py b/python/sglang/srt/entrypoints/http_server_engine.py index d1db80d65..9ab665a05 100644 --- a/python/sglang/srt/entrypoints/http_server_engine.py +++ b/python/sglang/srt/entrypoints/http_server_engine.py @@ -1,15 +1,9 @@ -import copy -import dataclasses import multiprocessing -import pickle -import threading import time -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import List, Optional, Tuple -import pybase64 import requests import torch -import torch.distributed as dist from sglang.srt.entrypoints.EngineBase import EngineBase from sglang.srt.entrypoints.http_server import launch_server diff --git a/python/sglang/srt/eplb/eplb_algorithms/deepseek.py b/python/sglang/srt/eplb/eplb_algorithms/deepseek.py index 180ccdee4..34bbc4910 100644 --- a/python/sglang/srt/eplb/eplb_algorithms/deepseek.py +++ b/python/sglang/srt/eplb/eplb_algorithms/deepseek.py @@ -3,8 +3,6 @@ from typing import Tuple import torch -from sglang.srt.utils import get_bool_env_var - def balanced_packing( weight: torch.Tensor, num_packs: int diff --git a/python/sglang/srt/function_call/glm4_moe_detector.py b/python/sglang/srt/function_call/glm4_moe_detector.py index 845b5d41f..301d0e0de 100644 --- a/python/sglang/srt/function_call/glm4_moe_detector.py +++ b/python/sglang/srt/function_call/glm4_moe_detector.py @@ -6,11 +6,7 @@ from typing import List from sglang.srt.entrypoints.openai.protocol import Tool from sglang.srt.function_call.base_format_detector import BaseFormatDetector -from sglang.srt.function_call.core_types import ( - StreamingParseResult, - StructureInfo, - _GetInfoFunc, -) +from sglang.srt.function_call.core_types import StreamingParseResult, _GetInfoFunc from sglang.srt.function_call.ebnf_composer import EBNFComposer logger = logging.getLogger(__name__) diff --git a/python/sglang/srt/function_call/json_array_parser.py b/python/sglang/srt/function_call/json_array_parser.py index 5144cb83b..6d6bffc99 100644 --- a/python/sglang/srt/function_call/json_array_parser.py +++ b/python/sglang/srt/function_call/json_array_parser.py @@ -1,5 +1,3 @@ -import json -import re from typing import List from sglang.srt.entrypoints.openai.protocol import Tool diff --git a/python/sglang/srt/function_call/utils.py b/python/sglang/srt/function_call/utils.py index 5ad3f6e89..d85e5e6c0 100644 --- a/python/sglang/srt/function_call/utils.py +++ b/python/sglang/srt/function_call/utils.py @@ -1,4 +1,3 @@ -import json from json import JSONDecodeError, JSONDecoder from json.decoder import WHITESPACE from typing import Any, List, Literal, Optional, Tuple, Union diff --git a/python/sglang/srt/grpc/compile_proto.py b/python/sglang/srt/grpc/compile_proto.py index 7aa145075..c2c4c0aa6 100755 --- a/python/sglang/srt/grpc/compile_proto.py +++ b/python/sglang/srt/grpc/compile_proto.py @@ -70,7 +70,7 @@ def compile_proto(proto_file: Path, output_dir: Path, verbose: bool = True) -> b # Check if grpc_tools is available try: - import grpc_tools.protoc + import grpc_tools.protoc # noqa: F401 except ImportError: print("Error: grpcio-tools not installed") print( diff --git a/python/sglang/srt/grpc/grpc_request_manager.py b/python/sglang/srt/grpc/grpc_request_manager.py index a8acb4bc4..81845388b 100644 --- a/python/sglang/srt/grpc/grpc_request_manager.py +++ b/python/sglang/srt/grpc/grpc_request_manager.py @@ -27,7 +27,6 @@ from sglang.srt.managers.io_struct import ( TokenizedEmbeddingReqInput, TokenizedGenerateReqInput, ) -from sglang.srt.managers.scheduler import is_health_check_generate_req from sglang.srt.server_args import PortArgs, ServerArgs from sglang.srt.utils import get_zmq_socket, kill_process_tree from sglang.utils import get_exception_traceback diff --git a/python/sglang/srt/layers/activation.py b/python/sglang/srt/layers/activation.py index 5dc48821a..f9bb6d6f5 100644 --- a/python/sglang/srt/layers/activation.py +++ b/python/sglang/srt/layers/activation.py @@ -380,4 +380,7 @@ if not ( logger.info( "sgl-kernel is not available on Non-NV, Non-AMD platforms or Non-AMX CPUs. Fallback to other kernel libraries." ) - from vllm.model_executor.layers.activation import GeluAndMul, SiluAndMul + from vllm.model_executor.layers.activation import ( # noqa: F401 + GeluAndMul, + SiluAndMul, + ) diff --git a/python/sglang/srt/layers/attention/ascend_backend.py b/python/sglang/srt/layers/attention/ascend_backend.py index bc118d6c5..f795c65d0 100644 --- a/python/sglang/srt/layers/attention/ascend_backend.py +++ b/python/sglang/srt/layers/attention/ascend_backend.py @@ -20,7 +20,6 @@ if TYPE_CHECKING: from sglang.srt.layers.radix_attention import RadixAttention from sglang.srt.model_executor.model_runner import ModelRunner -import os import numpy as np diff --git a/python/sglang/srt/layers/attention/base_attn_backend.py b/python/sglang/srt/layers/attention/base_attn_backend.py index d0ab5ca82..dcbf1c8fd 100644 --- a/python/sglang/srt/layers/attention/base_attn_backend.py +++ b/python/sglang/srt/layers/attention/base_attn_backend.py @@ -1,7 +1,7 @@ from __future__ import annotations from abc import ABC, abstractmethod -from typing import TYPE_CHECKING, Optional, Union +from typing import TYPE_CHECKING, Optional import torch diff --git a/python/sglang/srt/layers/attention/fla/chunk.py b/python/sglang/srt/layers/attention/fla/chunk.py index a48a9e649..21d93ac00 100644 --- a/python/sglang/srt/layers/attention/fla/chunk.py +++ b/python/sglang/srt/layers/attention/fla/chunk.py @@ -2,7 +2,6 @@ # -*- coding: utf-8 -*- # Copyright (c) 2023-2025, Songlin Yang, Yu Zhang -import warnings from typing import Optional import torch diff --git a/python/sglang/srt/layers/attention/fla/chunk_o.py b/python/sglang/srt/layers/attention/fla/chunk_o.py index d672c646b..b2ae826f7 100644 --- a/python/sglang/srt/layers/attention/fla/chunk_o.py +++ b/python/sglang/srt/layers/attention/fla/chunk_o.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # Copyright (c) 2023-2025, Songlin Yang, Yu Zhang -from typing import Optional, Tuple +from typing import Optional import torch import triton diff --git a/python/sglang/srt/layers/attention/fla/index.py b/python/sglang/srt/layers/attention/fla/index.py index 754b98714..31b2e524e 100644 --- a/python/sglang/srt/layers/attention/fla/index.py +++ b/python/sglang/srt/layers/attention/fla/index.py @@ -3,9 +3,7 @@ # Copyright (c) 2023-2025, Songlin Yang, Yu Zhang import torch -import torch.nn.functional as F import triton -import triton.language as tl from sglang.srt.layers.attention.fla.utils import tensor_cache diff --git a/python/sglang/srt/layers/attention/fla/layernorm_gated.py b/python/sglang/srt/layers/attention/fla/layernorm_gated.py index 50b7244c6..b7dd39b12 100644 --- a/python/sglang/srt/layers/attention/fla/layernorm_gated.py +++ b/python/sglang/srt/layers/attention/fla/layernorm_gated.py @@ -5,7 +5,6 @@ # This backward pass is faster for dimensions up to 8k, but after that it's much slower due to register spilling. # The models we train have hidden dim up to 8k anyway (e.g. Llama 70B), so this is fine. -import math import torch import torch.nn.functional as F diff --git a/python/sglang/srt/layers/attention/fla/wy_fast.py b/python/sglang/srt/layers/attention/fla/wy_fast.py index d51500eb4..fa39312df 100644 --- a/python/sglang/srt/layers/attention/fla/wy_fast.py +++ b/python/sglang/srt/layers/attention/fla/wy_fast.py @@ -9,8 +9,6 @@ import triton import triton.language as tl from sglang.srt.layers.attention.fla.index import prepare_chunk_indices -from sglang.srt.layers.attention.fla.op import safe_exp -from sglang.srt.layers.attention.fla.utils import check_shared_mem @triton.heuristics({"IS_VARLEN": lambda args: args["cu_seqlens"] is not None}) diff --git a/python/sglang/srt/layers/attention/flashinfer_backend.py b/python/sglang/srt/layers/attention/flashinfer_backend.py index ab4398b0b..33ff82ca6 100644 --- a/python/sglang/srt/layers/attention/flashinfer_backend.py +++ b/python/sglang/srt/layers/attention/flashinfer_backend.py @@ -50,7 +50,6 @@ if is_flashinfer_available(): fast_decode_plan, ) from flashinfer.cascade import merge_state - from flashinfer.decode import _get_range_buf, get_seq_lens class WrapperDispatch(Enum): diff --git a/python/sglang/srt/layers/attention/hybrid_attn_backend.py b/python/sglang/srt/layers/attention/hybrid_attn_backend.py index 7a78fd4d1..4f1439c26 100644 --- a/python/sglang/srt/layers/attention/hybrid_attn_backend.py +++ b/python/sglang/srt/layers/attention/hybrid_attn_backend.py @@ -1,4 +1,4 @@ -from typing import Optional, Union +from typing import Optional import torch diff --git a/python/sglang/srt/layers/attention/hybrid_linear_attn_backend.py b/python/sglang/srt/layers/attention/hybrid_linear_attn_backend.py index 7f2e90255..5ea9e6c8e 100644 --- a/python/sglang/srt/layers/attention/hybrid_linear_attn_backend.py +++ b/python/sglang/srt/layers/attention/hybrid_linear_attn_backend.py @@ -1,9 +1,6 @@ -from dataclasses import astuple, dataclass -from functools import lru_cache from typing import Optional, Union import torch -import torch.nn.functional as F from sglang.srt.layers.attention.base_attn_backend import AttentionBackend from sglang.srt.layers.attention.fla.chunk import chunk_gated_delta_rule diff --git a/python/sglang/srt/layers/attention/intel_amx_backend.py b/python/sglang/srt/layers/attention/intel_amx_backend.py index 39e5c7428..4b2974c44 100644 --- a/python/sglang/srt/layers/attention/intel_amx_backend.py +++ b/python/sglang/srt/layers/attention/intel_amx_backend.py @@ -14,7 +14,7 @@ if TYPE_CHECKING: class IntelAMXAttnBackend(AttentionBackend): def __init__(self, model_runner: ModelRunner): - import sgl_kernel + import sgl_kernel # noqa: F401 super().__init__() self.forward_metadata = None diff --git a/python/sglang/srt/layers/attention/mamba/causal_conv1d_triton.py b/python/sglang/srt/layers/attention/mamba/causal_conv1d_triton.py index dbd9dac34..88a65ddd0 100644 --- a/python/sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +++ b/python/sglang/srt/layers/attention/mamba/causal_conv1d_triton.py @@ -4,7 +4,6 @@ from typing import List, Optional, Union -import numpy as np import torch import triton import triton.language as tl diff --git a/python/sglang/srt/layers/attention/mamba/ops/ssd_combined.py b/python/sglang/srt/layers/attention/mamba/ops/ssd_combined.py index d27fc562e..6e2e74752 100644 --- a/python/sglang/srt/layers/attention/mamba/ops/ssd_combined.py +++ b/python/sglang/srt/layers/attention/mamba/ops/ssd_combined.py @@ -10,7 +10,6 @@ import torch import triton -import triton.language as tl from einops import rearrange from packaging import version diff --git a/python/sglang/srt/layers/attention/npu_ops/mla_preprocess.py b/python/sglang/srt/layers/attention/npu_ops/mla_preprocess.py index 06a552545..76f802bd2 100644 --- a/python/sglang/srt/layers/attention/npu_ops/mla_preprocess.py +++ b/python/sglang/srt/layers/attention/npu_ops/mla_preprocess.py @@ -13,7 +13,7 @@ def is_mla_preprocess_enabled() -> bool: if is_mla_preprocess_enabled(): - import sgl_kernel_npu + import sgl_kernel_npu # noqa: F401 import torch_npu torch.npu.config.allow_internal_format = True diff --git a/python/sglang/srt/layers/attention/nsa/nsa_indexer.py b/python/sglang/srt/layers/attention/nsa/nsa_indexer.py index ebb5b85da..b9f399899 100644 --- a/python/sglang/srt/layers/attention/nsa/nsa_indexer.py +++ b/python/sglang/srt/layers/attention/nsa/nsa_indexer.py @@ -1,7 +1,7 @@ from __future__ import annotations from abc import ABC, abstractmethod -from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple +from typing import TYPE_CHECKING, Any, Dict, Optional import torch import torch.nn.functional as F @@ -547,7 +547,7 @@ class Indexer(CustomOp): forward_batch: ForwardBatch, layer_id: int, ) -> torch.Tensor: - import custom_ops + import custom_ops # noqa: F401 import torch_npu from sglang.srt.layers.dp_attention import ( diff --git a/python/sglang/srt/layers/attention/nsa_backend.py b/python/sglang/srt/layers/attention/nsa_backend.py index 74d293fd3..6ec4652f4 100644 --- a/python/sglang/srt/layers/attention/nsa_backend.py +++ b/python/sglang/srt/layers/attention/nsa_backend.py @@ -1,6 +1,5 @@ from __future__ import annotations -import sys from dataclasses import dataclass from typing import TYPE_CHECKING, Dict, List, Literal, Optional, TypeAlias @@ -34,18 +33,18 @@ _is_hip = is_hip() if _is_hip: try: - from aiter import ( + from aiter import ( # noqa: F401 flash_attn_varlen_func, mha_batch_prefill_func, paged_attention_ragged, ) - from aiter.mla import mla_decode_fwd, mla_prefill_fwd + from aiter.mla import mla_decode_fwd, mla_prefill_fwd # noqa: F401 except ImportError: print( "aiter is AMD specific kernel library. Please make sure aiter is installed on your AMD device." ) else: - from sgl_kernel.flash_attn import flash_attn_varlen_func, flash_attn_with_kvcache + from sgl_kernel.flash_attn import flash_attn_with_kvcache @dataclass(frozen=True) diff --git a/python/sglang/srt/layers/layernorm.py b/python/sglang/srt/layers/layernorm.py index a0b75780b..c60314ad9 100644 --- a/python/sglang/srt/layers/layernorm.py +++ b/python/sglang/srt/layers/layernorm.py @@ -372,4 +372,4 @@ if not ( logger.info( "sgl-kernel layernorm implementation is not available on current platform. Fallback to other kernel libraries." ) - from vllm.model_executor.layers.layernorm import GemmaRMSNorm, RMSNorm + from vllm.model_executor.layers.layernorm import GemmaRMSNorm, RMSNorm # noqa: F401 diff --git a/python/sglang/srt/layers/moe/cutlass_moe.py b/python/sglang/srt/layers/moe/cutlass_moe.py index d0fb4e3ef..870749d47 100755 --- a/python/sglang/srt/layers/moe/cutlass_moe.py +++ b/python/sglang/srt/layers/moe/cutlass_moe.py @@ -116,8 +116,6 @@ def cutlass_fused_experts_fp8( if is_cuda: from sglang.srt.layers.quantization.fp8_kernel import ( - per_group_transpose, - per_token_group_quant_fp8_hopper_moe_mn_major, sglang_per_token_group_quant_fp8, ) diff --git a/python/sglang/srt/layers/moe/cutlass_w4a8_moe.py b/python/sglang/srt/layers/moe/cutlass_w4a8_moe.py index 2a84dedc4..800c8c83a 100644 --- a/python/sglang/srt/layers/moe/cutlass_w4a8_moe.py +++ b/python/sglang/srt/layers/moe/cutlass_w4a8_moe.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 """Cutlass W4A8 MoE kernel.""" -import logging from typing import Optional import torch diff --git a/python/sglang/srt/layers/moe/ep_moe/kernels.py b/python/sglang/srt/layers/moe/ep_moe/kernels.py index ef4262a1c..89bab802c 100644 --- a/python/sglang/srt/layers/moe/ep_moe/kernels.py +++ b/python/sglang/srt/layers/moe/ep_moe/kernels.py @@ -1,12 +1,9 @@ import logging -from typing import List, Optional import torch import triton -from sglang.srt.layers.quantization.fp8_kernel import per_token_group_quant_fp8 -from sglang.srt.utils import ceil_div, dispose_tensor, is_cuda -from sglang.utils import is_in_ci +from sglang.srt.utils import ceil_div, is_cuda logger = logging.getLogger(__name__) diff --git a/python/sglang/srt/layers/moe/flashinfer_cutedsl_moe.py b/python/sglang/srt/layers/moe/flashinfer_cutedsl_moe.py index 1d37236e0..8026b1e67 100644 --- a/python/sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +++ b/python/sglang/srt/layers/moe/flashinfer_cutedsl_moe.py @@ -1,4 +1,4 @@ -from typing import Any, Dict, Optional, Union +from typing import Optional, Union import torch from flashinfer.cute_dsl.blockscaled_gemm import grouped_gemm_nt_masked diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/layer.py b/python/sglang/srt/layers/moe/fused_moe_triton/layer.py index 1ff778184..0eb2a9170 100644 --- a/python/sglang/srt/layers/moe/fused_moe_triton/layer.py +++ b/python/sglang/srt/layers/moe/fused_moe_triton/layer.py @@ -43,13 +43,7 @@ from sglang.srt.utils import ( ) if is_flashinfer_available(): - from flashinfer import ( - RoutingMethodType, - fp4_quantize, - reorder_rows_for_gated_act_gemm, - shuffle_matrix_a, - shuffle_matrix_sf_a, - ) + from flashinfer import RoutingMethodType, fp4_quantize _is_hip = is_hip() _is_cpu_amx_available = cpu_has_amx_support() diff --git a/python/sglang/srt/layers/moe/moe_runner/triton.py b/python/sglang/srt/layers/moe/moe_runner/triton.py index 116fdcaa0..8c77d7580 100644 --- a/python/sglang/srt/layers/moe/moe_runner/triton.py +++ b/python/sglang/srt/layers/moe/moe_runner/triton.py @@ -51,7 +51,9 @@ elif _is_hip: if _is_cuda or _is_hip: - from sgl_kernel import moe_align_block_size as sgl_moe_align_block_size + from sgl_kernel import ( # noqa: F401 + moe_align_block_size as sgl_moe_align_block_size, + ) @dataclass diff --git a/python/sglang/srt/layers/moe/rocm_moe_utils.py b/python/sglang/srt/layers/moe/rocm_moe_utils.py index 5fe2de1e5..efa6bb1bb 100644 --- a/python/sglang/srt/layers/moe/rocm_moe_utils.py +++ b/python/sglang/srt/layers/moe/rocm_moe_utils.py @@ -2,7 +2,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from enum import IntEnum -from functools import cache from typing import Optional import torch diff --git a/python/sglang/srt/layers/moe/token_dispatcher/deepep.py b/python/sglang/srt/layers/moe/token_dispatcher/deepep.py index 618c4cf9e..8667d8747 100644 --- a/python/sglang/srt/layers/moe/token_dispatcher/deepep.py +++ b/python/sglang/srt/layers/moe/token_dispatcher/deepep.py @@ -3,7 +3,7 @@ from __future__ import annotations import logging from contextlib import nullcontext from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, Dict, List, NamedTuple, Optional, Tuple, Union +from typing import TYPE_CHECKING, List, NamedTuple, Optional, Tuple, Union from sglang.srt.eplb.expert_distribution import get_global_expert_distribution_recorder from sglang.srt.layers.moe.token_dispatcher.base import ( diff --git a/python/sglang/srt/layers/moe/token_dispatcher/mooncake.py b/python/sglang/srt/layers/moe/token_dispatcher/mooncake.py index d6d561865..54ba8f1b5 100644 --- a/python/sglang/srt/layers/moe/token_dispatcher/mooncake.py +++ b/python/sglang/srt/layers/moe/token_dispatcher/mooncake.py @@ -22,7 +22,7 @@ try: except ImportError: use_mooncake_ep = False -from enum import Enum, IntEnum, auto +from enum import Enum, auto import torch import torch.distributed as dist diff --git a/python/sglang/srt/layers/quantization/awq.py b/python/sglang/srt/layers/quantization/awq.py index 9cba60c2b..d796008c8 100644 --- a/python/sglang/srt/layers/quantization/awq.py +++ b/python/sglang/srt/layers/quantization/awq.py @@ -3,7 +3,7 @@ from __future__ import annotations import logging import warnings -from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional +from typing import TYPE_CHECKING, Any, Dict, List, Optional import torch diff --git a/python/sglang/srt/layers/quantization/base_config.py b/python/sglang/srt/layers/quantization/base_config.py index 4a5b7905e..183005177 100644 --- a/python/sglang/srt/layers/quantization/base_config.py +++ b/python/sglang/srt/layers/quantization/base_config.py @@ -3,7 +3,6 @@ from __future__ import annotations import inspect from abc import ABC, abstractmethod -from dataclasses import dataclass from typing import TYPE_CHECKING, Any, Dict, List, Optional, Type import torch diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py index e2ff25e68..3517bc5e2 100644 --- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -5,7 +5,7 @@ from __future__ import annotations import enum import logging from enum import Enum -from typing import TYPE_CHECKING, List, Optional +from typing import TYPE_CHECKING, List import torch from compressed_tensors import CompressionFormat @@ -21,14 +21,7 @@ from sglang.srt.layers.quantization.utils import ( per_tensor_dequantize, replace_parameter, ) -from sglang.srt.utils import ( - get_bool_env_var, - is_cpu, - is_cuda, - is_hip, - is_npu, - set_weight_attrs, -) +from sglang.srt.utils import get_bool_env_var, is_hip, set_weight_attrs if TYPE_CHECKING: from sglang.srt.layers.moe.fused_moe_triton import FusedMoE @@ -49,7 +42,7 @@ if _use_aiter: from sglang.srt.layers.moe.rocm_moe_utils import rocm_fused_experts_tkw1 try: - import vllm + import vllm # noqa: F401 VLLM_AVAILABLE = True except ImportError: diff --git a/python/sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py b/python/sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py index 6a7ae00d0..9bb34046d 100644 --- a/python/sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +++ b/python/sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py @@ -12,7 +12,7 @@ def _compute_enable_deep_gemm(): return False try: - import deep_gemm + import deep_gemm # noqa: F401 except ImportError: return False diff --git a/python/sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py b/python/sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py index 02945f449..1f2f4542a 100644 --- a/python/sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +++ b/python/sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py @@ -5,7 +5,7 @@ from typing import Tuple import torch from sglang.srt.layers.quantization.deep_gemm_wrapper import compile_utils -from sglang.srt.layers.quantization.deep_gemm_wrapper.configurer import ( +from sglang.srt.layers.quantization.deep_gemm_wrapper.configurer import ( # noqa: F401 DEEPGEMM_BLACKWELL, DEEPGEMM_SCALE_UE8M0, ENABLE_JIT_DEEPGEMM, @@ -17,7 +17,7 @@ logger = logging.getLogger(__name__) if ENABLE_JIT_DEEPGEMM: import deep_gemm - from deep_gemm.utils.layout import get_mn_major_tma_aligned_tensor + from deep_gemm.utils.layout import get_mn_major_tma_aligned_tensor # noqa: F401 _SANITY_CHECK = get_bool_env_var("SGLANG_DEEPGEMM_SANITY_CHECK") diff --git a/python/sglang/srt/layers/quantization/fp8_kernel.py b/python/sglang/srt/layers/quantization/fp8_kernel.py index 580f103f2..bd9628916 100644 --- a/python/sglang/srt/layers/quantization/fp8_kernel.py +++ b/python/sglang/srt/layers/quantization/fp8_kernel.py @@ -67,7 +67,7 @@ if _is_hip: raise ImportError("aiter is required when SGLANG_USE_AITER is set to True") else: try: - import vllm._C + import vllm._C # noqa: F401 except ImportError: raise ImportError("vllm is required when SGLANG_USE_AITER is set to False") diff --git a/python/sglang/srt/layers/quantization/fpgemm_fp8.py b/python/sglang/srt/layers/quantization/fpgemm_fp8.py index 5a78626ff..0c7030101 100644 --- a/python/sglang/srt/layers/quantization/fpgemm_fp8.py +++ b/python/sglang/srt/layers/quantization/fpgemm_fp8.py @@ -11,7 +11,6 @@ from torch.nn.parameter import Parameter from sglang.srt.layers.linear import LinearBase from sglang.srt.layers.parameter import ChannelQuantScaleParameter, ModelWeightParameter from sglang.srt.layers.quantization.base_config import ( - FusedMoEMethodBase, LinearMethodBase, QuantizationConfig, QuantizeMethodBase, @@ -28,7 +27,7 @@ from sglang.srt.layers.quantization.marlin_utils_fp8 import ( prepare_fp8_layer_for_marlin, ) from sglang.srt.layers.quantization.unquant import UnquantizedLinearMethod -from sglang.srt.layers.quantization.utils import is_layer_skipped, replace_parameter +from sglang.srt.layers.quantization.utils import is_layer_skipped from sglang.srt.utils import get_bool_env_var, is_cuda _is_cuda = is_cuda() diff --git a/python/sglang/srt/layers/quantization/gptq.py b/python/sglang/srt/layers/quantization/gptq.py index ccd3d46f7..be28f07f8 100644 --- a/python/sglang/srt/layers/quantization/gptq.py +++ b/python/sglang/srt/layers/quantization/gptq.py @@ -199,7 +199,6 @@ class GPTQConfig(QuantizationConfig): self, layer: torch.nn.Module, prefix: str ) -> Optional[LinearMethodBase]: # Delay the import to avoid circular dependency - from sglang.srt.layers.linear import LinearBase from sglang.srt.layers.moe.fused_moe_triton import FusedMoE if isinstance(layer, FusedMoE): diff --git a/python/sglang/srt/layers/quantization/int8_kernel.py b/python/sglang/srt/layers/quantization/int8_kernel.py index 9e92412ac..91cba1c32 100644 --- a/python/sglang/srt/layers/quantization/int8_kernel.py +++ b/python/sglang/srt/layers/quantization/int8_kernel.py @@ -8,7 +8,7 @@ import torch import triton import triton.language as tl -from sglang.srt.utils import get_bool_env_var, get_device_name, is_cuda +from sglang.srt.utils import get_device_name, is_cuda _is_cuda = is_cuda() if _is_cuda: diff --git a/python/sglang/srt/layers/quantization/modelopt_quant.py b/python/sglang/srt/layers/quantization/modelopt_quant.py index d5c1db3a8..f1c6dafb5 100755 --- a/python/sglang/srt/layers/quantization/modelopt_quant.py +++ b/python/sglang/srt/layers/quantization/modelopt_quant.py @@ -1059,16 +1059,7 @@ class ModelOptNvFp4FusedMoEMethod(FusedMoEMethodBase): intermediate_size, num_experts, ): - from flashinfer import ( - RoutingMethodType, - e2m1_and_ufp8sf_scale_to_float, - fp4_quantize, - next_positive_power_of_2, - nvfp4_block_scale_interleave, - reorder_rows_for_gated_act_gemm, - shuffle_matrix_a, - shuffle_matrix_sf_a, - ) + from flashinfer import nvfp4_block_scale_interleave from flashinfer.fused_moe.core import ( _maybe_get_cached_w2_permute_indices, _maybe_get_cached_w3_w1_permute_indices, diff --git a/python/sglang/srt/layers/quantization/petit.py b/python/sglang/srt/layers/quantization/petit.py index 2c608507c..daac52ee2 100644 --- a/python/sglang/srt/layers/quantization/petit.py +++ b/python/sglang/srt/layers/quantization/petit.py @@ -2,7 +2,7 @@ import logging -from typing import Any, Callable, Dict, List, Optional +from typing import Any, Dict, List, Optional import regex as re import torch diff --git a/python/sglang/srt/layers/quantization/quark/quark_moe.py b/python/sglang/srt/layers/quantization/quark/quark_moe.py index d1ad13f48..3d2d52cd2 100644 --- a/python/sglang/srt/layers/quantization/quark/quark_moe.py +++ b/python/sglang/srt/layers/quantization/quark/quark_moe.py @@ -3,16 +3,16 @@ from __future__ import annotations import logging -from typing import TYPE_CHECKING, Any, Callable, Optional +from typing import TYPE_CHECKING, Any import torch -from aiter import ActivationType, QuantType, biased_grouped_topk +from aiter import ActivationType, QuantType from aiter.fused_moe import fused_moe from aiter.utility.fp4_utils import e8m0_shuffle from sglang.srt.layers.moe import MoeRunnerConfig from sglang.srt.layers.quantization.base_config import FusedMoEMethodBase -from sglang.srt.utils import get_bool_env_var, is_hip, mxfp_supported, set_weight_attrs +from sglang.srt.utils import is_hip, set_weight_attrs if TYPE_CHECKING: from sglang.srt.layers.moe.token_dispatcher import ( diff --git a/python/sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py b/python/sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py index a0787baaf..a8322b496 100644 --- a/python/sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +++ b/python/sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py @@ -2,20 +2,13 @@ from typing import Any, Callable, Optional -import aiter import torch -import torch.nn.functional as F -from aiter.ops.gemm_op_a4w4 import gemm_a4w4 -from aiter.ops.shuffle import shuffle_weight from aiter.ops.triton.gemm_afp4wfp4 import gemm_afp4wfp4 from aiter.ops.triton.gemm_afp4wfp4_pre_quant_atomic import gemm_afp4wfp4_pre_quant from aiter.ops.triton.quant import dynamic_mxfp4_quant -from aiter.utility import dtypes -from aiter.utility.fp4_utils import e8m0_shuffle from sglang.srt.layers.parameter import GroupQuantScaleParameter, PackedvLLMParameter from sglang.srt.layers.quantization.quark.schemes import QuarkScheme -from sglang.srt.utils import get_bool_env_var __all__ = ["QuarkW4A4MXFP4"] diff --git a/python/sglang/srt/layers/quantization/utils.py b/python/sglang/srt/layers/quantization/utils.py index 63b8b6eb7..d407b95f2 100644 --- a/python/sglang/srt/layers/quantization/utils.py +++ b/python/sglang/srt/layers/quantization/utils.py @@ -11,7 +11,6 @@ import numpy import torch from sglang.srt.layers.quantization.fp8_kernel import scaled_fp8_quant -from sglang.srt.utils import is_cuda if TYPE_CHECKING: from sglang.srt.layers.quantization.base_config import QuantizationConfig diff --git a/python/sglang/srt/layers/quantization/w4afp8.py b/python/sglang/srt/layers/quantization/w4afp8.py index e97de07d7..7c5d4554a 100644 --- a/python/sglang/srt/layers/quantization/w4afp8.py +++ b/python/sglang/srt/layers/quantization/w4afp8.py @@ -1,14 +1,13 @@ from __future__ import annotations import logging -from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple +from typing import TYPE_CHECKING, Any, Dict, List, Optional import torch from torch.nn import Module from torch.nn.parameter import Parameter -from sglang.srt.distributed.parallel_state import get_moe_expert_parallel_world_size -from sglang.srt.layers.linear import LinearBase, UnquantizedLinearMethod +from sglang.srt.layers.linear import UnquantizedLinearMethod from sglang.srt.layers.quantization.base_config import ( FusedMoEMethodBase, QuantizationConfig, @@ -17,11 +16,11 @@ from sglang.srt.layers.quantization.base_config import ( from sglang.srt.layers.quantization.fp8 import Fp8LinearMethod from sglang.srt.layers.quantization.unquant import UnquantizedLinearMethod from sglang.srt.layers.quantization.utils import is_layer_skipped -from sglang.srt.utils import is_npu, set_weight_attrs +from sglang.srt.utils import set_weight_attrs if TYPE_CHECKING: from sglang.srt.layers.moe import MoeRunnerConfig - from sglang.srt.layers.moe.ep_moe.layer import DeepEPMoE, EPMoE + from sglang.srt.layers.moe.ep_moe.layer import DeepEPMoE from sglang.srt.layers.moe.token_dispatcher import ( CombineInput, DeepEPNormalOutput, diff --git a/python/sglang/srt/layers/quantization/w8a8_int8.py b/python/sglang/srt/layers/quantization/w8a8_int8.py index 17a79190d..77be31163 100644 --- a/python/sglang/srt/layers/quantization/w8a8_int8.py +++ b/python/sglang/srt/layers/quantization/w8a8_int8.py @@ -1,28 +1,12 @@ from __future__ import annotations -import importlib -import sys from types import MappingProxyType -from typing import ( - TYPE_CHECKING, - Any, - Callable, - Dict, - List, - Mapping, - Optional, - Tuple, - Union, - cast, -) +from typing import TYPE_CHECKING, Any, Dict, List, Mapping, Optional, Tuple, Union, cast import torch from torch.nn.parameter import Parameter -from sglang.srt.distributed import ( - get_tensor_model_parallel_rank, - get_tensor_model_parallel_world_size, -) +from sglang.srt.distributed import get_tensor_model_parallel_world_size from sglang.srt.layers.amx_utils import _amx_process_weight_after_loading from sglang.srt.layers.moe import MoeRunner, MoeRunnerBackend, MoeRunnerConfig from sglang.srt.layers.moe.moe_runner.triton import TritonMoeQuantInfo diff --git a/python/sglang/srt/layers/utils.py b/python/sglang/srt/layers/utils.py index 45e154791..e88f3a938 100644 --- a/python/sglang/srt/layers/utils.py +++ b/python/sglang/srt/layers/utils.py @@ -1,6 +1,5 @@ import logging import re -from functools import lru_cache import torch diff --git a/python/sglang/srt/lora/backend/triton_backend.py b/python/sglang/srt/lora/backend/triton_backend.py index f99e2c006..722915efc 100644 --- a/python/sglang/srt/lora/backend/triton_backend.py +++ b/python/sglang/srt/lora/backend/triton_backend.py @@ -11,7 +11,6 @@ from sglang.srt.lora.triton_ops import ( ) from sglang.srt.lora.utils import LoRABatchInfo from sglang.srt.model_executor.forward_batch_info import ForwardBatch -from sglang.srt.server_args import ServerArgs class TritonLoRABackend(BaseLoRABackend): diff --git a/python/sglang/srt/lora/eviction_policy.py b/python/sglang/srt/lora/eviction_policy.py index 7d1f5f91a..d4b29612f 100644 --- a/python/sglang/srt/lora/eviction_policy.py +++ b/python/sglang/srt/lora/eviction_policy.py @@ -20,7 +20,7 @@ import logging import time from abc import ABC, abstractmethod from collections import OrderedDict -from typing import Any, Dict, List, Optional, Set +from typing import Optional, Set logger = logging.getLogger(__name__) diff --git a/python/sglang/srt/lora/lora_manager.py b/python/sglang/srt/lora/lora_manager.py index 30d3386e2..19ff874dc 100644 --- a/python/sglang/srt/lora/lora_manager.py +++ b/python/sglang/srt/lora/lora_manager.py @@ -16,7 +16,7 @@ # and "Punica: Multi-Tenant LoRA Serving" import logging -from typing import Dict, Iterable, List, Optional, Set, Tuple +from typing import Dict, Iterable, List, Optional import torch diff --git a/python/sglang/srt/managers/cache_controller.py b/python/sglang/srt/managers/cache_controller.py index f36d61ee0..b5c4aa172 100644 --- a/python/sglang/srt/managers/cache_controller.py +++ b/python/sglang/srt/managers/cache_controller.py @@ -14,11 +14,10 @@ limitations under the License. """ import logging -import math import threading import time -from queue import Empty, Full, PriorityQueue, Queue -from typing import TYPE_CHECKING, List, NamedTuple, Optional, Set, Tuple +from queue import Empty, Full, Queue +from typing import TYPE_CHECKING, List, NamedTuple, Optional import torch @@ -41,7 +40,7 @@ from sglang.srt.layers.dp_attention import ( get_attention_tp_size, is_dp_attention_enabled, ) -from sglang.srt.mem_cache.memory_pool import MHATokenToKVPool, MLATokenToKVPool +from sglang.srt.mem_cache.memory_pool import MLATokenToKVPool logger = logging.getLogger(__name__) diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py index 264d89bb9..a39a7a535 100644 --- a/python/sglang/srt/managers/schedule_batch.py +++ b/python/sglang/srt/managers/schedule_batch.py @@ -59,11 +59,10 @@ from sglang.srt.mem_cache.allocator import ( SWATokenToKVPoolAllocator, ) from sglang.srt.mem_cache.base_prefix_cache import BasePrefixCache -from sglang.srt.mem_cache.chunk_cache import ChunkCache, SWAChunkCache +from sglang.srt.mem_cache.chunk_cache import SWAChunkCache from sglang.srt.mem_cache.common import ( alloc_for_decode, alloc_for_extend, - alloc_token_slots, evict_from_tree_cache, ) from sglang.srt.mem_cache.mamba_radix_cache import MambaRadixCache @@ -76,7 +75,6 @@ from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo from sglang.srt.sampling.sampling_params import SamplingParams from sglang.srt.server_args import ServerArgs, get_global_server_args from sglang.srt.utils import flatten_nested_list -from sglang.srt.utils.common import next_power_of_2 if TYPE_CHECKING: from sglang.srt.configs.model_config import ModelConfig diff --git a/python/sglang/srt/managers/scheduler_metrics_mixin.py b/python/sglang/srt/managers/scheduler_metrics_mixin.py index 91fff9e9b..34832f3e3 100644 --- a/python/sglang/srt/managers/scheduler_metrics_mixin.py +++ b/python/sglang/srt/managers/scheduler_metrics_mixin.py @@ -3,13 +3,10 @@ from __future__ import annotations import logging import time from collections import defaultdict -from typing import TYPE_CHECKING, Dict, List, Optional, Union - -import torch +from typing import TYPE_CHECKING, List, Optional from sglang.srt.disaggregation.kv_events import EventPublisherFactory, KVEventBatch from sglang.srt.disaggregation.utils import DisaggregationMode -from sglang.srt.managers.io_struct import TokenizedGenerateReqInput from sglang.srt.managers.schedule_policy import PrefillAdder from sglang.srt.managers.scheduler import Req, ScheduleBatch from sglang.srt.metrics.collector import SchedulerMetricsCollector, SchedulerStats diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py index 03c15fde9..3e325ca4d 100644 --- a/python/sglang/srt/managers/tokenizer_manager.py +++ b/python/sglang/srt/managers/tokenizer_manager.py @@ -16,7 +16,6 @@ import asyncio import copy import dataclasses -import json import logging import math import os diff --git a/python/sglang/srt/managers/utils.py b/python/sglang/srt/managers/utils.py index ccd3f0fe2..fa3435198 100644 --- a/python/sglang/srt/managers/utils.py +++ b/python/sglang/srt/managers/utils.py @@ -1,8 +1,7 @@ from __future__ import annotations import logging -import multiprocessing as mp -from typing import TYPE_CHECKING, Dict, List, Optional +from typing import TYPE_CHECKING, Optional from sglang.srt.layers.logits_processor import LogitsProcessorOutput from sglang.srt.managers.schedule_batch import Req diff --git a/python/sglang/srt/mem_cache/allocator_ascend.py b/python/sglang/srt/mem_cache/allocator_ascend.py index 2c606187a..4adbf592a 100644 --- a/python/sglang/srt/mem_cache/allocator_ascend.py +++ b/python/sglang/srt/mem_cache/allocator_ascend.py @@ -92,7 +92,7 @@ class AscendPagedTokenToKVPoolAllocator(PagedTokenToKVPoolAllocator): ) if num_new_pages_item < 200: - import sgl_kernel_npu + import sgl_kernel_npu # noqa: F401 torch.ops.npu.alloc_extend( prefix_lens, diff --git a/python/sglang/srt/mem_cache/base_prefix_cache.py b/python/sglang/srt/mem_cache/base_prefix_cache.py index 34df99689..fb85497c3 100644 --- a/python/sglang/srt/mem_cache/base_prefix_cache.py +++ b/python/sglang/srt/mem_cache/base_prefix_cache.py @@ -1,5 +1,5 @@ from abc import ABC, abstractmethod -from typing import TYPE_CHECKING, Any, List, NamedTuple, Optional, Tuple +from typing import TYPE_CHECKING, Any, NamedTuple, Optional, Tuple import torch diff --git a/python/sglang/srt/mem_cache/evict_policy.py b/python/sglang/srt/mem_cache/evict_policy.py index ddd2ab6c3..491d3d846 100644 --- a/python/sglang/srt/mem_cache/evict_policy.py +++ b/python/sglang/srt/mem_cache/evict_policy.py @@ -1,7 +1,7 @@ from __future__ import annotations from abc import ABC, abstractmethod -from typing import TYPE_CHECKING, List, Tuple, Union +from typing import TYPE_CHECKING, Tuple, Union if TYPE_CHECKING: from sglang.srt.mem_cache.radix_cache import TreeNode diff --git a/python/sglang/srt/mem_cache/mamba_radix_cache.py b/python/sglang/srt/mem_cache/mamba_radix_cache.py index 7467daa5d..739b204ed 100644 --- a/python/sglang/srt/mem_cache/mamba_radix_cache.py +++ b/python/sglang/srt/mem_cache/mamba_radix_cache.py @@ -22,7 +22,6 @@ The radix tree data structure for managing the hybrid (full and Mamba) KV cache. import heapq import time from collections import defaultdict -from functools import partial from typing import TYPE_CHECKING, List, Optional, Tuple import torch @@ -33,7 +32,6 @@ from sglang.srt.mem_cache.memory_pool import HybridReqToTokenPool from sglang.srt.mem_cache.radix_cache import ( RadixKey, _key_match_page_size1, - _key_match_paged, get_child_key, ) diff --git a/python/sglang/srt/mem_cache/memory_pool_host.py b/python/sglang/srt/mem_cache/memory_pool_host.py index f6d655af0..edfae2cfe 100644 --- a/python/sglang/srt/mem_cache/memory_pool_host.py +++ b/python/sglang/srt/mem_cache/memory_pool_host.py @@ -1,7 +1,6 @@ import abc import logging import threading -from enum import IntEnum from functools import wraps from typing import Optional diff --git a/python/sglang/srt/mem_cache/multimodal_cache.py b/python/sglang/srt/mem_cache/multimodal_cache.py index 63a177543..42c31a8e8 100644 --- a/python/sglang/srt/mem_cache/multimodal_cache.py +++ b/python/sglang/srt/mem_cache/multimodal_cache.py @@ -1,6 +1,5 @@ import logging from collections import OrderedDict -from typing import Dict import torch diff --git a/python/sglang/srt/mem_cache/radix_cache.py b/python/sglang/srt/mem_cache/radix_cache.py index f82594330..9009d4e92 100644 --- a/python/sglang/srt/mem_cache/radix_cache.py +++ b/python/sglang/srt/mem_cache/radix_cache.py @@ -23,7 +23,7 @@ import heapq import time from collections import defaultdict from functools import lru_cache, partial -from typing import TYPE_CHECKING, Any, Iterator, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Iterator, List, Optional, Tuple, Union import torch diff --git a/python/sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py b/python/sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py index 2e54e9816..14494d819 100644 --- a/python/sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +++ b/python/sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py @@ -3,20 +3,8 @@ import os import torch import torch.distributed -from aibrix_kvcache import ( - BaseKVCacheManager, - GroupAwareKVCacheManager, - KVCacheBlockLayout, - KVCacheBlockSpec, - KVCacheConfig, - KVCacheMetrics, - KVCacheTensorSpec, - ModelSpec, - TokenListView, -) -from aibrix_kvcache.common.absl_logging import getLogger, log_every_n_seconds, log_if +from aibrix_kvcache.common.absl_logging import log_every_n_seconds from aibrix_kvcache_storage import AibrixKVCacheStorage -from torch.distributed import Backend, ProcessGroup from sglang.srt.mem_cache.hicache_storage import HiCacheStorageConfig from sglang.srt.mem_cache.memory_pool import MHATokenToKVPool diff --git a/python/sglang/srt/mem_cache/storage/eic/eic_storage.py b/python/sglang/srt/mem_cache/storage/eic/eic_storage.py index 0acd5b65f..f3cc15632 100644 --- a/python/sglang/srt/mem_cache/storage/eic/eic_storage.py +++ b/python/sglang/srt/mem_cache/storage/eic/eic_storage.py @@ -2,21 +2,18 @@ import json import logging import os import time -import uuid -from dataclasses import dataclass -from typing import Any, Dict, List, Optional, Tuple +from typing import Any, List, Optional, Tuple import eic import torch import yaml -from sglang.srt.layers.dp_attention import get_attention_tp_rank, get_attention_tp_size from sglang.srt.mem_cache.hicache_storage import ( HiCacheStorage, HiCacheStorageConfig, HiCacheStorageExtraInfo, ) -from sglang.srt.mem_cache.memory_pool_host import HostKVCache, MLATokenToKVPoolHost +from sglang.srt.mem_cache.memory_pool_host import HostKVCache logger = logging.getLogger(__name__) diff --git a/python/sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py b/python/sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py index c7a485fa0..d789a2053 100644 --- a/python/sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +++ b/python/sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py @@ -1,6 +1,5 @@ import logging import os -import threading from abc import ABC, abstractmethod from typing import List diff --git a/python/sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py b/python/sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py index bf31cbb38..9fdadf6ac 100644 --- a/python/sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +++ b/python/sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py @@ -2,7 +2,7 @@ from __future__ import annotations import logging import threading -from typing import TYPE_CHECKING, List, Optional +from typing import TYPE_CHECKING, Optional import torch diff --git a/python/sglang/srt/mem_cache/storage/nixl/hicache_nixl.py b/python/sglang/srt/mem_cache/storage/nixl/hicache_nixl.py index 55b3dd976..8965acb4a 100644 --- a/python/sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +++ b/python/sglang/srt/mem_cache/storage/nixl/hicache_nixl.py @@ -1,9 +1,8 @@ -import hashlib import logging import os import time import uuid -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import Any, List, Optional, Union import torch diff --git a/python/sglang/srt/mem_cache/storage/nixl/nixl_utils.py b/python/sglang/srt/mem_cache/storage/nixl/nixl_utils.py index 6e3d2a900..b04f9e58d 100644 --- a/python/sglang/srt/mem_cache/storage/nixl/nixl_utils.py +++ b/python/sglang/srt/mem_cache/storage/nixl/nixl_utils.py @@ -1,6 +1,6 @@ import logging import os -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import Any, List, Optional, Tuple, Union import torch diff --git a/python/sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py b/python/sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py index 3784ab91a..aea004a6d 100755 --- a/python/sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +++ b/python/sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py @@ -2,7 +2,7 @@ import os import unittest -from typing import List, Optional +from typing import List from unittest.mock import MagicMock import torch diff --git a/python/sglang/srt/metrics/func_timer.py b/python/sglang/srt/metrics/func_timer.py index fbb01bac8..51d445ab4 100644 --- a/python/sglang/srt/metrics/func_timer.py +++ b/python/sglang/srt/metrics/func_timer.py @@ -18,7 +18,7 @@ Records the latency of some functions import asyncio import time from functools import wraps -from typing import Any, Callable, List, Optional +from typing import Any, Callable, Optional from sglang.srt.metrics.utils import exponential_buckets diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py index b1b8b7ff3..ef780899d 100644 --- a/python/sglang/srt/model_executor/model_runner.py +++ b/python/sglang/srt/model_executor/model_runner.py @@ -104,11 +104,7 @@ from sglang.srt.mem_cache.memory_pool import ( ) from sglang.srt.model_executor.cpu_graph_runner import CPUGraphRunner from sglang.srt.model_executor.cuda_graph_runner import CudaGraphRunner -from sglang.srt.model_executor.forward_batch_info import ( - ForwardBatch, - ForwardMode, - PPProxyTensors, -) +from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors from sglang.srt.model_executor.npu_graph_runner import NPUGraphRunner from sglang.srt.model_executor.piecewise_cuda_graph_runner import ( PiecewiseCudaGraphRunner, diff --git a/python/sglang/srt/model_executor/npu_graph_runner.py b/python/sglang/srt/model_executor/npu_graph_runner.py index db7dcd159..cfd9abbcf 100644 --- a/python/sglang/srt/model_executor/npu_graph_runner.py +++ b/python/sglang/srt/model_executor/npu_graph_runner.py @@ -19,10 +19,9 @@ import logging import threading from typing import TYPE_CHECKING, Optional, Union -import numpy as np import torch -from sglang.srt.configs.model_config import AttentionArch, is_deepseek_nsa +from sglang.srt.configs.model_config import is_deepseek_nsa from sglang.srt.model_executor.cuda_graph_runner import CudaGraphRunner logger = logging.getLogger(__name__) diff --git a/python/sglang/srt/models/bailing_moe.py b/python/sglang/srt/models/bailing_moe.py index 2cb7d5961..e768c0a53 100644 --- a/python/sglang/srt/models/bailing_moe.py +++ b/python/sglang/srt/models/bailing_moe.py @@ -19,7 +19,7 @@ # limitations under the License. """SGLang BailingMoE model.""" import logging -from typing import Any, Dict, Iterable, Optional, Tuple, Union +from typing import Iterable, Optional, Tuple, Union import torch import torch.nn.functional as F @@ -59,7 +59,6 @@ from sglang.srt.layers.moe.ep_moe.layer import get_moe_impl_class from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE from sglang.srt.layers.moe.token_dispatcher import DeepEPDispatcher from sglang.srt.layers.moe.topk import TopK -from sglang.srt.layers.moe.utils import DeepEPMode from sglang.srt.layers.quantization.base_config import QuantizationConfig from sglang.srt.layers.radix_attention import RadixAttention from sglang.srt.layers.rotary_embedding import get_rope diff --git a/python/sglang/srt/models/bert.py b/python/sglang/srt/models/bert.py index d7f3301c6..45494423f 100644 --- a/python/sglang/srt/models/bert.py +++ b/python/sglang/srt/models/bert.py @@ -1,5 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Any, Dict, Iterable, Optional, Set, Tuple +from typing import Iterable, Optional, Set, Tuple import torch from torch import nn diff --git a/python/sglang/srt/models/deepseek_v2.py b/python/sglang/srt/models/deepseek_v2.py index fb9cd4f6c..f24923a73 100644 --- a/python/sglang/srt/models/deepseek_v2.py +++ b/python/sglang/srt/models/deepseek_v2.py @@ -183,9 +183,9 @@ elif _is_hip: awq_dequantize_triton as awq_dequantize, ) elif _is_npu: - import custom_ops - import sgl_kernel_npu - import torch_npu + import custom_ops # noqa: F401 + import sgl_kernel_npu # noqa: F401 + import torch_npu # noqa: F401 else: pass diff --git a/python/sglang/srt/models/dots_ocr.py b/python/sglang/srt/models/dots_ocr.py index ee48909ed..d1f60fecc 100644 --- a/python/sglang/srt/models/dots_ocr.py +++ b/python/sglang/srt/models/dots_ocr.py @@ -6,7 +6,6 @@ from typing import Iterable, List, Optional, Tuple import torch import torch.nn as nn -from transformers.activations import ACT2FN from sglang.srt.configs import DotsOCRConfig from sglang.srt.layers.logits_processor import LogitsProcessor @@ -22,7 +21,6 @@ from sglang.srt.model_loader.weight_utils import default_weight_loader from sglang.srt.models.dots_vlm_vit import DotsVisionTransformer from sglang.srt.models.qwen2 import Qwen2ForCausalLM from sglang.srt.utils import add_prefix -from sglang.srt.utils.hf_transformers_utils import get_processor logger = logging.getLogger(__name__) diff --git a/python/sglang/srt/models/dots_vlm.py b/python/sglang/srt/models/dots_vlm.py index 95475058f..d626b1ef6 100644 --- a/python/sglang/srt/models/dots_vlm.py +++ b/python/sglang/srt/models/dots_vlm.py @@ -23,7 +23,6 @@ import torch from torch import nn from sglang.srt.configs.dots_vlm import DotsVLMConfig -from sglang.srt.distributed import parallel_state from sglang.srt.layers.quantization.base_config import QuantizationConfig from sglang.srt.managers.mm_utils import ( MultiModalityDataPaddingPatternMultimodalTokens, diff --git a/python/sglang/srt/models/falcon_h1.py b/python/sglang/srt/models/falcon_h1.py index c35613bcb..0fab9e410 100644 --- a/python/sglang/srt/models/falcon_h1.py +++ b/python/sglang/srt/models/falcon_h1.py @@ -1,4 +1,3 @@ -import enum import logging from typing import Any, Iterable, List, Optional, Set, Tuple diff --git a/python/sglang/srt/models/gemma3n_mm.py b/python/sglang/srt/models/gemma3n_mm.py index 3c52635dd..86f7fd516 100644 --- a/python/sglang/srt/models/gemma3n_mm.py +++ b/python/sglang/srt/models/gemma3n_mm.py @@ -14,8 +14,7 @@ from transformers import ( ) from transformers.models.auto.modeling_auto import AutoModel -from sglang.srt.layers.layernorm import RMSNorm -from sglang.srt.layers.linear import ColumnParallelLinear, RowParallelLinear +from sglang.srt.layers.linear import RowParallelLinear from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.quantization.base_config import QuantizationConfig from sglang.srt.layers.vocab_parallel_embedding import VocabParallelEmbedding diff --git a/python/sglang/srt/models/glm4_moe.py b/python/sglang/srt/models/glm4_moe.py index 35ce0c40d..2d4bf41f1 100644 --- a/python/sglang/srt/models/glm4_moe.py +++ b/python/sglang/srt/models/glm4_moe.py @@ -44,10 +44,8 @@ from sglang.srt.layers.dp_attention import ( ) from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.linear import ( - ColumnParallelLinear, MergedColumnParallelLinear, QKVParallelLinear, - ReplicatedLinear, RowParallelLinear, ) from sglang.srt.layers.logits_processor import LogitsProcessor @@ -78,16 +76,12 @@ from sglang.srt.utils import ( BumpAllocator, LazyValue, add_prefix, - bind_or_assign, cpu_has_amx_support, get_bool_env_var, get_device_sm, - get_int_env_var, is_cpu, is_cuda, - is_flashinfer_available, is_hip, - is_non_idle_and_non_empty, log_info_on_rank0, use_intel_amx_backend, ) diff --git a/python/sglang/srt/models/gpt_oss.py b/python/sglang/srt/models/gpt_oss.py index 1f280f37e..6d80adf0f 100644 --- a/python/sglang/srt/models/gpt_oss.py +++ b/python/sglang/srt/models/gpt_oss.py @@ -85,7 +85,7 @@ _is_sm100_supported = is_cuda() and is_sm100_supported() if _is_cuda: - from sgl_kernel import FusedSetKVBufferArg + from sgl_kernel import FusedSetKVBufferArg # noqa: F401 class GptOssConfig(PretrainedConfig): diff --git a/python/sglang/srt/models/hunyuan.py b/python/sglang/srt/models/hunyuan.py index c1ed2543c..7c6fd9e48 100644 --- a/python/sglang/srt/models/hunyuan.py +++ b/python/sglang/srt/models/hunyuan.py @@ -12,18 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only HunYuan model compatible with HuggingFace weights.""" -import logging import re -from dataclasses import dataclass -from enum import Enum, auto -from typing import Any, Dict, Iterable, List, Optional, Tuple, Union +from typing import Any, Dict, Iterable, Optional, Tuple import torch from torch import nn from transformers import PretrainedConfig from sglang.srt.distributed import ( - get_pp_group, get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, tensor_model_parallel_all_reduce, @@ -46,7 +42,6 @@ from sglang.srt.layers.radix_attention import RadixAttention from sglang.srt.layers.rotary_embedding import get_rope from sglang.srt.layers.sampler import Sampler from sglang.srt.layers.vocab_parallel_embedding import ( - DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding, ) @@ -56,7 +51,7 @@ from sglang.srt.model_loader.weight_utils import ( kv_cache_scales_loader, maybe_remap_kv_scale_name, ) -from sglang.srt.utils import add_prefix, is_hip +from sglang.srt.utils import is_hip expert_distribution_recorder = ExpertDistributionRecorder() diff --git a/python/sglang/srt/models/interns1.py b/python/sglang/srt/models/interns1.py index c7383ed25..e896843ff 100644 --- a/python/sglang/srt/models/interns1.py +++ b/python/sglang/srt/models/interns1.py @@ -5,7 +5,6 @@ from torch import nn from transformers import PretrainedConfig from sglang.srt.layers.attention import vision_utils -from sglang.srt.layers.moe.ep_moe.layer import get_moe_impl_class from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE from sglang.srt.layers.quantization.base_config import QuantizationConfig from sglang.srt.managers.mm_utils import ( diff --git a/python/sglang/srt/models/llama_eagle3.py b/python/sglang/srt/models/llama_eagle3.py index 87ae7ade5..d0605d08d 100644 --- a/python/sglang/srt/models/llama_eagle3.py +++ b/python/sglang/srt/models/llama_eagle3.py @@ -27,7 +27,7 @@ from transformers import LlamaConfig from sglang.srt.distributed import get_pp_group from sglang.srt.layers.layernorm import RMSNorm -from sglang.srt.layers.linear import QKVParallelLinear, RowParallelLinear +from sglang.srt.layers.linear import QKVParallelLinear from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.quantization.base_config import QuantizationConfig from sglang.srt.layers.vocab_parallel_embedding import ( diff --git a/python/sglang/srt/models/longcat_flash.py b/python/sglang/srt/models/longcat_flash.py index edfadfa0a..ffca2bad0 100644 --- a/python/sglang/srt/models/longcat_flash.py +++ b/python/sglang/srt/models/longcat_flash.py @@ -44,9 +44,7 @@ from sglang.srt.distributed import ( ) from sglang.srt.eplb.expert_distribution import get_global_expert_distribution_recorder from sglang.srt.eplb.expert_location import ModelConfigForExpertLocation -from sglang.srt.eplb.expert_location_dispatch import ExpertLocationDispatchInfo from sglang.srt.layers.activation import SiluAndMul -from sglang.srt.layers.amx_utils import PackWeightMethod from sglang.srt.layers.communicator import LayerCommunicator, LayerScatterModes from sglang.srt.layers.dp_attention import ( get_attention_tp_rank, @@ -87,20 +85,15 @@ from sglang.srt.models.deepseek_v2 import DeepseekV2AttentionMLA from sglang.srt.server_args import get_global_server_args from sglang.srt.utils import ( BumpAllocator, - LazyValue, add_prefix, bind_or_assign, cpu_has_amx_support, get_bool_env_var, get_device_sm, - get_int_env_var, is_cpu, is_cuda, - is_flashinfer_available, is_hip, - is_non_idle_and_non_empty, is_npu, - is_sm100_supported, ) _is_hip = is_hip() @@ -113,13 +106,7 @@ _is_cpu = is_cpu() _device_sm = get_device_sm() if _is_cuda: - from sgl_kernel import ( - awq_dequantize, - bmm_fp8, - dsv3_fused_a_gemm, - dsv3_router_gemm, - merge_state_v2, - ) + from sgl_kernel import awq_dequantize elif _is_cpu and _is_cpu_amx_available: pass elif _is_hip: diff --git a/python/sglang/srt/models/longcat_flash_nextn.py b/python/sglang/srt/models/longcat_flash_nextn.py index 69bd1548d..a6092785a 100644 --- a/python/sglang/srt/models/longcat_flash_nextn.py +++ b/python/sglang/srt/models/longcat_flash_nextn.py @@ -32,14 +32,10 @@ import concurrent.futures import logging -import os -from enum import IntEnum, auto -from typing import Any, Dict, Iterable, Optional, Tuple, Union +from typing import Iterable, Optional, Tuple import torch -import torch.nn.functional as F from torch import nn -from tqdm import tqdm from sglang.srt.configs import LongcatFlashConfig from sglang.srt.eplb.expert_distribution import get_global_expert_distribution_recorder @@ -75,7 +71,6 @@ from sglang.srt.models.deepseek_v2 import DeepseekV2AttentionMLA from sglang.srt.models.longcat_flash import LongcatFlashForCausalLM, LongcatFlashMLP from sglang.srt.utils import ( BumpAllocator, - LazyValue, add_prefix, bind_or_assign, cpu_has_amx_support, @@ -97,13 +92,7 @@ _is_cpu = is_cpu() _device_sm = get_device_sm() if _is_cuda: - from sgl_kernel import ( - awq_dequantize, - bmm_fp8, - dsv3_fused_a_gemm, - dsv3_router_gemm, - merge_state_v2, - ) + from sgl_kernel import awq_dequantize elif _is_cpu and _is_cpu_amx_available: pass elif _is_hip: diff --git a/python/sglang/srt/models/mimo.py b/python/sglang/srt/models/mimo.py index 2a89e7706..15aad8f41 100644 --- a/python/sglang/srt/models/mimo.py +++ b/python/sglang/srt/models/mimo.py @@ -1,28 +1,17 @@ # Adapted from qwen2.py -from functools import partial -from typing import Any, Dict, Iterable, Optional, Tuple +from typing import Iterable, Optional, Tuple import torch from torch import nn -from sglang.srt.distributed import ( - get_tensor_model_parallel_rank, - get_tensor_model_parallel_world_size, - split_tensor_along_last_dim, - tensor_model_parallel_all_gather, -) -from sglang.srt.layers.layernorm import RMSNorm -from sglang.srt.layers.linear import QKVParallelLinear, RowParallelLinear from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.pooler import Pooler, PoolingType from sglang.srt.layers.quantization.base_config import QuantizationConfig -from sglang.srt.layers.radix_attention import RadixAttention -from sglang.srt.layers.rotary_embedding import get_rope from sglang.srt.layers.vocab_parallel_embedding import ParallelLMHead from sglang.srt.model_executor.forward_batch_info import ForwardBatch from sglang.srt.model_loader.weight_utils import default_weight_loader -from sglang.srt.models.qwen2 import Qwen2DecoderLayer, Qwen2MLP, Qwen2Model +from sglang.srt.models.qwen2 import Qwen2DecoderLayer, Qwen2Model from sglang.srt.utils import add_prefix MiMoConfig = None diff --git a/python/sglang/srt/models/mimo_mtp.py b/python/sglang/srt/models/mimo_mtp.py index 89e8c02cd..2702a637d 100644 --- a/python/sglang/srt/models/mimo_mtp.py +++ b/python/sglang/srt/models/mimo_mtp.py @@ -1,7 +1,6 @@ # Adapted from https://github.com/vllm-project/vllm/pull/17433/files and deepseek_nextn.py -from functools import partial -from typing import Any, Dict, Iterable, Optional, Tuple +from typing import Iterable, Optional, Tuple import torch from torch import nn diff --git a/python/sglang/srt/models/minicpmo.py b/python/sglang/srt/models/minicpmo.py index 2f8271c6c..b83a86e22 100644 --- a/python/sglang/srt/models/minicpmo.py +++ b/python/sglang/srt/models/minicpmo.py @@ -43,7 +43,6 @@ from sglang.srt.managers.mm_utils import ( general_mm_embed_routine, ) from sglang.srt.managers.schedule_batch import ( - Modality, MultimodalDataItem, MultimodalInputs, flatten_nested_list, @@ -59,8 +58,6 @@ from sglang.srt.utils import logger try: from transformers import LogitsWarper from vector_quantize_pytorch import GroupedResidualFSQ - from vocos import Vocos - from vocos.pretrained import instantiate_class _tts_deps = True except: diff --git a/python/sglang/srt/models/mixtral.py b/python/sglang/srt/models/mixtral.py index 81026f9bb..cb55848cf 100644 --- a/python/sglang/srt/models/mixtral.py +++ b/python/sglang/srt/models/mixtral.py @@ -24,7 +24,6 @@ from torch import nn from transformers import MixtralConfig from sglang.srt.distributed import ( - get_moe_expert_parallel_world_size, get_pp_group, get_tensor_model_parallel_world_size, tensor_model_parallel_all_reduce, diff --git a/python/sglang/srt/models/opt.py b/python/sglang/srt/models/opt.py index a571e8937..bf989f6e8 100644 --- a/python/sglang/srt/models/opt.py +++ b/python/sglang/srt/models/opt.py @@ -17,7 +17,6 @@ from collections.abc import Iterable from typing import Optional, Union import torch -import torch.nn.functional as F from torch import nn from transformers import OPTConfig @@ -26,10 +25,8 @@ from sglang.srt.distributed import ( get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, ) -from sglang.srt.layers.activation import get_act_fn from sglang.srt.layers.linear import ( ColumnParallelLinear, - MergedColumnParallelLinear, QKVParallelLinear, ReplicatedLinear, RowParallelLinear, @@ -38,7 +35,7 @@ from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorO from sglang.srt.layers.pooler import Pooler, PoolingType from sglang.srt.layers.quantization.base_config import QuantizationConfig from sglang.srt.layers.radix_attention import RadixAttention -from sglang.srt.layers.utils import PPMissingLayer, get_layer_id +from sglang.srt.layers.utils import get_layer_id from sglang.srt.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding, @@ -47,7 +44,6 @@ from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTe from sglang.srt.model_loader.weight_utils import ( default_weight_loader, kv_cache_scales_loader, - maybe_remap_kv_scale_name, ) from sglang.srt.utils import add_prefix, make_layers diff --git a/python/sglang/srt/models/phi.py b/python/sglang/srt/models/phi.py index f48895c67..5679bc987 100644 --- a/python/sglang/srt/models/phi.py +++ b/python/sglang/srt/models/phi.py @@ -1,5 +1,5 @@ # Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/phi.py -from typing import Iterable, Optional, Union +from typing import Iterable, Optional import torch from torch import nn diff --git a/python/sglang/srt/models/phi4mm.py b/python/sglang/srt/models/phi4mm.py index 37a638acb..6d00144d2 100644 --- a/python/sglang/srt/models/phi4mm.py +++ b/python/sglang/srt/models/phi4mm.py @@ -24,7 +24,7 @@ from typing import List, Optional, Tuple import numpy as np import torch from torch import nn -from transformers import PretrainedConfig, SiglipVisionConfig +from transformers import PretrainedConfig from sglang.srt.layers.quantization import QuantizationConfig from sglang.srt.managers.mm_utils import ( diff --git a/python/sglang/srt/models/phimoe.py b/python/sglang/srt/models/phimoe.py index 4604aeef9..0d147c2b1 100644 --- a/python/sglang/srt/models/phimoe.py +++ b/python/sglang/srt/models/phimoe.py @@ -18,7 +18,6 @@ from sglang.srt.layers.pooler import Pooler, PoolingType from sglang.srt.layers.quantization.base_config import QuantizationConfig from sglang.srt.layers.radix_attention import RadixAttention from sglang.srt.layers.rotary_embedding import get_rope -from sglang.srt.layers.utils import PPMissingLayer from sglang.srt.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, diff --git a/python/sglang/srt/models/pixtral.py b/python/sglang/srt/models/pixtral.py index 04a7362d8..209b40645 100644 --- a/python/sglang/srt/models/pixtral.py +++ b/python/sglang/srt/models/pixtral.py @@ -16,13 +16,10 @@ Using mistral-community/pixtral-12b as reference. """ -import logging -import math from typing import Iterable, List, Optional, Set, Tuple, Union import torch import torch.nn as nn -import torch.nn.functional as F from transformers import PixtralVisionConfig, PretrainedConfig from transformers.models.pixtral.modeling_pixtral import PixtralRotaryEmbedding from transformers.models.pixtral.modeling_pixtral import ( diff --git a/python/sglang/srt/models/qwen.py b/python/sglang/srt/models/qwen.py index 009650411..206908b49 100644 --- a/python/sglang/srt/models/qwen.py +++ b/python/sglang/srt/models/qwen.py @@ -15,7 +15,6 @@ # Adapted from # https://github.com/vllm-project/vllm/blob/c7f2cf2b7f67bce5842fedfdba508440fe257375/vllm/model_executor/models/qwen.py#L1 -import time from typing import Any, Dict, Iterable, Optional, Tuple import torch diff --git a/python/sglang/srt/models/qwen2_audio.py b/python/sglang/srt/models/qwen2_audio.py index 8609758a9..98f30636a 100644 --- a/python/sglang/srt/models/qwen2_audio.py +++ b/python/sglang/srt/models/qwen2_audio.py @@ -23,30 +23,18 @@ # limitations under the License. """Inference-only Qwen2-Audio model compatible with HuggingFace weights.""" import logging -import math -from functools import lru_cache, partial -from typing import Any, Iterable, List, Optional, Tuple, Type, TypedDict +from typing import Any, Iterable, List, Optional, Tuple import torch import torch.nn as nn -import torch.nn.functional as F -from einops import rearrange -from transformers import AutoTokenizer, Qwen2AudioEncoderConfig, Qwen2Config -from transformers.activations import ACT2FN +from transformers import Qwen2AudioEncoderConfig, Qwen2Config from transformers.models.qwen2_audio.configuration_qwen2_audio import Qwen2AudioConfig from transformers.models.qwen2_audio.modeling_qwen2_audio import ( Qwen2AudioEncoder, Qwen2AudioMultiModalProjector, ) -from sglang.srt.layers.activation import QuickGELU -from sglang.srt.layers.attention.vision import VisionAttention -from sglang.srt.layers.linear import ColumnParallelLinear, RowParallelLinear -from sglang.srt.layers.logits_processor import LogitsProcessor -from sglang.srt.layers.pooler import Pooler, PoolingType from sglang.srt.layers.quantization.base_config import QuantizationConfig -from sglang.srt.layers.utils import get_layer_id -from sglang.srt.layers.vocab_parallel_embedding import ParallelLMHead from sglang.srt.managers.mm_utils import ( MultiModalityDataPaddingPatternMultimodalTokens, general_mm_embed_routine, @@ -60,7 +48,6 @@ from sglang.srt.model_executor.forward_batch_info import ForwardBatch from sglang.srt.model_loader.weight_utils import default_weight_loader from sglang.srt.models.qwen2 import Qwen2ForCausalLM from sglang.srt.utils import add_prefix -from sglang.srt.utils.hf_transformers_utils import get_processor logger = logging.getLogger(__name__) diff --git a/python/sglang/srt/models/qwen3_next.py b/python/sglang/srt/models/qwen3_next.py index 1b11aa30b..9fe9e7748 100644 --- a/python/sglang/srt/models/qwen3_next.py +++ b/python/sglang/srt/models/qwen3_next.py @@ -1,18 +1,12 @@ import enum import logging -from typing import Any, Dict, Iterable, Optional, Set, Tuple +from typing import Any, Iterable, Optional, Set, Tuple import torch -import torch.nn.functional as F from torch import nn from sglang.srt.configs.qwen3_next import Qwen3NextConfig -from sglang.srt.distributed import ( - divide, - get_pp_group, - get_tensor_model_parallel_rank, - get_tensor_model_parallel_world_size, -) +from sglang.srt.distributed import divide, get_pp_group from sglang.srt.eplb.expert_distribution import get_global_expert_distribution_recorder from sglang.srt.eplb.expert_location import ModelConfigForExpertLocation from sglang.srt.layers.attention.fla.layernorm_gated import RMSNorm as RMSNormGated @@ -23,10 +17,9 @@ from sglang.srt.layers.dp_attention import ( get_attention_tp_size, is_dp_attention_enabled, ) -from sglang.srt.layers.layernorm import GemmaRMSNorm, RMSNorm +from sglang.srt.layers.layernorm import GemmaRMSNorm from sglang.srt.layers.linear import ( ColumnParallelLinear, - MergedColumnParallelLinear, QKVParallelLinear, RowParallelLinear, ) diff --git a/python/sglang/srt/models/qwen3_vl.py b/python/sglang/srt/models/qwen3_vl.py index c41eb0403..be81eef62 100644 --- a/python/sglang/srt/models/qwen3_vl.py +++ b/python/sglang/srt/models/qwen3_vl.py @@ -20,18 +20,13 @@ from typing import Callable, Iterable, List, Optional, Tuple, Union import numpy as np import torch import torch.nn as nn -import torch.nn.functional as F from einops import rearrange from transformers.activations import ACT2FN from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import ( Qwen2_5_VisionRotaryEmbedding, ) -from sglang.srt.configs.qwen3_vl import ( - Qwen3VLConfig, - Qwen3VLTextConfig, - Qwen3VLVisionConfig, -) +from sglang.srt.configs.qwen3_vl import Qwen3VLConfig, Qwen3VLVisionConfig from sglang.srt.layers.attention.vision import VisionAttention from sglang.srt.layers.linear import ColumnParallelLinear, RowParallelLinear from sglang.srt.layers.logits_processor import LogitsProcessor @@ -47,11 +42,7 @@ from sglang.srt.managers.schedule_batch import ( MultimodalDataItem, MultimodalInputs, ) -from sglang.srt.model_executor.forward_batch_info import ( - ForwardBatch, - ForwardMode, - PPProxyTensors, -) +from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors from sglang.srt.model_loader.weight_utils import default_weight_loader from sglang.srt.models.qwen3 import Qwen3Model from sglang.srt.utils import add_prefix diff --git a/python/sglang/srt/models/qwen3_vl_moe.py b/python/sglang/srt/models/qwen3_vl_moe.py index c4d56a257..3bf0b1123 100644 --- a/python/sglang/srt/models/qwen3_vl_moe.py +++ b/python/sglang/srt/models/qwen3_vl_moe.py @@ -25,12 +25,8 @@ from sglang.srt.distributed import ( get_moe_expert_parallel_world_size, get_tensor_model_parallel_rank, ) -from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE from sglang.srt.layers.quantization.base_config import QuantizationConfig -from sglang.srt.layers.vocab_parallel_embedding import ParallelLMHead -from sglang.srt.managers.mm_utils import general_mm_embed_routine -from sglang.srt.managers.schedule_batch import MultimodalDataItem from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors from sglang.srt.model_loader.weight_utils import default_weight_loader from sglang.srt.models.qwen3_moe import Qwen3MoeModel diff --git a/python/sglang/srt/models/roberta.py b/python/sglang/srt/models/roberta.py index 209be1296..9fad5cfa3 100644 --- a/python/sglang/srt/models/roberta.py +++ b/python/sglang/srt/models/roberta.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -import itertools from typing import Iterable, Optional, Tuple import torch diff --git a/python/sglang/srt/models/sarashina2_vision.py b/python/sglang/srt/models/sarashina2_vision.py index eae341349..f58908b5d 100644 --- a/python/sglang/srt/models/sarashina2_vision.py +++ b/python/sglang/srt/models/sarashina2_vision.py @@ -17,7 +17,6 @@ import logging from typing import Iterable, List, Optional, Tuple import torch -import torch.nn.functional as F from torch import nn from transformers import LlamaConfig diff --git a/python/sglang/srt/models/step3_vl.py b/python/sglang/srt/models/step3_vl.py index 14d277f9f..5a9e74ab6 100644 --- a/python/sglang/srt/models/step3_vl.py +++ b/python/sglang/srt/models/step3_vl.py @@ -1,8 +1,7 @@ import logging import math -from collections.abc import Iterable from math import sqrt -from typing import Any, Dict, Iterable, List, Literal, Optional, Tuple, TypedDict, Union +from typing import Any, Dict, Iterable, List, Optional, Tuple import torch from torch import nn diff --git a/python/sglang/srt/multimodal/processors/deepseek_vl_v2.py b/python/sglang/srt/multimodal/processors/deepseek_vl_v2.py index b09402d0b..26708e8dc 100644 --- a/python/sglang/srt/multimodal/processors/deepseek_vl_v2.py +++ b/python/sglang/srt/multimodal/processors/deepseek_vl_v2.py @@ -18,9 +18,6 @@ # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. from typing import List, Union -import torch - -from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem from sglang.srt.models.deepseek_vl2 import DeepseekVL2ForCausalLM from sglang.srt.multimodal.processors.base_processor import ( BaseMultimodalProcessor, diff --git a/python/sglang/srt/multimodal/processors/dots_vlm.py b/python/sglang/srt/multimodal/processors/dots_vlm.py index 3b95beff3..5f095d150 100644 --- a/python/sglang/srt/multimodal/processors/dots_vlm.py +++ b/python/sglang/srt/multimodal/processors/dots_vlm.py @@ -1,5 +1,4 @@ import asyncio -import math import re from typing import Dict, List, Union diff --git a/python/sglang/srt/multimodal/processors/glm4v.py b/python/sglang/srt/multimodal/processors/glm4v.py index e3c8edc92..2051a426f 100644 --- a/python/sglang/srt/multimodal/processors/glm4v.py +++ b/python/sglang/srt/multimodal/processors/glm4v.py @@ -1,4 +1,3 @@ -import re from typing import List, Union from decord import VideoReader @@ -9,10 +8,7 @@ from sglang.srt.models.glm4v_moe import Glm4vMoeForConditionalGeneration from sglang.srt.multimodal.processors.base_processor import ( BaseMultimodalProcessor as SGLangBaseProcessor, ) -from sglang.srt.multimodal.processors.base_processor import ( - BaseMultiModalProcessorOutput, - MultimodalSpecialTokens, -) +from sglang.srt.multimodal.processors.base_processor import MultimodalSpecialTokens class Glm4vImageProcessor(SGLangBaseProcessor): diff --git a/python/sglang/srt/multimodal/processors/internvl.py b/python/sglang/srt/multimodal/processors/internvl.py index c9a2d97ef..a1ef6b675 100644 --- a/python/sglang/srt/multimodal/processors/internvl.py +++ b/python/sglang/srt/multimodal/processors/internvl.py @@ -4,10 +4,8 @@ from functools import lru_cache import numpy as np import torch -import torchvision.transforms as T from decord import VideoReader, cpu, gpu from PIL import Image -from torchvision.transforms import InterpolationMode from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem from sglang.srt.models.interns1 import InternS1ForConditionalGeneration diff --git a/python/sglang/srt/multimodal/processors/janus_pro.py b/python/sglang/srt/multimodal/processors/janus_pro.py index 54d6c1978..044e31dd2 100644 --- a/python/sglang/srt/multimodal/processors/janus_pro.py +++ b/python/sglang/srt/multimodal/processors/janus_pro.py @@ -1,6 +1,5 @@ from typing import List, Union -from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem from sglang.srt.models.deepseek_janus_pro import MultiModalityCausalLM from sglang.srt.multimodal.processors.base_processor import ( BaseMultimodalProcessor, diff --git a/python/sglang/srt/multimodal/processors/mllama4.py b/python/sglang/srt/multimodal/processors/mllama4.py index 6a01f2aeb..4f04688b8 100644 --- a/python/sglang/srt/multimodal/processors/mllama4.py +++ b/python/sglang/srt/multimodal/processors/mllama4.py @@ -1,13 +1,5 @@ from typing import List, Union -import torch -from transformers.image_utils import SizeDict -from transformers.models.llama4.image_processing_llama4_fast import ( - find_supported_resolutions, - get_best_fit, -) - -from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem from sglang.srt.models.mllama4 import Llama4ForConditionalGeneration from sglang.srt.multimodal.processors.base_processor import ( BaseMultimodalProcessor, diff --git a/python/sglang/srt/multimodal/processors/phi4mm.py b/python/sglang/srt/multimodal/processors/phi4mm.py index 1487d2ca2..c59a41685 100644 --- a/python/sglang/srt/multimodal/processors/phi4mm.py +++ b/python/sglang/srt/multimodal/processors/phi4mm.py @@ -3,7 +3,6 @@ from typing import List, Union from transformers.processing_utils import ProcessorMixin -from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem from sglang.srt.models.phi4mm import Phi4MMForCausalLM from sglang.srt.multimodal.processors.base_processor import ( BaseMultimodalProcessor, diff --git a/python/sglang/srt/multimodal/processors/step3_vl.py b/python/sglang/srt/multimodal/processors/step3_vl.py index ee537e68e..6bd691ecf 100644 --- a/python/sglang/srt/multimodal/processors/step3_vl.py +++ b/python/sglang/srt/multimodal/processors/step3_vl.py @@ -1,7 +1,7 @@ import math import re from itertools import product -from typing import List, Literal, Optional, TypedDict, Union +from typing import List, Optional, Union import numpy as np import torch diff --git a/python/sglang/srt/parser/reasoning_parser.py b/python/sglang/srt/parser/reasoning_parser.py index f50368aed..0c01ede9c 100644 --- a/python/sglang/srt/parser/reasoning_parser.py +++ b/python/sglang/srt/parser/reasoning_parser.py @@ -1,4 +1,3 @@ -import re from typing import Dict, Optional, Tuple, Type from sglang.srt.parser.harmony_parser import HarmonyParser diff --git a/python/sglang/srt/server_args_config_parser.py b/python/sglang/srt/server_args_config_parser.py index 74dc67677..2fee7fc0c 100644 --- a/python/sglang/srt/server_args_config_parser.py +++ b/python/sglang/srt/server_args_config_parser.py @@ -5,7 +5,7 @@ Handles merging of YAML configuration files with command-line arguments. import logging from pathlib import Path -from typing import Any, Dict, List, Union +from typing import Any, Dict, List import yaml diff --git a/python/sglang/srt/speculative/eagle_worker.py b/python/sglang/srt/speculative/eagle_worker.py index e141a0238..cb59b31f8 100644 --- a/python/sglang/srt/speculative/eagle_worker.py +++ b/python/sglang/srt/speculative/eagle_worker.py @@ -56,7 +56,7 @@ from sglang.srt.utils import ( ) if is_cuda(): - from sgl_kernel import segment_packbits + from sgl_kernel import segment_packbits # noqa: F401 logger = logging.getLogger(__name__) SGLANG_RETURN_ORIGINAL_LOGPROB = get_bool_env_var("SGLANG_RETURN_ORIGINAL_LOGPROB") diff --git a/python/sglang/srt/speculative/spec_utils.py b/python/sglang/srt/speculative/spec_utils.py index d89236dbe..c00391bcb 100644 --- a/python/sglang/srt/speculative/spec_utils.py +++ b/python/sglang/srt/speculative/spec_utils.py @@ -22,8 +22,6 @@ from sglang.srt.managers.schedule_batch import Req from sglang.srt.utils import is_cuda, is_hip if TYPE_CHECKING: - from sglang.srt.mem_cache.allocator import TokenToKVPoolAllocator - from sglang.srt.mem_cache.memory_pool import ReqToTokenPool from sglang.srt.speculative.eagle_info import EagleVerifyInput diff --git a/python/sglang/srt/speculative/standalone_worker.py b/python/sglang/srt/speculative/standalone_worker.py index 23f9b9dd2..302799cc6 100644 --- a/python/sglang/srt/speculative/standalone_worker.py +++ b/python/sglang/srt/speculative/standalone_worker.py @@ -11,7 +11,7 @@ from sglang.srt.speculative.spec_utils import draft_tp_context, load_token_map from sglang.srt.utils import empty_context, get_bool_env_var, is_cuda if is_cuda(): - from sgl_kernel import segment_packbits + from sgl_kernel import segment_packbits # noqa: F401 logger = logging.getLogger(__name__) SGLANG_RETURN_ORIGINAL_LOGPROB = get_bool_env_var("SGLANG_RETURN_ORIGINAL_LOGPROB") diff --git a/python/sglang/srt/utils/common.py b/python/sglang/srt/utils/common.py index e4188d050..e8a4256c9 100644 --- a/python/sglang/srt/utils/common.py +++ b/python/sglang/srt/utils/common.py @@ -228,7 +228,7 @@ def support_triton(backend: str) -> bool: try: - import sgl_kernel + import sgl_kernel # noqa: F401 is_intel_amx_backend_available = hasattr( torch.ops.sgl_kernel, "convert_weight_packed" @@ -1556,7 +1556,7 @@ def get_hpu_memory_capacity(): def get_npu_memory_capacity(): try: - import torch_npu + import torch_npu # noqa: F401 return torch.npu.mem_get_info()[1] // 1024 // 1024 # unit: MB except ImportError as e: @@ -1743,7 +1743,7 @@ def get_device(device_id: Optional[int] = None) -> str: if is_habana_available(): try: - import habana_frameworks.torch.hpu + import habana_frameworks.torch.hpu # noqa: F401 if torch.hpu.is_available(): if device_id == None: @@ -1773,7 +1773,7 @@ def get_device_count() -> int: if is_habana_available(): try: - import habana_frameworks.torch.hpu + import habana_frameworks.torch.hpu # noqa: F401 if torch.hpu.is_available(): return torch.hpu.device_count() diff --git a/python/sglang/srt/utils/host_shared_memory.py b/python/sglang/srt/utils/host_shared_memory.py index c599527f9..20ddf8fc7 100644 --- a/python/sglang/srt/utils/host_shared_memory.py +++ b/python/sglang/srt/utils/host_shared_memory.py @@ -1,5 +1,4 @@ import logging -import os from dataclasses import dataclass from multiprocessing import shared_memory from pathlib import Path diff --git a/python/sglang/test/attention/test_flashattn_mla_backend.py b/python/sglang/test/attention/test_flashattn_mla_backend.py index ebfd0b395..16f94a2b2 100644 --- a/python/sglang/test/attention/test_flashattn_mla_backend.py +++ b/python/sglang/test/attention/test_flashattn_mla_backend.py @@ -4,7 +4,6 @@ import torch from sglang.srt.configs.model_config import AttentionArch from sglang.srt.layers.attention.flashattention_backend import FlashAttentionBackend -from sglang.srt.layers.attention.torch_native_backend import TorchNativeAttnBackend from sglang.srt.layers.radix_attention import RadixAttention from sglang.srt.mem_cache.memory_pool import MLATokenToKVPool from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode diff --git a/python/sglang/test/attention/test_prefix_chunk_info.py b/python/sglang/test/attention/test_prefix_chunk_info.py index c02d4d1d6..2b85b695b 100644 --- a/python/sglang/test/attention/test_prefix_chunk_info.py +++ b/python/sglang/test/attention/test_prefix_chunk_info.py @@ -2,8 +2,6 @@ import unittest import torch -from sglang.srt.layers.attention.flashattention_backend import FlashAttentionBackend -from sglang.srt.layers.radix_attention import RadixAttention from sglang.srt.mem_cache.memory_pool import MLATokenToKVPool from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode from sglang.test.test_utils import CustomTestCase diff --git a/python/sglang/test/few_shot_gsm8k_engine.py b/python/sglang/test/few_shot_gsm8k_engine.py index 05b095713..567816cfc 100644 --- a/python/sglang/test/few_shot_gsm8k_engine.py +++ b/python/sglang/test/few_shot_gsm8k_engine.py @@ -1,16 +1,13 @@ import argparse import ast import asyncio -import json import re import time import numpy as np import sglang as sgl -from sglang.lang.api import set_default_backend -from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint -from sglang.utils import download_and_cache_file, dump_state_text, read_jsonl +from sglang.utils import download_and_cache_file, read_jsonl INVALID = -9999999 diff --git a/python/sglang/test/simple_eval_gpqa.py b/python/sglang/test/simple_eval_gpqa.py index b77ca773e..b39366ef5 100644 --- a/python/sglang/test/simple_eval_gpqa.py +++ b/python/sglang/test/simple_eval_gpqa.py @@ -18,7 +18,6 @@ from sglang.test.simple_eval_common import ( HTML_JINJA, Eval, EvalResult, - MessageList, SamplerBase, SingleEvalResult, format_multichoice_question, diff --git a/python/sglang/test/simple_eval_humaneval.py b/python/sglang/test/simple_eval_humaneval.py index 25dcdd53a..efd03af38 100644 --- a/python/sglang/test/simple_eval_humaneval.py +++ b/python/sglang/test/simple_eval_humaneval.py @@ -11,8 +11,6 @@ import re from concurrent.futures import ThreadPoolExecutor, as_completed from typing import Dict, List, Optional -import tqdm - try: from human_eval.data import read_problems from human_eval.evaluation import estimate_pass_at_k @@ -41,7 +39,6 @@ def evaluate_functional_correctness( Evaluates the functional correctness of generated samples, and writes results to f"{sample_file}_results.jsonl.gz" """ - import copy # Check the generated samples against test suites. with ThreadPoolExecutor(max_workers=n_workers) as executor: diff --git a/python/sglang/test/test_block_fp8.py b/python/sglang/test/test_block_fp8.py index 80202d15e..2390489ca 100644 --- a/python/sglang/test/test_block_fp8.py +++ b/python/sglang/test/test_block_fp8.py @@ -1,5 +1,4 @@ import itertools -import os import unittest import torch @@ -577,7 +576,7 @@ class TestW8A8BlockFP8BatchedDeepGemm(CustomTestCase): if not torch.cuda.is_available(): raise unittest.SkipTest("CUDA is not available") try: - import deep_gemm + import deep_gemm # noqa: F401 except ImportError: raise unittest.SkipTest("DeepGEMM is not available") torch.set_default_device("cuda") diff --git a/python/sglang/test/test_block_fp8_deep_gemm_blackwell.py b/python/sglang/test/test_block_fp8_deep_gemm_blackwell.py index 36d7acddb..ac7239ea0 100644 --- a/python/sglang/test/test_block_fp8_deep_gemm_blackwell.py +++ b/python/sglang/test/test_block_fp8_deep_gemm_blackwell.py @@ -1,5 +1,4 @@ import itertools -import os import unittest from typing import List, Tuple diff --git a/python/sglang/test/test_cutlass_moe.py b/python/sglang/test/test_cutlass_moe.py index 377534a49..fdab5a3ac 100755 --- a/python/sglang/test/test_cutlass_moe.py +++ b/python/sglang/test/test_cutlass_moe.py @@ -1,5 +1,4 @@ import argparse -import time import torch import triton # Added import diff --git a/python/sglang/test/test_cutlass_w4a8_moe.py b/python/sglang/test/test_cutlass_w4a8_moe.py index 7d96cccd5..e75154ef4 100644 --- a/python/sglang/test/test_cutlass_w4a8_moe.py +++ b/python/sglang/test/test_cutlass_w4a8_moe.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Literal, Optional +from typing import Optional import pytest import torch diff --git a/python/sglang/test/test_marlin_moe.py b/python/sglang/test/test_marlin_moe.py index 77b0109df..d58200edd 100644 --- a/python/sglang/test/test_marlin_moe.py +++ b/python/sglang/test/test_marlin_moe.py @@ -1,4 +1,3 @@ -import types from typing import Optional import pytest