From 9acc6e350475a64207a6702a579850c93ab27b43 Mon Sep 17 00:00:00 2001 From: Liangsheng Yin Date: Mon, 22 Apr 2024 22:38:09 +0800 Subject: [PATCH] add `.isort.cfg` (#378) --- .isort.cfg | 3 +++ python/sglang/api.py | 4 ---- python/sglang/backend/anthropic.py | 3 --- python/sglang/backend/base_backend.py | 2 +- python/sglang/backend/openai.py | 3 ++- python/sglang/backend/runtime_endpoint.py | 8 ++++---- python/sglang/backend/vertexai.py | 2 -- python/sglang/lang/chat_template.py | 4 ++-- python/sglang/lang/compiler.py | 8 +------- python/sglang/lang/interpreter.py | 4 ++-- python/sglang/lang/ir.py | 2 +- python/sglang/lang/tracer.py | 6 +----- python/sglang/srt/constrained/jump_forward.py | 1 + python/sglang/srt/hf_transformers_utils.py | 5 +++-- .../srt/layers/context_flashattention_nopad.py | 1 + python/sglang/srt/layers/extend_attention.py | 1 + python/sglang/srt/layers/logits_processor.py | 3 ++- python/sglang/srt/layers/radix_attention.py | 3 ++- python/sglang/srt/layers/token_attention.py | 1 + .../sglang/srt/managers/detokenizer_manager.py | 3 ++- .../sglang/srt/managers/router/infer_batch.py | 1 + python/sglang/srt/managers/router/manager.py | 1 + python/sglang/srt/managers/router/model_rpc.py | 3 ++- .../sglang/srt/managers/router/model_runner.py | 9 +++++---- .../sglang/srt/managers/router/radix_cache.py | 2 -- python/sglang/srt/managers/tokenizer_manager.py | 1 + python/sglang/srt/models/commandr.py | 9 +++++---- python/sglang/srt/models/dbrx.py | 9 +++++---- python/sglang/srt/models/gemma.py | 7 ++++--- python/sglang/srt/models/llama2.py | 9 +++++---- python/sglang/srt/models/llava.py | 17 +++++++++-------- python/sglang/srt/models/mixtral.py | 9 +++++---- python/sglang/srt/models/qwen.py | 9 +++++---- python/sglang/srt/models/qwen2.py | 9 +++++---- python/sglang/srt/models/stablelm.py | 7 ++++--- python/sglang/srt/models/yivl.py | 13 ++++++------- python/sglang/srt/server.py | 13 +++++++------ python/sglang/srt/utils.py | 1 - python/sglang/test/test_utils.py | 1 + test/lang/run_all.py | 1 - test/lang/test_anthropic_backend.py | 4 +--- test/lang/test_bind_pin.py | 3 +-- test/lang/test_openai_backend.py | 3 +-- test/lang/test_srt_backend.py | 6 +----- test/lang/test_tracing.py | 5 ++--- test/lang/test_vertexai_backend.py | 3 +-- test/srt/model/bench_llama_low_api.py | 13 +++++++------ test/srt/model/reference_hf.py | 1 - test/srt/model/test_llama_extend.py | 5 +---- test/srt/model/test_llama_low_api.py | 1 + test/srt/model/test_llava_low_api.py | 6 ++---- test/srt/test_flashinfer.py | 1 + test/srt/test_httpserver_concurrent.py | 3 --- test/srt/test_httpserver_llava.py | 1 - test/srt/test_httpserver_reuse.py | 1 - test/srt/test_jump_forward.py | 4 ++-- test/srt/test_robust.py | 6 +++--- 57 files changed, 125 insertions(+), 139 deletions(-) create mode 100644 .isort.cfg diff --git a/.isort.cfg b/.isort.cfg new file mode 100644 index 000000000..482b916dd --- /dev/null +++ b/.isort.cfg @@ -0,0 +1,3 @@ +[settings] +profile=black +known_first_party=sglang \ No newline at end of file diff --git a/python/sglang/api.py b/python/sglang/api.py index 9470b1425..21a9a13fa 100644 --- a/python/sglang/api.py +++ b/python/sglang/api.py @@ -3,11 +3,7 @@ import re from typing import Callable, List, Optional, Union -from sglang.backend.anthropic import Anthropic from sglang.backend.base_backend import BaseBackend -from sglang.backend.openai import OpenAI -from sglang.backend.runtime_endpoint import RuntimeEndpoint -from sglang.backend.vertexai import VertexAI from sglang.global_config import global_config from sglang.lang.ir import ( SglExpr, diff --git a/python/sglang/backend/anthropic.py b/python/sglang/backend/anthropic.py index ea3213e9d..82b3ab7b0 100644 --- a/python/sglang/backend/anthropic.py +++ b/python/sglang/backend/anthropic.py @@ -1,6 +1,3 @@ -from typing import List, Optional, Union - -import numpy as np from sglang.backend.base_backend import BaseBackend from sglang.lang.chat_template import get_chat_template from sglang.lang.interpreter import StreamExecutor diff --git a/python/sglang/backend/base_backend.py b/python/sglang/backend/base_backend.py index cb504f51b..606b821a8 100644 --- a/python/sglang/backend/base_backend.py +++ b/python/sglang/backend/base_backend.py @@ -1,4 +1,4 @@ -from typing import Callable, List, Optional, Union +from typing import List, Optional, Union from sglang.lang.chat_template import get_chat_template from sglang.lang.interpreter import StreamExecutor diff --git a/python/sglang/backend/openai.py b/python/sglang/backend/openai.py index 6cad2f6aa..06f80c341 100644 --- a/python/sglang/backend/openai.py +++ b/python/sglang/backend/openai.py @@ -1,8 +1,9 @@ import logging import time -from typing import Callable, List, Optional, Union +from typing import List, Optional import numpy as np + from sglang.backend.base_backend import BaseBackend from sglang.lang.chat_template import ChatTemplate, get_chat_template_by_model_path from sglang.lang.interpreter import StreamExecutor diff --git a/python/sglang/backend/runtime_endpoint.py b/python/sglang/backend/runtime_endpoint.py index aba69f00c..13e905e3e 100644 --- a/python/sglang/backend/runtime_endpoint.py +++ b/python/sglang/backend/runtime_endpoint.py @@ -1,14 +1,14 @@ import json -from typing import Callable, List, Optional, Union +from typing import List, Optional import numpy as np -import requests + from sglang.backend.base_backend import BaseBackend from sglang.global_config import global_config from sglang.lang.chat_template import get_chat_template_by_model_path from sglang.lang.interpreter import StreamExecutor -from sglang.lang.ir import SglArgument, SglSamplingParams -from sglang.utils import encode_image_base64, find_printable_text, http_request +from sglang.lang.ir import SglSamplingParams +from sglang.utils import find_printable_text, http_request class RuntimeEndpoint(BaseBackend): diff --git a/python/sglang/backend/vertexai.py b/python/sglang/backend/vertexai.py index 5c3c307e2..30829ebf9 100644 --- a/python/sglang/backend/vertexai.py +++ b/python/sglang/backend/vertexai.py @@ -1,8 +1,6 @@ import os import warnings -from typing import List, Optional, Union -import numpy as np from sglang.backend.base_backend import BaseBackend from sglang.lang.chat_template import get_chat_template from sglang.lang.interpreter import StreamExecutor diff --git a/python/sglang/lang/chat_template.py b/python/sglang/lang/chat_template.py index d91dee365..187e0b885 100644 --- a/python/sglang/lang/chat_template.py +++ b/python/sglang/lang/chat_template.py @@ -1,6 +1,6 @@ -from dataclasses import dataclass, field +from dataclasses import dataclass from enum import Enum, auto -from typing import Callable, Dict, List, Optional, Tuple +from typing import Callable, Dict, List, Tuple class ChatTemplateStyle(Enum): diff --git a/python/sglang/lang/compiler.py b/python/sglang/lang/compiler.py index 2c071e407..b2a83ea3c 100644 --- a/python/sglang/lang/compiler.py +++ b/python/sglang/lang/compiler.py @@ -5,13 +5,7 @@ from typing import List, Union from sglang.global_config import global_config from sglang.lang.interpreter import ProgramState, StreamExecutor, pin_program -from sglang.lang.ir import ( - SglArgument, - SglConstantText, - SglExpr, - SglSamplingParams, - SglVariable, -) +from sglang.lang.ir import SglArgument, SglExpr, SglSamplingParams, SglVariable def compile_func(function, backend): diff --git a/python/sglang/lang/interpreter.py b/python/sglang/lang/interpreter.py index ef3d9fb1f..d9cf9f839 100644 --- a/python/sglang/lang/interpreter.py +++ b/python/sglang/lang/interpreter.py @@ -7,9 +7,10 @@ import threading import uuid from concurrent.futures import ThreadPoolExecutor from contextlib import contextmanager -from typing import Any, Callable, Dict, List, Optional, Union +from typing import Any, Callable, Dict, List, Optional import tqdm + from sglang.global_config import global_config from sglang.lang.ir import ( SglCommitLazy, @@ -17,7 +18,6 @@ from sglang.lang.ir import ( SglConstantText, SglExpr, SglExprList, - SglFunction, SglGen, SglImage, SglRoleBegin, diff --git a/python/sglang/lang/ir.py b/python/sglang/lang/ir.py index 9895786dc..66f515686 100644 --- a/python/sglang/lang/ir.py +++ b/python/sglang/lang/ir.py @@ -472,4 +472,4 @@ class SglCommitLazy(SglExpr): super().__init__() def __repr__(self): - return f"CommitLazy()" + return "CommitLazy()" diff --git a/python/sglang/lang/tracer.py b/python/sglang/lang/tracer.py index 74ac9b998..adfe1af0a 100644 --- a/python/sglang/lang/tracer.py +++ b/python/sglang/lang/tracer.py @@ -1,20 +1,16 @@ """Tracing a program.""" import uuid -from typing import Any, Callable, Dict, List, Optional, Union +from typing import Any, Dict, List, Optional from sglang.backend.base_backend import BaseBackend -from sglang.global_config import global_config from sglang.lang.interpreter import ProgramState, ProgramStateGroup from sglang.lang.ir import ( SglArgument, - SglCommitLazy, - SglConcateAndAppend, SglConstantText, SglExpr, SglExprList, SglFork, - SglFunction, SglGen, SglGetForkItem, SglRoleBegin, diff --git a/python/sglang/srt/constrained/jump_forward.py b/python/sglang/srt/constrained/jump_forward.py index 85922e3cf..5955c6147 100644 --- a/python/sglang/srt/constrained/jump_forward.py +++ b/python/sglang/srt/constrained/jump_forward.py @@ -1,4 +1,5 @@ import interegular + from sglang.srt.constrained import FSMInfo, disk_cache, make_deterministic_fsm from sglang.srt.constrained.base_cache import BaseCache diff --git a/python/sglang/srt/hf_transformers_utils.py b/python/sglang/srt/hf_transformers_utils.py index fde8457a3..d88e13616 100644 --- a/python/sglang/srt/hf_transformers_utils.py +++ b/python/sglang/srt/hf_transformers_utils.py @@ -3,10 +3,9 @@ import json import os import warnings -from typing import List, Optional, Tuple, Union +from typing import Optional, Union from huggingface_hub import snapshot_download -from sglang.srt.utils import is_multimodal_model from transformers import ( AutoConfig, AutoProcessor, @@ -15,6 +14,8 @@ from transformers import ( PreTrainedTokenizerFast, ) +from sglang.srt.utils import is_multimodal_model + def download_from_hf(model_path: str): if os.path.exists(model_path): diff --git a/python/sglang/srt/layers/context_flashattention_nopad.py b/python/sglang/srt/layers/context_flashattention_nopad.py index 2ac3d39e9..0c3102c3f 100644 --- a/python/sglang/srt/layers/context_flashattention_nopad.py +++ b/python/sglang/srt/layers/context_flashattention_nopad.py @@ -3,6 +3,7 @@ import torch import triton import triton.language as tl + from sglang.srt.utils import wrap_kernel_launcher CUDA_CAPABILITY = torch.cuda.get_device_capability() diff --git a/python/sglang/srt/layers/extend_attention.py b/python/sglang/srt/layers/extend_attention.py index 62167a582..ce402a910 100644 --- a/python/sglang/srt/layers/extend_attention.py +++ b/python/sglang/srt/layers/extend_attention.py @@ -1,6 +1,7 @@ import torch import triton import triton.language as tl + from sglang.srt.layers.context_flashattention_nopad import context_attention_fwd from sglang.srt.utils import wrap_kernel_launcher diff --git a/python/sglang/srt/layers/logits_processor.py b/python/sglang/srt/layers/logits_processor.py index f96471e63..617dcdf3e 100644 --- a/python/sglang/srt/layers/logits_processor.py +++ b/python/sglang/srt/layers/logits_processor.py @@ -1,11 +1,12 @@ import torch -from sglang.srt.managers.router.model_runner import ForwardMode, InputMetadata from torch import nn from vllm.model_executor.parallel_utils.communication_op import ( get_tensor_model_parallel_world_size, tensor_model_parallel_all_gather, ) +from sglang.srt.managers.router.model_runner import ForwardMode, InputMetadata + class LogitsProcessor(nn.Module): def __init__(self, config): diff --git a/python/sglang/srt/layers/radix_attention.py b/python/sglang/srt/layers/radix_attention.py index 5adc31d3e..cef2c3b7f 100644 --- a/python/sglang/srt/layers/radix_attention.py +++ b/python/sglang/srt/layers/radix_attention.py @@ -1,9 +1,10 @@ import torch +from torch import nn + from sglang.srt.layers.context_flashattention_nopad import context_attention_fwd from sglang.srt.layers.extend_attention import extend_attention_fwd from sglang.srt.layers.token_attention import token_attention_fwd from sglang.srt.managers.router.model_runner import ForwardMode, InputMetadata -from torch import nn class RadixAttention(nn.Module): diff --git a/python/sglang/srt/layers/token_attention.py b/python/sglang/srt/layers/token_attention.py index b0dac1759..7b03e3ffe 100644 --- a/python/sglang/srt/layers/token_attention.py +++ b/python/sglang/srt/layers/token_attention.py @@ -4,6 +4,7 @@ import torch import triton import triton.language as tl + from sglang.srt.managers.router.model_runner import global_server_args_dict from sglang.srt.utils import wrap_kernel_launcher diff --git a/python/sglang/srt/managers/detokenizer_manager.py b/python/sglang/srt/managers/detokenizer_manager.py index 566d40d13..5076a57f8 100644 --- a/python/sglang/srt/managers/detokenizer_manager.py +++ b/python/sglang/srt/managers/detokenizer_manager.py @@ -3,6 +3,7 @@ import asyncio import uvloop import zmq import zmq.asyncio + from sglang.srt.hf_transformers_utils import get_tokenizer from sglang.srt.managers.io_struct import BatchStrOut, BatchTokenIDOut from sglang.srt.server_args import PortArgs, ServerArgs @@ -83,7 +84,7 @@ def start_detokenizer_process( ): try: manager = DetokenizerManager(server_args, port_args) - except Exception as e: + except Exception: pipe_writer.send(get_exception_traceback()) raise pipe_writer.send("init ok") diff --git a/python/sglang/srt/managers/router/infer_batch.py b/python/sglang/srt/managers/router/infer_batch.py index 9d1b366e3..c52892c14 100644 --- a/python/sglang/srt/managers/router/infer_batch.py +++ b/python/sglang/srt/managers/router/infer_batch.py @@ -4,6 +4,7 @@ from typing import List import numpy as np import torch + from sglang.srt.managers.router.radix_cache import RadixCache from sglang.srt.memory_pool import ReqToTokenPool, TokenToKVPool diff --git a/python/sglang/srt/managers/router/manager.py b/python/sglang/srt/managers/router/manager.py index 7562a3022..c331ae2bb 100644 --- a/python/sglang/srt/managers/router/manager.py +++ b/python/sglang/srt/managers/router/manager.py @@ -4,6 +4,7 @@ import logging import uvloop import zmq import zmq.asyncio + from sglang.srt.backend_config import GLOBAL_BACKEND_CONFIG from sglang.srt.managers.router.model_rpc import ModelRpcClient from sglang.srt.server_args import PortArgs, ServerArgs diff --git a/python/sglang/srt/managers/router/model_rpc.py b/python/sglang/srt/managers/router/model_rpc.py index cbf504b99..883bb12f9 100644 --- a/python/sglang/srt/managers/router/model_rpc.py +++ b/python/sglang/srt/managers/router/model_rpc.py @@ -10,6 +10,8 @@ import rpyc import torch from rpyc.utils.classic import obtain from rpyc.utils.server import ThreadedServer +from vllm.logger import _default_handler as vllm_default_handler + from sglang.srt.constrained.fsm_cache import FSMCache from sglang.srt.constrained.jump_forward import JumpForwardCache from sglang.srt.hf_transformers_utils import get_processor, get_tokenizer @@ -30,7 +32,6 @@ from sglang.srt.utils import ( is_multimodal_model, set_random_seed, ) -from vllm.logger import _default_handler as vllm_default_handler logger = logging.getLogger("model_rpc") diff --git a/python/sglang/srt/managers/router/model_runner.py b/python/sglang/srt/managers/router/model_runner.py index d5f120418..0837c51bb 100644 --- a/python/sglang/srt/managers/router/model_runner.py +++ b/python/sglang/srt/managers/router/model_runner.py @@ -9,16 +9,17 @@ from typing import List import numpy as np import torch -from sglang.srt.managers.router.infer_batch import Batch, ForwardMode -from sglang.srt.memory_pool import ReqToTokenPool, TokenToKVPool -from sglang.srt.utils import is_multimodal_model -from sglang.utils import get_available_gpu_memory from vllm.model_executor.layers.quantization.awq import AWQConfig from vllm.model_executor.layers.quantization.gptq import GPTQConfig from vllm.model_executor.layers.quantization.marlin import MarlinConfig from vllm.model_executor.model_loader import _set_default_torch_dtype from vllm.model_executor.parallel_utils.parallel_state import initialize_model_parallel +from sglang.srt.managers.router.infer_batch import Batch, ForwardMode +from sglang.srt.memory_pool import ReqToTokenPool, TokenToKVPool +from sglang.srt.utils import is_multimodal_model +from sglang.utils import get_available_gpu_memory + QUANTIZATION_CONFIG_MAPPING = { "awq": AWQConfig, "gptq": GPTQConfig, diff --git a/python/sglang/srt/managers/router/radix_cache.py b/python/sglang/srt/managers/router/radix_cache.py index c7bd9cb6b..ccf7f4af4 100644 --- a/python/sglang/srt/managers/router/radix_cache.py +++ b/python/sglang/srt/managers/router/radix_cache.py @@ -1,8 +1,6 @@ import heapq import time from collections import defaultdict -from dataclasses import dataclass -from typing import Tuple import torch diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py index 183c34bef..78241b74b 100644 --- a/python/sglang/srt/managers/tokenizer_manager.py +++ b/python/sglang/srt/managers/tokenizer_manager.py @@ -10,6 +10,7 @@ import transformers import uvloop import zmq import zmq.asyncio + from sglang.srt.hf_transformers_utils import ( get_config, get_context_length, diff --git a/python/sglang/srt/models/commandr.py b/python/sglang/srt/models/commandr.py index 6f53bf2ec..60aa095d1 100644 --- a/python/sglang/srt/models/commandr.py +++ b/python/sglang/srt/models/commandr.py @@ -20,13 +20,10 @@ # This file is based on the LLama model definition file in transformers """PyTorch Cohere model.""" -from typing import List, Optional, Tuple +from typing import Optional, Tuple import torch import torch.utils.checkpoint -from sglang.srt.layers.logits_processor import LogitsProcessor -from sglang.srt.layers.radix_attention import RadixAttention -from sglang.srt.managers.router.model_runner import InputMetadata from torch import nn from torch.nn.parameter import Parameter from transformers import PretrainedConfig @@ -49,6 +46,10 @@ from vllm.model_executor.weight_utils import ( hf_model_weights_iterator, ) +from sglang.srt.layers.logits_processor import LogitsProcessor +from sglang.srt.layers.radix_attention import RadixAttention +from sglang.srt.managers.router.model_runner import InputMetadata + @torch.compile def layer_norm_func(hidden_states, weight, variance_epsilon): diff --git a/python/sglang/srt/models/dbrx.py b/python/sglang/srt/models/dbrx.py index 4742982cf..50215a2ef 100644 --- a/python/sglang/srt/models/dbrx.py +++ b/python/sglang/srt/models/dbrx.py @@ -5,10 +5,6 @@ from typing import Optional import torch import torch.nn as nn -from sglang.srt.layers.logits_processor import LogitsProcessor -from sglang.srt.layers.radix_attention import RadixAttention -from sglang.srt.managers.router.model_runner import InputMetadata -from sglang.srt.models.dbrx_config import DbrxConfig from vllm.model_executor.layers.fused_moe import fused_moe from vllm.model_executor.layers.linear import ( LinearMethodBase, @@ -35,6 +31,11 @@ from vllm.model_executor.weight_utils import ( hf_model_weights_iterator, ) +from sglang.srt.layers.logits_processor import LogitsProcessor +from sglang.srt.layers.radix_attention import RadixAttention +from sglang.srt.managers.router.model_runner import InputMetadata +from sglang.srt.models.dbrx_config import DbrxConfig + class DbrxRouter(nn.Module): """A Router implementation for DBRX that returns logits for each expert diff --git a/python/sglang/srt/models/gemma.py b/python/sglang/srt/models/gemma.py index 624f8fd5c..37b352803 100644 --- a/python/sglang/srt/models/gemma.py +++ b/python/sglang/srt/models/gemma.py @@ -4,9 +4,6 @@ from typing import Optional, Tuple import torch -from sglang.srt.layers.logits_processor import LogitsProcessor -from sglang.srt.layers.radix_attention import RadixAttention -from sglang.srt.managers.router.model_runner import InputMetadata from torch import nn from transformers import PretrainedConfig from vllm.config import LoRAConfig @@ -28,6 +25,10 @@ from vllm.model_executor.weight_utils import ( hf_model_weights_iterator, ) +from sglang.srt.layers.logits_processor import LogitsProcessor +from sglang.srt.layers.radix_attention import RadixAttention +from sglang.srt.managers.router.model_runner import InputMetadata + class GemmaMLP(nn.Module): def __init__( diff --git a/python/sglang/srt/models/llama2.py b/python/sglang/srt/models/llama2.py index e5c28fa12..212c4cf87 100644 --- a/python/sglang/srt/models/llama2.py +++ b/python/sglang/srt/models/llama2.py @@ -1,12 +1,9 @@ # Adapted from # https://github.com/vllm-project/vllm/blob/671af2b1c0b3ed6d856d37c21a561cc429a10701/vllm/model_executor/models/llama.py#L1 """Inference-only LLaMA model compatible with HuggingFace weights.""" -from typing import Any, Dict, List, Optional, Tuple +from typing import Any, Dict, Optional, Tuple import torch -from sglang.srt.layers.logits_processor import LogitsProcessor -from sglang.srt.layers.radix_attention import RadixAttention -from sglang.srt.managers.router.model_runner import InputMetadata from torch import nn from transformers import LlamaConfig from vllm.model_executor.layers.activation import SiluAndMul @@ -30,6 +27,10 @@ from vllm.model_executor.weight_utils import ( hf_model_weights_iterator, ) +from sglang.srt.layers.logits_processor import LogitsProcessor +from sglang.srt.layers.radix_attention import RadixAttention +from sglang.srt.managers.router.model_runner import InputMetadata + class LlamaMLP(nn.Module): def __init__( diff --git a/python/sglang/srt/models/llava.py b/python/sglang/srt/models/llava.py index 8e42d48c7..e7db6a543 100644 --- a/python/sglang/srt/models/llava.py +++ b/python/sglang/srt/models/llava.py @@ -4,6 +4,15 @@ from typing import List, Optional import numpy as np import torch +from torch import nn +from transformers import CLIPVisionModel, LlavaConfig +from transformers.models.llava.modeling_llava import LlavaMultiModalProjector +from vllm.model_executor.layers.linear import LinearMethodBase +from vllm.model_executor.weight_utils import ( + default_weight_loader, + hf_model_weights_iterator, +) + from sglang.srt.managers.router.infer_batch import ForwardMode from sglang.srt.managers.router.model_runner import InputMetadata from sglang.srt.mm_utils import ( @@ -12,14 +21,6 @@ from sglang.srt.mm_utils import ( unpad_image_shape, ) from sglang.srt.models.llama2 import LlamaForCausalLM -from torch import nn -from transformers import CLIPVisionModel, LlamaConfig, LlavaConfig -from transformers.models.llava.modeling_llava import LlavaMultiModalProjector -from vllm.model_executor.layers.linear import LinearMethodBase -from vllm.model_executor.weight_utils import ( - default_weight_loader, - hf_model_weights_iterator, -) class LlavaLlamaForCausalLM(nn.Module): diff --git a/python/sglang/srt/models/mixtral.py b/python/sglang/srt/models/mixtral.py index 01a830807..9d3742535 100644 --- a/python/sglang/srt/models/mixtral.py +++ b/python/sglang/srt/models/mixtral.py @@ -1,14 +1,11 @@ # Adapted from # https://github.com/vllm-project/vllm/blob/d0215a58e78572d91dadafe9d832a2db89b09a13/vllm/model_executor/models/mixtral.py#L1 """Inference-only Mixtral model.""" -from typing import List, Optional, Tuple +from typing import Optional import numpy as np import torch import torch.nn.functional as F -from sglang.srt.layers.logits_processor import LogitsProcessor -from sglang.srt.layers.radix_attention import RadixAttention -from sglang.srt.managers.router.model_runner import InputMetadata from torch import nn from transformers import MixtralConfig from vllm.model_executor.layers.layernorm import RMSNorm @@ -35,6 +32,10 @@ from vllm.model_executor.weight_utils import ( hf_model_weights_iterator, ) +from sglang.srt.layers.logits_processor import LogitsProcessor +from sglang.srt.layers.radix_attention import RadixAttention +from sglang.srt.managers.router.model_runner import InputMetadata + class MixtralMLP(nn.Module): def __init__( diff --git a/python/sglang/srt/models/qwen.py b/python/sglang/srt/models/qwen.py index 111ad704b..12480016d 100644 --- a/python/sglang/srt/models/qwen.py +++ b/python/sglang/srt/models/qwen.py @@ -1,9 +1,6 @@ -from typing import Any, Dict, List, Optional, Tuple +from typing import Any, Dict, Optional import torch -from sglang.srt.layers.logits_processor import LogitsProcessor -from sglang.srt.layers.radix_attention import RadixAttention -from sglang.srt.managers.router.model_runner import InputMetadata from torch import nn from transformers import PretrainedConfig from vllm.model_executor.layers.activation import SiluAndMul @@ -27,6 +24,10 @@ from vllm.model_executor.weight_utils import ( hf_model_weights_iterator, ) +from sglang.srt.layers.logits_processor import LogitsProcessor +from sglang.srt.layers.radix_attention import RadixAttention +from sglang.srt.managers.router.model_runner import InputMetadata + class QWenMLP(nn.Module): def __init__( diff --git a/python/sglang/srt/models/qwen2.py b/python/sglang/srt/models/qwen2.py index 26f0a5ae1..2314e5a33 100644 --- a/python/sglang/srt/models/qwen2.py +++ b/python/sglang/srt/models/qwen2.py @@ -1,12 +1,9 @@ # Adapted from llama2.py # Modify details for the adaptation of Qwen2 model. """Inference-only Qwen2 model compatible with HuggingFace weights.""" -from typing import Any, Dict, List, Optional, Tuple +from typing import Any, Dict, Optional, Tuple import torch -from sglang.srt.layers.logits_processor import LogitsProcessor -from sglang.srt.layers.radix_attention import RadixAttention -from sglang.srt.managers.router.model_runner import InputMetadata from torch import nn from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.layernorm import RMSNorm @@ -29,6 +26,10 @@ from vllm.model_executor.weight_utils import ( hf_model_weights_iterator, ) +from sglang.srt.layers.logits_processor import LogitsProcessor +from sglang.srt.layers.radix_attention import RadixAttention +from sglang.srt.managers.router.model_runner import InputMetadata + Qwen2Config = None diff --git a/python/sglang/srt/models/stablelm.py b/python/sglang/srt/models/stablelm.py index f839ea0b9..9d559ecfa 100644 --- a/python/sglang/srt/models/stablelm.py +++ b/python/sglang/srt/models/stablelm.py @@ -5,9 +5,6 @@ model compatible with HuggingFace weights.""" from typing import Optional, Tuple import torch -from sglang.srt.layers.logits_processor import LogitsProcessor -from sglang.srt.layers.radix_attention import RadixAttention -from sglang.srt.managers.router.model_runner import InputMetadata from torch import nn from transformers import PretrainedConfig from vllm.model_executor.layers.activation import SiluAndMul @@ -30,6 +27,10 @@ from vllm.model_executor.weight_utils import ( hf_model_weights_iterator, ) +from sglang.srt.layers.logits_processor import LogitsProcessor +from sglang.srt.layers.radix_attention import RadixAttention +from sglang.srt.managers.router.model_runner import InputMetadata + class StablelmMLP(nn.Module): def __init__( diff --git a/python/sglang/srt/models/yivl.py b/python/sglang/srt/models/yivl.py index 014e40c6e..f2d7b1948 100644 --- a/python/sglang/srt/models/yivl.py +++ b/python/sglang/srt/models/yivl.py @@ -1,21 +1,20 @@ """Inference-only Yi-VL model.""" -import os -from typing import List, Optional +from typing import Optional import torch import torch.nn as nn -from sglang.srt.models.llava import ( - LlavaLlamaForCausalLM, - clip_vision_embed_forward, - monkey_path_clip_vision_embed_forward, -) from transformers import CLIPVisionModel, LlavaConfig from vllm.model_executor.weight_utils import ( default_weight_loader, hf_model_weights_iterator, ) +from sglang.srt.models.llava import ( + LlavaLlamaForCausalLM, + monkey_path_clip_vision_embed_forward, +) + class YiVLForCausalLM(LlavaLlamaForCausalLM): def __init__(self, *args, **kwargs): diff --git a/python/sglang/srt/server.py b/python/sglang/srt/server.py index aa3e5291b..5643f0ad4 100644 --- a/python/sglang/srt/server.py +++ b/python/sglang/srt/server.py @@ -10,9 +10,6 @@ import threading import time from typing import List, Optional, Union -# Fix a Python bug -setattr(threading, "_register_atexit", lambda *args, **kwargs: None) - import aiohttp import psutil import pydantic @@ -22,6 +19,9 @@ import uvloop from fastapi import FastAPI, HTTPException, Request from fastapi.responses import Response, StreamingResponse from pydantic import BaseModel +from starlette.middleware.base import BaseHTTPMiddleware +from starlette.responses import JSONResponse + from sglang.backend.runtime_endpoint import RuntimeEndpoint from sglang.srt.constrained import disable_cache from sglang.srt.conversation import ( @@ -54,8 +54,9 @@ from sglang.srt.managers.router.manager import start_router_process from sglang.srt.managers.tokenizer_manager import TokenizerManager from sglang.srt.server_args import PortArgs, ServerArgs from sglang.srt.utils import enable_show_time_cost, handle_port_init -from starlette.middleware.base import BaseHTTPMiddleware -from starlette.responses import JSONResponse + +# Fix a Python bug +setattr(threading, "_register_atexit", lambda *args, **kwargs: None) asyncio.set_event_loop_policy(uvloop.EventLoopPolicy()) @@ -618,7 +619,7 @@ def launch_server(server_args, pipe_finish_writer): try: requests.get(url + "/get_model_info", timeout=5, headers=headers) break - except requests.exceptions.RequestException as e: + except requests.exceptions.RequestException: pass else: if pipe_finish_writer is not None: diff --git a/python/sglang/srt/utils.py b/python/sglang/srt/utils.py index 0f7322bb6..479bdda09 100644 --- a/python/sglang/srt/utils.py +++ b/python/sglang/srt/utils.py @@ -157,7 +157,6 @@ def get_exception_traceback(): def get_int_token_logit_bias(tokenizer, vocab_size): - from transformers import LlamaTokenizer, LlamaTokenizerFast # a bug when model's vocab size > tokenizer.vocab_size vocab_size = tokenizer.vocab_size diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py index beddd6255..4d5e18211 100644 --- a/python/sglang/test/test_utils.py +++ b/python/sglang/test/test_utils.py @@ -2,6 +2,7 @@ import numpy as np import requests + from sglang.backend.openai import OpenAI from sglang.backend.runtime_endpoint import RuntimeEndpoint from sglang.global_config import global_config diff --git a/test/lang/run_all.py b/test/lang/run_all.py index cb5da1585..75d5d5c2b 100644 --- a/test/lang/run_all.py +++ b/test/lang/run_all.py @@ -1,7 +1,6 @@ import argparse import glob import multiprocessing -import os import time import unittest diff --git a/test/lang/test_anthropic_backend.py b/test/lang/test_anthropic_backend.py index b0da88838..83f6c76f4 100644 --- a/test/lang/test_anthropic_backend.py +++ b/test/lang/test_anthropic_backend.py @@ -1,9 +1,7 @@ -import json import unittest -from sglang.test.test_programs import test_mt_bench, test_stream - from sglang import Anthropic, set_default_backend +from sglang.test.test_programs import test_mt_bench, test_stream class TestAnthropicBackend(unittest.TestCase): diff --git a/test/lang/test_bind_pin.py b/test/lang/test_bind_pin.py index 38e5daa41..626d6ff05 100644 --- a/test/lang/test_bind_pin.py +++ b/test/lang/test_bind_pin.py @@ -1,8 +1,7 @@ import unittest -from sglang.backend.runtime_endpoint import RuntimeEndpoint - import sglang as sgl +from sglang.backend.runtime_endpoint import RuntimeEndpoint class TestBind(unittest.TestCase): diff --git a/test/lang/test_openai_backend.py b/test/lang/test_openai_backend.py index 236c548a8..bb8b9f77e 100644 --- a/test/lang/test_openai_backend.py +++ b/test/lang/test_openai_backend.py @@ -1,5 +1,6 @@ import unittest +from sglang import OpenAI, set_default_backend from sglang.test.test_programs import ( test_decode_int, test_decode_json, @@ -15,8 +16,6 @@ from sglang.test.test_programs import ( test_tool_use, ) -from sglang import OpenAI, set_default_backend - class TestOpenAIBackend(unittest.TestCase): backend = None diff --git a/test/lang/test_srt_backend.py b/test/lang/test_srt_backend.py index 82a9f1ad4..007d96257 100644 --- a/test/lang/test_srt_backend.py +++ b/test/lang/test_srt_backend.py @@ -2,9 +2,9 @@ python3 -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 """ -import json import unittest +import sglang as sgl from sglang.test.test_programs import ( test_decode_int, test_decode_json_regex, @@ -12,16 +12,12 @@ from sglang.test.test_programs import ( test_few_shot_qa, test_mt_bench, test_parallel_decoding, - test_parallel_encoding, - test_react, test_regex, test_select, test_stream, test_tool_use, ) -import sglang as sgl - class TestSRTBackend(unittest.TestCase): backend = None diff --git a/test/lang/test_tracing.py b/test/lang/test_tracing.py index cdc9000d8..f77b50752 100644 --- a/test/lang/test_tracing.py +++ b/test/lang/test_tracing.py @@ -1,10 +1,9 @@ import unittest +import sglang as sgl from sglang.backend.base_backend import BaseBackend from sglang.lang.chat_template import get_chat_template -import sglang as sgl - class TestTracing(unittest.TestCase): def test_few_shot_qa(self): @@ -111,7 +110,7 @@ class TestTracing(unittest.TestCase): forks = s.fork(3) for i in range(3): forks[i] += f"Now, expand tip {i+1} into a paragraph:\n" - forks[i] += sgl.gen(f"detailed_tip") + forks[i] += sgl.gen("detailed_tip") s += "Tip 1:" + forks[0]["detailed_tip"] + "\n" s += "Tip 2:" + forks[1]["detailed_tip"] + "\n" diff --git a/test/lang/test_vertexai_backend.py b/test/lang/test_vertexai_backend.py index a17ab4ba7..aae840101 100644 --- a/test/lang/test_vertexai_backend.py +++ b/test/lang/test_vertexai_backend.py @@ -1,5 +1,6 @@ import unittest +from sglang import VertexAI, set_default_backend from sglang.test.test_programs import ( test_expert_answer, test_few_shot_qa, @@ -10,8 +11,6 @@ from sglang.test.test_programs import ( test_stream, ) -from sglang import VertexAI, set_default_backend - class TestVertexAIBackend(unittest.TestCase): backend = None diff --git a/test/srt/model/bench_llama_low_api.py b/test/srt/model/bench_llama_low_api.py index 34c64cd6c..9c6bce91d 100644 --- a/test/srt/model/bench_llama_low_api.py +++ b/test/srt/model/bench_llama_low_api.py @@ -4,6 +4,7 @@ from dataclasses import dataclass import torch import torch.distributed as dist + from sglang.srt.managers.router.model_runner import ModelRunner from sglang.srt.model_config import ModelConfig @@ -66,9 +67,9 @@ class BenchBatch: p_idx = prefix_req_idx[i // fork_num].item() n_idx = self.req_pool_indices[i].item() req_to_token[n_idx, :prefix_len] = req_to_token[p_idx, :prefix_len] - req_to_token[ - n_idx, prefix_len : prefix_len + extend_len - ] = self.out_cache_loc[i * extend_len : (i + 1) * extend_len] + req_to_token[n_idx, prefix_len : prefix_len + extend_len] = ( + self.out_cache_loc[i * extend_len : (i + 1) * extend_len] + ) def update_decode(self, predict_ids, batch_size): assert predict_ids.shape[0] == batch_size @@ -81,9 +82,9 @@ class BenchBatch: self.out_cache_cont_start, self.out_cache_cont_end, ) = self.token_to_kv_pool.alloc_contiguous(batch_size) - self.req_to_token_pool.req_to_token[ - self.req_pool_indices, self.seq_lens - ] = self.out_cache_loc + self.req_to_token_pool.req_to_token[self.req_pool_indices, self.seq_lens] = ( + self.out_cache_loc + ) self.seq_lens.add_(1) diff --git a/test/srt/model/reference_hf.py b/test/srt/model/reference_hf.py index e63866f02..4060f9212 100644 --- a/test/srt/model/reference_hf.py +++ b/test/srt/model/reference_hf.py @@ -1,5 +1,4 @@ import argparse -import os import torch from transformers import AutoModelForCausalLM, AutoTokenizer diff --git a/test/srt/model/test_llama_extend.py b/test/srt/model/test_llama_extend.py index 2931dfa5d..cf589be7b 100644 --- a/test/srt/model/test_llama_extend.py +++ b/test/srt/model/test_llama_extend.py @@ -1,11 +1,8 @@ import multiprocessing import os -import time -import numpy as np -import torch -import torch.distributed as dist import transformers + from sglang.srt.managers.router.infer_batch import Batch, ForwardMode, Req from sglang.srt.managers.router.model_runner import ModelRunner from sglang.srt.model_config import ModelConfig diff --git a/test/srt/model/test_llama_low_api.py b/test/srt/model/test_llama_low_api.py index a8917ee4a..80a79e0c6 100644 --- a/test/srt/model/test_llama_low_api.py +++ b/test/srt/model/test_llama_low_api.py @@ -4,6 +4,7 @@ import time import numpy as np import torch import torch.distributed as dist + from sglang.srt.managers.router.model_runner import ModelRunner from sglang.srt.model_config import ModelConfig diff --git a/test/srt/model/test_llava_low_api.py b/test/srt/model/test_llava_low_api.py index 322ba4855..38b030d07 100644 --- a/test/srt/model/test_llava_low_api.py +++ b/test/srt/model/test_llava_low_api.py @@ -1,12 +1,10 @@ import multiprocessing -import time import numpy as np import torch -import torch.distributed as dist + from sglang.srt.hf_transformers_utils import get_processor -from sglang.srt.managers.router.infer_batch import ForwardMode -from sglang.srt.managers.router.model_runner import InputMetadata, ModelRunner +from sglang.srt.managers.router.model_runner import ModelRunner from sglang.srt.model_config import ModelConfig from sglang.srt.utils import load_image diff --git a/test/srt/test_flashinfer.py b/test/srt/test_flashinfer.py index 3fef32e99..40deb23d9 100644 --- a/test/srt/test_flashinfer.py +++ b/test/srt/test_flashinfer.py @@ -1,6 +1,7 @@ import flashinfer import pytest import torch + from sglang.srt.layers.extend_attention import extend_attention_fwd from sglang.srt.layers.token_attention import token_attention_fwd diff --git a/test/srt/test_httpserver_concurrent.py b/test/srt/test_httpserver_concurrent.py index 855e51f33..6cdd5332d 100644 --- a/test/srt/test_httpserver_concurrent.py +++ b/test/srt/test_httpserver_concurrent.py @@ -9,11 +9,8 @@ The capital of the United Kindom is London.\nThe capital of the United Kingdom i import argparse import asyncio -import json -import time import aiohttp -import requests async def send_request(url, data, delay=0): diff --git a/test/srt/test_httpserver_llava.py b/test/srt/test_httpserver_llava.py index 0f6571b45..6db4ab930 100644 --- a/test/srt/test_httpserver_llava.py +++ b/test/srt/test_httpserver_llava.py @@ -10,7 +10,6 @@ The image features a man standing on the back of a yellow taxi cab, holding import argparse import asyncio import json -import time import aiohttp import requests diff --git a/test/srt/test_httpserver_reuse.py b/test/srt/test_httpserver_reuse.py index c3f786589..ef866afc6 100644 --- a/test/srt/test_httpserver_reuse.py +++ b/test/srt/test_httpserver_reuse.py @@ -6,7 +6,6 @@ The capital of France is Paris.\nThe capital of the United States is Washington, """ import argparse -import time import requests diff --git a/test/srt/test_jump_forward.py b/test/srt/test_jump_forward.py index 15ec2caff..60074a040 100644 --- a/test/srt/test_jump_forward.py +++ b/test/srt/test_jump_forward.py @@ -2,14 +2,14 @@ import argparse from enum import Enum from pydantic import BaseModel, constr + +import sglang as sgl from sglang.srt.constrained import build_regex_from_object from sglang.test.test_utils import ( add_common_sglang_args_and_parse, select_sglang_backend, ) -import sglang as sgl - IP_REGEX = r"((25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(25[0-5]|2[0-4]\d|[01]?\d\d?)" ip_jump_forward = ( diff --git a/test/srt/test_robust.py b/test/srt/test_robust.py index 5b479318f..633e2e649 100644 --- a/test/srt/test_robust.py +++ b/test/srt/test_robust.py @@ -2,13 +2,13 @@ import argparse import random import string +from vllm.transformers_utils.tokenizer import get_tokenizer + +import sglang as sgl from sglang.test.test_utils import ( add_common_sglang_args_and_parse, select_sglang_backend, ) -from vllm.transformers_utils.tokenizer import get_tokenizer - -import sglang as sgl TOKENIZER = None RANDOM_PREFILL_LEN = None