From 033c715b4662576bade850fc42b20ce07151fa46 Mon Sep 17 00:00:00 2001 From: Yineng Zhang Date: Fri, 17 Jan 2025 23:46:48 +0800 Subject: [PATCH] cleanup models dependencies 1/n (#2948) --- python/sglang/srt/layers/moe/ep_moe/layer.py | 2 +- python/sglang/srt/lora/lora.py | 10 +--------- python/sglang/srt/models/baichuan.py | 10 +++++----- python/sglang/srt/models/gpt2.py | 3 +-- python/sglang/srt/models/minicpm3.py | 12 ++++++------ python/sglang/srt/models/olmo2.py | 2 +- python/sglang/srt/models/olmoe.py | 11 +++++------ python/sglang/srt/models/qwen2_vl.py | 4 ++-- python/sglang/srt/models/xverse.py | 12 ++++++------ python/sglang/srt/models/xverse_moe.py | 16 ++++++++-------- 10 files changed, 36 insertions(+), 46 deletions(-) diff --git a/python/sglang/srt/layers/moe/ep_moe/layer.py b/python/sglang/srt/layers/moe/ep_moe/layer.py index eaa65c544..8f5a71dff 100644 --- a/python/sglang/srt/layers/moe/ep_moe/layer.py +++ b/python/sglang/srt/layers/moe/ep_moe/layer.py @@ -5,7 +5,6 @@ import torch from torch.nn import Module from vllm import _custom_ops as ops from vllm.model_executor.custom_op import CustomOp -from vllm.model_executor.layers.quantization.fp8 import Fp8Config, Fp8MoEMethod from sglang.srt.distributed import ( get_tensor_model_parallel_rank, @@ -25,6 +24,7 @@ from sglang.srt.layers.quantization.base_config import ( QuantizationConfig, QuantizeMethodBase, ) +from sglang.srt.layers.quantization.fp8 import Fp8Config, Fp8MoEMethod from sglang.srt.utils import is_hip, set_weight_attrs logger = logging.getLogger(__name__) diff --git a/python/sglang/srt/lora/lora.py b/python/sglang/srt/lora/lora.py index 839d10222..c8cbe3660 100644 --- a/python/sglang/srt/lora/lora.py +++ b/python/sglang/srt/lora/lora.py @@ -19,18 +19,11 @@ # https://github.com/vllm-project/vllm/blob/4abf6336ec65c270343eb895e7b18786e9274176/vllm/lora/layers.py -import json -import os import re -from typing import Any, Dict, List, Optional, Tuple -import safetensors.torch import torch from torch import nn -from vllm.model_executor.layers.vocab_parallel_embedding import ( - ParallelLMHead, - VocabParallelEmbedding, -) +from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding from sglang.srt.layers.linear import ( ColumnParallelLinear, @@ -38,7 +31,6 @@ from sglang.srt.layers.linear import ( QKVParallelLinear, RowParallelLinear, ) -from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode from sglang.srt.model_loader.loader import DefaultModelLoader diff --git a/python/sglang/srt/models/baichuan.py b/python/sglang/srt/models/baichuan.py index c973e64c7..d8916abac 100644 --- a/python/sglang/srt/models/baichuan.py +++ b/python/sglang/srt/models/baichuan.py @@ -24,11 +24,6 @@ from typing import Iterable, Optional, Tuple import torch from torch import nn from transformers import PretrainedConfig -from vllm.model_executor.layers.linear import ( - MergedColumnParallelLinear, - QKVParallelLinear, - RowParallelLinear, -) from vllm.model_executor.layers.rotary_embedding import get_rope from sglang.srt.distributed import ( @@ -37,6 +32,11 @@ from sglang.srt.distributed import ( ) from sglang.srt.layers.activation import SiluAndMul from sglang.srt.layers.layernorm import RMSNorm +from sglang.srt.layers.linear import ( + MergedColumnParallelLinear, + QKVParallelLinear, + RowParallelLinear, +) from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.quantization.base_config import QuantizationConfig from sglang.srt.layers.radix_attention import RadixAttention diff --git a/python/sglang/srt/models/gpt2.py b/python/sglang/srt/models/gpt2.py index a99232dc2..280ff152a 100644 --- a/python/sglang/srt/models/gpt2.py +++ b/python/sglang/srt/models/gpt2.py @@ -22,10 +22,9 @@ from typing import Iterable, List, Optional, Tuple import torch from torch import nn from transformers import GPT2Config -from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding from sglang.srt.distributed.parallel_state import get_tensor_model_parallel_world_size +from sglang.srt.layers.activation import get_act_fn # from sglang.srt.layers.activation import get_act_fn from sglang.srt.layers.linear import ( diff --git a/python/sglang/srt/models/minicpm3.py b/python/sglang/srt/models/minicpm3.py index f5e722a14..2d15af43f 100644 --- a/python/sglang/srt/models/minicpm3.py +++ b/python/sglang/srt/models/minicpm3.py @@ -19,17 +19,17 @@ from typing import Any, Dict, Iterable, Optional, Tuple import torch from torch import nn from transformers import PretrainedConfig -from vllm.model_executor.layers.linear import ( - ColumnParallelLinear, - MergedColumnParallelLinear, - ReplicatedLinear, - RowParallelLinear, -) from vllm.model_executor.layers.rotary_embedding import get_rope from sglang.srt.distributed import get_tensor_model_parallel_world_size from sglang.srt.layers.activation import SiluAndMul from sglang.srt.layers.layernorm import RMSNorm +from sglang.srt.layers.linear import ( + ColumnParallelLinear, + MergedColumnParallelLinear, + ReplicatedLinear, + RowParallelLinear, +) from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.quantization.base_config import QuantizationConfig from sglang.srt.layers.radix_attention import RadixAttention diff --git a/python/sglang/srt/models/olmo2.py b/python/sglang/srt/models/olmo2.py index df0121930..fafe39d71 100755 --- a/python/sglang/srt/models/olmo2.py +++ b/python/sglang/srt/models/olmo2.py @@ -22,7 +22,6 @@ import torch from torch import nn from transformers import PretrainedConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.model_loader.weight_utils import default_weight_loader from sglang.srt.distributed import ( get_tensor_model_parallel_rank, @@ -45,6 +44,7 @@ from sglang.srt.layers.vocab_parallel_embedding import ( VocabParallelEmbedding, ) from sglang.srt.model_executor.forward_batch_info import ForwardBatch +from sglang.srt.model_loader.weight_utils import default_weight_loader from sglang.srt.utils import make_layers diff --git a/python/sglang/srt/models/olmoe.py b/python/sglang/srt/models/olmoe.py index 74bc98372..9abe9ff25 100644 --- a/python/sglang/srt/models/olmoe.py +++ b/python/sglang/srt/models/olmoe.py @@ -23,12 +23,6 @@ import torch import torch.nn.functional as F from torch import nn from transformers import PretrainedConfig -from vllm.model_executor.layers.linear import ( - MergedColumnParallelLinear, - QKVParallelLinear, - ReplicatedLinear, - RowParallelLinear, -) from vllm.model_executor.layers.rotary_embedding import get_rope from sglang.srt.distributed import ( @@ -37,6 +31,11 @@ from sglang.srt.distributed import ( ) from sglang.srt.layers.activation import SiluAndMul from sglang.srt.layers.layernorm import RMSNorm +from sglang.srt.layers.linear import ( + QKVParallelLinear, + ReplicatedLinear, + RowParallelLinear, +) from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput from sglang.srt.layers.moe.fused_moe_triton import FusedMoE from sglang.srt.layers.quantization.base_config import QuantizationConfig diff --git a/python/sglang/srt/models/qwen2_vl.py b/python/sglang/srt/models/qwen2_vl.py index fc5dd49ea..83912e894 100644 --- a/python/sglang/srt/models/qwen2_vl.py +++ b/python/sglang/srt/models/qwen2_vl.py @@ -22,6 +22,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Qwen2-VL model compatible with HuggingFace weights.""" +import logging from functools import lru_cache, partial from typing import Iterable, List, Optional, Tuple, Type, TypedDict @@ -30,7 +31,6 @@ import torch import torch.nn as nn import torch.nn.functional as F from einops import rearrange, repeat -from vllm.logger import init_logger from vllm.model_executor.layers.activation import QuickGELU from sglang.srt.configs import Qwen2VLConfig, Qwen2VLVisionConfig @@ -50,7 +50,7 @@ from sglang.srt.model_executor.forward_batch_info import ForwardBatch from sglang.srt.model_loader.weight_utils import default_weight_loader from sglang.srt.models.qwen2 import Qwen2Model -logger = init_logger(__name__) +logger = logging.getLogger(__name__) # === Vision Inputs === # diff --git a/python/sglang/srt/models/xverse.py b/python/sglang/srt/models/xverse.py index 7d6158a9b..799e513ae 100644 --- a/python/sglang/srt/models/xverse.py +++ b/python/sglang/srt/models/xverse.py @@ -21,16 +21,16 @@ from typing import Any, Dict, Iterable, Optional, Tuple import torch from torch import nn from transformers import LlamaConfig -from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.model_executor.layers.linear import ( +from vllm.model_executor.layers.rotary_embedding import get_rope + +from sglang.srt.distributed import get_tensor_model_parallel_world_size +from sglang.srt.layers.activation import SiluAndMul +from sglang.srt.layers.layernorm import RMSNorm +from sglang.srt.layers.linear import ( MergedColumnParallelLinear, QKVParallelLinear, RowParallelLinear, ) -from vllm.model_executor.layers.rotary_embedding import get_rope - -from sglang.srt.distributed import get_tensor_model_parallel_world_size from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.quantization.base_config import QuantizationConfig from sglang.srt.layers.radix_attention import RadixAttention diff --git a/python/sglang/srt/models/xverse_moe.py b/python/sglang/srt/models/xverse_moe.py index 27f763011..97b62815a 100644 --- a/python/sglang/srt/models/xverse_moe.py +++ b/python/sglang/srt/models/xverse_moe.py @@ -18,14 +18,6 @@ from typing import Any, Dict, Iterable, Optional, Tuple import torch from torch import nn from transformers import PretrainedConfig -from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.model_executor.layers.linear import ( - MergedColumnParallelLinear, - QKVParallelLinear, - ReplicatedLinear, - RowParallelLinear, -) from vllm.model_executor.layers.rotary_embedding import get_rope from sglang.srt.distributed import ( @@ -33,6 +25,14 @@ from sglang.srt.distributed import ( get_tensor_model_parallel_world_size, tensor_model_parallel_all_reduce, ) +from sglang.srt.layers.activation import SiluAndMul +from sglang.srt.layers.layernorm import RMSNorm +from sglang.srt.layers.linear import ( + MergedColumnParallelLinear, + QKVParallelLinear, + ReplicatedLinear, + RowParallelLinear, +) from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.moe.fused_moe_triton import fused_moe from sglang.srt.layers.quantization.base_config import QuantizationConfig