From 5dc54f1a627a1e412c6732777dc2622363df7c21 Mon Sep 17 00:00:00 2001 From: Yineng Zhang Date: Fri, 17 Jan 2025 22:31:51 +0800 Subject: [PATCH] feat: remove vllm distributed (#2907) Co-authored-by: Zhangyi <1109276519@qq.com> --- python/sglang/srt/layers/activation.py | 6 +++--- python/sglang/srt/layers/dp_attention.py | 3 ++- python/sglang/srt/layers/linear.py | 4 ++-- python/sglang/srt/layers/logits_processor.py | 4 ++-- python/sglang/srt/layers/moe/ep_moe/layer.py | 8 ++++---- .../srt/layers/moe/fused_moe_triton/layer.py | 6 +++--- python/sglang/srt/layers/parameter.py | 3 ++- python/sglang/srt/layers/quantization/fp8.py | 2 +- .../sglang/srt/layers/vocab_parallel_embedding.py | 4 ++-- .../sglang/srt/model_executor/cuda_graph_runner.py | 4 ++-- python/sglang/srt/model_executor/model_runner.py | 14 +++++++++----- python/sglang/srt/model_loader/loader.py | 14 ++++++++------ python/sglang/srt/model_loader/weight_utils.py | 2 +- python/sglang/srt/models/baichuan.py | 8 ++++---- python/sglang/srt/models/chatglm.py | 2 +- python/sglang/srt/models/commandr.py | 6 +++--- python/sglang/srt/models/dbrx.py | 8 ++++---- python/sglang/srt/models/deepseek.py | 6 +++--- python/sglang/srt/models/deepseek_v2.py | 6 +++--- python/sglang/srt/models/exaone.py | 2 +- python/sglang/srt/models/gemma.py | 2 +- python/sglang/srt/models/gemma2.py | 2 +- python/sglang/srt/models/gpt2.py | 3 ++- python/sglang/srt/models/gpt_bigcode.py | 2 +- python/sglang/srt/models/granite.py | 2 +- python/sglang/srt/models/grok.py | 6 +++--- python/sglang/srt/models/internlm2.py | 2 +- python/sglang/srt/models/llama.py | 8 ++++---- python/sglang/srt/models/minicpm.py | 2 +- python/sglang/srt/models/minicpm3.py | 2 +- python/sglang/srt/models/mixtral.py | 6 +++--- python/sglang/srt/models/mixtral_quant.py | 6 +++--- python/sglang/srt/models/mllama.py | 4 ++-- python/sglang/srt/models/olmo.py | 2 +- python/sglang/srt/models/olmo2.py | 8 ++++---- python/sglang/srt/models/olmoe.py | 8 ++++---- python/sglang/srt/models/phi3_small.py | 2 +- python/sglang/srt/models/qwen.py | 2 +- python/sglang/srt/models/qwen2.py | 2 +- python/sglang/srt/models/qwen2_moe.py | 6 +++--- python/sglang/srt/models/qwen2_vl.py | 4 ++-- python/sglang/srt/models/stablelm.py | 2 +- python/sglang/srt/models/torch_native_llama.py | 6 +++--- python/sglang/srt/models/xverse.py | 2 +- python/sglang/srt/models/xverse_moe.py | 10 +++++----- 45 files changed, 111 insertions(+), 102 deletions(-) diff --git a/python/sglang/srt/layers/activation.py b/python/sglang/srt/layers/activation.py index c4c54f0b0..ebb0652c5 100644 --- a/python/sglang/srt/layers/activation.py +++ b/python/sglang/srt/layers/activation.py @@ -25,13 +25,13 @@ from sglang.srt.utils import is_flashinfer_available if is_flashinfer_available(): from flashinfer.activation import gelu_and_mul, gelu_tanh_and_mul, silu_and_mul -from vllm.distributed import ( +from vllm.model_executor.custom_op import CustomOp + +from sglang.srt.distributed import ( divide, get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, ) -from vllm.model_executor.custom_op import CustomOp - from sglang.srt.layers.custom_op_util import register_custom_op from sglang.srt.layers.quantization.base_config import QuantizationConfig from sglang.srt.utils import set_weight_attrs diff --git a/python/sglang/srt/layers/dp_attention.py b/python/sglang/srt/layers/dp_attention.py index 41bcb2181..65efa0feb 100644 --- a/python/sglang/srt/layers/dp_attention.py +++ b/python/sglang/srt/layers/dp_attention.py @@ -1,5 +1,6 @@ import torch -from vllm.distributed import GroupCoordinator, get_tp_group + +from sglang.srt.distributed import GroupCoordinator, get_tp_group _ATTN_TP_GROUP = None _ATTN_TP_RANK = None diff --git a/python/sglang/srt/layers/linear.py b/python/sglang/srt/layers/linear.py index 4596f3d78..bfa5d2b66 100644 --- a/python/sglang/srt/layers/linear.py +++ b/python/sglang/srt/layers/linear.py @@ -7,7 +7,8 @@ from typing import Dict, List, Optional, Tuple import torch import torch.nn.functional as F from torch.nn.parameter import Parameter, UninitializedParameter -from vllm.distributed import ( + +from sglang.srt.distributed import ( divide, get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, @@ -15,7 +16,6 @@ from vllm.distributed import ( tensor_model_parallel_all_gather, tensor_model_parallel_all_reduce, ) - from sglang.srt.layers.parameter import ( BasevLLMParameter, PackedColumnParameter, diff --git a/python/sglang/srt/layers/logits_processor.py b/python/sglang/srt/layers/logits_processor.py index e1dc94548..a4fe49051 100644 --- a/python/sglang/srt/layers/logits_processor.py +++ b/python/sglang/srt/layers/logits_processor.py @@ -20,11 +20,11 @@ import torch import triton import triton.language as tl from torch import nn -from vllm.distributed import ( + +from sglang.srt.distributed import ( get_tensor_model_parallel_world_size, tensor_model_parallel_all_gather, ) - from sglang.srt.layers.vocab_parallel_embedding import VocabParallelEmbedding from sglang.srt.model_executor.forward_batch_info import ( CaptureHiddenMode, diff --git a/python/sglang/srt/layers/moe/ep_moe/layer.py b/python/sglang/srt/layers/moe/ep_moe/layer.py index 96e02e312..eaa65c544 100644 --- a/python/sglang/srt/layers/moe/ep_moe/layer.py +++ b/python/sglang/srt/layers/moe/ep_moe/layer.py @@ -4,13 +4,13 @@ from typing import Callable, List, Optional, Tuple import torch from torch.nn import Module from vllm import _custom_ops as ops -from vllm.distributed import ( - get_tensor_model_parallel_rank, - get_tensor_model_parallel_world_size, -) from vllm.model_executor.custom_op import CustomOp from vllm.model_executor.layers.quantization.fp8 import Fp8Config, Fp8MoEMethod +from sglang.srt.distributed import ( + get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size, +) from sglang.srt.layers.custom_op_util import register_custom_op from sglang.srt.layers.moe.ep_moe.kernels import ( grouped_gemm_triton, diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/layer.py b/python/sglang/srt/layers/moe/fused_moe_triton/layer.py index a484c2ef9..75d4c5ead 100644 --- a/python/sglang/srt/layers/moe/fused_moe_triton/layer.py +++ b/python/sglang/srt/layers/moe/fused_moe_triton/layer.py @@ -5,13 +5,13 @@ from enum import Enum from typing import Callable, List, Optional, Tuple import torch -from vllm.distributed import ( +from vllm.model_executor.custom_op import CustomOp + +from sglang.srt.distributed import ( get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, tensor_model_parallel_all_reduce, ) -from vllm.model_executor.custom_op import CustomOp - from sglang.srt.layers.custom_op_util import register_custom_op from sglang.srt.layers.moe.fused_moe_native import moe_forward_native from sglang.srt.layers.moe.topk import select_experts diff --git a/python/sglang/srt/layers/parameter.py b/python/sglang/srt/layers/parameter.py index fe999baa2..d99b2efe8 100644 --- a/python/sglang/srt/layers/parameter.py +++ b/python/sglang/srt/layers/parameter.py @@ -6,7 +6,8 @@ from typing import Callable, Optional, Union import torch from torch.nn import Parameter -from vllm.distributed import get_tensor_model_parallel_rank + +from sglang.srt.distributed import get_tensor_model_parallel_rank __all__ = [ "BasevLLMParameter", diff --git a/python/sglang/srt/layers/quantization/fp8.py b/python/sglang/srt/layers/quantization/fp8.py index 5ccac960f..bd59352a7 100644 --- a/python/sglang/srt/layers/quantization/fp8.py +++ b/python/sglang/srt/layers/quantization/fp8.py @@ -8,7 +8,6 @@ import torch.nn.functional as F from torch.nn import Module from torch.nn.parameter import Parameter from vllm import _custom_ops as ops -from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import ( apply_fp8_marlin_linear, @@ -24,6 +23,7 @@ from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( requantize_with_max_scale, ) +from sglang.srt.distributed import get_tensor_model_parallel_world_size from sglang.srt.layers.linear import ( LinearBase, LinearMethodBase, diff --git a/python/sglang/srt/layers/vocab_parallel_embedding.py b/python/sglang/srt/layers/vocab_parallel_embedding.py index a346a2cbd..ed9d67ef9 100644 --- a/python/sglang/srt/layers/vocab_parallel_embedding.py +++ b/python/sglang/srt/layers/vocab_parallel_embedding.py @@ -6,13 +6,13 @@ from typing import List, Optional, Sequence, Tuple import torch import torch.nn.functional as F from torch.nn.parameter import Parameter, UninitializedParameter -from vllm.distributed import ( + +from sglang.srt.distributed import ( divide, get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, tensor_model_parallel_all_reduce, ) - from sglang.srt.layers.parameter import BasevLLMParameter from sglang.srt.layers.quantization.base_config import ( QuantizationConfig, diff --git a/python/sglang/srt/model_executor/cuda_graph_runner.py b/python/sglang/srt/model_executor/cuda_graph_runner.py index e167ff16a..9fdf7a8ac 100644 --- a/python/sglang/srt/model_executor/cuda_graph_runner.py +++ b/python/sglang/srt/model_executor/cuda_graph_runner.py @@ -21,10 +21,10 @@ from typing import TYPE_CHECKING, Callable import torch import tqdm -from vllm.distributed import get_tensor_model_parallel_rank -from vllm.distributed.parallel_state import graph_capture from vllm.model_executor.custom_op import CustomOp +from sglang.srt.distributed import get_tensor_model_parallel_rank +from sglang.srt.distributed.parallel_state import graph_capture from sglang.srt.layers.logits_processor import LogitsProcessorOutput from sglang.srt.layers.moe.fused_moe_native import fused_moe_forward_native from sglang.srt.layers.torchao_utils import save_gemlite_cache diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py index 86d59582f..261914694 100644 --- a/python/sglang/srt/model_executor/model_runner.py +++ b/python/sglang/srt/model_executor/model_runner.py @@ -21,16 +21,17 @@ from typing import List, Optional, Tuple import torch import torch.distributed as dist -from vllm.distributed import ( + +from sglang.srt.configs.device_config import DeviceConfig +from sglang.srt.configs.load_config import LoadConfig +from sglang.srt.configs.model_config import AttentionArch, ModelConfig +from sglang.srt.distributed import ( get_tp_group, init_distributed_environment, initialize_model_parallel, set_custom_all_reduce, ) - -from sglang.srt.configs.device_config import DeviceConfig -from sglang.srt.configs.load_config import LoadConfig -from sglang.srt.configs.model_config import AttentionArch, ModelConfig +from sglang.srt.distributed.parallel_state import monkey_patch_vllm_parallel_state from sglang.srt.layers.attention.double_sparsity_backend import DoubleSparseAttnBackend from sglang.srt.layers.attention.flashinfer_backend import FlashInferAttnBackend from sglang.srt.layers.attention.torch_native_backend import TorchNativeAttnBackend @@ -295,12 +296,15 @@ class ModelRunner: monkey_patch_vllm_gguf_config() # Load the model + # Remove monkey_patch when linear.py quant remove dependencies with vllm + monkey_patch_vllm_parallel_state() with self.memory_saver_adapter.region(): self.model = get_model( model_config=self.model_config, load_config=self.load_config, device_config=DeviceConfig(self.device), ) + monkey_patch_vllm_parallel_state(reverse=True) if self.server_args.kv_cache_dtype == "fp8_e4m3": if self.server_args.quantization_param_path is not None: diff --git a/python/sglang/srt/model_loader/loader.py b/python/sglang/srt/model_loader/loader.py index 776b69aaf..677d716d4 100644 --- a/python/sglang/srt/model_loader/loader.py +++ b/python/sglang/srt/model_loader/loader.py @@ -21,14 +21,14 @@ from huggingface_hub import HfApi, hf_hub_download from torch import nn from transformers import AutoModelForCausalLM, PretrainedConfig from transformers.utils import SAFE_WEIGHTS_INDEX_NAME -from vllm.distributed import ( - get_tensor_model_parallel_rank, - get_tensor_model_parallel_world_size, -) from sglang.srt.configs.device_config import DeviceConfig from sglang.srt.configs.load_config import LoadConfig, LoadFormat from sglang.srt.configs.model_config import ModelConfig +from sglang.srt.distributed import ( + get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size, +) from sglang.srt.layers.quantization.base_config import QuantizationConfig from sglang.srt.model_loader.utils import ( get_model_architecture, @@ -496,7 +496,8 @@ class ShardedStateLoader(BaseModelLoader): device_config: DeviceConfig, ) -> nn.Module: from safetensors.torch import safe_open - from vllm.distributed import get_tensor_model_parallel_rank + + from sglang.srt.distributed import get_tensor_model_parallel_rank local_model_path = self._prepare_weights( model_config.model_path, model_config.revision @@ -556,7 +557,8 @@ class ShardedStateLoader(BaseModelLoader): max_size: Optional[int] = None, ) -> None: from safetensors.torch import save_file - from vllm.distributed import get_tensor_model_parallel_rank + + from sglang.srt.distributed import get_tensor_model_parallel_rank if pattern is None: pattern = ShardedStateLoader.DEFAULT_PATTERN diff --git a/python/sglang/srt/model_loader/weight_utils.py b/python/sglang/srt/model_loader/weight_utils.py index 13b323b5d..015c65145 100644 --- a/python/sglang/srt/model_loader/weight_utils.py +++ b/python/sglang/srt/model_loader/weight_utils.py @@ -19,10 +19,10 @@ import torch from huggingface_hub import HfFileSystem, hf_hub_download, snapshot_download from safetensors.torch import load_file, safe_open, save_file from tqdm.auto import tqdm -from vllm.distributed import get_tensor_model_parallel_rank from sglang.srt.configs.load_config import LoadConfig from sglang.srt.configs.model_config import ModelConfig +from sglang.srt.distributed import get_tensor_model_parallel_rank from sglang.srt.layers.quantization import QuantizationConfig, get_quantization_config from sglang.srt.utils import print_warning_once diff --git a/python/sglang/srt/models/baichuan.py b/python/sglang/srt/models/baichuan.py index 3bd60c25d..c973e64c7 100644 --- a/python/sglang/srt/models/baichuan.py +++ b/python/sglang/srt/models/baichuan.py @@ -24,10 +24,6 @@ from typing import Iterable, Optional, Tuple import torch from torch import nn from transformers import PretrainedConfig -from vllm.distributed import ( - get_tensor_model_parallel_rank, - get_tensor_model_parallel_world_size, -) from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, QKVParallelLinear, @@ -35,6 +31,10 @@ from vllm.model_executor.layers.linear import ( ) from vllm.model_executor.layers.rotary_embedding import get_rope +from sglang.srt.distributed import ( + get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size, +) from sglang.srt.layers.activation import SiluAndMul from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.logits_processor import LogitsProcessor diff --git a/python/sglang/srt/models/chatglm.py b/python/sglang/srt/models/chatglm.py index b69a9e116..4d73aa0de 100644 --- a/python/sglang/srt/models/chatglm.py +++ b/python/sglang/srt/models/chatglm.py @@ -21,10 +21,10 @@ from typing import Iterable, Optional, Tuple import torch from torch import nn from torch.nn import LayerNorm -from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.rotary_embedding import get_rope from sglang.srt.configs import ChatGLMConfig +from sglang.srt.distributed import get_tensor_model_parallel_world_size from sglang.srt.layers.activation import SiluAndMul from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.linear import ( diff --git a/python/sglang/srt/models/commandr.py b/python/sglang/srt/models/commandr.py index 83ac3d867..d701c10a7 100644 --- a/python/sglang/srt/models/commandr.py +++ b/python/sglang/srt/models/commandr.py @@ -44,12 +44,12 @@ import torch.utils.checkpoint from torch import nn from torch.nn.parameter import Parameter from transformers import PretrainedConfig -from vllm.distributed import ( +from vllm.model_executor.layers.rotary_embedding import get_rope + +from sglang.srt.distributed import ( get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, ) -from vllm.model_executor.layers.rotary_embedding import get_rope - from sglang.srt.layers.activation import SiluAndMul from sglang.srt.layers.linear import ( MergedColumnParallelLinear, diff --git a/python/sglang/srt/models/dbrx.py b/python/sglang/srt/models/dbrx.py index f838cfa57..206f24d61 100644 --- a/python/sglang/srt/models/dbrx.py +++ b/python/sglang/srt/models/dbrx.py @@ -19,14 +19,14 @@ from typing import Iterable, Optional, Tuple import torch import torch.nn as nn -from vllm.distributed import ( +from vllm.model_executor.layers.rotary_embedding import get_rope + +from sglang.srt.configs import DbrxConfig +from sglang.srt.distributed import ( get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, tensor_model_parallel_all_reduce, ) -from vllm.model_executor.layers.rotary_embedding import get_rope - -from sglang.srt.configs import DbrxConfig from sglang.srt.layers.linear import ( QKVParallelLinear, ReplicatedLinear, diff --git a/python/sglang/srt/models/deepseek.py b/python/sglang/srt/models/deepseek.py index d840cb866..cbd123c9e 100644 --- a/python/sglang/srt/models/deepseek.py +++ b/python/sglang/srt/models/deepseek.py @@ -21,13 +21,13 @@ from typing import Any, Dict, Iterable, Optional, Tuple import torch from torch import nn from transformers import PretrainedConfig -from vllm.distributed import ( +from vllm.model_executor.layers.rotary_embedding import get_rope + +from sglang.srt.distributed import ( get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, tensor_model_parallel_all_reduce, ) -from vllm.model_executor.layers.rotary_embedding import get_rope - from sglang.srt.layers.activation import SiluAndMul from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.linear import ( diff --git a/python/sglang/srt/models/deepseek_v2.py b/python/sglang/srt/models/deepseek_v2.py index 626722121..5a76c8ac9 100644 --- a/python/sglang/srt/models/deepseek_v2.py +++ b/python/sglang/srt/models/deepseek_v2.py @@ -23,14 +23,14 @@ import torch.nn.functional as F from torch import nn from transformers import PretrainedConfig from vllm import _custom_ops as ops -from vllm.distributed import ( +from vllm.model_executor.layers.rotary_embedding import get_rope + +from sglang.srt.distributed import ( get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, get_tp_group, tensor_model_parallel_all_reduce, ) -from vllm.model_executor.layers.rotary_embedding import get_rope - from sglang.srt.layers.activation import SiluAndMul from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.linear import ( diff --git a/python/sglang/srt/models/exaone.py b/python/sglang/srt/models/exaone.py index 536c253c3..5bb0ea538 100644 --- a/python/sglang/srt/models/exaone.py +++ b/python/sglang/srt/models/exaone.py @@ -20,9 +20,9 @@ from typing import Any, Dict, Iterable, Optional, Tuple import torch from torch import nn -from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.rotary_embedding import get_rope +from sglang.srt.distributed import get_tensor_model_parallel_world_size from sglang.srt.layers.activation import SiluAndMul from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.linear import ( diff --git a/python/sglang/srt/models/gemma.py b/python/sglang/srt/models/gemma.py index 10949a2f5..bed496c69 100644 --- a/python/sglang/srt/models/gemma.py +++ b/python/sglang/srt/models/gemma.py @@ -21,9 +21,9 @@ from typing import Iterable, Optional, Tuple import torch from torch import nn from transformers import PretrainedConfig -from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.rotary_embedding import get_rope +from sglang.srt.distributed import get_tensor_model_parallel_world_size from sglang.srt.layers.activation import GeluAndMul from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.linear import ( diff --git a/python/sglang/srt/models/gemma2.py b/python/sglang/srt/models/gemma2.py index 58d9ce02f..af51ba41b 100644 --- a/python/sglang/srt/models/gemma2.py +++ b/python/sglang/srt/models/gemma2.py @@ -20,8 +20,8 @@ from typing import Iterable, Optional, Set, Tuple, Union import torch from torch import nn from transformers import PretrainedConfig -from vllm.distributed import get_tensor_model_parallel_world_size +from sglang.srt.distributed import get_tensor_model_parallel_world_size from sglang.srt.layers.activation import GeluAndMul from sglang.srt.layers.layernorm import GemmaRMSNorm from sglang.srt.layers.linear import ( diff --git a/python/sglang/srt/models/gpt2.py b/python/sglang/srt/models/gpt2.py index 144ad8bbf..a99232dc2 100644 --- a/python/sglang/srt/models/gpt2.py +++ b/python/sglang/srt/models/gpt2.py @@ -22,10 +22,11 @@ from typing import Iterable, List, Optional, Tuple import torch from torch import nn from transformers import GPT2Config -from vllm.distributed.parallel_state import get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding +from sglang.srt.distributed.parallel_state import get_tensor_model_parallel_world_size + # from sglang.srt.layers.activation import get_act_fn from sglang.srt.layers.linear import ( ColumnParallelLinear, diff --git a/python/sglang/srt/models/gpt_bigcode.py b/python/sglang/srt/models/gpt_bigcode.py index f2f5ebd52..0d705fb41 100644 --- a/python/sglang/srt/models/gpt_bigcode.py +++ b/python/sglang/srt/models/gpt_bigcode.py @@ -21,8 +21,8 @@ from typing import Iterable, Optional, Tuple import torch from torch import nn from transformers import GPTBigCodeConfig -from vllm.distributed import get_tensor_model_parallel_world_size +from sglang.srt.distributed import get_tensor_model_parallel_world_size from sglang.srt.layers.activation import get_act_fn from sglang.srt.layers.linear import ( ColumnParallelLinear, diff --git a/python/sglang/srt/models/granite.py b/python/sglang/srt/models/granite.py index d207ff61b..1383e0ef0 100644 --- a/python/sglang/srt/models/granite.py +++ b/python/sglang/srt/models/granite.py @@ -22,9 +22,9 @@ from typing import Any, Dict, Iterable, Optional, Tuple import torch from torch import nn from transformers import GraniteConfig -from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.rotary_embedding import get_rope +from sglang.srt.distributed import get_tensor_model_parallel_world_size from sglang.srt.layers.activation import SiluAndMul from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.linear import ( diff --git a/python/sglang/srt/models/grok.py b/python/sglang/srt/models/grok.py index 33a055a8f..490798b5b 100644 --- a/python/sglang/srt/models/grok.py +++ b/python/sglang/srt/models/grok.py @@ -22,12 +22,12 @@ import torch import torch.nn.functional as F from torch import nn from transformers import PretrainedConfig -from vllm.distributed import ( +from vllm.model_executor.layers.rotary_embedding import get_rope + +from sglang.srt.distributed import ( get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, ) -from vllm.model_executor.layers.rotary_embedding import get_rope - from sglang.srt.layers.activation import GeluAndMul from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.linear import ( diff --git a/python/sglang/srt/models/internlm2.py b/python/sglang/srt/models/internlm2.py index 0a737c138..31617db5e 100644 --- a/python/sglang/srt/models/internlm2.py +++ b/python/sglang/srt/models/internlm2.py @@ -19,9 +19,9 @@ from typing import Any, Dict, Iterable, Optional, Tuple import torch from torch import nn from transformers import PretrainedConfig -from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.rotary_embedding import get_rope +from sglang.srt.distributed import get_tensor_model_parallel_world_size from sglang.srt.layers.activation import SiluAndMul from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.linear import ( diff --git a/python/sglang/srt/models/llama.py b/python/sglang/srt/models/llama.py index 4f09fd185..198d53995 100644 --- a/python/sglang/srt/models/llama.py +++ b/python/sglang/srt/models/llama.py @@ -22,13 +22,13 @@ from typing import Any, Dict, Iterable, Optional, Tuple import torch from torch import nn from transformers import LlamaConfig -from vllm.distributed import ( - get_tensor_model_parallel_rank, - get_tensor_model_parallel_world_size, -) from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.model_loader.weight_utils import kv_cache_scales_loader +from sglang.srt.distributed import ( + get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size, +) from sglang.srt.layers.activation import SiluAndMul from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.linear import ( diff --git a/python/sglang/srt/models/minicpm.py b/python/sglang/srt/models/minicpm.py index 3482a8281..b0853c3ee 100644 --- a/python/sglang/srt/models/minicpm.py +++ b/python/sglang/srt/models/minicpm.py @@ -18,9 +18,9 @@ from typing import Any, Dict, Iterable, Optional, Tuple import torch from torch import nn -from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.rotary_embedding import get_rope +from sglang.srt.distributed import get_tensor_model_parallel_world_size from sglang.srt.layers.activation import SiluAndMul from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.linear import ( diff --git a/python/sglang/srt/models/minicpm3.py b/python/sglang/srt/models/minicpm3.py index b0c93274e..f5e722a14 100644 --- a/python/sglang/srt/models/minicpm3.py +++ b/python/sglang/srt/models/minicpm3.py @@ -19,7 +19,6 @@ from typing import Any, Dict, Iterable, Optional, Tuple import torch from torch import nn from transformers import PretrainedConfig -from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.linear import ( ColumnParallelLinear, MergedColumnParallelLinear, @@ -28,6 +27,7 @@ from vllm.model_executor.layers.linear import ( ) from vllm.model_executor.layers.rotary_embedding import get_rope +from sglang.srt.distributed import get_tensor_model_parallel_world_size from sglang.srt.layers.activation import SiluAndMul from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.logits_processor import LogitsProcessor diff --git a/python/sglang/srt/models/mixtral.py b/python/sglang/srt/models/mixtral.py index 9dbdb46ff..c2c8d2294 100644 --- a/python/sglang/srt/models/mixtral.py +++ b/python/sglang/srt/models/mixtral.py @@ -21,12 +21,12 @@ from typing import Iterable, Optional, Tuple import torch from torch import nn from transformers import MixtralConfig -from vllm.distributed import ( +from vllm.model_executor.layers.rotary_embedding import get_rope + +from sglang.srt.distributed import ( get_tensor_model_parallel_world_size, tensor_model_parallel_all_reduce, ) -from vllm.model_executor.layers.rotary_embedding import get_rope - from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.linear import ( QKVParallelLinear, diff --git a/python/sglang/srt/models/mixtral_quant.py b/python/sglang/srt/models/mixtral_quant.py index e5f49f566..c38328369 100644 --- a/python/sglang/srt/models/mixtral_quant.py +++ b/python/sglang/srt/models/mixtral_quant.py @@ -23,13 +23,13 @@ import torch import torch.nn.functional as F from torch import nn from transformers import MixtralConfig -from vllm.distributed import ( +from vllm.model_executor.layers.rotary_embedding import get_rope + +from sglang.srt.distributed import ( get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, tensor_model_parallel_all_reduce, ) -from vllm.model_executor.layers.rotary_embedding import get_rope - from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.linear import ( QKVParallelLinear, diff --git a/python/sglang/srt/models/mllama.py b/python/sglang/srt/models/mllama.py index 019d21c20..43f6793e4 100644 --- a/python/sglang/srt/models/mllama.py +++ b/python/sglang/srt/models/mllama.py @@ -8,14 +8,14 @@ import torch import torch.nn.functional as F import torch.utils.checkpoint import transformers.models.mllama.configuration_mllama as config_mllama -import vllm.distributed.parallel_state as ps from torch import nn from transformers.modeling_outputs import BaseModelOutput, CausalLMOutputWithPast from transformers.models.mllama.modeling_mllama import ( _prepare_aspect_ratio_attention_mask, ) -from vllm.distributed import get_tensor_model_parallel_world_size +import sglang.srt.distributed.parallel_state as ps +from sglang.srt.distributed import get_tensor_model_parallel_world_size from sglang.srt.layers.activation import get_act_fn from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.linear import ( diff --git a/python/sglang/srt/models/olmo.py b/python/sglang/srt/models/olmo.py index 1cfa27309..e8fe9a7a0 100644 --- a/python/sglang/srt/models/olmo.py +++ b/python/sglang/srt/models/olmo.py @@ -20,9 +20,9 @@ from typing import Iterable, List, Optional, Tuple import torch from torch import nn from transformers import OlmoConfig -from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.rotary_embedding import get_rope +from sglang.srt.distributed import get_tensor_model_parallel_world_size from sglang.srt.layers.activation import SiluAndMul from sglang.srt.layers.linear import ( MergedColumnParallelLinear, diff --git a/python/sglang/srt/models/olmo2.py b/python/sglang/srt/models/olmo2.py index 0944b5720..df0121930 100755 --- a/python/sglang/srt/models/olmo2.py +++ b/python/sglang/srt/models/olmo2.py @@ -21,15 +21,15 @@ from typing import Iterable, Optional, Tuple import torch from torch import nn from transformers import PretrainedConfig -from vllm.distributed import ( +from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.model_executor.model_loader.weight_utils import default_weight_loader + +from sglang.srt.distributed import ( get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, split_tensor_along_last_dim, tensor_model_parallel_all_gather, ) -from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.model_loader.weight_utils import default_weight_loader - from sglang.srt.layers.activation import SiluAndMul from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.linear import ( diff --git a/python/sglang/srt/models/olmoe.py b/python/sglang/srt/models/olmoe.py index df96be3bc..74bc98372 100644 --- a/python/sglang/srt/models/olmoe.py +++ b/python/sglang/srt/models/olmoe.py @@ -23,10 +23,6 @@ import torch import torch.nn.functional as F from torch import nn from transformers import PretrainedConfig -from vllm.distributed import ( - get_tensor_model_parallel_world_size, - tensor_model_parallel_all_reduce, -) from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, QKVParallelLinear, @@ -35,6 +31,10 @@ from vllm.model_executor.layers.linear import ( ) from vllm.model_executor.layers.rotary_embedding import get_rope +from sglang.srt.distributed import ( + get_tensor_model_parallel_world_size, + tensor_model_parallel_all_reduce, +) from sglang.srt.layers.activation import SiluAndMul from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput diff --git a/python/sglang/srt/models/phi3_small.py b/python/sglang/srt/models/phi3_small.py index 1e70c7d78..e59f88013 100644 --- a/python/sglang/srt/models/phi3_small.py +++ b/python/sglang/srt/models/phi3_small.py @@ -5,9 +5,9 @@ import torch from torch import nn from transformers import Phi3Config from transformers.configuration_utils import PretrainedConfig -from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.rotary_embedding import get_rope +from sglang.srt.distributed import get_tensor_model_parallel_world_size from sglang.srt.layers.linear import ( MergedColumnParallelLinear, QKVParallelLinear, diff --git a/python/sglang/srt/models/qwen.py b/python/sglang/srt/models/qwen.py index 5492a3e12..ed9ca02b7 100644 --- a/python/sglang/srt/models/qwen.py +++ b/python/sglang/srt/models/qwen.py @@ -20,9 +20,9 @@ from typing import Any, Dict, Iterable, Optional, Tuple import torch from torch import nn from transformers import PretrainedConfig -from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.rotary_embedding import get_rope +from sglang.srt.distributed import get_tensor_model_parallel_world_size from sglang.srt.layers.activation import SiluAndMul from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.linear import ( diff --git a/python/sglang/srt/models/qwen2.py b/python/sglang/srt/models/qwen2.py index bc3f10997..f015b2872 100644 --- a/python/sglang/srt/models/qwen2.py +++ b/python/sglang/srt/models/qwen2.py @@ -20,9 +20,9 @@ from typing import Any, Dict, Iterable, Optional, Tuple import torch from torch import nn -from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.rotary_embedding import get_rope +from sglang.srt.distributed import get_tensor_model_parallel_world_size from sglang.srt.layers.activation import SiluAndMul from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.linear import ( diff --git a/python/sglang/srt/models/qwen2_moe.py b/python/sglang/srt/models/qwen2_moe.py index 9db2d5382..b0f08f975 100644 --- a/python/sglang/srt/models/qwen2_moe.py +++ b/python/sglang/srt/models/qwen2_moe.py @@ -22,12 +22,12 @@ import torch import torch.nn.functional as F from torch import nn from transformers import PretrainedConfig -from vllm.distributed import ( +from vllm.model_executor.layers.rotary_embedding import get_rope + +from sglang.srt.distributed import ( get_tensor_model_parallel_world_size, tensor_model_parallel_all_reduce, ) -from vllm.model_executor.layers.rotary_embedding import get_rope - from sglang.srt.layers.activation import SiluAndMul from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.linear import ( diff --git a/python/sglang/srt/models/qwen2_vl.py b/python/sglang/srt/models/qwen2_vl.py index 2e9ec9d8f..fc5dd49ea 100644 --- a/python/sglang/srt/models/qwen2_vl.py +++ b/python/sglang/srt/models/qwen2_vl.py @@ -30,12 +30,12 @@ import torch import torch.nn as nn import torch.nn.functional as F from einops import rearrange, repeat -from vllm.distributed import parallel_state -from vllm.distributed import utils as dist_utils from vllm.logger import init_logger from vllm.model_executor.layers.activation import QuickGELU from sglang.srt.configs import Qwen2VLConfig, Qwen2VLVisionConfig +from sglang.srt.distributed import parallel_state +from sglang.srt.distributed import utils as dist_utils from sglang.srt.hf_transformers_utils import get_processor from sglang.srt.layers.attention.triton_ops.prefill_attention import ( context_attention_fwd, diff --git a/python/sglang/srt/models/stablelm.py b/python/sglang/srt/models/stablelm.py index 079d54e3c..2f144dcb1 100644 --- a/python/sglang/srt/models/stablelm.py +++ b/python/sglang/srt/models/stablelm.py @@ -24,9 +24,9 @@ from typing import Iterable, Optional, Tuple import torch from torch import nn from transformers import PretrainedConfig -from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.rotary_embedding import get_rope +from sglang.srt.distributed import get_tensor_model_parallel_world_size from sglang.srt.layers.activation import SiluAndMul from sglang.srt.layers.linear import ( MergedColumnParallelLinear, diff --git a/python/sglang/srt/models/torch_native_llama.py b/python/sglang/srt/models/torch_native_llama.py index 7a55d5045..04d21f0d4 100644 --- a/python/sglang/srt/models/torch_native_llama.py +++ b/python/sglang/srt/models/torch_native_llama.py @@ -47,12 +47,12 @@ import torch from torch import nn from torch.nn.parameter import Parameter from transformers import LlamaConfig -from vllm.distributed import ( +from vllm.model_executor.layers.rotary_embedding import get_rope + +from sglang.srt.distributed import ( get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, ) -from vllm.model_executor.layers.rotary_embedding import get_rope - from sglang.srt.layers.activation import SiluAndMul from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput diff --git a/python/sglang/srt/models/xverse.py b/python/sglang/srt/models/xverse.py index e65514215..7d6158a9b 100644 --- a/python/sglang/srt/models/xverse.py +++ b/python/sglang/srt/models/xverse.py @@ -21,7 +21,6 @@ from typing import Any, Dict, Iterable, Optional, Tuple import torch from torch import nn from transformers import LlamaConfig -from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( @@ -31,6 +30,7 @@ from vllm.model_executor.layers.linear import ( ) from vllm.model_executor.layers.rotary_embedding import get_rope +from sglang.srt.distributed import get_tensor_model_parallel_world_size from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.quantization.base_config import QuantizationConfig from sglang.srt.layers.radix_attention import RadixAttention diff --git a/python/sglang/srt/models/xverse_moe.py b/python/sglang/srt/models/xverse_moe.py index 9b4b27f07..27f763011 100644 --- a/python/sglang/srt/models/xverse_moe.py +++ b/python/sglang/srt/models/xverse_moe.py @@ -18,11 +18,6 @@ from typing import Any, Dict, Iterable, Optional, Tuple import torch from torch import nn from transformers import PretrainedConfig -from vllm.distributed import ( - get_tensor_model_parallel_rank, - get_tensor_model_parallel_world_size, - tensor_model_parallel_all_reduce, -) from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( @@ -33,6 +28,11 @@ from vllm.model_executor.layers.linear import ( ) from vllm.model_executor.layers.rotary_embedding import get_rope +from sglang.srt.distributed import ( + get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size, + tensor_model_parallel_all_reduce, +) from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.moe.fused_moe_triton import fused_moe from sglang.srt.layers.quantization.base_config import QuantizationConfig