From c16b33ccac854f6a90a00401c33c68f82f6cdc1b Mon Sep 17 00:00:00 2001 From: Yineng Zhang Date: Tue, 18 Mar 2025 00:11:36 -0700 Subject: [PATCH] cleanup deps 3/n (#4541) --- python/sglang/srt/layers/quantization/fp8.py | 4 ---- python/sglang/srt/models/deepseek_nextn.py | 3 ++- python/sglang/srt/models/deepseek_v2.py | 3 ++- python/sglang/srt/utils.py | 18 ++++++++++++------ 4 files changed, 16 insertions(+), 12 deletions(-) diff --git a/python/sglang/srt/layers/quantization/fp8.py b/python/sglang/srt/layers/quantization/fp8.py index 58ae51436..7e668fb95 100644 --- a/python/sglang/srt/layers/quantization/fp8.py +++ b/python/sglang/srt/layers/quantization/fp8.py @@ -152,8 +152,6 @@ class Fp8Config(QuantizationConfig): def get_quant_method( self, layer: torch.nn.Module, prefix: str ) -> Optional["QuantizeMethodBase"]: - from vllm.attention.layer import Attention # Avoid circular import - from sglang.srt.layers.moe.fused_moe_triton import FusedMoE if isinstance(layer, LinearBase): @@ -162,8 +160,6 @@ class Fp8Config(QuantizationConfig): return Fp8LinearMethod(self) elif isinstance(layer, FusedMoE): return Fp8MoEMethod(self) - elif isinstance(layer, Attention): - return Fp8KVCacheMethod(self) return None def get_scaled_act_names(self) -> List[str]: diff --git a/python/sglang/srt/models/deepseek_nextn.py b/python/sglang/srt/models/deepseek_nextn.py index 721c41ff1..be3466197 100644 --- a/python/sglang/srt/models/deepseek_nextn.py +++ b/python/sglang/srt/models/deepseek_nextn.py @@ -18,7 +18,6 @@ from typing import Iterable, Optional, Tuple import torch from torch import nn from transformers import PretrainedConfig -from vllm import _custom_ops as ops from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.linear import ReplicatedLinear @@ -48,6 +47,8 @@ _is_cuda = is_cuda() if _is_cuda: from sgl_kernel import awq_dequantize +else: + from vllm import _custom_ops as ops class DeepseekModelNextN(nn.Module): diff --git a/python/sglang/srt/models/deepseek_v2.py b/python/sglang/srt/models/deepseek_v2.py index f654b3d03..c9701f166 100755 --- a/python/sglang/srt/models/deepseek_v2.py +++ b/python/sglang/srt/models/deepseek_v2.py @@ -23,7 +23,6 @@ import torch import torch.nn.functional as F from torch import nn from transformers import PretrainedConfig -from vllm import _custom_ops as ops from sglang.srt.distributed import ( get_tensor_model_parallel_world_size, @@ -75,6 +74,8 @@ _is_cuda = is_cuda() if _is_cuda: from sgl_kernel import awq_dequantize, bmm_fp8 +else: + from vllm import _custom_ops as ops class DeepseekV2MLP(nn.Module): diff --git a/python/sglang/srt/utils.py b/python/sglang/srt/utils.py index 2522ee324..8a8be4550 100644 --- a/python/sglang/srt/utils.py +++ b/python/sglang/srt/utils.py @@ -531,7 +531,10 @@ def load_image(image_file: Union[str, bytes]): def suppress_other_loggers(): - from vllm.logger import logger as vllm_default_logger + try: + from vllm.logger import logger as vllm_default_logger + except ImportError: + return vllm_default_logger.setLevel(logging.WARN) logging.getLogger("vllm.distributed.device_communicators.pynccl").setLevel( @@ -620,11 +623,14 @@ def monkey_patch_p2p_access_check(): def monkey_patch_vllm_gguf_config(): - from vllm.model_executor.layers.quantization.gguf import ( - GGUFConfig, - GGUFEmbeddingMethod, - GGUFLinearMethod, - ) + try: + from vllm.model_executor.layers.quantization.gguf import ( + GGUFConfig, + GGUFEmbeddingMethod, + GGUFLinearMethod, + ) + except ImportError: + return from sglang.srt.layers.linear import LinearBase from sglang.srt.layers.vocab_parallel_embedding import VocabParallelEmbedding