cleanup deps 3/n (#4541)

2025-03-18 00:11:36 -07:00
parent 2d0045125f
commit c16b33ccac
4 changed files with 16 additions and 12 deletions
--- a/python/sglang/srt/layers/quantization/fp8.py
+++ b/python/sglang/srt/layers/quantization/fp8.py
@@ -152,8 +152,6 @@ class Fp8Config(QuantizationConfig):
    def get_quant_method(
        self, layer: torch.nn.Module, prefix: str
    ) -> Optional["QuantizeMethodBase"]:
-        from vllm.attention.layer import Attention  # Avoid circular import
-
        from sglang.srt.layers.moe.fused_moe_triton import FusedMoE

        if isinstance(layer, LinearBase):
@@ -162,8 +160,6 @@ class Fp8Config(QuantizationConfig):
            return Fp8LinearMethod(self)
        elif isinstance(layer, FusedMoE):
            return Fp8MoEMethod(self)
-        elif isinstance(layer, Attention):
-            return Fp8KVCacheMethod(self)
        return None

    def get_scaled_act_names(self) -> List[str]:
--- a/python/sglang/srt/models/deepseek_nextn.py
+++ b/python/sglang/srt/models/deepseek_nextn.py
@@ -18,7 +18,6 @@ from typing import Iterable, Optional, Tuple
 import torch
 from torch import nn
 from transformers import PretrainedConfig
-from vllm import _custom_ops as ops

 from sglang.srt.layers.layernorm import RMSNorm
 from sglang.srt.layers.linear import ReplicatedLinear
@@ -48,6 +47,8 @@ _is_cuda = is_cuda()

 if _is_cuda:
    from sgl_kernel import awq_dequantize
+else:
+    from vllm import _custom_ops as ops


 class DeepseekModelNextN(nn.Module):
--- a/python/sglang/srt/models/deepseek_v2.py
+++ b/python/sglang/srt/models/deepseek_v2.py
@@ -23,7 +23,6 @@ import torch
 import torch.nn.functional as F
 from torch import nn
 from transformers import PretrainedConfig
-from vllm import _custom_ops as ops

 from sglang.srt.distributed import (
    get_tensor_model_parallel_world_size,
@@ -75,6 +74,8 @@ _is_cuda = is_cuda()

 if _is_cuda:
    from sgl_kernel import awq_dequantize, bmm_fp8
+else:
+    from vllm import _custom_ops as ops


 class DeepseekV2MLP(nn.Module):
--- a/python/sglang/srt/utils.py
+++ b/python/sglang/srt/utils.py
@@ -531,7 +531,10 @@ def load_image(image_file: Union[str, bytes]):


 def suppress_other_loggers():
-    from vllm.logger import logger as vllm_default_logger
+    try:
+        from vllm.logger import logger as vllm_default_logger
+    except ImportError:
+        return

    vllm_default_logger.setLevel(logging.WARN)
    logging.getLogger("vllm.distributed.device_communicators.pynccl").setLevel(
@@ -620,11 +623,14 @@ def monkey_patch_p2p_access_check():


 def monkey_patch_vllm_gguf_config():
-    from vllm.model_executor.layers.quantization.gguf import (
-        GGUFConfig,
-        GGUFEmbeddingMethod,
-        GGUFLinearMethod,
-    )
+    try:
+        from vllm.model_executor.layers.quantization.gguf import (
+            GGUFConfig,
+            GGUFEmbeddingMethod,
+            GGUFLinearMethod,
+        )
+    except ImportError:
+        return

    from sglang.srt.layers.linear import LinearBase
    from sglang.srt.layers.vocab_parallel_embedding import VocabParallelEmbedding