cleanup deps 3/n (#4541)

This commit is contained in:
Yineng Zhang
2025-03-18 00:11:36 -07:00
committed by GitHub
parent 2d0045125f
commit c16b33ccac
4 changed files with 16 additions and 12 deletions

View File

@@ -152,8 +152,6 @@ class Fp8Config(QuantizationConfig):
def get_quant_method(
self, layer: torch.nn.Module, prefix: str
) -> Optional["QuantizeMethodBase"]:
from vllm.attention.layer import Attention # Avoid circular import
from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
if isinstance(layer, LinearBase):
@@ -162,8 +160,6 @@ class Fp8Config(QuantizationConfig):
return Fp8LinearMethod(self)
elif isinstance(layer, FusedMoE):
return Fp8MoEMethod(self)
elif isinstance(layer, Attention):
return Fp8KVCacheMethod(self)
return None
def get_scaled_act_names(self) -> List[str]:

View File

@@ -18,7 +18,6 @@ from typing import Iterable, Optional, Tuple
import torch
from torch import nn
from transformers import PretrainedConfig
from vllm import _custom_ops as ops
from sglang.srt.layers.layernorm import RMSNorm
from sglang.srt.layers.linear import ReplicatedLinear
@@ -48,6 +47,8 @@ _is_cuda = is_cuda()
if _is_cuda:
from sgl_kernel import awq_dequantize
else:
from vllm import _custom_ops as ops
class DeepseekModelNextN(nn.Module):

View File

@@ -23,7 +23,6 @@ import torch
import torch.nn.functional as F
from torch import nn
from transformers import PretrainedConfig
from vllm import _custom_ops as ops
from sglang.srt.distributed import (
get_tensor_model_parallel_world_size,
@@ -75,6 +74,8 @@ _is_cuda = is_cuda()
if _is_cuda:
from sgl_kernel import awq_dequantize, bmm_fp8
else:
from vllm import _custom_ops as ops
class DeepseekV2MLP(nn.Module):

View File

@@ -531,7 +531,10 @@ def load_image(image_file: Union[str, bytes]):
def suppress_other_loggers():
from vllm.logger import logger as vllm_default_logger
try:
from vllm.logger import logger as vllm_default_logger
except ImportError:
return
vllm_default_logger.setLevel(logging.WARN)
logging.getLogger("vllm.distributed.device_communicators.pynccl").setLevel(
@@ -620,11 +623,14 @@ def monkey_patch_p2p_access_check():
def monkey_patch_vllm_gguf_config():
from vllm.model_executor.layers.quantization.gguf import (
GGUFConfig,
GGUFEmbeddingMethod,
GGUFLinearMethod,
)
try:
from vllm.model_executor.layers.quantization.gguf import (
GGUFConfig,
GGUFEmbeddingMethod,
GGUFLinearMethod,
)
except ImportError:
return
from sglang.srt.layers.linear import LinearBase
from sglang.srt.layers.vocab_parallel_embedding import VocabParallelEmbedding