cleanup deps 3/n (#4541)
This commit is contained in:
@@ -152,8 +152,6 @@ class Fp8Config(QuantizationConfig):
|
||||
def get_quant_method(
|
||||
self, layer: torch.nn.Module, prefix: str
|
||||
) -> Optional["QuantizeMethodBase"]:
|
||||
from vllm.attention.layer import Attention # Avoid circular import
|
||||
|
||||
from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
|
||||
|
||||
if isinstance(layer, LinearBase):
|
||||
@@ -162,8 +160,6 @@ class Fp8Config(QuantizationConfig):
|
||||
return Fp8LinearMethod(self)
|
||||
elif isinstance(layer, FusedMoE):
|
||||
return Fp8MoEMethod(self)
|
||||
elif isinstance(layer, Attention):
|
||||
return Fp8KVCacheMethod(self)
|
||||
return None
|
||||
|
||||
def get_scaled_act_names(self) -> List[str]:
|
||||
|
||||
@@ -18,7 +18,6 @@ from typing import Iterable, Optional, Tuple
|
||||
import torch
|
||||
from torch import nn
|
||||
from transformers import PretrainedConfig
|
||||
from vllm import _custom_ops as ops
|
||||
|
||||
from sglang.srt.layers.layernorm import RMSNorm
|
||||
from sglang.srt.layers.linear import ReplicatedLinear
|
||||
@@ -48,6 +47,8 @@ _is_cuda = is_cuda()
|
||||
|
||||
if _is_cuda:
|
||||
from sgl_kernel import awq_dequantize
|
||||
else:
|
||||
from vllm import _custom_ops as ops
|
||||
|
||||
|
||||
class DeepseekModelNextN(nn.Module):
|
||||
|
||||
@@ -23,7 +23,6 @@ import torch
|
||||
import torch.nn.functional as F
|
||||
from torch import nn
|
||||
from transformers import PretrainedConfig
|
||||
from vllm import _custom_ops as ops
|
||||
|
||||
from sglang.srt.distributed import (
|
||||
get_tensor_model_parallel_world_size,
|
||||
@@ -75,6 +74,8 @@ _is_cuda = is_cuda()
|
||||
|
||||
if _is_cuda:
|
||||
from sgl_kernel import awq_dequantize, bmm_fp8
|
||||
else:
|
||||
from vllm import _custom_ops as ops
|
||||
|
||||
|
||||
class DeepseekV2MLP(nn.Module):
|
||||
|
||||
@@ -531,7 +531,10 @@ def load_image(image_file: Union[str, bytes]):
|
||||
|
||||
|
||||
def suppress_other_loggers():
|
||||
from vllm.logger import logger as vllm_default_logger
|
||||
try:
|
||||
from vllm.logger import logger as vllm_default_logger
|
||||
except ImportError:
|
||||
return
|
||||
|
||||
vllm_default_logger.setLevel(logging.WARN)
|
||||
logging.getLogger("vllm.distributed.device_communicators.pynccl").setLevel(
|
||||
@@ -620,11 +623,14 @@ def monkey_patch_p2p_access_check():
|
||||
|
||||
|
||||
def monkey_patch_vllm_gguf_config():
|
||||
from vllm.model_executor.layers.quantization.gguf import (
|
||||
GGUFConfig,
|
||||
GGUFEmbeddingMethod,
|
||||
GGUFLinearMethod,
|
||||
)
|
||||
try:
|
||||
from vllm.model_executor.layers.quantization.gguf import (
|
||||
GGUFConfig,
|
||||
GGUFEmbeddingMethod,
|
||||
GGUFLinearMethod,
|
||||
)
|
||||
except ImportError:
|
||||
return
|
||||
|
||||
from sglang.srt.layers.linear import LinearBase
|
||||
from sglang.srt.layers.vocab_parallel_embedding import VocabParallelEmbedding
|
||||
|
||||
Reference in New Issue
Block a user