cleanup deps 3/n (#4541)
This commit is contained in:
@@ -152,8 +152,6 @@ class Fp8Config(QuantizationConfig):
|
|||||||
def get_quant_method(
|
def get_quant_method(
|
||||||
self, layer: torch.nn.Module, prefix: str
|
self, layer: torch.nn.Module, prefix: str
|
||||||
) -> Optional["QuantizeMethodBase"]:
|
) -> Optional["QuantizeMethodBase"]:
|
||||||
from vllm.attention.layer import Attention # Avoid circular import
|
|
||||||
|
|
||||||
from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
|
from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
|
||||||
|
|
||||||
if isinstance(layer, LinearBase):
|
if isinstance(layer, LinearBase):
|
||||||
@@ -162,8 +160,6 @@ class Fp8Config(QuantizationConfig):
|
|||||||
return Fp8LinearMethod(self)
|
return Fp8LinearMethod(self)
|
||||||
elif isinstance(layer, FusedMoE):
|
elif isinstance(layer, FusedMoE):
|
||||||
return Fp8MoEMethod(self)
|
return Fp8MoEMethod(self)
|
||||||
elif isinstance(layer, Attention):
|
|
||||||
return Fp8KVCacheMethod(self)
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def get_scaled_act_names(self) -> List[str]:
|
def get_scaled_act_names(self) -> List[str]:
|
||||||
|
|||||||
@@ -18,7 +18,6 @@ from typing import Iterable, Optional, Tuple
|
|||||||
import torch
|
import torch
|
||||||
from torch import nn
|
from torch import nn
|
||||||
from transformers import PretrainedConfig
|
from transformers import PretrainedConfig
|
||||||
from vllm import _custom_ops as ops
|
|
||||||
|
|
||||||
from sglang.srt.layers.layernorm import RMSNorm
|
from sglang.srt.layers.layernorm import RMSNorm
|
||||||
from sglang.srt.layers.linear import ReplicatedLinear
|
from sglang.srt.layers.linear import ReplicatedLinear
|
||||||
@@ -48,6 +47,8 @@ _is_cuda = is_cuda()
|
|||||||
|
|
||||||
if _is_cuda:
|
if _is_cuda:
|
||||||
from sgl_kernel import awq_dequantize
|
from sgl_kernel import awq_dequantize
|
||||||
|
else:
|
||||||
|
from vllm import _custom_ops as ops
|
||||||
|
|
||||||
|
|
||||||
class DeepseekModelNextN(nn.Module):
|
class DeepseekModelNextN(nn.Module):
|
||||||
|
|||||||
@@ -23,7 +23,6 @@ import torch
|
|||||||
import torch.nn.functional as F
|
import torch.nn.functional as F
|
||||||
from torch import nn
|
from torch import nn
|
||||||
from transformers import PretrainedConfig
|
from transformers import PretrainedConfig
|
||||||
from vllm import _custom_ops as ops
|
|
||||||
|
|
||||||
from sglang.srt.distributed import (
|
from sglang.srt.distributed import (
|
||||||
get_tensor_model_parallel_world_size,
|
get_tensor_model_parallel_world_size,
|
||||||
@@ -75,6 +74,8 @@ _is_cuda = is_cuda()
|
|||||||
|
|
||||||
if _is_cuda:
|
if _is_cuda:
|
||||||
from sgl_kernel import awq_dequantize, bmm_fp8
|
from sgl_kernel import awq_dequantize, bmm_fp8
|
||||||
|
else:
|
||||||
|
from vllm import _custom_ops as ops
|
||||||
|
|
||||||
|
|
||||||
class DeepseekV2MLP(nn.Module):
|
class DeepseekV2MLP(nn.Module):
|
||||||
|
|||||||
@@ -531,7 +531,10 @@ def load_image(image_file: Union[str, bytes]):
|
|||||||
|
|
||||||
|
|
||||||
def suppress_other_loggers():
|
def suppress_other_loggers():
|
||||||
from vllm.logger import logger as vllm_default_logger
|
try:
|
||||||
|
from vllm.logger import logger as vllm_default_logger
|
||||||
|
except ImportError:
|
||||||
|
return
|
||||||
|
|
||||||
vllm_default_logger.setLevel(logging.WARN)
|
vllm_default_logger.setLevel(logging.WARN)
|
||||||
logging.getLogger("vllm.distributed.device_communicators.pynccl").setLevel(
|
logging.getLogger("vllm.distributed.device_communicators.pynccl").setLevel(
|
||||||
@@ -620,11 +623,14 @@ def monkey_patch_p2p_access_check():
|
|||||||
|
|
||||||
|
|
||||||
def monkey_patch_vllm_gguf_config():
|
def monkey_patch_vllm_gguf_config():
|
||||||
from vllm.model_executor.layers.quantization.gguf import (
|
try:
|
||||||
GGUFConfig,
|
from vllm.model_executor.layers.quantization.gguf import (
|
||||||
GGUFEmbeddingMethod,
|
GGUFConfig,
|
||||||
GGUFLinearMethod,
|
GGUFEmbeddingMethod,
|
||||||
)
|
GGUFLinearMethod,
|
||||||
|
)
|
||||||
|
except ImportError:
|
||||||
|
return
|
||||||
|
|
||||||
from sglang.srt.layers.linear import LinearBase
|
from sglang.srt.layers.linear import LinearBase
|
||||||
from sglang.srt.layers.vocab_parallel_embedding import VocabParallelEmbedding
|
from sglang.srt.layers.vocab_parallel_embedding import VocabParallelEmbedding
|
||||||
|
|||||||
Reference in New Issue
Block a user