feat: remove the dependency on FusedMoE (#2153)

This commit is contained in:
Yineng Zhang
2024-11-24 20:09:27 +08:00
committed by GitHub
parent dbe1729395
commit b509db5832
7 changed files with 1602 additions and 7 deletions

View File

@@ -57,12 +57,23 @@ __all__ = [
"QUANTIZATION_METHODS",
]
"""
def fp8_get_quant_method(
self, layer: torch.nn.Module, prefix: str
) -> Optional["QuantizeMethodBase"]:
def fp8_get_quant_method(self, layer, prefix):
from vllm.model_executor.layers.linear import LinearBase
from vllm.model_executor.layers.quantization.fp8 import (
Fp8LinearMethod,
Fp8MoEMethod,
)
from vllm.model_executor.layers.quantization.utils.quant_utils import (
is_layer_skipped,
)
from sglang.srt.layers.triton_fused_moe.layer import FusedMoE
if isinstance(layer, LinearBase):
if is_layer_skipped(prefix, self.ignored_layers):
from sglang.srt.layers.linear import UnquantizedLinearMethod
return UnquantizedLinearMethod()
return Fp8LinearMethod(self)
elif isinstance(layer, FusedMoE):
@@ -71,4 +82,3 @@ def fp8_get_quant_method(
setattr(Fp8Config, "get_quant_method", fp8_get_quant_method)
"""