Enable Nvidia's ModelOpt fp8 quantized models (#2535)

2025-01-06 14:54:52 -08:00
parent b8574f6953
commit 287427e2e6
5 changed files with 185 additions and 0 deletions
--- a/python/sglang/srt/layers/linear.py
+++ b/python/sglang/srt/layers/linear.py
@@ -44,6 +44,7 @@ WEIGHT_LOADER_V2_SUPPORTED = [
    "MarlinLinearMethod",
    "GPTQLinearMethod",
    "QQQLinearMethod",
+    "ModelOptFp8LinearMethod",
 ]


--- a/python/sglang/srt/layers/modelopt_quant.py
+++ b/python/sglang/srt/layers/modelopt_quant.py
@@ -0,0 +1,173 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/quantization/modelopt.py
+
+import logging
+from typing import Any, Dict, List, Optional
+
+import torch
+from torch.nn.parameter import Parameter
+from vllm.model_executor.layers.linear import LinearBase
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    apply_fp8_linear,
+    cutlass_fp8_supported,
+    requantize_with_max_scale,
+)
+from vllm.model_executor.parameter import ModelWeightParameter, PerTensorScaleParameter
+
+from sglang.srt.layers.linear import LinearMethodBase
+from sglang.srt.layers.quantization.base_config import (
+    QuantizationConfig,
+    QuantizeMethodBase,
+)
+
+# Initialize logger for the module
+logger = logging.getLogger(__name__)
+
+# Supported activation schemes for the current configuration
+ACTIVATION_SCHEMES = ["static"]
+
+
+class ModelOptFp8Config(QuantizationConfig):
+    """Configuration for ModelOpt FP8 quantization, including serialization and compatibility checks."""
+
+    def __init__(self, is_checkpoint_fp8_serialized: bool = False) -> None:
+        """
+        Args:
+            is_checkpoint_fp8_serialized (bool): Indicates if the checkpoint uses serialized FP8 format.
+        """
+        self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized
+        if is_checkpoint_fp8_serialized:
+            logger.warning(
+                "Detected ModelOpt FP8 checkpoint. The format is experimental and subject to change."
+            )
+
+    @classmethod
+    def get_name(cls) -> str:
+        return "modelopt"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
+        return [torch.bfloat16, torch.half]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 89  # Minimum hardware capability (e.g., Hopper GPUs).
+
+    @classmethod
+    def get_config_filenames(cls) -> List[str]:
+        return ["hf_quant_config.json"]
+
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> "ModelOptFp8Config":
+        quant_method = cls.get_from_keys(config, ["quantization"]).get("quant_algo")
+
+        if "FP8" not in quant_method:
+            raise ValueError(
+                "ModelOpt only supports static FP8 quantization in SGLang. "
+                "Check the `hf_quant_config.json` file for your model's configuration."
+            )
+
+        return cls(is_checkpoint_fp8_serialized=True)
+
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional["QuantizeMethodBase"]:
+        return ModelOptFp8LinearMethod(self) if isinstance(layer, LinearBase) else None
+
+    def get_scaled_act_names(self) -> List[str]:
+        return []
+
+
+class ModelOptFp8LinearMethod(LinearMethodBase):
+    """Linear method for ModelOpt static FP8 quantization.
+
+    Supports loading FP8 checkpoints with static weight and activation scales.
+    Future support may include dynamic scales.
+
+    **Limitations**:
+    1. Only supports per-tensor quantization due to `torch._scaled_mm` limitations.
+    2. Only supports the `float8_e4m3fn` data type.
+
+    Args:
+        quant_config (ModelOptFp8Config): The ModelOpt quantization configuration.
+    """
+
+    def __init__(self, quant_config: ModelOptFp8Config):
+        super().__init__()
+        self.quant_config = quant_config
+        self.cutlass_fp8_supported = cutlass_fp8_supported()
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: List[int],
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ) -> None:
+        """Creates and registers weights, weight scales, and input scales for FP8 quantization."""
+        output_size_per_partition = sum(output_partition_sizes)
+        weight_loader = extra_weight_attrs.get("weight_loader")
+        weight_dtype = (
+            torch.float8_e4m3fn
+            if self.quant_config.is_checkpoint_fp8_serialized
+            else params_dtype
+        )
+
+        # Set layer attributes
+        layer.logical_widths = output_partition_sizes
+        layer.input_size_per_partition = input_size_per_partition
+        layer.output_size_per_partition = output_size_per_partition
+
+        # Register weight
+        layer.register_parameter(
+            "weight",
+            ModelWeightParameter(
+                data=torch.empty(
+                    output_size_per_partition,
+                    input_size_per_partition,
+                    dtype=weight_dtype,
+                ),
+                input_dim=1,
+                output_dim=0,
+                weight_loader=weight_loader,
+            ),
+        )
+
+        if self.quant_config.is_checkpoint_fp8_serialized:
+            # Register weight and input scales
+            for scale_name in ["weight_scale", "input_scale"]:
+                layer.register_parameter(
+                    scale_name,
+                    PerTensorScaleParameter(
+                        data=torch.full(
+                            (len(output_partition_sizes),),
+                            torch.finfo(torch.float32).min,
+                        ),
+                        weight_loader=weight_loader,
+                    ),
+                )
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        """Requantizes weights after loading using the maximum scale."""
+        max_w_scale, quantized_weight = requantize_with_max_scale(
+            layer.weight, layer.weight_scale, layer.logical_widths
+        )
+        layer.weight = Parameter(quantized_weight.t(), requires_grad=False)
+        layer.weight_scale = Parameter(max_w_scale, requires_grad=False)
+        layer.input_scale = Parameter(layer.input_scale.max(), requires_grad=False)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """Applies FP8 linear transformation."""
+        return apply_fp8_linear(
+            input=x,
+            weight=layer.weight,
+            weight_scale=layer.weight_scale,
+            input_scale=layer.input_scale,
+            bias=bias,
+            cutlass_fp8_supported=self.cutlass_fp8_supported,
+        )
--- a/python/sglang/srt/layers/quantization/init.py
+++ b/python/sglang/srt/layers/quantization/init.py
@@ -18,6 +18,7 @@ from vllm.model_executor.layers.quantization.gptq import GPTQConfig
 from vllm.model_executor.layers.quantization.gptq_marlin import GPTQMarlinConfig
 from vllm.model_executor.layers.quantization.gptq_marlin_24 import GPTQMarlin24Config
 from vllm.model_executor.layers.quantization.marlin import MarlinConfig
+from vllm.model_executor.layers.quantization.modelopt import ModelOptFp8Config
 from vllm.model_executor.layers.quantization.qqq import QQQConfig
 from vllm.model_executor.layers.quantization.tpu_int8 import Int8TpuConfig

@@ -32,6 +33,7 @@ QUANTIZATION_METHODS: Dict[str, Type[QuantizationConfig]] = {
    "fp8": Fp8Config,
    "fbgemm_fp8": FBGEMMFp8Config,
    "marlin": MarlinConfig,
+    "modelopt": ModelOptFp8Config,
    "gguf": GGUFConfig,
    "gptq_marlin_24": GPTQMarlin24Config,
    "gptq_marlin": GPTQMarlinConfig,
--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -150,6 +150,7 @@ class ModelRunner:
                "enable_nan_detection": server_args.enable_nan_detection,
                "enable_dp_attention": server_args.enable_dp_attention,
                "enable_ep_moe": server_args.enable_ep_moe,
+                "modelopt_config": server_args.modelopt_config,
            }
        )

--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -149,6 +149,7 @@ class ServerArgs:
    torch_compile_max_bs: int = 32
    cuda_graph_max_bs: Optional[int] = None
    torchao_config: str = ""
+    modelopt_config: str = ""
    enable_nan_detection: bool = False
    enable_p2p_check: bool = False
    triton_attention_reduce_in_fp32: bool = False
@@ -361,6 +362,7 @@ class ServerArgs:
                "awq_marlin",
                "bitsandbytes",
                "gguf",
+                "modelopt",
            ],
            help="The quantization method.",
        )
@@ -808,6 +810,12 @@ class ServerArgs:
            default=ServerArgs.torchao_config,
            help="Optimize the model with torchao. Experimental feature. Current choices are: int8dq, int8wo, int4wo-<group_size>, fp8wo, fp8dq-per_tensor, fp8dq-per_row",
        )
+        parser.add_argument(
+            "--modelopt-config",
+            type=str,
+            default=ServerArgs.modelopt_config,
+            help="Optimize the model with nvidia-modelopt. Experimental feature. Current choices are: fp8",
+        )
        parser.add_argument(
            "--enable-nan-detection",
            action="store_true",