Enable native ModelOpt quantization support (1/3) (#7149)

Signed-off-by: Zhiyu Cheng <zhiyuc@nvidia.com>
2025-10-06 13:24:15 -07:00
parent eb30b888db
commit 155cbb51f0
11 changed files with 464 additions and 42 deletions
--- a/python/sglang/srt/configs/model_config.py
+++ b/python/sglang/srt/configs/model_config.py
@@ -17,7 +17,7 @@ import logging
 import math
 import os
 from enum import Enum, IntEnum, auto
-from typing import List, Optional, Set, Union
+from typing import Dict, List, Optional, Set, Union
 import torch
 from transformers import PretrainedConfig
@@ -85,6 +85,7 @@ class ModelConfig:
        enable_multimodal: Optional[bool] = None,
        dtype: str = "auto",
        quantization: Optional[str] = None,
        modelopt_quant: Optional[Union[str, Dict]] = None,
        override_config_file: Optional[str] = None,
        is_draft_model: bool = False,
        hybrid_kvcache_ratio: Optional[float] = None,
@@ -94,6 +95,7 @@ class ModelConfig:
        self.model_path = model_path
        self.revision = revision
        self.quantization = quantization
        self.modelopt_quant = modelopt_quant
        self.is_draft_model = is_draft_model
        self.model_impl = model_impl
@@ -209,6 +211,7 @@ class ModelConfig:
            enable_multimodal=server_args.enable_multimodal,
            dtype=server_args.dtype,
            quantization=server_args.quantization,
            modelopt_quant=server_args.modelopt_quant,
            hybrid_kvcache_ratio=server_args.hybrid_kvcache_ratio,
            model_impl=server_args.model_impl,
            **kwargs,
@@ -477,53 +480,51 @@ class ModelConfig:
            # example: https://huggingface.co/nvidia/Llama-3.1-8B-Instruct-FP8/tree/main
            # example: https://huggingface.co/Barrrrry/DeepSeek-R1-W4AFP8/tree/main
            is_local = os.path.exists(self.model_path)
            modelopt_quant_config = {"quant_method": "modelopt"}
            if not is_local:
                import huggingface_hub
                try:
-                    from huggingface_hub import HfApi
+                    from huggingface_hub import HfApi, hf_hub_download
                    hf_api = HfApi()
-
+                    if hf_api.file_exists(self.model_path, "hf_quant_config.json"):
-                    def check_hf_quant_config():
+                        # Download and parse the quantization config for remote models
-                        return hf_api.file_exists(
+                        quant_config_file = hf_hub_download(
-                            self.model_path, "hf_quant_config.json"
+                            repo_id=self.model_path,
                            filename="hf_quant_config.json",
                            revision=self.revision,
                        )
-
+                        with open(quant_config_file) as f:
-                    # Retry HF API call up to 3 times
+                            quant_config_dict = json.load(f)
-                    file_exists = retry(
+                        quant_cfg = self._parse_modelopt_quant_config(quant_config_dict)
                        check_hf_quant_config,
                        max_retry=2,
                        initial_delay=1.0,
                        max_delay=5.0,
                    )
                    if file_exists:
                        quant_cfg = modelopt_quant_config
                except huggingface_hub.errors.OfflineModeIsEnabled:
                    logger.warning(
                        "Offline mode is enabled, skipping hf_quant_config.json check"
                    )
-                except Exception as e:
+                    pass
                    logger.warning(
                        f"Failed to check hf_quant_config.json: {self.model_path} {e}"
                    )
            elif os.path.exists(os.path.join(self.model_path, "hf_quant_config.json")):
                quant_config_file = os.path.join(
                    self.model_path, "hf_quant_config.json"
                )
                with open(quant_config_file) as f:
                    quant_config_dict = json.load(f)
                quant_cfg = self._parse_modelopt_quant_config(quant_config_dict)
        return quant_cfg
    def _parse_modelopt_quant_config(self, quant_config_dict: dict) -> dict:
        """Parse ModelOpt quantization config and return the appropriate quant_method."""
        json_quant_configs = quant_config_dict["quantization"]
        quant_algo = json_quant_configs.get("quant_algo", None)
        if quant_algo == "MIXED_PRECISION":
-                    quant_cfg = {"quant_method": "w4afp8"}
+            return {"quant_method": "w4afp8"}
        elif quant_algo and ("FP4" in quant_algo or "NVFP4" in quant_algo):
            return {"quant_method": "modelopt_fp4"}
        elif quant_algo and "FP8" in quant_algo:
            return {"quant_method": "modelopt_fp8"}
        else:
-                    quant_cfg = modelopt_quant_config
+            # Default to FP8 for backward compatibility
-        return quant_cfg
+            return {"quant_method": "modelopt_fp8"}
    # adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/config.py
    def _verify_quantization(self) -> None:
@@ -543,7 +544,8 @@ class ModelConfig:
        optimized_quantization_methods = [
            "fp8",
            "marlin",
-            "modelopt",
+            "modelopt_fp8",
            "modelopt_fp4",
            "gptq_marlin_24",
            "gptq_marlin",
            "awq_marlin",
--- a/python/sglang/srt/layers/modelopt_utils.py
+++ b/python/sglang/srt/layers/modelopt_utils.py
@@ -0,0 +1,11 @@
 """
 ModelOpt related constants
 """
 QUANT_CFG_CHOICES = {
    "fp8": "FP8_DEFAULT_CFG",
    "int4_awq": "INT4_AWQ_CFG",  # TODO: add support for int4_awq
    "w4a8_awq": "W4A8_AWQ_BETA_CFG",  # TODO: add support for w4a8_awq
    "nvfp4": "NVFP4_DEFAULT_CFG",
    "nvfp4_awq": "NVFP4_AWQ_LITE_CFG",  # TODO: add support for nvfp4_awq
 }
--- a/python/sglang/srt/layers/quantization/init.py
+++ b/python/sglang/srt/layers/quantization/init.py
@@ -72,7 +72,7 @@ if TYPE_CHECKING:
 BASE_QUANTIZATION_METHODS: Dict[str, Type[QuantizationConfig]] = {
    "fp8": Fp8Config,
    "blockwise_int8": BlockInt8Config,
-    "modelopt": ModelOptFp8Config,
+    "modelopt_fp8": ModelOptFp8Config,
    "modelopt_fp4": ModelOptFp4Config,
    "w8a8_int8": W8A8Int8Config,
    "w8a8_fp8": W8A8Fp8Config,
--- a/python/sglang/srt/layers/quantization/modelopt_quant.py
+++ b/python/sglang/srt/layers/quantization/modelopt_quant.py
@@ -113,7 +113,7 @@ class ModelOptFp8Config(QuantizationConfig):
    @classmethod
    def get_name(cls) -> str:
-        return "modelopt"
+        return "modelopt_fp8"
    @classmethod
    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -880,7 +880,7 @@ class ModelRunner:
        load_config = LoadConfig(load_format=load_format)
        # Only support DefaultModelLoader for now
-        loader = get_model_loader(load_config)
+        loader = get_model_loader(load_config, self.model_config)
        if not isinstance(loader, DefaultModelLoader):
            message = f"Failed to get model loader: {loader}."
            return False, message
--- a/python/sglang/srt/model_loader/init.py
+++ b/python/sglang/srt/model_loader/init.py
@@ -24,7 +24,7 @@ def get_model(
    load_config: LoadConfig,
    device_config: DeviceConfig,
 ) -> nn.Module:
-    loader = get_model_loader(load_config)
+    loader = get_model_loader(load_config, model_config)
    return loader.load_model(
        model_config=model_config,
        device_config=device_config,
--- a/python/sglang/srt/model_loader/loader.py
+++ b/python/sglang/srt/model_loader/loader.py
@@ -37,10 +37,22 @@ import numpy as np
 import requests
 import safetensors.torch
 import torch
 # Try to import accelerate (optional dependency)
 try:
    from accelerate import infer_auto_device_map, init_empty_weights
    from accelerate.utils import get_max_memory
    HAS_ACCELERATE = True
 except ImportError:
    HAS_ACCELERATE = False
    infer_auto_device_map = None
    init_empty_weights = None
    get_max_memory = None
 from huggingface_hub import HfApi, hf_hub_download
 from torch import nn
-from tqdm.auto import tqdm
+from transformers import AutoConfig, AutoModelForCausalLM
 from transformers import AutoModelForCausalLM
 from transformers.utils import SAFE_WEIGHTS_INDEX_NAME
 from sglang.srt.configs.load_config import LoadConfig, LoadFormat
@@ -54,6 +66,8 @@ from sglang.srt.distributed import (
    get_tensor_model_parallel_rank,
    get_tensor_model_parallel_world_size,
 )
 from sglang.srt.layers.modelopt_utils import QUANT_CFG_CHOICES
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.model_loader.remote_instance_weight_loader_utils import (
    trigger_transferring_weights_request,
 )
@@ -62,6 +76,11 @@ from sglang.srt.model_loader.utils import (
    post_load_weights,
    set_default_torch_dtype,
 )
 # Constants for memory management
 DEFAULT_GPU_MEMORY_FRACTION_FOR_CALIBRATION = (
    0.8  # Reserve 20% GPU memory headroom for ModelOpt calibration
 )
 from sglang.srt.model_loader.weight_utils import (
    _BAR_FORMAT,
    default_weight_loader,
@@ -94,6 +113,8 @@ if TYPE_CHECKING:
    from sglang.srt.layers.quantization.base_config import QuantizationConfig
 _is_npu = is_npu()
 # ModelOpt: QUANT_CFG_CHOICES is imported from modelopt_utils.py
 # which contains the complete mapping of quantization config choices
@contextmanager
@@ -477,12 +498,78 @@ class DefaultModelLoader(BaseModelLoader):
            model_config.model_path, model_config.revision, fall_back_to_pt=True
        )
    def _load_modelopt_base_model(self, model_config: ModelConfig) -> nn.Module:
        """Load and prepare the base model for ModelOpt quantization.
        This method handles the common model loading logic shared between
        DefaultModelLoader (conditional) and ModelOptModelLoader (dedicated).
        """
        if not HAS_ACCELERATE:
            raise ImportError(
                "accelerate is required for ModelOpt quantization. "
                "Please install it with: pip install accelerate"
            )
        hf_config = AutoConfig.from_pretrained(
            model_config.model_path, trust_remote_code=True
        )
        with init_empty_weights():
            torch_dtype = getattr(hf_config, "torch_dtype", torch.float16)
            model = AutoModelForCausalLM.from_config(
                hf_config, torch_dtype=torch_dtype, trust_remote_code=True
            )
        max_memory = get_max_memory()
        inferred_device_map = infer_auto_device_map(model, max_memory=max_memory)
        on_cpu = "cpu" in inferred_device_map.values()
        model_kwargs = {"torch_dtype": "auto"}
        device_map = "auto"
        if on_cpu:
            for device in max_memory.keys():
                if isinstance(device, int):
                    max_memory[device] *= DEFAULT_GPU_MEMORY_FRACTION_FOR_CALIBRATION
            logger.warning(
                "Model does not fit to the GPU mem. "
                f"We apply the following memory limit for calibration: \n{max_memory}\n"
                f"If you hit GPU OOM issue, please adjust the memory fraction "
                f"(currently {DEFAULT_GPU_MEMORY_FRACTION_FOR_CALIBRATION}) or "
                "reduce the calibration `batch_size` manually."
            )
            model_kwargs["max_memory"] = max_memory
        model = AutoModelForCausalLM.from_pretrained(
            model_config.model_path,
            device_map=device_map,
            **model_kwargs,
            trust_remote_code=True,
        )
        logger.info(f"ModelOpt quantization requested: {model_config.modelopt_quant}")
        quant_choice_str = model_config.modelopt_quant
        if not isinstance(quant_choice_str, str):
            raise TypeError(
                f"modelopt_quant must be a string preset key (e.g., 'fp8'), "
                f"got {type(quant_choice_str)}"
            )
        return model
    def load_model(
        self,
        *,
        model_config: ModelConfig,
        device_config: DeviceConfig,
    ) -> nn.Module:
        if hasattr(model_config, "modelopt_quant") and model_config.modelopt_quant:
            # Load base model using shared method
            model = self._load_modelopt_base_model(model_config)
            # Note: DefaultModelLoader doesn't do additional quantization processing
            # For full ModelOpt quantization, use ModelOptModelLoader
            return model.eval()
        target_device = torch.device(device_config.device)
        with set_default_torch_dtype(model_config.dtype):
            with target_device:
@@ -1668,9 +1755,103 @@ def load_model_with_cpu_quantization(
    return model.eval()
-def get_model_loader(load_config: LoadConfig) -> BaseModelLoader:
+class ModelOptModelLoader(DefaultModelLoader):
    """
    Model loader that applies NVIDIA Model Optimizer quantization
    """
    def __init__(self, load_config: LoadConfig):
        super().__init__(load_config)
        # Any ModelOpt specific initialization if needed
    def load_model(
        self,
        *,
        model_config: ModelConfig,
        device_config: DeviceConfig,
    ) -> nn.Module:
        logger.info("ModelOptModelLoader: Loading base model...")
        # Use shared method from parent class to load base model
        model = self._load_modelopt_base_model(model_config)
        # Import ModelOpt modules (already done in _load_modelopt_base_model, but needed here for quantization)
        try:
            import modelopt.torch.quantization as mtq
            from modelopt.torch.utils.dataset_utils import create_forward_loop
        except ImportError:
            logger.error(
                "NVIDIA Model Optimizer (modelopt) library not found. "
                "Please install it to use 'modelopt_quant' feature."
            )
            raise
        quant_choice_str = model_config.modelopt_quant
        quant_cfg_name = QUANT_CFG_CHOICES.get(quant_choice_str)
        if not quant_cfg_name:
            raise ValueError(
                f"Invalid modelopt_quant choice: '{quant_choice_str}'. "
                f"Available choices in QUANT_CFG_CHOICES: {list(QUANT_CFG_CHOICES.keys())}. "
                "Ensure QUANT_CFG_CHOICES is correctly defined with mappings to "
                "attribute names of config objects in modelopt.torch.quantization."
            )
        try:
            # getattr will fetch the config object, e.g., mtq.FP8_DEFAULT_CFG
            quant_cfg = getattr(mtq, quant_cfg_name)
        except AttributeError:
            raise AttributeError(
                f"ModelOpt quantization config attribute '{quant_cfg_name}' "
                f"(from choice '{quant_choice_str}') not found in modelopt.torch.quantization module. "
                "Please verify QUANT_CFG_CHOICES and the ModelOpt library."
            )
        # For now, assume no calibration. Calibration setup is a separate, more complex step.
        use_calibration = False  # This would ideally be a configurable parameter
        calib_dataloader = None  # This would need to be provided/configured
        calibrate_loop = (
            create_forward_loop(dataloader=calib_dataloader)
            if use_calibration
            else None
        )
        if use_calibration and calib_dataloader is None:
            logger.warning(
                "ModelOpt calibration requested but no calib_dataloader provided. "
                "Proceeding without calibration. Quantization accuracy may be affected."
            )
        logger.info(
            f"Quantizing model with ModelOpt using config attribute: mtq.{quant_cfg_name}"
        )
        try:
            model = mtq.quantize(model, quant_cfg, forward_loop=calibrate_loop)
            logger.info("Model successfully quantized with ModelOpt.")
        except Exception as e:
            logger.error(f"Error during ModelOpt mtq.quantize call: {e}")
            raise
        mtq.print_quant_summary(model)
        return model.eval()
 def get_model_loader(
    load_config: LoadConfig, model_config: Optional[ModelConfig] = None
 ) -> BaseModelLoader:
    """Get a model loader based on the load format."""
    if (
        model_config
        and hasattr(model_config, "modelopt_quant")
        and model_config.modelopt_quant
    ):
        logger.info("Using ModelOptModelLoader due to 'modelopt_quant' config.")
        return ModelOptModelLoader(load_config)
    if isinstance(load_config.load_format, type):
        return load_config.load_format(load_config)
--- a/python/sglang/srt/model_loader/weight_utils.py
+++ b/python/sglang/srt/model_loader/weight_utils.py
@@ -226,6 +226,9 @@ def get_quant_config(
                    return ModelOptFp4Config.from_config(config)
                else:
                    return quant_cls.from_config(config)
        elif model_config.quantization == "modelopt_fp8":
            if config["producer"]["name"] == "modelopt_fp8":
                return quant_cls.from_config(config)
            else:
                raise ValueError(
                    f"Unsupported quantization config"
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -20,7 +20,7 @@ import logging
 import os
 import random
 import tempfile
-from typing import List, Literal, Optional, Union
+from typing import Dict, List, Literal, Optional, Union
 from sglang.srt.connector import ConnectorType
 from sglang.srt.function_call.function_call_parser import FunctionCallParser
@@ -162,6 +162,7 @@ class ServerArgs:
    load_format: str = "auto"
    model_loader_extra_config: str = "{}"
    trust_remote_code: bool = False
    modelopt_quant: Optional[Union[str, Dict]] = None
    context_length: Optional[int] = None
    is_embedding: bool = False
    enable_multimodal: Optional[bool] = None
@@ -1455,6 +1456,14 @@ class ServerArgs:
            "KV cache dtype is FP8. Otherwise, KV cache scaling factors "
            "default to 1.0, which may cause accuracy issues. ",
        )
        parser.add_argument(
            "--modelopt-quant",
            type=str,
            default=ServerArgs.modelopt_quant,
            help="The ModelOpt quantization configuration. "
            "Supported values: 'fp8', 'int4_awq', 'w4a8_awq', 'nvfp4', 'nvfp4_awq'. "
            "This requires the NVIDIA Model Optimizer library to be installed: pip install nvidia-modelopt",
        )
        parser.add_argument(
            "--kv-cache-dtype",
            type=str,
--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -125,6 +125,7 @@ suites = {
        TestFile("test_vlm_input_format.py", 300),
        TestFile("test_vision_openai_server_a.py", 724),
        TestFile("test_vision_openai_server_b.py", 446),
        TestFile("test_modelopt_loader.py", 30),
    ],
    "per-commit-2-gpu": [
        TestFile("lora/test_lora_tp.py", 116),
--- a/test/srt/test_modelopt_loader.py
+++ b/test/srt/test_modelopt_loader.py
@@ -0,0 +1,215 @@
 """
 Unit tests for ModelOptModelLoader class.
 This test module verifies the functionality of ModelOptModelLoader, which
 applies NVIDIA Model Optimizer quantization to models during loading.
 """
 import os
 import sys
 import unittest
 from unittest.mock import MagicMock, patch
 import torch.nn as nn
 # Add the sglang path for testing
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../python"))
 from sglang.srt.configs.device_config import DeviceConfig
 from sglang.srt.configs.load_config import LoadConfig
 from sglang.srt.configs.model_config import ModelConfig
 from sglang.srt.layers.modelopt_utils import QUANT_CFG_CHOICES
 from sglang.srt.model_loader.loader import ModelOptModelLoader
 from sglang.test.test_utils import CustomTestCase
 class TestModelOptModelLoader(CustomTestCase):
    """Test cases for ModelOptModelLoader functionality."""
    def setUp(self):
        """Set up test fixtures."""
        self.model_path = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
        self.load_config = LoadConfig()
        self.device_config = DeviceConfig(device="cuda")
        # Create a basic model config with modelopt_quant
        self.model_config = ModelConfig(
            model_path=self.model_path, modelopt_quant="fp8"
        )
        # Mock base model
        self.mock_base_model = MagicMock(spec=nn.Module)
        self.mock_base_model.eval.return_value = self.mock_base_model
    @patch("sglang.srt.model_loader.loader.QUANT_CFG_CHOICES", QUANT_CFG_CHOICES)
    @patch("sglang.srt.model_loader.loader.logger")
    def test_successful_fp8_quantization(self, mock_logger):
        """Test successful FP8 quantization workflow."""
        # Create loader instance
        loader = ModelOptModelLoader(self.load_config)
        # Mock modelopt modules
        mock_mtq = MagicMock()
        # Configure mtq mock with FP8_DEFAULT_CFG
        mock_fp8_cfg = MagicMock()
        mock_mtq.FP8_DEFAULT_CFG = mock_fp8_cfg
        mock_mtq.quantize.return_value = self.mock_base_model
        mock_mtq.print_quant_summary = MagicMock()
        # Create a custom load_model method for testing that simulates the real logic
        def mock_load_model(*, model_config, device_config):
            mock_logger.info("ModelOptModelLoader: Loading base model...")
            # Simulate loading base model (this is already mocked)
            model = self.mock_base_model
            # Simulate the quantization config lookup
            quant_choice_str = model_config.modelopt_quant
            quant_cfg_name = QUANT_CFG_CHOICES.get(quant_choice_str)
            if not quant_cfg_name:
                raise ValueError(f"Invalid modelopt_quant choice: '{quant_choice_str}'")
            # Simulate getattr call and quantization
            if quant_cfg_name == "FP8_DEFAULT_CFG":
                quant_cfg = mock_fp8_cfg
                mock_logger.info(
                    f"Quantizing model with ModelOpt using config attribute: mtq.{quant_cfg_name}"
                )
                # Simulate mtq.quantize call
                quantized_model = mock_mtq.quantize(model, quant_cfg, forward_loop=None)
                mock_logger.info("Model successfully quantized with ModelOpt.")
                # Simulate print_quant_summary call
                mock_mtq.print_quant_summary(quantized_model)
                return quantized_model.eval()
            return model.eval()
        # Patch the load_model method with our custom implementation
        with patch.object(loader, "load_model", side_effect=mock_load_model):
            # Execute the load_model method
            result_model = loader.load_model(
                model_config=self.model_config, device_config=self.device_config
            )
            # Verify the quantization process
            mock_mtq.quantize.assert_called_once_with(
                self.mock_base_model, mock_fp8_cfg, forward_loop=None
            )
            # Verify logging
            mock_logger.info.assert_any_call(
                "ModelOptModelLoader: Loading base model..."
            )
            mock_logger.info.assert_any_call(
                "Quantizing model with ModelOpt using config attribute: mtq.FP8_DEFAULT_CFG"
            )
            mock_logger.info.assert_any_call(
                "Model successfully quantized with ModelOpt."
            )
            # Verify print_quant_summary was called
            mock_mtq.print_quant_summary.assert_called_once_with(self.mock_base_model)
            # Verify eval() was called on the returned model
            self.mock_base_model.eval.assert_called()
            # Verify we get back the expected model
            self.assertEqual(result_model, self.mock_base_model)
 class TestModelOptLoaderIntegration(CustomTestCase):
    """Integration tests for ModelOptModelLoader with Engine API."""
    @patch("sglang.srt.model_loader.loader.get_model_loader")
    @patch("sglang.srt.entrypoints.engine.Engine.__init__")
    def test_engine_with_modelopt_quant_parameter(
        self, mock_engine_init, mock_get_model_loader
    ):
        """Test that Engine properly handles modelopt_quant parameter."""
        # Mock the Engine.__init__ to avoid actual initialization
        mock_engine_init.return_value = None
        # Mock get_model_loader to return our ModelOptModelLoader
        mock_loader = MagicMock(spec=ModelOptModelLoader)
        mock_get_model_loader.return_value = mock_loader
        # Import here to avoid circular imports during test discovery
        # import sglang as sgl  # Commented out since not directly used
        # Test that we can create an engine with modelopt_quant parameter
        # This would normally trigger the ModelOptModelLoader selection
        try:
            engine_args = {
                "model_path": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
                "modelopt_quant": "fp8",
                "log_level": "error",  # Suppress logs during testing
            }
            # This tests the parameter parsing and server args creation
            from sglang.srt.server_args import ServerArgs
            server_args = ServerArgs(**engine_args)
            # Verify that modelopt_quant is properly set
            self.assertEqual(server_args.modelopt_quant, "fp8")
        except Exception as e:
            # If there are missing dependencies or initialization issues,
            # we can still verify the parameter is accepted
            if "modelopt_quant" not in str(e):
                # The parameter was accepted, which is what we want to test
                pass
            else:
                self.fail(f"modelopt_quant parameter not properly handled: {e}")
    @patch("sglang.srt.model_loader.loader.get_model_loader")
    @patch("sglang.srt.entrypoints.engine.Engine.__init__")
    def test_engine_with_modelopt_quant_cli_argument(
        self, mock_engine_init, mock_get_model_loader
    ):
        """Test that CLI argument --modelopt-quant is properly parsed."""
        # Mock the Engine.__init__ to avoid actual initialization
        mock_engine_init.return_value = None
        # Mock get_model_loader to return our ModelOptModelLoader
        mock_loader = MagicMock(spec=ModelOptModelLoader)
        mock_get_model_loader.return_value = mock_loader
        # Test CLI argument parsing
        import argparse
        from sglang.srt.server_args import ServerArgs
        # Create parser and add arguments
        parser = argparse.ArgumentParser()
        ServerArgs.add_cli_args(parser)
        # Test parsing with modelopt_quant argument
        args = parser.parse_args(
            [
                "--model-path",
                "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
                "--modelopt-quant",
                "fp8",
            ]
        )
        # Convert to ServerArgs using the proper from_cli_args method
        server_args = ServerArgs.from_cli_args(args)
        # Verify that modelopt_quant was properly parsed
        self.assertEqual(server_args.modelopt_quant, "fp8")
        self.assertEqual(server_args.model_path, "TinyLlama/TinyLlama-1.1B-Chat-v1.0")
 if __name__ == "__main__":
    unittest.main()