Enable native ModelOpt quantization support (3/3) (#10154)

Signed-off-by: Zhiyu Cheng <zhiyuc@nvidia.com>
2025-10-21 21:44:29 -07:00
parent 4b65ed42cc
commit 80b2b3207a
16 changed files with 1528 additions and 39 deletions
--- a/docs/advanced_features/quantization.md
+++ b/docs/advanced_features/quantization.md
@@ -110,6 +110,157 @@ python3 -m sglang.launch_server \
    --port 30000 --host 0.0.0.0
 ```

+#### Using [NVIDIA ModelOpt](https://github.com/NVIDIA/TensorRT-Model-Optimizer)
+
+NVIDIA Model Optimizer (ModelOpt) provides advanced quantization techniques optimized for NVIDIA hardware. SGLang includes a streamlined workflow for quantizing models with ModelOpt and automatically exporting them for deployment.
+
+##### Installation
+
+First, install ModelOpt. You can either install it directly or as an optional SGLang dependency:
+
+```bash
+# Option 1: Install ModelOpt directly
+pip install nvidia-modelopt
+
+# Option 2: Install SGLang with ModelOpt support (recommended)
+pip install sglang[modelopt]
+```
+
+##### Quantization and Export Workflow
+
+SGLang provides an example script that demonstrates the complete ModelOpt quantization and export workflow:
+
+```bash
+# Quantize and export a model using ModelOpt FP8 quantization
+python examples/usage/modelopt_quantize_and_export.py quantize \
+    --model-path TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
+    --export-dir ./quantized_tinyllama_fp8 \
+    --quantization-method modelopt_fp8
+
+# For FP4 quantization
+python examples/usage/modelopt_quantize_and_export.py quantize \
+    --model-path TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
+    --export-dir ./quantized_tinyllama_fp4 \
+    --quantization-method modelopt_fp4
+```
+
+##### Available Quantization Methods
+
+- `modelopt_fp8`: FP8 quantization with optimal performance on NVIDIA Hopper and Blackwell GPUs
+- `modelopt_fp4`: FP4 quantization with optimal performance on Nvidia Blackwell GPUs
+
+##### Python API Usage
+
+You can also use ModelOpt quantization programmatically:
+
+```python
+import sglang as sgl
+from sglang.srt.configs.device_config import DeviceConfig
+from sglang.srt.configs.load_config import LoadConfig
+from sglang.srt.configs.model_config import ModelConfig
+from sglang.srt.model_loader.loader import get_model_loader
+
+# Configure model with ModelOpt quantization and export
+model_config = ModelConfig(
+    model_path="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    quantization="modelopt_fp8",  # or "modelopt_fp4"
+    trust_remote_code=True,
+)
+
+load_config = LoadConfig(
+    modelopt_export_path="./exported_model",
+    modelopt_checkpoint_save_path="./checkpoint.pth",  # optional, fake quantized checkpoint
+)
+device_config = DeviceConfig(device="cuda")
+
+# Load and quantize the model (export happens automatically)
+model_loader = get_model_loader(load_config, model_config)
+quantized_model = model_loader.load_model(
+    model_config=model_config,
+    device_config=device_config,
+)
+```
+
+##### Deploying Quantized Models
+
+After quantization and export, you can deploy the model with SGLang:
+
+```bash
+# Deploy the exported quantized model
+python -m sglang.launch_server \
+    --model-path ./quantized_tinyllama_fp8 \
+    --quantization modelopt \
+    --port 30000 --host 0.0.0.0
+```
+
+Or using the Python API:
+
+```python
+import sglang as sgl
+
+# Deploy exported ModelOpt quantized model
+llm = sgl.Engine(
+    model_path="./quantized_tinyllama_fp8",
+    quantization="modelopt"
+)
+
+# Run inference
+prompts = ["Hello, how are you?", "What is the capital of France?"]
+sampling_params = {"temperature": 0.8, "top_p": 0.95, "max_new_tokens": 100}
+outputs = llm.generate(prompts, sampling_params)
+
+for i, output in enumerate(outputs):
+    print(f"Prompt: {prompts[i]}")
+    print(f"Output: {output.outputs[0].text}")
+```
+
+##### Advanced Features
+
+**Checkpoint Management**: Save and restore fake quantized checkpoints for reuse:
+
+```bash
+# Save the fake quantized checkpoint during quantization
+python examples/usage/modelopt_quantize_and_export.py quantize \
+    --model-path meta-llama/Llama-3.2-1B-Instruct \
+    --export-dir ./quantized_model \
+    --quantization-method modelopt_fp8 \
+    --checkpoint-save-path ./my_checkpoint.pth
+
+# The checkpoint can be reused for future quantization runs and skip calibration
+```
+
+**Export-only Workflow**: If you have a pre-existing fake quantized ModelOpt checkpoint, you can export it directly:
+
+```python
+from sglang.srt.configs.device_config import DeviceConfig
+from sglang.srt.configs.load_config import LoadConfig
+from sglang.srt.configs.model_config import ModelConfig
+from sglang.srt.model_loader.loader import get_model_loader
+
+model_config = ModelConfig(
+    model_path="meta-llama/Llama-3.2-1B-Instruct",
+    quantization="modelopt_fp8",
+    trust_remote_code=True,
+)
+
+load_config = LoadConfig(
+    modelopt_checkpoint_restore_path="./my_checkpoint.pth",
+    modelopt_export_path="./exported_model",
+)
+
+# Load and export the model
+model_loader = get_model_loader(load_config, model_config)
+model_loader.load_model(model_config=model_config, device_config=DeviceConfig())
+```
+
+##### Benefits of ModelOpt
+
+- **Hardware Optimization**: Specifically optimized for NVIDIA GPU architectures
+- **Advanced Quantization**: Supports cutting-edge FP8 and FP4 quantization techniques
+- **Seamless Integration**: Automatic export to HuggingFace format for easy deployment
+- **Calibration-based**: Uses calibration datasets for optimal quantization quality
+- **Production Ready**: Enterprise-grade quantization with NVIDIA support
+
 ## Online Quantization

 To enable online quantization, you can simply specify `--quantization` in the command line. For example, you can launch the server with the following command to enable `FP8` quantization for model `meta-llama/Meta-Llama-3.1-8B-Instruct`:
@@ -148,5 +299,6 @@ python3 -m sglang.launch_server \

 - [GPTQModel](https://github.com/ModelCloud/GPTQModel)
 - [LLM Compressor](https://github.com/vllm-project/llm-compressor/)
+- [NVIDIA Model Optimizer (ModelOpt)](https://github.com/NVIDIA/TensorRT-Model-Optimizer)
 - [Torchao: PyTorch Architecture Optimization](https://github.com/pytorch/ao)
 - [vLLM Quantization](https://docs.vllm.ai/en/latest/quantization/)
--- a/examples/usage/modelopt_quantize_and_export.py
+++ b/examples/usage/modelopt_quantize_and_export.py
@@ -0,0 +1,303 @@
+#!/usr/bin/env python3
+"""
+Example: ModelOpt Quantization and Export with SGLang
+
+This example demonstrates the streamlined workflow for quantizing a model with
+ModelOpt and automatically exporting it for deployment with SGLang.
+"""
+
+import argparse
+import os
+from typing import Optional
+
+import torch
+
+import sglang as sgl
+from sglang.srt.configs.device_config import DeviceConfig
+from sglang.srt.configs.load_config import LoadConfig
+from sglang.srt.configs.model_config import ModelConfig
+from sglang.srt.distributed.parallel_state import (
+    init_distributed_environment,
+    initialize_model_parallel,
+)
+from sglang.srt.model_loader.loader import get_model_loader
+
+
+def _validate_export(export_dir: str) -> bool:
+    """Validate that an exported model directory contains the expected files."""
+    import glob
+
+    required_files = ["config.json", "tokenizer_config.json"]
+
+    if not os.path.exists(export_dir):
+        return False
+
+    # Check required files
+    for file in required_files:
+        if not os.path.exists(os.path.join(export_dir, file)):
+            return False
+
+    # Check for model files using pattern matching to handle sharded models
+    model_patterns = [
+        "model*.safetensors",
+        "pytorch_model*.bin",
+    ]
+
+    has_model_file = False
+    for pattern in model_patterns:
+        matching_files = glob.glob(os.path.join(export_dir, pattern))
+        if matching_files:
+            has_model_file = True
+            break
+
+    return has_model_file
+
+
+def _get_export_info(export_dir: str) -> Optional[dict]:
+    """Get information about an exported model."""
+    import json
+
+    if not _validate_export(export_dir):
+        return None
+
+    try:
+        config_path = os.path.join(export_dir, "config.json")
+        with open(config_path, "r") as f:
+            config = json.load(f)
+
+        return {
+            "model_type": config.get("model_type", "unknown"),
+            "architectures": config.get("architectures", []),
+            "quantization_config": config.get("quantization_config", {}),
+            "export_dir": export_dir,
+        }
+    except Exception:
+        return None
+
+
+def quantize_and_export_model(
+    model_path: str,
+    export_dir: str,
+    quantization_method: str = "modelopt_fp8",
+    checkpoint_save_path: Optional[str] = None,
+    device: str = "cuda",
+) -> None:
+    """
+    Quantize a model with ModelOpt and export it for SGLang deployment.
+
+    Args:
+        model_path: Path to the original model
+        export_dir: Directory to export the quantized model
+        quantization_method: Quantization method ("modelopt_fp8" or "modelopt_fp4")
+        checkpoint_save_path: Optional path to save ModelOpt checkpoint
+        device: Device to use for quantization
+    """
+    print("🚀 Starting ModelOpt quantization and export workflow")
+    print(f"📥 Input model: {model_path}")
+    print(f"📤 Export directory: {export_dir}")
+    print(f"⚙️  Quantization method: {quantization_method}")
+
+    # Initialize minimal distributed environment for single GPU quantization
+    if not torch.distributed.is_initialized():
+        print("🔧 Initializing distributed environment...")
+        # Set up environment variables for single-process distributed
+        os.environ["RANK"] = "0"
+        os.environ["WORLD_SIZE"] = "1"
+        os.environ["MASTER_ADDR"] = "localhost"
+        os.environ["MASTER_PORT"] = "12355"  # Use a different port than tests
+        os.environ["LOCAL_RANK"] = "0"
+
+        init_distributed_environment(
+            world_size=1,
+            rank=0,
+            local_rank=0,
+            backend="nccl" if device == "cuda" else "gloo",
+        )
+        initialize_model_parallel(
+            tensor_model_parallel_size=1,
+            pipeline_model_parallel_size=1,
+        )
+
+    # Configure model loading with ModelOpt quantization and export
+    model_config = ModelConfig(
+        model_path=model_path,
+        quantization=quantization_method,  # Use unified quantization flag
+        trust_remote_code=True,
+    )
+
+    load_config = LoadConfig(
+        modelopt_checkpoint_save_path=checkpoint_save_path,
+        modelopt_export_path=export_dir,
+    )
+    device_config = DeviceConfig(device=device)
+
+    # Load and quantize the model (export happens automatically)
+    print("🔄 Loading and quantizing model...")
+    model_loader = get_model_loader(load_config, model_config)
+
+    try:
+        model_loader.load_model(
+            model_config=model_config,
+            device_config=device_config,
+        )
+        print("✅ Model quantized successfully!")
+
+        # Validate the export
+        if _validate_export(export_dir):
+            print("✅ Export validation passed!")
+
+            info = _get_export_info(export_dir)
+            if info:
+                print("📋 Model info:")
+                print(f"   - Type: {info['model_type']}")
+                print(f"   - Architecture: {info['architectures']}")
+                print(f"   - Quantization: {info['quantization_config']}")
+        else:
+            print("❌ Export validation failed!")
+            return
+
+    except Exception as e:
+        print(f"❌ Quantization failed: {e}")
+        return
+
+    print("\n🎉 Workflow completed successfully!")
+    print(f"📁 Quantized model exported to: {export_dir}")
+    print("\n🚀 To use the exported model:")
+    print(
+        f"   python -m sglang.launch_server --model-path {export_dir} --quantization modelopt"
+    )
+    print("\n   # Or in Python:")
+    print("   import sglang as sgl")
+    print(f"   llm = sgl.Engine(model_path='{export_dir}', quantization='modelopt')")
+    print("   # Note: 'modelopt' auto-detects FP4/FP8 from model config")
+
+
+def deploy_exported_model(
+    export_dir: str,
+    host: str = "127.0.0.1",
+    port: int = 30000,
+) -> None:
+    """
+    Deploy an exported ModelOpt quantized model with SGLang.
+
+    Args:
+        export_dir: Directory containing the exported model
+        host: Host to bind the server to
+        port: Port to bind the server to
+    """
+    print(f"🚀 Deploying exported model from: {export_dir}")
+
+    # Validate export first
+    if not _validate_export(export_dir):
+        print("❌ Invalid export directory!")
+        return
+
+    try:
+        # Launch SGLang engine with the exported model
+        # Using generic "modelopt" for auto-detection of FP4/FP8
+        llm = sgl.Engine(
+            model_path=export_dir,
+            quantization="modelopt",
+            host=host,
+            port=port,
+        )
+
+        print("✅ Model deployed successfully!")
+        print(f"🌐 Server running at http://{host}:{port}")
+
+        # Example inference
+        prompts = ["Hello, how are you?", "What is the capital of France?"]
+        sampling_params = {"temperature": 0.8, "top_p": 0.95, "max_new_tokens": 100}
+
+        print("\n🧪 Running example inference...")
+        outputs = llm.generate(prompts, sampling_params)
+
+        for i, output in enumerate(outputs):
+            print(f"Prompt {i+1}: {prompts[i]}")
+            print(f"Output: {output['text']}")
+            print()
+
+    except Exception as e:
+        print(f"❌ Deployment failed: {e}")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="ModelOpt Quantization and Export with SGLang",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Quantize and export a model (recommended workflow)
+  python modelopt_quantize_and_export.py quantize \\
+    --model-path TinyLlama/TinyLlama-1.1B-Chat-v1.0 \\
+    --export-dir ./quantized_model \\
+    --quantization-method modelopt_fp8
+
+  # Deploy a pre-exported model
+  python modelopt_quantize_and_export.py deploy \\
+    --export-dir ./quantized_model
+        """,
+    )
+
+    subparsers = parser.add_subparsers(dest="command", help="Available commands")
+
+    # Quantize command
+    quantize_parser = subparsers.add_parser(
+        "quantize", help="Quantize and export a model"
+    )
+    quantize_parser.add_argument(
+        "--model-path", required=True, help="Path to the model to quantize"
+    )
+    quantize_parser.add_argument(
+        "--export-dir", required=True, help="Directory to export the quantized model"
+    )
+    quantize_parser.add_argument(
+        "--quantization-method",
+        choices=["modelopt_fp8", "modelopt_fp4"],
+        default="modelopt_fp8",
+        help="Quantization method to use",
+    )
+    quantize_parser.add_argument(
+        "--checkpoint-save-path", help="Optional path to save ModelOpt checkpoint"
+    )
+    quantize_parser.add_argument(
+        "--device", default="cuda", help="Device to use for quantization"
+    )
+
+    # TODO: Quantize-and-serve command removed due to compatibility issues
+    # Use the separate quantize-then-deploy workflow instead
+
+    # Deploy command
+    deploy_parser = subparsers.add_parser("deploy", help="Deploy an exported model")
+    deploy_parser.add_argument(
+        "--export-dir", required=True, help="Directory containing the exported model"
+    )
+    deploy_parser.add_argument(
+        "--host", default="127.0.0.1", help="Host to bind the server to"
+    )
+    deploy_parser.add_argument(
+        "--port", type=int, default=30000, help="Port to bind the server to"
+    )
+
+    args = parser.parse_args()
+
+    if args.command == "quantize":
+        quantize_and_export_model(
+            model_path=args.model_path,
+            export_dir=args.export_dir,
+            quantization_method=args.quantization_method,
+            checkpoint_save_path=args.checkpoint_save_path,
+            device=args.device,
+        )
+    elif args.command == "deploy":
+        deploy_exported_model(
+            export_dir=args.export_dir,
+            host=args.host,
+            port=args.port,
+        )
+    else:
+        parser.print_help()
+
+
+if __name__ == "__main__":
+    main()
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -75,12 +75,7 @@ dependencies = [
 ]

 [project.optional-dependencies]
-tracing = [
-  "opentelemetry-api",
-  "opentelemetry-exporter-otlp",
-  "opentelemetry-exporter-otlp-proto-grpc",
-  "opentelemetry-sdk",
-]
+modelopt = ["nvidia-modelopt"]
 test = [
  "accelerate",
  "expecttest",
@@ -107,6 +102,12 @@ cu130_all = [
  "sglang[decord]",
  "sglang[cu130]"
 ]
+tracing = [
+  "opentelemetry-api",
+  "opentelemetry-exporter-otlp",
+  "opentelemetry-exporter-otlp-proto-grpc",
+  "opentelemetry-sdk",
+]

 # To be deprecated in 2 weeks
 blackwell = ["sglang[dev]"]
--- a/python/sglang/srt/configs/load_config.py
+++ b/python/sglang/srt/configs/load_config.py
@@ -6,6 +6,7 @@ from typing import List, Optional, Union

 import orjson

+from sglang.srt.configs.modelopt_config import ModelOptConfig
 from sglang.srt.utils import is_hip

 logger = logging.getLogger(__name__)
@@ -51,6 +52,11 @@ class LoadConfig:
    decryption_key_file: If set, decrypts the output files with a password read
        from this file (after PBKDF2).
    decrypt_max_concurrency: The maximum number of concurrent processes to decrypt the safetensor files. -1 means no limit.
+
+    # ModelOpt-specific loading options
+    modelopt_checkpoint_restore_path: Optional[str] = None
+    modelopt_checkpoint_save_path: Optional[str] = None
+    modelopt_export_path: Optional[str] = None
    """

    load_format: Union[str, LoadFormat] = LoadFormat.AUTO
@@ -64,6 +70,14 @@ class LoadConfig:
    remote_instance_weight_loader_seed_instance_service_port: Optional[int] = None
    remote_instance_weight_loader_send_weights_group_ports: Optional[List[int]] = None

+    # ModelOpt-specific loading options
+    modelopt_checkpoint_restore_path: Optional[str] = None
+    modelopt_checkpoint_save_path: Optional[str] = None
+    modelopt_export_path: Optional[str] = None
+
+    # ModelOpt configuration object
+    modelopt_config: Optional[ModelOptConfig] = None
+
    def __post_init__(self):
        model_loader_extra_config = self.model_loader_extra_config or {}
        if isinstance(model_loader_extra_config, str):
@@ -78,6 +92,14 @@ class LoadConfig:
        else:
            self.ignore_patterns = ["original/**/*"]

+        # Create ModelOptConfig if not provided
+        if self.modelopt_config is None:
+            self.modelopt_config = ModelOptConfig(
+                checkpoint_restore_path=self.modelopt_checkpoint_restore_path,
+                checkpoint_save_path=self.modelopt_checkpoint_save_path,
+                export_path=self.modelopt_export_path,
+            )
+
    def _verify_load_format(self) -> None:
        if not isinstance(self.load_format, str):
            return
--- a/python/sglang/srt/configs/model_config.py
+++ b/python/sglang/srt/configs/model_config.py
@@ -17,7 +17,7 @@ import logging
 import math
 import os
 from enum import Enum, IntEnum, auto
-from typing import Any, Dict, List, Optional, Set, Union
+from typing import Any, List, Optional, Set, Union

 import torch
 from transformers import PretrainedConfig
@@ -89,7 +89,6 @@ class ModelConfig:
        enable_multimodal: Optional[bool] = None,
        dtype: str = "auto",
        quantization: Optional[str] = None,
-        modelopt_quant: Optional[Union[str, Dict]] = None,
        override_config_file: Optional[str] = None,
        is_draft_model: bool = False,
        hybrid_kvcache_ratio: Optional[
@@ -97,15 +96,19 @@ class ModelConfig:
        ] = None,  # TODO: remove this, it is not a model config
        model_impl: Union[str, ModelImpl] = ModelImpl.AUTO,
        sampling_defaults: str = "openai",
+        quantize_and_serve: bool = False,
    ) -> None:
        # Parse args
        self.model_path = model_path
        self.revision = revision
        self.quantization = quantization
-        self.modelopt_quant = modelopt_quant
        self.is_draft_model = is_draft_model
        self.model_impl = model_impl
        self.sampling_defaults = sampling_defaults
+        self.quantize_and_serve = quantize_and_serve
+
+        # Validate quantize_and_serve configuration
+        self._validate_quantize_and_serve_config()

        # Get hf config
        self._maybe_pull_model_tokenizer_from_remote()
@@ -219,10 +222,10 @@ class ModelConfig:
            enable_multimodal=server_args.enable_multimodal,
            dtype=server_args.dtype,
            quantization=server_args.quantization,
-            modelopt_quant=server_args.modelopt_quant,
            hybrid_kvcache_ratio=server_args.hybrid_kvcache_ratio,
            model_impl=server_args.model_impl,
            sampling_defaults=server_args.sampling_defaults,
+            quantize_and_serve=server_args.quantize_and_serve,
            **kwargs,
        )

@@ -547,6 +550,56 @@ class ModelConfig:
            # Default to FP8 for backward compatibility
            return {"quant_method": "modelopt_fp8"}

+    def _is_already_quantized(self) -> bool:
+        """Check if the model is already quantized based on config files."""
+        # Check for HuggingFace quantization config
+        from sglang.srt.utils import has_hf_quant_config
+
+        return has_hf_quant_config(self.model_path)
+
+    def _get_modelopt_quant_type(self) -> str:
+        """Extract ModelOpt quantization type from unified quantization flag."""
+        if self.quantization == "modelopt_fp8":
+            return "fp8"
+        elif self.quantization == "modelopt_fp4":
+            return "nvfp4"
+        elif self.quantization == "modelopt":
+            # Auto-detect from model config
+            quant_cfg = self._parse_quant_hf_config()
+            if quant_cfg:
+                quant_method = quant_cfg.get("quant_method", "").lower()
+                if "fp4" in quant_method:
+                    return "fp4"
+                elif "fp8" in quant_method:
+                    return "fp8"
+            # Default to fp8 if can't detect
+            return "fp8"
+        else:
+            return "fp8"  # Default fallback
+
+    def _validate_quantize_and_serve_config(self):
+        """Validate quantize_and_serve configuration."""
+        if not self.quantize_and_serve:
+            return
+
+        # Check if ModelOpt quantization is specified
+        modelopt_quantization_specified = self.quantization in [
+            "modelopt",
+            "modelopt_fp8",
+            "modelopt_fp4",
+        ]
+
+        if not modelopt_quantization_specified:
+            raise ValueError("quantize_and_serve requires ModelOpt quantization")
+
+        # quantize_and_serve is disabled due to compatibility issues
+        raise NotImplementedError(
+            "quantize_and_serve functionality is currently disabled due to compatibility issues. "
+            "Please use the separate quantize-then-deploy workflow instead. "
+            "Step 1: Quantize and export model. "
+            "Step 2: Deploy the exported model."
+        )
+
    # adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/config.py
    def _verify_quantization(self) -> None:
        supported_quantization = [*QUANTIZATION_METHODS]
--- a/python/sglang/srt/configs/modelopt_config.py
+++ b/python/sglang/srt/configs/modelopt_config.py
@@ -0,0 +1,30 @@
+# Configuration for NVIDIA ModelOpt quantization integration
+from dataclasses import dataclass
+from typing import Optional
+
+
+@dataclass
+class ModelOptConfig:
+    """Configuration for NVIDIA ModelOpt quantization operations.
+
+    This configuration class holds parameters for ModelOpt quantization,
+    checkpoint management, and model export operations.
+
+    Args:
+        quant: Quantization method/type (e.g., "fp8", "fp4")
+        checkpoint_restore_path: Path to restore ModelOpt checkpoint from
+        checkpoint_save_path: Path to save ModelOpt checkpoint to
+        export_path: Path to export quantized model in HuggingFace format
+        quantize_and_serve: Whether to quantize and serve in one step
+    """
+
+    quant: Optional[str] = None
+    checkpoint_restore_path: Optional[str] = None
+    checkpoint_save_path: Optional[str] = None
+    export_path: Optional[str] = None
+    quantize_and_serve: bool = False
+
+    def __post_init__(self):
+        """Validate configuration after initialization."""
+        # Add any validation logic if needed
+        pass
--- a/python/sglang/srt/layers/quantization/init.py
+++ b/python/sglang/srt/layers/quantization/init.py
@@ -72,6 +72,7 @@ if TYPE_CHECKING:
 BASE_QUANTIZATION_METHODS: Dict[str, Type[QuantizationConfig]] = {
    "fp8": Fp8Config,
    "blockwise_int8": BlockInt8Config,
+    "modelopt": ModelOptFp8Config,  # Auto-detect, defaults to FP8
    "modelopt_fp8": ModelOptFp8Config,
    "modelopt_fp4": ModelOptFp4Config,
    "w8a8_int8": W8A8Int8Config,
--- a/python/sglang/srt/layers/quantization/base_config.py
+++ b/python/sglang/srt/layers/quantization/base_config.py
@@ -161,6 +161,26 @@ class QuantizationConfig(ABC):
        """
        return None

+    @classmethod
+    def _modelopt_override_quantization_method(
+        cls, hf_quant_config, user_quant
+    ) -> Optional[str]:
+        """Shared ModelOpt quantization method override logic."""
+        if hf_quant_config is None:
+            return None
+
+        # Check if this is a ModelOpt config
+        quant_algo = hf_quant_config.get("quant_algo", "").upper()
+
+        # If user specified generic "modelopt", auto-detect the specific method
+        if user_quant == "modelopt":
+            if "FP8" in quant_algo:
+                return "modelopt_fp8"
+            elif "NVFP4" in quant_algo or "FP4" in quant_algo:
+                return "modelopt_fp4"
+
+        return None
+
    @staticmethod
    def get_from_keys(config: Dict[str, Any], keys: List[str]) -> Any:
        """Get a value from the model's quantization config."""
--- a/python/sglang/srt/layers/quantization/modelopt_quant.py
+++ b/python/sglang/srt/layers/quantization/modelopt_quant.py
@@ -111,6 +111,11 @@ class ModelOptFp8Config(QuantizationConfig):
                "Detected ModelOpt FP8 checkpoint. The format is experimental and subject to change."
            )

+    @classmethod
+    def override_quantization_method(cls, hf_quant_config, user_quant):
+        """Override quantization method based on the model's config."""
+        return cls._modelopt_override_quantization_method(hf_quant_config, user_quant)
+
    @classmethod
    def get_name(cls) -> str:
        return "modelopt_fp8"
@@ -527,6 +532,11 @@ class ModelOptFp4Config(QuantizationConfig):
        self.kv_cache_quant_algo = kv_cache_quant_algo
        self.exclude_modules = exclude_modules

+    @classmethod
+    def override_quantization_method(cls, hf_quant_config, user_quant):
+        """Override quantization method based on the model's config."""
+        return cls._modelopt_override_quantization_method(hf_quant_config, user_quant)
+
    @classmethod
    def get_name(cls) -> str:
        return "modelopt_fp4"
@@ -608,7 +618,16 @@ class ModelOptFp4Config(QuantizationConfig):
                else:
                    kv_cache_quant_algo = "auto"

-            group_size = ModelOptFp4Config.common_group_size(config)
+            group_size = config.get("group_size")
+            # If group_size is not at top level, try to extract from config_groups
+            if group_size is None:
+                config_groups = config.get("config_groups", {})
+                if config_groups:
+                    # Get group_size from the first group's weights config
+                    first_group = next(iter(config_groups.values()), {})
+                    weights_config = first_group.get("weights", {})
+                    group_size = weights_config.get("group_size")
+
            exclude_modules = config.get("ignore", [])
        else:
            # Fall back to nested format (hf_quant_config.json - legacy format)
@@ -634,15 +653,15 @@ class ModelOptFp4Config(QuantizationConfig):
            )
        is_checkpoint_nvfp4_serialized = "NVFP4" in quant_method

-        if not (group_size and kv_cache_quant_algo) or exclude_modules is None:
+        if group_size is None or exclude_modules is None:
            logger.warning(
                f"group_size: {group_size},"
                f"kv_cache_quant_algo: {kv_cache_quant_algo},"
                f"exclude_modules: {exclude_modules}"
            )
            raise ValueError(
-                "NVFP4 quantization requires group size and "
-                "kv_cache_quant_algo specified in the quantization config"
+                "NVFP4 quantization requires group_size and exclude_modules "
+                "specified in the quantization config"
            )
        return cls(
            is_checkpoint_nvfp4_serialized,
--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -828,6 +828,16 @@ class ModelRunner:
        set_cuda_arch()

        # Prepare the model config
+        from sglang.srt.configs.modelopt_config import ModelOptConfig
+
+        modelopt_config = ModelOptConfig(
+            quant=self.server_args.modelopt_quant,
+            checkpoint_restore_path=self.server_args.modelopt_checkpoint_restore_path,
+            checkpoint_save_path=self.server_args.modelopt_checkpoint_save_path,
+            export_path=self.server_args.modelopt_export_path,
+            quantize_and_serve=self.server_args.quantize_and_serve,
+        )
+
        self.load_config = LoadConfig(
            load_format=self.server_args.load_format,
            download_dir=self.server_args.download_dir,
@@ -836,6 +846,7 @@ class ModelRunner:
            remote_instance_weight_loader_seed_instance_ip=self.server_args.remote_instance_weight_loader_seed_instance_ip,
            remote_instance_weight_loader_seed_instance_service_port=self.server_args.remote_instance_weight_loader_seed_instance_service_port,
            remote_instance_weight_loader_send_weights_group_ports=self.server_args.remote_instance_weight_loader_send_weights_group_ports,
+            modelopt_config=modelopt_config,
        )
        if self.device == "cpu":
            self.model_config = adjust_config_with_unaligned_cpu_tp(
--- a/python/sglang/srt/model_loader/loader.py
+++ b/python/sglang/srt/model_loader/loader.py
@@ -538,12 +538,21 @@ class DefaultModelLoader(BaseModelLoader):
            **model_kwargs,
            trust_remote_code=True,
        )
-        rank0_log(f"ModelOpt quantization requested: {model_config.modelopt_quant}")
+        # Handle both legacy modelopt_quant and unified quantization flags
+        if hasattr(model_config, "modelopt_quant") and model_config.modelopt_quant:
+            # Legacy approach
+            quant_choice_str = model_config.modelopt_quant
+            rank0_log(f"ModelOpt quantization requested (legacy): {quant_choice_str}")
+        else:
+            # Unified approach - extract quantization type
+            quant_choice_str = model_config._get_modelopt_quant_type()
+            rank0_log(
+                f"ModelOpt quantization requested (unified): {model_config.quantization} -> {quant_choice_str}"
+            )

-        quant_choice_str = model_config.modelopt_quant
        if not isinstance(quant_choice_str, str):
            raise TypeError(
-                f"modelopt_quant must be a string preset key (e.g., 'fp8'), "
+                f"Quantization type must be a string (e.g., 'fp8'), "
                f"got {type(quant_choice_str)}"
            )

@@ -1764,6 +1773,7 @@ class ModelOptModelLoader(DefaultModelLoader):
        quant_cfg,
        quantized_ckpt_restore_path: str | None = None,
        quantized_ckpt_save_path: str | None = None,
+        export_path: str | None = None,
    ) -> None:
        """
        Set up ModelOpt quantization for the given model.
@@ -1774,6 +1784,7 @@ class ModelOptModelLoader(DefaultModelLoader):
            quant_cfg: The quantization configuration
            quantized_ckpt_restore_path: Path to restore quantized checkpoint from
            quantized_ckpt_save_path: Path to save quantized checkpoint to
+            export_path: Path to export the quantized model in HuggingFace format

        Raises:
            ImportError: If ModelOpt is not available
@@ -1798,6 +1809,9 @@ class ModelOptModelLoader(DefaultModelLoader):
                rank0_log(
                    f"Restored quantized model from {quantized_ckpt_restore_path}"
                )
+
+                # Export model if path provided (even when restoring from checkpoint)
+                self._maybe_export_modelopt(model, export_path)
                return
            except Exception as e:
                logger.warning(
@@ -1844,9 +1858,75 @@ class ModelOptModelLoader(DefaultModelLoader):
                        f"Failed to save quantized checkpoint to {quantized_ckpt_save_path}: {e}"
                    )

+            # Export model if path provided
+            self._maybe_export_modelopt(model, export_path)
+
        except Exception as e:
            raise Exception(f"Failed to set up ModelOpt quantization: {e}") from e

+    def _maybe_export_modelopt(self, model, export_path: str | None) -> None:
+        """Export model to HuggingFace format if export_path is provided."""
+        if export_path:
+            try:
+                # Get the original model path from the model config
+                original_model_path = getattr(self, "_original_model_path", None)
+                self._export_modelopt_checkpoint(
+                    model, export_path, original_model_path
+                )
+                rank0_log(
+                    f"Quantized model exported to HuggingFace format at {export_path}"
+                )
+            except Exception as e:
+                rank0_log(
+                    f"Warning: Failed to export quantized model to {export_path}: {e}"
+                )
+
+    def _export_modelopt_checkpoint(
+        self,
+        model,
+        export_path: str,
+        model_path: str = None,
+        trust_remote_code: bool = True,
+    ) -> None:
+        """
+        Export the quantized model to HuggingFace format using ModelOpt export API.
+
+        Args:
+            model: The quantized model to export
+            export_path: Directory path to export the model to
+            model_path: Path to the original model (for tokenizer export)
+            trust_remote_code: Whether to trust remote code for tokenizer loading
+
+        Raises:
+            ImportError: If ModelOpt export functionality is not available
+            Exception: If export fails
+        """
+        try:
+            from modelopt.torch.export import export_hf_checkpoint
+            from transformers import AutoTokenizer
+        except ImportError as e:
+            raise ImportError(
+                "ModelOpt export functionality is not available. "
+                "Please ensure you have the latest version of modelopt installed."
+            ) from e
+
+        # Create export directory if it doesn't exist
+        os.makedirs(export_path, exist_ok=True)
+
+        # Export the quantized model
+        export_hf_checkpoint(model, export_dir=export_path)
+
+        # Export the tokenizer if model_path is provided
+        if model_path:
+            try:
+                tokenizer = AutoTokenizer.from_pretrained(
+                    model_path, trust_remote_code=trust_remote_code
+                )
+                tokenizer.save_pretrained(export_path)
+                rank0_log(f"Tokenizer exported to {export_path}")
+            except Exception as e:
+                rank0_log(f"Warning: Failed to export tokenizer: {e}")
+
    def load_model(
        self,
        *,
@@ -1856,28 +1936,52 @@ class ModelOptModelLoader(DefaultModelLoader):

        logger.info("ModelOptModelLoader: Loading base model...")

-        # Use shared method from parent class to load base model
+        # Store the original model path for tokenizer export
+        self._original_model_path = model_config.model_path
+
+        # Check if model is already quantized
+        if model_config._is_already_quantized():
+            logger.info("Model is already quantized, loading directly...")
+            # Use default loading for pre-quantized models
+            return super().load_model(
+                model_config=model_config, device_config=device_config
+            )
+
+        # TODO: Quantize-and-serve mode has been disabled at the ModelConfig level
+        # All quantization now uses the standard workflow (quantize + export/save)
+        logger.info("Standard quantization mode: Will quantize and export/save")
+        return self._standard_quantization_workflow(model_config, device_config)
+
+    def _standard_quantization_workflow(
+        self, model_config: ModelConfig, device_config: DeviceConfig
+    ) -> nn.Module:
+        """Standard quantization workflow: quantize, save checkpoint, export, then return model."""
+        # Use shared method from parent class to load base model for quantization
        model = self._load_modelopt_base_model(model_config)

-        # Import ModelOpt modules (already done in _load_modelopt_base_model, but needed here for quantization)
+        # Import ModelOpt modules
        try:
            import modelopt.torch.quantization as mtq
        except ImportError:
            logger.error(
                "NVIDIA Model Optimizer (modelopt) library not found. "
-                "Please install it to use 'modelopt_quant' feature."
+                "Please install it to use ModelOpt quantization."
            )
            raise

-        quant_choice_str = model_config.modelopt_quant
+        # Handle both old modelopt_quant and new unified quantization flags
+        if hasattr(model_config, "modelopt_quant") and model_config.modelopt_quant:
+            # Legacy modelopt_quant flag
+            quant_choice_str = model_config.modelopt_quant
+        else:
+            # Unified quantization flag - extract the type (fp8/fp4)
+            quant_choice_str = model_config._get_modelopt_quant_type()

        quant_cfg_name = QUANT_CFG_CHOICES.get(quant_choice_str)
        if not quant_cfg_name:
            raise ValueError(
-                f"Invalid modelopt_quant choice: '{quant_choice_str}'. "
-                f"Available choices in QUANT_CFG_CHOICES: {list(QUANT_CFG_CHOICES.keys())}. "
-                "Ensure QUANT_CFG_CHOICES is correctly defined with mappings to "
-                "attribute names of config objects in modelopt.torch.quantization."
+                f"Invalid quantization choice: '{quant_choice_str}'. "
+                f"Available choices: {list(QUANT_CFG_CHOICES.keys())}"
            )

        try:
@@ -1885,20 +1989,27 @@ class ModelOptModelLoader(DefaultModelLoader):
            quant_cfg = getattr(mtq, quant_cfg_name)
        except AttributeError:
            raise AttributeError(
-                f"ModelOpt quantization config attribute '{quant_cfg_name}' "
-                f"(from choice '{quant_choice_str}') not found in modelopt.torch.quantization module. "
-                "Please verify QUANT_CFG_CHOICES and the ModelOpt library."
+                f"ModelOpt quantization config '{quant_cfg_name}' not found. "
+                "Please verify the ModelOpt library installation."
            )

        logger.info(
-            f"Quantizing model with ModelOpt using config attribute: mtq.{quant_cfg_name}"
+            f"Quantizing model with ModelOpt using config: mtq.{quant_cfg_name}"
        )

-        quantized_ckpt_restore_path = model_config.modelopt_checkpoint_restore_path
-        quantized_ckpt_save_path = model_config.modelopt_checkpoint_save_path
+        # Get ModelOpt configuration from LoadConfig
+        modelopt_config = self.load_config.modelopt_config
+        quantized_ckpt_restore_path = (
+            modelopt_config.checkpoint_restore_path if modelopt_config else None
+        )
+        quantized_ckpt_save_path = (
+            modelopt_config.checkpoint_save_path if modelopt_config else None
+        )
+        export_path = modelopt_config.export_path if modelopt_config else None
        tokenizer = AutoTokenizer.from_pretrained(
            model_config.model_path, use_fast=True
        )
+
        try:
            self._setup_modelopt_quantization(
                model,
@@ -1906,6 +2017,7 @@ class ModelOptModelLoader(DefaultModelLoader):
                quant_cfg,
                quantized_ckpt_restore_path=quantized_ckpt_restore_path,
                quantized_ckpt_save_path=quantized_ckpt_save_path,
+                export_path=export_path,
            )
        except Exception as e:
            logger.warning(f"ModelOpt quantization failed: {e}")
@@ -1919,12 +2031,27 @@ def get_model_loader(
 ) -> BaseModelLoader:
    """Get a model loader based on the load format."""

+    if model_config and (
+        (hasattr(model_config, "modelopt_quant") and model_config.modelopt_quant)
+        or model_config.quantization in ["modelopt_fp8", "modelopt_fp4", "modelopt"]
+    ):
+        logger.info("Using ModelOptModelLoader due to ModelOpt quantization config.")
+        return ModelOptModelLoader(load_config)
+
+    # Use ModelOptModelLoader for unified quantization flags
    if (
        model_config
-        and hasattr(model_config, "modelopt_quant")
-        and model_config.modelopt_quant
+        and hasattr(model_config, "quantization")
+        and model_config.quantization in ["modelopt_fp8", "modelopt_fp4"]
    ):
-        logger.info("Using ModelOptModelLoader due to 'modelopt_quant' config.")
+        if model_config._is_already_quantized():
+            logger.info(
+                f"Using ModelOptModelLoader for pre-quantized model: {model_config.quantization}"
+            )
+        else:
+            logger.info(
+                f"Using ModelOptModelLoader for quantization: {model_config.quantization}"
+            )
        return ModelOptModelLoader(load_config)

    if isinstance(load_config.load_format, type):
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -83,6 +83,7 @@ QUANTIZATION_CHOICES = [
    "bitsandbytes",
    "gguf",
    "modelopt",
+    "modelopt_fp8",
    "modelopt_fp4",
    "petit_nvfp4",
    "w8a8_int8",
@@ -192,6 +193,8 @@ class ServerArgs:
    modelopt_quant: Optional[Union[str, Dict]] = None
    modelopt_checkpoint_restore_path: Optional[str] = None
    modelopt_checkpoint_save_path: Optional[str] = None
+    modelopt_export_path: Optional[str] = None
+    quantize_and_serve: bool = False
    context_length: Optional[int] = None
    is_embedding: bool = False
    enable_multimodal: Optional[bool] = None
@@ -1743,6 +1746,22 @@ class ServerArgs:
            help="Path to save the ModelOpt quantized checkpoint after quantization. "
            "This allows reusing the quantized model in future runs.",
        )
+        parser.add_argument(
+            "--modelopt-export-path",
+            type=str,
+            default=ServerArgs.modelopt_export_path,
+            help="Path to export the quantized model in HuggingFace format after ModelOpt quantization. "
+            "The exported model can then be used directly with SGLang for inference. "
+            "If not provided, the model will not be exported.",
+        )
+        parser.add_argument(
+            "--quantize-and-serve",
+            action="store_true",
+            default=ServerArgs.quantize_and_serve,
+            help="Quantize the model with ModelOpt and immediately serve it without exporting. "
+            "This is useful for development and prototyping. For production, it's recommended "
+            "to use separate quantization and deployment steps.",
+        )
        parser.add_argument(
            "--kv-cache-dtype",
            type=str,
--- a/python/sglang/srt/utils/common.py
+++ b/python/sglang/srt/utils/common.py
@@ -2411,6 +2411,29 @@ def retry(
            time.sleep(delay)


+def has_hf_quant_config(model_path: str) -> bool:
+    """Check if the model path contains hf_quant_config.json file.
+
+    Args:
+        model_path: Path to the model, can be local path or remote URL.
+
+    Returns:
+        True if hf_quant_config.json exists, False otherwise.
+    """
+    if is_remote_url(model_path):
+        try:
+            from huggingface_hub import HfApi
+
+            hf_api = HfApi()
+            return hf_api.file_exists(model_path, "hf_quant_config.json")
+        except Exception:
+            return False
+    else:
+        import os
+
+        return os.path.exists(os.path.join(model_path, "hf_quant_config.json"))
+
+
 def flatten_nested_list(nested_list):
    if isinstance(nested_list, list):
        return [
--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -135,6 +135,8 @@ suites = {
        TestFile("test_vision_chunked_prefill.py", 175),
        TestFile("test_vision_openai_server_a.py", 918),
        TestFile("test_vlm_input_format.py", 300),
+        TestFile("test_modelopt_loader.py", 30),
+        TestFile("test_modelopt_export.py", 30),
    ],
    "per-commit-2-gpu": [
        TestFile("ep/test_moe_ep.py", 140),
--- a/test/srt/test_modelopt_export.py
+++ b/test/srt/test_modelopt_export.py
@@ -0,0 +1,353 @@
+"""
+Unit tests for ModelOpt export functionality in SGLang.
+
+These tests verify the integration of ModelOpt export API with SGLang's model loading
+and quantization workflow.
+"""
+
+import json
+import os
+import sys
+import tempfile
+import unittest
+from unittest.mock import Mock, patch
+
+import torch
+
+from sglang.srt.configs.device_config import DeviceConfig
+from sglang.srt.configs.load_config import LoadConfig
+from sglang.srt.configs.model_config import ModelConfig
+from sglang.srt.model_loader.loader import ModelOptModelLoader
+
+# Note: PYTHONPATH=python should be set when running tests
+
+# Check if modelopt is available
+try:
+    import modelopt
+
+    MODELOPT_AVAILABLE = True
+except ImportError:
+    MODELOPT_AVAILABLE = False
+
+
+class TestModelOptExport(unittest.TestCase):
+    """Test suite for ModelOpt export functionality."""
+
+    def setUp(self):
+        """Set up test fixtures."""
+        # Mock distributed functionality to avoid initialization errors
+        self.mock_tp_rank = patch(
+            "sglang.srt.distributed.parallel_state.get_tensor_model_parallel_rank",
+            return_value=0,
+        )
+        self.mock_tp_rank.start()
+
+        self.mock_rank0_log = patch("sglang.srt.model_loader.loader.rank0_log")
+        self.mock_rank0_log.start()
+
+        # Mock logger to avoid issues
+        self.mock_logger = patch("sglang.srt.model_loader.loader.logger")
+        self.mock_logger.start()
+
+        # Mock all distributed functions that might be called
+        self.mock_get_tp_group = patch(
+            "sglang.srt.distributed.parallel_state.get_tp_group"
+        )
+        self.mock_get_tp_group.start()
+
+        # Mock model parallel initialization check
+        self.mock_mp_is_initialized = patch(
+            "sglang.srt.distributed.parallel_state.model_parallel_is_initialized",
+            return_value=True,
+        )
+        self.mock_mp_is_initialized.start()
+        self.temp_dir = tempfile.mkdtemp()
+        self.export_dir = os.path.join(self.temp_dir, "exported_model")
+        self.checkpoint_dir = os.path.join(self.temp_dir, "checkpoint")
+
+        # Mock model
+        self.mock_model = Mock(spec=torch.nn.Module)
+        self.mock_model.device = torch.device("cuda:0")
+
+        # Mock tokenizer
+        self.mock_tokenizer = Mock()
+
+        # Mock quantization config
+        self.mock_quant_cfg = Mock()
+
+        # Create ModelOptModelLoader instance
+        self.load_config = LoadConfig()
+        self.model_loader = ModelOptModelLoader(self.load_config)
+
+    def tearDown(self):
+        """Clean up test fixtures."""
+        import shutil
+
+        shutil.rmtree(self.temp_dir, ignore_errors=True)
+
+        # Stop mocks
+        self.mock_tp_rank.stop()
+        self.mock_rank0_log.stop()
+        self.mock_logger.stop()
+        self.mock_get_tp_group.stop()
+        self.mock_mp_is_initialized.stop()
+
+    def _create_mock_export_files(self, export_dir: str):
+        """Create mock export files for testing validation."""
+        os.makedirs(export_dir, exist_ok=True)
+
+        # Create config.json
+        config = {
+            "model_type": "test_model",
+            "architectures": ["TestModel"],
+            "quantization_config": {
+                "quant_method": "modelopt",
+                "bits": 8,
+            },
+        }
+        with open(os.path.join(export_dir, "config.json"), "w") as f:
+            json.dump(config, f)
+
+        # Create tokenizer_config.json
+        tokenizer_config = {"tokenizer_class": "TestTokenizer"}
+        with open(os.path.join(export_dir, "tokenizer_config.json"), "w") as f:
+            json.dump(tokenizer_config, f)
+
+        # Create model file
+        with open(os.path.join(export_dir, "model.safetensors"), "w") as f:
+            f.write("mock_model_data")
+
+    @unittest.skipIf(not MODELOPT_AVAILABLE, "nvidia-modelopt not available")
+    @patch("sglang.srt.model_loader.loader.os.makedirs")
+    @patch("modelopt.torch.export.export_hf_checkpoint")
+    def test_export_modelopt_checkpoint_success(self, mock_export, mock_makedirs):
+        """Test successful model export."""
+        # Arrange
+        mock_export.return_value = None
+        mock_makedirs.return_value = None
+
+        # Act
+        self.model_loader._export_modelopt_checkpoint(self.mock_model, self.export_dir)
+
+        # Assert
+        mock_makedirs.assert_called_once_with(self.export_dir, exist_ok=True)
+        mock_export.assert_called_once_with(self.mock_model, export_dir=self.export_dir)
+
+    @unittest.skipIf(not MODELOPT_AVAILABLE, "nvidia-modelopt not available")
+    @patch("modelopt.torch.opt.restore")
+    @patch("modelopt.torch.quantization.utils.is_quantized")
+    def test_setup_quantization_with_export_from_checkpoint(
+        self, mock_is_quantized, mock_restore
+    ):
+        """Test export functionality when restoring from checkpoint."""
+        # Arrange
+        mock_is_quantized.return_value = False
+        mock_restore.return_value = None
+
+        with patch.object(
+            self.model_loader, "_export_modelopt_checkpoint"
+        ) as mock_export:
+            # Act
+            self.model_loader._setup_modelopt_quantization(
+                self.mock_model,
+                self.mock_tokenizer,
+                self.mock_quant_cfg,
+                quantized_ckpt_restore_path=self.checkpoint_dir,
+                export_path=self.export_dir,
+            )
+
+            # Assert
+            mock_restore.assert_called_once_with(self.mock_model, self.checkpoint_dir)
+            mock_export.assert_called_once_with(self.mock_model, self.export_dir, None)
+
+    @unittest.skipIf(not MODELOPT_AVAILABLE, "nvidia-modelopt not available")
+    @patch("modelopt.torch.quantization.quantize")
+    @patch("modelopt.torch.quantization.print_quant_summary")
+    @patch("modelopt.torch.quantization.utils.is_quantized")
+    @patch("modelopt.torch.utils.dataset_utils.get_dataset_dataloader")
+    @patch("modelopt.torch.utils.dataset_utils.create_forward_loop")
+    def test_setup_quantization_with_export_after_calibration(
+        self,
+        mock_create_loop,
+        mock_get_dataloader,
+        mock_is_quantized,
+        mock_print_summary,
+        mock_quantize,
+    ):
+        """Test export functionality after calibration-based quantization."""
+        # Arrange
+        mock_is_quantized.return_value = False
+        mock_dataloader = Mock()
+        mock_get_dataloader.return_value = mock_dataloader
+        mock_calibrate_loop = Mock()
+        mock_create_loop.return_value = mock_calibrate_loop
+        mock_quantize.return_value = None
+        mock_print_summary.return_value = None
+
+        with patch.object(
+            self.model_loader, "_export_modelopt_checkpoint"
+        ) as mock_export:
+            # Act
+            self.model_loader._setup_modelopt_quantization(
+                self.mock_model,
+                self.mock_tokenizer,
+                self.mock_quant_cfg,
+                export_path=self.export_dir,
+            )
+
+            # Assert
+            mock_quantize.assert_called_once_with(
+                self.mock_model, self.mock_quant_cfg, forward_loop=mock_calibrate_loop
+            )
+            mock_export.assert_called_once_with(self.mock_model, self.export_dir, None)
+
+    @unittest.skipIf(not MODELOPT_AVAILABLE, "nvidia-modelopt not available")
+    def test_setup_quantization_without_export(self):
+        """Test quantization setup without export path specified."""
+        with patch("modelopt.torch.quantization.utils.is_quantized", return_value=True):
+            # Act
+            with patch.object(
+                self.model_loader, "_export_modelopt_checkpoint"
+            ) as mock_export:
+                self.model_loader._setup_modelopt_quantization(
+                    self.mock_model,
+                    self.mock_tokenizer,
+                    self.mock_quant_cfg,
+                    export_path=None,  # No export path
+                )
+
+                # Assert
+                mock_export.assert_not_called()
+
+    def test_quantize_and_serve_config_validation(self):
+        """Test that quantize_and_serve is properly disabled."""
+        # Test that quantize-and-serve mode raises NotImplementedError
+        with self.assertRaises(NotImplementedError) as context:
+            ModelConfig(
+                model_path="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+                quantization="modelopt_fp8",
+                quantize_and_serve=True,
+            )
+
+        # Verify the error message contains helpful instructions
+        error_msg = str(context.exception)
+        self.assertIn("disabled due to compatibility issues", error_msg)
+        self.assertIn("separate quantize-then-deploy workflow", error_msg)
+
+        # Test invalid configuration - no quantization
+        with self.assertRaises(ValueError) as context:
+            ModelConfig(
+                model_path="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+                quantize_and_serve=True,
+            )
+        self.assertIn("requires ModelOpt quantization", str(context.exception))
+
+    @unittest.skipIf(not MODELOPT_AVAILABLE, "nvidia-modelopt not available")
+    def test_standard_workflow_selection(self):
+        """Test that standard workflow is selected by default."""
+        with patch(
+            "modelopt.torch.quantization.utils.is_quantized", return_value=False
+        ):
+            with patch.object(
+                self.model_loader, "_standard_quantization_workflow"
+            ) as mock_standard:
+                with patch.object(self.model_loader, "_load_modelopt_base_model"):
+                    mock_standard.return_value = Mock()
+
+                    # Create model config without quantize_and_serve
+                    model_config = ModelConfig(
+                        model_path="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+                        quantization="modelopt_fp8",
+                        quantize_and_serve=False,
+                    )
+                    device_config = DeviceConfig()
+
+                    # Act
+                    self.model_loader.load_model(
+                        model_config=model_config,
+                        device_config=device_config,
+                    )
+
+                    # Assert
+                    mock_standard.assert_called_once_with(model_config, device_config)
+
+    def _get_export_info(self, export_dir: str) -> dict:
+        """Get information about an exported model."""
+        if not self._validate_export(export_dir):
+            return None
+
+        try:
+            config_path = os.path.join(export_dir, "config.json")
+            with open(config_path, "r") as f:
+                config = json.load(f)
+
+            return {
+                "model_type": config.get("model_type", "unknown"),
+                "architectures": config.get("architectures", []),
+                "quantization_config": config.get("quantization_config", {}),
+                "export_dir": export_dir,
+            }
+        except Exception:
+            return None
+
+
+@unittest.skipIf(not MODELOPT_AVAILABLE, "nvidia-modelopt not available")
+class TestModelOptExportIntegration(unittest.TestCase):
+    """Integration tests for ModelOpt export with full model loading workflow."""
+
+    def setUp(self):
+        """Set up integration test fixtures."""
+        self.temp_dir = tempfile.mkdtemp()
+        self.export_dir = os.path.join(self.temp_dir, "exported_model")
+
+    def tearDown(self):
+        """Clean up integration test fixtures."""
+        import shutil
+
+        shutil.rmtree(self.temp_dir, ignore_errors=True)
+
+    @patch("sglang.srt.model_loader.loader.get_model_architecture")
+    @patch("transformers.AutoTokenizer.from_pretrained")
+    @patch("transformers.AutoModelForCausalLM.from_pretrained")
+    def test_full_workflow_with_export(self, mock_model, mock_tokenizer, mock_arch):
+        """Test the complete workflow from model config to export."""
+        # Arrange
+        mock_arch.return_value = ("TestModel", "TestConfig")
+        mock_tokenizer.return_value = Mock()
+        mock_model.return_value = Mock(spec=torch.nn.Module)
+
+        model_config = ModelConfig(
+            model_path="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+            modelopt_quant="fp8",
+            modelopt_export_path=self.export_dir,
+        )
+
+        load_config = LoadConfig()
+        device_config = DeviceConfig()
+
+        # Mock the quantization and export process
+        with patch.object(
+            ModelOptModelLoader, "_setup_modelopt_quantization"
+        ) as mock_setup:
+            with patch.object(
+                ModelOptModelLoader, "_load_modelopt_base_model"
+            ) as mock_load_base:
+                mock_load_base.return_value = mock_model.return_value
+
+                # Act
+                model_loader = ModelOptModelLoader(load_config)
+                result = model_loader.load_model(
+                    model_config=model_config,
+                    device_config=device_config,
+                )
+
+                # Assert
+                self.assertIsNotNone(result)
+                mock_setup.assert_called_once()
+                # Verify export_path was passed to setup
+                args, kwargs = mock_setup.call_args
+                self.assertEqual(kwargs.get("export_path"), self.export_dir)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/test/srt/test_modelopt_loader.py
+++ b/test/srt/test_modelopt_loader.py
@@ -12,8 +12,17 @@ from unittest.mock import MagicMock, patch

 import torch.nn as nn

-# Add the sglang path for testing
-sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../python"))
+# Note: PYTHONPATH=python should be set when running tests
+
+# Constants for calibration parameters to avoid hard-coded values
+CALIBRATION_BATCH_SIZE = 36
+CALIBRATION_NUM_SAMPLES = 512
+DEFAULT_DEVICE = "cuda:0"
+
+# Constants for calibration parameters to avoid hard-coded values
+CALIBRATION_BATCH_SIZE = 36
+CALIBRATION_NUM_SAMPLES = 512
+DEFAULT_DEVICE = "cuda:0"

 from sglang.srt.configs.device_config import DeviceConfig
 from sglang.srt.configs.load_config import LoadConfig
@@ -28,18 +37,63 @@ class TestModelOptModelLoader(CustomTestCase):

    def setUp(self):
        """Set up test fixtures."""
+        # Mock distributed functionality to avoid initialization errors
+        self.mock_tp_rank = patch(
+            "sglang.srt.distributed.parallel_state.get_tensor_model_parallel_rank",
+            return_value=0,
+        )
+        self.mock_tp_rank.start()
+
+        self.mock_rank0_log = patch("sglang.srt.model_loader.loader.rank0_log")
+        self.mock_rank0_log.start()
+
+        # Mock logger to avoid issues
+        self.mock_logger = patch("sglang.srt.model_loader.loader.logger")
+        self.mock_logger.start()
+
+        # Mock all distributed functions that might be called
+        self.mock_get_tp_group = patch(
+            "sglang.srt.distributed.parallel_state.get_tp_group"
+        )
+        self.mock_get_tp_group.start()
+
+        # Mock model parallel initialization check
+        self.mock_mp_is_initialized = patch(
+            "sglang.srt.distributed.parallel_state.model_parallel_is_initialized",
+            return_value=True,
+        )
+        self.mock_mp_is_initialized.start()
+
        self.model_path = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
        self.load_config = LoadConfig()
        self.device_config = DeviceConfig(device="cuda")

-        # Create a basic model config with modelopt_quant
+        # Create a basic model config with unified quantization flag
        self.model_config = ModelConfig(
-            model_path=self.model_path, modelopt_quant="fp8"
+            model_path=self.model_path,
+            quantization="modelopt_fp8",  # Use unified quantization approach
+        )
+
+        # Also create a unified quantization config for new tests
+        self.unified_model_config = ModelConfig(
+            model_path=self.model_path, quantization="modelopt_fp8"
        )

        # Mock base model
        self.mock_base_model = MagicMock(spec=nn.Module)
        self.mock_base_model.eval.return_value = self.mock_base_model
+        self.mock_base_model.device = (
+            DEFAULT_DEVICE  # Add device attribute for calibration tests
+        )
+
+    def tearDown(self):
+        """Clean up test fixtures."""
+        # Stop mocks
+        self.mock_tp_rank.stop()
+        self.mock_rank0_log.stop()
+        self.mock_logger.stop()
+        self.mock_get_tp_group.stop()
+        self.mock_mp_is_initialized.stop()

    @patch("sglang.srt.model_loader.loader.QUANT_CFG_CHOICES", QUANT_CFG_CHOICES)
    @patch("sglang.srt.model_loader.loader.logger")
@@ -66,7 +120,7 @@ class TestModelOptModelLoader(CustomTestCase):
            model = self.mock_base_model

            # Simulate the quantization config lookup
-            quant_choice_str = model_config.modelopt_quant
+            quant_choice_str = model_config._get_modelopt_quant_type()
            quant_cfg_name = QUANT_CFG_CHOICES.get(quant_choice_str)

            if not quant_cfg_name:
@@ -123,6 +177,305 @@ class TestModelOptModelLoader(CustomTestCase):
            # Verify we get back the expected model
            self.assertEqual(result_model, self.mock_base_model)

+    @patch("sglang.srt.model_loader.loader.logger")
+    def test_missing_modelopt_import(self, mock_logger):
+        """Test error handling when modelopt library is not available."""
+
+        loader = ModelOptModelLoader(self.load_config)
+
+        # Mock the base model loader method
+        with patch.object(
+            loader, "_load_modelopt_base_model", return_value=self.mock_base_model
+        ):
+            # Simulate missing modelopt by making import fail
+            original_import = __import__
+
+            def mock_import(name, *args, **kwargs):
+                if name.startswith("modelopt"):
+                    raise ImportError("No module named 'modelopt'")
+                # Return default import behavior for other modules
+                return original_import(name, *args, **kwargs)
+
+            with patch("builtins.__import__", side_effect=mock_import):
+                # Expect ImportError to be raised and logged
+                with self.assertRaises(ImportError):
+                    loader.load_model(
+                        model_config=self.model_config, device_config=self.device_config
+                    )
+
+                # Verify error logging
+                mock_logger.error.assert_called_with(
+                    "NVIDIA Model Optimizer (modelopt) library not found. "
+                    "Please install it to use ModelOpt quantization."
+                )
+
+    @patch("sglang.srt.model_loader.loader.QUANT_CFG_CHOICES", QUANT_CFG_CHOICES)
+    @patch("sglang.srt.model_loader.loader.AutoTokenizer")
+    @patch("sglang.srt.model_loader.loader.logger")
+    def test_calibration_workflow_integration(self, mock_logger, mock_auto_tokenizer):
+        """Test end-to-end calibration workflow integration."""
+
+        loader = ModelOptModelLoader(self.load_config)
+
+        # Mock tokenizer
+        mock_tokenizer = MagicMock()
+        mock_tokenizer.padding_side = "right"
+        mock_auto_tokenizer.from_pretrained.return_value = mock_tokenizer
+
+        # Mock modelopt modules
+        mock_mtq = MagicMock()
+        mock_mto = MagicMock()
+        mock_dataset_utils = MagicMock()
+
+        # Configure quantization config
+        mock_fp8_cfg = MagicMock()
+        mock_mtq.FP8_DEFAULT_CFG = mock_fp8_cfg
+
+        # Configure dataset utilities
+        mock_calib_dataloader = MagicMock()
+        mock_calibrate_loop = MagicMock()
+        mock_dataset_utils.get_dataset_dataloader.return_value = mock_calib_dataloader
+        mock_dataset_utils.create_forward_loop.return_value = mock_calibrate_loop
+
+        # Configure model as not quantized initially
+        mock_is_quantized = MagicMock(return_value=False)
+
+        with patch.object(
+            loader, "_load_modelopt_base_model", return_value=self.mock_base_model
+        ):
+            with patch.dict(
+                "sys.modules",
+                {
+                    "modelopt": MagicMock(),
+                    "modelopt.torch": MagicMock(),
+                    "modelopt.torch.opt": mock_mto,
+                    "modelopt.torch.quantization": mock_mtq,
+                    "modelopt.torch.quantization.utils": MagicMock(
+                        is_quantized=mock_is_quantized
+                    ),
+                    "modelopt.torch.utils": MagicMock(),
+                    "modelopt.torch.utils.dataset_utils": mock_dataset_utils,
+                },
+            ):
+                # Execute the load_model method to test the full workflow
+                result_model = loader.load_model(
+                    model_config=self.model_config, device_config=self.device_config
+                )
+
+                # Verify the model loading was successful
+                self.assertEqual(result_model, self.mock_base_model)
+
+                # Verify key calibration components were used
+                # Note: We can't easily verify the exact calls due to dynamic imports,
+                # but we can verify the workflow completed successfully
+
+    @patch("sglang.srt.model_loader.loader.QUANT_CFG_CHOICES", QUANT_CFG_CHOICES)
+    @patch("sglang.srt.model_loader.loader.AutoTokenizer")
+    @patch("sglang.srt.model_loader.loader.logger")
+    def test_quantized_checkpoint_restore(self, mock_logger, mock_auto_tokenizer):
+        """Test restoring from a quantized checkpoint."""
+
+        # Create model config with checkpoint restore path
+        config_with_restore = ModelConfig(
+            model_path=self.model_path,
+            quantization="modelopt_fp8",
+        )
+
+        # Create load config with checkpoint restore path
+        load_config_with_restore = LoadConfig(
+            modelopt_checkpoint_restore_path="/path/to/quantized/checkpoint"
+        )
+
+        loader = ModelOptModelLoader(load_config_with_restore)
+
+        # Mock tokenizer
+        mock_tokenizer = MagicMock()
+        mock_auto_tokenizer.from_pretrained.return_value = mock_tokenizer
+
+        # Mock modelopt modules
+        mock_mtq = MagicMock()
+        mock_mto = MagicMock()
+
+        # Configure quantization config
+        mock_fp8_cfg = MagicMock()
+        mock_mtq.FP8_DEFAULT_CFG = mock_fp8_cfg
+
+        # Configure model as not quantized initially
+        mock_is_quantized = MagicMock(return_value=False)
+
+        with patch.object(
+            loader, "_load_modelopt_base_model", return_value=self.mock_base_model
+        ):
+            with patch.dict(
+                "sys.modules",
+                {
+                    "modelopt": MagicMock(),
+                    "modelopt.torch": MagicMock(),
+                    "modelopt.torch.opt": mock_mto,
+                    "modelopt.torch.quantization": mock_mtq,
+                    "modelopt.torch.quantization.utils": MagicMock(
+                        is_quantized=mock_is_quantized
+                    ),
+                },
+            ):
+                with patch.object(loader, "_setup_modelopt_quantization") as mock_setup:
+                    # Mock the _setup_modelopt_quantization to simulate checkpoint restore
+                    def mock_setup_quantization(
+                        model,
+                        tokenizer,
+                        quant_cfg,
+                        quantized_ckpt_restore_path=None,
+                        **kwargs,
+                    ):
+                        if quantized_ckpt_restore_path:
+                            mock_mto.restore(model, quantized_ckpt_restore_path)
+                            print(
+                                f"Restored quantized model from {quantized_ckpt_restore_path}"
+                            )
+                            return
+
+                    mock_setup.side_effect = mock_setup_quantization
+
+                    # Execute the load_model method
+                    result_model = loader.load_model(
+                        model_config=config_with_restore,
+                        device_config=self.device_config,
+                    )
+
+                    # Verify the setup was called with restore path
+                    mock_setup.assert_called_once()
+                    call_args = mock_setup.call_args
+                    # Check that the restore path was passed correctly
+                    self.assertIn("quantized_ckpt_restore_path", call_args[1])
+                    self.assertEqual(
+                        call_args[1]["quantized_ckpt_restore_path"],
+                        "/path/to/quantized/checkpoint",
+                    )
+
+                    # Verify restore was called
+                    mock_mto.restore.assert_called_once_with(
+                        self.mock_base_model, "/path/to/quantized/checkpoint"
+                    )
+
+                    # Verify we get the expected model back
+                    self.assertEqual(result_model, self.mock_base_model)
+
+    @patch("sglang.srt.model_loader.loader.QUANT_CFG_CHOICES", QUANT_CFG_CHOICES)
+    @patch("sglang.srt.model_loader.loader.AutoTokenizer")
+    @patch("sglang.srt.model_loader.loader.logger")
+    def test_quantized_checkpoint_save(self, mock_logger, mock_auto_tokenizer):
+        """Test saving quantized checkpoint after calibration."""
+
+        # Create model config with checkpoint save path
+        config_with_save = ModelConfig(
+            model_path=self.model_path,
+            quantization="modelopt_fp8",
+        )
+
+        # Create load config with checkpoint save path
+        load_config_with_save = LoadConfig(
+            modelopt_checkpoint_save_path="/path/to/save/checkpoint"
+        )
+
+        loader = ModelOptModelLoader(load_config_with_save)
+
+        # Mock tokenizer
+        mock_tokenizer = MagicMock()
+        mock_auto_tokenizer.from_pretrained.return_value = mock_tokenizer
+
+        # Mock modelopt modules
+        mock_mtq = MagicMock()
+        mock_mto = MagicMock()
+        mock_dataset_utils = MagicMock()
+
+        # Configure quantization config
+        mock_fp8_cfg = MagicMock()
+        mock_mtq.FP8_DEFAULT_CFG = mock_fp8_cfg
+
+        # Configure model as not quantized initially
+        mock_is_quantized = MagicMock(return_value=False)
+
+        with patch.object(
+            loader, "_load_modelopt_base_model", return_value=self.mock_base_model
+        ):
+            with patch.dict(
+                "sys.modules",
+                {
+                    "modelopt": MagicMock(),
+                    "modelopt.torch": MagicMock(),
+                    "modelopt.torch.opt": mock_mto,
+                    "modelopt.torch.quantization": mock_mtq,
+                    "modelopt.torch.quantization.utils": MagicMock(
+                        is_quantized=mock_is_quantized
+                    ),
+                    "modelopt.torch.utils": MagicMock(),
+                    "modelopt.torch.utils.dataset_utils": mock_dataset_utils,
+                },
+            ):
+                with patch.object(loader, "_setup_modelopt_quantization") as mock_setup:
+                    # Mock the _setup_modelopt_quantization to simulate checkpoint save
+                    def mock_setup_quantization(
+                        model,
+                        tokenizer,
+                        quant_cfg,
+                        quantized_ckpt_save_path=None,
+                        **kwargs,
+                    ):
+                        # Simulate calibration and quantization
+                        mock_mtq.quantize(model, quant_cfg, forward_loop=MagicMock())
+                        mock_mtq.print_quant_summary(model)
+
+                        # Save checkpoint if path provided
+                        if quantized_ckpt_save_path:
+                            mock_mto.save(model, quantized_ckpt_save_path)
+                            print(
+                                f"Quantized model saved to {quantized_ckpt_save_path}"
+                            )
+
+                    mock_setup.side_effect = mock_setup_quantization
+
+                    # Execute the load_model method
+                    result_model = loader.load_model(
+                        model_config=config_with_save, device_config=self.device_config
+                    )
+
+                    # Verify the setup was called with save path
+                    mock_setup.assert_called_once()
+                    call_args = mock_setup.call_args
+                    # Check that the save path was passed correctly
+                    self.assertIn("quantized_ckpt_save_path", call_args[1])
+                    self.assertEqual(
+                        call_args[1]["quantized_ckpt_save_path"],
+                        "/path/to/save/checkpoint",
+                    )
+
+                    # Verify save was called
+                    mock_mto.save.assert_called_once_with(
+                        self.mock_base_model, "/path/to/save/checkpoint"
+                    )
+
+                    # Verify we get the expected model back
+                    self.assertEqual(result_model, self.mock_base_model)
+
+    def test_unified_quantization_flag_support(self):
+        """Test that ModelOptModelLoader supports unified quantization flags."""
+        # Test modelopt_fp8
+        config_fp8 = ModelConfig(
+            model_path=self.model_path, quantization="modelopt_fp8"
+        )
+        self.assertEqual(config_fp8._get_modelopt_quant_type(), "fp8")
+
+        # Test modelopt_fp4
+        config_fp4 = ModelConfig(
+            model_path=self.model_path, quantization="modelopt_fp4"
+        )
+        self.assertEqual(config_fp4._get_modelopt_quant_type(), "nvfp4")
+
+        # Test auto-detection
+        config_auto = ModelConfig(model_path=self.model_path, quantization="modelopt")
+        # Should default to fp8 when no config is detected
+        self.assertEqual(config_auto._get_modelopt_quant_type(), "fp8")
+

 class TestModelOptLoaderIntegration(CustomTestCase):
    """Integration tests for ModelOptModelLoader with Engine API."""