Revert "Revert "[FEAT] Support GGUF format"" (#2287)

2024-11-30 22:14:48 -08:00
parent 1bfa511b95
commit 4936be8acc
41 changed files with 229 additions and 132 deletions
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -20,6 +20,7 @@ import random
 import tempfile
 from typing import List, Optional

+from sglang.srt.hf_transformers_utils import check_gguf_file
 from sglang.srt.utils import (
    get_amdgpu_memory_capacity,
    get_nvgpu_memory_capacity,
@@ -204,6 +205,12 @@ class ServerArgs:
                "Overlap schedule is disabled."
            )

+        # GGUF
+        if (
+            self.load_format == "auto" or self.load_format == "gguf"
+        ) and check_gguf_file(self.model_path):
+            self.quantization = self.load_format = "gguf"
+
    @staticmethod
    def add_cli_args(parser: argparse.ArgumentParser):
        # Model and port args
@@ -243,7 +250,7 @@ class ServerArgs:
            "--load-format",
            type=str,
            default=ServerArgs.load_format,
-            choices=["auto", "pt", "safetensors", "npcache", "dummy"],
+            choices=["auto", "pt", "safetensors", "npcache", "dummy", "gguf"],
            help="The format of the model weights to load. "
            '"auto" will try to load the weights in the safetensors format '
            "and fall back to the pytorch bin format if safetensors format "
@@ -253,7 +260,8 @@ class ServerArgs:
            '"npcache" will load the weights in pytorch format and store '
            "a numpy cache to speed up the loading. "
            '"dummy" will initialize the weights with random values, '
-            "which is mainly for profiling.",
+            "which is mainly for profiling."
+            '"gguf" will load the weights in the gguf format. ',
        )
        parser.add_argument(
            "--trust-remote-code",
@@ -293,6 +301,7 @@ class ServerArgs:
                "gptq_marlin",
                "awq_marlin",
                "bitsandbytes",
+                "gguf",
            ],
            help="The quantization method.",
        )