Revert "Revert "[FEAT] Support GGUF format"" (#2287)
This commit is contained in:
@@ -20,6 +20,7 @@ import random
|
||||
import tempfile
|
||||
from typing import List, Optional
|
||||
|
||||
from sglang.srt.hf_transformers_utils import check_gguf_file
|
||||
from sglang.srt.utils import (
|
||||
get_amdgpu_memory_capacity,
|
||||
get_nvgpu_memory_capacity,
|
||||
@@ -204,6 +205,12 @@ class ServerArgs:
|
||||
"Overlap schedule is disabled."
|
||||
)
|
||||
|
||||
# GGUF
|
||||
if (
|
||||
self.load_format == "auto" or self.load_format == "gguf"
|
||||
) and check_gguf_file(self.model_path):
|
||||
self.quantization = self.load_format = "gguf"
|
||||
|
||||
@staticmethod
|
||||
def add_cli_args(parser: argparse.ArgumentParser):
|
||||
# Model and port args
|
||||
@@ -243,7 +250,7 @@ class ServerArgs:
|
||||
"--load-format",
|
||||
type=str,
|
||||
default=ServerArgs.load_format,
|
||||
choices=["auto", "pt", "safetensors", "npcache", "dummy"],
|
||||
choices=["auto", "pt", "safetensors", "npcache", "dummy", "gguf"],
|
||||
help="The format of the model weights to load. "
|
||||
'"auto" will try to load the weights in the safetensors format '
|
||||
"and fall back to the pytorch bin format if safetensors format "
|
||||
@@ -253,7 +260,8 @@ class ServerArgs:
|
||||
'"npcache" will load the weights in pytorch format and store '
|
||||
"a numpy cache to speed up the loading. "
|
||||
'"dummy" will initialize the weights with random values, '
|
||||
"which is mainly for profiling.",
|
||||
"which is mainly for profiling."
|
||||
'"gguf" will load the weights in the gguf format. ',
|
||||
)
|
||||
parser.add_argument(
|
||||
"--trust-remote-code",
|
||||
@@ -293,6 +301,7 @@ class ServerArgs:
|
||||
"gptq_marlin",
|
||||
"awq_marlin",
|
||||
"bitsandbytes",
|
||||
"gguf",
|
||||
],
|
||||
help="The quantization method.",
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user