Support loading of larger models with on-the-fly quantization (#3061)

2025-01-22 21:33:17 -08:00
parent 8b84e69f25
commit 862bcff833
6 changed files with 116 additions and 14 deletions
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -317,6 +317,7 @@ class ServerArgs:
                "dummy",
                "gguf",
                "bitsandbytes",
+                "layered",
            ],
            help="The format of the model weights to load. "
            '"auto" will try to load the weights in the safetensors format '
@@ -330,7 +331,10 @@ class ServerArgs:
            "which is mainly for profiling."
            '"gguf" will load the weights in the gguf format. '
            '"bitsandbytes" will load the weights using bitsandbytes '
-            "quantization.",
+            "quantization."
+            '"layered" loads weights layer by layer so that one can quantize a '
+            "layer before loading another to make the peak memory envelope "
+            "smaller.",
        )
        parser.add_argument(
            "--trust-remote-code",