refactor model loader: initial refactor (#664)

2024-07-20 02:18:22 -07:00
parent 39c57317e1
commit 06487f126e
6 changed files with 100 additions and 15 deletions
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -57,6 +57,7 @@ class ServerArgs:
    disable_disk_cache: bool = False
    attention_reduce_in_fp32: bool = False
    enable_p2p_check: bool = False
+    efficient_weight_load: bool = False

    # Distributed args
    nccl_init_addr: Optional[str] = None
@@ -327,6 +328,11 @@ class ServerArgs:
            action="store_true",
            help="Enable P2P check for GPU access, otherwise the p2p access is allowed by default.",
        )
+        parser.add_argument(
+            "--efficient-weight-load",
+            action="store_true",
+            help="Turn on memory efficient weight loading with quantization (quantize per layer during loading).",
+        )

    @classmethod
    def from_cli_args(cls, args: argparse.Namespace):