Improve linear.py to load sharded weights & remove the dependency of Parameters from vllm (#2784)

Co-authored-by: SangBin Cho rkooo567@gmail.com
2025-01-07 23:29:10 -08:00
parent 694e41925e
commit 8a6906127a
15 changed files with 655 additions and 88 deletions
--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -205,7 +205,7 @@ class ModelRunner:
        if self.device == "cuda":
            backend = "nccl"
        elif self.device == "xpu":
-            # TODO(liangan1):Just use gloo to bypass the initilization fail
+            # TODO(liangan1): Just use gloo to bypass the initilization fail
            # Need to use xccl for xpu backend in the future
            backend = "gloo"
        elif self.device == "hpu":
@@ -634,7 +634,6 @@ class ModelRunner:
            )

    def init_double_sparsity_channel_config(self, selected_channel):
-
        selected_channel = "." + selected_channel + "_proj"
        self.sorted_channels = []
        # load channel config