[Feature] Support Tensor Parallelism and Weight Slicing for Lora (#4274)

Co-authored-by: ShenAo1111 <1377693092@qq.com> Co-authored-by: Baizhou Zhang <sobereddiezhang@gmail.com>
2025-03-18 23:33:07 -04:00
parent 3196999f63
commit 588865f0e0
13 changed files with 528 additions and 103 deletions
--- a/python/sglang/srt/lora/lora.py
+++ b/python/sglang/srt/lora/lora.py
@@ -39,16 +39,9 @@ class LoRALayer(nn.Module):
        super().__init__()
        self.config: LoRAConfig = config
        self.base_hf_config: AutoConfig = base_hf_config
+
+        # lora weights in cpu. The weights are loaded from checkpoint.
        self.weights: Dict[str, torch.Tensor] = {}
-        self.weight_gpu: Dict[str, torch.Tensor] = {}
-
-    def load_to_gpu(self):
-        for name, weight in self.weights.items():
-            self.weight_gpu[name] = weight.to(torch.float16).to("cuda")
-
-    def offload_from_gpu(self):
-        for name, weight in self.weights.items():
-            self.weight_gpu[name] = None


 class LoRAAdapter(nn.Module):
@@ -77,19 +70,6 @@ class LoRAAdapter(nn.Module):
        )

        self.weights: Dict[str, torch.Tensor] = {}
-        self.weights_gpu: Dict[str, torch.Tensor] = {}
-
-    def load_to_gpu(self):
-        for name, weight in self.weights.items():
-            self.weights_gpu[name] = weight.to(torch.float16).to("cuda")
-        for layer in self.layers:
-            layer.load_to_gpu()
-
-    def offload_from_gpu(self):
-        for name, weight in self.weights.items():
-            self.weights_gpu[name] = None
-        for layer in self.layers:
-            layer.offload_from_gpu()

    # initialize the LoRA weights to cpu
    def initialize_weights(self):