Improve linear.py to load sharded weights & remove the dependency of Parameters from vllm (#2784)

Co-authored-by: SangBin Cho rkooo567@gmail.com
This commit is contained in:
Lianmin Zheng
2025-01-07 23:29:10 -08:00
committed by GitHub
parent 694e41925e
commit 8a6906127a
15 changed files with 655 additions and 88 deletions

View File

@@ -205,7 +205,7 @@ class ModelRunner:
if self.device == "cuda":
backend = "nccl"
elif self.device == "xpu":
# TODO(liangan1):Just use gloo to bypass the initilization fail
# TODO(liangan1): Just use gloo to bypass the initilization fail
# Need to use xccl for xpu backend in the future
backend = "gloo"
elif self.device == "hpu":
@@ -634,7 +634,6 @@ class ModelRunner:
)
def init_double_sparsity_channel_config(self, selected_channel):
selected_channel = "." + selected_channel + "_proj"
self.sorted_channels = []
# load channel config