[Feature] Support Tensor Parallelism and Weight Slicing for Lora (#4274)

Co-authored-by: ShenAo1111 <1377693092@qq.com> Co-authored-by: Baizhou Zhang <sobereddiezhang@gmail.com>
2025-03-18 23:33:07 -04:00
parent 3196999f63
commit 588865f0e0
13 changed files with 528 additions and 103 deletions
--- a/python/sglang/test/runners.py
+++ b/python/sglang/test/runners.py
@@ -437,6 +437,7 @@ class SRTRunner:
        speculative_eagle_topk: Optional[int] = None,
        speculative_num_draft_tokens: Optional[int] = None,
        disable_overlap_schedule: bool = False,
+        disable_custom_all_reduce: bool = False,
    ):
        self.model_type = model_type
        self.is_generation = model_type == "generation"
@@ -470,6 +471,7 @@ class SRTRunner:
            enable_ep_moe=enable_ep_moe,
            disable_overlap_schedule=disable_overlap_schedule,
            cuda_graph_max_bs=4,
+            disable_custom_all_reduce=disable_custom_all_reduce,
            **spec_kwargs,
        )