[Feature] Support Tensor Parallelism and Weight Slicing for Lora (#4274)
Co-authored-by: ShenAo1111 <1377693092@qq.com> Co-authored-by: Baizhou Zhang <sobereddiezhang@gmail.com>
This commit is contained in:
@@ -437,6 +437,7 @@ class SRTRunner:
|
||||
speculative_eagle_topk: Optional[int] = None,
|
||||
speculative_num_draft_tokens: Optional[int] = None,
|
||||
disable_overlap_schedule: bool = False,
|
||||
disable_custom_all_reduce: bool = False,
|
||||
):
|
||||
self.model_type = model_type
|
||||
self.is_generation = model_type == "generation"
|
||||
@@ -470,6 +471,7 @@ class SRTRunner:
|
||||
enable_ep_moe=enable_ep_moe,
|
||||
disable_overlap_schedule=disable_overlap_schedule,
|
||||
cuda_graph_max_bs=4,
|
||||
disable_custom_all_reduce=disable_custom_all_reduce,
|
||||
**spec_kwargs,
|
||||
)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user