[Feature] Support Tensor Parallelism and Weight Slicing for Lora (#4274)

Co-authored-by: ShenAo1111 <1377693092@qq.com>
Co-authored-by: Baizhou Zhang <sobereddiezhang@gmail.com>
This commit is contained in:
aoshen524
2025-03-18 23:33:07 -04:00
committed by GitHub
parent 3196999f63
commit 588865f0e0
13 changed files with 528 additions and 103 deletions

View File

@@ -22,7 +22,10 @@ def launch_server(args):
cmd += f"--disable-radix --disable-cuda-graph "
cmd += f"--max-loras-per-batch {args.max_loras_per_batch} "
cmd += f"--max-running-requests {args.max_running_requests} "
cmd += f"--lora-backend {args.lora_backend}"
cmd += f"--lora-backend {args.lora_backend} "
cmd += f"--tp-size {args.tp_size} "
if args.disable_custom_all_reduce:
cmd += "--disable-custom-all-reduce"
print(cmd)
os.system(cmd)
@@ -48,6 +51,18 @@ if __name__ == "__main__":
type=str,
default="triton",
)
parser.add_argument(
"--tp-size",
type=int,
default=1,
help="Tensor parallel size for distributed inference",
)
# disable_custom_all_reduce
parser.add_argument(
"--disable-custom-all-reduce",
action="store_true",
help="Disable custom all reduce when device does not support p2p communication",
)
args = parser.parse_args()
launch_server(args)