[Feature] Support Tensor Parallelism and Weight Slicing for Lora (#4274)

Co-authored-by: ShenAo1111 <1377693092@qq.com> Co-authored-by: Baizhou Zhang <sobereddiezhang@gmail.com>
2025-03-18 23:33:07 -04:00
parent 3196999f63
commit 588865f0e0
13 changed files with 528 additions and 103 deletions
--- a/benchmark/lora/launch_server.py
+++ b/benchmark/lora/launch_server.py
@@ -22,7 +22,10 @@ def launch_server(args):
    cmd += f"--disable-radix --disable-cuda-graph "
    cmd += f"--max-loras-per-batch {args.max_loras_per_batch} "
    cmd += f"--max-running-requests {args.max_running_requests} "
-    cmd += f"--lora-backend {args.lora_backend}"
+    cmd += f"--lora-backend {args.lora_backend} "
+    cmd += f"--tp-size {args.tp_size} "
+    if args.disable_custom_all_reduce:
+        cmd += "--disable-custom-all-reduce"
    print(cmd)
    os.system(cmd)

@@ -48,6 +51,18 @@ if __name__ == "__main__":
        type=str,
        default="triton",
    )
+    parser.add_argument(
+        "--tp-size",
+        type=int,
+        default=1,
+        help="Tensor parallel size for distributed inference",
+    )
+    # disable_custom_all_reduce
+    parser.add_argument(
+        "--disable-custom-all-reduce",
+        action="store_true",
+        help="Disable custom all reduce when device does not support p2p communication",
+    )
    args = parser.parse_args()

    launch_server(args)