[Feature] Define backends and add Triton backend for Lora (#3161)

Co-authored-by: Ying Sheng <sqy1415@gmail.com>
2025-02-03 22:09:13 -08:00
parent 7b5a374114
commit 70817a7eae
18 changed files with 1129 additions and 135 deletions
--- a/benchmark/lora/launch_server.py
+++ b/benchmark/lora/launch_server.py
@@ -1,10 +1,10 @@
 import argparse
 import os

-NUM_LORAS = 8
+NUM_LORAS = 4
 LORA_PATH = {
-    "base": "mistralai/Mistral-7B-Instruct-v0.3",
-    "lora": "/home/ying/test_lora",
+    "base": "meta-llama/Llama-2-7b-hf",
+    "lora": "winddude/wizardLM-LlaMA-LoRA-7B",
 }


@@ -21,7 +21,8 @@ def launch_server(args):
            cmd += f"{lora_name}={lora_path} "
    cmd += f"--disable-radix --disable-cuda-graph "
    cmd += f"--max-loras-per-batch {args.max_loras_per_batch} "
-    cmd += f"--max-running-requests {args.max_running_requests}"
+    cmd += f"--max-running-requests {args.max_running_requests} "
+    cmd += f"--lora-backend {args.lora_backend}"
    print(cmd)
    os.system(cmd)

@@ -42,6 +43,11 @@ if __name__ == "__main__":
        type=int,
        default=8,
    )
+    parser.add_argument(
+        "--lora-backend",
+        type=str,
+        default="triton",
+    )
    args = parser.parse_args()

    launch_server(args)