[Feature] Define backends and add Triton backend for Lora (#3161)

Co-authored-by: Ying Sheng <sqy1415@gmail.com>
2025-02-03 22:09:13 -08:00
parent 7b5a374114
commit 70817a7eae
18 changed files with 1129 additions and 135 deletions
--- a/benchmark/lora/lora_bench.py
+++ b/benchmark/lora/lora_bench.py
@@ -183,6 +183,7 @@ async def benchmark(
        api_url=api_url,
        prompt_len=test_prompt_len,
        output_len=test_output_len,
+        lora_name="dummy",  # the lora_name argument will not be used
        extra_request_body=extra_request_body,
    )
    test_output = await request_func(request_func_input=test_input)
@@ -206,6 +207,7 @@ async def benchmark(
            api_url=api_url,
            prompt_len=prompt_len,
            output_len=output_len,
+            lora_name="dummy",
            extra_request_body=extra_request_body,
        )
        tasks.append(
@@ -255,6 +257,9 @@ async def benchmark(
            "Output token throughput (tok/s):", metrics.output_throughput
        )
    )
+    print(
+        "{:<40} {:<10.2f}".format("Total throughput (tok/s):", metrics.total_throughput)
+    )
    print("{s:{c}^{n}}".format(s="End-to-End Latency", n=50, c="-"))
    print(
        "{:<40} {:<10.2f}".format("Mean E2E Latency (ms):", metrics.mean_e2e_latency_ms)