Improve tensor parallel performance (#625)

Co-authored-by: Mingyi <wisclmy0611@gmail.com>
2024-07-15 07:10:51 -07:00
parent 5ac8b80677
commit 6a2941f4d0
10 changed files with 171 additions and 81 deletions
--- a/benchmark/latency_throughput/bench_one.py
+++ b/benchmark/latency_throughput/bench_one.py
@@ -96,8 +96,11 @@ def run_one_batch_size(bs):
        ret = response.json()
    print(ret)

+    input_len = args.input_len if args.input_len else 1
+    output_len = max_new_tokens
+
    output_throughput = bs * max_new_tokens / latency
-    overall_throughput = bs * (args.input_len + max_new_tokens) / latency
+    overall_throughput = bs * (input_len + output_len) / latency
    print(f"latency: {latency:.2f} s")
    print(f"decode throughput: {output_throughput:.2f} token/s")
    print(f"overall throughput: {overall_throughput:.2f} token/s")