Improve tensor parallel performance (#625)

Co-authored-by: Mingyi <wisclmy0611@gmail.com>
2024-07-15 07:10:51 -07:00
parent 5ac8b80677
commit 6a2941f4d0
10 changed files with 171 additions and 81 deletions
--- a/benchmark/latency_throughput/bench_serving.py
+++ b/benchmark/latency_throughput/bench_serving.py
@@ -312,6 +312,9 @@ def main(args: argparse.Namespace):
        np.sum([output_len for _, output_len, _ in REQUEST_LATENCY]) / benchmark_time
    )

+    #latencies = [round(latency, 2) for _, _, latency in REQUEST_LATENCY]
+    #print(latencies)
+
    print(f"Total time: {benchmark_time:.2f} s")
    print(f"Request throughput: {args.num_prompts / benchmark_time:.2f} requests/s")
    print(f"Decoding throughput: {decoding_throughput:.2f} token/s")