Format Benchmark Code (#399)

2024-04-28 21:06:22 +08:00
parent 19818b9c2f
commit 95c4e0dfac
41 changed files with 1169 additions and 608 deletions
--- a/benchmark/latency_throughput/bench_throughput.py
+++ b/benchmark/latency_throughput/bench_throughput.py
@@ -15,16 +15,17 @@ On the client side, run:
        --tokenizer <your_model> --dataset <target_dataset> \
        --request-rate <request_rate>
 """
+
 import argparse
 import asyncio
 import json
 import random
 import time
 from typing import AsyncGenerator, List, Tuple
-from tqdm.asyncio import tqdm_asyncio

 import aiohttp
 import numpy as np
+from tqdm.asyncio import tqdm_asyncio
 from transformers import PreTrainedTokenizerBase
 from vllm.transformers_utils.tokenizer import get_tokenizer

@@ -41,10 +42,7 @@ def sample_requests(
    with open(dataset_path) as f:
        dataset = json.load(f)
    # Filter out the conversations with less than 2 turns.
-    dataset = [
-        data for data in dataset
-        if len(data["conversations"]) >= 2
-    ]
+    dataset = [data for data in dataset if len(data["conversations"]) >= 2]
    # Only keep the first two turns of each conversation.
    dataset = [
        (data["conversations"][0]["value"], data["conversations"][1]["value"])
@@ -185,9 +183,17 @@ async def benchmark(
    tasks: List[asyncio.Task] = []
    async for request in get_request(input_requests, request_rate):
        prompt, prompt_len, output_len = request
-        task = asyncio.create_task(send_request(backend, api_url, prompt,
-                                                prompt_len, output_len,
-                                                best_of, use_beam_search))
+        task = asyncio.create_task(
+            send_request(
+                backend,
+                api_url,
+                prompt,
+                prompt_len,
+                output_len,
+                best_of,
+                use_beam_search,
+            )
+        )
        tasks.append(task)
    await tqdm_asyncio.gather(*tasks)

@@ -202,8 +208,16 @@ def main(args: argparse.Namespace):
    input_requests = sample_requests(args.dataset, args.num_prompts, tokenizer)

    benchmark_start_time = time.perf_counter()
-    asyncio.run(benchmark(args.backend, api_url, input_requests, args.best_of,
-                          args.use_beam_search, args.request_rate))
+    asyncio.run(
+        benchmark(
+            args.backend,
+            api_url,
+            input_requests,
+            args.best_of,
+            args.use_beam_search,
+            args.request_rate,
+        )
+    )
    benchmark_end_time = time.perf_counter()
    benchmark_time = benchmark_end_time - benchmark_start_time
    print(f"Total time: {benchmark_time:.2f} s")
@@ -212,43 +226,61 @@ def main(args: argparse.Namespace):
    # Compute the latency statistics.
    avg_latency = np.mean([latency for _, _, latency in REQUEST_LATENCY])
    print(f"Average latency: {avg_latency:.2f} s")
-    avg_per_token_latency = np.mean([
-        latency / (prompt_len + output_len)
-        for prompt_len, output_len, latency in REQUEST_LATENCY
-    ])
+    avg_per_token_latency = np.mean(
+        [
+            latency / (prompt_len + output_len)
+            for prompt_len, output_len, latency in REQUEST_LATENCY
+        ]
+    )
    print(f"Average latency per token: {avg_per_token_latency:.2f} s")
-    avg_per_output_token_latency = np.mean([
-        latency / output_len
-        for _, output_len, latency in REQUEST_LATENCY
-    ])
-    print("Average latency per output token: "
-          f"{avg_per_output_token_latency:.2f} s")
+    avg_per_output_token_latency = np.mean(
+        [latency / output_len for _, output_len, latency in REQUEST_LATENCY]
+    )
+    print("Average latency per output token: " f"{avg_per_output_token_latency:.2f} s")


 if __name__ == "__main__":
    parser = argparse.ArgumentParser(
-        description="Benchmark the online serving throughput.")
-    parser.add_argument("--backend", type=str, default="vllm",
-                        choices=["vllm", "tgi", "srt", "lightllm"])
+        description="Benchmark the online serving throughput."
+    )
+    parser.add_argument(
+        "--backend",
+        type=str,
+        default="vllm",
+        choices=["vllm", "tgi", "srt", "lightllm"],
+    )
    parser.add_argument("--host", type=str, default="localhost")
    parser.add_argument("--port", type=int, default=8000)
-    parser.add_argument("--dataset", type=str, required=True,
-                        help="Path to the dataset.")
-    parser.add_argument("--tokenizer", type=str, required=True,
-                        help="Name or path of the tokenizer.")
-    parser.add_argument("--best-of", type=int, default=1,
-                        help="Generates `best_of` sequences per prompt and "
-                             "returns the best one.")
+    parser.add_argument(
+        "--dataset", type=str, required=True, help="Path to the dataset."
+    )
+    parser.add_argument(
+        "--tokenizer", type=str, required=True, help="Name or path of the tokenizer."
+    )
+    parser.add_argument(
+        "--best-of",
+        type=int,
+        default=1,
+        help="Generates `best_of` sequences per prompt and " "returns the best one.",
+    )
    parser.add_argument("--use-beam-search", action="store_true")
-    parser.add_argument("--num-prompts", type=int, default=1000,
-                        help="Number of prompts to process.")
-    parser.add_argument("--request-rate", type=float, default=float("inf"),
-                        help="Number of requests per second. If this is inf, "
-                             "then all the requests are sent at time 0. "
-                             "Otherwise, we use Poisson process to synthesize "
-                             "the request arrival times.")
+    parser.add_argument(
+        "--num-prompts", type=int, default=1000, help="Number of prompts to process."
+    )
+    parser.add_argument(
+        "--request-rate",
+        type=float,
+        default=float("inf"),
+        help="Number of requests per second. If this is inf, "
+        "then all the requests are sent at time 0. "
+        "Otherwise, we use Poisson process to synthesize "
+        "the request arrival times.",
+    )
    parser.add_argument("--seed", type=int, default=0)
-    parser.add_argument('--trust-remote-code', action='store_true',
-                        help='trust remote code from huggingface')
+    parser.add_argument(
+        "--trust-remote-code",
+        action="store_true",
+        help="trust remote code from huggingface",
+    )
    args = parser.parse_args()
    main(args)