add profiling to bench_one_batch script (#2821)

2025-01-16 07:24:24 -08:00
parent a2f602b541
commit e00e5385e0
1 changed files with 49 additions and 2 deletions
--- a/python/sglang/bench_one_batch.py
+++ b/python/sglang/bench_one_batch.py
@@ -9,7 +9,8 @@ It accepts server arguments (the same as launch_server.py) and benchmark argumen
 python -m sglang.bench_one_batch --model-path meta-llama/Meta-Llama-3-8B-Instruct --load-format dummy
 ## sweep through multiple data points and store (append) the results in a jsonl file:
 python -m sglang.bench_one_batch --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 1 12 14 --input-len 256 512 --output-len 32 256 --run-name test_run
-
+## run with profiling:
+python -m sglang.bench_one_batch --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 1 12 14 --input-len 256 512 --profile
 # Usage (correctness test):
 python -m sglang.bench_one_batch --model-path TinyLlama/TinyLlama-1.1B-Chat-v0.4 --correct

@@ -77,6 +78,8 @@ class BenchArgs:
    correctness_test: bool = False
    # This is only used for correctness test
    cut_len: int = 4
+    profile: bool = False
+    profile_filename_prefix: str = "profile"

    @staticmethod
    def add_cli_args(parser: argparse.ArgumentParser):
@@ -95,6 +98,19 @@ class BenchArgs:
        )
        parser.add_argument("--correctness-test", action="store_true")
        parser.add_argument("--cut-len", type=int, default=BenchArgs.cut_len)
+        parser.add_argument(
+            "--profile",
+            action="store_true",
+            help="Use Torch Profiler. The endpoint must be launched with "
+            "SGLANG_TORCH_PROFILER_DIR to enable profiler.",
+        )
+        parser.add_argument(
+            "--profile-filename-prefix",
+            type=str,
+            default=BenchArgs.profile_filename_prefix,
+            help="Prefix of the profiling file names. The full profiling result file(s) be "
+            '"[profile_filename_prefix]_batch[batch_size]_input[input_len]_output[output_len].trace.json.gz"',
+        )

    @classmethod
    def from_cli_args(cls, args: argparse.Namespace):
@@ -286,7 +302,16 @@ def synchronize(device):


 def latency_test_run_once(
-    run_name, model_runner, rank_print, reqs, batch_size, input_len, output_len, device
+    run_name,
+    model_runner,
+    rank_print,
+    reqs,
+    batch_size,
+    input_len,
+    output_len,
+    device,
+    profile,
+    profile_filename_prefix,
 ):
    max_batch_size = model_runner.max_total_num_tokens // (input_len + output_len)
    if batch_size > max_batch_size:
@@ -308,6 +333,17 @@ def latency_test_run_once(

    tot_latency = 0

+    profiler = None
+    if profile:
+        profiler = torch.profiler.profile(
+            activities=[
+                torch.profiler.ProfilerActivity.CPU,
+                torch.profiler.ProfilerActivity.CUDA,
+            ],
+            with_stack=True,
+        )
+        profiler.start()
+
    # Prefill
    synchronize(device)
    tic = time.time()
@@ -338,6 +374,13 @@ def latency_test_run_once(
                f"Decode.  latency: {latency:6.5f} s, throughput: {throughput:9.2f} token/s"
            )

+    if profile:
+        profiler.stop()
+        profile_filename = f"{profile_filename_prefix}_batch{batch_size}_input{input_len}_output{output_len}.trace.json.gz"
+        parent_dir = os.path.dirname(os.path.abspath(profile_filename))
+        os.makedirs(parent_dir, exist_ok=True)
+        profiler.export_chrome_trace(profile_filename)
+
    # Record decode timing from 2nd output
    if output_len > 1:
        med_decode_latency = np.median(decode_latencies)
@@ -386,6 +429,8 @@ def latency_test(
        bench_args.input_len[0],
        8,  # shorter decoding to speed up the warmup
        server_args.device,
+        profile=False,
+        profile_filename_prefix="",  # not used
    )

    rank_print("Benchmark ...")
@@ -405,6 +450,8 @@ def latency_test(
            il,
            ol,
            server_args.device,
+            bench_args.profile,
+            bench_args.profile_filename_prefix,
        )
        if ret is not None:
            result_list.append(ret)