From 4edbe0d534debd907f75068bb520a5b9d42a3790 Mon Sep 17 00:00:00 2001
From: yuxingcyx <yuxingchen.math@gmail.com>
Date: Sat, 23 Aug 2025 15:40:15 +0800
Subject: [PATCH] [benchmark] Add benchmark scripts for ceval and boolq 
 (#8946)

Co-authored-by: chenyuxing <2818499974@qq.com>
Co-authored-by: hanqing <huang010706@126.com>
Co-authored-by: Muggle <62579327+trawolf@users.noreply.github.com>
Co-authored-by: ronnie_zheng <zl19940307@163.com>
---
 benchmark/boolq/README.md                  |  19 +++
 benchmark/boolq/bench_sglang.py            | 124 ++++++++++++++++++
 benchmark/boolq/convert_parquet_to_json.py |  28 +++++
 benchmark/boolq/parquet_to_json.sh         |  26 ++++
 benchmark/ceval/README.md                  |  15 +++
 benchmark/ceval/bench_sglang.py            | 138 +++++++++++++++++++++
 6 files changed, 350 insertions(+)
 create mode 100644 benchmark/boolq/README.md
 create mode 100644 benchmark/boolq/bench_sglang.py
 create mode 100644 benchmark/boolq/convert_parquet_to_json.py
 create mode 100755 benchmark/boolq/parquet_to_json.sh
 create mode 100644 benchmark/ceval/README.md
 create mode 100644 benchmark/ceval/bench_sglang.py

diff --git a/benchmark/boolq/README.md b/benchmark/boolq/README.md
new file mode 100644
index 000000000..3704742ee
--- /dev/null
+++ b/benchmark/boolq/README.md
@@ -0,0 +1,19 @@
+## Download data
+```
+git clone https://hf-mirror.com/datasets/google/boolq
+```
+
+## Convert parquet to json
+```
+bash parquet_to_json.sh
+```
+## Run benchmark
+
+### Benchmark sglang
+```
+python -m sglang.launch_server --model-path ramblingpolymath/Qwen3-32B-W8A8 --port 30000
+```
+
+```
+python3 bench_sglang.py
+```
diff --git a/benchmark/boolq/bench_sglang.py b/benchmark/boolq/bench_sglang.py
new file mode 100644
index 000000000..b3ce3c996
--- /dev/null
+++ b/benchmark/boolq/bench_sglang.py
@@ -0,0 +1,124 @@
+import argparse
+import json
+import time
+
+import numpy as np
+
+from sglang.api import set_default_backend
+from sglang.test.test_utils import (
+    add_common_sglang_args_and_parse,
+    select_sglang_backend,
+)
+from sglang.utils import read_jsonl
+
+
+def get_example(lines, i, answer):
+    prompt = "Question: " + lines[i]["question"] + lines[i]["passage"] + "\nAnswer:"
+    if answer:
+        prompt += str(lines[i]["answer"])
+    return prompt
+
+
+def few_shot_examples(lines, k):
+    prompts = ""
+    for i in range(k):
+        prompts += get_example(lines, i, True) + "\n\n"
+    return prompts
+
+
+def main(args):
+    # Select backend
+    set_default_backend(select_sglang_backend(args))
+
+    # Read data
+    train_data_path = args.train_data_path
+    test_data_path = args.test_data_path
+    lines_train = list(read_jsonl(train_data_path))
+    lines_test = list(read_jsonl(test_data_path))
+
+    # Construct prompts
+    num_questions = args.num_questions
+    num_shots = args.num_shots
+    few_shots = few_shot_examples(lines_train, num_shots)
+
+    questions = []
+    answer = []
+    for i in range(len(lines_test[:num_questions])):
+        questions.append(get_example(lines_test, i, False))
+        answer.append(str(lines_test[i]["answer"]))
+    arguments = [{"question": q} for q in questions]
+
+    #####################################
+    ######### SGL Program Begin #########
+    #####################################
+
+    import sglang as sgl
+
+    @sgl.function
+    def few_shot_boolq(s, question):
+        s += few_shots + question
+        s += sgl.gen("answer", max_tokens=5, stop=["\n"])
+
+    #####################################
+    ########## SGL Program End ##########
+    #####################################
+
+    # Run requests
+    tic = time.perf_counter()
+    states = few_shot_boolq.run_batch(
+        arguments,
+        temperature=0,
+        num_threads=args.parallel,
+        progress_bar=True,
+    )
+    latency = time.perf_counter() - tic
+
+    preds = []
+    for i in range(len(states)):
+        preds.append(states[i]["answer"])
+
+    # Compute accuracy
+    acc = np.mean(np.array(preds) == np.array(answer))
+
+    # Compute speed
+    num_output_tokens = sum(
+        s.get_meta_info("answer")["completion_tokens"] for s in states
+    )
+    output_throughput = num_output_tokens / latency
+
+    # Print results
+    print(f"Accuracy: {acc:.3f}")
+    print(f"Latency: {latency:.3f} s")
+    print(f"Output throughput: {output_throughput:.3f} token/s")
+
+    # Results
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "boolq",
+            "backend": args.backend,
+            "num_gpus": 1,
+            "latency": round(latency, 3),
+            "accuracy": round(acc, 3),
+            "num_requests": args.num_questions,
+            "other": {
+                "num_questions": args.num_questions,
+                "parallel": args.parallel,
+            },
+        }
+        fout.write(json.dumps(value) + "\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--num-shots", type=int, default=5)
+    parser.add_argument(
+        "--train-data-path", type=str, default="./boolq/data/train-00000-of-00001.json"
+    )
+    parser.add_argument(
+        "--test-data-path",
+        type=str,
+        default="./boolq/data/validation-00000-of-00001.json",
+    )
+    parser.add_argument("--num-questions", type=int, default=200)
+    args = add_common_sglang_args_and_parse(parser)
+    main(args)
diff --git a/benchmark/boolq/convert_parquet_to_json.py b/benchmark/boolq/convert_parquet_to_json.py
new file mode 100644
index 000000000..e3e69cb31
--- /dev/null
+++ b/benchmark/boolq/convert_parquet_to_json.py
@@ -0,0 +1,28 @@
+import sys
+
+import pyarrow.parquet as pq
+
+
+def convert_parquet_to_json(input_file, output_file):
+    # read parquet file
+    table = pq.read_table(input_file)
+
+    # turn parquet data to dataframe
+    df = table.to_pandas()
+
+    # turn dataframe to json form
+    json_data = df.to_json(orient="records", lines=True)
+
+    # write json to file
+    with open(output_file, "w") as f:
+        f.write(json_data)
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 3:
+        print("Usage:python convert_parquet_to_json.py <input_file> <output_file>")
+
+    input_file = sys.argv[1]
+    output_file = sys.argv[2]
+
+    convert_parquet_to_json(input_file, output_file)
diff --git a/benchmark/boolq/parquet_to_json.sh b/benchmark/boolq/parquet_to_json.sh
new file mode 100755
index 000000000..9aaf087ff
--- /dev/null
+++ b/benchmark/boolq/parquet_to_json.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+#define input and output direction
+input_dir="./boolq/data"
+output_dir="./boolq/data"
+
+#define files needed to be handled
+files=(
+        "train-00000-of-00001.parquet"
+        "validation-00000-of-00001.parquet"
+)
+
+#foe files above, use python script to convert the form
+for file in "${files[@]}"; do
+    input_file="${input_dir}/${file}"
+    output_file="${output_dir}/${file%.parquet}.json"
+
+    echo "Converting ${input_file} to ${output_file} ..."
+    python3 convert_parquet_to_json.py "${input_file}" "${output_file}"
+
+    if [ $? -eq 0 ]; then
+        echo "Conversion successful: ${output_file}"
+    else
+        echo "Conversion failed: ${input_file}"
+    fi
+done
diff --git a/benchmark/ceval/README.md b/benchmark/ceval/README.md
new file mode 100644
index 000000000..b822e43c3
--- /dev/null
+++ b/benchmark/ceval/README.md
@@ -0,0 +1,15 @@
+## Download data
+```
+git lfs clone https://huggingface.co/datasets/ceval/ceval-exam
+```
+
+## Run benchmark
+
+### Benchmark sglang
+```
+python -m sglang.launch_server --model-path ramblingpolymath/Qwen3-32B-W8A8 --port 30000
+```
+
+```
+python3 bench_sglang.py
+```
diff --git a/benchmark/ceval/bench_sglang.py b/benchmark/ceval/bench_sglang.py
new file mode 100644
index 000000000..32ed0baf2
--- /dev/null
+++ b/benchmark/ceval/bench_sglang.py
@@ -0,0 +1,138 @@
+import argparse
+import json
+import os
+import random
+import re
+import time
+
+import numpy as np
+from datasets import load_dataset
+
+from sglang.api import set_default_backend
+from sglang.test.test_utils import (
+    add_common_sglang_args_and_parse,
+    select_sglang_backend,
+)
+
+choices = ["A", "B", "C", "D"]
+
+
+def get_one_example(line, include_answer):
+    res = line["question"]
+    res += f"\nA. {line['A']}"
+    res += f"\nB. {line['B']}"
+    res += f"\nC. {line['C']}"
+    res += f"\nD. {line['D']}"
+
+    if include_answer:
+        res += f"\nAnswer: {line['answer']} \n\n"
+    return res
+
+
+def get_few_shot_examples(lines):
+    res = ""
+    for line in lines:
+        res += get_one_example(line, True) + "\n\n"
+    return res
+
+
+def get_answer_value(response):
+    pattern = r"(Answer:|answer:|答案是|答案是:|正确答案是:|答案:|Assistant:)\s*([A-D])(?![\w])"
+    match = re.search(pattern, response)
+
+    if match:
+        return match.group(2)
+
+    return random.choice(choices)
+
+
+def main(args):
+    # Read data && Construct prompts
+    arguments = []
+    labels = []
+    examples = "examples:\n"
+    data_path = args.data_path
+    for subject in os.listdir(data_path):
+        subject_path = os.path.join(data_path, subject)
+        if os.path.isdir(subject_path) and subject != ".git":
+            dataset = load_dataset(data_path, name=subject)
+            dev_lines_temp = dataset["dev"]
+            val_lines_temp = dataset["val"]
+            few_shot_examples = get_few_shot_examples(dev_lines_temp, subject)
+            examples += f"{few_shot_examples}"
+            for val_line in val_lines_temp:
+                arguments.append(
+                    {
+                        "examples": few_shot_examples,
+                        "question": get_one_example(val_line, False),
+                    }
+                )
+                labels.append(val_line["answer"])
+
+    #####################################
+    ######### SGL Program Begin #########
+    #####################################
+
+    import sglang as sgl
+
+    @sgl.function
+    def few_shot_ceval(s, examples, question):
+        s += examples + question + sgl.gen("Answer")
+
+    #####################################
+    ########## SGL Program End ##########
+    #####################################
+
+    num_questions = args.num_questions if args.num_questions else len(arguments)
+
+    # Select backend
+    set_default_backend(select_sglang_backend(args))
+
+    # Run requests
+    tic = time.perf_counter()
+    states = few_shot_ceval.run_batch(
+        arguments[:num_questions],
+        temperature=0,
+        num_threads=args.parallel,
+        progress_bar=True,
+    )
+    latency = time.perf_counter() - tic
+
+    preds = [get_answer_value(states[i]["Answer"]) for i in range(num_questions)]
+
+    # Compute accuracy
+    acc = np.mean(np.array(preds) == np.array(labels[:num_questions]))
+
+    # Compute speed
+    num_output_tokens = sum(
+        s.get_meta_info("Answer")["completion_tokens"] for s in states
+    )
+    output_throughput = num_output_tokens / latency
+
+    # Print results
+    print(f"Accuracy: {acc:.3f}")
+    print(f"Latency: {latency:.3f} s")
+    print(f"Output throughput: {output_throughput:.3f} token/s")
+
+    # Write results
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "ceval",
+            "backend": args.backend,
+            "num_gpus": 1,
+            "latency": round(latency, 3),
+            "accuracy": round(acc, 3),
+            "num_requests": args.num_questions,
+            "other": {
+                "parallel": args.parallel,
+            },
+        }
+        fout.write(json.dumps(value) + "\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data-path", type=str, default="ceval-exam")
+    parser.add_argument("--num-questions", type=int, default=None)
+    args = add_common_sglang_args_and_parse(parser)
+    main(args)