From 4edbe0d534debd907f75068bb520a5b9d42a3790 Mon Sep 17 00:00:00 2001 From: yuxingcyx Date: Sat, 23 Aug 2025 15:40:15 +0800 Subject: [PATCH] [benchmark] Add benchmark scripts for ceval and boolq (#8946) Co-authored-by: chenyuxing <2818499974@qq.com> Co-authored-by: hanqing Co-authored-by: Muggle <62579327+trawolf@users.noreply.github.com> Co-authored-by: ronnie_zheng --- benchmark/boolq/README.md | 19 +++ benchmark/boolq/bench_sglang.py | 124 ++++++++++++++++++ benchmark/boolq/convert_parquet_to_json.py | 28 +++++ benchmark/boolq/parquet_to_json.sh | 26 ++++ benchmark/ceval/README.md | 15 +++ benchmark/ceval/bench_sglang.py | 138 +++++++++++++++++++++ 6 files changed, 350 insertions(+) create mode 100644 benchmark/boolq/README.md create mode 100644 benchmark/boolq/bench_sglang.py create mode 100644 benchmark/boolq/convert_parquet_to_json.py create mode 100755 benchmark/boolq/parquet_to_json.sh create mode 100644 benchmark/ceval/README.md create mode 100644 benchmark/ceval/bench_sglang.py diff --git a/benchmark/boolq/README.md b/benchmark/boolq/README.md new file mode 100644 index 000000000..3704742ee --- /dev/null +++ b/benchmark/boolq/README.md @@ -0,0 +1,19 @@ +## Download data +``` +git clone https://hf-mirror.com/datasets/google/boolq +``` + +## Convert parquet to json +``` +bash parquet_to_json.sh +``` +## Run benchmark + +### Benchmark sglang +``` +python -m sglang.launch_server --model-path ramblingpolymath/Qwen3-32B-W8A8 --port 30000 +``` + +``` +python3 bench_sglang.py +``` diff --git a/benchmark/boolq/bench_sglang.py b/benchmark/boolq/bench_sglang.py new file mode 100644 index 000000000..b3ce3c996 --- /dev/null +++ b/benchmark/boolq/bench_sglang.py @@ -0,0 +1,124 @@ +import argparse +import json +import time + +import numpy as np + +from sglang.api import set_default_backend +from sglang.test.test_utils import ( + add_common_sglang_args_and_parse, + select_sglang_backend, +) +from sglang.utils import read_jsonl + + +def get_example(lines, i, answer): + prompt = "Question: " + lines[i]["question"] + lines[i]["passage"] + "\nAnswer:" + if answer: + prompt += str(lines[i]["answer"]) + return prompt + + +def few_shot_examples(lines, k): + prompts = "" + for i in range(k): + prompts += get_example(lines, i, True) + "\n\n" + return prompts + + +def main(args): + # Select backend + set_default_backend(select_sglang_backend(args)) + + # Read data + train_data_path = args.train_data_path + test_data_path = args.test_data_path + lines_train = list(read_jsonl(train_data_path)) + lines_test = list(read_jsonl(test_data_path)) + + # Construct prompts + num_questions = args.num_questions + num_shots = args.num_shots + few_shots = few_shot_examples(lines_train, num_shots) + + questions = [] + answer = [] + for i in range(len(lines_test[:num_questions])): + questions.append(get_example(lines_test, i, False)) + answer.append(str(lines_test[i]["answer"])) + arguments = [{"question": q} for q in questions] + + ##################################### + ######### SGL Program Begin ######### + ##################################### + + import sglang as sgl + + @sgl.function + def few_shot_boolq(s, question): + s += few_shots + question + s += sgl.gen("answer", max_tokens=5, stop=["\n"]) + + ##################################### + ########## SGL Program End ########## + ##################################### + + # Run requests + tic = time.perf_counter() + states = few_shot_boolq.run_batch( + arguments, + temperature=0, + num_threads=args.parallel, + progress_bar=True, + ) + latency = time.perf_counter() - tic + + preds = [] + for i in range(len(states)): + preds.append(states[i]["answer"]) + + # Compute accuracy + acc = np.mean(np.array(preds) == np.array(answer)) + + # Compute speed + num_output_tokens = sum( + s.get_meta_info("answer")["completion_tokens"] for s in states + ) + output_throughput = num_output_tokens / latency + + # Print results + print(f"Accuracy: {acc:.3f}") + print(f"Latency: {latency:.3f} s") + print(f"Output throughput: {output_throughput:.3f} token/s") + + # Results + with open(args.result_file, "a") as fout: + value = { + "task": "boolq", + "backend": args.backend, + "num_gpus": 1, + "latency": round(latency, 3), + "accuracy": round(acc, 3), + "num_requests": args.num_questions, + "other": { + "num_questions": args.num_questions, + "parallel": args.parallel, + }, + } + fout.write(json.dumps(value) + "\n") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--num-shots", type=int, default=5) + parser.add_argument( + "--train-data-path", type=str, default="./boolq/data/train-00000-of-00001.json" + ) + parser.add_argument( + "--test-data-path", + type=str, + default="./boolq/data/validation-00000-of-00001.json", + ) + parser.add_argument("--num-questions", type=int, default=200) + args = add_common_sglang_args_and_parse(parser) + main(args) diff --git a/benchmark/boolq/convert_parquet_to_json.py b/benchmark/boolq/convert_parquet_to_json.py new file mode 100644 index 000000000..e3e69cb31 --- /dev/null +++ b/benchmark/boolq/convert_parquet_to_json.py @@ -0,0 +1,28 @@ +import sys + +import pyarrow.parquet as pq + + +def convert_parquet_to_json(input_file, output_file): + # read parquet file + table = pq.read_table(input_file) + + # turn parquet data to dataframe + df = table.to_pandas() + + # turn dataframe to json form + json_data = df.to_json(orient="records", lines=True) + + # write json to file + with open(output_file, "w") as f: + f.write(json_data) + + +if __name__ == "__main__": + if len(sys.argv) != 3: + print("Usage:python convert_parquet_to_json.py ") + + input_file = sys.argv[1] + output_file = sys.argv[2] + + convert_parquet_to_json(input_file, output_file) diff --git a/benchmark/boolq/parquet_to_json.sh b/benchmark/boolq/parquet_to_json.sh new file mode 100755 index 000000000..9aaf087ff --- /dev/null +++ b/benchmark/boolq/parquet_to_json.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +#define input and output direction +input_dir="./boolq/data" +output_dir="./boolq/data" + +#define files needed to be handled +files=( + "train-00000-of-00001.parquet" + "validation-00000-of-00001.parquet" +) + +#foe files above, use python script to convert the form +for file in "${files[@]}"; do + input_file="${input_dir}/${file}" + output_file="${output_dir}/${file%.parquet}.json" + + echo "Converting ${input_file} to ${output_file} ..." + python3 convert_parquet_to_json.py "${input_file}" "${output_file}" + + if [ $? -eq 0 ]; then + echo "Conversion successful: ${output_file}" + else + echo "Conversion failed: ${input_file}" + fi +done diff --git a/benchmark/ceval/README.md b/benchmark/ceval/README.md new file mode 100644 index 000000000..b822e43c3 --- /dev/null +++ b/benchmark/ceval/README.md @@ -0,0 +1,15 @@ +## Download data +``` +git lfs clone https://huggingface.co/datasets/ceval/ceval-exam +``` + +## Run benchmark + +### Benchmark sglang +``` +python -m sglang.launch_server --model-path ramblingpolymath/Qwen3-32B-W8A8 --port 30000 +``` + +``` +python3 bench_sglang.py +``` diff --git a/benchmark/ceval/bench_sglang.py b/benchmark/ceval/bench_sglang.py new file mode 100644 index 000000000..32ed0baf2 --- /dev/null +++ b/benchmark/ceval/bench_sglang.py @@ -0,0 +1,138 @@ +import argparse +import json +import os +import random +import re +import time + +import numpy as np +from datasets import load_dataset + +from sglang.api import set_default_backend +from sglang.test.test_utils import ( + add_common_sglang_args_and_parse, + select_sglang_backend, +) + +choices = ["A", "B", "C", "D"] + + +def get_one_example(line, include_answer): + res = line["question"] + res += f"\nA. {line['A']}" + res += f"\nB. {line['B']}" + res += f"\nC. {line['C']}" + res += f"\nD. {line['D']}" + + if include_answer: + res += f"\nAnswer: {line['answer']} \n\n" + return res + + +def get_few_shot_examples(lines): + res = "" + for line in lines: + res += get_one_example(line, True) + "\n\n" + return res + + +def get_answer_value(response): + pattern = r"(Answer:|answer:|答案是|答案是:|正确答案是:|答案:|Assistant:)\s*([A-D])(?![\w])" + match = re.search(pattern, response) + + if match: + return match.group(2) + + return random.choice(choices) + + +def main(args): + # Read data && Construct prompts + arguments = [] + labels = [] + examples = "examples:\n" + data_path = args.data_path + for subject in os.listdir(data_path): + subject_path = os.path.join(data_path, subject) + if os.path.isdir(subject_path) and subject != ".git": + dataset = load_dataset(data_path, name=subject) + dev_lines_temp = dataset["dev"] + val_lines_temp = dataset["val"] + few_shot_examples = get_few_shot_examples(dev_lines_temp, subject) + examples += f"{few_shot_examples}" + for val_line in val_lines_temp: + arguments.append( + { + "examples": few_shot_examples, + "question": get_one_example(val_line, False), + } + ) + labels.append(val_line["answer"]) + + ##################################### + ######### SGL Program Begin ######### + ##################################### + + import sglang as sgl + + @sgl.function + def few_shot_ceval(s, examples, question): + s += examples + question + sgl.gen("Answer") + + ##################################### + ########## SGL Program End ########## + ##################################### + + num_questions = args.num_questions if args.num_questions else len(arguments) + + # Select backend + set_default_backend(select_sglang_backend(args)) + + # Run requests + tic = time.perf_counter() + states = few_shot_ceval.run_batch( + arguments[:num_questions], + temperature=0, + num_threads=args.parallel, + progress_bar=True, + ) + latency = time.perf_counter() - tic + + preds = [get_answer_value(states[i]["Answer"]) for i in range(num_questions)] + + # Compute accuracy + acc = np.mean(np.array(preds) == np.array(labels[:num_questions])) + + # Compute speed + num_output_tokens = sum( + s.get_meta_info("Answer")["completion_tokens"] for s in states + ) + output_throughput = num_output_tokens / latency + + # Print results + print(f"Accuracy: {acc:.3f}") + print(f"Latency: {latency:.3f} s") + print(f"Output throughput: {output_throughput:.3f} token/s") + + # Write results + with open(args.result_file, "a") as fout: + value = { + "task": "ceval", + "backend": args.backend, + "num_gpus": 1, + "latency": round(latency, 3), + "accuracy": round(acc, 3), + "num_requests": args.num_questions, + "other": { + "parallel": args.parallel, + }, + } + fout.write(json.dumps(value) + "\n") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--data-path", type=str, default="ceval-exam") + parser.add_argument("--num-questions", type=int, default=None) + args = add_common_sglang_args_and_parse(parser) + main(args)