From da19434c2f3cbe4f367f84993da0bcbd84efb6ba Mon Sep 17 00:00:00 2001 From: Liangsheng Yin Date: Wed, 24 Apr 2024 02:23:01 +0800 Subject: [PATCH] Benchmark Updates (#382) --- benchmark/gsm8k/bench_sglang.py | 2 +- benchmark/hellaswag/bench_sglang.py | 2 +- benchmark/json_decode_regex/bench_sglang.py | 2 +- benchmark/json_jump_forward/bench_sglang.py | 4 +- benchmark/line_retrieval/bench_sglang.py | 2 +- benchmark/llm_judge/bench_sglang.py | 2 +- benchmark/long_json_decode/bench_sglang.py | 2 +- benchmark/mtbench/bench_sglang.py | 4 +- benchmark/multi_chain_reasoning/README.md | 2 +- .../multi_chain_reasoning/bench_sglang.py | 2 +- benchmark/multi_document_qa/bench_sglang.py | 2 +- benchmark/multi_turn_chat/bench_sglang.py | 2 +- benchmark/react/bench_sglang.py | 4 +- benchmark/tip_suggestion/.gitignore | 1 + benchmark/tip_suggestion/bench_sglang.py | 2 +- benchmark/tip_suggestion/topic.jsonl | 50 +++++++++++++++++++ .../tree_of_thought_deep/bench_sglang.py | 2 +- benchmark/tree_of_thought_v0/bench_sglang.py | 2 +- 18 files changed, 72 insertions(+), 17 deletions(-) create mode 100644 benchmark/tip_suggestion/.gitignore create mode 100644 benchmark/tip_suggestion/topic.jsonl diff --git a/benchmark/gsm8k/bench_sglang.py b/benchmark/gsm8k/bench_sglang.py index c5d76af31..d5ca031cf 100644 --- a/benchmark/gsm8k/bench_sglang.py +++ b/benchmark/gsm8k/bench_sglang.py @@ -73,7 +73,7 @@ def main(args): # Run requests tic = time.time() states = few_shot_gsm8k.run_batch( - arguments, temperature=0, backend=backend, num_threads=args.parallel) + arguments, temperature=0, backend=backend, num_threads=args.parallel, progress_bar=True) latency = time.time() - tic preds = [] diff --git a/benchmark/hellaswag/bench_sglang.py b/benchmark/hellaswag/bench_sglang.py index 43d06db79..a030d7972 100644 --- a/benchmark/hellaswag/bench_sglang.py +++ b/benchmark/hellaswag/bench_sglang.py @@ -61,7 +61,7 @@ def main(args): # Run requests tic = time.time() rets = few_shot_hellaswag.run_batch( - arguments, temperature=0, backend=backend, num_threads=args.parallel) + arguments, temperature=0, backend=backend, num_threads=args.parallel, progress_bar=True) preds = [choices[i].index(rets[i]["answer"]) for i in range(len(rets))] latency = time.time() - tic diff --git a/benchmark/json_decode_regex/bench_sglang.py b/benchmark/json_decode_regex/bench_sglang.py index c0007e8d1..1d6e1f9cd 100644 --- a/benchmark/json_decode_regex/bench_sglang.py +++ b/benchmark/json_decode_regex/bench_sglang.py @@ -63,7 +63,7 @@ def main(args): # Run requests tic = time.time() - states = json_decode.run_batch(arguments, temperature=0, num_threads=args.parallel) + states = json_decode.run_batch(arguments, temperature=0, num_threads=args.parallel, progress_bar=True) latency = time.time() - tic # Compute accuracy diff --git a/benchmark/json_jump_forward/bench_sglang.py b/benchmark/json_jump_forward/bench_sglang.py index cc22dd8c3..10cf2699b 100644 --- a/benchmark/json_jump_forward/bench_sglang.py +++ b/benchmark/json_jump_forward/bench_sglang.py @@ -72,7 +72,7 @@ def bench_city_doc(args): arguments, temperature=0, num_threads=args.parallel, - progress_bar=(args.parallel == 1), + progress_bar=True, ) latency = time.time() - tic @@ -96,7 +96,7 @@ def bench_character(args): arguments, temperature=0, num_threads=args.parallel, - progress_bar=(args.parallel == 1), + progress_bar=True, ) latency = time.time() - tic diff --git a/benchmark/line_retrieval/bench_sglang.py b/benchmark/line_retrieval/bench_sglang.py index 5ac56a491..91cbdd750 100644 --- a/benchmark/line_retrieval/bench_sglang.py +++ b/benchmark/line_retrieval/bench_sglang.py @@ -61,7 +61,7 @@ def eval_model(args, line_obj, num_hoops, src_indices, dst_percents): tic = time.time() states = line_retrieval.run_batch( - arguments, temperature=0, backend=backend, num_threads=args.parallel) + arguments, temperature=0, backend=backend, num_threads=args.parallel, progress_bar=True) latency = time.time() - tic corrects = [] diff --git a/benchmark/llm_judge/bench_sglang.py b/benchmark/llm_judge/bench_sglang.py index 0cef33700..81cf625fe 100644 --- a/benchmark/llm_judge/bench_sglang.py +++ b/benchmark/llm_judge/bench_sglang.py @@ -54,7 +54,7 @@ def main(args): # Run requests tic = time.time() states = multi_dimension_judge.run_batch( - arguments, temperature=0, backend=backend, num_threads=args.parallel) + arguments, temperature=0, backend=backend, num_threads=args.parallel, progress_bar=True) latency = time.time() - tic print(f"Latency: {latency:.3f}") diff --git a/benchmark/long_json_decode/bench_sglang.py b/benchmark/long_json_decode/bench_sglang.py index 4cb1e6f87..0879ae04b 100644 --- a/benchmark/long_json_decode/bench_sglang.py +++ b/benchmark/long_json_decode/bench_sglang.py @@ -36,7 +36,7 @@ def main(args): # Run requests tic = time.time() states = json_decode.run_batch( - arguments, temperature=0, num_threads=args.parallel) + arguments, temperature=0, num_threads=args.parallel, progress_bar=True) latency = time.time() - tic # Compute accuracy diff --git a/benchmark/mtbench/bench_sglang.py b/benchmark/mtbench/bench_sglang.py index 7727e03fc..085c92e51 100644 --- a/benchmark/mtbench/bench_sglang.py +++ b/benchmark/mtbench/bench_sglang.py @@ -60,7 +60,9 @@ def main(args): arguments, temperature=0, max_new_tokens=256, - num_threads=args.parallel) + num_threads=args.parallel, + progress_bar=True, + ) answers = [[s["answer_1"], s["answer_2"]] for s in rets] latency = time.time() - tic diff --git a/benchmark/multi_chain_reasoning/README.md b/benchmark/multi_chain_reasoning/README.md index 5859145eb..67f627681 100644 --- a/benchmark/multi_chain_reasoning/README.md +++ b/benchmark/multi_chain_reasoning/README.md @@ -7,7 +7,7 @@ wget https://raw.githubusercontent.com/openai/grade-school-math/master/grade_sch ### Benchmark sglang ``` -python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 +python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --schedule-conservativeness 1.3 ``` ``` diff --git a/benchmark/multi_chain_reasoning/bench_sglang.py b/benchmark/multi_chain_reasoning/bench_sglang.py index f10c7b2c3..7f81818b8 100644 --- a/benchmark/multi_chain_reasoning/bench_sglang.py +++ b/benchmark/multi_chain_reasoning/bench_sglang.py @@ -86,7 +86,7 @@ def main(args): # Run requests tic = time.time() states = multi_chain_gsm8k.run_batch( - arguments, temperature=0, backend=backend, num_threads=args.parallel) + arguments, temperature=0, backend=backend, num_threads=args.parallel, progress_bar=True) latency = time.time() - tic preds = [] diff --git a/benchmark/multi_document_qa/bench_sglang.py b/benchmark/multi_document_qa/bench_sglang.py index 22bbfc5d8..84a0d189e 100644 --- a/benchmark/multi_document_qa/bench_sglang.py +++ b/benchmark/multi_document_qa/bench_sglang.py @@ -43,7 +43,7 @@ def main(args): # Run requests tic = time.time() states = multi_document_qa.run_batch( - arguments, temperature=0, num_threads=args.parallel) + arguments, temperature=0, num_threads=args.parallel, progress_bar=True) latency = time.time() - tic # Compute accuracy diff --git a/benchmark/multi_turn_chat/bench_sglang.py b/benchmark/multi_turn_chat/bench_sglang.py index c1a84d60a..ff21c00e2 100644 --- a/benchmark/multi_turn_chat/bench_sglang.py +++ b/benchmark/multi_turn_chat/bench_sglang.py @@ -29,7 +29,7 @@ def main(args): tic = time.time() states = multi_turns.run_batch( - multi_qas, temperature=0, backend=backend, num_threads=args.parallel + multi_qas, temperature=0, backend=backend, num_threads=args.parallel, progress_bar=True ) latency = time.time() - tic diff --git a/benchmark/react/bench_sglang.py b/benchmark/react/bench_sglang.py index 3710736f9..83fd0a5f8 100644 --- a/benchmark/react/bench_sglang.py +++ b/benchmark/react/bench_sglang.py @@ -110,7 +110,9 @@ def main(args): tic = time.time() states = webthink.run_batch(arguments, temperature=0, - num_threads=args.parallel) + num_threads=args.parallel, + progress_bar=True, + ) latency = time.time() - tic # Compute accuracy diff --git a/benchmark/tip_suggestion/.gitignore b/benchmark/tip_suggestion/.gitignore new file mode 100644 index 000000000..f1aeb25d4 --- /dev/null +++ b/benchmark/tip_suggestion/.gitignore @@ -0,0 +1 @@ +!topic.jsonl \ No newline at end of file diff --git a/benchmark/tip_suggestion/bench_sglang.py b/benchmark/tip_suggestion/bench_sglang.py index 5cd0f23cf..f02b7168c 100644 --- a/benchmark/tip_suggestion/bench_sglang.py +++ b/benchmark/tip_suggestion/bench_sglang.py @@ -59,7 +59,7 @@ def main(args): # Run requests tic = time.time() states = suggest_tips.run_batch( - arguments, temperature=0, num_threads=args.parallel) + arguments, temperature=0, num_threads=args.parallel, progress_bar=True) latency = time.time() - tic # Compute accuracy diff --git a/benchmark/tip_suggestion/topic.jsonl b/benchmark/tip_suggestion/topic.jsonl new file mode 100644 index 000000000..1d818089d --- /dev/null +++ b/benchmark/tip_suggestion/topic.jsonl @@ -0,0 +1,50 @@ +{"topic": "organizing a successful charity event", "number": 6} +{"topic": "improving personal credit scores", "number": 7} +{"topic": "staying motivated during job searches", "number": 5} +{"topic": "maintaining a work-life balance", "number": 9} +{"topic": "reducing carbon footprint at home", "number": 8} +{"topic": "starting a book club", "number": 5} +{"topic": "learning to play a musical instrument", "number": 7} +{"topic": "getting into freelance writing", "number": 6} +{"topic": "beginner yoga poses", "number": 8} +{"topic": "preparing for graduate school exams", "number": 5} +{"topic": "exploring minimalist living", "number": 9} +{"topic": "effective grocery shopping", "number": 7} +{"topic": "winter camping", "number": 5} +{"topic": "starting a podcast on a budget", "number": 8} +{"topic": "creating a capsule wardrobe", "number": 6} +{"topic": "improving your writing skills", "number": 7} +{"topic": "learning a new software quickly", "number": 9} +{"topic": "reducing anxiety before public speaking", "number": 5} +{"topic": "planning a solo travel adventure", "number": 8} +{"topic": "beginner skateboarders", "number": 6} +{"topic": "studying abroad", "number": 7} +{"topic": "planting a vegetable garden", "number": 5} +{"topic": "adopting a shelter pet", "number": 9} +{"topic": "learning to cook ethnic cuisines", "number": 8} +{"topic": "effective conflict resolution", "number": 5} +{"topic": "starting a vlog", "number": 7} +{"topic": "keeping a daily journal", "number": 6} +{"topic": "improving sleep hygiene", "number": 8} +{"topic": "beginner mountain climbers", "number": 5} +{"topic": "creating a mobile app", "number": 9} +{"topic": "maintaining a saltwater aquarium", "number": 7} +{"topic": "preparing for a baby's arrival", "number": 6} +{"topic": "writing a fantasy novel", "number": 5} +{"topic": "effective team leadership", "number": 8} +{"topic": "making a documentary film", "number": 9} +{"topic": "learning about historical events", "number": 7} +{"topic": "baking gluten-free treats", "number": 6} +{"topic": "improving mental arithmetic skills", "number": 5} +{"topic": "building a treehouse", "number": 8} +{"topic": "getting started with watercolor painting", "number": 9} +{"topic": "creating a YouTube tutorial series", "number": 7} +{"topic": "landscape photography", "number": 5} +{"topic": "navigating cultural differences", "number": 6} +{"topic": "preparing for a marathon", "number": 8} +{"topic": "building an online business", "number": 9} +{"topic": "learning to dance at home", "number": 5} +{"topic": "self-publishing a book", "number": 7} +{"topic": "starting an urban farm", "number": 6} +{"topic": "improving your memory", "number": 8} +{"topic": "creating a personal brand online", "number": 9} \ No newline at end of file diff --git a/benchmark/tree_of_thought_deep/bench_sglang.py b/benchmark/tree_of_thought_deep/bench_sglang.py index e8b617597..66a5c26c4 100644 --- a/benchmark/tree_of_thought_deep/bench_sglang.py +++ b/benchmark/tree_of_thought_deep/bench_sglang.py @@ -112,7 +112,7 @@ def main(args): # Run requests tic = time.time() states = tree_search.run_batch( - arguments, temperature=0, backend=backend, num_threads=args.parallel) + arguments, temperature=0, backend=backend, num_threads=args.parallel, progress_bar=True) latency = time.time() - tic answers_text = [] for s in states: diff --git a/benchmark/tree_of_thought_v0/bench_sglang.py b/benchmark/tree_of_thought_v0/bench_sglang.py index 7b717a60c..7e337829d 100644 --- a/benchmark/tree_of_thought_v0/bench_sglang.py +++ b/benchmark/tree_of_thought_v0/bench_sglang.py @@ -102,7 +102,7 @@ def main(args): # Run requests tic = time.time() states = tree_search.run_batch( - arguments, temperature=0, backend=backend, num_threads=args.parallel) + arguments, temperature=0, backend=backend, num_threads=args.parallel, progress_bar=True) latency = time.time() - tic answers_text = [] for s in states: