初始化项目,由ModelHub XC社区提供模型
Model: nv-community/AceMath-7B-Instruct Source: Original Platform
This commit is contained in:
79
evaluation/calculate_scores.py
Normal file
79
evaluation/calculate_scores.py
Normal file
@@ -0,0 +1,79 @@
|
||||
|
||||
from grader import is_equal
|
||||
import json
|
||||
import re
|
||||
|
||||
|
||||
def get_gold_list(datapath, dataset_name):
|
||||
|
||||
assert dataset_name in ["gsm8k", "math", "minerva_math", "gaokao2023en", "olympiadbench", "collegemath"]
|
||||
|
||||
gold_list = []
|
||||
with open(datapath, "r") as f:
|
||||
for line in f:
|
||||
item = json.loads(line)
|
||||
|
||||
if dataset_name == "gsm8k":
|
||||
gold = item['answer'].split("#### ")[-1]
|
||||
|
||||
elif dataset_name == "math":
|
||||
gold = item['answer']
|
||||
|
||||
elif dataset_name == "minerva_math":
|
||||
pattern = r"\\boxed\{((?:[^{}]|\{(?:[^{}]|\{[^{}]*\})*\})*)\}"
|
||||
pattern_re = re.compile(pattern, re.DOTALL)
|
||||
solution = item['solution']
|
||||
matches = pattern_re.findall(solution)
|
||||
if len(matches) == 0:
|
||||
gold = None
|
||||
else:
|
||||
gold = matches[-1]
|
||||
|
||||
elif dataset_name == "gaokao2023en":
|
||||
gold = re.sub(r'^\$(.*)\$$', r'\1', item['answer'])
|
||||
|
||||
elif dataset_name == "olympiadbench":
|
||||
gold = re.sub(r'^\$(.*)\$$', r'\1', item['final_answer'][0])
|
||||
|
||||
elif dataset_name == "collegemath":
|
||||
gold = re.sub(r'^\$(.*)\$$', r'\1', item['answer'])
|
||||
|
||||
gold_list.append(gold)
|
||||
|
||||
return gold_list
|
||||
|
||||
|
||||
def get_scores_on_math_benchmarks(model_output_path, test_gold_path, dataset_name):
|
||||
|
||||
gold_list = get_gold_list(test_gold_path, dataset_name)
|
||||
|
||||
"""TODO
|
||||
Get the output_list from model_output_path
|
||||
output_list is a list of string (List[str])
|
||||
Each string represents the model's response for a corresponding question in the benchmark
|
||||
Therefore, the length of output_list must match the length of gold_list.
|
||||
|
||||
output_list = ...
|
||||
"""
|
||||
|
||||
correct = 0
|
||||
for output, gold in zip(output_list, gold_list):
|
||||
if is_equal(output, gold, dataset_name):
|
||||
correct += 1
|
||||
|
||||
print("accuracy on %s is %.4f" % (dataset_name, correct / len(gold_list)))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
"""TODO
|
||||
Download test benchmarks from Qwen2.5-Math
|
||||
https://github.com/QwenLM/Qwen2.5-Math/tree/main/evaluation/data
|
||||
|
||||
Prepare model_output_path and test_gold_path for each dataset
|
||||
"""
|
||||
|
||||
test_gold_path = "PATH_OF_THE_BENCHMARK"
|
||||
model_output_path = "PATH_OF_YOUR_MODEL_OUTPUTS"
|
||||
dataset_name = "DATASET_NAME" # e.g., gsm8k, math, "minerva_math", "gaokao2023en", "olympiadbench", "collegemath"
|
||||
|
||||
get_scores_on_math_benchmarks(model_output_path, test_gold_path, dataset_name)
|
||||
Reference in New Issue
Block a user