初始化项目，由ModelHub XC社区提供模型

Model: nv-community/AceMath-7B-Instruct Source: Original Platform
2026-06-09 23:04:13 +08:00
commit 2b35bbfb85
18 changed files with 152820 additions and 0 deletions
--- a/evaluation/calculate_scores.py
+++ b/evaluation/calculate_scores.py
@@ -0,0 +1,79 @@
+
+from grader import is_equal
+import json
+import re
+
+
+def get_gold_list(datapath, dataset_name):
+
+    assert dataset_name in ["gsm8k", "math", "minerva_math", "gaokao2023en", "olympiadbench", "collegemath"]
+
+    gold_list = []
+    with open(datapath, "r") as f:
+        for line in f:
+            item = json.loads(line)
+
+            if dataset_name == "gsm8k":
+                gold = item['answer'].split("#### ")[-1]
+
+            elif dataset_name == "math":
+                gold = item['answer']
+
+            elif dataset_name == "minerva_math":
+                pattern = r"\\boxed\{((?:[^{}]|\{(?:[^{}]|\{[^{}]*\})*\})*)\}"
+                pattern_re = re.compile(pattern, re.DOTALL)
+                solution = item['solution']
+                matches = pattern_re.findall(solution)
+                if len(matches) == 0:
+                    gold = None
+                else:
+                    gold = matches[-1]
+            
+            elif dataset_name == "gaokao2023en":
+                gold = re.sub(r'^\$(.*)\$$', r'\1', item['answer'])
+
+            elif dataset_name == "olympiadbench":
+                gold = re.sub(r'^\$(.*)\$$', r'\1', item['final_answer'][0])
+            
+            elif dataset_name == "collegemath":
+                gold = re.sub(r'^\$(.*)\$$', r'\1', item['answer'])
+
+        gold_list.append(gold)
+
+    return gold_list
+
+
+def get_scores_on_math_benchmarks(model_output_path, test_gold_path, dataset_name):
+    
+    gold_list = get_gold_list(test_gold_path, dataset_name)
+
+    """TODO
+    Get the output_list from model_output_path
+    output_list is a list of string (List[str])
+    Each string represents the model's response for a corresponding question in the benchmark
+    Therefore, the length of output_list must match the length of gold_list.
+
+    output_list = ...
+    """
+
+    correct = 0
+    for output, gold in zip(output_list, gold_list):
+        if is_equal(output, gold, dataset_name):
+            correct += 1
+
+    print("accuracy on %s is %.4f" % (dataset_name, correct / len(gold_list)))
+
+
+if __name__ == "__main__":
+    """TODO
+    Download test benchmarks from Qwen2.5-Math 
+    https://github.com/QwenLM/Qwen2.5-Math/tree/main/evaluation/data
+
+    Prepare model_output_path and test_gold_path for each dataset
+    """
+
+    test_gold_path = "PATH_OF_THE_BENCHMARK"
+    model_output_path = "PATH_OF_YOUR_MODEL_OUTPUTS"
+    dataset_name = "DATASET_NAME"   # e.g., gsm8k, math, "minerva_math", "gaokao2023en", "olympiadbench", "collegemath"
+
+    get_scores_on_math_benchmarks(model_output_path, test_gold_path, dataset_name)