3.2 KiB
3.2 KiB
| 1 | task | metric | value |
|---|---|---|---|
| 2 | arc_challenge | acc_norm,none | 0.3174061433447099 |
| 3 | boolq | acc,none | 0.7437308868501529 |
| 4 | gsm8k | exact_match,flexible-extract | 0.6239575435936315 |
| 5 | hellaswag | acc_norm,none | 0.560744871539534 |
| 6 | mmlu | acc,none | 0.5997721122347244 |
| 7 | mmlu_abstract_algebra | acc,none | 0.43 |
| 8 | mmlu_anatomy | acc,none | 0.6074074074074074 |
| 9 | mmlu_astronomy | acc,none | 0.6973684210526315 |
| 10 | mmlu_business_ethics | acc,none | 0.62 |
| 11 | mmlu_clinical_knowledge | acc,none | 0.6415094339622641 |
| 12 | mmlu_college_biology | acc,none | 0.8263888888888888 |
| 13 | mmlu_college_chemistry | acc,none | 0.53 |
| 14 | mmlu_college_computer_science | acc,none | 0.54 |
| 15 | mmlu_college_mathematics | acc,none | 0.5 |
| 16 | mmlu_college_medicine | acc,none | 0.5953757225433526 |
| 17 | mmlu_college_physics | acc,none | 0.5 |
| 18 | mmlu_computer_security | acc,none | 0.68 |
| 19 | mmlu_conceptual_physics | acc,none | 0.5872340425531914 |
| 20 | mmlu_econometrics | acc,none | 0.35964912280701755 |
| 21 | mmlu_electrical_engineering | acc,none | 0.6413793103448275 |
| 22 | mmlu_elementary_mathematics | acc,none | 0.5317460317460317 |
| 23 | mmlu_formal_logic | acc,none | 0.5 |
| 24 | mmlu_global_facts | acc,none | 0.33 |
| 25 | mmlu_high_school_biology | acc,none | 0.7548387096774194 |
| 26 | mmlu_high_school_chemistry | acc,none | 0.6009852216748769 |
| 27 | mmlu_high_school_computer_science | acc,none | 0.69 |
| 28 | mmlu_high_school_european_history | acc,none | 0.7696969696969697 |
| 29 | mmlu_high_school_geography | acc,none | 0.7272727272727273 |
| 30 | mmlu_high_school_government_and_politics | acc,none | 0.7461139896373057 |
| 31 | mmlu_high_school_macroeconomics | acc,none | 0.6435897435897436 |
| 32 | mmlu_high_school_mathematics | acc,none | 0.45555555555555555 |
| 33 | mmlu_high_school_microeconomics | acc,none | 0.7773109243697479 |
| 34 | mmlu_high_school_physics | acc,none | 0.5165562913907285 |
| 35 | mmlu_high_school_psychology | acc,none | 0.8 |
| 36 | mmlu_high_school_statistics | acc,none | 0.5694444444444444 |
| 37 | mmlu_high_school_us_history | acc,none | 0.7156862745098039 |
| 38 | mmlu_high_school_world_history | acc,none | 0.7974683544303798 |
| 39 | mmlu_human_aging | acc,none | 0.600896860986547 |
| 40 | mmlu_human_sexuality | acc,none | 0.6946564885496184 |
| 41 | mmlu_humanities | acc,none | 0.5300743889479277 |
| 42 | mmlu_international_law | acc,none | 0.7851239669421488 |
| 43 | mmlu_jurisprudence | acc,none | 0.7222222222222222 |
| 44 | mmlu_logical_fallacies | acc,none | 0.6932515337423313 |
| 45 | mmlu_machine_learning | acc,none | 0.42857142857142855 |
| 46 | mmlu_management | acc,none | 0.6893203883495146 |
| 47 | mmlu_marketing | acc,none | 0.8034188034188035 |
| 48 | mmlu_medical_genetics | acc,none | 0.69 |
| 49 | mmlu_miscellaneous | acc,none | 0.6717752234993615 |
| 50 | mmlu_moral_disputes | acc,none | 0.5953757225433526 |
| 51 | mmlu_moral_scenarios | acc,none | 0.2446927374301676 |
| 52 | mmlu_nutrition | acc,none | 0.6764705882352942 |
| 53 | mmlu_other | acc,none | 0.6269713550048278 |
| 54 | mmlu_philosophy | acc,none | 0.6559485530546624 |
| 55 | mmlu_prehistory | acc,none | 0.6265432098765432 |
| 56 | mmlu_professional_accounting | acc,none | 0.4397163120567376 |
| 57 | mmlu_professional_law | acc,none | 0.4745762711864407 |
| 58 | mmlu_professional_medicine | acc,none | 0.6838235294117647 |
| 59 | mmlu_professional_psychology | acc,none | 0.5915032679738562 |
| 60 | mmlu_public_relations | acc,none | 0.6 |
| 61 | mmlu_security_studies | acc,none | 0.7020408163265306 |
| 62 | mmlu_social_sciences | acc,none | 0.6906077348066298 |
| 63 | mmlu_sociology | acc,none | 0.7711442786069652 |
| 64 | mmlu_stem | acc,none | 0.5883285759594037 |
| 65 | mmlu_us_foreign_policy | acc,none | 0.78 |
| 66 | mmlu_virology | acc,none | 0.45180722891566266 |
| 67 | mmlu_world_religions | acc,none | 0.7192982456140351 |
| 68 | piqa | acc_norm,none | 0.6322089227421109 |
| 69 | truthfulqa_mc2 | acc,none | 0.45340473177307805 |
| 70 | winogrande | acc,none | 0.500394632991318 |