task metric value arc_challenge acc_norm,none 0.3174061433447099 boolq acc,none 0.7437308868501529 gsm8k exact_match,flexible-extract 0.6239575435936315 hellaswag acc_norm,none 0.560744871539534 mmlu acc,none 0.5997721122347244 mmlu_abstract_algebra acc,none 0.43 mmlu_anatomy acc,none 0.6074074074074074 mmlu_astronomy acc,none 0.6973684210526315 mmlu_business_ethics acc,none 0.62 mmlu_clinical_knowledge acc,none 0.6415094339622641 mmlu_college_biology acc,none 0.8263888888888888 mmlu_college_chemistry acc,none 0.53 mmlu_college_computer_science acc,none 0.54 mmlu_college_mathematics acc,none 0.5 mmlu_college_medicine acc,none 0.5953757225433526 mmlu_college_physics acc,none 0.5 mmlu_computer_security acc,none 0.68 mmlu_conceptual_physics acc,none 0.5872340425531914 mmlu_econometrics acc,none 0.35964912280701755 mmlu_electrical_engineering acc,none 0.6413793103448275 mmlu_elementary_mathematics acc,none 0.5317460317460317 mmlu_formal_logic acc,none 0.5 mmlu_global_facts acc,none 0.33 mmlu_high_school_biology acc,none 0.7548387096774194 mmlu_high_school_chemistry acc,none 0.6009852216748769 mmlu_high_school_computer_science acc,none 0.69 mmlu_high_school_european_history acc,none 0.7696969696969697 mmlu_high_school_geography acc,none 0.7272727272727273 mmlu_high_school_government_and_politics acc,none 0.7461139896373057 mmlu_high_school_macroeconomics acc,none 0.6435897435897436 mmlu_high_school_mathematics acc,none 0.45555555555555555 mmlu_high_school_microeconomics acc,none 0.7773109243697479 mmlu_high_school_physics acc,none 0.5165562913907285 mmlu_high_school_psychology acc,none 0.8 mmlu_high_school_statistics acc,none 0.5694444444444444 mmlu_high_school_us_history acc,none 0.7156862745098039 mmlu_high_school_world_history acc,none 0.7974683544303798 mmlu_human_aging acc,none 0.600896860986547 mmlu_human_sexuality acc,none 0.6946564885496184 mmlu_humanities acc,none 0.5300743889479277 mmlu_international_law acc,none 0.7851239669421488 mmlu_jurisprudence acc,none 0.7222222222222222 mmlu_logical_fallacies acc,none 0.6932515337423313 mmlu_machine_learning acc,none 0.42857142857142855 mmlu_management acc,none 0.6893203883495146 mmlu_marketing acc,none 0.8034188034188035 mmlu_medical_genetics acc,none 0.69 mmlu_miscellaneous acc,none 0.6717752234993615 mmlu_moral_disputes acc,none 0.5953757225433526 mmlu_moral_scenarios acc,none 0.2446927374301676 mmlu_nutrition acc,none 0.6764705882352942 mmlu_other acc,none 0.6269713550048278 mmlu_philosophy acc,none 0.6559485530546624 mmlu_prehistory acc,none 0.6265432098765432 mmlu_professional_accounting acc,none 0.4397163120567376 mmlu_professional_law acc,none 0.4745762711864407 mmlu_professional_medicine acc,none 0.6838235294117647 mmlu_professional_psychology acc,none 0.5915032679738562 mmlu_public_relations acc,none 0.6 mmlu_security_studies acc,none 0.7020408163265306 mmlu_social_sciences acc,none 0.6906077348066298 mmlu_sociology acc,none 0.7711442786069652 mmlu_stem acc,none 0.5883285759594037 mmlu_us_foreign_policy acc,none 0.78 mmlu_virology acc,none 0.45180722891566266 mmlu_world_religions acc,none 0.7192982456140351 piqa acc_norm,none 0.6322089227421109 truthfulqa_mc2 acc,none 0.45340473177307805 winogrande acc,none 0.500394632991318