evaluations: arc_challenge_poly_pt_acc: 0.32905982905982906 arc_challenge_poly_pt_acc_norm: 0.37435897435897436 arc_challenge_poly_pt_acc_norm_stderr: 0.014154661190814505 arc_challenge_poly_pt_acc_stderr: 0.013742700308521677 arc_challenge_poly_pt_alias: arc_challenge_poly_pt assin2_rte_acc,all: 0.610702614379085 assin2_rte_acc_stderr,all: 0.006976455542871003 assin2_rte_alias: assin2_rte assin2_rte_f1_macro,all: 0.5493894012220668 assin2_rte_f1_macro_stderr,all: 0.007313213532417135 assin2_sts_alias: assin2_sts assin2_sts_mse,all: 1.954166666666667 assin2_sts_mse_stderr,all: N/A assin2_sts_pearson,all: 0.06429110147600466 assin2_sts_pearson_stderr,all: 0.013460910678684006 assin_entailment_acc: 0.70675 assin_entailment_acc_stderr: 0.007199067024031941 assin_entailment_alias: assin_entailment assin_paraphrase_acc: 0.72675 assin_paraphrase_acc_stderr: 0.007046880444991351 assin_paraphrase_alias: assin_paraphrase belebele_por_Latn_acc: 0.5388888888888889 belebele_por_Latn_acc_norm: 0.5388888888888889 belebele_por_Latn_acc_norm_stderr: 0.016625417583086437 belebele_por_Latn_acc_stderr: 0.016625417583086437 belebele_por_Latn_alias: belebele_por_Latn bluex_acc,all: 0.46870653685674546 bluex_acc,exam_id__UNICAMP_2018: 0.46296296296296297 bluex_acc,exam_id__UNICAMP_2019: 0.46 bluex_acc,exam_id__UNICAMP_2020: 0.5636363636363636 bluex_acc,exam_id__UNICAMP_2021_1: 0.3695652173913043 bluex_acc,exam_id__UNICAMP_2021_2: 0.37254901960784315 bluex_acc,exam_id__UNICAMP_2022: 0.5641025641025641 bluex_acc,exam_id__UNICAMP_2023: 0.6046511627906976 bluex_acc,exam_id__UNICAMP_2024: 0.5777777777777777 bluex_acc,exam_id__USP_2018: 0.37037037037037035 bluex_acc,exam_id__USP_2019: 0.475 bluex_acc,exam_id__USP_2020: 0.4642857142857143 bluex_acc,exam_id__USP_2021: 0.34615384615384615 bluex_acc,exam_id__USP_2022: 0.3877551020408163 bluex_acc,exam_id__USP_2023: 0.5227272727272727 bluex_acc,exam_id__USP_2024: 0.5609756097560976 bluex_acc_stderr,all: 0.010751763481122825 bluex_acc_stderr,exam_id__UNICAMP_2018: 0.03908140507987583 bluex_acc_stderr,exam_id__UNICAMP_2019: 0.04086284322684593 bluex_acc_stderr,exam_id__UNICAMP_2020: 0.03854268045176616 bluex_acc_stderr,exam_id__UNICAMP_2021_1: 0.041049198501562786 bluex_acc_stderr,exam_id__UNICAMP_2021_2: 0.03903627297829125 bluex_acc_stderr,exam_id__UNICAMP_2022: 0.046067534061712136 bluex_acc_stderr,exam_id__UNICAMP_2023: 0.043110276875963825 bluex_acc_stderr,exam_id__UNICAMP_2024: 0.04235722643358554 bluex_acc_stderr,exam_id__USP_2018: 0.03788178159383037 bluex_acc_stderr,exam_id__USP_2019: 0.04557332896428892 bluex_acc_stderr,exam_id__USP_2020: 0.038341658891473986 bluex_acc_stderr,exam_id__USP_2021: 0.038139467533341624 bluex_acc_stderr,exam_id__USP_2022: 0.040012931518371306 bluex_acc_stderr,exam_id__USP_2023: 0.043422447984322925 bluex_acc_stderr,exam_id__USP_2024: 0.044825815810456374 bluex_alias: bluex calame_pt_acc: 0.5867052023121387 calame_pt_acc_stderr: 0.01081012929476997 calame_pt_alias: calame_pt calame_pt_perplexity: 7.2359833214160085 calame_pt_perplexity_stderr: 0.42482617041660564 enem_challenge_acc,all: 0.5514345696291113 enem_challenge_acc,exam_id__2009: 0.5043478260869565 enem_challenge_acc,exam_id__2010: 0.5726495726495726 enem_challenge_acc,exam_id__2011: 0.6153846153846154 enem_challenge_acc,exam_id__2012: 0.5344827586206896 enem_challenge_acc,exam_id__2013: 0.5833333333333334 enem_challenge_acc,exam_id__2014: 0.5688073394495413 enem_challenge_acc,exam_id__2015: 0.4789915966386555 enem_challenge_acc,exam_id__2016: 0.5371900826446281 enem_challenge_acc,exam_id__2016_2: 0.5284552845528455 enem_challenge_acc,exam_id__2017: 0.5086206896551724 enem_challenge_acc,exam_id__2022: 0.5639097744360902 enem_challenge_acc,exam_id__2023: 0.6148148148148148 enem_challenge_acc_stderr,all: 0.007618623435000518 enem_challenge_acc_stderr,exam_id__2009: 0.026949045711548218 enem_challenge_acc_stderr,exam_id__2010: 0.026429279577735547 enem_challenge_acc_stderr,exam_id__2011: 0.02589822342752356 enem_challenge_acc_stderr,exam_id__2012: 0.02669879193984667 enem_challenge_acc_stderr,exam_id__2013: 0.02740882906544383 enem_challenge_acc_stderr,exam_id__2014: 0.027336979010138398 enem_challenge_acc_stderr,exam_id__2015: 0.026385065690201895 enem_challenge_acc_stderr,exam_id__2016: 0.026088144599797948 enem_challenge_acc_stderr,exam_id__2016_2: 0.02606861159558674 enem_challenge_acc_stderr,exam_id__2017: 0.02684138658677631 enem_challenge_acc_stderr,exam_id__2022: 0.02485425227418429 enem_challenge_acc_stderr,exam_id__2023: 0.024259924778206914 enem_challenge_alias: enem faquad_nli_acc,all: 0.7846153846153846 faquad_nli_acc_stderr,all: 0.011396120309131366 faquad_nli_alias: faquad_nli faquad_nli_f1_macro,all: 0.4396551724137931 faquad_nli_f1_macro_stderr,all: 0.00357969847290883 global_piqa_completions_por_latn_braz_acc: 0.79 global_piqa_completions_por_latn_braz_acc_bytes: 0.75 global_piqa_completions_por_latn_braz_acc_bytes_stderr: 0.04351941398892446 global_piqa_completions_por_latn_braz_acc_norm: 0.74 global_piqa_completions_por_latn_braz_acc_norm_stderr: 0.0440844002276808 global_piqa_completions_por_latn_braz_acc_stderr: 0.040936018074033236 global_piqa_completions_por_latn_braz_alias: global_piqa_completions_por_latn_braz hatebr_offensive_acc,all: 0.7964285714285714 hatebr_offensive_acc_stderr,all: 0.007613133172324561 hatebr_offensive_alias: hatebr_offensive_binary hatebr_offensive_f1_macro,all: 0.7963528271484649 hatebr_offensive_f1_macro_stderr,all: 0.00761763073122653 hellaswag_poly_pt_acc: 0.3776140426915159 hellaswag_poly_pt_acc_norm: 0.48434283237620546 hellaswag_poly_pt_acc_norm_stderr: 0.005202393220555622 hellaswag_poly_pt_acc_stderr: 0.005046614926940191 hellaswag_poly_pt_alias: hellaswag_poly_pt lambada_poly_pt_acc: 0.4513875412381137 lambada_poly_pt_acc_stderr: 0.006932975888368315 lambada_poly_pt_alias: lambada_poly_pt lambada_poly_pt_perplexity: 15.604144991512424 lambada_poly_pt_perplexity_stderr: 0.5333901055397442 mmlu_poly_pt_acc: 0.39680276193335334 mmlu_poly_pt_acc_stderr: 0.0042385372220186505 mmlu_poly_pt_alias: mmlu_poly_pt oab_exams_acc,all: 0.40364464692482915 oab_exams_acc,exam_id__2010-01: 0.4 oab_exams_acc,exam_id__2010-02: 0.4 oab_exams_acc,exam_id__2011-03: 0.3434343434343434 oab_exams_acc,exam_id__2011-04: 0.4125 oab_exams_acc,exam_id__2011-05: 0.425 oab_exams_acc,exam_id__2012-06: 0.375 oab_exams_acc,exam_id__2012-06a: 0.475 oab_exams_acc,exam_id__2012-07: 0.3125 oab_exams_acc,exam_id__2012-08: 0.425 oab_exams_acc,exam_id__2012-09: 0.3246753246753247 oab_exams_acc,exam_id__2013-10: 0.425 oab_exams_acc,exam_id__2013-11: 0.4375 oab_exams_acc,exam_id__2013-12: 0.4375 oab_exams_acc,exam_id__2014-13: 0.425 oab_exams_acc,exam_id__2014-14: 0.4125 oab_exams_acc,exam_id__2014-15: 0.41025641025641024 oab_exams_acc,exam_id__2015-16: 0.35 oab_exams_acc,exam_id__2015-17: 0.48717948717948717 oab_exams_acc,exam_id__2015-18: 0.3375 oab_exams_acc,exam_id__2016-19: 0.44871794871794873 oab_exams_acc,exam_id__2016-20: 0.3875 oab_exams_acc,exam_id__2016-20a: 0.3625 oab_exams_acc,exam_id__2016-21: 0.425 oab_exams_acc,exam_id__2017-22: 0.425 oab_exams_acc,exam_id__2017-23: 0.425 oab_exams_acc,exam_id__2017-24: 0.3875 oab_exams_acc,exam_id__2018-25: 0.4375 oab_exams_acc_stderr,all: 0.006053332133276562 oab_exams_acc_stderr,exam_id__2010-01: 0.030610161516106096 oab_exams_acc_stderr,exam_id__2010-02: 0.028296630437280906 oab_exams_acc_stderr,exam_id__2011-03: 0.027549621066797744 oab_exams_acc_stderr,exam_id__2011-04: 0.031730143003018005 oab_exams_acc_stderr,exam_id__2011-05: 0.03187425289992289 oab_exams_acc_stderr,exam_id__2012-06: 0.031303363618806604 oab_exams_acc_stderr,exam_id__2012-06a: 0.03204492958151872 oab_exams_acc_stderr,exam_id__2012-07: 0.030024899757931458 oab_exams_acc_stderr,exam_id__2012-08: 0.03186228006495766 oab_exams_acc_stderr,exam_id__2012-09: 0.030794179748498546 oab_exams_acc_stderr,exam_id__2013-10: 0.03186391626480471 oab_exams_acc_stderr,exam_id__2013-11: 0.03202492621379354 oab_exams_acc_stderr,exam_id__2013-12: 0.03195252134301974 oab_exams_acc_stderr,exam_id__2014-13: 0.03180998638392341 oab_exams_acc_stderr,exam_id__2014-14: 0.03183014975650971 oab_exams_acc_stderr,exam_id__2014-15: 0.03207677106125773 oab_exams_acc_stderr,exam_id__2015-16: 0.030649958521189145 oab_exams_acc_stderr,exam_id__2015-17: 0.03266206847243709 oab_exams_acc_stderr,exam_id__2015-18: 0.030453791802761587 oab_exams_acc_stderr,exam_id__2016-19: 0.03245421470850268 oab_exams_acc_stderr,exam_id__2016-20: 0.03147299166016239 oab_exams_acc_stderr,exam_id__2016-20a: 0.031016128861774545 oab_exams_acc_stderr,exam_id__2016-21: 0.03173265714315922 oab_exams_acc_stderr,exam_id__2017-22: 0.03191749762684417 oab_exams_acc_stderr,exam_id__2017-23: 0.0320020087581464 oab_exams_acc_stderr,exam_id__2017-24: 0.031539234776498976 oab_exams_acc_stderr,exam_id__2018-25: 0.03202176928915031 oab_exams_alias: oab_exams portuguese_hate_speech_acc,all: 0.43478260869565216 portuguese_hate_speech_acc_stderr,all: 0.012048917796997311 portuguese_hate_speech_alias: portuguese_hate_speech_binary portuguese_hate_speech_f1_macro,all: 0.42838929145469706 portuguese_hate_speech_f1_macro_stderr,all: 0.011986624066160297 tweetsentbr_acc,all: 0.4736318407960199 tweetsentbr_acc_stderr,all: 0.007893686144300348 tweetsentbr_alias: tweetsentbr tweetsentbr_f1_macro,all: 0.2678826295357763 tweetsentbr_f1_macro_stderr,all: 0.005779014212286808 step: 50000