evaluations: arc_challenge_poly_pt_acc: 0.44529914529914527 arc_challenge_poly_pt_acc_norm: 0.48205128205128206 arc_challenge_poly_pt_acc_norm_stderr: 0.014614459118720773 arc_challenge_poly_pt_acc_stderr: 0.014536106383401307 arc_challenge_poly_pt_alias: arc_challenge_poly_pt assin2_rte_acc,all: 0.8766339869281046 assin2_rte_acc_stderr,all: 0.004699176594010998 assin2_rte_alias: assin2_rte assin2_rte_f1_macro,all: 0.8755544782450612 assin2_rte_f1_macro_stderr,all: 0.004739218474976754 assin2_sts_alias: assin2_sts assin2_sts_mse,all: 1.0735661764705884 assin2_sts_mse_stderr,all: N/A assin2_sts_pearson,all: 0.6290850483582386 assin2_sts_pearson_stderr,all: 0.009612669804680212 assin_entailment_acc: 0.708 assin_entailment_acc_stderr: 0.007190057317647597 assin_entailment_alias: assin_entailment assin_paraphrase_acc: 0.72475 assin_paraphrase_acc_stderr: 0.007062884004258771 assin_paraphrase_alias: assin_paraphrase belebele_por_Latn_acc: 0.74 belebele_por_Latn_acc_norm: 0.74 belebele_por_Latn_acc_norm_stderr: 0.014629271097998421 belebele_por_Latn_acc_stderr: 0.014629271097998421 belebele_por_Latn_alias: belebele_por_Latn bluex_acc,all: 0.5591098748261474 bluex_acc,exam_id__UNICAMP_2018: 0.5370370370370371 bluex_acc,exam_id__UNICAMP_2019: 0.6 bluex_acc,exam_id__UNICAMP_2020: 0.509090909090909 bluex_acc,exam_id__UNICAMP_2021_1: 0.6304347826086957 bluex_acc,exam_id__UNICAMP_2021_2: 0.47058823529411764 bluex_acc,exam_id__UNICAMP_2022: 0.6923076923076923 bluex_acc,exam_id__UNICAMP_2023: 0.6511627906976745 bluex_acc,exam_id__UNICAMP_2024: 0.5555555555555556 bluex_acc,exam_id__USP_2018: 0.42592592592592593 bluex_acc,exam_id__USP_2019: 0.4 bluex_acc,exam_id__USP_2020: 0.5535714285714286 bluex_acc,exam_id__USP_2021: 0.6346153846153846 bluex_acc,exam_id__USP_2022: 0.4897959183673469 bluex_acc,exam_id__USP_2023: 0.6136363636363636 bluex_acc,exam_id__USP_2024: 0.6829268292682927 bluex_acc_stderr,all: 0.01069785624296974 bluex_acc_stderr,exam_id__UNICAMP_2018: 0.039296745462938605 bluex_acc_stderr,exam_id__UNICAMP_2019: 0.04014798243504816 bluex_acc_stderr,exam_id__UNICAMP_2020: 0.03888891915912078 bluex_acc_stderr,exam_id__UNICAMP_2021_1: 0.0411282805992433 bluex_acc_stderr,exam_id__UNICAMP_2021_2: 0.04024244267609041 bluex_acc_stderr,exam_id__UNICAMP_2022: 0.04269098796102326 bluex_acc_stderr,exam_id__UNICAMP_2023: 0.041929332285094205 bluex_acc_stderr,exam_id__UNICAMP_2024: 0.04271556020713639 bluex_acc_stderr,exam_id__USP_2018: 0.038960456443585575 bluex_acc_stderr,exam_id__USP_2019: 0.04470992542423865 bluex_acc_stderr,exam_id__USP_2020: 0.03835558472845869 bluex_acc_stderr,exam_id__USP_2021: 0.03851223021094464 bluex_acc_stderr,exam_id__USP_2022: 0.0410194387799713 bluex_acc_stderr,exam_id__USP_2023: 0.04234932088737962 bluex_acc_stderr,exam_id__USP_2024: 0.0418113153523233 bluex_alias: bluex calame_pt_acc: 0.5905587668593449 calame_pt_acc_stderr: 0.010794891914388602 calame_pt_alias: calame_pt calame_pt_perplexity: 7.008747913313241 calame_pt_perplexity_stderr: 0.40940358093832135 enem_challenge_acc,all: 0.6871938418474458 enem_challenge_acc,exam_id__2009: 0.6782608695652174 enem_challenge_acc,exam_id__2010: 0.717948717948718 enem_challenge_acc,exam_id__2011: 0.7521367521367521 enem_challenge_acc,exam_id__2012: 0.7068965517241379 enem_challenge_acc,exam_id__2013: 0.6666666666666666 enem_challenge_acc,exam_id__2014: 0.6972477064220184 enem_challenge_acc,exam_id__2015: 0.7058823529411765 enem_challenge_acc,exam_id__2016: 0.6611570247933884 enem_challenge_acc,exam_id__2016_2: 0.6422764227642277 enem_challenge_acc,exam_id__2017: 0.6896551724137931 enem_challenge_acc,exam_id__2022: 0.631578947368421 enem_challenge_acc,exam_id__2023: 0.7037037037037037 enem_challenge_acc_stderr,all: 0.0070891143834158395 enem_challenge_acc_stderr,exam_id__2009: 0.0251403029631727 enem_challenge_acc_stderr,exam_id__2010: 0.02405435432253117 enem_challenge_acc_stderr,exam_id__2011: 0.023038334357693698 enem_challenge_acc_stderr,exam_id__2012: 0.02443296265724745 enem_challenge_acc_stderr,exam_id__2013: 0.02625818903872996 enem_challenge_acc_stderr,exam_id__2014: 0.02543475203567573 enem_challenge_acc_stderr,exam_id__2015: 0.0241011316238719 enem_challenge_acc_stderr,exam_id__2016: 0.02479881231135445 enem_challenge_acc_stderr,exam_id__2016_2: 0.024985945100694615 enem_challenge_acc_stderr,exam_id__2017: 0.024741511708920926 enem_challenge_acc_stderr,exam_id__2022: 0.024112138519174948 enem_challenge_acc_stderr,exam_id__2023: 0.022685440228473772 enem_challenge_alias: enem faquad_nli_acc,all: 0.7846153846153846 faquad_nli_acc_stderr,all: 0.011396120309131366 faquad_nli_alias: faquad_nli faquad_nli_f1_macro,all: 0.4396551724137931 faquad_nli_f1_macro_stderr,all: 0.00357969847290883 global_piqa_completions_por_latn_braz_acc: 0.8 global_piqa_completions_por_latn_braz_acc_bytes: 0.77 global_piqa_completions_por_latn_braz_acc_bytes_stderr: 0.042295258468165065 global_piqa_completions_por_latn_braz_acc_norm: 0.77 global_piqa_completions_por_latn_braz_acc_norm_stderr: 0.042295258468165065 global_piqa_completions_por_latn_braz_acc_stderr: 0.04020151261036849 global_piqa_completions_por_latn_braz_alias: global_piqa_completions_por_latn_braz hatebr_offensive_acc,all: 0.8064285714285714 hatebr_offensive_acc_stderr,all: 0.0074826455677965455 hatebr_offensive_alias: hatebr_offensive_binary hatebr_offensive_f1_macro,all: 0.801107069296415 hatebr_offensive_f1_macro_stderr,all: 0.007665138669900729 hellaswag_poly_pt_acc: 0.42539820132192 hellaswag_poly_pt_acc_norm: 0.5624661393433742 hellaswag_poly_pt_acc_norm_stderr: 0.005164166461307016 hellaswag_poly_pt_acc_stderr: 0.005146684217488626 hellaswag_poly_pt_alias: hellaswag_poly_pt lambada_poly_pt_acc: 0.5420143605666602 lambada_poly_pt_acc_stderr: 0.006941341313928234 lambada_poly_pt_alias: lambada_poly_pt lambada_poly_pt_perplexity: 9.820716308685725 lambada_poly_pt_perplexity_stderr: 0.3120846033602529 mmlu_poly_pt_acc: 0.5403782647853498 mmlu_poly_pt_acc_stderr: 0.004317657624183865 mmlu_poly_pt_alias: mmlu_poly_pt oab_exams_acc,all: 0.48291571753986334 oab_exams_acc,exam_id__2010-01: 0.4588235294117647 oab_exams_acc,exam_id__2010-02: 0.51 oab_exams_acc,exam_id__2011-03: 0.46464646464646464 oab_exams_acc,exam_id__2011-04: 0.45 oab_exams_acc,exam_id__2011-05: 0.5 oab_exams_acc,exam_id__2012-06: 0.4625 oab_exams_acc,exam_id__2012-06a: 0.525 oab_exams_acc,exam_id__2012-07: 0.5 oab_exams_acc,exam_id__2012-08: 0.4625 oab_exams_acc,exam_id__2012-09: 0.33766233766233766 oab_exams_acc,exam_id__2013-10: 0.525 oab_exams_acc,exam_id__2013-11: 0.525 oab_exams_acc,exam_id__2013-12: 0.525 oab_exams_acc,exam_id__2014-13: 0.475 oab_exams_acc,exam_id__2014-14: 0.5375 oab_exams_acc,exam_id__2014-15: 0.5641025641025641 oab_exams_acc,exam_id__2015-16: 0.5375 oab_exams_acc,exam_id__2015-17: 0.5384615384615384 oab_exams_acc,exam_id__2015-18: 0.4625 oab_exams_acc,exam_id__2016-19: 0.48717948717948717 oab_exams_acc,exam_id__2016-20: 0.45 oab_exams_acc,exam_id__2016-20a: 0.425 oab_exams_acc,exam_id__2016-21: 0.4625 oab_exams_acc,exam_id__2017-22: 0.45 oab_exams_acc,exam_id__2017-23: 0.45 oab_exams_acc,exam_id__2017-24: 0.5 oab_exams_acc,exam_id__2018-25: 0.45 oab_exams_acc_stderr,all: 0.006164493571290463 oab_exams_acc_stderr,exam_id__2010-01: 0.03120711424338333 oab_exams_acc_stderr,exam_id__2010-02: 0.028912621193308535 oab_exams_acc_stderr,exam_id__2011-03: 0.028826912523627856 oab_exams_acc_stderr,exam_id__2011-04: 0.03204801747078995 oab_exams_acc_stderr,exam_id__2011-05: 0.03224202969176272 oab_exams_acc_stderr,exam_id__2012-06: 0.03222923233485234 oab_exams_acc_stderr,exam_id__2012-06a: 0.03229751885191722 oab_exams_acc_stderr,exam_id__2012-07: 0.03239443199904663 oab_exams_acc_stderr,exam_id__2012-08: 0.032144839789965185 oab_exams_acc_stderr,exam_id__2012-09: 0.03103244684042299 oab_exams_acc_stderr,exam_id__2013-10: 0.032222242709920586 oab_exams_acc_stderr,exam_id__2013-11: 0.032249698626176736 oab_exams_acc_stderr,exam_id__2013-12: 0.03225675063294939 oab_exams_acc_stderr,exam_id__2014-13: 0.03217856982922958 oab_exams_acc_stderr,exam_id__2014-14: 0.032246622088818386 oab_exams_acc_stderr,exam_id__2014-15: 0.032435167155658584 oab_exams_acc_stderr,exam_id__2015-16: 0.03223354880595777 oab_exams_acc_stderr,exam_id__2015-17: 0.032573794785528166 oab_exams_acc_stderr,exam_id__2015-18: 0.032123574402475284 oab_exams_acc_stderr,exam_id__2016-19: 0.03271170717682627 oab_exams_acc_stderr,exam_id__2016-20: 0.03203769414642788 oab_exams_acc_stderr,exam_id__2016-20a: 0.031951776527517205 oab_exams_acc_stderr,exam_id__2016-21: 0.03217984644292296 oab_exams_acc_stderr,exam_id__2017-22: 0.03205629372165545 oab_exams_acc_stderr,exam_id__2017-23: 0.03221345216992268 oab_exams_acc_stderr,exam_id__2017-24: 0.03232207361521986 oab_exams_acc_stderr,exam_id__2018-25: 0.03198727711742204 oab_exams_alias: oab_exams portuguese_hate_speech_acc,all: 0.7297297297297297 portuguese_hate_speech_acc_stderr,all: 0.010749375621571917 portuguese_hate_speech_alias: portuguese_hate_speech_binary portuguese_hate_speech_f1_macro,all: 0.679463244638342 portuguese_hate_speech_f1_macro_stderr,all: 0.01220967447481398 tweetsentbr_acc,all: 0.7014925373134329 tweetsentbr_acc_stderr,all: 0.007246042251471291 tweetsentbr_alias: tweetsentbr tweetsentbr_f1_macro,all: 0.6540958473356445 tweetsentbr_f1_macro_stderr,all: 0.007812938746547184 step: 100000