evaluations: arc_challenge_poly_pt_acc: 0.5230769230769231 arc_challenge_poly_pt_acc_norm: 0.5777777777777777 arc_challenge_poly_pt_acc_norm_stderr: 0.014445870094078068 arc_challenge_poly_pt_acc_stderr: 0.014608300475750825 arc_challenge_poly_pt_alias: arc_challenge_poly_pt assin2_rte_acc,all: 0.9252450980392157 assin2_rte_acc_stderr,all: 0.0037560275279046665 assin2_rte_alias: assin2_rte assin2_rte_f1_macro,all: 0.9251590635527494 assin2_rte_f1_macro_stderr,all: 0.0037610728425282497 assin2_sts_alias: assin2_sts assin2_sts_mse,all: 0.5572916666666665 assin2_sts_mse_stderr,all: N/A assin2_sts_pearson,all: 0.7701197353926412 assin2_sts_pearson_stderr,all: 0.006649667590414615 assin_entailment_acc: 0.704 assin_entailment_acc_stderr: 0.00721865827261647 assin_entailment_alias: assin_entailment assin_paraphrase_acc: 0.694 assin_paraphrase_acc_stderr: 0.007287268079947193 assin_paraphrase_alias: assin_paraphrase belebele_por_Latn_acc: 0.8366666666666667 belebele_por_Latn_acc_norm: 0.8366666666666667 belebele_por_Latn_acc_norm_stderr: 0.012329168844652528 belebele_por_Latn_acc_stderr: 0.012329168844652528 belebele_por_Latn_alias: belebele_por_Latn bluex_acc,all: 0.6620305980528511 bluex_acc,exam_id__UNICAMP_2018: 0.6481481481481481 bluex_acc,exam_id__UNICAMP_2019: 0.64 bluex_acc,exam_id__UNICAMP_2020: 0.6909090909090909 bluex_acc,exam_id__UNICAMP_2021_1: 0.6521739130434783 bluex_acc,exam_id__UNICAMP_2021_2: 0.5882352941176471 bluex_acc,exam_id__UNICAMP_2022: 0.6666666666666666 bluex_acc,exam_id__UNICAMP_2023: 0.7209302325581395 bluex_acc,exam_id__UNICAMP_2024: 0.6444444444444445 bluex_acc,exam_id__USP_2018: 0.5925925925925926 bluex_acc,exam_id__USP_2019: 0.7 bluex_acc,exam_id__USP_2020: 0.6607142857142857 bluex_acc,exam_id__USP_2021: 0.75 bluex_acc,exam_id__USP_2022: 0.5918367346938775 bluex_acc,exam_id__USP_2023: 0.7045454545454546 bluex_acc,exam_id__USP_2024: 0.7073170731707317 bluex_acc_stderr,all: 0.010157757528559894 bluex_acc_stderr,exam_id__UNICAMP_2018: 0.037594875406546435 bluex_acc_stderr,exam_id__UNICAMP_2019: 0.03953375278949041 bluex_acc_stderr,exam_id__UNICAMP_2020: 0.03605420458368598 bluex_acc_stderr,exam_id__UNICAMP_2021_1: 0.0405315180666698 bluex_acc_stderr,exam_id__UNICAMP_2021_2: 0.039752636457935614 bluex_acc_stderr,exam_id__UNICAMP_2022: 0.04360776726045774 bluex_acc_stderr,exam_id__UNICAMP_2023: 0.039432470230869696 bluex_acc_stderr,exam_id__UNICAMP_2024: 0.041213076472343936 bluex_acc_stderr,exam_id__USP_2018: 0.038650432775192395 bluex_acc_stderr,exam_id__USP_2019: 0.04175277819931915 bluex_acc_stderr,exam_id__USP_2020: 0.03644397183647981 bluex_acc_stderr,exam_id__USP_2021: 0.034773619645811646 bluex_acc_stderr,exam_id__USP_2022: 0.040532053004604704 bluex_acc_stderr,exam_id__USP_2023: 0.039742872820681924 bluex_acc_stderr,exam_id__USP_2024: 0.040951553558739306 bluex_alias: bluex calame_pt_acc: 0.6107899807321773 calame_pt_acc_stderr: 0.0107035762556229 calame_pt_alias: calame_pt calame_pt_perplexity: 5.713055201421455 calame_pt_perplexity_stderr: 0.29495381614560345 enem_challenge_acc,all: 0.7753673897830651 enem_challenge_acc,exam_id__2009: 0.7478260869565218 enem_challenge_acc,exam_id__2010: 0.811965811965812 enem_challenge_acc,exam_id__2011: 0.8461538461538461 enem_challenge_acc,exam_id__2012: 0.8448275862068966 enem_challenge_acc,exam_id__2013: 0.7777777777777778 enem_challenge_acc,exam_id__2014: 0.8073394495412844 enem_challenge_acc,exam_id__2015: 0.8067226890756303 enem_challenge_acc,exam_id__2016: 0.743801652892562 enem_challenge_acc,exam_id__2016_2: 0.7154471544715447 enem_challenge_acc,exam_id__2017: 0.75 enem_challenge_acc,exam_id__2022: 0.6842105263157895 enem_challenge_acc,exam_id__2023: 0.7851851851851852 enem_challenge_acc_stderr,all: 0.006377145135723042 enem_challenge_acc_stderr,exam_id__2009: 0.023447252641875988 enem_challenge_acc_stderr,exam_id__2010: 0.02087704326839612 enem_challenge_acc_stderr,exam_id__2011: 0.01921565112452091 enem_challenge_acc_stderr,exam_id__2012: 0.01944793905815595 enem_challenge_acc_stderr,exam_id__2013: 0.023084030560191867 enem_challenge_acc_stderr,exam_id__2014: 0.021892563584984096 enem_challenge_acc_stderr,exam_id__2015: 0.020893018955083217 enem_challenge_acc_stderr,exam_id__2016: 0.022776450345788787 enem_challenge_acc_stderr,exam_id__2016_2: 0.023503035027562222 enem_challenge_acc_stderr,exam_id__2017: 0.023138027075607918 enem_challenge_acc_stderr,exam_id__2022: 0.02320588990454305 enem_challenge_acc_stderr,exam_id__2023: 0.020404682391600704 enem_challenge_alias: enem faquad_nli_acc,all: 0.7876923076923077 faquad_nli_acc_stderr,all: 0.01133278097111669 faquad_nli_alias: faquad_nli faquad_nli_f1_macro,all: 0.45449901481427424 faquad_nli_f1_macro_stderr,all: 0.008069363645658589 global_piqa_completions_por_latn_braz_acc: 0.84 global_piqa_completions_por_latn_braz_acc_bytes: 0.83 global_piqa_completions_por_latn_braz_acc_bytes_stderr: 0.03775251680686369 global_piqa_completions_por_latn_braz_acc_norm: 0.83 global_piqa_completions_por_latn_braz_acc_norm_stderr: 0.03775251680686369 global_piqa_completions_por_latn_braz_acc_stderr: 0.03684529491774706 global_piqa_completions_por_latn_braz_alias: global_piqa_completions_por_latn_braz hatebr_offensive_acc,all: 0.665 hatebr_offensive_acc_stderr,all: 0.00890653495166499 hatebr_offensive_alias: hatebr_offensive_binary hatebr_offensive_f1_macro,all: 0.6234605955470173 hatebr_offensive_f1_macro_stderr,all: 0.00951858317428499 hellaswag_poly_pt_acc: 0.4838010618701918 hellaswag_poly_pt_acc_norm: 0.6531585220500596 hellaswag_poly_pt_acc_norm_stderr: 0.004954741713215018 hellaswag_poly_pt_acc_stderr: 0.00520221346811777 hellaswag_poly_pt_alias: hellaswag_poly_pt lambada_poly_pt_acc: 0.6252668348534834 lambada_poly_pt_acc_stderr: 0.006743817908692071 lambada_poly_pt_alias: lambada_poly_pt lambada_poly_pt_perplexity: 6.574656712295472 lambada_poly_pt_perplexity_stderr: 0.18832300331707774 mmlu_poly_pt_acc: 0.6540078054638246 mmlu_poly_pt_acc_stderr: 0.004121199159002156 mmlu_poly_pt_alias: mmlu_poly_pt oab_exams_acc,all: 0.584510250569476 oab_exams_acc,exam_id__2010-01: 0.3764705882352941 oab_exams_acc,exam_id__2010-02: 0.59 oab_exams_acc,exam_id__2011-03: 0.5656565656565656 oab_exams_acc,exam_id__2011-04: 0.5125 oab_exams_acc,exam_id__2011-05: 0.6625 oab_exams_acc,exam_id__2012-06: 0.625 oab_exams_acc,exam_id__2012-06a: 0.7375 oab_exams_acc,exam_id__2012-07: 0.6125 oab_exams_acc,exam_id__2012-08: 0.55 oab_exams_acc,exam_id__2012-09: 0.4805194805194805 oab_exams_acc,exam_id__2013-10: 0.65 oab_exams_acc,exam_id__2013-11: 0.625 oab_exams_acc,exam_id__2013-12: 0.65 oab_exams_acc,exam_id__2014-13: 0.525 oab_exams_acc,exam_id__2014-14: 0.625 oab_exams_acc,exam_id__2014-15: 0.6538461538461539 oab_exams_acc,exam_id__2015-16: 0.5875 oab_exams_acc,exam_id__2015-17: 0.5897435897435898 oab_exams_acc,exam_id__2015-18: 0.575 oab_exams_acc,exam_id__2016-19: 0.5897435897435898 oab_exams_acc,exam_id__2016-20: 0.6125 oab_exams_acc,exam_id__2016-20a: 0.55 oab_exams_acc,exam_id__2016-21: 0.4875 oab_exams_acc,exam_id__2017-22: 0.6375 oab_exams_acc,exam_id__2017-23: 0.525 oab_exams_acc,exam_id__2017-24: 0.6375 oab_exams_acc,exam_id__2018-25: 0.5625 oab_exams_acc_stderr,all: 0.006071412237214423 oab_exams_acc_stderr,exam_id__2010-01: 0.030361740131894334 oab_exams_acc_stderr,exam_id__2010-02: 0.028445183897774385 oab_exams_acc_stderr,exam_id__2011-03: 0.02862790167127372 oab_exams_acc_stderr,exam_id__2011-04: 0.03224493616787287 oab_exams_acc_stderr,exam_id__2011-05: 0.030467009008680036 oab_exams_acc_stderr,exam_id__2012-06: 0.031220571629946996 oab_exams_acc_stderr,exam_id__2012-06a: 0.02843559794708646 oab_exams_acc_stderr,exam_id__2012-07: 0.03139589988285276 oab_exams_acc_stderr,exam_id__2012-08: 0.03226132851591818 oab_exams_acc_stderr,exam_id__2012-09: 0.032830805301195386 oab_exams_acc_stderr,exam_id__2013-10: 0.030611536360793473 oab_exams_acc_stderr,exam_id__2013-11: 0.031310092407276585 oab_exams_acc_stderr,exam_id__2013-12: 0.030692719997990617 oab_exams_acc_stderr,exam_id__2014-13: 0.032358129209763435 oab_exams_acc_stderr,exam_id__2014-14: 0.031382714558388446 oab_exams_acc_stderr,exam_id__2014-15: 0.03109637957099322 oab_exams_acc_stderr,exam_id__2015-16: 0.031957806650269406 oab_exams_acc_stderr,exam_id__2015-17: 0.03208206142728883 oab_exams_acc_stderr,exam_id__2015-18: 0.03182114971496286 oab_exams_acc_stderr,exam_id__2016-19: 0.03228511767428725 oab_exams_acc_stderr,exam_id__2016-20: 0.031372223696958024 oab_exams_acc_stderr,exam_id__2016-20a: 0.0321935167686262 oab_exams_acc_stderr,exam_id__2016-21: 0.03209267502993051 oab_exams_acc_stderr,exam_id__2017-22: 0.03105400471909683 oab_exams_acc_stderr,exam_id__2017-23: 0.03235792164586319 oab_exams_acc_stderr,exam_id__2017-24: 0.031098329350728315 oab_exams_acc_stderr,exam_id__2018-25: 0.03209246971016282 oab_exams_alias: oab_exams portuguese_hate_speech_acc,all: 0.6145710928319624 portuguese_hate_speech_acc_stderr,all: 0.011835075822813054 portuguese_hate_speech_alias: portuguese_hate_speech_binary portuguese_hate_speech_f1_macro,all: 0.6103088177807561 portuguese_hate_speech_f1_macro_stderr,all: 0.011900091760317547 tweetsentbr_acc,all: 0.7298507462686568 tweetsentbr_acc_stderr,all: 0.006986496038388035 tweetsentbr_alias: tweetsentbr tweetsentbr_f1_macro,all: 0.7027752533485003 tweetsentbr_f1_macro_stderr,all: 0.007392699151541939 step: 50000