evaluations: arc_challenge_poly_pt_acc: 0.47094017094017093 arc_challenge_poly_pt_acc_norm: 0.5025641025641026 arc_challenge_poly_pt_acc_norm_stderr: 0.014623692220064299 arc_challenge_poly_pt_acc_stderr: 0.014599164650716437 arc_challenge_poly_pt_alias: arc_challenge_poly_pt assin2_rte_acc,all: 0.9178921568627451 assin2_rte_acc_stderr,all: 0.0039410227425137 assin2_rte_alias: assin2_rte assin2_rte_f1_macro,all: 0.9176110196232412 assin2_rte_f1_macro_stderr,all: 0.003955066154708788 assin2_sts_alias: assin2_sts assin2_sts_mse,all: 0.8036356209150326 assin2_sts_mse_stderr,all: N/A assin2_sts_pearson,all: 0.7335010773330974 assin2_sts_pearson_stderr,all: 0.006813537226494434 assin_entailment_acc: 0.6955 assin_entailment_acc_stderr: 0.007277236909999638 assin_entailment_alias: assin_entailment assin_paraphrase_acc: 0.677 assin_paraphrase_acc_stderr: 0.007394688658428343 assin_paraphrase_alias: assin_paraphrase belebele_por_Latn_acc: 0.7755555555555556 belebele_por_Latn_acc_norm: 0.7755555555555556 belebele_por_Latn_acc_norm_stderr: 0.013914930474237296 belebele_por_Latn_acc_stderr: 0.013914930474237296 belebele_por_Latn_alias: belebele_por_Latn bluex_acc,all: 0.5285118219749653 bluex_acc,exam_id__UNICAMP_2018: 0.5185185185185185 bluex_acc,exam_id__UNICAMP_2019: 0.44 bluex_acc,exam_id__UNICAMP_2020: 0.509090909090909 bluex_acc,exam_id__UNICAMP_2021_1: 0.4782608695652174 bluex_acc,exam_id__UNICAMP_2021_2: 0.47058823529411764 bluex_acc,exam_id__UNICAMP_2022: 0.6410256410256411 bluex_acc,exam_id__UNICAMP_2023: 0.6046511627906976 bluex_acc,exam_id__UNICAMP_2024: 0.5333333333333333 bluex_acc,exam_id__USP_2018: 0.4074074074074074 bluex_acc,exam_id__USP_2019: 0.5 bluex_acc,exam_id__USP_2020: 0.6071428571428571 bluex_acc,exam_id__USP_2021: 0.5384615384615384 bluex_acc,exam_id__USP_2022: 0.42857142857142855 bluex_acc,exam_id__USP_2023: 0.6818181818181818 bluex_acc,exam_id__USP_2024: 0.6341463414634146 bluex_acc_stderr,all: 0.010703918286665611 bluex_acc_stderr,exam_id__UNICAMP_2018: 0.03937226593136202 bluex_acc_stderr,exam_id__UNICAMP_2019: 0.0404309855509269 bluex_acc_stderr,exam_id__UNICAMP_2020: 0.0389358045204499 bluex_acc_stderr,exam_id__UNICAMP_2021_1: 0.04241243284078559 bluex_acc_stderr,exam_id__UNICAMP_2021_2: 0.04027645736401125 bluex_acc_stderr,exam_id__UNICAMP_2022: 0.044230892510041545 bluex_acc_stderr,exam_id__UNICAMP_2023: 0.04301638178039838 bluex_acc_stderr,exam_id__UNICAMP_2024: 0.04292586445321273 bluex_acc_stderr,exam_id__USP_2018: 0.0385530737645887 bluex_acc_stderr,exam_id__USP_2019: 0.04559218480567802 bluex_acc_stderr,exam_id__USP_2020: 0.03761017438804557 bluex_acc_stderr,exam_id__USP_2021: 0.04001597061911072 bluex_acc_stderr,exam_id__USP_2022: 0.04070596417180048 bluex_acc_stderr,exam_id__USP_2023: 0.040428408321166086 bluex_acc_stderr,exam_id__USP_2024: 0.04340314503826957 bluex_alias: bluex calame_pt_acc: 0.5438342967244701 calame_pt_acc_stderr: 0.010934163426897668 calame_pt_alias: calame_pt calame_pt_perplexity: 7.8689968011124725 calame_pt_perplexity_stderr: 0.41766365172631903 enem_challenge_acc,all: 0.6270118964310707 enem_challenge_acc,exam_id__2009: 0.6347826086956522 enem_challenge_acc,exam_id__2010: 0.6752136752136753 enem_challenge_acc,exam_id__2011: 0.6923076923076923 enem_challenge_acc,exam_id__2012: 0.6379310344827587 enem_challenge_acc,exam_id__2013: 0.6111111111111112 enem_challenge_acc,exam_id__2014: 0.6330275229357798 enem_challenge_acc,exam_id__2015: 0.5462184873949579 enem_challenge_acc,exam_id__2016: 0.6198347107438017 enem_challenge_acc,exam_id__2016_2: 0.6097560975609756 enem_challenge_acc,exam_id__2017: 0.6120689655172413 enem_challenge_acc,exam_id__2022: 0.5939849624060151 enem_challenge_acc,exam_id__2023: 0.6592592592592592 enem_challenge_acc_stderr,all: 0.0073957026155519546 enem_challenge_acc_stderr,exam_id__2009: 0.025965719340461144 enem_challenge_acc_stderr,exam_id__2010: 0.02502188604536942 enem_challenge_acc_stderr,exam_id__2011: 0.02467006930960307 enem_challenge_acc_stderr,exam_id__2012: 0.025714191658261398 enem_challenge_acc_stderr,exam_id__2013: 0.02702908679710427 enem_challenge_acc_stderr,exam_id__2014: 0.026672592927520465 enem_challenge_acc_stderr,exam_id__2015: 0.02626461947663519 enem_challenge_acc_stderr,exam_id__2016: 0.025564903994126094 enem_challenge_acc_stderr,exam_id__2016_2: 0.025424569351359966 enem_challenge_acc_stderr,exam_id__2017: 0.026108373622566433 enem_challenge_acc_stderr,exam_id__2022: 0.024621347603181173 enem_challenge_acc_stderr,exam_id__2023: 0.023527643017571674 enem_challenge_alias: enem faquad_nli_acc,all: 0.8584615384615385 faquad_nli_acc_stderr,all: 0.009630496129889232 faquad_nli_alias: faquad_nli faquad_nli_f1_macro,all: 0.7762511973180077 faquad_nli_f1_macro_stderr,all: 0.014629139687819745 global_piqa_completions_por_latn_braz_acc: 0.75 global_piqa_completions_por_latn_braz_acc_bytes: 0.7 global_piqa_completions_por_latn_braz_acc_bytes_stderr: 0.04605661864718383 global_piqa_completions_por_latn_braz_acc_norm: 0.71 global_piqa_completions_por_latn_braz_acc_norm_stderr: 0.045604802157206865 global_piqa_completions_por_latn_braz_acc_stderr: 0.04351941398892446 global_piqa_completions_por_latn_braz_alias: global_piqa_completions_por_latn_braz gsm8k_pt_alias: gsm8k_pt gsm8k_pt_exact_match,flexible-extract: 0.19710806697108066 gsm8k_pt_exact_match,strict-match: 0.0 gsm8k_pt_exact_match_stderr,flexible-extract: 0.010978635935133022 gsm8k_pt_exact_match_stderr,strict-match: 0.0 hatebr_offensive_acc,all: 0.805 hatebr_offensive_acc_stderr,all: 0.00750545853190643 hatebr_offensive_alias: hatebr_offensive_binary hatebr_offensive_f1_macro,all: 0.7986300308904273 hatebr_offensive_f1_macro_stderr,all: 0.007718372444189955 hellaswag_poly_pt_acc: 0.43222450969769205 hellaswag_poly_pt_acc_norm: 0.5524975620327229 hellaswag_poly_pt_acc_norm_stderr: 0.00517617669289931 hellaswag_poly_pt_acc_stderr: 0.005156906089879843 hellaswag_poly_pt_alias: hellaswag_poly_pt humaneval_instruct_alias: humaneval_instruct humaneval_instruct_pass@1,create_test: 0.2621951219512195 humaneval_instruct_pass@1_stderr,create_test: 0.03445000289173463 ifeval_pt_alias: ifeval_pt ifeval_pt_inst_level_loose_acc: 0.46511627906976744 ifeval_pt_inst_level_loose_acc_stderr: N/A ifeval_pt_inst_level_strict_acc: 0.4232558139534884 ifeval_pt_inst_level_strict_acc_stderr: N/A ifeval_pt_prompt_level_loose_acc: 0.3433333333333333 ifeval_pt_prompt_level_loose_acc_stderr: 0.027459642357098978 ifeval_pt_prompt_level_strict_acc: 0.30333333333333334 ifeval_pt_prompt_level_strict_acc_stderr: 0.026585019936500975 lambada_poly_pt_acc: 0.5905297884727343 lambada_poly_pt_acc_stderr: 0.006850844880897535 lambada_poly_pt_alias: lambada_poly_pt lambada_poly_pt_perplexity: 9.837299391644223 lambada_poly_pt_perplexity_stderr: 0.3695848037190668 mmlu_poly_pt_acc: 0.5254428099669769 mmlu_poly_pt_acc_stderr: 0.0043261940110685525 mmlu_poly_pt_alias: mmlu_poly_pt oab_exams_acc,all: 0.43416856492027334 oab_exams_acc,exam_id__2010-01: 0.3411764705882353 oab_exams_acc,exam_id__2010-02: 0.46 oab_exams_acc,exam_id__2011-03: 0.3838383838383838 oab_exams_acc,exam_id__2011-04: 0.4 oab_exams_acc,exam_id__2011-05: 0.5 oab_exams_acc,exam_id__2012-06: 0.4125 oab_exams_acc,exam_id__2012-06a: 0.425 oab_exams_acc,exam_id__2012-07: 0.425 oab_exams_acc,exam_id__2012-08: 0.4625 oab_exams_acc,exam_id__2012-09: 0.38961038961038963 oab_exams_acc,exam_id__2013-10: 0.5 oab_exams_acc,exam_id__2013-11: 0.575 oab_exams_acc,exam_id__2013-12: 0.475 oab_exams_acc,exam_id__2014-13: 0.425 oab_exams_acc,exam_id__2014-14: 0.475 oab_exams_acc,exam_id__2014-15: 0.5 oab_exams_acc,exam_id__2015-16: 0.425 oab_exams_acc,exam_id__2015-17: 0.4230769230769231 oab_exams_acc,exam_id__2015-18: 0.4 oab_exams_acc,exam_id__2016-19: 0.47435897435897434 oab_exams_acc,exam_id__2016-20: 0.4625 oab_exams_acc,exam_id__2016-20a: 0.375 oab_exams_acc,exam_id__2016-21: 0.4125 oab_exams_acc,exam_id__2017-22: 0.4375 oab_exams_acc,exam_id__2017-23: 0.3375 oab_exams_acc,exam_id__2017-24: 0.425 oab_exams_acc,exam_id__2018-25: 0.4125 oab_exams_acc_stderr,all: 0.006102155835634948 oab_exams_acc_stderr,exam_id__2010-01: 0.029640791842627794 oab_exams_acc_stderr,exam_id__2010-02: 0.02872289890111472 oab_exams_acc_stderr,exam_id__2011-03: 0.02813693132118459 oab_exams_acc_stderr,exam_id__2011-04: 0.03154240021693644 oab_exams_acc_stderr,exam_id__2011-05: 0.03224120715021378 oab_exams_acc_stderr,exam_id__2012-06: 0.031634579715402525 oab_exams_acc_stderr,exam_id__2012-06a: 0.031920386806255 oab_exams_acc_stderr,exam_id__2012-07: 0.03196335345646489 oab_exams_acc_stderr,exam_id__2012-08: 0.03218895689579674 oab_exams_acc_stderr,exam_id__2012-09: 0.0320702088032968 oab_exams_acc_stderr,exam_id__2013-10: 0.032306001943973275 oab_exams_acc_stderr,exam_id__2013-11: 0.03198130854191419 oab_exams_acc_stderr,exam_id__2013-12: 0.03227420874356609 oab_exams_acc_stderr,exam_id__2014-13: 0.03197697369553699 oab_exams_acc_stderr,exam_id__2014-14: 0.03231427645661526 oab_exams_acc_stderr,exam_id__2014-15: 0.03281148582087051 oab_exams_acc_stderr,exam_id__2015-16: 0.03193036659905079 oab_exams_acc_stderr,exam_id__2015-17: 0.03231882364931395 oab_exams_acc_stderr,exam_id__2015-18: 0.031606839641845616 oab_exams_acc_stderr,exam_id__2016-19: 0.03261356291633692 oab_exams_acc_stderr,exam_id__2016-20: 0.03217012301961201 oab_exams_acc_stderr,exam_id__2016-20a: 0.03124902402233319 oab_exams_acc_stderr,exam_id__2016-21: 0.03172689133529826 oab_exams_acc_stderr,exam_id__2017-22: 0.03202936221429109 oab_exams_acc_stderr,exam_id__2017-23: 0.03052924311679464 oab_exams_acc_stderr,exam_id__2017-24: 0.031892680920180556 oab_exams_acc_stderr,exam_id__2018-25: 0.03181646004259634 oab_exams_alias: oab_exams portuguese_hate_speech_acc,all: 0.7708578143360753 portuguese_hate_speech_acc_stderr,all: 0.010179285936061107 portuguese_hate_speech_alias: portuguese_hate_speech_binary portuguese_hate_speech_f1_macro,all: 0.7383400530746562 portuguese_hate_speech_f1_macro_stderr,all: 0.011391910083076176 tweetsentbr_acc,all: 0.663681592039801 tweetsentbr_acc_stderr,all: 0.007468862153058396 tweetsentbr_alias: tweetsentbr tweetsentbr_f1_macro,all: 0.57818681435807 tweetsentbr_f1_macro_stderr,all: 0.007796495757272563 step: 69750