evaluations: arc_challenge_poly_pt_acc: 0.30427350427350425 arc_challenge_poly_pt_acc_norm: 0.32735042735042735 arc_challenge_poly_pt_acc_norm_stderr: 0.013724408490743929 arc_challenge_poly_pt_acc_stderr: 0.013456870841978025 arc_challenge_poly_pt_alias: arc_challenge_poly_pt assin2_rte_acc,all: 0.5 assin2_rte_acc_stderr,all: 0.007138073526203373 assin2_rte_alias: assin2_rte assin2_rte_f1_macro,all: 0.3333333333333333 assin2_rte_f1_macro_stderr,all: 0.003173025139437921 assin2_sts_alias: assin2_sts assin2_sts_mse,all: 3.059456699346405 assin2_sts_mse_stderr,all: N/A assin2_sts_pearson,all: 0.0276213074062244 assin2_sts_pearson_stderr,all: 0.011665375504884477 assin_entailment_acc: 0.7335 assin_entailment_acc_stderr: 0.006991541883910775 assin_entailment_alias: assin_entailment assin_paraphrase_acc: 0.71575 assin_paraphrase_acc_stderr: 0.0071327206100355 assin_paraphrase_alias: assin_paraphrase belebele_por_Latn_acc: 0.3611111111111111 belebele_por_Latn_acc_norm: 0.3611111111111111 belebele_por_Latn_acc_norm_stderr: 0.016019658270537297 belebele_por_Latn_acc_stderr: 0.016019658270537297 belebele_por_Latn_alias: belebele_por_Latn bluex_acc,all: 0.34492350486787204 bluex_acc,exam_id__UNICAMP_2018: 0.4074074074074074 bluex_acc,exam_id__UNICAMP_2019: 0.28 bluex_acc,exam_id__UNICAMP_2020: 0.36363636363636365 bluex_acc,exam_id__UNICAMP_2021_1: 0.2391304347826087 bluex_acc,exam_id__UNICAMP_2021_2: 0.23529411764705882 bluex_acc,exam_id__UNICAMP_2022: 0.2564102564102564 bluex_acc,exam_id__UNICAMP_2023: 0.5581395348837209 bluex_acc,exam_id__UNICAMP_2024: 0.4 bluex_acc,exam_id__USP_2018: 0.3148148148148148 bluex_acc,exam_id__USP_2019: 0.3 bluex_acc,exam_id__USP_2020: 0.30357142857142855 bluex_acc,exam_id__USP_2021: 0.36538461538461536 bluex_acc,exam_id__USP_2022: 0.3673469387755102 bluex_acc,exam_id__USP_2023: 0.38636363636363635 bluex_acc,exam_id__USP_2024: 0.4146341463414634 bluex_acc_stderr,all: 0.010242775047627772 bluex_acc_stderr,exam_id__UNICAMP_2018: 0.038534741152469736 bluex_acc_stderr,exam_id__UNICAMP_2019: 0.0367961057296462 bluex_acc_stderr,exam_id__UNICAMP_2020: 0.037368794971719924 bluex_acc_stderr,exam_id__UNICAMP_2021_1: 0.036222183210060134 bluex_acc_stderr,exam_id__UNICAMP_2021_2: 0.034079376824662555 bluex_acc_stderr,exam_id__UNICAMP_2022: 0.04027891922030821 bluex_acc_stderr,exam_id__UNICAMP_2023: 0.04379564728920204 bluex_acc_stderr,exam_id__UNICAMP_2024: 0.04209744896628822 bluex_acc_stderr,exam_id__USP_2018: 0.03647519527201769 bluex_acc_stderr,exam_id__USP_2019: 0.04181463130976304 bluex_acc_stderr,exam_id__USP_2020: 0.03539764734011866 bluex_acc_stderr,exam_id__USP_2021: 0.03859991685452418 bluex_acc_stderr,exam_id__USP_2022: 0.0397939306907615 bluex_acc_stderr,exam_id__USP_2023: 0.042529993265009204 bluex_acc_stderr,exam_id__USP_2024: 0.04456715995150719 bluex_alias: bluex calame_pt_acc: 0.09489402697495183 calame_pt_acc_stderr: 0.006433689590297553 calame_pt_alias: calame_pt calame_pt_perplexity: 8435.632130447184 calame_pt_perplexity_stderr: 963.9665411064852 enem_challenge_acc,all: 0.3198040587823653 enem_challenge_acc,exam_id__2009: 0.3130434782608696 enem_challenge_acc,exam_id__2010: 0.38461538461538464 enem_challenge_acc,exam_id__2011: 0.29914529914529914 enem_challenge_acc,exam_id__2012: 0.31896551724137934 enem_challenge_acc,exam_id__2013: 0.37037037037037035 enem_challenge_acc,exam_id__2014: 0.3394495412844037 enem_challenge_acc,exam_id__2015: 0.3025210084033613 enem_challenge_acc,exam_id__2016: 0.3140495867768595 enem_challenge_acc,exam_id__2016_2: 0.2764227642276423 enem_challenge_acc,exam_id__2017: 0.25 enem_challenge_acc,exam_id__2022: 0.3233082706766917 enem_challenge_acc,exam_id__2023: 0.34814814814814815 enem_challenge_acc_stderr,all: 0.007148951828535609 enem_challenge_acc_stderr,exam_id__2009: 0.024960208775102345 enem_challenge_acc_stderr,exam_id__2010: 0.025924496341894532 enem_challenge_acc_stderr,exam_id__2011: 0.02449152244622838 enem_challenge_acc_stderr,exam_id__2012: 0.024889831032253663 enem_challenge_acc_stderr,exam_id__2013: 0.02682596737556379 enem_challenge_acc_stderr,exam_id__2014: 0.026210250439613336 enem_challenge_acc_stderr,exam_id__2015: 0.02434000254473757 enem_challenge_acc_stderr,exam_id__2016: 0.024414986986171384 enem_challenge_acc_stderr,exam_id__2016_2: 0.023342214704782527 enem_challenge_acc_stderr,exam_id__2017: 0.02314953920904528 enem_challenge_acc_stderr,exam_id__2022: 0.023367019181975722 enem_challenge_acc_stderr,exam_id__2023: 0.023749430869741607 enem_challenge_alias: enem faquad_nli_acc,all: 0.7846153846153846 faquad_nli_acc_stderr,all: 0.011396120309131366 faquad_nli_alias: faquad_nli faquad_nli_f1_macro,all: 0.4396551724137931 faquad_nli_f1_macro_stderr,all: 0.00357969847290883 global_piqa_completions_por_latn_braz_acc: 0.7 global_piqa_completions_por_latn_braz_acc_bytes: 0.69 global_piqa_completions_por_latn_braz_acc_bytes_stderr: 0.046482319871173176 global_piqa_completions_por_latn_braz_acc_norm: 0.68 global_piqa_completions_por_latn_braz_acc_norm_stderr: 0.046882617226215076 global_piqa_completions_por_latn_braz_acc_stderr: 0.04605661864718383 global_piqa_completions_por_latn_braz_alias: global_piqa_completions_por_latn_braz gsm8k_pt_alias: gsm8k_pt gsm8k_pt_exact_match,flexible-extract: 0.1461187214611872 gsm8k_pt_exact_match,strict-match: 0.0 gsm8k_pt_exact_match_stderr,flexible-extract: 0.009748085159664667 gsm8k_pt_exact_match_stderr,strict-match: 0.0 hatebr_offensive_acc,all: 0.5328571428571428 hatebr_offensive_acc_stderr,all: 0.009452579861644791 hatebr_offensive_alias: hatebr_offensive_binary hatebr_offensive_f1_macro,all: 0.4054916057180851 hatebr_offensive_f1_macro_stderr,all: 0.007877082274553134 hellaswag_poly_pt_acc: 0.3781558131975295 hellaswag_poly_pt_acc_norm: 0.47209881894029687 hellaswag_poly_pt_acc_norm_stderr: 0.005196835630828483 hellaswag_poly_pt_acc_stderr: 0.005048035343611143 hellaswag_poly_pt_alias: hellaswag_poly_pt humaneval_instruct_alias: humaneval_instruct humaneval_instruct_pass@1,create_test: 0.0 humaneval_instruct_pass@1_stderr,create_test: 0.0 ifeval_pt_alias: ifeval_pt ifeval_pt_inst_level_loose_acc: 0.3930232558139535 ifeval_pt_inst_level_loose_acc_stderr: N/A ifeval_pt_inst_level_strict_acc: 0.3302325581395349 ifeval_pt_inst_level_strict_acc_stderr: N/A ifeval_pt_prompt_level_loose_acc: 0.27666666666666667 ifeval_pt_prompt_level_loose_acc_stderr: 0.025870931391123536 ifeval_pt_prompt_level_strict_acc: 0.21 ifeval_pt_prompt_level_strict_acc_stderr: 0.023555243542102446 lambada_poly_pt_acc: 0.20861633999611875 lambada_poly_pt_acc_stderr: 0.005660825573438258 lambada_poly_pt_alias: lambada_poly_pt lambada_poly_pt_perplexity: 861.6919525698232 lambada_poly_pt_perplexity_stderr: 63.082803555905926 mmlu_poly_pt_acc: 0.3607775442809967 mmlu_poly_pt_acc_stderr: 0.004160492531275285 mmlu_poly_pt_alias: mmlu_poly_pt oab_exams_acc,all: 0.27015945330296126 oab_exams_acc,exam_id__2010-01: 0.24705882352941178 oab_exams_acc,exam_id__2010-02: 0.28 oab_exams_acc,exam_id__2011-03: 0.24242424242424243 oab_exams_acc,exam_id__2011-04: 0.225 oab_exams_acc,exam_id__2011-05: 0.325 oab_exams_acc,exam_id__2012-06: 0.3 oab_exams_acc,exam_id__2012-06a: 0.2125 oab_exams_acc,exam_id__2012-07: 0.3125 oab_exams_acc,exam_id__2012-08: 0.1375 oab_exams_acc,exam_id__2012-09: 0.22077922077922077 oab_exams_acc,exam_id__2013-10: 0.2375 oab_exams_acc,exam_id__2013-11: 0.275 oab_exams_acc,exam_id__2013-12: 0.2375 oab_exams_acc,exam_id__2014-13: 0.2875 oab_exams_acc,exam_id__2014-14: 0.25 oab_exams_acc,exam_id__2014-15: 0.21794871794871795 oab_exams_acc,exam_id__2015-16: 0.2875 oab_exams_acc,exam_id__2015-17: 0.3333333333333333 oab_exams_acc,exam_id__2015-18: 0.275 oab_exams_acc,exam_id__2016-19: 0.2692307692307692 oab_exams_acc,exam_id__2016-20: 0.3625 oab_exams_acc,exam_id__2016-20a: 0.275 oab_exams_acc,exam_id__2016-21: 0.3 oab_exams_acc,exam_id__2017-22: 0.3375 oab_exams_acc,exam_id__2017-23: 0.3 oab_exams_acc,exam_id__2017-24: 0.3 oab_exams_acc,exam_id__2018-25: 0.25 oab_exams_acc_stderr,all: 0.0054708493847973475 oab_exams_acc_stderr,exam_id__2010-01: 0.027072546734021444 oab_exams_acc_stderr,exam_id__2010-02: 0.025924135072639538 oab_exams_acc_stderr,exam_id__2011-03: 0.024845779837305654 oab_exams_acc_stderr,exam_id__2011-04: 0.026955276691079964 oab_exams_acc_stderr,exam_id__2011-05: 0.03024661215761147 oab_exams_acc_stderr,exam_id__2012-06: 0.02951246277176116 oab_exams_acc_stderr,exam_id__2012-06a: 0.026389734380509827 oab_exams_acc_stderr,exam_id__2012-07: 0.029841237684623958 oab_exams_acc_stderr,exam_id__2012-08: 0.0221709705988969 oab_exams_acc_stderr,exam_id__2012-09: 0.027302999732294067 oab_exams_acc_stderr,exam_id__2013-10: 0.0274908309628061 oab_exams_acc_stderr,exam_id__2013-11: 0.02882257223036154 oab_exams_acc_stderr,exam_id__2013-12: 0.027413917931195794 oab_exams_acc_stderr,exam_id__2014-13: 0.02917450516650155 oab_exams_acc_stderr,exam_id__2014-14: 0.027979401755101777 oab_exams_acc_stderr,exam_id__2014-15: 0.027011736935613068 oab_exams_acc_stderr,exam_id__2015-16: 0.029203723510899393 oab_exams_acc_stderr,exam_id__2015-17: 0.030881077024925714 oab_exams_acc_stderr,exam_id__2015-18: 0.0288121979710083 oab_exams_acc_stderr,exam_id__2016-19: 0.029017747344098694 oab_exams_acc_stderr,exam_id__2016-20: 0.030947316988827907 oab_exams_acc_stderr,exam_id__2016-20a: 0.02885616149691637 oab_exams_acc_stderr,exam_id__2016-21: 0.029466001599879478 oab_exams_acc_stderr,exam_id__2017-22: 0.0304433823531397 oab_exams_acc_stderr,exam_id__2017-23: 0.0295029507416334 oab_exams_acc_stderr,exam_id__2017-24: 0.029614631804279756 oab_exams_acc_stderr,exam_id__2018-25: 0.028064299041816978 oab_exams_alias: oab_exams portuguese_hate_speech_acc,all: 0.30552291421856637 portuguese_hate_speech_acc_stderr,all: 0.0111342367647186 portuguese_hate_speech_alias: portuguese_hate_speech_binary portuguese_hate_speech_f1_macro,all: 0.24331655796529078 portuguese_hate_speech_f1_macro_stderr,all: 0.007523479961403228 tweetsentbr_acc,all: 0.34626865671641793 tweetsentbr_acc_stderr,all: 0.007507813204647027 tweetsentbr_alias: tweetsentbr tweetsentbr_f1_macro,all: 0.24098176805099736 tweetsentbr_f1_macro_stderr,all: 0.005874676993024155 step: 3595