Files
Tucano2-qwen-3.7B-Base/evals.yaml

191 lines
9.5 KiB
YAML
Raw Permalink Normal View History

evaluations:
arc_challenge_poly_pt_acc: 0.5230769230769231
arc_challenge_poly_pt_acc_norm: 0.5777777777777777
arc_challenge_poly_pt_acc_norm_stderr: 0.014445870094078068
arc_challenge_poly_pt_acc_stderr: 0.014608300475750825
arc_challenge_poly_pt_alias: arc_challenge_poly_pt
assin2_rte_acc,all: 0.9252450980392157
assin2_rte_acc_stderr,all: 0.0037560275279046665
assin2_rte_alias: assin2_rte
assin2_rte_f1_macro,all: 0.9251590635527494
assin2_rte_f1_macro_stderr,all: 0.0037610728425282497
assin2_sts_alias: assin2_sts
assin2_sts_mse,all: 0.5572916666666665
assin2_sts_mse_stderr,all: N/A
assin2_sts_pearson,all: 0.7701197353926412
assin2_sts_pearson_stderr,all: 0.006649667590414615
assin_entailment_acc: 0.704
assin_entailment_acc_stderr: 0.00721865827261647
assin_entailment_alias: assin_entailment
assin_paraphrase_acc: 0.694
assin_paraphrase_acc_stderr: 0.007287268079947193
assin_paraphrase_alias: assin_paraphrase
belebele_por_Latn_acc: 0.8366666666666667
belebele_por_Latn_acc_norm: 0.8366666666666667
belebele_por_Latn_acc_norm_stderr: 0.012329168844652528
belebele_por_Latn_acc_stderr: 0.012329168844652528
belebele_por_Latn_alias: belebele_por_Latn
bluex_acc,all: 0.6620305980528511
bluex_acc,exam_id__UNICAMP_2018: 0.6481481481481481
bluex_acc,exam_id__UNICAMP_2019: 0.64
bluex_acc,exam_id__UNICAMP_2020: 0.6909090909090909
bluex_acc,exam_id__UNICAMP_2021_1: 0.6521739130434783
bluex_acc,exam_id__UNICAMP_2021_2: 0.5882352941176471
bluex_acc,exam_id__UNICAMP_2022: 0.6666666666666666
bluex_acc,exam_id__UNICAMP_2023: 0.7209302325581395
bluex_acc,exam_id__UNICAMP_2024: 0.6444444444444445
bluex_acc,exam_id__USP_2018: 0.5925925925925926
bluex_acc,exam_id__USP_2019: 0.7
bluex_acc,exam_id__USP_2020: 0.6607142857142857
bluex_acc,exam_id__USP_2021: 0.75
bluex_acc,exam_id__USP_2022: 0.5918367346938775
bluex_acc,exam_id__USP_2023: 0.7045454545454546
bluex_acc,exam_id__USP_2024: 0.7073170731707317
bluex_acc_stderr,all: 0.010157757528559894
bluex_acc_stderr,exam_id__UNICAMP_2018: 0.037594875406546435
bluex_acc_stderr,exam_id__UNICAMP_2019: 0.03953375278949041
bluex_acc_stderr,exam_id__UNICAMP_2020: 0.03605420458368598
bluex_acc_stderr,exam_id__UNICAMP_2021_1: 0.0405315180666698
bluex_acc_stderr,exam_id__UNICAMP_2021_2: 0.039752636457935614
bluex_acc_stderr,exam_id__UNICAMP_2022: 0.04360776726045774
bluex_acc_stderr,exam_id__UNICAMP_2023: 0.039432470230869696
bluex_acc_stderr,exam_id__UNICAMP_2024: 0.041213076472343936
bluex_acc_stderr,exam_id__USP_2018: 0.038650432775192395
bluex_acc_stderr,exam_id__USP_2019: 0.04175277819931915
bluex_acc_stderr,exam_id__USP_2020: 0.03644397183647981
bluex_acc_stderr,exam_id__USP_2021: 0.034773619645811646
bluex_acc_stderr,exam_id__USP_2022: 0.040532053004604704
bluex_acc_stderr,exam_id__USP_2023: 0.039742872820681924
bluex_acc_stderr,exam_id__USP_2024: 0.040951553558739306
bluex_alias: bluex
calame_pt_acc: 0.6107899807321773
calame_pt_acc_stderr: 0.0107035762556229
calame_pt_alias: calame_pt
calame_pt_perplexity: 5.713055201421455
calame_pt_perplexity_stderr: 0.29495381614560345
enem_challenge_acc,all: 0.7753673897830651
enem_challenge_acc,exam_id__2009: 0.7478260869565218
enem_challenge_acc,exam_id__2010: 0.811965811965812
enem_challenge_acc,exam_id__2011: 0.8461538461538461
enem_challenge_acc,exam_id__2012: 0.8448275862068966
enem_challenge_acc,exam_id__2013: 0.7777777777777778
enem_challenge_acc,exam_id__2014: 0.8073394495412844
enem_challenge_acc,exam_id__2015: 0.8067226890756303
enem_challenge_acc,exam_id__2016: 0.743801652892562
enem_challenge_acc,exam_id__2016_2: 0.7154471544715447
enem_challenge_acc,exam_id__2017: 0.75
enem_challenge_acc,exam_id__2022: 0.6842105263157895
enem_challenge_acc,exam_id__2023: 0.7851851851851852
enem_challenge_acc_stderr,all: 0.006377145135723042
enem_challenge_acc_stderr,exam_id__2009: 0.023447252641875988
enem_challenge_acc_stderr,exam_id__2010: 0.02087704326839612
enem_challenge_acc_stderr,exam_id__2011: 0.01921565112452091
enem_challenge_acc_stderr,exam_id__2012: 0.01944793905815595
enem_challenge_acc_stderr,exam_id__2013: 0.023084030560191867
enem_challenge_acc_stderr,exam_id__2014: 0.021892563584984096
enem_challenge_acc_stderr,exam_id__2015: 0.020893018955083217
enem_challenge_acc_stderr,exam_id__2016: 0.022776450345788787
enem_challenge_acc_stderr,exam_id__2016_2: 0.023503035027562222
enem_challenge_acc_stderr,exam_id__2017: 0.023138027075607918
enem_challenge_acc_stderr,exam_id__2022: 0.02320588990454305
enem_challenge_acc_stderr,exam_id__2023: 0.020404682391600704
enem_challenge_alias: enem
faquad_nli_acc,all: 0.7876923076923077
faquad_nli_acc_stderr,all: 0.01133278097111669
faquad_nli_alias: faquad_nli
faquad_nli_f1_macro,all: 0.45449901481427424
faquad_nli_f1_macro_stderr,all: 0.008069363645658589
global_piqa_completions_por_latn_braz_acc: 0.84
global_piqa_completions_por_latn_braz_acc_bytes: 0.83
global_piqa_completions_por_latn_braz_acc_bytes_stderr: 0.03775251680686369
global_piqa_completions_por_latn_braz_acc_norm: 0.83
global_piqa_completions_por_latn_braz_acc_norm_stderr: 0.03775251680686369
global_piqa_completions_por_latn_braz_acc_stderr: 0.03684529491774706
global_piqa_completions_por_latn_braz_alias: global_piqa_completions_por_latn_braz
hatebr_offensive_acc,all: 0.665
hatebr_offensive_acc_stderr,all: 0.00890653495166499
hatebr_offensive_alias: hatebr_offensive_binary
hatebr_offensive_f1_macro,all: 0.6234605955470173
hatebr_offensive_f1_macro_stderr,all: 0.00951858317428499
hellaswag_poly_pt_acc: 0.4838010618701918
hellaswag_poly_pt_acc_norm: 0.6531585220500596
hellaswag_poly_pt_acc_norm_stderr: 0.004954741713215018
hellaswag_poly_pt_acc_stderr: 0.00520221346811777
hellaswag_poly_pt_alias: hellaswag_poly_pt
lambada_poly_pt_acc: 0.6252668348534834
lambada_poly_pt_acc_stderr: 0.006743817908692071
lambada_poly_pt_alias: lambada_poly_pt
lambada_poly_pt_perplexity: 6.574656712295472
lambada_poly_pt_perplexity_stderr: 0.18832300331707774
mmlu_poly_pt_acc: 0.6540078054638246
mmlu_poly_pt_acc_stderr: 0.004121199159002156
mmlu_poly_pt_alias: mmlu_poly_pt
oab_exams_acc,all: 0.584510250569476
oab_exams_acc,exam_id__2010-01: 0.3764705882352941
oab_exams_acc,exam_id__2010-02: 0.59
oab_exams_acc,exam_id__2011-03: 0.5656565656565656
oab_exams_acc,exam_id__2011-04: 0.5125
oab_exams_acc,exam_id__2011-05: 0.6625
oab_exams_acc,exam_id__2012-06: 0.625
oab_exams_acc,exam_id__2012-06a: 0.7375
oab_exams_acc,exam_id__2012-07: 0.6125
oab_exams_acc,exam_id__2012-08: 0.55
oab_exams_acc,exam_id__2012-09: 0.4805194805194805
oab_exams_acc,exam_id__2013-10: 0.65
oab_exams_acc,exam_id__2013-11: 0.625
oab_exams_acc,exam_id__2013-12: 0.65
oab_exams_acc,exam_id__2014-13: 0.525
oab_exams_acc,exam_id__2014-14: 0.625
oab_exams_acc,exam_id__2014-15: 0.6538461538461539
oab_exams_acc,exam_id__2015-16: 0.5875
oab_exams_acc,exam_id__2015-17: 0.5897435897435898
oab_exams_acc,exam_id__2015-18: 0.575
oab_exams_acc,exam_id__2016-19: 0.5897435897435898
oab_exams_acc,exam_id__2016-20: 0.6125
oab_exams_acc,exam_id__2016-20a: 0.55
oab_exams_acc,exam_id__2016-21: 0.4875
oab_exams_acc,exam_id__2017-22: 0.6375
oab_exams_acc,exam_id__2017-23: 0.525
oab_exams_acc,exam_id__2017-24: 0.6375
oab_exams_acc,exam_id__2018-25: 0.5625
oab_exams_acc_stderr,all: 0.006071412237214423
oab_exams_acc_stderr,exam_id__2010-01: 0.030361740131894334
oab_exams_acc_stderr,exam_id__2010-02: 0.028445183897774385
oab_exams_acc_stderr,exam_id__2011-03: 0.02862790167127372
oab_exams_acc_stderr,exam_id__2011-04: 0.03224493616787287
oab_exams_acc_stderr,exam_id__2011-05: 0.030467009008680036
oab_exams_acc_stderr,exam_id__2012-06: 0.031220571629946996
oab_exams_acc_stderr,exam_id__2012-06a: 0.02843559794708646
oab_exams_acc_stderr,exam_id__2012-07: 0.03139589988285276
oab_exams_acc_stderr,exam_id__2012-08: 0.03226132851591818
oab_exams_acc_stderr,exam_id__2012-09: 0.032830805301195386
oab_exams_acc_stderr,exam_id__2013-10: 0.030611536360793473
oab_exams_acc_stderr,exam_id__2013-11: 0.031310092407276585
oab_exams_acc_stderr,exam_id__2013-12: 0.030692719997990617
oab_exams_acc_stderr,exam_id__2014-13: 0.032358129209763435
oab_exams_acc_stderr,exam_id__2014-14: 0.031382714558388446
oab_exams_acc_stderr,exam_id__2014-15: 0.03109637957099322
oab_exams_acc_stderr,exam_id__2015-16: 0.031957806650269406
oab_exams_acc_stderr,exam_id__2015-17: 0.03208206142728883
oab_exams_acc_stderr,exam_id__2015-18: 0.03182114971496286
oab_exams_acc_stderr,exam_id__2016-19: 0.03228511767428725
oab_exams_acc_stderr,exam_id__2016-20: 0.031372223696958024
oab_exams_acc_stderr,exam_id__2016-20a: 0.0321935167686262
oab_exams_acc_stderr,exam_id__2016-21: 0.03209267502993051
oab_exams_acc_stderr,exam_id__2017-22: 0.03105400471909683
oab_exams_acc_stderr,exam_id__2017-23: 0.03235792164586319
oab_exams_acc_stderr,exam_id__2017-24: 0.031098329350728315
oab_exams_acc_stderr,exam_id__2018-25: 0.03209246971016282
oab_exams_alias: oab_exams
portuguese_hate_speech_acc,all: 0.6145710928319624
portuguese_hate_speech_acc_stderr,all: 0.011835075822813054
portuguese_hate_speech_alias: portuguese_hate_speech_binary
portuguese_hate_speech_f1_macro,all: 0.6103088177807561
portuguese_hate_speech_f1_macro_stderr,all: 0.011900091760317547
tweetsentbr_acc,all: 0.7298507462686568
tweetsentbr_acc_stderr,all: 0.006986496038388035
tweetsentbr_alias: tweetsentbr
tweetsentbr_f1_macro,all: 0.7027752533485003
tweetsentbr_f1_macro_stderr,all: 0.007392699151541939
step: 50000