191 lines
9.5 KiB
YAML
191 lines
9.5 KiB
YAML
evaluations:
|
|
arc_challenge_poly_pt_acc: 0.44529914529914527
|
|
arc_challenge_poly_pt_acc_norm: 0.48205128205128206
|
|
arc_challenge_poly_pt_acc_norm_stderr: 0.014614459118720773
|
|
arc_challenge_poly_pt_acc_stderr: 0.014536106383401307
|
|
arc_challenge_poly_pt_alias: arc_challenge_poly_pt
|
|
assin2_rte_acc,all: 0.8766339869281046
|
|
assin2_rte_acc_stderr,all: 0.004699176594010998
|
|
assin2_rte_alias: assin2_rte
|
|
assin2_rte_f1_macro,all: 0.8755544782450612
|
|
assin2_rte_f1_macro_stderr,all: 0.004739218474976754
|
|
assin2_sts_alias: assin2_sts
|
|
assin2_sts_mse,all: 1.0735661764705884
|
|
assin2_sts_mse_stderr,all: N/A
|
|
assin2_sts_pearson,all: 0.6290850483582386
|
|
assin2_sts_pearson_stderr,all: 0.009612669804680212
|
|
assin_entailment_acc: 0.708
|
|
assin_entailment_acc_stderr: 0.007190057317647597
|
|
assin_entailment_alias: assin_entailment
|
|
assin_paraphrase_acc: 0.72475
|
|
assin_paraphrase_acc_stderr: 0.007062884004258771
|
|
assin_paraphrase_alias: assin_paraphrase
|
|
belebele_por_Latn_acc: 0.74
|
|
belebele_por_Latn_acc_norm: 0.74
|
|
belebele_por_Latn_acc_norm_stderr: 0.014629271097998421
|
|
belebele_por_Latn_acc_stderr: 0.014629271097998421
|
|
belebele_por_Latn_alias: belebele_por_Latn
|
|
bluex_acc,all: 0.5591098748261474
|
|
bluex_acc,exam_id__UNICAMP_2018: 0.5370370370370371
|
|
bluex_acc,exam_id__UNICAMP_2019: 0.6
|
|
bluex_acc,exam_id__UNICAMP_2020: 0.509090909090909
|
|
bluex_acc,exam_id__UNICAMP_2021_1: 0.6304347826086957
|
|
bluex_acc,exam_id__UNICAMP_2021_2: 0.47058823529411764
|
|
bluex_acc,exam_id__UNICAMP_2022: 0.6923076923076923
|
|
bluex_acc,exam_id__UNICAMP_2023: 0.6511627906976745
|
|
bluex_acc,exam_id__UNICAMP_2024: 0.5555555555555556
|
|
bluex_acc,exam_id__USP_2018: 0.42592592592592593
|
|
bluex_acc,exam_id__USP_2019: 0.4
|
|
bluex_acc,exam_id__USP_2020: 0.5535714285714286
|
|
bluex_acc,exam_id__USP_2021: 0.6346153846153846
|
|
bluex_acc,exam_id__USP_2022: 0.4897959183673469
|
|
bluex_acc,exam_id__USP_2023: 0.6136363636363636
|
|
bluex_acc,exam_id__USP_2024: 0.6829268292682927
|
|
bluex_acc_stderr,all: 0.01069785624296974
|
|
bluex_acc_stderr,exam_id__UNICAMP_2018: 0.039296745462938605
|
|
bluex_acc_stderr,exam_id__UNICAMP_2019: 0.04014798243504816
|
|
bluex_acc_stderr,exam_id__UNICAMP_2020: 0.03888891915912078
|
|
bluex_acc_stderr,exam_id__UNICAMP_2021_1: 0.0411282805992433
|
|
bluex_acc_stderr,exam_id__UNICAMP_2021_2: 0.04024244267609041
|
|
bluex_acc_stderr,exam_id__UNICAMP_2022: 0.04269098796102326
|
|
bluex_acc_stderr,exam_id__UNICAMP_2023: 0.041929332285094205
|
|
bluex_acc_stderr,exam_id__UNICAMP_2024: 0.04271556020713639
|
|
bluex_acc_stderr,exam_id__USP_2018: 0.038960456443585575
|
|
bluex_acc_stderr,exam_id__USP_2019: 0.04470992542423865
|
|
bluex_acc_stderr,exam_id__USP_2020: 0.03835558472845869
|
|
bluex_acc_stderr,exam_id__USP_2021: 0.03851223021094464
|
|
bluex_acc_stderr,exam_id__USP_2022: 0.0410194387799713
|
|
bluex_acc_stderr,exam_id__USP_2023: 0.04234932088737962
|
|
bluex_acc_stderr,exam_id__USP_2024: 0.0418113153523233
|
|
bluex_alias: bluex
|
|
calame_pt_acc: 0.5905587668593449
|
|
calame_pt_acc_stderr: 0.010794891914388602
|
|
calame_pt_alias: calame_pt
|
|
calame_pt_perplexity: 7.008747913313241
|
|
calame_pt_perplexity_stderr: 0.40940358093832135
|
|
enem_challenge_acc,all: 0.6871938418474458
|
|
enem_challenge_acc,exam_id__2009: 0.6782608695652174
|
|
enem_challenge_acc,exam_id__2010: 0.717948717948718
|
|
enem_challenge_acc,exam_id__2011: 0.7521367521367521
|
|
enem_challenge_acc,exam_id__2012: 0.7068965517241379
|
|
enem_challenge_acc,exam_id__2013: 0.6666666666666666
|
|
enem_challenge_acc,exam_id__2014: 0.6972477064220184
|
|
enem_challenge_acc,exam_id__2015: 0.7058823529411765
|
|
enem_challenge_acc,exam_id__2016: 0.6611570247933884
|
|
enem_challenge_acc,exam_id__2016_2: 0.6422764227642277
|
|
enem_challenge_acc,exam_id__2017: 0.6896551724137931
|
|
enem_challenge_acc,exam_id__2022: 0.631578947368421
|
|
enem_challenge_acc,exam_id__2023: 0.7037037037037037
|
|
enem_challenge_acc_stderr,all: 0.0070891143834158395
|
|
enem_challenge_acc_stderr,exam_id__2009: 0.0251403029631727
|
|
enem_challenge_acc_stderr,exam_id__2010: 0.02405435432253117
|
|
enem_challenge_acc_stderr,exam_id__2011: 0.023038334357693698
|
|
enem_challenge_acc_stderr,exam_id__2012: 0.02443296265724745
|
|
enem_challenge_acc_stderr,exam_id__2013: 0.02625818903872996
|
|
enem_challenge_acc_stderr,exam_id__2014: 0.02543475203567573
|
|
enem_challenge_acc_stderr,exam_id__2015: 0.0241011316238719
|
|
enem_challenge_acc_stderr,exam_id__2016: 0.02479881231135445
|
|
enem_challenge_acc_stderr,exam_id__2016_2: 0.024985945100694615
|
|
enem_challenge_acc_stderr,exam_id__2017: 0.024741511708920926
|
|
enem_challenge_acc_stderr,exam_id__2022: 0.024112138519174948
|
|
enem_challenge_acc_stderr,exam_id__2023: 0.022685440228473772
|
|
enem_challenge_alias: enem
|
|
faquad_nli_acc,all: 0.7846153846153846
|
|
faquad_nli_acc_stderr,all: 0.011396120309131366
|
|
faquad_nli_alias: faquad_nli
|
|
faquad_nli_f1_macro,all: 0.4396551724137931
|
|
faquad_nli_f1_macro_stderr,all: 0.00357969847290883
|
|
global_piqa_completions_por_latn_braz_acc: 0.8
|
|
global_piqa_completions_por_latn_braz_acc_bytes: 0.77
|
|
global_piqa_completions_por_latn_braz_acc_bytes_stderr: 0.042295258468165065
|
|
global_piqa_completions_por_latn_braz_acc_norm: 0.77
|
|
global_piqa_completions_por_latn_braz_acc_norm_stderr: 0.042295258468165065
|
|
global_piqa_completions_por_latn_braz_acc_stderr: 0.04020151261036849
|
|
global_piqa_completions_por_latn_braz_alias: global_piqa_completions_por_latn_braz
|
|
hatebr_offensive_acc,all: 0.8064285714285714
|
|
hatebr_offensive_acc_stderr,all: 0.0074826455677965455
|
|
hatebr_offensive_alias: hatebr_offensive_binary
|
|
hatebr_offensive_f1_macro,all: 0.801107069296415
|
|
hatebr_offensive_f1_macro_stderr,all: 0.007665138669900729
|
|
hellaswag_poly_pt_acc: 0.42539820132192
|
|
hellaswag_poly_pt_acc_norm: 0.5624661393433742
|
|
hellaswag_poly_pt_acc_norm_stderr: 0.005164166461307016
|
|
hellaswag_poly_pt_acc_stderr: 0.005146684217488626
|
|
hellaswag_poly_pt_alias: hellaswag_poly_pt
|
|
lambada_poly_pt_acc: 0.5420143605666602
|
|
lambada_poly_pt_acc_stderr: 0.006941341313928234
|
|
lambada_poly_pt_alias: lambada_poly_pt
|
|
lambada_poly_pt_perplexity: 9.820716308685725
|
|
lambada_poly_pt_perplexity_stderr: 0.3120846033602529
|
|
mmlu_poly_pt_acc: 0.5403782647853498
|
|
mmlu_poly_pt_acc_stderr: 0.004317657624183865
|
|
mmlu_poly_pt_alias: mmlu_poly_pt
|
|
oab_exams_acc,all: 0.48291571753986334
|
|
oab_exams_acc,exam_id__2010-01: 0.4588235294117647
|
|
oab_exams_acc,exam_id__2010-02: 0.51
|
|
oab_exams_acc,exam_id__2011-03: 0.46464646464646464
|
|
oab_exams_acc,exam_id__2011-04: 0.45
|
|
oab_exams_acc,exam_id__2011-05: 0.5
|
|
oab_exams_acc,exam_id__2012-06: 0.4625
|
|
oab_exams_acc,exam_id__2012-06a: 0.525
|
|
oab_exams_acc,exam_id__2012-07: 0.5
|
|
oab_exams_acc,exam_id__2012-08: 0.4625
|
|
oab_exams_acc,exam_id__2012-09: 0.33766233766233766
|
|
oab_exams_acc,exam_id__2013-10: 0.525
|
|
oab_exams_acc,exam_id__2013-11: 0.525
|
|
oab_exams_acc,exam_id__2013-12: 0.525
|
|
oab_exams_acc,exam_id__2014-13: 0.475
|
|
oab_exams_acc,exam_id__2014-14: 0.5375
|
|
oab_exams_acc,exam_id__2014-15: 0.5641025641025641
|
|
oab_exams_acc,exam_id__2015-16: 0.5375
|
|
oab_exams_acc,exam_id__2015-17: 0.5384615384615384
|
|
oab_exams_acc,exam_id__2015-18: 0.4625
|
|
oab_exams_acc,exam_id__2016-19: 0.48717948717948717
|
|
oab_exams_acc,exam_id__2016-20: 0.45
|
|
oab_exams_acc,exam_id__2016-20a: 0.425
|
|
oab_exams_acc,exam_id__2016-21: 0.4625
|
|
oab_exams_acc,exam_id__2017-22: 0.45
|
|
oab_exams_acc,exam_id__2017-23: 0.45
|
|
oab_exams_acc,exam_id__2017-24: 0.5
|
|
oab_exams_acc,exam_id__2018-25: 0.45
|
|
oab_exams_acc_stderr,all: 0.006164493571290463
|
|
oab_exams_acc_stderr,exam_id__2010-01: 0.03120711424338333
|
|
oab_exams_acc_stderr,exam_id__2010-02: 0.028912621193308535
|
|
oab_exams_acc_stderr,exam_id__2011-03: 0.028826912523627856
|
|
oab_exams_acc_stderr,exam_id__2011-04: 0.03204801747078995
|
|
oab_exams_acc_stderr,exam_id__2011-05: 0.03224202969176272
|
|
oab_exams_acc_stderr,exam_id__2012-06: 0.03222923233485234
|
|
oab_exams_acc_stderr,exam_id__2012-06a: 0.03229751885191722
|
|
oab_exams_acc_stderr,exam_id__2012-07: 0.03239443199904663
|
|
oab_exams_acc_stderr,exam_id__2012-08: 0.032144839789965185
|
|
oab_exams_acc_stderr,exam_id__2012-09: 0.03103244684042299
|
|
oab_exams_acc_stderr,exam_id__2013-10: 0.032222242709920586
|
|
oab_exams_acc_stderr,exam_id__2013-11: 0.032249698626176736
|
|
oab_exams_acc_stderr,exam_id__2013-12: 0.03225675063294939
|
|
oab_exams_acc_stderr,exam_id__2014-13: 0.03217856982922958
|
|
oab_exams_acc_stderr,exam_id__2014-14: 0.032246622088818386
|
|
oab_exams_acc_stderr,exam_id__2014-15: 0.032435167155658584
|
|
oab_exams_acc_stderr,exam_id__2015-16: 0.03223354880595777
|
|
oab_exams_acc_stderr,exam_id__2015-17: 0.032573794785528166
|
|
oab_exams_acc_stderr,exam_id__2015-18: 0.032123574402475284
|
|
oab_exams_acc_stderr,exam_id__2016-19: 0.03271170717682627
|
|
oab_exams_acc_stderr,exam_id__2016-20: 0.03203769414642788
|
|
oab_exams_acc_stderr,exam_id__2016-20a: 0.031951776527517205
|
|
oab_exams_acc_stderr,exam_id__2016-21: 0.03217984644292296
|
|
oab_exams_acc_stderr,exam_id__2017-22: 0.03205629372165545
|
|
oab_exams_acc_stderr,exam_id__2017-23: 0.03221345216992268
|
|
oab_exams_acc_stderr,exam_id__2017-24: 0.03232207361521986
|
|
oab_exams_acc_stderr,exam_id__2018-25: 0.03198727711742204
|
|
oab_exams_alias: oab_exams
|
|
portuguese_hate_speech_acc,all: 0.7297297297297297
|
|
portuguese_hate_speech_acc_stderr,all: 0.010749375621571917
|
|
portuguese_hate_speech_alias: portuguese_hate_speech_binary
|
|
portuguese_hate_speech_f1_macro,all: 0.679463244638342
|
|
portuguese_hate_speech_f1_macro_stderr,all: 0.01220967447481398
|
|
tweetsentbr_acc,all: 0.7014925373134329
|
|
tweetsentbr_acc_stderr,all: 0.007246042251471291
|
|
tweetsentbr_alias: tweetsentbr
|
|
tweetsentbr_f1_macro,all: 0.6540958473356445
|
|
tweetsentbr_f1_macro_stderr,all: 0.007812938746547184
|
|
step: 100000
|