208 lines
10 KiB
YAML
208 lines
10 KiB
YAML
|
|
evaluations:
|
||
|
|
arc_challenge_poly_pt_acc: 0.47094017094017093
|
||
|
|
arc_challenge_poly_pt_acc_norm: 0.5025641025641026
|
||
|
|
arc_challenge_poly_pt_acc_norm_stderr: 0.014623692220064299
|
||
|
|
arc_challenge_poly_pt_acc_stderr: 0.014599164650716437
|
||
|
|
arc_challenge_poly_pt_alias: arc_challenge_poly_pt
|
||
|
|
assin2_rte_acc,all: 0.9178921568627451
|
||
|
|
assin2_rte_acc_stderr,all: 0.0039410227425137
|
||
|
|
assin2_rte_alias: assin2_rte
|
||
|
|
assin2_rte_f1_macro,all: 0.9176110196232412
|
||
|
|
assin2_rte_f1_macro_stderr,all: 0.003955066154708788
|
||
|
|
assin2_sts_alias: assin2_sts
|
||
|
|
assin2_sts_mse,all: 0.8036356209150326
|
||
|
|
assin2_sts_mse_stderr,all: N/A
|
||
|
|
assin2_sts_pearson,all: 0.7335010773330974
|
||
|
|
assin2_sts_pearson_stderr,all: 0.006813537226494434
|
||
|
|
assin_entailment_acc: 0.6955
|
||
|
|
assin_entailment_acc_stderr: 0.007277236909999638
|
||
|
|
assin_entailment_alias: assin_entailment
|
||
|
|
assin_paraphrase_acc: 0.677
|
||
|
|
assin_paraphrase_acc_stderr: 0.007394688658428343
|
||
|
|
assin_paraphrase_alias: assin_paraphrase
|
||
|
|
belebele_por_Latn_acc: 0.7755555555555556
|
||
|
|
belebele_por_Latn_acc_norm: 0.7755555555555556
|
||
|
|
belebele_por_Latn_acc_norm_stderr: 0.013914930474237296
|
||
|
|
belebele_por_Latn_acc_stderr: 0.013914930474237296
|
||
|
|
belebele_por_Latn_alias: belebele_por_Latn
|
||
|
|
bluex_acc,all: 0.5285118219749653
|
||
|
|
bluex_acc,exam_id__UNICAMP_2018: 0.5185185185185185
|
||
|
|
bluex_acc,exam_id__UNICAMP_2019: 0.44
|
||
|
|
bluex_acc,exam_id__UNICAMP_2020: 0.509090909090909
|
||
|
|
bluex_acc,exam_id__UNICAMP_2021_1: 0.4782608695652174
|
||
|
|
bluex_acc,exam_id__UNICAMP_2021_2: 0.47058823529411764
|
||
|
|
bluex_acc,exam_id__UNICAMP_2022: 0.6410256410256411
|
||
|
|
bluex_acc,exam_id__UNICAMP_2023: 0.6046511627906976
|
||
|
|
bluex_acc,exam_id__UNICAMP_2024: 0.5333333333333333
|
||
|
|
bluex_acc,exam_id__USP_2018: 0.4074074074074074
|
||
|
|
bluex_acc,exam_id__USP_2019: 0.5
|
||
|
|
bluex_acc,exam_id__USP_2020: 0.6071428571428571
|
||
|
|
bluex_acc,exam_id__USP_2021: 0.5384615384615384
|
||
|
|
bluex_acc,exam_id__USP_2022: 0.42857142857142855
|
||
|
|
bluex_acc,exam_id__USP_2023: 0.6818181818181818
|
||
|
|
bluex_acc,exam_id__USP_2024: 0.6341463414634146
|
||
|
|
bluex_acc_stderr,all: 0.010703918286665611
|
||
|
|
bluex_acc_stderr,exam_id__UNICAMP_2018: 0.03937226593136202
|
||
|
|
bluex_acc_stderr,exam_id__UNICAMP_2019: 0.0404309855509269
|
||
|
|
bluex_acc_stderr,exam_id__UNICAMP_2020: 0.0389358045204499
|
||
|
|
bluex_acc_stderr,exam_id__UNICAMP_2021_1: 0.04241243284078559
|
||
|
|
bluex_acc_stderr,exam_id__UNICAMP_2021_2: 0.04027645736401125
|
||
|
|
bluex_acc_stderr,exam_id__UNICAMP_2022: 0.044230892510041545
|
||
|
|
bluex_acc_stderr,exam_id__UNICAMP_2023: 0.04301638178039838
|
||
|
|
bluex_acc_stderr,exam_id__UNICAMP_2024: 0.04292586445321273
|
||
|
|
bluex_acc_stderr,exam_id__USP_2018: 0.0385530737645887
|
||
|
|
bluex_acc_stderr,exam_id__USP_2019: 0.04559218480567802
|
||
|
|
bluex_acc_stderr,exam_id__USP_2020: 0.03761017438804557
|
||
|
|
bluex_acc_stderr,exam_id__USP_2021: 0.04001597061911072
|
||
|
|
bluex_acc_stderr,exam_id__USP_2022: 0.04070596417180048
|
||
|
|
bluex_acc_stderr,exam_id__USP_2023: 0.040428408321166086
|
||
|
|
bluex_acc_stderr,exam_id__USP_2024: 0.04340314503826957
|
||
|
|
bluex_alias: bluex
|
||
|
|
calame_pt_acc: 0.5438342967244701
|
||
|
|
calame_pt_acc_stderr: 0.010934163426897668
|
||
|
|
calame_pt_alias: calame_pt
|
||
|
|
calame_pt_perplexity: 7.8689968011124725
|
||
|
|
calame_pt_perplexity_stderr: 0.41766365172631903
|
||
|
|
enem_challenge_acc,all: 0.6270118964310707
|
||
|
|
enem_challenge_acc,exam_id__2009: 0.6347826086956522
|
||
|
|
enem_challenge_acc,exam_id__2010: 0.6752136752136753
|
||
|
|
enem_challenge_acc,exam_id__2011: 0.6923076923076923
|
||
|
|
enem_challenge_acc,exam_id__2012: 0.6379310344827587
|
||
|
|
enem_challenge_acc,exam_id__2013: 0.6111111111111112
|
||
|
|
enem_challenge_acc,exam_id__2014: 0.6330275229357798
|
||
|
|
enem_challenge_acc,exam_id__2015: 0.5462184873949579
|
||
|
|
enem_challenge_acc,exam_id__2016: 0.6198347107438017
|
||
|
|
enem_challenge_acc,exam_id__2016_2: 0.6097560975609756
|
||
|
|
enem_challenge_acc,exam_id__2017: 0.6120689655172413
|
||
|
|
enem_challenge_acc,exam_id__2022: 0.5939849624060151
|
||
|
|
enem_challenge_acc,exam_id__2023: 0.6592592592592592
|
||
|
|
enem_challenge_acc_stderr,all: 0.0073957026155519546
|
||
|
|
enem_challenge_acc_stderr,exam_id__2009: 0.025965719340461144
|
||
|
|
enem_challenge_acc_stderr,exam_id__2010: 0.02502188604536942
|
||
|
|
enem_challenge_acc_stderr,exam_id__2011: 0.02467006930960307
|
||
|
|
enem_challenge_acc_stderr,exam_id__2012: 0.025714191658261398
|
||
|
|
enem_challenge_acc_stderr,exam_id__2013: 0.02702908679710427
|
||
|
|
enem_challenge_acc_stderr,exam_id__2014: 0.026672592927520465
|
||
|
|
enem_challenge_acc_stderr,exam_id__2015: 0.02626461947663519
|
||
|
|
enem_challenge_acc_stderr,exam_id__2016: 0.025564903994126094
|
||
|
|
enem_challenge_acc_stderr,exam_id__2016_2: 0.025424569351359966
|
||
|
|
enem_challenge_acc_stderr,exam_id__2017: 0.026108373622566433
|
||
|
|
enem_challenge_acc_stderr,exam_id__2022: 0.024621347603181173
|
||
|
|
enem_challenge_acc_stderr,exam_id__2023: 0.023527643017571674
|
||
|
|
enem_challenge_alias: enem
|
||
|
|
faquad_nli_acc,all: 0.8584615384615385
|
||
|
|
faquad_nli_acc_stderr,all: 0.009630496129889232
|
||
|
|
faquad_nli_alias: faquad_nli
|
||
|
|
faquad_nli_f1_macro,all: 0.7762511973180077
|
||
|
|
faquad_nli_f1_macro_stderr,all: 0.014629139687819745
|
||
|
|
global_piqa_completions_por_latn_braz_acc: 0.75
|
||
|
|
global_piqa_completions_por_latn_braz_acc_bytes: 0.7
|
||
|
|
global_piqa_completions_por_latn_braz_acc_bytes_stderr: 0.04605661864718383
|
||
|
|
global_piqa_completions_por_latn_braz_acc_norm: 0.71
|
||
|
|
global_piqa_completions_por_latn_braz_acc_norm_stderr: 0.045604802157206865
|
||
|
|
global_piqa_completions_por_latn_braz_acc_stderr: 0.04351941398892446
|
||
|
|
global_piqa_completions_por_latn_braz_alias: global_piqa_completions_por_latn_braz
|
||
|
|
gsm8k_pt_alias: gsm8k_pt
|
||
|
|
gsm8k_pt_exact_match,flexible-extract: 0.19710806697108066
|
||
|
|
gsm8k_pt_exact_match,strict-match: 0.0
|
||
|
|
gsm8k_pt_exact_match_stderr,flexible-extract: 0.010978635935133022
|
||
|
|
gsm8k_pt_exact_match_stderr,strict-match: 0.0
|
||
|
|
hatebr_offensive_acc,all: 0.805
|
||
|
|
hatebr_offensive_acc_stderr,all: 0.00750545853190643
|
||
|
|
hatebr_offensive_alias: hatebr_offensive_binary
|
||
|
|
hatebr_offensive_f1_macro,all: 0.7986300308904273
|
||
|
|
hatebr_offensive_f1_macro_stderr,all: 0.007718372444189955
|
||
|
|
hellaswag_poly_pt_acc: 0.43222450969769205
|
||
|
|
hellaswag_poly_pt_acc_norm: 0.5524975620327229
|
||
|
|
hellaswag_poly_pt_acc_norm_stderr: 0.00517617669289931
|
||
|
|
hellaswag_poly_pt_acc_stderr: 0.005156906089879843
|
||
|
|
hellaswag_poly_pt_alias: hellaswag_poly_pt
|
||
|
|
humaneval_instruct_alias: humaneval_instruct
|
||
|
|
humaneval_instruct_pass@1,create_test: 0.2621951219512195
|
||
|
|
humaneval_instruct_pass@1_stderr,create_test: 0.03445000289173463
|
||
|
|
ifeval_pt_alias: ifeval_pt
|
||
|
|
ifeval_pt_inst_level_loose_acc: 0.46511627906976744
|
||
|
|
ifeval_pt_inst_level_loose_acc_stderr: N/A
|
||
|
|
ifeval_pt_inst_level_strict_acc: 0.4232558139534884
|
||
|
|
ifeval_pt_inst_level_strict_acc_stderr: N/A
|
||
|
|
ifeval_pt_prompt_level_loose_acc: 0.3433333333333333
|
||
|
|
ifeval_pt_prompt_level_loose_acc_stderr: 0.027459642357098978
|
||
|
|
ifeval_pt_prompt_level_strict_acc: 0.30333333333333334
|
||
|
|
ifeval_pt_prompt_level_strict_acc_stderr: 0.026585019936500975
|
||
|
|
lambada_poly_pt_acc: 0.5905297884727343
|
||
|
|
lambada_poly_pt_acc_stderr: 0.006850844880897535
|
||
|
|
lambada_poly_pt_alias: lambada_poly_pt
|
||
|
|
lambada_poly_pt_perplexity: 9.837299391644223
|
||
|
|
lambada_poly_pt_perplexity_stderr: 0.3695848037190668
|
||
|
|
mmlu_poly_pt_acc: 0.5254428099669769
|
||
|
|
mmlu_poly_pt_acc_stderr: 0.0043261940110685525
|
||
|
|
mmlu_poly_pt_alias: mmlu_poly_pt
|
||
|
|
oab_exams_acc,all: 0.43416856492027334
|
||
|
|
oab_exams_acc,exam_id__2010-01: 0.3411764705882353
|
||
|
|
oab_exams_acc,exam_id__2010-02: 0.46
|
||
|
|
oab_exams_acc,exam_id__2011-03: 0.3838383838383838
|
||
|
|
oab_exams_acc,exam_id__2011-04: 0.4
|
||
|
|
oab_exams_acc,exam_id__2011-05: 0.5
|
||
|
|
oab_exams_acc,exam_id__2012-06: 0.4125
|
||
|
|
oab_exams_acc,exam_id__2012-06a: 0.425
|
||
|
|
oab_exams_acc,exam_id__2012-07: 0.425
|
||
|
|
oab_exams_acc,exam_id__2012-08: 0.4625
|
||
|
|
oab_exams_acc,exam_id__2012-09: 0.38961038961038963
|
||
|
|
oab_exams_acc,exam_id__2013-10: 0.5
|
||
|
|
oab_exams_acc,exam_id__2013-11: 0.575
|
||
|
|
oab_exams_acc,exam_id__2013-12: 0.475
|
||
|
|
oab_exams_acc,exam_id__2014-13: 0.425
|
||
|
|
oab_exams_acc,exam_id__2014-14: 0.475
|
||
|
|
oab_exams_acc,exam_id__2014-15: 0.5
|
||
|
|
oab_exams_acc,exam_id__2015-16: 0.425
|
||
|
|
oab_exams_acc,exam_id__2015-17: 0.4230769230769231
|
||
|
|
oab_exams_acc,exam_id__2015-18: 0.4
|
||
|
|
oab_exams_acc,exam_id__2016-19: 0.47435897435897434
|
||
|
|
oab_exams_acc,exam_id__2016-20: 0.4625
|
||
|
|
oab_exams_acc,exam_id__2016-20a: 0.375
|
||
|
|
oab_exams_acc,exam_id__2016-21: 0.4125
|
||
|
|
oab_exams_acc,exam_id__2017-22: 0.4375
|
||
|
|
oab_exams_acc,exam_id__2017-23: 0.3375
|
||
|
|
oab_exams_acc,exam_id__2017-24: 0.425
|
||
|
|
oab_exams_acc,exam_id__2018-25: 0.4125
|
||
|
|
oab_exams_acc_stderr,all: 0.006102155835634948
|
||
|
|
oab_exams_acc_stderr,exam_id__2010-01: 0.029640791842627794
|
||
|
|
oab_exams_acc_stderr,exam_id__2010-02: 0.02872289890111472
|
||
|
|
oab_exams_acc_stderr,exam_id__2011-03: 0.02813693132118459
|
||
|
|
oab_exams_acc_stderr,exam_id__2011-04: 0.03154240021693644
|
||
|
|
oab_exams_acc_stderr,exam_id__2011-05: 0.03224120715021378
|
||
|
|
oab_exams_acc_stderr,exam_id__2012-06: 0.031634579715402525
|
||
|
|
oab_exams_acc_stderr,exam_id__2012-06a: 0.031920386806255
|
||
|
|
oab_exams_acc_stderr,exam_id__2012-07: 0.03196335345646489
|
||
|
|
oab_exams_acc_stderr,exam_id__2012-08: 0.03218895689579674
|
||
|
|
oab_exams_acc_stderr,exam_id__2012-09: 0.0320702088032968
|
||
|
|
oab_exams_acc_stderr,exam_id__2013-10: 0.032306001943973275
|
||
|
|
oab_exams_acc_stderr,exam_id__2013-11: 0.03198130854191419
|
||
|
|
oab_exams_acc_stderr,exam_id__2013-12: 0.03227420874356609
|
||
|
|
oab_exams_acc_stderr,exam_id__2014-13: 0.03197697369553699
|
||
|
|
oab_exams_acc_stderr,exam_id__2014-14: 0.03231427645661526
|
||
|
|
oab_exams_acc_stderr,exam_id__2014-15: 0.03281148582087051
|
||
|
|
oab_exams_acc_stderr,exam_id__2015-16: 0.03193036659905079
|
||
|
|
oab_exams_acc_stderr,exam_id__2015-17: 0.03231882364931395
|
||
|
|
oab_exams_acc_stderr,exam_id__2015-18: 0.031606839641845616
|
||
|
|
oab_exams_acc_stderr,exam_id__2016-19: 0.03261356291633692
|
||
|
|
oab_exams_acc_stderr,exam_id__2016-20: 0.03217012301961201
|
||
|
|
oab_exams_acc_stderr,exam_id__2016-20a: 0.03124902402233319
|
||
|
|
oab_exams_acc_stderr,exam_id__2016-21: 0.03172689133529826
|
||
|
|
oab_exams_acc_stderr,exam_id__2017-22: 0.03202936221429109
|
||
|
|
oab_exams_acc_stderr,exam_id__2017-23: 0.03052924311679464
|
||
|
|
oab_exams_acc_stderr,exam_id__2017-24: 0.031892680920180556
|
||
|
|
oab_exams_acc_stderr,exam_id__2018-25: 0.03181646004259634
|
||
|
|
oab_exams_alias: oab_exams
|
||
|
|
portuguese_hate_speech_acc,all: 0.7708578143360753
|
||
|
|
portuguese_hate_speech_acc_stderr,all: 0.010179285936061107
|
||
|
|
portuguese_hate_speech_alias: portuguese_hate_speech_binary
|
||
|
|
portuguese_hate_speech_f1_macro,all: 0.7383400530746562
|
||
|
|
portuguese_hate_speech_f1_macro_stderr,all: 0.011391910083076176
|
||
|
|
tweetsentbr_acc,all: 0.663681592039801
|
||
|
|
tweetsentbr_acc_stderr,all: 0.007468862153058396
|
||
|
|
tweetsentbr_alias: tweetsentbr
|
||
|
|
tweetsentbr_f1_macro,all: 0.57818681435807
|
||
|
|
tweetsentbr_f1_macro_stderr,all: 0.007796495757272563
|
||
|
|
step: 69750
|