Files

208 lines
10 KiB
YAML
Raw Permalink Normal View History

evaluations:
arc_challenge_poly_pt_acc: 0.47094017094017093
arc_challenge_poly_pt_acc_norm: 0.5025641025641026
arc_challenge_poly_pt_acc_norm_stderr: 0.014623692220064299
arc_challenge_poly_pt_acc_stderr: 0.014599164650716437
arc_challenge_poly_pt_alias: arc_challenge_poly_pt
assin2_rte_acc,all: 0.9178921568627451
assin2_rte_acc_stderr,all: 0.0039410227425137
assin2_rte_alias: assin2_rte
assin2_rte_f1_macro,all: 0.9176110196232412
assin2_rte_f1_macro_stderr,all: 0.003955066154708788
assin2_sts_alias: assin2_sts
assin2_sts_mse,all: 0.8036356209150326
assin2_sts_mse_stderr,all: N/A
assin2_sts_pearson,all: 0.7335010773330974
assin2_sts_pearson_stderr,all: 0.006813537226494434
assin_entailment_acc: 0.6955
assin_entailment_acc_stderr: 0.007277236909999638
assin_entailment_alias: assin_entailment
assin_paraphrase_acc: 0.677
assin_paraphrase_acc_stderr: 0.007394688658428343
assin_paraphrase_alias: assin_paraphrase
belebele_por_Latn_acc: 0.7755555555555556
belebele_por_Latn_acc_norm: 0.7755555555555556
belebele_por_Latn_acc_norm_stderr: 0.013914930474237296
belebele_por_Latn_acc_stderr: 0.013914930474237296
belebele_por_Latn_alias: belebele_por_Latn
bluex_acc,all: 0.5285118219749653
bluex_acc,exam_id__UNICAMP_2018: 0.5185185185185185
bluex_acc,exam_id__UNICAMP_2019: 0.44
bluex_acc,exam_id__UNICAMP_2020: 0.509090909090909
bluex_acc,exam_id__UNICAMP_2021_1: 0.4782608695652174
bluex_acc,exam_id__UNICAMP_2021_2: 0.47058823529411764
bluex_acc,exam_id__UNICAMP_2022: 0.6410256410256411
bluex_acc,exam_id__UNICAMP_2023: 0.6046511627906976
bluex_acc,exam_id__UNICAMP_2024: 0.5333333333333333
bluex_acc,exam_id__USP_2018: 0.4074074074074074
bluex_acc,exam_id__USP_2019: 0.5
bluex_acc,exam_id__USP_2020: 0.6071428571428571
bluex_acc,exam_id__USP_2021: 0.5384615384615384
bluex_acc,exam_id__USP_2022: 0.42857142857142855
bluex_acc,exam_id__USP_2023: 0.6818181818181818
bluex_acc,exam_id__USP_2024: 0.6341463414634146
bluex_acc_stderr,all: 0.010703918286665611
bluex_acc_stderr,exam_id__UNICAMP_2018: 0.03937226593136202
bluex_acc_stderr,exam_id__UNICAMP_2019: 0.0404309855509269
bluex_acc_stderr,exam_id__UNICAMP_2020: 0.0389358045204499
bluex_acc_stderr,exam_id__UNICAMP_2021_1: 0.04241243284078559
bluex_acc_stderr,exam_id__UNICAMP_2021_2: 0.04027645736401125
bluex_acc_stderr,exam_id__UNICAMP_2022: 0.044230892510041545
bluex_acc_stderr,exam_id__UNICAMP_2023: 0.04301638178039838
bluex_acc_stderr,exam_id__UNICAMP_2024: 0.04292586445321273
bluex_acc_stderr,exam_id__USP_2018: 0.0385530737645887
bluex_acc_stderr,exam_id__USP_2019: 0.04559218480567802
bluex_acc_stderr,exam_id__USP_2020: 0.03761017438804557
bluex_acc_stderr,exam_id__USP_2021: 0.04001597061911072
bluex_acc_stderr,exam_id__USP_2022: 0.04070596417180048
bluex_acc_stderr,exam_id__USP_2023: 0.040428408321166086
bluex_acc_stderr,exam_id__USP_2024: 0.04340314503826957
bluex_alias: bluex
calame_pt_acc: 0.5438342967244701
calame_pt_acc_stderr: 0.010934163426897668
calame_pt_alias: calame_pt
calame_pt_perplexity: 7.8689968011124725
calame_pt_perplexity_stderr: 0.41766365172631903
enem_challenge_acc,all: 0.6270118964310707
enem_challenge_acc,exam_id__2009: 0.6347826086956522
enem_challenge_acc,exam_id__2010: 0.6752136752136753
enem_challenge_acc,exam_id__2011: 0.6923076923076923
enem_challenge_acc,exam_id__2012: 0.6379310344827587
enem_challenge_acc,exam_id__2013: 0.6111111111111112
enem_challenge_acc,exam_id__2014: 0.6330275229357798
enem_challenge_acc,exam_id__2015: 0.5462184873949579
enem_challenge_acc,exam_id__2016: 0.6198347107438017
enem_challenge_acc,exam_id__2016_2: 0.6097560975609756
enem_challenge_acc,exam_id__2017: 0.6120689655172413
enem_challenge_acc,exam_id__2022: 0.5939849624060151
enem_challenge_acc,exam_id__2023: 0.6592592592592592
enem_challenge_acc_stderr,all: 0.0073957026155519546
enem_challenge_acc_stderr,exam_id__2009: 0.025965719340461144
enem_challenge_acc_stderr,exam_id__2010: 0.02502188604536942
enem_challenge_acc_stderr,exam_id__2011: 0.02467006930960307
enem_challenge_acc_stderr,exam_id__2012: 0.025714191658261398
enem_challenge_acc_stderr,exam_id__2013: 0.02702908679710427
enem_challenge_acc_stderr,exam_id__2014: 0.026672592927520465
enem_challenge_acc_stderr,exam_id__2015: 0.02626461947663519
enem_challenge_acc_stderr,exam_id__2016: 0.025564903994126094
enem_challenge_acc_stderr,exam_id__2016_2: 0.025424569351359966
enem_challenge_acc_stderr,exam_id__2017: 0.026108373622566433
enem_challenge_acc_stderr,exam_id__2022: 0.024621347603181173
enem_challenge_acc_stderr,exam_id__2023: 0.023527643017571674
enem_challenge_alias: enem
faquad_nli_acc,all: 0.8584615384615385
faquad_nli_acc_stderr,all: 0.009630496129889232
faquad_nli_alias: faquad_nli
faquad_nli_f1_macro,all: 0.7762511973180077
faquad_nli_f1_macro_stderr,all: 0.014629139687819745
global_piqa_completions_por_latn_braz_acc: 0.75
global_piqa_completions_por_latn_braz_acc_bytes: 0.7
global_piqa_completions_por_latn_braz_acc_bytes_stderr: 0.04605661864718383
global_piqa_completions_por_latn_braz_acc_norm: 0.71
global_piqa_completions_por_latn_braz_acc_norm_stderr: 0.045604802157206865
global_piqa_completions_por_latn_braz_acc_stderr: 0.04351941398892446
global_piqa_completions_por_latn_braz_alias: global_piqa_completions_por_latn_braz
gsm8k_pt_alias: gsm8k_pt
gsm8k_pt_exact_match,flexible-extract: 0.19710806697108066
gsm8k_pt_exact_match,strict-match: 0.0
gsm8k_pt_exact_match_stderr,flexible-extract: 0.010978635935133022
gsm8k_pt_exact_match_stderr,strict-match: 0.0
hatebr_offensive_acc,all: 0.805
hatebr_offensive_acc_stderr,all: 0.00750545853190643
hatebr_offensive_alias: hatebr_offensive_binary
hatebr_offensive_f1_macro,all: 0.7986300308904273
hatebr_offensive_f1_macro_stderr,all: 0.007718372444189955
hellaswag_poly_pt_acc: 0.43222450969769205
hellaswag_poly_pt_acc_norm: 0.5524975620327229
hellaswag_poly_pt_acc_norm_stderr: 0.00517617669289931
hellaswag_poly_pt_acc_stderr: 0.005156906089879843
hellaswag_poly_pt_alias: hellaswag_poly_pt
humaneval_instruct_alias: humaneval_instruct
humaneval_instruct_pass@1,create_test: 0.2621951219512195
humaneval_instruct_pass@1_stderr,create_test: 0.03445000289173463
ifeval_pt_alias: ifeval_pt
ifeval_pt_inst_level_loose_acc: 0.46511627906976744
ifeval_pt_inst_level_loose_acc_stderr: N/A
ifeval_pt_inst_level_strict_acc: 0.4232558139534884
ifeval_pt_inst_level_strict_acc_stderr: N/A
ifeval_pt_prompt_level_loose_acc: 0.3433333333333333
ifeval_pt_prompt_level_loose_acc_stderr: 0.027459642357098978
ifeval_pt_prompt_level_strict_acc: 0.30333333333333334
ifeval_pt_prompt_level_strict_acc_stderr: 0.026585019936500975
lambada_poly_pt_acc: 0.5905297884727343
lambada_poly_pt_acc_stderr: 0.006850844880897535
lambada_poly_pt_alias: lambada_poly_pt
lambada_poly_pt_perplexity: 9.837299391644223
lambada_poly_pt_perplexity_stderr: 0.3695848037190668
mmlu_poly_pt_acc: 0.5254428099669769
mmlu_poly_pt_acc_stderr: 0.0043261940110685525
mmlu_poly_pt_alias: mmlu_poly_pt
oab_exams_acc,all: 0.43416856492027334
oab_exams_acc,exam_id__2010-01: 0.3411764705882353
oab_exams_acc,exam_id__2010-02: 0.46
oab_exams_acc,exam_id__2011-03: 0.3838383838383838
oab_exams_acc,exam_id__2011-04: 0.4
oab_exams_acc,exam_id__2011-05: 0.5
oab_exams_acc,exam_id__2012-06: 0.4125
oab_exams_acc,exam_id__2012-06a: 0.425
oab_exams_acc,exam_id__2012-07: 0.425
oab_exams_acc,exam_id__2012-08: 0.4625
oab_exams_acc,exam_id__2012-09: 0.38961038961038963
oab_exams_acc,exam_id__2013-10: 0.5
oab_exams_acc,exam_id__2013-11: 0.575
oab_exams_acc,exam_id__2013-12: 0.475
oab_exams_acc,exam_id__2014-13: 0.425
oab_exams_acc,exam_id__2014-14: 0.475
oab_exams_acc,exam_id__2014-15: 0.5
oab_exams_acc,exam_id__2015-16: 0.425
oab_exams_acc,exam_id__2015-17: 0.4230769230769231
oab_exams_acc,exam_id__2015-18: 0.4
oab_exams_acc,exam_id__2016-19: 0.47435897435897434
oab_exams_acc,exam_id__2016-20: 0.4625
oab_exams_acc,exam_id__2016-20a: 0.375
oab_exams_acc,exam_id__2016-21: 0.4125
oab_exams_acc,exam_id__2017-22: 0.4375
oab_exams_acc,exam_id__2017-23: 0.3375
oab_exams_acc,exam_id__2017-24: 0.425
oab_exams_acc,exam_id__2018-25: 0.4125
oab_exams_acc_stderr,all: 0.006102155835634948
oab_exams_acc_stderr,exam_id__2010-01: 0.029640791842627794
oab_exams_acc_stderr,exam_id__2010-02: 0.02872289890111472
oab_exams_acc_stderr,exam_id__2011-03: 0.02813693132118459
oab_exams_acc_stderr,exam_id__2011-04: 0.03154240021693644
oab_exams_acc_stderr,exam_id__2011-05: 0.03224120715021378
oab_exams_acc_stderr,exam_id__2012-06: 0.031634579715402525
oab_exams_acc_stderr,exam_id__2012-06a: 0.031920386806255
oab_exams_acc_stderr,exam_id__2012-07: 0.03196335345646489
oab_exams_acc_stderr,exam_id__2012-08: 0.03218895689579674
oab_exams_acc_stderr,exam_id__2012-09: 0.0320702088032968
oab_exams_acc_stderr,exam_id__2013-10: 0.032306001943973275
oab_exams_acc_stderr,exam_id__2013-11: 0.03198130854191419
oab_exams_acc_stderr,exam_id__2013-12: 0.03227420874356609
oab_exams_acc_stderr,exam_id__2014-13: 0.03197697369553699
oab_exams_acc_stderr,exam_id__2014-14: 0.03231427645661526
oab_exams_acc_stderr,exam_id__2014-15: 0.03281148582087051
oab_exams_acc_stderr,exam_id__2015-16: 0.03193036659905079
oab_exams_acc_stderr,exam_id__2015-17: 0.03231882364931395
oab_exams_acc_stderr,exam_id__2015-18: 0.031606839641845616
oab_exams_acc_stderr,exam_id__2016-19: 0.03261356291633692
oab_exams_acc_stderr,exam_id__2016-20: 0.03217012301961201
oab_exams_acc_stderr,exam_id__2016-20a: 0.03124902402233319
oab_exams_acc_stderr,exam_id__2016-21: 0.03172689133529826
oab_exams_acc_stderr,exam_id__2017-22: 0.03202936221429109
oab_exams_acc_stderr,exam_id__2017-23: 0.03052924311679464
oab_exams_acc_stderr,exam_id__2017-24: 0.031892680920180556
oab_exams_acc_stderr,exam_id__2018-25: 0.03181646004259634
oab_exams_alias: oab_exams
portuguese_hate_speech_acc,all: 0.7708578143360753
portuguese_hate_speech_acc_stderr,all: 0.010179285936061107
portuguese_hate_speech_alias: portuguese_hate_speech_binary
portuguese_hate_speech_f1_macro,all: 0.7383400530746562
portuguese_hate_speech_f1_macro_stderr,all: 0.011391910083076176
tweetsentbr_acc,all: 0.663681592039801
tweetsentbr_acc_stderr,all: 0.007468862153058396
tweetsentbr_alias: tweetsentbr
tweetsentbr_f1_macro,all: 0.57818681435807
tweetsentbr_f1_macro_stderr,all: 0.007796495757272563
step: 69750