1303 lines
35 KiB
JSON
1303 lines
35 KiB
JSON
{
|
|
"results": {
|
|
"assin2_rte": {
|
|
"f1_macro,all": 0.3333333333333333,
|
|
"f1_macro_stderr,all": 0.0031730251394380704,
|
|
"acc,all": 0.5,
|
|
"acc_stderr,all": 0.007138073526203421,
|
|
"alias": "assin2_rte"
|
|
},
|
|
"assin2_sts": {
|
|
"pearson,all": 0.07790738376477561,
|
|
"pearson_stderr,all": 0.014716684813439029,
|
|
"mse,all": 2.066646241830065,
|
|
"mse_stderr,all": "N/A",
|
|
"alias": "assin2_sts"
|
|
},
|
|
"bluex": {
|
|
"acc,all": 0.20584144645340752,
|
|
"acc_stderr,all": 0.008708200533939503,
|
|
"acc,exam_id__USP_2024": 0.24390243902439024,
|
|
"acc_stderr,exam_id__USP_2024": 0.038765983941525854,
|
|
"acc,exam_id__UNICAMP_2018": 0.2222222222222222,
|
|
"acc_stderr,exam_id__UNICAMP_2018": 0.032540841899570724,
|
|
"acc,exam_id__USP_2018": 0.16666666666666666,
|
|
"acc_stderr,exam_id__USP_2018": 0.029320906143906797,
|
|
"acc,exam_id__USP_2022": 0.20408163265306123,
|
|
"acc_stderr,exam_id__USP_2022": 0.03318725036103681,
|
|
"acc,exam_id__USP_2021": 0.1346153846153846,
|
|
"acc_stderr,exam_id__USP_2021": 0.027253785013691273,
|
|
"acc,exam_id__UNICAMP_2021_2": 0.27450980392156865,
|
|
"acc_stderr,exam_id__UNICAMP_2021_2": 0.0360141604917446,
|
|
"acc,exam_id__UNICAMP_2021_1": 0.2826086956521739,
|
|
"acc_stderr,exam_id__UNICAMP_2021_1": 0.038353605844743385,
|
|
"acc,exam_id__USP_2020": 0.08928571428571429,
|
|
"acc_stderr,exam_id__USP_2020": 0.021921367122397676,
|
|
"acc,exam_id__UNICAMP_2022": 0.3076923076923077,
|
|
"acc_stderr,exam_id__UNICAMP_2022": 0.042727899536711536,
|
|
"acc,exam_id__UNICAMP_2023": 0.16279069767441862,
|
|
"acc_stderr,exam_id__UNICAMP_2023": 0.03238294764474062,
|
|
"acc,exam_id__UNICAMP_2024": 0.3111111111111111,
|
|
"acc_stderr,exam_id__UNICAMP_2024": 0.0398333923513593,
|
|
"acc,exam_id__UNICAMP_2020": 0.18181818181818182,
|
|
"acc_stderr,exam_id__UNICAMP_2020": 0.02992924246353086,
|
|
"acc,exam_id__USP_2023": 0.18181818181818182,
|
|
"acc_stderr,exam_id__USP_2023": 0.03345273573325805,
|
|
"acc,exam_id__UNICAMP_2019": 0.2,
|
|
"acc_stderr,exam_id__UNICAMP_2019": 0.03269771596389771,
|
|
"acc,exam_id__USP_2019": 0.175,
|
|
"acc_stderr,exam_id__USP_2019": 0.0348121283420538,
|
|
"alias": "bluex"
|
|
},
|
|
"enem_challenge": {
|
|
"alias": "enem",
|
|
"acc,all": 0.2092372288313506,
|
|
"acc_stderr,all": 0.006229253057208555,
|
|
"acc,exam_id__2016_2": 0.21138211382113822,
|
|
"acc_stderr,exam_id__2016_2": 0.02123691588211098,
|
|
"acc,exam_id__2023": 0.1925925925925926,
|
|
"acc_stderr,exam_id__2023": 0.019583861187616968,
|
|
"acc,exam_id__2013": 0.2037037037037037,
|
|
"acc_stderr,exam_id__2013": 0.02238599079325693,
|
|
"acc,exam_id__2012": 0.12931034482758622,
|
|
"acc_stderr,exam_id__2012": 0.01803175905554286,
|
|
"acc,exam_id__2009": 0.23478260869565218,
|
|
"acc_stderr,exam_id__2009": 0.022814803582640184,
|
|
"acc,exam_id__2022": 0.17293233082706766,
|
|
"acc_stderr,exam_id__2022": 0.01889827424607104,
|
|
"acc,exam_id__2015": 0.31932773109243695,
|
|
"acc_stderr,exam_id__2015": 0.02472561230954832,
|
|
"acc,exam_id__2014": 0.21100917431192662,
|
|
"acc_stderr,exam_id__2014": 0.022513127008089658,
|
|
"acc,exam_id__2016": 0.18181818181818182,
|
|
"acc_stderr,exam_id__2016": 0.020248897347876847,
|
|
"acc,exam_id__2010": 0.23076923076923078,
|
|
"acc_stderr,exam_id__2010": 0.02246971773699712,
|
|
"acc,exam_id__2017": 0.27586206896551724,
|
|
"acc_stderr,exam_id__2017": 0.023994074423977864,
|
|
"acc,exam_id__2011": 0.15384615384615385,
|
|
"acc_stderr,exam_id__2011": 0.019264278502154123
|
|
},
|
|
"faquad_nli": {
|
|
"f1_macro,all": 0.4396551724137931,
|
|
"f1_macro_stderr,all": 0.0035796984729087084,
|
|
"acc,all": 0.7846153846153846,
|
|
"acc_stderr,all": 0.011396120309131327,
|
|
"alias": "faquad_nli"
|
|
},
|
|
"hatebr_offensive": {
|
|
"alias": "hatebr_offensive_binary",
|
|
"f1_macro,all": 0.43054708155379295,
|
|
"f1_macro_stderr,all": 0.009093679844467082,
|
|
"acc,all": 0.4742857142857143,
|
|
"acc_stderr,all": 0.009437507998400261
|
|
},
|
|
"oab_exams": {
|
|
"acc,all": 0.25968109339407747,
|
|
"acc_stderr,all": 0.005403181658894358,
|
|
"acc,exam_id__2017-22": 0.2375,
|
|
"acc_stderr,exam_id__2017-22": 0.027511429390216682,
|
|
"acc,exam_id__2016-20a": 0.2,
|
|
"acc_stderr,exam_id__2016-20a": 0.02584175311098727,
|
|
"acc,exam_id__2011-04": 0.2875,
|
|
"acc_stderr,exam_id__2011-04": 0.02919454405528515,
|
|
"acc,exam_id__2013-12": 0.2875,
|
|
"acc_stderr,exam_id__2013-12": 0.029277381115049662,
|
|
"acc,exam_id__2013-11": 0.325,
|
|
"acc_stderr,exam_id__2013-11": 0.030286419424458838,
|
|
"acc,exam_id__2010-02": 0.32,
|
|
"acc_stderr,exam_id__2010-02": 0.026888774775418785,
|
|
"acc,exam_id__2012-07": 0.2375,
|
|
"acc_stderr,exam_id__2012-07": 0.02737558649609428,
|
|
"acc,exam_id__2016-21": 0.25,
|
|
"acc_stderr,exam_id__2016-21": 0.027994547544285982,
|
|
"acc,exam_id__2015-17": 0.24358974358974358,
|
|
"acc_stderr,exam_id__2015-17": 0.02793267139214751,
|
|
"acc,exam_id__2015-18": 0.2625,
|
|
"acc_stderr,exam_id__2015-18": 0.028396710161944567,
|
|
"acc,exam_id__2014-14": 0.225,
|
|
"acc_stderr,exam_id__2014-14": 0.026939185801353988,
|
|
"acc,exam_id__2015-16": 0.3375,
|
|
"acc_stderr,exam_id__2015-16": 0.030631826713546063,
|
|
"acc,exam_id__2012-09": 0.23376623376623376,
|
|
"acc_stderr,exam_id__2012-09": 0.02787359925121907,
|
|
"acc,exam_id__2011-03": 0.26262626262626265,
|
|
"acc_stderr,exam_id__2011-03": 0.025505720074946718,
|
|
"acc,exam_id__2016-19": 0.2692307692307692,
|
|
"acc_stderr,exam_id__2016-19": 0.028948751914583667,
|
|
"acc,exam_id__2012-06a": 0.2375,
|
|
"acc_stderr,exam_id__2012-06a": 0.027440075549438697,
|
|
"acc,exam_id__2011-05": 0.2625,
|
|
"acc_stderr,exam_id__2011-05": 0.02835789202564455,
|
|
"acc,exam_id__2013-10": 0.2125,
|
|
"acc_stderr,exam_id__2013-10": 0.026367641247603036,
|
|
"acc,exam_id__2017-24": 0.25,
|
|
"acc_stderr,exam_id__2017-24": 0.028053164455460838,
|
|
"acc,exam_id__2016-20": 0.2625,
|
|
"acc_stderr,exam_id__2016-20": 0.0283327789711091,
|
|
"acc,exam_id__2012-08": 0.275,
|
|
"acc_stderr,exam_id__2012-08": 0.02893752928626648,
|
|
"acc,exam_id__2014-13": 0.2,
|
|
"acc_stderr,exam_id__2014-13": 0.025784866156114444,
|
|
"acc,exam_id__2018-25": 0.25,
|
|
"acc_stderr,exam_id__2018-25": 0.027961840366717016,
|
|
"acc,exam_id__2017-23": 0.3,
|
|
"acc_stderr,exam_id__2017-23": 0.02953160157687412,
|
|
"acc,exam_id__2014-15": 0.2564102564102564,
|
|
"acc_stderr,exam_id__2014-15": 0.028456647275964232,
|
|
"acc,exam_id__2012-06": 0.275,
|
|
"acc_stderr,exam_id__2012-06": 0.028790584320040398,
|
|
"acc,exam_id__2010-01": 0.23529411764705882,
|
|
"acc_stderr,exam_id__2010-01": 0.026470002521428834,
|
|
"alias": "oab_exams"
|
|
},
|
|
"portuguese_hate_speech": {
|
|
"alias": "portuguese_hate_speech_binary",
|
|
"f1_macro,all": 0.35895915678524376,
|
|
"f1_macro_stderr,all": 0.011204300451999685,
|
|
"acc,all": 0.381903642773208,
|
|
"acc_stderr,all": 0.011741654959752653
|
|
},
|
|
"tweetsentbr": {
|
|
"f1_macro,all": 0.2114730555936155,
|
|
"f1_macro_stderr,all": 0.0056538657419370805,
|
|
"acc,all": 0.3169154228855721,
|
|
"acc_stderr,all": 0.007343236186351586,
|
|
"alias": "tweetsentbr"
|
|
}
|
|
},
|
|
"configs": {
|
|
"assin2_rte": {
|
|
"task": "assin2_rte",
|
|
"group": [
|
|
"pt_benchmark",
|
|
"assin2"
|
|
],
|
|
"dataset_path": "assin2",
|
|
"test_split": "test",
|
|
"fewshot_split": "train",
|
|
"doc_to_text": "Premissa: {{premise}}\nHipótese: {{hypothesis}}\nPergunta: A hipótese pode ser inferida pela premissa? Sim ou Não?\nResposta:",
|
|
"doc_to_target": "{{['Não', 'Sim'][entailment_judgment]}}",
|
|
"description": "Abaixo estão pares de premissa e hipótese. Para cada par, indique se a hipótese pode ser inferida a partir da premissa, responda apenas com \"Sim\" ou \"Não\".\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "id_sampler",
|
|
"sampler_config": {
|
|
"id_list": [
|
|
1,
|
|
3251,
|
|
2,
|
|
3252,
|
|
3,
|
|
4,
|
|
5,
|
|
6,
|
|
3253,
|
|
7,
|
|
3254,
|
|
3255,
|
|
3256,
|
|
8,
|
|
9,
|
|
10,
|
|
3257,
|
|
11,
|
|
3258,
|
|
12,
|
|
13,
|
|
14,
|
|
15,
|
|
3259,
|
|
3260,
|
|
3261,
|
|
3262,
|
|
3263,
|
|
16,
|
|
17,
|
|
3264,
|
|
18,
|
|
3265,
|
|
3266,
|
|
3267,
|
|
19,
|
|
20,
|
|
3268,
|
|
3269,
|
|
21,
|
|
3270,
|
|
3271,
|
|
22,
|
|
3272,
|
|
3273,
|
|
23,
|
|
3274,
|
|
24,
|
|
25,
|
|
3275
|
|
],
|
|
"id_column": "sentence_pair_id"
|
|
}
|
|
},
|
|
"num_fewshot": 15,
|
|
"metric_list": [
|
|
{
|
|
"metric": "f1_macro",
|
|
"aggregation": "f1_macro",
|
|
"higher_is_better": true
|
|
},
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "acc",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "generate_until",
|
|
"generation_kwargs": {
|
|
"max_gen_toks": 32,
|
|
"do_sample": false,
|
|
"temperature": 0.0,
|
|
"top_k": null,
|
|
"top_p": null,
|
|
"until": [
|
|
"\n\n"
|
|
]
|
|
},
|
|
"repeats": 1,
|
|
"filter_list": [
|
|
{
|
|
"name": "all",
|
|
"filter": [
|
|
{
|
|
"function": "find_similar_label",
|
|
"labels": [
|
|
"Sim",
|
|
"Não"
|
|
]
|
|
},
|
|
{
|
|
"function": "take_first"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 1.1
|
|
}
|
|
},
|
|
"assin2_sts": {
|
|
"task": "assin2_sts",
|
|
"group": [
|
|
"pt_benchmark",
|
|
"assin2"
|
|
],
|
|
"dataset_path": "assin2",
|
|
"test_split": "test",
|
|
"fewshot_split": "train",
|
|
"doc_to_text": "Frase 1: {{premise}}\nFrase 2: {{hypothesis}}\nPergunta: Quão similares são as duas frases? Dê uma pontuação entre 1,0 a 5,0.\nResposta:",
|
|
"doc_to_target": "<function assin2_float_to_pt_str at 0x14c26cd9a5c0>",
|
|
"description": "Abaixo estão pares de frases que você deve avaliar o grau de similaridade. Dê uma pontuação entre 1,0 e 5,0, sendo 1,0 pouco similar e 5,0 muito similar.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "id_sampler",
|
|
"sampler_config": {
|
|
"id_list": [
|
|
1,
|
|
3251,
|
|
2,
|
|
3252,
|
|
3,
|
|
4,
|
|
5,
|
|
6,
|
|
3253,
|
|
7,
|
|
3254,
|
|
3255,
|
|
3256,
|
|
8,
|
|
9,
|
|
10,
|
|
3257,
|
|
11,
|
|
3258,
|
|
12,
|
|
13,
|
|
14,
|
|
15,
|
|
3259,
|
|
3260,
|
|
3261,
|
|
3262,
|
|
3263,
|
|
16,
|
|
17,
|
|
3264,
|
|
18,
|
|
3265,
|
|
3266,
|
|
3267,
|
|
19,
|
|
20,
|
|
3268,
|
|
3269,
|
|
21,
|
|
3270,
|
|
3271,
|
|
22,
|
|
3272,
|
|
3273,
|
|
23,
|
|
3274,
|
|
24,
|
|
25,
|
|
3275
|
|
],
|
|
"id_column": "sentence_pair_id"
|
|
}
|
|
},
|
|
"num_fewshot": 10,
|
|
"metric_list": [
|
|
{
|
|
"metric": "pearson",
|
|
"aggregation": "pearsonr",
|
|
"higher_is_better": true
|
|
},
|
|
{
|
|
"metric": "mse",
|
|
"aggregation": "mean_squared_error",
|
|
"higher_is_better": false
|
|
}
|
|
],
|
|
"output_type": "generate_until",
|
|
"generation_kwargs": {
|
|
"max_gen_toks": 32,
|
|
"do_sample": false,
|
|
"temperature": 0.0,
|
|
"top_k": null,
|
|
"top_p": null,
|
|
"until": [
|
|
"\n\n"
|
|
]
|
|
},
|
|
"repeats": 1,
|
|
"filter_list": [
|
|
{
|
|
"name": "all",
|
|
"filter": [
|
|
{
|
|
"function": "number_filter",
|
|
"type": "float",
|
|
"range_min": 1.0,
|
|
"range_max": 5.0,
|
|
"on_outside_range": "clip",
|
|
"fallback": 5.0
|
|
},
|
|
{
|
|
"function": "take_first"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 1.1
|
|
}
|
|
},
|
|
"bluex": {
|
|
"task": "bluex",
|
|
"group": [
|
|
"pt_benchmark",
|
|
"vestibular"
|
|
],
|
|
"dataset_path": "eduagarcia-temp/BLUEX_without_images",
|
|
"test_split": "train",
|
|
"fewshot_split": "train",
|
|
"doc_to_text": "<function enem_doc_to_text at 0x14c26cd99b20>",
|
|
"doc_to_target": "{{answerKey}}",
|
|
"description": "As perguntas a seguir são questões de múltipla escolha de provas de vestibular de universidades brasileiras, selecione a única alternativa correta e responda apenas com as letras \"A\", \"B\", \"C\", \"D\" ou \"E\".\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "id_sampler",
|
|
"sampler_config": {
|
|
"id_list": [
|
|
"USP_2018_3",
|
|
"UNICAMP_2018_2",
|
|
"USP_2018_35",
|
|
"UNICAMP_2018_16",
|
|
"USP_2018_89"
|
|
],
|
|
"id_column": "id",
|
|
"exclude_from_task": true
|
|
}
|
|
},
|
|
"num_fewshot": 3,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "acc",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "generate_until",
|
|
"generation_kwargs": {
|
|
"max_gen_toks": 32,
|
|
"do_sample": false,
|
|
"temperature": 0.0,
|
|
"top_k": null,
|
|
"top_p": null,
|
|
"until": [
|
|
"\n\n"
|
|
]
|
|
},
|
|
"repeats": 1,
|
|
"filter_list": [
|
|
{
|
|
"name": "all",
|
|
"filter": [
|
|
{
|
|
"function": "normalize_spaces"
|
|
},
|
|
{
|
|
"function": "remove_accents"
|
|
},
|
|
{
|
|
"function": "find_choices",
|
|
"choices": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D",
|
|
"E"
|
|
],
|
|
"regex_patterns": [
|
|
"(?:[Ll]etra|[Aa]lternativa|[Rr]esposta|[Rr]esposta [Cc]orreta|[Rr]esposta [Cc]orreta e|[Oo]pcao):? ([ABCDE])\\b",
|
|
"\\b([ABCDE])\\.",
|
|
"\\b([ABCDE]) ?[.):-]",
|
|
"\\b([ABCDE])$",
|
|
"\\b([ABCDE])\\b"
|
|
]
|
|
},
|
|
{
|
|
"function": "take_first"
|
|
}
|
|
],
|
|
"group_by": {
|
|
"column": "exam_id"
|
|
}
|
|
}
|
|
],
|
|
"should_decontaminate": true,
|
|
"doc_to_decontamination_query": "<function enem_doc_to_text at 0x14c26cd99e40>",
|
|
"metadata": {
|
|
"version": 1.1
|
|
}
|
|
},
|
|
"enem_challenge": {
|
|
"task": "enem_challenge",
|
|
"task_alias": "enem",
|
|
"group": [
|
|
"pt_benchmark",
|
|
"vestibular"
|
|
],
|
|
"dataset_path": "eduagarcia/enem_challenge",
|
|
"test_split": "train",
|
|
"fewshot_split": "train",
|
|
"doc_to_text": "<function enem_doc_to_text at 0x14c26cd9a020>",
|
|
"doc_to_target": "{{answerKey}}",
|
|
"description": "As perguntas a seguir são questões de múltipla escolha do Exame Nacional do Ensino Médio (ENEM), selecione a única alternativa correta e responda apenas com as letras \"A\", \"B\", \"C\", \"D\" ou \"E\".\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "id_sampler",
|
|
"sampler_config": {
|
|
"id_list": [
|
|
"2022_21",
|
|
"2022_88",
|
|
"2022_143"
|
|
],
|
|
"id_column": "id",
|
|
"exclude_from_task": true
|
|
}
|
|
},
|
|
"num_fewshot": 3,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "acc",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "generate_until",
|
|
"generation_kwargs": {
|
|
"max_gen_toks": 32,
|
|
"do_sample": false,
|
|
"temperature": 0.0,
|
|
"top_k": null,
|
|
"top_p": null,
|
|
"until": [
|
|
"\n\n"
|
|
]
|
|
},
|
|
"repeats": 1,
|
|
"filter_list": [
|
|
{
|
|
"name": "all",
|
|
"filter": [
|
|
{
|
|
"function": "normalize_spaces"
|
|
},
|
|
{
|
|
"function": "remove_accents"
|
|
},
|
|
{
|
|
"function": "find_choices",
|
|
"choices": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D",
|
|
"E"
|
|
],
|
|
"regex_patterns": [
|
|
"(?:[Ll]etra|[Aa]lternativa|[Rr]esposta|[Rr]esposta [Cc]orreta|[Rr]esposta [Cc]orreta e|[Oo]pcao):? ([ABCDE])\\b",
|
|
"\\b([ABCDE])\\.",
|
|
"\\b([ABCDE]) ?[.):-]",
|
|
"\\b([ABCDE])$",
|
|
"\\b([ABCDE])\\b"
|
|
]
|
|
},
|
|
{
|
|
"function": "take_first"
|
|
}
|
|
],
|
|
"group_by": {
|
|
"column": "exam_id"
|
|
}
|
|
}
|
|
],
|
|
"should_decontaminate": true,
|
|
"doc_to_decontamination_query": "<function enem_doc_to_text at 0x14c26cd9a200>",
|
|
"metadata": {
|
|
"version": 1.1
|
|
}
|
|
},
|
|
"faquad_nli": {
|
|
"task": "faquad_nli",
|
|
"group": [
|
|
"pt_benchmark"
|
|
],
|
|
"dataset_path": "ruanchaves/faquad-nli",
|
|
"test_split": "test",
|
|
"fewshot_split": "train",
|
|
"doc_to_text": "Pergunta: {{question}}\nResposta: {{answer}}\nA resposta dada satisfaz à pergunta? Sim ou Não?",
|
|
"doc_to_target": "{{['Não', 'Sim'][label]}}",
|
|
"description": "Abaixo estão pares de pergunta e resposta. Para cada par, você deve julgar se a resposta responde à pergunta de maneira satisfatória e aparenta estar correta. Escreva apenas \"Sim\" ou \"Não\".\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n",
|
|
"sampler_config": {
|
|
"fewshot_indices": [
|
|
1893,
|
|
949,
|
|
663,
|
|
105,
|
|
1169,
|
|
2910,
|
|
2227,
|
|
2813,
|
|
974,
|
|
558,
|
|
1503,
|
|
1958,
|
|
2918,
|
|
601,
|
|
1560,
|
|
984,
|
|
2388,
|
|
995,
|
|
2233,
|
|
1982,
|
|
165,
|
|
2788,
|
|
1312,
|
|
2285,
|
|
522,
|
|
1113,
|
|
1670,
|
|
323,
|
|
236,
|
|
1263,
|
|
1562,
|
|
2519,
|
|
1049,
|
|
432,
|
|
1167,
|
|
1394,
|
|
2022,
|
|
2551,
|
|
2194,
|
|
2187,
|
|
2282,
|
|
2816,
|
|
108,
|
|
301,
|
|
1185,
|
|
1315,
|
|
1420,
|
|
2436,
|
|
2322,
|
|
766
|
|
]
|
|
}
|
|
},
|
|
"num_fewshot": 15,
|
|
"metric_list": [
|
|
{
|
|
"metric": "f1_macro",
|
|
"aggregation": "f1_macro",
|
|
"higher_is_better": true
|
|
},
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "acc",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "generate_until",
|
|
"generation_kwargs": {
|
|
"max_gen_toks": 32,
|
|
"do_sample": false,
|
|
"temperature": 0.0,
|
|
"top_k": null,
|
|
"top_p": null,
|
|
"until": [
|
|
"\n\n"
|
|
]
|
|
},
|
|
"repeats": 1,
|
|
"filter_list": [
|
|
{
|
|
"name": "all",
|
|
"filter": [
|
|
{
|
|
"function": "find_similar_label",
|
|
"labels": [
|
|
"Sim",
|
|
"Não"
|
|
]
|
|
},
|
|
{
|
|
"function": "take_first"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 1.1
|
|
}
|
|
},
|
|
"hatebr_offensive": {
|
|
"task": "hatebr_offensive",
|
|
"task_alias": "hatebr_offensive_binary",
|
|
"group": [
|
|
"pt_benchmark"
|
|
],
|
|
"dataset_path": "eduagarcia/portuguese_benchmark",
|
|
"dataset_name": "HateBR_offensive_binary",
|
|
"test_split": "test",
|
|
"fewshot_split": "train",
|
|
"doc_to_text": "Texto: {{sentence}}\nPergunta: O texto é ofensivo?\nResposta:",
|
|
"doc_to_target": "{{'Sim' if label == 1 else 'Não'}}",
|
|
"description": "Abaixo contém o texto de comentários de usuários do Instagram em português, sua tarefa é classificar se o texto é ofensivo ou não. Responda apenas com \"Sim\" ou \"Não\".\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "id_sampler",
|
|
"sampler_config": {
|
|
"id_list": [
|
|
48,
|
|
44,
|
|
36,
|
|
20,
|
|
3511,
|
|
88,
|
|
3555,
|
|
16,
|
|
56,
|
|
3535,
|
|
60,
|
|
40,
|
|
3527,
|
|
4,
|
|
76,
|
|
3579,
|
|
3523,
|
|
3551,
|
|
68,
|
|
3503,
|
|
84,
|
|
3539,
|
|
64,
|
|
3599,
|
|
80,
|
|
3563,
|
|
3559,
|
|
3543,
|
|
3547,
|
|
3587,
|
|
3595,
|
|
3575,
|
|
3567,
|
|
3591,
|
|
24,
|
|
96,
|
|
92,
|
|
3507,
|
|
52,
|
|
72,
|
|
8,
|
|
3571,
|
|
3515,
|
|
3519,
|
|
3531,
|
|
28,
|
|
32,
|
|
0,
|
|
12,
|
|
3583
|
|
],
|
|
"id_column": "idx"
|
|
}
|
|
},
|
|
"num_fewshot": 25,
|
|
"metric_list": [
|
|
{
|
|
"metric": "f1_macro",
|
|
"aggregation": "f1_macro",
|
|
"higher_is_better": true
|
|
},
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "acc",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "generate_until",
|
|
"generation_kwargs": {
|
|
"max_gen_toks": 32,
|
|
"do_sample": false,
|
|
"temperature": 0.0,
|
|
"top_k": null,
|
|
"top_p": null,
|
|
"until": [
|
|
"\n\n"
|
|
]
|
|
},
|
|
"repeats": 1,
|
|
"filter_list": [
|
|
{
|
|
"name": "all",
|
|
"filter": [
|
|
{
|
|
"function": "find_similar_label",
|
|
"labels": [
|
|
"Sim",
|
|
"Não"
|
|
]
|
|
},
|
|
{
|
|
"function": "take_first"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 1.0
|
|
}
|
|
},
|
|
"oab_exams": {
|
|
"task": "oab_exams",
|
|
"group": [
|
|
"legal_benchmark",
|
|
"pt_benchmark"
|
|
],
|
|
"dataset_path": "eduagarcia/oab_exams",
|
|
"test_split": "train",
|
|
"fewshot_split": "train",
|
|
"doc_to_text": "<function doc_to_text at 0x14c26cd9ad40>",
|
|
"doc_to_target": "{{answerKey}}",
|
|
"description": "As perguntas a seguir são questões de múltipla escolha do Exame de Ordem da Ordem dos Advogados do Brasil (OAB), selecione a única alternativa correta e responda apenas com as letras \"A\", \"B\", \"C\" ou \"D\".\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "id_sampler",
|
|
"sampler_config": {
|
|
"id_list": [
|
|
"2010-01_1",
|
|
"2010-01_11",
|
|
"2010-01_13",
|
|
"2010-01_23",
|
|
"2010-01_26",
|
|
"2010-01_28",
|
|
"2010-01_38",
|
|
"2010-01_48",
|
|
"2010-01_58",
|
|
"2010-01_68",
|
|
"2010-01_76",
|
|
"2010-01_83",
|
|
"2010-01_85",
|
|
"2010-01_91",
|
|
"2010-01_99"
|
|
],
|
|
"id_column": "id",
|
|
"exclude_from_task": true
|
|
}
|
|
},
|
|
"num_fewshot": 3,
|
|
"metric_list": [
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "acc",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "generate_until",
|
|
"generation_kwargs": {
|
|
"max_gen_toks": 32,
|
|
"do_sample": false,
|
|
"temperature": 0.0,
|
|
"top_k": null,
|
|
"top_p": null,
|
|
"until": [
|
|
"\n\n"
|
|
]
|
|
},
|
|
"repeats": 1,
|
|
"filter_list": [
|
|
{
|
|
"name": "all",
|
|
"filter": [
|
|
{
|
|
"function": "normalize_spaces"
|
|
},
|
|
{
|
|
"function": "remove_accents"
|
|
},
|
|
{
|
|
"function": "find_choices",
|
|
"choices": [
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D"
|
|
],
|
|
"regex_patterns": [
|
|
"(?:[Ll]etra|[Aa]lternativa|[Rr]esposta|[Rr]esposta [Cc]orreta|[Rr]esposta [Cc]orreta e|[Oo]pcao):? ([ABCD])\\b",
|
|
"\\b([ABCD])\\.",
|
|
"\\b([ABCD]) ?[.):-]",
|
|
"\\b([ABCD])$",
|
|
"\\b([ABCD])\\b"
|
|
]
|
|
},
|
|
{
|
|
"function": "take_first"
|
|
}
|
|
],
|
|
"group_by": {
|
|
"column": "exam_id"
|
|
}
|
|
}
|
|
],
|
|
"should_decontaminate": true,
|
|
"doc_to_decontamination_query": "<function doc_to_text at 0x14c26cd9afc0>",
|
|
"metadata": {
|
|
"version": 1.5
|
|
}
|
|
},
|
|
"portuguese_hate_speech": {
|
|
"task": "portuguese_hate_speech",
|
|
"task_alias": "portuguese_hate_speech_binary",
|
|
"group": [
|
|
"pt_benchmark"
|
|
],
|
|
"dataset_path": "eduagarcia/portuguese_benchmark",
|
|
"dataset_name": "Portuguese_Hate_Speech_binary",
|
|
"test_split": "test",
|
|
"fewshot_split": "train",
|
|
"doc_to_text": "Texto: {{sentence}}\nPergunta: O texto contém discurso de ódio?\nResposta:",
|
|
"doc_to_target": "{{'Sim' if label == 1 else 'Não'}}",
|
|
"description": "Abaixo contém o texto de tweets de usuários do Twitter em português, sua tarefa é classificar se o texto contém discurso de ódio ou não. Responda apenas com \"Sim\" ou \"Não\".\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "id_sampler",
|
|
"sampler_config": {
|
|
"id_list": [
|
|
52,
|
|
50,
|
|
39,
|
|
28,
|
|
3,
|
|
105,
|
|
22,
|
|
25,
|
|
60,
|
|
11,
|
|
66,
|
|
41,
|
|
9,
|
|
4,
|
|
91,
|
|
42,
|
|
7,
|
|
20,
|
|
76,
|
|
1,
|
|
104,
|
|
13,
|
|
67,
|
|
54,
|
|
97,
|
|
27,
|
|
24,
|
|
14,
|
|
16,
|
|
48,
|
|
53,
|
|
40,
|
|
34,
|
|
49,
|
|
32,
|
|
119,
|
|
114,
|
|
2,
|
|
58,
|
|
83,
|
|
18,
|
|
36,
|
|
5,
|
|
6,
|
|
10,
|
|
35,
|
|
38,
|
|
0,
|
|
21,
|
|
46
|
|
],
|
|
"id_column": "idx"
|
|
}
|
|
},
|
|
"num_fewshot": 25,
|
|
"metric_list": [
|
|
{
|
|
"metric": "f1_macro",
|
|
"aggregation": "f1_macro",
|
|
"higher_is_better": true
|
|
},
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "acc",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "generate_until",
|
|
"generation_kwargs": {
|
|
"max_gen_toks": 32,
|
|
"do_sample": false,
|
|
"temperature": 0.0,
|
|
"top_k": null,
|
|
"top_p": null,
|
|
"until": [
|
|
"\n\n"
|
|
]
|
|
},
|
|
"repeats": 1,
|
|
"filter_list": [
|
|
{
|
|
"name": "all",
|
|
"filter": [
|
|
{
|
|
"function": "find_similar_label",
|
|
"labels": [
|
|
"Sim",
|
|
"Não"
|
|
]
|
|
},
|
|
{
|
|
"function": "take_first"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 1.0
|
|
}
|
|
},
|
|
"tweetsentbr": {
|
|
"task": "tweetsentbr",
|
|
"group": [
|
|
"pt_benchmark"
|
|
],
|
|
"dataset_path": "eduagarcia/tweetsentbr_fewshot",
|
|
"test_split": "test",
|
|
"fewshot_split": "train",
|
|
"doc_to_text": "Texto: {{sentence}}\nPergunta: O sentimento do texto é Positivo, Neutro ou Negativo?\nResposta:",
|
|
"doc_to_target": "{{'Positivo' if label == 'Positive' else ('Negativo' if label == 'Negative' else 'Neutro')}}",
|
|
"description": "Abaixo contém o texto de tweets de usuários do Twitter em português, sua tarefa é classificar se o sentimento do texto é Positivo, Neutro ou Negativo. Responda apenas com uma das opções.\n\n",
|
|
"target_delimiter": " ",
|
|
"fewshot_delimiter": "\n\n",
|
|
"fewshot_config": {
|
|
"sampler": "first_n"
|
|
},
|
|
"num_fewshot": 25,
|
|
"metric_list": [
|
|
{
|
|
"metric": "f1_macro",
|
|
"aggregation": "f1_macro",
|
|
"higher_is_better": true
|
|
},
|
|
{
|
|
"metric": "acc",
|
|
"aggregation": "acc",
|
|
"higher_is_better": true
|
|
}
|
|
],
|
|
"output_type": "generate_until",
|
|
"generation_kwargs": {
|
|
"max_gen_toks": 32,
|
|
"do_sample": false,
|
|
"temperature": 0.0,
|
|
"top_k": null,
|
|
"top_p": null,
|
|
"until": [
|
|
"\n\n"
|
|
]
|
|
},
|
|
"repeats": 1,
|
|
"filter_list": [
|
|
{
|
|
"name": "all",
|
|
"filter": [
|
|
{
|
|
"function": "find_similar_label",
|
|
"labels": [
|
|
"Positivo",
|
|
"Neutro",
|
|
"Negativo"
|
|
]
|
|
},
|
|
{
|
|
"function": "take_first"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"should_decontaminate": false,
|
|
"metadata": {
|
|
"version": 1.0
|
|
}
|
|
}
|
|
},
|
|
"versions": {
|
|
"assin2_rte": 1.1,
|
|
"assin2_sts": 1.1,
|
|
"bluex": 1.1,
|
|
"enem_challenge": 1.1,
|
|
"faquad_nli": 1.1,
|
|
"hatebr_offensive": 1.0,
|
|
"oab_exams": 1.5,
|
|
"portuguese_hate_speech": 1.0,
|
|
"tweetsentbr": 1.0
|
|
},
|
|
"n-shot": {
|
|
"assin2_rte": 15,
|
|
"assin2_sts": 10,
|
|
"bluex": 3,
|
|
"enem_challenge": 3,
|
|
"faquad_nli": 15,
|
|
"hatebr_offensive": 25,
|
|
"oab_exams": 3,
|
|
"portuguese_hate_speech": 25,
|
|
"tweetsentbr": 25
|
|
},
|
|
"model_meta": {
|
|
"truncated": 0,
|
|
"non_truncated": 14150,
|
|
"padded": 0,
|
|
"non_padded": 14150,
|
|
"fewshots_truncated": 0,
|
|
"has_chat_template": true,
|
|
"chat_type": "user_assistant",
|
|
"n_gpus": 1,
|
|
"accelerate_num_process": null,
|
|
"model_sha": "None",
|
|
"model_dtype": "torch.bfloat16",
|
|
"model_memory_footprint": 4889264960,
|
|
"model_num_parameters": 2444628480,
|
|
"model_is_loaded_in_4bit": null,
|
|
"model_is_loaded_in_8bit": null,
|
|
"model_is_quantized": null,
|
|
"model_device": "cuda:0",
|
|
"batch_size": 16,
|
|
"max_length": 4096,
|
|
"max_ctx_length": 4064,
|
|
"max_gen_toks": 32
|
|
},
|
|
"task_model_meta": {
|
|
"assin2_rte": {
|
|
"sample_size": 2448,
|
|
"truncated": 0,
|
|
"non_truncated": 2448,
|
|
"padded": 0,
|
|
"non_padded": 2448,
|
|
"fewshots_truncated": 0,
|
|
"mean_seq_length": 1061.423202614379,
|
|
"min_seq_length": 1046,
|
|
"max_seq_length": 1100,
|
|
"max_ctx_length": 4064,
|
|
"max_gen_toks": 32,
|
|
"mean_original_fewshots_size": 15.0,
|
|
"mean_effective_fewshot_size": 15.0
|
|
},
|
|
"assin2_sts": {
|
|
"sample_size": 2448,
|
|
"truncated": 0,
|
|
"non_truncated": 2448,
|
|
"padded": 0,
|
|
"non_padded": 2448,
|
|
"fewshots_truncated": 0,
|
|
"mean_seq_length": 747.4232026143791,
|
|
"min_seq_length": 732,
|
|
"max_seq_length": 786,
|
|
"max_ctx_length": 4064,
|
|
"max_gen_toks": 32,
|
|
"mean_original_fewshots_size": 10.0,
|
|
"mean_effective_fewshot_size": 10.0
|
|
},
|
|
"bluex": {
|
|
"sample_size": 719,
|
|
"truncated": 0,
|
|
"non_truncated": 719,
|
|
"padded": 0,
|
|
"non_padded": 719,
|
|
"fewshots_truncated": 0,
|
|
"mean_seq_length": 1198.817802503477,
|
|
"min_seq_length": 932,
|
|
"max_seq_length": 1829,
|
|
"max_ctx_length": 4064,
|
|
"max_gen_toks": 32,
|
|
"mean_original_fewshots_size": 3.0,
|
|
"mean_effective_fewshot_size": 3.0
|
|
},
|
|
"enem_challenge": {
|
|
"sample_size": 1429,
|
|
"truncated": 0,
|
|
"non_truncated": 1429,
|
|
"padded": 0,
|
|
"non_padded": 1429,
|
|
"fewshots_truncated": 0,
|
|
"mean_seq_length": 1035.4177746675998,
|
|
"min_seq_length": 857,
|
|
"max_seq_length": 2512,
|
|
"max_ctx_length": 4064,
|
|
"max_gen_toks": 32,
|
|
"mean_original_fewshots_size": 3.0,
|
|
"mean_effective_fewshot_size": 3.0
|
|
},
|
|
"faquad_nli": {
|
|
"sample_size": 650,
|
|
"truncated": 0,
|
|
"non_truncated": 650,
|
|
"padded": 0,
|
|
"non_padded": 650,
|
|
"fewshots_truncated": 0,
|
|
"mean_seq_length": 1083.1338461538462,
|
|
"min_seq_length": 1051,
|
|
"max_seq_length": 1149,
|
|
"max_ctx_length": 4064,
|
|
"max_gen_toks": 32,
|
|
"mean_original_fewshots_size": 15.0,
|
|
"mean_effective_fewshot_size": 15.0
|
|
},
|
|
"hatebr_offensive": {
|
|
"sample_size": 1400,
|
|
"truncated": 0,
|
|
"non_truncated": 1400,
|
|
"padded": 0,
|
|
"non_padded": 1400,
|
|
"fewshots_truncated": 0,
|
|
"mean_seq_length": 1090.4407142857142,
|
|
"min_seq_length": 1075,
|
|
"max_seq_length": 1284,
|
|
"max_ctx_length": 4064,
|
|
"max_gen_toks": 32,
|
|
"mean_original_fewshots_size": 25.0,
|
|
"mean_effective_fewshot_size": 25.0
|
|
},
|
|
"oab_exams": {
|
|
"sample_size": 2195,
|
|
"truncated": 0,
|
|
"non_truncated": 2195,
|
|
"padded": 0,
|
|
"non_padded": 2195,
|
|
"fewshots_truncated": 0,
|
|
"mean_seq_length": 863.024145785877,
|
|
"min_seq_length": 690,
|
|
"max_seq_length": 1139,
|
|
"max_ctx_length": 4064,
|
|
"max_gen_toks": 32,
|
|
"mean_original_fewshots_size": 3.0,
|
|
"mean_effective_fewshot_size": 3.0
|
|
},
|
|
"portuguese_hate_speech": {
|
|
"sample_size": 851,
|
|
"truncated": 0,
|
|
"non_truncated": 851,
|
|
"padded": 0,
|
|
"non_padded": 851,
|
|
"fewshots_truncated": 0,
|
|
"mean_seq_length": 1442.021151586369,
|
|
"min_seq_length": 1415,
|
|
"max_seq_length": 1478,
|
|
"max_ctx_length": 4064,
|
|
"max_gen_toks": 32,
|
|
"mean_original_fewshots_size": 25.0,
|
|
"mean_effective_fewshot_size": 25.0
|
|
},
|
|
"tweetsentbr": {
|
|
"sample_size": 2010,
|
|
"truncated": 0,
|
|
"non_truncated": 2010,
|
|
"padded": 0,
|
|
"non_padded": 2010,
|
|
"fewshots_truncated": 0,
|
|
"mean_seq_length": 1370.4194029850746,
|
|
"min_seq_length": 1353,
|
|
"max_seq_length": 1427,
|
|
"max_ctx_length": 4064,
|
|
"max_gen_toks": 32,
|
|
"mean_original_fewshots_size": 25.0,
|
|
"mean_effective_fewshot_size": 25.0
|
|
}
|
|
},
|
|
"config": {
|
|
"model": "huggingface",
|
|
"model_args": "pretrained=/lustre/mlnvme/data/asen_hpc-mula/checkpoints-llama/slurm_job_17782345/step_42164",
|
|
"batch_size": "auto",
|
|
"batch_sizes": [],
|
|
"device": "cuda:0",
|
|
"use_cache": null,
|
|
"limit": null,
|
|
"bootstrap_iters": 100000,
|
|
"gen_kwargs": null
|
|
},
|
|
"git_hash": null
|
|
} |