Tucano-2b4-Instruct/results-pt-sft.json

{
  "results": {
    "assin2_rte": {
      "f1_macro,all": 0.3333333333333333,
      "f1_macro_stderr,all": 0.0031730251394380704,
      "acc,all": 0.5,
      "acc_stderr,all": 0.007138073526203421,
      "alias": "assin2_rte"
    },
    "assin2_sts": {
      "pearson,all": 0.07790738376477561,
      "pearson_stderr,all": 0.014716684813439029,
      "mse,all": 2.066646241830065,
      "mse_stderr,all": "N/A",
      "alias": "assin2_sts"
    },
    "bluex": {
      "acc,all": 0.20584144645340752,
      "acc_stderr,all": 0.008708200533939503,
      "acc,exam_id__USP_2024": 0.24390243902439024,
      "acc_stderr,exam_id__USP_2024": 0.038765983941525854,
      "acc,exam_id__UNICAMP_2018": 0.2222222222222222,
      "acc_stderr,exam_id__UNICAMP_2018": 0.032540841899570724,
      "acc,exam_id__USP_2018": 0.16666666666666666,
      "acc_stderr,exam_id__USP_2018": 0.029320906143906797,
      "acc,exam_id__USP_2022": 0.20408163265306123,
      "acc_stderr,exam_id__USP_2022": 0.03318725036103681,
      "acc,exam_id__USP_2021": 0.1346153846153846,
      "acc_stderr,exam_id__USP_2021": 0.027253785013691273,
      "acc,exam_id__UNICAMP_2021_2": 0.27450980392156865,
      "acc_stderr,exam_id__UNICAMP_2021_2": 0.0360141604917446,
      "acc,exam_id__UNICAMP_2021_1": 0.2826086956521739,
      "acc_stderr,exam_id__UNICAMP_2021_1": 0.038353605844743385,
      "acc,exam_id__USP_2020": 0.08928571428571429,
      "acc_stderr,exam_id__USP_2020": 0.021921367122397676,
      "acc,exam_id__UNICAMP_2022": 0.3076923076923077,
      "acc_stderr,exam_id__UNICAMP_2022": 0.042727899536711536,
      "acc,exam_id__UNICAMP_2023": 0.16279069767441862,
      "acc_stderr,exam_id__UNICAMP_2023": 0.03238294764474062,
      "acc,exam_id__UNICAMP_2024": 0.3111111111111111,
      "acc_stderr,exam_id__UNICAMP_2024": 0.0398333923513593,
      "acc,exam_id__UNICAMP_2020": 0.18181818181818182,
      "acc_stderr,exam_id__UNICAMP_2020": 0.02992924246353086,
      "acc,exam_id__USP_2023": 0.18181818181818182,
      "acc_stderr,exam_id__USP_2023": 0.03345273573325805,
      "acc,exam_id__UNICAMP_2019": 0.2,
      "acc_stderr,exam_id__UNICAMP_2019": 0.03269771596389771,
      "acc,exam_id__USP_2019": 0.175,
      "acc_stderr,exam_id__USP_2019": 0.0348121283420538,
      "alias": "bluex"
    },
    "enem_challenge": {
      "alias": "enem",
      "acc,all": 0.2092372288313506,
      "acc_stderr,all": 0.006229253057208555,
      "acc,exam_id__2016_2": 0.21138211382113822,
      "acc_stderr,exam_id__2016_2": 0.02123691588211098,
      "acc,exam_id__2023": 0.1925925925925926,
      "acc_stderr,exam_id__2023": 0.019583861187616968,
      "acc,exam_id__2013": 0.2037037037037037,
      "acc_stderr,exam_id__2013": 0.02238599079325693,
      "acc,exam_id__2012": 0.12931034482758622,
      "acc_stderr,exam_id__2012": 0.01803175905554286,
      "acc,exam_id__2009": 0.23478260869565218,
      "acc_stderr,exam_id__2009": 0.022814803582640184,
      "acc,exam_id__2022": 0.17293233082706766,
      "acc_stderr,exam_id__2022": 0.01889827424607104,
      "acc,exam_id__2015": 0.31932773109243695,
      "acc_stderr,exam_id__2015": 0.02472561230954832,
      "acc,exam_id__2014": 0.21100917431192662,
      "acc_stderr,exam_id__2014": 0.022513127008089658,
      "acc,exam_id__2016": 0.18181818181818182,
      "acc_stderr,exam_id__2016": 0.020248897347876847,
      "acc,exam_id__2010": 0.23076923076923078,
      "acc_stderr,exam_id__2010": 0.02246971773699712,
      "acc,exam_id__2017": 0.27586206896551724,
      "acc_stderr,exam_id__2017": 0.023994074423977864,
      "acc,exam_id__2011": 0.15384615384615385,
      "acc_stderr,exam_id__2011": 0.019264278502154123
    },
    "faquad_nli": {
      "f1_macro,all": 0.4396551724137931,
      "f1_macro_stderr,all": 0.0035796984729087084,
      "acc,all": 0.7846153846153846,
      "acc_stderr,all": 0.011396120309131327,
      "alias": "faquad_nli"
    },
    "hatebr_offensive": {
      "alias": "hatebr_offensive_binary",
      "f1_macro,all": 0.43054708155379295,
      "f1_macro_stderr,all": 0.009093679844467082,
      "acc,all": 0.4742857142857143,
      "acc_stderr,all": 0.009437507998400261
    },
    "oab_exams": {
      "acc,all": 0.25968109339407747,
      "acc_stderr,all": 0.005403181658894358,
      "acc,exam_id__2017-22": 0.2375,
      "acc_stderr,exam_id__2017-22": 0.027511429390216682,
      "acc,exam_id__2016-20a": 0.2,
      "acc_stderr,exam_id__2016-20a": 0.02584175311098727,
      "acc,exam_id__2011-04": 0.2875,
      "acc_stderr,exam_id__2011-04": 0.02919454405528515,
      "acc,exam_id__2013-12": 0.2875,
      "acc_stderr,exam_id__2013-12": 0.029277381115049662,
      "acc,exam_id__2013-11": 0.325,
      "acc_stderr,exam_id__2013-11": 0.030286419424458838,
      "acc,exam_id__2010-02": 0.32,
      "acc_stderr,exam_id__2010-02": 0.026888774775418785,
      "acc,exam_id__2012-07": 0.2375,
      "acc_stderr,exam_id__2012-07": 0.02737558649609428,
      "acc,exam_id__2016-21": 0.25,
      "acc_stderr,exam_id__2016-21": 0.027994547544285982,
      "acc,exam_id__2015-17": 0.24358974358974358,
      "acc_stderr,exam_id__2015-17": 0.02793267139214751,
      "acc,exam_id__2015-18": 0.2625,
      "acc_stderr,exam_id__2015-18": 0.028396710161944567,
      "acc,exam_id__2014-14": 0.225,
      "acc_stderr,exam_id__2014-14": 0.026939185801353988,
      "acc,exam_id__2015-16": 0.3375,
      "acc_stderr,exam_id__2015-16": 0.030631826713546063,
      "acc,exam_id__2012-09": 0.23376623376623376,
      "acc_stderr,exam_id__2012-09": 0.02787359925121907,
      "acc,exam_id__2011-03": 0.26262626262626265,
      "acc_stderr,exam_id__2011-03": 0.025505720074946718,
      "acc,exam_id__2016-19": 0.2692307692307692,
      "acc_stderr,exam_id__2016-19": 0.028948751914583667,
      "acc,exam_id__2012-06a": 0.2375,
      "acc_stderr,exam_id__2012-06a": 0.027440075549438697,
      "acc,exam_id__2011-05": 0.2625,
      "acc_stderr,exam_id__2011-05": 0.02835789202564455,
      "acc,exam_id__2013-10": 0.2125,
      "acc_stderr,exam_id__2013-10": 0.026367641247603036,
      "acc,exam_id__2017-24": 0.25,
      "acc_stderr,exam_id__2017-24": 0.028053164455460838,
      "acc,exam_id__2016-20": 0.2625,
      "acc_stderr,exam_id__2016-20": 0.0283327789711091,
      "acc,exam_id__2012-08": 0.275,
      "acc_stderr,exam_id__2012-08": 0.02893752928626648,
      "acc,exam_id__2014-13": 0.2,
      "acc_stderr,exam_id__2014-13": 0.025784866156114444,
      "acc,exam_id__2018-25": 0.25,
      "acc_stderr,exam_id__2018-25": 0.027961840366717016,
      "acc,exam_id__2017-23": 0.3,
      "acc_stderr,exam_id__2017-23": 0.02953160157687412,
      "acc,exam_id__2014-15": 0.2564102564102564,
      "acc_stderr,exam_id__2014-15": 0.028456647275964232,
      "acc,exam_id__2012-06": 0.275,
      "acc_stderr,exam_id__2012-06": 0.028790584320040398,
      "acc,exam_id__2010-01": 0.23529411764705882,
      "acc_stderr,exam_id__2010-01": 0.026470002521428834,
      "alias": "oab_exams"
    },
    "portuguese_hate_speech": {
      "alias": "portuguese_hate_speech_binary",
      "f1_macro,all": 0.35895915678524376,
      "f1_macro_stderr,all": 0.011204300451999685,
      "acc,all": 0.381903642773208,
      "acc_stderr,all": 0.011741654959752653
    },
    "tweetsentbr": {
      "f1_macro,all": 0.2114730555936155,
      "f1_macro_stderr,all": 0.0056538657419370805,
      "acc,all": 0.3169154228855721,
      "acc_stderr,all": 0.007343236186351586,
      "alias": "tweetsentbr"
    }
  },
  "configs": {
    "assin2_rte": {
      "task": "assin2_rte",
      "group": [
        "pt_benchmark",
        "assin2"
      ],
      "dataset_path": "assin2",
      "test_split": "test",
      "fewshot_split": "train",
      "doc_to_text": "Premissa: {{premise}}\nHipótese: {{hypothesis}}\nPergunta: A hipótese pode ser inferida pela premissa? Sim ou Não?\nResposta:",
      "doc_to_target": "{{['Não', 'Sim'][entailment_judgment]}}",
      "description": "Abaixo estão pares de premissa e hipótese. Para cada par, indique se a hipótese pode ser inferida a partir da premissa, responda apenas com \"Sim\" ou \"Não\".\n\n",
      "target_delimiter": " ",
      "fewshot_delimiter": "\n\n",
      "fewshot_config": {
        "sampler": "id_sampler",
        "sampler_config": {
          "id_list": [
            1,
            3251,
            2,
            3252,
            3,
            4,
            5,
            6,
            3253,
            7,
            3254,
            3255,
            3256,
            8,
            9,
            10,
            3257,
            11,
            3258,
            12,
            13,
            14,
            15,
            3259,
            3260,
            3261,
            3262,
            3263,
            16,
            17,
            3264,
            18,
            3265,
            3266,
            3267,
            19,
            20,
            3268,
            3269,
            21,
            3270,
            3271,
            22,
            3272,
            3273,
            23,
            3274,
            24,
            25,
            3275
          ],
          "id_column": "sentence_pair_id"
        }
      },
      "num_fewshot": 15,
      "metric_list": [
        {
          "metric": "f1_macro",
          "aggregation": "f1_macro",
          "higher_is_better": true
        },
        {
          "metric": "acc",
          "aggregation": "acc",
          "higher_is_better": true
        }
      ],
      "output_type": "generate_until",
      "generation_kwargs": {
        "max_gen_toks": 32,
        "do_sample": false,
        "temperature": 0.0,
        "top_k": null,
        "top_p": null,
        "until": [
          "\n\n"
        ]
      },
      "repeats": 1,
      "filter_list": [
        {
          "name": "all",
          "filter": [
            {
              "function": "find_similar_label",
              "labels": [
                "Sim",
                "Não"
              ]
            },
            {
              "function": "take_first"
            }
          ]
        }
      ],
      "should_decontaminate": false,
      "metadata": {
        "version": 1.1
      }
    },
    "assin2_sts": {
      "task": "assin2_sts",
      "group": [
        "pt_benchmark",
        "assin2"
      ],
      "dataset_path": "assin2",
      "test_split": "test",
      "fewshot_split": "train",
      "doc_to_text": "Frase 1: {{premise}}\nFrase 2: {{hypothesis}}\nPergunta: Quão similares são as duas frases? Dê uma pontuação entre 1,0 a 5,0.\nResposta:",
      "doc_to_target": "<function assin2_float_to_pt_str at 0x14c26cd9a5c0>",
      "description": "Abaixo estão pares de frases que você deve avaliar o grau de similaridade. Dê uma pontuação entre 1,0 e 5,0, sendo 1,0 pouco similar e 5,0 muito similar.\n\n",
      "target_delimiter": " ",
      "fewshot_delimiter": "\n\n",
      "fewshot_config": {
        "sampler": "id_sampler",
        "sampler_config": {
          "id_list": [
            1,
            3251,
            2,
            3252,
            3,
            4,
            5,
            6,
            3253,
            7,
            3254,
            3255,
            3256,
            8,
            9,
            10,
            3257,
            11,
            3258,
            12,
            13,
            14,
            15,
            3259,
            3260,
            3261,
            3262,
            3263,
            16,
            17,
            3264,
            18,
            3265,
            3266,
            3267,
            19,
            20,
            3268,
            3269,
            21,
            3270,
            3271,
            22,
            3272,
            3273,
            23,
            3274,
            24,
            25,
            3275
          ],
          "id_column": "sentence_pair_id"
        }
      },
      "num_fewshot": 10,
      "metric_list": [
        {
          "metric": "pearson",
          "aggregation": "pearsonr",
          "higher_is_better": true
        },
        {
          "metric": "mse",
          "aggregation": "mean_squared_error",
          "higher_is_better": false
        }
      ],
      "output_type": "generate_until",
      "generation_kwargs": {
        "max_gen_toks": 32,
        "do_sample": false,
        "temperature": 0.0,
        "top_k": null,
        "top_p": null,
        "until": [
          "\n\n"
        ]
      },
      "repeats": 1,
      "filter_list": [
        {
          "name": "all",
          "filter": [
            {
              "function": "number_filter",
              "type": "float",
              "range_min": 1.0,
              "range_max": 5.0,
              "on_outside_range": "clip",
              "fallback": 5.0
            },
            {
              "function": "take_first"
            }
          ]
        }
      ],
      "should_decontaminate": false,
      "metadata": {
        "version": 1.1
      }
    },
    "bluex": {
      "task": "bluex",
      "group": [
        "pt_benchmark",
        "vestibular"
      ],
      "dataset_path": "eduagarcia-temp/BLUEX_without_images",
      "test_split": "train",
      "fewshot_split": "train",
      "doc_to_text": "<function enem_doc_to_text at 0x14c26cd99b20>",
      "doc_to_target": "{{answerKey}}",
      "description": "As perguntas a seguir são questões de múltipla escolha de provas de vestibular de universidades brasileiras, selecione a única alternativa correta e responda apenas com as letras \"A\", \"B\", \"C\", \"D\" ou \"E\".\n\n",
      "target_delimiter": " ",
      "fewshot_delimiter": "\n\n",
      "fewshot_config": {
        "sampler": "id_sampler",
        "sampler_config": {
          "id_list": [
            "USP_2018_3",
            "UNICAMP_2018_2",
            "USP_2018_35",
            "UNICAMP_2018_16",
            "USP_2018_89"
          ],
          "id_column": "id",
          "exclude_from_task": true
        }
      },
      "num_fewshot": 3,
      "metric_list": [
        {
          "metric": "acc",
          "aggregation": "acc",
          "higher_is_better": true
        }
      ],
      "output_type": "generate_until",
      "generation_kwargs": {
        "max_gen_toks": 32,
        "do_sample": false,
        "temperature": 0.0,
        "top_k": null,
        "top_p": null,
        "until": [
          "\n\n"
        ]
      },
      "repeats": 1,
      "filter_list": [
        {
          "name": "all",
          "filter": [
            {
              "function": "normalize_spaces"
            },
            {
              "function": "remove_accents"
            },
            {
              "function": "find_choices",
              "choices": [
                "A",
                "B",
                "C",
                "D",
                "E"
              ],
              "regex_patterns": [
                "(?:[Ll]etra|[Aa]lternativa|[Rr]esposta|[Rr]esposta [Cc]orreta|[Rr]esposta [Cc]orreta e|[Oo]pcao):? ([ABCDE])\\b",
                "\\b([ABCDE])\\.",
                "\\b([ABCDE]) ?[.):-]",
                "\\b([ABCDE])$",
                "\\b([ABCDE])\\b"
              ]
            },
            {
              "function": "take_first"
            }
          ],
          "group_by": {
            "column": "exam_id"
          }
        }
      ],
      "should_decontaminate": true,
      "doc_to_decontamination_query": "<function enem_doc_to_text at 0x14c26cd99e40>",
      "metadata": {
        "version": 1.1
      }
    },
    "enem_challenge": {
      "task": "enem_challenge",
      "task_alias": "enem",
      "group": [
        "pt_benchmark",
        "vestibular"
      ],
      "dataset_path": "eduagarcia/enem_challenge",
      "test_split": "train",
      "fewshot_split": "train",
      "doc_to_text": "<function enem_doc_to_text at 0x14c26cd9a020>",
      "doc_to_target": "{{answerKey}}",
      "description": "As perguntas a seguir são questões de múltipla escolha do Exame Nacional do Ensino Médio (ENEM), selecione a única alternativa correta e responda apenas com as letras \"A\", \"B\", \"C\", \"D\" ou \"E\".\n\n",
      "target_delimiter": " ",
      "fewshot_delimiter": "\n\n",
      "fewshot_config": {
        "sampler": "id_sampler",
        "sampler_config": {
          "id_list": [
            "2022_21",
            "2022_88",
            "2022_143"
          ],
          "id_column": "id",
          "exclude_from_task": true
        }
      },
      "num_fewshot": 3,
      "metric_list": [
        {
          "metric": "acc",
          "aggregation": "acc",
          "higher_is_better": true
        }
      ],
      "output_type": "generate_until",
      "generation_kwargs": {
        "max_gen_toks": 32,
        "do_sample": false,
        "temperature": 0.0,
        "top_k": null,
        "top_p": null,
        "until": [
          "\n\n"
        ]
      },
      "repeats": 1,
      "filter_list": [
        {
          "name": "all",
          "filter": [
            {
              "function": "normalize_spaces"
            },
            {
              "function": "remove_accents"
            },
            {
              "function": "find_choices",
              "choices": [
                "A",
                "B",
                "C",
                "D",
                "E"
              ],
              "regex_patterns": [
                "(?:[Ll]etra|[Aa]lternativa|[Rr]esposta|[Rr]esposta [Cc]orreta|[Rr]esposta [Cc]orreta e|[Oo]pcao):? ([ABCDE])\\b",
                "\\b([ABCDE])\\.",
                "\\b([ABCDE]) ?[.):-]",
                "\\b([ABCDE])$",
                "\\b([ABCDE])\\b"
              ]
            },
            {
              "function": "take_first"
            }
          ],
          "group_by": {
            "column": "exam_id"
          }
        }
      ],
      "should_decontaminate": true,
      "doc_to_decontamination_query": "<function enem_doc_to_text at 0x14c26cd9a200>",
      "metadata": {
        "version": 1.1
      }
    },
    "faquad_nli": {
      "task": "faquad_nli",
      "group": [
        "pt_benchmark"
      ],
      "dataset_path": "ruanchaves/faquad-nli",
      "test_split": "test",
      "fewshot_split": "train",
      "doc_to_text": "Pergunta: {{question}}\nResposta: {{answer}}\nA resposta dada satisfaz à pergunta? Sim ou Não?",
      "doc_to_target": "{{['Não', 'Sim'][label]}}",
      "description": "Abaixo estão pares de pergunta e resposta. Para cada par, você deve julgar se a resposta responde à pergunta de maneira satisfatória e aparenta estar correta. Escreva apenas \"Sim\" ou \"Não\".\n\n",
      "target_delimiter": " ",
      "fewshot_delimiter": "\n\n",
      "fewshot_config": {
        "sampler": "first_n",
        "sampler_config": {
          "fewshot_indices": [
            1893,
            949,
            663,
            105,
            1169,
            2910,
            2227,
            2813,
            974,
            558,
            1503,
            1958,
            2918,
            601,
            1560,
            984,
            2388,
            995,
            2233,
            1982,
            165,
            2788,
            1312,
            2285,
            522,
            1113,
            1670,
            323,
            236,
            1263,
            1562,
            2519,
            1049,
            432,
            1167,
            1394,
            2022,
            2551,
            2194,
            2187,
            2282,
            2816,
            108,
            301,
            1185,
            1315,
            1420,
            2436,
            2322,
            766
          ]
        }
      },
      "num_fewshot": 15,
      "metric_list": [
        {
          "metric": "f1_macro",
          "aggregation": "f1_macro",
          "higher_is_better": true
        },
        {
          "metric": "acc",
          "aggregation": "acc",
          "higher_is_better": true
        }
      ],
      "output_type": "generate_until",
      "generation_kwargs": {
        "max_gen_toks": 32,
        "do_sample": false,
        "temperature": 0.0,
        "top_k": null,
        "top_p": null,
        "until": [
          "\n\n"
        ]
      },
      "repeats": 1,
      "filter_list": [
        {
          "name": "all",
          "filter": [
            {
              "function": "find_similar_label",
              "labels": [
                "Sim",
                "Não"
              ]
            },
            {
              "function": "take_first"
            }
          ]
        }
      ],
      "should_decontaminate": false,
      "metadata": {
        "version": 1.1
      }
    },
    "hatebr_offensive": {
      "task": "hatebr_offensive",
      "task_alias": "hatebr_offensive_binary",
      "group": [
        "pt_benchmark"
      ],
      "dataset_path": "eduagarcia/portuguese_benchmark",
      "dataset_name": "HateBR_offensive_binary",
      "test_split": "test",
      "fewshot_split": "train",
      "doc_to_text": "Texto: {{sentence}}\nPergunta: O texto é ofensivo?\nResposta:",
      "doc_to_target": "{{'Sim' if label == 1 else 'Não'}}",
      "description": "Abaixo contém o texto de comentários de usuários do Instagram em português, sua tarefa é classificar se o texto é ofensivo ou não. Responda apenas com \"Sim\" ou \"Não\".\n\n",
      "target_delimiter": " ",
      "fewshot_delimiter": "\n\n",
      "fewshot_config": {
        "sampler": "id_sampler",
        "sampler_config": {
          "id_list": [
            48,
            44,
            36,
            20,
            3511,
            88,
            3555,
            16,
            56,
            3535,
            60,
            40,
            3527,
            4,
            76,
            3579,
            3523,
            3551,
            68,
            3503,
            84,
            3539,
            64,
            3599,
            80,
            3563,
            3559,
            3543,
            3547,
            3587,
            3595,
            3575,
            3567,
            3591,
            24,
            96,
            92,
            3507,
            52,
            72,
            8,
            3571,
            3515,
            3519,
            3531,
            28,
            32,
            0,
            12,
            3583
          ],
          "id_column": "idx"
        }
      },
      "num_fewshot": 25,
      "metric_list": [
        {
          "metric": "f1_macro",
          "aggregation": "f1_macro",
          "higher_is_better": true
        },
        {
          "metric": "acc",
          "aggregation": "acc",
          "higher_is_better": true
        }
      ],
      "output_type": "generate_until",
      "generation_kwargs": {
        "max_gen_toks": 32,
        "do_sample": false,
        "temperature": 0.0,
        "top_k": null,
        "top_p": null,
        "until": [
          "\n\n"
        ]
      },
      "repeats": 1,
      "filter_list": [
        {
          "name": "all",
          "filter": [
            {
              "function": "find_similar_label",
              "labels": [
                "Sim",
                "Não"
              ]
            },
            {
              "function": "take_first"
            }
          ]
        }
      ],
      "should_decontaminate": false,
      "metadata": {
        "version": 1.0
      }
    },
    "oab_exams": {
      "task": "oab_exams",
      "group": [
        "legal_benchmark",
        "pt_benchmark"
      ],
      "dataset_path": "eduagarcia/oab_exams",
      "test_split": "train",
      "fewshot_split": "train",
      "doc_to_text": "<function doc_to_text at 0x14c26cd9ad40>",
      "doc_to_target": "{{answerKey}}",
      "description": "As perguntas a seguir são questões de múltipla escolha do Exame de Ordem da Ordem dos Advogados do Brasil (OAB), selecione a única alternativa correta e responda apenas com as letras \"A\", \"B\", \"C\" ou \"D\".\n\n",
      "target_delimiter": " ",
      "fewshot_delimiter": "\n\n",
      "fewshot_config": {
        "sampler": "id_sampler",
        "sampler_config": {
          "id_list": [
            "2010-01_1",
            "2010-01_11",
            "2010-01_13",
            "2010-01_23",
            "2010-01_26",
            "2010-01_28",
            "2010-01_38",
            "2010-01_48",
            "2010-01_58",
            "2010-01_68",
            "2010-01_76",
            "2010-01_83",
            "2010-01_85",
            "2010-01_91",
            "2010-01_99"
          ],
          "id_column": "id",
          "exclude_from_task": true
        }
      },
      "num_fewshot": 3,
      "metric_list": [
        {
          "metric": "acc",
          "aggregation": "acc",
          "higher_is_better": true
        }
      ],
      "output_type": "generate_until",
      "generation_kwargs": {
        "max_gen_toks": 32,
        "do_sample": false,
        "temperature": 0.0,
        "top_k": null,
        "top_p": null,
        "until": [
          "\n\n"
        ]
      },
      "repeats": 1,
      "filter_list": [
        {
          "name": "all",
          "filter": [
            {
              "function": "normalize_spaces"
            },
            {
              "function": "remove_accents"
            },
            {
              "function": "find_choices",
              "choices": [
                "A",
                "B",
                "C",
                "D"
              ],
              "regex_patterns": [
                "(?:[Ll]etra|[Aa]lternativa|[Rr]esposta|[Rr]esposta [Cc]orreta|[Rr]esposta [Cc]orreta e|[Oo]pcao):? ([ABCD])\\b",
                "\\b([ABCD])\\.",
                "\\b([ABCD]) ?[.):-]",
                "\\b([ABCD])$",
                "\\b([ABCD])\\b"
              ]
            },
            {
              "function": "take_first"
            }
          ],
          "group_by": {
            "column": "exam_id"
          }
        }
      ],
      "should_decontaminate": true,
      "doc_to_decontamination_query": "<function doc_to_text at 0x14c26cd9afc0>",
      "metadata": {
        "version": 1.5
      }
    },
    "portuguese_hate_speech": {
      "task": "portuguese_hate_speech",
      "task_alias": "portuguese_hate_speech_binary",
      "group": [
        "pt_benchmark"
      ],
      "dataset_path": "eduagarcia/portuguese_benchmark",
      "dataset_name": "Portuguese_Hate_Speech_binary",
      "test_split": "test",
      "fewshot_split": "train",
      "doc_to_text": "Texto: {{sentence}}\nPergunta: O texto contém discurso de ódio?\nResposta:",
      "doc_to_target": "{{'Sim' if label == 1 else 'Não'}}",
      "description": "Abaixo contém o texto de tweets de usuários do Twitter em português, sua tarefa é classificar se o texto contém discurso de ódio ou não. Responda apenas com \"Sim\" ou \"Não\".\n\n",
      "target_delimiter": " ",
      "fewshot_delimiter": "\n\n",
      "fewshot_config": {
        "sampler": "id_sampler",
        "sampler_config": {
          "id_list": [
            52,
            50,
            39,
            28,
            3,
            105,
            22,
            25,
            60,
            11,
            66,
            41,
            9,
            4,
            91,
            42,
            7,
            20,
            76,
            1,
            104,
            13,
            67,
            54,
            97,
            27,
            24,
            14,
            16,
            48,
            53,
            40,
            34,
            49,
            32,
            119,
            114,
            2,
            58,
            83,
            18,
            36,
            5,
            6,
            10,
            35,
            38,
            0,
            21,
            46
          ],
          "id_column": "idx"
        }
      },
      "num_fewshot": 25,
      "metric_list": [
        {
          "metric": "f1_macro",
          "aggregation": "f1_macro",
          "higher_is_better": true
        },
        {
          "metric": "acc",
          "aggregation": "acc",
          "higher_is_better": true
        }
      ],
      "output_type": "generate_until",
      "generation_kwargs": {
        "max_gen_toks": 32,
        "do_sample": false,
        "temperature": 0.0,
        "top_k": null,
        "top_p": null,
        "until": [
          "\n\n"
        ]
      },
      "repeats": 1,
      "filter_list": [
        {
          "name": "all",
          "filter": [
            {
              "function": "find_similar_label",
              "labels": [
                "Sim",
                "Não"
              ]
            },
            {
              "function": "take_first"
            }
          ]
        }
      ],
      "should_decontaminate": false,
      "metadata": {
        "version": 1.0
      }
    },
    "tweetsentbr": {
      "task": "tweetsentbr",
      "group": [
        "pt_benchmark"
      ],
      "dataset_path": "eduagarcia/tweetsentbr_fewshot",
      "test_split": "test",
      "fewshot_split": "train",
      "doc_to_text": "Texto: {{sentence}}\nPergunta: O sentimento do texto é Positivo, Neutro ou Negativo?\nResposta:",
      "doc_to_target": "{{'Positivo' if label == 'Positive' else ('Negativo' if label == 'Negative' else 'Neutro')}}",
      "description": "Abaixo contém o texto de tweets de usuários do Twitter em português, sua tarefa é classificar se o sentimento do texto é Positivo, Neutro ou Negativo. Responda apenas com uma das opções.\n\n",
      "target_delimiter": " ",
      "fewshot_delimiter": "\n\n",
      "fewshot_config": {
        "sampler": "first_n"
      },
      "num_fewshot": 25,
      "metric_list": [
        {
          "metric": "f1_macro",
          "aggregation": "f1_macro",
          "higher_is_better": true
        },
        {
          "metric": "acc",
          "aggregation": "acc",
          "higher_is_better": true
        }
      ],
      "output_type": "generate_until",
      "generation_kwargs": {
        "max_gen_toks": 32,
        "do_sample": false,
        "temperature": 0.0,
        "top_k": null,
        "top_p": null,
        "until": [
          "\n\n"
        ]
      },
      "repeats": 1,
      "filter_list": [
        {
          "name": "all",
          "filter": [
            {
              "function": "find_similar_label",
              "labels": [
                "Positivo",
                "Neutro",
                "Negativo"
              ]
            },
            {
              "function": "take_first"
            }
          ]
        }
      ],
      "should_decontaminate": false,
      "metadata": {
        "version": 1.0
      }
    }
  },
  "versions": {
    "assin2_rte": 1.1,
    "assin2_sts": 1.1,
    "bluex": 1.1,
    "enem_challenge": 1.1,
    "faquad_nli": 1.1,
    "hatebr_offensive": 1.0,
    "oab_exams": 1.5,
    "portuguese_hate_speech": 1.0,
    "tweetsentbr": 1.0
  },
  "n-shot": {
    "assin2_rte": 15,
    "assin2_sts": 10,
    "bluex": 3,
    "enem_challenge": 3,
    "faquad_nli": 15,
    "hatebr_offensive": 25,
    "oab_exams": 3,
    "portuguese_hate_speech": 25,
    "tweetsentbr": 25
  },
  "model_meta": {
    "truncated": 0,
    "non_truncated": 14150,
    "padded": 0,
    "non_padded": 14150,
    "fewshots_truncated": 0,
    "has_chat_template": true,
    "chat_type": "user_assistant",
    "n_gpus": 1,
    "accelerate_num_process": null,
    "model_sha": "None",
    "model_dtype": "torch.bfloat16",
    "model_memory_footprint": 4889264960,
    "model_num_parameters": 2444628480,
    "model_is_loaded_in_4bit": null,
    "model_is_loaded_in_8bit": null,
    "model_is_quantized": null,
    "model_device": "cuda:0",
    "batch_size": 16,
    "max_length": 4096,
    "max_ctx_length": 4064,
    "max_gen_toks": 32
  },
  "task_model_meta": {
    "assin2_rte": {
      "sample_size": 2448,
      "truncated": 0,
      "non_truncated": 2448,
      "padded": 0,
      "non_padded": 2448,
      "fewshots_truncated": 0,
      "mean_seq_length": 1061.423202614379,
      "min_seq_length": 1046,
      "max_seq_length": 1100,
      "max_ctx_length": 4064,
      "max_gen_toks": 32,
      "mean_original_fewshots_size": 15.0,
      "mean_effective_fewshot_size": 15.0
    },
    "assin2_sts": {
      "sample_size": 2448,
      "truncated": 0,
      "non_truncated": 2448,
      "padded": 0,
      "non_padded": 2448,
      "fewshots_truncated": 0,
      "mean_seq_length": 747.4232026143791,
      "min_seq_length": 732,
      "max_seq_length": 786,
      "max_ctx_length": 4064,
      "max_gen_toks": 32,
      "mean_original_fewshots_size": 10.0,
      "mean_effective_fewshot_size": 10.0
    },
    "bluex": {
      "sample_size": 719,
      "truncated": 0,
      "non_truncated": 719,
      "padded": 0,
      "non_padded": 719,
      "fewshots_truncated": 0,
      "mean_seq_length": 1198.817802503477,
      "min_seq_length": 932,
      "max_seq_length": 1829,
      "max_ctx_length": 4064,
      "max_gen_toks": 32,
      "mean_original_fewshots_size": 3.0,
      "mean_effective_fewshot_size": 3.0
    },
    "enem_challenge": {
      "sample_size": 1429,
      "truncated": 0,
      "non_truncated": 1429,
      "padded": 0,
      "non_padded": 1429,
      "fewshots_truncated": 0,
      "mean_seq_length": 1035.4177746675998,
      "min_seq_length": 857,
      "max_seq_length": 2512,
      "max_ctx_length": 4064,
      "max_gen_toks": 32,
      "mean_original_fewshots_size": 3.0,
      "mean_effective_fewshot_size": 3.0
    },
    "faquad_nli": {
      "sample_size": 650,
      "truncated": 0,
      "non_truncated": 650,
      "padded": 0,
      "non_padded": 650,
      "fewshots_truncated": 0,
      "mean_seq_length": 1083.1338461538462,
      "min_seq_length": 1051,
      "max_seq_length": 1149,
      "max_ctx_length": 4064,
      "max_gen_toks": 32,
      "mean_original_fewshots_size": 15.0,
      "mean_effective_fewshot_size": 15.0
    },
    "hatebr_offensive": {
      "sample_size": 1400,
      "truncated": 0,
      "non_truncated": 1400,
      "padded": 0,
      "non_padded": 1400,
      "fewshots_truncated": 0,
      "mean_seq_length": 1090.4407142857142,
      "min_seq_length": 1075,
      "max_seq_length": 1284,
      "max_ctx_length": 4064,
      "max_gen_toks": 32,
      "mean_original_fewshots_size": 25.0,
      "mean_effective_fewshot_size": 25.0
    },
    "oab_exams": {
      "sample_size": 2195,
      "truncated": 0,
      "non_truncated": 2195,
      "padded": 0,
      "non_padded": 2195,
      "fewshots_truncated": 0,
      "mean_seq_length": 863.024145785877,
      "min_seq_length": 690,
      "max_seq_length": 1139,
      "max_ctx_length": 4064,
      "max_gen_toks": 32,
      "mean_original_fewshots_size": 3.0,
      "mean_effective_fewshot_size": 3.0
    },
    "portuguese_hate_speech": {
      "sample_size": 851,
      "truncated": 0,
      "non_truncated": 851,
      "padded": 0,
      "non_padded": 851,
      "fewshots_truncated": 0,
      "mean_seq_length": 1442.021151586369,
      "min_seq_length": 1415,
      "max_seq_length": 1478,
      "max_ctx_length": 4064,
      "max_gen_toks": 32,
      "mean_original_fewshots_size": 25.0,
      "mean_effective_fewshot_size": 25.0
    },
    "tweetsentbr": {
      "sample_size": 2010,
      "truncated": 0,
      "non_truncated": 2010,
      "padded": 0,
      "non_padded": 2010,
      "fewshots_truncated": 0,
      "mean_seq_length": 1370.4194029850746,
      "min_seq_length": 1353,
      "max_seq_length": 1427,
      "max_ctx_length": 4064,
      "max_gen_toks": 32,
      "mean_original_fewshots_size": 25.0,
      "mean_effective_fewshot_size": 25.0
    }
  },
  "config": {
    "model": "huggingface",
    "model_args": "pretrained=/lustre/mlnvme/data/asen_hpc-mula/checkpoints-llama/slurm_job_17782345/step_42164",
    "batch_size": "auto",
    "batch_sizes": [],
    "device": "cuda:0",
    "use_cache": null,
    "limit": null,
    "bootstrap_iters": 100000,
    "gen_kwargs": null
  },
  "git_hash": null
}