From 78a6661ff1838a52daeaf3ff3889f8596873821d Mon Sep 17 00:00:00 2001 From: ModelHub XC Date: Mon, 15 Jun 2026 07:40:14 +0800 Subject: [PATCH] =?UTF-8?q?=E5=88=9D=E5=A7=8B=E5=8C=96=E9=A1=B9=E7=9B=AE?= =?UTF-8?q?=EF=BC=8C=E7=94=B1ModelHub=20XC=E7=A4=BE=E5=8C=BA=E6=8F=90?= =?UTF-8?q?=E4=BE=9B=E6=A8=A1=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Model: bigscience/bloomz-7b1-p3 Source: Original Platform --- .gitattributes | 75 ++ README.md | 882 ++++++++++++++++++ config.json | 31 + configuration.json | 1 + .../ar/Answer_Given_options/results.json | 9 + .../ar/Choose_Story_Ending/results.json | 9 + .../ar/Generate_Ending/results.json | 9 + .../ar/Novel_Correct_Ending/results.json | 9 + .../results.json | 9 + .../es/Answer_Given_options/results.json | 9 + .../es/Choose_Story_Ending/results.json | 9 + .../es/Generate_Ending/results.json | 9 + .../es/Novel_Correct_Ending/results.json | 9 + .../results.json | 9 + .../eu/Answer_Given_options/results.json | 9 + .../eu/Choose_Story_Ending/results.json | 9 + .../eu/Generate_Ending/results.json | 9 + .../eu/Novel_Correct_Ending/results.json | 9 + .../results.json | 9 + .../hi/Answer_Given_options/results.json | 9 + .../hi/Choose_Story_Ending/results.json | 9 + .../hi/Generate_Ending/results.json | 9 + .../hi/Novel_Correct_Ending/results.json | 9 + .../results.json | 9 + .../id/Answer_Given_options/results.json | 9 + .../id/Choose_Story_Ending/results.json | 9 + .../id/Generate_Ending/results.json | 9 + .../id/Novel_Correct_Ending/results.json | 9 + .../results.json | 9 + .../zh/Answer_Given_options/results.json | 9 + .../zh/Choose_Story_Ending/results.json | 9 + .../zh/Generate_Ending/results.json | 9 + .../zh/Novel_Correct_Ending/results.json | 9 + .../results.json | 9 + .../en/Replace/results.json | 9 + .../en/True_or_False/results.json | 9 + .../en/does_underscore_refer_to/results.json | 9 + .../en/stand_for/results.json | 9 + .../en/underscore_refer_to/results.json | 9 + .../fr/Replace/results.json | 9 + .../fr/True_or_False/results.json | 9 + .../fr/does_underscore_refer_to/results.json | 9 + .../fr/stand_for/results.json | 9 + .../fr/underscore_refer_to/results.json | 9 + .../pt/Replace/results.json | 9 + .../pt/True_or_False/results.json | 9 + .../pt/does_underscore_refer_to/results.json | 9 + .../pt/stand_for/results.json | 9 + .../pt/underscore_refer_to/results.json | 9 + .../zh/Replace/results.json | 9 + .../zh/True_or_False/results.json | 9 + .../zh/does_underscore_refer_to/results.json | 9 + .../zh/stand_for/results.json | 9 + .../zh/underscore_refer_to/results.json | 9 + .../anli/dev_r1/GPT-3_style/results.json | 9 + .../anli/dev_r1/MNLI_crowdsource/results.json | 9 + .../anli/dev_r1/can_we_infer/results.json | 9 + .../results.json | 9 + .../dev_r1/justified_in_saying/results.json | 9 + .../anli/dev_r2/GPT-3_style/results.json | 9 + .../anli/dev_r2/MNLI_crowdsource/results.json | 9 + .../anli/dev_r2/can_we_infer/results.json | 9 + .../results.json | 9 + .../dev_r2/justified_in_saying/results.json | 9 + .../anli/dev_r3/GPT-3_style/results.json | 9 + .../anli/dev_r3/MNLI_crowdsource/results.json | 9 + .../anli/dev_r3/can_we_infer/results.json | 9 + .../results.json | 9 + .../dev_r3/justified_in_saying/results.json | 9 + .../evaluation_l1/merged.csv | 194 ++++ .../evaluation_l1/merged.json | 1 + .../2016/Answer_Given_options/results.json | 9 + .../2016/Choose_Story_Ending/results.json | 9 + .../2016/Generate_Ending/results.json | 9 + .../2016/Novel_Correct_Ending/results.json | 9 + .../results.json | 9 + .../super_glue/cb/GPT-3_style/results.json | 9 + .../cb/MNLI_crowdsource/results.json | 9 + .../super_glue/cb/can_we_infer/results.json | 9 + .../results.json | 9 + .../cb/justified_in_saying/results.json | 9 + .../results.json | 9 + .../copa/C1_or_C2?_premise/results.json | 9 + .../super_glue/copa/best_option/results.json | 9 + .../super_glue/copa/cause_effect/results.json | 9 + .../copa/i_am_hesitating/results.json | 9 + .../copa/plausible_alternatives/results.json | 9 + .../super_glue/rte/GPT-3_style/results.json | 9 + .../rte/MNLI_crowdsource/results.json | 9 + .../rte/does_it_follow_that/results.json | 9 + .../rte/guaranteed_true/results.json | 9 + .../super_glue/rte/should_assume/results.json | 9 + .../winogrande_xl/Replace/results.json | 9 + .../winogrande_xl/True_or_False/results.json | 9 + .../does_underscore_refer_to/results.json | 9 + .../winogrande_xl/stand_for/results.json | 9 + .../underscore_refer_to/results.json | 9 + .../xcopa/id/C1_or_C2?_premise/results.json | 9 + .../xcopa/id/best_option/results.json | 9 + .../xcopa/id/cause_effect/results.json | 9 + .../xcopa/id/i_am_hesitating/results.json | 9 + .../id/plausible_alternatives/results.json | 9 + .../xcopa/sw/C1_or_C2?_premise/results.json | 9 + .../xcopa/sw/best_option/results.json | 9 + .../xcopa/sw/cause_effect/results.json | 9 + .../xcopa/sw/i_am_hesitating/results.json | 9 + .../sw/plausible_alternatives/results.json | 9 + .../xcopa/ta/C1_or_C2?_premise/results.json | 9 + .../xcopa/ta/best_option/results.json | 9 + .../xcopa/ta/cause_effect/results.json | 9 + .../xcopa/ta/i_am_hesitating/results.json | 9 + .../ta/plausible_alternatives/results.json | 9 + .../xcopa/vi/C1_or_C2?_premise/results.json | 9 + .../xcopa/vi/best_option/results.json | 9 + .../xcopa/vi/cause_effect/results.json | 9 + .../xcopa/vi/i_am_hesitating/results.json | 9 + .../vi/plausible_alternatives/results.json | 9 + .../results.json | 9 + .../xcopa/zh/C1_or_C2?_premise/results.json | 9 + .../xcopa/zh/best_option/results.json | 9 + .../xcopa/zh/cause_effect/results.json | 9 + .../xcopa/zh/i_am_hesitating/results.json | 9 + .../zh/plausible_alternatives/results.json | 9 + .../xnli/ar/GPT-3_style/results.json | 9 + .../xnli/ar/MNLI_crowdsource/results.json | 9 + .../xnli/ar/can_we_infer/results.json | 9 + .../results.json | 9 + .../xnli/ar/justified_in_saying/results.json | 9 + .../xnli/en/GPT-3_style/results.json | 9 + .../xnli/en/MNLI_crowdsource/results.json | 9 + .../xnli/en/can_we_infer/results.json | 9 + .../results.json | 9 + .../xnli/en/justified_in_saying/results.json | 9 + .../xnli/es/GPT-3_style/results.json | 9 + .../xnli/es/MNLI_crowdsource/results.json | 9 + .../xnli/es/can_we_infer/results.json | 9 + .../results.json | 9 + .../xnli/es/justified_in_saying/results.json | 9 + .../xnli/fr/GPT-3_style/results.json | 9 + .../xnli/fr/MNLI_crowdsource/results.json | 9 + .../xnli/fr/can_we_infer/results.json | 9 + .../results.json | 9 + .../xnli/fr/justified_in_saying/results.json | 9 + .../xnli/hi/GPT-3_style/results.json | 9 + .../xnli/hi/MNLI_crowdsource/results.json | 9 + .../xnli/hi/can_we_infer/results.json | 9 + .../results.json | 9 + .../xnli/hi/justified_in_saying/results.json | 9 + .../xnli/sw/GPT-3_style/results.json | 9 + .../xnli/sw/MNLI_crowdsource/results.json | 9 + .../xnli/sw/can_we_infer/results.json | 9 + .../results.json | 9 + .../xnli/sw/justified_in_saying/results.json | 9 + .../xnli/ur/GPT-3_style/results.json | 9 + .../xnli/ur/MNLI_crowdsource/results.json | 9 + .../xnli/ur/can_we_infer/results.json | 9 + .../results.json | 9 + .../xnli/ur/justified_in_saying/results.json | 9 + .../xnli/vi/GPT-3_style/results.json | 9 + .../xnli/vi/MNLI_crowdsource/results.json | 9 + .../xnli/vi/can_we_infer/results.json | 9 + .../results.json | 9 + .../xnli/vi/justified_in_saying/results.json | 9 + .../xnli/zh/GPT-3_style/results.json | 9 + .../xnli/zh/MNLI_crowdsource/results.json | 9 + .../xnli/zh/can_we_infer/results.json | 9 + .../results.json | 9 + .../xnli/zh/justified_in_saying/results.json | 9 + .../my/Answer_Given_options/results.json | 9 + .../my/Choose_Story_Ending/results.json | 9 + .../my/Generate_Ending/results.json | 9 + .../my/Novel_Correct_Ending/results.json | 9 + .../results.json | 9 + .../ru/Answer_Given_options/results.json | 9 + .../ru/Choose_Story_Ending/results.json | 9 + .../ru/Generate_Ending/results.json | 9 + .../ru/Novel_Correct_Ending/results.json | 9 + .../results.json | 9 + .../jp/Replace/results.json | 9 + .../jp/True_or_False/results.json | 9 + .../jp/does_underscore_refer_to/results.json | 9 + .../jp/stand_for/results.json | 9 + .../jp/underscore_refer_to/results.json | 9 + .../ru/Replace/results.json | 9 + .../ru/True_or_False/results.json | 9 + .../ru/does_underscore_refer_to/results.json | 9 + .../ru/stand_for/results.json | 9 + .../ru/underscore_refer_to/results.json | 9 + .../xcopa/et/C1_or_C2?_premise/results.json | 9 + .../xcopa/et/best_option/results.json | 9 + .../xcopa/et/cause_effect/results.json | 9 + .../xcopa/et/i_am_hesitating/results.json | 9 + .../et/plausible_alternatives/results.json | 9 + .../xcopa/ht/C1_or_C2?_premise/results.json | 9 + .../xcopa/ht/best_option/results.json | 9 + .../xcopa/ht/cause_effect/results.json | 9 + .../xcopa/ht/i_am_hesitating/results.json | 9 + .../ht/plausible_alternatives/results.json | 9 + .../xcopa/it/C1_or_C2?_premise/results.json | 9 + .../xcopa/it/best_option/results.json | 9 + .../xcopa/it/cause_effect/results.json | 9 + .../xcopa/it/i_am_hesitating/results.json | 9 + .../it/plausible_alternatives/results.json | 9 + .../xcopa/qu/C1_or_C2?_premise/results.json | 9 + .../xcopa/qu/best_option/results.json | 9 + .../xcopa/qu/cause_effect/results.json | 9 + .../xcopa/qu/i_am_hesitating/results.json | 9 + .../qu/plausible_alternatives/results.json | 9 + .../xcopa/tr/C1_or_C2?_premise/results.json | 9 + .../xcopa/tr/best_option/results.json | 9 + .../xcopa/tr/cause_effect/results.json | 9 + .../xcopa/tr/i_am_hesitating/results.json | 9 + .../tr/plausible_alternatives/results.json | 9 + .../xnli/bg/GPT-3_style/results.json | 9 + .../xnli/bg/MNLI_crowdsource/results.json | 9 + .../xnli/bg/can_we_infer/results.json | 9 + .../results.json | 9 + .../xnli/bg/justified_in_saying/results.json | 9 + .../xnli/de/GPT-3_style/results.json | 9 + .../xnli/de/MNLI_crowdsource/results.json | 9 + .../xnli/de/can_we_infer/results.json | 9 + .../results.json | 9 + .../xnli/de/justified_in_saying/results.json | 9 + .../xnli/el/GPT-3_style/results.json | 9 + .../xnli/el/MNLI_crowdsource/results.json | 9 + .../xnli/el/can_we_infer/results.json | 9 + .../results.json | 9 + .../xnli/el/justified_in_saying/results.json | 9 + .../xnli/ru/GPT-3_style/results.json | 9 + .../xnli/ru/MNLI_crowdsource/results.json | 9 + .../xnli/ru/can_we_infer/results.json | 9 + .../results.json | 9 + .../xnli/ru/justified_in_saying/results.json | 9 + .../xnli/th/GPT-3_style/results.json | 9 + .../xnli/th/MNLI_crowdsource/results.json | 9 + .../xnli/th/can_we_infer/results.json | 9 + .../results.json | 9 + .../xnli/th/justified_in_saying/results.json | 9 + .../xnli/tr/GPT-3_style/results.json | 9 + .../xnli/tr/MNLI_crowdsource/results.json | 9 + .../xnli/tr/can_we_infer/results.json | 9 + .../results.json | 9 + .../xnli/tr/justified_in_saying/results.json | 9 + ...ed=1234.timestamp=2022-09-09T23:48:40.json | 1 + ...ed=1234.timestamp=2022-09-09T23:48:40.json | 1 + ...ed=1234.timestamp=2022-09-09T23:48:40.json | 1 + ...ed=1234.timestamp=2022-09-09T23:48:43.json | 1 + ...ed=1234.timestamp=2022-09-09T23:48:43.json | 1 + ...ed=1234.timestamp=2022-09-09T23:48:38.json | 1 + ...ed=1234.timestamp=2022-09-09T23:48:38.json | 1 + ...ed=1234.timestamp=2022-09-09T23:48:43.json | 1 + ...ed=1234.timestamp=2022-09-09T23:48:38.json | 1 + ...ed=1234.timestamp=2022-09-09T23:48:43.json | 1 + ...ed=1234.timestamp=2022-09-09T23:48:38.json | 1 + ...ed=1234.timestamp=2022-09-09T23:48:43.json | 1 + ...ed=1234.timestamp=2022-09-09T23:48:38.json | 1 + ...ed=1234.timestamp=2022-09-09T23:48:38.json | 1 + ...ed=1234.timestamp=2022-09-09T23:48:38.json | 1 + ...ed=1234.timestamp=2022-09-09T23:48:38.json | 1 + ...ed=1234.timestamp=2022-09-09T23:48:38.json | 1 + ...ed=1234.timestamp=2022-09-09T23:48:38.json | 1 + ...ed=1234.timestamp=2022-09-09T23:48:38.json | 1 + ...ed=1234.timestamp=2022-09-09T23:48:38.json | 1 + ...ed=1234.timestamp=2022-09-09T23:48:38.json | 1 + .../en/prompt_body_title_to_star/results.json | 9 + .../en/prompt_review_to_star/results.json | 9 + .../en/prompt_title_to_star/results.json | 9 + .../es/prompt_body_title_to_star/results.json | 9 + .../es/prompt_review_to_star/results.json | 9 + .../es/prompt_title_to_star/results.json | 9 + .../fr/prompt_body_title_to_star/results.json | 9 + .../fr/prompt_review_to_star/results.json | 9 + .../fr/prompt_title_to_star/results.json | 9 + .../zh/prompt_body_title_to_star/results.json | 9 + .../zh/prompt_review_to_star/results.json | 9 + .../zh/prompt_title_to_star/results.json | 9 + .../results.json | 9 + .../aqua_rat/raw/answer_quiz/results.json | 9 + .../raw/select_the_best_option/results.json | 9 + .../art/choose_hypothesis/results.json | 9 + .../choose_hypothesis_believable/results.json | 9 + .../art/choose_hypothesis_desc/results.json | 9 + .../art/choose_hypothesis_likely/results.json | 9 + .../choose_hypothesis_options/results.json | 9 + .../direct_to_which_department/results.json | 9 + .../banking77/help_page_topic/results.json | 9 + .../rephrase_as_banking_term/results.json | 9 + .../classify/results.json | 9 + .../multi-choice/results.json | 9 + .../premise_context_first/results.json | 9 + .../grammatical_between_1_2/results.json | 9 + .../grammatical_between_A_B/results.json | 9 + .../grammatical_which_one_1_2/results.json | 9 + .../single_sentence_bad_yes_no/results.json | 9 + .../single_sentence_good_yes_no/results.json | 9 + .../results.json | 9 + .../results.json | 9 + .../results.json | 9 + .../results.json | 9 + .../third_evidence_claim_pair/results.json | 9 + .../results.json | 9 + .../results.json | 9 + .../results.json | 9 + .../results.json | 9 + .../most_suitable_answer/results.json | 9 + .../question_answering/results.json | 9 + .../conv_ai_3/ambiguous/results.json | 9 + .../clarification_needed/results.json | 9 + .../conv_ai_3/directly_answer/results.json | 9 + .../conv_ai_3/score_give_number/results.json | 9 + .../conv_ai_3/score_how_much/results.json | 9 + .../best_deal/results.json | 9 + .../good_deal_for_seller/results.json | 9 + .../results.json | 9 + .../results.json | 9 + .../results.json | 9 + .../answer_with_class_label/results.json | 9 + .../results.json | 9 + .../reply_with_emoation_label/results.json | 9 + ...d=1234.timestamp=2022-09-09T23:48:40.jsonl | 3 + ...d=1234.timestamp=2022-09-09T23:48:40.jsonl | 3 + ...d=1234.timestamp=2022-09-09T23:48:40.jsonl | 3 + ...d=1234.timestamp=2022-09-09T23:48:43.jsonl | 3 + ...d=1234.timestamp=2022-09-09T23:48:43.jsonl | 3 + ...d=1234.timestamp=2022-09-09T23:48:38.jsonl | 3 + ...d=1234.timestamp=2022-09-09T23:48:38.jsonl | 3 + ...d=1234.timestamp=2022-09-09T23:48:43.jsonl | 3 + ...d=1234.timestamp=2022-09-09T23:48:38.jsonl | 3 + ...d=1234.timestamp=2022-09-09T23:48:43.jsonl | 3 + ...d=1234.timestamp=2022-09-09T23:48:38.jsonl | 3 + ...d=1234.timestamp=2022-09-09T23:48:43.jsonl | 3 + ...d=1234.timestamp=2022-09-09T23:48:38.jsonl | 3 + ...d=1234.timestamp=2022-09-09T23:48:38.jsonl | 3 + ...d=1234.timestamp=2022-09-09T23:48:38.jsonl | 3 + ...d=1234.timestamp=2022-09-09T23:48:38.jsonl | 3 + ...d=1234.timestamp=2022-09-09T23:48:38.jsonl | 3 + ...d=1234.timestamp=2022-09-09T23:48:38.jsonl | 3 + ...d=1234.timestamp=2022-09-09T23:48:38.jsonl | 3 + ...d=1234.timestamp=2022-09-09T23:48:38.jsonl | 3 + ...d=1234.timestamp=2022-09-09T23:48:38.jsonl | 3 + .../bullish_neutral_bearish/results.json | 9 + .../complementary_industries/results.json | 9 + .../sentences_allagree/sentiment/results.json | 9 + .../share_price_option/results.json | 9 + .../word_comes_to_mind/results.json | 9 + .../results.json | 9 + .../glue/cola/Make_sense_yes_no/results.json | 9 + .../Previous_sentence_acceptable/results.json | 9 + .../glue/cola/editing/results.json | 9 + .../glue/cola/is_this_correct/results.json | 9 + .../following_positive_negative/results.json | 9 + .../glue/sst2/happy_or_mad/results.json | 9 + .../sst2/positive_negative_after/results.json | 9 + .../glue/sst2/review/results.json | 9 + .../glue/sst2/said/results.json | 9 + .../multiple_choice_a_and_q_en/results.json | 9 + .../results.json | 9 + .../multiple_choice_q_and_a_en/results.json | 9 + .../results.json | 9 + .../results.json | 9 + .../multiple_choice_a_and_q_en/results.json | 9 + .../results.json | 9 + .../multiple_choice_q_and_a_en/results.json | 9 + .../results.json | 9 + .../results.json | 9 + .../results.json | 9 + .../results.json | 9 + .../results.json | 9 + .../is_same_event_editor_asks/results.json | 9 + .../results.json | 9 + .../hlgd/is_same_event_refer/results.json | 9 + .../results.json | 9 + .../results.json | 9 + .../results.json | 9 + .../results.json | 9 + .../consume_with_caution/results.json | 9 + .../results.json | 9 + .../results.json | 9 + .../results.json | 9 + .../sa_spaeng/express_sentiment/results.json | 9 + .../sa_spaeng/negation_template/results.json | 9 + .../results.json | 9 + .../sentiment_trying_to_express/results.json | 9 + .../sa_spaeng/the_author_seem/results.json | 9 + .../math_qa/choose_correct_og/results.json | 9 + .../first_choice_then_problem/results.json | 9 + .../math_qa/gre_problem/results.json | 9 + .../math_qa/pick_the_correct/results.json | 9 + .../math_qa/problem_set_type/results.json | 9 + .../evaluation_val/merged.csv | 207 ++++ .../evaluation_val/merged.json | 1 + .../Evidences_+_review/results.json | 9 + .../results.json | 9 + .../results.json | 9 + .../results.json | 9 + .../mwsc/in-the-sentence/results.json | 9 + .../mwsc/is-correct/results.json | 9 + .../mwsc/options-or/results.json | 9 + .../mwsc/what-think/results.json | 9 + .../onestop_english/ara_context/results.json | 9 + .../onestop_english/assess/results.json | 9 + .../results.json | 9 + .../onestop_english/esl_context/results.json | 9 + .../esl_variation/results.json | 9 + .../results.json | 9 + .../most_appropriate_sentiment/results.json | 9 + .../results.json | 9 + .../results.json | 9 + .../question_answer_format/results.json | 9 + .../results.json | 9 + .../Question_Answering_(Short)/results.json | 9 + .../results.json | 9 + .../most_suitable_answer/results.json | 9 + .../question_answering/results.json | 9 + .../question_to_answer_index/results.json | 9 + .../scicite/Classify_intent/results.json | 9 + .../results.json | 9 + .../results.json | 9 + .../results.json | 9 + .../scicite/can_describe/results.json | 9 + .../is-he-talking-about/results.json | 9 + .../make-sense-rand/results.json | 9 + .../which-answer-1st-vs-random/results.json | 9 + .../would-make-sense-qu-rand/results.json | 9 + ...ed=1234.timestamp=2022-09-09T23:48:40.json | 132 +++ ...ed=1234.timestamp=2022-09-09T23:48:40.json | 132 +++ ...ed=1234.timestamp=2022-09-09T23:48:40.json | 132 +++ ...ed=1234.timestamp=2022-09-09T23:48:43.json | 24 + ...ed=1234.timestamp=2022-09-09T23:48:43.json | 24 + ...ed=1234.timestamp=2022-09-09T23:48:38.json | 24 + ...ed=1234.timestamp=2022-09-09T23:48:38.json | 24 + ...ed=1234.timestamp=2022-09-09T23:48:43.json | 24 + ...ed=1234.timestamp=2022-09-09T23:48:38.json | 24 + ...ed=1234.timestamp=2022-09-09T23:48:43.json | 24 + ...ed=1234.timestamp=2022-09-09T23:48:38.json | 24 + ...ed=1234.timestamp=2022-09-09T23:48:43.json | 24 + ...ed=1234.timestamp=2022-09-09T23:48:38.json | 24 + ...ed=1234.timestamp=2022-09-09T23:48:38.json | 24 + ...ed=1234.timestamp=2022-09-09T23:48:38.json | 24 + ...ed=1234.timestamp=2022-09-09T23:48:38.json | 24 + ...ed=1234.timestamp=2022-09-09T23:48:38.json | 24 + ...ed=1234.timestamp=2022-09-09T23:48:38.json | 24 + ...ed=1234.timestamp=2022-09-09T23:48:38.json | 24 + ...ed=1234.timestamp=2022-09-09T23:48:38.json | 24 + ...ed=1234.timestamp=2022-09-09T23:48:38.json | 24 + .../categorize_query/results.json | 9 + .../categorize_query_brief/results.json | 9 + .../intent_query/results.json | 9 + .../query_intent/results.json | 9 + .../voice_intent/results.json | 9 + ...ed=1234.timestamp=2022-09-10T11:48:47.json | 1 + ...ed=1234.timestamp=2022-09-10T11:48:47.json | 1 + ...d=1234.timestamp=2022-09-10T11:48:47.jsonl | 3 + ...d=1234.timestamp=2022-09-10T11:48:47.jsonl | 3 + ...ed=1234.timestamp=2022-09-10T11:48:47.json | 24 + ...ed=1234.timestamp=2022-09-10T11:48:47.json | 24 + .../xnli/ar/GPT-3_style_arht/results.json | 9 + .../ar/MNLI_crowdsource_arht/results.json | 9 + .../xnli/ar/can_we_infer_arht/results.json | 9 + .../results.json | 9 + .../ar/justified_in_saying_arht/results.json | 9 + .../xnli/es/GPT-3_style_esht/results.json | 9 + .../es/MNLI_crowdsource_esht/results.json | 9 + .../xnli/es/can_we_infer_esht/results.json | 9 + .../results.json | 9 + .../es/justified_in_saying_esht/results.json | 9 + .../xnli/fr/GPT-3_style_frht/results.json | 9 + .../fr/MNLI_crowdsource_frht/results.json | 9 + .../xnli/fr/can_we_infer_frht/results.json | 9 + .../results.json | 9 + .../fr/justified_in_saying_frht/results.json | 9 + .../xnli/hi/GPT-3_style_hiht/results.json | 9 + .../hi/MNLI_crowdsource_hiht/results.json | 9 + .../xnli/hi/can_we_infer_hiht/results.json | 9 + .../results.json | 9 + .../hi/justified_in_saying_hiht/results.json | 9 + .../evaluation_xnliht/xnli/merged.csv | 50 + .../evaluation_xnliht/xnli/merged.json | 1 + .../xnli/sw/GPT-3_style_swht/results.json | 9 + .../sw/MNLI_crowdsource_swht/results.json | 9 + .../xnli/sw/can_we_infer_swht/results.json | 9 + .../results.json | 9 + .../sw/justified_in_saying_swht/results.json | 9 + .../xnli/ur/GPT-3_style_urht/results.json | 9 + .../ur/MNLI_crowdsource_urht/results.json | 9 + .../xnli/ur/can_we_infer_urht/results.json | 9 + .../results.json | 9 + .../ur/justified_in_saying_urht/results.json | 9 + .../xnli/vi/GPT-3_style_viht/results.json | 9 + .../vi/MNLI_crowdsource_viht/results.json | 9 + .../xnli/vi/can_we_infer_viht/results.json | 9 + .../results.json | 9 + .../vi/justified_in_saying_viht/results.json | 9 + .../xnli/zh/GPT-3_style_zhht/results.json | 9 + .../zh/MNLI_crowdsource_zhht/results.json | 9 + .../xnli/zh/can_we_infer_zhht/results.json | 9 + .../results.json | 9 + .../zh/justified_in_saying_zhht/results.json | 9 + .../xnli/ar/GPT-3_style_armt/results.json | 9 + .../ar/MNLI_crowdsource_armt/results.json | 9 + .../xnli/ar/can_we_infer_armt/results.json | 9 + .../results.json | 9 + .../ar/justified_in_saying_armt/results.json | 9 + .../xnli/es/GPT-3_style_esmt/results.json | 9 + .../es/MNLI_crowdsource_esmt/results.json | 9 + .../xnli/es/can_we_infer_esmt/results.json | 9 + .../results.json | 9 + .../es/justified_in_saying_esmt/results.json | 9 + .../xnli/fr/GPT-3_style_frmt/results.json | 9 + .../fr/MNLI_crowdsource_frmt/results.json | 9 + .../xnli/fr/can_we_infer_frmt/results.json | 9 + .../results.json | 9 + .../fr/justified_in_saying_frmt/results.json | 9 + .../xnli/hi/GPT-3_style_himt/results.json | 9 + .../hi/MNLI_crowdsource_himt/results.json | 9 + .../xnli/hi/can_we_infer_himt/results.json | 9 + .../results.json | 9 + .../hi/justified_in_saying_himt/results.json | 9 + .../evaluation_xnlimt/xnli/merged.csv | 50 + .../evaluation_xnlimt/xnli/merged.json | 1 + .../xnli/sw/GPT-3_style_swmt/results.json | 9 + .../sw/MNLI_crowdsource_swmt/results.json | 9 + .../xnli/sw/can_we_infer_swmt/results.json | 9 + .../results.json | 9 + .../sw/justified_in_saying_swmt/results.json | 9 + .../xnli/ur/GPT-3_style_urmt/results.json | 9 + .../ur/MNLI_crowdsource_urmt/results.json | 9 + .../xnli/ur/can_we_infer_urmt/results.json | 9 + .../results.json | 9 + .../ur/justified_in_saying_urmt/results.json | 9 + .../xnli/vi/GPT-3_style_vimt/results.json | 9 + .../vi/MNLI_crowdsource_vimt/results.json | 9 + .../xnli/vi/can_we_infer_vimt/results.json | 9 + .../results.json | 9 + .../vi/justified_in_saying_vimt/results.json | 9 + .../xnli/zh/GPT-3_style_zhmt/results.json | 9 + .../zh/MNLI_crowdsource_zhmt/results.json | 9 + .../xnli/zh/can_we_infer_zhmt/results.json | 9 + .../results.json | 9 + .../zh/justified_in_saying_zhmt/results.json | 9 + .../zh/Answer_Given_options_zhht/results.json | 9 + .../zh/Choose_Story_Ending_zhht/results.json | 9 + .../zh/Generate_Ending_zhht/results.json | 9 + .../zh/Novel_Correct_Ending_zhht/results.json | 9 + .../results.json | 9 + .../zh/Replace_zhht/results.json | 9 + .../zh/True_or_False_zhht/results.json | 9 + .../results.json | 9 + .../zh/stand_for_zhht/results.json | 9 + .../zh/underscore_refer_to_zhht/results.json | 9 + .../evaluation_xwinostorycopaht/merged.csv | 20 + .../evaluation_xwinostorycopaht/merged.json | 1 + .../zh/C1_or_C2?_premise_zhht/results.json | 9 + .../xcopa/zh/best_option_zhht/results.json | 9 + .../xcopa/zh/cause_effect_zhht/results.json | 9 + .../zh/i_am_hesitating_zhht/results.json | 9 + .../plausible_alternatives_zhht/results.json | 9 + .../ar/Answer_Given_options_armt/results.json | 9 + .../ar/Choose_Story_Ending_armt/results.json | 9 + .../ar/Generate_Ending_armt/results.json | 9 + .../ar/Novel_Correct_Ending_armt/results.json | 9 + .../results.json | 9 + .../es/Answer_Given_options_esmt/results.json | 9 + .../es/Choose_Story_Ending_esmt/results.json | 9 + .../es/Generate_Ending_esmt/results.json | 9 + .../es/Novel_Correct_Ending_esmt/results.json | 9 + .../results.json | 9 + .../eu/Answer_Given_options_eumt/results.json | 9 + .../eu/Choose_Story_Ending_eumt/results.json | 9 + .../eu/Generate_Ending_eumt/results.json | 9 + .../eu/Novel_Correct_Ending_eumt/results.json | 9 + .../results.json | 9 + .../hi/Answer_Given_options_himt/results.json | 9 + .../hi/Choose_Story_Ending_himt/results.json | 9 + .../hi/Generate_Ending_himt/results.json | 9 + .../hi/Novel_Correct_Ending_himt/results.json | 9 + .../results.json | 9 + .../id/Answer_Given_options_idmt/results.json | 9 + .../id/Choose_Story_Ending_idmt/results.json | 9 + .../id/Generate_Ending_idmt/results.json | 9 + .../id/Novel_Correct_Ending_idmt/results.json | 9 + .../results.json | 9 + .../zh/Answer_Given_options_zhmt/results.json | 9 + .../zh/Choose_Story_Ending_zhmt/results.json | 9 + .../zh/Generate_Ending_zhmt/results.json | 9 + .../zh/Novel_Correct_Ending_zhmt/results.json | 9 + .../results.json | 9 + .../fr/Replace_frmt/results.json | 9 + .../fr/True_or_False_frmt/results.json | 9 + .../results.json | 9 + .../fr/stand_for_frmt/results.json | 9 + .../fr/underscore_refer_to_frmt/results.json | 9 + .../pt/Replace_ptmt/results.json | 9 + .../pt/True_or_False_ptmt/results.json | 9 + .../results.json | 9 + .../pt/stand_for_ptmt/results.json | 9 + .../pt/underscore_refer_to_ptmt/results.json | 9 + .../zh/Replace_zhmt/results.json | 9 + .../zh/True_or_False_zhmt/results.json | 9 + .../results.json | 9 + .../zh/stand_for_zhmt/results.json | 9 + .../zh/underscore_refer_to_zhmt/results.json | 9 + .../evaluation_xwinostorycopamt/merged.csv | 86 ++ .../evaluation_xwinostorycopamt/merged.json | 1 + .../id/C1_or_C2?_premise_idmt/results.json | 9 + .../xcopa/id/best_option_idmt/results.json | 9 + .../xcopa/id/cause_effect_idmt/results.json | 9 + .../id/i_am_hesitating_idmt/results.json | 9 + .../plausible_alternatives_idmt/results.json | 9 + .../sw/C1_or_C2?_premise_swmt/results.json | 9 + .../xcopa/sw/best_option_swmt/results.json | 9 + .../xcopa/sw/cause_effect_swmt/results.json | 9 + .../sw/i_am_hesitating_swmt/results.json | 9 + .../plausible_alternatives_swmt/results.json | 9 + .../ta/C1_or_C2?_premise_tamt/results.json | 9 + .../xcopa/ta/best_option_tamt/results.json | 9 + .../xcopa/ta/cause_effect_tamt/results.json | 9 + .../ta/i_am_hesitating_tamt/results.json | 9 + .../plausible_alternatives_tamt/results.json | 9 + .../vi/C1_or_C2?_premise_vimt/results.json | 9 + .../xcopa/vi/best_option_vimt/results.json | 9 + .../xcopa/vi/cause_effect_vimt/results.json | 9 + .../vi/i_am_hesitating_vimt/results.json | 9 + .../plausible_alternatives_vimt/results.json | 9 + .../zh/C1_or_C2?_premise_zhmt/results.json | 9 + .../xcopa/zh/best_option_zhmt/results.json | 9 + .../xcopa/zh/cause_effect_zhmt/results.json | 9 + .../zh/i_am_hesitating_zhmt/results.json | 9 + .../plausible_alternatives_zhmt/results.json | 9 + model.safetensors | 3 + pytorch_model.bin | 3 + special_tokens_map.json | 1 + tokenizer.json | 3 + tokenizer_config.json | 1 + 634 files changed, 7477 insertions(+) create mode 100644 .gitattributes create mode 100644 README.md create mode 100644 config.json create mode 100644 configuration.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/ar/Answer_Given_options/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/ar/Choose_Story_Ending/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/ar/Generate_Ending/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/ar/Novel_Correct_Ending/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/ar/Story_Continuation_and_Options/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/es/Answer_Given_options/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/es/Choose_Story_Ending/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/es/Generate_Ending/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/es/Novel_Correct_Ending/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/es/Story_Continuation_and_Options/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/eu/Answer_Given_options/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/eu/Choose_Story_Ending/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/eu/Generate_Ending/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/eu/Novel_Correct_Ending/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/eu/Story_Continuation_and_Options/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/hi/Answer_Given_options/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/hi/Choose_Story_Ending/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/hi/Generate_Ending/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/hi/Novel_Correct_Ending/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/hi/Story_Continuation_and_Options/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/id/Answer_Given_options/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/id/Choose_Story_Ending/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/id/Generate_Ending/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/id/Novel_Correct_Ending/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/id/Story_Continuation_and_Options/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/zh/Answer_Given_options/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/zh/Choose_Story_Ending/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/zh/Generate_Ending/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/zh/Novel_Correct_Ending/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/zh/Story_Continuation_and_Options/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/en/Replace/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/en/True_or_False/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/en/does_underscore_refer_to/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/en/stand_for/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/en/underscore_refer_to/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/fr/Replace/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/fr/True_or_False/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/fr/does_underscore_refer_to/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/fr/stand_for/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/fr/underscore_refer_to/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/pt/Replace/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/pt/True_or_False/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/pt/does_underscore_refer_to/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/pt/stand_for/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/pt/underscore_refer_to/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/zh/Replace/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/zh/True_or_False/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/zh/does_underscore_refer_to/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/zh/stand_for/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/zh/underscore_refer_to/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r1/GPT-3_style/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r1/MNLI_crowdsource/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r1/can_we_infer/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r1/guaranteed_possible_impossible/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r1/justified_in_saying/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r2/GPT-3_style/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r2/MNLI_crowdsource/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r2/can_we_infer/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r2/guaranteed_possible_impossible/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r2/justified_in_saying/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r3/GPT-3_style/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r3/MNLI_crowdsource/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r3/can_we_infer/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r3/guaranteed_possible_impossible/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r3/justified_in_saying/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/merged.csv create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/merged.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/story_cloze/2016/Answer_Given_options/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/story_cloze/2016/Choose_Story_Ending/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/story_cloze/2016/Generate_Ending/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/story_cloze/2016/Novel_Correct_Ending/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/story_cloze/2016/Story_Continuation_and_Options/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/cb/GPT-3_style/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/cb/MNLI_crowdsource/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/cb/can_we_infer/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/cb/guaranteed_possible_impossible/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/cb/justified_in_saying/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/copa/C1_or_C2?_premise,_so_because…/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/copa/C1_or_C2?_premise/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/copa/best_option/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/copa/cause_effect/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/copa/i_am_hesitating/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/copa/plausible_alternatives/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/rte/GPT-3_style/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/rte/MNLI_crowdsource/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/rte/does_it_follow_that/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/rte/guaranteed_true/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/rte/should_assume/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/winogrande/winogrande_xl/Replace/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/winogrande/winogrande_xl/True_or_False/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/winogrande/winogrande_xl/does_underscore_refer_to/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/winogrande/winogrande_xl/stand_for/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/winogrande/winogrande_xl/underscore_refer_to/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/id/C1_or_C2?_premise/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/id/best_option/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/id/cause_effect/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/id/i_am_hesitating/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/id/plausible_alternatives/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/sw/C1_or_C2?_premise/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/sw/best_option/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/sw/cause_effect/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/sw/i_am_hesitating/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/sw/plausible_alternatives/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/ta/C1_or_C2?_premise/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/ta/best_option/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/ta/cause_effect/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/ta/i_am_hesitating/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/ta/plausible_alternatives/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/vi/C1_or_C2?_premise/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/vi/best_option/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/vi/cause_effect/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/vi/i_am_hesitating/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/vi/plausible_alternatives/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/zh/C1_or_C2?_premise,_so_because…/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/zh/C1_or_C2?_premise/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/zh/best_option/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/zh/cause_effect/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/zh/i_am_hesitating/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/zh/plausible_alternatives/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/xnli/ar/GPT-3_style/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/xnli/ar/MNLI_crowdsource/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/xnli/ar/can_we_infer/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/xnli/ar/guaranteed_possible_impossible/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/xnli/ar/justified_in_saying/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/xnli/en/GPT-3_style/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/xnli/en/MNLI_crowdsource/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/xnli/en/can_we_infer/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/xnli/en/guaranteed_possible_impossible/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/xnli/en/justified_in_saying/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/xnli/es/GPT-3_style/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/xnli/es/MNLI_crowdsource/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/xnli/es/can_we_infer/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/xnli/es/guaranteed_possible_impossible/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/xnli/es/justified_in_saying/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/xnli/fr/GPT-3_style/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/xnli/fr/MNLI_crowdsource/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/xnli/fr/can_we_infer/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/xnli/fr/guaranteed_possible_impossible/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/xnli/fr/justified_in_saying/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/xnli/hi/GPT-3_style/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/xnli/hi/MNLI_crowdsource/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/xnli/hi/can_we_infer/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/xnli/hi/guaranteed_possible_impossible/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/xnli/hi/justified_in_saying/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/xnli/sw/GPT-3_style/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/xnli/sw/MNLI_crowdsource/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/xnli/sw/can_we_infer/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/xnli/sw/guaranteed_possible_impossible/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/xnli/sw/justified_in_saying/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/xnli/ur/GPT-3_style/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/xnli/ur/MNLI_crowdsource/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/xnli/ur/can_we_infer/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/xnli/ur/guaranteed_possible_impossible/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/xnli/ur/justified_in_saying/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/xnli/vi/GPT-3_style/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/xnli/vi/MNLI_crowdsource/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/xnli/vi/can_we_infer/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/xnli/vi/guaranteed_possible_impossible/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/xnli/vi/justified_in_saying/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/xnli/zh/GPT-3_style/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/xnli/zh/MNLI_crowdsource/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/xnli/zh/can_we_infer/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/xnli/zh/guaranteed_possible_impossible/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l1/xnli/zh/justified_in_saying/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l2/Muennighoff_xstory_cloze/my/Answer_Given_options/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l2/Muennighoff_xstory_cloze/my/Choose_Story_Ending/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l2/Muennighoff_xstory_cloze/my/Generate_Ending/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l2/Muennighoff_xstory_cloze/my/Novel_Correct_Ending/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l2/Muennighoff_xstory_cloze/my/Story_Continuation_and_Options/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l2/Muennighoff_xstory_cloze/ru/Answer_Given_options/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l2/Muennighoff_xstory_cloze/ru/Choose_Story_Ending/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l2/Muennighoff_xstory_cloze/ru/Generate_Ending/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l2/Muennighoff_xstory_cloze/ru/Novel_Correct_Ending/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l2/Muennighoff_xstory_cloze/ru/Story_Continuation_and_Options/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l2/Muennighoff_xwinograd/jp/Replace/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l2/Muennighoff_xwinograd/jp/True_or_False/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l2/Muennighoff_xwinograd/jp/does_underscore_refer_to/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l2/Muennighoff_xwinograd/jp/stand_for/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l2/Muennighoff_xwinograd/jp/underscore_refer_to/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l2/Muennighoff_xwinograd/ru/Replace/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l2/Muennighoff_xwinograd/ru/True_or_False/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l2/Muennighoff_xwinograd/ru/does_underscore_refer_to/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l2/Muennighoff_xwinograd/ru/stand_for/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l2/Muennighoff_xwinograd/ru/underscore_refer_to/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/et/C1_or_C2?_premise/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/et/best_option/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/et/cause_effect/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/et/i_am_hesitating/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/et/plausible_alternatives/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/ht/C1_or_C2?_premise/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/ht/best_option/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/ht/cause_effect/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/ht/i_am_hesitating/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/ht/plausible_alternatives/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/it/C1_or_C2?_premise/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/it/best_option/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/it/cause_effect/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/it/i_am_hesitating/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/it/plausible_alternatives/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/qu/C1_or_C2?_premise/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/qu/best_option/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/qu/cause_effect/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/qu/i_am_hesitating/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/qu/plausible_alternatives/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/tr/C1_or_C2?_premise/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/tr/best_option/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/tr/cause_effect/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/tr/i_am_hesitating/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/tr/plausible_alternatives/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l2/xnli/bg/GPT-3_style/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l2/xnli/bg/MNLI_crowdsource/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l2/xnli/bg/can_we_infer/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l2/xnli/bg/guaranteed_possible_impossible/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l2/xnli/bg/justified_in_saying/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l2/xnli/de/GPT-3_style/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l2/xnli/de/MNLI_crowdsource/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l2/xnli/de/can_we_infer/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l2/xnli/de/guaranteed_possible_impossible/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l2/xnli/de/justified_in_saying/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l2/xnli/el/GPT-3_style/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l2/xnli/el/MNLI_crowdsource/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l2/xnli/el/can_we_infer/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l2/xnli/el/guaranteed_possible_impossible/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l2/xnli/el/justified_in_saying/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l2/xnli/ru/GPT-3_style/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l2/xnli/ru/MNLI_crowdsource/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l2/xnli/ru/can_we_infer/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l2/xnli/ru/guaranteed_possible_impossible/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l2/xnli/ru/justified_in_saying/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l2/xnli/th/GPT-3_style/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l2/xnli/th/MNLI_crowdsource/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l2/xnli/th/can_we_infer/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l2/xnli/th/guaranteed_possible_impossible/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l2/xnli/th/justified_in_saying/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l2/xnli/tr/GPT-3_style/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l2/xnli/tr/MNLI_crowdsource/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l2/xnli/tr/can_we_infer/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l2/xnli/tr/guaranteed_possible_impossible/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_l2/xnli/tr/justified_in_saying/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=mlsum_es.templates=layman_summ_es.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:40.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=mlsum_es.templates=palm_prompt.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:40.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=mlsum_es.templates=summarise_this_in_es_few_sentences.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:40.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=a_good_translation-en-fr-source+target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:43.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=a_good_translation-en-fr-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:43.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=a_good_translation-fr-en-source+target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=a_good_translation-fr-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=gpt3-en-fr.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:43.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=gpt3-fr-en.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=version-en-fr-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:43.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=version-fr-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=xglm-en-fr-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:43.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=xglm-fr-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=a_good_translation-en-hi-source+target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=a_good_translation-en-hi-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=a_good_translation-hi-en-source+target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=a_good_translation-hi-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=version-en-hi-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=version-hi-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=xglm-en-hi-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=xglm-hi-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/amazon_reviews_multi/en/prompt_body_title_to_star/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/amazon_reviews_multi/en/prompt_review_to_star/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/amazon_reviews_multi/en/prompt_title_to_star/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/amazon_reviews_multi/es/prompt_body_title_to_star/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/amazon_reviews_multi/es/prompt_review_to_star/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/amazon_reviews_multi/es/prompt_title_to_star/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/amazon_reviews_multi/fr/prompt_body_title_to_star/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/amazon_reviews_multi/fr/prompt_review_to_star/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/amazon_reviews_multi/fr/prompt_title_to_star/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/amazon_reviews_multi/zh/prompt_body_title_to_star/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/amazon_reviews_multi/zh/prompt_review_to_star/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/amazon_reviews_multi/zh/prompt_title_to_star/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/aqua_rat/raw/Answer_questions_from_options/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/aqua_rat/raw/answer_quiz/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/aqua_rat/raw/select_the_best_option/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/art/choose_hypothesis/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/art/choose_hypothesis_believable/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/art/choose_hypothesis_desc/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/art/choose_hypothesis_likely/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/art/choose_hypothesis_options/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/banking77/direct_to_which_department/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/banking77/help_page_topic/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/banking77/rephrase_as_banking_term/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/blbooksgenre/title_genre_classifiction/classify/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/blbooksgenre/title_genre_classifiction/multi-choice/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/blbooksgenre/title_genre_classifiction/premise_context_first/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/blimp/adjunct_island/grammatical_between_1_2/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/blimp/adjunct_island/grammatical_between_A_B/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/blimp/adjunct_island/grammatical_which_one_1_2/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/blimp/adjunct_island/single_sentence_bad_yes_no/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/blimp/adjunct_island/single_sentence_good_yes_no/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/climate_fever/claim_and_all_supporting_evidences/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/climate_fever/fifth_evidence_and_claim_itemization/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/climate_fever/first_evidence_and_claim_itemization/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/climate_fever/second_evidence_and_claim_itemization/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/climate_fever/third_evidence_claim_pair/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/codah/codah/affirmative_instruction_after_sentence_and_choices/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/codah/codah/affirmative_instruction_before_sentence_and_choices/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/codah/codah/interrogative_instruction_after_sentence_and_choices/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/commonsense_qa/answer_given_question_without_options/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/commonsense_qa/most_suitable_answer/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/commonsense_qa/question_answering/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/conv_ai_3/ambiguous/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/conv_ai_3/clarification_needed/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/conv_ai_3/directly_answer/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/conv_ai_3/score_give_number/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/conv_ai_3/score_how_much/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/craigslist_bargains/best_deal/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/craigslist_bargains/good_deal_for_seller/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/craigslist_bargains/good_deal_for_seller_no_list_price/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/craigslist_bargains/good_deal_for_seller_no_list_price_implicit/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/emotion/answer_question_with_emotion_label/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/emotion/answer_with_class_label/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/emotion/choose_the_best_emotion_label/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/emotion/reply_with_emoation_label/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=mlsum_es.templates=layman_summ_es.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:40.jsonl create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=mlsum_es.templates=palm_prompt.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:40.jsonl create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=mlsum_es.templates=summarise_this_in_es_few_sentences.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:40.jsonl create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=a_good_translation-en-fr-source+target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:43.jsonl create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=a_good_translation-en-fr-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:43.jsonl create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=a_good_translation-fr-en-source+target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.jsonl create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=a_good_translation-fr-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.jsonl create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=gpt3-en-fr.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:43.jsonl create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=gpt3-fr-en.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.jsonl create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=version-en-fr-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:43.jsonl create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=version-fr-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.jsonl create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=xglm-en-fr-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:43.jsonl create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=xglm-fr-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.jsonl create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=a_good_translation-en-hi-source+target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.jsonl create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=a_good_translation-en-hi-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.jsonl create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=a_good_translation-hi-en-source+target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.jsonl create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=a_good_translation-hi-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.jsonl create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=version-en-hi-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.jsonl create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=version-hi-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.jsonl create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=xglm-en-hi-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.jsonl create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=xglm-hi-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.jsonl create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/financial_phrasebank/sentences_allagree/bullish_neutral_bearish/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/financial_phrasebank/sentences_allagree/complementary_industries/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/financial_phrasebank/sentences_allagree/sentiment/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/financial_phrasebank/sentences_allagree/share_price_option/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/financial_phrasebank/sentences_allagree/word_comes_to_mind/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/glue/cola/Following_sentence_acceptable/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/glue/cola/Make_sense_yes_no/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/glue/cola/Previous_sentence_acceptable/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/glue/cola/editing/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/glue/cola/is_this_correct/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/glue/sst2/following_positive_negative/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/glue/sst2/happy_or_mad/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/glue/sst2/positive_negative_after/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/glue/sst2/review/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/glue/sst2/said/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/head_qa/en/multiple_choice_a_and_q_en/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/head_qa/en/multiple_choice_a_and_q_with_context_en/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/head_qa/en/multiple_choice_q_and_a_en/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/head_qa/en/multiple_choice_q_and_a_index_en/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/head_qa/en/multiple_choice_q_and_a_index_with_context_en/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/head_qa/es/multiple_choice_a_and_q_en/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/head_qa/es/multiple_choice_a_and_q_with_context_en/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/head_qa/es/multiple_choice_q_and_a_en/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/head_qa/es/multiple_choice_q_and_a_index_en/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/head_qa/es/multiple_choice_q_and_a_index_with_context_en/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/health_fact/claim_explanation_classification/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/health_fact/claim_veracity_classification_after_reading_I_believe/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/health_fact/claim_veracity_classification_tell_me/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/hlgd/is_same_event_editor_asks/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/hlgd/is_same_event_interrogative_talk/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/hlgd/is_same_event_refer/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/hlgd/is_same_event_with_time_interrogative_related/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/hlgd/is_same_event_with_time_interrogative_talk/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/hyperpartisan_news_detection/byarticle/consider_does_it_follow_a_hyperpartisan_argumentation/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/hyperpartisan_news_detection/byarticle/consider_it_exhibits_extreme_one_sidedness/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/hyperpartisan_news_detection/byarticle/consume_with_caution/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/hyperpartisan_news_detection/byarticle/extreme_left_wing_or_right_wing/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/hyperpartisan_news_detection/byarticle/follows_hyperpartisan_argumentation/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/liar/Given_statement_guess_category/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/lince/sa_spaeng/express_sentiment/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/lince/sa_spaeng/negation_template/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/lince/sa_spaeng/original_poster_expressed_sentiment/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/lince/sa_spaeng/sentiment_trying_to_express/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/lince/sa_spaeng/the_author_seem/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/math_qa/choose_correct_og/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/math_qa/first_choice_then_problem/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/math_qa/gre_problem/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/math_qa/pick_the_correct/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/math_qa/problem_set_type/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/merged.csv create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/merged.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/movie_rationales/Evidences_+_review/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/movie_rationales/Evidences_sentiment_classification/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/movie_rationales/Standard_binary_sentiment_analysis/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/mwsc/in-the-sentence-question-first/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/mwsc/in-the-sentence/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/mwsc/is-correct/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/mwsc/options-or/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/mwsc/what-think/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/onestop_english/ara_context/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/onestop_english/assess/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/onestop_english/determine_reading_level_from_the_first_three_sentences/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/onestop_english/esl_context/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/onestop_english/esl_variation/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/poem_sentiment/guess_sentiment_without_options_variation_1/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/poem_sentiment/most_appropriate_sentiment/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/poem_sentiment/positive_or_negative_sentiment_variation_1/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/poem_sentiment/positive_or_negative_sentiment_variation_2/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/poem_sentiment/question_answer_format/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/pubmed_qa/pqa_labeled/Long_Answer_to_Final_Decision/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/pubmed_qa/pqa_labeled/Question_Answering_(Short)/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/riddle_sense/answer_given_question_without_options/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/riddle_sense/most_suitable_answer/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/riddle_sense/question_answering/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/riddle_sense/question_to_answer_index/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/scicite/Classify_intent/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/scicite/Classify_intent_(choices_first)/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/scicite/Classify_intent_(select_choice)/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/scicite/Classify_intent_w_section_(select_choice)/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/scicite/can_describe/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/selqa/answer_selection_analysis/is-he-talking-about/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/selqa/answer_selection_analysis/make-sense-rand/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/selqa/answer_selection_analysis/which-answer-1st-vs-random/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/selqa/answer_selection_analysis/would-make-sense-qu-rand/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=mlsum_es.templates=layman_summ_es.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:40.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=mlsum_es.templates=palm_prompt.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:40.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=mlsum_es.templates=summarise_this_in_es_few_sentences.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:40.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=a_good_translation-en-fr-source+target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:43.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=a_good_translation-en-fr-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:43.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=a_good_translation-fr-en-source+target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=a_good_translation-fr-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=gpt3-en-fr.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:43.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=gpt3-fr-en.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=version-en-fr-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:43.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=version-fr-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=xglm-en-fr-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:43.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=xglm-fr-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=a_good_translation-en-hi-source+target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=a_good_translation-en-hi-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=a_good_translation-hi-en-source+target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=a_good_translation-hi-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=version-en-hi-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=version-hi-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=xglm-en-hi-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=xglm-hi-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/snips_built_in_intents/categorize_query/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/snips_built_in_intents/categorize_query_brief/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/snips_built_in_intents/intent_query/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/snips_built_in_intents/query_intent/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/snips_built_in_intents/voice_intent/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/wmt14_hi_en/agg.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=gpt-3-en-hi-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-10T11:48:47.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/wmt14_hi_en/agg.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=gpt-3-hi-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-10T11:48:47.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/wmt14_hi_en/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=gpt-3-en-hi-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-10T11:48:47.jsonl create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/wmt14_hi_en/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=gpt-3-hi-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-10T11:48:47.jsonl create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/wmt14_hi_en/slim.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=gpt-3-en-hi-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-10T11:48:47.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_val/wmt14_hi_en/slim.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=gpt-3-hi-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-10T11:48:47.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/ar/GPT-3_style_arht/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/ar/MNLI_crowdsource_arht/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/ar/can_we_infer_arht/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/ar/guaranteed_possible_impossible_arht/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/ar/justified_in_saying_arht/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/es/GPT-3_style_esht/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/es/MNLI_crowdsource_esht/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/es/can_we_infer_esht/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/es/guaranteed_possible_impossible_esht/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/es/justified_in_saying_esht/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/fr/GPT-3_style_frht/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/fr/MNLI_crowdsource_frht/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/fr/can_we_infer_frht/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/fr/guaranteed_possible_impossible_frht/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/fr/justified_in_saying_frht/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/hi/GPT-3_style_hiht/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/hi/MNLI_crowdsource_hiht/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/hi/can_we_infer_hiht/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/hi/guaranteed_possible_impossible_hiht/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/hi/justified_in_saying_hiht/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/merged.csv create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/merged.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/sw/GPT-3_style_swht/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/sw/MNLI_crowdsource_swht/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/sw/can_we_infer_swht/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/sw/guaranteed_possible_impossible_swht/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/sw/justified_in_saying_swht/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/ur/GPT-3_style_urht/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/ur/MNLI_crowdsource_urht/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/ur/can_we_infer_urht/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/ur/guaranteed_possible_impossible_urht/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/ur/justified_in_saying_urht/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/vi/GPT-3_style_viht/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/vi/MNLI_crowdsource_viht/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/vi/can_we_infer_viht/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/vi/guaranteed_possible_impossible_viht/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/vi/justified_in_saying_viht/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/zh/GPT-3_style_zhht/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/zh/MNLI_crowdsource_zhht/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/zh/can_we_infer_zhht/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/zh/guaranteed_possible_impossible_zhht/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/zh/justified_in_saying_zhht/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/ar/GPT-3_style_armt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/ar/MNLI_crowdsource_armt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/ar/can_we_infer_armt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/ar/guaranteed_possible_impossible_armt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/ar/justified_in_saying_armt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/es/GPT-3_style_esmt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/es/MNLI_crowdsource_esmt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/es/can_we_infer_esmt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/es/guaranteed_possible_impossible_esmt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/es/justified_in_saying_esmt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/fr/GPT-3_style_frmt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/fr/MNLI_crowdsource_frmt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/fr/can_we_infer_frmt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/fr/guaranteed_possible_impossible_frmt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/fr/justified_in_saying_frmt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/hi/GPT-3_style_himt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/hi/MNLI_crowdsource_himt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/hi/can_we_infer_himt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/hi/guaranteed_possible_impossible_himt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/hi/justified_in_saying_himt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/merged.csv create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/merged.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/sw/GPT-3_style_swmt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/sw/MNLI_crowdsource_swmt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/sw/can_we_infer_swmt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/sw/guaranteed_possible_impossible_swmt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/sw/justified_in_saying_swmt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/ur/GPT-3_style_urmt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/ur/MNLI_crowdsource_urmt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/ur/can_we_infer_urmt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/ur/guaranteed_possible_impossible_urmt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/ur/justified_in_saying_urmt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/vi/GPT-3_style_vimt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/vi/MNLI_crowdsource_vimt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/vi/can_we_infer_vimt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/vi/guaranteed_possible_impossible_vimt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/vi/justified_in_saying_vimt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/zh/GPT-3_style_zhmt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/zh/MNLI_crowdsource_zhmt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/zh/can_we_infer_zhmt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/zh/guaranteed_possible_impossible_zhmt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/zh/justified_in_saying_zhmt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopaht/Muennighoff_xstory_cloze/zh/Answer_Given_options_zhht/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopaht/Muennighoff_xstory_cloze/zh/Choose_Story_Ending_zhht/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopaht/Muennighoff_xstory_cloze/zh/Generate_Ending_zhht/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopaht/Muennighoff_xstory_cloze/zh/Novel_Correct_Ending_zhht/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopaht/Muennighoff_xstory_cloze/zh/Story_Continuation_and_Options_zhht/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopaht/Muennighoff_xwinograd/zh/Replace_zhht/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopaht/Muennighoff_xwinograd/zh/True_or_False_zhht/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopaht/Muennighoff_xwinograd/zh/does_underscore_refer_to_zhht/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopaht/Muennighoff_xwinograd/zh/stand_for_zhht/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopaht/Muennighoff_xwinograd/zh/underscore_refer_to_zhht/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopaht/merged.csv create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopaht/merged.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopaht/xcopa/zh/C1_or_C2?_premise_zhht/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopaht/xcopa/zh/best_option_zhht/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopaht/xcopa/zh/cause_effect_zhht/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopaht/xcopa/zh/i_am_hesitating_zhht/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopaht/xcopa/zh/plausible_alternatives_zhht/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/ar/Answer_Given_options_armt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/ar/Choose_Story_Ending_armt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/ar/Generate_Ending_armt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/ar/Novel_Correct_Ending_armt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/ar/Story_Continuation_and_Options_armt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/es/Answer_Given_options_esmt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/es/Choose_Story_Ending_esmt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/es/Generate_Ending_esmt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/es/Novel_Correct_Ending_esmt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/es/Story_Continuation_and_Options_esmt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/eu/Answer_Given_options_eumt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/eu/Choose_Story_Ending_eumt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/eu/Generate_Ending_eumt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/eu/Novel_Correct_Ending_eumt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/eu/Story_Continuation_and_Options_eumt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/hi/Answer_Given_options_himt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/hi/Choose_Story_Ending_himt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/hi/Generate_Ending_himt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/hi/Novel_Correct_Ending_himt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/hi/Story_Continuation_and_Options_himt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/id/Answer_Given_options_idmt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/id/Choose_Story_Ending_idmt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/id/Generate_Ending_idmt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/id/Novel_Correct_Ending_idmt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/id/Story_Continuation_and_Options_idmt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/zh/Answer_Given_options_zhmt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/zh/Choose_Story_Ending_zhmt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/zh/Generate_Ending_zhmt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/zh/Novel_Correct_Ending_zhmt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/zh/Story_Continuation_and_Options_zhmt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/fr/Replace_frmt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/fr/True_or_False_frmt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/fr/does_underscore_refer_to_frmt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/fr/stand_for_frmt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/fr/underscore_refer_to_frmt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/pt/Replace_ptmt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/pt/True_or_False_ptmt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/pt/does_underscore_refer_to_ptmt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/pt/stand_for_ptmt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/pt/underscore_refer_to_ptmt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/zh/Replace_zhmt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/zh/True_or_False_zhmt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/zh/does_underscore_refer_to_zhmt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/zh/stand_for_zhmt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/zh/underscore_refer_to_zhmt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/merged.csv create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/merged.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/id/C1_or_C2?_premise_idmt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/id/best_option_idmt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/id/cause_effect_idmt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/id/i_am_hesitating_idmt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/id/plausible_alternatives_idmt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/sw/C1_or_C2?_premise_swmt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/sw/best_option_swmt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/sw/cause_effect_swmt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/sw/i_am_hesitating_swmt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/sw/plausible_alternatives_swmt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/ta/C1_or_C2?_premise_tamt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/ta/best_option_tamt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/ta/cause_effect_tamt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/ta/i_am_hesitating_tamt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/ta/plausible_alternatives_tamt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/vi/C1_or_C2?_premise_vimt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/vi/best_option_vimt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/vi/cause_effect_vimt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/vi/i_am_hesitating_vimt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/vi/plausible_alternatives_vimt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/zh/C1_or_C2?_premise_zhmt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/zh/best_option_zhmt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/zh/cause_effect_zhmt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/zh/i_am_hesitating_zhmt/results.json create mode 100644 evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/zh/plausible_alternatives_zhmt/results.json create mode 100644 model.safetensors create mode 100644 pytorch_model.bin create mode 100644 special_tokens_map.json create mode 100644 tokenizer.json create mode 100644 tokenizer_config.json diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..889f5b0 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,75 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text + + +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text + +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text + +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text + +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text + +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zstandard filter=lfs diff=lfs merge=lfs -text +*.tfevents* filter=lfs diff=lfs merge=lfs -text +*.db* filter=lfs diff=lfs merge=lfs -text +*.ark* filter=lfs diff=lfs merge=lfs -text +**/*ckpt*data* filter=lfs diff=lfs merge=lfs -text +**/*ckpt*.meta filter=lfs diff=lfs merge=lfs -text +**/*ckpt*.index filter=lfs diff=lfs merge=lfs -text + +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.gguf* filter=lfs diff=lfs merge=lfs -text +*.ggml filter=lfs diff=lfs merge=lfs -text +*.llamafile* filter=lfs diff=lfs merge=lfs -text +*.pt2 filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text + +*.wasm filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text + +evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=a_good_translation-fr-en-source+target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.jsonl filter=lfs diff=lfs merge=lfs -text +evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=mlsum_es.templates=layman_summ_es.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:40.jsonl filter=lfs diff=lfs merge=lfs -text +evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=a_good_translation-en-fr-source+target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:43.jsonl filter=lfs diff=lfs merge=lfs -text +evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=mlsum_es.templates=palm_prompt.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:40.jsonl filter=lfs diff=lfs merge=lfs -text +evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=mlsum_es.templates=summarise_this_in_es_few_sentences.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:40.jsonl filter=lfs diff=lfs merge=lfs -text +evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=gpt3-fr-en.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.jsonl filter=lfs diff=lfs merge=lfs -text +evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=a_good_translation-en-fr-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:43.jsonl filter=lfs diff=lfs merge=lfs -text +evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=a_good_translation-en-hi-source+target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.jsonl filter=lfs diff=lfs merge=lfs -text +evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=version-fr-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.jsonl filter=lfs diff=lfs merge=lfs -text +evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=xglm-fr-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.jsonl filter=lfs diff=lfs merge=lfs -text +evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=gpt3-en-fr.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:43.jsonl filter=lfs diff=lfs merge=lfs -text +evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=version-en-fr-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:43.jsonl filter=lfs diff=lfs merge=lfs -text +evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=xglm-en-fr-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:43.jsonl filter=lfs diff=lfs merge=lfs -text +evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=a_good_translation-en-hi-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.jsonl filter=lfs diff=lfs merge=lfs -text +evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=a_good_translation-hi-en-source+target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.jsonl filter=lfs diff=lfs merge=lfs -text +evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=a_good_translation-hi-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.jsonl filter=lfs diff=lfs merge=lfs -text +evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=version-en-hi-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.jsonl filter=lfs diff=lfs merge=lfs -text +evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=xglm-en-hi-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.jsonl filter=lfs diff=lfs merge=lfs -text +evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=version-hi-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.jsonl filter=lfs diff=lfs merge=lfs -text +evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=xglm-hi-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.jsonl filter=lfs diff=lfs merge=lfs -text +evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=a_good_translation-fr-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.jsonl filter=lfs diff=lfs merge=lfs -text +evaluation_bloomz-7b1-p3/evaluation_val/wmt14_hi_en/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=gpt-3-en-hi-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-10T11:48:47.jsonl filter=lfs diff=lfs merge=lfs -text +evaluation_bloomz-7b1-p3/evaluation_val/wmt14_hi_en/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=gpt-3-hi-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-10T11:48:47.jsonl filter=lfs diff=lfs merge=lfs -text + +pytorch_model.bin filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text +model.safetensors filter=lfs diff=lfs merge=lfs -text \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..b81ad6c --- /dev/null +++ b/README.md @@ -0,0 +1,882 @@ +--- +datasets: +- Muennighoff/P3 +license: bigscience-bloom-rail-1.0 +language: +- ak +- ar +- as +- bm +- bn +- ca +- code +- en +- es +- eu +- fon +- fr +- gu +- hi +- id +- ig +- ki +- kn +- lg +- ln +- ml +- mr +- ne +- nso +- ny +- or +- pa +- pt +- rn +- rw +- sn +- st +- sw +- ta +- te +- tn +- ts +- tum +- tw +- ur +- vi +- wo +- xh +- yo +- zh +- zu +programming_language: +- C +- C++ +- C# +- Go +- Java +- JavaScript +- Lua +- PHP +- Python +- Ruby +- Rust +- Scala +- TypeScript +pipeline_tag: text-generation +widget: +- text: "äø€äøŖä¼ å„‡ēš„å¼€ē«Æļ¼Œäø€äøŖäøē­ēš„ē„žčÆļ¼Œčæ™äøä»…ä»…ę˜Æäø€éƒØē”µå½±ļ¼Œč€Œę˜Æä½œäøŗäø€äøŖčµ°čæ›ę–°ę—¶ä»£ēš„ę ‡ē­¾ļ¼Œę°øčæœå½Ŗē‚³å²å†Œć€‚Would you rate the previous review as positive, neutral or negative?" + example_title: "zh-en sentiment" +- text: "äø€äøŖä¼ å„‡ēš„å¼€ē«Æļ¼Œäø€äøŖäøē­ēš„ē„žčÆļ¼Œčæ™äøä»…ä»…ę˜Æäø€éƒØē”µå½±ļ¼Œč€Œę˜Æä½œäøŗäø€äøŖčµ°čæ›ę–°ę—¶ä»£ēš„ę ‡ē­¾ļ¼Œę°øčæœå½Ŗē‚³å²å†Œć€‚ä½ č®¤äøŗčæ™å„čÆēš„ē«‹åœŗę˜Æčµžę‰¬ć€äø­ē«‹čæ˜ę˜Æę‰¹čÆ„ļ¼Ÿ" + example_title: "zh-zh sentiment" +- text: "Suggest at least five related search terms to \"Mįŗ”ng neural nhĆ¢n tįŗ”o\"." + example_title: "vi-en query" +- text: "Proposez au moins cinq mots clĆ©s concernant Ā«RĆ©seau de neurones artificielsĀ»." + example_title: "fr-fr query" +- text: "Explain in a sentence in Telugu what is backpropagation in neural networks." + example_title: "te-en qa" +- text: "Why is the sky blue?" + example_title: "en-en qa" +- text: "Write a fairy tale about a troll saving a princess from a dangerous dragon. The fairy tale is a masterpiece that has achieved praise worldwide and its moral is \"Heroes Come in All Shapes and Sizes\". Story (in Spanish):" + example_title: "es-en fable" +- text: "Write a fable about wood elves living in a forest that is suddenly invaded by ogres. The fable is a masterpiece that has achieved praise worldwide and its moral is \"Violence is the last refuge of the incompetent\". Fable (in Hindi):" + example_title: "hi-en fable" +model-index: +- name: bloomz-7b1-p3 + results: + - task: + type: Coreference resolution + dataset: + type: winogrande + name: Winogrande XL (xl) + config: xl + split: validation + revision: a80f460359d1e9a67c006011c94de42a8759430c + metrics: + - type: Accuracy + value: 54.06 + - task: + type: Coreference resolution + dataset: + type: Muennighoff/xwinograd + name: XWinograd (en) + config: en + split: test + revision: 9dd5ea5505fad86b7bedad667955577815300cee + metrics: + - type: Accuracy + value: 53.72 + - task: + type: Coreference resolution + dataset: + type: Muennighoff/xwinograd + name: XWinograd (fr) + config: fr + split: test + revision: 9dd5ea5505fad86b7bedad667955577815300cee + metrics: + - type: Accuracy + value: 55.42 + - task: + type: Coreference resolution + dataset: + type: Muennighoff/xwinograd + name: XWinograd (jp) + config: jp + split: test + revision: 9dd5ea5505fad86b7bedad667955577815300cee + metrics: + - type: Accuracy + value: 51.93 + - task: + type: Coreference resolution + dataset: + type: Muennighoff/xwinograd + name: XWinograd (pt) + config: pt + split: test + revision: 9dd5ea5505fad86b7bedad667955577815300cee + metrics: + - type: Accuracy + value: 53.99 + - task: + type: Coreference resolution + dataset: + type: Muennighoff/xwinograd + name: XWinograd (ru) + config: ru + split: test + revision: 9dd5ea5505fad86b7bedad667955577815300cee + metrics: + - type: Accuracy + value: 53.97 + - task: + type: Coreference resolution + dataset: + type: Muennighoff/xwinograd + name: XWinograd (zh) + config: zh + split: test + revision: 9dd5ea5505fad86b7bedad667955577815300cee + metrics: + - type: Accuracy + value: 52.98 + - task: + type: Natural language inference + dataset: + type: anli + name: ANLI (r1) + config: r1 + split: validation + revision: 9dbd830a06fea8b1c49d6e5ef2004a08d9f45094 + metrics: + - type: Accuracy + value: 35.1 + - task: + type: Natural language inference + dataset: + type: anli + name: ANLI (r2) + config: r2 + split: validation + revision: 9dbd830a06fea8b1c49d6e5ef2004a08d9f45094 + metrics: + - type: Accuracy + value: 35.4 + - task: + type: Natural language inference + dataset: + type: anli + name: ANLI (r3) + config: r3 + split: validation + revision: 9dbd830a06fea8b1c49d6e5ef2004a08d9f45094 + metrics: + - type: Accuracy + value: 37.58 + - task: + type: Natural language inference + dataset: + type: super_glue + name: SuperGLUE (cb) + config: cb + split: validation + revision: 9e12063561e7e6c79099feb6d5a493142584e9e2 + metrics: + - type: Accuracy + value: 62.5 + - task: + type: Natural language inference + dataset: + type: super_glue + name: SuperGLUE (rte) + config: rte + split: validation + revision: 9e12063561e7e6c79099feb6d5a493142584e9e2 + metrics: + - type: Accuracy + value: 78.7 + - task: + type: Natural language inference + dataset: + type: xnli + name: XNLI (ar) + config: ar + split: validation + revision: a5a45e4ff92d5d3f34de70aaf4b72c3bdf9f7f16 + metrics: + - type: Accuracy + value: 50.64 + - task: + type: Natural language inference + dataset: + type: xnli + name: XNLI (bg) + config: bg + split: validation + revision: a5a45e4ff92d5d3f34de70aaf4b72c3bdf9f7f16 + metrics: + - type: Accuracy + value: 43.98 + - task: + type: Natural language inference + dataset: + type: xnli + name: XNLI (de) + config: de + split: validation + revision: a5a45e4ff92d5d3f34de70aaf4b72c3bdf9f7f16 + metrics: + - type: Accuracy + value: 47.03 + - task: + type: Natural language inference + dataset: + type: xnli + name: XNLI (el) + config: el + split: validation + revision: a5a45e4ff92d5d3f34de70aaf4b72c3bdf9f7f16 + metrics: + - type: Accuracy + value: 41.89 + - task: + type: Natural language inference + dataset: + type: xnli + name: XNLI (en) + config: en + split: validation + revision: a5a45e4ff92d5d3f34de70aaf4b72c3bdf9f7f16 + metrics: + - type: Accuracy + value: 55.9 + - task: + type: Natural language inference + dataset: + type: xnli + name: XNLI (es) + config: es + split: validation + revision: a5a45e4ff92d5d3f34de70aaf4b72c3bdf9f7f16 + metrics: + - type: Accuracy + value: 53.73 + - task: + type: Natural language inference + dataset: + type: xnli + name: XNLI (fr) + config: fr + split: validation + revision: a5a45e4ff92d5d3f34de70aaf4b72c3bdf9f7f16 + metrics: + - type: Accuracy + value: 53.37 + - task: + type: Natural language inference + dataset: + type: xnli + name: XNLI (hi) + config: hi + split: validation + revision: a5a45e4ff92d5d3f34de70aaf4b72c3bdf9f7f16 + metrics: + - type: Accuracy + value: 49.84 + - task: + type: Natural language inference + dataset: + type: xnli + name: XNLI (ru) + config: ru + split: validation + revision: a5a45e4ff92d5d3f34de70aaf4b72c3bdf9f7f16 + metrics: + - type: Accuracy + value: 46.55 + - task: + type: Natural language inference + dataset: + type: xnli + name: XNLI (sw) + config: sw + split: validation + revision: a5a45e4ff92d5d3f34de70aaf4b72c3bdf9f7f16 + metrics: + - type: Accuracy + value: 43.49 + - task: + type: Natural language inference + dataset: + type: xnli + name: XNLI (th) + config: th + split: validation + revision: a5a45e4ff92d5d3f34de70aaf4b72c3bdf9f7f16 + metrics: + - type: Accuracy + value: 43.17 + - task: + type: Natural language inference + dataset: + type: xnli + name: XNLI (tr) + config: tr + split: validation + revision: a5a45e4ff92d5d3f34de70aaf4b72c3bdf9f7f16 + metrics: + - type: Accuracy + value: 40.44 + - task: + type: Natural language inference + dataset: + type: xnli + name: XNLI (ur) + config: ur + split: validation + revision: a5a45e4ff92d5d3f34de70aaf4b72c3bdf9f7f16 + metrics: + - type: Accuracy + value: 45.18 + - task: + type: Natural language inference + dataset: + type: xnli + name: XNLI (vi) + config: vi + split: validation + revision: a5a45e4ff92d5d3f34de70aaf4b72c3bdf9f7f16 + metrics: + - type: Accuracy + value: 51.97 + - task: + type: Natural language inference + dataset: + type: xnli + name: XNLI (zh) + config: zh + split: validation + revision: a5a45e4ff92d5d3f34de70aaf4b72c3bdf9f7f16 + metrics: + - type: Accuracy + value: 52.29 + - task: + type: Program synthesis + dataset: + type: openai_humaneval + name: HumanEval + config: None + split: test + revision: e8dc562f5de170c54b5481011dd9f4fa04845771 + metrics: + - type: Pass@1 + value: 1.55 + - type: Pass@10 + value: 4.12 + - type: Pass@100 + value: 9.60 + - task: + type: Sentence completion + dataset: + type: story_cloze + name: StoryCloze (2016) + config: "2016" + split: validation + revision: e724c6f8cdf7c7a2fb229d862226e15b023ee4db + metrics: + - type: Accuracy + value: 87.07 + - task: + type: Sentence completion + dataset: + type: super_glue + name: SuperGLUE (copa) + config: copa + split: validation + revision: 9e12063561e7e6c79099feb6d5a493142584e9e2 + metrics: + - type: Accuracy + value: 81.0 + - task: + type: Sentence completion + dataset: + type: xcopa + name: XCOPA (et) + config: et + split: validation + revision: 37f73c60fb123111fa5af5f9b705d0b3747fd187 + metrics: + - type: Accuracy + value: 57.0 + - task: + type: Sentence completion + dataset: + type: xcopa + name: XCOPA (ht) + config: ht + split: validation + revision: 37f73c60fb123111fa5af5f9b705d0b3747fd187 + metrics: + - type: Accuracy + value: 56.0 + - task: + type: Sentence completion + dataset: + type: xcopa + name: XCOPA (id) + config: id + split: validation + revision: 37f73c60fb123111fa5af5f9b705d0b3747fd187 + metrics: + - type: Accuracy + value: 70.0 + - task: + type: Sentence completion + dataset: + type: xcopa + name: XCOPA (it) + config: it + split: validation + revision: 37f73c60fb123111fa5af5f9b705d0b3747fd187 + metrics: + - type: Accuracy + value: 60.0 + - task: + type: Sentence completion + dataset: + type: xcopa + name: XCOPA (qu) + config: qu + split: validation + revision: 37f73c60fb123111fa5af5f9b705d0b3747fd187 + metrics: + - type: Accuracy + value: 54.0 + - task: + type: Sentence completion + dataset: + type: xcopa + name: XCOPA (sw) + config: sw + split: validation + revision: 37f73c60fb123111fa5af5f9b705d0b3747fd187 + metrics: + - type: Accuracy + value: 62.0 + - task: + type: Sentence completion + dataset: + type: xcopa + name: XCOPA (ta) + config: ta + split: validation + revision: 37f73c60fb123111fa5af5f9b705d0b3747fd187 + metrics: + - type: Accuracy + value: 71.0 + - task: + type: Sentence completion + dataset: + type: xcopa + name: XCOPA (th) + config: th + split: validation + revision: 37f73c60fb123111fa5af5f9b705d0b3747fd187 + metrics: + - type: Accuracy + value: 63.0 + - task: + type: Sentence completion + dataset: + type: xcopa + name: XCOPA (tr) + config: tr + split: validation + revision: 37f73c60fb123111fa5af5f9b705d0b3747fd187 + metrics: + - type: Accuracy + value: 58.0 + - task: + type: Sentence completion + dataset: + type: xcopa + name: XCOPA (vi) + config: vi + split: validation + revision: 37f73c60fb123111fa5af5f9b705d0b3747fd187 + metrics: + - type: Accuracy + value: 67.0 + - task: + type: Sentence completion + dataset: + type: xcopa + name: XCOPA (zh) + config: zh + split: validation + revision: 37f73c60fb123111fa5af5f9b705d0b3747fd187 + metrics: + - type: Accuracy + value: 79.0 + - task: + type: Sentence completion + dataset: + type: Muennighoff/xstory_cloze + name: XStoryCloze (ar) + config: ar + split: validation + revision: 8bb76e594b68147f1a430e86829d07189622b90d + metrics: + - type: Accuracy + value: 78.69 + - task: + type: Sentence completion + dataset: + type: Muennighoff/xstory_cloze + name: XStoryCloze (es) + config: es + split: validation + revision: 8bb76e594b68147f1a430e86829d07189622b90d + metrics: + - type: Accuracy + value: 82.93 + - task: + type: Sentence completion + dataset: + type: Muennighoff/xstory_cloze + name: XStoryCloze (eu) + config: eu + split: validation + revision: 8bb76e594b68147f1a430e86829d07189622b90d + metrics: + - type: Accuracy + value: 70.42 + - task: + type: Sentence completion + dataset: + type: Muennighoff/xstory_cloze + name: XStoryCloze (hi) + config: hi + split: validation + revision: 8bb76e594b68147f1a430e86829d07189622b90d + metrics: + - type: Accuracy + value: 72.2 + - task: + type: Sentence completion + dataset: + type: Muennighoff/xstory_cloze + name: XStoryCloze (id) + config: id + split: validation + revision: 8bb76e594b68147f1a430e86829d07189622b90d + metrics: + - type: Accuracy + value: 77.1 + - task: + type: Sentence completion + dataset: + type: Muennighoff/xstory_cloze + name: XStoryCloze (my) + config: my + split: validation + revision: 8bb76e594b68147f1a430e86829d07189622b90d + metrics: + - type: Accuracy + value: 51.49 + - task: + type: Sentence completion + dataset: + type: Muennighoff/xstory_cloze + name: XStoryCloze (ru) + config: ru + split: validation + revision: 8bb76e594b68147f1a430e86829d07189622b90d + metrics: + - type: Accuracy + value: 66.45 + - task: + type: Sentence completion + dataset: + type: Muennighoff/xstory_cloze + name: XStoryCloze (sw) + config: sw + split: validation + revision: 8bb76e594b68147f1a430e86829d07189622b90d + metrics: + - type: Accuracy + value: 60.82 + - task: + type: Sentence completion + dataset: + type: Muennighoff/xstory_cloze + name: XStoryCloze (te) + config: te + split: validation + revision: 8bb76e594b68147f1a430e86829d07189622b90d + metrics: + - type: Accuracy + value: 63.14 + - task: + type: Sentence completion + dataset: + type: Muennighoff/xstory_cloze + name: XStoryCloze (zh) + config: zh + split: validation + revision: 8bb76e594b68147f1a430e86829d07189622b90d + metrics: + - type: Accuracy + value: 80.34 +--- + +![xmtf](https://github.com/bigscience-workshop/xmtf/blob/master/xmtf_banner.png?raw=true) + +# Table of Contents + +1. [Model Summary](#model-summary) +2. [Use](#use) +3. [Limitations](#limitations) +4. [Training](#training) +5. [Evaluation](#evaluation) +7. [Citation](#citation) + +# Model Summary + +> We present BLOOMZ & mT0, a family of models capable of following human instructions in dozens of languages zero-shot. We finetune BLOOM & mT5 pretrained multilingual language models on our crosslingual task mixture (xP3) and find the resulting models capable of crosslingual generalization to unseen tasks & languages. + +- **Repository:** [bigscience-workshop/xmtf](https://github.com/bigscience-workshop/xmtf) +- **Paper:** [Crosslingual Generalization through Multitask Finetuning](https://arxiv.org/abs/2211.01786) +- **Point of Contact:** [Niklas Muennighoff](mailto:niklas@hf.co) +- **Languages:** Refer to [bloom](https://huggingface.co/bigscience/bloom) for pretraining & [xP3](https://huggingface.co/datasets/bigscience/xP3) for finetuning language proportions. It understands both pretraining & finetuning languages. +- **BLOOMZ & mT0 Model Family:** + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Multitask finetuned on xP3. Recommended for prompting in English. +
Parameters300M580M1.2B3.7B13B560M1.1B1.7B3B7.1B176B
Finetuned Modelmt0-smallmt0-basemt0-largemt0-xlmt0-xxlbloomz-560mbloomz-1b1bloomz-1b7bloomz-3bbloomz-7b1bloomz
Multitask finetuned on xP3mt. Recommended for prompting in non-English.
Finetuned Modelmt0-xxl-mtbloomz-7b1-mtbloomz-mt
Multitask finetuned on P3. Released for research purposes only. Strictly inferior to above models!
Finetuned Modelmt0-xxl-p3bloomz-7b1-p3bloomz-p3
Original pretrained checkpoints. Not recommended.
Pretrained Modelmt5-smallmt5-basemt5-largemt5-xlmt5-xxlbloom-560mbloom-1b1bloom-1b7bloom-3bbloom-7b1bloom
+
+ + +# Use + +## Intended use + +We recommend using the model to perform tasks expressed in natural language. For example, given the prompt "*Translate to English: Je t’aime.*", the model will most likely answer "*I love you.*". Some prompt ideas from our paper: +- äø€äøŖä¼ å„‡ēš„å¼€ē«Æļ¼Œäø€äøŖäøē­ēš„ē„žčÆļ¼Œčæ™äøä»…ä»…ę˜Æäø€éƒØē”µå½±ļ¼Œč€Œę˜Æä½œäøŗäø€äøŖčµ°čæ›ę–°ę—¶ä»£ēš„ę ‡ē­¾ļ¼Œę°øčæœå½Ŗē‚³å²å†Œć€‚ä½ č®¤äøŗčæ™å„čÆēš„ē«‹åœŗę˜Æčµžę‰¬ć€äø­ē«‹čæ˜ę˜Æę‰¹čÆ„? +- Suggest at least five related search terms to "Mįŗ”ng neural nhĆ¢n tįŗ”o". +- Write a fairy tale about a troll saving a princess from a dangerous dragon. The fairy tale is a masterpiece that has achieved praise worldwide and its moral is "Heroes Come in All Shapes and Sizes". Story (in Spanish): +- Explain in a sentence in Telugu what is backpropagation in neural networks. + +**Feel free to share your generations in the Community tab!** + +## How to use + +### CPU + +
+ Click to expand + +```python +# pip install -q transformers +from transformers import AutoModelForCausalLM, AutoTokenizer + +checkpoint = "bigscience/bloomz-7b1-p3" + +tokenizer = AutoTokenizer.from_pretrained(checkpoint) +model = AutoModelForCausalLM.from_pretrained(checkpoint) + +inputs = tokenizer.encode("Translate to English: Je t’aime.", return_tensors="pt") +outputs = model.generate(inputs) +print(tokenizer.decode(outputs[0])) +``` + +
+ +### GPU + +
+ Click to expand + +```python +# pip install -q transformers accelerate +from transformers import AutoModelForCausalLM, AutoTokenizer + +checkpoint = "bigscience/bloomz-7b1-p3" + +tokenizer = AutoTokenizer.from_pretrained(checkpoint) +model = AutoModelForCausalLM.from_pretrained(checkpoint, torch_dtype="auto", device_map="auto") + +inputs = tokenizer.encode("Translate to English: Je t’aime.", return_tensors="pt").to("cuda") +outputs = model.generate(inputs) +print(tokenizer.decode(outputs[0])) +``` + +
+ +### GPU in 8bit + +
+ Click to expand + +```python +# pip install -q transformers accelerate bitsandbytes +from transformers import AutoModelForCausalLM, AutoTokenizer + +checkpoint = "bigscience/bloomz-7b1-p3" + +tokenizer = AutoTokenizer.from_pretrained(checkpoint) +model = AutoModelForCausalLM.from_pretrained(checkpoint, device_map="auto", load_in_8bit=True) + +inputs = tokenizer.encode("Translate to English: Je t’aime.", return_tensors="pt").to("cuda") +outputs = model.generate(inputs) +print(tokenizer.decode(outputs[0])) +``` + +
+ + +### + +# Limitations + +**Prompt Engineering:** The performance may vary depending on the prompt. For BLOOMZ models, we recommend making it very clear when the input stops to avoid the model trying to continue it. For example, the prompt "*Translate to English: Je t'aime*" without the full stop (.) at the end, may result in the model trying to continue the French sentence. Better prompts are e.g. "*Translate to English: Je t'aime.*", "*Translate to English: Je t'aime. Translation:*" "*What is "Je t'aime." in English?*", where it is clear for the model when it should answer. Further, we recommend providing the model as much context as possible. For example, if you want it to answer in Telugu, then tell the model, e.g. "*Explain in a sentence in Telugu what is backpropagation in neural networks.*". + +# Training + +## Model + +- **Architecture:** Same as [bloom-7b1](https://huggingface.co/bigscience/bloom-7b1), also refer to the `config.json` file +- **Finetuning steps:** 1000 +- **Finetuning tokens:** 4.19 billion +- **Finetuning layout:** 1x pipeline parallel, 1x tensor parallel, 64x data parallel +- **Precision:** float16 + +## Hardware + +- **CPUs:** AMD CPUs with 512GB memory per node +- **GPUs:** 64 A100 80GB GPUs with 8 GPUs per node (8 nodes) using NVLink 4 inter-gpu connects, 4 OmniPath links +- **Communication:** NCCL-communications network with a fully dedicated subnet + +## Software + +- **Orchestration:** [Megatron-DeepSpeed](https://github.com/bigscience-workshop/Megatron-DeepSpeed) +- **Optimizer & parallelism:** [DeepSpeed](https://github.com/microsoft/DeepSpeed) +- **Neural networks:** [PyTorch](https://github.com/pytorch/pytorch) (pytorch-1.11 w/ CUDA-11.5) +- **FP16 if applicable:** [apex](https://github.com/NVIDIA/apex) + +# Evaluation + +We refer to Table 7 from our [paper](https://arxiv.org/abs/2211.01786) & [bigscience/evaluation-results](https://huggingface.co/datasets/bigscience/evaluation-results) for zero-shot results on unseen tasks. The sidebar reports zero-shot performance of the best prompt per dataset config. + +# Citation +```bibtex +@misc{muennighoff2022crosslingual, + title={Crosslingual Generalization through Multitask Finetuning}, + author={Niklas Muennighoff and Thomas Wang and Lintang Sutawika and Adam Roberts and Stella Biderman and Teven Le Scao and M Saiful Bari and Sheng Shen and Zheng-Xin Yong and Hailey Schoelkopf and Xiangru Tang and Dragomir Radev and Alham Fikri Aji and Khalid Almubarak and Samuel Albanie and Zaid Alyafeai and Albert Webson and Edward Raff and Colin Raffel}, + year={2022}, + eprint={2211.01786}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} +``` \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..ecdf6f3 --- /dev/null +++ b/config.json @@ -0,0 +1,31 @@ +{ + "apply_residual_connection_post_layernorm": false, + "architectures": [ + "BloomForCausalLM" + ], + "attention_dropout": 0.0, + "attention_softmax_in_fp32": true, + "bias_dropout_fusion": true, + "bos_token_id": 1, + "eos_token_id": 2, + "hidden_dropout": 0.0, + "initializer_range": 0.02, + "layer_norm_epsilon": 1e-05, + "masked_softmax_fusion": true, + "model_type": "bloom", + "n_embed": 4096, + "n_inner": null, + "n_layer": 30, + "num_attention_heads": 32, + "offset_alibi": 100, + "pad_token_id": 3, + "pretraining_tp": 4, + "seq_length": 2048, + "skip_bias_add": true, + "skip_bias_add_qkv": false, + "slow_but_exact": false, + "transformers_version": "4.21.0.dev0", + "unk_token_id": 0, + "use_cache": true, + "vocab_size": 250880 +} \ No newline at end of file diff --git a/configuration.json b/configuration.json new file mode 100644 index 0000000..bbeeda1 --- /dev/null +++ b/configuration.json @@ -0,0 +1 @@ +{"framework": "pytorch", "task": "text-generation", "allow_remote": true} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/ar/Answer_Given_options/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/ar/Answer_Given_options/results.json new file mode 100644 index 0000000..e2cc660 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/ar/Answer_Given_options/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "ar", + "template_name": "Answer Given options", + "evaluation": { + "accuracy": 0.7518199867637326 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Answer Given options', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/ar/Choose_Story_Ending/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/ar/Choose_Story_Ending/results.json new file mode 100644 index 0000000..3e7ca99 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/ar/Choose_Story_Ending/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "ar", + "template_name": "Choose Story Ending", + "evaluation": { + "accuracy": 0.7749834546657842 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Choose Story Ending', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/ar/Generate_Ending/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/ar/Generate_Ending/results.json new file mode 100644 index 0000000..2e3bfec --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/ar/Generate_Ending/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "ar", + "template_name": "Generate Ending", + "evaluation": { + "accuracy": 0.586366644606221 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Generate Ending', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/ar/Novel_Correct_Ending/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/ar/Novel_Correct_Ending/results.json new file mode 100644 index 0000000..89b2123 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/ar/Novel_Correct_Ending/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "ar", + "template_name": "Novel Correct Ending", + "evaluation": { + "accuracy": 0.7518199867637326 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Novel Correct Ending', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/ar/Story_Continuation_and_Options/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/ar/Story_Continuation_and_Options/results.json new file mode 100644 index 0000000..ad2a566 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/ar/Story_Continuation_and_Options/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "ar", + "template_name": "Story Continuation and Options", + "evaluation": { + "accuracy": 0.7438782263401721 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Story Continuation and Options', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/es/Answer_Given_options/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/es/Answer_Given_options/results.json new file mode 100644 index 0000000..8e2b6a0 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/es/Answer_Given_options/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "es", + "template_name": "Answer Given options", + "evaluation": { + "accuracy": 0.7835870284579749 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Answer Given options', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/es/Choose_Story_Ending/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/es/Choose_Story_Ending/results.json new file mode 100644 index 0000000..b21261a --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/es/Choose_Story_Ending/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "es", + "template_name": "Choose Story Ending", + "evaluation": { + "accuracy": 0.8292521508934481 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Choose Story Ending', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/es/Generate_Ending/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/es/Generate_Ending/results.json new file mode 100644 index 0000000..d627a57 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/es/Generate_Ending/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "es", + "template_name": "Generate Ending", + "evaluation": { + "accuracy": 0.6399735274652548 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Generate Ending', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/es/Novel_Correct_Ending/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/es/Novel_Correct_Ending/results.json new file mode 100644 index 0000000..4863dda --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/es/Novel_Correct_Ending/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "es", + "template_name": "Novel Correct Ending", + "evaluation": { + "accuracy": 0.7935142289874255 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Novel Correct Ending', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/es/Story_Continuation_and_Options/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/es/Story_Continuation_and_Options/results.json new file mode 100644 index 0000000..aedf670 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/es/Story_Continuation_and_Options/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "es", + "template_name": "Story Continuation and Options", + "evaluation": { + "accuracy": 0.7888815354070152 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Story Continuation and Options', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/eu/Answer_Given_options/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/eu/Answer_Given_options/results.json new file mode 100644 index 0000000..03a1769 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/eu/Answer_Given_options/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "eu", + "template_name": "Answer Given options", + "evaluation": { + "accuracy": 0.7041694242223693 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Answer Given options', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/eu/Choose_Story_Ending/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/eu/Choose_Story_Ending/results.json new file mode 100644 index 0000000..2d22896 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/eu/Choose_Story_Ending/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "eu", + "template_name": "Choose Story Ending", + "evaluation": { + "accuracy": 0.6823295830575777 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Choose Story Ending', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/eu/Generate_Ending/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/eu/Generate_Ending/results.json new file mode 100644 index 0000000..f0ea4c7 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/eu/Generate_Ending/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "eu", + "template_name": "Generate Ending", + "evaluation": { + "accuracy": 0.5625413633355394 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Generate Ending', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/eu/Novel_Correct_Ending/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/eu/Novel_Correct_Ending/results.json new file mode 100644 index 0000000..70fca52 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/eu/Novel_Correct_Ending/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "eu", + "template_name": "Novel Correct Ending", + "evaluation": { + "accuracy": 0.6671078755790867 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Novel Correct Ending', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/eu/Story_Continuation_and_Options/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/eu/Story_Continuation_and_Options/results.json new file mode 100644 index 0000000..4f3d928 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/eu/Story_Continuation_and_Options/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "eu", + "template_name": "Story Continuation and Options", + "evaluation": { + "accuracy": 0.671740569159497 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Story Continuation and Options', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/hi/Answer_Given_options/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/hi/Answer_Given_options/results.json new file mode 100644 index 0000000..a914062 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/hi/Answer_Given_options/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "hi", + "template_name": "Answer Given options", + "evaluation": { + "accuracy": 0.6915949702183984 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Answer Given options', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/hi/Choose_Story_Ending/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/hi/Choose_Story_Ending/results.json new file mode 100644 index 0000000..754ae0f --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/hi/Choose_Story_Ending/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "hi", + "template_name": "Choose Story Ending", + "evaluation": { + "accuracy": 0.7220383851753805 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Choose Story Ending', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/hi/Generate_Ending/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/hi/Generate_Ending/results.json new file mode 100644 index 0000000..137692b --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/hi/Generate_Ending/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "hi", + "template_name": "Generate Ending", + "evaluation": { + "accuracy": 0.5883520847121112 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Generate Ending', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/hi/Novel_Correct_Ending/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/hi/Novel_Correct_Ending/results.json new file mode 100644 index 0000000..6af3dda --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/hi/Novel_Correct_Ending/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "hi", + "template_name": "Novel Correct Ending", + "evaluation": { + "accuracy": 0.6743878226340172 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Novel Correct Ending', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/hi/Story_Continuation_and_Options/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/hi/Story_Continuation_and_Options/results.json new file mode 100644 index 0000000..03c24ba --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/hi/Story_Continuation_and_Options/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "hi", + "template_name": "Story Continuation and Options", + "evaluation": { + "accuracy": 0.6816677696889477 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Story Continuation and Options', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/id/Answer_Given_options/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/id/Answer_Given_options/results.json new file mode 100644 index 0000000..cdaeb11 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/id/Answer_Given_options/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "id", + "template_name": "Answer Given options", + "evaluation": { + "accuracy": 0.7445400397088021 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Answer Given options', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/id/Choose_Story_Ending/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/id/Choose_Story_Ending/results.json new file mode 100644 index 0000000..57d4ced --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/id/Choose_Story_Ending/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "id", + "template_name": "Choose Story Ending", + "evaluation": { + "accuracy": 0.771012574454004 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Choose Story Ending', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/id/Generate_Ending/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/id/Generate_Ending/results.json new file mode 100644 index 0000000..0c3d1f4 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/id/Generate_Ending/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "id", + "template_name": "Generate Ending", + "evaluation": { + "accuracy": 0.6029119788219722 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Generate Ending', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/id/Novel_Correct_Ending/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/id/Novel_Correct_Ending/results.json new file mode 100644 index 0000000..3cc334b --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/id/Novel_Correct_Ending/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "id", + "template_name": "Novel Correct Ending", + "evaluation": { + "accuracy": 0.7485109199205824 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Novel Correct Ending', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/id/Story_Continuation_and_Options/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/id/Story_Continuation_and_Options/results.json new file mode 100644 index 0000000..1a5452a --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/id/Story_Continuation_and_Options/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "id", + "template_name": "Story Continuation and Options", + "evaluation": { + "accuracy": 0.7438782263401721 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Story Continuation and Options', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/zh/Answer_Given_options/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/zh/Answer_Given_options/results.json new file mode 100644 index 0000000..3a32637 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/zh/Answer_Given_options/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "zh", + "template_name": "Answer Given options", + "evaluation": { + "accuracy": 0.7610853739245532 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Answer Given options', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/zh/Choose_Story_Ending/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/zh/Choose_Story_Ending/results.json new file mode 100644 index 0000000..9ac4013 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/zh/Choose_Story_Ending/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "zh", + "template_name": "Choose Story Ending", + "evaluation": { + "accuracy": 0.7961614824619457 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Choose Story Ending', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/zh/Generate_Ending/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/zh/Generate_Ending/results.json new file mode 100644 index 0000000..714502d --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/zh/Generate_Ending/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "zh", + "template_name": "Generate Ending", + "evaluation": { + "accuracy": 0.6214427531436135 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Generate Ending', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/zh/Novel_Correct_Ending/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/zh/Novel_Correct_Ending/results.json new file mode 100644 index 0000000..c6050a9 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/zh/Novel_Correct_Ending/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "zh", + "template_name": "Novel Correct Ending", + "evaluation": { + "accuracy": 0.7696889477167439 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Novel Correct Ending', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/zh/Story_Continuation_and_Options/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/zh/Story_Continuation_and_Options/results.json new file mode 100644 index 0000000..3ac7cba --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/zh/Story_Continuation_and_Options/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "zh", + "template_name": "Story Continuation and Options", + "evaluation": { + "accuracy": 0.7670416942422237 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Story Continuation and Options', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/en/Replace/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/en/Replace/results.json new file mode 100644 index 0000000..d901331 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/en/Replace/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xwinograd", + "dataset_config_name": "en", + "template_name": "Replace", + "evaluation": { + "accuracy": 0.5225806451612903 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='Replace', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/en/True_or_False/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/en/True_or_False/results.json new file mode 100644 index 0000000..e6413da --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/en/True_or_False/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xwinograd", + "dataset_config_name": "en", + "template_name": "True or False", + "evaluation": { + "accuracy": 0.48946236559139783 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='True or False', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/en/does_underscore_refer_to/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/en/does_underscore_refer_to/results.json new file mode 100644 index 0000000..4f712db --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/en/does_underscore_refer_to/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xwinograd", + "dataset_config_name": "en", + "template_name": "does underscore refer to", + "evaluation": { + "accuracy": 0.5281720430107527 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='does underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/en/stand_for/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/en/stand_for/results.json new file mode 100644 index 0000000..2807c66 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/en/stand_for/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xwinograd", + "dataset_config_name": "en", + "template_name": "stand for", + "evaluation": { + "accuracy": 0.5062365591397849 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='stand for', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/en/underscore_refer_to/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/en/underscore_refer_to/results.json new file mode 100644 index 0000000..e5700e7 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/en/underscore_refer_to/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xwinograd", + "dataset_config_name": "en", + "template_name": "underscore refer to", + "evaluation": { + "accuracy": 0.5372043010752688 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/fr/Replace/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/fr/Replace/results.json new file mode 100644 index 0000000..700aab3 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/fr/Replace/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xwinograd", + "dataset_config_name": "fr", + "template_name": "Replace", + "evaluation": { + "accuracy": 0.5060240963855421 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='Replace', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/fr/True_or_False/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/fr/True_or_False/results.json new file mode 100644 index 0000000..01b16ac --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/fr/True_or_False/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xwinograd", + "dataset_config_name": "fr", + "template_name": "True or False", + "evaluation": { + "accuracy": 0.5421686746987951 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='True or False', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/fr/does_underscore_refer_to/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/fr/does_underscore_refer_to/results.json new file mode 100644 index 0000000..2d8f520 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/fr/does_underscore_refer_to/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xwinograd", + "dataset_config_name": "fr", + "template_name": "does underscore refer to", + "evaluation": { + "accuracy": 0.5542168674698795 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='does underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/fr/stand_for/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/fr/stand_for/results.json new file mode 100644 index 0000000..0de7d69 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/fr/stand_for/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xwinograd", + "dataset_config_name": "fr", + "template_name": "stand for", + "evaluation": { + "accuracy": 0.4819277108433735 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='stand for', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/fr/underscore_refer_to/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/fr/underscore_refer_to/results.json new file mode 100644 index 0000000..929c3e2 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/fr/underscore_refer_to/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xwinograd", + "dataset_config_name": "fr", + "template_name": "underscore refer to", + "evaluation": { + "accuracy": 0.5301204819277109 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/pt/Replace/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/pt/Replace/results.json new file mode 100644 index 0000000..38c979c --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/pt/Replace/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xwinograd", + "dataset_config_name": "pt", + "template_name": "Replace", + "evaluation": { + "accuracy": 0.5133079847908745 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='Replace', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/pt/True_or_False/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/pt/True_or_False/results.json new file mode 100644 index 0000000..0d06d41 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/pt/True_or_False/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xwinograd", + "dataset_config_name": "pt", + "template_name": "True or False", + "evaluation": { + "accuracy": 0.4714828897338403 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='True or False', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/pt/does_underscore_refer_to/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/pt/does_underscore_refer_to/results.json new file mode 100644 index 0000000..eacd9b4 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/pt/does_underscore_refer_to/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xwinograd", + "dataset_config_name": "pt", + "template_name": "does underscore refer to", + "evaluation": { + "accuracy": 0.5209125475285171 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='does underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/pt/stand_for/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/pt/stand_for/results.json new file mode 100644 index 0000000..01a8bc8 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/pt/stand_for/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xwinograd", + "dataset_config_name": "pt", + "template_name": "stand for", + "evaluation": { + "accuracy": 0.5019011406844106 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='stand for', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/pt/underscore_refer_to/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/pt/underscore_refer_to/results.json new file mode 100644 index 0000000..19ab3a4 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/pt/underscore_refer_to/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xwinograd", + "dataset_config_name": "pt", + "template_name": "underscore refer to", + "evaluation": { + "accuracy": 0.5399239543726235 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/zh/Replace/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/zh/Replace/results.json new file mode 100644 index 0000000..525cbfa --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/zh/Replace/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xwinograd", + "dataset_config_name": "zh", + "template_name": "Replace", + "evaluation": { + "accuracy": 0.5257936507936508 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='Replace', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/zh/True_or_False/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/zh/True_or_False/results.json new file mode 100644 index 0000000..e9f960b --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/zh/True_or_False/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xwinograd", + "dataset_config_name": "zh", + "template_name": "True or False", + "evaluation": { + "accuracy": 0.5297619047619048 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='True or False', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/zh/does_underscore_refer_to/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/zh/does_underscore_refer_to/results.json new file mode 100644 index 0000000..b04431e --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/zh/does_underscore_refer_to/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xwinograd", + "dataset_config_name": "zh", + "template_name": "does underscore refer to", + "evaluation": { + "accuracy": 0.5218253968253969 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='does underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/zh/stand_for/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/zh/stand_for/results.json new file mode 100644 index 0000000..962d07e --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/zh/stand_for/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xwinograd", + "dataset_config_name": "zh", + "template_name": "stand for", + "evaluation": { + "accuracy": 0.4444444444444444 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='stand for', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/zh/underscore_refer_to/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/zh/underscore_refer_to/results.json new file mode 100644 index 0000000..eded0ef --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/zh/underscore_refer_to/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xwinograd", + "dataset_config_name": "zh", + "template_name": "underscore refer to", + "evaluation": { + "accuracy": 0.5198412698412699 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r1/GPT-3_style/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r1/GPT-3_style/results.json new file mode 100644 index 0000000..fcfda62 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r1/GPT-3_style/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "anli", + "dataset_config_name": "dev_r1", + "template_name": "GPT-3 style", + "evaluation": { + "accuracy": 0.351 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='dev_r1', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r1', target_max_length=256, template_config_name=None, template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r1/MNLI_crowdsource/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r1/MNLI_crowdsource/results.json new file mode 100644 index 0000000..c9050cf --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r1/MNLI_crowdsource/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "anli", + "dataset_config_name": "dev_r1", + "template_name": "MNLI crowdsource", + "evaluation": { + "accuracy": 0.334 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='dev_r1', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r1', target_max_length=256, template_config_name=None, template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r1/can_we_infer/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r1/can_we_infer/results.json new file mode 100644 index 0000000..b5d4cf6 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r1/can_we_infer/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "anli", + "dataset_config_name": "dev_r1", + "template_name": "can we infer", + "evaluation": { + "accuracy": 0.351 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='dev_r1', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r1', target_max_length=256, template_config_name=None, template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r1/guaranteed_possible_impossible/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r1/guaranteed_possible_impossible/results.json new file mode 100644 index 0000000..261d4ea --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r1/guaranteed_possible_impossible/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "anli", + "dataset_config_name": "dev_r1", + "template_name": "guaranteed/possible/impossible", + "evaluation": { + "accuracy": 0.288 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='dev_r1', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r1', target_max_length=256, template_config_name=None, template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r1/justified_in_saying/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r1/justified_in_saying/results.json new file mode 100644 index 0000000..6cac7bc --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r1/justified_in_saying/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "anli", + "dataset_config_name": "dev_r1", + "template_name": "justified in saying", + "evaluation": { + "accuracy": 0.345 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='dev_r1', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r1', target_max_length=256, template_config_name=None, template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r2/GPT-3_style/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r2/GPT-3_style/results.json new file mode 100644 index 0000000..0a3bdab --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r2/GPT-3_style/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "anli", + "dataset_config_name": "dev_r2", + "template_name": "GPT-3 style", + "evaluation": { + "accuracy": 0.339 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='dev_r2', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r2', target_max_length=256, template_config_name=None, template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r2/MNLI_crowdsource/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r2/MNLI_crowdsource/results.json new file mode 100644 index 0000000..d1a41e1 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r2/MNLI_crowdsource/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "anli", + "dataset_config_name": "dev_r2", + "template_name": "MNLI crowdsource", + "evaluation": { + "accuracy": 0.335 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='dev_r2', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r2', target_max_length=256, template_config_name=None, template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r2/can_we_infer/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r2/can_we_infer/results.json new file mode 100644 index 0000000..58c9782 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r2/can_we_infer/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "anli", + "dataset_config_name": "dev_r2", + "template_name": "can we infer", + "evaluation": { + "accuracy": 0.354 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='dev_r2', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r2', target_max_length=256, template_config_name=None, template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r2/guaranteed_possible_impossible/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r2/guaranteed_possible_impossible/results.json new file mode 100644 index 0000000..bc78b9e --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r2/guaranteed_possible_impossible/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "anli", + "dataset_config_name": "dev_r2", + "template_name": "guaranteed/possible/impossible", + "evaluation": { + "accuracy": 0.297 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='dev_r2', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r2', target_max_length=256, template_config_name=None, template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r2/justified_in_saying/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r2/justified_in_saying/results.json new file mode 100644 index 0000000..3bdc511 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r2/justified_in_saying/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "anli", + "dataset_config_name": "dev_r2", + "template_name": "justified in saying", + "evaluation": { + "accuracy": 0.345 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='dev_r2', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r2', target_max_length=256, template_config_name=None, template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r3/GPT-3_style/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r3/GPT-3_style/results.json new file mode 100644 index 0000000..79b144f --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r3/GPT-3_style/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "anli", + "dataset_config_name": "dev_r3", + "template_name": "GPT-3 style", + "evaluation": { + "accuracy": 0.37583333333333335 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='dev_r3', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r3', target_max_length=256, template_config_name=None, template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r3/MNLI_crowdsource/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r3/MNLI_crowdsource/results.json new file mode 100644 index 0000000..120299a --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r3/MNLI_crowdsource/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "anli", + "dataset_config_name": "dev_r3", + "template_name": "MNLI crowdsource", + "evaluation": { + "accuracy": 0.3408333333333333 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='dev_r3', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r3', target_max_length=256, template_config_name=None, template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r3/can_we_infer/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r3/can_we_infer/results.json new file mode 100644 index 0000000..c7830cb --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r3/can_we_infer/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "anli", + "dataset_config_name": "dev_r3", + "template_name": "can we infer", + "evaluation": { + "accuracy": 0.36333333333333334 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='dev_r3', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r3', target_max_length=256, template_config_name=None, template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r3/guaranteed_possible_impossible/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r3/guaranteed_possible_impossible/results.json new file mode 100644 index 0000000..008580c --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r3/guaranteed_possible_impossible/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "anli", + "dataset_config_name": "dev_r3", + "template_name": "guaranteed/possible/impossible", + "evaluation": { + "accuracy": 0.31083333333333335 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='dev_r3', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r3', target_max_length=256, template_config_name=None, template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r3/justified_in_saying/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r3/justified_in_saying/results.json new file mode 100644 index 0000000..cb20fb0 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r3/justified_in_saying/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "anli", + "dataset_config_name": "dev_r3", + "template_name": "justified in saying", + "evaluation": { + "accuracy": 0.34 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='dev_r3', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r3', target_max_length=256, template_config_name=None, template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/merged.csv b/evaluation_bloomz-7b1-p3/evaluation_l1/merged.csv new file mode 100644 index 0000000..bada4f2 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/merged.csv @@ -0,0 +1,194 @@ +dataset,prompt,metric,value +anli_dev_r1,GPT-3 style,accuracy,0.351 +anli_dev_r1,MNLI crowdsource,accuracy,0.334 +anli_dev_r1,can we infer,accuracy,0.351 +anli_dev_r1,guaranteed/possible/impossible,accuracy,0.288 +anli_dev_r1,justified in saying,accuracy,0.345 +anli_dev_r1,median,accuracy,0.345 +anli_dev_r2,GPT-3 style,accuracy,0.339 +anli_dev_r2,MNLI crowdsource,accuracy,0.335 +anli_dev_r2,can we infer,accuracy,0.354 +anli_dev_r2,guaranteed/possible/impossible,accuracy,0.297 +anli_dev_r2,justified in saying,accuracy,0.345 +anli_dev_r2,median,accuracy,0.339 +anli_dev_r3,GPT-3 style,accuracy,0.37583333333333335 +anli_dev_r3,MNLI crowdsource,accuracy,0.3408333333333333 +anli_dev_r3,can we infer,accuracy,0.36333333333333334 +anli_dev_r3,guaranteed/possible/impossible,accuracy,0.31083333333333335 +anli_dev_r3,justified in saying,accuracy,0.34 +anli_dev_r3,median,accuracy,0.3408333333333333 +story_cloze_2016,Answer Given options,accuracy,0.8305718866916088 +story_cloze_2016,Choose Story Ending,accuracy,0.8706574024585783 +story_cloze_2016,Generate Ending,accuracy,0.7183324425440941 +story_cloze_2016,Novel Correct Ending,accuracy,0.848743987172635 +story_cloze_2016,Story Continuation and Options,accuracy,0.8466060929983966 +story_cloze_2016,median,accuracy,0.8466060929983966 +super_glue_cb,GPT-3 style,accuracy,0.625 +super_glue_cb,MNLI crowdsource,accuracy,0.08928571428571429 +super_glue_cb,can we infer,accuracy,0.5892857142857143 +super_glue_cb,guaranteed/possible/impossible,accuracy,0.5 +super_glue_cb,justified in saying,accuracy,0.5357142857142857 +super_glue_cb,median,accuracy,0.5357142857142857 +super_glue_copa,"C1 or C2? premise, so/because…",accuracy,0.66 +super_glue_copa,best_option,accuracy,0.67 +super_glue_copa,cause_effect,accuracy,0.78 +super_glue_copa,i_am_hesitating,accuracy,0.8 +super_glue_copa,plausible_alternatives,accuracy,0.81 +super_glue_copa,median,accuracy,0.78 +super_glue_rte,GPT-3 style,accuracy,0.7870036101083032 +super_glue_rte,MNLI crowdsource,accuracy,0.7220216606498195 +super_glue_rte,does it follow that,accuracy,0.6678700361010831 +super_glue_rte,guaranteed true,accuracy,0.6714801444043321 +super_glue_rte,should assume,accuracy,0.6678700361010831 +super_glue_rte,median,accuracy,0.6714801444043321 +winogrande_winogrande_xl,Replace,accuracy,0.5406471981057617 +winogrande_winogrande_xl,True or False,accuracy,0.5074980268350434 +winogrande_winogrande_xl,does underscore refer to,accuracy,0.5177584846093133 +winogrande_winogrande_xl,stand for,accuracy,0.510655090765588 +winogrande_winogrande_xl,underscore refer to,accuracy,0.5256511444356748 +winogrande_winogrande_xl,median,accuracy,0.5177584846093133 +xcopa_id,"C1 or C2? premise, so/because…",accuracy,0.47 +xcopa_id,best_option,accuracy,0.51 +xcopa_id,cause_effect,accuracy,0.65 +xcopa_id,i_am_hesitating,accuracy,0.66 +xcopa_id,plausible_alternatives,accuracy,0.67 +xcopa_id,median,accuracy,0.65 +xcopa_sw,"C1 or C2? premise, so/because…",accuracy,0.58 +xcopa_sw,best_option,accuracy,0.57 +xcopa_sw,cause_effect,accuracy,0.46 +xcopa_sw,i_am_hesitating,accuracy,0.48 +xcopa_sw,plausible_alternatives,accuracy,0.45 +xcopa_sw,median,accuracy,0.48 +xcopa_ta,"C1 or C2? premise, so/because…",accuracy,0.57 +xcopa_ta,best_option,accuracy,0.67 +xcopa_ta,cause_effect,accuracy,0.71 +xcopa_ta,i_am_hesitating,accuracy,0.71 +xcopa_ta,plausible_alternatives,accuracy,0.69 +xcopa_ta,median,accuracy,0.69 +xcopa_vi,"C1 or C2? premise, so/because…",accuracy,0.55 +xcopa_vi,best_option,accuracy,0.61 +xcopa_vi,cause_effect,accuracy,0.67 +xcopa_vi,i_am_hesitating,accuracy,0.66 +xcopa_vi,plausible_alternatives,accuracy,0.65 +xcopa_vi,median,accuracy,0.65 +xcopa_zh,"C1 or C2? premise, so/because…",accuracy,0.62 +xcopa_zh,best_option,accuracy,0.61 +xcopa_zh,cause_effect,accuracy,0.77 +xcopa_zh,i_am_hesitating,accuracy,0.72 +xcopa_zh,plausible_alternatives,accuracy,0.74 +xcopa_zh,median,accuracy,0.72 +xnli_ar,GPT-3 style,accuracy,0.5040160642570282 +xnli_ar,MNLI crowdsource,accuracy,0.39879518072289155 +xnli_ar,can we infer,accuracy,0.506425702811245 +xnli_ar,guaranteed/possible/impossible,accuracy,0.4799196787148594 +xnli_ar,justified in saying,accuracy,0.41526104417670684 +xnli_ar,median,accuracy,0.4799196787148594 +xnli_en,GPT-3 style,accuracy,0.5590361445783133 +xnli_en,MNLI crowdsource,accuracy,0.342570281124498 +xnli_en,can we infer,accuracy,0.5449799196787148 +xnli_en,guaranteed/possible/impossible,accuracy,0.41164658634538154 +xnli_en,justified in saying,accuracy,0.4634538152610442 +xnli_en,median,accuracy,0.4634538152610442 +xnli_es,GPT-3 style,accuracy,0.5373493975903615 +xnli_es,MNLI crowdsource,accuracy,0.40441767068273093 +xnli_es,can we infer,accuracy,0.5277108433734939 +xnli_es,guaranteed/possible/impossible,accuracy,0.44216867469879517 +xnli_es,justified in saying,accuracy,0.4534136546184739 +xnli_es,median,accuracy,0.4534136546184739 +xnli_fr,GPT-3 style,accuracy,0.5248995983935743 +xnli_fr,MNLI crowdsource,accuracy,0.3895582329317269 +xnli_fr,can we infer,accuracy,0.5337349397590362 +xnli_fr,guaranteed/possible/impossible,accuracy,0.42971887550200805 +xnli_fr,justified in saying,accuracy,0.4738955823293173 +xnli_fr,median,accuracy,0.4738955823293173 +xnli_hi,GPT-3 style,accuracy,0.4983935742971888 +xnli_hi,MNLI crowdsource,accuracy,0.38714859437751004 +xnli_hi,can we infer,accuracy,0.45542168674698796 +xnli_hi,guaranteed/possible/impossible,accuracy,0.41405622489959837 +xnli_hi,justified in saying,accuracy,0.38795180722891565 +xnli_hi,median,accuracy,0.41405622489959837 +xnli_sw,GPT-3 style,accuracy,0.43493975903614457 +xnli_sw,MNLI crowdsource,accuracy,0.363855421686747 +xnli_sw,can we infer,accuracy,0.42891566265060244 +xnli_sw,guaranteed/possible/impossible,accuracy,0.3457831325301205 +xnli_sw,justified in saying,accuracy,0.3650602409638554 +xnli_sw,median,accuracy,0.3650602409638554 +xnli_ur,GPT-3 style,accuracy,0.43493975903614457 +xnli_ur,MNLI crowdsource,accuracy,0.3895582329317269 +xnli_ur,can we infer,accuracy,0.45180722891566266 +xnli_ur,guaranteed/possible/impossible,accuracy,0.40120481927710844 +xnli_ur,justified in saying,accuracy,0.37630522088353413 +xnli_ur,median,accuracy,0.40120481927710844 +xnli_vi,GPT-3 style,accuracy,0.5196787148594377 +xnli_vi,MNLI crowdsource,accuracy,0.38112449799196785 +xnli_vi,can we infer,accuracy,0.5080321285140562 +xnli_vi,guaranteed/possible/impossible,accuracy,0.38393574297188754 +xnli_vi,justified in saying,accuracy,0.43614457831325304 +xnli_vi,median,accuracy,0.43614457831325304 +xnli_zh,GPT-3 style,accuracy,0.5052208835341365 +xnli_zh,MNLI crowdsource,accuracy,0.4 +xnli_zh,can we infer,accuracy,0.5228915662650603 +xnli_zh,guaranteed/possible/impossible,accuracy,0.4738955823293173 +xnli_zh,justified in saying,accuracy,0.45863453815261046 +xnli_zh,median,accuracy,0.4738955823293173 +xstory_cloze_ar,Answer Given options,accuracy,0.7518199867637326 +xstory_cloze_ar,Choose Story Ending,accuracy,0.7749834546657842 +xstory_cloze_ar,Generate Ending,accuracy,0.586366644606221 +xstory_cloze_ar,Novel Correct Ending,accuracy,0.7518199867637326 +xstory_cloze_ar,Story Continuation and Options,accuracy,0.7438782263401721 +xstory_cloze_ar,median,accuracy,0.7518199867637326 +xstory_cloze_es,Answer Given options,accuracy,0.7835870284579749 +xstory_cloze_es,Choose Story Ending,accuracy,0.8292521508934481 +xstory_cloze_es,Generate Ending,accuracy,0.6399735274652548 +xstory_cloze_es,Novel Correct Ending,accuracy,0.7935142289874255 +xstory_cloze_es,Story Continuation and Options,accuracy,0.7888815354070152 +xstory_cloze_es,median,accuracy,0.7888815354070152 +xstory_cloze_eu,Answer Given options,accuracy,0.7041694242223693 +xstory_cloze_eu,Choose Story Ending,accuracy,0.6823295830575777 +xstory_cloze_eu,Generate Ending,accuracy,0.5625413633355394 +xstory_cloze_eu,Novel Correct Ending,accuracy,0.6671078755790867 +xstory_cloze_eu,Story Continuation and Options,accuracy,0.671740569159497 +xstory_cloze_eu,median,accuracy,0.671740569159497 +xstory_cloze_hi,Answer Given options,accuracy,0.6915949702183984 +xstory_cloze_hi,Choose Story Ending,accuracy,0.7220383851753805 +xstory_cloze_hi,Generate Ending,accuracy,0.5883520847121112 +xstory_cloze_hi,Novel Correct Ending,accuracy,0.6743878226340172 +xstory_cloze_hi,Story Continuation and Options,accuracy,0.6816677696889477 +xstory_cloze_hi,median,accuracy,0.6816677696889477 +xstory_cloze_id,Answer Given options,accuracy,0.7445400397088021 +xstory_cloze_id,Choose Story Ending,accuracy,0.771012574454004 +xstory_cloze_id,Generate Ending,accuracy,0.6029119788219722 +xstory_cloze_id,Novel Correct Ending,accuracy,0.7485109199205824 +xstory_cloze_id,Story Continuation and Options,accuracy,0.7438782263401721 +xstory_cloze_id,median,accuracy,0.7445400397088021 +xstory_cloze_zh,Answer Given options,accuracy,0.7610853739245532 +xstory_cloze_zh,Choose Story Ending,accuracy,0.7961614824619457 +xstory_cloze_zh,Generate Ending,accuracy,0.6214427531436135 +xstory_cloze_zh,Novel Correct Ending,accuracy,0.7696889477167439 +xstory_cloze_zh,Story Continuation and Options,accuracy,0.7670416942422237 +xstory_cloze_zh,median,accuracy,0.7670416942422237 +xwinograd_en,Replace,accuracy,0.5225806451612903 +xwinograd_en,True or False,accuracy,0.48946236559139783 +xwinograd_en,does underscore refer to,accuracy,0.5281720430107527 +xwinograd_en,stand for,accuracy,0.5062365591397849 +xwinograd_en,underscore refer to,accuracy,0.5372043010752688 +xwinograd_en,median,accuracy,0.5225806451612903 +xwinograd_fr,Replace,accuracy,0.5060240963855421 +xwinograd_fr,True or False,accuracy,0.5421686746987951 +xwinograd_fr,does underscore refer to,accuracy,0.5542168674698795 +xwinograd_fr,stand for,accuracy,0.4819277108433735 +xwinograd_fr,underscore refer to,accuracy,0.5301204819277109 +xwinograd_fr,median,accuracy,0.5301204819277109 +xwinograd_pt,Replace,accuracy,0.5133079847908745 +xwinograd_pt,True or False,accuracy,0.4714828897338403 +xwinograd_pt,does underscore refer to,accuracy,0.5209125475285171 +xwinograd_pt,stand for,accuracy,0.5019011406844106 +xwinograd_pt,underscore refer to,accuracy,0.5399239543726235 +xwinograd_pt,median,accuracy,0.5133079847908745 +xwinograd_zh,Replace,accuracy,0.5257936507936508 +xwinograd_zh,True or False,accuracy,0.5297619047619048 +xwinograd_zh,does underscore refer to,accuracy,0.5218253968253969 +xwinograd_zh,stand for,accuracy,0.4444444444444444 +xwinograd_zh,underscore refer to,accuracy,0.5198412698412699 +xwinograd_zh,median,accuracy,0.5218253968253969 +multiple,average,multiple,0.5631550819200618 diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/merged.json b/evaluation_bloomz-7b1-p3/evaluation_l1/merged.json new file mode 100644 index 0000000..73592fb --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/merged.json @@ -0,0 +1 @@ +{"Muennighoff/xstory_cloze_ar": {"Answer Given options": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Answer Given options', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.7518199867637326}, "template_name": "Answer Given options"}, "Choose Story Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Choose Story Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.7749834546657842}, "template_name": "Choose Story Ending"}, "Generate Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Generate Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.586366644606221}, "template_name": "Generate Ending"}, "Novel Correct Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Novel Correct Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.7518199867637326}, "template_name": "Novel Correct Ending"}, "Story Continuation and Options": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Story Continuation and Options', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.7438782263401721}, "template_name": "Story Continuation and Options"}}, "Muennighoff/xstory_cloze_es": {"Answer Given options": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Answer Given options', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.7835870284579749}, "template_name": "Answer Given options"}, "Choose Story Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Choose Story Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.8292521508934481}, "template_name": "Choose Story Ending"}, "Generate Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Generate Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.6399735274652548}, "template_name": "Generate Ending"}, "Novel Correct Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Novel Correct Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.7935142289874255}, "template_name": "Novel Correct Ending"}, "Story Continuation and Options": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Story Continuation and Options', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.7888815354070152}, "template_name": "Story Continuation and Options"}}, "Muennighoff/xstory_cloze_eu": {"Answer Given options": {"arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Answer Given options', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "eu", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.7041694242223693}, "template_name": "Answer Given options"}, "Choose Story Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Choose Story Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "eu", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.6823295830575777}, "template_name": "Choose Story Ending"}, "Generate Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Generate Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "eu", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.5625413633355394}, "template_name": "Generate Ending"}, "Novel Correct Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Novel Correct Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "eu", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.6671078755790867}, "template_name": "Novel Correct Ending"}, "Story Continuation and Options": {"arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Story Continuation and Options', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "eu", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.671740569159497}, "template_name": "Story Continuation and Options"}}, "Muennighoff/xstory_cloze_hi": {"Answer Given options": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Answer Given options', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.6915949702183984}, "template_name": "Answer Given options"}, "Choose Story Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Choose Story Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.7220383851753805}, "template_name": "Choose Story Ending"}, "Generate Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Generate Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.5883520847121112}, "template_name": "Generate Ending"}, "Novel Correct Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Novel Correct Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.6743878226340172}, "template_name": "Novel Correct Ending"}, "Story Continuation and Options": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Story Continuation and Options', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.6816677696889477}, "template_name": "Story Continuation and Options"}}, "Muennighoff/xstory_cloze_id": {"Answer Given options": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Answer Given options', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.7445400397088021}, "template_name": "Answer Given options"}, "Choose Story Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Choose Story Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.771012574454004}, "template_name": "Choose Story Ending"}, "Generate Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Generate Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.6029119788219722}, "template_name": "Generate Ending"}, "Novel Correct Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Novel Correct Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.7485109199205824}, "template_name": "Novel Correct Ending"}, "Story Continuation and Options": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Story Continuation and Options', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.7438782263401721}, "template_name": "Story Continuation and Options"}}, "Muennighoff/xstory_cloze_zh": {"Answer Given options": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Answer Given options', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.7610853739245532}, "template_name": "Answer Given options"}, "Choose Story Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Choose Story Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.7961614824619457}, "template_name": "Choose Story Ending"}, "Generate Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Generate Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.6214427531436135}, "template_name": "Generate Ending"}, "Novel Correct Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Novel Correct Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.7696889477167439}, "template_name": "Novel Correct Ending"}, "Story Continuation and Options": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Story Continuation and Options', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.7670416942422237}, "template_name": "Story Continuation and Options"}}, "Muennighoff/xwinograd_en": {"Replace": {"arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='Replace', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "en", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.5225806451612903}, "template_name": "Replace"}, "True or False": {"arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='True or False', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "en", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.48946236559139783}, "template_name": "True or False"}, "does underscore refer to": {"arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='does underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "en", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.5281720430107527}, "template_name": "does underscore refer to"}, "stand for": {"arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='stand for', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "en", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.5062365591397849}, "template_name": "stand for"}, "underscore refer to": {"arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "en", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.5372043010752688}, "template_name": "underscore refer to"}}, "Muennighoff/xwinograd_fr": {"Replace": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='Replace', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.5060240963855421}, "template_name": "Replace"}, "True or False": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='True or False', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.5421686746987951}, "template_name": "True or False"}, "does underscore refer to": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='does underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.5542168674698795}, "template_name": "does underscore refer to"}, "stand for": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='stand for', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.4819277108433735}, "template_name": "stand for"}, "underscore refer to": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.5301204819277109}, "template_name": "underscore refer to"}}, "Muennighoff/xwinograd_pt": {"Replace": {"arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='Replace', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "pt", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.5133079847908745}, "template_name": "Replace"}, "True or False": {"arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='True or False', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "pt", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.4714828897338403}, "template_name": "True or False"}, "does underscore refer to": {"arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='does underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "pt", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.5209125475285171}, "template_name": "does underscore refer to"}, "stand for": {"arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='stand for', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "pt", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.5019011406844106}, "template_name": "stand for"}, "underscore refer to": {"arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "pt", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.5399239543726235}, "template_name": "underscore refer to"}}, "Muennighoff/xwinograd_zh": {"Replace": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='Replace', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.5257936507936508}, "template_name": "Replace"}, "True or False": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='True or False', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.5297619047619048}, "template_name": "True or False"}, "does underscore refer to": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='does underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.5218253968253969}, "template_name": "does underscore refer to"}, "stand for": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='stand for', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.4444444444444444}, "template_name": "stand for"}, "underscore refer to": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.5198412698412699}, "template_name": "underscore refer to"}}, "anli_dev_r1": {"GPT-3 style": {"arguments": "Namespace(config_name=None, dataset_config_name='dev_r1', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r1', target_max_length=256, template_config_name=None, template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "dev_r1", "dataset_name": "anli", "evaluation": {"accuracy": 0.351}, "template_name": "GPT-3 style"}, "MNLI crowdsource": {"arguments": "Namespace(config_name=None, dataset_config_name='dev_r1', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r1', target_max_length=256, template_config_name=None, template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "dev_r1", "dataset_name": "anli", "evaluation": {"accuracy": 0.334}, "template_name": "MNLI crowdsource"}, "can we infer": {"arguments": "Namespace(config_name=None, dataset_config_name='dev_r1', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r1', target_max_length=256, template_config_name=None, template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "dev_r1", "dataset_name": "anli", "evaluation": {"accuracy": 0.351}, "template_name": "can we infer"}, "guaranteed/possible/impossible": {"arguments": "Namespace(config_name=None, dataset_config_name='dev_r1', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r1', target_max_length=256, template_config_name=None, template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "dev_r1", "dataset_name": "anli", "evaluation": {"accuracy": 0.288}, "template_name": "guaranteed/possible/impossible"}, "justified in saying": {"arguments": "Namespace(config_name=None, dataset_config_name='dev_r1', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r1', target_max_length=256, template_config_name=None, template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "dev_r1", "dataset_name": "anli", "evaluation": {"accuracy": 0.345}, "template_name": "justified in saying"}}, "anli_dev_r2": {"GPT-3 style": {"arguments": "Namespace(config_name=None, dataset_config_name='dev_r2', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r2', target_max_length=256, template_config_name=None, template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "dev_r2", "dataset_name": "anli", "evaluation": {"accuracy": 0.339}, "template_name": "GPT-3 style"}, "MNLI crowdsource": {"arguments": "Namespace(config_name=None, dataset_config_name='dev_r2', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r2', target_max_length=256, template_config_name=None, template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "dev_r2", "dataset_name": "anli", "evaluation": {"accuracy": 0.335}, "template_name": "MNLI crowdsource"}, "can we infer": {"arguments": "Namespace(config_name=None, dataset_config_name='dev_r2', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r2', target_max_length=256, template_config_name=None, template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "dev_r2", "dataset_name": "anli", "evaluation": {"accuracy": 0.354}, "template_name": "can we infer"}, "guaranteed/possible/impossible": {"arguments": "Namespace(config_name=None, dataset_config_name='dev_r2', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r2', target_max_length=256, template_config_name=None, template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "dev_r2", "dataset_name": "anli", "evaluation": {"accuracy": 0.297}, "template_name": "guaranteed/possible/impossible"}, "justified in saying": {"arguments": "Namespace(config_name=None, dataset_config_name='dev_r2', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r2', target_max_length=256, template_config_name=None, template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "dev_r2", "dataset_name": "anli", "evaluation": {"accuracy": 0.345}, "template_name": "justified in saying"}}, "anli_dev_r3": {"GPT-3 style": {"arguments": "Namespace(config_name=None, dataset_config_name='dev_r3', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r3', target_max_length=256, template_config_name=None, template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "dev_r3", "dataset_name": "anli", "evaluation": {"accuracy": 0.37583333333333335}, "template_name": "GPT-3 style"}, "MNLI crowdsource": {"arguments": "Namespace(config_name=None, dataset_config_name='dev_r3', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r3', target_max_length=256, template_config_name=None, template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "dev_r3", "dataset_name": "anli", "evaluation": {"accuracy": 0.3408333333333333}, "template_name": "MNLI crowdsource"}, "can we infer": {"arguments": "Namespace(config_name=None, dataset_config_name='dev_r3', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r3', target_max_length=256, template_config_name=None, template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "dev_r3", "dataset_name": "anli", "evaluation": {"accuracy": 0.36333333333333334}, "template_name": "can we infer"}, "guaranteed/possible/impossible": {"arguments": "Namespace(config_name=None, dataset_config_name='dev_r3', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r3', target_max_length=256, template_config_name=None, template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "dev_r3", "dataset_name": "anli", "evaluation": {"accuracy": 0.31083333333333335}, "template_name": "guaranteed/possible/impossible"}, "justified in saying": {"arguments": "Namespace(config_name=None, dataset_config_name='dev_r3', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r3', target_max_length=256, template_config_name=None, template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "dev_r3", "dataset_name": "anli", "evaluation": {"accuracy": 0.34}, "template_name": "justified in saying"}}, "story_cloze_2016": {"Answer Given options": {"arguments": "Namespace(config_name=None, dataset_config_name='2016', dataset_name='story_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='Answer Given options', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "2016", "dataset_name": "story_cloze", "evaluation": {"accuracy": 0.8305718866916088}, "template_name": "Answer Given options"}, "Choose Story Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='2016', dataset_name='story_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='Choose Story Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "2016", "dataset_name": "story_cloze", "evaluation": {"accuracy": 0.8706574024585783}, "template_name": "Choose Story Ending"}, "Generate Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='2016', dataset_name='story_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='Generate Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "2016", "dataset_name": "story_cloze", "evaluation": {"accuracy": 0.7183324425440941}, "template_name": "Generate Ending"}, "Novel Correct Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='2016', dataset_name='story_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='Novel Correct Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "2016", "dataset_name": "story_cloze", "evaluation": {"accuracy": 0.848743987172635}, "template_name": "Novel Correct Ending"}, "Story Continuation and Options": {"arguments": "Namespace(config_name=None, dataset_config_name='2016', dataset_name='story_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='Story Continuation and Options', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "2016", "dataset_name": "story_cloze", "evaluation": {"accuracy": 0.8466060929983966}, "template_name": "Story Continuation and Options"}}, "super_glue_cb": {"GPT-3 style": {"arguments": "Namespace(config_name=None, dataset_config_name='cb', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "cb", "dataset_name": "super_glue", "evaluation": {"accuracy": 0.625}, "template_name": "GPT-3 style"}, "MNLI crowdsource": {"arguments": "Namespace(config_name=None, dataset_config_name='cb', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "cb", "dataset_name": "super_glue", "evaluation": {"accuracy": 0.08928571428571429}, "template_name": "MNLI crowdsource"}, "can we infer": {"arguments": "Namespace(config_name=None, dataset_config_name='cb', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "cb", "dataset_name": "super_glue", "evaluation": {"accuracy": 0.5892857142857143}, "template_name": "can we infer"}, "guaranteed/possible/impossible": {"arguments": "Namespace(config_name=None, dataset_config_name='cb', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "cb", "dataset_name": "super_glue", "evaluation": {"accuracy": 0.5}, "template_name": "guaranteed/possible/impossible"}, "justified in saying": {"arguments": "Namespace(config_name=None, dataset_config_name='cb', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "cb", "dataset_name": "super_glue", "evaluation": {"accuracy": 0.5357142857142857}, "template_name": "justified in saying"}}, "super_glue_copa": {"C1 or C2? premise, so/because\u2026": {"arguments": "Namespace(config_name=None, dataset_config_name='copa', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='so/because\u2026,validation', target_max_length=256, template_config_name=None, template_name='C1 or C2? premise', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "copa", "dataset_name": "super_glue", "evaluation": {"accuracy": 0.66}, "template_name": "C1 or C2? premise, so/because\u2026"}, "best_option": {"arguments": "Namespace(config_name=None, dataset_config_name='copa', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='best_option', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "copa", "dataset_name": "super_glue", "evaluation": {"accuracy": 0.67}, "template_name": "best_option"}, "cause_effect": {"arguments": "Namespace(config_name=None, dataset_config_name='copa', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='cause_effect', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "copa", "dataset_name": "super_glue", "evaluation": {"accuracy": 0.78}, "template_name": "cause_effect"}, "i_am_hesitating": {"arguments": "Namespace(config_name=None, dataset_config_name='copa', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='i_am_hesitating', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "copa", "dataset_name": "super_glue", "evaluation": {"accuracy": 0.8}, "template_name": "i_am_hesitating"}, "plausible_alternatives": {"arguments": "Namespace(config_name=None, dataset_config_name='copa', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='plausible_alternatives', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "copa", "dataset_name": "super_glue", "evaluation": {"accuracy": 0.81}, "template_name": "plausible_alternatives"}}, "super_glue_rte": {"GPT-3 style": {"arguments": "Namespace(config_name=None, dataset_config_name='rte', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "rte", "dataset_name": "super_glue", "evaluation": {"accuracy": 0.7870036101083032}, "template_name": "GPT-3 style"}, "MNLI crowdsource": {"arguments": "Namespace(config_name=None, dataset_config_name='rte', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "rte", "dataset_name": "super_glue", "evaluation": {"accuracy": 0.7220216606498195}, "template_name": "MNLI crowdsource"}, "does it follow that": {"arguments": "Namespace(config_name=None, dataset_config_name='rte', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='does it follow that', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "rte", "dataset_name": "super_glue", "evaluation": {"accuracy": 0.6678700361010831}, "template_name": "does it follow that"}, "guaranteed true": {"arguments": "Namespace(config_name=None, dataset_config_name='rte', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='guaranteed true', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "rte", "dataset_name": "super_glue", "evaluation": {"accuracy": 0.6714801444043321}, "template_name": "guaranteed true"}, "should assume": {"arguments": "Namespace(config_name=None, dataset_config_name='rte', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='should assume', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "rte", "dataset_name": "super_glue", "evaluation": {"accuracy": 0.6678700361010831}, "template_name": "should assume"}}, "winogrande_winogrande_xl": {"Replace": {"arguments": "Namespace(config_name=None, dataset_config_name='winogrande_xl', dataset_name='winogrande', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='Replace', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "winogrande_xl", "dataset_name": "winogrande", "evaluation": {"accuracy": 0.5406471981057617}, "template_name": "Replace"}, "True or False": {"arguments": "Namespace(config_name=None, dataset_config_name='winogrande_xl', dataset_name='winogrande', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='True or False', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "winogrande_xl", "dataset_name": "winogrande", "evaluation": {"accuracy": 0.5074980268350434}, "template_name": "True or False"}, "does underscore refer to": {"arguments": "Namespace(config_name=None, dataset_config_name='winogrande_xl', dataset_name='winogrande', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='does underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "winogrande_xl", "dataset_name": "winogrande", "evaluation": {"accuracy": 0.5177584846093133}, "template_name": "does underscore refer to"}, "stand for": {"arguments": "Namespace(config_name=None, dataset_config_name='winogrande_xl', dataset_name='winogrande', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='stand for', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "winogrande_xl", "dataset_name": "winogrande", "evaluation": {"accuracy": 0.510655090765588}, "template_name": "stand for"}, "underscore refer to": {"arguments": "Namespace(config_name=None, dataset_config_name='winogrande_xl', dataset_name='winogrande', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "winogrande_xl", "dataset_name": "winogrande", "evaluation": {"accuracy": 0.5256511444356748}, "template_name": "underscore refer to"}}, "xcopa_id": {"C1 or C2? premise, so/because\u2026": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='so/because\u2026,validation', target_max_length=256, template_config_name='en', template_name='C1 or C2? premise', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.47}, "template_name": "C1 or C2? premise, so/because\u2026"}, "best_option": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='best_option', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.51}, "template_name": "best_option"}, "cause_effect": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='cause_effect', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.65}, "template_name": "cause_effect"}, "i_am_hesitating": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='i_am_hesitating', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.66}, "template_name": "i_am_hesitating"}, "plausible_alternatives": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='plausible_alternatives', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.67}, "template_name": "plausible_alternatives"}}, "xcopa_sw": {"C1 or C2? premise, so/because\u2026": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='so/because\u2026,validation', target_max_length=256, template_config_name='en', template_name='C1 or C2? premise', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.58}, "template_name": "C1 or C2? premise, so/because\u2026"}, "best_option": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='best_option', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.57}, "template_name": "best_option"}, "cause_effect": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='cause_effect', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.46}, "template_name": "cause_effect"}, "i_am_hesitating": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='i_am_hesitating', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.48}, "template_name": "i_am_hesitating"}, "plausible_alternatives": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='plausible_alternatives', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.45}, "template_name": "plausible_alternatives"}}, "xcopa_ta": {"C1 or C2? premise, so/because\u2026": {"arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='so/because\u2026,validation', target_max_length=256, template_config_name='en', template_name='C1 or C2? premise', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ta", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.57}, "template_name": "C1 or C2? premise, so/because\u2026"}, "best_option": {"arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='best_option', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ta", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.67}, "template_name": "best_option"}, "cause_effect": {"arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='cause_effect', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ta", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.71}, "template_name": "cause_effect"}, "i_am_hesitating": {"arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='i_am_hesitating', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ta", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.71}, "template_name": "i_am_hesitating"}, "plausible_alternatives": {"arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='plausible_alternatives', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ta", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.69}, "template_name": "plausible_alternatives"}}, "xcopa_vi": {"C1 or C2? premise, so/because\u2026": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='so/because\u2026,validation', target_max_length=256, template_config_name='en', template_name='C1 or C2? premise', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.55}, "template_name": "C1 or C2? premise, so/because\u2026"}, "best_option": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='best_option', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.61}, "template_name": "best_option"}, "cause_effect": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='cause_effect', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.67}, "template_name": "cause_effect"}, "i_am_hesitating": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='i_am_hesitating', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.66}, "template_name": "i_am_hesitating"}, "plausible_alternatives": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='plausible_alternatives', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.65}, "template_name": "plausible_alternatives"}}, "xcopa_zh": {"C1 or C2? premise, so/because\u2026": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='so/because\u2026,validation', target_max_length=256, template_config_name='en', template_name='C1 or C2? premise', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.62}, "template_name": "C1 or C2? premise, so/because\u2026"}, "best_option": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='best_option', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.61}, "template_name": "best_option"}, "cause_effect": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='cause_effect', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.77}, "template_name": "cause_effect"}, "i_am_hesitating": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='i_am_hesitating', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.72}, "template_name": "i_am_hesitating"}, "plausible_alternatives": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='plausible_alternatives', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.74}, "template_name": "plausible_alternatives"}}, "xnli_ar": {"GPT-3 style": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5040160642570282}, "template_name": "GPT-3 style"}, "MNLI crowdsource": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "xnli", "evaluation": {"accuracy": 0.39879518072289155}, "template_name": "MNLI crowdsource"}, "can we infer": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "xnli", "evaluation": {"accuracy": 0.506425702811245}, "template_name": "can we infer"}, "guaranteed/possible/impossible": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "xnli", "evaluation": {"accuracy": 0.4799196787148594}, "template_name": "guaranteed/possible/impossible"}, "justified in saying": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "xnli", "evaluation": {"accuracy": 0.41526104417670684}, "template_name": "justified in saying"}}, "xnli_en": {"GPT-3 style": {"arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "en", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5590361445783133}, "template_name": "GPT-3 style"}, "MNLI crowdsource": {"arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "en", "dataset_name": "xnli", "evaluation": {"accuracy": 0.342570281124498}, "template_name": "MNLI crowdsource"}, "can we infer": {"arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "en", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5449799196787148}, "template_name": "can we infer"}, "guaranteed/possible/impossible": {"arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "en", "dataset_name": "xnli", "evaluation": {"accuracy": 0.41164658634538154}, "template_name": "guaranteed/possible/impossible"}, "justified in saying": {"arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "en", "dataset_name": "xnli", "evaluation": {"accuracy": 0.4634538152610442}, "template_name": "justified in saying"}}, "xnli_es": {"GPT-3 style": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5373493975903615}, "template_name": "GPT-3 style"}, "MNLI crowdsource": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "xnli", "evaluation": {"accuracy": 0.40441767068273093}, "template_name": "MNLI crowdsource"}, "can we infer": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5277108433734939}, "template_name": "can we infer"}, "guaranteed/possible/impossible": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "xnli", "evaluation": {"accuracy": 0.44216867469879517}, "template_name": "guaranteed/possible/impossible"}, "justified in saying": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "xnli", "evaluation": {"accuracy": 0.4534136546184739}, "template_name": "justified in saying"}}, "xnli_fr": {"GPT-3 style": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5248995983935743}, "template_name": "GPT-3 style"}, "MNLI crowdsource": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3895582329317269}, "template_name": "MNLI crowdsource"}, "can we infer": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5337349397590362}, "template_name": "can we infer"}, "guaranteed/possible/impossible": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "xnli", "evaluation": {"accuracy": 0.42971887550200805}, "template_name": "guaranteed/possible/impossible"}, "justified in saying": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "xnli", "evaluation": {"accuracy": 0.4738955823293173}, "template_name": "justified in saying"}}, "xnli_hi": {"GPT-3 style": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.4983935742971888}, "template_name": "GPT-3 style"}, "MNLI crowdsource": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.38714859437751004}, "template_name": "MNLI crowdsource"}, "can we infer": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.45542168674698796}, "template_name": "can we infer"}, "guaranteed/possible/impossible": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.41405622489959837}, "template_name": "guaranteed/possible/impossible"}, "justified in saying": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.38795180722891565}, "template_name": "justified in saying"}}, "xnli_sw": {"GPT-3 style": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xnli", "evaluation": {"accuracy": 0.43493975903614457}, "template_name": "GPT-3 style"}, "MNLI crowdsource": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xnli", "evaluation": {"accuracy": 0.363855421686747}, "template_name": "MNLI crowdsource"}, "can we infer": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xnli", "evaluation": {"accuracy": 0.42891566265060244}, "template_name": "can we infer"}, "guaranteed/possible/impossible": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3457831325301205}, "template_name": "guaranteed/possible/impossible"}, "justified in saying": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3650602409638554}, "template_name": "justified in saying"}}, "xnli_ur": {"GPT-3 style": {"arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ur", "dataset_name": "xnli", "evaluation": {"accuracy": 0.43493975903614457}, "template_name": "GPT-3 style"}, "MNLI crowdsource": {"arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ur", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3895582329317269}, "template_name": "MNLI crowdsource"}, "can we infer": {"arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ur", "dataset_name": "xnli", "evaluation": {"accuracy": 0.45180722891566266}, "template_name": "can we infer"}, "guaranteed/possible/impossible": {"arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ur", "dataset_name": "xnli", "evaluation": {"accuracy": 0.40120481927710844}, "template_name": "guaranteed/possible/impossible"}, "justified in saying": {"arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ur", "dataset_name": "xnli", "evaluation": {"accuracy": 0.37630522088353413}, "template_name": "justified in saying"}}, "xnli_vi": {"GPT-3 style": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5196787148594377}, "template_name": "GPT-3 style"}, "MNLI crowdsource": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.38112449799196785}, "template_name": "MNLI crowdsource"}, "can we infer": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5080321285140562}, "template_name": "can we infer"}, "guaranteed/possible/impossible": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.38393574297188754}, "template_name": "guaranteed/possible/impossible"}, "justified in saying": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.43614457831325304}, "template_name": "justified in saying"}}, "xnli_zh": {"GPT-3 style": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5052208835341365}, "template_name": "GPT-3 style"}, "MNLI crowdsource": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xnli", "evaluation": {"accuracy": 0.4}, "template_name": "MNLI crowdsource"}, "can we infer": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5228915662650603}, "template_name": "can we infer"}, "guaranteed/possible/impossible": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xnli", "evaluation": {"accuracy": 0.4738955823293173}, "template_name": "guaranteed/possible/impossible"}, "justified in saying": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xnli", "evaluation": {"accuracy": 0.45863453815261046}, "template_name": "justified in saying"}}} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/story_cloze/2016/Answer_Given_options/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/story_cloze/2016/Answer_Given_options/results.json new file mode 100644 index 0000000..4ebb419 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/story_cloze/2016/Answer_Given_options/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "story_cloze", + "dataset_config_name": "2016", + "template_name": "Answer Given options", + "evaluation": { + "accuracy": 0.8305718866916088 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='2016', dataset_name='story_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='Answer Given options', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/story_cloze/2016/Choose_Story_Ending/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/story_cloze/2016/Choose_Story_Ending/results.json new file mode 100644 index 0000000..239df22 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/story_cloze/2016/Choose_Story_Ending/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "story_cloze", + "dataset_config_name": "2016", + "template_name": "Choose Story Ending", + "evaluation": { + "accuracy": 0.8706574024585783 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='2016', dataset_name='story_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='Choose Story Ending', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/story_cloze/2016/Generate_Ending/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/story_cloze/2016/Generate_Ending/results.json new file mode 100644 index 0000000..a517dac --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/story_cloze/2016/Generate_Ending/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "story_cloze", + "dataset_config_name": "2016", + "template_name": "Generate Ending", + "evaluation": { + "accuracy": 0.7183324425440941 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='2016', dataset_name='story_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='Generate Ending', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/story_cloze/2016/Novel_Correct_Ending/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/story_cloze/2016/Novel_Correct_Ending/results.json new file mode 100644 index 0000000..b4a064d --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/story_cloze/2016/Novel_Correct_Ending/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "story_cloze", + "dataset_config_name": "2016", + "template_name": "Novel Correct Ending", + "evaluation": { + "accuracy": 0.848743987172635 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='2016', dataset_name='story_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='Novel Correct Ending', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/story_cloze/2016/Story_Continuation_and_Options/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/story_cloze/2016/Story_Continuation_and_Options/results.json new file mode 100644 index 0000000..b7e2e17 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/story_cloze/2016/Story_Continuation_and_Options/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "story_cloze", + "dataset_config_name": "2016", + "template_name": "Story Continuation and Options", + "evaluation": { + "accuracy": 0.8466060929983966 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='2016', dataset_name='story_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='Story Continuation and Options', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/cb/GPT-3_style/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/cb/GPT-3_style/results.json new file mode 100644 index 0000000..e368bc1 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/cb/GPT-3_style/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "super_glue", + "dataset_config_name": "cb", + "template_name": "GPT-3 style", + "evaluation": { + "accuracy": 0.625 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='cb', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/cb/MNLI_crowdsource/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/cb/MNLI_crowdsource/results.json new file mode 100644 index 0000000..0909b3f --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/cb/MNLI_crowdsource/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "super_glue", + "dataset_config_name": "cb", + "template_name": "MNLI crowdsource", + "evaluation": { + "accuracy": 0.08928571428571429 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='cb', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/cb/can_we_infer/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/cb/can_we_infer/results.json new file mode 100644 index 0000000..5880cfe --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/cb/can_we_infer/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "super_glue", + "dataset_config_name": "cb", + "template_name": "can we infer", + "evaluation": { + "accuracy": 0.5892857142857143 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='cb', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/cb/guaranteed_possible_impossible/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/cb/guaranteed_possible_impossible/results.json new file mode 100644 index 0000000..7c32a08 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/cb/guaranteed_possible_impossible/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "super_glue", + "dataset_config_name": "cb", + "template_name": "guaranteed/possible/impossible", + "evaluation": { + "accuracy": 0.5 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='cb', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/cb/justified_in_saying/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/cb/justified_in_saying/results.json new file mode 100644 index 0000000..696c541 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/cb/justified_in_saying/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "super_glue", + "dataset_config_name": "cb", + "template_name": "justified in saying", + "evaluation": { + "accuracy": 0.5357142857142857 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='cb', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/copa/C1_or_C2?_premise,_so_because…/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/copa/C1_or_C2?_premise,_so_because…/results.json new file mode 100644 index 0000000..148f561 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/copa/C1_or_C2?_premise,_so_because…/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "super_glue", + "dataset_config_name": "copa", + "template_name": "C1 or C2? premise, so/because\u2026", + "evaluation": { + "accuracy": 0.66 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='copa', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='C1 or C2? premise, so/because\u2026', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/copa/C1_or_C2?_premise/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/copa/C1_or_C2?_premise/results.json new file mode 100644 index 0000000..15549fc --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/copa/C1_or_C2?_premise/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "super_glue", + "dataset_config_name": "copa", + "template_name": "C1 or C2? premise, so/because\u2026", + "evaluation": { + "accuracy": 0.66 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='copa', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='so/because\u2026,validation', target_max_length=256, template_config_name=None, template_name='C1 or C2? premise', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/copa/best_option/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/copa/best_option/results.json new file mode 100644 index 0000000..5242428 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/copa/best_option/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "super_glue", + "dataset_config_name": "copa", + "template_name": "best_option", + "evaluation": { + "accuracy": 0.67 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='copa', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='best_option', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/copa/cause_effect/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/copa/cause_effect/results.json new file mode 100644 index 0000000..7192fff --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/copa/cause_effect/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "super_glue", + "dataset_config_name": "copa", + "template_name": "cause_effect", + "evaluation": { + "accuracy": 0.78 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='copa', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='cause_effect', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/copa/i_am_hesitating/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/copa/i_am_hesitating/results.json new file mode 100644 index 0000000..d0e3a19 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/copa/i_am_hesitating/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "super_glue", + "dataset_config_name": "copa", + "template_name": "i_am_hesitating", + "evaluation": { + "accuracy": 0.8 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='copa', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='i_am_hesitating', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/copa/plausible_alternatives/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/copa/plausible_alternatives/results.json new file mode 100644 index 0000000..cc2dfef --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/copa/plausible_alternatives/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "super_glue", + "dataset_config_name": "copa", + "template_name": "plausible_alternatives", + "evaluation": { + "accuracy": 0.81 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='copa', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='plausible_alternatives', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/rte/GPT-3_style/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/rte/GPT-3_style/results.json new file mode 100644 index 0000000..095cdd7 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/rte/GPT-3_style/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "super_glue", + "dataset_config_name": "rte", + "template_name": "GPT-3 style", + "evaluation": { + "accuracy": 0.7870036101083032 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='rte', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/rte/MNLI_crowdsource/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/rte/MNLI_crowdsource/results.json new file mode 100644 index 0000000..75e2990 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/rte/MNLI_crowdsource/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "super_glue", + "dataset_config_name": "rte", + "template_name": "MNLI crowdsource", + "evaluation": { + "accuracy": 0.7220216606498195 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='rte', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/rte/does_it_follow_that/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/rte/does_it_follow_that/results.json new file mode 100644 index 0000000..e068c68 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/rte/does_it_follow_that/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "super_glue", + "dataset_config_name": "rte", + "template_name": "does it follow that", + "evaluation": { + "accuracy": 0.6678700361010831 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='rte', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='does it follow that', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/rte/guaranteed_true/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/rte/guaranteed_true/results.json new file mode 100644 index 0000000..4ef34fb --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/rte/guaranteed_true/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "super_glue", + "dataset_config_name": "rte", + "template_name": "guaranteed true", + "evaluation": { + "accuracy": 0.6714801444043321 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='rte', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='guaranteed true', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/rte/should_assume/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/rte/should_assume/results.json new file mode 100644 index 0000000..de77a06 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/rte/should_assume/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "super_glue", + "dataset_config_name": "rte", + "template_name": "should assume", + "evaluation": { + "accuracy": 0.6678700361010831 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='rte', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='should assume', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/winogrande/winogrande_xl/Replace/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/winogrande/winogrande_xl/Replace/results.json new file mode 100644 index 0000000..433c3d4 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/winogrande/winogrande_xl/Replace/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "winogrande", + "dataset_config_name": "winogrande_xl", + "template_name": "Replace", + "evaluation": { + "accuracy": 0.5406471981057617 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='winogrande_xl', dataset_name='winogrande', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='Replace', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/winogrande/winogrande_xl/True_or_False/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/winogrande/winogrande_xl/True_or_False/results.json new file mode 100644 index 0000000..32959bc --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/winogrande/winogrande_xl/True_or_False/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "winogrande", + "dataset_config_name": "winogrande_xl", + "template_name": "True or False", + "evaluation": { + "accuracy": 0.5074980268350434 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='winogrande_xl', dataset_name='winogrande', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='True or False', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/winogrande/winogrande_xl/does_underscore_refer_to/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/winogrande/winogrande_xl/does_underscore_refer_to/results.json new file mode 100644 index 0000000..3a6ef32 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/winogrande/winogrande_xl/does_underscore_refer_to/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "winogrande", + "dataset_config_name": "winogrande_xl", + "template_name": "does underscore refer to", + "evaluation": { + "accuracy": 0.5177584846093133 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='winogrande_xl', dataset_name='winogrande', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='does underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/winogrande/winogrande_xl/stand_for/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/winogrande/winogrande_xl/stand_for/results.json new file mode 100644 index 0000000..63c12a3 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/winogrande/winogrande_xl/stand_for/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "winogrande", + "dataset_config_name": "winogrande_xl", + "template_name": "stand for", + "evaluation": { + "accuracy": 0.510655090765588 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='winogrande_xl', dataset_name='winogrande', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='stand for', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/winogrande/winogrande_xl/underscore_refer_to/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/winogrande/winogrande_xl/underscore_refer_to/results.json new file mode 100644 index 0000000..6786c5e --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/winogrande/winogrande_xl/underscore_refer_to/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "winogrande", + "dataset_config_name": "winogrande_xl", + "template_name": "underscore refer to", + "evaluation": { + "accuracy": 0.5256511444356748 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='winogrande_xl', dataset_name='winogrande', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/id/C1_or_C2?_premise/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/id/C1_or_C2?_premise/results.json new file mode 100644 index 0000000..9208a33 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/id/C1_or_C2?_premise/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "id", + "template_name": "C1 or C2? premise, so/because\u2026", + "evaluation": { + "accuracy": 0.47 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='so/because\u2026,validation', target_max_length=256, template_config_name='en', template_name='C1 or C2? premise', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/id/best_option/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/id/best_option/results.json new file mode 100644 index 0000000..382cd4d --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/id/best_option/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "id", + "template_name": "best_option", + "evaluation": { + "accuracy": 0.51 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='best_option', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/id/cause_effect/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/id/cause_effect/results.json new file mode 100644 index 0000000..46602d5 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/id/cause_effect/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "id", + "template_name": "cause_effect", + "evaluation": { + "accuracy": 0.65 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='cause_effect', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/id/i_am_hesitating/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/id/i_am_hesitating/results.json new file mode 100644 index 0000000..277f463 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/id/i_am_hesitating/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "id", + "template_name": "i_am_hesitating", + "evaluation": { + "accuracy": 0.66 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='i_am_hesitating', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/id/plausible_alternatives/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/id/plausible_alternatives/results.json new file mode 100644 index 0000000..6b417c2 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/id/plausible_alternatives/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "id", + "template_name": "plausible_alternatives", + "evaluation": { + "accuracy": 0.67 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='plausible_alternatives', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/sw/C1_or_C2?_premise/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/sw/C1_or_C2?_premise/results.json new file mode 100644 index 0000000..714d204 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/sw/C1_or_C2?_premise/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "sw", + "template_name": "C1 or C2? premise, so/because\u2026", + "evaluation": { + "accuracy": 0.58 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='so/because\u2026,validation', target_max_length=256, template_config_name='en', template_name='C1 or C2? premise', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/sw/best_option/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/sw/best_option/results.json new file mode 100644 index 0000000..e60d66f --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/sw/best_option/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "sw", + "template_name": "best_option", + "evaluation": { + "accuracy": 0.57 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='best_option', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/sw/cause_effect/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/sw/cause_effect/results.json new file mode 100644 index 0000000..249cf33 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/sw/cause_effect/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "sw", + "template_name": "cause_effect", + "evaluation": { + "accuracy": 0.46 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='cause_effect', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/sw/i_am_hesitating/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/sw/i_am_hesitating/results.json new file mode 100644 index 0000000..2376018 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/sw/i_am_hesitating/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "sw", + "template_name": "i_am_hesitating", + "evaluation": { + "accuracy": 0.48 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='i_am_hesitating', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/sw/plausible_alternatives/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/sw/plausible_alternatives/results.json new file mode 100644 index 0000000..580d208 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/sw/plausible_alternatives/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "sw", + "template_name": "plausible_alternatives", + "evaluation": { + "accuracy": 0.45 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='plausible_alternatives', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/ta/C1_or_C2?_premise/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/ta/C1_or_C2?_premise/results.json new file mode 100644 index 0000000..5141a84 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/ta/C1_or_C2?_premise/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "ta", + "template_name": "C1 or C2? premise, so/because\u2026", + "evaluation": { + "accuracy": 0.57 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='so/because\u2026,validation', target_max_length=256, template_config_name='en', template_name='C1 or C2? premise', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/ta/best_option/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/ta/best_option/results.json new file mode 100644 index 0000000..4fa6c37 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/ta/best_option/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "ta", + "template_name": "best_option", + "evaluation": { + "accuracy": 0.67 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='best_option', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/ta/cause_effect/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/ta/cause_effect/results.json new file mode 100644 index 0000000..3ebf18e --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/ta/cause_effect/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "ta", + "template_name": "cause_effect", + "evaluation": { + "accuracy": 0.71 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='cause_effect', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/ta/i_am_hesitating/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/ta/i_am_hesitating/results.json new file mode 100644 index 0000000..b9b65f8 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/ta/i_am_hesitating/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "ta", + "template_name": "i_am_hesitating", + "evaluation": { + "accuracy": 0.71 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='i_am_hesitating', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/ta/plausible_alternatives/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/ta/plausible_alternatives/results.json new file mode 100644 index 0000000..e6f2cd5 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/ta/plausible_alternatives/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "ta", + "template_name": "plausible_alternatives", + "evaluation": { + "accuracy": 0.69 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='plausible_alternatives', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/vi/C1_or_C2?_premise/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/vi/C1_or_C2?_premise/results.json new file mode 100644 index 0000000..753aa77 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/vi/C1_or_C2?_premise/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "vi", + "template_name": "C1 or C2? premise, so/because\u2026", + "evaluation": { + "accuracy": 0.55 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='so/because\u2026,validation', target_max_length=256, template_config_name='en', template_name='C1 or C2? premise', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/vi/best_option/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/vi/best_option/results.json new file mode 100644 index 0000000..05e9431 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/vi/best_option/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "vi", + "template_name": "best_option", + "evaluation": { + "accuracy": 0.61 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='best_option', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/vi/cause_effect/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/vi/cause_effect/results.json new file mode 100644 index 0000000..27cde31 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/vi/cause_effect/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "vi", + "template_name": "cause_effect", + "evaluation": { + "accuracy": 0.67 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='cause_effect', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/vi/i_am_hesitating/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/vi/i_am_hesitating/results.json new file mode 100644 index 0000000..8f9775f --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/vi/i_am_hesitating/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "vi", + "template_name": "i_am_hesitating", + "evaluation": { + "accuracy": 0.66 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='i_am_hesitating', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/vi/plausible_alternatives/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/vi/plausible_alternatives/results.json new file mode 100644 index 0000000..7e101b9 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/vi/plausible_alternatives/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "vi", + "template_name": "plausible_alternatives", + "evaluation": { + "accuracy": 0.65 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='plausible_alternatives', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/zh/C1_or_C2?_premise,_so_because…/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/zh/C1_or_C2?_premise,_so_because…/results.json new file mode 100644 index 0000000..43b4984 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/zh/C1_or_C2?_premise,_so_because…/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "zh", + "template_name": "C1 or C2? premise, so/because\u2026", + "evaluation": { + "accuracy": 0.62 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='C1 or C2? premise, so/because\u2026', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/zh/C1_or_C2?_premise/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/zh/C1_or_C2?_premise/results.json new file mode 100644 index 0000000..d3344b0 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/zh/C1_or_C2?_premise/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "zh", + "template_name": "C1 or C2? premise, so/because\u2026", + "evaluation": { + "accuracy": 0.62 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='so/because\u2026,validation', target_max_length=256, template_config_name='en', template_name='C1 or C2? premise', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/zh/best_option/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/zh/best_option/results.json new file mode 100644 index 0000000..3a559a8 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/zh/best_option/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "zh", + "template_name": "best_option", + "evaluation": { + "accuracy": 0.61 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='best_option', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/zh/cause_effect/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/zh/cause_effect/results.json new file mode 100644 index 0000000..e0fba68 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/zh/cause_effect/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "zh", + "template_name": "cause_effect", + "evaluation": { + "accuracy": 0.77 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='cause_effect', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/zh/i_am_hesitating/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/zh/i_am_hesitating/results.json new file mode 100644 index 0000000..4314536 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/zh/i_am_hesitating/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "zh", + "template_name": "i_am_hesitating", + "evaluation": { + "accuracy": 0.72 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='i_am_hesitating', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/zh/plausible_alternatives/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/zh/plausible_alternatives/results.json new file mode 100644 index 0000000..144b85d --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/zh/plausible_alternatives/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "zh", + "template_name": "plausible_alternatives", + "evaluation": { + "accuracy": 0.74 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='plausible_alternatives', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/ar/GPT-3_style/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/ar/GPT-3_style/results.json new file mode 100644 index 0000000..601c995 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/ar/GPT-3_style/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "ar", + "template_name": "GPT-3 style", + "evaluation": { + "accuracy": 0.5040160642570282 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/ar/MNLI_crowdsource/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/ar/MNLI_crowdsource/results.json new file mode 100644 index 0000000..668d0c1 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/ar/MNLI_crowdsource/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "ar", + "template_name": "MNLI crowdsource", + "evaluation": { + "accuracy": 0.39879518072289155 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/ar/can_we_infer/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/ar/can_we_infer/results.json new file mode 100644 index 0000000..f78fef7 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/ar/can_we_infer/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "ar", + "template_name": "can we infer", + "evaluation": { + "accuracy": 0.506425702811245 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/ar/guaranteed_possible_impossible/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/ar/guaranteed_possible_impossible/results.json new file mode 100644 index 0000000..510018d --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/ar/guaranteed_possible_impossible/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "ar", + "template_name": "guaranteed/possible/impossible", + "evaluation": { + "accuracy": 0.4799196787148594 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/ar/justified_in_saying/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/ar/justified_in_saying/results.json new file mode 100644 index 0000000..b6a33bc --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/ar/justified_in_saying/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "ar", + "template_name": "justified in saying", + "evaluation": { + "accuracy": 0.41526104417670684 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/en/GPT-3_style/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/en/GPT-3_style/results.json new file mode 100644 index 0000000..43ac7fc --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/en/GPT-3_style/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "en", + "template_name": "GPT-3 style", + "evaluation": { + "accuracy": 0.5590361445783133 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/en/MNLI_crowdsource/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/en/MNLI_crowdsource/results.json new file mode 100644 index 0000000..fda72d5 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/en/MNLI_crowdsource/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "en", + "template_name": "MNLI crowdsource", + "evaluation": { + "accuracy": 0.342570281124498 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/en/can_we_infer/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/en/can_we_infer/results.json new file mode 100644 index 0000000..9d865db --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/en/can_we_infer/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "en", + "template_name": "can we infer", + "evaluation": { + "accuracy": 0.5449799196787148 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/en/guaranteed_possible_impossible/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/en/guaranteed_possible_impossible/results.json new file mode 100644 index 0000000..74a991d --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/en/guaranteed_possible_impossible/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "en", + "template_name": "guaranteed/possible/impossible", + "evaluation": { + "accuracy": 0.41164658634538154 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/en/justified_in_saying/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/en/justified_in_saying/results.json new file mode 100644 index 0000000..d7082a0 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/en/justified_in_saying/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "en", + "template_name": "justified in saying", + "evaluation": { + "accuracy": 0.4634538152610442 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/es/GPT-3_style/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/es/GPT-3_style/results.json new file mode 100644 index 0000000..ad7fb0b --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/es/GPT-3_style/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "es", + "template_name": "GPT-3 style", + "evaluation": { + "accuracy": 0.5373493975903615 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/es/MNLI_crowdsource/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/es/MNLI_crowdsource/results.json new file mode 100644 index 0000000..5a501d5 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/es/MNLI_crowdsource/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "es", + "template_name": "MNLI crowdsource", + "evaluation": { + "accuracy": 0.40441767068273093 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/es/can_we_infer/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/es/can_we_infer/results.json new file mode 100644 index 0000000..a7314e1 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/es/can_we_infer/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "es", + "template_name": "can we infer", + "evaluation": { + "accuracy": 0.5277108433734939 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/es/guaranteed_possible_impossible/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/es/guaranteed_possible_impossible/results.json new file mode 100644 index 0000000..d18bdad --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/es/guaranteed_possible_impossible/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "es", + "template_name": "guaranteed/possible/impossible", + "evaluation": { + "accuracy": 0.44216867469879517 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/es/justified_in_saying/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/es/justified_in_saying/results.json new file mode 100644 index 0000000..d3422e6 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/es/justified_in_saying/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "es", + "template_name": "justified in saying", + "evaluation": { + "accuracy": 0.4534136546184739 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/fr/GPT-3_style/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/fr/GPT-3_style/results.json new file mode 100644 index 0000000..a857973 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/fr/GPT-3_style/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "fr", + "template_name": "GPT-3 style", + "evaluation": { + "accuracy": 0.5248995983935743 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/fr/MNLI_crowdsource/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/fr/MNLI_crowdsource/results.json new file mode 100644 index 0000000..d52331c --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/fr/MNLI_crowdsource/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "fr", + "template_name": "MNLI crowdsource", + "evaluation": { + "accuracy": 0.3895582329317269 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/fr/can_we_infer/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/fr/can_we_infer/results.json new file mode 100644 index 0000000..dd2b3eb --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/fr/can_we_infer/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "fr", + "template_name": "can we infer", + "evaluation": { + "accuracy": 0.5337349397590362 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/fr/guaranteed_possible_impossible/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/fr/guaranteed_possible_impossible/results.json new file mode 100644 index 0000000..468e885 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/fr/guaranteed_possible_impossible/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "fr", + "template_name": "guaranteed/possible/impossible", + "evaluation": { + "accuracy": 0.42971887550200805 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/fr/justified_in_saying/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/fr/justified_in_saying/results.json new file mode 100644 index 0000000..630d89c --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/fr/justified_in_saying/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "fr", + "template_name": "justified in saying", + "evaluation": { + "accuracy": 0.4738955823293173 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/hi/GPT-3_style/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/hi/GPT-3_style/results.json new file mode 100644 index 0000000..026f085 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/hi/GPT-3_style/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "hi", + "template_name": "GPT-3 style", + "evaluation": { + "accuracy": 0.4983935742971888 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/hi/MNLI_crowdsource/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/hi/MNLI_crowdsource/results.json new file mode 100644 index 0000000..7170a7b --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/hi/MNLI_crowdsource/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "hi", + "template_name": "MNLI crowdsource", + "evaluation": { + "accuracy": 0.38714859437751004 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/hi/can_we_infer/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/hi/can_we_infer/results.json new file mode 100644 index 0000000..4d29da4 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/hi/can_we_infer/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "hi", + "template_name": "can we infer", + "evaluation": { + "accuracy": 0.45542168674698796 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/hi/guaranteed_possible_impossible/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/hi/guaranteed_possible_impossible/results.json new file mode 100644 index 0000000..60b7867 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/hi/guaranteed_possible_impossible/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "hi", + "template_name": "guaranteed/possible/impossible", + "evaluation": { + "accuracy": 0.41405622489959837 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/hi/justified_in_saying/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/hi/justified_in_saying/results.json new file mode 100644 index 0000000..24544d5 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/hi/justified_in_saying/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "hi", + "template_name": "justified in saying", + "evaluation": { + "accuracy": 0.38795180722891565 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/sw/GPT-3_style/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/sw/GPT-3_style/results.json new file mode 100644 index 0000000..704a54f --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/sw/GPT-3_style/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "sw", + "template_name": "GPT-3 style", + "evaluation": { + "accuracy": 0.43493975903614457 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/sw/MNLI_crowdsource/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/sw/MNLI_crowdsource/results.json new file mode 100644 index 0000000..1aaf83c --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/sw/MNLI_crowdsource/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "sw", + "template_name": "MNLI crowdsource", + "evaluation": { + "accuracy": 0.363855421686747 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/sw/can_we_infer/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/sw/can_we_infer/results.json new file mode 100644 index 0000000..063bcc3 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/sw/can_we_infer/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "sw", + "template_name": "can we infer", + "evaluation": { + "accuracy": 0.42891566265060244 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/sw/guaranteed_possible_impossible/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/sw/guaranteed_possible_impossible/results.json new file mode 100644 index 0000000..a66ca54 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/sw/guaranteed_possible_impossible/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "sw", + "template_name": "guaranteed/possible/impossible", + "evaluation": { + "accuracy": 0.3457831325301205 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/sw/justified_in_saying/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/sw/justified_in_saying/results.json new file mode 100644 index 0000000..65f1c06 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/sw/justified_in_saying/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "sw", + "template_name": "justified in saying", + "evaluation": { + "accuracy": 0.3650602409638554 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/ur/GPT-3_style/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/ur/GPT-3_style/results.json new file mode 100644 index 0000000..1997695 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/ur/GPT-3_style/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "ur", + "template_name": "GPT-3 style", + "evaluation": { + "accuracy": 0.43493975903614457 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/ur/MNLI_crowdsource/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/ur/MNLI_crowdsource/results.json new file mode 100644 index 0000000..c44a34a --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/ur/MNLI_crowdsource/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "ur", + "template_name": "MNLI crowdsource", + "evaluation": { + "accuracy": 0.3895582329317269 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/ur/can_we_infer/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/ur/can_we_infer/results.json new file mode 100644 index 0000000..4506835 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/ur/can_we_infer/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "ur", + "template_name": "can we infer", + "evaluation": { + "accuracy": 0.45180722891566266 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/ur/guaranteed_possible_impossible/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/ur/guaranteed_possible_impossible/results.json new file mode 100644 index 0000000..b48d0d1 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/ur/guaranteed_possible_impossible/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "ur", + "template_name": "guaranteed/possible/impossible", + "evaluation": { + "accuracy": 0.40120481927710844 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/ur/justified_in_saying/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/ur/justified_in_saying/results.json new file mode 100644 index 0000000..80cac46 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/ur/justified_in_saying/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "ur", + "template_name": "justified in saying", + "evaluation": { + "accuracy": 0.37630522088353413 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/vi/GPT-3_style/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/vi/GPT-3_style/results.json new file mode 100644 index 0000000..3fe5669 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/vi/GPT-3_style/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "vi", + "template_name": "GPT-3 style", + "evaluation": { + "accuracy": 0.5196787148594377 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/vi/MNLI_crowdsource/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/vi/MNLI_crowdsource/results.json new file mode 100644 index 0000000..5aac5e7 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/vi/MNLI_crowdsource/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "vi", + "template_name": "MNLI crowdsource", + "evaluation": { + "accuracy": 0.38112449799196785 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/vi/can_we_infer/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/vi/can_we_infer/results.json new file mode 100644 index 0000000..da6257d --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/vi/can_we_infer/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "vi", + "template_name": "can we infer", + "evaluation": { + "accuracy": 0.5080321285140562 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/vi/guaranteed_possible_impossible/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/vi/guaranteed_possible_impossible/results.json new file mode 100644 index 0000000..39b0bda --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/vi/guaranteed_possible_impossible/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "vi", + "template_name": "guaranteed/possible/impossible", + "evaluation": { + "accuracy": 0.38393574297188754 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/vi/justified_in_saying/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/vi/justified_in_saying/results.json new file mode 100644 index 0000000..4936fb1 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/vi/justified_in_saying/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "vi", + "template_name": "justified in saying", + "evaluation": { + "accuracy": 0.43614457831325304 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/zh/GPT-3_style/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/zh/GPT-3_style/results.json new file mode 100644 index 0000000..21dfb29 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/zh/GPT-3_style/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "zh", + "template_name": "GPT-3 style", + "evaluation": { + "accuracy": 0.5052208835341365 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/zh/MNLI_crowdsource/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/zh/MNLI_crowdsource/results.json new file mode 100644 index 0000000..0f1aaf7 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/zh/MNLI_crowdsource/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "zh", + "template_name": "MNLI crowdsource", + "evaluation": { + "accuracy": 0.4 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/zh/can_we_infer/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/zh/can_we_infer/results.json new file mode 100644 index 0000000..29fd58d --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/zh/can_we_infer/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "zh", + "template_name": "can we infer", + "evaluation": { + "accuracy": 0.5228915662650603 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/zh/guaranteed_possible_impossible/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/zh/guaranteed_possible_impossible/results.json new file mode 100644 index 0000000..639dd78 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/zh/guaranteed_possible_impossible/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "zh", + "template_name": "guaranteed/possible/impossible", + "evaluation": { + "accuracy": 0.4738955823293173 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/zh/justified_in_saying/results.json b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/zh/justified_in_saying/results.json new file mode 100644 index 0000000..51d5846 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l1/xnli/zh/justified_in_saying/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "zh", + "template_name": "justified in saying", + "evaluation": { + "accuracy": 0.45863453815261046 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l2/Muennighoff_xstory_cloze/my/Answer_Given_options/results.json b/evaluation_bloomz-7b1-p3/evaluation_l2/Muennighoff_xstory_cloze/my/Answer_Given_options/results.json new file mode 100644 index 0000000..2fa1366 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l2/Muennighoff_xstory_cloze/my/Answer_Given_options/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "my", + "template_name": "Answer Given options", + "evaluation": { + "accuracy": 0.5056254136333554 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='my', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3', nospace=False, output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Answer Given options', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l2/Muennighoff_xstory_cloze/my/Choose_Story_Ending/results.json b/evaluation_bloomz-7b1-p3/evaluation_l2/Muennighoff_xstory_cloze/my/Choose_Story_Ending/results.json new file mode 100644 index 0000000..ed1a670 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l2/Muennighoff_xstory_cloze/my/Choose_Story_Ending/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "my", + "template_name": "Choose Story Ending", + "evaluation": { + "accuracy": 0.5069490403706155 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='my', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3', nospace=False, output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Choose Story Ending', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l2/Muennighoff_xstory_cloze/my/Generate_Ending/results.json b/evaluation_bloomz-7b1-p3/evaluation_l2/Muennighoff_xstory_cloze/my/Generate_Ending/results.json new file mode 100644 index 0000000..d599c8d --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l2/Muennighoff_xstory_cloze/my/Generate_Ending/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "my", + "template_name": "Generate Ending", + "evaluation": { + "accuracy": 0.4784910655195235 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='my', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3', nospace=False, output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Generate Ending', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l2/Muennighoff_xstory_cloze/my/Novel_Correct_Ending/results.json b/evaluation_bloomz-7b1-p3/evaluation_l2/Muennighoff_xstory_cloze/my/Novel_Correct_Ending/results.json new file mode 100644 index 0000000..bfefdb1 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l2/Muennighoff_xstory_cloze/my/Novel_Correct_Ending/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "my", + "template_name": "Novel Correct Ending", + "evaluation": { + "accuracy": 0.5102581072137657 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='my', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3', nospace=False, output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Novel Correct Ending', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l2/Muennighoff_xstory_cloze/my/Story_Continuation_and_Options/results.json b/evaluation_bloomz-7b1-p3/evaluation_l2/Muennighoff_xstory_cloze/my/Story_Continuation_and_Options/results.json new file mode 100644 index 0000000..08300d7 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l2/Muennighoff_xstory_cloze/my/Story_Continuation_and_Options/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "my", + "template_name": "Story Continuation and Options", + "evaluation": { + "accuracy": 0.5062872270019855 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='my', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3', nospace=False, output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Story Continuation and Options', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l2/Muennighoff_xstory_cloze/ru/Answer_Given_options/results.json b/evaluation_bloomz-7b1-p3/evaluation_l2/Muennighoff_xstory_cloze/ru/Answer_Given_options/results.json new file mode 100644 index 0000000..b2f5de0 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l2/Muennighoff_xstory_cloze/ru/Answer_Given_options/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "ru", + "template_name": "Answer Given options", + "evaluation": { + "accuracy": 0.6406353408338848 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='ru', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3', nospace=False, output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Answer Given options', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l2/Muennighoff_xstory_cloze/ru/Choose_Story_Ending/results.json b/evaluation_bloomz-7b1-p3/evaluation_l2/Muennighoff_xstory_cloze/ru/Choose_Story_Ending/results.json new file mode 100644 index 0000000..aed3c1e --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l2/Muennighoff_xstory_cloze/ru/Choose_Story_Ending/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "ru", + "template_name": "Choose Story Ending", + "evaluation": { + "accuracy": 0.6644606221045665 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='ru', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3', nospace=False, output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Choose Story Ending', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l2/Muennighoff_xstory_cloze/ru/Generate_Ending/results.json b/evaluation_bloomz-7b1-p3/evaluation_l2/Muennighoff_xstory_cloze/ru/Generate_Ending/results.json new file mode 100644 index 0000000..ced50bd --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l2/Muennighoff_xstory_cloze/ru/Generate_Ending/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "ru", + "template_name": "Generate Ending", + "evaluation": { + "accuracy": 0.514890800794176 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='ru', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3', nospace=False, output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Generate Ending', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l2/Muennighoff_xstory_cloze/ru/Novel_Correct_Ending/results.json b/evaluation_bloomz-7b1-p3/evaluation_l2/Muennighoff_xstory_cloze/ru/Novel_Correct_Ending/results.json new file mode 100644 index 0000000..d7d783c --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l2/Muennighoff_xstory_cloze/ru/Novel_Correct_Ending/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "ru", + "template_name": "Novel Correct Ending", + "evaluation": { + "accuracy": 0.6393117140966248 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='ru', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3', nospace=False, output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Novel Correct Ending', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l2/Muennighoff_xstory_cloze/ru/Story_Continuation_and_Options/results.json b/evaluation_bloomz-7b1-p3/evaluation_l2/Muennighoff_xstory_cloze/ru/Story_Continuation_and_Options/results.json new file mode 100644 index 0000000..f011df8 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l2/Muennighoff_xstory_cloze/ru/Story_Continuation_and_Options/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "ru", + "template_name": "Story Continuation and Options", + "evaluation": { + "accuracy": 0.6545334215751158 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='ru', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3', nospace=False, output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Story Continuation and Options', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l2/Muennighoff_xwinograd/jp/Replace/results.json b/evaluation_bloomz-7b1-p3/evaluation_l2/Muennighoff_xwinograd/jp/Replace/results.json new file mode 100644 index 0000000..49325a6 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l2/Muennighoff_xwinograd/jp/Replace/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xwinograd", + "dataset_config_name": "jp", + "template_name": "Replace", + "evaluation": { + "accuracy": 0.5130344108446299 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='jp', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3', nospace=False, output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='Replace', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l2/Muennighoff_xwinograd/jp/True_or_False/results.json b/evaluation_bloomz-7b1-p3/evaluation_l2/Muennighoff_xwinograd/jp/True_or_False/results.json new file mode 100644 index 0000000..392a934 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l2/Muennighoff_xwinograd/jp/True_or_False/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xwinograd", + "dataset_config_name": "jp", + "template_name": "True or False", + "evaluation": { + "accuracy": 0.5036496350364964 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='jp', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3', nospace=False, output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='True or False', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l2/Muennighoff_xwinograd/jp/does_underscore_refer_to/results.json b/evaluation_bloomz-7b1-p3/evaluation_l2/Muennighoff_xwinograd/jp/does_underscore_refer_to/results.json new file mode 100644 index 0000000..d87d24f --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l2/Muennighoff_xwinograd/jp/does_underscore_refer_to/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xwinograd", + "dataset_config_name": "jp", + "template_name": "does underscore refer to", + "evaluation": { + "accuracy": 0.5192909280500522 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='jp', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3', nospace=False, output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='does underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l2/Muennighoff_xwinograd/jp/stand_for/results.json b/evaluation_bloomz-7b1-p3/evaluation_l2/Muennighoff_xwinograd/jp/stand_for/results.json new file mode 100644 index 0000000..e5a907e --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l2/Muennighoff_xwinograd/jp/stand_for/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xwinograd", + "dataset_config_name": "jp", + "template_name": "stand for", + "evaluation": { + "accuracy": 0.49635036496350365 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='jp', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3', nospace=False, output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='stand for', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l2/Muennighoff_xwinograd/jp/underscore_refer_to/results.json b/evaluation_bloomz-7b1-p3/evaluation_l2/Muennighoff_xwinograd/jp/underscore_refer_to/results.json new file mode 100644 index 0000000..9ab4f10 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l2/Muennighoff_xwinograd/jp/underscore_refer_to/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xwinograd", + "dataset_config_name": "jp", + "template_name": "underscore refer to", + "evaluation": { + "accuracy": 0.4994786235662148 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='jp', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3', nospace=False, output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l2/Muennighoff_xwinograd/ru/Replace/results.json b/evaluation_bloomz-7b1-p3/evaluation_l2/Muennighoff_xwinograd/ru/Replace/results.json new file mode 100644 index 0000000..a681d4b --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l2/Muennighoff_xwinograd/ru/Replace/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xwinograd", + "dataset_config_name": "ru", + "template_name": "Replace", + "evaluation": { + "accuracy": 0.49206349206349204 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='ru', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3', nospace=False, output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='Replace', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l2/Muennighoff_xwinograd/ru/True_or_False/results.json b/evaluation_bloomz-7b1-p3/evaluation_l2/Muennighoff_xwinograd/ru/True_or_False/results.json new file mode 100644 index 0000000..ef9b2a3 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l2/Muennighoff_xwinograd/ru/True_or_False/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xwinograd", + "dataset_config_name": "ru", + "template_name": "True or False", + "evaluation": { + "accuracy": 0.4793650793650794 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='ru', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3', nospace=False, output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='True or False', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l2/Muennighoff_xwinograd/ru/does_underscore_refer_to/results.json b/evaluation_bloomz-7b1-p3/evaluation_l2/Muennighoff_xwinograd/ru/does_underscore_refer_to/results.json new file mode 100644 index 0000000..0cfffb5 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l2/Muennighoff_xwinograd/ru/does_underscore_refer_to/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xwinograd", + "dataset_config_name": "ru", + "template_name": "does underscore refer to", + "evaluation": { + "accuracy": 0.4857142857142857 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='ru', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3', nospace=False, output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='does underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l2/Muennighoff_xwinograd/ru/stand_for/results.json b/evaluation_bloomz-7b1-p3/evaluation_l2/Muennighoff_xwinograd/ru/stand_for/results.json new file mode 100644 index 0000000..9dcf656 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l2/Muennighoff_xwinograd/ru/stand_for/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xwinograd", + "dataset_config_name": "ru", + "template_name": "stand for", + "evaluation": { + "accuracy": 0.4888888888888889 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='ru', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3', nospace=False, output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='stand for', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l2/Muennighoff_xwinograd/ru/underscore_refer_to/results.json b/evaluation_bloomz-7b1-p3/evaluation_l2/Muennighoff_xwinograd/ru/underscore_refer_to/results.json new file mode 100644 index 0000000..758a2e5 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l2/Muennighoff_xwinograd/ru/underscore_refer_to/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xwinograd", + "dataset_config_name": "ru", + "template_name": "underscore refer to", + "evaluation": { + "accuracy": 0.4857142857142857 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='ru', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3', nospace=False, output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/et/C1_or_C2?_premise/results.json b/evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/et/C1_or_C2?_premise/results.json new file mode 100644 index 0000000..743e974 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/et/C1_or_C2?_premise/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "et", + "template_name": "C1 or C2? premise, so/because\u2026", + "evaluation": { + "accuracy": 0.47 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='et', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3', nospace=False, output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='so/because\u2026,validation', target_max_length=256, template_config_name='en', template_name='C1 or C2? premise', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/et/best_option/results.json b/evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/et/best_option/results.json new file mode 100644 index 0000000..50ee24d --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/et/best_option/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "et", + "template_name": "best_option", + "evaluation": { + "accuracy": 0.52 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='et', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3', nospace=False, output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='best_option', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/et/cause_effect/results.json b/evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/et/cause_effect/results.json new file mode 100644 index 0000000..88e21e3 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/et/cause_effect/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "et", + "template_name": "cause_effect", + "evaluation": { + "accuracy": 0.49 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='et', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3', nospace=False, output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='cause_effect', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/et/i_am_hesitating/results.json b/evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/et/i_am_hesitating/results.json new file mode 100644 index 0000000..66ad24d --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/et/i_am_hesitating/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "et", + "template_name": "i_am_hesitating", + "evaluation": { + "accuracy": 0.57 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='et', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3', nospace=False, output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='i_am_hesitating', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/et/plausible_alternatives/results.json b/evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/et/plausible_alternatives/results.json new file mode 100644 index 0000000..d7a95b8 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/et/plausible_alternatives/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "et", + "template_name": "plausible_alternatives", + "evaluation": { + "accuracy": 0.55 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='et', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3', nospace=False, output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='plausible_alternatives', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/ht/C1_or_C2?_premise/results.json b/evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/ht/C1_or_C2?_premise/results.json new file mode 100644 index 0000000..90ff583 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/ht/C1_or_C2?_premise/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "ht", + "template_name": "C1 or C2? premise, so/because\u2026", + "evaluation": { + "accuracy": 0.51 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='ht', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3', nospace=False, output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='so/because\u2026,validation', target_max_length=256, template_config_name='en', template_name='C1 or C2? premise', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/ht/best_option/results.json b/evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/ht/best_option/results.json new file mode 100644 index 0000000..60d6cf6 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/ht/best_option/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "ht", + "template_name": "best_option", + "evaluation": { + "accuracy": 0.47 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='ht', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3', nospace=False, output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='best_option', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/ht/cause_effect/results.json b/evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/ht/cause_effect/results.json new file mode 100644 index 0000000..97837e8 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/ht/cause_effect/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "ht", + "template_name": "cause_effect", + "evaluation": { + "accuracy": 0.55 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='ht', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3', nospace=False, output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='cause_effect', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/ht/i_am_hesitating/results.json b/evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/ht/i_am_hesitating/results.json new file mode 100644 index 0000000..2567e74 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/ht/i_am_hesitating/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "ht", + "template_name": "i_am_hesitating", + "evaluation": { + "accuracy": 0.51 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='ht', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3', nospace=False, output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='i_am_hesitating', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/ht/plausible_alternatives/results.json b/evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/ht/plausible_alternatives/results.json new file mode 100644 index 0000000..a729197 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/ht/plausible_alternatives/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "ht", + "template_name": "plausible_alternatives", + "evaluation": { + "accuracy": 0.52 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='ht', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3', nospace=False, output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='plausible_alternatives', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/it/C1_or_C2?_premise/results.json b/evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/it/C1_or_C2?_premise/results.json new file mode 100644 index 0000000..d2850ee --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/it/C1_or_C2?_premise/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "it", + "template_name": "C1 or C2? premise, so/because\u2026", + "evaluation": { + "accuracy": 0.57 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='it', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3', nospace=False, output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='so/because\u2026,validation', target_max_length=256, template_config_name='en', template_name='C1 or C2? premise', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/it/best_option/results.json b/evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/it/best_option/results.json new file mode 100644 index 0000000..07549f5 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/it/best_option/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "it", + "template_name": "best_option", + "evaluation": { + "accuracy": 0.52 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='it', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3', nospace=False, output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='best_option', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/it/cause_effect/results.json b/evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/it/cause_effect/results.json new file mode 100644 index 0000000..396c033 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/it/cause_effect/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "it", + "template_name": "cause_effect", + "evaluation": { + "accuracy": 0.54 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='it', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3', nospace=False, output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='cause_effect', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/it/i_am_hesitating/results.json b/evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/it/i_am_hesitating/results.json new file mode 100644 index 0000000..46df899 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/it/i_am_hesitating/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "it", + "template_name": "i_am_hesitating", + "evaluation": { + "accuracy": 0.57 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='it', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3', nospace=False, output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='i_am_hesitating', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/it/plausible_alternatives/results.json b/evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/it/plausible_alternatives/results.json new file mode 100644 index 0000000..c0e5ea4 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/it/plausible_alternatives/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "it", + "template_name": "plausible_alternatives", + "evaluation": { + "accuracy": 0.6 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='it', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3', nospace=False, output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='plausible_alternatives', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/qu/C1_or_C2?_premise/results.json b/evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/qu/C1_or_C2?_premise/results.json new file mode 100644 index 0000000..a51e6d2 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/qu/C1_or_C2?_premise/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "qu", + "template_name": "C1 or C2? premise, so/because\u2026", + "evaluation": { + "accuracy": 0.47 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='qu', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3', nospace=False, output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='so/because\u2026,validation', target_max_length=256, template_config_name='en', template_name='C1 or C2? premise', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/qu/best_option/results.json b/evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/qu/best_option/results.json new file mode 100644 index 0000000..d1ef876 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/qu/best_option/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "qu", + "template_name": "best_option", + "evaluation": { + "accuracy": 0.52 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='qu', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3', nospace=False, output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='best_option', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/qu/cause_effect/results.json b/evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/qu/cause_effect/results.json new file mode 100644 index 0000000..28d4308 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/qu/cause_effect/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "qu", + "template_name": "cause_effect", + "evaluation": { + "accuracy": 0.5 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='qu', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3', nospace=False, output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='cause_effect', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/qu/i_am_hesitating/results.json b/evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/qu/i_am_hesitating/results.json new file mode 100644 index 0000000..6c6ea45 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/qu/i_am_hesitating/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "qu", + "template_name": "i_am_hesitating", + "evaluation": { + "accuracy": 0.48 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='qu', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3', nospace=False, output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='i_am_hesitating', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/qu/plausible_alternatives/results.json b/evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/qu/plausible_alternatives/results.json new file mode 100644 index 0000000..8cdca1c --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/qu/plausible_alternatives/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "qu", + "template_name": "plausible_alternatives", + "evaluation": { + "accuracy": 0.54 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='qu', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3', nospace=False, output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='plausible_alternatives', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/tr/C1_or_C2?_premise/results.json b/evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/tr/C1_or_C2?_premise/results.json new file mode 100644 index 0000000..7451e86 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/tr/C1_or_C2?_premise/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "tr", + "template_name": "C1 or C2? premise, so/because\u2026", + "evaluation": { + "accuracy": 0.55 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='tr', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3', nospace=False, output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='so/because\u2026,validation', target_max_length=256, template_config_name='en', template_name='C1 or C2? premise', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/tr/best_option/results.json b/evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/tr/best_option/results.json new file mode 100644 index 0000000..fe7ad7f --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/tr/best_option/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "tr", + "template_name": "best_option", + "evaluation": { + "accuracy": 0.48 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='tr', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3', nospace=False, output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='best_option', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/tr/cause_effect/results.json b/evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/tr/cause_effect/results.json new file mode 100644 index 0000000..1284231 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/tr/cause_effect/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "tr", + "template_name": "cause_effect", + "evaluation": { + "accuracy": 0.53 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='tr', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3', nospace=False, output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='cause_effect', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/tr/i_am_hesitating/results.json b/evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/tr/i_am_hesitating/results.json new file mode 100644 index 0000000..69e4de4 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/tr/i_am_hesitating/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "tr", + "template_name": "i_am_hesitating", + "evaluation": { + "accuracy": 0.54 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='tr', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3', nospace=False, output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='i_am_hesitating', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/tr/plausible_alternatives/results.json b/evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/tr/plausible_alternatives/results.json new file mode 100644 index 0000000..dce2013 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l2/xcopa/tr/plausible_alternatives/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "tr", + "template_name": "plausible_alternatives", + "evaluation": { + "accuracy": 0.52 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='tr', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3', nospace=False, output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='plausible_alternatives', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/bg/GPT-3_style/results.json b/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/bg/GPT-3_style/results.json new file mode 100644 index 0000000..3dcfb82 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/bg/GPT-3_style/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "bg", + "template_name": "GPT-3 style", + "evaluation": { + "accuracy": 0.43775100401606426 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='bg', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3', nospace=False, output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/bg/MNLI_crowdsource/results.json b/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/bg/MNLI_crowdsource/results.json new file mode 100644 index 0000000..a662fc6 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/bg/MNLI_crowdsource/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "bg", + "template_name": "MNLI crowdsource", + "evaluation": { + "accuracy": 0.36666666666666664 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='bg', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3', nospace=False, output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/bg/can_we_infer/results.json b/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/bg/can_we_infer/results.json new file mode 100644 index 0000000..27aadc3 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/bg/can_we_infer/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "bg", + "template_name": "can we infer", + "evaluation": { + "accuracy": 0.4397590361445783 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='bg', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3', nospace=False, output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/bg/guaranteed_possible_impossible/results.json b/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/bg/guaranteed_possible_impossible/results.json new file mode 100644 index 0000000..a8966b9 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/bg/guaranteed_possible_impossible/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "bg", + "template_name": "guaranteed/possible/impossible", + "evaluation": { + "accuracy": 0.41646586345381525 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='bg', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3', nospace=False, output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/bg/justified_in_saying/results.json b/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/bg/justified_in_saying/results.json new file mode 100644 index 0000000..756dbc8 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/bg/justified_in_saying/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "bg", + "template_name": "justified in saying", + "evaluation": { + "accuracy": 0.4108433734939759 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='bg', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3', nospace=False, output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/de/GPT-3_style/results.json b/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/de/GPT-3_style/results.json new file mode 100644 index 0000000..76cbb91 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/de/GPT-3_style/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "de", + "template_name": "GPT-3 style", + "evaluation": { + "accuracy": 0.470281124497992 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='de', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3', nospace=False, output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/de/MNLI_crowdsource/results.json b/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/de/MNLI_crowdsource/results.json new file mode 100644 index 0000000..1accdf1 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/de/MNLI_crowdsource/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "de", + "template_name": "MNLI crowdsource", + "evaluation": { + "accuracy": 0.3586345381526104 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='de', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3', nospace=False, output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/de/can_we_infer/results.json b/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/de/can_we_infer/results.json new file mode 100644 index 0000000..ef2e3c9 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/de/can_we_infer/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "de", + "template_name": "can we infer", + "evaluation": { + "accuracy": 0.44016064257028115 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='de', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3', nospace=False, output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/de/guaranteed_possible_impossible/results.json b/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/de/guaranteed_possible_impossible/results.json new file mode 100644 index 0000000..50b78cb --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/de/guaranteed_possible_impossible/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "de", + "template_name": "guaranteed/possible/impossible", + "evaluation": { + "accuracy": 0.3538152610441767 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='de', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3', nospace=False, output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/de/justified_in_saying/results.json b/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/de/justified_in_saying/results.json new file mode 100644 index 0000000..55a8940 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/de/justified_in_saying/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "de", + "template_name": "justified in saying", + "evaluation": { + "accuracy": 0.41847389558232934 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='de', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3', nospace=False, output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/el/GPT-3_style/results.json b/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/el/GPT-3_style/results.json new file mode 100644 index 0000000..ac4efe3 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/el/GPT-3_style/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "el", + "template_name": "GPT-3 style", + "evaluation": { + "accuracy": 0.41887550200803214 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='el', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3', nospace=False, output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/el/MNLI_crowdsource/results.json b/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/el/MNLI_crowdsource/results.json new file mode 100644 index 0000000..748a428 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/el/MNLI_crowdsource/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "el", + "template_name": "MNLI crowdsource", + "evaluation": { + "accuracy": 0.3598393574297189 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='el', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3', nospace=False, output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/el/can_we_infer/results.json b/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/el/can_we_infer/results.json new file mode 100644 index 0000000..2e3d597 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/el/can_we_infer/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "el", + "template_name": "can we infer", + "evaluation": { + "accuracy": 0.4108433734939759 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='el', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3', nospace=False, output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/el/guaranteed_possible_impossible/results.json b/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/el/guaranteed_possible_impossible/results.json new file mode 100644 index 0000000..7293712 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/el/guaranteed_possible_impossible/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "el", + "template_name": "guaranteed/possible/impossible", + "evaluation": { + "accuracy": 0.40682730923694777 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='el', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3', nospace=False, output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/el/justified_in_saying/results.json b/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/el/justified_in_saying/results.json new file mode 100644 index 0000000..f1887c4 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/el/justified_in_saying/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "el", + "template_name": "justified in saying", + "evaluation": { + "accuracy": 0.3823293172690763 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='el', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3', nospace=False, output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/ru/GPT-3_style/results.json b/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/ru/GPT-3_style/results.json new file mode 100644 index 0000000..e33ac69 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/ru/GPT-3_style/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "ru", + "template_name": "GPT-3 style", + "evaluation": { + "accuracy": 0.46546184738955826 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='ru', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3', nospace=False, output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/ru/MNLI_crowdsource/results.json b/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/ru/MNLI_crowdsource/results.json new file mode 100644 index 0000000..6f3d789 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/ru/MNLI_crowdsource/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "ru", + "template_name": "MNLI crowdsource", + "evaluation": { + "accuracy": 0.3819277108433735 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='ru', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3', nospace=False, output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/ru/can_we_infer/results.json b/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/ru/can_we_infer/results.json new file mode 100644 index 0000000..b5d4131 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/ru/can_we_infer/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "ru", + "template_name": "can we infer", + "evaluation": { + "accuracy": 0.4614457831325301 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='ru', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3', nospace=False, output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/ru/guaranteed_possible_impossible/results.json b/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/ru/guaranteed_possible_impossible/results.json new file mode 100644 index 0000000..8d1aa37 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/ru/guaranteed_possible_impossible/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "ru", + "template_name": "guaranteed/possible/impossible", + "evaluation": { + "accuracy": 0.42208835341365464 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='ru', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3', nospace=False, output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/ru/justified_in_saying/results.json b/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/ru/justified_in_saying/results.json new file mode 100644 index 0000000..7266f29 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/ru/justified_in_saying/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "ru", + "template_name": "justified in saying", + "evaluation": { + "accuracy": 0.4389558232931727 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='ru', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3', nospace=False, output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/th/GPT-3_style/results.json b/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/th/GPT-3_style/results.json new file mode 100644 index 0000000..4ec3de9 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/th/GPT-3_style/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "th", + "template_name": "GPT-3 style", + "evaluation": { + "accuracy": 0.41646586345381525 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='th', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3', nospace=False, output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/th/MNLI_crowdsource/results.json b/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/th/MNLI_crowdsource/results.json new file mode 100644 index 0000000..30f0cf4 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/th/MNLI_crowdsource/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "th", + "template_name": "MNLI crowdsource", + "evaluation": { + "accuracy": 0.3224899598393574 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='th', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3', nospace=False, output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/th/can_we_infer/results.json b/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/th/can_we_infer/results.json new file mode 100644 index 0000000..a5f8041 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/th/can_we_infer/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "th", + "template_name": "can we infer", + "evaluation": { + "accuracy": 0.43172690763052207 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='th', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3', nospace=False, output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/th/guaranteed_possible_impossible/results.json b/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/th/guaranteed_possible_impossible/results.json new file mode 100644 index 0000000..983ee3c --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/th/guaranteed_possible_impossible/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "th", + "template_name": "guaranteed/possible/impossible", + "evaluation": { + "accuracy": 0.42730923694779116 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='th', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3', nospace=False, output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/th/justified_in_saying/results.json b/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/th/justified_in_saying/results.json new file mode 100644 index 0000000..dde1f52 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/th/justified_in_saying/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "th", + "template_name": "justified in saying", + "evaluation": { + "accuracy": 0.40401606425702813 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='th', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3', nospace=False, output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/tr/GPT-3_style/results.json b/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/tr/GPT-3_style/results.json new file mode 100644 index 0000000..72afbab --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/tr/GPT-3_style/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "tr", + "template_name": "GPT-3 style", + "evaluation": { + "accuracy": 0.40240963855421685 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='tr', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3', nospace=False, output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/tr/MNLI_crowdsource/results.json b/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/tr/MNLI_crowdsource/results.json new file mode 100644 index 0000000..6ccab90 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/tr/MNLI_crowdsource/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "tr", + "template_name": "MNLI crowdsource", + "evaluation": { + "accuracy": 0.351004016064257 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='tr', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3', nospace=False, output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/tr/can_we_infer/results.json b/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/tr/can_we_infer/results.json new file mode 100644 index 0000000..3e0b433 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/tr/can_we_infer/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "tr", + "template_name": "can we infer", + "evaluation": { + "accuracy": 0.40441767068273093 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='tr', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3', nospace=False, output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/tr/guaranteed_possible_impossible/results.json b/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/tr/guaranteed_possible_impossible/results.json new file mode 100644 index 0000000..92190e9 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/tr/guaranteed_possible_impossible/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "tr", + "template_name": "guaranteed/possible/impossible", + "evaluation": { + "accuracy": 0.3678714859437751 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='tr', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3', nospace=False, output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/tr/justified_in_saying/results.json b/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/tr/justified_in_saying/results.json new file mode 100644 index 0000000..2567b54 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_l2/xnli/tr/justified_in_saying/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "tr", + "template_name": "justified in saying", + "evaluation": { + "accuracy": 0.38313253012048193 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='tr', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3', nospace=False, output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/bloomz-7b1-p3/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=mlsum_es.templates=layman_summ_es.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:40.json b/evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=mlsum_es.templates=layman_summ_es.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:40.json new file mode 100644 index 0000000..2b9597c --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=mlsum_es.templates=layman_summ_es.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:40.json @@ -0,0 +1 @@ +{"results": [{"task_name": "mlsum_es", "prompt_name": "layman_summ_es", "bleu": 2.6830705121606706, "fixed_answer_choice_list": null, "dataset_path": "GEM/mlsum", "dataset_name": "es", "subset": "", "prompt_id": "6fc70031-95ab-40fa-9cc7-e6eda42a4833", "prompt_jinja": "My college roommate asked me what this Spanish article meant:\n {{text}}\nSo I recapped it in layman''s terms in Spanish: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.14257713719805254}, {"task_name": "mlsum_es", "prompt_name": "layman_summ_es", "rouge1_precision": 0.19994210865731296, "fixed_answer_choice_list": null, "dataset_path": "GEM/mlsum", "dataset_name": "es", "subset": "", "prompt_id": "6fc70031-95ab-40fa-9cc7-e6eda42a4833", "prompt_jinja": "My college roommate asked me what this Spanish article meant:\n {{text}}\nSo I recapped it in layman''s terms in Spanish: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0026547454621461738}, {"task_name": "mlsum_es", "prompt_name": "layman_summ_es", "rouge1_recall": 0.22882499765155356, "fixed_answer_choice_list": null, "dataset_path": "GEM/mlsum", "dataset_name": "es", "subset": "", "prompt_id": "6fc70031-95ab-40fa-9cc7-e6eda42a4833", "prompt_jinja": "My college roommate asked me what this Spanish article meant:\n {{text}}\nSo I recapped it in layman''s terms in Spanish: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0030837265632016487}, {"task_name": "mlsum_es", "prompt_name": "layman_summ_es", "rouge1_fmeasure": 0.19665942356583802, "fixed_answer_choice_list": null, "dataset_path": "GEM/mlsum", "dataset_name": "es", "subset": "", "prompt_id": "6fc70031-95ab-40fa-9cc7-e6eda42a4833", "prompt_jinja": "My college roommate asked me what this Spanish article meant:\n {{text}}\nSo I recapped it in layman''s terms in Spanish: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.00245186970283176}, {"task_name": "mlsum_es", "prompt_name": "layman_summ_es", "rouge2_precision": 0.05189155779128239, "fixed_answer_choice_list": null, "dataset_path": "GEM/mlsum", "dataset_name": "es", "subset": "", "prompt_id": "6fc70031-95ab-40fa-9cc7-e6eda42a4833", "prompt_jinja": "My college roommate asked me what this Spanish article meant:\n {{text}}\nSo I recapped it in layman''s terms in Spanish: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0016867882237885771}, {"task_name": "mlsum_es", "prompt_name": "layman_summ_es", "rouge2_recall": 0.06074745104675877, "fixed_answer_choice_list": null, "dataset_path": "GEM/mlsum", "dataset_name": "es", "subset": "", "prompt_id": "6fc70031-95ab-40fa-9cc7-e6eda42a4833", "prompt_jinja": "My college roommate asked me what this Spanish article meant:\n {{text}}\nSo I recapped it in layman''s terms in Spanish: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0018807564961523813}, {"task_name": "mlsum_es", "prompt_name": "layman_summ_es", "rouge2_fmeasure": 0.0515077211753521, "fixed_answer_choice_list": null, "dataset_path": "GEM/mlsum", "dataset_name": "es", "subset": "", "prompt_id": "6fc70031-95ab-40fa-9cc7-e6eda42a4833", "prompt_jinja": "My college roommate asked me what this Spanish article meant:\n {{text}}\nSo I recapped it in layman''s terms in Spanish: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015699354248315028}, {"task_name": "mlsum_es", "prompt_name": "layman_summ_es", "rougeL_precision": 0.15286714092332523, "fixed_answer_choice_list": null, "dataset_path": "GEM/mlsum", "dataset_name": "es", "subset": "", "prompt_id": "6fc70031-95ab-40fa-9cc7-e6eda42a4833", "prompt_jinja": "My college roommate asked me what this Spanish article meant:\n {{text}}\nSo I recapped it in layman''s terms in Spanish: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0021813478665272707}, {"task_name": "mlsum_es", "prompt_name": "layman_summ_es", "rougeL_recall": 0.17277746632777954, "fixed_answer_choice_list": null, "dataset_path": "GEM/mlsum", "dataset_name": "es", "subset": "", "prompt_id": "6fc70031-95ab-40fa-9cc7-e6eda42a4833", "prompt_jinja": "My college roommate asked me what this Spanish article meant:\n {{text}}\nSo I recapped it in layman''s terms in Spanish: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002444939812221139}, {"task_name": "mlsum_es", "prompt_name": "layman_summ_es", "rougeL_fmeasure": 0.14897381009906005, "fixed_answer_choice_list": null, "dataset_path": "GEM/mlsum", "dataset_name": "es", "subset": "", "prompt_id": "6fc70031-95ab-40fa-9cc7-e6eda42a4833", "prompt_jinja": "My college roommate asked me what this Spanish article meant:\n {{text}}\nSo I recapped it in layman''s terms in Spanish: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0019744371501868186}, {"task_name": "mlsum_es", "prompt_name": "layman_summ_es", "rougeLsum_precision": 0.15829244217518917, "fixed_answer_choice_list": null, "dataset_path": "GEM/mlsum", "dataset_name": "es", "subset": "", "prompt_id": "6fc70031-95ab-40fa-9cc7-e6eda42a4833", "prompt_jinja": "My college roommate asked me what this Spanish article meant:\n {{text}}\nSo I recapped it in layman''s terms in Spanish: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002241835408305656}, {"task_name": "mlsum_es", "prompt_name": "layman_summ_es", "rougeLsum_recall": 0.17998021570910885, "fixed_answer_choice_list": null, "dataset_path": "GEM/mlsum", "dataset_name": "es", "subset": "", "prompt_id": "6fc70031-95ab-40fa-9cc7-e6eda42a4833", "prompt_jinja": "My college roommate asked me what this Spanish article meant:\n {{text}}\nSo I recapped it in layman''s terms in Spanish: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002565278973604084}, {"task_name": "mlsum_es", "prompt_name": "layman_summ_es", "rougeLsum_fmeasure": 0.15454418245332874, "fixed_answer_choice_list": null, "dataset_path": "GEM/mlsum", "dataset_name": "es", "subset": "", "prompt_id": "6fc70031-95ab-40fa-9cc7-e6eda42a4833", "prompt_jinja": "My college roommate asked me what this Spanish article meant:\n {{text}}\nSo I recapped it in layman''s terms in Spanish: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0020344070829227297}], "config": {"model": "hf-causal", "model_args": "pretrained=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,use_accelerate=True,tokenizer=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,dtype=float16", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=mlsum_es.templates=palm_prompt.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:40.json b/evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=mlsum_es.templates=palm_prompt.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:40.json new file mode 100644 index 0000000..16aed74 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=mlsum_es.templates=palm_prompt.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:40.json @@ -0,0 +1 @@ +{"results": [{"task_name": "mlsum_es", "prompt_name": "palm_prompt", "bleu": 3.341310161344892, "fixed_answer_choice_list": null, "dataset_path": "GEM/mlsum", "dataset_name": "es", "subset": "", "prompt_id": "e3c60771-5e99-49b1-b477-c2b69f645d59", "prompt_jinja": "I will first show a news article and then provide a summary of it in Spanish:\nArticle: {{text}}\n ===\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.12383760876849086}, {"task_name": "mlsum_es", "prompt_name": "palm_prompt", "rouge1_precision": 0.2238936517609025, "fixed_answer_choice_list": null, "dataset_path": "GEM/mlsum", "dataset_name": "es", "subset": "", "prompt_id": "e3c60771-5e99-49b1-b477-c2b69f645d59", "prompt_jinja": "I will first show a news article and then provide a summary of it in Spanish:\nArticle: {{text}}\n ===\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002299862104308459}, {"task_name": "mlsum_es", "prompt_name": "palm_prompt", "rouge1_recall": 0.31290976115097796, "fixed_answer_choice_list": null, "dataset_path": "GEM/mlsum", "dataset_name": "es", "subset": "", "prompt_id": "e3c60771-5e99-49b1-b477-c2b69f645d59", "prompt_jinja": "I will first show a news article and then provide a summary of it in Spanish:\nArticle: {{text}}\n ===\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003157461966656448}, {"task_name": "mlsum_es", "prompt_name": "palm_prompt", "rouge1_fmeasure": 0.23872886986952627, "fixed_answer_choice_list": null, "dataset_path": "GEM/mlsum", "dataset_name": "es", "subset": "", "prompt_id": "e3c60771-5e99-49b1-b477-c2b69f645d59", "prompt_jinja": "I will first show a news article and then provide a summary of it in Spanish:\nArticle: {{text}}\n ===\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0021217578248352883}, {"task_name": "mlsum_es", "prompt_name": "palm_prompt", "rouge2_precision": 0.06209477646349353, "fixed_answer_choice_list": null, "dataset_path": "GEM/mlsum", "dataset_name": "es", "subset": "", "prompt_id": "e3c60771-5e99-49b1-b477-c2b69f645d59", "prompt_jinja": "I will first show a news article and then provide a summary of it in Spanish:\nArticle: {{text}}\n ===\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001551591733457607}, {"task_name": "mlsum_es", "prompt_name": "palm_prompt", "rouge2_recall": 0.09261450488619867, "fixed_answer_choice_list": null, "dataset_path": "GEM/mlsum", "dataset_name": "es", "subset": "", "prompt_id": "e3c60771-5e99-49b1-b477-c2b69f645d59", "prompt_jinja": "I will first show a news article and then provide a summary of it in Spanish:\nArticle: {{text}}\n ===\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002338770523626696}, {"task_name": "mlsum_es", "prompt_name": "palm_prompt", "rouge2_fmeasure": 0.06770985280514573, "fixed_answer_choice_list": null, "dataset_path": "GEM/mlsum", "dataset_name": "es", "subset": "", "prompt_id": "e3c60771-5e99-49b1-b477-c2b69f645d59", "prompt_jinja": "I will first show a news article and then provide a summary of it in Spanish:\nArticle: {{text}}\n ===\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0016286057939871985}, {"task_name": "mlsum_es", "prompt_name": "palm_prompt", "rougeL_precision": 0.16744902972078152, "fixed_answer_choice_list": null, "dataset_path": "GEM/mlsum", "dataset_name": "es", "subset": "", "prompt_id": "e3c60771-5e99-49b1-b477-c2b69f645d59", "prompt_jinja": "I will first show a news article and then provide a summary of it in Spanish:\nArticle: {{text}}\n ===\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001881884121187265}, {"task_name": "mlsum_es", "prompt_name": "palm_prompt", "rougeL_recall": 0.23426964040901505, "fixed_answer_choice_list": null, "dataset_path": "GEM/mlsum", "dataset_name": "es", "subset": "", "prompt_id": "e3c60771-5e99-49b1-b477-c2b69f645d59", "prompt_jinja": "I will first show a news article and then provide a summary of it in Spanish:\nArticle: {{text}}\n ===\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0025983311247854634}, {"task_name": "mlsum_es", "prompt_name": "palm_prompt", "rougeL_fmeasure": 0.17833059997868725, "fixed_answer_choice_list": null, "dataset_path": "GEM/mlsum", "dataset_name": "es", "subset": "", "prompt_id": "e3c60771-5e99-49b1-b477-c2b69f645d59", "prompt_jinja": "I will first show a news article and then provide a summary of it in Spanish:\nArticle: {{text}}\n ===\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0017648367718678965}, {"task_name": "mlsum_es", "prompt_name": "palm_prompt", "rougeLsum_precision": 0.1755707446810662, "fixed_answer_choice_list": null, "dataset_path": "GEM/mlsum", "dataset_name": "es", "subset": "", "prompt_id": "e3c60771-5e99-49b1-b477-c2b69f645d59", "prompt_jinja": "I will first show a news article and then provide a summary of it in Spanish:\nArticle: {{text}}\n ===\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0019188012583382194}, {"task_name": "mlsum_es", "prompt_name": "palm_prompt", "rougeLsum_recall": 0.24827775226125046, "fixed_answer_choice_list": null, "dataset_path": "GEM/mlsum", "dataset_name": "es", "subset": "", "prompt_id": "e3c60771-5e99-49b1-b477-c2b69f645d59", "prompt_jinja": "I will first show a news article and then provide a summary of it in Spanish:\nArticle: {{text}}\n ===\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0027727938693488186}, {"task_name": "mlsum_es", "prompt_name": "palm_prompt", "rougeLsum_fmeasure": 0.1876134227034203, "fixed_answer_choice_list": null, "dataset_path": "GEM/mlsum", "dataset_name": "es", "subset": "", "prompt_id": "e3c60771-5e99-49b1-b477-c2b69f645d59", "prompt_jinja": "I will first show a news article and then provide a summary of it in Spanish:\nArticle: {{text}}\n ===\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018064283175187946}], "config": {"model": "hf-causal", "model_args": "pretrained=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,use_accelerate=True,tokenizer=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,dtype=float16", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=mlsum_es.templates=summarise_this_in_es_few_sentences.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:40.json b/evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=mlsum_es.templates=summarise_this_in_es_few_sentences.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:40.json new file mode 100644 index 0000000..92458c2 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=mlsum_es.templates=summarise_this_in_es_few_sentences.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:40.json @@ -0,0 +1 @@ +{"results": [{"task_name": "mlsum_es", "prompt_name": "summarise_this_in_es_few_sentences", "bleu": 2.2245794650879462, "fixed_answer_choice_list": null, "dataset_path": "GEM/mlsum", "dataset_name": "es", "subset": "", "prompt_id": "5e644239-d989-4531-b2ff-44b0e4310df6", "prompt_jinja": "{{text}}\n===\nGiven the above document, write few sentences in Spanish to summarize: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07743700029169612}, {"task_name": "mlsum_es", "prompt_name": "summarise_this_in_es_few_sentences", "rouge1_precision": 0.18419910608261986, "fixed_answer_choice_list": null, "dataset_path": "GEM/mlsum", "dataset_name": "es", "subset": "", "prompt_id": "5e644239-d989-4531-b2ff-44b0e4310df6", "prompt_jinja": "{{text}}\n===\nGiven the above document, write few sentences in Spanish to summarize: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002301564923577535}, {"task_name": "mlsum_es", "prompt_name": "summarise_this_in_es_few_sentences", "rouge1_recall": 0.33528109600140793, "fixed_answer_choice_list": null, "dataset_path": "GEM/mlsum", "dataset_name": "es", "subset": "", "prompt_id": "5e644239-d989-4531-b2ff-44b0e4310df6", "prompt_jinja": "{{text}}\n===\nGiven the above document, write few sentences in Spanish to summarize: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0034321373331462294}, {"task_name": "mlsum_es", "prompt_name": "summarise_this_in_es_few_sentences", "rouge1_fmeasure": 0.21126423815884174, "fixed_answer_choice_list": null, "dataset_path": "GEM/mlsum", "dataset_name": "es", "subset": "", "prompt_id": "5e644239-d989-4531-b2ff-44b0e4310df6", "prompt_jinja": "{{text}}\n===\nGiven the above document, write few sentences in Spanish to summarize: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0021771710222460634}, {"task_name": "mlsum_es", "prompt_name": "summarise_this_in_es_few_sentences", "rouge2_precision": 0.051112897675373886, "fixed_answer_choice_list": null, "dataset_path": "GEM/mlsum", "dataset_name": "es", "subset": "", "prompt_id": "5e644239-d989-4531-b2ff-44b0e4310df6", "prompt_jinja": "{{text}}\n===\nGiven the above document, write few sentences in Spanish to summarize: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0014393881241720322}, {"task_name": "mlsum_es", "prompt_name": "summarise_this_in_es_few_sentences", "rouge2_recall": 0.09793074579590116, "fixed_answer_choice_list": null, "dataset_path": "GEM/mlsum", "dataset_name": "es", "subset": "", "prompt_id": "5e644239-d989-4531-b2ff-44b0e4310df6", "prompt_jinja": "{{text}}\n===\nGiven the above document, write few sentences in Spanish to summarize: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0024100392963833633}, {"task_name": "mlsum_es", "prompt_name": "summarise_this_in_es_few_sentences", "rouge2_fmeasure": 0.05913033007358818, "fixed_answer_choice_list": null, "dataset_path": "GEM/mlsum", "dataset_name": "es", "subset": "", "prompt_id": "5e644239-d989-4531-b2ff-44b0e4310df6", "prompt_jinja": "{{text}}\n===\nGiven the above document, write few sentences in Spanish to summarize: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0014890254374386052}, {"task_name": "mlsum_es", "prompt_name": "summarise_this_in_es_few_sentences", "rougeL_precision": 0.13714024915254835, "fixed_answer_choice_list": null, "dataset_path": "GEM/mlsum", "dataset_name": "es", "subset": "", "prompt_id": "5e644239-d989-4531-b2ff-44b0e4310df6", "prompt_jinja": "{{text}}\n===\nGiven the above document, write few sentences in Spanish to summarize: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0018260510947169805}, {"task_name": "mlsum_es", "prompt_name": "summarise_this_in_es_few_sentences", "rougeL_recall": 0.2521886801730905, "fixed_answer_choice_list": null, "dataset_path": "GEM/mlsum", "dataset_name": "es", "subset": "", "prompt_id": "5e644239-d989-4531-b2ff-44b0e4310df6", "prompt_jinja": "{{text}}\n===\nGiven the above document, write few sentences in Spanish to summarize: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0027817174072391373}, {"task_name": "mlsum_es", "prompt_name": "summarise_this_in_es_few_sentences", "rougeL_fmeasure": 0.15711042852214044, "fixed_answer_choice_list": null, "dataset_path": "GEM/mlsum", "dataset_name": "es", "subset": "", "prompt_id": "5e644239-d989-4531-b2ff-44b0e4310df6", "prompt_jinja": "{{text}}\n===\nGiven the above document, write few sentences in Spanish to summarize: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0017251643310554304}, {"task_name": "mlsum_es", "prompt_name": "summarise_this_in_es_few_sentences", "rougeLsum_precision": 0.14518990658432604, "fixed_answer_choice_list": null, "dataset_path": "GEM/mlsum", "dataset_name": "es", "subset": "", "prompt_id": "5e644239-d989-4531-b2ff-44b0e4310df6", "prompt_jinja": "{{text}}\n===\nGiven the above document, write few sentences in Spanish to summarize: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001900911801455617}, {"task_name": "mlsum_es", "prompt_name": "summarise_this_in_es_few_sentences", "rougeLsum_recall": 0.26906405645015485, "fixed_answer_choice_list": null, "dataset_path": "GEM/mlsum", "dataset_name": "es", "subset": "", "prompt_id": "5e644239-d989-4531-b2ff-44b0e4310df6", "prompt_jinja": "{{text}}\n===\nGiven the above document, write few sentences in Spanish to summarize: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0030454611701701234}, {"task_name": "mlsum_es", "prompt_name": "summarise_this_in_es_few_sentences", "rougeLsum_fmeasure": 0.1670307582655529, "fixed_answer_choice_list": null, "dataset_path": "GEM/mlsum", "dataset_name": "es", "subset": "", "prompt_id": "5e644239-d989-4531-b2ff-44b0e4310df6", "prompt_jinja": "{{text}}\n===\nGiven the above document, write few sentences in Spanish to summarize: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018288687884684008}], "config": {"model": "hf-causal", "model_args": "pretrained=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,use_accelerate=True,tokenizer=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,dtype=float16", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=a_good_translation-en-fr-source+target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:43.json b/evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=a_good_translation-en-fr-source+target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:43.json new file mode 100644 index 0000000..8423b4c --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=a_good_translation-en-fr-source+target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:43.json @@ -0,0 +1 @@ +{"results": [{"task_name": "wmt14_fr_en", "prompt_name": "a_good_translation-en-fr-source+target", "bleu": 2.125573406419127, "fixed_answer_choice_list": null, "dataset_path": "wmt14", "dataset_name": "fr-en", "subset": null, "prompt_id": "a3a87505-e423-4c03-9a22-a3da4ccbeae5", "prompt_jinja": "Given the following source text in English: {{translation[\"en\"]}} , a good French translation is:\n||| {{translation[\"fr\"]}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09981676122698169}], "config": {"model": "hf-causal", "model_args": "pretrained=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,use_accelerate=True,tokenizer=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,dtype=float16", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=a_good_translation-en-fr-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:43.json b/evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=a_good_translation-en-fr-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:43.json new file mode 100644 index 0000000..2e63dab --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=a_good_translation-en-fr-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:43.json @@ -0,0 +1 @@ +{"results": [{"task_name": "wmt14_fr_en", "prompt_name": "a_good_translation-en-fr-target", "bleu": 1.5697853682886957, "fixed_answer_choice_list": null, "dataset_path": "wmt14", "dataset_name": "fr-en", "subset": null, "prompt_id": "474c20a1-a2ea-4ff4-b4c8-7f9c6466ff20", "prompt_jinja": "Given the following passage: {{translation[\"en\"]}} , a good French translation is: ||| {{translation[\"fr\"]}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.10176333685236229}], "config": {"model": "hf-causal", "model_args": "pretrained=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,use_accelerate=True,tokenizer=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,dtype=float16", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=a_good_translation-fr-en-source+target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json b/evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=a_good_translation-fr-en-source+target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json new file mode 100644 index 0000000..bba2db4 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=a_good_translation-fr-en-source+target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json @@ -0,0 +1 @@ +{"results": [{"task_name": "wmt14_fr_en", "prompt_name": "a_good_translation-fr-en-source+target", "bleu": 30.388346190168132, "fixed_answer_choice_list": null, "dataset_path": "wmt14", "dataset_name": "fr-en", "subset": null, "prompt_id": "43dc1b77-e8ea-4dc8-8a12-0abc3b0dbba0", "prompt_jinja": "Given the following source text in French: {{translation[\"fr\"]}} , a good English translation is: ||| {{translation[\"en\"]}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.28706919566129924}], "config": {"model": "hf-causal", "model_args": "pretrained=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,use_accelerate=True,tokenizer=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,dtype=float16", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=a_good_translation-fr-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json b/evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=a_good_translation-fr-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json new file mode 100644 index 0000000..d85e2f8 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=a_good_translation-fr-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json @@ -0,0 +1 @@ +{"results": [{"task_name": "wmt14_fr_en", "prompt_name": "a_good_translation-fr-en-target", "bleu": 22.361703612398195, "fixed_answer_choice_list": null, "dataset_path": "wmt14", "dataset_name": "fr-en", "subset": null, "prompt_id": "762c0878-c8fc-43ec-839f-d5d8435a94f6", "prompt_jinja": "Given the following passage: {{translation[\"fr\"]}} , a good English translation is:\n||| {{translation[\"en\"]}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.43872418791072576}], "config": {"model": "hf-causal", "model_args": "pretrained=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,use_accelerate=True,tokenizer=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,dtype=float16", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=gpt3-en-fr.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:43.json b/evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=gpt3-en-fr.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:43.json new file mode 100644 index 0000000..113c5e1 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=gpt3-en-fr.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:43.json @@ -0,0 +1 @@ +{"results": [{"task_name": "wmt14_fr_en", "prompt_name": "gpt3-en-fr", "bleu": 0.37928468482204986, "fixed_answer_choice_list": null, "dataset_path": "wmt14", "dataset_name": "fr-en", "subset": null, "prompt_id": "fc3b96b0-de5e-4ff4-b7bb-cda348ff7fcf", "prompt_jinja": "Q: What is the French translation of {{translation[\"en\"]}} A: ||| {{translation[\"fr\"]}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.03833854862936989}], "config": {"model": "hf-causal", "model_args": "pretrained=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,use_accelerate=True,tokenizer=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,dtype=float16", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=gpt3-fr-en.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json b/evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=gpt3-fr-en.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json new file mode 100644 index 0000000..3f19630 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=gpt3-fr-en.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json @@ -0,0 +1 @@ +{"results": [{"task_name": "wmt14_fr_en", "prompt_name": "gpt3-fr-en", "bleu": 17.167001660570335, "fixed_answer_choice_list": null, "dataset_path": "wmt14", "dataset_name": "fr-en", "subset": null, "prompt_id": "gc3b96b0-de5e-4ff4-b7bb-cda348ff7fcf", "prompt_jinja": "Q: What is the English translation of {{translation[\"fr\"]}} A: ||| {{translation[\"en\"]}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.3999014258297822}], "config": {"model": "hf-causal", "model_args": "pretrained=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,use_accelerate=True,tokenizer=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,dtype=float16", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=version-en-fr-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:43.json b/evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=version-en-fr-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:43.json new file mode 100644 index 0000000..400734a --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=version-en-fr-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:43.json @@ -0,0 +1 @@ +{"results": [{"task_name": "wmt14_fr_en", "prompt_name": "version-en-fr-target", "bleu": 4.788559958687529, "fixed_answer_choice_list": null, "dataset_path": "wmt14", "dataset_name": "fr-en", "subset": null, "prompt_id": "c80e443a-0ba4-4c5d-be98-998e050a202d", "prompt_jinja": "If the original version says: {{translation[\"en\"]}}; then the French version should say:\n||| {{translation[\"fr\"]}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.12647149552786194}], "config": {"model": "hf-causal", "model_args": "pretrained=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,use_accelerate=True,tokenizer=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,dtype=float16", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=version-fr-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json b/evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=version-fr-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json new file mode 100644 index 0000000..36d011c --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=version-fr-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json @@ -0,0 +1 @@ +{"results": [{"task_name": "wmt14_fr_en", "prompt_name": "version-fr-en-target", "bleu": 23.925613843737143, "fixed_answer_choice_list": null, "dataset_path": "wmt14", "dataset_name": "fr-en", "subset": null, "prompt_id": "9fe6b44b-2dc6-4557-8201-14d6ea7668ff", "prompt_jinja": "If the original version says: {{translation[\"fr\"]}}; then the English version should say:\n||| {{translation[\"en\"]}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.2692548707999714}], "config": {"model": "hf-causal", "model_args": "pretrained=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,use_accelerate=True,tokenizer=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,dtype=float16", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=xglm-en-fr-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:43.json b/evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=xglm-en-fr-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:43.json new file mode 100644 index 0000000..6b0977e --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=xglm-en-fr-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:43.json @@ -0,0 +1 @@ +{"results": [{"task_name": "wmt14_fr_en", "prompt_name": "xglm-en-fr-target", "bleu": 2.186171298454336, "fixed_answer_choice_list": null, "dataset_path": "wmt14", "dataset_name": "fr-en", "subset": null, "prompt_id": "2fc841fb-b872-4cc6-9a88-735d6bb7e2e3", "prompt_jinja": "{{translation[\"en\"]}} = French:\n||| {{translation[\"fr\"]}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09641163271059554}], "config": {"model": "hf-causal", "model_args": "pretrained=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,use_accelerate=True,tokenizer=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,dtype=float16", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=xglm-fr-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json b/evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=xglm-fr-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json new file mode 100644 index 0000000..dac33f0 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=xglm-fr-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json @@ -0,0 +1 @@ +{"results": [{"task_name": "wmt14_fr_en", "prompt_name": "xglm-fr-en-target", "bleu": 14.10190003658709, "fixed_answer_choice_list": null, "dataset_path": "wmt14", "dataset_name": "fr-en", "subset": null, "prompt_id": "957b8554-a00a-4652-b080-e9ee3ccae381", "prompt_jinja": "{{translation[\"fr\"]}} = English:\n||| {{translation[\"en\"]}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.1974741324240151}], "config": {"model": "hf-causal", "model_args": "pretrained=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,use_accelerate=True,tokenizer=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,dtype=float16", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=a_good_translation-en-hi-source+target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json b/evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=a_good_translation-en-hi-source+target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json new file mode 100644 index 0000000..504d57a --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=a_good_translation-en-hi-source+target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json @@ -0,0 +1 @@ +{"results": [{"task_name": "wmt14_hi_en", "prompt_name": "a_good_translation-en-hi-source+target", "bleu": 0.18051438917625368, "fixed_answer_choice_list": null, "dataset_path": "wmt14", "dataset_name": "hi-en", "subset": null, "prompt_id": "d1e354a7-8fa3-415a-9bb7-755e1ae21813", "prompt_jinja": "Given the following source text in English: {{translation[\"en\"]}} , a good Hindi translation is:\n||| {{translation[\"hi\"]}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.03338441915097909}], "config": {"model": "hf-causal", "model_args": "pretrained=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,use_accelerate=True,tokenizer=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,dtype=float16", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=a_good_translation-en-hi-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json b/evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=a_good_translation-en-hi-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json new file mode 100644 index 0000000..62f8f38 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=a_good_translation-en-hi-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json @@ -0,0 +1 @@ +{"results": [{"task_name": "wmt14_hi_en", "prompt_name": "a_good_translation-en-hi-target", "bleu": 0.1812629246502659, "fixed_answer_choice_list": null, "dataset_path": "wmt14", "dataset_name": "hi-en", "subset": null, "prompt_id": "42379c42-04c5-4ea9-99ca-f43f1b1cfc1b", "prompt_jinja": "Given the following passage: {{translation[\"en\"]}} , a good Hindi translation is: ||| {{translation[\"hi\"]}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.04198901460363051}], "config": {"model": "hf-causal", "model_args": "pretrained=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,use_accelerate=True,tokenizer=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,dtype=float16", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=a_good_translation-hi-en-source+target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json b/evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=a_good_translation-hi-en-source+target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json new file mode 100644 index 0000000..7b04098 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=a_good_translation-hi-en-source+target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json @@ -0,0 +1 @@ +{"results": [{"task_name": "wmt14_hi_en", "prompt_name": "a_good_translation-hi-en-source+target", "bleu": 16.056644593701627, "fixed_answer_choice_list": null, "dataset_path": "wmt14", "dataset_name": "hi-en", "subset": null, "prompt_id": "03664fac-67ef-414d-8e4a-504ad4d7a8a0", "prompt_jinja": "Given the following source text in Hindi: {{translation[\"hi\"]}} , a good English translation is: ||| {{translation[\"en\"]}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.2809620281933667}], "config": {"model": "hf-causal", "model_args": "pretrained=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,use_accelerate=True,tokenizer=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,dtype=float16", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=a_good_translation-hi-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json b/evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=a_good_translation-hi-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json new file mode 100644 index 0000000..98e87e7 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=a_good_translation-hi-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json @@ -0,0 +1 @@ +{"results": [{"task_name": "wmt14_hi_en", "prompt_name": "a_good_translation-hi-en-target", "bleu": 15.032491079468809, "fixed_answer_choice_list": null, "dataset_path": "wmt14", "dataset_name": "hi-en", "subset": null, "prompt_id": "fbd2d598-80e9-4ce6-b85e-fb269aa82580", "prompt_jinja": "Given the following passage: {{translation[\"hi\"]}} , a good English translation is:\n||| {{translation[\"en\"]}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.2628594862835867}], "config": {"model": "hf-causal", "model_args": "pretrained=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,use_accelerate=True,tokenizer=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,dtype=float16", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=version-en-hi-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json b/evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=version-en-hi-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json new file mode 100644 index 0000000..e4e1ac8 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=version-en-hi-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json @@ -0,0 +1 @@ +{"results": [{"task_name": "wmt14_hi_en", "prompt_name": "version-en-hi-target", "bleu": 0.1858574511075315, "fixed_answer_choice_list": null, "dataset_path": "wmt14", "dataset_name": "hi-en", "subset": null, "prompt_id": "b5952cac-9388-4901-98ed-c45cccfed5de", "prompt_jinja": "If the original version says: {{translation[\"en\"]}}; then the Hindi version should say:\n||| {{translation[\"hi\"]}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.029122685049572238}], "config": {"model": "hf-causal", "model_args": "pretrained=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,use_accelerate=True,tokenizer=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,dtype=float16", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=version-hi-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json b/evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=version-hi-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json new file mode 100644 index 0000000..72a2976 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=version-hi-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json @@ -0,0 +1 @@ +{"results": [{"task_name": "wmt14_hi_en", "prompt_name": "version-hi-en-target", "bleu": 15.167071858881462, "fixed_answer_choice_list": null, "dataset_path": "wmt14", "dataset_name": "hi-en", "subset": null, "prompt_id": "02ec7175-a97c-4c0f-982f-1cc8c4c050d1", "prompt_jinja": "If the original version says: {{translation[\"hi\"]}}; then the English version should say:\n||| {{translation[\"en\"]}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.2573529636593602}], "config": {"model": "hf-causal", "model_args": "pretrained=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,use_accelerate=True,tokenizer=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,dtype=float16", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=xglm-en-hi-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json b/evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=xglm-en-hi-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json new file mode 100644 index 0000000..08fe928 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=xglm-en-hi-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json @@ -0,0 +1 @@ +{"results": [{"task_name": "wmt14_hi_en", "prompt_name": "xglm-en-hi-target", "bleu": 0.002225608801197892, "fixed_answer_choice_list": null, "dataset_path": "wmt14", "dataset_name": "hi-en", "subset": null, "prompt_id": "25daf37e-e684-419e-a250-bdeeb82a7df6", "prompt_jinja": "{{translation[\"en\"]}} = Hindi:\n||| {{translation[\"hi\"]}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0005988947090265846}], "config": {"model": "hf-causal", "model_args": "pretrained=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,use_accelerate=True,tokenizer=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,dtype=float16", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=xglm-hi-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json b/evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=xglm-hi-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json new file mode 100644 index 0000000..23adec1 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/agg.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=xglm-hi-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json @@ -0,0 +1 @@ +{"results": [{"task_name": "wmt14_hi_en", "prompt_name": "xglm-hi-en-target", "bleu": 3.675518735361532, "fixed_answer_choice_list": null, "dataset_path": "wmt14", "dataset_name": "hi-en", "subset": null, "prompt_id": "9b430f52-31a1-4b7b-9600-59069a706b2c", "prompt_jinja": "{{translation[\"hi\"]}} = English:\n||| {{translation[\"en\"]}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.17101231729659816}], "config": {"model": "hf-causal", "model_args": "pretrained=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,use_accelerate=True,tokenizer=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,dtype=float16", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/amazon_reviews_multi/en/prompt_body_title_to_star/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/amazon_reviews_multi/en/prompt_body_title_to_star/results.json new file mode 100644 index 0000000..a7d0b93 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/amazon_reviews_multi/en/prompt_body_title_to_star/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "amazon_reviews_multi", + "dataset_config_name": "en", + "template_name": "prompt_body_title_to_star", + "evaluation": { + "accuracy": 0.6176 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='amazon_reviews_multi', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='prompt_body_title_to_star', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/amazon_reviews_multi/en/prompt_review_to_star/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/amazon_reviews_multi/en/prompt_review_to_star/results.json new file mode 100644 index 0000000..9d9049c --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/amazon_reviews_multi/en/prompt_review_to_star/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "amazon_reviews_multi", + "dataset_config_name": "en", + "template_name": "prompt_review_to_star", + "evaluation": { + "accuracy": 0.5592 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='amazon_reviews_multi', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='prompt_review_to_star', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/amazon_reviews_multi/en/prompt_title_to_star/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/amazon_reviews_multi/en/prompt_title_to_star/results.json new file mode 100644 index 0000000..6392919 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/amazon_reviews_multi/en/prompt_title_to_star/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "amazon_reviews_multi", + "dataset_config_name": "en", + "template_name": "prompt_title_to_star", + "evaluation": { + "accuracy": 0.3922 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='amazon_reviews_multi', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='prompt_title_to_star', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/amazon_reviews_multi/es/prompt_body_title_to_star/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/amazon_reviews_multi/es/prompt_body_title_to_star/results.json new file mode 100644 index 0000000..7bbda9d --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/amazon_reviews_multi/es/prompt_body_title_to_star/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "amazon_reviews_multi", + "dataset_config_name": "es", + "template_name": "prompt_body_title_to_star", + "evaluation": { + "accuracy": 0.5526 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='amazon_reviews_multi', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='prompt_body_title_to_star', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/amazon_reviews_multi/es/prompt_review_to_star/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/amazon_reviews_multi/es/prompt_review_to_star/results.json new file mode 100644 index 0000000..65d627e --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/amazon_reviews_multi/es/prompt_review_to_star/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "amazon_reviews_multi", + "dataset_config_name": "es", + "template_name": "prompt_review_to_star", + "evaluation": { + "accuracy": 0.5296 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='amazon_reviews_multi', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='prompt_review_to_star', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/amazon_reviews_multi/es/prompt_title_to_star/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/amazon_reviews_multi/es/prompt_title_to_star/results.json new file mode 100644 index 0000000..fe68080 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/amazon_reviews_multi/es/prompt_title_to_star/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "amazon_reviews_multi", + "dataset_config_name": "es", + "template_name": "prompt_title_to_star", + "evaluation": { + "accuracy": 0.3646 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='amazon_reviews_multi', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='prompt_title_to_star', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/amazon_reviews_multi/fr/prompt_body_title_to_star/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/amazon_reviews_multi/fr/prompt_body_title_to_star/results.json new file mode 100644 index 0000000..d329622 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/amazon_reviews_multi/fr/prompt_body_title_to_star/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "amazon_reviews_multi", + "dataset_config_name": "fr", + "template_name": "prompt_body_title_to_star", + "evaluation": { + "accuracy": 0.5332 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='amazon_reviews_multi', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='prompt_body_title_to_star', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/amazon_reviews_multi/fr/prompt_review_to_star/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/amazon_reviews_multi/fr/prompt_review_to_star/results.json new file mode 100644 index 0000000..1a08600 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/amazon_reviews_multi/fr/prompt_review_to_star/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "amazon_reviews_multi", + "dataset_config_name": "fr", + "template_name": "prompt_review_to_star", + "evaluation": { + "accuracy": 0.5182 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='amazon_reviews_multi', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='prompt_review_to_star', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/amazon_reviews_multi/fr/prompt_title_to_star/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/amazon_reviews_multi/fr/prompt_title_to_star/results.json new file mode 100644 index 0000000..6946caa --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/amazon_reviews_multi/fr/prompt_title_to_star/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "amazon_reviews_multi", + "dataset_config_name": "fr", + "template_name": "prompt_title_to_star", + "evaluation": { + "accuracy": 0.3644 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='amazon_reviews_multi', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='prompt_title_to_star', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/amazon_reviews_multi/zh/prompt_body_title_to_star/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/amazon_reviews_multi/zh/prompt_body_title_to_star/results.json new file mode 100644 index 0000000..6e5349d --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/amazon_reviews_multi/zh/prompt_body_title_to_star/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "amazon_reviews_multi", + "dataset_config_name": "zh", + "template_name": "prompt_body_title_to_star", + "evaluation": { + "accuracy": 0.5174 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='amazon_reviews_multi', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='prompt_body_title_to_star', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/amazon_reviews_multi/zh/prompt_review_to_star/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/amazon_reviews_multi/zh/prompt_review_to_star/results.json new file mode 100644 index 0000000..fbd93ba --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/amazon_reviews_multi/zh/prompt_review_to_star/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "amazon_reviews_multi", + "dataset_config_name": "zh", + "template_name": "prompt_review_to_star", + "evaluation": { + "accuracy": 0.5006 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='amazon_reviews_multi', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='prompt_review_to_star', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/amazon_reviews_multi/zh/prompt_title_to_star/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/amazon_reviews_multi/zh/prompt_title_to_star/results.json new file mode 100644 index 0000000..8a39ebc --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/amazon_reviews_multi/zh/prompt_title_to_star/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "amazon_reviews_multi", + "dataset_config_name": "zh", + "template_name": "prompt_title_to_star", + "evaluation": { + "accuracy": 0.3874 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='amazon_reviews_multi', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='prompt_title_to_star', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/aqua_rat/raw/Answer_questions_from_options/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/aqua_rat/raw/Answer_questions_from_options/results.json new file mode 100644 index 0000000..3e9ce2e --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/aqua_rat/raw/Answer_questions_from_options/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "aqua_rat", + "dataset_config_name": "raw", + "template_name": "Answer questions from options", + "evaluation": { + "accuracy": 0.24015748031496062 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='raw', dataset_name='aqua_rat', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='Answer questions from options', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/aqua_rat/raw/answer_quiz/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/aqua_rat/raw/answer_quiz/results.json new file mode 100644 index 0000000..7f2ee8f --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/aqua_rat/raw/answer_quiz/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "aqua_rat", + "dataset_config_name": "raw", + "template_name": "answer_quiz", + "evaluation": { + "accuracy": 0.22440944881889763 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='raw', dataset_name='aqua_rat', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='answer_quiz', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/aqua_rat/raw/select_the_best_option/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/aqua_rat/raw/select_the_best_option/results.json new file mode 100644 index 0000000..5554f52 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/aqua_rat/raw/select_the_best_option/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "aqua_rat", + "dataset_config_name": "raw", + "template_name": "select_the_best_option", + "evaluation": { + "accuracy": 0.2559055118110236 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='raw', dataset_name='aqua_rat', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='select_the_best_option', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/art/choose_hypothesis/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/art/choose_hypothesis/results.json new file mode 100644 index 0000000..9a4636e --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/art/choose_hypothesis/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "art", + "dataset_config_name": null, + "template_name": "choose_hypothesis", + "evaluation": { + "accuracy": 0.5926892950391645 + }, + "arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='art', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='choose_hypothesis', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/art/choose_hypothesis_believable/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/art/choose_hypothesis_believable/results.json new file mode 100644 index 0000000..a15729f --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/art/choose_hypothesis_believable/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "art", + "dataset_config_name": null, + "template_name": "choose_hypothesis_believable", + "evaluation": { + "accuracy": 0.5711488250652742 + }, + "arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='art', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='choose_hypothesis_believable', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/art/choose_hypothesis_desc/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/art/choose_hypothesis_desc/results.json new file mode 100644 index 0000000..1f484cb --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/art/choose_hypothesis_desc/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "art", + "dataset_config_name": null, + "template_name": "choose_hypothesis_desc", + "evaluation": { + "accuracy": 0.5169712793733682 + }, + "arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='art', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='choose_hypothesis_desc', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/art/choose_hypothesis_likely/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/art/choose_hypothesis_likely/results.json new file mode 100644 index 0000000..da9088e --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/art/choose_hypothesis_likely/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "art", + "dataset_config_name": null, + "template_name": "choose_hypothesis_likely", + "evaluation": { + "accuracy": 0.5300261096605744 + }, + "arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='art', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='choose_hypothesis_likely', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/art/choose_hypothesis_options/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/art/choose_hypothesis_options/results.json new file mode 100644 index 0000000..52cc598 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/art/choose_hypothesis_options/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "art", + "dataset_config_name": null, + "template_name": "choose_hypothesis_options", + "evaluation": { + "accuracy": 0.5672323759791122 + }, + "arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='art', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='choose_hypothesis_options', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/banking77/direct_to_which_department/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/banking77/direct_to_which_department/results.json new file mode 100644 index 0000000..56386b9 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/banking77/direct_to_which_department/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "banking77", + "dataset_config_name": null, + "template_name": "direct_to_which_department", + "evaluation": { + "accuracy": 0.16753246753246753 + }, + "arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='banking77', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='test', target_max_length=256, template_config_name=None, template_name='direct_to_which_department', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/banking77/help_page_topic/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/banking77/help_page_topic/results.json new file mode 100644 index 0000000..3a983f2 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/banking77/help_page_topic/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "banking77", + "dataset_config_name": null, + "template_name": "help_page_topic", + "evaluation": { + "accuracy": 0.26785714285714285 + }, + "arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='banking77', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='test', target_max_length=256, template_config_name=None, template_name='help_page_topic', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/banking77/rephrase_as_banking_term/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/banking77/rephrase_as_banking_term/results.json new file mode 100644 index 0000000..9e302c2 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/banking77/rephrase_as_banking_term/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "banking77", + "dataset_config_name": null, + "template_name": "rephrase_as_banking_term", + "evaluation": { + "accuracy": 0.274025974025974 + }, + "arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='banking77', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='test', target_max_length=256, template_config_name=None, template_name='rephrase_as_banking_term', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/blbooksgenre/title_genre_classifiction/classify/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/blbooksgenre/title_genre_classifiction/classify/results.json new file mode 100644 index 0000000..29c83eb --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/blbooksgenre/title_genre_classifiction/classify/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "blbooksgenre", + "dataset_config_name": "title_genre_classifiction", + "template_name": "classify", + "evaluation": { + "accuracy": 0.25057603686635943 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='title_genre_classifiction', dataset_name='blbooksgenre', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='train', target_max_length=256, template_config_name=None, template_name='classify', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/blbooksgenre/title_genre_classifiction/multi-choice/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/blbooksgenre/title_genre_classifiction/multi-choice/results.json new file mode 100644 index 0000000..2b7915d --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/blbooksgenre/title_genre_classifiction/multi-choice/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "blbooksgenre", + "dataset_config_name": "title_genre_classifiction", + "template_name": "multi-choice", + "evaluation": { + "accuracy": 0.25057603686635943 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='title_genre_classifiction', dataset_name='blbooksgenre', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='train', target_max_length=256, template_config_name=None, template_name='multi-choice', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/blbooksgenre/title_genre_classifiction/premise_context_first/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/blbooksgenre/title_genre_classifiction/premise_context_first/results.json new file mode 100644 index 0000000..1e5f746 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/blbooksgenre/title_genre_classifiction/premise_context_first/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "blbooksgenre", + "dataset_config_name": "title_genre_classifiction", + "template_name": "premise_context_first", + "evaluation": { + "accuracy": 0.7321428571428571 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='title_genre_classifiction', dataset_name='blbooksgenre', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='train', target_max_length=256, template_config_name=None, template_name='premise_context_first', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/blimp/adjunct_island/grammatical_between_1_2/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/blimp/adjunct_island/grammatical_between_1_2/results.json new file mode 100644 index 0000000..45a50bf --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/blimp/adjunct_island/grammatical_between_1_2/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "blimp", + "dataset_config_name": "adjunct_island", + "template_name": "grammatical_between_1_2", + "evaluation": { + "accuracy": 0.512 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='adjunct_island', dataset_name='blimp', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='train', target_max_length=256, template_config_name=None, template_name='grammatical_between_1_2', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/blimp/adjunct_island/grammatical_between_A_B/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/blimp/adjunct_island/grammatical_between_A_B/results.json new file mode 100644 index 0000000..e848bf6 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/blimp/adjunct_island/grammatical_between_A_B/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "blimp", + "dataset_config_name": "adjunct_island", + "template_name": "grammatical_between_A_B", + "evaluation": { + "accuracy": 0.464 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='adjunct_island', dataset_name='blimp', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='train', target_max_length=256, template_config_name=None, template_name='grammatical_between_A_B', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/blimp/adjunct_island/grammatical_which_one_1_2/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/blimp/adjunct_island/grammatical_which_one_1_2/results.json new file mode 100644 index 0000000..9688389 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/blimp/adjunct_island/grammatical_which_one_1_2/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "blimp", + "dataset_config_name": "adjunct_island", + "template_name": "grammatical_which_one_1_2", + "evaluation": { + "accuracy": 0.512 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='adjunct_island', dataset_name='blimp', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='train', target_max_length=256, template_config_name=None, template_name='grammatical_which_one_1_2', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/blimp/adjunct_island/single_sentence_bad_yes_no/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/blimp/adjunct_island/single_sentence_bad_yes_no/results.json new file mode 100644 index 0000000..3b8aeb9 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/blimp/adjunct_island/single_sentence_bad_yes_no/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "blimp", + "dataset_config_name": "adjunct_island", + "template_name": "single_sentence_bad_yes_no", + "evaluation": { + "accuracy": 0.52 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='adjunct_island', dataset_name='blimp', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='train', target_max_length=256, template_config_name=None, template_name='single_sentence_bad_yes_no', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/blimp/adjunct_island/single_sentence_good_yes_no/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/blimp/adjunct_island/single_sentence_good_yes_no/results.json new file mode 100644 index 0000000..9eebdb0 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/blimp/adjunct_island/single_sentence_good_yes_no/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "blimp", + "dataset_config_name": "adjunct_island", + "template_name": "single_sentence_good_yes_no", + "evaluation": { + "accuracy": 0.493 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='adjunct_island', dataset_name='blimp', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='train', target_max_length=256, template_config_name=None, template_name='single_sentence_good_yes_no', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/climate_fever/claim_and_all_supporting_evidences/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/climate_fever/claim_and_all_supporting_evidences/results.json new file mode 100644 index 0000000..eced852 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/climate_fever/claim_and_all_supporting_evidences/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "climate_fever", + "dataset_config_name": null, + "template_name": "claim_and_all_supporting_evidences", + "evaluation": { + "accuracy": 0.3166123778501629 + }, + "arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='climate_fever', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='test', target_max_length=256, template_config_name=None, template_name='claim_and_all_supporting_evidences', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/climate_fever/fifth_evidence_and_claim_itemization/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/climate_fever/fifth_evidence_and_claim_itemization/results.json new file mode 100644 index 0000000..c747882 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/climate_fever/fifth_evidence_and_claim_itemization/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "climate_fever", + "dataset_config_name": null, + "template_name": "fifth_evidence_and_claim_itemization", + "evaluation": { + "accuracy": 0.4749185667752443 + }, + "arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='climate_fever', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='test', target_max_length=256, template_config_name=None, template_name='fifth_evidence_and_claim_itemization', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/climate_fever/first_evidence_and_claim_itemization/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/climate_fever/first_evidence_and_claim_itemization/results.json new file mode 100644 index 0000000..7bf0547 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/climate_fever/first_evidence_and_claim_itemization/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "climate_fever", + "dataset_config_name": null, + "template_name": "first_evidence_and_claim_itemization", + "evaluation": { + "accuracy": 0.22996742671009773 + }, + "arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='climate_fever', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='test', target_max_length=256, template_config_name=None, template_name='first_evidence_and_claim_itemization', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/climate_fever/second_evidence_and_claim_itemization/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/climate_fever/second_evidence_and_claim_itemization/results.json new file mode 100644 index 0000000..c83d48a --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/climate_fever/second_evidence_and_claim_itemization/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "climate_fever", + "dataset_config_name": null, + "template_name": "second_evidence_and_claim_itemization", + "evaluation": { + "accuracy": 0.24625407166123778 + }, + "arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='climate_fever', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='test', target_max_length=256, template_config_name=None, template_name='second_evidence_and_claim_itemization', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/climate_fever/third_evidence_claim_pair/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/climate_fever/third_evidence_claim_pair/results.json new file mode 100644 index 0000000..d8efe54 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/climate_fever/third_evidence_claim_pair/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "climate_fever", + "dataset_config_name": null, + "template_name": "third_evidence_claim_pair", + "evaluation": { + "accuracy": 0.24234527687296417 + }, + "arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='climate_fever', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='test', target_max_length=256, template_config_name=None, template_name='third_evidence_claim_pair', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/codah/codah/affirmative_instruction_after_sentence_and_choices/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/codah/codah/affirmative_instruction_after_sentence_and_choices/results.json new file mode 100644 index 0000000..9177f2c --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/codah/codah/affirmative_instruction_after_sentence_and_choices/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "codah", + "dataset_config_name": "codah", + "template_name": "affirmative_instruction_after_sentence_and_choices", + "evaluation": { + "accuracy": 0.6693083573487032 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='codah', dataset_name='codah', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='train', target_max_length=256, template_config_name=None, template_name='affirmative_instruction_after_sentence_and_choices', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/codah/codah/affirmative_instruction_before_sentence_and_choices/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/codah/codah/affirmative_instruction_before_sentence_and_choices/results.json new file mode 100644 index 0000000..cf53706 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/codah/codah/affirmative_instruction_before_sentence_and_choices/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "codah", + "dataset_config_name": "codah", + "template_name": "affirmative_instruction_before_sentence_and_choices", + "evaluation": { + "accuracy": 0.6509365994236311 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='codah', dataset_name='codah', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='train', target_max_length=256, template_config_name=None, template_name='affirmative_instruction_before_sentence_and_choices', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/codah/codah/interrogative_instruction_after_sentence_and_choices/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/codah/codah/interrogative_instruction_after_sentence_and_choices/results.json new file mode 100644 index 0000000..5132525 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/codah/codah/interrogative_instruction_after_sentence_and_choices/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "codah", + "dataset_config_name": "codah", + "template_name": "interrogative_instruction_after_sentence_and_choices", + "evaluation": { + "accuracy": 0.6761527377521613 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='codah', dataset_name='codah', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='train', target_max_length=256, template_config_name=None, template_name='interrogative_instruction_after_sentence_and_choices', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/commonsense_qa/answer_given_question_without_options/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/commonsense_qa/answer_given_question_without_options/results.json new file mode 100644 index 0000000..eabd150 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/commonsense_qa/answer_given_question_without_options/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "commonsense_qa", + "dataset_config_name": null, + "template_name": "answer_given_question_without_options", + "evaluation": { + "accuracy": 0.6388206388206388 + }, + "arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='commonsense_qa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='answer_given_question_without_options', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/commonsense_qa/most_suitable_answer/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/commonsense_qa/most_suitable_answer/results.json new file mode 100644 index 0000000..fde7d4d --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/commonsense_qa/most_suitable_answer/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "commonsense_qa", + "dataset_config_name": null, + "template_name": "most_suitable_answer", + "evaluation": { + "accuracy": 0.7313677313677314 + }, + "arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='commonsense_qa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='most_suitable_answer', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/commonsense_qa/question_answering/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/commonsense_qa/question_answering/results.json new file mode 100644 index 0000000..d8f996d --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/commonsense_qa/question_answering/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "commonsense_qa", + "dataset_config_name": null, + "template_name": "question_answering", + "evaluation": { + "accuracy": 0.7158067158067158 + }, + "arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='commonsense_qa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='question_answering', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/conv_ai_3/ambiguous/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/conv_ai_3/ambiguous/results.json new file mode 100644 index 0000000..a4d91de --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/conv_ai_3/ambiguous/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "conv_ai_3", + "dataset_config_name": null, + "template_name": "ambiguous", + "evaluation": { + "accuracy": 0.39040207522697795 + }, + "arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='conv_ai_3', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='ambiguous', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/conv_ai_3/clarification_needed/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/conv_ai_3/clarification_needed/results.json new file mode 100644 index 0000000..9dcfa51 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/conv_ai_3/clarification_needed/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "conv_ai_3", + "dataset_config_name": null, + "template_name": "clarification_needed", + "evaluation": { + "accuracy": 0.39040207522697795 + }, + "arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='conv_ai_3', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='clarification_needed', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/conv_ai_3/directly_answer/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/conv_ai_3/directly_answer/results.json new file mode 100644 index 0000000..3950d23 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/conv_ai_3/directly_answer/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "conv_ai_3", + "dataset_config_name": null, + "template_name": "directly_answer", + "evaluation": { + "accuracy": 0.6095979247730221 + }, + "arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='conv_ai_3', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='directly_answer', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/conv_ai_3/score_give_number/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/conv_ai_3/score_give_number/results.json new file mode 100644 index 0000000..ec2be53 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/conv_ai_3/score_give_number/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "conv_ai_3", + "dataset_config_name": null, + "template_name": "score_give_number", + "evaluation": { + "accuracy": 0.057933419801124084 + }, + "arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='conv_ai_3', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='score_give_number', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/conv_ai_3/score_how_much/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/conv_ai_3/score_how_much/results.json new file mode 100644 index 0000000..972240a --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/conv_ai_3/score_how_much/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "conv_ai_3", + "dataset_config_name": null, + "template_name": "score_how_much", + "evaluation": { + "accuracy": 0.010376134889753566 + }, + "arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='conv_ai_3', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='score_how_much', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/craigslist_bargains/best_deal/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/craigslist_bargains/best_deal/results.json new file mode 100644 index 0000000..e3b1d4e --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/craigslist_bargains/best_deal/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "craigslist_bargains", + "dataset_config_name": null, + "template_name": "best deal", + "evaluation": { + "accuracy": 0.5192629815745393 + }, + "arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='craigslist_bargains', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='best deal', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/craigslist_bargains/good_deal_for_seller/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/craigslist_bargains/good_deal_for_seller/results.json new file mode 100644 index 0000000..614eaeb --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/craigslist_bargains/good_deal_for_seller/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "craigslist_bargains", + "dataset_config_name": null, + "template_name": "good deal for seller", + "evaluation": { + "accuracy": 0.2529313232830821 + }, + "arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='craigslist_bargains', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='good deal for seller', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/craigslist_bargains/good_deal_for_seller_no_list_price/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/craigslist_bargains/good_deal_for_seller_no_list_price/results.json new file mode 100644 index 0000000..c66f022 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/craigslist_bargains/good_deal_for_seller_no_list_price/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "craigslist_bargains", + "dataset_config_name": null, + "template_name": "good deal for seller no list price", + "evaluation": { + "accuracy": 0.09715242881072027 + }, + "arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='craigslist_bargains', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='good deal for seller no list price', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/craigslist_bargains/good_deal_for_seller_no_list_price_implicit/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/craigslist_bargains/good_deal_for_seller_no_list_price_implicit/results.json new file mode 100644 index 0000000..9ffa539 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/craigslist_bargains/good_deal_for_seller_no_list_price_implicit/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "craigslist_bargains", + "dataset_config_name": null, + "template_name": "good deal for seller no list price implicit", + "evaluation": { + "accuracy": 0.24623115577889448 + }, + "arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='craigslist_bargains', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='good deal for seller no list price implicit', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/emotion/answer_question_with_emotion_label/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/emotion/answer_question_with_emotion_label/results.json new file mode 100644 index 0000000..2df8ac9 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/emotion/answer_question_with_emotion_label/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "emotion", + "dataset_config_name": null, + "template_name": "answer_question_with_emotion_label", + "evaluation": { + "accuracy": 0.3375 + }, + "arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='emotion', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='test', target_max_length=256, template_config_name=None, template_name='answer_question_with_emotion_label', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/emotion/answer_with_class_label/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/emotion/answer_with_class_label/results.json new file mode 100644 index 0000000..8fe296e --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/emotion/answer_with_class_label/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "emotion", + "dataset_config_name": null, + "template_name": "answer_with_class_label", + "evaluation": { + "accuracy": 0.214 + }, + "arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='emotion', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='test', target_max_length=256, template_config_name=None, template_name='answer_with_class_label', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/emotion/choose_the_best_emotion_label/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/emotion/choose_the_best_emotion_label/results.json new file mode 100644 index 0000000..490aea4 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/emotion/choose_the_best_emotion_label/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "emotion", + "dataset_config_name": null, + "template_name": "choose_the_best_emotion_label", + "evaluation": { + "accuracy": 0.312 + }, + "arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='emotion', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='test', target_max_length=256, template_config_name=None, template_name='choose_the_best_emotion_label', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/emotion/reply_with_emoation_label/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/emotion/reply_with_emoation_label/results.json new file mode 100644 index 0000000..174739f --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/emotion/reply_with_emoation_label/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "emotion", + "dataset_config_name": null, + "template_name": "reply_with_emoation_label", + "evaluation": { + "accuracy": 0.4495 + }, + "arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='emotion', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='test', target_max_length=256, template_config_name=None, template_name='reply_with_emoation_label', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=mlsum_es.templates=layman_summ_es.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:40.jsonl b/evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=mlsum_es.templates=layman_summ_es.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:40.jsonl new file mode 100644 index 0000000..94a4509 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=mlsum_es.templates=layman_summ_es.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:40.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd372ab32cfb273ee9a6e526c9d99cefa19711eb95cfa5f5b6f2c1287312e0d2 +size 8972520 diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=mlsum_es.templates=palm_prompt.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:40.jsonl b/evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=mlsum_es.templates=palm_prompt.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:40.jsonl new file mode 100644 index 0000000..9cdd962 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=mlsum_es.templates=palm_prompt.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:40.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95244bede560e3967e010b62bdc07bcccb7286ba9ec91f0ba7319010de183ecc +size 9028288 diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=mlsum_es.templates=summarise_this_in_es_few_sentences.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:40.jsonl b/evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=mlsum_es.templates=summarise_this_in_es_few_sentences.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:40.jsonl new file mode 100644 index 0000000..0d7272e --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=mlsum_es.templates=summarise_this_in_es_few_sentences.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:40.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce82a11301357ee05915d24a130a33931deb425ad8baa4189fe628b588100854 +size 9272950 diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=a_good_translation-en-fr-source+target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:43.jsonl b/evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=a_good_translation-en-fr-source+target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:43.jsonl new file mode 100644 index 0000000..c0dc974 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=a_good_translation-en-fr-source+target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:43.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1faa39e152afdb8123e501b0b08280e7be04fad2b944bbff9aed8b063c0156d +size 3114901 diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=a_good_translation-en-fr-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:43.jsonl b/evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=a_good_translation-en-fr-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:43.jsonl new file mode 100644 index 0000000..f929b06 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=a_good_translation-en-fr-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:43.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8fe88f3b501cf933a7d15898f3d0b307725981ada99a2d501a034e7e17ca98dd +size 2973455 diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=a_good_translation-fr-en-source+target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.jsonl b/evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=a_good_translation-fr-en-source+target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.jsonl new file mode 100644 index 0000000..9cda426 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=a_good_translation-fr-en-source+target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab6fd8d0a6fcb0fc245cf1ac3b57c03781fad7c39b6f77975c0a5d53a21649a0 +size 3111377 diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=a_good_translation-fr-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.jsonl b/evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=a_good_translation-fr-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.jsonl new file mode 100644 index 0000000..f164d66 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=a_good_translation-fr-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca69af50902f2ecd90e2b41201c3712e27fce9c6b7158bd65f46d3c4b5b1eba7 +size 3034252 diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=gpt3-en-fr.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:43.jsonl b/evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=gpt3-en-fr.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:43.jsonl new file mode 100644 index 0000000..b581a72 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=gpt3-en-fr.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:43.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:15956ffeb8e1ec731e2900fd68204d3936d31807e64a29d0712f3357f56957cc +size 2613841 diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=gpt3-fr-en.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.jsonl b/evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=gpt3-fr-en.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.jsonl new file mode 100644 index 0000000..a2ee6a2 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=gpt3-fr-en.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89d9d602d726bb0821c710b7b32b63c2813c0116d03965059be24fd5923ae79d +size 2722728 diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=version-en-fr-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:43.jsonl b/evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=version-en-fr-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:43.jsonl new file mode 100644 index 0000000..0641ed7 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=version-en-fr-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:43.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a48cc4625e95ae013239f8c20fc859392d9a2ed2a346150b2fdd6063c2e870a0 +size 3020414 diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=version-fr-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.jsonl b/evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=version-fr-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.jsonl new file mode 100644 index 0000000..17498fd --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=version-fr-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c18300f469840d070cc8d39fae6ab0df82bde890c5eef10b6e6767792381478 +size 3013171 diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=xglm-en-fr-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:43.jsonl b/evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=xglm-en-fr-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:43.jsonl new file mode 100644 index 0000000..6c4341d --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=xglm-en-fr-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:43.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0cae8ada7278a7273ac2ddebe6f4f3b2a909f4b02f43ea0c4927c432eb2ede13 +size 2473909 diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=xglm-fr-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.jsonl b/evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=xglm-fr-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.jsonl new file mode 100644 index 0000000..4c0d5ae --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=xglm-fr-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:562f7049dd5e17a2eabc42030da658f4c3176ccd75bac3297055aea791c86587 +size 2579931 diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=a_good_translation-en-hi-source+target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.jsonl b/evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=a_good_translation-en-hi-source+target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.jsonl new file mode 100644 index 0000000..272d38a --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=a_good_translation-en-hi-source+target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9fe075dbd2400c25596e0f07943d2ef01c0cb29728ea2fea9074bde069e83a76 +size 2951839 diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=a_good_translation-en-hi-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.jsonl b/evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=a_good_translation-en-hi-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.jsonl new file mode 100644 index 0000000..ef56dc3 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=a_good_translation-en-hi-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e96ffbc6423c60e7ff34c31e28f3ea95ea7e9e79c71c2bf6cd0a094ac8592d1a +size 2820700 diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=a_good_translation-hi-en-source+target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.jsonl b/evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=a_good_translation-hi-en-source+target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.jsonl new file mode 100644 index 0000000..55c2e25 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=a_good_translation-hi-en-source+target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0df7c80e772bfeff18b8e494ac8600ea0b1f1d1b9e05a2cd7b306495d6ec4e0 +size 2950072 diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=a_good_translation-hi-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.jsonl b/evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=a_good_translation-hi-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.jsonl new file mode 100644 index 0000000..3a75b2f --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=a_good_translation-hi-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:769fd47551c4c4cd8692f52ae39e1982fe04c8c84da0412501dc351a166d130f +size 2870274 diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=version-en-hi-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.jsonl b/evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=version-en-hi-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.jsonl new file mode 100644 index 0000000..8cb1d2f --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=version-en-hi-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d495b77461e9eca518a4cb42f37eba56ad258c201ada004636a64a0409c69f66 +size 2863552 diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=version-hi-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.jsonl b/evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=version-hi-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.jsonl new file mode 100644 index 0000000..df75129 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=version-hi-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b6cdba90d5b9da0850021ed44f26a0f849022b96d9dc85afa3ae5262eb08ee7 +size 2865803 diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=xglm-en-hi-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.jsonl b/evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=xglm-en-hi-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.jsonl new file mode 100644 index 0000000..b4c72c6 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=xglm-en-hi-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c3871387b7d5f67a59022d57cb99409a187770143a22713fde2d44809364827 +size 2378711 diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=xglm-hi-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.jsonl b/evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=xglm-hi-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.jsonl new file mode 100644 index 0000000..373e575 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=xglm-hi-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f4ebb076e4c2fabd5a2ea2105f9fa3d589c7a76e41e169aa02e776cbd92f08c +size 2446695 diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/financial_phrasebank/sentences_allagree/bullish_neutral_bearish/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/financial_phrasebank/sentences_allagree/bullish_neutral_bearish/results.json new file mode 100644 index 0000000..0241fa3 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/financial_phrasebank/sentences_allagree/bullish_neutral_bearish/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "financial_phrasebank", + "dataset_config_name": "sentences_allagree", + "template_name": "bullish_neutral_bearish", + "evaluation": { + "accuracy": 0.3878091872791519 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='sentences_allagree', dataset_name='financial_phrasebank', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='train', target_max_length=256, template_config_name=None, template_name='bullish_neutral_bearish', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/financial_phrasebank/sentences_allagree/complementary_industries/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/financial_phrasebank/sentences_allagree/complementary_industries/results.json new file mode 100644 index 0000000..5bb6fbd --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/financial_phrasebank/sentences_allagree/complementary_industries/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "financial_phrasebank", + "dataset_config_name": "sentences_allagree", + "template_name": "complementary_industries", + "evaluation": { + "accuracy": 0.10114840989399293 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='sentences_allagree', dataset_name='financial_phrasebank', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='train', target_max_length=256, template_config_name=None, template_name='complementary_industries', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/financial_phrasebank/sentences_allagree/sentiment/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/financial_phrasebank/sentences_allagree/sentiment/results.json new file mode 100644 index 0000000..a69e009 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/financial_phrasebank/sentences_allagree/sentiment/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "financial_phrasebank", + "dataset_config_name": "sentences_allagree", + "template_name": "sentiment", + "evaluation": { + "accuracy": 0.35644876325088337 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='sentences_allagree', dataset_name='financial_phrasebank', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='train', target_max_length=256, template_config_name=None, template_name='sentiment', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/financial_phrasebank/sentences_allagree/share_price_option/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/financial_phrasebank/sentences_allagree/share_price_option/results.json new file mode 100644 index 0000000..b159767 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/financial_phrasebank/sentences_allagree/share_price_option/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "financial_phrasebank", + "dataset_config_name": "sentences_allagree", + "template_name": "share_price_option", + "evaluation": { + "accuracy": 0.3670494699646643 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='sentences_allagree', dataset_name='financial_phrasebank', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='train', target_max_length=256, template_config_name=None, template_name='share_price_option', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/financial_phrasebank/sentences_allagree/word_comes_to_mind/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/financial_phrasebank/sentences_allagree/word_comes_to_mind/results.json new file mode 100644 index 0000000..1303fcc --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/financial_phrasebank/sentences_allagree/word_comes_to_mind/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "financial_phrasebank", + "dataset_config_name": "sentences_allagree", + "template_name": "word_comes_to_mind", + "evaluation": { + "accuracy": 0.08259717314487633 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='sentences_allagree', dataset_name='financial_phrasebank', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='train', target_max_length=256, template_config_name=None, template_name='word_comes_to_mind', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/glue/cola/Following_sentence_acceptable/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/glue/cola/Following_sentence_acceptable/results.json new file mode 100644 index 0000000..6bb1ba8 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/glue/cola/Following_sentence_acceptable/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "glue", + "dataset_config_name": "cola", + "template_name": "Following sentence acceptable", + "evaluation": { + "accuracy": 0.37583892617449666 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='cola', dataset_name='glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='Following sentence acceptable', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/glue/cola/Make_sense_yes_no/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/glue/cola/Make_sense_yes_no/results.json new file mode 100644 index 0000000..2735d67 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/glue/cola/Make_sense_yes_no/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "glue", + "dataset_config_name": "cola", + "template_name": "Make sense yes no", + "evaluation": { + "accuracy": 0.33940556088207097 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='cola', dataset_name='glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='Make sense yes no', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/glue/cola/Previous_sentence_acceptable/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/glue/cola/Previous_sentence_acceptable/results.json new file mode 100644 index 0000000..7e6b131 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/glue/cola/Previous_sentence_acceptable/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "glue", + "dataset_config_name": "cola", + "template_name": "Previous sentence acceptable", + "evaluation": { + "accuracy": 0.31255992329817833 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='cola', dataset_name='glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='Previous sentence acceptable', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/glue/cola/editing/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/glue/cola/editing/results.json new file mode 100644 index 0000000..567df7d --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/glue/cola/editing/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "glue", + "dataset_config_name": "cola", + "template_name": "editing", + "evaluation": { + "accuracy": 0.3844678811121764 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='cola', dataset_name='glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='editing', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/glue/cola/is_this_correct/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/glue/cola/is_this_correct/results.json new file mode 100644 index 0000000..8216bcc --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/glue/cola/is_this_correct/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "glue", + "dataset_config_name": "cola", + "template_name": "is_this_correct", + "evaluation": { + "accuracy": 0.37775647171620325 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='cola', dataset_name='glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='is_this_correct', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/glue/sst2/following_positive_negative/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/glue/sst2/following_positive_negative/results.json new file mode 100644 index 0000000..4691a46 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/glue/sst2/following_positive_negative/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "glue", + "dataset_config_name": "sst2", + "template_name": "following positive negative", + "evaluation": { + "accuracy": 0.9426605504587156 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='sst2', dataset_name='glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='following positive negative', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/glue/sst2/happy_or_mad/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/glue/sst2/happy_or_mad/results.json new file mode 100644 index 0000000..5ed8054 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/glue/sst2/happy_or_mad/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "glue", + "dataset_config_name": "sst2", + "template_name": "happy or mad", + "evaluation": { + "accuracy": 0.8279816513761468 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='sst2', dataset_name='glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='happy or mad', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/glue/sst2/positive_negative_after/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/glue/sst2/positive_negative_after/results.json new file mode 100644 index 0000000..646094e --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/glue/sst2/positive_negative_after/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "glue", + "dataset_config_name": "sst2", + "template_name": "positive negative after", + "evaluation": { + "accuracy": 0.9472477064220184 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='sst2', dataset_name='glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='positive negative after', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/glue/sst2/review/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/glue/sst2/review/results.json new file mode 100644 index 0000000..7b772f3 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/glue/sst2/review/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "glue", + "dataset_config_name": "sst2", + "template_name": "review", + "evaluation": { + "accuracy": 0.9254587155963303 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='sst2', dataset_name='glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='review', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/glue/sst2/said/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/glue/sst2/said/results.json new file mode 100644 index 0000000..d01a848 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/glue/sst2/said/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "glue", + "dataset_config_name": "sst2", + "template_name": "said", + "evaluation": { + "accuracy": 0.9059633027522935 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='sst2', dataset_name='glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='said', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/head_qa/en/multiple_choice_a_and_q_en/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/head_qa/en/multiple_choice_a_and_q_en/results.json new file mode 100644 index 0000000..364cb5a --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/head_qa/en/multiple_choice_a_and_q_en/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "head_qa", + "dataset_config_name": "en", + "template_name": "multiple_choice_a_and_q_en", + "evaluation": { + "accuracy": 0.29428989751098095 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='head_qa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='multiple_choice_a_and_q_en', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/head_qa/en/multiple_choice_a_and_q_with_context_en/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/head_qa/en/multiple_choice_a_and_q_with_context_en/results.json new file mode 100644 index 0000000..523ce4b --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/head_qa/en/multiple_choice_a_and_q_with_context_en/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "head_qa", + "dataset_config_name": "en", + "template_name": "multiple_choice_a_and_q_with_context_en", + "evaluation": { + "accuracy": 0.29502196193265007 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='head_qa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='multiple_choice_a_and_q_with_context_en', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/head_qa/en/multiple_choice_q_and_a_en/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/head_qa/en/multiple_choice_q_and_a_en/results.json new file mode 100644 index 0000000..3d8ec5c --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/head_qa/en/multiple_choice_q_and_a_en/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "head_qa", + "dataset_config_name": "en", + "template_name": "multiple_choice_q_and_a_en", + "evaluation": { + "accuracy": 0.3938506588579795 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='head_qa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='multiple_choice_q_and_a_en', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/head_qa/en/multiple_choice_q_and_a_index_en/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/head_qa/en/multiple_choice_q_and_a_index_en/results.json new file mode 100644 index 0000000..d90b979 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/head_qa/en/multiple_choice_q_and_a_index_en/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "head_qa", + "dataset_config_name": "en", + "template_name": "multiple_choice_q_and_a_index_en", + "evaluation": { + "accuracy": 0.30307467057101023 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='head_qa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='multiple_choice_q_and_a_index_en', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/head_qa/en/multiple_choice_q_and_a_index_with_context_en/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/head_qa/en/multiple_choice_q_and_a_index_with_context_en/results.json new file mode 100644 index 0000000..c71ca00 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/head_qa/en/multiple_choice_q_and_a_index_with_context_en/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "head_qa", + "dataset_config_name": "en", + "template_name": "multiple_choice_q_and_a_index_with_context_en", + "evaluation": { + "accuracy": 0.30234260614934116 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='head_qa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='multiple_choice_q_and_a_index_with_context_en', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/head_qa/es/multiple_choice_a_and_q_en/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/head_qa/es/multiple_choice_a_and_q_en/results.json new file mode 100644 index 0000000..b58bf7c --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/head_qa/es/multiple_choice_a_and_q_en/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "head_qa", + "dataset_config_name": "es", + "template_name": "multiple_choice_a_and_q_en", + "evaluation": { + "accuracy": 0.2730600292825769 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='head_qa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='multiple_choice_a_and_q_en', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/head_qa/es/multiple_choice_a_and_q_with_context_en/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/head_qa/es/multiple_choice_a_and_q_with_context_en/results.json new file mode 100644 index 0000000..f2cc1f2 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/head_qa/es/multiple_choice_a_and_q_with_context_en/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "head_qa", + "dataset_config_name": "es", + "template_name": "multiple_choice_a_and_q_with_context_en", + "evaluation": { + "accuracy": 0.27232796486090777 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='head_qa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='multiple_choice_a_and_q_with_context_en', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/head_qa/es/multiple_choice_q_and_a_en/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/head_qa/es/multiple_choice_q_and_a_en/results.json new file mode 100644 index 0000000..055d65c --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/head_qa/es/multiple_choice_q_and_a_en/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "head_qa", + "dataset_config_name": "es", + "template_name": "multiple_choice_q_and_a_en", + "evaluation": { + "accuracy": 0.36530014641288433 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='head_qa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='multiple_choice_q_and_a_en', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/head_qa/es/multiple_choice_q_and_a_index_en/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/head_qa/es/multiple_choice_q_and_a_index_en/results.json new file mode 100644 index 0000000..3dd8471 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/head_qa/es/multiple_choice_q_and_a_index_en/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "head_qa", + "dataset_config_name": "es", + "template_name": "multiple_choice_q_and_a_index_en", + "evaluation": { + "accuracy": 0.3074670571010249 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='head_qa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='multiple_choice_q_and_a_index_en', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/head_qa/es/multiple_choice_q_and_a_index_with_context_en/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/head_qa/es/multiple_choice_q_and_a_index_with_context_en/results.json new file mode 100644 index 0000000..acdf8f2 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/head_qa/es/multiple_choice_q_and_a_index_with_context_en/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "head_qa", + "dataset_config_name": "es", + "template_name": "multiple_choice_q_and_a_index_with_context_en", + "evaluation": { + "accuracy": 0.3089311859443631 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='head_qa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='multiple_choice_q_and_a_index_with_context_en', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/health_fact/claim_explanation_classification/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/health_fact/claim_explanation_classification/results.json new file mode 100644 index 0000000..4865a13 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/health_fact/claim_explanation_classification/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "health_fact", + "dataset_config_name": null, + "template_name": "claim_explanation_classification", + "evaluation": { + "accuracy": 0.5591836734693878 + }, + "arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='health_fact', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='claim_explanation_classification', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/health_fact/claim_veracity_classification_after_reading_I_believe/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/health_fact/claim_veracity_classification_after_reading_I_believe/results.json new file mode 100644 index 0000000..8f656b5 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/health_fact/claim_veracity_classification_after_reading_I_believe/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "health_fact", + "dataset_config_name": null, + "template_name": "claim_veracity_classification_after_reading_I_believe", + "evaluation": { + "accuracy": 0.34938775510204084 + }, + "arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='health_fact', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='claim_veracity_classification_after_reading_I_believe', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/health_fact/claim_veracity_classification_tell_me/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/health_fact/claim_veracity_classification_tell_me/results.json new file mode 100644 index 0000000..9b481b8 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/health_fact/claim_veracity_classification_tell_me/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "health_fact", + "dataset_config_name": null, + "template_name": "claim_veracity_classification_tell_me", + "evaluation": { + "accuracy": 0.48244897959183675 + }, + "arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='health_fact', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='claim_veracity_classification_tell_me', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/hlgd/is_same_event_editor_asks/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/hlgd/is_same_event_editor_asks/results.json new file mode 100644 index 0000000..72b4c63 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/hlgd/is_same_event_editor_asks/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "hlgd", + "dataset_config_name": null, + "template_name": "is_same_event_editor_asks", + "evaluation": { + "accuracy": 0.6926051232479459 + }, + "arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='hlgd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='is_same_event_editor_asks', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/hlgd/is_same_event_interrogative_talk/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/hlgd/is_same_event_interrogative_talk/results.json new file mode 100644 index 0000000..ecaa5e3 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/hlgd/is_same_event_interrogative_talk/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "hlgd", + "dataset_config_name": null, + "template_name": "is_same_event_interrogative_talk", + "evaluation": { + "accuracy": 0.6582890285161914 + }, + "arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='hlgd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='is_same_event_interrogative_talk', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/hlgd/is_same_event_refer/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/hlgd/is_same_event_refer/results.json new file mode 100644 index 0000000..63d0b15 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/hlgd/is_same_event_refer/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "hlgd", + "dataset_config_name": null, + "template_name": "is_same_event_refer", + "evaluation": { + "accuracy": 0.7858869018849686 + }, + "arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='hlgd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='is_same_event_refer', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/hlgd/is_same_event_with_time_interrogative_related/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/hlgd/is_same_event_with_time_interrogative_related/results.json new file mode 100644 index 0000000..00a6d47 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/hlgd/is_same_event_with_time_interrogative_related/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "hlgd", + "dataset_config_name": null, + "template_name": "is_same_event_with_time_interrogative_related", + "evaluation": { + "accuracy": 0.7839536007733204 + }, + "arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='hlgd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='is_same_event_with_time_interrogative_related', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/hlgd/is_same_event_with_time_interrogative_talk/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/hlgd/is_same_event_with_time_interrogative_talk/results.json new file mode 100644 index 0000000..4a7d287 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/hlgd/is_same_event_with_time_interrogative_talk/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "hlgd", + "dataset_config_name": null, + "template_name": "is_same_event_with_time_interrogative_talk", + "evaluation": { + "accuracy": 0.7786370227162881 + }, + "arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='hlgd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='is_same_event_with_time_interrogative_talk', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/hyperpartisan_news_detection/byarticle/consider_does_it_follow_a_hyperpartisan_argumentation/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/hyperpartisan_news_detection/byarticle/consider_does_it_follow_a_hyperpartisan_argumentation/results.json new file mode 100644 index 0000000..d14f797 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/hyperpartisan_news_detection/byarticle/consider_does_it_follow_a_hyperpartisan_argumentation/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "hyperpartisan_news_detection", + "dataset_config_name": "byarticle", + "template_name": "consider_does_it_follow_a_hyperpartisan_argumentation", + "evaluation": { + "accuracy": 0.6232558139534884 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='byarticle', dataset_name='hyperpartisan_news_detection', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='train', target_max_length=256, template_config_name=None, template_name='consider_does_it_follow_a_hyperpartisan_argumentation', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/hyperpartisan_news_detection/byarticle/consider_it_exhibits_extreme_one_sidedness/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/hyperpartisan_news_detection/byarticle/consider_it_exhibits_extreme_one_sidedness/results.json new file mode 100644 index 0000000..eb45b1e --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/hyperpartisan_news_detection/byarticle/consider_it_exhibits_extreme_one_sidedness/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "hyperpartisan_news_detection", + "dataset_config_name": "byarticle", + "template_name": "consider_it_exhibits_extreme_one_sidedness", + "evaluation": { + "accuracy": 0.6310077519379845 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='byarticle', dataset_name='hyperpartisan_news_detection', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='train', target_max_length=256, template_config_name=None, template_name='consider_it_exhibits_extreme_one_sidedness', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/hyperpartisan_news_detection/byarticle/consume_with_caution/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/hyperpartisan_news_detection/byarticle/consume_with_caution/results.json new file mode 100644 index 0000000..de077f9 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/hyperpartisan_news_detection/byarticle/consume_with_caution/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "hyperpartisan_news_detection", + "dataset_config_name": "byarticle", + "template_name": "consume_with_caution", + "evaluation": { + "accuracy": 0.6294573643410852 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='byarticle', dataset_name='hyperpartisan_news_detection', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='train', target_max_length=256, template_config_name=None, template_name='consume_with_caution', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/hyperpartisan_news_detection/byarticle/extreme_left_wing_or_right_wing/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/hyperpartisan_news_detection/byarticle/extreme_left_wing_or_right_wing/results.json new file mode 100644 index 0000000..994f3ac --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/hyperpartisan_news_detection/byarticle/extreme_left_wing_or_right_wing/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "hyperpartisan_news_detection", + "dataset_config_name": "byarticle", + "template_name": "extreme_left_wing_or_right_wing", + "evaluation": { + "accuracy": 0.6077519379844961 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='byarticle', dataset_name='hyperpartisan_news_detection', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='train', target_max_length=256, template_config_name=None, template_name='extreme_left_wing_or_right_wing', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/hyperpartisan_news_detection/byarticle/follows_hyperpartisan_argumentation/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/hyperpartisan_news_detection/byarticle/follows_hyperpartisan_argumentation/results.json new file mode 100644 index 0000000..8425932 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/hyperpartisan_news_detection/byarticle/follows_hyperpartisan_argumentation/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "hyperpartisan_news_detection", + "dataset_config_name": "byarticle", + "template_name": "follows_hyperpartisan_argumentation", + "evaluation": { + "accuracy": 0.627906976744186 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='byarticle', dataset_name='hyperpartisan_news_detection', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='train', target_max_length=256, template_config_name=None, template_name='follows_hyperpartisan_argumentation', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/liar/Given_statement_guess_category/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/liar/Given_statement_guess_category/results.json new file mode 100644 index 0000000..4be112a --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/liar/Given_statement_guess_category/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "liar", + "dataset_config_name": null, + "template_name": "Given statement guess category", + "evaluation": { + "accuracy": 0.19314641744548286 + }, + "arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='liar', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='Given statement guess category', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/lince/sa_spaeng/express_sentiment/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/lince/sa_spaeng/express_sentiment/results.json new file mode 100644 index 0000000..8bcd55e --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/lince/sa_spaeng/express_sentiment/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "lince", + "dataset_config_name": "sa_spaeng", + "template_name": "express sentiment", + "evaluation": { + "accuracy": 0.5696611081226466 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='sa_spaeng', dataset_name='lince', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='express sentiment', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/lince/sa_spaeng/negation_template/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/lince/sa_spaeng/negation_template/results.json new file mode 100644 index 0000000..75c828a --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/lince/sa_spaeng/negation_template/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "lince", + "dataset_config_name": "sa_spaeng", + "template_name": "negation template", + "evaluation": { + "accuracy": 0.3851533082302313 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='sa_spaeng', dataset_name='lince', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='negation template', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/lince/sa_spaeng/original_poster_expressed_sentiment/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/lince/sa_spaeng/original_poster_expressed_sentiment/results.json new file mode 100644 index 0000000..a74c4dd --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/lince/sa_spaeng/original_poster_expressed_sentiment/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "lince", + "dataset_config_name": "sa_spaeng", + "template_name": "original poster expressed sentiment", + "evaluation": { + "accuracy": 0.5841850457235073 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='sa_spaeng', dataset_name='lince', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='original poster expressed sentiment', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/lince/sa_spaeng/sentiment_trying_to_express/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/lince/sa_spaeng/sentiment_trying_to_express/results.json new file mode 100644 index 0000000..9699162 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/lince/sa_spaeng/sentiment_trying_to_express/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "lince", + "dataset_config_name": "sa_spaeng", + "template_name": "sentiment trying to express", + "evaluation": { + "accuracy": 0.5809575040344271 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='sa_spaeng', dataset_name='lince', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='sentiment trying to express', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/lince/sa_spaeng/the_author_seem/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/lince/sa_spaeng/the_author_seem/results.json new file mode 100644 index 0000000..c4c7fcf --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/lince/sa_spaeng/the_author_seem/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "lince", + "dataset_config_name": "sa_spaeng", + "template_name": "the author seem", + "evaluation": { + "accuracy": 0.5771920387305003 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='sa_spaeng', dataset_name='lince', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='the author seem', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/math_qa/choose_correct_og/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/math_qa/choose_correct_og/results.json new file mode 100644 index 0000000..4ff52e1 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/math_qa/choose_correct_og/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "math_qa", + "dataset_config_name": null, + "template_name": "choose_correct_og", + "evaluation": { + "accuracy": 0.23484087102177553 + }, + "arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='math_qa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='test', target_max_length=256, template_config_name=None, template_name='choose_correct_og', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/math_qa/first_choice_then_problem/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/math_qa/first_choice_then_problem/results.json new file mode 100644 index 0000000..5b8996f --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/math_qa/first_choice_then_problem/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "math_qa", + "dataset_config_name": null, + "template_name": "first_choice_then_problem", + "evaluation": { + "accuracy": 0.2254606365159129 + }, + "arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='math_qa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='test', target_max_length=256, template_config_name=None, template_name='first_choice_then_problem', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/math_qa/gre_problem/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/math_qa/gre_problem/results.json new file mode 100644 index 0000000..495e542 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/math_qa/gre_problem/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "math_qa", + "dataset_config_name": null, + "template_name": "gre_problem", + "evaluation": { + "accuracy": 0.21943048576214405 + }, + "arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='math_qa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='test', target_max_length=256, template_config_name=None, template_name='gre_problem', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/math_qa/pick_the_correct/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/math_qa/pick_the_correct/results.json new file mode 100644 index 0000000..1c7799a --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/math_qa/pick_the_correct/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "math_qa", + "dataset_config_name": null, + "template_name": "pick_the_correct", + "evaluation": { + "accuracy": 0.2338358458961474 + }, + "arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='math_qa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='test', target_max_length=256, template_config_name=None, template_name='pick_the_correct', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/math_qa/problem_set_type/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/math_qa/problem_set_type/results.json new file mode 100644 index 0000000..9232cea --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/math_qa/problem_set_type/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "math_qa", + "dataset_config_name": null, + "template_name": "problem_set_type", + "evaluation": { + "accuracy": 0.29246231155778896 + }, + "arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='math_qa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='test', target_max_length=256, template_config_name=None, template_name='problem_set_type', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/merged.csv b/evaluation_bloomz-7b1-p3/evaluation_val/merged.csv new file mode 100644 index 0000000..ea5e3bd --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/merged.csv @@ -0,0 +1,207 @@ +dataset,prompt,metric,value +amazon_reviews_multi_en,prompt_body_title_to_star,accuracy,0.6176 +amazon_reviews_multi_en,prompt_review_to_star,accuracy,0.5592 +amazon_reviews_multi_en,prompt_title_to_star,accuracy,0.3922 +amazon_reviews_multi_en,median,accuracy,0.5592 +amazon_reviews_multi_es,prompt_body_title_to_star,accuracy,0.5526 +amazon_reviews_multi_es,prompt_review_to_star,accuracy,0.5296 +amazon_reviews_multi_es,prompt_title_to_star,accuracy,0.3646 +amazon_reviews_multi_es,median,accuracy,0.5296 +amazon_reviews_multi_fr,prompt_body_title_to_star,accuracy,0.5332 +amazon_reviews_multi_fr,prompt_review_to_star,accuracy,0.5182 +amazon_reviews_multi_fr,prompt_title_to_star,accuracy,0.3644 +amazon_reviews_multi_fr,median,accuracy,0.5182 +amazon_reviews_multi_zh,prompt_body_title_to_star,accuracy,0.5174 +amazon_reviews_multi_zh,prompt_review_to_star,accuracy,0.5006 +amazon_reviews_multi_zh,prompt_title_to_star,accuracy,0.3874 +amazon_reviews_multi_zh,median,accuracy,0.5006 +aqua_rat_raw,Answer questions from options,accuracy,0.24015748031496062 +aqua_rat_raw,answer_quiz,accuracy,0.22440944881889763 +aqua_rat_raw,select_the_best_option,accuracy,0.2559055118110236 +aqua_rat_raw,median,accuracy,0.24015748031496062 +art_None,choose_hypothesis,accuracy,0.5926892950391645 +art_None,choose_hypothesis_believable,accuracy,0.5711488250652742 +art_None,choose_hypothesis_desc,accuracy,0.5169712793733682 +art_None,choose_hypothesis_likely,accuracy,0.5300261096605744 +art_None,choose_hypothesis_options,accuracy,0.5672323759791122 +art_None,median,accuracy,0.5672323759791122 +banking77_None,direct_to_which_department,accuracy,0.16753246753246753 +banking77_None,help_page_topic,accuracy,0.26785714285714285 +banking77_None,rephrase_as_banking_term,accuracy,0.274025974025974 +banking77_None,median,accuracy,0.26785714285714285 +blbooksgenre_title_genre_classifiction,classify,accuracy,0.25057603686635943 +blbooksgenre_title_genre_classifiction,multi-choice,accuracy,0.25057603686635943 +blbooksgenre_title_genre_classifiction,premise_context_first,accuracy,0.7321428571428571 +blbooksgenre_title_genre_classifiction,median,accuracy,0.25057603686635943 +blimp_adjunct_island,grammatical_between_1_2,accuracy,0.512 +blimp_adjunct_island,grammatical_between_A_B,accuracy,0.464 +blimp_adjunct_island,grammatical_which_one_1_2,accuracy,0.512 +blimp_adjunct_island,single_sentence_bad_yes_no,accuracy,0.52 +blimp_adjunct_island,single_sentence_good_yes_no,accuracy,0.493 +blimp_adjunct_island,median,accuracy,0.512 +climate_fever_None,claim_and_all_supporting_evidences,accuracy,0.3166123778501629 +climate_fever_None,fifth_evidence_and_claim_itemization,accuracy,0.4749185667752443 +climate_fever_None,first_evidence_and_claim_itemization,accuracy,0.22996742671009773 +climate_fever_None,second_evidence_and_claim_itemization,accuracy,0.24625407166123778 +climate_fever_None,third_evidence_claim_pair,accuracy,0.24234527687296417 +climate_fever_None,median,accuracy,0.24625407166123778 +codah_codah,affirmative_instruction_after_sentence_and_choices,accuracy,0.6693083573487032 +codah_codah,affirmative_instruction_before_sentence_and_choices,accuracy,0.6509365994236311 +codah_codah,interrogative_instruction_after_sentence_and_choices,accuracy,0.6761527377521613 +codah_codah,median,accuracy,0.6693083573487032 +commonsense_qa_None,answer_given_question_without_options,accuracy,0.6388206388206388 +commonsense_qa_None,most_suitable_answer,accuracy,0.7313677313677314 +commonsense_qa_None,question_answering,accuracy,0.7158067158067158 +commonsense_qa_None,median,accuracy,0.7158067158067158 +conv_ai_3_None,ambiguous,accuracy,0.39040207522697795 +conv_ai_3_None,clarification_needed,accuracy,0.39040207522697795 +conv_ai_3_None,directly_answer,accuracy,0.6095979247730221 +conv_ai_3_None,score_give_number,accuracy,0.057933419801124084 +conv_ai_3_None,score_how_much,accuracy,0.010376134889753566 +conv_ai_3_None,median,accuracy,0.39040207522697795 +craigslist_bargains_None,best deal,accuracy,0.5192629815745393 +craigslist_bargains_None,good deal for seller,accuracy,0.2529313232830821 +craigslist_bargains_None,good deal for seller no list price,accuracy,0.09715242881072027 +craigslist_bargains_None,good deal for seller no list price implicit,accuracy,0.24623115577889448 +craigslist_bargains_None,median,accuracy,0.2495812395309883 +emotion_None,answer_question_with_emotion_label,accuracy,0.3375 +emotion_None,answer_with_class_label,accuracy,0.214 +emotion_None,choose_the_best_emotion_label,accuracy,0.312 +emotion_None,reply_with_emoation_label,accuracy,0.4495 +emotion_None,median,accuracy,0.32475 +financial_phrasebank_sentences_allagree,bullish_neutral_bearish,accuracy,0.3878091872791519 +financial_phrasebank_sentences_allagree,complementary_industries,accuracy,0.10114840989399293 +financial_phrasebank_sentences_allagree,sentiment,accuracy,0.35644876325088337 +financial_phrasebank_sentences_allagree,share_price_option,accuracy,0.3670494699646643 +financial_phrasebank_sentences_allagree,word_comes_to_mind,accuracy,0.08259717314487633 +financial_phrasebank_sentences_allagree,median,accuracy,0.35644876325088337 +glue_cola,Following sentence acceptable,accuracy,0.37583892617449666 +glue_cola,Make sense yes no,accuracy,0.33940556088207097 +glue_cola,Previous sentence acceptable,accuracy,0.31255992329817833 +glue_cola,editing,accuracy,0.3844678811121764 +glue_cola,is_this_correct,accuracy,0.37775647171620325 +glue_cola,median,accuracy,0.37583892617449666 +glue_sst2,following positive negative,accuracy,0.9426605504587156 +glue_sst2,happy or mad,accuracy,0.8279816513761468 +glue_sst2,positive negative after,accuracy,0.9472477064220184 +glue_sst2,review,accuracy,0.9254587155963303 +glue_sst2,said,accuracy,0.9059633027522935 +glue_sst2,median,accuracy,0.9254587155963303 +head_qa_en,multiple_choice_a_and_q_en,accuracy,0.29428989751098095 +head_qa_en,multiple_choice_a_and_q_with_context_en,accuracy,0.29502196193265007 +head_qa_en,multiple_choice_q_and_a_en,accuracy,0.3938506588579795 +head_qa_en,multiple_choice_q_and_a_index_en,accuracy,0.30307467057101023 +head_qa_en,multiple_choice_q_and_a_index_with_context_en,accuracy,0.30234260614934116 +head_qa_en,median,accuracy,0.30234260614934116 +head_qa_es,multiple_choice_a_and_q_en,accuracy,0.2730600292825769 +head_qa_es,multiple_choice_a_and_q_with_context_en,accuracy,0.27232796486090777 +head_qa_es,multiple_choice_q_and_a_en,accuracy,0.36530014641288433 +head_qa_es,multiple_choice_q_and_a_index_en,accuracy,0.3074670571010249 +head_qa_es,multiple_choice_q_and_a_index_with_context_en,accuracy,0.3089311859443631 +head_qa_es,median,accuracy,0.3074670571010249 +health_fact_None,claim_explanation_classification,accuracy,0.5591836734693878 +health_fact_None,claim_veracity_classification_after_reading_I_believe,accuracy,0.34938775510204084 +health_fact_None,claim_veracity_classification_tell_me,accuracy,0.48244897959183675 +health_fact_None,median,accuracy,0.48244897959183675 +hlgd_None,is_same_event_editor_asks,accuracy,0.6926051232479459 +hlgd_None,is_same_event_interrogative_talk,accuracy,0.6582890285161914 +hlgd_None,is_same_event_refer,accuracy,0.7858869018849686 +hlgd_None,is_same_event_with_time_interrogative_related,accuracy,0.7839536007733204 +hlgd_None,is_same_event_with_time_interrogative_talk,accuracy,0.7786370227162881 +hlgd_None,median,accuracy,0.7786370227162881 +hyperpartisan_news_detection_byarticle,consider_does_it_follow_a_hyperpartisan_argumentation,accuracy,0.6232558139534884 +hyperpartisan_news_detection_byarticle,consider_it_exhibits_extreme_one_sidedness,accuracy,0.6310077519379845 +hyperpartisan_news_detection_byarticle,consume_with_caution,accuracy,0.6294573643410852 +hyperpartisan_news_detection_byarticle,extreme_left_wing_or_right_wing,accuracy,0.6077519379844961 +hyperpartisan_news_detection_byarticle,follows_hyperpartisan_argumentation,accuracy,0.627906976744186 +hyperpartisan_news_detection_byarticle,median,accuracy,0.627906976744186 +liar_None,Given statement guess category,accuracy,0.19314641744548286 +liar_None,median,accuracy,0.19314641744548286 +lince_sa_spaeng,express sentiment,accuracy,0.5696611081226466 +lince_sa_spaeng,negation template,accuracy,0.3851533082302313 +lince_sa_spaeng,original poster expressed sentiment,accuracy,0.5841850457235073 +lince_sa_spaeng,sentiment trying to express,accuracy,0.5809575040344271 +lince_sa_spaeng,the author seem,accuracy,0.5771920387305003 +lince_sa_spaeng,median,accuracy,0.5771920387305003 +math_qa_None,choose_correct_og,accuracy,0.23484087102177553 +math_qa_None,first_choice_then_problem,accuracy,0.2254606365159129 +math_qa_None,gre_problem,accuracy,0.21943048576214405 +math_qa_None,pick_the_correct,accuracy,0.2338358458961474 +math_qa_None,problem_set_type,accuracy,0.29246231155778896 +math_qa_None,median,accuracy,0.2338358458961474 +mlsum_es,layman_summ_es,bleu,0.026830705121606707 +mlsum_es,palm_prompt,bleu,0.033413101613448924 +mlsum_es,summarise_this_in_es_few_sentences,bleu,0.02224579465087946 +mlsum_es,median,bleu,0.026830705121606707 +movie_rationales_None,Evidences + review,accuracy,0.97 +movie_rationales_None,Evidences sentiment classification,accuracy,1.0 +movie_rationales_None,Standard binary sentiment analysis,accuracy,0.95 +movie_rationales_None,median,accuracy,0.97 +mwsc_None,in-the-sentence,accuracy,0.6219512195121951 +mwsc_None,in-the-sentence-question-first,accuracy,0.5853658536585366 +mwsc_None,is-correct,accuracy,0.5365853658536586 +mwsc_None,options-or,accuracy,0.6097560975609756 +mwsc_None,what-think,accuracy,0.6097560975609756 +mwsc_None,median,accuracy,0.6097560975609756 +onestop_english_None,ara_context,accuracy,0.3333333333333333 +onestop_english_None,assess,accuracy,0.3333333333333333 +onestop_english_None,determine_reading_level_from_the_first_three_sentences,accuracy,0.5696649029982364 +onestop_english_None,esl_context,accuracy,0.3333333333333333 +onestop_english_None,esl_variation,accuracy,0.3333333333333333 +onestop_english_None,median,accuracy,0.3333333333333333 +poem_sentiment_None,guess_sentiment_without_options_variation_1,accuracy,0.22857142857142856 +poem_sentiment_None,most_appropriate_sentiment,accuracy,0.2571428571428571 +poem_sentiment_None,positive_or_negative_sentiment_variation_1,accuracy,0.2571428571428571 +poem_sentiment_None,positive_or_negative_sentiment_variation_2,accuracy,0.21904761904761905 +poem_sentiment_None,question_answer_format,accuracy,0.24761904761904763 +poem_sentiment_None,median,accuracy,0.24761904761904763 +pubmed_qa_pqa_labeled,Long Answer to Final Decision,accuracy,0.598 +pubmed_qa_pqa_labeled,Question Answering (Short),accuracy,0.581 +pubmed_qa_pqa_labeled,median,accuracy,0.5894999999999999 +riddle_sense_None,answer_given_question_without_options,accuracy,0.4534769833496572 +riddle_sense_None,most_suitable_answer,accuracy,0.4348677766895201 +riddle_sense_None,question_answering,accuracy,0.4407443682664055 +riddle_sense_None,question_to_answer_index,accuracy,0.3878550440744368 +riddle_sense_None,median,accuracy,0.43780607247796277 +scicite_None,Classify intent,accuracy,0.15065502183406113 +scicite_None,Classify intent (choices first),accuracy,0.1331877729257642 +scicite_None,Classify intent (select choice),accuracy,0.2652838427947598 +scicite_None,Classify intent w/section (select choice),accuracy,0.3537117903930131 +scicite_None,can_describe,accuracy,0.15283842794759825 +scicite_None,median,accuracy,0.15283842794759825 +selqa_answer_selection_analysis,is-he-talking-about,accuracy,0.9121019108280255 +selqa_answer_selection_analysis,make-sense-rand,accuracy,0.9171974522292994 +selqa_answer_selection_analysis,which-answer-1st-vs-random,accuracy,0.7503184713375797 +selqa_answer_selection_analysis,would-make-sense-qu-rand,accuracy,0.8993630573248408 +selqa_answer_selection_analysis,median,accuracy,0.9057324840764331 +snips_built_in_intents_None,categorize_query,accuracy,0.47865853658536583 +snips_built_in_intents_None,categorize_query_brief,accuracy,0.375 +snips_built_in_intents_None,intent_query,accuracy,0.31402439024390244 +snips_built_in_intents_None,query_intent,accuracy,0.7012195121951219 +snips_built_in_intents_None,voice_intent,accuracy,0.6128048780487805 +snips_built_in_intents_None,median,accuracy,0.47865853658536583 +wmt14_fr_en_en-fr,a_good_translation-en-fr-source+target,bleu,0.02125573406419127 +wmt14_fr_en_en-fr,a_good_translation-en-fr-target,bleu,0.015697853682886957 +wmt14_fr_en_en-fr,gpt3-en-fr,bleu,0.0037928468482204985 +wmt14_fr_en_en-fr,version-en-fr-target,bleu,0.047885599586875285 +wmt14_fr_en_en-fr,xglm-en-fr-target,bleu,0.021861712984543362 +wmt14_fr_en_en-fr,median,bleu,0.02125573406419127 +wmt14_fr_en_fr-en,a_good_translation-fr-en-source+target,bleu,0.3038834619016813 +wmt14_fr_en_fr-en,a_good_translation-fr-en-target,bleu,0.22361703612398195 +wmt14_fr_en_fr-en,gpt3-fr-en,bleu,0.17167001660570336 +wmt14_fr_en_fr-en,version-fr-en-target,bleu,0.23925613843737142 +wmt14_fr_en_fr-en,xglm-fr-en-target,bleu,0.1410190003658709 +wmt14_fr_en_fr-en,median,bleu,0.22361703612398195 +wmt14_hi_en_en-hi,a_good_translation-en-hi-source+target,bleu,0.0018051438917625368 +wmt14_hi_en_en-hi,a_good_translation-en-hi-target,bleu,0.0018126292465026588 +wmt14_hi_en_en-hi,gpt-3-en-hi-target,bleu,0.00010782650615890081 +wmt14_hi_en_en-hi,version-en-hi-target,bleu,0.0018585745110753149 +wmt14_hi_en_en-hi,xglm-en-hi-target,bleu,2.225608801197892e-05 +wmt14_hi_en_en-hi,median,bleu,0.0018051438917625368 +wmt14_hi_en_hi-en,a_good_translation-hi-en-source+target,bleu,0.16056644593701627 +wmt14_hi_en_hi-en,a_good_translation-hi-en-target,bleu,0.1503249107946881 +wmt14_hi_en_hi-en,gpt-3-hi-en-target,bleu,0.05607403962346587 +wmt14_hi_en_hi-en,version-hi-en-target,bleu,0.15167071858881462 +wmt14_hi_en_hi-en,xglm-hi-en-target,bleu,0.03675518735361532 +wmt14_hi_en_hi-en,median,bleu,0.1503249107946881 +multiple,average,multiple,0.42128315936464156 diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/merged.json b/evaluation_bloomz-7b1-p3/evaluation_val/merged.json new file mode 100644 index 0000000..d8bc874 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/merged.json @@ -0,0 +1 @@ +{"amazon_reviews_multi_en": {"prompt_body_title_to_star": {"arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='amazon_reviews_multi', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='prompt_body_title_to_star', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "en", "dataset_name": "amazon_reviews_multi", "evaluation": {"accuracy": 0.6176}, "template_name": "prompt_body_title_to_star"}, "prompt_review_to_star": {"arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='amazon_reviews_multi', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='prompt_review_to_star', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "en", "dataset_name": "amazon_reviews_multi", "evaluation": {"accuracy": 0.5592}, "template_name": "prompt_review_to_star"}, "prompt_title_to_star": {"arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='amazon_reviews_multi', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='prompt_title_to_star', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "en", "dataset_name": "amazon_reviews_multi", "evaluation": {"accuracy": 0.3922}, "template_name": "prompt_title_to_star"}}, "amazon_reviews_multi_es": {"prompt_body_title_to_star": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='amazon_reviews_multi', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='prompt_body_title_to_star', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "amazon_reviews_multi", "evaluation": {"accuracy": 0.5526}, "template_name": "prompt_body_title_to_star"}, "prompt_review_to_star": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='amazon_reviews_multi', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='prompt_review_to_star', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "amazon_reviews_multi", "evaluation": {"accuracy": 0.5296}, "template_name": "prompt_review_to_star"}, "prompt_title_to_star": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='amazon_reviews_multi', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='prompt_title_to_star', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "amazon_reviews_multi", "evaluation": {"accuracy": 0.3646}, "template_name": "prompt_title_to_star"}}, "amazon_reviews_multi_fr": {"prompt_body_title_to_star": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='amazon_reviews_multi', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='prompt_body_title_to_star', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "amazon_reviews_multi", "evaluation": {"accuracy": 0.5332}, "template_name": "prompt_body_title_to_star"}, "prompt_review_to_star": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='amazon_reviews_multi', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='prompt_review_to_star', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "amazon_reviews_multi", "evaluation": {"accuracy": 0.5182}, "template_name": "prompt_review_to_star"}, "prompt_title_to_star": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='amazon_reviews_multi', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='prompt_title_to_star', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "amazon_reviews_multi", "evaluation": {"accuracy": 0.3644}, "template_name": "prompt_title_to_star"}}, "amazon_reviews_multi_zh": {"prompt_body_title_to_star": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='amazon_reviews_multi', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='prompt_body_title_to_star', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "amazon_reviews_multi", "evaluation": {"accuracy": 0.5174}, "template_name": "prompt_body_title_to_star"}, "prompt_review_to_star": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='amazon_reviews_multi', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='prompt_review_to_star', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "amazon_reviews_multi", "evaluation": {"accuracy": 0.5006}, "template_name": "prompt_review_to_star"}, "prompt_title_to_star": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='amazon_reviews_multi', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='prompt_title_to_star', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "amazon_reviews_multi", "evaluation": {"accuracy": 0.3874}, "template_name": "prompt_title_to_star"}}, "aqua_rat_raw": {"Answer questions from options": {"arguments": "Namespace(config_name=None, dataset_config_name='raw', dataset_name='aqua_rat', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='Answer questions from options', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "raw", "dataset_name": "aqua_rat", "evaluation": {"accuracy": 0.24015748031496062}, "template_name": "Answer questions from options"}, "answer_quiz": {"arguments": "Namespace(config_name=None, dataset_config_name='raw', dataset_name='aqua_rat', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='answer_quiz', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "raw", "dataset_name": "aqua_rat", "evaluation": {"accuracy": 0.22440944881889763}, "template_name": "answer_quiz"}, "select_the_best_option": {"arguments": "Namespace(config_name=None, dataset_config_name='raw', dataset_name='aqua_rat', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='select_the_best_option', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "raw", "dataset_name": "aqua_rat", "evaluation": {"accuracy": 0.2559055118110236}, "template_name": "select_the_best_option"}}, "art_None": {"choose_hypothesis": {"arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='art', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='choose_hypothesis', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": null, "dataset_name": "art", "evaluation": {"accuracy": 0.5926892950391645}, "template_name": "choose_hypothesis"}, "choose_hypothesis_believable": {"arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='art', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='choose_hypothesis_believable', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": null, "dataset_name": "art", "evaluation": {"accuracy": 0.5711488250652742}, "template_name": "choose_hypothesis_believable"}, "choose_hypothesis_desc": {"arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='art', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='choose_hypothesis_desc', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": null, "dataset_name": "art", "evaluation": {"accuracy": 0.5169712793733682}, "template_name": "choose_hypothesis_desc"}, "choose_hypothesis_likely": {"arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='art', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='choose_hypothesis_likely', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": null, "dataset_name": "art", "evaluation": {"accuracy": 0.5300261096605744}, "template_name": "choose_hypothesis_likely"}, "choose_hypothesis_options": {"arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='art', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='choose_hypothesis_options', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": null, "dataset_name": "art", "evaluation": {"accuracy": 0.5672323759791122}, "template_name": "choose_hypothesis_options"}}, "banking77_None": {"direct_to_which_department": {"arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='banking77', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='test', target_max_length=256, template_config_name=None, template_name='direct_to_which_department', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": null, "dataset_name": "banking77", "evaluation": {"accuracy": 0.16753246753246753}, "template_name": "direct_to_which_department"}, "help_page_topic": {"arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='banking77', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='test', target_max_length=256, template_config_name=None, template_name='help_page_topic', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": null, "dataset_name": "banking77", "evaluation": {"accuracy": 0.26785714285714285}, "template_name": "help_page_topic"}, "rephrase_as_banking_term": {"arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='banking77', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='test', target_max_length=256, template_config_name=None, template_name='rephrase_as_banking_term', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": null, "dataset_name": "banking77", "evaluation": {"accuracy": 0.274025974025974}, "template_name": "rephrase_as_banking_term"}}, "blbooksgenre_title_genre_classifiction": {"classify": {"arguments": "Namespace(config_name=None, dataset_config_name='title_genre_classifiction', dataset_name='blbooksgenre', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='train', target_max_length=256, template_config_name=None, template_name='classify', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "title_genre_classifiction", "dataset_name": "blbooksgenre", "evaluation": {"accuracy": 0.25057603686635943}, "template_name": "classify"}, "multi-choice": {"arguments": "Namespace(config_name=None, dataset_config_name='title_genre_classifiction', dataset_name='blbooksgenre', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='train', target_max_length=256, template_config_name=None, template_name='multi-choice', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "title_genre_classifiction", "dataset_name": "blbooksgenre", "evaluation": {"accuracy": 0.25057603686635943}, "template_name": "multi-choice"}, "premise_context_first": {"arguments": "Namespace(config_name=None, dataset_config_name='title_genre_classifiction', dataset_name='blbooksgenre', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='train', target_max_length=256, template_config_name=None, template_name='premise_context_first', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "title_genre_classifiction", "dataset_name": "blbooksgenre", "evaluation": {"accuracy": 0.7321428571428571}, "template_name": "premise_context_first"}}, "blimp_adjunct_island": {"grammatical_between_1_2": {"arguments": "Namespace(config_name=None, dataset_config_name='adjunct_island', dataset_name='blimp', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='train', target_max_length=256, template_config_name=None, template_name='grammatical_between_1_2', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "adjunct_island", "dataset_name": "blimp", "evaluation": {"accuracy": 0.512}, "template_name": "grammatical_between_1_2"}, "grammatical_between_A_B": {"arguments": "Namespace(config_name=None, dataset_config_name='adjunct_island', dataset_name='blimp', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='train', target_max_length=256, template_config_name=None, template_name='grammatical_between_A_B', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "adjunct_island", "dataset_name": "blimp", "evaluation": {"accuracy": 0.464}, "template_name": "grammatical_between_A_B"}, "grammatical_which_one_1_2": {"arguments": "Namespace(config_name=None, dataset_config_name='adjunct_island', dataset_name='blimp', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='train', target_max_length=256, template_config_name=None, template_name='grammatical_which_one_1_2', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "adjunct_island", "dataset_name": "blimp", "evaluation": {"accuracy": 0.512}, "template_name": "grammatical_which_one_1_2"}, "single_sentence_bad_yes_no": {"arguments": "Namespace(config_name=None, dataset_config_name='adjunct_island', dataset_name='blimp', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='train', target_max_length=256, template_config_name=None, template_name='single_sentence_bad_yes_no', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "adjunct_island", "dataset_name": "blimp", "evaluation": {"accuracy": 0.52}, "template_name": "single_sentence_bad_yes_no"}, "single_sentence_good_yes_no": {"arguments": "Namespace(config_name=None, dataset_config_name='adjunct_island', dataset_name='blimp', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='train', target_max_length=256, template_config_name=None, template_name='single_sentence_good_yes_no', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "adjunct_island", "dataset_name": "blimp", "evaluation": {"accuracy": 0.493}, "template_name": "single_sentence_good_yes_no"}}, "climate_fever_None": {"claim_and_all_supporting_evidences": {"arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='climate_fever', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='test', target_max_length=256, template_config_name=None, template_name='claim_and_all_supporting_evidences', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": null, "dataset_name": "climate_fever", "evaluation": {"accuracy": 0.3166123778501629}, "template_name": "claim_and_all_supporting_evidences"}, "fifth_evidence_and_claim_itemization": {"arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='climate_fever', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='test', target_max_length=256, template_config_name=None, template_name='fifth_evidence_and_claim_itemization', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": null, "dataset_name": "climate_fever", "evaluation": {"accuracy": 0.4749185667752443}, "template_name": "fifth_evidence_and_claim_itemization"}, "first_evidence_and_claim_itemization": {"arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='climate_fever', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='test', target_max_length=256, template_config_name=None, template_name='first_evidence_and_claim_itemization', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": null, "dataset_name": "climate_fever", "evaluation": {"accuracy": 0.22996742671009773}, "template_name": "first_evidence_and_claim_itemization"}, "second_evidence_and_claim_itemization": {"arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='climate_fever', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='test', target_max_length=256, template_config_name=None, template_name='second_evidence_and_claim_itemization', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": null, "dataset_name": "climate_fever", "evaluation": {"accuracy": 0.24625407166123778}, "template_name": "second_evidence_and_claim_itemization"}, "third_evidence_claim_pair": {"arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='climate_fever', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='test', target_max_length=256, template_config_name=None, template_name='third_evidence_claim_pair', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": null, "dataset_name": "climate_fever", "evaluation": {"accuracy": 0.24234527687296417}, "template_name": "third_evidence_claim_pair"}}, "codah_codah": {"affirmative_instruction_after_sentence_and_choices": {"arguments": "Namespace(config_name=None, dataset_config_name='codah', dataset_name='codah', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='train', target_max_length=256, template_config_name=None, template_name='affirmative_instruction_after_sentence_and_choices', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "codah", "dataset_name": "codah", "evaluation": {"accuracy": 0.6693083573487032}, "template_name": "affirmative_instruction_after_sentence_and_choices"}, "affirmative_instruction_before_sentence_and_choices": {"arguments": "Namespace(config_name=None, dataset_config_name='codah', dataset_name='codah', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='train', target_max_length=256, template_config_name=None, template_name='affirmative_instruction_before_sentence_and_choices', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "codah", "dataset_name": "codah", "evaluation": {"accuracy": 0.6509365994236311}, "template_name": "affirmative_instruction_before_sentence_and_choices"}, "interrogative_instruction_after_sentence_and_choices": {"arguments": "Namespace(config_name=None, dataset_config_name='codah', dataset_name='codah', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='train', target_max_length=256, template_config_name=None, template_name='interrogative_instruction_after_sentence_and_choices', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "codah", "dataset_name": "codah", "evaluation": {"accuracy": 0.6761527377521613}, "template_name": "interrogative_instruction_after_sentence_and_choices"}}, "commonsense_qa_None": {"answer_given_question_without_options": {"arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='commonsense_qa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='answer_given_question_without_options', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": null, "dataset_name": "commonsense_qa", "evaluation": {"accuracy": 0.6388206388206388}, "template_name": "answer_given_question_without_options"}, "most_suitable_answer": {"arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='commonsense_qa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='most_suitable_answer', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": null, "dataset_name": "commonsense_qa", "evaluation": {"accuracy": 0.7313677313677314}, "template_name": "most_suitable_answer"}, "question_answering": {"arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='commonsense_qa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='question_answering', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": null, "dataset_name": "commonsense_qa", "evaluation": {"accuracy": 0.7158067158067158}, "template_name": "question_answering"}}, "conv_ai_3_None": {"ambiguous": {"arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='conv_ai_3', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='ambiguous', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": null, "dataset_name": "conv_ai_3", "evaluation": {"accuracy": 0.39040207522697795}, "template_name": "ambiguous"}, "clarification_needed": {"arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='conv_ai_3', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='clarification_needed', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": null, "dataset_name": "conv_ai_3", "evaluation": {"accuracy": 0.39040207522697795}, "template_name": "clarification_needed"}, "directly_answer": {"arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='conv_ai_3', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='directly_answer', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": null, "dataset_name": "conv_ai_3", "evaluation": {"accuracy": 0.6095979247730221}, "template_name": "directly_answer"}, "score_give_number": {"arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='conv_ai_3', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='score_give_number', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": null, "dataset_name": "conv_ai_3", "evaluation": {"accuracy": 0.057933419801124084}, "template_name": "score_give_number"}, "score_how_much": {"arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='conv_ai_3', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='score_how_much', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": null, "dataset_name": "conv_ai_3", "evaluation": {"accuracy": 0.010376134889753566}, "template_name": "score_how_much"}}, "craigslist_bargains_None": {"best deal": {"arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='craigslist_bargains', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='best deal', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": null, "dataset_name": "craigslist_bargains", "evaluation": {"accuracy": 0.5192629815745393}, "template_name": "best deal"}, "good deal for seller": {"arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='craigslist_bargains', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='good deal for seller', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": null, "dataset_name": "craigslist_bargains", "evaluation": {"accuracy": 0.2529313232830821}, "template_name": "good deal for seller"}, "good deal for seller no list price": {"arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='craigslist_bargains', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='good deal for seller no list price', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": null, "dataset_name": "craigslist_bargains", "evaluation": {"accuracy": 0.09715242881072027}, "template_name": "good deal for seller no list price"}, "good deal for seller no list price implicit": {"arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='craigslist_bargains', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='good deal for seller no list price implicit', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": null, "dataset_name": "craigslist_bargains", "evaluation": {"accuracy": 0.24623115577889448}, "template_name": "good deal for seller no list price implicit"}}, "emotion_None": {"answer_question_with_emotion_label": {"arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='emotion', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='test', target_max_length=256, template_config_name=None, template_name='answer_question_with_emotion_label', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": null, "dataset_name": "emotion", "evaluation": {"accuracy": 0.3375}, "template_name": "answer_question_with_emotion_label"}, "answer_with_class_label": {"arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='emotion', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='test', target_max_length=256, template_config_name=None, template_name='answer_with_class_label', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": null, "dataset_name": "emotion", "evaluation": {"accuracy": 0.214}, "template_name": "answer_with_class_label"}, "choose_the_best_emotion_label": {"arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='emotion', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='test', target_max_length=256, template_config_name=None, template_name='choose_the_best_emotion_label', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": null, "dataset_name": "emotion", "evaluation": {"accuracy": 0.312}, "template_name": "choose_the_best_emotion_label"}, "reply_with_emoation_label": {"arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='emotion', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='test', target_max_length=256, template_config_name=None, template_name='reply_with_emoation_label', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": null, "dataset_name": "emotion", "evaluation": {"accuracy": 0.4495}, "template_name": "reply_with_emoation_label"}}, "financial_phrasebank_sentences_allagree": {"bullish_neutral_bearish": {"arguments": "Namespace(config_name=None, dataset_config_name='sentences_allagree', dataset_name='financial_phrasebank', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='train', target_max_length=256, template_config_name=None, template_name='bullish_neutral_bearish', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sentences_allagree", "dataset_name": "financial_phrasebank", "evaluation": {"accuracy": 0.3878091872791519}, "template_name": "bullish_neutral_bearish"}, "complementary_industries": {"arguments": "Namespace(config_name=None, dataset_config_name='sentences_allagree', dataset_name='financial_phrasebank', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='train', target_max_length=256, template_config_name=None, template_name='complementary_industries', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sentences_allagree", "dataset_name": "financial_phrasebank", "evaluation": {"accuracy": 0.10114840989399293}, "template_name": "complementary_industries"}, "sentiment": {"arguments": "Namespace(config_name=None, dataset_config_name='sentences_allagree', dataset_name='financial_phrasebank', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='train', target_max_length=256, template_config_name=None, template_name='sentiment', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sentences_allagree", "dataset_name": "financial_phrasebank", "evaluation": {"accuracy": 0.35644876325088337}, "template_name": "sentiment"}, "share_price_option": {"arguments": "Namespace(config_name=None, dataset_config_name='sentences_allagree', dataset_name='financial_phrasebank', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='train', target_max_length=256, template_config_name=None, template_name='share_price_option', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sentences_allagree", "dataset_name": "financial_phrasebank", "evaluation": {"accuracy": 0.3670494699646643}, "template_name": "share_price_option"}, "word_comes_to_mind": {"arguments": "Namespace(config_name=None, dataset_config_name='sentences_allagree', dataset_name='financial_phrasebank', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='train', target_max_length=256, template_config_name=None, template_name='word_comes_to_mind', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sentences_allagree", "dataset_name": "financial_phrasebank", "evaluation": {"accuracy": 0.08259717314487633}, "template_name": "word_comes_to_mind"}}, "glue_cola": {"Following sentence acceptable": {"arguments": "Namespace(config_name=None, dataset_config_name='cola', dataset_name='glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='Following sentence acceptable', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "cola", "dataset_name": "glue", "evaluation": {"accuracy": 0.37583892617449666}, "template_name": "Following sentence acceptable"}, "Make sense yes no": {"arguments": "Namespace(config_name=None, dataset_config_name='cola', dataset_name='glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='Make sense yes no', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "cola", "dataset_name": "glue", "evaluation": {"accuracy": 0.33940556088207097}, "template_name": "Make sense yes no"}, "Previous sentence acceptable": {"arguments": "Namespace(config_name=None, dataset_config_name='cola', dataset_name='glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='Previous sentence acceptable', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "cola", "dataset_name": "glue", "evaluation": {"accuracy": 0.31255992329817833}, "template_name": "Previous sentence acceptable"}, "editing": {"arguments": "Namespace(config_name=None, dataset_config_name='cola', dataset_name='glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='editing', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "cola", "dataset_name": "glue", "evaluation": {"accuracy": 0.3844678811121764}, "template_name": "editing"}, "is_this_correct": {"arguments": "Namespace(config_name=None, dataset_config_name='cola', dataset_name='glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='is_this_correct', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "cola", "dataset_name": "glue", "evaluation": {"accuracy": 0.37775647171620325}, "template_name": "is_this_correct"}}, "glue_sst2": {"following positive negative": {"arguments": "Namespace(config_name=None, dataset_config_name='sst2', dataset_name='glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='following positive negative', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sst2", "dataset_name": "glue", "evaluation": {"accuracy": 0.9426605504587156}, "template_name": "following positive negative"}, "happy or mad": {"arguments": "Namespace(config_name=None, dataset_config_name='sst2', dataset_name='glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='happy or mad', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sst2", "dataset_name": "glue", "evaluation": {"accuracy": 0.8279816513761468}, "template_name": "happy or mad"}, "positive negative after": {"arguments": "Namespace(config_name=None, dataset_config_name='sst2', dataset_name='glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='positive negative after', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sst2", "dataset_name": "glue", "evaluation": {"accuracy": 0.9472477064220184}, "template_name": "positive negative after"}, "review": {"arguments": "Namespace(config_name=None, dataset_config_name='sst2', dataset_name='glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='review', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sst2", "dataset_name": "glue", "evaluation": {"accuracy": 0.9254587155963303}, "template_name": "review"}, "said": {"arguments": "Namespace(config_name=None, dataset_config_name='sst2', dataset_name='glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='said', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sst2", "dataset_name": "glue", "evaluation": {"accuracy": 0.9059633027522935}, "template_name": "said"}}, "head_qa_en": {"multiple_choice_a_and_q_en": {"arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='head_qa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='multiple_choice_a_and_q_en', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "en", "dataset_name": "head_qa", "evaluation": {"accuracy": 0.29428989751098095}, "template_name": "multiple_choice_a_and_q_en"}, "multiple_choice_a_and_q_with_context_en": {"arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='head_qa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='multiple_choice_a_and_q_with_context_en', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "en", "dataset_name": "head_qa", "evaluation": {"accuracy": 0.29502196193265007}, "template_name": "multiple_choice_a_and_q_with_context_en"}, "multiple_choice_q_and_a_en": {"arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='head_qa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='multiple_choice_q_and_a_en', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "en", "dataset_name": "head_qa", "evaluation": {"accuracy": 0.3938506588579795}, "template_name": "multiple_choice_q_and_a_en"}, "multiple_choice_q_and_a_index_en": {"arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='head_qa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='multiple_choice_q_and_a_index_en', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "en", "dataset_name": "head_qa", "evaluation": {"accuracy": 0.30307467057101023}, "template_name": "multiple_choice_q_and_a_index_en"}, "multiple_choice_q_and_a_index_with_context_en": {"arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='head_qa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='multiple_choice_q_and_a_index_with_context_en', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "en", "dataset_name": "head_qa", "evaluation": {"accuracy": 0.30234260614934116}, "template_name": "multiple_choice_q_and_a_index_with_context_en"}}, "head_qa_es": {"multiple_choice_a_and_q_en": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='head_qa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='multiple_choice_a_and_q_en', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "head_qa", "evaluation": {"accuracy": 0.2730600292825769}, "template_name": "multiple_choice_a_and_q_en"}, "multiple_choice_a_and_q_with_context_en": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='head_qa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='multiple_choice_a_and_q_with_context_en', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "head_qa", "evaluation": {"accuracy": 0.27232796486090777}, "template_name": "multiple_choice_a_and_q_with_context_en"}, "multiple_choice_q_and_a_en": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='head_qa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='multiple_choice_q_and_a_en', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "head_qa", "evaluation": {"accuracy": 0.36530014641288433}, "template_name": "multiple_choice_q_and_a_en"}, "multiple_choice_q_and_a_index_en": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='head_qa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='multiple_choice_q_and_a_index_en', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "head_qa", "evaluation": {"accuracy": 0.3074670571010249}, "template_name": "multiple_choice_q_and_a_index_en"}, "multiple_choice_q_and_a_index_with_context_en": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='head_qa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='multiple_choice_q_and_a_index_with_context_en', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "head_qa", "evaluation": {"accuracy": 0.3089311859443631}, "template_name": "multiple_choice_q_and_a_index_with_context_en"}}, "health_fact_None": {"claim_explanation_classification": {"arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='health_fact', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='claim_explanation_classification', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": null, "dataset_name": "health_fact", "evaluation": {"accuracy": 0.5591836734693878}, "template_name": "claim_explanation_classification"}, "claim_veracity_classification_after_reading_I_believe": {"arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='health_fact', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='claim_veracity_classification_after_reading_I_believe', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": null, "dataset_name": "health_fact", "evaluation": {"accuracy": 0.34938775510204084}, "template_name": "claim_veracity_classification_after_reading_I_believe"}, "claim_veracity_classification_tell_me": {"arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='health_fact', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='claim_veracity_classification_tell_me', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": null, "dataset_name": "health_fact", "evaluation": {"accuracy": 0.48244897959183675}, "template_name": "claim_veracity_classification_tell_me"}}, "hlgd_None": {"is_same_event_editor_asks": {"arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='hlgd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='is_same_event_editor_asks', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": null, "dataset_name": "hlgd", "evaluation": {"accuracy": 0.6926051232479459}, "template_name": "is_same_event_editor_asks"}, "is_same_event_interrogative_talk": {"arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='hlgd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='is_same_event_interrogative_talk', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": null, "dataset_name": "hlgd", "evaluation": {"accuracy": 0.6582890285161914}, "template_name": "is_same_event_interrogative_talk"}, "is_same_event_refer": {"arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='hlgd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='is_same_event_refer', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": null, "dataset_name": "hlgd", "evaluation": {"accuracy": 0.7858869018849686}, "template_name": "is_same_event_refer"}, "is_same_event_with_time_interrogative_related": {"arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='hlgd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='is_same_event_with_time_interrogative_related', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": null, "dataset_name": "hlgd", "evaluation": {"accuracy": 0.7839536007733204}, "template_name": "is_same_event_with_time_interrogative_related"}, "is_same_event_with_time_interrogative_talk": {"arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='hlgd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='is_same_event_with_time_interrogative_talk', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": null, "dataset_name": "hlgd", "evaluation": {"accuracy": 0.7786370227162881}, "template_name": "is_same_event_with_time_interrogative_talk"}}, "hyperpartisan_news_detection_byarticle": {"consider_does_it_follow_a_hyperpartisan_argumentation": {"arguments": "Namespace(config_name=None, dataset_config_name='byarticle', dataset_name='hyperpartisan_news_detection', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='train', target_max_length=256, template_config_name=None, template_name='consider_does_it_follow_a_hyperpartisan_argumentation', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "byarticle", "dataset_name": "hyperpartisan_news_detection", "evaluation": {"accuracy": 0.6232558139534884}, "template_name": "consider_does_it_follow_a_hyperpartisan_argumentation"}, "consider_it_exhibits_extreme_one_sidedness": {"arguments": "Namespace(config_name=None, dataset_config_name='byarticle', dataset_name='hyperpartisan_news_detection', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='train', target_max_length=256, template_config_name=None, template_name='consider_it_exhibits_extreme_one_sidedness', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "byarticle", "dataset_name": "hyperpartisan_news_detection", "evaluation": {"accuracy": 0.6310077519379845}, "template_name": "consider_it_exhibits_extreme_one_sidedness"}, "consume_with_caution": {"arguments": "Namespace(config_name=None, dataset_config_name='byarticle', dataset_name='hyperpartisan_news_detection', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='train', target_max_length=256, template_config_name=None, template_name='consume_with_caution', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "byarticle", "dataset_name": "hyperpartisan_news_detection", "evaluation": {"accuracy": 0.6294573643410852}, "template_name": "consume_with_caution"}, "extreme_left_wing_or_right_wing": {"arguments": "Namespace(config_name=None, dataset_config_name='byarticle', dataset_name='hyperpartisan_news_detection', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='train', target_max_length=256, template_config_name=None, template_name='extreme_left_wing_or_right_wing', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "byarticle", "dataset_name": "hyperpartisan_news_detection", "evaluation": {"accuracy": 0.6077519379844961}, "template_name": "extreme_left_wing_or_right_wing"}, "follows_hyperpartisan_argumentation": {"arguments": "Namespace(config_name=None, dataset_config_name='byarticle', dataset_name='hyperpartisan_news_detection', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='train', target_max_length=256, template_config_name=None, template_name='follows_hyperpartisan_argumentation', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "byarticle", "dataset_name": "hyperpartisan_news_detection", "evaluation": {"accuracy": 0.627906976744186}, "template_name": "follows_hyperpartisan_argumentation"}}, "liar_None": {"Given statement guess category": {"arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='liar', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='Given statement guess category', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": null, "dataset_name": "liar", "evaluation": {"accuracy": 0.19314641744548286}, "template_name": "Given statement guess category"}}, "lince_sa_spaeng": {"express sentiment": {"arguments": "Namespace(config_name=None, dataset_config_name='sa_spaeng', dataset_name='lince', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='express sentiment', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sa_spaeng", "dataset_name": "lince", "evaluation": {"accuracy": 0.5696611081226466}, "template_name": "express sentiment"}, "negation template": {"arguments": "Namespace(config_name=None, dataset_config_name='sa_spaeng', dataset_name='lince', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='negation template', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sa_spaeng", "dataset_name": "lince", "evaluation": {"accuracy": 0.3851533082302313}, "template_name": "negation template"}, "original poster expressed sentiment": {"arguments": "Namespace(config_name=None, dataset_config_name='sa_spaeng', dataset_name='lince', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='original poster expressed sentiment', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sa_spaeng", "dataset_name": "lince", "evaluation": {"accuracy": 0.5841850457235073}, "template_name": "original poster expressed sentiment"}, "sentiment trying to express": {"arguments": "Namespace(config_name=None, dataset_config_name='sa_spaeng', dataset_name='lince', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='sentiment trying to express', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sa_spaeng", "dataset_name": "lince", "evaluation": {"accuracy": 0.5809575040344271}, "template_name": "sentiment trying to express"}, "the author seem": {"arguments": "Namespace(config_name=None, dataset_config_name='sa_spaeng', dataset_name='lince', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='the author seem', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sa_spaeng", "dataset_name": "lince", "evaluation": {"accuracy": 0.5771920387305003}, "template_name": "the author seem"}}, "math_qa_None": {"choose_correct_og": {"arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='math_qa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='test', target_max_length=256, template_config_name=None, template_name='choose_correct_og', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": null, "dataset_name": "math_qa", "evaluation": {"accuracy": 0.23484087102177553}, "template_name": "choose_correct_og"}, "first_choice_then_problem": {"arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='math_qa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='test', target_max_length=256, template_config_name=None, template_name='first_choice_then_problem', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": null, "dataset_name": "math_qa", "evaluation": {"accuracy": 0.2254606365159129}, "template_name": "first_choice_then_problem"}, "gre_problem": {"arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='math_qa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='test', target_max_length=256, template_config_name=None, template_name='gre_problem', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": null, "dataset_name": "math_qa", "evaluation": {"accuracy": 0.21943048576214405}, "template_name": "gre_problem"}, "pick_the_correct": {"arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='math_qa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='test', target_max_length=256, template_config_name=None, template_name='pick_the_correct', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": null, "dataset_name": "math_qa", "evaluation": {"accuracy": 0.2338358458961474}, "template_name": "pick_the_correct"}, "problem_set_type": {"arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='math_qa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='test', target_max_length=256, template_config_name=None, template_name='problem_set_type', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": null, "dataset_name": "math_qa", "evaluation": {"accuracy": 0.29246231155778896}, "template_name": "problem_set_type"}}, "mlsum_es": {"layman_summ_es": {"bleu": 2.6830705121606706, "bleu_stderr": 0.14257713719805254, "rouge1_fmeasure": 0.19665942356583802, "rouge1_fmeasure_stderr": 0.00245186970283176, "rouge1_precision": 0.19994210865731296, "rouge1_precision_stderr": 0.0026547454621461738, "rouge1_recall": 0.22882499765155356, "rouge1_recall_stderr": 0.0030837265632016487, "rouge2_fmeasure": 0.0515077211753521, "rouge2_fmeasure_stderr": 0.0015699354248315028, "rouge2_precision": 0.05189155779128239, "rouge2_precision_stderr": 0.0016867882237885771, "rouge2_recall": 0.06074745104675877, "rouge2_recall_stderr": 0.0018807564961523813, "rougeL_fmeasure": 0.14897381009906005, "rougeL_fmeasure_stderr": 0.0019744371501868186, "rougeL_precision": 0.15286714092332523, "rougeL_precision_stderr": 0.0021813478665272707, "rougeL_recall": 0.17277746632777954, "rougeL_recall_stderr": 0.002444939812221139, "rougeLsum_fmeasure": 0.15454418245332874, "rougeLsum_fmeasure_stderr": 0.0020344070829227297, "rougeLsum_precision": 0.15829244217518917, "rougeLsum_precision_stderr": 0.002241835408305656, "rougeLsum_recall": 0.17998021570910885, "rougeLsum_recall_stderr": 0.002565278973604084}, "palm_prompt": {"bleu": 3.341310161344892, "bleu_stderr": 0.12383760876849086, "rouge1_fmeasure": 0.23872886986952627, "rouge1_fmeasure_stderr": 0.0021217578248352883, "rouge1_precision": 0.2238936517609025, "rouge1_precision_stderr": 0.002299862104308459, "rouge1_recall": 0.31290976115097796, "rouge1_recall_stderr": 0.003157461966656448, "rouge2_fmeasure": 0.06770985280514573, "rouge2_fmeasure_stderr": 0.0016286057939871985, "rouge2_precision": 0.06209477646349353, "rouge2_precision_stderr": 0.001551591733457607, "rouge2_recall": 0.09261450488619867, "rouge2_recall_stderr": 0.002338770523626696, "rougeL_fmeasure": 0.17833059997868725, "rougeL_fmeasure_stderr": 0.0017648367718678965, "rougeL_precision": 0.16744902972078152, "rougeL_precision_stderr": 0.001881884121187265, "rougeL_recall": 0.23426964040901505, "rougeL_recall_stderr": 0.0025983311247854634, "rougeLsum_fmeasure": 0.1876134227034203, "rougeLsum_fmeasure_stderr": 0.0018064283175187946, "rougeLsum_precision": 0.1755707446810662, "rougeLsum_precision_stderr": 0.0019188012583382194, "rougeLsum_recall": 0.24827775226125046, "rougeLsum_recall_stderr": 0.0027727938693488186}, "summarise_this_in_es_few_sentences": {"bleu": 2.2245794650879462, "bleu_stderr": 0.07743700029169612, "rouge1_fmeasure": 0.21126423815884174, "rouge1_fmeasure_stderr": 0.0021771710222460634, "rouge1_precision": 0.18419910608261986, "rouge1_precision_stderr": 0.002301564923577535, "rouge1_recall": 0.33528109600140793, "rouge1_recall_stderr": 0.0034321373331462294, "rouge2_fmeasure": 0.05913033007358818, "rouge2_fmeasure_stderr": 0.0014890254374386052, "rouge2_precision": 0.051112897675373886, "rouge2_precision_stderr": 0.0014393881241720322, "rouge2_recall": 0.09793074579590116, "rouge2_recall_stderr": 0.0024100392963833633, "rougeL_fmeasure": 0.15711042852214044, "rougeL_fmeasure_stderr": 0.0017251643310554304, "rougeL_precision": 0.13714024915254835, "rougeL_precision_stderr": 0.0018260510947169805, "rougeL_recall": 0.2521886801730905, "rougeL_recall_stderr": 0.0027817174072391373, "rougeLsum_fmeasure": 0.1670307582655529, "rougeLsum_fmeasure_stderr": 0.0018288687884684008, "rougeLsum_precision": 0.14518990658432604, "rougeLsum_precision_stderr": 0.001900911801455617, "rougeLsum_recall": 0.26906405645015485, "rougeLsum_recall_stderr": 0.0030454611701701234}}, "movie_rationales_None": {"Evidences + review": {"arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='movie_rationales', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='Evidences + review', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": null, "dataset_name": "movie_rationales", "evaluation": {"accuracy": 0.97}, "template_name": "Evidences + review"}, "Evidences sentiment classification": {"arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='movie_rationales', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='Evidences sentiment classification', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": null, "dataset_name": "movie_rationales", "evaluation": {"accuracy": 1.0}, "template_name": "Evidences sentiment classification"}, "Standard binary sentiment analysis": {"arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='movie_rationales', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='Standard binary sentiment analysis', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": null, "dataset_name": "movie_rationales", "evaluation": {"accuracy": 0.95}, "template_name": "Standard binary sentiment analysis"}}, "mwsc_None": {"in-the-sentence": {"arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='mwsc', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='in-the-sentence', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": null, "dataset_name": "mwsc", "evaluation": {"accuracy": 0.6219512195121951}, "template_name": "in-the-sentence"}, "in-the-sentence-question-first": {"arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='mwsc', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='in-the-sentence-question-first', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": null, "dataset_name": "mwsc", "evaluation": {"accuracy": 0.5853658536585366}, "template_name": "in-the-sentence-question-first"}, "is-correct": {"arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='mwsc', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='is-correct', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": null, "dataset_name": "mwsc", "evaluation": {"accuracy": 0.5365853658536586}, "template_name": "is-correct"}, "options-or": {"arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='mwsc', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='options-or', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": null, "dataset_name": "mwsc", "evaluation": {"accuracy": 0.6097560975609756}, "template_name": "options-or"}, "what-think": {"arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='mwsc', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='what-think', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": null, "dataset_name": "mwsc", "evaluation": {"accuracy": 0.6097560975609756}, "template_name": "what-think"}}, "onestop_english_None": {"ara_context": {"arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='onestop_english', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='train', target_max_length=256, template_config_name=None, template_name='ara_context', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": null, "dataset_name": "onestop_english", "evaluation": {"accuracy": 0.3333333333333333}, "template_name": "ara_context"}, "assess": {"arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='onestop_english', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='train', target_max_length=256, template_config_name=None, template_name='assess', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": null, "dataset_name": "onestop_english", "evaluation": {"accuracy": 0.3333333333333333}, "template_name": "assess"}, "determine_reading_level_from_the_first_three_sentences": {"arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='onestop_english', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='train', target_max_length=256, template_config_name=None, template_name='determine_reading_level_from_the_first_three_sentences', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": null, "dataset_name": "onestop_english", "evaluation": {"accuracy": 0.5696649029982364}, "template_name": "determine_reading_level_from_the_first_three_sentences"}, "esl_context": {"arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='onestop_english', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='train', target_max_length=256, template_config_name=None, template_name='esl_context', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": null, "dataset_name": "onestop_english", "evaluation": {"accuracy": 0.3333333333333333}, "template_name": "esl_context"}, "esl_variation": {"arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='onestop_english', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='train', target_max_length=256, template_config_name=None, template_name='esl_variation', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": null, "dataset_name": "onestop_english", "evaluation": {"accuracy": 0.3333333333333333}, "template_name": "esl_variation"}}, "poem_sentiment_None": {"guess_sentiment_without_options_variation_1": {"arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='poem_sentiment', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='guess_sentiment_without_options_variation_1', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": null, "dataset_name": "poem_sentiment", "evaluation": {"accuracy": 0.22857142857142856}, "template_name": "guess_sentiment_without_options_variation_1"}, "most_appropriate_sentiment": {"arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='poem_sentiment', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='most_appropriate_sentiment', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": null, "dataset_name": "poem_sentiment", "evaluation": {"accuracy": 0.2571428571428571}, "template_name": "most_appropriate_sentiment"}, "positive_or_negative_sentiment_variation_1": {"arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='poem_sentiment', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='positive_or_negative_sentiment_variation_1', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": null, "dataset_name": "poem_sentiment", "evaluation": {"accuracy": 0.2571428571428571}, "template_name": "positive_or_negative_sentiment_variation_1"}, "positive_or_negative_sentiment_variation_2": {"arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='poem_sentiment', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='positive_or_negative_sentiment_variation_2', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": null, "dataset_name": "poem_sentiment", "evaluation": {"accuracy": 0.21904761904761905}, "template_name": "positive_or_negative_sentiment_variation_2"}, "question_answer_format": {"arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='poem_sentiment', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='question_answer_format', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": null, "dataset_name": "poem_sentiment", "evaluation": {"accuracy": 0.24761904761904763}, "template_name": "question_answer_format"}}, "pubmed_qa_pqa_labeled": {"Long Answer to Final Decision": {"arguments": "Namespace(config_name=None, dataset_config_name='pqa_labeled', dataset_name='pubmed_qa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='train', target_max_length=256, template_config_name=None, template_name='Long Answer to Final Decision', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "pqa_labeled", "dataset_name": "pubmed_qa", "evaluation": {"accuracy": 0.598}, "template_name": "Long Answer to Final Decision"}, "Question Answering (Short)": {"arguments": "Namespace(config_name=None, dataset_config_name='pqa_labeled', dataset_name='pubmed_qa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='train', target_max_length=256, template_config_name=None, template_name='Question Answering (Short)', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "pqa_labeled", "dataset_name": "pubmed_qa", "evaluation": {"accuracy": 0.581}, "template_name": "Question Answering (Short)"}}, "riddle_sense_None": {"answer_given_question_without_options": {"arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='riddle_sense', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='answer_given_question_without_options', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": null, "dataset_name": "riddle_sense", "evaluation": {"accuracy": 0.4534769833496572}, "template_name": "answer_given_question_without_options"}, "most_suitable_answer": {"arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='riddle_sense', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='most_suitable_answer', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": null, "dataset_name": "riddle_sense", "evaluation": {"accuracy": 0.4348677766895201}, "template_name": "most_suitable_answer"}, "question_answering": {"arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='riddle_sense', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='question_answering', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": null, "dataset_name": "riddle_sense", "evaluation": {"accuracy": 0.4407443682664055}, "template_name": "question_answering"}, "question_to_answer_index": {"arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='riddle_sense', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='question_to_answer_index', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": null, "dataset_name": "riddle_sense", "evaluation": {"accuracy": 0.3878550440744368}, "template_name": "question_to_answer_index"}}, "scicite_None": {"Classify intent": {"arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='scicite', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='Classify intent', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": null, "dataset_name": "scicite", "evaluation": {"accuracy": 0.15065502183406113}, "template_name": "Classify intent"}, "Classify intent (choices first)": {"arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='scicite', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='Classify intent (choices first)', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": null, "dataset_name": "scicite", "evaluation": {"accuracy": 0.1331877729257642}, "template_name": "Classify intent (choices first)"}, "Classify intent (select choice)": {"arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='scicite', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='Classify intent (select choice)', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": null, "dataset_name": "scicite", "evaluation": {"accuracy": 0.2652838427947598}, "template_name": "Classify intent (select choice)"}, "Classify intent w/section (select choice)": {"arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='scicite', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='Classify intent w/section (select choice)', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": null, "dataset_name": "scicite", "evaluation": {"accuracy": 0.3537117903930131}, "template_name": "Classify intent w/section (select choice)"}, "can_describe": {"arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='scicite', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='can_describe', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": null, "dataset_name": "scicite", "evaluation": {"accuracy": 0.15283842794759825}, "template_name": "can_describe"}}, "selqa_answer_selection_analysis": {"is-he-talking-about": {"arguments": "Namespace(config_name=None, dataset_config_name='answer_selection_analysis', dataset_name='selqa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='is-he-talking-about', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "answer_selection_analysis", "dataset_name": "selqa", "evaluation": {"accuracy": 0.9121019108280255}, "template_name": "is-he-talking-about"}, "make-sense-rand": {"arguments": "Namespace(config_name=None, dataset_config_name='answer_selection_analysis', dataset_name='selqa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='make-sense-rand', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "answer_selection_analysis", "dataset_name": "selqa", "evaluation": {"accuracy": 0.9171974522292994}, "template_name": "make-sense-rand"}, "which-answer-1st-vs-random": {"arguments": "Namespace(config_name=None, dataset_config_name='answer_selection_analysis', dataset_name='selqa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='which-answer-1st-vs-random', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "answer_selection_analysis", "dataset_name": "selqa", "evaluation": {"accuracy": 0.7503184713375797}, "template_name": "which-answer-1st-vs-random"}, "would-make-sense-qu-rand": {"arguments": "Namespace(config_name=None, dataset_config_name='answer_selection_analysis', dataset_name='selqa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='would-make-sense-qu-rand', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "answer_selection_analysis", "dataset_name": "selqa", "evaluation": {"accuracy": 0.8993630573248408}, "template_name": "would-make-sense-qu-rand"}}, "snips_built_in_intents_None": {"categorize_query": {"arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='snips_built_in_intents', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='train', target_max_length=256, template_config_name=None, template_name='categorize_query', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": null, "dataset_name": "snips_built_in_intents", "evaluation": {"accuracy": 0.47865853658536583}, "template_name": "categorize_query"}, "categorize_query_brief": {"arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='snips_built_in_intents', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='train', target_max_length=256, template_config_name=None, template_name='categorize_query_brief', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": null, "dataset_name": "snips_built_in_intents", "evaluation": {"accuracy": 0.375}, "template_name": "categorize_query_brief"}, "intent_query": {"arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='snips_built_in_intents', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='train', target_max_length=256, template_config_name=None, template_name='intent_query', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": null, "dataset_name": "snips_built_in_intents", "evaluation": {"accuracy": 0.31402439024390244}, "template_name": "intent_query"}, "query_intent": {"arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='snips_built_in_intents', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='train', target_max_length=256, template_config_name=None, template_name='query_intent', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": null, "dataset_name": "snips_built_in_intents", "evaluation": {"accuracy": 0.7012195121951219}, "template_name": "query_intent"}, "voice_intent": {"arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='snips_built_in_intents', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='train', target_max_length=256, template_config_name=None, template_name='voice_intent', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": null, "dataset_name": "snips_built_in_intents", "evaluation": {"accuracy": 0.6128048780487805}, "template_name": "voice_intent"}}, "wmt14_fr_en_en-fr": {"a_good_translation-en-fr-source+target": {"bleu": 2.125573406419127, "bleu_stderr": 0.09981676122698169}, "a_good_translation-en-fr-target": {"bleu": 1.5697853682886957, "bleu_stderr": 0.10176333685236229}, "gpt3-en-fr": {"bleu": 0.37928468482204986, "bleu_stderr": 0.03833854862936989}, "version-en-fr-target": {"bleu": 4.788559958687529, "bleu_stderr": 0.12647149552786194}, "xglm-en-fr-target": {"bleu": 2.186171298454336, "bleu_stderr": 0.09641163271059554}}, "wmt14_fr_en_fr-en": {"a_good_translation-fr-en-source+target": {"bleu": 30.388346190168132, "bleu_stderr": 0.28706919566129924}, "a_good_translation-fr-en-target": {"bleu": 22.361703612398195, "bleu_stderr": 0.43872418791072576}, "gpt3-fr-en": {"bleu": 17.167001660570335, "bleu_stderr": 0.3999014258297822}, "version-fr-en-target": {"bleu": 23.925613843737143, "bleu_stderr": 0.2692548707999714}, "xglm-fr-en-target": {"bleu": 14.10190003658709, "bleu_stderr": 0.1974741324240151}}, "wmt14_hi_en_en-hi": {"a_good_translation-en-hi-source+target": {"bleu": 0.18051438917625368, "bleu_stderr": 0.03338441915097909}, "a_good_translation-en-hi-target": {"bleu": 0.1812629246502659, "bleu_stderr": 0.04198901460363051}, "gpt-3-en-hi-target": {"bleu": 0.010782650615890082, "bleu_stderr": 0.003615918191553956}, "version-en-hi-target": {"bleu": 0.1858574511075315, "bleu_stderr": 0.029122685049572238}, "xglm-en-hi-target": {"bleu": 0.002225608801197892, "bleu_stderr": 0.0005988947090265846}}, "wmt14_hi_en_hi-en": {"a_good_translation-hi-en-source+target": {"bleu": 16.056644593701627, "bleu_stderr": 0.2809620281933667}, "a_good_translation-hi-en-target": {"bleu": 15.032491079468809, "bleu_stderr": 0.2628594862835867}, "gpt-3-hi-en-target": {"bleu": 5.607403962346587, "bleu_stderr": 0.26092845447942553}, "version-hi-en-target": {"bleu": 15.167071858881462, "bleu_stderr": 0.2573529636593602}, "xglm-hi-en-target": {"bleu": 3.675518735361532, "bleu_stderr": 0.17101231729659816}}} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/movie_rationales/Evidences_+_review/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/movie_rationales/Evidences_+_review/results.json new file mode 100644 index 0000000..f11a644 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/movie_rationales/Evidences_+_review/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "movie_rationales", + "dataset_config_name": null, + "template_name": "Evidences + review", + "evaluation": { + "accuracy": 0.97 + }, + "arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='movie_rationales', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='Evidences + review', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/movie_rationales/Evidences_sentiment_classification/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/movie_rationales/Evidences_sentiment_classification/results.json new file mode 100644 index 0000000..86bfa54 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/movie_rationales/Evidences_sentiment_classification/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "movie_rationales", + "dataset_config_name": null, + "template_name": "Evidences sentiment classification", + "evaluation": { + "accuracy": 1.0 + }, + "arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='movie_rationales', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='Evidences sentiment classification', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/movie_rationales/Standard_binary_sentiment_analysis/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/movie_rationales/Standard_binary_sentiment_analysis/results.json new file mode 100644 index 0000000..3cc8115 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/movie_rationales/Standard_binary_sentiment_analysis/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "movie_rationales", + "dataset_config_name": null, + "template_name": "Standard binary sentiment analysis", + "evaluation": { + "accuracy": 0.95 + }, + "arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='movie_rationales', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='Standard binary sentiment analysis', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/mwsc/in-the-sentence-question-first/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/mwsc/in-the-sentence-question-first/results.json new file mode 100644 index 0000000..5d8199c --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/mwsc/in-the-sentence-question-first/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "mwsc", + "dataset_config_name": null, + "template_name": "in-the-sentence-question-first", + "evaluation": { + "accuracy": 0.5853658536585366 + }, + "arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='mwsc', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='in-the-sentence-question-first', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/mwsc/in-the-sentence/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/mwsc/in-the-sentence/results.json new file mode 100644 index 0000000..9387ea6 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/mwsc/in-the-sentence/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "mwsc", + "dataset_config_name": null, + "template_name": "in-the-sentence", + "evaluation": { + "accuracy": 0.6219512195121951 + }, + "arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='mwsc', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='in-the-sentence', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/mwsc/is-correct/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/mwsc/is-correct/results.json new file mode 100644 index 0000000..12cdade --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/mwsc/is-correct/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "mwsc", + "dataset_config_name": null, + "template_name": "is-correct", + "evaluation": { + "accuracy": 0.5365853658536586 + }, + "arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='mwsc', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='is-correct', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/mwsc/options-or/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/mwsc/options-or/results.json new file mode 100644 index 0000000..3ad0826 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/mwsc/options-or/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "mwsc", + "dataset_config_name": null, + "template_name": "options-or", + "evaluation": { + "accuracy": 0.6097560975609756 + }, + "arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='mwsc', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='options-or', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/mwsc/what-think/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/mwsc/what-think/results.json new file mode 100644 index 0000000..c36924b --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/mwsc/what-think/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "mwsc", + "dataset_config_name": null, + "template_name": "what-think", + "evaluation": { + "accuracy": 0.6097560975609756 + }, + "arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='mwsc', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='what-think', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/onestop_english/ara_context/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/onestop_english/ara_context/results.json new file mode 100644 index 0000000..29fd3a9 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/onestop_english/ara_context/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "onestop_english", + "dataset_config_name": null, + "template_name": "ara_context", + "evaluation": { + "accuracy": 0.3333333333333333 + }, + "arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='onestop_english', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='train', target_max_length=256, template_config_name=None, template_name='ara_context', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/onestop_english/assess/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/onestop_english/assess/results.json new file mode 100644 index 0000000..bd92819 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/onestop_english/assess/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "onestop_english", + "dataset_config_name": null, + "template_name": "assess", + "evaluation": { + "accuracy": 0.3333333333333333 + }, + "arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='onestop_english', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='train', target_max_length=256, template_config_name=None, template_name='assess', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/onestop_english/determine_reading_level_from_the_first_three_sentences/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/onestop_english/determine_reading_level_from_the_first_three_sentences/results.json new file mode 100644 index 0000000..f934d0f --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/onestop_english/determine_reading_level_from_the_first_three_sentences/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "onestop_english", + "dataset_config_name": null, + "template_name": "determine_reading_level_from_the_first_three_sentences", + "evaluation": { + "accuracy": 0.5696649029982364 + }, + "arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='onestop_english', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='train', target_max_length=256, template_config_name=None, template_name='determine_reading_level_from_the_first_three_sentences', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/onestop_english/esl_context/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/onestop_english/esl_context/results.json new file mode 100644 index 0000000..42cab1f --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/onestop_english/esl_context/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "onestop_english", + "dataset_config_name": null, + "template_name": "esl_context", + "evaluation": { + "accuracy": 0.3333333333333333 + }, + "arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='onestop_english', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='train', target_max_length=256, template_config_name=None, template_name='esl_context', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/onestop_english/esl_variation/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/onestop_english/esl_variation/results.json new file mode 100644 index 0000000..6b26581 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/onestop_english/esl_variation/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "onestop_english", + "dataset_config_name": null, + "template_name": "esl_variation", + "evaluation": { + "accuracy": 0.3333333333333333 + }, + "arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='onestop_english', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='train', target_max_length=256, template_config_name=None, template_name='esl_variation', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/poem_sentiment/guess_sentiment_without_options_variation_1/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/poem_sentiment/guess_sentiment_without_options_variation_1/results.json new file mode 100644 index 0000000..0c773ce --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/poem_sentiment/guess_sentiment_without_options_variation_1/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "poem_sentiment", + "dataset_config_name": null, + "template_name": "guess_sentiment_without_options_variation_1", + "evaluation": { + "accuracy": 0.22857142857142856 + }, + "arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='poem_sentiment', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='guess_sentiment_without_options_variation_1', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/poem_sentiment/most_appropriate_sentiment/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/poem_sentiment/most_appropriate_sentiment/results.json new file mode 100644 index 0000000..ffc4203 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/poem_sentiment/most_appropriate_sentiment/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "poem_sentiment", + "dataset_config_name": null, + "template_name": "most_appropriate_sentiment", + "evaluation": { + "accuracy": 0.2571428571428571 + }, + "arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='poem_sentiment', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='most_appropriate_sentiment', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/poem_sentiment/positive_or_negative_sentiment_variation_1/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/poem_sentiment/positive_or_negative_sentiment_variation_1/results.json new file mode 100644 index 0000000..ecfd8ac --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/poem_sentiment/positive_or_negative_sentiment_variation_1/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "poem_sentiment", + "dataset_config_name": null, + "template_name": "positive_or_negative_sentiment_variation_1", + "evaluation": { + "accuracy": 0.2571428571428571 + }, + "arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='poem_sentiment', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='positive_or_negative_sentiment_variation_1', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/poem_sentiment/positive_or_negative_sentiment_variation_2/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/poem_sentiment/positive_or_negative_sentiment_variation_2/results.json new file mode 100644 index 0000000..1b7dbb4 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/poem_sentiment/positive_or_negative_sentiment_variation_2/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "poem_sentiment", + "dataset_config_name": null, + "template_name": "positive_or_negative_sentiment_variation_2", + "evaluation": { + "accuracy": 0.21904761904761905 + }, + "arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='poem_sentiment', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='positive_or_negative_sentiment_variation_2', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/poem_sentiment/question_answer_format/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/poem_sentiment/question_answer_format/results.json new file mode 100644 index 0000000..643fd4d --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/poem_sentiment/question_answer_format/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "poem_sentiment", + "dataset_config_name": null, + "template_name": "question_answer_format", + "evaluation": { + "accuracy": 0.24761904761904763 + }, + "arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='poem_sentiment', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='question_answer_format', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/pubmed_qa/pqa_labeled/Long_Answer_to_Final_Decision/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/pubmed_qa/pqa_labeled/Long_Answer_to_Final_Decision/results.json new file mode 100644 index 0000000..06ffcd5 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/pubmed_qa/pqa_labeled/Long_Answer_to_Final_Decision/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "pubmed_qa", + "dataset_config_name": "pqa_labeled", + "template_name": "Long Answer to Final Decision", + "evaluation": { + "accuracy": 0.598 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='pqa_labeled', dataset_name='pubmed_qa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='train', target_max_length=256, template_config_name=None, template_name='Long Answer to Final Decision', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/pubmed_qa/pqa_labeled/Question_Answering_(Short)/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/pubmed_qa/pqa_labeled/Question_Answering_(Short)/results.json new file mode 100644 index 0000000..f154092 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/pubmed_qa/pqa_labeled/Question_Answering_(Short)/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "pubmed_qa", + "dataset_config_name": "pqa_labeled", + "template_name": "Question Answering (Short)", + "evaluation": { + "accuracy": 0.581 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='pqa_labeled', dataset_name='pubmed_qa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='train', target_max_length=256, template_config_name=None, template_name='Question Answering (Short)', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/riddle_sense/answer_given_question_without_options/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/riddle_sense/answer_given_question_without_options/results.json new file mode 100644 index 0000000..53c774f --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/riddle_sense/answer_given_question_without_options/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "riddle_sense", + "dataset_config_name": null, + "template_name": "answer_given_question_without_options", + "evaluation": { + "accuracy": 0.4534769833496572 + }, + "arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='riddle_sense', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='answer_given_question_without_options', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/riddle_sense/most_suitable_answer/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/riddle_sense/most_suitable_answer/results.json new file mode 100644 index 0000000..55df976 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/riddle_sense/most_suitable_answer/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "riddle_sense", + "dataset_config_name": null, + "template_name": "most_suitable_answer", + "evaluation": { + "accuracy": 0.4348677766895201 + }, + "arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='riddle_sense', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='most_suitable_answer', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/riddle_sense/question_answering/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/riddle_sense/question_answering/results.json new file mode 100644 index 0000000..9ff56a8 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/riddle_sense/question_answering/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "riddle_sense", + "dataset_config_name": null, + "template_name": "question_answering", + "evaluation": { + "accuracy": 0.4407443682664055 + }, + "arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='riddle_sense', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='question_answering', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/riddle_sense/question_to_answer_index/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/riddle_sense/question_to_answer_index/results.json new file mode 100644 index 0000000..7366aa9 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/riddle_sense/question_to_answer_index/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "riddle_sense", + "dataset_config_name": null, + "template_name": "question_to_answer_index", + "evaluation": { + "accuracy": 0.3878550440744368 + }, + "arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='riddle_sense', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='question_to_answer_index', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/scicite/Classify_intent/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/scicite/Classify_intent/results.json new file mode 100644 index 0000000..6d0373a --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/scicite/Classify_intent/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "scicite", + "dataset_config_name": null, + "template_name": "Classify intent", + "evaluation": { + "accuracy": 0.15065502183406113 + }, + "arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='scicite', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='Classify intent', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/scicite/Classify_intent_(choices_first)/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/scicite/Classify_intent_(choices_first)/results.json new file mode 100644 index 0000000..604e75b --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/scicite/Classify_intent_(choices_first)/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "scicite", + "dataset_config_name": null, + "template_name": "Classify intent (choices first)", + "evaluation": { + "accuracy": 0.1331877729257642 + }, + "arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='scicite', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='Classify intent (choices first)', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/scicite/Classify_intent_(select_choice)/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/scicite/Classify_intent_(select_choice)/results.json new file mode 100644 index 0000000..2a9525e --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/scicite/Classify_intent_(select_choice)/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "scicite", + "dataset_config_name": null, + "template_name": "Classify intent (select choice)", + "evaluation": { + "accuracy": 0.2652838427947598 + }, + "arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='scicite', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='Classify intent (select choice)', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/scicite/Classify_intent_w_section_(select_choice)/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/scicite/Classify_intent_w_section_(select_choice)/results.json new file mode 100644 index 0000000..f9f64a4 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/scicite/Classify_intent_w_section_(select_choice)/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "scicite", + "dataset_config_name": null, + "template_name": "Classify intent w/section (select choice)", + "evaluation": { + "accuracy": 0.3537117903930131 + }, + "arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='scicite', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='Classify intent w/section (select choice)', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/scicite/can_describe/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/scicite/can_describe/results.json new file mode 100644 index 0000000..7ae40de --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/scicite/can_describe/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "scicite", + "dataset_config_name": null, + "template_name": "can_describe", + "evaluation": { + "accuracy": 0.15283842794759825 + }, + "arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='scicite', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='can_describe', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/selqa/answer_selection_analysis/is-he-talking-about/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/selqa/answer_selection_analysis/is-he-talking-about/results.json new file mode 100644 index 0000000..0619577 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/selqa/answer_selection_analysis/is-he-talking-about/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "selqa", + "dataset_config_name": "answer_selection_analysis", + "template_name": "is-he-talking-about", + "evaluation": { + "accuracy": 0.9121019108280255 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='answer_selection_analysis', dataset_name='selqa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='is-he-talking-about', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/selqa/answer_selection_analysis/make-sense-rand/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/selqa/answer_selection_analysis/make-sense-rand/results.json new file mode 100644 index 0000000..89666d3 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/selqa/answer_selection_analysis/make-sense-rand/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "selqa", + "dataset_config_name": "answer_selection_analysis", + "template_name": "make-sense-rand", + "evaluation": { + "accuracy": 0.9171974522292994 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='answer_selection_analysis', dataset_name='selqa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='make-sense-rand', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/selqa/answer_selection_analysis/which-answer-1st-vs-random/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/selqa/answer_selection_analysis/which-answer-1st-vs-random/results.json new file mode 100644 index 0000000..5243468 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/selqa/answer_selection_analysis/which-answer-1st-vs-random/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "selqa", + "dataset_config_name": "answer_selection_analysis", + "template_name": "which-answer-1st-vs-random", + "evaluation": { + "accuracy": 0.7503184713375797 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='answer_selection_analysis', dataset_name='selqa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='which-answer-1st-vs-random', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/selqa/answer_selection_analysis/would-make-sense-qu-rand/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/selqa/answer_selection_analysis/would-make-sense-qu-rand/results.json new file mode 100644 index 0000000..95569aa --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/selqa/answer_selection_analysis/would-make-sense-qu-rand/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "selqa", + "dataset_config_name": "answer_selection_analysis", + "template_name": "would-make-sense-qu-rand", + "evaluation": { + "accuracy": 0.8993630573248408 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='answer_selection_analysis', dataset_name='selqa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='would-make-sense-qu-rand', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=mlsum_es.templates=layman_summ_es.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:40.json b/evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=mlsum_es.templates=layman_summ_es.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:40.json new file mode 100644 index 0000000..25150f0 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=mlsum_es.templates=layman_summ_es.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:40.json @@ -0,0 +1,132 @@ +{ + "results": [ + { + "task_name": "mlsum_es", + "prompt_name": "layman_summ_es", + "bleu": 2.6830705121606706, + "dataset_path": "GEM/mlsum", + "dataset_name": "es", + "subset": "", + "bleu_stderr": 0.14257713719805254 + }, + { + "task_name": "mlsum_es", + "prompt_name": "layman_summ_es", + "rouge1_precision": 0.19994210865731296, + "dataset_path": "GEM/mlsum", + "dataset_name": "es", + "subset": "", + "rouge1_precision_stderr": 0.0026547454621461738 + }, + { + "task_name": "mlsum_es", + "prompt_name": "layman_summ_es", + "rouge1_recall": 0.22882499765155356, + "dataset_path": "GEM/mlsum", + "dataset_name": "es", + "subset": "", + "rouge1_recall_stderr": 0.0030837265632016487 + }, + { + "task_name": "mlsum_es", + "prompt_name": "layman_summ_es", + "rouge1_fmeasure": 0.19665942356583802, + "dataset_path": "GEM/mlsum", + "dataset_name": "es", + "subset": "", + "rouge1_fmeasure_stderr": 0.00245186970283176 + }, + { + "task_name": "mlsum_es", + "prompt_name": "layman_summ_es", + "rouge2_precision": 0.05189155779128239, + "dataset_path": "GEM/mlsum", + "dataset_name": "es", + "subset": "", + "rouge2_precision_stderr": 0.0016867882237885771 + }, + { + "task_name": "mlsum_es", + "prompt_name": "layman_summ_es", + "rouge2_recall": 0.06074745104675877, + "dataset_path": "GEM/mlsum", + "dataset_name": "es", + "subset": "", + "rouge2_recall_stderr": 0.0018807564961523813 + }, + { + "task_name": "mlsum_es", + "prompt_name": "layman_summ_es", + "rouge2_fmeasure": 0.0515077211753521, + "dataset_path": "GEM/mlsum", + "dataset_name": "es", + "subset": "", + "rouge2_fmeasure_stderr": 0.0015699354248315028 + }, + { + "task_name": "mlsum_es", + "prompt_name": "layman_summ_es", + "rougeL_precision": 0.15286714092332523, + "dataset_path": "GEM/mlsum", + "dataset_name": "es", + "subset": "", + "rougeL_precision_stderr": 0.0021813478665272707 + }, + { + "task_name": "mlsum_es", + "prompt_name": "layman_summ_es", + "rougeL_recall": 0.17277746632777954, + "dataset_path": "GEM/mlsum", + "dataset_name": "es", + "subset": "", + "rougeL_recall_stderr": 0.002444939812221139 + }, + { + "task_name": "mlsum_es", + "prompt_name": "layman_summ_es", + "rougeL_fmeasure": 0.14897381009906005, + "dataset_path": "GEM/mlsum", + "dataset_name": "es", + "subset": "", + "rougeL_fmeasure_stderr": 0.0019744371501868186 + }, + { + "task_name": "mlsum_es", + "prompt_name": "layman_summ_es", + "rougeLsum_precision": 0.15829244217518917, + "dataset_path": "GEM/mlsum", + "dataset_name": "es", + "subset": "", + "rougeLsum_precision_stderr": 0.002241835408305656 + }, + { + "task_name": "mlsum_es", + "prompt_name": "layman_summ_es", + "rougeLsum_recall": 0.17998021570910885, + "dataset_path": "GEM/mlsum", + "dataset_name": "es", + "subset": "", + "rougeLsum_recall_stderr": 0.002565278973604084 + }, + { + "task_name": "mlsum_es", + "prompt_name": "layman_summ_es", + "rougeLsum_fmeasure": 0.15454418245332874, + "dataset_path": "GEM/mlsum", + "dataset_name": "es", + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0020344070829227297 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,use_accelerate=True,tokenizer=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,dtype=float16", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=mlsum_es.templates=palm_prompt.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:40.json b/evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=mlsum_es.templates=palm_prompt.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:40.json new file mode 100644 index 0000000..f256ad0 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=mlsum_es.templates=palm_prompt.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:40.json @@ -0,0 +1,132 @@ +{ + "results": [ + { + "task_name": "mlsum_es", + "prompt_name": "palm_prompt", + "bleu": 3.341310161344892, + "dataset_path": "GEM/mlsum", + "dataset_name": "es", + "subset": "", + "bleu_stderr": 0.12383760876849086 + }, + { + "task_name": "mlsum_es", + "prompt_name": "palm_prompt", + "rouge1_precision": 0.2238936517609025, + "dataset_path": "GEM/mlsum", + "dataset_name": "es", + "subset": "", + "rouge1_precision_stderr": 0.002299862104308459 + }, + { + "task_name": "mlsum_es", + "prompt_name": "palm_prompt", + "rouge1_recall": 0.31290976115097796, + "dataset_path": "GEM/mlsum", + "dataset_name": "es", + "subset": "", + "rouge1_recall_stderr": 0.003157461966656448 + }, + { + "task_name": "mlsum_es", + "prompt_name": "palm_prompt", + "rouge1_fmeasure": 0.23872886986952627, + "dataset_path": "GEM/mlsum", + "dataset_name": "es", + "subset": "", + "rouge1_fmeasure_stderr": 0.0021217578248352883 + }, + { + "task_name": "mlsum_es", + "prompt_name": "palm_prompt", + "rouge2_precision": 0.06209477646349353, + "dataset_path": "GEM/mlsum", + "dataset_name": "es", + "subset": "", + "rouge2_precision_stderr": 0.001551591733457607 + }, + { + "task_name": "mlsum_es", + "prompt_name": "palm_prompt", + "rouge2_recall": 0.09261450488619867, + "dataset_path": "GEM/mlsum", + "dataset_name": "es", + "subset": "", + "rouge2_recall_stderr": 0.002338770523626696 + }, + { + "task_name": "mlsum_es", + "prompt_name": "palm_prompt", + "rouge2_fmeasure": 0.06770985280514573, + "dataset_path": "GEM/mlsum", + "dataset_name": "es", + "subset": "", + "rouge2_fmeasure_stderr": 0.0016286057939871985 + }, + { + "task_name": "mlsum_es", + "prompt_name": "palm_prompt", + "rougeL_precision": 0.16744902972078152, + "dataset_path": "GEM/mlsum", + "dataset_name": "es", + "subset": "", + "rougeL_precision_stderr": 0.001881884121187265 + }, + { + "task_name": "mlsum_es", + "prompt_name": "palm_prompt", + "rougeL_recall": 0.23426964040901505, + "dataset_path": "GEM/mlsum", + "dataset_name": "es", + "subset": "", + "rougeL_recall_stderr": 0.0025983311247854634 + }, + { + "task_name": "mlsum_es", + "prompt_name": "palm_prompt", + "rougeL_fmeasure": 0.17833059997868725, + "dataset_path": "GEM/mlsum", + "dataset_name": "es", + "subset": "", + "rougeL_fmeasure_stderr": 0.0017648367718678965 + }, + { + "task_name": "mlsum_es", + "prompt_name": "palm_prompt", + "rougeLsum_precision": 0.1755707446810662, + "dataset_path": "GEM/mlsum", + "dataset_name": "es", + "subset": "", + "rougeLsum_precision_stderr": 0.0019188012583382194 + }, + { + "task_name": "mlsum_es", + "prompt_name": "palm_prompt", + "rougeLsum_recall": 0.24827775226125046, + "dataset_path": "GEM/mlsum", + "dataset_name": "es", + "subset": "", + "rougeLsum_recall_stderr": 0.0027727938693488186 + }, + { + "task_name": "mlsum_es", + "prompt_name": "palm_prompt", + "rougeLsum_fmeasure": 0.1876134227034203, + "dataset_path": "GEM/mlsum", + "dataset_name": "es", + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0018064283175187946 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,use_accelerate=True,tokenizer=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,dtype=float16", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=mlsum_es.templates=summarise_this_in_es_few_sentences.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:40.json b/evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=mlsum_es.templates=summarise_this_in_es_few_sentences.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:40.json new file mode 100644 index 0000000..9836abb --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=mlsum_es.templates=summarise_this_in_es_few_sentences.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:40.json @@ -0,0 +1,132 @@ +{ + "results": [ + { + "task_name": "mlsum_es", + "prompt_name": "summarise_this_in_es_few_sentences", + "bleu": 2.2245794650879462, + "dataset_path": "GEM/mlsum", + "dataset_name": "es", + "subset": "", + "bleu_stderr": 0.07743700029169612 + }, + { + "task_name": "mlsum_es", + "prompt_name": "summarise_this_in_es_few_sentences", + "rouge1_precision": 0.18419910608261986, + "dataset_path": "GEM/mlsum", + "dataset_name": "es", + "subset": "", + "rouge1_precision_stderr": 0.002301564923577535 + }, + { + "task_name": "mlsum_es", + "prompt_name": "summarise_this_in_es_few_sentences", + "rouge1_recall": 0.33528109600140793, + "dataset_path": "GEM/mlsum", + "dataset_name": "es", + "subset": "", + "rouge1_recall_stderr": 0.0034321373331462294 + }, + { + "task_name": "mlsum_es", + "prompt_name": "summarise_this_in_es_few_sentences", + "rouge1_fmeasure": 0.21126423815884174, + "dataset_path": "GEM/mlsum", + "dataset_name": "es", + "subset": "", + "rouge1_fmeasure_stderr": 0.0021771710222460634 + }, + { + "task_name": "mlsum_es", + "prompt_name": "summarise_this_in_es_few_sentences", + "rouge2_precision": 0.051112897675373886, + "dataset_path": "GEM/mlsum", + "dataset_name": "es", + "subset": "", + "rouge2_precision_stderr": 0.0014393881241720322 + }, + { + "task_name": "mlsum_es", + "prompt_name": "summarise_this_in_es_few_sentences", + "rouge2_recall": 0.09793074579590116, + "dataset_path": "GEM/mlsum", + "dataset_name": "es", + "subset": "", + "rouge2_recall_stderr": 0.0024100392963833633 + }, + { + "task_name": "mlsum_es", + "prompt_name": "summarise_this_in_es_few_sentences", + "rouge2_fmeasure": 0.05913033007358818, + "dataset_path": "GEM/mlsum", + "dataset_name": "es", + "subset": "", + "rouge2_fmeasure_stderr": 0.0014890254374386052 + }, + { + "task_name": "mlsum_es", + "prompt_name": "summarise_this_in_es_few_sentences", + "rougeL_precision": 0.13714024915254835, + "dataset_path": "GEM/mlsum", + "dataset_name": "es", + "subset": "", + "rougeL_precision_stderr": 0.0018260510947169805 + }, + { + "task_name": "mlsum_es", + "prompt_name": "summarise_this_in_es_few_sentences", + "rougeL_recall": 0.2521886801730905, + "dataset_path": "GEM/mlsum", + "dataset_name": "es", + "subset": "", + "rougeL_recall_stderr": 0.0027817174072391373 + }, + { + "task_name": "mlsum_es", + "prompt_name": "summarise_this_in_es_few_sentences", + "rougeL_fmeasure": 0.15711042852214044, + "dataset_path": "GEM/mlsum", + "dataset_name": "es", + "subset": "", + "rougeL_fmeasure_stderr": 0.0017251643310554304 + }, + { + "task_name": "mlsum_es", + "prompt_name": "summarise_this_in_es_few_sentences", + "rougeLsum_precision": 0.14518990658432604, + "dataset_path": "GEM/mlsum", + "dataset_name": "es", + "subset": "", + "rougeLsum_precision_stderr": 0.001900911801455617 + }, + { + "task_name": "mlsum_es", + "prompt_name": "summarise_this_in_es_few_sentences", + "rougeLsum_recall": 0.26906405645015485, + "dataset_path": "GEM/mlsum", + "dataset_name": "es", + "subset": "", + "rougeLsum_recall_stderr": 0.0030454611701701234 + }, + { + "task_name": "mlsum_es", + "prompt_name": "summarise_this_in_es_few_sentences", + "rougeLsum_fmeasure": 0.1670307582655529, + "dataset_path": "GEM/mlsum", + "dataset_name": "es", + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0018288687884684008 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,use_accelerate=True,tokenizer=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,dtype=float16", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=a_good_translation-en-fr-source+target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:43.json b/evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=a_good_translation-en-fr-source+target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:43.json new file mode 100644 index 0000000..e57f28a --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=a_good_translation-en-fr-source+target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:43.json @@ -0,0 +1,24 @@ +{ + "results": [ + { + "task_name": "wmt14_fr_en", + "prompt_name": "a_good_translation-en-fr-source+target", + "bleu": 2.125573406419127, + "dataset_path": "wmt14", + "dataset_name": "fr-en", + "subset": null, + "bleu_stderr": 0.09981676122698169 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,use_accelerate=True,tokenizer=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,dtype=float16", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=a_good_translation-en-fr-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:43.json b/evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=a_good_translation-en-fr-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:43.json new file mode 100644 index 0000000..c95a2da --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=a_good_translation-en-fr-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:43.json @@ -0,0 +1,24 @@ +{ + "results": [ + { + "task_name": "wmt14_fr_en", + "prompt_name": "a_good_translation-en-fr-target", + "bleu": 1.5697853682886957, + "dataset_path": "wmt14", + "dataset_name": "fr-en", + "subset": null, + "bleu_stderr": 0.10176333685236229 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,use_accelerate=True,tokenizer=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,dtype=float16", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=a_good_translation-fr-en-source+target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json b/evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=a_good_translation-fr-en-source+target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json new file mode 100644 index 0000000..0df9c94 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=a_good_translation-fr-en-source+target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json @@ -0,0 +1,24 @@ +{ + "results": [ + { + "task_name": "wmt14_fr_en", + "prompt_name": "a_good_translation-fr-en-source+target", + "bleu": 30.388346190168132, + "dataset_path": "wmt14", + "dataset_name": "fr-en", + "subset": null, + "bleu_stderr": 0.28706919566129924 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,use_accelerate=True,tokenizer=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,dtype=float16", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=a_good_translation-fr-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json b/evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=a_good_translation-fr-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json new file mode 100644 index 0000000..fe7150c --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=a_good_translation-fr-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json @@ -0,0 +1,24 @@ +{ + "results": [ + { + "task_name": "wmt14_fr_en", + "prompt_name": "a_good_translation-fr-en-target", + "bleu": 22.361703612398195, + "dataset_path": "wmt14", + "dataset_name": "fr-en", + "subset": null, + "bleu_stderr": 0.43872418791072576 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,use_accelerate=True,tokenizer=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,dtype=float16", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=gpt3-en-fr.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:43.json b/evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=gpt3-en-fr.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:43.json new file mode 100644 index 0000000..7d66c4b --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=gpt3-en-fr.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:43.json @@ -0,0 +1,24 @@ +{ + "results": [ + { + "task_name": "wmt14_fr_en", + "prompt_name": "gpt3-en-fr", + "bleu": 0.37928468482204986, + "dataset_path": "wmt14", + "dataset_name": "fr-en", + "subset": null, + "bleu_stderr": 0.03833854862936989 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,use_accelerate=True,tokenizer=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,dtype=float16", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=gpt3-fr-en.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json b/evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=gpt3-fr-en.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json new file mode 100644 index 0000000..bd52a70 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=gpt3-fr-en.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json @@ -0,0 +1,24 @@ +{ + "results": [ + { + "task_name": "wmt14_fr_en", + "prompt_name": "gpt3-fr-en", + "bleu": 17.167001660570335, + "dataset_path": "wmt14", + "dataset_name": "fr-en", + "subset": null, + "bleu_stderr": 0.3999014258297822 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,use_accelerate=True,tokenizer=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,dtype=float16", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=version-en-fr-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:43.json b/evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=version-en-fr-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:43.json new file mode 100644 index 0000000..75adb44 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=version-en-fr-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:43.json @@ -0,0 +1,24 @@ +{ + "results": [ + { + "task_name": "wmt14_fr_en", + "prompt_name": "version-en-fr-target", + "bleu": 4.788559958687529, + "dataset_path": "wmt14", + "dataset_name": "fr-en", + "subset": null, + "bleu_stderr": 0.12647149552786194 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,use_accelerate=True,tokenizer=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,dtype=float16", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=version-fr-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json b/evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=version-fr-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json new file mode 100644 index 0000000..f732688 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=version-fr-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json @@ -0,0 +1,24 @@ +{ + "results": [ + { + "task_name": "wmt14_fr_en", + "prompt_name": "version-fr-en-target", + "bleu": 23.925613843737143, + "dataset_path": "wmt14", + "dataset_name": "fr-en", + "subset": null, + "bleu_stderr": 0.2692548707999714 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,use_accelerate=True,tokenizer=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,dtype=float16", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=xglm-en-fr-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:43.json b/evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=xglm-en-fr-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:43.json new file mode 100644 index 0000000..ce92212 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=xglm-en-fr-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:43.json @@ -0,0 +1,24 @@ +{ + "results": [ + { + "task_name": "wmt14_fr_en", + "prompt_name": "xglm-en-fr-target", + "bleu": 2.186171298454336, + "dataset_path": "wmt14", + "dataset_name": "fr-en", + "subset": null, + "bleu_stderr": 0.09641163271059554 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,use_accelerate=True,tokenizer=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,dtype=float16", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=xglm-fr-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json b/evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=xglm-fr-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json new file mode 100644 index 0000000..561043e --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=xglm-fr-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json @@ -0,0 +1,24 @@ +{ + "results": [ + { + "task_name": "wmt14_fr_en", + "prompt_name": "xglm-fr-en-target", + "bleu": 14.10190003658709, + "dataset_path": "wmt14", + "dataset_name": "fr-en", + "subset": null, + "bleu_stderr": 0.1974741324240151 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,use_accelerate=True,tokenizer=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,dtype=float16", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=a_good_translation-en-hi-source+target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json b/evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=a_good_translation-en-hi-source+target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json new file mode 100644 index 0000000..1f4bf1d --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=a_good_translation-en-hi-source+target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json @@ -0,0 +1,24 @@ +{ + "results": [ + { + "task_name": "wmt14_hi_en", + "prompt_name": "a_good_translation-en-hi-source+target", + "bleu": 0.18051438917625368, + "dataset_path": "wmt14", + "dataset_name": "hi-en", + "subset": null, + "bleu_stderr": 0.03338441915097909 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,use_accelerate=True,tokenizer=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,dtype=float16", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=a_good_translation-en-hi-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json b/evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=a_good_translation-en-hi-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json new file mode 100644 index 0000000..f90cca0 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=a_good_translation-en-hi-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json @@ -0,0 +1,24 @@ +{ + "results": [ + { + "task_name": "wmt14_hi_en", + "prompt_name": "a_good_translation-en-hi-target", + "bleu": 0.1812629246502659, + "dataset_path": "wmt14", + "dataset_name": "hi-en", + "subset": null, + "bleu_stderr": 0.04198901460363051 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,use_accelerate=True,tokenizer=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,dtype=float16", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=a_good_translation-hi-en-source+target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json b/evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=a_good_translation-hi-en-source+target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json new file mode 100644 index 0000000..ee358e5 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=a_good_translation-hi-en-source+target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json @@ -0,0 +1,24 @@ +{ + "results": [ + { + "task_name": "wmt14_hi_en", + "prompt_name": "a_good_translation-hi-en-source+target", + "bleu": 16.056644593701627, + "dataset_path": "wmt14", + "dataset_name": "hi-en", + "subset": null, + "bleu_stderr": 0.2809620281933667 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,use_accelerate=True,tokenizer=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,dtype=float16", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=a_good_translation-hi-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json b/evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=a_good_translation-hi-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json new file mode 100644 index 0000000..e7b7549 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=a_good_translation-hi-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json @@ -0,0 +1,24 @@ +{ + "results": [ + { + "task_name": "wmt14_hi_en", + "prompt_name": "a_good_translation-hi-en-target", + "bleu": 15.032491079468809, + "dataset_path": "wmt14", + "dataset_name": "hi-en", + "subset": null, + "bleu_stderr": 0.2628594862835867 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,use_accelerate=True,tokenizer=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,dtype=float16", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=version-en-hi-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json b/evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=version-en-hi-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json new file mode 100644 index 0000000..5c893aa --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=version-en-hi-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json @@ -0,0 +1,24 @@ +{ + "results": [ + { + "task_name": "wmt14_hi_en", + "prompt_name": "version-en-hi-target", + "bleu": 0.1858574511075315, + "dataset_path": "wmt14", + "dataset_name": "hi-en", + "subset": null, + "bleu_stderr": 0.029122685049572238 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,use_accelerate=True,tokenizer=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,dtype=float16", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=version-hi-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json b/evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=version-hi-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json new file mode 100644 index 0000000..dee25dd --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=version-hi-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json @@ -0,0 +1,24 @@ +{ + "results": [ + { + "task_name": "wmt14_hi_en", + "prompt_name": "version-hi-en-target", + "bleu": 15.167071858881462, + "dataset_path": "wmt14", + "dataset_name": "hi-en", + "subset": null, + "bleu_stderr": 0.2573529636593602 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,use_accelerate=True,tokenizer=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,dtype=float16", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=xglm-en-hi-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json b/evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=xglm-en-hi-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json new file mode 100644 index 0000000..880912a --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=xglm-en-hi-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json @@ -0,0 +1,24 @@ +{ + "results": [ + { + "task_name": "wmt14_hi_en", + "prompt_name": "xglm-en-hi-target", + "bleu": 0.002225608801197892, + "dataset_path": "wmt14", + "dataset_name": "hi-en", + "subset": null, + "bleu_stderr": 0.0005988947090265846 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,use_accelerate=True,tokenizer=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,dtype=float16", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=xglm-hi-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json b/evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=xglm-hi-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json new file mode 100644 index 0000000..973aa51 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/slim.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=xglm-hi-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.json @@ -0,0 +1,24 @@ +{ + "results": [ + { + "task_name": "wmt14_hi_en", + "prompt_name": "xglm-hi-en-target", + "bleu": 3.675518735361532, + "dataset_path": "wmt14", + "dataset_name": "hi-en", + "subset": null, + "bleu_stderr": 0.17101231729659816 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,use_accelerate=True,tokenizer=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,dtype=float16", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/snips_built_in_intents/categorize_query/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/snips_built_in_intents/categorize_query/results.json new file mode 100644 index 0000000..31608b5 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/snips_built_in_intents/categorize_query/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "snips_built_in_intents", + "dataset_config_name": null, + "template_name": "categorize_query", + "evaluation": { + "accuracy": 0.47865853658536583 + }, + "arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='snips_built_in_intents', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='train', target_max_length=256, template_config_name=None, template_name='categorize_query', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/snips_built_in_intents/categorize_query_brief/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/snips_built_in_intents/categorize_query_brief/results.json new file mode 100644 index 0000000..0ef9e1d --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/snips_built_in_intents/categorize_query_brief/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "snips_built_in_intents", + "dataset_config_name": null, + "template_name": "categorize_query_brief", + "evaluation": { + "accuracy": 0.375 + }, + "arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='snips_built_in_intents', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='train', target_max_length=256, template_config_name=None, template_name='categorize_query_brief', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/snips_built_in_intents/intent_query/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/snips_built_in_intents/intent_query/results.json new file mode 100644 index 0000000..ffa4069 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/snips_built_in_intents/intent_query/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "snips_built_in_intents", + "dataset_config_name": null, + "template_name": "intent_query", + "evaluation": { + "accuracy": 0.31402439024390244 + }, + "arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='snips_built_in_intents', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='train', target_max_length=256, template_config_name=None, template_name='intent_query', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/snips_built_in_intents/query_intent/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/snips_built_in_intents/query_intent/results.json new file mode 100644 index 0000000..4b53c41 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/snips_built_in_intents/query_intent/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "snips_built_in_intents", + "dataset_config_name": null, + "template_name": "query_intent", + "evaluation": { + "accuracy": 0.7012195121951219 + }, + "arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='snips_built_in_intents', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='train', target_max_length=256, template_config_name=None, template_name='query_intent', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/snips_built_in_intents/voice_intent/results.json b/evaluation_bloomz-7b1-p3/evaluation_val/snips_built_in_intents/voice_intent/results.json new file mode 100644 index 0000000..1c21be0 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/snips_built_in_intents/voice_intent/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "snips_built_in_intents", + "dataset_config_name": null, + "template_name": "voice_intent", + "evaluation": { + "accuracy": 0.6128048780487805 + }, + "arguments": "Namespace(config_name=None, dataset_config_name=None, dataset_name='snips_built_in_intents', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=4, prefixlm=False, split='train', target_max_length=256, template_config_name=None, template_name='voice_intent', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/wmt14_hi_en/agg.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=gpt-3-en-hi-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-10T11:48:47.json b/evaluation_bloomz-7b1-p3/evaluation_val/wmt14_hi_en/agg.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=gpt-3-en-hi-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-10T11:48:47.json new file mode 100644 index 0000000..7fc7a47 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/wmt14_hi_en/agg.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=gpt-3-en-hi-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-10T11:48:47.json @@ -0,0 +1 @@ +{"results": [{"task_name": "wmt14_hi_en", "prompt_name": "gpt-3-en-hi-target", "bleu": 0.010782650615890082, "fixed_answer_choice_list": null, "dataset_path": "wmt14", "dataset_name": "hi-en", "subset": null, "prompt_id": "eb27a29c-e238-4ebd-a675-456d2102a80e", "prompt_jinja": "What is the Hindi translation of: {{translation[\"en\"]}}\n||| {{translation[\"hi\"]}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.003615918191553956}], "config": {"model": "hf-causal", "model_args": "pretrained=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,use_accelerate=True,tokenizer=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,dtype=float16", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/wmt14_hi_en/agg.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=gpt-3-hi-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-10T11:48:47.json b/evaluation_bloomz-7b1-p3/evaluation_val/wmt14_hi_en/agg.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=gpt-3-hi-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-10T11:48:47.json new file mode 100644 index 0000000..da07e5c --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/wmt14_hi_en/agg.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=gpt-3-hi-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-10T11:48:47.json @@ -0,0 +1 @@ +{"results": [{"task_name": "wmt14_hi_en", "prompt_name": "gpt-3-hi-en-target", "bleu": 5.607403962346587, "fixed_answer_choice_list": null, "dataset_path": "wmt14", "dataset_name": "hi-en", "subset": null, "prompt_id": "ab0195bd-4abd-4d9e-8107-afa7a3a6f6fc", "prompt_jinja": "What is the English translation of : {{translation[\"hi\"]}}\n||| {{translation[\"en\"]}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.26092845447942553}], "config": {"model": "hf-causal", "model_args": "pretrained=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,use_accelerate=True,tokenizer=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,dtype=float16", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/wmt14_hi_en/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=gpt-3-en-hi-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-10T11:48:47.jsonl b/evaluation_bloomz-7b1-p3/evaluation_val/wmt14_hi_en/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=gpt-3-en-hi-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-10T11:48:47.jsonl new file mode 100644 index 0000000..dabe4af --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/wmt14_hi_en/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=gpt-3-en-hi-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-10T11:48:47.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a6a183ea36dcc0c0e66297832e9189bcaf3b0357d8be3f23d6844fc2456bf91 +size 2502412 diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/wmt14_hi_en/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=gpt-3-hi-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-10T11:48:47.jsonl b/evaluation_bloomz-7b1-p3/evaluation_val/wmt14_hi_en/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=gpt-3-hi-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-10T11:48:47.jsonl new file mode 100644 index 0000000..a50b4d9 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/wmt14_hi_en/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=gpt-3-hi-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-10T11:48:47.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3526a0506acdca89e7a35d88ff848186f380dba60b9c3134c8d80b3f6754eb9e +size 2595172 diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/wmt14_hi_en/slim.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=gpt-3-en-hi-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-10T11:48:47.json b/evaluation_bloomz-7b1-p3/evaluation_val/wmt14_hi_en/slim.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=gpt-3-en-hi-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-10T11:48:47.json new file mode 100644 index 0000000..9bb3787 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/wmt14_hi_en/slim.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=gpt-3-en-hi-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-10T11:48:47.json @@ -0,0 +1,24 @@ +{ + "results": [ + { + "task_name": "wmt14_hi_en", + "prompt_name": "gpt-3-en-hi-target", + "bleu": 0.010782650615890082, + "dataset_path": "wmt14", + "dataset_name": "hi-en", + "subset": null, + "bleu_stderr": 0.003615918191553956 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,use_accelerate=True,tokenizer=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,dtype=float16", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_val/wmt14_hi_en/slim.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=gpt-3-hi-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-10T11:48:47.json b/evaluation_bloomz-7b1-p3/evaluation_val/wmt14_hi_en/slim.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=gpt-3-hi-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-10T11:48:47.json new file mode 100644 index 0000000..7a4f39c --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_val/wmt14_hi_en/slim.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=gpt-3-hi-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-10T11:48:47.json @@ -0,0 +1,24 @@ +{ + "results": [ + { + "task_name": "wmt14_hi_en", + "prompt_name": "gpt-3-hi-en-target", + "bleu": 5.607403962346587, + "dataset_path": "wmt14", + "dataset_name": "hi-en", + "subset": null, + "bleu_stderr": 0.26092845447942553 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,use_accelerate=True,tokenizer=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000,dtype=float16", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/ar/GPT-3_style_arht/results.json b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/ar/GPT-3_style_arht/results.json new file mode 100644 index 0000000..799b50f --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/ar/GPT-3_style_arht/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "ar", + "template_name": "GPT-3 style_arht", + "evaluation": { + "accuracy": 0.3634538152610442 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='GPT-3 style_arht', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/ar/MNLI_crowdsource_arht/results.json b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/ar/MNLI_crowdsource_arht/results.json new file mode 100644 index 0000000..6271120 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/ar/MNLI_crowdsource_arht/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "ar", + "template_name": "MNLI crowdsource_arht", + "evaluation": { + "accuracy": 0.3437751004016064 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='MNLI crowdsource_arht', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/ar/can_we_infer_arht/results.json b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/ar/can_we_infer_arht/results.json new file mode 100644 index 0000000..35adba1 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/ar/can_we_infer_arht/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "ar", + "template_name": "can we infer_arht", + "evaluation": { + "accuracy": 0.40923694779116465 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='can we infer_arht', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/ar/guaranteed_possible_impossible_arht/results.json b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/ar/guaranteed_possible_impossible_arht/results.json new file mode 100644 index 0000000..d318445 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/ar/guaranteed_possible_impossible_arht/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "ar", + "template_name": "guaranteed/possible/impossible_arht", + "evaluation": { + "accuracy": 0.342570281124498 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='guaranteed/possible/impossible_arht', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/ar/justified_in_saying_arht/results.json b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/ar/justified_in_saying_arht/results.json new file mode 100644 index 0000000..75763f2 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/ar/justified_in_saying_arht/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "ar", + "template_name": "justified in saying_arht", + "evaluation": { + "accuracy": 0.4321285140562249 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='justified in saying_arht', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/es/GPT-3_style_esht/results.json b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/es/GPT-3_style_esht/results.json new file mode 100644 index 0000000..2f8dedb --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/es/GPT-3_style_esht/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "es", + "template_name": "GPT-3 style_esht", + "evaluation": { + "accuracy": 0.4795180722891566 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='GPT-3 style_esht', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/es/MNLI_crowdsource_esht/results.json b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/es/MNLI_crowdsource_esht/results.json new file mode 100644 index 0000000..cfe406f --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/es/MNLI_crowdsource_esht/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "es", + "template_name": "MNLI crowdsource_esht", + "evaluation": { + "accuracy": 0.3333333333333333 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='MNLI crowdsource_esht', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/es/can_we_infer_esht/results.json b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/es/can_we_infer_esht/results.json new file mode 100644 index 0000000..2ebc4f9 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/es/can_we_infer_esht/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "es", + "template_name": "can we infer_esht", + "evaluation": { + "accuracy": 0.3333333333333333 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='can we infer_esht', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/es/guaranteed_possible_impossible_esht/results.json b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/es/guaranteed_possible_impossible_esht/results.json new file mode 100644 index 0000000..c07544e --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/es/guaranteed_possible_impossible_esht/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "es", + "template_name": "guaranteed/possible/impossible_esht", + "evaluation": { + "accuracy": 0.529718875502008 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='guaranteed/possible/impossible_esht', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/es/justified_in_saying_esht/results.json b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/es/justified_in_saying_esht/results.json new file mode 100644 index 0000000..18b001c --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/es/justified_in_saying_esht/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "es", + "template_name": "justified in saying_esht", + "evaluation": { + "accuracy": 0.3333333333333333 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='justified in saying_esht', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/fr/GPT-3_style_frht/results.json b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/fr/GPT-3_style_frht/results.json new file mode 100644 index 0000000..e335ce7 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/fr/GPT-3_style_frht/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "fr", + "template_name": "GPT-3 style_frht", + "evaluation": { + "accuracy": 0.45863453815261046 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='fr', template_name='GPT-3 style_frht', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/fr/MNLI_crowdsource_frht/results.json b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/fr/MNLI_crowdsource_frht/results.json new file mode 100644 index 0000000..bf70bb1 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/fr/MNLI_crowdsource_frht/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "fr", + "template_name": "MNLI crowdsource_frht", + "evaluation": { + "accuracy": 0.42730923694779116 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='fr', template_name='MNLI crowdsource_frht', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/fr/can_we_infer_frht/results.json b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/fr/can_we_infer_frht/results.json new file mode 100644 index 0000000..56acbcb --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/fr/can_we_infer_frht/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "fr", + "template_name": "can we infer_frht", + "evaluation": { + "accuracy": 0.40963855421686746 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='fr', template_name='can we infer_frht', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/fr/guaranteed_possible_impossible_frht/results.json b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/fr/guaranteed_possible_impossible_frht/results.json new file mode 100644 index 0000000..de618ed --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/fr/guaranteed_possible_impossible_frht/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "fr", + "template_name": "guaranteed/possible/impossible_frht", + "evaluation": { + "accuracy": 0.37309236947791163 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='fr', template_name='guaranteed/possible/impossible_frht', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/fr/justified_in_saying_frht/results.json b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/fr/justified_in_saying_frht/results.json new file mode 100644 index 0000000..3b84be5 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/fr/justified_in_saying_frht/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "fr", + "template_name": "justified in saying_frht", + "evaluation": { + "accuracy": 0.4710843373493976 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='fr', template_name='justified in saying_frht', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/hi/GPT-3_style_hiht/results.json b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/hi/GPT-3_style_hiht/results.json new file mode 100644 index 0000000..16c00f8 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/hi/GPT-3_style_hiht/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "hi", + "template_name": "GPT-3 style_hiht", + "evaluation": { + "accuracy": 0.3542168674698795 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='GPT-3 style_hiht', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/hi/MNLI_crowdsource_hiht/results.json b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/hi/MNLI_crowdsource_hiht/results.json new file mode 100644 index 0000000..5e3aa47 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/hi/MNLI_crowdsource_hiht/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "hi", + "template_name": "MNLI crowdsource_hiht", + "evaluation": { + "accuracy": 0.4389558232931727 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='MNLI crowdsource_hiht', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/hi/can_we_infer_hiht/results.json b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/hi/can_we_infer_hiht/results.json new file mode 100644 index 0000000..9778af1 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/hi/can_we_infer_hiht/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "hi", + "template_name": "can we infer_hiht", + "evaluation": { + "accuracy": 0.41566265060240964 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='can we infer_hiht', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/hi/guaranteed_possible_impossible_hiht/results.json b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/hi/guaranteed_possible_impossible_hiht/results.json new file mode 100644 index 0000000..e725b82 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/hi/guaranteed_possible_impossible_hiht/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "hi", + "template_name": "guaranteed/possible/impossible_hiht", + "evaluation": { + "accuracy": 0.4927710843373494 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='guaranteed/possible/impossible_hiht', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/hi/justified_in_saying_hiht/results.json b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/hi/justified_in_saying_hiht/results.json new file mode 100644 index 0000000..98a8389 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/hi/justified_in_saying_hiht/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "hi", + "template_name": "justified in saying_hiht", + "evaluation": { + "accuracy": 0.4562248995983936 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='justified in saying_hiht', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/merged.csv b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/merged.csv new file mode 100644 index 0000000..eda9e56 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/merged.csv @@ -0,0 +1,50 @@ +dataset,prompt,metric,value +xnli_ar,GPT-3 style_arht,accuracy,0.3634538152610442 +xnli_ar,MNLI crowdsource_arht,accuracy,0.3437751004016064 +xnli_ar,can we infer_arht,accuracy,0.40923694779116465 +xnli_ar,guaranteed/possible/impossible_arht,accuracy,0.342570281124498 +xnli_ar,justified in saying_arht,accuracy,0.4321285140562249 +xnli_ar,median,accuracy,0.3634538152610442 +xnli_es,GPT-3 style_esht,accuracy,0.4795180722891566 +xnli_es,MNLI crowdsource_esht,accuracy,0.3333333333333333 +xnli_es,can we infer_esht,accuracy,0.3333333333333333 +xnli_es,guaranteed/possible/impossible_esht,accuracy,0.529718875502008 +xnli_es,justified in saying_esht,accuracy,0.3333333333333333 +xnli_es,median,accuracy,0.3333333333333333 +xnli_fr,GPT-3 style_frht,accuracy,0.45863453815261046 +xnli_fr,MNLI crowdsource_frht,accuracy,0.42730923694779116 +xnli_fr,can we infer_frht,accuracy,0.40963855421686746 +xnli_fr,guaranteed/possible/impossible_frht,accuracy,0.37309236947791163 +xnli_fr,justified in saying_frht,accuracy,0.4710843373493976 +xnli_fr,median,accuracy,0.42730923694779116 +xnli_hi,GPT-3 style_hiht,accuracy,0.3542168674698795 +xnli_hi,MNLI crowdsource_hiht,accuracy,0.4389558232931727 +xnli_hi,can we infer_hiht,accuracy,0.41566265060240964 +xnli_hi,guaranteed/possible/impossible_hiht,accuracy,0.4927710843373494 +xnli_hi,justified in saying_hiht,accuracy,0.4562248995983936 +xnli_hi,median,accuracy,0.4389558232931727 +xnli_sw,GPT-3 style_swht,accuracy,0.3389558232931727 +xnli_sw,MNLI crowdsource_swht,accuracy,0.3257028112449799 +xnli_sw,can we infer_swht,accuracy,0.3429718875502008 +xnli_sw,guaranteed/possible/impossible_swht,accuracy,0.3718875502008032 +xnli_sw,justified in saying_swht,accuracy,0.3409638554216867 +xnli_sw,median,accuracy,0.3409638554216867 +xnli_ur,GPT-3 style_urht,accuracy,0.3646586345381526 +xnli_ur,MNLI crowdsource_urht,accuracy,0.3538152610441767 +xnli_ur,can we infer_urht,accuracy,0.3610441767068273 +xnli_ur,guaranteed/possible/impossible_urht,accuracy,0.37670682730923694 +xnli_ur,justified in saying_urht,accuracy,0.3377510040160643 +xnli_ur,median,accuracy,0.3610441767068273 +xnli_vi,GPT-3 style_viht,accuracy,0.3357429718875502 +xnli_vi,MNLI crowdsource_viht,accuracy,0.3477911646586345 +xnli_vi,can we infer_viht,accuracy,0.3333333333333333 +xnli_vi,guaranteed/possible/impossible_viht,accuracy,0.39759036144578314 +xnli_vi,justified in saying_viht,accuracy,0.3333333333333333 +xnli_vi,median,accuracy,0.3357429718875502 +xnli_zh,GPT-3 style_zhht,accuracy,0.348995983935743 +xnli_zh,MNLI crowdsource_zhht,accuracy,0.45100401606425705 +xnli_zh,can we infer_zhht,accuracy,0.3931726907630522 +xnli_zh,guaranteed/possible/impossible_zhht,accuracy,0.39879518072289155 +xnli_zh,justified in saying_zhht,accuracy,0.3827309236947791 +xnli_zh,median,accuracy,0.3931726907630522 +multiple,average,multiple,0.3742469879518072 diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/merged.json b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/merged.json new file mode 100644 index 0000000..f811e1d --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/merged.json @@ -0,0 +1 @@ +{"xnli_ar": {"GPT-3 style_arht": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='GPT-3 style_arht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3634538152610442}, "template_name": "GPT-3 style_arht"}, "MNLI crowdsource_arht": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='MNLI crowdsource_arht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3437751004016064}, "template_name": "MNLI crowdsource_arht"}, "can we infer_arht": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='can we infer_arht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "xnli", "evaluation": {"accuracy": 0.40923694779116465}, "template_name": "can we infer_arht"}, "guaranteed/possible/impossible_arht": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='guaranteed/possible/impossible_arht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "xnli", "evaluation": {"accuracy": 0.342570281124498}, "template_name": "guaranteed/possible/impossible_arht"}, "justified in saying_arht": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='justified in saying_arht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "xnli", "evaluation": {"accuracy": 0.4321285140562249}, "template_name": "justified in saying_arht"}}, "xnli_es": {"GPT-3 style_esht": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='GPT-3 style_esht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "xnli", "evaluation": {"accuracy": 0.4795180722891566}, "template_name": "GPT-3 style_esht"}, "MNLI crowdsource_esht": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='MNLI crowdsource_esht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3333333333333333}, "template_name": "MNLI crowdsource_esht"}, "can we infer_esht": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='can we infer_esht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3333333333333333}, "template_name": "can we infer_esht"}, "guaranteed/possible/impossible_esht": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='guaranteed/possible/impossible_esht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "xnli", "evaluation": {"accuracy": 0.529718875502008}, "template_name": "guaranteed/possible/impossible_esht"}, "justified in saying_esht": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='justified in saying_esht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3333333333333333}, "template_name": "justified in saying_esht"}}, "xnli_fr": {"GPT-3 style_frht": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='fr', template_name='GPT-3 style_frht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "xnli", "evaluation": {"accuracy": 0.45863453815261046}, "template_name": "GPT-3 style_frht"}, "MNLI crowdsource_frht": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='fr', template_name='MNLI crowdsource_frht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "xnli", "evaluation": {"accuracy": 0.42730923694779116}, "template_name": "MNLI crowdsource_frht"}, "can we infer_frht": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='fr', template_name='can we infer_frht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "xnli", "evaluation": {"accuracy": 0.40963855421686746}, "template_name": "can we infer_frht"}, "guaranteed/possible/impossible_frht": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='fr', template_name='guaranteed/possible/impossible_frht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "xnli", "evaluation": {"accuracy": 0.37309236947791163}, "template_name": "guaranteed/possible/impossible_frht"}, "justified in saying_frht": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='fr', template_name='justified in saying_frht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "xnli", "evaluation": {"accuracy": 0.4710843373493976}, "template_name": "justified in saying_frht"}}, "xnli_hi": {"GPT-3 style_hiht": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='GPT-3 style_hiht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3542168674698795}, "template_name": "GPT-3 style_hiht"}, "MNLI crowdsource_hiht": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='MNLI crowdsource_hiht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.4389558232931727}, "template_name": "MNLI crowdsource_hiht"}, "can we infer_hiht": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='can we infer_hiht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.41566265060240964}, "template_name": "can we infer_hiht"}, "guaranteed/possible/impossible_hiht": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='guaranteed/possible/impossible_hiht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.4927710843373494}, "template_name": "guaranteed/possible/impossible_hiht"}, "justified in saying_hiht": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='justified in saying_hiht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.4562248995983936}, "template_name": "justified in saying_hiht"}}, "xnli_sw": {"GPT-3 style_swht": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='GPT-3 style_swht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3389558232931727}, "template_name": "GPT-3 style_swht"}, "MNLI crowdsource_swht": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='MNLI crowdsource_swht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3257028112449799}, "template_name": "MNLI crowdsource_swht"}, "can we infer_swht": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='can we infer_swht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3429718875502008}, "template_name": "can we infer_swht"}, "guaranteed/possible/impossible_swht": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='guaranteed/possible/impossible_swht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3718875502008032}, "template_name": "guaranteed/possible/impossible_swht"}, "justified in saying_swht": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='justified in saying_swht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3409638554216867}, "template_name": "justified in saying_swht"}}, "xnli_ur": {"GPT-3 style_urht": {"arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ur', template_name='GPT-3 style_urht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ur", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3646586345381526}, "template_name": "GPT-3 style_urht"}, "MNLI crowdsource_urht": {"arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ur', template_name='MNLI crowdsource_urht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ur", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3538152610441767}, "template_name": "MNLI crowdsource_urht"}, "can we infer_urht": {"arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ur', template_name='can we infer_urht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ur", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3610441767068273}, "template_name": "can we infer_urht"}, "guaranteed/possible/impossible_urht": {"arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ur', template_name='guaranteed/possible/impossible_urht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ur", "dataset_name": "xnli", "evaluation": {"accuracy": 0.37670682730923694}, "template_name": "guaranteed/possible/impossible_urht"}, "justified in saying_urht": {"arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ur', template_name='justified in saying_urht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ur", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3377510040160643}, "template_name": "justified in saying_urht"}}, "xnli_vi": {"GPT-3 style_viht": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='GPT-3 style_viht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3357429718875502}, "template_name": "GPT-3 style_viht"}, "MNLI crowdsource_viht": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='MNLI crowdsource_viht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3477911646586345}, "template_name": "MNLI crowdsource_viht"}, "can we infer_viht": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='can we infer_viht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3333333333333333}, "template_name": "can we infer_viht"}, "guaranteed/possible/impossible_viht": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='guaranteed/possible/impossible_viht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.39759036144578314}, "template_name": "guaranteed/possible/impossible_viht"}, "justified in saying_viht": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='justified in saying_viht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3333333333333333}, "template_name": "justified in saying_viht"}}, "xnli_zh": {"GPT-3 style_zhht": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='GPT-3 style_zhht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xnli", "evaluation": {"accuracy": 0.348995983935743}, "template_name": "GPT-3 style_zhht"}, "MNLI crowdsource_zhht": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='MNLI crowdsource_zhht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xnli", "evaluation": {"accuracy": 0.45100401606425705}, "template_name": "MNLI crowdsource_zhht"}, "can we infer_zhht": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='can we infer_zhht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3931726907630522}, "template_name": "can we infer_zhht"}, "guaranteed/possible/impossible_zhht": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='guaranteed/possible/impossible_zhht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xnli", "evaluation": {"accuracy": 0.39879518072289155}, "template_name": "guaranteed/possible/impossible_zhht"}, "justified in saying_zhht": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='justified in saying_zhht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3827309236947791}, "template_name": "justified in saying_zhht"}}} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/sw/GPT-3_style_swht/results.json b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/sw/GPT-3_style_swht/results.json new file mode 100644 index 0000000..24438f9 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/sw/GPT-3_style_swht/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "sw", + "template_name": "GPT-3 style_swht", + "evaluation": { + "accuracy": 0.3389558232931727 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='GPT-3 style_swht', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/sw/MNLI_crowdsource_swht/results.json b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/sw/MNLI_crowdsource_swht/results.json new file mode 100644 index 0000000..eb153c5 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/sw/MNLI_crowdsource_swht/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "sw", + "template_name": "MNLI crowdsource_swht", + "evaluation": { + "accuracy": 0.3257028112449799 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='MNLI crowdsource_swht', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/sw/can_we_infer_swht/results.json b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/sw/can_we_infer_swht/results.json new file mode 100644 index 0000000..8032dbc --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/sw/can_we_infer_swht/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "sw", + "template_name": "can we infer_swht", + "evaluation": { + "accuracy": 0.3429718875502008 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='can we infer_swht', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/sw/guaranteed_possible_impossible_swht/results.json b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/sw/guaranteed_possible_impossible_swht/results.json new file mode 100644 index 0000000..5cc2164 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/sw/guaranteed_possible_impossible_swht/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "sw", + "template_name": "guaranteed/possible/impossible_swht", + "evaluation": { + "accuracy": 0.3718875502008032 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='guaranteed/possible/impossible_swht', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/sw/justified_in_saying_swht/results.json b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/sw/justified_in_saying_swht/results.json new file mode 100644 index 0000000..17b1cd7 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/sw/justified_in_saying_swht/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "sw", + "template_name": "justified in saying_swht", + "evaluation": { + "accuracy": 0.3409638554216867 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='justified in saying_swht', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/ur/GPT-3_style_urht/results.json b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/ur/GPT-3_style_urht/results.json new file mode 100644 index 0000000..b93c09e --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/ur/GPT-3_style_urht/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "ur", + "template_name": "GPT-3 style_urht", + "evaluation": { + "accuracy": 0.3646586345381526 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ur', template_name='GPT-3 style_urht', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/ur/MNLI_crowdsource_urht/results.json b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/ur/MNLI_crowdsource_urht/results.json new file mode 100644 index 0000000..7c2a870 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/ur/MNLI_crowdsource_urht/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "ur", + "template_name": "MNLI crowdsource_urht", + "evaluation": { + "accuracy": 0.3538152610441767 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ur', template_name='MNLI crowdsource_urht', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/ur/can_we_infer_urht/results.json b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/ur/can_we_infer_urht/results.json new file mode 100644 index 0000000..382ea3d --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/ur/can_we_infer_urht/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "ur", + "template_name": "can we infer_urht", + "evaluation": { + "accuracy": 0.3610441767068273 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ur', template_name='can we infer_urht', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/ur/guaranteed_possible_impossible_urht/results.json b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/ur/guaranteed_possible_impossible_urht/results.json new file mode 100644 index 0000000..b3cc697 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/ur/guaranteed_possible_impossible_urht/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "ur", + "template_name": "guaranteed/possible/impossible_urht", + "evaluation": { + "accuracy": 0.37670682730923694 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ur', template_name='guaranteed/possible/impossible_urht', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/ur/justified_in_saying_urht/results.json b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/ur/justified_in_saying_urht/results.json new file mode 100644 index 0000000..be4af4c --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/ur/justified_in_saying_urht/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "ur", + "template_name": "justified in saying_urht", + "evaluation": { + "accuracy": 0.3377510040160643 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ur', template_name='justified in saying_urht', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/vi/GPT-3_style_viht/results.json b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/vi/GPT-3_style_viht/results.json new file mode 100644 index 0000000..5543194 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/vi/GPT-3_style_viht/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "vi", + "template_name": "GPT-3 style_viht", + "evaluation": { + "accuracy": 0.3357429718875502 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='GPT-3 style_viht', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/vi/MNLI_crowdsource_viht/results.json b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/vi/MNLI_crowdsource_viht/results.json new file mode 100644 index 0000000..400b33c --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/vi/MNLI_crowdsource_viht/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "vi", + "template_name": "MNLI crowdsource_viht", + "evaluation": { + "accuracy": 0.3477911646586345 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='MNLI crowdsource_viht', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/vi/can_we_infer_viht/results.json b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/vi/can_we_infer_viht/results.json new file mode 100644 index 0000000..48ad17f --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/vi/can_we_infer_viht/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "vi", + "template_name": "can we infer_viht", + "evaluation": { + "accuracy": 0.3333333333333333 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='can we infer_viht', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/vi/guaranteed_possible_impossible_viht/results.json b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/vi/guaranteed_possible_impossible_viht/results.json new file mode 100644 index 0000000..0159d00 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/vi/guaranteed_possible_impossible_viht/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "vi", + "template_name": "guaranteed/possible/impossible_viht", + "evaluation": { + "accuracy": 0.39759036144578314 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='guaranteed/possible/impossible_viht', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/vi/justified_in_saying_viht/results.json b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/vi/justified_in_saying_viht/results.json new file mode 100644 index 0000000..1ee850f --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/vi/justified_in_saying_viht/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "vi", + "template_name": "justified in saying_viht", + "evaluation": { + "accuracy": 0.3333333333333333 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='justified in saying_viht', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/zh/GPT-3_style_zhht/results.json b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/zh/GPT-3_style_zhht/results.json new file mode 100644 index 0000000..83b7551 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/zh/GPT-3_style_zhht/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "zh", + "template_name": "GPT-3 style_zhht", + "evaluation": { + "accuracy": 0.348995983935743 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='GPT-3 style_zhht', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/zh/MNLI_crowdsource_zhht/results.json b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/zh/MNLI_crowdsource_zhht/results.json new file mode 100644 index 0000000..8198f5c --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/zh/MNLI_crowdsource_zhht/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "zh", + "template_name": "MNLI crowdsource_zhht", + "evaluation": { + "accuracy": 0.45100401606425705 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='MNLI crowdsource_zhht', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/zh/can_we_infer_zhht/results.json b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/zh/can_we_infer_zhht/results.json new file mode 100644 index 0000000..f31b1b7 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/zh/can_we_infer_zhht/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "zh", + "template_name": "can we infer_zhht", + "evaluation": { + "accuracy": 0.3931726907630522 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='can we infer_zhht', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/zh/guaranteed_possible_impossible_zhht/results.json b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/zh/guaranteed_possible_impossible_zhht/results.json new file mode 100644 index 0000000..1b8e5cd --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/zh/guaranteed_possible_impossible_zhht/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "zh", + "template_name": "guaranteed/possible/impossible_zhht", + "evaluation": { + "accuracy": 0.39879518072289155 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='guaranteed/possible/impossible_zhht', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/zh/justified_in_saying_zhht/results.json b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/zh/justified_in_saying_zhht/results.json new file mode 100644 index 0000000..7b0ea07 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnliht/xnli/zh/justified_in_saying_zhht/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "zh", + "template_name": "justified in saying_zhht", + "evaluation": { + "accuracy": 0.3827309236947791 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='justified in saying_zhht', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/ar/GPT-3_style_armt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/ar/GPT-3_style_armt/results.json new file mode 100644 index 0000000..1e2ff21 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/ar/GPT-3_style_armt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "ar", + "template_name": "GPT-3 style_armt", + "evaluation": { + "accuracy": 0.3333333333333333 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='GPT-3 style_armt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/ar/MNLI_crowdsource_armt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/ar/MNLI_crowdsource_armt/results.json new file mode 100644 index 0000000..bbe6891 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/ar/MNLI_crowdsource_armt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "ar", + "template_name": "MNLI crowdsource_armt", + "evaluation": { + "accuracy": 0.42891566265060244 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='MNLI crowdsource_armt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/ar/can_we_infer_armt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/ar/can_we_infer_armt/results.json new file mode 100644 index 0000000..f087151 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/ar/can_we_infer_armt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "ar", + "template_name": "can we infer_armt", + "evaluation": { + "accuracy": 0.3353413654618474 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='can we infer_armt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/ar/guaranteed_possible_impossible_armt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/ar/guaranteed_possible_impossible_armt/results.json new file mode 100644 index 0000000..6239c2d --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/ar/guaranteed_possible_impossible_armt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "ar", + "template_name": "guaranteed/possible/impossible_armt", + "evaluation": { + "accuracy": 0.3755020080321285 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='guaranteed/possible/impossible_armt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/ar/justified_in_saying_armt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/ar/justified_in_saying_armt/results.json new file mode 100644 index 0000000..2bd8fbe --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/ar/justified_in_saying_armt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "ar", + "template_name": "justified in saying_armt", + "evaluation": { + "accuracy": 0.3349397590361446 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='justified in saying_armt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/es/GPT-3_style_esmt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/es/GPT-3_style_esmt/results.json new file mode 100644 index 0000000..a566cc7 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/es/GPT-3_style_esmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "es", + "template_name": "GPT-3 style_esmt", + "evaluation": { + "accuracy": 0.5220883534136547 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='GPT-3 style_esmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/es/MNLI_crowdsource_esmt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/es/MNLI_crowdsource_esmt/results.json new file mode 100644 index 0000000..e3132eb --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/es/MNLI_crowdsource_esmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "es", + "template_name": "MNLI crowdsource_esmt", + "evaluation": { + "accuracy": 0.4847389558232932 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='MNLI crowdsource_esmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/es/can_we_infer_esmt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/es/can_we_infer_esmt/results.json new file mode 100644 index 0000000..7a3a5d2 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/es/can_we_infer_esmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "es", + "template_name": "can we infer_esmt", + "evaluation": { + "accuracy": 0.3333333333333333 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='can we infer_esmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/es/guaranteed_possible_impossible_esmt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/es/guaranteed_possible_impossible_esmt/results.json new file mode 100644 index 0000000..60b1ddf --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/es/guaranteed_possible_impossible_esmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "es", + "template_name": "guaranteed/possible/impossible_esmt", + "evaluation": { + "accuracy": 0.3449799196787149 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='guaranteed/possible/impossible_esmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/es/justified_in_saying_esmt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/es/justified_in_saying_esmt/results.json new file mode 100644 index 0000000..c0a185f --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/es/justified_in_saying_esmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "es", + "template_name": "justified in saying_esmt", + "evaluation": { + "accuracy": 0.3333333333333333 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='justified in saying_esmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/fr/GPT-3_style_frmt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/fr/GPT-3_style_frmt/results.json new file mode 100644 index 0000000..5e79340 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/fr/GPT-3_style_frmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "fr", + "template_name": "GPT-3 style_frmt", + "evaluation": { + "accuracy": 0.4791164658634538 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='fr', template_name='GPT-3 style_frmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/fr/MNLI_crowdsource_frmt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/fr/MNLI_crowdsource_frmt/results.json new file mode 100644 index 0000000..f2cd5c2 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/fr/MNLI_crowdsource_frmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "fr", + "template_name": "MNLI crowdsource_frmt", + "evaluation": { + "accuracy": 0.3333333333333333 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='fr', template_name='MNLI crowdsource_frmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/fr/can_we_infer_frmt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/fr/can_we_infer_frmt/results.json new file mode 100644 index 0000000..f56d15e --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/fr/can_we_infer_frmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "fr", + "template_name": "can we infer_frmt", + "evaluation": { + "accuracy": 0.42248995983935744 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='fr', template_name='can we infer_frmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/fr/guaranteed_possible_impossible_frmt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/fr/guaranteed_possible_impossible_frmt/results.json new file mode 100644 index 0000000..fff37a1 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/fr/guaranteed_possible_impossible_frmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "fr", + "template_name": "guaranteed/possible/impossible_frmt", + "evaluation": { + "accuracy": 0.41847389558232934 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='fr', template_name='guaranteed/possible/impossible_frmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/fr/justified_in_saying_frmt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/fr/justified_in_saying_frmt/results.json new file mode 100644 index 0000000..ef27c54 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/fr/justified_in_saying_frmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "fr", + "template_name": "justified in saying_frmt", + "evaluation": { + "accuracy": 0.378714859437751 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='fr', template_name='justified in saying_frmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/hi/GPT-3_style_himt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/hi/GPT-3_style_himt/results.json new file mode 100644 index 0000000..a6f9d56 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/hi/GPT-3_style_himt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "hi", + "template_name": "GPT-3 style_himt", + "evaluation": { + "accuracy": 0.3389558232931727 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='GPT-3 style_himt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/hi/MNLI_crowdsource_himt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/hi/MNLI_crowdsource_himt/results.json new file mode 100644 index 0000000..8a27483 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/hi/MNLI_crowdsource_himt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "hi", + "template_name": "MNLI crowdsource_himt", + "evaluation": { + "accuracy": 0.3333333333333333 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='MNLI crowdsource_himt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/hi/can_we_infer_himt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/hi/can_we_infer_himt/results.json new file mode 100644 index 0000000..0b6baa0 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/hi/can_we_infer_himt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "hi", + "template_name": "can we infer_himt", + "evaluation": { + "accuracy": 0.3542168674698795 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='can we infer_himt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/hi/guaranteed_possible_impossible_himt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/hi/guaranteed_possible_impossible_himt/results.json new file mode 100644 index 0000000..d34e7fc --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/hi/guaranteed_possible_impossible_himt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "hi", + "template_name": "guaranteed/possible/impossible_himt", + "evaluation": { + "accuracy": 0.3353413654618474 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='guaranteed/possible/impossible_himt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/hi/justified_in_saying_himt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/hi/justified_in_saying_himt/results.json new file mode 100644 index 0000000..55d5629 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/hi/justified_in_saying_himt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "hi", + "template_name": "justified in saying_himt", + "evaluation": { + "accuracy": 0.39879518072289155 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='justified in saying_himt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/merged.csv b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/merged.csv new file mode 100644 index 0000000..ac34fb9 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/merged.csv @@ -0,0 +1,50 @@ +dataset,prompt,metric,value +xnli_ar,GPT-3 style_armt,accuracy,0.3333333333333333 +xnli_ar,MNLI crowdsource_armt,accuracy,0.42891566265060244 +xnli_ar,can we infer_armt,accuracy,0.3353413654618474 +xnli_ar,guaranteed/possible/impossible_armt,accuracy,0.3755020080321285 +xnli_ar,justified in saying_armt,accuracy,0.3349397590361446 +xnli_ar,median,accuracy,0.3353413654618474 +xnli_es,GPT-3 style_esmt,accuracy,0.5220883534136547 +xnli_es,MNLI crowdsource_esmt,accuracy,0.4847389558232932 +xnli_es,can we infer_esmt,accuracy,0.3333333333333333 +xnli_es,guaranteed/possible/impossible_esmt,accuracy,0.3449799196787149 +xnli_es,justified in saying_esmt,accuracy,0.3333333333333333 +xnli_es,median,accuracy,0.3449799196787149 +xnli_fr,GPT-3 style_frmt,accuracy,0.4791164658634538 +xnli_fr,MNLI crowdsource_frmt,accuracy,0.3333333333333333 +xnli_fr,can we infer_frmt,accuracy,0.42248995983935744 +xnli_fr,guaranteed/possible/impossible_frmt,accuracy,0.41847389558232934 +xnli_fr,justified in saying_frmt,accuracy,0.378714859437751 +xnli_fr,median,accuracy,0.41847389558232934 +xnli_hi,GPT-3 style_himt,accuracy,0.3389558232931727 +xnli_hi,MNLI crowdsource_himt,accuracy,0.3333333333333333 +xnli_hi,can we infer_himt,accuracy,0.3542168674698795 +xnli_hi,guaranteed/possible/impossible_himt,accuracy,0.3353413654618474 +xnli_hi,justified in saying_himt,accuracy,0.39879518072289155 +xnli_hi,median,accuracy,0.3389558232931727 +xnli_sw,GPT-3 style_swmt,accuracy,0.3333333333333333 +xnli_sw,MNLI crowdsource_swmt,accuracy,0.3333333333333333 +xnli_sw,can we infer_swmt,accuracy,0.334136546184739 +xnli_sw,guaranteed/possible/impossible_swmt,accuracy,0.3236947791164659 +xnli_sw,justified in saying_swmt,accuracy,0.3321285140562249 +xnli_sw,median,accuracy,0.3333333333333333 +xnli_ur,GPT-3 style_urmt,accuracy,0.3751004016064257 +xnli_ur,MNLI crowdsource_urmt,accuracy,0.3751004016064257 +xnli_ur,can we infer_urmt,accuracy,0.329718875502008 +xnli_ur,guaranteed/possible/impossible_urmt,accuracy,0.3337349397590361 +xnli_ur,justified in saying_urmt,accuracy,0.3285140562248996 +xnli_ur,median,accuracy,0.3337349397590361 +xnli_vi,GPT-3 style_vimt,accuracy,0.3333333333333333 +xnli_vi,MNLI crowdsource_vimt,accuracy,0.3333333333333333 +xnli_vi,can we infer_vimt,accuracy,0.342570281124498 +xnli_vi,guaranteed/possible/impossible_vimt,accuracy,0.3333333333333333 +xnli_vi,justified in saying_vimt,accuracy,0.3365461847389558 +xnli_vi,median,accuracy,0.3333333333333333 +xnli_zh,GPT-3 style_zhmt,accuracy,0.3606425702811245 +xnli_zh,MNLI crowdsource_zhmt,accuracy,0.39598393574297186 +xnli_zh,can we infer_zhmt,accuracy,0.351004016064257 +xnli_zh,guaranteed/possible/impossible_zhmt,accuracy,0.3473895582329317 +xnli_zh,justified in saying_zhmt,accuracy,0.3409638554216867 +xnli_zh,median,accuracy,0.351004016064257 +multiple,average,multiple,0.348644578313253 diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/merged.json b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/merged.json new file mode 100644 index 0000000..0f9888f --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/merged.json @@ -0,0 +1 @@ +{"xnli_ar": {"GPT-3 style_armt": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='GPT-3 style_armt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3333333333333333}, "template_name": "GPT-3 style_armt"}, "MNLI crowdsource_armt": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='MNLI crowdsource_armt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "xnli", "evaluation": {"accuracy": 0.42891566265060244}, "template_name": "MNLI crowdsource_armt"}, "can we infer_armt": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='can we infer_armt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3353413654618474}, "template_name": "can we infer_armt"}, "guaranteed/possible/impossible_armt": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='guaranteed/possible/impossible_armt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3755020080321285}, "template_name": "guaranteed/possible/impossible_armt"}, "justified in saying_armt": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='justified in saying_armt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3349397590361446}, "template_name": "justified in saying_armt"}}, "xnli_es": {"GPT-3 style_esmt": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='GPT-3 style_esmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5220883534136547}, "template_name": "GPT-3 style_esmt"}, "MNLI crowdsource_esmt": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='MNLI crowdsource_esmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "xnli", "evaluation": {"accuracy": 0.4847389558232932}, "template_name": "MNLI crowdsource_esmt"}, "can we infer_esmt": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='can we infer_esmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3333333333333333}, "template_name": "can we infer_esmt"}, "guaranteed/possible/impossible_esmt": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='guaranteed/possible/impossible_esmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3449799196787149}, "template_name": "guaranteed/possible/impossible_esmt"}, "justified in saying_esmt": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='justified in saying_esmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3333333333333333}, "template_name": "justified in saying_esmt"}}, "xnli_fr": {"GPT-3 style_frmt": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='fr', template_name='GPT-3 style_frmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "xnli", "evaluation": {"accuracy": 0.4791164658634538}, "template_name": "GPT-3 style_frmt"}, "MNLI crowdsource_frmt": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='fr', template_name='MNLI crowdsource_frmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3333333333333333}, "template_name": "MNLI crowdsource_frmt"}, "can we infer_frmt": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='fr', template_name='can we infer_frmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "xnli", "evaluation": {"accuracy": 0.42248995983935744}, "template_name": "can we infer_frmt"}, "guaranteed/possible/impossible_frmt": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='fr', template_name='guaranteed/possible/impossible_frmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "xnli", "evaluation": {"accuracy": 0.41847389558232934}, "template_name": "guaranteed/possible/impossible_frmt"}, "justified in saying_frmt": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='fr', template_name='justified in saying_frmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "xnli", "evaluation": {"accuracy": 0.378714859437751}, "template_name": "justified in saying_frmt"}}, "xnli_hi": {"GPT-3 style_himt": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='GPT-3 style_himt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3389558232931727}, "template_name": "GPT-3 style_himt"}, "MNLI crowdsource_himt": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='MNLI crowdsource_himt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3333333333333333}, "template_name": "MNLI crowdsource_himt"}, "can we infer_himt": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='can we infer_himt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3542168674698795}, "template_name": "can we infer_himt"}, "guaranteed/possible/impossible_himt": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='guaranteed/possible/impossible_himt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3353413654618474}, "template_name": "guaranteed/possible/impossible_himt"}, "justified in saying_himt": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='justified in saying_himt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.39879518072289155}, "template_name": "justified in saying_himt"}}, "xnli_sw": {"GPT-3 style_swmt": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='GPT-3 style_swmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3333333333333333}, "template_name": "GPT-3 style_swmt"}, "MNLI crowdsource_swmt": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='MNLI crowdsource_swmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3333333333333333}, "template_name": "MNLI crowdsource_swmt"}, "can we infer_swmt": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='can we infer_swmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xnli", "evaluation": {"accuracy": 0.334136546184739}, "template_name": "can we infer_swmt"}, "guaranteed/possible/impossible_swmt": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='guaranteed/possible/impossible_swmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3236947791164659}, "template_name": "guaranteed/possible/impossible_swmt"}, "justified in saying_swmt": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='justified in saying_swmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3321285140562249}, "template_name": "justified in saying_swmt"}}, "xnli_ur": {"GPT-3 style_urmt": {"arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ur', template_name='GPT-3 style_urmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ur", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3751004016064257}, "template_name": "GPT-3 style_urmt"}, "MNLI crowdsource_urmt": {"arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ur', template_name='MNLI crowdsource_urmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ur", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3751004016064257}, "template_name": "MNLI crowdsource_urmt"}, "can we infer_urmt": {"arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ur', template_name='can we infer_urmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ur", "dataset_name": "xnli", "evaluation": {"accuracy": 0.329718875502008}, "template_name": "can we infer_urmt"}, "guaranteed/possible/impossible_urmt": {"arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ur', template_name='guaranteed/possible/impossible_urmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ur", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3337349397590361}, "template_name": "guaranteed/possible/impossible_urmt"}, "justified in saying_urmt": {"arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ur', template_name='justified in saying_urmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ur", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3285140562248996}, "template_name": "justified in saying_urmt"}}, "xnli_vi": {"GPT-3 style_vimt": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='GPT-3 style_vimt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3333333333333333}, "template_name": "GPT-3 style_vimt"}, "MNLI crowdsource_vimt": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='MNLI crowdsource_vimt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3333333333333333}, "template_name": "MNLI crowdsource_vimt"}, "can we infer_vimt": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='can we infer_vimt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.342570281124498}, "template_name": "can we infer_vimt"}, "guaranteed/possible/impossible_vimt": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='guaranteed/possible/impossible_vimt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3333333333333333}, "template_name": "guaranteed/possible/impossible_vimt"}, "justified in saying_vimt": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='justified in saying_vimt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3365461847389558}, "template_name": "justified in saying_vimt"}}, "xnli_zh": {"GPT-3 style_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='GPT-3 style_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3606425702811245}, "template_name": "GPT-3 style_zhmt"}, "MNLI crowdsource_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='MNLI crowdsource_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xnli", "evaluation": {"accuracy": 0.39598393574297186}, "template_name": "MNLI crowdsource_zhmt"}, "can we infer_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='can we infer_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xnli", "evaluation": {"accuracy": 0.351004016064257}, "template_name": "can we infer_zhmt"}, "guaranteed/possible/impossible_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='guaranteed/possible/impossible_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3473895582329317}, "template_name": "guaranteed/possible/impossible_zhmt"}, "justified in saying_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='justified in saying_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3409638554216867}, "template_name": "justified in saying_zhmt"}}} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/sw/GPT-3_style_swmt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/sw/GPT-3_style_swmt/results.json new file mode 100644 index 0000000..ab03e75 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/sw/GPT-3_style_swmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "sw", + "template_name": "GPT-3 style_swmt", + "evaluation": { + "accuracy": 0.3333333333333333 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='GPT-3 style_swmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/sw/MNLI_crowdsource_swmt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/sw/MNLI_crowdsource_swmt/results.json new file mode 100644 index 0000000..cad905a --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/sw/MNLI_crowdsource_swmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "sw", + "template_name": "MNLI crowdsource_swmt", + "evaluation": { + "accuracy": 0.3333333333333333 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='MNLI crowdsource_swmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/sw/can_we_infer_swmt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/sw/can_we_infer_swmt/results.json new file mode 100644 index 0000000..ae32ecf --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/sw/can_we_infer_swmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "sw", + "template_name": "can we infer_swmt", + "evaluation": { + "accuracy": 0.334136546184739 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='can we infer_swmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/sw/guaranteed_possible_impossible_swmt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/sw/guaranteed_possible_impossible_swmt/results.json new file mode 100644 index 0000000..bc070b1 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/sw/guaranteed_possible_impossible_swmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "sw", + "template_name": "guaranteed/possible/impossible_swmt", + "evaluation": { + "accuracy": 0.3236947791164659 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='guaranteed/possible/impossible_swmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/sw/justified_in_saying_swmt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/sw/justified_in_saying_swmt/results.json new file mode 100644 index 0000000..469ed48 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/sw/justified_in_saying_swmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "sw", + "template_name": "justified in saying_swmt", + "evaluation": { + "accuracy": 0.3321285140562249 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='justified in saying_swmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/ur/GPT-3_style_urmt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/ur/GPT-3_style_urmt/results.json new file mode 100644 index 0000000..939a337 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/ur/GPT-3_style_urmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "ur", + "template_name": "GPT-3 style_urmt", + "evaluation": { + "accuracy": 0.3751004016064257 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ur', template_name='GPT-3 style_urmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/ur/MNLI_crowdsource_urmt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/ur/MNLI_crowdsource_urmt/results.json new file mode 100644 index 0000000..d1e112a --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/ur/MNLI_crowdsource_urmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "ur", + "template_name": "MNLI crowdsource_urmt", + "evaluation": { + "accuracy": 0.3751004016064257 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ur', template_name='MNLI crowdsource_urmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/ur/can_we_infer_urmt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/ur/can_we_infer_urmt/results.json new file mode 100644 index 0000000..bd42764 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/ur/can_we_infer_urmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "ur", + "template_name": "can we infer_urmt", + "evaluation": { + "accuracy": 0.329718875502008 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ur', template_name='can we infer_urmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/ur/guaranteed_possible_impossible_urmt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/ur/guaranteed_possible_impossible_urmt/results.json new file mode 100644 index 0000000..de68349 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/ur/guaranteed_possible_impossible_urmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "ur", + "template_name": "guaranteed/possible/impossible_urmt", + "evaluation": { + "accuracy": 0.3337349397590361 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ur', template_name='guaranteed/possible/impossible_urmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/ur/justified_in_saying_urmt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/ur/justified_in_saying_urmt/results.json new file mode 100644 index 0000000..dea45d6 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/ur/justified_in_saying_urmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "ur", + "template_name": "justified in saying_urmt", + "evaluation": { + "accuracy": 0.3285140562248996 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ur', template_name='justified in saying_urmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/vi/GPT-3_style_vimt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/vi/GPT-3_style_vimt/results.json new file mode 100644 index 0000000..88ad694 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/vi/GPT-3_style_vimt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "vi", + "template_name": "GPT-3 style_vimt", + "evaluation": { + "accuracy": 0.3333333333333333 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='GPT-3 style_vimt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/vi/MNLI_crowdsource_vimt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/vi/MNLI_crowdsource_vimt/results.json new file mode 100644 index 0000000..d13bb2e --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/vi/MNLI_crowdsource_vimt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "vi", + "template_name": "MNLI crowdsource_vimt", + "evaluation": { + "accuracy": 0.3333333333333333 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='MNLI crowdsource_vimt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/vi/can_we_infer_vimt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/vi/can_we_infer_vimt/results.json new file mode 100644 index 0000000..78b2960 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/vi/can_we_infer_vimt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "vi", + "template_name": "can we infer_vimt", + "evaluation": { + "accuracy": 0.342570281124498 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='can we infer_vimt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/vi/guaranteed_possible_impossible_vimt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/vi/guaranteed_possible_impossible_vimt/results.json new file mode 100644 index 0000000..a850aec --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/vi/guaranteed_possible_impossible_vimt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "vi", + "template_name": "guaranteed/possible/impossible_vimt", + "evaluation": { + "accuracy": 0.3333333333333333 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='guaranteed/possible/impossible_vimt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/vi/justified_in_saying_vimt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/vi/justified_in_saying_vimt/results.json new file mode 100644 index 0000000..ad7ace4 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/vi/justified_in_saying_vimt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "vi", + "template_name": "justified in saying_vimt", + "evaluation": { + "accuracy": 0.3365461847389558 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='justified in saying_vimt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/zh/GPT-3_style_zhmt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/zh/GPT-3_style_zhmt/results.json new file mode 100644 index 0000000..cbc7215 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/zh/GPT-3_style_zhmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "zh", + "template_name": "GPT-3 style_zhmt", + "evaluation": { + "accuracy": 0.3606425702811245 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='GPT-3 style_zhmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/zh/MNLI_crowdsource_zhmt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/zh/MNLI_crowdsource_zhmt/results.json new file mode 100644 index 0000000..a54f53e --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/zh/MNLI_crowdsource_zhmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "zh", + "template_name": "MNLI crowdsource_zhmt", + "evaluation": { + "accuracy": 0.39598393574297186 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='MNLI crowdsource_zhmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/zh/can_we_infer_zhmt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/zh/can_we_infer_zhmt/results.json new file mode 100644 index 0000000..d30234a --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/zh/can_we_infer_zhmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "zh", + "template_name": "can we infer_zhmt", + "evaluation": { + "accuracy": 0.351004016064257 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='can we infer_zhmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/zh/guaranteed_possible_impossible_zhmt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/zh/guaranteed_possible_impossible_zhmt/results.json new file mode 100644 index 0000000..d80007c --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/zh/guaranteed_possible_impossible_zhmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "zh", + "template_name": "guaranteed/possible/impossible_zhmt", + "evaluation": { + "accuracy": 0.3473895582329317 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='guaranteed/possible/impossible_zhmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/zh/justified_in_saying_zhmt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/zh/justified_in_saying_zhmt/results.json new file mode 100644 index 0000000..3adbec9 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xnlimt/xnli/zh/justified_in_saying_zhmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xnli", + "dataset_config_name": "zh", + "template_name": "justified in saying_zhmt", + "evaluation": { + "accuracy": 0.3409638554216867 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='justified in saying_zhmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopaht/Muennighoff_xstory_cloze/zh/Answer_Given_options_zhht/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopaht/Muennighoff_xstory_cloze/zh/Answer_Given_options_zhht/results.json new file mode 100644 index 0000000..d22fb9c --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopaht/Muennighoff_xstory_cloze/zh/Answer_Given_options_zhht/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "zh", + "template_name": "Answer Given options_zhht", + "evaluation": { + "accuracy": 0.7054930509596293 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='Answer Given options_zhht', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopaht/Muennighoff_xstory_cloze/zh/Choose_Story_Ending_zhht/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopaht/Muennighoff_xstory_cloze/zh/Choose_Story_Ending_zhht/results.json new file mode 100644 index 0000000..7456ae0 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopaht/Muennighoff_xstory_cloze/zh/Choose_Story_Ending_zhht/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "zh", + "template_name": "Choose Story Ending_zhht", + "evaluation": { + "accuracy": 0.7948378557246857 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='Choose Story Ending_zhht', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopaht/Muennighoff_xstory_cloze/zh/Generate_Ending_zhht/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopaht/Muennighoff_xstory_cloze/zh/Generate_Ending_zhht/results.json new file mode 100644 index 0000000..8845595 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopaht/Muennighoff_xstory_cloze/zh/Generate_Ending_zhht/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "zh", + "template_name": "Generate Ending_zhht", + "evaluation": { + "accuracy": 0.6366644606221046 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='Generate Ending_zhht', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopaht/Muennighoff_xstory_cloze/zh/Novel_Correct_Ending_zhht/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopaht/Muennighoff_xstory_cloze/zh/Novel_Correct_Ending_zhht/results.json new file mode 100644 index 0000000..bae4452 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopaht/Muennighoff_xstory_cloze/zh/Novel_Correct_Ending_zhht/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "zh", + "template_name": "Novel Correct Ending_zhht", + "evaluation": { + "accuracy": 0.7782925215089345 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='Novel Correct Ending_zhht', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopaht/Muennighoff_xstory_cloze/zh/Story_Continuation_and_Options_zhht/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopaht/Muennighoff_xstory_cloze/zh/Story_Continuation_and_Options_zhht/results.json new file mode 100644 index 0000000..cfec774 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopaht/Muennighoff_xstory_cloze/zh/Story_Continuation_and_Options_zhht/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "zh", + "template_name": "Story Continuation and Options_zhht", + "evaluation": { + "accuracy": 0.771012574454004 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='Story Continuation and Options_zhht', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopaht/Muennighoff_xwinograd/zh/Replace_zhht/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopaht/Muennighoff_xwinograd/zh/Replace_zhht/results.json new file mode 100644 index 0000000..9f7e240 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopaht/Muennighoff_xwinograd/zh/Replace_zhht/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xwinograd", + "dataset_config_name": "zh", + "template_name": "Replace_zhht", + "evaluation": { + "accuracy": 0.5178571428571429 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='zh', template_name='Replace_zhht', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopaht/Muennighoff_xwinograd/zh/True_or_False_zhht/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopaht/Muennighoff_xwinograd/zh/True_or_False_zhht/results.json new file mode 100644 index 0000000..904e44e --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopaht/Muennighoff_xwinograd/zh/True_or_False_zhht/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xwinograd", + "dataset_config_name": "zh", + "template_name": "True or False_zhht", + "evaluation": { + "accuracy": 0.5218253968253969 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='zh', template_name='True or False_zhht', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopaht/Muennighoff_xwinograd/zh/does_underscore_refer_to_zhht/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopaht/Muennighoff_xwinograd/zh/does_underscore_refer_to_zhht/results.json new file mode 100644 index 0000000..991789c --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopaht/Muennighoff_xwinograd/zh/does_underscore_refer_to_zhht/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xwinograd", + "dataset_config_name": "zh", + "template_name": "does underscore refer to_zhht", + "evaluation": { + "accuracy": 0.4662698412698413 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='zh', template_name='does underscore refer to_zhht', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopaht/Muennighoff_xwinograd/zh/stand_for_zhht/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopaht/Muennighoff_xwinograd/zh/stand_for_zhht/results.json new file mode 100644 index 0000000..97352c6 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopaht/Muennighoff_xwinograd/zh/stand_for_zhht/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xwinograd", + "dataset_config_name": "zh", + "template_name": "stand for_zhht", + "evaluation": { + "accuracy": 0.49404761904761907 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='zh', template_name='stand for_zhht', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopaht/Muennighoff_xwinograd/zh/underscore_refer_to_zhht/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopaht/Muennighoff_xwinograd/zh/underscore_refer_to_zhht/results.json new file mode 100644 index 0000000..b14a42d --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopaht/Muennighoff_xwinograd/zh/underscore_refer_to_zhht/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xwinograd", + "dataset_config_name": "zh", + "template_name": "underscore refer to_zhht", + "evaluation": { + "accuracy": 0.44047619047619047 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='zh', template_name='underscore refer to_zhht', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopaht/merged.csv b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopaht/merged.csv new file mode 100644 index 0000000..c609030 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopaht/merged.csv @@ -0,0 +1,20 @@ +dataset,prompt,metric,value +xcopa_zh,C1 or C2? premise_zhht,accuracy,0.55 +xcopa_zh,best_option_zhht,accuracy,0.67 +xcopa_zh,cause_effect_zhht,accuracy,0.79 +xcopa_zh,i_am_hesitating_zhht,accuracy,0.77 +xcopa_zh,plausible_alternatives_zhht,accuracy,0.75 +xcopa_zh,median,accuracy,0.75 +xstory_cloze_zh,Answer Given options_zhht,accuracy,0.7054930509596293 +xstory_cloze_zh,Choose Story Ending_zhht,accuracy,0.7948378557246857 +xstory_cloze_zh,Generate Ending_zhht,accuracy,0.6366644606221046 +xstory_cloze_zh,Novel Correct Ending_zhht,accuracy,0.7782925215089345 +xstory_cloze_zh,Story Continuation and Options_zhht,accuracy,0.771012574454004 +xstory_cloze_zh,median,accuracy,0.771012574454004 +xwinograd_zh,Replace_zhht,accuracy,0.5178571428571429 +xwinograd_zh,True or False_zhht,accuracy,0.5218253968253969 +xwinograd_zh,does underscore refer to_zhht,accuracy,0.4662698412698413 +xwinograd_zh,stand for_zhht,accuracy,0.49404761904761907 +xwinograd_zh,underscore refer to_zhht,accuracy,0.44047619047619047 +xwinograd_zh,median,accuracy,0.49404761904761907 +multiple,average,multiple,0.6716867311672077 diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopaht/merged.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopaht/merged.json new file mode 100644 index 0000000..2d21054 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopaht/merged.json @@ -0,0 +1 @@ +{"Muennighoff/xstory_cloze_zh": {"Answer Given options_zhht": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='Answer Given options_zhht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.7054930509596293}, "template_name": "Answer Given options_zhht"}, "Choose Story Ending_zhht": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='Choose Story Ending_zhht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.7948378557246857}, "template_name": "Choose Story Ending_zhht"}, "Generate Ending_zhht": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='Generate Ending_zhht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.6366644606221046}, "template_name": "Generate Ending_zhht"}, "Novel Correct Ending_zhht": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='Novel Correct Ending_zhht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.7782925215089345}, "template_name": "Novel Correct Ending_zhht"}, "Story Continuation and Options_zhht": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='Story Continuation and Options_zhht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.771012574454004}, "template_name": "Story Continuation and Options_zhht"}}, "Muennighoff/xwinograd_zh": {"Replace_zhht": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='zh', template_name='Replace_zhht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.5178571428571429}, "template_name": "Replace_zhht"}, "True or False_zhht": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='zh', template_name='True or False_zhht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.5218253968253969}, "template_name": "True or False_zhht"}, "does underscore refer to_zhht": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='zh', template_name='does underscore refer to_zhht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.4662698412698413}, "template_name": "does underscore refer to_zhht"}, "stand for_zhht": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='zh', template_name='stand for_zhht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.49404761904761907}, "template_name": "stand for_zhht"}, "underscore refer to_zhht": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='zh', template_name='underscore refer to_zhht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.44047619047619047}, "template_name": "underscore refer to_zhht"}}, "xcopa_zh": {"C1 or C2? premise_zhht": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='C1 or C2? premise_zhht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.55}, "template_name": "C1 or C2? premise_zhht"}, "best_option_zhht": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='best_option_zhht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.67}, "template_name": "best_option_zhht"}, "cause_effect_zhht": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='cause_effect_zhht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.79}, "template_name": "cause_effect_zhht"}, "i_am_hesitating_zhht": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='i_am_hesitating_zhht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.77}, "template_name": "i_am_hesitating_zhht"}, "plausible_alternatives_zhht": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='plausible_alternatives_zhht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.75}, "template_name": "plausible_alternatives_zhht"}}} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopaht/xcopa/zh/C1_or_C2?_premise_zhht/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopaht/xcopa/zh/C1_or_C2?_premise_zhht/results.json new file mode 100644 index 0000000..9577870 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopaht/xcopa/zh/C1_or_C2?_premise_zhht/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "zh", + "template_name": "C1 or C2? premise_zhht", + "evaluation": { + "accuracy": 0.55 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='C1 or C2? premise_zhht', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopaht/xcopa/zh/best_option_zhht/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopaht/xcopa/zh/best_option_zhht/results.json new file mode 100644 index 0000000..2208fe7 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopaht/xcopa/zh/best_option_zhht/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "zh", + "template_name": "best_option_zhht", + "evaluation": { + "accuracy": 0.67 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='best_option_zhht', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopaht/xcopa/zh/cause_effect_zhht/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopaht/xcopa/zh/cause_effect_zhht/results.json new file mode 100644 index 0000000..6a65623 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopaht/xcopa/zh/cause_effect_zhht/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "zh", + "template_name": "cause_effect_zhht", + "evaluation": { + "accuracy": 0.79 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='cause_effect_zhht', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopaht/xcopa/zh/i_am_hesitating_zhht/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopaht/xcopa/zh/i_am_hesitating_zhht/results.json new file mode 100644 index 0000000..a7e7857 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopaht/xcopa/zh/i_am_hesitating_zhht/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "zh", + "template_name": "i_am_hesitating_zhht", + "evaluation": { + "accuracy": 0.77 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='i_am_hesitating_zhht', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopaht/xcopa/zh/plausible_alternatives_zhht/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopaht/xcopa/zh/plausible_alternatives_zhht/results.json new file mode 100644 index 0000000..27209fa --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopaht/xcopa/zh/plausible_alternatives_zhht/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "zh", + "template_name": "plausible_alternatives_zhht", + "evaluation": { + "accuracy": 0.75 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='plausible_alternatives_zhht', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/ar/Answer_Given_options_armt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/ar/Answer_Given_options_armt/results.json new file mode 100644 index 0000000..97baacb --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/ar/Answer_Given_options_armt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "ar", + "template_name": "Answer Given options_armt", + "evaluation": { + "accuracy": 0.7061548643282595 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='Answer Given options_armt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/ar/Choose_Story_Ending_armt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/ar/Choose_Story_Ending_armt/results.json new file mode 100644 index 0000000..211272a --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/ar/Choose_Story_Ending_armt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "ar", + "template_name": "Choose Story Ending_armt", + "evaluation": { + "accuracy": 0.786896095301125 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='Choose Story Ending_armt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/ar/Generate_Ending_armt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/ar/Generate_Ending_armt/results.json new file mode 100644 index 0000000..b6a05ec --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/ar/Generate_Ending_armt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "ar", + "template_name": "Generate Ending_armt", + "evaluation": { + "accuracy": 0.600926538716082 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='Generate Ending_armt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/ar/Novel_Correct_Ending_armt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/ar/Novel_Correct_Ending_armt/results.json new file mode 100644 index 0000000..2627c80 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/ar/Novel_Correct_Ending_armt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "ar", + "template_name": "Novel Correct Ending_armt", + "evaluation": { + "accuracy": 0.7511581733951026 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='Novel Correct Ending_armt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/ar/Story_Continuation_and_Options_armt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/ar/Story_Continuation_and_Options_armt/results.json new file mode 100644 index 0000000..130c917 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/ar/Story_Continuation_and_Options_armt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "ar", + "template_name": "Story Continuation and Options_armt", + "evaluation": { + "accuracy": 0.757114493712773 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='Story Continuation and Options_armt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/es/Answer_Given_options_esmt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/es/Answer_Given_options_esmt/results.json new file mode 100644 index 0000000..eb2edab --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/es/Answer_Given_options_esmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "es", + "template_name": "Answer Given options_esmt", + "evaluation": { + "accuracy": 0.7902051621442753 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='Answer Given options_esmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/es/Choose_Story_Ending_esmt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/es/Choose_Story_Ending_esmt/results.json new file mode 100644 index 0000000..f5747a6 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/es/Choose_Story_Ending_esmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "es", + "template_name": "Choose Story Ending_esmt", + "evaluation": { + "accuracy": 0.8160158835208471 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='Choose Story Ending_esmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/es/Generate_Ending_esmt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/es/Generate_Ending_esmt/results.json new file mode 100644 index 0000000..3a2651c --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/es/Generate_Ending_esmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "es", + "template_name": "Generate Ending_esmt", + "evaluation": { + "accuracy": 0.657180675049636 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='Generate Ending_esmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/es/Novel_Correct_Ending_esmt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/es/Novel_Correct_Ending_esmt/results.json new file mode 100644 index 0000000..1e21b93 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/es/Novel_Correct_Ending_esmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "es", + "template_name": "Novel Correct Ending_esmt", + "evaluation": { + "accuracy": 0.784910655195235 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='Novel Correct Ending_esmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/es/Story_Continuation_and_Options_esmt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/es/Story_Continuation_and_Options_esmt/results.json new file mode 100644 index 0000000..bea6eba --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/es/Story_Continuation_and_Options_esmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "es", + "template_name": "Story Continuation and Options_esmt", + "evaluation": { + "accuracy": 0.7696889477167439 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='Story Continuation and Options_esmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/eu/Answer_Given_options_eumt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/eu/Answer_Given_options_eumt/results.json new file mode 100644 index 0000000..beb5d05 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/eu/Answer_Given_options_eumt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "eu", + "template_name": "Answer Given options_eumt", + "evaluation": { + "accuracy": 0.6227663798808736 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='eu', template_name='Answer Given options_eumt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/eu/Choose_Story_Ending_eumt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/eu/Choose_Story_Ending_eumt/results.json new file mode 100644 index 0000000..a3b0a30 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/eu/Choose_Story_Ending_eumt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "eu", + "template_name": "Choose Story Ending_eumt", + "evaluation": { + "accuracy": 0.6763732627399074 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='eu', template_name='Choose Story Ending_eumt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/eu/Generate_Ending_eumt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/eu/Generate_Ending_eumt/results.json new file mode 100644 index 0000000..6895c7d --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/eu/Generate_Ending_eumt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "eu", + "template_name": "Generate Ending_eumt", + "evaluation": { + "accuracy": 0.5737921906022502 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='eu', template_name='Generate Ending_eumt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/eu/Novel_Correct_Ending_eumt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/eu/Novel_Correct_Ending_eumt/results.json new file mode 100644 index 0000000..1235fb9 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/eu/Novel_Correct_Ending_eumt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "eu", + "template_name": "Novel Correct Ending_eumt", + "evaluation": { + "accuracy": 0.686300463269358 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='eu', template_name='Novel Correct Ending_eumt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/eu/Story_Continuation_and_Options_eumt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/eu/Story_Continuation_and_Options_eumt/results.json new file mode 100644 index 0000000..8ef3e26 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/eu/Story_Continuation_and_Options_eumt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "eu", + "template_name": "Story Continuation and Options_eumt", + "evaluation": { + "accuracy": 0.6637988087359364 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='eu', template_name='Story Continuation and Options_eumt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/hi/Answer_Given_options_himt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/hi/Answer_Given_options_himt/results.json new file mode 100644 index 0000000..0a0b1d6 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/hi/Answer_Given_options_himt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "hi", + "template_name": "Answer Given options_himt", + "evaluation": { + "accuracy": 0.6697551290536069 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='Answer Given options_himt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/hi/Choose_Story_Ending_himt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/hi/Choose_Story_Ending_himt/results.json new file mode 100644 index 0000000..975a804 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/hi/Choose_Story_Ending_himt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "hi", + "template_name": "Choose Story Ending_himt", + "evaluation": { + "accuracy": 0.7160820648577101 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='Choose Story Ending_himt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/hi/Generate_Ending_himt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/hi/Generate_Ending_himt/results.json new file mode 100644 index 0000000..be02b48 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/hi/Generate_Ending_himt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "hi", + "template_name": "Generate Ending_himt", + "evaluation": { + "accuracy": 0.5923229649238915 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='Generate Ending_himt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/hi/Novel_Correct_Ending_himt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/hi/Novel_Correct_Ending_himt/results.json new file mode 100644 index 0000000..02c07fc --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/hi/Novel_Correct_Ending_himt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "hi", + "template_name": "Novel Correct Ending_himt", + "evaluation": { + "accuracy": 0.6882859033752482 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='Novel Correct Ending_himt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/hi/Story_Continuation_and_Options_himt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/hi/Story_Continuation_and_Options_himt/results.json new file mode 100644 index 0000000..9c2dcc6 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/hi/Story_Continuation_and_Options_himt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "hi", + "template_name": "Story Continuation and Options_himt", + "evaluation": { + "accuracy": 0.7048312375909993 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='Story Continuation and Options_himt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/id/Answer_Given_options_idmt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/id/Answer_Given_options_idmt/results.json new file mode 100644 index 0000000..88b3537 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/id/Answer_Given_options_idmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "id", + "template_name": "Answer Given options_idmt", + "evaluation": { + "accuracy": 0.7346128391793514 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='Answer Given options_idmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/id/Choose_Story_Ending_idmt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/id/Choose_Story_Ending_idmt/results.json new file mode 100644 index 0000000..7957b67 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/id/Choose_Story_Ending_idmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "id", + "template_name": "Choose Story Ending_idmt", + "evaluation": { + "accuracy": 0.7511581733951026 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='Choose Story Ending_idmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/id/Generate_Ending_idmt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/id/Generate_Ending_idmt/results.json new file mode 100644 index 0000000..77381f7 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/id/Generate_Ending_idmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "id", + "template_name": "Generate Ending_idmt", + "evaluation": { + "accuracy": 0.6201191264063534 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='Generate Ending_idmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/id/Novel_Correct_Ending_idmt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/id/Novel_Correct_Ending_idmt/results.json new file mode 100644 index 0000000..9ba130d --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/id/Novel_Correct_Ending_idmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "id", + "template_name": "Novel Correct Ending_idmt", + "evaluation": { + "accuracy": 0.728656518861681 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='Novel Correct Ending_idmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/id/Story_Continuation_and_Options_idmt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/id/Story_Continuation_and_Options_idmt/results.json new file mode 100644 index 0000000..116040f --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/id/Story_Continuation_and_Options_idmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "id", + "template_name": "Story Continuation and Options_idmt", + "evaluation": { + "accuracy": 0.7412309728656519 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='Story Continuation and Options_idmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/zh/Answer_Given_options_zhmt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/zh/Answer_Given_options_zhmt/results.json new file mode 100644 index 0000000..2d64ec6 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/zh/Answer_Given_options_zhmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "zh", + "template_name": "Answer Given options_zhmt", + "evaluation": { + "accuracy": 0.7425545996029119 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='Answer Given options_zhmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/zh/Choose_Story_Ending_zhmt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/zh/Choose_Story_Ending_zhmt/results.json new file mode 100644 index 0000000..cf2e866 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/zh/Choose_Story_Ending_zhmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "zh", + "template_name": "Choose Story Ending_zhmt", + "evaluation": { + "accuracy": 0.7941760423560555 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='Choose Story Ending_zhmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/zh/Generate_Ending_zhmt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/zh/Generate_Ending_zhmt/results.json new file mode 100644 index 0000000..488337f --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/zh/Generate_Ending_zhmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "zh", + "template_name": "Generate Ending_zhmt", + "evaluation": { + "accuracy": 0.6247518199867638 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='Generate Ending_zhmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/zh/Novel_Correct_Ending_zhmt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/zh/Novel_Correct_Ending_zhmt/results.json new file mode 100644 index 0000000..9ffe987 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/zh/Novel_Correct_Ending_zhmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "zh", + "template_name": "Novel Correct Ending_zhmt", + "evaluation": { + "accuracy": 0.7842488418266049 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='Novel Correct Ending_zhmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/zh/Story_Continuation_and_Options_zhmt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/zh/Story_Continuation_and_Options_zhmt/results.json new file mode 100644 index 0000000..71c74b5 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/zh/Story_Continuation_and_Options_zhmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "zh", + "template_name": "Story Continuation and Options_zhmt", + "evaluation": { + "accuracy": 0.8034414295168762 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='Story Continuation and Options_zhmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/fr/Replace_frmt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/fr/Replace_frmt/results.json new file mode 100644 index 0000000..0bffd92 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/fr/Replace_frmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xwinograd", + "dataset_config_name": "fr", + "template_name": "Replace_frmt", + "evaluation": { + "accuracy": 0.5180722891566265 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='fr', template_name='Replace_frmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/fr/True_or_False_frmt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/fr/True_or_False_frmt/results.json new file mode 100644 index 0000000..5102b11 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/fr/True_or_False_frmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xwinograd", + "dataset_config_name": "fr", + "template_name": "True or False_frmt", + "evaluation": { + "accuracy": 0.46987951807228917 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='fr', template_name='True or False_frmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/fr/does_underscore_refer_to_frmt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/fr/does_underscore_refer_to_frmt/results.json new file mode 100644 index 0000000..99238a9 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/fr/does_underscore_refer_to_frmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xwinograd", + "dataset_config_name": "fr", + "template_name": "does underscore refer to_frmt", + "evaluation": { + "accuracy": 0.5421686746987951 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='fr', template_name='does underscore refer to_frmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/fr/stand_for_frmt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/fr/stand_for_frmt/results.json new file mode 100644 index 0000000..983d921 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/fr/stand_for_frmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xwinograd", + "dataset_config_name": "fr", + "template_name": "stand for_frmt", + "evaluation": { + "accuracy": 0.5060240963855421 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='fr', template_name='stand for_frmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/fr/underscore_refer_to_frmt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/fr/underscore_refer_to_frmt/results.json new file mode 100644 index 0000000..4cfaf56 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/fr/underscore_refer_to_frmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xwinograd", + "dataset_config_name": "fr", + "template_name": "underscore refer to_frmt", + "evaluation": { + "accuracy": 0.5421686746987951 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='fr', template_name='underscore refer to_frmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/pt/Replace_ptmt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/pt/Replace_ptmt/results.json new file mode 100644 index 0000000..37284b8 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/pt/Replace_ptmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xwinograd", + "dataset_config_name": "pt", + "template_name": "Replace_ptmt", + "evaluation": { + "accuracy": 0.5057034220532319 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='pt', template_name='Replace_ptmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/pt/True_or_False_ptmt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/pt/True_or_False_ptmt/results.json new file mode 100644 index 0000000..e0293e2 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/pt/True_or_False_ptmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xwinograd", + "dataset_config_name": "pt", + "template_name": "True or False_ptmt", + "evaluation": { + "accuracy": 0.5133079847908745 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='pt', template_name='True or False_ptmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/pt/does_underscore_refer_to_ptmt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/pt/does_underscore_refer_to_ptmt/results.json new file mode 100644 index 0000000..bd50bda --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/pt/does_underscore_refer_to_ptmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xwinograd", + "dataset_config_name": "pt", + "template_name": "does underscore refer to_ptmt", + "evaluation": { + "accuracy": 0.5209125475285171 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='pt', template_name='does underscore refer to_ptmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/pt/stand_for_ptmt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/pt/stand_for_ptmt/results.json new file mode 100644 index 0000000..2c0661e --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/pt/stand_for_ptmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xwinograd", + "dataset_config_name": "pt", + "template_name": "stand for_ptmt", + "evaluation": { + "accuracy": 0.5209125475285171 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='pt', template_name='stand for_ptmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/pt/underscore_refer_to_ptmt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/pt/underscore_refer_to_ptmt/results.json new file mode 100644 index 0000000..01c913c --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/pt/underscore_refer_to_ptmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xwinograd", + "dataset_config_name": "pt", + "template_name": "underscore refer to_ptmt", + "evaluation": { + "accuracy": 0.49049429657794674 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='pt', template_name='underscore refer to_ptmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/zh/Replace_zhmt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/zh/Replace_zhmt/results.json new file mode 100644 index 0000000..caedc0a --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/zh/Replace_zhmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xwinograd", + "dataset_config_name": "zh", + "template_name": "Replace_zhmt", + "evaluation": { + "accuracy": 0.5238095238095238 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='zh', template_name='Replace_zhmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/zh/True_or_False_zhmt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/zh/True_or_False_zhmt/results.json new file mode 100644 index 0000000..756637b --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/zh/True_or_False_zhmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xwinograd", + "dataset_config_name": "zh", + "template_name": "True or False_zhmt", + "evaluation": { + "accuracy": 0.5138888888888888 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='zh', template_name='True or False_zhmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/zh/does_underscore_refer_to_zhmt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/zh/does_underscore_refer_to_zhmt/results.json new file mode 100644 index 0000000..8961af4 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/zh/does_underscore_refer_to_zhmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xwinograd", + "dataset_config_name": "zh", + "template_name": "does underscore refer to_zhmt", + "evaluation": { + "accuracy": 0.49404761904761907 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='zh', template_name='does underscore refer to_zhmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/zh/stand_for_zhmt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/zh/stand_for_zhmt/results.json new file mode 100644 index 0000000..f0fd6eb --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/zh/stand_for_zhmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xwinograd", + "dataset_config_name": "zh", + "template_name": "stand for_zhmt", + "evaluation": { + "accuracy": 0.49603174603174605 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='zh', template_name='stand for_zhmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/zh/underscore_refer_to_zhmt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/zh/underscore_refer_to_zhmt/results.json new file mode 100644 index 0000000..3499bce --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/zh/underscore_refer_to_zhmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xwinograd", + "dataset_config_name": "zh", + "template_name": "underscore refer to_zhmt", + "evaluation": { + "accuracy": 0.503968253968254 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='zh', template_name='underscore refer to_zhmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/merged.csv b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/merged.csv new file mode 100644 index 0000000..13f4b69 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/merged.csv @@ -0,0 +1,86 @@ +dataset,prompt,metric,value +xcopa_id,C1 or C2? premise_idmt,accuracy,0.51 +xcopa_id,best_option_idmt,accuracy,0.53 +xcopa_id,cause_effect_idmt,accuracy,0.69 +xcopa_id,i_am_hesitating_idmt,accuracy,0.64 +xcopa_id,plausible_alternatives_idmt,accuracy,0.7 +xcopa_id,median,accuracy,0.64 +xcopa_sw,C1 or C2? premise_swmt,accuracy,0.6 +xcopa_sw,best_option_swmt,accuracy,0.62 +xcopa_sw,cause_effect_swmt,accuracy,0.49 +xcopa_sw,i_am_hesitating_swmt,accuracy,0.56 +xcopa_sw,plausible_alternatives_swmt,accuracy,0.54 +xcopa_sw,median,accuracy,0.56 +xcopa_ta,C1 or C2? premise_tamt,accuracy,0.52 +xcopa_ta,best_option_tamt,accuracy,0.55 +xcopa_ta,cause_effect_tamt,accuracy,0.63 +xcopa_ta,i_am_hesitating_tamt,accuracy,0.63 +xcopa_ta,plausible_alternatives_tamt,accuracy,0.66 +xcopa_ta,median,accuracy,0.63 +xcopa_vi,C1 or C2? premise_vimt,accuracy,0.55 +xcopa_vi,best_option_vimt,accuracy,0.61 +xcopa_vi,cause_effect_vimt,accuracy,0.64 +xcopa_vi,i_am_hesitating_vimt,accuracy,0.6 +xcopa_vi,plausible_alternatives_vimt,accuracy,0.64 +xcopa_vi,median,accuracy,0.61 +xcopa_zh,C1 or C2? premise_zhmt,accuracy,0.52 +xcopa_zh,best_option_zhmt,accuracy,0.61 +xcopa_zh,cause_effect_zhmt,accuracy,0.75 +xcopa_zh,i_am_hesitating_zhmt,accuracy,0.72 +xcopa_zh,plausible_alternatives_zhmt,accuracy,0.76 +xcopa_zh,median,accuracy,0.72 +xstory_cloze_ar,Answer Given options_armt,accuracy,0.7061548643282595 +xstory_cloze_ar,Choose Story Ending_armt,accuracy,0.786896095301125 +xstory_cloze_ar,Generate Ending_armt,accuracy,0.600926538716082 +xstory_cloze_ar,Novel Correct Ending_armt,accuracy,0.7511581733951026 +xstory_cloze_ar,Story Continuation and Options_armt,accuracy,0.757114493712773 +xstory_cloze_ar,median,accuracy,0.7511581733951026 +xstory_cloze_es,Answer Given options_esmt,accuracy,0.7902051621442753 +xstory_cloze_es,Choose Story Ending_esmt,accuracy,0.8160158835208471 +xstory_cloze_es,Generate Ending_esmt,accuracy,0.657180675049636 +xstory_cloze_es,Novel Correct Ending_esmt,accuracy,0.784910655195235 +xstory_cloze_es,Story Continuation and Options_esmt,accuracy,0.7696889477167439 +xstory_cloze_es,median,accuracy,0.784910655195235 +xstory_cloze_eu,Answer Given options_eumt,accuracy,0.6227663798808736 +xstory_cloze_eu,Choose Story Ending_eumt,accuracy,0.6763732627399074 +xstory_cloze_eu,Generate Ending_eumt,accuracy,0.5737921906022502 +xstory_cloze_eu,Novel Correct Ending_eumt,accuracy,0.686300463269358 +xstory_cloze_eu,Story Continuation and Options_eumt,accuracy,0.6637988087359364 +xstory_cloze_eu,median,accuracy,0.6637988087359364 +xstory_cloze_hi,Answer Given options_himt,accuracy,0.6697551290536069 +xstory_cloze_hi,Choose Story Ending_himt,accuracy,0.7160820648577101 +xstory_cloze_hi,Generate Ending_himt,accuracy,0.5923229649238915 +xstory_cloze_hi,Novel Correct Ending_himt,accuracy,0.6882859033752482 +xstory_cloze_hi,Story Continuation and Options_himt,accuracy,0.7048312375909993 +xstory_cloze_hi,median,accuracy,0.6882859033752482 +xstory_cloze_id,Answer Given options_idmt,accuracy,0.7346128391793514 +xstory_cloze_id,Choose Story Ending_idmt,accuracy,0.7511581733951026 +xstory_cloze_id,Generate Ending_idmt,accuracy,0.6201191264063534 +xstory_cloze_id,Novel Correct Ending_idmt,accuracy,0.728656518861681 +xstory_cloze_id,Story Continuation and Options_idmt,accuracy,0.7412309728656519 +xstory_cloze_id,median,accuracy,0.7346128391793514 +xstory_cloze_zh,Answer Given options_zhmt,accuracy,0.7425545996029119 +xstory_cloze_zh,Choose Story Ending_zhmt,accuracy,0.7941760423560555 +xstory_cloze_zh,Generate Ending_zhmt,accuracy,0.6247518199867638 +xstory_cloze_zh,Novel Correct Ending_zhmt,accuracy,0.7842488418266049 +xstory_cloze_zh,Story Continuation and Options_zhmt,accuracy,0.8034414295168762 +xstory_cloze_zh,median,accuracy,0.7842488418266049 +xwinograd_fr,Replace_frmt,accuracy,0.5180722891566265 +xwinograd_fr,True or False_frmt,accuracy,0.46987951807228917 +xwinograd_fr,does underscore refer to_frmt,accuracy,0.5421686746987951 +xwinograd_fr,stand for_frmt,accuracy,0.5060240963855421 +xwinograd_fr,underscore refer to_frmt,accuracy,0.5421686746987951 +xwinograd_fr,median,accuracy,0.5180722891566265 +xwinograd_pt,Replace_ptmt,accuracy,0.5057034220532319 +xwinograd_pt,True or False_ptmt,accuracy,0.5133079847908745 +xwinograd_pt,does underscore refer to_ptmt,accuracy,0.5209125475285171 +xwinograd_pt,stand for_ptmt,accuracy,0.5209125475285171 +xwinograd_pt,underscore refer to_ptmt,accuracy,0.49049429657794674 +xwinograd_pt,median,accuracy,0.5133079847908745 +xwinograd_zh,Replace_zhmt,accuracy,0.5238095238095238 +xwinograd_zh,True or False_zhmt,accuracy,0.5138888888888888 +xwinograd_zh,does underscore refer to_zhmt,accuracy,0.49404761904761907 +xwinograd_zh,stand for_zhmt,accuracy,0.49603174603174605 +xwinograd_zh,underscore refer to_zhmt,accuracy,0.503968253968254 +xwinograd_zh,median,accuracy,0.503968253968254 +multiple,average,multiple,0.6501688392588024 diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/merged.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/merged.json new file mode 100644 index 0000000..2b9b3dc --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/merged.json @@ -0,0 +1 @@ +{"Muennighoff/xstory_cloze_ar": {"Answer Given options_armt": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='Answer Given options_armt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.7061548643282595}, "template_name": "Answer Given options_armt"}, "Choose Story Ending_armt": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='Choose Story Ending_armt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.786896095301125}, "template_name": "Choose Story Ending_armt"}, "Generate Ending_armt": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='Generate Ending_armt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.600926538716082}, "template_name": "Generate Ending_armt"}, "Novel Correct Ending_armt": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='Novel Correct Ending_armt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.7511581733951026}, "template_name": "Novel Correct Ending_armt"}, "Story Continuation and Options_armt": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='Story Continuation and Options_armt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.757114493712773}, "template_name": "Story Continuation and Options_armt"}}, "Muennighoff/xstory_cloze_es": {"Answer Given options_esmt": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='Answer Given options_esmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.7902051621442753}, "template_name": "Answer Given options_esmt"}, "Choose Story Ending_esmt": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='Choose Story Ending_esmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.8160158835208471}, "template_name": "Choose Story Ending_esmt"}, "Generate Ending_esmt": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='Generate Ending_esmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.657180675049636}, "template_name": "Generate Ending_esmt"}, "Novel Correct Ending_esmt": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='Novel Correct Ending_esmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.784910655195235}, "template_name": "Novel Correct Ending_esmt"}, "Story Continuation and Options_esmt": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='Story Continuation and Options_esmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.7696889477167439}, "template_name": "Story Continuation and Options_esmt"}}, "Muennighoff/xstory_cloze_eu": {"Answer Given options_eumt": {"arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='eu', template_name='Answer Given options_eumt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "eu", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.6227663798808736}, "template_name": "Answer Given options_eumt"}, "Choose Story Ending_eumt": {"arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='eu', template_name='Choose Story Ending_eumt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "eu", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.6763732627399074}, "template_name": "Choose Story Ending_eumt"}, "Generate Ending_eumt": {"arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='eu', template_name='Generate Ending_eumt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "eu", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.5737921906022502}, "template_name": "Generate Ending_eumt"}, "Novel Correct Ending_eumt": {"arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='eu', template_name='Novel Correct Ending_eumt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "eu", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.686300463269358}, "template_name": "Novel Correct Ending_eumt"}, "Story Continuation and Options_eumt": {"arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='eu', template_name='Story Continuation and Options_eumt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "eu", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.6637988087359364}, "template_name": "Story Continuation and Options_eumt"}}, "Muennighoff/xstory_cloze_hi": {"Answer Given options_himt": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='Answer Given options_himt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.6697551290536069}, "template_name": "Answer Given options_himt"}, "Choose Story Ending_himt": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='Choose Story Ending_himt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.7160820648577101}, "template_name": "Choose Story Ending_himt"}, "Generate Ending_himt": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='Generate Ending_himt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.5923229649238915}, "template_name": "Generate Ending_himt"}, "Novel Correct Ending_himt": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='Novel Correct Ending_himt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.6882859033752482}, "template_name": "Novel Correct Ending_himt"}, "Story Continuation and Options_himt": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='Story Continuation and Options_himt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.7048312375909993}, "template_name": "Story Continuation and Options_himt"}}, "Muennighoff/xstory_cloze_id": {"Answer Given options_idmt": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='Answer Given options_idmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.7346128391793514}, "template_name": "Answer Given options_idmt"}, "Choose Story Ending_idmt": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='Choose Story Ending_idmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.7511581733951026}, "template_name": "Choose Story Ending_idmt"}, "Generate Ending_idmt": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='Generate Ending_idmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.6201191264063534}, "template_name": "Generate Ending_idmt"}, "Novel Correct Ending_idmt": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='Novel Correct Ending_idmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.728656518861681}, "template_name": "Novel Correct Ending_idmt"}, "Story Continuation and Options_idmt": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='Story Continuation and Options_idmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.7412309728656519}, "template_name": "Story Continuation and Options_idmt"}}, "Muennighoff/xstory_cloze_zh": {"Answer Given options_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='Answer Given options_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.7425545996029119}, "template_name": "Answer Given options_zhmt"}, "Choose Story Ending_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='Choose Story Ending_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.7941760423560555}, "template_name": "Choose Story Ending_zhmt"}, "Generate Ending_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='Generate Ending_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.6247518199867638}, "template_name": "Generate Ending_zhmt"}, "Novel Correct Ending_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='Novel Correct Ending_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.7842488418266049}, "template_name": "Novel Correct Ending_zhmt"}, "Story Continuation and Options_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='Story Continuation and Options_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.8034414295168762}, "template_name": "Story Continuation and Options_zhmt"}}, "Muennighoff/xwinograd_fr": {"Replace_frmt": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='fr', template_name='Replace_frmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.5180722891566265}, "template_name": "Replace_frmt"}, "True or False_frmt": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='fr', template_name='True or False_frmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.46987951807228917}, "template_name": "True or False_frmt"}, "does underscore refer to_frmt": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='fr', template_name='does underscore refer to_frmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.5421686746987951}, "template_name": "does underscore refer to_frmt"}, "stand for_frmt": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='fr', template_name='stand for_frmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.5060240963855421}, "template_name": "stand for_frmt"}, "underscore refer to_frmt": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='fr', template_name='underscore refer to_frmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.5421686746987951}, "template_name": "underscore refer to_frmt"}}, "Muennighoff/xwinograd_pt": {"Replace_ptmt": {"arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='pt', template_name='Replace_ptmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "pt", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.5057034220532319}, "template_name": "Replace_ptmt"}, "True or False_ptmt": {"arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='pt', template_name='True or False_ptmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "pt", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.5133079847908745}, "template_name": "True or False_ptmt"}, "does underscore refer to_ptmt": {"arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='pt', template_name='does underscore refer to_ptmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "pt", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.5209125475285171}, "template_name": "does underscore refer to_ptmt"}, "stand for_ptmt": {"arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='pt', template_name='stand for_ptmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "pt", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.5209125475285171}, "template_name": "stand for_ptmt"}, "underscore refer to_ptmt": {"arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='pt', template_name='underscore refer to_ptmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "pt", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.49049429657794674}, "template_name": "underscore refer to_ptmt"}}, "Muennighoff/xwinograd_zh": {"Replace_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='zh', template_name='Replace_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.5238095238095238}, "template_name": "Replace_zhmt"}, "True or False_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='zh', template_name='True or False_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.5138888888888888}, "template_name": "True or False_zhmt"}, "does underscore refer to_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='zh', template_name='does underscore refer to_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.49404761904761907}, "template_name": "does underscore refer to_zhmt"}, "stand for_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='zh', template_name='stand for_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.49603174603174605}, "template_name": "stand for_zhmt"}, "underscore refer to_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='zh', template_name='underscore refer to_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.503968253968254}, "template_name": "underscore refer to_zhmt"}}, "xcopa_id": {"C1 or C2? premise_idmt": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='C1 or C2? premise_idmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.51}, "template_name": "C1 or C2? premise_idmt"}, "best_option_idmt": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='best_option_idmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.53}, "template_name": "best_option_idmt"}, "cause_effect_idmt": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='cause_effect_idmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.69}, "template_name": "cause_effect_idmt"}, "i_am_hesitating_idmt": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='i_am_hesitating_idmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.64}, "template_name": "i_am_hesitating_idmt"}, "plausible_alternatives_idmt": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='plausible_alternatives_idmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.7}, "template_name": "plausible_alternatives_idmt"}}, "xcopa_sw": {"C1 or C2? premise_swmt": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='C1 or C2? premise_swmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.6}, "template_name": "C1 or C2? premise_swmt"}, "best_option_swmt": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='best_option_swmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.62}, "template_name": "best_option_swmt"}, "cause_effect_swmt": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='cause_effect_swmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.49}, "template_name": "cause_effect_swmt"}, "i_am_hesitating_swmt": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='i_am_hesitating_swmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.56}, "template_name": "i_am_hesitating_swmt"}, "plausible_alternatives_swmt": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='plausible_alternatives_swmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.54}, "template_name": "plausible_alternatives_swmt"}}, "xcopa_ta": {"C1 or C2? premise_tamt": {"arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ta', template_name='C1 or C2? premise_tamt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ta", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.52}, "template_name": "C1 or C2? premise_tamt"}, "best_option_tamt": {"arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ta', template_name='best_option_tamt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ta", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.55}, "template_name": "best_option_tamt"}, "cause_effect_tamt": {"arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ta', template_name='cause_effect_tamt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ta", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.63}, "template_name": "cause_effect_tamt"}, "i_am_hesitating_tamt": {"arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ta', template_name='i_am_hesitating_tamt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ta", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.63}, "template_name": "i_am_hesitating_tamt"}, "plausible_alternatives_tamt": {"arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ta', template_name='plausible_alternatives_tamt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ta", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.66}, "template_name": "plausible_alternatives_tamt"}}, "xcopa_vi": {"C1 or C2? premise_vimt": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='C1 or C2? premise_vimt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.55}, "template_name": "C1 or C2? premise_vimt"}, "best_option_vimt": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='best_option_vimt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.61}, "template_name": "best_option_vimt"}, "cause_effect_vimt": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='cause_effect_vimt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.64}, "template_name": "cause_effect_vimt"}, "i_am_hesitating_vimt": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='i_am_hesitating_vimt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.6}, "template_name": "i_am_hesitating_vimt"}, "plausible_alternatives_vimt": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='plausible_alternatives_vimt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.64}, "template_name": "plausible_alternatives_vimt"}}, "xcopa_zh": {"C1 or C2? premise_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='C1 or C2? premise_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.52}, "template_name": "C1 or C2? premise_zhmt"}, "best_option_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='best_option_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.61}, "template_name": "best_option_zhmt"}, "cause_effect_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='cause_effect_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.75}, "template_name": "cause_effect_zhmt"}, "i_am_hesitating_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='i_am_hesitating_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.72}, "template_name": "i_am_hesitating_zhmt"}, "plausible_alternatives_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='plausible_alternatives_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.76}, "template_name": "plausible_alternatives_zhmt"}}} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/id/C1_or_C2?_premise_idmt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/id/C1_or_C2?_premise_idmt/results.json new file mode 100644 index 0000000..9ebe029 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/id/C1_or_C2?_premise_idmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "id", + "template_name": "C1 or C2? premise_idmt", + "evaluation": { + "accuracy": 0.51 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='C1 or C2? premise_idmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/id/best_option_idmt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/id/best_option_idmt/results.json new file mode 100644 index 0000000..7eba63a --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/id/best_option_idmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "id", + "template_name": "best_option_idmt", + "evaluation": { + "accuracy": 0.53 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='best_option_idmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/id/cause_effect_idmt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/id/cause_effect_idmt/results.json new file mode 100644 index 0000000..3dbf522 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/id/cause_effect_idmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "id", + "template_name": "cause_effect_idmt", + "evaluation": { + "accuracy": 0.69 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='cause_effect_idmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/id/i_am_hesitating_idmt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/id/i_am_hesitating_idmt/results.json new file mode 100644 index 0000000..443e40f --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/id/i_am_hesitating_idmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "id", + "template_name": "i_am_hesitating_idmt", + "evaluation": { + "accuracy": 0.64 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='i_am_hesitating_idmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/id/plausible_alternatives_idmt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/id/plausible_alternatives_idmt/results.json new file mode 100644 index 0000000..13e42e9 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/id/plausible_alternatives_idmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "id", + "template_name": "plausible_alternatives_idmt", + "evaluation": { + "accuracy": 0.7 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='plausible_alternatives_idmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/sw/C1_or_C2?_premise_swmt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/sw/C1_or_C2?_premise_swmt/results.json new file mode 100644 index 0000000..a2a0e80 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/sw/C1_or_C2?_premise_swmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "sw", + "template_name": "C1 or C2? premise_swmt", + "evaluation": { + "accuracy": 0.6 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='C1 or C2? premise_swmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/sw/best_option_swmt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/sw/best_option_swmt/results.json new file mode 100644 index 0000000..f8b9cd0 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/sw/best_option_swmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "sw", + "template_name": "best_option_swmt", + "evaluation": { + "accuracy": 0.62 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='best_option_swmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/sw/cause_effect_swmt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/sw/cause_effect_swmt/results.json new file mode 100644 index 0000000..5fc9371 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/sw/cause_effect_swmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "sw", + "template_name": "cause_effect_swmt", + "evaluation": { + "accuracy": 0.49 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='cause_effect_swmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/sw/i_am_hesitating_swmt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/sw/i_am_hesitating_swmt/results.json new file mode 100644 index 0000000..744698b --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/sw/i_am_hesitating_swmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "sw", + "template_name": "i_am_hesitating_swmt", + "evaluation": { + "accuracy": 0.56 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='i_am_hesitating_swmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/sw/plausible_alternatives_swmt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/sw/plausible_alternatives_swmt/results.json new file mode 100644 index 0000000..07c5e09 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/sw/plausible_alternatives_swmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "sw", + "template_name": "plausible_alternatives_swmt", + "evaluation": { + "accuracy": 0.54 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='plausible_alternatives_swmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/ta/C1_or_C2?_premise_tamt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/ta/C1_or_C2?_premise_tamt/results.json new file mode 100644 index 0000000..ae0ea79 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/ta/C1_or_C2?_premise_tamt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "ta", + "template_name": "C1 or C2? premise_tamt", + "evaluation": { + "accuracy": 0.52 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ta', template_name='C1 or C2? premise_tamt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/ta/best_option_tamt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/ta/best_option_tamt/results.json new file mode 100644 index 0000000..5ccdaf0 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/ta/best_option_tamt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "ta", + "template_name": "best_option_tamt", + "evaluation": { + "accuracy": 0.55 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ta', template_name='best_option_tamt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/ta/cause_effect_tamt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/ta/cause_effect_tamt/results.json new file mode 100644 index 0000000..f726f27 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/ta/cause_effect_tamt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "ta", + "template_name": "cause_effect_tamt", + "evaluation": { + "accuracy": 0.63 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ta', template_name='cause_effect_tamt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/ta/i_am_hesitating_tamt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/ta/i_am_hesitating_tamt/results.json new file mode 100644 index 0000000..b69edcb --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/ta/i_am_hesitating_tamt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "ta", + "template_name": "i_am_hesitating_tamt", + "evaluation": { + "accuracy": 0.63 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ta', template_name='i_am_hesitating_tamt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/ta/plausible_alternatives_tamt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/ta/plausible_alternatives_tamt/results.json new file mode 100644 index 0000000..9faee89 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/ta/plausible_alternatives_tamt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "ta", + "template_name": "plausible_alternatives_tamt", + "evaluation": { + "accuracy": 0.66 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ta', template_name='plausible_alternatives_tamt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/vi/C1_or_C2?_premise_vimt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/vi/C1_or_C2?_premise_vimt/results.json new file mode 100644 index 0000000..cf439f3 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/vi/C1_or_C2?_premise_vimt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "vi", + "template_name": "C1 or C2? premise_vimt", + "evaluation": { + "accuracy": 0.55 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='C1 or C2? premise_vimt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/vi/best_option_vimt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/vi/best_option_vimt/results.json new file mode 100644 index 0000000..fabb6f8 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/vi/best_option_vimt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "vi", + "template_name": "best_option_vimt", + "evaluation": { + "accuracy": 0.61 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='best_option_vimt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/vi/cause_effect_vimt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/vi/cause_effect_vimt/results.json new file mode 100644 index 0000000..79af1aa --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/vi/cause_effect_vimt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "vi", + "template_name": "cause_effect_vimt", + "evaluation": { + "accuracy": 0.64 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='cause_effect_vimt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/vi/i_am_hesitating_vimt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/vi/i_am_hesitating_vimt/results.json new file mode 100644 index 0000000..4ca7d6d --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/vi/i_am_hesitating_vimt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "vi", + "template_name": "i_am_hesitating_vimt", + "evaluation": { + "accuracy": 0.6 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='i_am_hesitating_vimt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/vi/plausible_alternatives_vimt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/vi/plausible_alternatives_vimt/results.json new file mode 100644 index 0000000..2b5bd88 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/vi/plausible_alternatives_vimt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "vi", + "template_name": "plausible_alternatives_vimt", + "evaluation": { + "accuracy": 0.64 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='plausible_alternatives_vimt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/zh/C1_or_C2?_premise_zhmt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/zh/C1_or_C2?_premise_zhmt/results.json new file mode 100644 index 0000000..a6de90b --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/zh/C1_or_C2?_premise_zhmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "zh", + "template_name": "C1 or C2? premise_zhmt", + "evaluation": { + "accuracy": 0.52 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='C1 or C2? premise_zhmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/zh/best_option_zhmt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/zh/best_option_zhmt/results.json new file mode 100644 index 0000000..1e68673 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/zh/best_option_zhmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "zh", + "template_name": "best_option_zhmt", + "evaluation": { + "accuracy": 0.61 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='best_option_zhmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/zh/cause_effect_zhmt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/zh/cause_effect_zhmt/results.json new file mode 100644 index 0000000..6f04541 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/zh/cause_effect_zhmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "zh", + "template_name": "cause_effect_zhmt", + "evaluation": { + "accuracy": 0.75 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='cause_effect_zhmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/zh/i_am_hesitating_zhmt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/zh/i_am_hesitating_zhmt/results.json new file mode 100644 index 0000000..3ca4d5c --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/zh/i_am_hesitating_zhmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "zh", + "template_name": "i_am_hesitating_zhmt", + "evaluation": { + "accuracy": 0.72 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='i_am_hesitating_zhmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/zh/plausible_alternatives_zhmt/results.json b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/zh/plausible_alternatives_zhmt/results.json new file mode 100644 index 0000000..c8250e6 --- /dev/null +++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/zh/plausible_alternatives_zhmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "zh", + "template_name": "plausible_alternatives_zhmt", + "evaluation": { + "accuracy": 0.76 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='plausible_alternatives_zhmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000..26db4d3 --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fbc2a1a1f12c3259b6728a588bb7bb599171536af24e0d7daf017c8d12483407 +size 14138070936 diff --git a/pytorch_model.bin b/pytorch_model.bin new file mode 100644 index 0000000..2ca6730 --- /dev/null +++ b/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f144185745aa045c4fe3ed87aefcb4095872dc75bc181663a06d4fef100867a +size 14138162687 diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..25bc396 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1 @@ +{"bos_token": "", "eos_token": "", "unk_token": "", "pad_token": ""} \ No newline at end of file diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..370bd68 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fa39cd4b1500feb205bcce3b9703a4373414cafe4970e0657b413f7ddd2a9d3 +size 14500438 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..c633450 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1 @@ +{"unk_token": "", "eos_token": "", "bos_token": "", "pad_token": "", "name_or_path": "bigscience/tokenizer", "special_tokens_map_file": null, "tokenizer_class": "BloomTokenizerFast"} \ No newline at end of file