13 KiB
13 KiB
| 1 | dataset | prompt | metric | value |
|---|---|---|---|---|
| 2 | amazon_reviews_multi_en | prompt_body_title_to_star | accuracy | 0.6176 |
| 3 | amazon_reviews_multi_en | prompt_review_to_star | accuracy | 0.5592 |
| 4 | amazon_reviews_multi_en | prompt_title_to_star | accuracy | 0.3922 |
| 5 | amazon_reviews_multi_en | median | accuracy | 0.5592 |
| 6 | amazon_reviews_multi_es | prompt_body_title_to_star | accuracy | 0.5526 |
| 7 | amazon_reviews_multi_es | prompt_review_to_star | accuracy | 0.5296 |
| 8 | amazon_reviews_multi_es | prompt_title_to_star | accuracy | 0.3646 |
| 9 | amazon_reviews_multi_es | median | accuracy | 0.5296 |
| 10 | amazon_reviews_multi_fr | prompt_body_title_to_star | accuracy | 0.5332 |
| 11 | amazon_reviews_multi_fr | prompt_review_to_star | accuracy | 0.5182 |
| 12 | amazon_reviews_multi_fr | prompt_title_to_star | accuracy | 0.3644 |
| 13 | amazon_reviews_multi_fr | median | accuracy | 0.5182 |
| 14 | amazon_reviews_multi_zh | prompt_body_title_to_star | accuracy | 0.5174 |
| 15 | amazon_reviews_multi_zh | prompt_review_to_star | accuracy | 0.5006 |
| 16 | amazon_reviews_multi_zh | prompt_title_to_star | accuracy | 0.3874 |
| 17 | amazon_reviews_multi_zh | median | accuracy | 0.5006 |
| 18 | aqua_rat_raw | Answer questions from options | accuracy | 0.24015748031496062 |
| 19 | aqua_rat_raw | answer_quiz | accuracy | 0.22440944881889763 |
| 20 | aqua_rat_raw | select_the_best_option | accuracy | 0.2559055118110236 |
| 21 | aqua_rat_raw | median | accuracy | 0.24015748031496062 |
| 22 | art_None | choose_hypothesis | accuracy | 0.5926892950391645 |
| 23 | art_None | choose_hypothesis_believable | accuracy | 0.5711488250652742 |
| 24 | art_None | choose_hypothesis_desc | accuracy | 0.5169712793733682 |
| 25 | art_None | choose_hypothesis_likely | accuracy | 0.5300261096605744 |
| 26 | art_None | choose_hypothesis_options | accuracy | 0.5672323759791122 |
| 27 | art_None | median | accuracy | 0.5672323759791122 |
| 28 | banking77_None | direct_to_which_department | accuracy | 0.16753246753246753 |
| 29 | banking77_None | help_page_topic | accuracy | 0.26785714285714285 |
| 30 | banking77_None | rephrase_as_banking_term | accuracy | 0.274025974025974 |
| 31 | banking77_None | median | accuracy | 0.26785714285714285 |
| 32 | blbooksgenre_title_genre_classifiction | classify | accuracy | 0.25057603686635943 |
| 33 | blbooksgenre_title_genre_classifiction | multi-choice | accuracy | 0.25057603686635943 |
| 34 | blbooksgenre_title_genre_classifiction | premise_context_first | accuracy | 0.7321428571428571 |
| 35 | blbooksgenre_title_genre_classifiction | median | accuracy | 0.25057603686635943 |
| 36 | blimp_adjunct_island | grammatical_between_1_2 | accuracy | 0.512 |
| 37 | blimp_adjunct_island | grammatical_between_A_B | accuracy | 0.464 |
| 38 | blimp_adjunct_island | grammatical_which_one_1_2 | accuracy | 0.512 |
| 39 | blimp_adjunct_island | single_sentence_bad_yes_no | accuracy | 0.52 |
| 40 | blimp_adjunct_island | single_sentence_good_yes_no | accuracy | 0.493 |
| 41 | blimp_adjunct_island | median | accuracy | 0.512 |
| 42 | climate_fever_None | claim_and_all_supporting_evidences | accuracy | 0.3166123778501629 |
| 43 | climate_fever_None | fifth_evidence_and_claim_itemization | accuracy | 0.4749185667752443 |
| 44 | climate_fever_None | first_evidence_and_claim_itemization | accuracy | 0.22996742671009773 |
| 45 | climate_fever_None | second_evidence_and_claim_itemization | accuracy | 0.24625407166123778 |
| 46 | climate_fever_None | third_evidence_claim_pair | accuracy | 0.24234527687296417 |
| 47 | climate_fever_None | median | accuracy | 0.24625407166123778 |
| 48 | codah_codah | affirmative_instruction_after_sentence_and_choices | accuracy | 0.6693083573487032 |
| 49 | codah_codah | affirmative_instruction_before_sentence_and_choices | accuracy | 0.6509365994236311 |
| 50 | codah_codah | interrogative_instruction_after_sentence_and_choices | accuracy | 0.6761527377521613 |
| 51 | codah_codah | median | accuracy | 0.6693083573487032 |
| 52 | commonsense_qa_None | answer_given_question_without_options | accuracy | 0.6388206388206388 |
| 53 | commonsense_qa_None | most_suitable_answer | accuracy | 0.7313677313677314 |
| 54 | commonsense_qa_None | question_answering | accuracy | 0.7158067158067158 |
| 55 | commonsense_qa_None | median | accuracy | 0.7158067158067158 |
| 56 | conv_ai_3_None | ambiguous | accuracy | 0.39040207522697795 |
| 57 | conv_ai_3_None | clarification_needed | accuracy | 0.39040207522697795 |
| 58 | conv_ai_3_None | directly_answer | accuracy | 0.6095979247730221 |
| 59 | conv_ai_3_None | score_give_number | accuracy | 0.057933419801124084 |
| 60 | conv_ai_3_None | score_how_much | accuracy | 0.010376134889753566 |
| 61 | conv_ai_3_None | median | accuracy | 0.39040207522697795 |
| 62 | craigslist_bargains_None | best deal | accuracy | 0.5192629815745393 |
| 63 | craigslist_bargains_None | good deal for seller | accuracy | 0.2529313232830821 |
| 64 | craigslist_bargains_None | good deal for seller no list price | accuracy | 0.09715242881072027 |
| 65 | craigslist_bargains_None | good deal for seller no list price implicit | accuracy | 0.24623115577889448 |
| 66 | craigslist_bargains_None | median | accuracy | 0.2495812395309883 |
| 67 | emotion_None | answer_question_with_emotion_label | accuracy | 0.3375 |
| 68 | emotion_None | answer_with_class_label | accuracy | 0.214 |
| 69 | emotion_None | choose_the_best_emotion_label | accuracy | 0.312 |
| 70 | emotion_None | reply_with_emoation_label | accuracy | 0.4495 |
| 71 | emotion_None | median | accuracy | 0.32475 |
| 72 | financial_phrasebank_sentences_allagree | bullish_neutral_bearish | accuracy | 0.3878091872791519 |
| 73 | financial_phrasebank_sentences_allagree | complementary_industries | accuracy | 0.10114840989399293 |
| 74 | financial_phrasebank_sentences_allagree | sentiment | accuracy | 0.35644876325088337 |
| 75 | financial_phrasebank_sentences_allagree | share_price_option | accuracy | 0.3670494699646643 |
| 76 | financial_phrasebank_sentences_allagree | word_comes_to_mind | accuracy | 0.08259717314487633 |
| 77 | financial_phrasebank_sentences_allagree | median | accuracy | 0.35644876325088337 |
| 78 | glue_cola | Following sentence acceptable | accuracy | 0.37583892617449666 |
| 79 | glue_cola | Make sense yes no | accuracy | 0.33940556088207097 |
| 80 | glue_cola | Previous sentence acceptable | accuracy | 0.31255992329817833 |
| 81 | glue_cola | editing | accuracy | 0.3844678811121764 |
| 82 | glue_cola | is_this_correct | accuracy | 0.37775647171620325 |
| 83 | glue_cola | median | accuracy | 0.37583892617449666 |
| 84 | glue_sst2 | following positive negative | accuracy | 0.9426605504587156 |
| 85 | glue_sst2 | happy or mad | accuracy | 0.8279816513761468 |
| 86 | glue_sst2 | positive negative after | accuracy | 0.9472477064220184 |
| 87 | glue_sst2 | review | accuracy | 0.9254587155963303 |
| 88 | glue_sst2 | said | accuracy | 0.9059633027522935 |
| 89 | glue_sst2 | median | accuracy | 0.9254587155963303 |
| 90 | head_qa_en | multiple_choice_a_and_q_en | accuracy | 0.29428989751098095 |
| 91 | head_qa_en | multiple_choice_a_and_q_with_context_en | accuracy | 0.29502196193265007 |
| 92 | head_qa_en | multiple_choice_q_and_a_en | accuracy | 0.3938506588579795 |
| 93 | head_qa_en | multiple_choice_q_and_a_index_en | accuracy | 0.30307467057101023 |
| 94 | head_qa_en | multiple_choice_q_and_a_index_with_context_en | accuracy | 0.30234260614934116 |
| 95 | head_qa_en | median | accuracy | 0.30234260614934116 |
| 96 | head_qa_es | multiple_choice_a_and_q_en | accuracy | 0.2730600292825769 |
| 97 | head_qa_es | multiple_choice_a_and_q_with_context_en | accuracy | 0.27232796486090777 |
| 98 | head_qa_es | multiple_choice_q_and_a_en | accuracy | 0.36530014641288433 |
| 99 | head_qa_es | multiple_choice_q_and_a_index_en | accuracy | 0.3074670571010249 |
| 100 | head_qa_es | multiple_choice_q_and_a_index_with_context_en | accuracy | 0.3089311859443631 |
| 101 | head_qa_es | median | accuracy | 0.3074670571010249 |
| 102 | health_fact_None | claim_explanation_classification | accuracy | 0.5591836734693878 |
| 103 | health_fact_None | claim_veracity_classification_after_reading_I_believe | accuracy | 0.34938775510204084 |
| 104 | health_fact_None | claim_veracity_classification_tell_me | accuracy | 0.48244897959183675 |
| 105 | health_fact_None | median | accuracy | 0.48244897959183675 |
| 106 | hlgd_None | is_same_event_editor_asks | accuracy | 0.6926051232479459 |
| 107 | hlgd_None | is_same_event_interrogative_talk | accuracy | 0.6582890285161914 |
| 108 | hlgd_None | is_same_event_refer | accuracy | 0.7858869018849686 |
| 109 | hlgd_None | is_same_event_with_time_interrogative_related | accuracy | 0.7839536007733204 |
| 110 | hlgd_None | is_same_event_with_time_interrogative_talk | accuracy | 0.7786370227162881 |
| 111 | hlgd_None | median | accuracy | 0.7786370227162881 |
| 112 | hyperpartisan_news_detection_byarticle | consider_does_it_follow_a_hyperpartisan_argumentation | accuracy | 0.6232558139534884 |
| 113 | hyperpartisan_news_detection_byarticle | consider_it_exhibits_extreme_one_sidedness | accuracy | 0.6310077519379845 |
| 114 | hyperpartisan_news_detection_byarticle | consume_with_caution | accuracy | 0.6294573643410852 |
| 115 | hyperpartisan_news_detection_byarticle | extreme_left_wing_or_right_wing | accuracy | 0.6077519379844961 |
| 116 | hyperpartisan_news_detection_byarticle | follows_hyperpartisan_argumentation | accuracy | 0.627906976744186 |
| 117 | hyperpartisan_news_detection_byarticle | median | accuracy | 0.627906976744186 |
| 118 | liar_None | Given statement guess category | accuracy | 0.19314641744548286 |
| 119 | liar_None | median | accuracy | 0.19314641744548286 |
| 120 | lince_sa_spaeng | express sentiment | accuracy | 0.5696611081226466 |
| 121 | lince_sa_spaeng | negation template | accuracy | 0.3851533082302313 |
| 122 | lince_sa_spaeng | original poster expressed sentiment | accuracy | 0.5841850457235073 |
| 123 | lince_sa_spaeng | sentiment trying to express | accuracy | 0.5809575040344271 |
| 124 | lince_sa_spaeng | the author seem | accuracy | 0.5771920387305003 |
| 125 | lince_sa_spaeng | median | accuracy | 0.5771920387305003 |
| 126 | math_qa_None | choose_correct_og | accuracy | 0.23484087102177553 |
| 127 | math_qa_None | first_choice_then_problem | accuracy | 0.2254606365159129 |
| 128 | math_qa_None | gre_problem | accuracy | 0.21943048576214405 |
| 129 | math_qa_None | pick_the_correct | accuracy | 0.2338358458961474 |
| 130 | math_qa_None | problem_set_type | accuracy | 0.29246231155778896 |
| 131 | math_qa_None | median | accuracy | 0.2338358458961474 |
| 132 | mlsum_es | layman_summ_es | bleu | 0.026830705121606707 |
| 133 | mlsum_es | palm_prompt | bleu | 0.033413101613448924 |
| 134 | mlsum_es | summarise_this_in_es_few_sentences | bleu | 0.02224579465087946 |
| 135 | mlsum_es | median | bleu | 0.026830705121606707 |
| 136 | movie_rationales_None | Evidences + review | accuracy | 0.97 |
| 137 | movie_rationales_None | Evidences sentiment classification | accuracy | 1.0 |
| 138 | movie_rationales_None | Standard binary sentiment analysis | accuracy | 0.95 |
| 139 | movie_rationales_None | median | accuracy | 0.97 |
| 140 | mwsc_None | in-the-sentence | accuracy | 0.6219512195121951 |
| 141 | mwsc_None | in-the-sentence-question-first | accuracy | 0.5853658536585366 |
| 142 | mwsc_None | is-correct | accuracy | 0.5365853658536586 |
| 143 | mwsc_None | options-or | accuracy | 0.6097560975609756 |
| 144 | mwsc_None | what-think | accuracy | 0.6097560975609756 |
| 145 | mwsc_None | median | accuracy | 0.6097560975609756 |
| 146 | onestop_english_None | ara_context | accuracy | 0.3333333333333333 |
| 147 | onestop_english_None | assess | accuracy | 0.3333333333333333 |
| 148 | onestop_english_None | determine_reading_level_from_the_first_three_sentences | accuracy | 0.5696649029982364 |
| 149 | onestop_english_None | esl_context | accuracy | 0.3333333333333333 |
| 150 | onestop_english_None | esl_variation | accuracy | 0.3333333333333333 |
| 151 | onestop_english_None | median | accuracy | 0.3333333333333333 |
| 152 | poem_sentiment_None | guess_sentiment_without_options_variation_1 | accuracy | 0.22857142857142856 |
| 153 | poem_sentiment_None | most_appropriate_sentiment | accuracy | 0.2571428571428571 |
| 154 | poem_sentiment_None | positive_or_negative_sentiment_variation_1 | accuracy | 0.2571428571428571 |
| 155 | poem_sentiment_None | positive_or_negative_sentiment_variation_2 | accuracy | 0.21904761904761905 |
| 156 | poem_sentiment_None | question_answer_format | accuracy | 0.24761904761904763 |
| 157 | poem_sentiment_None | median | accuracy | 0.24761904761904763 |
| 158 | pubmed_qa_pqa_labeled | Long Answer to Final Decision | accuracy | 0.598 |
| 159 | pubmed_qa_pqa_labeled | Question Answering (Short) | accuracy | 0.581 |
| 160 | pubmed_qa_pqa_labeled | median | accuracy | 0.5894999999999999 |
| 161 | riddle_sense_None | answer_given_question_without_options | accuracy | 0.4534769833496572 |
| 162 | riddle_sense_None | most_suitable_answer | accuracy | 0.4348677766895201 |
| 163 | riddle_sense_None | question_answering | accuracy | 0.4407443682664055 |
| 164 | riddle_sense_None | question_to_answer_index | accuracy | 0.3878550440744368 |
| 165 | riddle_sense_None | median | accuracy | 0.43780607247796277 |
| 166 | scicite_None | Classify intent | accuracy | 0.15065502183406113 |
| 167 | scicite_None | Classify intent (choices first) | accuracy | 0.1331877729257642 |
| 168 | scicite_None | Classify intent (select choice) | accuracy | 0.2652838427947598 |
| 169 | scicite_None | Classify intent w/section (select choice) | accuracy | 0.3537117903930131 |
| 170 | scicite_None | can_describe | accuracy | 0.15283842794759825 |
| 171 | scicite_None | median | accuracy | 0.15283842794759825 |
| 172 | selqa_answer_selection_analysis | is-he-talking-about | accuracy | 0.9121019108280255 |
| 173 | selqa_answer_selection_analysis | make-sense-rand | accuracy | 0.9171974522292994 |
| 174 | selqa_answer_selection_analysis | which-answer-1st-vs-random | accuracy | 0.7503184713375797 |
| 175 | selqa_answer_selection_analysis | would-make-sense-qu-rand | accuracy | 0.8993630573248408 |
| 176 | selqa_answer_selection_analysis | median | accuracy | 0.9057324840764331 |
| 177 | snips_built_in_intents_None | categorize_query | accuracy | 0.47865853658536583 |
| 178 | snips_built_in_intents_None | categorize_query_brief | accuracy | 0.375 |
| 179 | snips_built_in_intents_None | intent_query | accuracy | 0.31402439024390244 |
| 180 | snips_built_in_intents_None | query_intent | accuracy | 0.7012195121951219 |
| 181 | snips_built_in_intents_None | voice_intent | accuracy | 0.6128048780487805 |
| 182 | snips_built_in_intents_None | median | accuracy | 0.47865853658536583 |
| 183 | wmt14_fr_en_en-fr | a_good_translation-en-fr-source+target | bleu | 0.02125573406419127 |
| 184 | wmt14_fr_en_en-fr | a_good_translation-en-fr-target | bleu | 0.015697853682886957 |
| 185 | wmt14_fr_en_en-fr | gpt3-en-fr | bleu | 0.0037928468482204985 |
| 186 | wmt14_fr_en_en-fr | version-en-fr-target | bleu | 0.047885599586875285 |
| 187 | wmt14_fr_en_en-fr | xglm-en-fr-target | bleu | 0.021861712984543362 |
| 188 | wmt14_fr_en_en-fr | median | bleu | 0.02125573406419127 |
| 189 | wmt14_fr_en_fr-en | a_good_translation-fr-en-source+target | bleu | 0.3038834619016813 |
| 190 | wmt14_fr_en_fr-en | a_good_translation-fr-en-target | bleu | 0.22361703612398195 |
| 191 | wmt14_fr_en_fr-en | gpt3-fr-en | bleu | 0.17167001660570336 |
| 192 | wmt14_fr_en_fr-en | version-fr-en-target | bleu | 0.23925613843737142 |
| 193 | wmt14_fr_en_fr-en | xglm-fr-en-target | bleu | 0.1410190003658709 |
| 194 | wmt14_fr_en_fr-en | median | bleu | 0.22361703612398195 |
| 195 | wmt14_hi_en_en-hi | a_good_translation-en-hi-source+target | bleu | 0.0018051438917625368 |
| 196 | wmt14_hi_en_en-hi | a_good_translation-en-hi-target | bleu | 0.0018126292465026588 |
| 197 | wmt14_hi_en_en-hi | gpt-3-en-hi-target | bleu | 0.00010782650615890081 |
| 198 | wmt14_hi_en_en-hi | version-en-hi-target | bleu | 0.0018585745110753149 |
| 199 | wmt14_hi_en_en-hi | xglm-en-hi-target | bleu | 2.225608801197892e-05 |
| 200 | wmt14_hi_en_en-hi | median | bleu | 0.0018051438917625368 |
| 201 | wmt14_hi_en_hi-en | a_good_translation-hi-en-source+target | bleu | 0.16056644593701627 |
| 202 | wmt14_hi_en_hi-en | a_good_translation-hi-en-target | bleu | 0.1503249107946881 |
| 203 | wmt14_hi_en_hi-en | gpt-3-hi-en-target | bleu | 0.05607403962346587 |
| 204 | wmt14_hi_en_hi-en | version-hi-en-target | bleu | 0.15167071858881462 |
| 205 | wmt14_hi_en_hi-en | xglm-hi-en-target | bleu | 0.03675518735361532 |
| 206 | wmt14_hi_en_hi-en | median | bleu | 0.1503249107946881 |
| 207 | multiple | average | multiple | 0.42128315936464156 |