208 lines
13 KiB
CSV
208 lines
13 KiB
CSV
dataset,prompt,metric,value
|
|
amazon_reviews_multi_en,prompt_body_title_to_star,accuracy,0.6176
|
|
amazon_reviews_multi_en,prompt_review_to_star,accuracy,0.5592
|
|
amazon_reviews_multi_en,prompt_title_to_star,accuracy,0.3922
|
|
amazon_reviews_multi_en,median,accuracy,0.5592
|
|
amazon_reviews_multi_es,prompt_body_title_to_star,accuracy,0.5526
|
|
amazon_reviews_multi_es,prompt_review_to_star,accuracy,0.5296
|
|
amazon_reviews_multi_es,prompt_title_to_star,accuracy,0.3646
|
|
amazon_reviews_multi_es,median,accuracy,0.5296
|
|
amazon_reviews_multi_fr,prompt_body_title_to_star,accuracy,0.5332
|
|
amazon_reviews_multi_fr,prompt_review_to_star,accuracy,0.5182
|
|
amazon_reviews_multi_fr,prompt_title_to_star,accuracy,0.3644
|
|
amazon_reviews_multi_fr,median,accuracy,0.5182
|
|
amazon_reviews_multi_zh,prompt_body_title_to_star,accuracy,0.5174
|
|
amazon_reviews_multi_zh,prompt_review_to_star,accuracy,0.5006
|
|
amazon_reviews_multi_zh,prompt_title_to_star,accuracy,0.3874
|
|
amazon_reviews_multi_zh,median,accuracy,0.5006
|
|
aqua_rat_raw,Answer questions from options,accuracy,0.24015748031496062
|
|
aqua_rat_raw,answer_quiz,accuracy,0.22440944881889763
|
|
aqua_rat_raw,select_the_best_option,accuracy,0.2559055118110236
|
|
aqua_rat_raw,median,accuracy,0.24015748031496062
|
|
art_None,choose_hypothesis,accuracy,0.5926892950391645
|
|
art_None,choose_hypothesis_believable,accuracy,0.5711488250652742
|
|
art_None,choose_hypothesis_desc,accuracy,0.5169712793733682
|
|
art_None,choose_hypothesis_likely,accuracy,0.5300261096605744
|
|
art_None,choose_hypothesis_options,accuracy,0.5672323759791122
|
|
art_None,median,accuracy,0.5672323759791122
|
|
banking77_None,direct_to_which_department,accuracy,0.16753246753246753
|
|
banking77_None,help_page_topic,accuracy,0.26785714285714285
|
|
banking77_None,rephrase_as_banking_term,accuracy,0.274025974025974
|
|
banking77_None,median,accuracy,0.26785714285714285
|
|
blbooksgenre_title_genre_classifiction,classify,accuracy,0.25057603686635943
|
|
blbooksgenre_title_genre_classifiction,multi-choice,accuracy,0.25057603686635943
|
|
blbooksgenre_title_genre_classifiction,premise_context_first,accuracy,0.7321428571428571
|
|
blbooksgenre_title_genre_classifiction,median,accuracy,0.25057603686635943
|
|
blimp_adjunct_island,grammatical_between_1_2,accuracy,0.512
|
|
blimp_adjunct_island,grammatical_between_A_B,accuracy,0.464
|
|
blimp_adjunct_island,grammatical_which_one_1_2,accuracy,0.512
|
|
blimp_adjunct_island,single_sentence_bad_yes_no,accuracy,0.52
|
|
blimp_adjunct_island,single_sentence_good_yes_no,accuracy,0.493
|
|
blimp_adjunct_island,median,accuracy,0.512
|
|
climate_fever_None,claim_and_all_supporting_evidences,accuracy,0.3166123778501629
|
|
climate_fever_None,fifth_evidence_and_claim_itemization,accuracy,0.4749185667752443
|
|
climate_fever_None,first_evidence_and_claim_itemization,accuracy,0.22996742671009773
|
|
climate_fever_None,second_evidence_and_claim_itemization,accuracy,0.24625407166123778
|
|
climate_fever_None,third_evidence_claim_pair,accuracy,0.24234527687296417
|
|
climate_fever_None,median,accuracy,0.24625407166123778
|
|
codah_codah,affirmative_instruction_after_sentence_and_choices,accuracy,0.6693083573487032
|
|
codah_codah,affirmative_instruction_before_sentence_and_choices,accuracy,0.6509365994236311
|
|
codah_codah,interrogative_instruction_after_sentence_and_choices,accuracy,0.6761527377521613
|
|
codah_codah,median,accuracy,0.6693083573487032
|
|
commonsense_qa_None,answer_given_question_without_options,accuracy,0.6388206388206388
|
|
commonsense_qa_None,most_suitable_answer,accuracy,0.7313677313677314
|
|
commonsense_qa_None,question_answering,accuracy,0.7158067158067158
|
|
commonsense_qa_None,median,accuracy,0.7158067158067158
|
|
conv_ai_3_None,ambiguous,accuracy,0.39040207522697795
|
|
conv_ai_3_None,clarification_needed,accuracy,0.39040207522697795
|
|
conv_ai_3_None,directly_answer,accuracy,0.6095979247730221
|
|
conv_ai_3_None,score_give_number,accuracy,0.057933419801124084
|
|
conv_ai_3_None,score_how_much,accuracy,0.010376134889753566
|
|
conv_ai_3_None,median,accuracy,0.39040207522697795
|
|
craigslist_bargains_None,best deal,accuracy,0.5192629815745393
|
|
craigslist_bargains_None,good deal for seller,accuracy,0.2529313232830821
|
|
craigslist_bargains_None,good deal for seller no list price,accuracy,0.09715242881072027
|
|
craigslist_bargains_None,good deal for seller no list price implicit,accuracy,0.24623115577889448
|
|
craigslist_bargains_None,median,accuracy,0.2495812395309883
|
|
emotion_None,answer_question_with_emotion_label,accuracy,0.3375
|
|
emotion_None,answer_with_class_label,accuracy,0.214
|
|
emotion_None,choose_the_best_emotion_label,accuracy,0.312
|
|
emotion_None,reply_with_emoation_label,accuracy,0.4495
|
|
emotion_None,median,accuracy,0.32475
|
|
financial_phrasebank_sentences_allagree,bullish_neutral_bearish,accuracy,0.3878091872791519
|
|
financial_phrasebank_sentences_allagree,complementary_industries,accuracy,0.10114840989399293
|
|
financial_phrasebank_sentences_allagree,sentiment,accuracy,0.35644876325088337
|
|
financial_phrasebank_sentences_allagree,share_price_option,accuracy,0.3670494699646643
|
|
financial_phrasebank_sentences_allagree,word_comes_to_mind,accuracy,0.08259717314487633
|
|
financial_phrasebank_sentences_allagree,median,accuracy,0.35644876325088337
|
|
glue_cola,Following sentence acceptable,accuracy,0.37583892617449666
|
|
glue_cola,Make sense yes no,accuracy,0.33940556088207097
|
|
glue_cola,Previous sentence acceptable,accuracy,0.31255992329817833
|
|
glue_cola,editing,accuracy,0.3844678811121764
|
|
glue_cola,is_this_correct,accuracy,0.37775647171620325
|
|
glue_cola,median,accuracy,0.37583892617449666
|
|
glue_sst2,following positive negative,accuracy,0.9426605504587156
|
|
glue_sst2,happy or mad,accuracy,0.8279816513761468
|
|
glue_sst2,positive negative after,accuracy,0.9472477064220184
|
|
glue_sst2,review,accuracy,0.9254587155963303
|
|
glue_sst2,said,accuracy,0.9059633027522935
|
|
glue_sst2,median,accuracy,0.9254587155963303
|
|
head_qa_en,multiple_choice_a_and_q_en,accuracy,0.29428989751098095
|
|
head_qa_en,multiple_choice_a_and_q_with_context_en,accuracy,0.29502196193265007
|
|
head_qa_en,multiple_choice_q_and_a_en,accuracy,0.3938506588579795
|
|
head_qa_en,multiple_choice_q_and_a_index_en,accuracy,0.30307467057101023
|
|
head_qa_en,multiple_choice_q_and_a_index_with_context_en,accuracy,0.30234260614934116
|
|
head_qa_en,median,accuracy,0.30234260614934116
|
|
head_qa_es,multiple_choice_a_and_q_en,accuracy,0.2730600292825769
|
|
head_qa_es,multiple_choice_a_and_q_with_context_en,accuracy,0.27232796486090777
|
|
head_qa_es,multiple_choice_q_and_a_en,accuracy,0.36530014641288433
|
|
head_qa_es,multiple_choice_q_and_a_index_en,accuracy,0.3074670571010249
|
|
head_qa_es,multiple_choice_q_and_a_index_with_context_en,accuracy,0.3089311859443631
|
|
head_qa_es,median,accuracy,0.3074670571010249
|
|
health_fact_None,claim_explanation_classification,accuracy,0.5591836734693878
|
|
health_fact_None,claim_veracity_classification_after_reading_I_believe,accuracy,0.34938775510204084
|
|
health_fact_None,claim_veracity_classification_tell_me,accuracy,0.48244897959183675
|
|
health_fact_None,median,accuracy,0.48244897959183675
|
|
hlgd_None,is_same_event_editor_asks,accuracy,0.6926051232479459
|
|
hlgd_None,is_same_event_interrogative_talk,accuracy,0.6582890285161914
|
|
hlgd_None,is_same_event_refer,accuracy,0.7858869018849686
|
|
hlgd_None,is_same_event_with_time_interrogative_related,accuracy,0.7839536007733204
|
|
hlgd_None,is_same_event_with_time_interrogative_talk,accuracy,0.7786370227162881
|
|
hlgd_None,median,accuracy,0.7786370227162881
|
|
hyperpartisan_news_detection_byarticle,consider_does_it_follow_a_hyperpartisan_argumentation,accuracy,0.6232558139534884
|
|
hyperpartisan_news_detection_byarticle,consider_it_exhibits_extreme_one_sidedness,accuracy,0.6310077519379845
|
|
hyperpartisan_news_detection_byarticle,consume_with_caution,accuracy,0.6294573643410852
|
|
hyperpartisan_news_detection_byarticle,extreme_left_wing_or_right_wing,accuracy,0.6077519379844961
|
|
hyperpartisan_news_detection_byarticle,follows_hyperpartisan_argumentation,accuracy,0.627906976744186
|
|
hyperpartisan_news_detection_byarticle,median,accuracy,0.627906976744186
|
|
liar_None,Given statement guess category,accuracy,0.19314641744548286
|
|
liar_None,median,accuracy,0.19314641744548286
|
|
lince_sa_spaeng,express sentiment,accuracy,0.5696611081226466
|
|
lince_sa_spaeng,negation template,accuracy,0.3851533082302313
|
|
lince_sa_spaeng,original poster expressed sentiment,accuracy,0.5841850457235073
|
|
lince_sa_spaeng,sentiment trying to express,accuracy,0.5809575040344271
|
|
lince_sa_spaeng,the author seem,accuracy,0.5771920387305003
|
|
lince_sa_spaeng,median,accuracy,0.5771920387305003
|
|
math_qa_None,choose_correct_og,accuracy,0.23484087102177553
|
|
math_qa_None,first_choice_then_problem,accuracy,0.2254606365159129
|
|
math_qa_None,gre_problem,accuracy,0.21943048576214405
|
|
math_qa_None,pick_the_correct,accuracy,0.2338358458961474
|
|
math_qa_None,problem_set_type,accuracy,0.29246231155778896
|
|
math_qa_None,median,accuracy,0.2338358458961474
|
|
mlsum_es,layman_summ_es,bleu,0.026830705121606707
|
|
mlsum_es,palm_prompt,bleu,0.033413101613448924
|
|
mlsum_es,summarise_this_in_es_few_sentences,bleu,0.02224579465087946
|
|
mlsum_es,median,bleu,0.026830705121606707
|
|
movie_rationales_None,Evidences + review,accuracy,0.97
|
|
movie_rationales_None,Evidences sentiment classification,accuracy,1.0
|
|
movie_rationales_None,Standard binary sentiment analysis,accuracy,0.95
|
|
movie_rationales_None,median,accuracy,0.97
|
|
mwsc_None,in-the-sentence,accuracy,0.6219512195121951
|
|
mwsc_None,in-the-sentence-question-first,accuracy,0.5853658536585366
|
|
mwsc_None,is-correct,accuracy,0.5365853658536586
|
|
mwsc_None,options-or,accuracy,0.6097560975609756
|
|
mwsc_None,what-think,accuracy,0.6097560975609756
|
|
mwsc_None,median,accuracy,0.6097560975609756
|
|
onestop_english_None,ara_context,accuracy,0.3333333333333333
|
|
onestop_english_None,assess,accuracy,0.3333333333333333
|
|
onestop_english_None,determine_reading_level_from_the_first_three_sentences,accuracy,0.5696649029982364
|
|
onestop_english_None,esl_context,accuracy,0.3333333333333333
|
|
onestop_english_None,esl_variation,accuracy,0.3333333333333333
|
|
onestop_english_None,median,accuracy,0.3333333333333333
|
|
poem_sentiment_None,guess_sentiment_without_options_variation_1,accuracy,0.22857142857142856
|
|
poem_sentiment_None,most_appropriate_sentiment,accuracy,0.2571428571428571
|
|
poem_sentiment_None,positive_or_negative_sentiment_variation_1,accuracy,0.2571428571428571
|
|
poem_sentiment_None,positive_or_negative_sentiment_variation_2,accuracy,0.21904761904761905
|
|
poem_sentiment_None,question_answer_format,accuracy,0.24761904761904763
|
|
poem_sentiment_None,median,accuracy,0.24761904761904763
|
|
pubmed_qa_pqa_labeled,Long Answer to Final Decision,accuracy,0.598
|
|
pubmed_qa_pqa_labeled,Question Answering (Short),accuracy,0.581
|
|
pubmed_qa_pqa_labeled,median,accuracy,0.5894999999999999
|
|
riddle_sense_None,answer_given_question_without_options,accuracy,0.4534769833496572
|
|
riddle_sense_None,most_suitable_answer,accuracy,0.4348677766895201
|
|
riddle_sense_None,question_answering,accuracy,0.4407443682664055
|
|
riddle_sense_None,question_to_answer_index,accuracy,0.3878550440744368
|
|
riddle_sense_None,median,accuracy,0.43780607247796277
|
|
scicite_None,Classify intent,accuracy,0.15065502183406113
|
|
scicite_None,Classify intent (choices first),accuracy,0.1331877729257642
|
|
scicite_None,Classify intent (select choice),accuracy,0.2652838427947598
|
|
scicite_None,Classify intent w/section (select choice),accuracy,0.3537117903930131
|
|
scicite_None,can_describe,accuracy,0.15283842794759825
|
|
scicite_None,median,accuracy,0.15283842794759825
|
|
selqa_answer_selection_analysis,is-he-talking-about,accuracy,0.9121019108280255
|
|
selqa_answer_selection_analysis,make-sense-rand,accuracy,0.9171974522292994
|
|
selqa_answer_selection_analysis,which-answer-1st-vs-random,accuracy,0.7503184713375797
|
|
selqa_answer_selection_analysis,would-make-sense-qu-rand,accuracy,0.8993630573248408
|
|
selqa_answer_selection_analysis,median,accuracy,0.9057324840764331
|
|
snips_built_in_intents_None,categorize_query,accuracy,0.47865853658536583
|
|
snips_built_in_intents_None,categorize_query_brief,accuracy,0.375
|
|
snips_built_in_intents_None,intent_query,accuracy,0.31402439024390244
|
|
snips_built_in_intents_None,query_intent,accuracy,0.7012195121951219
|
|
snips_built_in_intents_None,voice_intent,accuracy,0.6128048780487805
|
|
snips_built_in_intents_None,median,accuracy,0.47865853658536583
|
|
wmt14_fr_en_en-fr,a_good_translation-en-fr-source+target,bleu,0.02125573406419127
|
|
wmt14_fr_en_en-fr,a_good_translation-en-fr-target,bleu,0.015697853682886957
|
|
wmt14_fr_en_en-fr,gpt3-en-fr,bleu,0.0037928468482204985
|
|
wmt14_fr_en_en-fr,version-en-fr-target,bleu,0.047885599586875285
|
|
wmt14_fr_en_en-fr,xglm-en-fr-target,bleu,0.021861712984543362
|
|
wmt14_fr_en_en-fr,median,bleu,0.02125573406419127
|
|
wmt14_fr_en_fr-en,a_good_translation-fr-en-source+target,bleu,0.3038834619016813
|
|
wmt14_fr_en_fr-en,a_good_translation-fr-en-target,bleu,0.22361703612398195
|
|
wmt14_fr_en_fr-en,gpt3-fr-en,bleu,0.17167001660570336
|
|
wmt14_fr_en_fr-en,version-fr-en-target,bleu,0.23925613843737142
|
|
wmt14_fr_en_fr-en,xglm-fr-en-target,bleu,0.1410190003658709
|
|
wmt14_fr_en_fr-en,median,bleu,0.22361703612398195
|
|
wmt14_hi_en_en-hi,a_good_translation-en-hi-source+target,bleu,0.0018051438917625368
|
|
wmt14_hi_en_en-hi,a_good_translation-en-hi-target,bleu,0.0018126292465026588
|
|
wmt14_hi_en_en-hi,gpt-3-en-hi-target,bleu,0.00010782650615890081
|
|
wmt14_hi_en_en-hi,version-en-hi-target,bleu,0.0018585745110753149
|
|
wmt14_hi_en_en-hi,xglm-en-hi-target,bleu,2.225608801197892e-05
|
|
wmt14_hi_en_en-hi,median,bleu,0.0018051438917625368
|
|
wmt14_hi_en_hi-en,a_good_translation-hi-en-source+target,bleu,0.16056644593701627
|
|
wmt14_hi_en_hi-en,a_good_translation-hi-en-target,bleu,0.1503249107946881
|
|
wmt14_hi_en_hi-en,gpt-3-hi-en-target,bleu,0.05607403962346587
|
|
wmt14_hi_en_hi-en,version-hi-en-target,bleu,0.15167071858881462
|
|
wmt14_hi_en_hi-en,xglm-hi-en-target,bleu,0.03675518735361532
|
|
wmt14_hi_en_hi-en,median,bleu,0.1503249107946881
|
|
multiple,average,multiple,0.42128315936464156
|