初始化项目,由ModelHub XC社区提供模型

Model: bigscience/bloomz-7b1-p3
Source: Original Platform
This commit is contained in:
ModelHub XC
2026-06-15 07:40:14 +08:00
commit 78a6661ff1
634 changed files with 7477 additions and 0 deletions

View File

@@ -0,0 +1,207 @@
dataset,prompt,metric,value
amazon_reviews_multi_en,prompt_body_title_to_star,accuracy,0.6176
amazon_reviews_multi_en,prompt_review_to_star,accuracy,0.5592
amazon_reviews_multi_en,prompt_title_to_star,accuracy,0.3922
amazon_reviews_multi_en,median,accuracy,0.5592
amazon_reviews_multi_es,prompt_body_title_to_star,accuracy,0.5526
amazon_reviews_multi_es,prompt_review_to_star,accuracy,0.5296
amazon_reviews_multi_es,prompt_title_to_star,accuracy,0.3646
amazon_reviews_multi_es,median,accuracy,0.5296
amazon_reviews_multi_fr,prompt_body_title_to_star,accuracy,0.5332
amazon_reviews_multi_fr,prompt_review_to_star,accuracy,0.5182
amazon_reviews_multi_fr,prompt_title_to_star,accuracy,0.3644
amazon_reviews_multi_fr,median,accuracy,0.5182
amazon_reviews_multi_zh,prompt_body_title_to_star,accuracy,0.5174
amazon_reviews_multi_zh,prompt_review_to_star,accuracy,0.5006
amazon_reviews_multi_zh,prompt_title_to_star,accuracy,0.3874
amazon_reviews_multi_zh,median,accuracy,0.5006
aqua_rat_raw,Answer questions from options,accuracy,0.24015748031496062
aqua_rat_raw,answer_quiz,accuracy,0.22440944881889763
aqua_rat_raw,select_the_best_option,accuracy,0.2559055118110236
aqua_rat_raw,median,accuracy,0.24015748031496062
art_None,choose_hypothesis,accuracy,0.5926892950391645
art_None,choose_hypothesis_believable,accuracy,0.5711488250652742
art_None,choose_hypothesis_desc,accuracy,0.5169712793733682
art_None,choose_hypothesis_likely,accuracy,0.5300261096605744
art_None,choose_hypothesis_options,accuracy,0.5672323759791122
art_None,median,accuracy,0.5672323759791122
banking77_None,direct_to_which_department,accuracy,0.16753246753246753
banking77_None,help_page_topic,accuracy,0.26785714285714285
banking77_None,rephrase_as_banking_term,accuracy,0.274025974025974
banking77_None,median,accuracy,0.26785714285714285
blbooksgenre_title_genre_classifiction,classify,accuracy,0.25057603686635943
blbooksgenre_title_genre_classifiction,multi-choice,accuracy,0.25057603686635943
blbooksgenre_title_genre_classifiction,premise_context_first,accuracy,0.7321428571428571
blbooksgenre_title_genre_classifiction,median,accuracy,0.25057603686635943
blimp_adjunct_island,grammatical_between_1_2,accuracy,0.512
blimp_adjunct_island,grammatical_between_A_B,accuracy,0.464
blimp_adjunct_island,grammatical_which_one_1_2,accuracy,0.512
blimp_adjunct_island,single_sentence_bad_yes_no,accuracy,0.52
blimp_adjunct_island,single_sentence_good_yes_no,accuracy,0.493
blimp_adjunct_island,median,accuracy,0.512
climate_fever_None,claim_and_all_supporting_evidences,accuracy,0.3166123778501629
climate_fever_None,fifth_evidence_and_claim_itemization,accuracy,0.4749185667752443
climate_fever_None,first_evidence_and_claim_itemization,accuracy,0.22996742671009773
climate_fever_None,second_evidence_and_claim_itemization,accuracy,0.24625407166123778
climate_fever_None,third_evidence_claim_pair,accuracy,0.24234527687296417
climate_fever_None,median,accuracy,0.24625407166123778
codah_codah,affirmative_instruction_after_sentence_and_choices,accuracy,0.6693083573487032
codah_codah,affirmative_instruction_before_sentence_and_choices,accuracy,0.6509365994236311
codah_codah,interrogative_instruction_after_sentence_and_choices,accuracy,0.6761527377521613
codah_codah,median,accuracy,0.6693083573487032
commonsense_qa_None,answer_given_question_without_options,accuracy,0.6388206388206388
commonsense_qa_None,most_suitable_answer,accuracy,0.7313677313677314
commonsense_qa_None,question_answering,accuracy,0.7158067158067158
commonsense_qa_None,median,accuracy,0.7158067158067158
conv_ai_3_None,ambiguous,accuracy,0.39040207522697795
conv_ai_3_None,clarification_needed,accuracy,0.39040207522697795
conv_ai_3_None,directly_answer,accuracy,0.6095979247730221
conv_ai_3_None,score_give_number,accuracy,0.057933419801124084
conv_ai_3_None,score_how_much,accuracy,0.010376134889753566
conv_ai_3_None,median,accuracy,0.39040207522697795
craigslist_bargains_None,best deal,accuracy,0.5192629815745393
craigslist_bargains_None,good deal for seller,accuracy,0.2529313232830821
craigslist_bargains_None,good deal for seller no list price,accuracy,0.09715242881072027
craigslist_bargains_None,good deal for seller no list price implicit,accuracy,0.24623115577889448
craigslist_bargains_None,median,accuracy,0.2495812395309883
emotion_None,answer_question_with_emotion_label,accuracy,0.3375
emotion_None,answer_with_class_label,accuracy,0.214
emotion_None,choose_the_best_emotion_label,accuracy,0.312
emotion_None,reply_with_emoation_label,accuracy,0.4495
emotion_None,median,accuracy,0.32475
financial_phrasebank_sentences_allagree,bullish_neutral_bearish,accuracy,0.3878091872791519
financial_phrasebank_sentences_allagree,complementary_industries,accuracy,0.10114840989399293
financial_phrasebank_sentences_allagree,sentiment,accuracy,0.35644876325088337
financial_phrasebank_sentences_allagree,share_price_option,accuracy,0.3670494699646643
financial_phrasebank_sentences_allagree,word_comes_to_mind,accuracy,0.08259717314487633
financial_phrasebank_sentences_allagree,median,accuracy,0.35644876325088337
glue_cola,Following sentence acceptable,accuracy,0.37583892617449666
glue_cola,Make sense yes no,accuracy,0.33940556088207097
glue_cola,Previous sentence acceptable,accuracy,0.31255992329817833
glue_cola,editing,accuracy,0.3844678811121764
glue_cola,is_this_correct,accuracy,0.37775647171620325
glue_cola,median,accuracy,0.37583892617449666
glue_sst2,following positive negative,accuracy,0.9426605504587156
glue_sst2,happy or mad,accuracy,0.8279816513761468
glue_sst2,positive negative after,accuracy,0.9472477064220184
glue_sst2,review,accuracy,0.9254587155963303
glue_sst2,said,accuracy,0.9059633027522935
glue_sst2,median,accuracy,0.9254587155963303
head_qa_en,multiple_choice_a_and_q_en,accuracy,0.29428989751098095
head_qa_en,multiple_choice_a_and_q_with_context_en,accuracy,0.29502196193265007
head_qa_en,multiple_choice_q_and_a_en,accuracy,0.3938506588579795
head_qa_en,multiple_choice_q_and_a_index_en,accuracy,0.30307467057101023
head_qa_en,multiple_choice_q_and_a_index_with_context_en,accuracy,0.30234260614934116
head_qa_en,median,accuracy,0.30234260614934116
head_qa_es,multiple_choice_a_and_q_en,accuracy,0.2730600292825769
head_qa_es,multiple_choice_a_and_q_with_context_en,accuracy,0.27232796486090777
head_qa_es,multiple_choice_q_and_a_en,accuracy,0.36530014641288433
head_qa_es,multiple_choice_q_and_a_index_en,accuracy,0.3074670571010249
head_qa_es,multiple_choice_q_and_a_index_with_context_en,accuracy,0.3089311859443631
head_qa_es,median,accuracy,0.3074670571010249
health_fact_None,claim_explanation_classification,accuracy,0.5591836734693878
health_fact_None,claim_veracity_classification_after_reading_I_believe,accuracy,0.34938775510204084
health_fact_None,claim_veracity_classification_tell_me,accuracy,0.48244897959183675
health_fact_None,median,accuracy,0.48244897959183675
hlgd_None,is_same_event_editor_asks,accuracy,0.6926051232479459
hlgd_None,is_same_event_interrogative_talk,accuracy,0.6582890285161914
hlgd_None,is_same_event_refer,accuracy,0.7858869018849686
hlgd_None,is_same_event_with_time_interrogative_related,accuracy,0.7839536007733204
hlgd_None,is_same_event_with_time_interrogative_talk,accuracy,0.7786370227162881
hlgd_None,median,accuracy,0.7786370227162881
hyperpartisan_news_detection_byarticle,consider_does_it_follow_a_hyperpartisan_argumentation,accuracy,0.6232558139534884
hyperpartisan_news_detection_byarticle,consider_it_exhibits_extreme_one_sidedness,accuracy,0.6310077519379845
hyperpartisan_news_detection_byarticle,consume_with_caution,accuracy,0.6294573643410852
hyperpartisan_news_detection_byarticle,extreme_left_wing_or_right_wing,accuracy,0.6077519379844961
hyperpartisan_news_detection_byarticle,follows_hyperpartisan_argumentation,accuracy,0.627906976744186
hyperpartisan_news_detection_byarticle,median,accuracy,0.627906976744186
liar_None,Given statement guess category,accuracy,0.19314641744548286
liar_None,median,accuracy,0.19314641744548286
lince_sa_spaeng,express sentiment,accuracy,0.5696611081226466
lince_sa_spaeng,negation template,accuracy,0.3851533082302313
lince_sa_spaeng,original poster expressed sentiment,accuracy,0.5841850457235073
lince_sa_spaeng,sentiment trying to express,accuracy,0.5809575040344271
lince_sa_spaeng,the author seem,accuracy,0.5771920387305003
lince_sa_spaeng,median,accuracy,0.5771920387305003
math_qa_None,choose_correct_og,accuracy,0.23484087102177553
math_qa_None,first_choice_then_problem,accuracy,0.2254606365159129
math_qa_None,gre_problem,accuracy,0.21943048576214405
math_qa_None,pick_the_correct,accuracy,0.2338358458961474
math_qa_None,problem_set_type,accuracy,0.29246231155778896
math_qa_None,median,accuracy,0.2338358458961474
mlsum_es,layman_summ_es,bleu,0.026830705121606707
mlsum_es,palm_prompt,bleu,0.033413101613448924
mlsum_es,summarise_this_in_es_few_sentences,bleu,0.02224579465087946
mlsum_es,median,bleu,0.026830705121606707
movie_rationales_None,Evidences + review,accuracy,0.97
movie_rationales_None,Evidences sentiment classification,accuracy,1.0
movie_rationales_None,Standard binary sentiment analysis,accuracy,0.95
movie_rationales_None,median,accuracy,0.97
mwsc_None,in-the-sentence,accuracy,0.6219512195121951
mwsc_None,in-the-sentence-question-first,accuracy,0.5853658536585366
mwsc_None,is-correct,accuracy,0.5365853658536586
mwsc_None,options-or,accuracy,0.6097560975609756
mwsc_None,what-think,accuracy,0.6097560975609756
mwsc_None,median,accuracy,0.6097560975609756
onestop_english_None,ara_context,accuracy,0.3333333333333333
onestop_english_None,assess,accuracy,0.3333333333333333
onestop_english_None,determine_reading_level_from_the_first_three_sentences,accuracy,0.5696649029982364
onestop_english_None,esl_context,accuracy,0.3333333333333333
onestop_english_None,esl_variation,accuracy,0.3333333333333333
onestop_english_None,median,accuracy,0.3333333333333333
poem_sentiment_None,guess_sentiment_without_options_variation_1,accuracy,0.22857142857142856
poem_sentiment_None,most_appropriate_sentiment,accuracy,0.2571428571428571
poem_sentiment_None,positive_or_negative_sentiment_variation_1,accuracy,0.2571428571428571
poem_sentiment_None,positive_or_negative_sentiment_variation_2,accuracy,0.21904761904761905
poem_sentiment_None,question_answer_format,accuracy,0.24761904761904763
poem_sentiment_None,median,accuracy,0.24761904761904763
pubmed_qa_pqa_labeled,Long Answer to Final Decision,accuracy,0.598
pubmed_qa_pqa_labeled,Question Answering (Short),accuracy,0.581
pubmed_qa_pqa_labeled,median,accuracy,0.5894999999999999
riddle_sense_None,answer_given_question_without_options,accuracy,0.4534769833496572
riddle_sense_None,most_suitable_answer,accuracy,0.4348677766895201
riddle_sense_None,question_answering,accuracy,0.4407443682664055
riddle_sense_None,question_to_answer_index,accuracy,0.3878550440744368
riddle_sense_None,median,accuracy,0.43780607247796277
scicite_None,Classify intent,accuracy,0.15065502183406113
scicite_None,Classify intent (choices first),accuracy,0.1331877729257642
scicite_None,Classify intent (select choice),accuracy,0.2652838427947598
scicite_None,Classify intent w/section (select choice),accuracy,0.3537117903930131
scicite_None,can_describe,accuracy,0.15283842794759825
scicite_None,median,accuracy,0.15283842794759825
selqa_answer_selection_analysis,is-he-talking-about,accuracy,0.9121019108280255
selqa_answer_selection_analysis,make-sense-rand,accuracy,0.9171974522292994
selqa_answer_selection_analysis,which-answer-1st-vs-random,accuracy,0.7503184713375797
selqa_answer_selection_analysis,would-make-sense-qu-rand,accuracy,0.8993630573248408
selqa_answer_selection_analysis,median,accuracy,0.9057324840764331
snips_built_in_intents_None,categorize_query,accuracy,0.47865853658536583
snips_built_in_intents_None,categorize_query_brief,accuracy,0.375
snips_built_in_intents_None,intent_query,accuracy,0.31402439024390244
snips_built_in_intents_None,query_intent,accuracy,0.7012195121951219
snips_built_in_intents_None,voice_intent,accuracy,0.6128048780487805
snips_built_in_intents_None,median,accuracy,0.47865853658536583
wmt14_fr_en_en-fr,a_good_translation-en-fr-source+target,bleu,0.02125573406419127
wmt14_fr_en_en-fr,a_good_translation-en-fr-target,bleu,0.015697853682886957
wmt14_fr_en_en-fr,gpt3-en-fr,bleu,0.0037928468482204985
wmt14_fr_en_en-fr,version-en-fr-target,bleu,0.047885599586875285
wmt14_fr_en_en-fr,xglm-en-fr-target,bleu,0.021861712984543362
wmt14_fr_en_en-fr,median,bleu,0.02125573406419127
wmt14_fr_en_fr-en,a_good_translation-fr-en-source+target,bleu,0.3038834619016813
wmt14_fr_en_fr-en,a_good_translation-fr-en-target,bleu,0.22361703612398195
wmt14_fr_en_fr-en,gpt3-fr-en,bleu,0.17167001660570336
wmt14_fr_en_fr-en,version-fr-en-target,bleu,0.23925613843737142
wmt14_fr_en_fr-en,xglm-fr-en-target,bleu,0.1410190003658709
wmt14_fr_en_fr-en,median,bleu,0.22361703612398195
wmt14_hi_en_en-hi,a_good_translation-en-hi-source+target,bleu,0.0018051438917625368
wmt14_hi_en_en-hi,a_good_translation-en-hi-target,bleu,0.0018126292465026588
wmt14_hi_en_en-hi,gpt-3-en-hi-target,bleu,0.00010782650615890081
wmt14_hi_en_en-hi,version-en-hi-target,bleu,0.0018585745110753149
wmt14_hi_en_en-hi,xglm-en-hi-target,bleu,2.225608801197892e-05
wmt14_hi_en_en-hi,median,bleu,0.0018051438917625368
wmt14_hi_en_hi-en,a_good_translation-hi-en-source+target,bleu,0.16056644593701627
wmt14_hi_en_hi-en,a_good_translation-hi-en-target,bleu,0.1503249107946881
wmt14_hi_en_hi-en,gpt-3-hi-en-target,bleu,0.05607403962346587
wmt14_hi_en_hi-en,version-hi-en-target,bleu,0.15167071858881462
wmt14_hi_en_hi-en,xglm-hi-en-target,bleu,0.03675518735361532
wmt14_hi_en_hi-en,median,bleu,0.1503249107946881
multiple,average,multiple,0.42128315936464156
1 dataset prompt metric value
2 amazon_reviews_multi_en prompt_body_title_to_star accuracy 0.6176
3 amazon_reviews_multi_en prompt_review_to_star accuracy 0.5592
4 amazon_reviews_multi_en prompt_title_to_star accuracy 0.3922
5 amazon_reviews_multi_en median accuracy 0.5592
6 amazon_reviews_multi_es prompt_body_title_to_star accuracy 0.5526
7 amazon_reviews_multi_es prompt_review_to_star accuracy 0.5296
8 amazon_reviews_multi_es prompt_title_to_star accuracy 0.3646
9 amazon_reviews_multi_es median accuracy 0.5296
10 amazon_reviews_multi_fr prompt_body_title_to_star accuracy 0.5332
11 amazon_reviews_multi_fr prompt_review_to_star accuracy 0.5182
12 amazon_reviews_multi_fr prompt_title_to_star accuracy 0.3644
13 amazon_reviews_multi_fr median accuracy 0.5182
14 amazon_reviews_multi_zh prompt_body_title_to_star accuracy 0.5174
15 amazon_reviews_multi_zh prompt_review_to_star accuracy 0.5006
16 amazon_reviews_multi_zh prompt_title_to_star accuracy 0.3874
17 amazon_reviews_multi_zh median accuracy 0.5006
18 aqua_rat_raw Answer questions from options accuracy 0.24015748031496062
19 aqua_rat_raw answer_quiz accuracy 0.22440944881889763
20 aqua_rat_raw select_the_best_option accuracy 0.2559055118110236
21 aqua_rat_raw median accuracy 0.24015748031496062
22 art_None choose_hypothesis accuracy 0.5926892950391645
23 art_None choose_hypothesis_believable accuracy 0.5711488250652742
24 art_None choose_hypothesis_desc accuracy 0.5169712793733682
25 art_None choose_hypothesis_likely accuracy 0.5300261096605744
26 art_None choose_hypothesis_options accuracy 0.5672323759791122
27 art_None median accuracy 0.5672323759791122
28 banking77_None direct_to_which_department accuracy 0.16753246753246753
29 banking77_None help_page_topic accuracy 0.26785714285714285
30 banking77_None rephrase_as_banking_term accuracy 0.274025974025974
31 banking77_None median accuracy 0.26785714285714285
32 blbooksgenre_title_genre_classifiction classify accuracy 0.25057603686635943
33 blbooksgenre_title_genre_classifiction multi-choice accuracy 0.25057603686635943
34 blbooksgenre_title_genre_classifiction premise_context_first accuracy 0.7321428571428571
35 blbooksgenre_title_genre_classifiction median accuracy 0.25057603686635943
36 blimp_adjunct_island grammatical_between_1_2 accuracy 0.512
37 blimp_adjunct_island grammatical_between_A_B accuracy 0.464
38 blimp_adjunct_island grammatical_which_one_1_2 accuracy 0.512
39 blimp_adjunct_island single_sentence_bad_yes_no accuracy 0.52
40 blimp_adjunct_island single_sentence_good_yes_no accuracy 0.493
41 blimp_adjunct_island median accuracy 0.512
42 climate_fever_None claim_and_all_supporting_evidences accuracy 0.3166123778501629
43 climate_fever_None fifth_evidence_and_claim_itemization accuracy 0.4749185667752443
44 climate_fever_None first_evidence_and_claim_itemization accuracy 0.22996742671009773
45 climate_fever_None second_evidence_and_claim_itemization accuracy 0.24625407166123778
46 climate_fever_None third_evidence_claim_pair accuracy 0.24234527687296417
47 climate_fever_None median accuracy 0.24625407166123778
48 codah_codah affirmative_instruction_after_sentence_and_choices accuracy 0.6693083573487032
49 codah_codah affirmative_instruction_before_sentence_and_choices accuracy 0.6509365994236311
50 codah_codah interrogative_instruction_after_sentence_and_choices accuracy 0.6761527377521613
51 codah_codah median accuracy 0.6693083573487032
52 commonsense_qa_None answer_given_question_without_options accuracy 0.6388206388206388
53 commonsense_qa_None most_suitable_answer accuracy 0.7313677313677314
54 commonsense_qa_None question_answering accuracy 0.7158067158067158
55 commonsense_qa_None median accuracy 0.7158067158067158
56 conv_ai_3_None ambiguous accuracy 0.39040207522697795
57 conv_ai_3_None clarification_needed accuracy 0.39040207522697795
58 conv_ai_3_None directly_answer accuracy 0.6095979247730221
59 conv_ai_3_None score_give_number accuracy 0.057933419801124084
60 conv_ai_3_None score_how_much accuracy 0.010376134889753566
61 conv_ai_3_None median accuracy 0.39040207522697795
62 craigslist_bargains_None best deal accuracy 0.5192629815745393
63 craigslist_bargains_None good deal for seller accuracy 0.2529313232830821
64 craigslist_bargains_None good deal for seller no list price accuracy 0.09715242881072027
65 craigslist_bargains_None good deal for seller no list price implicit accuracy 0.24623115577889448
66 craigslist_bargains_None median accuracy 0.2495812395309883
67 emotion_None answer_question_with_emotion_label accuracy 0.3375
68 emotion_None answer_with_class_label accuracy 0.214
69 emotion_None choose_the_best_emotion_label accuracy 0.312
70 emotion_None reply_with_emoation_label accuracy 0.4495
71 emotion_None median accuracy 0.32475
72 financial_phrasebank_sentences_allagree bullish_neutral_bearish accuracy 0.3878091872791519
73 financial_phrasebank_sentences_allagree complementary_industries accuracy 0.10114840989399293
74 financial_phrasebank_sentences_allagree sentiment accuracy 0.35644876325088337
75 financial_phrasebank_sentences_allagree share_price_option accuracy 0.3670494699646643
76 financial_phrasebank_sentences_allagree word_comes_to_mind accuracy 0.08259717314487633
77 financial_phrasebank_sentences_allagree median accuracy 0.35644876325088337
78 glue_cola Following sentence acceptable accuracy 0.37583892617449666
79 glue_cola Make sense yes no accuracy 0.33940556088207097
80 glue_cola Previous sentence acceptable accuracy 0.31255992329817833
81 glue_cola editing accuracy 0.3844678811121764
82 glue_cola is_this_correct accuracy 0.37775647171620325
83 glue_cola median accuracy 0.37583892617449666
84 glue_sst2 following positive negative accuracy 0.9426605504587156
85 glue_sst2 happy or mad accuracy 0.8279816513761468
86 glue_sst2 positive negative after accuracy 0.9472477064220184
87 glue_sst2 review accuracy 0.9254587155963303
88 glue_sst2 said accuracy 0.9059633027522935
89 glue_sst2 median accuracy 0.9254587155963303
90 head_qa_en multiple_choice_a_and_q_en accuracy 0.29428989751098095
91 head_qa_en multiple_choice_a_and_q_with_context_en accuracy 0.29502196193265007
92 head_qa_en multiple_choice_q_and_a_en accuracy 0.3938506588579795
93 head_qa_en multiple_choice_q_and_a_index_en accuracy 0.30307467057101023
94 head_qa_en multiple_choice_q_and_a_index_with_context_en accuracy 0.30234260614934116
95 head_qa_en median accuracy 0.30234260614934116
96 head_qa_es multiple_choice_a_and_q_en accuracy 0.2730600292825769
97 head_qa_es multiple_choice_a_and_q_with_context_en accuracy 0.27232796486090777
98 head_qa_es multiple_choice_q_and_a_en accuracy 0.36530014641288433
99 head_qa_es multiple_choice_q_and_a_index_en accuracy 0.3074670571010249
100 head_qa_es multiple_choice_q_and_a_index_with_context_en accuracy 0.3089311859443631
101 head_qa_es median accuracy 0.3074670571010249
102 health_fact_None claim_explanation_classification accuracy 0.5591836734693878
103 health_fact_None claim_veracity_classification_after_reading_I_believe accuracy 0.34938775510204084
104 health_fact_None claim_veracity_classification_tell_me accuracy 0.48244897959183675
105 health_fact_None median accuracy 0.48244897959183675
106 hlgd_None is_same_event_editor_asks accuracy 0.6926051232479459
107 hlgd_None is_same_event_interrogative_talk accuracy 0.6582890285161914
108 hlgd_None is_same_event_refer accuracy 0.7858869018849686
109 hlgd_None is_same_event_with_time_interrogative_related accuracy 0.7839536007733204
110 hlgd_None is_same_event_with_time_interrogative_talk accuracy 0.7786370227162881
111 hlgd_None median accuracy 0.7786370227162881
112 hyperpartisan_news_detection_byarticle consider_does_it_follow_a_hyperpartisan_argumentation accuracy 0.6232558139534884
113 hyperpartisan_news_detection_byarticle consider_it_exhibits_extreme_one_sidedness accuracy 0.6310077519379845
114 hyperpartisan_news_detection_byarticle consume_with_caution accuracy 0.6294573643410852
115 hyperpartisan_news_detection_byarticle extreme_left_wing_or_right_wing accuracy 0.6077519379844961
116 hyperpartisan_news_detection_byarticle follows_hyperpartisan_argumentation accuracy 0.627906976744186
117 hyperpartisan_news_detection_byarticle median accuracy 0.627906976744186
118 liar_None Given statement guess category accuracy 0.19314641744548286
119 liar_None median accuracy 0.19314641744548286
120 lince_sa_spaeng express sentiment accuracy 0.5696611081226466
121 lince_sa_spaeng negation template accuracy 0.3851533082302313
122 lince_sa_spaeng original poster expressed sentiment accuracy 0.5841850457235073
123 lince_sa_spaeng sentiment trying to express accuracy 0.5809575040344271
124 lince_sa_spaeng the author seem accuracy 0.5771920387305003
125 lince_sa_spaeng median accuracy 0.5771920387305003
126 math_qa_None choose_correct_og accuracy 0.23484087102177553
127 math_qa_None first_choice_then_problem accuracy 0.2254606365159129
128 math_qa_None gre_problem accuracy 0.21943048576214405
129 math_qa_None pick_the_correct accuracy 0.2338358458961474
130 math_qa_None problem_set_type accuracy 0.29246231155778896
131 math_qa_None median accuracy 0.2338358458961474
132 mlsum_es layman_summ_es bleu 0.026830705121606707
133 mlsum_es palm_prompt bleu 0.033413101613448924
134 mlsum_es summarise_this_in_es_few_sentences bleu 0.02224579465087946
135 mlsum_es median bleu 0.026830705121606707
136 movie_rationales_None Evidences + review accuracy 0.97
137 movie_rationales_None Evidences sentiment classification accuracy 1.0
138 movie_rationales_None Standard binary sentiment analysis accuracy 0.95
139 movie_rationales_None median accuracy 0.97
140 mwsc_None in-the-sentence accuracy 0.6219512195121951
141 mwsc_None in-the-sentence-question-first accuracy 0.5853658536585366
142 mwsc_None is-correct accuracy 0.5365853658536586
143 mwsc_None options-or accuracy 0.6097560975609756
144 mwsc_None what-think accuracy 0.6097560975609756
145 mwsc_None median accuracy 0.6097560975609756
146 onestop_english_None ara_context accuracy 0.3333333333333333
147 onestop_english_None assess accuracy 0.3333333333333333
148 onestop_english_None determine_reading_level_from_the_first_three_sentences accuracy 0.5696649029982364
149 onestop_english_None esl_context accuracy 0.3333333333333333
150 onestop_english_None esl_variation accuracy 0.3333333333333333
151 onestop_english_None median accuracy 0.3333333333333333
152 poem_sentiment_None guess_sentiment_without_options_variation_1 accuracy 0.22857142857142856
153 poem_sentiment_None most_appropriate_sentiment accuracy 0.2571428571428571
154 poem_sentiment_None positive_or_negative_sentiment_variation_1 accuracy 0.2571428571428571
155 poem_sentiment_None positive_or_negative_sentiment_variation_2 accuracy 0.21904761904761905
156 poem_sentiment_None question_answer_format accuracy 0.24761904761904763
157 poem_sentiment_None median accuracy 0.24761904761904763
158 pubmed_qa_pqa_labeled Long Answer to Final Decision accuracy 0.598
159 pubmed_qa_pqa_labeled Question Answering (Short) accuracy 0.581
160 pubmed_qa_pqa_labeled median accuracy 0.5894999999999999
161 riddle_sense_None answer_given_question_without_options accuracy 0.4534769833496572
162 riddle_sense_None most_suitable_answer accuracy 0.4348677766895201
163 riddle_sense_None question_answering accuracy 0.4407443682664055
164 riddle_sense_None question_to_answer_index accuracy 0.3878550440744368
165 riddle_sense_None median accuracy 0.43780607247796277
166 scicite_None Classify intent accuracy 0.15065502183406113
167 scicite_None Classify intent (choices first) accuracy 0.1331877729257642
168 scicite_None Classify intent (select choice) accuracy 0.2652838427947598
169 scicite_None Classify intent w/section (select choice) accuracy 0.3537117903930131
170 scicite_None can_describe accuracy 0.15283842794759825
171 scicite_None median accuracy 0.15283842794759825
172 selqa_answer_selection_analysis is-he-talking-about accuracy 0.9121019108280255
173 selqa_answer_selection_analysis make-sense-rand accuracy 0.9171974522292994
174 selqa_answer_selection_analysis which-answer-1st-vs-random accuracy 0.7503184713375797
175 selqa_answer_selection_analysis would-make-sense-qu-rand accuracy 0.8993630573248408
176 selqa_answer_selection_analysis median accuracy 0.9057324840764331
177 snips_built_in_intents_None categorize_query accuracy 0.47865853658536583
178 snips_built_in_intents_None categorize_query_brief accuracy 0.375
179 snips_built_in_intents_None intent_query accuracy 0.31402439024390244
180 snips_built_in_intents_None query_intent accuracy 0.7012195121951219
181 snips_built_in_intents_None voice_intent accuracy 0.6128048780487805
182 snips_built_in_intents_None median accuracy 0.47865853658536583
183 wmt14_fr_en_en-fr a_good_translation-en-fr-source+target bleu 0.02125573406419127
184 wmt14_fr_en_en-fr a_good_translation-en-fr-target bleu 0.015697853682886957
185 wmt14_fr_en_en-fr gpt3-en-fr bleu 0.0037928468482204985
186 wmt14_fr_en_en-fr version-en-fr-target bleu 0.047885599586875285
187 wmt14_fr_en_en-fr xglm-en-fr-target bleu 0.021861712984543362
188 wmt14_fr_en_en-fr median bleu 0.02125573406419127
189 wmt14_fr_en_fr-en a_good_translation-fr-en-source+target bleu 0.3038834619016813
190 wmt14_fr_en_fr-en a_good_translation-fr-en-target bleu 0.22361703612398195
191 wmt14_fr_en_fr-en gpt3-fr-en bleu 0.17167001660570336
192 wmt14_fr_en_fr-en version-fr-en-target bleu 0.23925613843737142
193 wmt14_fr_en_fr-en xglm-fr-en-target bleu 0.1410190003658709
194 wmt14_fr_en_fr-en median bleu 0.22361703612398195
195 wmt14_hi_en_en-hi a_good_translation-en-hi-source+target bleu 0.0018051438917625368
196 wmt14_hi_en_en-hi a_good_translation-en-hi-target bleu 0.0018126292465026588
197 wmt14_hi_en_en-hi gpt-3-en-hi-target bleu 0.00010782650615890081
198 wmt14_hi_en_en-hi version-en-hi-target bleu 0.0018585745110753149
199 wmt14_hi_en_en-hi xglm-en-hi-target bleu 2.225608801197892e-05
200 wmt14_hi_en_en-hi median bleu 0.0018051438917625368
201 wmt14_hi_en_hi-en a_good_translation-hi-en-source+target bleu 0.16056644593701627
202 wmt14_hi_en_hi-en a_good_translation-hi-en-target bleu 0.1503249107946881
203 wmt14_hi_en_hi-en gpt-3-hi-en-target bleu 0.05607403962346587
204 wmt14_hi_en_hi-en version-hi-en-target bleu 0.15167071858881462
205 wmt14_hi_en_hi-en xglm-hi-en-target bleu 0.03675518735361532
206 wmt14_hi_en_hi-en median bleu 0.1503249107946881
207 multiple average multiple 0.42128315936464156