bloomz-7b1-p3/merged.csv at main

Files

ModelHub XC 78a6661ff1 初始化项目，由ModelHub XC社区提供模型

Model: bigscience/bloomz-7b1-p3
Source: Original Platform

2026-06-15 07:40:14 +08:00

13 KiB

Raw Permalink Blame History

1	dataset	prompt	metric	value
2	amazon_reviews_multi_en	prompt_body_title_to_star	accuracy	0.6176
3	amazon_reviews_multi_en	prompt_review_to_star	accuracy	0.5592
4	amazon_reviews_multi_en	prompt_title_to_star	accuracy	0.3922
5	amazon_reviews_multi_en	median	accuracy	0.5592
6	amazon_reviews_multi_es	prompt_body_title_to_star	accuracy	0.5526
7	amazon_reviews_multi_es	prompt_review_to_star	accuracy	0.5296
8	amazon_reviews_multi_es	prompt_title_to_star	accuracy	0.3646
9	amazon_reviews_multi_es	median	accuracy	0.5296
10	amazon_reviews_multi_fr	prompt_body_title_to_star	accuracy	0.5332
11	amazon_reviews_multi_fr	prompt_review_to_star	accuracy	0.5182
12	amazon_reviews_multi_fr	prompt_title_to_star	accuracy	0.3644
13	amazon_reviews_multi_fr	median	accuracy	0.5182
14	amazon_reviews_multi_zh	prompt_body_title_to_star	accuracy	0.5174
15	amazon_reviews_multi_zh	prompt_review_to_star	accuracy	0.5006
16	amazon_reviews_multi_zh	prompt_title_to_star	accuracy	0.3874
17	amazon_reviews_multi_zh	median	accuracy	0.5006
18	aqua_rat_raw	Answer questions from options	accuracy	0.24015748031496062
19	aqua_rat_raw	answer_quiz	accuracy	0.22440944881889763
20	aqua_rat_raw	select_the_best_option	accuracy	0.2559055118110236
21	aqua_rat_raw	median	accuracy	0.24015748031496062
22	art_None	choose_hypothesis	accuracy	0.5926892950391645
23	art_None	choose_hypothesis_believable	accuracy	0.5711488250652742
24	art_None	choose_hypothesis_desc	accuracy	0.5169712793733682
25	art_None	choose_hypothesis_likely	accuracy	0.5300261096605744
26	art_None	choose_hypothesis_options	accuracy	0.5672323759791122
27	art_None	median	accuracy	0.5672323759791122
28	banking77_None	direct_to_which_department	accuracy	0.16753246753246753
29	banking77_None	help_page_topic	accuracy	0.26785714285714285
30	banking77_None	rephrase_as_banking_term	accuracy	0.274025974025974
31	banking77_None	median	accuracy	0.26785714285714285
32	blbooksgenre_title_genre_classifiction	classify	accuracy	0.25057603686635943
33	blbooksgenre_title_genre_classifiction	multi-choice	accuracy	0.25057603686635943
34	blbooksgenre_title_genre_classifiction	premise_context_first	accuracy	0.7321428571428571
35	blbooksgenre_title_genre_classifiction	median	accuracy	0.25057603686635943
36	blimp_adjunct_island	grammatical_between_1_2	accuracy	0.512
37	blimp_adjunct_island	grammatical_between_A_B	accuracy	0.464
38	blimp_adjunct_island	grammatical_which_one_1_2	accuracy	0.512
39	blimp_adjunct_island	single_sentence_bad_yes_no	accuracy	0.52
40	blimp_adjunct_island	single_sentence_good_yes_no	accuracy	0.493
41	blimp_adjunct_island	median	accuracy	0.512
42	climate_fever_None	claim_and_all_supporting_evidences	accuracy	0.3166123778501629
43	climate_fever_None	fifth_evidence_and_claim_itemization	accuracy	0.4749185667752443
44	climate_fever_None	first_evidence_and_claim_itemization	accuracy	0.22996742671009773
45	climate_fever_None	second_evidence_and_claim_itemization	accuracy	0.24625407166123778
46	climate_fever_None	third_evidence_claim_pair	accuracy	0.24234527687296417
47	climate_fever_None	median	accuracy	0.24625407166123778
48	codah_codah	affirmative_instruction_after_sentence_and_choices	accuracy	0.6693083573487032
49	codah_codah	affirmative_instruction_before_sentence_and_choices	accuracy	0.6509365994236311
50	codah_codah	interrogative_instruction_after_sentence_and_choices	accuracy	0.6761527377521613
51	codah_codah	median	accuracy	0.6693083573487032
52	commonsense_qa_None	answer_given_question_without_options	accuracy	0.6388206388206388
53	commonsense_qa_None	most_suitable_answer	accuracy	0.7313677313677314
54	commonsense_qa_None	question_answering	accuracy	0.7158067158067158
55	commonsense_qa_None	median	accuracy	0.7158067158067158
56	conv_ai_3_None	ambiguous	accuracy	0.39040207522697795
57	conv_ai_3_None	clarification_needed	accuracy	0.39040207522697795
58	conv_ai_3_None	directly_answer	accuracy	0.6095979247730221
59	conv_ai_3_None	score_give_number	accuracy	0.057933419801124084
60	conv_ai_3_None	score_how_much	accuracy	0.010376134889753566
61	conv_ai_3_None	median	accuracy	0.39040207522697795
62	craigslist_bargains_None	best deal	accuracy	0.5192629815745393
63	craigslist_bargains_None	good deal for seller	accuracy	0.2529313232830821
64	craigslist_bargains_None	good deal for seller no list price	accuracy	0.09715242881072027
65	craigslist_bargains_None	good deal for seller no list price implicit	accuracy	0.24623115577889448
66	craigslist_bargains_None	median	accuracy	0.2495812395309883
67	emotion_None	answer_question_with_emotion_label	accuracy	0.3375
68	emotion_None	answer_with_class_label	accuracy	0.214
69	emotion_None	choose_the_best_emotion_label	accuracy	0.312
70	emotion_None	reply_with_emoation_label	accuracy	0.4495
71	emotion_None	median	accuracy	0.32475
72	financial_phrasebank_sentences_allagree	bullish_neutral_bearish	accuracy	0.3878091872791519
73	financial_phrasebank_sentences_allagree	complementary_industries	accuracy	0.10114840989399293
74	financial_phrasebank_sentences_allagree	sentiment	accuracy	0.35644876325088337
75	financial_phrasebank_sentences_allagree	share_price_option	accuracy	0.3670494699646643
76	financial_phrasebank_sentences_allagree	word_comes_to_mind	accuracy	0.08259717314487633
77	financial_phrasebank_sentences_allagree	median	accuracy	0.35644876325088337
78	glue_cola	Following sentence acceptable	accuracy	0.37583892617449666
79	glue_cola	Make sense yes no	accuracy	0.33940556088207097
80	glue_cola	Previous sentence acceptable	accuracy	0.31255992329817833
81	glue_cola	editing	accuracy	0.3844678811121764
82	glue_cola	is_this_correct	accuracy	0.37775647171620325
83	glue_cola	median	accuracy	0.37583892617449666
84	glue_sst2	following positive negative	accuracy	0.9426605504587156
85	glue_sst2	happy or mad	accuracy	0.8279816513761468
86	glue_sst2	positive negative after	accuracy	0.9472477064220184
87	glue_sst2	review	accuracy	0.9254587155963303
88	glue_sst2	said	accuracy	0.9059633027522935
89	glue_sst2	median	accuracy	0.9254587155963303
90	head_qa_en	multiple_choice_a_and_q_en	accuracy	0.29428989751098095
91	head_qa_en	multiple_choice_a_and_q_with_context_en	accuracy	0.29502196193265007
92	head_qa_en	multiple_choice_q_and_a_en	accuracy	0.3938506588579795
93	head_qa_en	multiple_choice_q_and_a_index_en	accuracy	0.30307467057101023
94	head_qa_en	multiple_choice_q_and_a_index_with_context_en	accuracy	0.30234260614934116
95	head_qa_en	median	accuracy	0.30234260614934116
96	head_qa_es	multiple_choice_a_and_q_en	accuracy	0.2730600292825769
97	head_qa_es	multiple_choice_a_and_q_with_context_en	accuracy	0.27232796486090777
98	head_qa_es	multiple_choice_q_and_a_en	accuracy	0.36530014641288433
99	head_qa_es	multiple_choice_q_and_a_index_en	accuracy	0.3074670571010249
100	head_qa_es	multiple_choice_q_and_a_index_with_context_en	accuracy	0.3089311859443631
101	head_qa_es	median	accuracy	0.3074670571010249
102	health_fact_None	claim_explanation_classification	accuracy	0.5591836734693878
103	health_fact_None	claim_veracity_classification_after_reading_I_believe	accuracy	0.34938775510204084
104	health_fact_None	claim_veracity_classification_tell_me	accuracy	0.48244897959183675
105	health_fact_None	median	accuracy	0.48244897959183675
106	hlgd_None	is_same_event_editor_asks	accuracy	0.6926051232479459
107	hlgd_None	is_same_event_interrogative_talk	accuracy	0.6582890285161914
108	hlgd_None	is_same_event_refer	accuracy	0.7858869018849686
109	hlgd_None	is_same_event_with_time_interrogative_related	accuracy	0.7839536007733204
110	hlgd_None	is_same_event_with_time_interrogative_talk	accuracy	0.7786370227162881
111	hlgd_None	median	accuracy	0.7786370227162881
112	hyperpartisan_news_detection_byarticle	consider_does_it_follow_a_hyperpartisan_argumentation	accuracy	0.6232558139534884
113	hyperpartisan_news_detection_byarticle	consider_it_exhibits_extreme_one_sidedness	accuracy	0.6310077519379845
114	hyperpartisan_news_detection_byarticle	consume_with_caution	accuracy	0.6294573643410852
115	hyperpartisan_news_detection_byarticle	extreme_left_wing_or_right_wing	accuracy	0.6077519379844961
116	hyperpartisan_news_detection_byarticle	follows_hyperpartisan_argumentation	accuracy	0.627906976744186
117	hyperpartisan_news_detection_byarticle	median	accuracy	0.627906976744186
118	liar_None	Given statement guess category	accuracy	0.19314641744548286
119	liar_None	median	accuracy	0.19314641744548286
120	lince_sa_spaeng	express sentiment	accuracy	0.5696611081226466
121	lince_sa_spaeng	negation template	accuracy	0.3851533082302313
122	lince_sa_spaeng	original poster expressed sentiment	accuracy	0.5841850457235073
123	lince_sa_spaeng	sentiment trying to express	accuracy	0.5809575040344271
124	lince_sa_spaeng	the author seem	accuracy	0.5771920387305003
125	lince_sa_spaeng	median	accuracy	0.5771920387305003
126	math_qa_None	choose_correct_og	accuracy	0.23484087102177553
127	math_qa_None	first_choice_then_problem	accuracy	0.2254606365159129
128	math_qa_None	gre_problem	accuracy	0.21943048576214405
129	math_qa_None	pick_the_correct	accuracy	0.2338358458961474
130	math_qa_None	problem_set_type	accuracy	0.29246231155778896
131	math_qa_None	median	accuracy	0.2338358458961474
132	mlsum_es	layman_summ_es	bleu	0.026830705121606707
133	mlsum_es	palm_prompt	bleu	0.033413101613448924
134	mlsum_es	summarise_this_in_es_few_sentences	bleu	0.02224579465087946
135	mlsum_es	median	bleu	0.026830705121606707
136	movie_rationales_None	Evidences + review	accuracy	0.97
137	movie_rationales_None	Evidences sentiment classification	accuracy	1.0
138	movie_rationales_None	Standard binary sentiment analysis	accuracy	0.95
139	movie_rationales_None	median	accuracy	0.97
140	mwsc_None	in-the-sentence	accuracy	0.6219512195121951
141	mwsc_None	in-the-sentence-question-first	accuracy	0.5853658536585366
142	mwsc_None	is-correct	accuracy	0.5365853658536586
143	mwsc_None	options-or	accuracy	0.6097560975609756
144	mwsc_None	what-think	accuracy	0.6097560975609756
145	mwsc_None	median	accuracy	0.6097560975609756
146	onestop_english_None	ara_context	accuracy	0.3333333333333333
147	onestop_english_None	assess	accuracy	0.3333333333333333
148	onestop_english_None	determine_reading_level_from_the_first_three_sentences	accuracy	0.5696649029982364
149	onestop_english_None	esl_context	accuracy	0.3333333333333333
150	onestop_english_None	esl_variation	accuracy	0.3333333333333333
151	onestop_english_None	median	accuracy	0.3333333333333333
152	poem_sentiment_None	guess_sentiment_without_options_variation_1	accuracy	0.22857142857142856
153	poem_sentiment_None	most_appropriate_sentiment	accuracy	0.2571428571428571
154	poem_sentiment_None	positive_or_negative_sentiment_variation_1	accuracy	0.2571428571428571
155	poem_sentiment_None	positive_or_negative_sentiment_variation_2	accuracy	0.21904761904761905
156	poem_sentiment_None	question_answer_format	accuracy	0.24761904761904763
157	poem_sentiment_None	median	accuracy	0.24761904761904763
158	pubmed_qa_pqa_labeled	Long Answer to Final Decision	accuracy	0.598
159	pubmed_qa_pqa_labeled	Question Answering (Short)	accuracy	0.581
160	pubmed_qa_pqa_labeled	median	accuracy	0.5894999999999999
161	riddle_sense_None	answer_given_question_without_options	accuracy	0.4534769833496572
162	riddle_sense_None	most_suitable_answer	accuracy	0.4348677766895201
163	riddle_sense_None	question_answering	accuracy	0.4407443682664055
164	riddle_sense_None	question_to_answer_index	accuracy	0.3878550440744368
165	riddle_sense_None	median	accuracy	0.43780607247796277
166	scicite_None	Classify intent	accuracy	0.15065502183406113
167	scicite_None	Classify intent (choices first)	accuracy	0.1331877729257642
168	scicite_None	Classify intent (select choice)	accuracy	0.2652838427947598
169	scicite_None	Classify intent w/section (select choice)	accuracy	0.3537117903930131
170	scicite_None	can_describe	accuracy	0.15283842794759825
171	scicite_None	median	accuracy	0.15283842794759825
172	selqa_answer_selection_analysis	is-he-talking-about	accuracy	0.9121019108280255
173	selqa_answer_selection_analysis	make-sense-rand	accuracy	0.9171974522292994
174	selqa_answer_selection_analysis	which-answer-1st-vs-random	accuracy	0.7503184713375797
175	selqa_answer_selection_analysis	would-make-sense-qu-rand	accuracy	0.8993630573248408
176	selqa_answer_selection_analysis	median	accuracy	0.9057324840764331
177	snips_built_in_intents_None	categorize_query	accuracy	0.47865853658536583
178	snips_built_in_intents_None	categorize_query_brief	accuracy	0.375
179	snips_built_in_intents_None	intent_query	accuracy	0.31402439024390244
180	snips_built_in_intents_None	query_intent	accuracy	0.7012195121951219
181	snips_built_in_intents_None	voice_intent	accuracy	0.6128048780487805
182	snips_built_in_intents_None	median	accuracy	0.47865853658536583
183	wmt14_fr_en_en-fr	a_good_translation-en-fr-source+target	bleu	0.02125573406419127
184	wmt14_fr_en_en-fr	a_good_translation-en-fr-target	bleu	0.015697853682886957
185	wmt14_fr_en_en-fr	gpt3-en-fr	bleu	0.0037928468482204985
186	wmt14_fr_en_en-fr	version-en-fr-target	bleu	0.047885599586875285
187	wmt14_fr_en_en-fr	xglm-en-fr-target	bleu	0.021861712984543362
188	wmt14_fr_en_en-fr	median	bleu	0.02125573406419127
189	wmt14_fr_en_fr-en	a_good_translation-fr-en-source+target	bleu	0.3038834619016813
190	wmt14_fr_en_fr-en	a_good_translation-fr-en-target	bleu	0.22361703612398195
191	wmt14_fr_en_fr-en	gpt3-fr-en	bleu	0.17167001660570336
192	wmt14_fr_en_fr-en	version-fr-en-target	bleu	0.23925613843737142
193	wmt14_fr_en_fr-en	xglm-fr-en-target	bleu	0.1410190003658709
194	wmt14_fr_en_fr-en	median	bleu	0.22361703612398195
195	wmt14_hi_en_en-hi	a_good_translation-en-hi-source+target	bleu	0.0018051438917625368
196	wmt14_hi_en_en-hi	a_good_translation-en-hi-target	bleu	0.0018126292465026588
197	wmt14_hi_en_en-hi	gpt-3-en-hi-target	bleu	0.00010782650615890081
198	wmt14_hi_en_en-hi	version-en-hi-target	bleu	0.0018585745110753149
199	wmt14_hi_en_en-hi	xglm-en-hi-target	bleu	2.225608801197892e-05
200	wmt14_hi_en_en-hi	median	bleu	0.0018051438917625368
201	wmt14_hi_en_hi-en	a_good_translation-hi-en-source+target	bleu	0.16056644593701627
202	wmt14_hi_en_hi-en	a_good_translation-hi-en-target	bleu	0.1503249107946881
203	wmt14_hi_en_hi-en	gpt-3-hi-en-target	bleu	0.05607403962346587
204	wmt14_hi_en_hi-en	version-hi-en-target	bleu	0.15167071858881462
205	wmt14_hi_en_hi-en	xglm-hi-en-target	bleu	0.03675518735361532
206	wmt14_hi_en_hi-en	median	bleu	0.1503249107946881
207	multiple	average	multiple	0.42128315936464156

13 KiB Raw Permalink Blame History

13 KiB

Raw Permalink Blame History