初始化项目，由ModelHub XC社区提供模型

Model: bigscience/bloomz-7b1-p3 Source: Original Platform
2026-06-15 07:40:14 +08:00
commit 78a6661ff1
634 changed files with 7477 additions and 0 deletions
--- a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/ar/Answer_Given_options_armt/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/ar/Answer_Given_options_armt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "ar",
+  "template_name": "Answer Given options_armt",
+  "evaluation": {
+    "accuracy": 0.7061548643282595
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='Answer Given options_armt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/ar/Choose_Story_Ending_armt/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/ar/Choose_Story_Ending_armt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "ar",
+  "template_name": "Choose Story Ending_armt",
+  "evaluation": {
+    "accuracy": 0.786896095301125
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='Choose Story Ending_armt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/ar/Generate_Ending_armt/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/ar/Generate_Ending_armt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "ar",
+  "template_name": "Generate Ending_armt",
+  "evaluation": {
+    "accuracy": 0.600926538716082
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='Generate Ending_armt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/ar/Novel_Correct_Ending_armt/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/ar/Novel_Correct_Ending_armt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "ar",
+  "template_name": "Novel Correct Ending_armt",
+  "evaluation": {
+    "accuracy": 0.7511581733951026
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='Novel Correct Ending_armt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/ar/Story_Continuation_and_Options_armt/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/ar/Story_Continuation_and_Options_armt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "ar",
+  "template_name": "Story Continuation and Options_armt",
+  "evaluation": {
+    "accuracy": 0.757114493712773
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='Story Continuation and Options_armt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/es/Answer_Given_options_esmt/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/es/Answer_Given_options_esmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "es",
+  "template_name": "Answer Given options_esmt",
+  "evaluation": {
+    "accuracy": 0.7902051621442753
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='Answer Given options_esmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/es/Choose_Story_Ending_esmt/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/es/Choose_Story_Ending_esmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "es",
+  "template_name": "Choose Story Ending_esmt",
+  "evaluation": {
+    "accuracy": 0.8160158835208471
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='Choose Story Ending_esmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/es/Generate_Ending_esmt/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/es/Generate_Ending_esmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "es",
+  "template_name": "Generate Ending_esmt",
+  "evaluation": {
+    "accuracy": 0.657180675049636
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='Generate Ending_esmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/es/Novel_Correct_Ending_esmt/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/es/Novel_Correct_Ending_esmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "es",
+  "template_name": "Novel Correct Ending_esmt",
+  "evaluation": {
+    "accuracy": 0.784910655195235
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='Novel Correct Ending_esmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/es/Story_Continuation_and_Options_esmt/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/es/Story_Continuation_and_Options_esmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "es",
+  "template_name": "Story Continuation and Options_esmt",
+  "evaluation": {
+    "accuracy": 0.7696889477167439
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='Story Continuation and Options_esmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/eu/Answer_Given_options_eumt/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/eu/Answer_Given_options_eumt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "eu",
+  "template_name": "Answer Given options_eumt",
+  "evaluation": {
+    "accuracy": 0.6227663798808736
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='eu', template_name='Answer Given options_eumt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/eu/Choose_Story_Ending_eumt/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/eu/Choose_Story_Ending_eumt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "eu",
+  "template_name": "Choose Story Ending_eumt",
+  "evaluation": {
+    "accuracy": 0.6763732627399074
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='eu', template_name='Choose Story Ending_eumt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/eu/Generate_Ending_eumt/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/eu/Generate_Ending_eumt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "eu",
+  "template_name": "Generate Ending_eumt",
+  "evaluation": {
+    "accuracy": 0.5737921906022502
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='eu', template_name='Generate Ending_eumt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/eu/Novel_Correct_Ending_eumt/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/eu/Novel_Correct_Ending_eumt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "eu",
+  "template_name": "Novel Correct Ending_eumt",
+  "evaluation": {
+    "accuracy": 0.686300463269358
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='eu', template_name='Novel Correct Ending_eumt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/eu/Story_Continuation_and_Options_eumt/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/eu/Story_Continuation_and_Options_eumt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "eu",
+  "template_name": "Story Continuation and Options_eumt",
+  "evaluation": {
+    "accuracy": 0.6637988087359364
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='eu', template_name='Story Continuation and Options_eumt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/hi/Answer_Given_options_himt/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/hi/Answer_Given_options_himt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "hi",
+  "template_name": "Answer Given options_himt",
+  "evaluation": {
+    "accuracy": 0.6697551290536069
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='Answer Given options_himt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/hi/Choose_Story_Ending_himt/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/hi/Choose_Story_Ending_himt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "hi",
+  "template_name": "Choose Story Ending_himt",
+  "evaluation": {
+    "accuracy": 0.7160820648577101
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='Choose Story Ending_himt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/hi/Generate_Ending_himt/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/hi/Generate_Ending_himt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "hi",
+  "template_name": "Generate Ending_himt",
+  "evaluation": {
+    "accuracy": 0.5923229649238915
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='Generate Ending_himt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/hi/Novel_Correct_Ending_himt/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/hi/Novel_Correct_Ending_himt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "hi",
+  "template_name": "Novel Correct Ending_himt",
+  "evaluation": {
+    "accuracy": 0.6882859033752482
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='Novel Correct Ending_himt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/hi/Story_Continuation_and_Options_himt/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/hi/Story_Continuation_and_Options_himt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "hi",
+  "template_name": "Story Continuation and Options_himt",
+  "evaluation": {
+    "accuracy": 0.7048312375909993
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='Story Continuation and Options_himt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/id/Answer_Given_options_idmt/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/id/Answer_Given_options_idmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "id",
+  "template_name": "Answer Given options_idmt",
+  "evaluation": {
+    "accuracy": 0.7346128391793514
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='Answer Given options_idmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/id/Choose_Story_Ending_idmt/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/id/Choose_Story_Ending_idmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "id",
+  "template_name": "Choose Story Ending_idmt",
+  "evaluation": {
+    "accuracy": 0.7511581733951026
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='Choose Story Ending_idmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/id/Generate_Ending_idmt/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/id/Generate_Ending_idmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "id",
+  "template_name": "Generate Ending_idmt",
+  "evaluation": {
+    "accuracy": 0.6201191264063534
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='Generate Ending_idmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/id/Novel_Correct_Ending_idmt/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/id/Novel_Correct_Ending_idmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "id",
+  "template_name": "Novel Correct Ending_idmt",
+  "evaluation": {
+    "accuracy": 0.728656518861681
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='Novel Correct Ending_idmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/id/Story_Continuation_and_Options_idmt/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/id/Story_Continuation_and_Options_idmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "id",
+  "template_name": "Story Continuation and Options_idmt",
+  "evaluation": {
+    "accuracy": 0.7412309728656519
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='Story Continuation and Options_idmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/zh/Answer_Given_options_zhmt/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/zh/Answer_Given_options_zhmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "zh",
+  "template_name": "Answer Given options_zhmt",
+  "evaluation": {
+    "accuracy": 0.7425545996029119
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='Answer Given options_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/zh/Choose_Story_Ending_zhmt/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/zh/Choose_Story_Ending_zhmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "zh",
+  "template_name": "Choose Story Ending_zhmt",
+  "evaluation": {
+    "accuracy": 0.7941760423560555
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='Choose Story Ending_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/zh/Generate_Ending_zhmt/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/zh/Generate_Ending_zhmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "zh",
+  "template_name": "Generate Ending_zhmt",
+  "evaluation": {
+    "accuracy": 0.6247518199867638
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='Generate Ending_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/zh/Novel_Correct_Ending_zhmt/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/zh/Novel_Correct_Ending_zhmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "zh",
+  "template_name": "Novel Correct Ending_zhmt",
+  "evaluation": {
+    "accuracy": 0.7842488418266049
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='Novel Correct Ending_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/zh/Story_Continuation_and_Options_zhmt/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xstory_cloze/zh/Story_Continuation_and_Options_zhmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "zh",
+  "template_name": "Story Continuation and Options_zhmt",
+  "evaluation": {
+    "accuracy": 0.8034414295168762
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='Story Continuation and Options_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/fr/Replace_frmt/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/fr/Replace_frmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "fr",
+  "template_name": "Replace_frmt",
+  "evaluation": {
+    "accuracy": 0.5180722891566265
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='fr', template_name='Replace_frmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/fr/True_or_False_frmt/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/fr/True_or_False_frmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "fr",
+  "template_name": "True or False_frmt",
+  "evaluation": {
+    "accuracy": 0.46987951807228917
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='fr', template_name='True or False_frmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/fr/does_underscore_refer_to_frmt/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/fr/does_underscore_refer_to_frmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "fr",
+  "template_name": "does underscore refer to_frmt",
+  "evaluation": {
+    "accuracy": 0.5421686746987951
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='fr', template_name='does underscore refer to_frmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/fr/stand_for_frmt/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/fr/stand_for_frmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "fr",
+  "template_name": "stand for_frmt",
+  "evaluation": {
+    "accuracy": 0.5060240963855421
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='fr', template_name='stand for_frmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/fr/underscore_refer_to_frmt/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/fr/underscore_refer_to_frmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "fr",
+  "template_name": "underscore refer to_frmt",
+  "evaluation": {
+    "accuracy": 0.5421686746987951
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='fr', template_name='underscore refer to_frmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/pt/Replace_ptmt/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/pt/Replace_ptmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "pt",
+  "template_name": "Replace_ptmt",
+  "evaluation": {
+    "accuracy": 0.5057034220532319
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='pt', template_name='Replace_ptmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/pt/True_or_False_ptmt/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/pt/True_or_False_ptmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "pt",
+  "template_name": "True or False_ptmt",
+  "evaluation": {
+    "accuracy": 0.5133079847908745
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='pt', template_name='True or False_ptmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/pt/does_underscore_refer_to_ptmt/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/pt/does_underscore_refer_to_ptmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "pt",
+  "template_name": "does underscore refer to_ptmt",
+  "evaluation": {
+    "accuracy": 0.5209125475285171
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='pt', template_name='does underscore refer to_ptmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/pt/stand_for_ptmt/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/pt/stand_for_ptmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "pt",
+  "template_name": "stand for_ptmt",
+  "evaluation": {
+    "accuracy": 0.5209125475285171
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='pt', template_name='stand for_ptmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/pt/underscore_refer_to_ptmt/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/pt/underscore_refer_to_ptmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "pt",
+  "template_name": "underscore refer to_ptmt",
+  "evaluation": {
+    "accuracy": 0.49049429657794674
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='pt', template_name='underscore refer to_ptmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/zh/Replace_zhmt/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/zh/Replace_zhmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "zh",
+  "template_name": "Replace_zhmt",
+  "evaluation": {
+    "accuracy": 0.5238095238095238
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='zh', template_name='Replace_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/zh/True_or_False_zhmt/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/zh/True_or_False_zhmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "zh",
+  "template_name": "True or False_zhmt",
+  "evaluation": {
+    "accuracy": 0.5138888888888888
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='zh', template_name='True or False_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/zh/does_underscore_refer_to_zhmt/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/zh/does_underscore_refer_to_zhmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "zh",
+  "template_name": "does underscore refer to_zhmt",
+  "evaluation": {
+    "accuracy": 0.49404761904761907
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='zh', template_name='does underscore refer to_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/zh/stand_for_zhmt/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/zh/stand_for_zhmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "zh",
+  "template_name": "stand for_zhmt",
+  "evaluation": {
+    "accuracy": 0.49603174603174605
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='zh', template_name='stand for_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/zh/underscore_refer_to_zhmt/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/Muennighoff_xwinograd/zh/underscore_refer_to_zhmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "zh",
+  "template_name": "underscore refer to_zhmt",
+  "evaluation": {
+    "accuracy": 0.503968253968254
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='zh', template_name='underscore refer to_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/merged.csv
+++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/merged.csv
@@ -0,0 +1,86 @@
+dataset,prompt,metric,value
+xcopa_id,C1 or C2? premise_idmt,accuracy,0.51
+xcopa_id,best_option_idmt,accuracy,0.53
+xcopa_id,cause_effect_idmt,accuracy,0.69
+xcopa_id,i_am_hesitating_idmt,accuracy,0.64
+xcopa_id,plausible_alternatives_idmt,accuracy,0.7
+xcopa_id,median,accuracy,0.64
+xcopa_sw,C1 or C2? premise_swmt,accuracy,0.6
+xcopa_sw,best_option_swmt,accuracy,0.62
+xcopa_sw,cause_effect_swmt,accuracy,0.49
+xcopa_sw,i_am_hesitating_swmt,accuracy,0.56
+xcopa_sw,plausible_alternatives_swmt,accuracy,0.54
+xcopa_sw,median,accuracy,0.56
+xcopa_ta,C1 or C2? premise_tamt,accuracy,0.52
+xcopa_ta,best_option_tamt,accuracy,0.55
+xcopa_ta,cause_effect_tamt,accuracy,0.63
+xcopa_ta,i_am_hesitating_tamt,accuracy,0.63
+xcopa_ta,plausible_alternatives_tamt,accuracy,0.66
+xcopa_ta,median,accuracy,0.63
+xcopa_vi,C1 or C2? premise_vimt,accuracy,0.55
+xcopa_vi,best_option_vimt,accuracy,0.61
+xcopa_vi,cause_effect_vimt,accuracy,0.64
+xcopa_vi,i_am_hesitating_vimt,accuracy,0.6
+xcopa_vi,plausible_alternatives_vimt,accuracy,0.64
+xcopa_vi,median,accuracy,0.61
+xcopa_zh,C1 or C2? premise_zhmt,accuracy,0.52
+xcopa_zh,best_option_zhmt,accuracy,0.61
+xcopa_zh,cause_effect_zhmt,accuracy,0.75
+xcopa_zh,i_am_hesitating_zhmt,accuracy,0.72
+xcopa_zh,plausible_alternatives_zhmt,accuracy,0.76
+xcopa_zh,median,accuracy,0.72
+xstory_cloze_ar,Answer Given options_armt,accuracy,0.7061548643282595
+xstory_cloze_ar,Choose Story Ending_armt,accuracy,0.786896095301125
+xstory_cloze_ar,Generate Ending_armt,accuracy,0.600926538716082
+xstory_cloze_ar,Novel Correct Ending_armt,accuracy,0.7511581733951026
+xstory_cloze_ar,Story Continuation and Options_armt,accuracy,0.757114493712773
+xstory_cloze_ar,median,accuracy,0.7511581733951026
+xstory_cloze_es,Answer Given options_esmt,accuracy,0.7902051621442753
+xstory_cloze_es,Choose Story Ending_esmt,accuracy,0.8160158835208471
+xstory_cloze_es,Generate Ending_esmt,accuracy,0.657180675049636
+xstory_cloze_es,Novel Correct Ending_esmt,accuracy,0.784910655195235
+xstory_cloze_es,Story Continuation and Options_esmt,accuracy,0.7696889477167439
+xstory_cloze_es,median,accuracy,0.784910655195235
+xstory_cloze_eu,Answer Given options_eumt,accuracy,0.6227663798808736
+xstory_cloze_eu,Choose Story Ending_eumt,accuracy,0.6763732627399074
+xstory_cloze_eu,Generate Ending_eumt,accuracy,0.5737921906022502
+xstory_cloze_eu,Novel Correct Ending_eumt,accuracy,0.686300463269358
+xstory_cloze_eu,Story Continuation and Options_eumt,accuracy,0.6637988087359364
+xstory_cloze_eu,median,accuracy,0.6637988087359364
+xstory_cloze_hi,Answer Given options_himt,accuracy,0.6697551290536069
+xstory_cloze_hi,Choose Story Ending_himt,accuracy,0.7160820648577101
+xstory_cloze_hi,Generate Ending_himt,accuracy,0.5923229649238915
+xstory_cloze_hi,Novel Correct Ending_himt,accuracy,0.6882859033752482
+xstory_cloze_hi,Story Continuation and Options_himt,accuracy,0.7048312375909993
+xstory_cloze_hi,median,accuracy,0.6882859033752482
+xstory_cloze_id,Answer Given options_idmt,accuracy,0.7346128391793514
+xstory_cloze_id,Choose Story Ending_idmt,accuracy,0.7511581733951026
+xstory_cloze_id,Generate Ending_idmt,accuracy,0.6201191264063534
+xstory_cloze_id,Novel Correct Ending_idmt,accuracy,0.728656518861681
+xstory_cloze_id,Story Continuation and Options_idmt,accuracy,0.7412309728656519
+xstory_cloze_id,median,accuracy,0.7346128391793514
+xstory_cloze_zh,Answer Given options_zhmt,accuracy,0.7425545996029119
+xstory_cloze_zh,Choose Story Ending_zhmt,accuracy,0.7941760423560555
+xstory_cloze_zh,Generate Ending_zhmt,accuracy,0.6247518199867638
+xstory_cloze_zh,Novel Correct Ending_zhmt,accuracy,0.7842488418266049
+xstory_cloze_zh,Story Continuation and Options_zhmt,accuracy,0.8034414295168762
+xstory_cloze_zh,median,accuracy,0.7842488418266049
+xwinograd_fr,Replace_frmt,accuracy,0.5180722891566265
+xwinograd_fr,True or False_frmt,accuracy,0.46987951807228917
+xwinograd_fr,does underscore refer to_frmt,accuracy,0.5421686746987951
+xwinograd_fr,stand for_frmt,accuracy,0.5060240963855421
+xwinograd_fr,underscore refer to_frmt,accuracy,0.5421686746987951
+xwinograd_fr,median,accuracy,0.5180722891566265
+xwinograd_pt,Replace_ptmt,accuracy,0.5057034220532319
+xwinograd_pt,True or False_ptmt,accuracy,0.5133079847908745
+xwinograd_pt,does underscore refer to_ptmt,accuracy,0.5209125475285171
+xwinograd_pt,stand for_ptmt,accuracy,0.5209125475285171
+xwinograd_pt,underscore refer to_ptmt,accuracy,0.49049429657794674
+xwinograd_pt,median,accuracy,0.5133079847908745
+xwinograd_zh,Replace_zhmt,accuracy,0.5238095238095238
+xwinograd_zh,True or False_zhmt,accuracy,0.5138888888888888
+xwinograd_zh,does underscore refer to_zhmt,accuracy,0.49404761904761907
+xwinograd_zh,stand for_zhmt,accuracy,0.49603174603174605
+xwinograd_zh,underscore refer to_zhmt,accuracy,0.503968253968254
+xwinograd_zh,median,accuracy,0.503968253968254
+multiple,average,multiple,0.6501688392588024
--- a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/merged.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/merged.json
--- a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/id/C1_or_C2?_premise_idmt/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/id/C1_or_C2?_premise_idmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "id",
+  "template_name": "C1 or C2? premise_idmt",
+  "evaluation": {
+    "accuracy": 0.51
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='C1 or C2? premise_idmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/id/best_option_idmt/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/id/best_option_idmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "id",
+  "template_name": "best_option_idmt",
+  "evaluation": {
+    "accuracy": 0.53
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='best_option_idmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/id/cause_effect_idmt/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/id/cause_effect_idmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "id",
+  "template_name": "cause_effect_idmt",
+  "evaluation": {
+    "accuracy": 0.69
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='cause_effect_idmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/id/i_am_hesitating_idmt/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/id/i_am_hesitating_idmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "id",
+  "template_name": "i_am_hesitating_idmt",
+  "evaluation": {
+    "accuracy": 0.64
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='i_am_hesitating_idmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/id/plausible_alternatives_idmt/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/id/plausible_alternatives_idmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "id",
+  "template_name": "plausible_alternatives_idmt",
+  "evaluation": {
+    "accuracy": 0.7
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='plausible_alternatives_idmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/sw/C1_or_C2?_premise_swmt/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/sw/C1_or_C2?_premise_swmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "sw",
+  "template_name": "C1 or C2? premise_swmt",
+  "evaluation": {
+    "accuracy": 0.6
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='C1 or C2? premise_swmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/sw/best_option_swmt/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/sw/best_option_swmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "sw",
+  "template_name": "best_option_swmt",
+  "evaluation": {
+    "accuracy": 0.62
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='best_option_swmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/sw/cause_effect_swmt/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/sw/cause_effect_swmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "sw",
+  "template_name": "cause_effect_swmt",
+  "evaluation": {
+    "accuracy": 0.49
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='cause_effect_swmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/sw/i_am_hesitating_swmt/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/sw/i_am_hesitating_swmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "sw",
+  "template_name": "i_am_hesitating_swmt",
+  "evaluation": {
+    "accuracy": 0.56
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='i_am_hesitating_swmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/sw/plausible_alternatives_swmt/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/sw/plausible_alternatives_swmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "sw",
+  "template_name": "plausible_alternatives_swmt",
+  "evaluation": {
+    "accuracy": 0.54
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='plausible_alternatives_swmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/ta/C1_or_C2?_premise_tamt/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/ta/C1_or_C2?_premise_tamt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "ta",
+  "template_name": "C1 or C2? premise_tamt",
+  "evaluation": {
+    "accuracy": 0.52
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ta', template_name='C1 or C2? premise_tamt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/ta/best_option_tamt/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/ta/best_option_tamt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "ta",
+  "template_name": "best_option_tamt",
+  "evaluation": {
+    "accuracy": 0.55
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ta', template_name='best_option_tamt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/ta/cause_effect_tamt/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/ta/cause_effect_tamt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "ta",
+  "template_name": "cause_effect_tamt",
+  "evaluation": {
+    "accuracy": 0.63
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ta', template_name='cause_effect_tamt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/ta/i_am_hesitating_tamt/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/ta/i_am_hesitating_tamt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "ta",
+  "template_name": "i_am_hesitating_tamt",
+  "evaluation": {
+    "accuracy": 0.63
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ta', template_name='i_am_hesitating_tamt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/ta/plausible_alternatives_tamt/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/ta/plausible_alternatives_tamt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "ta",
+  "template_name": "plausible_alternatives_tamt",
+  "evaluation": {
+    "accuracy": 0.66
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ta', template_name='plausible_alternatives_tamt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/vi/C1_or_C2?_premise_vimt/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/vi/C1_or_C2?_premise_vimt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "vi",
+  "template_name": "C1 or C2? premise_vimt",
+  "evaluation": {
+    "accuracy": 0.55
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='C1 or C2? premise_vimt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/vi/best_option_vimt/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/vi/best_option_vimt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "vi",
+  "template_name": "best_option_vimt",
+  "evaluation": {
+    "accuracy": 0.61
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='best_option_vimt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/vi/cause_effect_vimt/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/vi/cause_effect_vimt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "vi",
+  "template_name": "cause_effect_vimt",
+  "evaluation": {
+    "accuracy": 0.64
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='cause_effect_vimt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/vi/i_am_hesitating_vimt/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/vi/i_am_hesitating_vimt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "vi",
+  "template_name": "i_am_hesitating_vimt",
+  "evaluation": {
+    "accuracy": 0.6
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='i_am_hesitating_vimt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/vi/plausible_alternatives_vimt/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/vi/plausible_alternatives_vimt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "vi",
+  "template_name": "plausible_alternatives_vimt",
+  "evaluation": {
+    "accuracy": 0.64
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='plausible_alternatives_vimt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/zh/C1_or_C2?_premise_zhmt/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/zh/C1_or_C2?_premise_zhmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "zh",
+  "template_name": "C1 or C2? premise_zhmt",
+  "evaluation": {
+    "accuracy": 0.52
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='C1 or C2? premise_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/zh/best_option_zhmt/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/zh/best_option_zhmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "zh",
+  "template_name": "best_option_zhmt",
+  "evaluation": {
+    "accuracy": 0.61
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='best_option_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/zh/cause_effect_zhmt/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/zh/cause_effect_zhmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "zh",
+  "template_name": "cause_effect_zhmt",
+  "evaluation": {
+    "accuracy": 0.75
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='cause_effect_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/zh/i_am_hesitating_zhmt/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/zh/i_am_hesitating_zhmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "zh",
+  "template_name": "i_am_hesitating_zhmt",
+  "evaluation": {
+    "accuracy": 0.72
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='i_am_hesitating_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/zh/plausible_alternatives_zhmt/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_xwinostorycopamt/xcopa/zh/plausible_alternatives_zhmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "zh",
+  "template_name": "plausible_alternatives_zhmt",
+  "evaluation": {
+    "accuracy": 0.76
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='plausible_alternatives_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}