初始化项目，由ModelHub XC社区提供模型

Model: bigscience/bloomz-7b1-p3 Source: Original Platform
2026-06-15 07:40:14 +08:00
commit 78a6661ff1
634 changed files with 7477 additions and 0 deletions
--- a/.gitattributes
+++ b/.gitattributes
@@ -0,0 +1,75 @@
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+ 
+ 
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+ 
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+ 
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+ 
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+ 
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zstandard filter=lfs diff=lfs merge=lfs -text
+*.tfevents* filter=lfs diff=lfs merge=lfs -text
+*.db* filter=lfs diff=lfs merge=lfs -text
+*.ark* filter=lfs diff=lfs merge=lfs -text
+**/*ckpt*data* filter=lfs diff=lfs merge=lfs -text
+**/*ckpt*.meta filter=lfs diff=lfs merge=lfs -text
+**/*ckpt*.index filter=lfs diff=lfs merge=lfs -text
+ 
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.gguf* filter=lfs diff=lfs merge=lfs -text
+*.ggml filter=lfs diff=lfs merge=lfs -text
+*.llamafile* filter=lfs diff=lfs merge=lfs -text
+*.pt2 filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+ 
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+
+evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=a_good_translation-fr-en-source+target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.jsonl filter=lfs diff=lfs merge=lfs -text
+evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=mlsum_es.templates=layman_summ_es.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:40.jsonl filter=lfs diff=lfs merge=lfs -text
+evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=a_good_translation-en-fr-source+target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:43.jsonl filter=lfs diff=lfs merge=lfs -text
+evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=mlsum_es.templates=palm_prompt.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:40.jsonl filter=lfs diff=lfs merge=lfs -text
+evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=mlsum_es.templates=summarise_this_in_es_few_sentences.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:40.jsonl filter=lfs diff=lfs merge=lfs -text
+evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=gpt3-fr-en.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.jsonl filter=lfs diff=lfs merge=lfs -text
+evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=a_good_translation-en-fr-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:43.jsonl filter=lfs diff=lfs merge=lfs -text
+evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=a_good_translation-en-hi-source+target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.jsonl filter=lfs diff=lfs merge=lfs -text
+evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=version-fr-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.jsonl filter=lfs diff=lfs merge=lfs -text
+evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=xglm-fr-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.jsonl filter=lfs diff=lfs merge=lfs -text
+evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=gpt3-en-fr.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:43.jsonl filter=lfs diff=lfs merge=lfs -text
+evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=version-en-fr-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:43.jsonl filter=lfs diff=lfs merge=lfs -text
+evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=xglm-en-fr-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:43.jsonl filter=lfs diff=lfs merge=lfs -text
+evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=a_good_translation-en-hi-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.jsonl filter=lfs diff=lfs merge=lfs -text
+evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=a_good_translation-hi-en-source+target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.jsonl filter=lfs diff=lfs merge=lfs -text
+evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=a_good_translation-hi-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.jsonl filter=lfs diff=lfs merge=lfs -text
+evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=version-en-hi-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.jsonl filter=lfs diff=lfs merge=lfs -text
+evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=xglm-en-hi-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.jsonl filter=lfs diff=lfs merge=lfs -text
+evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=version-hi-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.jsonl filter=lfs diff=lfs merge=lfs -text
+evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=xglm-hi-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.jsonl filter=lfs diff=lfs merge=lfs -text
+evaluation_bloomz-7b1-p3/evaluation_val/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_fr_en.templates=a_good_translation-fr-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-09T23:48:38.jsonl filter=lfs diff=lfs merge=lfs -text
+evaluation_bloomz-7b1-p3/evaluation_val/wmt14_hi_en/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=gpt-3-en-hi-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-10T11:48:47.jsonl filter=lfs diff=lfs merge=lfs -text
+evaluation_bloomz-7b1-p3/evaluation_val/wmt14_hi_en/examples.limited=3000.model=p31lossseqglobal_step1000.task=wmt14_hi_en.templates=gpt-3-hi-en-target.fewshot=0.batchsize=16.seed=1234.timestamp=2022-09-10T11:48:47.jsonl filter=lfs diff=lfs merge=lfs -text
+
+pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text
+model.safetensors filter=lfs diff=lfs merge=lfs -text
--- a/README.md
+++ b/README.md
@@ -0,0 +1,882 @@
+---
+datasets:
+- Muennighoff/P3
+license: bigscience-bloom-rail-1.0
+language:
+- ak
+- ar
+- as
+- bm
+- bn
+- ca
+- code
+- en
+- es
+- eu
+- fon
+- fr
+- gu
+- hi
+- id
+- ig
+- ki
+- kn
+- lg
+- ln
+- ml
+- mr
+- ne
+- nso
+- ny
+- or
+- pa
+- pt
+- rn
+- rw
+- sn
+- st
+- sw
+- ta
+- te
+- tn
+- ts
+- tum
+- tw
+- ur
+- vi
+- wo
+- xh
+- yo
+- zh
+- zu
+programming_language: 
+- C
+- C++
+- C#
+- Go
+- Java
+- JavaScript
+- Lua
+- PHP
+- Python
+- Ruby
+- Rust
+- Scala
+- TypeScript
+pipeline_tag: text-generation
+widget:
+- text: "一个传奇的开端，一个不灭的神话，这不仅仅是一部电影，而是作为一个走进新时代的标签，永远彪炳史册。Would you rate the previous review as positive, neutral or negative?"
+  example_title: "zh-en sentiment"
+- text: "一个传奇的开端，一个不灭的神话，这不仅仅是一部电影，而是作为一个走进新时代的标签，永远彪炳史册。你认为这句话的立场是赞扬、中立还是批评？"
+  example_title: "zh-zh sentiment"
+- text: "Suggest at least five related search terms to \"Mạng neural nhân tạo\"."
+  example_title: "vi-en query"
+- text: "Proposez au moins cinq mots clés concernant «Réseau de neurones artificiels»."
+  example_title: "fr-fr query"
+- text: "Explain in a sentence in Telugu what is backpropagation in neural networks."
+  example_title: "te-en qa"
+- text: "Why is the sky blue?"
+  example_title: "en-en qa"
+- text: "Write a fairy tale about a troll saving a princess from a dangerous dragon. The fairy tale is a masterpiece that has achieved praise worldwide and its moral is \"Heroes Come in All Shapes and Sizes\". Story (in Spanish):"
+  example_title: "es-en fable"
+- text: "Write a fable about wood elves living in a forest that is suddenly invaded by ogres. The fable is a masterpiece that has achieved praise worldwide and its moral is \"Violence is the last refuge of the incompetent\". Fable (in Hindi):"
+  example_title: "hi-en fable"
+model-index:
+- name: bloomz-7b1-p3
+  results:
+  - task:
+      type: Coreference resolution
+    dataset:
+      type: winogrande
+      name: Winogrande XL (xl)
+      config: xl
+      split: validation
+      revision: a80f460359d1e9a67c006011c94de42a8759430c
+    metrics:
+    - type: Accuracy
+      value: 54.06
+  - task:
+      type: Coreference resolution
+    dataset:
+      type: Muennighoff/xwinograd
+      name: XWinograd (en)
+      config: en
+      split: test
+      revision: 9dd5ea5505fad86b7bedad667955577815300cee
+    metrics:
+    - type: Accuracy
+      value: 53.72
+  - task:
+      type: Coreference resolution
+    dataset:
+      type: Muennighoff/xwinograd
+      name: XWinograd (fr)
+      config: fr
+      split: test
+      revision: 9dd5ea5505fad86b7bedad667955577815300cee
+    metrics:
+    - type: Accuracy
+      value: 55.42
+  - task:
+      type: Coreference resolution
+    dataset:
+      type: Muennighoff/xwinograd
+      name: XWinograd (jp)
+      config: jp
+      split: test
+      revision: 9dd5ea5505fad86b7bedad667955577815300cee
+    metrics:
+    - type: Accuracy
+      value: 51.93
+  - task:
+      type: Coreference resolution
+    dataset:
+      type: Muennighoff/xwinograd
+      name: XWinograd (pt)
+      config: pt
+      split: test
+      revision: 9dd5ea5505fad86b7bedad667955577815300cee
+    metrics:
+    - type: Accuracy
+      value: 53.99
+  - task:
+      type: Coreference resolution
+    dataset:
+      type: Muennighoff/xwinograd
+      name: XWinograd (ru)
+      config: ru
+      split: test
+      revision: 9dd5ea5505fad86b7bedad667955577815300cee
+    metrics:
+    - type: Accuracy
+      value: 53.97
+  - task:
+      type: Coreference resolution
+    dataset:
+      type: Muennighoff/xwinograd
+      name: XWinograd (zh)
+      config: zh
+      split: test
+      revision: 9dd5ea5505fad86b7bedad667955577815300cee
+    metrics:
+    - type: Accuracy
+      value: 52.98
+  - task:
+      type: Natural language inference
+    dataset:
+      type: anli
+      name: ANLI (r1)
+      config: r1
+      split: validation
+      revision: 9dbd830a06fea8b1c49d6e5ef2004a08d9f45094
+    metrics:
+    - type: Accuracy
+      value: 35.1
+  - task:
+      type: Natural language inference
+    dataset:
+      type: anli
+      name: ANLI (r2)
+      config: r2
+      split: validation
+      revision: 9dbd830a06fea8b1c49d6e5ef2004a08d9f45094
+    metrics:
+    - type: Accuracy
+      value: 35.4
+  - task:
+      type: Natural language inference
+    dataset:
+      type: anli
+      name: ANLI (r3)
+      config: r3
+      split: validation
+      revision: 9dbd830a06fea8b1c49d6e5ef2004a08d9f45094
+    metrics:
+    - type: Accuracy
+      value: 37.58
+  - task:
+      type: Natural language inference
+    dataset:
+      type: super_glue
+      name: SuperGLUE (cb)
+      config: cb
+      split: validation
+      revision: 9e12063561e7e6c79099feb6d5a493142584e9e2
+    metrics:
+    - type: Accuracy
+      value: 62.5
+  - task:
+      type: Natural language inference
+    dataset:
+      type: super_glue
+      name: SuperGLUE (rte)
+      config: rte
+      split: validation
+      revision: 9e12063561e7e6c79099feb6d5a493142584e9e2
+    metrics:
+    - type: Accuracy
+      value: 78.7
+  - task:
+      type: Natural language inference
+    dataset:
+      type: xnli
+      name: XNLI (ar)
+      config: ar
+      split: validation
+      revision: a5a45e4ff92d5d3f34de70aaf4b72c3bdf9f7f16
+    metrics:
+    - type: Accuracy
+      value: 50.64
+  - task:
+      type: Natural language inference
+    dataset:
+      type: xnli
+      name: XNLI (bg)
+      config: bg
+      split: validation
+      revision: a5a45e4ff92d5d3f34de70aaf4b72c3bdf9f7f16
+    metrics:
+    - type: Accuracy
+      value: 43.98
+  - task:
+      type: Natural language inference
+    dataset:
+      type: xnli
+      name: XNLI (de)
+      config: de
+      split: validation
+      revision: a5a45e4ff92d5d3f34de70aaf4b72c3bdf9f7f16
+    metrics:
+    - type: Accuracy
+      value: 47.03
+  - task:
+      type: Natural language inference
+    dataset:
+      type: xnli
+      name: XNLI (el)
+      config: el
+      split: validation
+      revision: a5a45e4ff92d5d3f34de70aaf4b72c3bdf9f7f16
+    metrics:
+    - type: Accuracy
+      value: 41.89
+  - task:
+      type: Natural language inference
+    dataset:
+      type: xnli
+      name: XNLI (en)
+      config: en
+      split: validation
+      revision: a5a45e4ff92d5d3f34de70aaf4b72c3bdf9f7f16
+    metrics:
+    - type: Accuracy
+      value: 55.9
+  - task:
+      type: Natural language inference
+    dataset:
+      type: xnli
+      name: XNLI (es)
+      config: es
+      split: validation
+      revision: a5a45e4ff92d5d3f34de70aaf4b72c3bdf9f7f16
+    metrics:
+    - type: Accuracy
+      value: 53.73
+  - task:
+      type: Natural language inference
+    dataset:
+      type: xnli
+      name: XNLI (fr)
+      config: fr
+      split: validation
+      revision: a5a45e4ff92d5d3f34de70aaf4b72c3bdf9f7f16
+    metrics:
+    - type: Accuracy
+      value: 53.37
+  - task:
+      type: Natural language inference
+    dataset:
+      type: xnli
+      name: XNLI (hi)
+      config: hi
+      split: validation
+      revision: a5a45e4ff92d5d3f34de70aaf4b72c3bdf9f7f16
+    metrics:
+    - type: Accuracy
+      value: 49.84
+  - task:
+      type: Natural language inference
+    dataset:
+      type: xnli
+      name: XNLI (ru)
+      config: ru
+      split: validation
+      revision: a5a45e4ff92d5d3f34de70aaf4b72c3bdf9f7f16
+    metrics:
+    - type: Accuracy
+      value: 46.55
+  - task:
+      type: Natural language inference
+    dataset:
+      type: xnli
+      name: XNLI (sw)
+      config: sw
+      split: validation
+      revision: a5a45e4ff92d5d3f34de70aaf4b72c3bdf9f7f16
+    metrics:
+    - type: Accuracy
+      value: 43.49
+  - task:
+      type: Natural language inference
+    dataset:
+      type: xnli
+      name: XNLI (th)
+      config: th
+      split: validation
+      revision: a5a45e4ff92d5d3f34de70aaf4b72c3bdf9f7f16
+    metrics:
+    - type: Accuracy
+      value: 43.17
+  - task:
+      type: Natural language inference
+    dataset:
+      type: xnli
+      name: XNLI (tr)
+      config: tr
+      split: validation
+      revision: a5a45e4ff92d5d3f34de70aaf4b72c3bdf9f7f16
+    metrics:
+    - type: Accuracy
+      value: 40.44
+  - task:
+      type: Natural language inference
+    dataset:
+      type: xnli
+      name: XNLI (ur)
+      config: ur
+      split: validation
+      revision: a5a45e4ff92d5d3f34de70aaf4b72c3bdf9f7f16
+    metrics:
+    - type: Accuracy
+      value: 45.18
+  - task:
+      type: Natural language inference
+    dataset:
+      type: xnli
+      name: XNLI (vi)
+      config: vi
+      split: validation
+      revision: a5a45e4ff92d5d3f34de70aaf4b72c3bdf9f7f16
+    metrics:
+    - type: Accuracy
+      value: 51.97
+  - task:
+      type: Natural language inference
+    dataset:
+      type: xnli
+      name: XNLI (zh)
+      config: zh
+      split: validation
+      revision: a5a45e4ff92d5d3f34de70aaf4b72c3bdf9f7f16
+    metrics:
+    - type: Accuracy
+      value: 52.29
+  - task:
+      type: Program synthesis
+    dataset:
+      type: openai_humaneval
+      name: HumanEval
+      config: None
+      split: test
+      revision: e8dc562f5de170c54b5481011dd9f4fa04845771
+    metrics:
+    - type: Pass@1
+      value: 1.55
+    - type: Pass@10
+      value: 4.12
+    - type: Pass@100
+      value: 9.60
+  - task:
+      type: Sentence completion
+    dataset:
+      type: story_cloze
+      name: StoryCloze (2016)
+      config: "2016"
+      split: validation
+      revision: e724c6f8cdf7c7a2fb229d862226e15b023ee4db
+    metrics:
+    - type: Accuracy
+      value: 87.07
+  - task:
+      type: Sentence completion
+    dataset:
+      type: super_glue
+      name: SuperGLUE (copa)
+      config: copa
+      split: validation
+      revision: 9e12063561e7e6c79099feb6d5a493142584e9e2
+    metrics:
+    - type: Accuracy
+      value: 81.0
+  - task:
+      type: Sentence completion
+    dataset:
+      type: xcopa
+      name: XCOPA (et)
+      config: et
+      split: validation
+      revision: 37f73c60fb123111fa5af5f9b705d0b3747fd187
+    metrics:
+    - type: Accuracy
+      value: 57.0
+  - task:
+      type: Sentence completion
+    dataset:
+      type: xcopa
+      name: XCOPA (ht)
+      config: ht
+      split: validation
+      revision: 37f73c60fb123111fa5af5f9b705d0b3747fd187
+    metrics:
+    - type: Accuracy
+      value: 56.0
+  - task:
+      type: Sentence completion
+    dataset:
+      type: xcopa
+      name: XCOPA (id)
+      config: id
+      split: validation
+      revision: 37f73c60fb123111fa5af5f9b705d0b3747fd187
+    metrics:
+    - type: Accuracy
+      value: 70.0
+  - task:
+      type: Sentence completion
+    dataset:
+      type: xcopa
+      name: XCOPA (it)
+      config: it
+      split: validation
+      revision: 37f73c60fb123111fa5af5f9b705d0b3747fd187
+    metrics:
+    - type: Accuracy
+      value: 60.0
+  - task:
+      type: Sentence completion
+    dataset:
+      type: xcopa
+      name: XCOPA (qu)
+      config: qu
+      split: validation
+      revision: 37f73c60fb123111fa5af5f9b705d0b3747fd187
+    metrics:
+    - type: Accuracy
+      value: 54.0
+  - task:
+      type: Sentence completion
+    dataset:
+      type: xcopa
+      name: XCOPA (sw)
+      config: sw
+      split: validation
+      revision: 37f73c60fb123111fa5af5f9b705d0b3747fd187
+    metrics:
+    - type: Accuracy
+      value: 62.0
+  - task:
+      type: Sentence completion
+    dataset:
+      type: xcopa
+      name: XCOPA (ta)
+      config: ta
+      split: validation
+      revision: 37f73c60fb123111fa5af5f9b705d0b3747fd187
+    metrics:
+    - type: Accuracy
+      value: 71.0
+  - task:
+      type: Sentence completion
+    dataset:
+      type: xcopa
+      name: XCOPA (th)
+      config: th
+      split: validation
+      revision: 37f73c60fb123111fa5af5f9b705d0b3747fd187
+    metrics:
+    - type: Accuracy
+      value: 63.0
+  - task:
+      type: Sentence completion
+    dataset:
+      type: xcopa
+      name: XCOPA (tr)
+      config: tr
+      split: validation
+      revision: 37f73c60fb123111fa5af5f9b705d0b3747fd187
+    metrics:
+    - type: Accuracy
+      value: 58.0
+  - task:
+      type: Sentence completion
+    dataset:
+      type: xcopa
+      name: XCOPA (vi)
+      config: vi
+      split: validation
+      revision: 37f73c60fb123111fa5af5f9b705d0b3747fd187
+    metrics:
+    - type: Accuracy
+      value: 67.0
+  - task:
+      type: Sentence completion
+    dataset:
+      type: xcopa
+      name: XCOPA (zh)
+      config: zh
+      split: validation
+      revision: 37f73c60fb123111fa5af5f9b705d0b3747fd187
+    metrics:
+    - type: Accuracy
+      value: 79.0
+  - task:
+      type: Sentence completion
+    dataset:
+      type: Muennighoff/xstory_cloze
+      name: XStoryCloze (ar)
+      config: ar
+      split: validation
+      revision: 8bb76e594b68147f1a430e86829d07189622b90d
+    metrics:
+    - type: Accuracy
+      value: 78.69
+  - task:
+      type: Sentence completion
+    dataset:
+      type: Muennighoff/xstory_cloze
+      name: XStoryCloze (es)
+      config: es
+      split: validation
+      revision: 8bb76e594b68147f1a430e86829d07189622b90d
+    metrics:
+    - type: Accuracy
+      value: 82.93
+  - task:
+      type: Sentence completion
+    dataset:
+      type: Muennighoff/xstory_cloze
+      name: XStoryCloze (eu)
+      config: eu
+      split: validation
+      revision: 8bb76e594b68147f1a430e86829d07189622b90d
+    metrics:
+    - type: Accuracy
+      value: 70.42
+  - task:
+      type: Sentence completion
+    dataset:
+      type: Muennighoff/xstory_cloze
+      name: XStoryCloze (hi)
+      config: hi
+      split: validation
+      revision: 8bb76e594b68147f1a430e86829d07189622b90d
+    metrics:
+    - type: Accuracy
+      value: 72.2
+  - task:
+      type: Sentence completion
+    dataset:
+      type: Muennighoff/xstory_cloze
+      name: XStoryCloze (id)
+      config: id
+      split: validation
+      revision: 8bb76e594b68147f1a430e86829d07189622b90d
+    metrics:
+    - type: Accuracy
+      value: 77.1
+  - task:
+      type: Sentence completion
+    dataset:
+      type: Muennighoff/xstory_cloze
+      name: XStoryCloze (my)
+      config: my
+      split: validation
+      revision: 8bb76e594b68147f1a430e86829d07189622b90d
+    metrics:
+    - type: Accuracy
+      value: 51.49
+  - task:
+      type: Sentence completion
+    dataset:
+      type: Muennighoff/xstory_cloze
+      name: XStoryCloze (ru)
+      config: ru
+      split: validation
+      revision: 8bb76e594b68147f1a430e86829d07189622b90d
+    metrics:
+    - type: Accuracy
+      value: 66.45
+  - task:
+      type: Sentence completion
+    dataset:
+      type: Muennighoff/xstory_cloze
+      name: XStoryCloze (sw)
+      config: sw
+      split: validation
+      revision: 8bb76e594b68147f1a430e86829d07189622b90d
+    metrics:
+    - type: Accuracy
+      value: 60.82
+  - task:
+      type: Sentence completion
+    dataset:
+      type: Muennighoff/xstory_cloze
+      name: XStoryCloze (te)
+      config: te
+      split: validation
+      revision: 8bb76e594b68147f1a430e86829d07189622b90d
+    metrics:
+    - type: Accuracy
+      value: 63.14
+  - task:
+      type: Sentence completion
+    dataset:
+      type: Muennighoff/xstory_cloze
+      name: XStoryCloze (zh)
+      config: zh
+      split: validation
+      revision: 8bb76e594b68147f1a430e86829d07189622b90d
+    metrics:
+    - type: Accuracy
+      value: 80.34
+---
+
+![xmtf](https://github.com/bigscience-workshop/xmtf/blob/master/xmtf_banner.png?raw=true)
+
+#  Table of Contents
+
+1. [Model Summary](#model-summary)
+2. [Use](#use)
+3. [Limitations](#limitations)
+4. [Training](#training)
+5. [Evaluation](#evaluation)
+7. [Citation](#citation)
+
+# Model Summary
+
+> We present BLOOMZ & mT0, a family of models capable of following human instructions in dozens of languages zero-shot. We finetune BLOOM & mT5 pretrained multilingual language models on our crosslingual task mixture (xP3) and find the resulting models capable of crosslingual generalization to unseen tasks & languages.
+
+- **Repository:** [bigscience-workshop/xmtf](https://github.com/bigscience-workshop/xmtf)
+- **Paper:** [Crosslingual Generalization through Multitask Finetuning](https://arxiv.org/abs/2211.01786)
+- **Point of Contact:** [Niklas Muennighoff](mailto:niklas@hf.co)
+- **Languages:** Refer to [bloom](https://huggingface.co/bigscience/bloom) for pretraining & [xP3](https://huggingface.co/datasets/bigscience/xP3) for finetuning language proportions. It understands both pretraining & finetuning languages.
+- **BLOOMZ & mT0 Model Family:**
+
+<div class="max-w-full overflow-auto">
+<table>
+  <tr>
+<th colspan="12">Multitask finetuned on <a style="font-weight:bold" href=https://huggingface.co/datasets/bigscience/xP3>xP3</a>. Recommended for prompting in English.
+</tr>
+<tr>
+<td>Parameters</td>
+<td>300M</td>
+<td>580M</td>
+<td>1.2B</td>
+<td>3.7B</td>
+<td>13B</td>
+<td>560M</td>
+<td>1.1B</td>
+<td>1.7B</td>
+<td>3B</td>
+<td>7.1B</td>
+<td>176B</td>
+</tr>
+<tr>
+<td>Finetuned Model</td>
+<td><a href=https://huggingface.co/bigscience/mt0-small>mt0-small</a></td>  
+<td><a href=https://huggingface.co/bigscience/mt0-base>mt0-base</a></td>
+<td><a href=https://huggingface.co/bigscience/mt0-large>mt0-large</a></td>
+<td><a href=https://huggingface.co/bigscience/mt0-xl>mt0-xl</a></td>
+<td><a href=https://huggingface.co/bigscience/mt0-xxl>mt0-xxl</a></td>
+<td><a href=https://huggingface.co/bigscience/bloomz-560m>bloomz-560m</a></td>
+<td><a href=https://huggingface.co/bigscience/bloomz-1b1>bloomz-1b1</a></td>
+<td><a href=https://huggingface.co/bigscience/bloomz-1b7>bloomz-1b7</a></td>
+<td><a href=https://huggingface.co/bigscience/bloomz-3b>bloomz-3b</a></td>
+<td><a href=https://huggingface.co/bigscience/bloomz-7b1>bloomz-7b1</a></td>
+<td><a href=https://huggingface.co/bigscience/bloomz>bloomz</a></td>
+</tr>
+</tr>
+  <tr>
+<th colspan="12">Multitask finetuned on <a style="font-weight:bold" href=https://huggingface.co/datasets/bigscience/xP3mt>xP3mt</a>. Recommended for prompting in non-English.</th>
+</tr>
+<tr>
+<td>Finetuned Model</td>
+<td></td>
+<td></td>
+<td></td>
+<td></td>
+<td><a href=https://huggingface.co/bigscience/mt0-xxl-mt>mt0-xxl-mt</a></td>
+<td></td>
+<td></td>
+<td></td>
+<td></td>
+<td><a href=https://huggingface.co/bigscience/bloomz-7b1-mt>bloomz-7b1-mt</a></td>
+<td><a href=https://huggingface.co/bigscience/bloomz-mt>bloomz-mt</a></td>
+</tr>
+<th colspan="12">Multitask finetuned on <a style="font-weight:bold" href=https://huggingface.co/datasets/Muennighoff/P3>P3</a>. Released for research purposes only. Strictly inferior to above models!</th>
+</tr>
+<tr>
+<td>Finetuned Model</td>
+<td></td>
+<td></td>
+<td></td>
+<td></td>
+<td><a href=https://huggingface.co/bigscience/mt0-xxl-p3>mt0-xxl-p3</a></td>
+<td></td>
+<td></td>
+<td></td>
+<td></td>
+<td><a href=https://huggingface.co/bigscience/bloomz-7b1-p3>bloomz-7b1-p3</a></td>
+<td><a href=https://huggingface.co/bigscience/bloomz-p3>bloomz-p3</a></td>
+</tr>
+<th colspan="12">Original pretrained checkpoints. Not recommended.</th>
+<tr>
+<td>Pretrained Model</td>
+<td><a href=https://huggingface.co/google/mt5-small>mt5-small</a></td>  
+<td><a href=https://huggingface.co/google/mt5-base>mt5-base</a></td>
+<td><a href=https://huggingface.co/google/mt5-large>mt5-large</a></td>
+<td><a href=https://huggingface.co/google/mt5-xl>mt5-xl</a></td>
+<td><a href=https://huggingface.co/google/mt5-xxl>mt5-xxl</a></td>
+<td><a href=https://huggingface.co/bigscience/bloom-560m>bloom-560m</a></td>
+<td><a href=https://huggingface.co/bigscience/bloom-1b1>bloom-1b1</a></td>
+<td><a href=https://huggingface.co/bigscience/bloom-1b7>bloom-1b7</a></td>
+<td><a href=https://huggingface.co/bigscience/bloom-3b>bloom-3b</a></td>
+<td><a href=https://huggingface.co/bigscience/bloom-7b1>bloom-7b1</a></td>
+<td><a href=https://huggingface.co/bigscience/bloom>bloom</a></td>
+</tr>
+</table>
+</div>
+
+
+# Use
+
+## Intended use
+
+We recommend using the model to perform tasks expressed in natural language. For example, given the prompt "*Translate to English: Je t’aime.*", the model will most likely answer "*I love you.*". Some prompt ideas from our paper: 
+- 一个传奇的开端，一个不灭的神话，这不仅仅是一部电影，而是作为一个走进新时代的标签，永远彪炳史册。你认为这句话的立场是赞扬、中立还是批评?
+- Suggest at least five related search terms to "Mạng neural nhân tạo".
+- Write a fairy tale about a troll saving a princess from a dangerous dragon. The fairy tale is a masterpiece that has achieved praise worldwide and its moral is "Heroes Come in All Shapes and Sizes". Story (in Spanish):
+- Explain in a sentence in Telugu what is backpropagation in neural networks.
+
+**Feel free to share your generations in the Community tab!**
+
+## How to use
+
+### CPU
+
+<details>
+<summary> Click to expand </summary>
+
+```python
+# pip install -q transformers
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+checkpoint = "bigscience/bloomz-7b1-p3"
+
+tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+model = AutoModelForCausalLM.from_pretrained(checkpoint)
+
+inputs = tokenizer.encode("Translate to English: Je t’aime.", return_tensors="pt")
+outputs = model.generate(inputs)
+print(tokenizer.decode(outputs[0]))
+```
+
+</details>
+
+### GPU
+
+<details>
+<summary> Click to expand </summary>
+
+```python
+# pip install -q transformers accelerate
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+checkpoint = "bigscience/bloomz-7b1-p3"
+
+tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+model = AutoModelForCausalLM.from_pretrained(checkpoint, torch_dtype="auto", device_map="auto")
+
+inputs = tokenizer.encode("Translate to English: Je t’aime.", return_tensors="pt").to("cuda")
+outputs = model.generate(inputs)
+print(tokenizer.decode(outputs[0]))
+```
+
+</details>
+
+### GPU in 8bit
+
+<details>
+<summary> Click to expand </summary>
+
+```python
+# pip install -q transformers accelerate bitsandbytes
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+checkpoint = "bigscience/bloomz-7b1-p3"
+
+tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+model = AutoModelForCausalLM.from_pretrained(checkpoint, device_map="auto", load_in_8bit=True)
+
+inputs = tokenizer.encode("Translate to English: Je t’aime.", return_tensors="pt").to("cuda")
+outputs = model.generate(inputs)
+print(tokenizer.decode(outputs[0]))
+```
+
+</details>
+
+<!-- Necessary for whitespace -->
+###
+
+# Limitations
+
+**Prompt Engineering:** The performance may vary depending on the prompt. For BLOOMZ models, we recommend making it very clear when the input stops to avoid the model trying to continue it. For example, the prompt "*Translate to English: Je t'aime*" without the full stop (.) at the end, may result in the model trying to continue the French sentence. Better prompts are e.g. "*Translate to English: Je t'aime.*", "*Translate to English: Je t'aime. Translation:*" "*What is "Je t'aime." in English?*", where it is clear for the model when it should answer. Further, we recommend providing the model as much context as possible. For example, if you want it to answer in Telugu, then tell the model, e.g. "*Explain in a sentence in Telugu what is backpropagation in neural networks.*".
+
+# Training
+
+## Model
+
+- **Architecture:** Same as [bloom-7b1](https://huggingface.co/bigscience/bloom-7b1), also refer to the `config.json` file
+- **Finetuning steps:** 1000
+- **Finetuning tokens:** 4.19 billion
+- **Finetuning layout:** 1x pipeline parallel, 1x tensor parallel, 64x data parallel
+- **Precision:** float16
+
+## Hardware
+
+- **CPUs:** AMD CPUs with 512GB memory per node
+- **GPUs:** 64 A100 80GB GPUs with 8 GPUs per node (8 nodes) using NVLink 4 inter-gpu connects, 4 OmniPath links
+- **Communication:** NCCL-communications network with a fully dedicated subnet
+
+## Software
+
+- **Orchestration:** [Megatron-DeepSpeed](https://github.com/bigscience-workshop/Megatron-DeepSpeed)
+- **Optimizer & parallelism:** [DeepSpeed](https://github.com/microsoft/DeepSpeed)
+- **Neural networks:** [PyTorch](https://github.com/pytorch/pytorch) (pytorch-1.11 w/ CUDA-11.5)
+- **FP16 if applicable:** [apex](https://github.com/NVIDIA/apex)
+
+# Evaluation
+
+We refer to Table 7 from our [paper](https://arxiv.org/abs/2211.01786) & [bigscience/evaluation-results](https://huggingface.co/datasets/bigscience/evaluation-results) for zero-shot results on unseen tasks. The sidebar reports zero-shot performance of the best prompt per dataset config.
+
+# Citation
+```bibtex
+@misc{muennighoff2022crosslingual,
+      title={Crosslingual Generalization through Multitask Finetuning}, 
+      author={Niklas Muennighoff and Thomas Wang and Lintang Sutawika and Adam Roberts and Stella Biderman and Teven Le Scao and M Saiful Bari and Sheng Shen and Zheng-Xin Yong and Hailey Schoelkopf and Xiangru Tang and Dragomir Radev and Alham Fikri Aji and Khalid Almubarak and Samuel Albanie and Zaid Alyafeai and Albert Webson and Edward Raff and Colin Raffel},
+      year={2022},
+      eprint={2211.01786},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+```
--- a/config.json
+++ b/config.json
@@ -0,0 +1,31 @@
+{
+  "apply_residual_connection_post_layernorm": false,
+  "architectures": [
+    "BloomForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "attention_softmax_in_fp32": true,
+  "bias_dropout_fusion": true,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "hidden_dropout": 0.0,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "masked_softmax_fusion": true,
+  "model_type": "bloom",
+  "n_embed": 4096,
+  "n_inner": null,
+  "n_layer": 30,
+  "num_attention_heads": 32,
+  "offset_alibi": 100,
+  "pad_token_id": 3,
+  "pretraining_tp": 4,
+  "seq_length": 2048,
+  "skip_bias_add": true,
+  "skip_bias_add_qkv": false,
+  "slow_but_exact": false,
+  "transformers_version": "4.21.0.dev0",
+  "unk_token_id": 0,
+  "use_cache": true,
+  "vocab_size": 250880
+}
--- a/configuration.json
+++ b/configuration.json
@@ -0,0 +1 @@
+{"framework": "pytorch", "task": "text-generation", "allow_remote": true}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/ar/Answer_Given_options/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/ar/Answer_Given_options/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "ar",
+  "template_name": "Answer Given options",
+  "evaluation": {
+    "accuracy": 0.7518199867637326
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Answer Given options', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/ar/Choose_Story_Ending/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/ar/Choose_Story_Ending/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "ar",
+  "template_name": "Choose Story Ending",
+  "evaluation": {
+    "accuracy": 0.7749834546657842
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Choose Story Ending', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/ar/Generate_Ending/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/ar/Generate_Ending/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "ar",
+  "template_name": "Generate Ending",
+  "evaluation": {
+    "accuracy": 0.586366644606221
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Generate Ending', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/ar/Novel_Correct_Ending/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/ar/Novel_Correct_Ending/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "ar",
+  "template_name": "Novel Correct Ending",
+  "evaluation": {
+    "accuracy": 0.7518199867637326
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Novel Correct Ending', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/ar/Story_Continuation_and_Options/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/ar/Story_Continuation_and_Options/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "ar",
+  "template_name": "Story Continuation and Options",
+  "evaluation": {
+    "accuracy": 0.7438782263401721
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Story Continuation and Options', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/es/Answer_Given_options/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/es/Answer_Given_options/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "es",
+  "template_name": "Answer Given options",
+  "evaluation": {
+    "accuracy": 0.7835870284579749
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Answer Given options', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/es/Choose_Story_Ending/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/es/Choose_Story_Ending/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "es",
+  "template_name": "Choose Story Ending",
+  "evaluation": {
+    "accuracy": 0.8292521508934481
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Choose Story Ending', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/es/Generate_Ending/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/es/Generate_Ending/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "es",
+  "template_name": "Generate Ending",
+  "evaluation": {
+    "accuracy": 0.6399735274652548
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Generate Ending', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/es/Novel_Correct_Ending/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/es/Novel_Correct_Ending/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "es",
+  "template_name": "Novel Correct Ending",
+  "evaluation": {
+    "accuracy": 0.7935142289874255
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Novel Correct Ending', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/es/Story_Continuation_and_Options/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/es/Story_Continuation_and_Options/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "es",
+  "template_name": "Story Continuation and Options",
+  "evaluation": {
+    "accuracy": 0.7888815354070152
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Story Continuation and Options', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/eu/Answer_Given_options/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/eu/Answer_Given_options/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "eu",
+  "template_name": "Answer Given options",
+  "evaluation": {
+    "accuracy": 0.7041694242223693
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Answer Given options', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/eu/Choose_Story_Ending/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/eu/Choose_Story_Ending/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "eu",
+  "template_name": "Choose Story Ending",
+  "evaluation": {
+    "accuracy": 0.6823295830575777
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Choose Story Ending', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/eu/Generate_Ending/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/eu/Generate_Ending/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "eu",
+  "template_name": "Generate Ending",
+  "evaluation": {
+    "accuracy": 0.5625413633355394
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Generate Ending', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/eu/Novel_Correct_Ending/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/eu/Novel_Correct_Ending/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "eu",
+  "template_name": "Novel Correct Ending",
+  "evaluation": {
+    "accuracy": 0.6671078755790867
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Novel Correct Ending', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/eu/Story_Continuation_and_Options/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/eu/Story_Continuation_and_Options/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "eu",
+  "template_name": "Story Continuation and Options",
+  "evaluation": {
+    "accuracy": 0.671740569159497
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Story Continuation and Options', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/hi/Answer_Given_options/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/hi/Answer_Given_options/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "hi",
+  "template_name": "Answer Given options",
+  "evaluation": {
+    "accuracy": 0.6915949702183984
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Answer Given options', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/hi/Choose_Story_Ending/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/hi/Choose_Story_Ending/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "hi",
+  "template_name": "Choose Story Ending",
+  "evaluation": {
+    "accuracy": 0.7220383851753805
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Choose Story Ending', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/hi/Generate_Ending/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/hi/Generate_Ending/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "hi",
+  "template_name": "Generate Ending",
+  "evaluation": {
+    "accuracy": 0.5883520847121112
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Generate Ending', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/hi/Novel_Correct_Ending/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/hi/Novel_Correct_Ending/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "hi",
+  "template_name": "Novel Correct Ending",
+  "evaluation": {
+    "accuracy": 0.6743878226340172
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Novel Correct Ending', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/hi/Story_Continuation_and_Options/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/hi/Story_Continuation_and_Options/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "hi",
+  "template_name": "Story Continuation and Options",
+  "evaluation": {
+    "accuracy": 0.6816677696889477
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Story Continuation and Options', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/id/Answer_Given_options/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/id/Answer_Given_options/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "id",
+  "template_name": "Answer Given options",
+  "evaluation": {
+    "accuracy": 0.7445400397088021
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Answer Given options', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/id/Choose_Story_Ending/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/id/Choose_Story_Ending/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "id",
+  "template_name": "Choose Story Ending",
+  "evaluation": {
+    "accuracy": 0.771012574454004
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Choose Story Ending', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/id/Generate_Ending/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/id/Generate_Ending/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "id",
+  "template_name": "Generate Ending",
+  "evaluation": {
+    "accuracy": 0.6029119788219722
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Generate Ending', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/id/Novel_Correct_Ending/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/id/Novel_Correct_Ending/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "id",
+  "template_name": "Novel Correct Ending",
+  "evaluation": {
+    "accuracy": 0.7485109199205824
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Novel Correct Ending', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/id/Story_Continuation_and_Options/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/id/Story_Continuation_and_Options/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "id",
+  "template_name": "Story Continuation and Options",
+  "evaluation": {
+    "accuracy": 0.7438782263401721
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Story Continuation and Options', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/zh/Answer_Given_options/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/zh/Answer_Given_options/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "zh",
+  "template_name": "Answer Given options",
+  "evaluation": {
+    "accuracy": 0.7610853739245532
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Answer Given options', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/zh/Choose_Story_Ending/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/zh/Choose_Story_Ending/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "zh",
+  "template_name": "Choose Story Ending",
+  "evaluation": {
+    "accuracy": 0.7961614824619457
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Choose Story Ending', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/zh/Generate_Ending/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/zh/Generate_Ending/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "zh",
+  "template_name": "Generate Ending",
+  "evaluation": {
+    "accuracy": 0.6214427531436135
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Generate Ending', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/zh/Novel_Correct_Ending/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/zh/Novel_Correct_Ending/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "zh",
+  "template_name": "Novel Correct Ending",
+  "evaluation": {
+    "accuracy": 0.7696889477167439
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Novel Correct Ending', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/zh/Story_Continuation_and_Options/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xstory_cloze/zh/Story_Continuation_and_Options/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "zh",
+  "template_name": "Story Continuation and Options",
+  "evaluation": {
+    "accuracy": 0.7670416942422237
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Story Continuation and Options', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/en/Replace/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/en/Replace/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "en",
+  "template_name": "Replace",
+  "evaluation": {
+    "accuracy": 0.5225806451612903
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='Replace', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/en/True_or_False/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/en/True_or_False/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "en",
+  "template_name": "True or False",
+  "evaluation": {
+    "accuracy": 0.48946236559139783
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='True or False', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/en/does_underscore_refer_to/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/en/does_underscore_refer_to/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "en",
+  "template_name": "does underscore refer to",
+  "evaluation": {
+    "accuracy": 0.5281720430107527
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='does underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/en/stand_for/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/en/stand_for/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "en",
+  "template_name": "stand for",
+  "evaluation": {
+    "accuracy": 0.5062365591397849
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='stand for', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/en/underscore_refer_to/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/en/underscore_refer_to/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "en",
+  "template_name": "underscore refer to",
+  "evaluation": {
+    "accuracy": 0.5372043010752688
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/fr/Replace/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/fr/Replace/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "fr",
+  "template_name": "Replace",
+  "evaluation": {
+    "accuracy": 0.5060240963855421
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='Replace', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/fr/True_or_False/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/fr/True_or_False/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "fr",
+  "template_name": "True or False",
+  "evaluation": {
+    "accuracy": 0.5421686746987951
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='True or False', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/fr/does_underscore_refer_to/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/fr/does_underscore_refer_to/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "fr",
+  "template_name": "does underscore refer to",
+  "evaluation": {
+    "accuracy": 0.5542168674698795
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='does underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/fr/stand_for/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/fr/stand_for/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "fr",
+  "template_name": "stand for",
+  "evaluation": {
+    "accuracy": 0.4819277108433735
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='stand for', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/fr/underscore_refer_to/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/fr/underscore_refer_to/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "fr",
+  "template_name": "underscore refer to",
+  "evaluation": {
+    "accuracy": 0.5301204819277109
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/pt/Replace/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/pt/Replace/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "pt",
+  "template_name": "Replace",
+  "evaluation": {
+    "accuracy": 0.5133079847908745
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='Replace', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/pt/True_or_False/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/pt/True_or_False/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "pt",
+  "template_name": "True or False",
+  "evaluation": {
+    "accuracy": 0.4714828897338403
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='True or False', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/pt/does_underscore_refer_to/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/pt/does_underscore_refer_to/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "pt",
+  "template_name": "does underscore refer to",
+  "evaluation": {
+    "accuracy": 0.5209125475285171
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='does underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/pt/stand_for/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/pt/stand_for/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "pt",
+  "template_name": "stand for",
+  "evaluation": {
+    "accuracy": 0.5019011406844106
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='stand for', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/pt/underscore_refer_to/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/pt/underscore_refer_to/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "pt",
+  "template_name": "underscore refer to",
+  "evaluation": {
+    "accuracy": 0.5399239543726235
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/zh/Replace/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/zh/Replace/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "zh",
+  "template_name": "Replace",
+  "evaluation": {
+    "accuracy": 0.5257936507936508
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='Replace', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/zh/True_or_False/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/zh/True_or_False/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "zh",
+  "template_name": "True or False",
+  "evaluation": {
+    "accuracy": 0.5297619047619048
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='True or False', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/zh/does_underscore_refer_to/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/zh/does_underscore_refer_to/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "zh",
+  "template_name": "does underscore refer to",
+  "evaluation": {
+    "accuracy": 0.5218253968253969
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='does underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/zh/stand_for/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/zh/stand_for/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "zh",
+  "template_name": "stand for",
+  "evaluation": {
+    "accuracy": 0.4444444444444444
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='stand for', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/zh/underscore_refer_to/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/Muennighoff_xwinograd/zh/underscore_refer_to/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "zh",
+  "template_name": "underscore refer to",
+  "evaluation": {
+    "accuracy": 0.5198412698412699
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r1/GPT-3_style/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r1/GPT-3_style/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "anli",
+  "dataset_config_name": "dev_r1",
+  "template_name": "GPT-3 style",
+  "evaluation": {
+    "accuracy": 0.351
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='dev_r1', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r1', target_max_length=256, template_config_name=None, template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r1/MNLI_crowdsource/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r1/MNLI_crowdsource/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "anli",
+  "dataset_config_name": "dev_r1",
+  "template_name": "MNLI crowdsource",
+  "evaluation": {
+    "accuracy": 0.334
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='dev_r1', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r1', target_max_length=256, template_config_name=None, template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r1/can_we_infer/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r1/can_we_infer/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "anli",
+  "dataset_config_name": "dev_r1",
+  "template_name": "can we infer",
+  "evaluation": {
+    "accuracy": 0.351
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='dev_r1', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r1', target_max_length=256, template_config_name=None, template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r1/guaranteed_possible_impossible/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r1/guaranteed_possible_impossible/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "anli",
+  "dataset_config_name": "dev_r1",
+  "template_name": "guaranteed/possible/impossible",
+  "evaluation": {
+    "accuracy": 0.288
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='dev_r1', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r1', target_max_length=256, template_config_name=None, template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r1/justified_in_saying/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r1/justified_in_saying/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "anli",
+  "dataset_config_name": "dev_r1",
+  "template_name": "justified in saying",
+  "evaluation": {
+    "accuracy": 0.345
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='dev_r1', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r1', target_max_length=256, template_config_name=None, template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r2/GPT-3_style/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r2/GPT-3_style/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "anli",
+  "dataset_config_name": "dev_r2",
+  "template_name": "GPT-3 style",
+  "evaluation": {
+    "accuracy": 0.339
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='dev_r2', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r2', target_max_length=256, template_config_name=None, template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r2/MNLI_crowdsource/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r2/MNLI_crowdsource/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "anli",
+  "dataset_config_name": "dev_r2",
+  "template_name": "MNLI crowdsource",
+  "evaluation": {
+    "accuracy": 0.335
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='dev_r2', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r2', target_max_length=256, template_config_name=None, template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r2/can_we_infer/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r2/can_we_infer/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "anli",
+  "dataset_config_name": "dev_r2",
+  "template_name": "can we infer",
+  "evaluation": {
+    "accuracy": 0.354
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='dev_r2', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r2', target_max_length=256, template_config_name=None, template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r2/guaranteed_possible_impossible/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r2/guaranteed_possible_impossible/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "anli",
+  "dataset_config_name": "dev_r2",
+  "template_name": "guaranteed/possible/impossible",
+  "evaluation": {
+    "accuracy": 0.297
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='dev_r2', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r2', target_max_length=256, template_config_name=None, template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r2/justified_in_saying/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r2/justified_in_saying/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "anli",
+  "dataset_config_name": "dev_r2",
+  "template_name": "justified in saying",
+  "evaluation": {
+    "accuracy": 0.345
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='dev_r2', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r2', target_max_length=256, template_config_name=None, template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r3/GPT-3_style/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r3/GPT-3_style/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "anli",
+  "dataset_config_name": "dev_r3",
+  "template_name": "GPT-3 style",
+  "evaluation": {
+    "accuracy": 0.37583333333333335
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='dev_r3', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r3', target_max_length=256, template_config_name=None, template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r3/MNLI_crowdsource/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r3/MNLI_crowdsource/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "anli",
+  "dataset_config_name": "dev_r3",
+  "template_name": "MNLI crowdsource",
+  "evaluation": {
+    "accuracy": 0.3408333333333333
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='dev_r3', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r3', target_max_length=256, template_config_name=None, template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r3/can_we_infer/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r3/can_we_infer/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "anli",
+  "dataset_config_name": "dev_r3",
+  "template_name": "can we infer",
+  "evaluation": {
+    "accuracy": 0.36333333333333334
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='dev_r3', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r3', target_max_length=256, template_config_name=None, template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r3/guaranteed_possible_impossible/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r3/guaranteed_possible_impossible/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "anli",
+  "dataset_config_name": "dev_r3",
+  "template_name": "guaranteed/possible/impossible",
+  "evaluation": {
+    "accuracy": 0.31083333333333335
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='dev_r3', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r3', target_max_length=256, template_config_name=None, template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r3/justified_in_saying/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/anli/dev_r3/justified_in_saying/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "anli",
+  "dataset_config_name": "dev_r3",
+  "template_name": "justified in saying",
+  "evaluation": {
+    "accuracy": 0.34
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='dev_r3', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r3', target_max_length=256, template_config_name=None, template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/merged.csv
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/merged.csv
@@ -0,0 +1,194 @@
+dataset,prompt,metric,value
+anli_dev_r1,GPT-3 style,accuracy,0.351
+anli_dev_r1,MNLI crowdsource,accuracy,0.334
+anli_dev_r1,can we infer,accuracy,0.351
+anli_dev_r1,guaranteed/possible/impossible,accuracy,0.288
+anli_dev_r1,justified in saying,accuracy,0.345
+anli_dev_r1,median,accuracy,0.345
+anli_dev_r2,GPT-3 style,accuracy,0.339
+anli_dev_r2,MNLI crowdsource,accuracy,0.335
+anli_dev_r2,can we infer,accuracy,0.354
+anli_dev_r2,guaranteed/possible/impossible,accuracy,0.297
+anli_dev_r2,justified in saying,accuracy,0.345
+anli_dev_r2,median,accuracy,0.339
+anli_dev_r3,GPT-3 style,accuracy,0.37583333333333335
+anli_dev_r3,MNLI crowdsource,accuracy,0.3408333333333333
+anli_dev_r3,can we infer,accuracy,0.36333333333333334
+anli_dev_r3,guaranteed/possible/impossible,accuracy,0.31083333333333335
+anli_dev_r3,justified in saying,accuracy,0.34
+anli_dev_r3,median,accuracy,0.3408333333333333
+story_cloze_2016,Answer Given options,accuracy,0.8305718866916088
+story_cloze_2016,Choose Story Ending,accuracy,0.8706574024585783
+story_cloze_2016,Generate Ending,accuracy,0.7183324425440941
+story_cloze_2016,Novel Correct Ending,accuracy,0.848743987172635
+story_cloze_2016,Story Continuation and Options,accuracy,0.8466060929983966
+story_cloze_2016,median,accuracy,0.8466060929983966
+super_glue_cb,GPT-3 style,accuracy,0.625
+super_glue_cb,MNLI crowdsource,accuracy,0.08928571428571429
+super_glue_cb,can we infer,accuracy,0.5892857142857143
+super_glue_cb,guaranteed/possible/impossible,accuracy,0.5
+super_glue_cb,justified in saying,accuracy,0.5357142857142857
+super_glue_cb,median,accuracy,0.5357142857142857
+super_glue_copa,"C1 or C2? premise, so/because…",accuracy,0.66
+super_glue_copa,best_option,accuracy,0.67
+super_glue_copa,cause_effect,accuracy,0.78
+super_glue_copa,i_am_hesitating,accuracy,0.8
+super_glue_copa,plausible_alternatives,accuracy,0.81
+super_glue_copa,median,accuracy,0.78
+super_glue_rte,GPT-3 style,accuracy,0.7870036101083032
+super_glue_rte,MNLI crowdsource,accuracy,0.7220216606498195
+super_glue_rte,does it follow that,accuracy,0.6678700361010831
+super_glue_rte,guaranteed true,accuracy,0.6714801444043321
+super_glue_rte,should assume,accuracy,0.6678700361010831
+super_glue_rte,median,accuracy,0.6714801444043321
+winogrande_winogrande_xl,Replace,accuracy,0.5406471981057617
+winogrande_winogrande_xl,True or False,accuracy,0.5074980268350434
+winogrande_winogrande_xl,does underscore refer to,accuracy,0.5177584846093133
+winogrande_winogrande_xl,stand for,accuracy,0.510655090765588
+winogrande_winogrande_xl,underscore refer to,accuracy,0.5256511444356748
+winogrande_winogrande_xl,median,accuracy,0.5177584846093133
+xcopa_id,"C1 or C2? premise, so/because…",accuracy,0.47
+xcopa_id,best_option,accuracy,0.51
+xcopa_id,cause_effect,accuracy,0.65
+xcopa_id,i_am_hesitating,accuracy,0.66
+xcopa_id,plausible_alternatives,accuracy,0.67
+xcopa_id,median,accuracy,0.65
+xcopa_sw,"C1 or C2? premise, so/because…",accuracy,0.58
+xcopa_sw,best_option,accuracy,0.57
+xcopa_sw,cause_effect,accuracy,0.46
+xcopa_sw,i_am_hesitating,accuracy,0.48
+xcopa_sw,plausible_alternatives,accuracy,0.45
+xcopa_sw,median,accuracy,0.48
+xcopa_ta,"C1 or C2? premise, so/because…",accuracy,0.57
+xcopa_ta,best_option,accuracy,0.67
+xcopa_ta,cause_effect,accuracy,0.71
+xcopa_ta,i_am_hesitating,accuracy,0.71
+xcopa_ta,plausible_alternatives,accuracy,0.69
+xcopa_ta,median,accuracy,0.69
+xcopa_vi,"C1 or C2? premise, so/because…",accuracy,0.55
+xcopa_vi,best_option,accuracy,0.61
+xcopa_vi,cause_effect,accuracy,0.67
+xcopa_vi,i_am_hesitating,accuracy,0.66
+xcopa_vi,plausible_alternatives,accuracy,0.65
+xcopa_vi,median,accuracy,0.65
+xcopa_zh,"C1 or C2? premise, so/because…",accuracy,0.62
+xcopa_zh,best_option,accuracy,0.61
+xcopa_zh,cause_effect,accuracy,0.77
+xcopa_zh,i_am_hesitating,accuracy,0.72
+xcopa_zh,plausible_alternatives,accuracy,0.74
+xcopa_zh,median,accuracy,0.72
+xnli_ar,GPT-3 style,accuracy,0.5040160642570282
+xnli_ar,MNLI crowdsource,accuracy,0.39879518072289155
+xnli_ar,can we infer,accuracy,0.506425702811245
+xnli_ar,guaranteed/possible/impossible,accuracy,0.4799196787148594
+xnli_ar,justified in saying,accuracy,0.41526104417670684
+xnli_ar,median,accuracy,0.4799196787148594
+xnli_en,GPT-3 style,accuracy,0.5590361445783133
+xnli_en,MNLI crowdsource,accuracy,0.342570281124498
+xnli_en,can we infer,accuracy,0.5449799196787148
+xnli_en,guaranteed/possible/impossible,accuracy,0.41164658634538154
+xnli_en,justified in saying,accuracy,0.4634538152610442
+xnli_en,median,accuracy,0.4634538152610442
+xnli_es,GPT-3 style,accuracy,0.5373493975903615
+xnli_es,MNLI crowdsource,accuracy,0.40441767068273093
+xnli_es,can we infer,accuracy,0.5277108433734939
+xnli_es,guaranteed/possible/impossible,accuracy,0.44216867469879517
+xnli_es,justified in saying,accuracy,0.4534136546184739
+xnli_es,median,accuracy,0.4534136546184739
+xnli_fr,GPT-3 style,accuracy,0.5248995983935743
+xnli_fr,MNLI crowdsource,accuracy,0.3895582329317269
+xnli_fr,can we infer,accuracy,0.5337349397590362
+xnli_fr,guaranteed/possible/impossible,accuracy,0.42971887550200805
+xnli_fr,justified in saying,accuracy,0.4738955823293173
+xnli_fr,median,accuracy,0.4738955823293173
+xnli_hi,GPT-3 style,accuracy,0.4983935742971888
+xnli_hi,MNLI crowdsource,accuracy,0.38714859437751004
+xnli_hi,can we infer,accuracy,0.45542168674698796
+xnli_hi,guaranteed/possible/impossible,accuracy,0.41405622489959837
+xnli_hi,justified in saying,accuracy,0.38795180722891565
+xnli_hi,median,accuracy,0.41405622489959837
+xnli_sw,GPT-3 style,accuracy,0.43493975903614457
+xnli_sw,MNLI crowdsource,accuracy,0.363855421686747
+xnli_sw,can we infer,accuracy,0.42891566265060244
+xnli_sw,guaranteed/possible/impossible,accuracy,0.3457831325301205
+xnli_sw,justified in saying,accuracy,0.3650602409638554
+xnli_sw,median,accuracy,0.3650602409638554
+xnli_ur,GPT-3 style,accuracy,0.43493975903614457
+xnli_ur,MNLI crowdsource,accuracy,0.3895582329317269
+xnli_ur,can we infer,accuracy,0.45180722891566266
+xnli_ur,guaranteed/possible/impossible,accuracy,0.40120481927710844
+xnli_ur,justified in saying,accuracy,0.37630522088353413
+xnli_ur,median,accuracy,0.40120481927710844
+xnli_vi,GPT-3 style,accuracy,0.5196787148594377
+xnli_vi,MNLI crowdsource,accuracy,0.38112449799196785
+xnli_vi,can we infer,accuracy,0.5080321285140562
+xnli_vi,guaranteed/possible/impossible,accuracy,0.38393574297188754
+xnli_vi,justified in saying,accuracy,0.43614457831325304
+xnli_vi,median,accuracy,0.43614457831325304
+xnli_zh,GPT-3 style,accuracy,0.5052208835341365
+xnli_zh,MNLI crowdsource,accuracy,0.4
+xnli_zh,can we infer,accuracy,0.5228915662650603
+xnli_zh,guaranteed/possible/impossible,accuracy,0.4738955823293173
+xnli_zh,justified in saying,accuracy,0.45863453815261046
+xnli_zh,median,accuracy,0.4738955823293173
+xstory_cloze_ar,Answer Given options,accuracy,0.7518199867637326
+xstory_cloze_ar,Choose Story Ending,accuracy,0.7749834546657842
+xstory_cloze_ar,Generate Ending,accuracy,0.586366644606221
+xstory_cloze_ar,Novel Correct Ending,accuracy,0.7518199867637326
+xstory_cloze_ar,Story Continuation and Options,accuracy,0.7438782263401721
+xstory_cloze_ar,median,accuracy,0.7518199867637326
+xstory_cloze_es,Answer Given options,accuracy,0.7835870284579749
+xstory_cloze_es,Choose Story Ending,accuracy,0.8292521508934481
+xstory_cloze_es,Generate Ending,accuracy,0.6399735274652548
+xstory_cloze_es,Novel Correct Ending,accuracy,0.7935142289874255
+xstory_cloze_es,Story Continuation and Options,accuracy,0.7888815354070152
+xstory_cloze_es,median,accuracy,0.7888815354070152
+xstory_cloze_eu,Answer Given options,accuracy,0.7041694242223693
+xstory_cloze_eu,Choose Story Ending,accuracy,0.6823295830575777
+xstory_cloze_eu,Generate Ending,accuracy,0.5625413633355394
+xstory_cloze_eu,Novel Correct Ending,accuracy,0.6671078755790867
+xstory_cloze_eu,Story Continuation and Options,accuracy,0.671740569159497
+xstory_cloze_eu,median,accuracy,0.671740569159497
+xstory_cloze_hi,Answer Given options,accuracy,0.6915949702183984
+xstory_cloze_hi,Choose Story Ending,accuracy,0.7220383851753805
+xstory_cloze_hi,Generate Ending,accuracy,0.5883520847121112
+xstory_cloze_hi,Novel Correct Ending,accuracy,0.6743878226340172
+xstory_cloze_hi,Story Continuation and Options,accuracy,0.6816677696889477
+xstory_cloze_hi,median,accuracy,0.6816677696889477
+xstory_cloze_id,Answer Given options,accuracy,0.7445400397088021
+xstory_cloze_id,Choose Story Ending,accuracy,0.771012574454004
+xstory_cloze_id,Generate Ending,accuracy,0.6029119788219722
+xstory_cloze_id,Novel Correct Ending,accuracy,0.7485109199205824
+xstory_cloze_id,Story Continuation and Options,accuracy,0.7438782263401721
+xstory_cloze_id,median,accuracy,0.7445400397088021
+xstory_cloze_zh,Answer Given options,accuracy,0.7610853739245532
+xstory_cloze_zh,Choose Story Ending,accuracy,0.7961614824619457
+xstory_cloze_zh,Generate Ending,accuracy,0.6214427531436135
+xstory_cloze_zh,Novel Correct Ending,accuracy,0.7696889477167439
+xstory_cloze_zh,Story Continuation and Options,accuracy,0.7670416942422237
+xstory_cloze_zh,median,accuracy,0.7670416942422237
+xwinograd_en,Replace,accuracy,0.5225806451612903
+xwinograd_en,True or False,accuracy,0.48946236559139783
+xwinograd_en,does underscore refer to,accuracy,0.5281720430107527
+xwinograd_en,stand for,accuracy,0.5062365591397849
+xwinograd_en,underscore refer to,accuracy,0.5372043010752688
+xwinograd_en,median,accuracy,0.5225806451612903
+xwinograd_fr,Replace,accuracy,0.5060240963855421
+xwinograd_fr,True or False,accuracy,0.5421686746987951
+xwinograd_fr,does underscore refer to,accuracy,0.5542168674698795
+xwinograd_fr,stand for,accuracy,0.4819277108433735
+xwinograd_fr,underscore refer to,accuracy,0.5301204819277109
+xwinograd_fr,median,accuracy,0.5301204819277109
+xwinograd_pt,Replace,accuracy,0.5133079847908745
+xwinograd_pt,True or False,accuracy,0.4714828897338403
+xwinograd_pt,does underscore refer to,accuracy,0.5209125475285171
+xwinograd_pt,stand for,accuracy,0.5019011406844106
+xwinograd_pt,underscore refer to,accuracy,0.5399239543726235
+xwinograd_pt,median,accuracy,0.5133079847908745
+xwinograd_zh,Replace,accuracy,0.5257936507936508
+xwinograd_zh,True or False,accuracy,0.5297619047619048
+xwinograd_zh,does underscore refer to,accuracy,0.5218253968253969
+xwinograd_zh,stand for,accuracy,0.4444444444444444
+xwinograd_zh,underscore refer to,accuracy,0.5198412698412699
+xwinograd_zh,median,accuracy,0.5218253968253969
+multiple,average,multiple,0.5631550819200618
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/merged.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/merged.json
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/story_cloze/2016/Answer_Given_options/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/story_cloze/2016/Answer_Given_options/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "story_cloze",
+  "dataset_config_name": "2016",
+  "template_name": "Answer Given options",
+  "evaluation": {
+    "accuracy": 0.8305718866916088
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='2016', dataset_name='story_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='Answer Given options', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/story_cloze/2016/Choose_Story_Ending/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/story_cloze/2016/Choose_Story_Ending/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "story_cloze",
+  "dataset_config_name": "2016",
+  "template_name": "Choose Story Ending",
+  "evaluation": {
+    "accuracy": 0.8706574024585783
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='2016', dataset_name='story_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='Choose Story Ending', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/story_cloze/2016/Generate_Ending/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/story_cloze/2016/Generate_Ending/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "story_cloze",
+  "dataset_config_name": "2016",
+  "template_name": "Generate Ending",
+  "evaluation": {
+    "accuracy": 0.7183324425440941
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='2016', dataset_name='story_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='Generate Ending', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/story_cloze/2016/Novel_Correct_Ending/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/story_cloze/2016/Novel_Correct_Ending/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "story_cloze",
+  "dataset_config_name": "2016",
+  "template_name": "Novel Correct Ending",
+  "evaluation": {
+    "accuracy": 0.848743987172635
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='2016', dataset_name='story_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='Novel Correct Ending', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/story_cloze/2016/Story_Continuation_and_Options/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/story_cloze/2016/Story_Continuation_and_Options/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "story_cloze",
+  "dataset_config_name": "2016",
+  "template_name": "Story Continuation and Options",
+  "evaluation": {
+    "accuracy": 0.8466060929983966
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='2016', dataset_name='story_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='Story Continuation and Options', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/cb/GPT-3_style/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/cb/GPT-3_style/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "super_glue",
+  "dataset_config_name": "cb",
+  "template_name": "GPT-3 style",
+  "evaluation": {
+    "accuracy": 0.625
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='cb', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/cb/MNLI_crowdsource/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/cb/MNLI_crowdsource/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "super_glue",
+  "dataset_config_name": "cb",
+  "template_name": "MNLI crowdsource",
+  "evaluation": {
+    "accuracy": 0.08928571428571429
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='cb', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/cb/can_we_infer/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/cb/can_we_infer/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "super_glue",
+  "dataset_config_name": "cb",
+  "template_name": "can we infer",
+  "evaluation": {
+    "accuracy": 0.5892857142857143
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='cb', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/cb/guaranteed_possible_impossible/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/cb/guaranteed_possible_impossible/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "super_glue",
+  "dataset_config_name": "cb",
+  "template_name": "guaranteed/possible/impossible",
+  "evaluation": {
+    "accuracy": 0.5
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='cb', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/cb/justified_in_saying/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/cb/justified_in_saying/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "super_glue",
+  "dataset_config_name": "cb",
+  "template_name": "justified in saying",
+  "evaluation": {
+    "accuracy": 0.5357142857142857
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='cb', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/copa/C1_or_C2?_premise,_so_because…/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/copa/C1_or_C2?_premise,_so_because…/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "super_glue",
+  "dataset_config_name": "copa",
+  "template_name": "C1 or C2? premise, so/because\u2026",
+  "evaluation": {
+    "accuracy": 0.66
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='copa', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='C1 or C2? premise, so/because\u2026', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/copa/C1_or_C2?_premise/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/copa/C1_or_C2?_premise/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "super_glue",
+  "dataset_config_name": "copa",
+  "template_name": "C1 or C2? premise, so/because\u2026",
+  "evaluation": {
+    "accuracy": 0.66
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='copa', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='so/because\u2026,validation', target_max_length=256, template_config_name=None, template_name='C1 or C2? premise', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/copa/best_option/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/copa/best_option/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "super_glue",
+  "dataset_config_name": "copa",
+  "template_name": "best_option",
+  "evaluation": {
+    "accuracy": 0.67
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='copa', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='best_option', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/copa/cause_effect/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/copa/cause_effect/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "super_glue",
+  "dataset_config_name": "copa",
+  "template_name": "cause_effect",
+  "evaluation": {
+    "accuracy": 0.78
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='copa', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='cause_effect', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/copa/i_am_hesitating/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/copa/i_am_hesitating/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "super_glue",
+  "dataset_config_name": "copa",
+  "template_name": "i_am_hesitating",
+  "evaluation": {
+    "accuracy": 0.8
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='copa', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='i_am_hesitating', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/copa/plausible_alternatives/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/copa/plausible_alternatives/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "super_glue",
+  "dataset_config_name": "copa",
+  "template_name": "plausible_alternatives",
+  "evaluation": {
+    "accuracy": 0.81
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='copa', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='plausible_alternatives', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/rte/GPT-3_style/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/rte/GPT-3_style/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "super_glue",
+  "dataset_config_name": "rte",
+  "template_name": "GPT-3 style",
+  "evaluation": {
+    "accuracy": 0.7870036101083032
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='rte', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/rte/MNLI_crowdsource/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/rte/MNLI_crowdsource/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "super_glue",
+  "dataset_config_name": "rte",
+  "template_name": "MNLI crowdsource",
+  "evaluation": {
+    "accuracy": 0.7220216606498195
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='rte', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/rte/does_it_follow_that/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/rte/does_it_follow_that/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "super_glue",
+  "dataset_config_name": "rte",
+  "template_name": "does it follow that",
+  "evaluation": {
+    "accuracy": 0.6678700361010831
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='rte', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='does it follow that', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/rte/guaranteed_true/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/rte/guaranteed_true/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "super_glue",
+  "dataset_config_name": "rte",
+  "template_name": "guaranteed true",
+  "evaluation": {
+    "accuracy": 0.6714801444043321
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='rte', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='guaranteed true', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/rte/should_assume/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/super_glue/rte/should_assume/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "super_glue",
+  "dataset_config_name": "rte",
+  "template_name": "should assume",
+  "evaluation": {
+    "accuracy": 0.6678700361010831
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='rte', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='should assume', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/winogrande/winogrande_xl/Replace/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/winogrande/winogrande_xl/Replace/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "winogrande",
+  "dataset_config_name": "winogrande_xl",
+  "template_name": "Replace",
+  "evaluation": {
+    "accuracy": 0.5406471981057617
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='winogrande_xl', dataset_name='winogrande', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='Replace', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/winogrande/winogrande_xl/True_or_False/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/winogrande/winogrande_xl/True_or_False/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "winogrande",
+  "dataset_config_name": "winogrande_xl",
+  "template_name": "True or False",
+  "evaluation": {
+    "accuracy": 0.5074980268350434
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='winogrande_xl', dataset_name='winogrande', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='True or False', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/winogrande/winogrande_xl/does_underscore_refer_to/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/winogrande/winogrande_xl/does_underscore_refer_to/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "winogrande",
+  "dataset_config_name": "winogrande_xl",
+  "template_name": "does underscore refer to",
+  "evaluation": {
+    "accuracy": 0.5177584846093133
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='winogrande_xl', dataset_name='winogrande', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='does underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/winogrande/winogrande_xl/stand_for/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/winogrande/winogrande_xl/stand_for/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "winogrande",
+  "dataset_config_name": "winogrande_xl",
+  "template_name": "stand for",
+  "evaluation": {
+    "accuracy": 0.510655090765588
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='winogrande_xl', dataset_name='winogrande', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='stand for', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/winogrande/winogrande_xl/underscore_refer_to/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/winogrande/winogrande_xl/underscore_refer_to/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "winogrande",
+  "dataset_config_name": "winogrande_xl",
+  "template_name": "underscore refer to",
+  "evaluation": {
+    "accuracy": 0.5256511444356748
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='winogrande_xl', dataset_name='winogrande', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/id/C1_or_C2?_premise/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/id/C1_or_C2?_premise/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "id",
+  "template_name": "C1 or C2? premise, so/because\u2026",
+  "evaluation": {
+    "accuracy": 0.47
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='so/because\u2026,validation', target_max_length=256, template_config_name='en', template_name='C1 or C2? premise', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/id/best_option/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/id/best_option/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "id",
+  "template_name": "best_option",
+  "evaluation": {
+    "accuracy": 0.51
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='best_option', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/id/cause_effect/results.json
+++ b/evaluation_bloomz-7b1-p3/evaluation_l1/xcopa/id/cause_effect/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "id",
+  "template_name": "cause_effect",
+  "evaluation": {
+    "accuracy": 0.65
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/p31lossseqglobal_step1000/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='cause_effect', tokenizer_name=None, use_slow_tokenizer=False)"
+}
--- a/Show More
+++ b/Show More
				`@@ -0,0 +1 @@`
				`{"framework": "pytorch", "task": "text-generation", "allow_remote": true}`