399 lines
10 KiB
YAML
399 lines
10 KiB
YAML
# Config file for https://github.com/malteos/lm-datasets
|
|
#
|
|
# EU top-5 (en,fr,es,de,it) + code
|
|
# target size: 300B tokens (train first for 200B tokens)
|
|
|
|
# a fixed random seed for shuffling etc.
|
|
seed: 0
|
|
|
|
# data split settings
|
|
validation_ratio: 0.005 # number of documents in the split: len(dataset) * ratio
|
|
validation_min_total_docs: 1_000 # to be used as validation set, the dataset must have at least n docs
|
|
validation_max_split_docs: 1_000 # number of documents in validation split are capped at this numbers
|
|
validation_min_split_docs: 10 # split must have at least this number of documents, otherwise it will be discarded
|
|
tokenizer_train_ratio: 0.1
|
|
|
|
selected_source_ids:
|
|
- starcoder
|
|
|
|
selected_dataset_ids:
|
|
# english
|
|
- pes2o
|
|
- math_amps
|
|
- eurlex_en
|
|
- wikipedia_20231101_en
|
|
- wikibooks_en
|
|
- wikiquote_en
|
|
- wikinews_en
|
|
- wikisource_en
|
|
- wikivoyage_en
|
|
- colossal_oscar_2015-14_en
|
|
- colossal_oscar_2016-40_en
|
|
- colossal_oscar_2017-43_en
|
|
- colossal_oscar_2018-47_en
|
|
- colossal_oscar_2019-22_en
|
|
- colossal_oscar_2020-24_en
|
|
- colossal_oscar_2020-45_en
|
|
- colossal_oscar_2021-49_en
|
|
- colossal_oscar_2022-27_en
|
|
- colossal_oscar_2022-49_en
|
|
- colossal_oscar_2023-14_en
|
|
- colossal_oscar_2023-23_en
|
|
- pile_of_law_r_legaladvice
|
|
- pile_of_law_atticus_contracts
|
|
- pile_of_law_un_debates
|
|
- proof_pile2_open_web_math
|
|
- parlamint_gb
|
|
- redpajama_stackexchange
|
|
|
|
# french
|
|
- cabernet
|
|
- eurlex_fr
|
|
- legal_mc4_fr
|
|
- wikipedia_20231101_fr
|
|
- wikibooks_fr
|
|
- wikiquote_fr
|
|
- wikinews_fr
|
|
- wikisource_fr
|
|
- wikivoyage_fr
|
|
- colossal_oscar_2015-14_fr
|
|
- colossal_oscar_2016-40_fr
|
|
- colossal_oscar_2017-43_fr
|
|
- colossal_oscar_2018-47_fr
|
|
- colossal_oscar_2019-22_fr
|
|
- colossal_oscar_2020-24_fr
|
|
- colossal_oscar_2020-45_fr
|
|
- colossal_oscar_2021-49_fr
|
|
- colossal_oscar_2022-27_fr
|
|
- colossal_oscar_2022-49_fr
|
|
- colossal_oscar_2023-14_fr
|
|
- colossal_oscar_2023-23_fr
|
|
- opensubtitles_fr
|
|
- parlamint_fr
|
|
|
|
# spanish
|
|
- spanish_legal
|
|
- eurlex_es
|
|
- legal_mc4_es
|
|
- wikipedia_20231101_es
|
|
- wikibooks_es
|
|
- wikiquote_es
|
|
- wikinews_es
|
|
- wikisource_es
|
|
- wikivoyage_es
|
|
- colossal_oscar_2015-14_es
|
|
- colossal_oscar_2016-40_es
|
|
- colossal_oscar_2017-43_es
|
|
- colossal_oscar_2018-47_es
|
|
- colossal_oscar_2019-22_es
|
|
- colossal_oscar_2020-24_es
|
|
- colossal_oscar_2020-45_es
|
|
- colossal_oscar_2021-49_es
|
|
- colossal_oscar_2022-27_es
|
|
- colossal_oscar_2022-49_es
|
|
- colossal_oscar_2023-14_es
|
|
- colossal_oscar_2023-23_es
|
|
- opensubtitles_es
|
|
- parlamint_es
|
|
|
|
# german
|
|
- openlegaldata
|
|
- dewac
|
|
- eurlex_de
|
|
- legal_mc4_de
|
|
- wikipedia_20231101_de
|
|
- wikibooks_de
|
|
- wikiquote_de
|
|
- wikinews_de
|
|
- wikisource_de
|
|
- wikivoyage_de
|
|
- colossal_oscar_2015-14_de
|
|
- colossal_oscar_2016-40_de
|
|
- colossal_oscar_2017-43_de
|
|
- colossal_oscar_2018-47_de
|
|
- colossal_oscar_2019-22_de
|
|
- colossal_oscar_2020-24_de
|
|
- colossal_oscar_2020-45_de
|
|
- colossal_oscar_2021-49_de
|
|
- colossal_oscar_2022-27_de
|
|
- colossal_oscar_2022-49_de
|
|
- colossal_oscar_2023-14_de
|
|
- colossal_oscar_2023-23_de
|
|
- open_discourse_bundestag
|
|
- tagesschau_2018_2023
|
|
- opensubtitles_de
|
|
- parlamint_at
|
|
|
|
# italian
|
|
- itwac
|
|
- eurlex_it
|
|
- legal_mc4_it
|
|
- wikipedia_20231101_it
|
|
- wikibooks_it
|
|
- wikiquote_it
|
|
- wikinews_it
|
|
- wikisource_it
|
|
- wikivoyage_it
|
|
- colossal_oscar_2015-14_it
|
|
- colossal_oscar_2016-40_it
|
|
- colossal_oscar_2017-43_it
|
|
- colossal_oscar_2018-47_it
|
|
- colossal_oscar_2019-22_it
|
|
- colossal_oscar_2020-24_it
|
|
- colossal_oscar_2020-45_it
|
|
- colossal_oscar_2021-49_it
|
|
- colossal_oscar_2022-27_it
|
|
- colossal_oscar_2022-49_it
|
|
- colossal_oscar_2023-14_it
|
|
- colossal_oscar_2023-23_it
|
|
- opensubtitles_it
|
|
- parlamint_it
|
|
- tatoeba_translation_en_fr
|
|
- tatoeba_translation_en_es
|
|
- tatoeba_translation_en_it
|
|
- tatoeba_translation_fr_it
|
|
- tatoeba_translation_es_fr
|
|
- tatoeba_translation_es_it
|
|
- tatoeba_translation_de_en
|
|
- tatoeba_translation_de_fr
|
|
- tatoeba_translation_de_es
|
|
- tatoeba_translation_de_it
|
|
- opus100_translation_de_en
|
|
- opus100_translation_en_es
|
|
- opus100_translation_en_fr
|
|
- opus100_translation_en_it
|
|
- wmt19_translation_de_en
|
|
- wmt19_translation_fr_de
|
|
|
|
sampling_factor_by_dataset_id:
|
|
redpajama_stackexchange: 0.1
|
|
pes2o: 0.1
|
|
math_amps: 0.1
|
|
openlegaldata: 0.75
|
|
dewac: 0.05
|
|
itwac: 1
|
|
cabernet: 1
|
|
spanish_legal: 0.1
|
|
eurlex_de: 0.5
|
|
eurlex_en: 0.5
|
|
eurlex_es: 1
|
|
eurlex_fr: 1
|
|
eurlex_it: 1
|
|
legal_mc4_de: 0.1
|
|
legal_mc4_es: 0.25
|
|
legal_mc4_fr: 0.25
|
|
legal_mc4_it: 1
|
|
wikipedia_20231101_de: 2
|
|
wikibooks_de: 1
|
|
wikiquote_de: 1
|
|
wikinews_de: 2
|
|
wikisource_de: 1
|
|
wikivoyage_de: 1
|
|
wikipedia_20231101_en: 1
|
|
wikibooks_en: 1
|
|
wikiquote_en: 0.25
|
|
wikinews_en: 1
|
|
wikisource_en: 1
|
|
wikivoyage_en: 1
|
|
wikipedia_20231101_es: 2
|
|
wikibooks_es: 1
|
|
wikiquote_es: 1
|
|
wikinews_es: 2
|
|
wikisource_es: 1
|
|
wikivoyage_es: 1
|
|
wikipedia_20231101_fr: 2
|
|
wikibooks_fr: 1
|
|
wikiquote_fr: 1
|
|
wikinews_fr: 2
|
|
wikisource_fr: 1
|
|
wikivoyage_fr: 1
|
|
wikipedia_20231101_it: 2
|
|
wikibooks_it: 1
|
|
wikiquote_it: 1
|
|
wikinews_it: 2
|
|
wikisource_it: 1
|
|
wikivoyage_it: 1
|
|
colossal_oscar_2015-14_de: 1
|
|
colossal_oscar_2016-40_de: 0.95
|
|
colossal_oscar_2017-43_de: 0.1
|
|
colossal_oscar_2018-47_de: 0.1
|
|
colossal_oscar_2019-22_de: 0.1
|
|
colossal_oscar_2020-24_de: 0.1
|
|
colossal_oscar_2020-45_de: 0.1
|
|
colossal_oscar_2021-49_de: 0.1
|
|
colossal_oscar_2022-27_de: 0.1
|
|
colossal_oscar_2022-49_de: 0.1
|
|
colossal_oscar_2023-14_de: 0.95
|
|
colossal_oscar_2023-23_de: 1
|
|
colossal_oscar_2015-14_en: 0.05
|
|
colossal_oscar_2016-40_en: 0.05
|
|
colossal_oscar_2017-43_en: 0.001
|
|
colossal_oscar_2018-47_en: 0.001
|
|
colossal_oscar_2019-22_en: 0.001
|
|
colossal_oscar_2020-24_en: 0.001
|
|
colossal_oscar_2020-45_en: 0.001
|
|
colossal_oscar_2021-49_en: 0.001
|
|
colossal_oscar_2022-27_en: 0.001
|
|
colossal_oscar_2022-49_en: 0.001
|
|
colossal_oscar_2023-14_en: 0.05
|
|
colossal_oscar_2023-23_en: 0.05
|
|
colossal_oscar_2015-14_es: 1
|
|
colossal_oscar_2016-40_es: 1
|
|
colossal_oscar_2017-43_es: 0.25
|
|
colossal_oscar_2018-47_es: 0.1
|
|
colossal_oscar_2019-22_es: 0.1
|
|
colossal_oscar_2020-24_es: 0.1
|
|
colossal_oscar_2020-45_es: 0.1
|
|
colossal_oscar_2021-49_es: 0.1
|
|
colossal_oscar_2022-27_es: 0.1
|
|
colossal_oscar_2022-49_es: 0.3
|
|
colossal_oscar_2023-14_es: 1
|
|
colossal_oscar_2023-23_es: 1
|
|
colossal_oscar_2015-14_fr: 1
|
|
colossal_oscar_2016-40_fr: 1
|
|
colossal_oscar_2017-43_fr: 0.25
|
|
colossal_oscar_2018-47_fr: 0.25
|
|
colossal_oscar_2019-22_fr: 0.1
|
|
colossal_oscar_2020-24_fr: 0.1
|
|
colossal_oscar_2020-45_fr: 0.1
|
|
colossal_oscar_2021-49_fr: 0.1
|
|
colossal_oscar_2022-27_fr: 0.1
|
|
colossal_oscar_2022-49_fr: 0.75
|
|
colossal_oscar_2023-14_fr: 1
|
|
colossal_oscar_2023-23_fr: 1
|
|
starcoder_emacs-lisp: 0.1
|
|
starcoder_literate-haskell: 0.1
|
|
starcoder_shell: 0.1
|
|
starcoder_ada: 0.1
|
|
starcoder_erlang: 0.1
|
|
starcoder_lua: 0.1
|
|
starcoder_smalltalk: 0.1
|
|
starcoder_agda: 0.1
|
|
starcoder_f-sharp: 0.1
|
|
starcoder_makefile: 0.1
|
|
starcoder_solidity: 0.1
|
|
starcoder_alloy: 0.1
|
|
starcoder_fortran: 0.1
|
|
starcoder_maple: 0.1
|
|
starcoder_sparql: 0.1
|
|
starcoder_antlr: 0.1
|
|
starcoder_git-commits-cleaned: 0.05
|
|
starcoder_markdown: 0.05
|
|
starcoder_sql: 0.1
|
|
starcoder_applescript: 0.1
|
|
starcoder_github-issues-filtered-structured: 0.075
|
|
starcoder_mathematica: 0.1
|
|
starcoder_stan: 0.1
|
|
starcoder_assembly: 0.1
|
|
starcoder_glsl: 0.1
|
|
starcoder_matlab: 0.1
|
|
starcoder_standard-ml: 0.1
|
|
starcoder_augeas: 0.1
|
|
starcoder_go: 0.05
|
|
starcoder_ocaml: 0.1
|
|
starcoder_stata: 0.1
|
|
starcoder_awk: 0.1
|
|
starcoder_groovy: 0.1
|
|
starcoder_pascal: 0.1
|
|
starcoder_systemverilog: 0.1
|
|
starcoder_batchfile: 0.1
|
|
starcoder_haskell: 0.1
|
|
starcoder_perl: 0.1
|
|
starcoder_tcl: 0.1
|
|
starcoder_bluespec: 0.1
|
|
starcoder_html: 0.05
|
|
starcoder_php: 0.05
|
|
starcoder_tcsh: 0.1
|
|
starcoder_c: 0.05
|
|
starcoder_idris: 0.1
|
|
starcoder_powershell: 0.1
|
|
starcoder_tex: 0.1
|
|
starcoder_c-sharp: 0.05
|
|
starcoder_isabelle: 0.1
|
|
starcoder_prolog: 0.1
|
|
starcoder_thrift: 0.1
|
|
starcoder_clojure: 0.1
|
|
starcoder_java: 0.05
|
|
starcoder_protocol-buffer: 0.1
|
|
starcoder_typescript: 0.05
|
|
starcoder_cmake: 0.1
|
|
starcoder_java-server-pages: 0.1
|
|
starcoder_python: 0.05
|
|
starcoder_verilog: 0.1
|
|
starcoder_coffeescript: 0.1
|
|
starcoder_javascript: 0.05
|
|
starcoder_r: 0.1
|
|
starcoder_vhdl: 0.1
|
|
starcoder_common-lisp: 0.1
|
|
starcoder_json: 0.1
|
|
starcoder_racket: 0.1
|
|
starcoder_visual-basic: 0.1
|
|
starcoder_cpp: 0.05
|
|
starcoder_julia: 0.1
|
|
starcoder_restructuredtext: 0.1
|
|
starcoder_xslt: 0.1
|
|
starcoder_css: 0.1
|
|
starcoder_jupyter-scripts-dedup-filtered: 0.1
|
|
starcoder_rmarkdown: 0.1
|
|
starcoder_yacc: 0.1
|
|
starcoder_cuda: 0.1
|
|
starcoder_jupyter-structured-clean-dedup: 0.1
|
|
starcoder_ruby: 0.1
|
|
starcoder_yaml: 0.1
|
|
starcoder_dart: 0.1
|
|
starcoder_kotlin: 0.1
|
|
starcoder_rust: 0.1
|
|
starcoder_zig: 0.1
|
|
starcoder_dockerfile: 0.1
|
|
starcoder_lean: 0.1
|
|
starcoder_sas: 0.1
|
|
starcoder_elixir: 0.1
|
|
starcoder_literate-agda: 0.1
|
|
starcoder_scala: 0.1
|
|
starcoder_elm: 0.1
|
|
starcoder_literate-coffeescript: 0.1
|
|
starcoder_scheme: 0.1
|
|
pile_of_law_r_legaladvice: 1
|
|
pile_of_law_atticus_contracts: 0.25
|
|
pile_of_law_un_debates: 1
|
|
open_discourse_bundestag: 0.5
|
|
tagesschau_2018_2023: 1
|
|
proof_pile2_open_web_math: 0.25
|
|
tatoeba_translation_en_fr: 1
|
|
tatoeba_translation_en_es: 1
|
|
tatoeba_translation_en_it: 1
|
|
tatoeba_translation_fr_it: 1
|
|
tatoeba_translation_es_fr: 1
|
|
tatoeba_translation_es_it: 1
|
|
tatoeba_translation_de_en: 1
|
|
tatoeba_translation_de_fr: 1
|
|
tatoeba_translation_de_es: 1
|
|
tatoeba_translation_de_it: 1
|
|
opus100_translation_de_en: 1
|
|
opus100_translation_en_es: 1
|
|
opus100_translation_en_fr: 1
|
|
opus100_translation_en_it: 1
|
|
wmt19_translation_de_en: 1
|
|
wmt19_translation_fr_de: 1
|
|
opensubtitles_es: 1
|
|
opensubtitles_fr: 1
|
|
opensubtitles_de: 1
|
|
opensubtitles_it: 1
|
|
parlamint_es: 1
|
|
parlamint_it: 1
|
|
parlamint_at: 1
|
|
parlamint_fr: 1
|
|
parlamint_gb: 1
|
|
colossal_oscar_2015-14_it: 1
|
|
colossal_oscar_2016-40_it: 1
|
|
colossal_oscar_2017-43_it: 0.75
|
|
colossal_oscar_2018-47_it: 0.75
|
|
colossal_oscar_2019-22_it: 0.75
|
|
colossal_oscar_2020-24_it: 0.75
|
|
colossal_oscar_2020-45_it: 0.75
|
|
colossal_oscar_2021-49_it: 0.75
|
|
colossal_oscar_2022-27_it: 0.75
|
|
colossal_oscar_2022-49_it: 0.75
|
|
colossal_oscar_2023-14_it: 0.9
|
|
colossal_oscar_2023-23_it: 1
|