292 lines
9.2 KiB
JSON
292 lines
9.2 KiB
JSON
{
|
|
"num_threads": 224,
|
|
"split_by_whitespace": true,
|
|
"model_type": "unigram",
|
|
"vocab_size": 250680,
|
|
"character_coverage": 0.9999,
|
|
"byte_fallback": true,
|
|
"split_by_number": true,
|
|
"split_digits": true,
|
|
"normalization_rule_name": "nfkc",
|
|
"max_sentence_length": 4096,
|
|
"shuffle_input_sentence": true,
|
|
"input_sentence_size": 0,
|
|
"train_extremely_large_corpus": true,
|
|
"allow_whitespace_only_pieces": true,
|
|
"required_chars": "",
|
|
"remove_extra_whitespaces": false,
|
|
"user_defined_symbols": [
|
|
"<s>",
|
|
"</s>",
|
|
"<pad>",
|
|
"<eod>",
|
|
"<placeholder_tok_0>",
|
|
"<placeholder_tok_1>",
|
|
"<placeholder_tok_2>",
|
|
"<placeholder_tok_3>",
|
|
"<placeholder_tok_4>",
|
|
"<placeholder_tok_5>",
|
|
"<placeholder_tok_6>",
|
|
"<placeholder_tok_7>",
|
|
"<placeholder_tok_8>",
|
|
"<placeholder_tok_9>",
|
|
"<placeholder_tok_10>",
|
|
"<placeholder_tok_11>",
|
|
"<placeholder_tok_12>",
|
|
"<placeholder_tok_13>",
|
|
"<placeholder_tok_14>",
|
|
"<placeholder_tok_15>",
|
|
"<placeholder_tok_16>",
|
|
"<placeholder_tok_17>",
|
|
"<placeholder_tok_18>",
|
|
"<placeholder_tok_19>",
|
|
"<placeholder_tok_20>",
|
|
"<placeholder_tok_21>",
|
|
"<placeholder_tok_22>",
|
|
"<placeholder_tok_23>",
|
|
"<placeholder_tok_24>",
|
|
"<placeholder_tok_25>",
|
|
"<placeholder_tok_26>",
|
|
"<placeholder_tok_27>",
|
|
"<placeholder_tok_28>",
|
|
"<placeholder_tok_29>",
|
|
"<placeholder_tok_30>",
|
|
"<placeholder_tok_31>",
|
|
"<placeholder_tok_32>",
|
|
"<placeholder_tok_33>",
|
|
"<placeholder_tok_34>",
|
|
"<placeholder_tok_35>",
|
|
"<placeholder_tok_36>",
|
|
"<placeholder_tok_37>",
|
|
"<placeholder_tok_38>",
|
|
"<placeholder_tok_39>",
|
|
"<placeholder_tok_40>",
|
|
"<placeholder_tok_41>",
|
|
"<placeholder_tok_42>",
|
|
"<placeholder_tok_43>",
|
|
"<placeholder_tok_44>",
|
|
"<placeholder_tok_45>",
|
|
"<placeholder_tok_46>",
|
|
"<placeholder_tok_47>",
|
|
"<placeholder_tok_48>",
|
|
"<placeholder_tok_49>",
|
|
"<placeholder_tok_50>",
|
|
"<placeholder_tok_51>",
|
|
"<placeholder_tok_52>",
|
|
"<placeholder_tok_53>",
|
|
"<placeholder_tok_54>",
|
|
"<placeholder_tok_55>",
|
|
"<placeholder_tok_56>",
|
|
"<placeholder_tok_57>",
|
|
"<placeholder_tok_58>",
|
|
"<placeholder_tok_59>",
|
|
"<placeholder_tok_60>",
|
|
"<placeholder_tok_61>",
|
|
"<placeholder_tok_62>",
|
|
"<placeholder_tok_63>",
|
|
"<placeholder_tok_64>",
|
|
"<placeholder_tok_65>",
|
|
"<placeholder_tok_66>",
|
|
"<placeholder_tok_67>",
|
|
"<placeholder_tok_68>",
|
|
"<placeholder_tok_69>",
|
|
"<placeholder_tok_70>",
|
|
"<placeholder_tok_71>",
|
|
"<placeholder_tok_72>",
|
|
"<placeholder_tok_73>",
|
|
"<placeholder_tok_74>",
|
|
"<placeholder_tok_75>",
|
|
"<placeholder_tok_76>",
|
|
"<placeholder_tok_77>",
|
|
"<placeholder_tok_78>",
|
|
"<placeholder_tok_79>",
|
|
"<placeholder_tok_80>",
|
|
"<placeholder_tok_81>",
|
|
"<placeholder_tok_82>",
|
|
"<placeholder_tok_83>",
|
|
"<placeholder_tok_84>",
|
|
"<placeholder_tok_85>",
|
|
"<placeholder_tok_86>",
|
|
"<placeholder_tok_87>",
|
|
"<placeholder_tok_88>",
|
|
"<placeholder_tok_89>",
|
|
"<placeholder_tok_90>",
|
|
"<placeholder_tok_91>",
|
|
"<placeholder_tok_92>",
|
|
"<placeholder_tok_93>",
|
|
"<placeholder_tok_94>",
|
|
"<placeholder_tok_95>",
|
|
"<placeholder_tok_96>",
|
|
"<placeholder_tok_97>",
|
|
"<placeholder_tok_98>",
|
|
"<placeholder_tok_99>",
|
|
"<placeholder_tok_100>",
|
|
"<placeholder_tok_101>",
|
|
"<placeholder_tok_102>",
|
|
"<placeholder_tok_103>",
|
|
"<placeholder_tok_104>",
|
|
"<placeholder_tok_105>",
|
|
"<placeholder_tok_106>",
|
|
"<placeholder_tok_107>",
|
|
"<placeholder_tok_108>",
|
|
"<placeholder_tok_109>",
|
|
"<placeholder_tok_110>",
|
|
"<placeholder_tok_111>",
|
|
"<placeholder_tok_112>",
|
|
"<placeholder_tok_113>",
|
|
"<placeholder_tok_114>",
|
|
"<placeholder_tok_115>",
|
|
"<placeholder_tok_116>",
|
|
"<placeholder_tok_117>",
|
|
"<placeholder_tok_118>",
|
|
"<placeholder_tok_119>",
|
|
"<placeholder_tok_120>",
|
|
"<placeholder_tok_121>",
|
|
"<placeholder_tok_122>",
|
|
"<placeholder_tok_123>",
|
|
"<placeholder_tok_124>",
|
|
"<placeholder_tok_125>",
|
|
"<placeholder_tok_126>",
|
|
"<placeholder_tok_127>",
|
|
"<placeholder_tok_128>",
|
|
"<placeholder_tok_129>",
|
|
"<placeholder_tok_130>",
|
|
"<placeholder_tok_131>",
|
|
"<placeholder_tok_132>",
|
|
"<placeholder_tok_133>",
|
|
"<placeholder_tok_134>",
|
|
"<placeholder_tok_135>",
|
|
"<placeholder_tok_136>",
|
|
"<placeholder_tok_137>",
|
|
"<placeholder_tok_138>",
|
|
"<placeholder_tok_139>",
|
|
"<placeholder_tok_140>",
|
|
"<placeholder_tok_141>",
|
|
"<placeholder_tok_142>",
|
|
"<placeholder_tok_143>",
|
|
"<placeholder_tok_144>",
|
|
"<placeholder_tok_145>",
|
|
"<placeholder_tok_146>",
|
|
"<placeholder_tok_147>",
|
|
"<placeholder_tok_148>",
|
|
"<placeholder_tok_149>",
|
|
"<placeholder_tok_150>",
|
|
"<placeholder_tok_151>",
|
|
"<placeholder_tok_152>",
|
|
"<placeholder_tok_153>",
|
|
"<placeholder_tok_154>",
|
|
"<placeholder_tok_155>",
|
|
"<placeholder_tok_156>",
|
|
"<placeholder_tok_157>",
|
|
"<placeholder_tok_158>",
|
|
"<placeholder_tok_159>",
|
|
"<placeholder_tok_160>",
|
|
"<placeholder_tok_161>",
|
|
"<placeholder_tok_162>",
|
|
"<placeholder_tok_163>",
|
|
"<placeholder_tok_164>",
|
|
"<placeholder_tok_165>",
|
|
"<placeholder_tok_166>",
|
|
"<placeholder_tok_167>",
|
|
"<placeholder_tok_168>",
|
|
"<placeholder_tok_169>",
|
|
"<placeholder_tok_170>",
|
|
"<placeholder_tok_171>",
|
|
"<placeholder_tok_172>",
|
|
"<placeholder_tok_173>",
|
|
"<placeholder_tok_174>",
|
|
"<placeholder_tok_175>",
|
|
"<placeholder_tok_176>",
|
|
"<placeholder_tok_177>",
|
|
"<placeholder_tok_178>",
|
|
"<placeholder_tok_179>",
|
|
"<placeholder_tok_180>",
|
|
"<placeholder_tok_181>",
|
|
"<placeholder_tok_182>",
|
|
"<placeholder_tok_183>",
|
|
"<placeholder_tok_184>",
|
|
"<placeholder_tok_185>",
|
|
"<placeholder_tok_186>",
|
|
"<placeholder_tok_187>",
|
|
"<placeholder_tok_188>",
|
|
"<placeholder_tok_189>",
|
|
"<placeholder_tok_190>",
|
|
"<placeholder_tok_191>",
|
|
"<placeholder_tok_192>",
|
|
"<placeholder_tok_193>",
|
|
"<placeholder_tok_194>",
|
|
"<placeholder_tok_195>",
|
|
"<placeholder_tok_196>",
|
|
"<placeholder_tok_197>",
|
|
"<placeholder_tok_198>",
|
|
"<placeholder_tok_199>",
|
|
"<placeholder_tok_200>",
|
|
"<placeholder_tok_201>",
|
|
"<placeholder_tok_202>",
|
|
"<placeholder_tok_203>",
|
|
"<placeholder_tok_204>",
|
|
"<placeholder_tok_205>",
|
|
"<placeholder_tok_206>",
|
|
"<placeholder_tok_207>",
|
|
"<placeholder_tok_208>",
|
|
"<placeholder_tok_209>",
|
|
"<placeholder_tok_210>",
|
|
"<placeholder_tok_211>",
|
|
"<placeholder_tok_212>",
|
|
"<placeholder_tok_213>",
|
|
"<placeholder_tok_214>",
|
|
"<placeholder_tok_215>",
|
|
"<placeholder_tok_216>",
|
|
"<placeholder_tok_217>",
|
|
"<placeholder_tok_218>",
|
|
"<placeholder_tok_219>",
|
|
"<placeholder_tok_220>",
|
|
"<placeholder_tok_221>",
|
|
"<placeholder_tok_222>",
|
|
"<placeholder_tok_223>",
|
|
"<placeholder_tok_224>",
|
|
"<placeholder_tok_225>",
|
|
"<placeholder_tok_226>",
|
|
"<placeholder_tok_227>",
|
|
"<placeholder_tok_228>",
|
|
"<placeholder_tok_229>",
|
|
"<placeholder_tok_230>",
|
|
"<placeholder_tok_231>",
|
|
"<placeholder_tok_232>",
|
|
"<placeholder_tok_233>",
|
|
"<placeholder_tok_234>",
|
|
"<placeholder_tok_235>",
|
|
"<placeholder_tok_236>",
|
|
"<placeholder_tok_237>",
|
|
"<placeholder_tok_238>",
|
|
"<placeholder_tok_239>",
|
|
"<placeholder_tok_240>",
|
|
"<placeholder_tok_241>",
|
|
"<placeholder_tok_242>",
|
|
"<placeholder_tok_243>",
|
|
"<placeholder_tok_244>",
|
|
"<placeholder_tok_245>",
|
|
"<placeholder_tok_246>",
|
|
"<placeholder_tok_247>",
|
|
"<placeholder_tok_248>",
|
|
"<placeholder_tok_249>",
|
|
"<placeholder_tok_250>",
|
|
"<placeholder_tok_251>",
|
|
"<placeholder_tok_252>",
|
|
"<placeholder_tok_253>",
|
|
"<placeholder_tok_254>",
|
|
"<placeholder_tok_255>"
|
|
],
|
|
"datasets_dir": "/home/fhgiais/gptx_ablations/bias_analysis/data/tokenizer/temp/",
|
|
"save_dir": "/home/fhgiais/gptx_ablations/bias_analysis/tokenizer/24",
|
|
"text_key": "text",
|
|
"cache_dir": "/home/fhgiais/gptx_ablations/bias_analysis/tokenizer/24/cache",
|
|
"library": "sentencepiece",
|
|
"auto_map": {
|
|
"AutoTokenizer": [
|
|
"gptx_tokenizer.SPTokenizer",
|
|
null
|
|
]
|
|
},
|
|
"tokenizer_class": "SPTokenizer"
|
|
} |