smolgen-pubchem-46M-base/tokenizer.json

{
  "version": "1.0",
  "truncation": null,
  "padding": null,
  "added_tokens": [
    {
      "id": 33,
      "content": "[PAD]",
      "single_word": false,
      "lstrip": false,
      "rstrip": false,
      "normalized": false,
      "special": true
    },
    {
      "id": 34,
      "content": "[EOS]",
      "single_word": false,
      "lstrip": false,
      "rstrip": false,
      "normalized": false,
      "special": true
    },
    {
      "id": 35,
      "content": "[BOS]",
      "single_word": false,
      "lstrip": false,
      "rstrip": false,
      "normalized": false,
      "special": true
    }
  ],
  "normalizer": null,
  "pre_tokenizer": {
    "type": "Split",
    "pattern": {
      "Regex": "(\\[[^\\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\\(|\\)|\\.|=|#|\\+|\\/|:|@|\\?|>|\\*|\\$|%[0-9]{2}|[0-9])"
    },
    "behavior": "Isolated",
    "invert": false
  },
  "post_processor": null,
  "decoder": {
    "type": "Fuse"
  },
  "model": {
    "type": "WordPiece",
    "unk_token": "[UNK]",
    "continuing_subword_prefix": "##",
    "max_input_chars_per_word": 100,
    "vocab": {
      "#": 0,
      "=": 1,
      "-": 2,
      "(": 3,
      ")": 4,
      "1": 5,
      "2": 6,
      "3": 7,
      "4": 8,
      "5": 9,
      "6": 10,
      "7": 11,
      "8": 12,
      "9": 13,
      "%10": 14,
      "Br": 15,
      "C": 16,
      "Cl": 17,
      "F": 18,
      "N": 19,
      "O": 20,
      "S": 21,
      "[N+]": 22,
      "[N-]": 23,
      "[O-]": 24,
      "[S+]": 25,
      "[n+]": 26,
      "[nH]": 27,
      "c": 28,
      "n": 29,
      "o": 30,
      "s": 31,
      "[UNK]": 32
    }
  }
}
初始化项目，由ModelHub XC社区提供模型 Model: ddidacus/smolgen-pubchem-46M-base Source: Original Platform 2026-04-17 05:58:14 +08:00			`{`
			`"version": "1.0",`
			`"truncation": null,`
			`"padding": null,`
			`"added_tokens": [`
			`{`
			`"id": 33,`
			`"content": "[PAD]",`
			`"single_word": false,`
			`"lstrip": false,`
			`"rstrip": false,`
			`"normalized": false,`
			`"special": true`
			`},`
			`{`
			`"id": 34,`
			`"content": "[EOS]",`
			`"single_word": false,`
			`"lstrip": false,`
			`"rstrip": false,`
			`"normalized": false,`
			`"special": true`
			`},`
			`{`
			`"id": 35,`
			`"content": "[BOS]",`
			`"single_word": false,`
			`"lstrip": false,`
			`"rstrip": false,`
			`"normalized": false,`
			`"special": true`
			`}`
			`],`
			`"normalizer": null,`
			`"pre_tokenizer": {`
			`"type": "Split",`
			`"pattern": {`
			`"Regex": "(\\[[^\\]]+]\|Br?\|Cl?\|N\|O\|S\|P\|F\|I\|b\|c\|n\|o\|s\|p\|\\(\|\\)\|\\.\|=\|#\|\\+\|\\/\|:\|@\|\\?\|>\|\\*\|\\$\|%[0-9]{2}\|[0-9])"`
			`},`
			`"behavior": "Isolated",`
			`"invert": false`
			`},`
			`"post_processor": null,`
			`"decoder": {`
			`"type": "Fuse"`
			`},`
			`"model": {`
			`"type": "WordPiece",`
			`"unk_token": "[UNK]",`
			`"continuing_subword_prefix": "##",`
			`"max_input_chars_per_word": 100,`
			`"vocab": {`
			`"#": 0,`
			`"=": 1,`
			`"-": 2,`
			`"(": 3,`
			`")": 4,`
			`"1": 5,`
			`"2": 6,`
			`"3": 7,`
			`"4": 8,`
			`"5": 9,`
			`"6": 10,`
			`"7": 11,`
			`"8": 12,`
			`"9": 13,`
			`"%10": 14,`
			`"Br": 15,`
			`"C": 16,`
			`"Cl": 17,`
			`"F": 18,`
			`"N": 19,`
			`"O": 20,`
			`"S": 21,`
			`"[N+]": 22,`
			`"[N-]": 23,`
			`"[O-]": 24,`
			`"[S+]": 25,`
			`"[n+]": 26,`
			`"[nH]": 27,`
			`"c": 28,`
			`"n": 29,`
			`"o": 30,`
			`"s": 31,`
			`"[UNK]": 32`
			`}`
			`}`
			`}`