commit 40bc8331f472a1427e804b00cc63a6465d07e365 Author: ModelHub XC Date: Fri Apr 17 05:58:14 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: ddidacus/smolgen-pubchem-46M-base Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..a6344aa --- /dev/null +++ b/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..45834b5 --- /dev/null +++ b/README.md @@ -0,0 +1,62 @@ +--- +library_name: transformers +tags: + - chemistry + - drug-discovery + - molecule-generation + - smiles +--- + +# smolgen-pubchem-46M-base + +A 46M-parameter causal language model for de novo molecule generation trained on SMILES strings from PubChem. + +## Training Data + +The model was pretrained on ~40 million molecules sourced from PubChem and filtered by: +- **Heavy atom count**: only drug-like size molecules retained +- **Structure alerts**: compounds flagged by common medicinal chemistry filters removed +- **Salt removal**: only the largest fragment of each compound kept + +## Model Architecture + +Decoder-only Transformer (LlamaForCausalLM) with grouped-query attention (GQA): + +| Parameter | Value | +|---|---| +| Hidden size | 576 | +| Intermediate size | 1536 | +| Layers | 13 | +| Attention heads | 9 (3 KV heads) | +| Max sequence length | 8192 | +| Vocabulary size | 36 | + +## Tokenizer + +This model uses the **REINVENT4 tokenizer** — a chemistry-aware tokenizer that splits SMILES strings based on a hand-crafted regex covering atoms, bonds, ring closures, branches, and bracket atoms. The vocabulary has 36 tokens. + +## Usage + +Pass an empty string to prompt the model to generate novel SMILES from scratch: + +```python +from transformers import AutoModelForCausalLM, PreTrainedTokenizerFast + +model = AutoModelForCausalLM.from_pretrained("ddidacus/smolgen-pubchem-46M-base") +tokenizer = PreTrainedTokenizerFast.from_pretrained("ddidacus/smolgen-pubchem-46M-base") + +inputs = tokenizer("", return_tensors="pt") + +outputs = model.generate( + **inputs, + max_new_tokens=128, + do_sample=True, + temperature=1.0, + num_return_sequences=10, + eos_token_id=tokenizer.eos_token_id, + pad_token_id=tokenizer.pad_token_id, +) + +smiles_list = tokenizer.batch_decode(outputs, skip_special_tokens=True) +print(smiles_list) +``` diff --git a/config.json b/config.json new file mode 100644 index 0000000..a3159e1 --- /dev/null +++ b/config.json @@ -0,0 +1,32 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 35, + "dtype": "bfloat16", + "eos_token_id": 34, + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 576, + "initializer_range": 0.041666666666666664, + "intermediate_size": 1536, + "is_llama_config": true, + "max_position_embeddings": 8192, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 9, + "num_hidden_layers": 13, + "num_key_value_heads": 3, + "pad_token_id": 33, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_interleaved": false, + "rope_scaling": null, + "rope_theta": 100000, + "tie_word_embeddings": true, + "transformers_version": "4.57.1", + "use_cache": true, + "vocab_size": 36 +} \ No newline at end of file diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..afabd72 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,9 @@ +{ + "_from_model_config": true, + "bos_token_id": 35, + "eos_token_id": [ + 34 + ], + "pad_token_id": 33, + "transformers_version": "4.57.1" +} diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000..4dd3347 --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c4a2d60ea7fe19b2b551584af90293c7238a9ce06b181eefb58def0ea956ff3 +size 92098208 diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..a88550a --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "[BOS]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "[EOS]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "[PAD]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..fffd654 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,88 @@ +{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [ + { + "id": 33, + "content": "[PAD]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 34, + "content": "[EOS]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 35, + "content": "[BOS]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + ], + "normalizer": null, + "pre_tokenizer": { + "type": "Split", + "pattern": { + "Regex": "(\\[[^\\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\\(|\\)|\\.|=|#|\\+|\\/|:|@|\\?|>|\\*|\\$|%[0-9]{2}|[0-9])" + }, + "behavior": "Isolated", + "invert": false + }, + "post_processor": null, + "decoder": { + "type": "Fuse" + }, + "model": { + "type": "WordPiece", + "unk_token": "[UNK]", + "continuing_subword_prefix": "##", + "max_input_chars_per_word": 100, + "vocab": { + "#": 0, + "=": 1, + "-": 2, + "(": 3, + ")": 4, + "1": 5, + "2": 6, + "3": 7, + "4": 8, + "5": 9, + "6": 10, + "7": 11, + "8": 12, + "9": 13, + "%10": 14, + "Br": 15, + "C": 16, + "Cl": 17, + "F": 18, + "N": 19, + "O": 20, + "S": 21, + "[N+]": 22, + "[N-]": 23, + "[O-]": 24, + "[S+]": 25, + "[n+]": 26, + "[nH]": 27, + "c": 28, + "n": 29, + "o": 30, + "s": 31, + "[UNK]": 32 + } + } +} \ No newline at end of file diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..a65bb3e --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,36 @@ +{ + "added_tokens_decoder": { + "33": { + "content": "[PAD]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "34": { + "content": "[EOS]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "35": { + "content": "[BOS]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "[BOS]", + "clean_up_tokenization_spaces": false, + "eos_token": "[EOS]", + "extra_special_tokens": {}, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "[PAD]", + "tokenizer_class": "PreTrainedTokenizerFast", + "unk_token": "[UNK]" +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..703b111 --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b8ce13bf8737609791db6f67cc04739574ce8c497c18d379481ca5ebb4d6db3 +size 6289