commit 642077d52ef081aa2eb4d254b49e11d72e5cd611 Author: ModelHub XC Date: Tue Jun 2 20:06:16 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: kojima-lab/molcrawl-protein-sequence-gpt2-small Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..a6344aa --- /dev/null +++ b/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..2094d6f --- /dev/null +++ b/README.md @@ -0,0 +1,67 @@ +--- +license: apache-2.0 +tags: +- pytorch +- gpt2 +- protein +pipeline_tag: text-generation +--- + +# molcrawl-protein-sequence-gpt2-small + +## Model Description + +GPT-2 small (124M parameters) foundation model pre-trained on protein amino acid sequences from the MolCrawl dataset. + +- **Model Type**: gpt2 +- **Data Type**: Protein +- **Training Date**: 2026-04-24 + +## Usage + +```python +from transformers import AutoModelForCausalLM, AutoTokenizer +import torch + +model = AutoModelForCausalLM.from_pretrained("kojima-lab/molcrawl-protein-sequence-gpt2-small") +tokenizer = AutoTokenizer.from_pretrained("kojima-lab/molcrawl-protein-sequence-gpt2-small") + +# Generate protein sequence +prompt = "MKTAYIAKQRQISFVKSHFSRQLEERLGLIEVQAPILSRVGDGT" +inputs = tokenizer(prompt, return_tensors="pt") +with torch.no_grad(): + output_ids = model.generate( + **inputs, + max_new_tokens=50, + do_sample=True, + temperature=0.8, + eos_token_id=None, # HF config.json has legacy eos_token_id=0; disable early stop + pad_token_id=0, + ) +print(tokenizer.decode(output_ids[0], skip_special_tokens=True)) + +``` + +## Source Code + +Training pipeline, configuration files, and data preparation scripts are +available in the MolCrawl GitHub repository: +[https://github.com/mmai-framework-lab/MolCrawl](https://github.com/mmai-framework-lab/MolCrawl) + +## License + +This model is released under the APACHE-2.0 license. + +## Citation + +If you use this model, please cite: + +```bibtex +@misc{molcrawl_protein_sequence_gpt2_small, + title={molcrawl-protein-sequence-gpt2-small}, + author={{RIKEN}}, + year={2026}, + publisher={{Hugging Face}}, + url={{https://huggingface.co/kojima-lab/molcrawl-protein-sequence-gpt2-small}} +} +``` diff --git a/config.json b/config.json new file mode 100644 index 0000000..94095b8 --- /dev/null +++ b/config.json @@ -0,0 +1,35 @@ +{ + "architectures": [ + "GPT2LMHeadModel" + ], + "model_type": "gpt2", + "vocab_size": 33, + "n_positions": 1024, + "n_ctx": 1024, + "n_embd": 768, + "n_layer": 12, + "n_head": 12, + "n_inner": 3072, + "activation_function": "gelu_new", + "resid_pdrop": 0.1, + "embd_pdrop": 0.1, + "attn_pdrop": 0.1, + "layer_norm_epsilon": 1e-05, + "initializer_range": 0.02, + "use_cache": true, + "bos_token_id": 0, + "eos_token_id": 2, + "transformers_version": "4.0.0", + "_name_or_path": "riken-gpt2", + "_riken_model_args": { + "n_layer": 12, + "n_head": 12, + "n_embd": 768, + "block_size": 1024, + "bias": false, + "vocab_size": 33, + "dropout": 0.1 + }, + "_riken_bias": false, + "pad_token_id": 1 +} \ No newline at end of file diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000..c968c91 --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:93a8d4a89ed2220f3b0b668a6d0afad271b132343954fcfdeb80850d979200f0 +size 343486080 diff --git a/pytorch_model.bin b/pytorch_model.bin new file mode 100644 index 0000000..31e1c7e --- /dev/null +++ b/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a713335969d98276789fc511695deca307f4fb26387e861bb3dca68a896ec3f5 +size 343087848 diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..c907ee1 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,10 @@ +{ + "additional_special_tokens": [ + "|" + ], + "cls_token": "", + "eos_token": "", + "mask_token": "", + "pad_token": "", + "unk_token": "" +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..81c797f --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,167 @@ +{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [ + { + "id": 0, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 1, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 2, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 3, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 31, + "content": "|", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 32, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + ], + "normalizer": null, + "pre_tokenizer": null, + "post_processor": { + "type": "TemplateProcessing", + "single": [ + { + "SpecialToken": { + "id": "", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + }, + { + "SpecialToken": { + "id": "", + "type_id": 0 + } + } + ], + "pair": [ + { + "Sequence": { + "id": "A", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "B", + "type_id": 1 + } + } + ], + "special_tokens": { + "": { + "id": "", + "ids": [ + 0 + ], + "tokens": [ + "" + ] + }, + "": { + "id": "", + "ids": [ + 2 + ], + "tokens": [ + "" + ] + } + } + }, + "decoder": null, + "model": { + "type": "BPE", + "dropout": null, + "unk_token": "", + "continuing_subword_prefix": null, + "end_of_word_suffix": null, + "fuse_unk": false, + "byte_fallback": false, + "ignore_merges": false, + "vocab": { + "": 0, + "": 1, + "": 2, + "": 3, + "L": 4, + "A": 5, + "G": 6, + "V": 7, + "S": 8, + "E": 9, + "R": 10, + "T": 11, + "I": 12, + "D": 13, + "P": 14, + "K": 15, + "Q": 16, + "N": 17, + "F": 18, + "Y": 19, + "M": 20, + "H": 21, + "W": 22, + "C": 23, + "X": 24, + "B": 25, + "U": 26, + "Z": 27, + "O": 28, + ".": 29, + "-": 30, + "|": 31, + "": 32 + }, + "merges": [] + } +} \ No newline at end of file diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..78fa4e7 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,64 @@ +{ + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "31": { + "content": "|", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "|" + ], + "bos_token": "", + "clean_up_tokenization_spaces": false, + "cls_token": "", + "eos_token": "", + "mask_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "tokenizer_class": "PreTrainedTokenizerFast", + "unk_token": "" +} diff --git a/training_args.json b/training_args.json new file mode 100644 index 0000000..24e4f7f --- /dev/null +++ b/training_args.json @@ -0,0 +1,17 @@ +{ + "iteration": 5000, + "best_val_loss": 2.814284086227417, + "early_stopping_counter": 4, + "learning_rate": 0.0006, + "batch_size": 12, + "block_size": 1024, + "model_args": { + "n_layer": 12, + "n_head": 12, + "n_embd": 768, + "block_size": 1024, + "bias": false, + "vocab_size": 33, + "dropout": 0.1 + } +} \ No newline at end of file