From 960c23a316816fb31dcabe11e8b81f9af6c4a836 Mon Sep 17 00:00:00 2001 From: ModelHub XC Date: Wed, 3 Jun 2026 04:21:19 +0800 Subject: [PATCH] =?UTF-8?q?=E5=88=9D=E5=A7=8B=E5=8C=96=E9=A1=B9=E7=9B=AE?= =?UTF-8?q?=EF=BC=8C=E7=94=B1ModelHub=20XC=E7=A4=BE=E5=8C=BA=E6=8F=90?= =?UTF-8?q?=E4=BE=9B=E6=A8=A1=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Model: kojima-lab/molcrawl-protein-sequence-proteingym-gpt2-small Source: Original Platform --- .gitattributes | 35 +++++++++ README.md | 71 +++++++++++++++++ config.json | 44 +++++++++++ generation_config.json | 6 ++ model.safetensors | 3 + pytorch_model.bin | 3 + special_tokens_map.json | 10 +++ tokenizer.json | 167 ++++++++++++++++++++++++++++++++++++++++ tokenizer_config.json | 64 +++++++++++++++ training_args.json | 17 ++++ 10 files changed, 420 insertions(+) create mode 100644 .gitattributes create mode 100644 README.md create mode 100644 config.json create mode 100644 generation_config.json create mode 100644 model.safetensors create mode 100644 pytorch_model.bin create mode 100644 special_tokens_map.json create mode 100644 tokenizer.json create mode 100644 tokenizer_config.json create mode 100644 training_args.json diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..a6344aa --- /dev/null +++ b/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..1fda0aa --- /dev/null +++ b/README.md @@ -0,0 +1,71 @@ +--- +license: apache-2.0 +tags: +- pytorch +- gpt2 +- protein +pipeline_tag: text-generation +--- + +# molcrawl-protein-sequence-proteingym-gpt2-small + +## Model Description + +GPT-2 small (124M parameters) fine-tuned on [ProteinGym](https://proteingym.org/) protein sequence data, starting from the `molcrawl-protein-sequence-gpt2-small` pre-trained model. + +## Datasets + +- **ProteinGym**: [https://proteingym.org/](https://proteingym.org/) (Fine-tuning dataset) + +- **Model Type**: gpt2 +- **Data Type**: Protein +- **Training Date**: 2026-04-24 + +## Usage + +```python +from transformers import AutoModelForCausalLM, AutoTokenizer +import torch + +model = AutoModelForCausalLM.from_pretrained("kojima-lab/molcrawl-protein-sequence-proteingym-gpt2-small") +tokenizer = AutoTokenizer.from_pretrained("kojima-lab/molcrawl-protein-sequence-proteingym-gpt2-small") + +# Generate protein sequence +prompt = "MKTAYIAKQRQISFVKSHFSRQLEERLGLIEVQAPILSRVGDGT" +inputs = tokenizer(prompt, return_tensors="pt") +with torch.no_grad(): + output_ids = model.generate( + **inputs, + max_new_tokens=50, + do_sample=True, + temperature=0.8, + eos_token_id=None, # HF config.json has legacy eos_token_id=0; disable early stop + pad_token_id=0, + ) +print(tokenizer.decode(output_ids[0], skip_special_tokens=True)) + +``` + +## Source Code + +Training pipeline, configuration files, and data preparation scripts are +available in the MolCrawl GitHub repository: +[https://github.com/mmai-framework-lab/MolCrawl](https://github.com/mmai-framework-lab/MolCrawl) + +## License + +This model is released under the APACHE-2.0 license. + +## Citation + +If you use this model, please cite: + +```bibtex +@misc{molcrawl_protein_sequence_proteingym_gpt2_small, + title={molcrawl-protein-sequence-proteingym-gpt2-small}, + author={{RIKEN}}, + year={2026}, + publisher={{Hugging Face}}, + url={{https://huggingface.co/kojima-lab/molcrawl-protein-sequence-proteingym-gpt2-small}} +} +``` diff --git a/config.json b/config.json new file mode 100644 index 0000000..64aa587 --- /dev/null +++ b/config.json @@ -0,0 +1,44 @@ +{ + "_name_or_path": "learning_source_20260316/protein_sequence_proteingym/gpt2-output/protein_sequence_proteingym-small/checkpoint-2000", + "_riken_bias": false, + "_riken_model_args": { + "bias": false, + "block_size": 1024, + "dropout": 0.1, + "n_embd": 768, + "n_head": 12, + "n_layer": 12, + "vocab_size": 33 + }, + "activation_function": "gelu_new", + "architectures": [ + "GPT2LMHeadModel" + ], + "attn_pdrop": 0.1, + "bos_token_id": 0, + "embd_pdrop": 0.1, + "eos_token_id": 2, + "initializer_range": 0.02, + "layer_norm_epsilon": 1e-05, + "model_type": "gpt2", + "n_ctx": 1024, + "n_embd": 768, + "n_head": 12, + "n_inner": 3072, + "n_layer": 12, + "n_positions": 1024, + "reorder_and_upcast_attn": false, + "resid_pdrop": 0.1, + "scale_attn_by_inverse_layer_idx": false, + "scale_attn_weights": true, + "summary_activation": null, + "summary_first_dropout": 0.1, + "summary_proj_to_labels": true, + "summary_type": "cls_index", + "summary_use_proj": true, + "torch_dtype": "float32", + "transformers_version": "4.45.1", + "use_cache": true, + "vocab_size": 33, + "pad_token_id": 1 +} \ No newline at end of file diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..1a5ca11 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,6 @@ +{ + "_from_model_config": true, + "bos_token_id": 0, + "eos_token_id": 0, + "transformers_version": "4.45.1" +} diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000..48a3fe7 --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ef7028b78fb234175161a2a51ebdbaff856dd5201fd90aef7acdf50b8a6b54e +size 343486080 diff --git a/pytorch_model.bin b/pytorch_model.bin new file mode 100644 index 0000000..2ee8ed8 --- /dev/null +++ b/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:17b4b51d0de548fb596b28168ba8bfc544551a24f35196d02914a2d58bee6690 +size 343087848 diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..c907ee1 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,10 @@ +{ + "additional_special_tokens": [ + "|" + ], + "cls_token": "", + "eos_token": "", + "mask_token": "", + "pad_token": "", + "unk_token": "" +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..81c797f --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,167 @@ +{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [ + { + "id": 0, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 1, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 2, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 3, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 31, + "content": "|", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 32, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + ], + "normalizer": null, + "pre_tokenizer": null, + "post_processor": { + "type": "TemplateProcessing", + "single": [ + { + "SpecialToken": { + "id": "", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + }, + { + "SpecialToken": { + "id": "", + "type_id": 0 + } + } + ], + "pair": [ + { + "Sequence": { + "id": "A", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "B", + "type_id": 1 + } + } + ], + "special_tokens": { + "": { + "id": "", + "ids": [ + 0 + ], + "tokens": [ + "" + ] + }, + "": { + "id": "", + "ids": [ + 2 + ], + "tokens": [ + "" + ] + } + } + }, + "decoder": null, + "model": { + "type": "BPE", + "dropout": null, + "unk_token": "", + "continuing_subword_prefix": null, + "end_of_word_suffix": null, + "fuse_unk": false, + "byte_fallback": false, + "ignore_merges": false, + "vocab": { + "": 0, + "": 1, + "": 2, + "": 3, + "L": 4, + "A": 5, + "G": 6, + "V": 7, + "S": 8, + "E": 9, + "R": 10, + "T": 11, + "I": 12, + "D": 13, + "P": 14, + "K": 15, + "Q": 16, + "N": 17, + "F": 18, + "Y": 19, + "M": 20, + "H": 21, + "W": 22, + "C": 23, + "X": 24, + "B": 25, + "U": 26, + "Z": 27, + "O": 28, + ".": 29, + "-": 30, + "|": 31, + "": 32 + }, + "merges": [] + } +} \ No newline at end of file diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..78fa4e7 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,64 @@ +{ + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "31": { + "content": "|", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "|" + ], + "bos_token": "", + "clean_up_tokenization_spaces": false, + "cls_token": "", + "eos_token": "", + "mask_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "tokenizer_class": "PreTrainedTokenizerFast", + "unk_token": "" +} diff --git a/training_args.json b/training_args.json new file mode 100644 index 0000000..bddb75c --- /dev/null +++ b/training_args.json @@ -0,0 +1,17 @@ +{ + "iteration": 2000, + "best_val_loss": 0.06397182494401932, + "early_stopping_counter": 0, + "learning_rate": 1e-05, + "batch_size": 12, + "block_size": 1024, + "model_args": { + "n_layer": 12, + "n_head": 12, + "n_embd": 768, + "block_size": 1024, + "bias": false, + "vocab_size": 33, + "dropout": 0.1 + } +} \ No newline at end of file