commit fd87dccaf9b8156b288b6c77128b893af58f1a5d Author: ModelHub XC Date: Sat Apr 11 17:09:57 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: NorseDrunkenSailor/Qwen_smol_GH114 Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..a6344aa --- /dev/null +++ b/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..c80b68c --- /dev/null +++ b/README.md @@ -0,0 +1,83 @@ +--- +language: +- en +license: apache-2.0 +library_name: transformers +model_type: qwen2 +tags: +- biology +- protein-language-model +- saprot +- 3Di +- enzymeml +- reinforcement-learning +datasets: +- westlake-repl/AF2_UniRef50 +pipeline_tag: text-generation +--- + +# Qwen2 SaPROT-3Di CLM for GH114 + +## Model Description +This is a **Qwen2-style** protein language model trained on **SaPROT 3Di-aware** protein sequences. Unlike SaPROT it is a CLM rather than a MLM, so it's generative (This becomes useful for DPO and the TRL-trainer from HF). + +This model serves as a specialized base model designed for **GH114 reinforcement alignment**. It captures the structural and sequence properties of glycoside hydrolase family 114 (GH114) enzymes and their structural neighbors. + +This model was specifically developed for the **AMLD Intelligence Summit 2026 EnzymeML workshop**. + +## Training Details + +### Pre-training +The model was pre-trained on the [westlake-repl/AF2_UniRef50](https://huggingface.co/datasets/westlake-repl/AF2_UniRef50) dataset. This provides a robust foundation of protein structure-sequence understanding using the SaPROT 3Di alphabet. Batch Size 896 with 512 sequence length @ 10k Steps (smol training run). 4.58 billion-tokens. Final Train Loss 3.3809 Validation Loss 3.4621. + +### Fine-tuning +Following pre-training, the model was fine-tuned on a curated dataset of **≈700,000 structural homologs**. These homologs were selected based on shared **InterPro domains** with the GH114 dataset (IPR004352, IPR017853, IPR013785, IPR000254), ensuring the model is highly sensitive to the structural motifs relevant to this specific enzyme family. Anything within 90% sequence identity from the 55 GH114 sequences was removed from the training set. Two validation sets were used concurrently to monitor distribution overfitting (i.i.d) and the out-of-distribution generalization on the homologs of interest. +4k Steps. 896 batch size, 512 max len. Train Loss 1.7648 Validation Loss 1.8568. + +## Intended Use +* **Primary Use:** As a base model for Reinforcement Learning (RL) alignment tasks targeting the FLOPP GH114 enzymes. log p(x). +* **Context:** AMLD Intelligence Summit 2026 (EnzymeML Workshop). +* **Input:** 3Di-encoded protein sequences (structure-aware tokens). + +## How to Use +You can load this model using the Hugging Face `transformers` library. + +*Note: Ensure your input sequences are converted to the 3Di format (Foldseek alphabet) before passing them to the model.* + +```python +from transformers import AutoModelForCausalLM, AutoTokenizer + +# Load model and tokenizer +model_name = "NorseDrunkenSailor/Qwen_smol_GH114" +tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) +model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True) + +# Example input (3Di sequence) +sequence = "M#L#HdSdLdLdAdAdSdFdAd" +inputs = tokenizer(sequence, return_tensors="pt") + +# Generate continuation or embeddings +outputs = model.generate(**inputs, max_new_tokens=200) +print(tokenizer.decode(outputs[0], skip_special_tokens=True)) +``` + +## Acknowledgements & Citations + +This model relies on the 3Di alphabet from Foldeek and the SaProt idea of using these concatenated 3Di-sequence tokens in a PLM. + +'''bibtex +@article{su2023saprot, + title={SaProt: Protein Language Modeling with Structure-aware Vocabulary}, + author={Su, Jin and Han, Chenchen and Zhou, Yuyang and Shan, Junjie and Zhou, Xibin and Yuan, Fajie}, + journal={bioRxiv}, + year={2023}, + publisher={Cold Spring Harbor Laboratory} +} + +@article{van2023foldseek, + title={Foldseek: fast and accurate protein structure search}, + author={van Kempen, Michel and et al.}, + journal={Nature Biotechnology}, + year={2024} +} +''' diff --git a/config.json b/config.json new file mode 100644 index 0000000..3477ba0 --- /dev/null +++ b/config.json @@ -0,0 +1,36 @@ +{ + "architectures": [ + "Qwen2ForCausalLM" + ], + "attention_dropout": 0.0, + "dtype": "bfloat16", + "hidden_act": "silu", + "hidden_size": 512, + "initializer_range": 0.02, + "intermediate_size": 2048, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 32768, + "max_window_layers": 28, + "model_type": "qwen2", + "num_attention_heads": 8, + "num_hidden_layers": 8, + "num_key_value_heads": 2, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "sliding_window": null, + "tie_word_embeddings": true, + "transformers_version": "4.57.6", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 512 +} diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..9f0f075 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,4 @@ +{ + "_from_model_config": true, + "transformers_version": "4.57.6" +} diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000..5400871 --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5113101feb4464df3e20edcbf6f50d407f2580deddc10355d730dce472849bc2 +size 61382024 diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..ef5f0f7 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,37 @@ +{ + "cls_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "mask_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..1db6a08 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,53 @@ +{ + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "clean_up_tokenization_spaces": false, + "cls_token": "", + "eos_token": "", + "extra_special_tokens": {}, + "mask_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "tokenizer_class": "EsmTokenizer", + "unk_token": "" +} diff --git a/vocab.txt b/vocab.txt new file mode 100644 index 0000000..51872fe --- /dev/null +++ b/vocab.txt @@ -0,0 +1,446 @@ + + + + + +Ap +Ay +An +Aw +Ar +Aq +Ah +Ag +Ad +Al +Av +At +Am +Af +As +Aa +Ae +Ai +Ak +Ac +A# +Cp +Cy +Cn +Cw +Cr +Cq +Ch +Cg +Cd +Cl +Cv +Ct +Cm +Cf +Cs +Ca +Ce +Ci +Ck +Cc +C# +Dp +Dy +Dn +Dw +Dr +Dq +Dh +Dg +Dd +Dl +Dv +Dt +Dm +Df +Ds +Da +De +Di +Dk +Dc +D# +Ep +Ey +En +Ew +Er +Eq +Eh +Eg +Ed +El +Ev +Et +Em +Ef +Es +Ea +Ee +Ei +Ek +Ec +E# +Fp +Fy +Fn +Fw +Fr +Fq +Fh +Fg +Fd +Fl +Fv +Ft +Fm +Ff +Fs +Fa +Fe +Fi +Fk +Fc +F# +Gp +Gy +Gn +Gw +Gr +Gq +Gh +Gg +Gd +Gl +Gv +Gt +Gm +Gf +Gs +Ga +Ge +Gi +Gk +Gc +G# +Hp +Hy +Hn +Hw +Hr +Hq +Hh +Hg +Hd +Hl +Hv +Ht +Hm +Hf +Hs +Ha +He +Hi +Hk +Hc +H# +Ip +Iy +In +Iw +Ir +Iq +Ih +Ig +Id +Il +Iv +It +Im +If +Is +Ia +Ie +Ii +Ik +Ic +I# +Kp +Ky +Kn +Kw +Kr +Kq +Kh +Kg +Kd +Kl +Kv +Kt +Km +Kf +Ks +Ka +Ke +Ki +Kk +Kc +K# +Lp +Ly +Ln +Lw +Lr +Lq +Lh +Lg +Ld +Ll +Lv +Lt +Lm +Lf +Ls +La +Le +Li +Lk +Lc +L# +Mp +My +Mn +Mw +Mr +Mq +Mh +Mg +Md +Ml +Mv +Mt +Mm +Mf +Ms +Ma +Me +Mi +Mk +Mc +M# +Np +Ny +Nn +Nw +Nr +Nq +Nh +Ng +Nd +Nl +Nv +Nt +Nm +Nf +Ns +Na +Ne +Ni +Nk +Nc +N# +Pp +Py +Pn +Pw +Pr +Pq +Ph +Pg +Pd +Pl +Pv +Pt +Pm +Pf +Ps +Pa +Pe +Pi +Pk +Pc +P# +Qp +Qy +Qn +Qw +Qr +Qq +Qh +Qg +Qd +Ql +Qv +Qt +Qm +Qf +Qs +Qa +Qe +Qi +Qk +Qc +Q# +Rp +Ry +Rn +Rw +Rr +Rq +Rh +Rg +Rd +Rl +Rv +Rt +Rm +Rf +Rs +Ra +Re +Ri +Rk +Rc +R# +Sp +Sy +Sn +Sw +Sr +Sq +Sh +Sg +Sd +Sl +Sv +St +Sm +Sf +Ss +Sa +Se +Si +Sk +Sc +S# +Tp +Ty +Tn +Tw +Tr +Tq +Th +Tg +Td +Tl +Tv +Tt +Tm +Tf +Ts +Ta +Te +Ti +Tk +Tc +T# +Vp +Vy +Vn +Vw +Vr +Vq +Vh +Vg +Vd +Vl +Vv +Vt +Vm +Vf +Vs +Va +Ve +Vi +Vk +Vc +V# +Wp +Wy +Wn +Ww +Wr +Wq +Wh +Wg +Wd +Wl +Wv +Wt +Wm +Wf +Ws +Wa +We +Wi +Wk +Wc +W# +Yp +Yy +Yn +Yw +Yr +Yq +Yh +Yg +Yd +Yl +Yv +Yt +Ym +Yf +Ys +Ya +Ye +Yi +Yk +Yc +Y# +#p +#y +#n +#w +#r +#q +#h +#g +#d +#l +#v +#t +#m +#f +#s +#a +#e +#i +#k +#c +## \ No newline at end of file