commit f475fec050612be874711821d4a9fcf4db58fb8a Author: ModelHub XC Date: Mon May 18 12:49:32 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: GenerTeam/GENERator-v2-eukaryote-3b-base Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..a6344aa --- /dev/null +++ b/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..e2e175a --- /dev/null +++ b/README.md @@ -0,0 +1,183 @@ +--- +library_name: transformers +license: mit +pipeline_tag: text-generation +tags: +- biology +- genomics +- long-context +--- + +# GENERator-v2-eukaryote-3b-base model + +## **Important Notice** +If you are using **GENERator** for sequence generation, please ensure that the length of each input sequence is a multiple of **6**. This can be achieved by either: +1. Padding the sequence on the left with `'A'` (**left padding**); +2. Truncating the sequence from the left (**left truncation**). + +This requirement arises because **GENERator** employs a 6-mer tokenizer. If the input sequence length is not a multiple of **6**, the tokenizer will append an `''` (out-of-vocabulary) token to the end of the token sequence. This can result in uninformative subsequent generations, such as repeated `'AAAAAA'`. + +We apologize for any inconvenience this may cause and recommend adhering to the above guidelines to ensure accurate and meaningful generation results. + + +## Abouts +In this repository, we present GENERator-v2, a generative genomic foundation with enhanced performance in eukaryotic domain. More technical details are provided in the GENERator-v2 [technical report](https://www.biorxiv.org/content/10.64898/2026.01.27.702015v1). + +Python scripts for downstream analysis are available on Github: [https://github.com/GenerTeam/GENERator](https://github.com/GenerTeam/GENERator). + + +## How to use +### Simple example1: generation + +```python + +import torch +from transformers import AutoTokenizer, AutoModelForCausalLM + +# Load the tokenizer and model. +tokenizer = AutoTokenizer.from_pretrained("GenerTeam/GENERator-v2-eukaryote-3b-base", trust_remote_code=True) +model = AutoModelForCausalLM.from_pretrained("GenerTeam/GENERator-v2-eukaryote-3b-base") +config = model.config + +max_length = config.max_position_embeddings + +# Define input sequences. +sequences = [ + "ATGAGGTGGCAAGAAATGGGCTAC", + "GAATTCCATGAGGCTATAGAATAATCTAAGAGAAAT" +] + +def left_padding(sequence, padding_char='A', multiple=6): + remainder = len(sequence) % multiple + if remainder != 0: + padding_length = multiple - remainder + return padding_char * padding_length + sequence + return sequence + +def left_truncation(sequence, multiple=6): + remainder = len(sequence) % multiple + if remainder != 0: + return sequence[remainder:] + return sequence + +# Apply left_padding to all sequences +# padded_sequences = [left_padding(seq) for seq in sequences] + +# Apply left_truncation to all sequences +truncated_sequences = [left_truncation(seq) for seq in sequences] + +# Process the sequences +sequences = [tokenizer.bos_token + sequence for sequence in truncated_sequences] + +# Tokenize the sequences +tokenizer.padding_side = "left" +inputs = tokenizer( + sequences, + add_special_tokens=False, + return_tensors="pt", + padding=True, + truncation=True, + max_length=max_length +) + +# Generate the sequences +with torch.inference_mode(): + outputs = model.generate(**inputs, max_new_tokens=32, temperature=0.00001, top_k=1) + +# Decode the generated sequences +decoded_sequences = tokenizer.batch_decode(outputs, skip_special_tokens=True) + +# Print the decoded sequences +print(decoded_sequences) + +# It is expected to observe non-sense decoded sequences (e.g., 'AAAAAA') +# The input sequences are too short to provide sufficient context. +``` + +### Simple example2: embedding + +```python + +import torch +from transformers import AutoTokenizer, AutoModelForCausalLM + +# Load the tokenizer and model +tokenizer = AutoTokenizer.from_pretrained("GENERator-v2-eukaryote-3b-base", trust_remote_code=True) +model = AutoModelForCausalLM.from_pretrained("GENERator-v2-eukaryote-3b-base") + +# Get model configuration +config = model.config +max_length = config.max_position_embeddings + +# Define input sequences +sequences = [ + "ATGAGGTGGCAAGAAATGGGCTAC", + "GAATTCCATGAGGCTATAGAATAATCTAAGAGAAAT" +] + +# Truncate each sequence to the nearest multiple of 6 +processed_sequences = [tokenizer.bos_token + seq[:len(seq)//6*6] for seq in sequences] + +# Tokenization +tokenizer.padding_side = "right" +inputs = tokenizer( + processed_sequences, + add_special_tokens=True, + return_tensors="pt", + padding=True, + truncation=True, + max_length=max_length +) + +# Model Inference +with torch.inference_mode(): + outputs = model(**inputs, output_hidden_states=True) + +hidden_states = outputs.hidden_states[-1] +attention_mask = inputs["attention_mask"] + +# Option 1: Last token (EOS) embedding +last_token_indices = attention_mask.sum(dim=1) - 1 +eos_embeddings = hidden_states[torch.arange(hidden_states.size(0)), last_token_indices, :] + +# Option 2: Mean pooling over all tokens +expanded_mask = attention_mask.unsqueeze(-1).expand(hidden_states.size()).to(torch.float32) +sum_embeddings = torch.sum(hidden_states * expanded_mask, dim=1) +mean_embeddings = sum_embeddings / expanded_mask.sum(dim=1) + +# Output +print("EOS (Last Token) Embeddings:", eos_embeddings) +print("Mean Pooling Embeddings:", mean_embeddings) + +# ============================================================================ +# Additional notes: +# - The preprocessing step ensures sequences are multiples of 6 for 6-mer tokenizer +# - For causal LM, the last token embedding (EOS) is commonly used +# - Mean pooling considers all tokens including BOS and content tokens +# - The choice depends on your downstream task requirements +# - Both methods handle variable sequence lengths via attention mask +# ============================================================================ + +``` + +## Citation +``` +@article {li2026generator2, + author = {Li, Qiuyi and Zhan, Zhihao and Feng, Shikun and Zhu, Yiheng and He, Yuan and Wu, Wei and Shi, Zhenghang and Wang, Shengjie and Hu, Zongyong and Yang, Zhao and Li, Jiaoyang and Tang, Jian and Liu, Haiguang and Qin, Tao}, + title = {GENERator-v2: Reconciling Coarse Tokenization with Single-Nucleotide Resolution in Genomic Language Modeling}, + elocation-id = {2026.01.27.702015}, + year = {2026}, + doi = {10.64898/2026.01.27.702015}, + publisher = {Cold Spring Harbor Laboratory}, + URL = {https://www.biorxiv.org/content/early/2026/05/04/2026.01.27.702015}, + journal = {bioRxiv} +} + +@article{wu2025generator, + title={GENERator: a long-context generative genomic foundation model}, + author={Wu, Wei and Li, Qiuyi and Li, Mingyang and Fu, Kun and Feng, Fuli and Ye, Jieping and Xiong, Hui and Wang, Zheng}, + journal={arXiv preprint arXiv:2502.07272}, + year={2025} +} + +``` \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..b45935e --- /dev/null +++ b/config.json @@ -0,0 +1,31 @@ +{ + "architectures": [ + "GENERatorForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 1, + "eos_token_id": 2, + "hidden_act": "silu", + "hidden_size": 3072, + "initializer_range": 0.02, + "intermediate_size": 8448, + "max_position_embeddings": 16384, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 30, + "num_key_value_heads": 4, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.44.0", + "use_cache": true, + "vocab_size": 4128, + "auto_map": { + "AutoModelForCausalLM": "modeling_generator.GENERatorForCausalLM" + } +} diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..619b676 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,6 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "eos_token_id": 2, + "transformers_version": "4.44.0" +} diff --git a/model-00001-of-00003.safetensors b/model-00001-of-00003.safetensors new file mode 100644 index 0000000..ec86a90 --- /dev/null +++ b/model-00001-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ee2725691b536b4b783971e1f4edd7c85c7b55ee6274941054c64ca979b6ebc +size 4996117216 diff --git a/model-00002-of-00003.safetensors b/model-00002-of-00003.safetensors new file mode 100644 index 0000000..9b26f37 --- /dev/null +++ b/model-00002-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9bf8118e434b5877d14a777d26d588f76d0d4fc0c18f5afa4f3eac4fad7f292b +size 4964291160 diff --git a/model-00003-of-00003.safetensors b/model-00003-of-00003.safetensors new file mode 100644 index 0000000..50cca5a --- /dev/null +++ b/model-00003-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:654dee9a787a61aa762738556059d896841ee62d85abcdf9d99ed8db98536c4a +size 2032674024 diff --git a/model.safetensors.index.json b/model.safetensors.index.json new file mode 100644 index 0000000..7a22ed8 --- /dev/null +++ b/model.safetensors.index.json @@ -0,0 +1,280 @@ +{ + "metadata": { + "total_size": 11993051136 + }, + "weight_map": { + "lm_head.weight": "model-00003-of-00003.safetensors", + "model.embed_tokens.weight": "model-00001-of-00003.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.18.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.19.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.25.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.26.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.27.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.28.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.29.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.norm.weight": "model-00003-of-00003.safetensors" + } +} diff --git a/modeling_generator.py b/modeling_generator.py new file mode 100644 index 0000000..8b0e511 --- /dev/null +++ b/modeling_generator.py @@ -0,0 +1,236 @@ +""" +GENERator with bp-level generation and scoring. + +generate_bp() plugs into the standard HF generate() pipeline via a +LogitsProcessor — no internal methods are overridden, so it is compatible +with any transformers version. +""" +import torch +import torch.nn.functional as F +from transformers import LlamaForCausalLM, LogitsProcessor, LogitsProcessorList +from typing import Union + +BASE_TO_IDX = {"A": 0, "T": 1, "C": 2, "G": 3, "N": -1} +IDX_TO_BASE = {0: "A", 1: "T", 2: "C", 3: "G", -1: "N"} + + +class _BPLogitsProcessor(LogitsProcessor): + """Forces token selection to use per-base marginal probabilities. + + Runs LAST in the logits-processor chain so that temperature / top-k / + top-p etc. influence the marginal distributions before base selection. + """ + + def __init__(self, kmer_ids, bp_base_index, flat_idx_to_token_id, bp_powers, k, do_sample): + self.kmer_ids = kmer_ids + self.bp_base_index = bp_base_index + self.flat_idx_to_token_id = flat_idx_to_token_id + self.bp_powers = bp_powers + self.k = k + self.do_sample = do_sample + + def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor: + B = scores.shape[0] + kmer_probs = F.softmax(scores[:, self.kmer_ids].float(), dim=-1) # [B, num_kmers] + + # Marginalise to per-base probabilities [B, k, 4] + bp_probs = torch.zeros(B, self.k, 4, device=scores.device, dtype=kmer_probs.dtype) + for pos in range(self.k): + idx = self.bp_base_index[pos] # [num_kmers] in {0,1,2,3} + for nt in range(4): + bp_probs[:, pos, nt] = kmer_probs[:, idx == nt].sum(dim=-1) + + if self.do_sample: + base_indices = torch.multinomial(bp_probs.view(-1, 4), 1).view(B, self.k) + else: + base_indices = bp_probs.argmax(dim=-1) # [B, k] + + flat_idx = (base_indices * self.bp_powers).sum(dim=-1) # [B] + selected = self.flat_idx_to_token_id[flat_idx] # [B] + + # One-hot: both argmax and multinomial land on the bp-selected token + new_scores = torch.full_like(scores, float("-inf")) + new_scores.scatter_(1, selected.unsqueeze(1), 0.0) + return new_scores + + +class GENERatorForCausalLM(LlamaForCausalLM): + """LlamaForCausalLM with bp-level autoregressive generation. + + Inherits all standard functionality (forward, generate, etc.) + and adds generate_bp() for base-pair independent generation. + + The tokenizer is automatically set up when loading the model with from_pretrained(). + """ + + @classmethod + def from_pretrained(cls, *args, **kwargs): + """Load model and automatically setup tokenizer if available.""" + model = super().from_pretrained(*args, **kwargs) + + model_path = args[0] if len(args) > 0 else kwargs.get('pretrained_model_name_or_path') + + if model_path: + try: + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + model.setup_tokenizer(tokenizer) + print(f"Tokenizer automatically loaded and configured for bp-level scoring") + except Exception as e: + print(f"Could not auto-load tokenizer: {e}") + print(f" Call model.setup_tokenizer(tokenizer) manually if needed") + + return model + + def setup_tokenizer(self, tokenizer): + """Cache tokenizer and precompute lookup tables for bp-level operations.""" + self.tokenizer = tokenizer + k = tokenizer.k + self.k = k + + device = next(self.parameters()).device + + # Build ordered kmer list from the tokenizer's DNA vocab + kmer_items = sorted( + [ + (kmer, tid) + for kmer, tid in tokenizer.vocab.items() + if len(kmer) == k and all(b in "ATCG" for b in kmer) + ], + key=lambda x: x[1], + ) + kmers = [item[0] for item in kmer_items] + kmer_ids = [item[1] for item in kmer_items] + num_kmers = len(kmer_ids) + + kmer_ids_tensor = torch.tensor(kmer_ids, dtype=torch.long, device=device) + self.register_buffer("_kmer_ids", kmer_ids_tensor, persistent=False) + + # bp_base_index[pos, j] = base index (0-3) of kmer j at position pos + bp_base_index = torch.zeros(k, num_kmers, dtype=torch.long) + for j, kmer in enumerate(kmers): + for pos, base in enumerate(kmer): + bp_base_index[pos, j] = BASE_TO_IDX[base] + self.register_buffer("_bp_base_index", bp_base_index.to(device), persistent=False) + + bp_powers = torch.tensor( + [4 ** i for i in range(k - 1, -1, -1)], dtype=torch.long, device=device + ) + self.register_buffer("_bp_powers", bp_powers, persistent=False) + + # flat kmer index -> token id (flat index = sum base_idx[i] * 4^(k-1-i)) + flat_to_tid = torch.zeros(num_kmers, dtype=torch.long, device=device) + for j, (kmer, tid) in enumerate(kmer_items): + flat_idx = sum(BASE_TO_IDX[c] * (4 ** (k - 1 - i)) for i, c in enumerate(kmer)) + flat_to_tid[flat_idx] = tid + self.register_buffer("_flat_idx_to_token_id", flat_to_tid, persistent=False) + + def compute_bp_probs(self, logits): + """Compute per-base marginal probabilities from token logits. + + Args: + logits: [B, V] or [B, L, V] + Returns: + bp_probs: [B, k, 4] or [B, L, k, 4] + """ + squeeze = logits.dim() == 2 + if squeeze: + logits = logits.unsqueeze(1) + + kmer_logits = logits[:, :, self._kmer_ids] + kmer_probs = F.softmax(kmer_logits.float(), dim=-1) + B, L, _ = kmer_probs.shape + bp_probs = torch.zeros(B, L, self.k, 4, device=logits.device, dtype=kmer_probs.dtype) + for pos in range(self.k): + idx = self._bp_base_index[pos] + for nt in range(4): + bp_probs[:, :, pos, nt] = kmer_probs[:, :, idx == nt].sum(dim=-1) + + return bp_probs.squeeze(1) if squeeze else bp_probs + + def generate(self, inputs=None, generation_config=None, **kwargs): + """Like generate(), but each token is selected base-by-base from marginal distributions. + + Temperature, top_k, top_p, repetition_penalty etc. all apply as usual — + they run before the bp processor and shift the marginal distributions. + Output shape and type are identical to generate(). + """ + assert hasattr(self, "_bp_base_index"), "Call setup_tokenizer(tokenizer) first" + + gc = generation_config or self.generation_config + do_sample = kwargs.get("do_sample", getattr(gc, "do_sample", False)) + + bp_proc = _BPLogitsProcessor( + kmer_ids=self._kmer_ids, + bp_base_index=self._bp_base_index, + flat_idx_to_token_id=self._flat_idx_to_token_id, + bp_powers=self._bp_powers, + k=self.k, + do_sample=do_sample, + ) + existing = list(kwargs.pop("logits_processor", None) or []) + kwargs["logits_processor"] = LogitsProcessorList(existing + [bp_proc]) + + return super().generate(inputs=inputs, generation_config=generation_config, **kwargs) + + @torch.no_grad() + def score_sequence(self, sequences: Union[str, list]): + """Score DNA sequence(s) at base resolution. + + Returns per-base probability distributions and the probability of the + actual base at each position, given all preceding context. + + Args: + sequences: single DNA string or list of DNA strings (ACGT only) + + Returns: + (bp_probs, actual_probs) for a single sequence, or + (list of bp_probs, list of actual_probs) for a batch. + bp_probs[i]: [seq_len_i, 4] — P(base | context) at each position + actual_probs[i]: [seq_len_i] — P(actual base | context) + """ + assert hasattr(self, "tokenizer"), "Call setup_tokenizer(tokenizer) first" + + is_single = isinstance(sequences, str) + if is_single: + sequences = [sequences] + + original_lens = [len(s) for s in sequences] + + # Right-pad to multiple of k with 'A' (matches tokenizer convention) + padded = [] + for s in sequences: + r = len(s) % self.k + padded.append(s + "A" * (self.k - r) if r else s) + + # Prepend BOS manually (training format) + tagged = ["" + s for s in padded] + + inputs = self.tokenizer( + tagged, return_tensors="pt", padding=True, add_special_tokens=False + ) + input_ids = inputs["input_ids"].to(self.device) + attention_mask = inputs["attention_mask"].to(self.device) + + logits = self(input_ids, attention_mask=attention_mask, return_dict=True).logits + bp_probs_all = self.compute_bp_probs(logits) # [B, L, k, 4] + + bp_results, actual_results = [], [] + for i, (seq, orig_len, pad_seq) in enumerate(zip(sequences, original_lens, padded)): + num_tokens = len(pad_seq) // self.k + # logits[t] predicts token t+1; logits[0] (from ) predicts token 1 + seq_bp = bp_probs_all[i, :num_tokens] # [num_tokens, k, 4] + seq_bp = seq_bp.reshape(-1, 4)[:orig_len] # [orig_len, 4] + actual = self._extract_actual_probs(seq_bp, seq) + bp_results.append(seq_bp) + actual_results.append(actual) + + if is_single: + return bp_results[0], actual_results[0] + return bp_results, actual_results + + def _extract_actual_probs(self, bp_probs: torch.Tensor, sequence: str) -> torch.Tensor: + actual = torch.zeros(len(sequence), device=bp_probs.device, dtype=bp_probs.dtype) + for i, base in enumerate(sequence): + actual[i] = bp_probs[i].max() if base == "N" else bp_probs[i, BASE_TO_IDX[base]] + return actual diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..f73835f --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/tokenizer.py b/tokenizer.py new file mode 100644 index 0000000..2032ae9 --- /dev/null +++ b/tokenizer.py @@ -0,0 +1,163 @@ +import itertools +import os +import json +import re +from typing import List, Optional, Tuple +from transformers import PreTrainedTokenizer + +class DNAKmerTokenizer(PreTrainedTokenizer): + def __init__(self, k, **kwargs): + self.k = k + self.special_tokens = [ + "", + "", + "", + "", + "", + "", + "", + "", + "", + "<+>", + "<->", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + ] + self.kmers = [ + "".join(kmer) for kmer in itertools.product("ATCG", repeat=self.k) + ] + self.vocab = { + token: i for i, token in enumerate(self.special_tokens + self.kmers) + } + self.ids_to_tokens = {v: k for k, v in self.vocab.items()} + self.special_token_pattern = re.compile( + "|".join(re.escape(token) for token in self.special_tokens) + ) + self.dna_pattern = re.compile(f"[A-Z]{{{self.k}}}|[A-Z]+") + kwargs.setdefault("bos_token", "") + kwargs.setdefault("eos_token", "") + kwargs.setdefault("unk_token", "") + kwargs.setdefault("pad_token", "") + super().__init__(**kwargs) + + @property + def vocab_size(self): + return len(self.vocab) + + def get_vocab(self): + return dict(self.vocab) + + def _tokenize(self, text, **kwargs) -> List[str]: + tokens = [] + pos = 0 + while pos < len(text): + special_match = self.special_token_pattern.match(text, pos) + if special_match: + tokens.append(special_match.group()) + pos = special_match.end() + else: + dna_match = self.dna_pattern.match(text, pos) + if dna_match: + dna_seq = dna_match.group() + tokens.append(dna_seq) + pos = dna_match.end() + else: + tokens.append(text[pos]) + pos += 1 + return tokens + + def _convert_token_to_id(self, token: str) -> int: + return self.vocab.get(token, self.vocab[""]) + + def _convert_id_to_token(self, index: int) -> str: + return self.ids_to_tokens.get(index, "") + + def convert_tokens_to_string(self, tokens: List[str]) -> str: + return "".join(tokens) + + def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): + if token_ids_1 is None: + return [self.bos_token_id] + token_ids_0 + [self.eos_token_id] + return [self.bos_token_id] + token_ids_0 + [self.eos_token_id] + token_ids_1 + [self.eos_token_id] + + def get_special_tokens_mask( + self, token_ids_0, token_ids_1=None, already_has_special_tokens=False + ): + if already_has_special_tokens: + return super().get_special_tokens_mask( + token_ids_0, token_ids_1, already_has_special_tokens=True + ) + if token_ids_1 is None: + return [1] + ([0] * len(token_ids_0)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] + + def prepare_for_model(self, *args, **kwargs): + encoding = super().prepare_for_model(*args, **kwargs) + if "token_type_ids" in encoding: + del encoding["token_type_ids"] + return encoding + + def save_vocabulary( + self, save_directory: str, filename_prefix: Optional[str] = None + ) -> Tuple[str]: + import os + + vocab_file = os.path.join( + save_directory, + (filename_prefix + "-" if filename_prefix else "") + "vocab.txt", + ) + with open(vocab_file, "w", encoding="utf-8") as writer: + for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]): + writer.write(token + "\n") + return (vocab_file,) + + def save_pretrained(self, save_directory: str, **kwargs): + vocab_files = super().save_pretrained(save_directory, **kwargs) + tokenizer_config_path = os.path.join(save_directory, "tokenizer_config.json") + + # 读取现有的配置或创建新的 + if os.path.exists(tokenizer_config_path): + with open(tokenizer_config_path, "r", encoding="utf-8") as f: + config = json.load(f) + else: + config = {} + + # 添加auto_map配置 + config.update({ + "auto_map": { + "AutoTokenizer": [ + "tokenizer.DNAKmerTokenizer", + None + ] + }, + }) + + # 添加kmer配置 + config.update({ + "k": self.k + }) + + # 保存配置 + with open(tokenizer_config_path, "w", encoding="utf-8") as f: + json.dump(config, f, ensure_ascii=False, indent=2) + + return vocab_files diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..361749a --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,60 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "auto_map": { + "AutoTokenizer": [ + "tokenizer.DNAKmerTokenizer", + null + ] + }, + "bos_token": "", + "clean_up_tokenization_spaces": true, + "eos_token": "", + "extra_special_tokens": {}, + "kmer": 6, + "legacy": true, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "DNAKmerTokenizer", + "unk_token": "", + "use_default_system_prompt": false, + "use_fast": false, + "k": 6 +} \ No newline at end of file diff --git a/vocab.txt b/vocab.txt new file mode 100644 index 0000000..df86e2a --- /dev/null +++ b/vocab.txt @@ -0,0 +1,4128 @@ + + + + + + + + + +<+> +<-> + + + + + + + + + + + + + + + + + + + + + +AAAAAA +AAAAAT +AAAAAC +AAAAAG +AAAATA +AAAATT +AAAATC +AAAATG +AAAACA +AAAACT +AAAACC +AAAACG +AAAAGA +AAAAGT +AAAAGC +AAAAGG +AAATAA +AAATAT +AAATAC +AAATAG +AAATTA +AAATTT +AAATTC +AAATTG +AAATCA +AAATCT +AAATCC +AAATCG +AAATGA +AAATGT +AAATGC +AAATGG +AAACAA +AAACAT +AAACAC +AAACAG +AAACTA +AAACTT +AAACTC +AAACTG +AAACCA +AAACCT +AAACCC +AAACCG +AAACGA +AAACGT +AAACGC +AAACGG +AAAGAA +AAAGAT +AAAGAC +AAAGAG +AAAGTA +AAAGTT +AAAGTC +AAAGTG +AAAGCA +AAAGCT +AAAGCC +AAAGCG +AAAGGA +AAAGGT +AAAGGC +AAAGGG +AATAAA +AATAAT +AATAAC +AATAAG +AATATA +AATATT +AATATC +AATATG +AATACA +AATACT +AATACC +AATACG +AATAGA +AATAGT +AATAGC +AATAGG +AATTAA +AATTAT +AATTAC +AATTAG +AATTTA +AATTTT +AATTTC +AATTTG +AATTCA +AATTCT +AATTCC +AATTCG +AATTGA +AATTGT +AATTGC +AATTGG +AATCAA +AATCAT +AATCAC +AATCAG +AATCTA +AATCTT +AATCTC +AATCTG +AATCCA +AATCCT +AATCCC +AATCCG +AATCGA +AATCGT +AATCGC +AATCGG +AATGAA +AATGAT +AATGAC +AATGAG +AATGTA +AATGTT +AATGTC +AATGTG +AATGCA +AATGCT +AATGCC +AATGCG +AATGGA +AATGGT +AATGGC +AATGGG +AACAAA +AACAAT +AACAAC +AACAAG +AACATA +AACATT +AACATC +AACATG +AACACA +AACACT +AACACC +AACACG +AACAGA +AACAGT +AACAGC +AACAGG +AACTAA +AACTAT +AACTAC +AACTAG +AACTTA +AACTTT +AACTTC +AACTTG +AACTCA +AACTCT +AACTCC +AACTCG +AACTGA +AACTGT +AACTGC +AACTGG +AACCAA +AACCAT +AACCAC +AACCAG +AACCTA +AACCTT +AACCTC +AACCTG +AACCCA +AACCCT +AACCCC +AACCCG +AACCGA +AACCGT +AACCGC +AACCGG +AACGAA +AACGAT +AACGAC +AACGAG +AACGTA +AACGTT +AACGTC +AACGTG +AACGCA +AACGCT +AACGCC +AACGCG +AACGGA +AACGGT +AACGGC +AACGGG +AAGAAA +AAGAAT +AAGAAC +AAGAAG +AAGATA +AAGATT +AAGATC +AAGATG +AAGACA +AAGACT +AAGACC +AAGACG +AAGAGA +AAGAGT +AAGAGC +AAGAGG +AAGTAA +AAGTAT +AAGTAC +AAGTAG +AAGTTA +AAGTTT +AAGTTC +AAGTTG +AAGTCA +AAGTCT +AAGTCC +AAGTCG +AAGTGA +AAGTGT +AAGTGC +AAGTGG +AAGCAA +AAGCAT +AAGCAC +AAGCAG +AAGCTA +AAGCTT +AAGCTC +AAGCTG +AAGCCA +AAGCCT +AAGCCC +AAGCCG +AAGCGA +AAGCGT +AAGCGC +AAGCGG +AAGGAA +AAGGAT +AAGGAC +AAGGAG +AAGGTA +AAGGTT +AAGGTC +AAGGTG +AAGGCA +AAGGCT +AAGGCC +AAGGCG +AAGGGA +AAGGGT +AAGGGC +AAGGGG +ATAAAA +ATAAAT +ATAAAC +ATAAAG +ATAATA +ATAATT +ATAATC +ATAATG +ATAACA +ATAACT +ATAACC +ATAACG +ATAAGA +ATAAGT +ATAAGC +ATAAGG +ATATAA +ATATAT +ATATAC +ATATAG +ATATTA +ATATTT +ATATTC +ATATTG +ATATCA +ATATCT +ATATCC +ATATCG +ATATGA +ATATGT +ATATGC +ATATGG +ATACAA +ATACAT +ATACAC +ATACAG +ATACTA +ATACTT +ATACTC +ATACTG +ATACCA +ATACCT +ATACCC +ATACCG +ATACGA +ATACGT +ATACGC +ATACGG +ATAGAA +ATAGAT +ATAGAC +ATAGAG +ATAGTA +ATAGTT +ATAGTC +ATAGTG +ATAGCA +ATAGCT +ATAGCC +ATAGCG +ATAGGA +ATAGGT +ATAGGC +ATAGGG +ATTAAA +ATTAAT +ATTAAC +ATTAAG +ATTATA +ATTATT +ATTATC +ATTATG +ATTACA +ATTACT +ATTACC +ATTACG +ATTAGA +ATTAGT +ATTAGC +ATTAGG +ATTTAA +ATTTAT +ATTTAC +ATTTAG +ATTTTA +ATTTTT +ATTTTC +ATTTTG +ATTTCA +ATTTCT +ATTTCC +ATTTCG +ATTTGA +ATTTGT +ATTTGC +ATTTGG +ATTCAA +ATTCAT +ATTCAC +ATTCAG +ATTCTA +ATTCTT +ATTCTC +ATTCTG +ATTCCA +ATTCCT +ATTCCC +ATTCCG +ATTCGA +ATTCGT +ATTCGC +ATTCGG +ATTGAA +ATTGAT +ATTGAC +ATTGAG +ATTGTA +ATTGTT +ATTGTC +ATTGTG +ATTGCA +ATTGCT +ATTGCC +ATTGCG +ATTGGA +ATTGGT +ATTGGC +ATTGGG +ATCAAA +ATCAAT +ATCAAC +ATCAAG +ATCATA +ATCATT +ATCATC +ATCATG +ATCACA +ATCACT +ATCACC +ATCACG +ATCAGA +ATCAGT +ATCAGC +ATCAGG +ATCTAA +ATCTAT +ATCTAC +ATCTAG +ATCTTA +ATCTTT +ATCTTC +ATCTTG +ATCTCA +ATCTCT +ATCTCC +ATCTCG +ATCTGA +ATCTGT +ATCTGC +ATCTGG +ATCCAA +ATCCAT +ATCCAC +ATCCAG +ATCCTA +ATCCTT +ATCCTC +ATCCTG +ATCCCA +ATCCCT +ATCCCC +ATCCCG +ATCCGA +ATCCGT +ATCCGC +ATCCGG +ATCGAA +ATCGAT +ATCGAC +ATCGAG +ATCGTA +ATCGTT +ATCGTC +ATCGTG +ATCGCA +ATCGCT +ATCGCC +ATCGCG +ATCGGA +ATCGGT +ATCGGC +ATCGGG +ATGAAA +ATGAAT +ATGAAC +ATGAAG +ATGATA +ATGATT +ATGATC +ATGATG +ATGACA +ATGACT +ATGACC +ATGACG +ATGAGA +ATGAGT +ATGAGC +ATGAGG +ATGTAA +ATGTAT +ATGTAC +ATGTAG +ATGTTA +ATGTTT +ATGTTC +ATGTTG +ATGTCA +ATGTCT +ATGTCC +ATGTCG +ATGTGA +ATGTGT +ATGTGC +ATGTGG +ATGCAA +ATGCAT +ATGCAC +ATGCAG +ATGCTA +ATGCTT +ATGCTC +ATGCTG +ATGCCA +ATGCCT +ATGCCC +ATGCCG +ATGCGA +ATGCGT +ATGCGC +ATGCGG +ATGGAA +ATGGAT +ATGGAC +ATGGAG +ATGGTA +ATGGTT +ATGGTC +ATGGTG +ATGGCA +ATGGCT +ATGGCC +ATGGCG +ATGGGA +ATGGGT +ATGGGC +ATGGGG +ACAAAA +ACAAAT +ACAAAC +ACAAAG +ACAATA +ACAATT +ACAATC +ACAATG +ACAACA +ACAACT +ACAACC +ACAACG +ACAAGA +ACAAGT +ACAAGC +ACAAGG +ACATAA +ACATAT +ACATAC +ACATAG +ACATTA +ACATTT +ACATTC +ACATTG +ACATCA +ACATCT +ACATCC +ACATCG +ACATGA +ACATGT +ACATGC +ACATGG +ACACAA +ACACAT +ACACAC +ACACAG +ACACTA +ACACTT +ACACTC +ACACTG +ACACCA +ACACCT +ACACCC +ACACCG +ACACGA +ACACGT +ACACGC +ACACGG +ACAGAA +ACAGAT +ACAGAC +ACAGAG +ACAGTA +ACAGTT +ACAGTC +ACAGTG +ACAGCA +ACAGCT +ACAGCC +ACAGCG +ACAGGA +ACAGGT +ACAGGC +ACAGGG +ACTAAA +ACTAAT +ACTAAC +ACTAAG +ACTATA +ACTATT +ACTATC +ACTATG +ACTACA +ACTACT +ACTACC +ACTACG +ACTAGA +ACTAGT +ACTAGC +ACTAGG +ACTTAA +ACTTAT +ACTTAC +ACTTAG +ACTTTA +ACTTTT +ACTTTC +ACTTTG +ACTTCA +ACTTCT +ACTTCC +ACTTCG +ACTTGA +ACTTGT +ACTTGC +ACTTGG +ACTCAA +ACTCAT +ACTCAC +ACTCAG +ACTCTA +ACTCTT +ACTCTC +ACTCTG +ACTCCA +ACTCCT +ACTCCC +ACTCCG +ACTCGA +ACTCGT +ACTCGC +ACTCGG +ACTGAA +ACTGAT +ACTGAC +ACTGAG +ACTGTA +ACTGTT +ACTGTC +ACTGTG +ACTGCA +ACTGCT +ACTGCC +ACTGCG +ACTGGA +ACTGGT +ACTGGC +ACTGGG +ACCAAA +ACCAAT +ACCAAC +ACCAAG +ACCATA +ACCATT +ACCATC +ACCATG +ACCACA +ACCACT +ACCACC +ACCACG +ACCAGA +ACCAGT +ACCAGC +ACCAGG +ACCTAA +ACCTAT +ACCTAC +ACCTAG +ACCTTA +ACCTTT +ACCTTC +ACCTTG +ACCTCA +ACCTCT +ACCTCC +ACCTCG +ACCTGA +ACCTGT +ACCTGC +ACCTGG +ACCCAA +ACCCAT +ACCCAC +ACCCAG +ACCCTA +ACCCTT +ACCCTC +ACCCTG +ACCCCA +ACCCCT +ACCCCC +ACCCCG +ACCCGA +ACCCGT +ACCCGC +ACCCGG +ACCGAA +ACCGAT +ACCGAC +ACCGAG +ACCGTA +ACCGTT +ACCGTC +ACCGTG +ACCGCA +ACCGCT +ACCGCC +ACCGCG +ACCGGA +ACCGGT +ACCGGC +ACCGGG +ACGAAA +ACGAAT +ACGAAC +ACGAAG +ACGATA +ACGATT +ACGATC +ACGATG +ACGACA +ACGACT +ACGACC +ACGACG +ACGAGA +ACGAGT +ACGAGC +ACGAGG +ACGTAA +ACGTAT +ACGTAC +ACGTAG +ACGTTA +ACGTTT +ACGTTC +ACGTTG +ACGTCA +ACGTCT +ACGTCC +ACGTCG +ACGTGA +ACGTGT +ACGTGC +ACGTGG +ACGCAA +ACGCAT +ACGCAC +ACGCAG +ACGCTA +ACGCTT +ACGCTC +ACGCTG +ACGCCA +ACGCCT +ACGCCC +ACGCCG +ACGCGA +ACGCGT +ACGCGC +ACGCGG +ACGGAA +ACGGAT +ACGGAC +ACGGAG +ACGGTA +ACGGTT +ACGGTC +ACGGTG +ACGGCA +ACGGCT +ACGGCC +ACGGCG +ACGGGA +ACGGGT +ACGGGC +ACGGGG +AGAAAA +AGAAAT +AGAAAC +AGAAAG +AGAATA +AGAATT +AGAATC +AGAATG +AGAACA +AGAACT +AGAACC +AGAACG +AGAAGA +AGAAGT +AGAAGC +AGAAGG +AGATAA +AGATAT +AGATAC +AGATAG +AGATTA +AGATTT +AGATTC +AGATTG +AGATCA +AGATCT +AGATCC +AGATCG +AGATGA +AGATGT +AGATGC +AGATGG +AGACAA +AGACAT +AGACAC +AGACAG +AGACTA +AGACTT +AGACTC +AGACTG +AGACCA +AGACCT +AGACCC +AGACCG +AGACGA +AGACGT +AGACGC +AGACGG +AGAGAA +AGAGAT +AGAGAC +AGAGAG +AGAGTA +AGAGTT +AGAGTC +AGAGTG +AGAGCA +AGAGCT +AGAGCC +AGAGCG +AGAGGA +AGAGGT +AGAGGC +AGAGGG +AGTAAA +AGTAAT +AGTAAC +AGTAAG +AGTATA +AGTATT +AGTATC +AGTATG +AGTACA +AGTACT +AGTACC +AGTACG +AGTAGA +AGTAGT +AGTAGC +AGTAGG +AGTTAA +AGTTAT +AGTTAC +AGTTAG +AGTTTA +AGTTTT +AGTTTC +AGTTTG +AGTTCA +AGTTCT +AGTTCC +AGTTCG +AGTTGA +AGTTGT +AGTTGC +AGTTGG +AGTCAA +AGTCAT +AGTCAC +AGTCAG +AGTCTA +AGTCTT +AGTCTC +AGTCTG +AGTCCA +AGTCCT +AGTCCC +AGTCCG +AGTCGA +AGTCGT +AGTCGC +AGTCGG +AGTGAA +AGTGAT +AGTGAC +AGTGAG +AGTGTA +AGTGTT +AGTGTC +AGTGTG +AGTGCA +AGTGCT +AGTGCC +AGTGCG +AGTGGA +AGTGGT +AGTGGC +AGTGGG +AGCAAA +AGCAAT +AGCAAC +AGCAAG +AGCATA +AGCATT +AGCATC +AGCATG +AGCACA +AGCACT +AGCACC +AGCACG +AGCAGA +AGCAGT +AGCAGC +AGCAGG +AGCTAA +AGCTAT +AGCTAC +AGCTAG +AGCTTA +AGCTTT +AGCTTC +AGCTTG +AGCTCA +AGCTCT +AGCTCC +AGCTCG +AGCTGA +AGCTGT +AGCTGC +AGCTGG +AGCCAA +AGCCAT +AGCCAC +AGCCAG +AGCCTA +AGCCTT +AGCCTC +AGCCTG +AGCCCA +AGCCCT +AGCCCC +AGCCCG +AGCCGA +AGCCGT +AGCCGC +AGCCGG +AGCGAA +AGCGAT +AGCGAC +AGCGAG +AGCGTA +AGCGTT +AGCGTC +AGCGTG +AGCGCA +AGCGCT +AGCGCC +AGCGCG +AGCGGA +AGCGGT +AGCGGC +AGCGGG +AGGAAA +AGGAAT +AGGAAC +AGGAAG +AGGATA +AGGATT +AGGATC +AGGATG +AGGACA +AGGACT +AGGACC +AGGACG +AGGAGA +AGGAGT +AGGAGC +AGGAGG +AGGTAA +AGGTAT +AGGTAC +AGGTAG +AGGTTA +AGGTTT +AGGTTC +AGGTTG +AGGTCA +AGGTCT +AGGTCC +AGGTCG +AGGTGA +AGGTGT +AGGTGC +AGGTGG +AGGCAA +AGGCAT +AGGCAC +AGGCAG +AGGCTA +AGGCTT +AGGCTC +AGGCTG +AGGCCA +AGGCCT +AGGCCC +AGGCCG +AGGCGA +AGGCGT +AGGCGC +AGGCGG +AGGGAA +AGGGAT +AGGGAC +AGGGAG +AGGGTA +AGGGTT +AGGGTC +AGGGTG +AGGGCA +AGGGCT +AGGGCC +AGGGCG +AGGGGA +AGGGGT +AGGGGC +AGGGGG +TAAAAA +TAAAAT +TAAAAC +TAAAAG +TAAATA +TAAATT +TAAATC +TAAATG +TAAACA +TAAACT +TAAACC +TAAACG +TAAAGA +TAAAGT +TAAAGC +TAAAGG +TAATAA +TAATAT +TAATAC +TAATAG +TAATTA +TAATTT +TAATTC +TAATTG +TAATCA +TAATCT +TAATCC +TAATCG +TAATGA +TAATGT +TAATGC +TAATGG +TAACAA +TAACAT +TAACAC +TAACAG +TAACTA +TAACTT +TAACTC +TAACTG +TAACCA +TAACCT +TAACCC +TAACCG +TAACGA +TAACGT +TAACGC +TAACGG +TAAGAA +TAAGAT +TAAGAC +TAAGAG +TAAGTA +TAAGTT +TAAGTC +TAAGTG +TAAGCA +TAAGCT +TAAGCC +TAAGCG +TAAGGA +TAAGGT +TAAGGC +TAAGGG +TATAAA +TATAAT +TATAAC +TATAAG +TATATA +TATATT +TATATC +TATATG +TATACA +TATACT +TATACC +TATACG +TATAGA +TATAGT +TATAGC +TATAGG +TATTAA +TATTAT +TATTAC +TATTAG +TATTTA +TATTTT +TATTTC +TATTTG +TATTCA +TATTCT +TATTCC +TATTCG +TATTGA +TATTGT +TATTGC +TATTGG +TATCAA +TATCAT +TATCAC +TATCAG +TATCTA +TATCTT +TATCTC +TATCTG +TATCCA +TATCCT +TATCCC +TATCCG +TATCGA +TATCGT +TATCGC +TATCGG +TATGAA +TATGAT +TATGAC +TATGAG +TATGTA +TATGTT +TATGTC +TATGTG +TATGCA +TATGCT +TATGCC +TATGCG +TATGGA +TATGGT +TATGGC +TATGGG +TACAAA +TACAAT +TACAAC +TACAAG +TACATA +TACATT +TACATC +TACATG +TACACA +TACACT +TACACC +TACACG +TACAGA +TACAGT +TACAGC +TACAGG +TACTAA +TACTAT +TACTAC +TACTAG +TACTTA +TACTTT +TACTTC +TACTTG +TACTCA +TACTCT +TACTCC +TACTCG +TACTGA +TACTGT +TACTGC +TACTGG +TACCAA +TACCAT +TACCAC +TACCAG +TACCTA +TACCTT +TACCTC +TACCTG +TACCCA +TACCCT +TACCCC +TACCCG +TACCGA +TACCGT +TACCGC +TACCGG +TACGAA +TACGAT +TACGAC +TACGAG +TACGTA +TACGTT +TACGTC +TACGTG +TACGCA +TACGCT +TACGCC +TACGCG +TACGGA +TACGGT +TACGGC +TACGGG +TAGAAA +TAGAAT +TAGAAC +TAGAAG +TAGATA +TAGATT +TAGATC +TAGATG +TAGACA +TAGACT +TAGACC +TAGACG +TAGAGA +TAGAGT +TAGAGC +TAGAGG +TAGTAA +TAGTAT +TAGTAC +TAGTAG +TAGTTA +TAGTTT +TAGTTC +TAGTTG +TAGTCA +TAGTCT +TAGTCC +TAGTCG +TAGTGA +TAGTGT +TAGTGC +TAGTGG +TAGCAA +TAGCAT +TAGCAC +TAGCAG +TAGCTA +TAGCTT +TAGCTC +TAGCTG +TAGCCA +TAGCCT +TAGCCC +TAGCCG +TAGCGA +TAGCGT +TAGCGC +TAGCGG +TAGGAA +TAGGAT +TAGGAC +TAGGAG +TAGGTA +TAGGTT +TAGGTC +TAGGTG +TAGGCA +TAGGCT +TAGGCC +TAGGCG +TAGGGA +TAGGGT +TAGGGC +TAGGGG +TTAAAA +TTAAAT +TTAAAC +TTAAAG +TTAATA +TTAATT +TTAATC +TTAATG +TTAACA +TTAACT +TTAACC +TTAACG +TTAAGA +TTAAGT +TTAAGC +TTAAGG +TTATAA +TTATAT +TTATAC +TTATAG +TTATTA +TTATTT +TTATTC +TTATTG +TTATCA +TTATCT +TTATCC +TTATCG +TTATGA +TTATGT +TTATGC +TTATGG +TTACAA +TTACAT +TTACAC +TTACAG +TTACTA +TTACTT +TTACTC +TTACTG +TTACCA +TTACCT +TTACCC +TTACCG +TTACGA +TTACGT +TTACGC +TTACGG +TTAGAA +TTAGAT +TTAGAC +TTAGAG +TTAGTA +TTAGTT +TTAGTC +TTAGTG +TTAGCA +TTAGCT +TTAGCC +TTAGCG +TTAGGA +TTAGGT +TTAGGC +TTAGGG +TTTAAA +TTTAAT +TTTAAC +TTTAAG +TTTATA +TTTATT +TTTATC +TTTATG +TTTACA +TTTACT +TTTACC +TTTACG +TTTAGA +TTTAGT +TTTAGC +TTTAGG +TTTTAA +TTTTAT +TTTTAC +TTTTAG +TTTTTA +TTTTTT +TTTTTC +TTTTTG +TTTTCA +TTTTCT +TTTTCC +TTTTCG +TTTTGA +TTTTGT +TTTTGC +TTTTGG +TTTCAA +TTTCAT +TTTCAC +TTTCAG +TTTCTA +TTTCTT +TTTCTC +TTTCTG +TTTCCA +TTTCCT +TTTCCC +TTTCCG +TTTCGA +TTTCGT +TTTCGC +TTTCGG +TTTGAA +TTTGAT +TTTGAC +TTTGAG +TTTGTA +TTTGTT +TTTGTC +TTTGTG +TTTGCA +TTTGCT +TTTGCC +TTTGCG +TTTGGA +TTTGGT +TTTGGC +TTTGGG +TTCAAA +TTCAAT +TTCAAC +TTCAAG +TTCATA +TTCATT +TTCATC +TTCATG +TTCACA +TTCACT +TTCACC +TTCACG +TTCAGA +TTCAGT +TTCAGC +TTCAGG +TTCTAA +TTCTAT +TTCTAC +TTCTAG +TTCTTA +TTCTTT +TTCTTC +TTCTTG +TTCTCA +TTCTCT +TTCTCC +TTCTCG +TTCTGA +TTCTGT +TTCTGC +TTCTGG +TTCCAA +TTCCAT +TTCCAC +TTCCAG +TTCCTA +TTCCTT +TTCCTC +TTCCTG +TTCCCA +TTCCCT +TTCCCC +TTCCCG +TTCCGA +TTCCGT +TTCCGC +TTCCGG +TTCGAA +TTCGAT +TTCGAC +TTCGAG +TTCGTA +TTCGTT +TTCGTC +TTCGTG +TTCGCA +TTCGCT +TTCGCC +TTCGCG +TTCGGA +TTCGGT +TTCGGC +TTCGGG +TTGAAA +TTGAAT +TTGAAC +TTGAAG +TTGATA +TTGATT +TTGATC +TTGATG +TTGACA +TTGACT +TTGACC +TTGACG +TTGAGA +TTGAGT +TTGAGC +TTGAGG +TTGTAA +TTGTAT +TTGTAC +TTGTAG +TTGTTA +TTGTTT +TTGTTC +TTGTTG +TTGTCA +TTGTCT +TTGTCC +TTGTCG +TTGTGA +TTGTGT +TTGTGC +TTGTGG +TTGCAA +TTGCAT +TTGCAC +TTGCAG +TTGCTA +TTGCTT +TTGCTC +TTGCTG +TTGCCA +TTGCCT +TTGCCC +TTGCCG +TTGCGA +TTGCGT +TTGCGC +TTGCGG +TTGGAA +TTGGAT +TTGGAC +TTGGAG +TTGGTA +TTGGTT +TTGGTC +TTGGTG +TTGGCA +TTGGCT +TTGGCC +TTGGCG +TTGGGA +TTGGGT +TTGGGC +TTGGGG +TCAAAA +TCAAAT +TCAAAC +TCAAAG +TCAATA +TCAATT +TCAATC +TCAATG +TCAACA +TCAACT +TCAACC +TCAACG +TCAAGA +TCAAGT +TCAAGC +TCAAGG +TCATAA +TCATAT +TCATAC +TCATAG +TCATTA +TCATTT +TCATTC +TCATTG +TCATCA +TCATCT +TCATCC +TCATCG +TCATGA +TCATGT +TCATGC +TCATGG +TCACAA +TCACAT +TCACAC +TCACAG +TCACTA +TCACTT +TCACTC +TCACTG +TCACCA +TCACCT +TCACCC +TCACCG +TCACGA +TCACGT +TCACGC +TCACGG +TCAGAA +TCAGAT +TCAGAC +TCAGAG +TCAGTA +TCAGTT +TCAGTC +TCAGTG +TCAGCA +TCAGCT +TCAGCC +TCAGCG +TCAGGA +TCAGGT +TCAGGC +TCAGGG +TCTAAA +TCTAAT +TCTAAC +TCTAAG +TCTATA +TCTATT +TCTATC +TCTATG +TCTACA +TCTACT +TCTACC +TCTACG +TCTAGA +TCTAGT +TCTAGC +TCTAGG +TCTTAA +TCTTAT +TCTTAC +TCTTAG +TCTTTA +TCTTTT +TCTTTC +TCTTTG +TCTTCA +TCTTCT +TCTTCC +TCTTCG +TCTTGA +TCTTGT +TCTTGC +TCTTGG +TCTCAA +TCTCAT +TCTCAC +TCTCAG +TCTCTA +TCTCTT +TCTCTC +TCTCTG +TCTCCA +TCTCCT +TCTCCC +TCTCCG +TCTCGA +TCTCGT +TCTCGC +TCTCGG +TCTGAA +TCTGAT +TCTGAC +TCTGAG +TCTGTA +TCTGTT +TCTGTC +TCTGTG +TCTGCA +TCTGCT +TCTGCC +TCTGCG +TCTGGA +TCTGGT +TCTGGC +TCTGGG +TCCAAA +TCCAAT +TCCAAC +TCCAAG +TCCATA +TCCATT +TCCATC +TCCATG +TCCACA +TCCACT +TCCACC +TCCACG +TCCAGA +TCCAGT +TCCAGC +TCCAGG +TCCTAA +TCCTAT +TCCTAC +TCCTAG +TCCTTA +TCCTTT +TCCTTC +TCCTTG +TCCTCA +TCCTCT +TCCTCC +TCCTCG +TCCTGA +TCCTGT +TCCTGC +TCCTGG +TCCCAA +TCCCAT +TCCCAC +TCCCAG +TCCCTA +TCCCTT +TCCCTC +TCCCTG +TCCCCA +TCCCCT +TCCCCC +TCCCCG +TCCCGA +TCCCGT +TCCCGC +TCCCGG +TCCGAA +TCCGAT +TCCGAC +TCCGAG +TCCGTA +TCCGTT +TCCGTC +TCCGTG +TCCGCA +TCCGCT +TCCGCC +TCCGCG +TCCGGA +TCCGGT +TCCGGC +TCCGGG +TCGAAA +TCGAAT +TCGAAC +TCGAAG +TCGATA +TCGATT +TCGATC +TCGATG +TCGACA +TCGACT +TCGACC +TCGACG +TCGAGA +TCGAGT +TCGAGC +TCGAGG +TCGTAA +TCGTAT +TCGTAC +TCGTAG +TCGTTA +TCGTTT +TCGTTC +TCGTTG +TCGTCA +TCGTCT +TCGTCC +TCGTCG +TCGTGA +TCGTGT +TCGTGC +TCGTGG +TCGCAA +TCGCAT +TCGCAC +TCGCAG +TCGCTA +TCGCTT +TCGCTC +TCGCTG +TCGCCA +TCGCCT +TCGCCC +TCGCCG +TCGCGA +TCGCGT +TCGCGC +TCGCGG +TCGGAA +TCGGAT +TCGGAC +TCGGAG +TCGGTA +TCGGTT +TCGGTC +TCGGTG +TCGGCA +TCGGCT +TCGGCC +TCGGCG +TCGGGA +TCGGGT +TCGGGC +TCGGGG +TGAAAA +TGAAAT +TGAAAC +TGAAAG +TGAATA +TGAATT +TGAATC +TGAATG +TGAACA +TGAACT +TGAACC +TGAACG +TGAAGA +TGAAGT +TGAAGC +TGAAGG +TGATAA +TGATAT +TGATAC +TGATAG +TGATTA +TGATTT +TGATTC +TGATTG +TGATCA +TGATCT +TGATCC +TGATCG +TGATGA +TGATGT +TGATGC +TGATGG +TGACAA +TGACAT +TGACAC +TGACAG +TGACTA +TGACTT +TGACTC +TGACTG +TGACCA +TGACCT +TGACCC +TGACCG +TGACGA +TGACGT +TGACGC +TGACGG +TGAGAA +TGAGAT +TGAGAC +TGAGAG +TGAGTA +TGAGTT +TGAGTC +TGAGTG +TGAGCA +TGAGCT +TGAGCC +TGAGCG +TGAGGA +TGAGGT +TGAGGC +TGAGGG +TGTAAA +TGTAAT +TGTAAC +TGTAAG +TGTATA +TGTATT +TGTATC +TGTATG +TGTACA +TGTACT +TGTACC +TGTACG +TGTAGA +TGTAGT +TGTAGC +TGTAGG +TGTTAA +TGTTAT +TGTTAC +TGTTAG +TGTTTA +TGTTTT +TGTTTC +TGTTTG +TGTTCA +TGTTCT +TGTTCC +TGTTCG +TGTTGA +TGTTGT +TGTTGC +TGTTGG +TGTCAA +TGTCAT +TGTCAC +TGTCAG +TGTCTA +TGTCTT +TGTCTC +TGTCTG +TGTCCA +TGTCCT +TGTCCC +TGTCCG +TGTCGA +TGTCGT +TGTCGC +TGTCGG +TGTGAA +TGTGAT +TGTGAC +TGTGAG +TGTGTA +TGTGTT +TGTGTC +TGTGTG +TGTGCA +TGTGCT +TGTGCC +TGTGCG +TGTGGA +TGTGGT +TGTGGC +TGTGGG +TGCAAA +TGCAAT +TGCAAC +TGCAAG +TGCATA +TGCATT +TGCATC +TGCATG +TGCACA +TGCACT +TGCACC +TGCACG +TGCAGA +TGCAGT +TGCAGC +TGCAGG +TGCTAA +TGCTAT +TGCTAC +TGCTAG +TGCTTA +TGCTTT +TGCTTC +TGCTTG +TGCTCA +TGCTCT +TGCTCC +TGCTCG +TGCTGA +TGCTGT +TGCTGC +TGCTGG +TGCCAA +TGCCAT +TGCCAC +TGCCAG +TGCCTA +TGCCTT +TGCCTC +TGCCTG +TGCCCA +TGCCCT +TGCCCC +TGCCCG +TGCCGA +TGCCGT +TGCCGC +TGCCGG +TGCGAA +TGCGAT +TGCGAC +TGCGAG +TGCGTA +TGCGTT +TGCGTC +TGCGTG +TGCGCA +TGCGCT +TGCGCC +TGCGCG +TGCGGA +TGCGGT +TGCGGC +TGCGGG +TGGAAA +TGGAAT +TGGAAC +TGGAAG +TGGATA +TGGATT +TGGATC +TGGATG +TGGACA +TGGACT +TGGACC +TGGACG +TGGAGA +TGGAGT +TGGAGC +TGGAGG +TGGTAA +TGGTAT +TGGTAC +TGGTAG +TGGTTA +TGGTTT +TGGTTC +TGGTTG +TGGTCA +TGGTCT +TGGTCC +TGGTCG +TGGTGA +TGGTGT +TGGTGC +TGGTGG +TGGCAA +TGGCAT +TGGCAC +TGGCAG +TGGCTA +TGGCTT +TGGCTC +TGGCTG +TGGCCA +TGGCCT +TGGCCC +TGGCCG +TGGCGA +TGGCGT +TGGCGC +TGGCGG +TGGGAA +TGGGAT +TGGGAC +TGGGAG +TGGGTA +TGGGTT +TGGGTC +TGGGTG +TGGGCA +TGGGCT +TGGGCC +TGGGCG +TGGGGA +TGGGGT +TGGGGC +TGGGGG +CAAAAA +CAAAAT +CAAAAC +CAAAAG +CAAATA +CAAATT +CAAATC +CAAATG +CAAACA +CAAACT +CAAACC +CAAACG +CAAAGA +CAAAGT +CAAAGC +CAAAGG +CAATAA +CAATAT +CAATAC +CAATAG +CAATTA +CAATTT +CAATTC +CAATTG +CAATCA +CAATCT +CAATCC +CAATCG +CAATGA +CAATGT +CAATGC +CAATGG +CAACAA +CAACAT +CAACAC +CAACAG +CAACTA +CAACTT +CAACTC +CAACTG +CAACCA +CAACCT +CAACCC +CAACCG +CAACGA +CAACGT +CAACGC +CAACGG +CAAGAA +CAAGAT +CAAGAC +CAAGAG +CAAGTA +CAAGTT +CAAGTC +CAAGTG +CAAGCA +CAAGCT +CAAGCC +CAAGCG +CAAGGA +CAAGGT +CAAGGC +CAAGGG +CATAAA +CATAAT +CATAAC +CATAAG +CATATA +CATATT +CATATC +CATATG +CATACA +CATACT +CATACC +CATACG +CATAGA +CATAGT +CATAGC +CATAGG +CATTAA +CATTAT +CATTAC +CATTAG +CATTTA +CATTTT +CATTTC +CATTTG +CATTCA +CATTCT +CATTCC +CATTCG +CATTGA +CATTGT +CATTGC +CATTGG +CATCAA +CATCAT +CATCAC +CATCAG +CATCTA +CATCTT +CATCTC +CATCTG +CATCCA +CATCCT +CATCCC +CATCCG +CATCGA +CATCGT +CATCGC +CATCGG +CATGAA +CATGAT +CATGAC +CATGAG +CATGTA +CATGTT +CATGTC +CATGTG +CATGCA +CATGCT +CATGCC +CATGCG +CATGGA +CATGGT +CATGGC +CATGGG +CACAAA +CACAAT +CACAAC +CACAAG +CACATA +CACATT +CACATC +CACATG +CACACA +CACACT +CACACC +CACACG +CACAGA +CACAGT +CACAGC +CACAGG +CACTAA +CACTAT +CACTAC +CACTAG +CACTTA +CACTTT +CACTTC +CACTTG +CACTCA +CACTCT +CACTCC +CACTCG +CACTGA +CACTGT +CACTGC +CACTGG +CACCAA +CACCAT +CACCAC +CACCAG +CACCTA +CACCTT +CACCTC +CACCTG +CACCCA +CACCCT +CACCCC +CACCCG +CACCGA +CACCGT +CACCGC +CACCGG +CACGAA +CACGAT +CACGAC +CACGAG +CACGTA +CACGTT +CACGTC +CACGTG +CACGCA +CACGCT +CACGCC +CACGCG +CACGGA +CACGGT +CACGGC +CACGGG +CAGAAA +CAGAAT +CAGAAC +CAGAAG +CAGATA +CAGATT +CAGATC +CAGATG +CAGACA +CAGACT +CAGACC +CAGACG +CAGAGA +CAGAGT +CAGAGC +CAGAGG +CAGTAA +CAGTAT +CAGTAC +CAGTAG +CAGTTA +CAGTTT +CAGTTC +CAGTTG +CAGTCA +CAGTCT +CAGTCC +CAGTCG +CAGTGA +CAGTGT +CAGTGC +CAGTGG +CAGCAA +CAGCAT +CAGCAC +CAGCAG +CAGCTA +CAGCTT +CAGCTC +CAGCTG +CAGCCA +CAGCCT +CAGCCC +CAGCCG +CAGCGA +CAGCGT +CAGCGC +CAGCGG +CAGGAA +CAGGAT +CAGGAC +CAGGAG +CAGGTA +CAGGTT +CAGGTC +CAGGTG +CAGGCA +CAGGCT +CAGGCC +CAGGCG +CAGGGA +CAGGGT +CAGGGC +CAGGGG +CTAAAA +CTAAAT +CTAAAC +CTAAAG +CTAATA +CTAATT +CTAATC +CTAATG +CTAACA +CTAACT +CTAACC +CTAACG +CTAAGA +CTAAGT +CTAAGC +CTAAGG +CTATAA +CTATAT +CTATAC +CTATAG +CTATTA +CTATTT +CTATTC +CTATTG +CTATCA +CTATCT +CTATCC +CTATCG +CTATGA +CTATGT +CTATGC +CTATGG +CTACAA +CTACAT +CTACAC +CTACAG +CTACTA +CTACTT +CTACTC +CTACTG +CTACCA +CTACCT +CTACCC +CTACCG +CTACGA +CTACGT +CTACGC +CTACGG +CTAGAA +CTAGAT +CTAGAC +CTAGAG +CTAGTA +CTAGTT +CTAGTC +CTAGTG +CTAGCA +CTAGCT +CTAGCC +CTAGCG +CTAGGA +CTAGGT +CTAGGC +CTAGGG +CTTAAA +CTTAAT +CTTAAC +CTTAAG +CTTATA +CTTATT +CTTATC +CTTATG +CTTACA +CTTACT +CTTACC +CTTACG +CTTAGA +CTTAGT +CTTAGC +CTTAGG +CTTTAA +CTTTAT +CTTTAC +CTTTAG +CTTTTA +CTTTTT +CTTTTC +CTTTTG +CTTTCA +CTTTCT +CTTTCC +CTTTCG +CTTTGA +CTTTGT +CTTTGC +CTTTGG +CTTCAA +CTTCAT +CTTCAC +CTTCAG +CTTCTA +CTTCTT +CTTCTC +CTTCTG +CTTCCA +CTTCCT +CTTCCC +CTTCCG +CTTCGA +CTTCGT +CTTCGC +CTTCGG +CTTGAA +CTTGAT +CTTGAC +CTTGAG +CTTGTA +CTTGTT +CTTGTC +CTTGTG +CTTGCA +CTTGCT +CTTGCC +CTTGCG +CTTGGA +CTTGGT +CTTGGC +CTTGGG +CTCAAA +CTCAAT +CTCAAC +CTCAAG +CTCATA +CTCATT +CTCATC +CTCATG +CTCACA +CTCACT +CTCACC +CTCACG +CTCAGA +CTCAGT +CTCAGC +CTCAGG +CTCTAA +CTCTAT +CTCTAC +CTCTAG +CTCTTA +CTCTTT +CTCTTC +CTCTTG +CTCTCA +CTCTCT +CTCTCC +CTCTCG +CTCTGA +CTCTGT +CTCTGC +CTCTGG +CTCCAA +CTCCAT +CTCCAC +CTCCAG +CTCCTA +CTCCTT +CTCCTC +CTCCTG +CTCCCA +CTCCCT +CTCCCC +CTCCCG +CTCCGA +CTCCGT +CTCCGC +CTCCGG +CTCGAA +CTCGAT +CTCGAC +CTCGAG +CTCGTA +CTCGTT +CTCGTC +CTCGTG +CTCGCA +CTCGCT +CTCGCC +CTCGCG +CTCGGA +CTCGGT +CTCGGC +CTCGGG +CTGAAA +CTGAAT +CTGAAC +CTGAAG +CTGATA +CTGATT +CTGATC +CTGATG +CTGACA +CTGACT +CTGACC +CTGACG +CTGAGA +CTGAGT +CTGAGC +CTGAGG +CTGTAA +CTGTAT +CTGTAC +CTGTAG +CTGTTA +CTGTTT +CTGTTC +CTGTTG +CTGTCA +CTGTCT +CTGTCC +CTGTCG +CTGTGA +CTGTGT +CTGTGC +CTGTGG +CTGCAA +CTGCAT +CTGCAC +CTGCAG +CTGCTA +CTGCTT +CTGCTC +CTGCTG +CTGCCA +CTGCCT +CTGCCC +CTGCCG +CTGCGA +CTGCGT +CTGCGC +CTGCGG +CTGGAA +CTGGAT +CTGGAC +CTGGAG +CTGGTA +CTGGTT +CTGGTC +CTGGTG +CTGGCA +CTGGCT +CTGGCC +CTGGCG +CTGGGA +CTGGGT +CTGGGC +CTGGGG +CCAAAA +CCAAAT +CCAAAC +CCAAAG +CCAATA +CCAATT +CCAATC +CCAATG +CCAACA +CCAACT +CCAACC +CCAACG +CCAAGA +CCAAGT +CCAAGC +CCAAGG +CCATAA +CCATAT +CCATAC +CCATAG +CCATTA +CCATTT +CCATTC +CCATTG +CCATCA +CCATCT +CCATCC +CCATCG +CCATGA +CCATGT +CCATGC +CCATGG +CCACAA +CCACAT +CCACAC +CCACAG +CCACTA +CCACTT +CCACTC +CCACTG +CCACCA +CCACCT +CCACCC +CCACCG +CCACGA +CCACGT +CCACGC +CCACGG +CCAGAA +CCAGAT +CCAGAC +CCAGAG +CCAGTA +CCAGTT +CCAGTC +CCAGTG +CCAGCA +CCAGCT +CCAGCC +CCAGCG +CCAGGA +CCAGGT +CCAGGC +CCAGGG +CCTAAA +CCTAAT +CCTAAC +CCTAAG +CCTATA +CCTATT +CCTATC +CCTATG +CCTACA +CCTACT +CCTACC +CCTACG +CCTAGA +CCTAGT +CCTAGC +CCTAGG +CCTTAA +CCTTAT +CCTTAC +CCTTAG +CCTTTA +CCTTTT +CCTTTC +CCTTTG +CCTTCA +CCTTCT +CCTTCC +CCTTCG +CCTTGA +CCTTGT +CCTTGC +CCTTGG +CCTCAA +CCTCAT +CCTCAC +CCTCAG +CCTCTA +CCTCTT +CCTCTC +CCTCTG +CCTCCA +CCTCCT +CCTCCC +CCTCCG +CCTCGA +CCTCGT +CCTCGC +CCTCGG +CCTGAA +CCTGAT +CCTGAC +CCTGAG +CCTGTA +CCTGTT +CCTGTC +CCTGTG +CCTGCA +CCTGCT +CCTGCC +CCTGCG +CCTGGA +CCTGGT +CCTGGC +CCTGGG +CCCAAA +CCCAAT +CCCAAC +CCCAAG +CCCATA +CCCATT +CCCATC +CCCATG +CCCACA +CCCACT +CCCACC +CCCACG +CCCAGA +CCCAGT +CCCAGC +CCCAGG +CCCTAA +CCCTAT +CCCTAC +CCCTAG +CCCTTA +CCCTTT +CCCTTC +CCCTTG +CCCTCA +CCCTCT +CCCTCC +CCCTCG +CCCTGA +CCCTGT +CCCTGC +CCCTGG +CCCCAA +CCCCAT +CCCCAC +CCCCAG +CCCCTA +CCCCTT +CCCCTC +CCCCTG +CCCCCA +CCCCCT +CCCCCC +CCCCCG +CCCCGA +CCCCGT +CCCCGC +CCCCGG +CCCGAA +CCCGAT +CCCGAC +CCCGAG +CCCGTA +CCCGTT +CCCGTC +CCCGTG +CCCGCA +CCCGCT +CCCGCC +CCCGCG +CCCGGA +CCCGGT +CCCGGC +CCCGGG +CCGAAA +CCGAAT +CCGAAC +CCGAAG +CCGATA +CCGATT +CCGATC +CCGATG +CCGACA +CCGACT +CCGACC +CCGACG +CCGAGA +CCGAGT +CCGAGC +CCGAGG +CCGTAA +CCGTAT +CCGTAC +CCGTAG +CCGTTA +CCGTTT +CCGTTC +CCGTTG +CCGTCA +CCGTCT +CCGTCC +CCGTCG +CCGTGA +CCGTGT +CCGTGC +CCGTGG +CCGCAA +CCGCAT +CCGCAC +CCGCAG +CCGCTA +CCGCTT +CCGCTC +CCGCTG +CCGCCA +CCGCCT +CCGCCC +CCGCCG +CCGCGA +CCGCGT +CCGCGC +CCGCGG +CCGGAA +CCGGAT +CCGGAC +CCGGAG +CCGGTA +CCGGTT +CCGGTC +CCGGTG +CCGGCA +CCGGCT +CCGGCC +CCGGCG +CCGGGA +CCGGGT +CCGGGC +CCGGGG +CGAAAA +CGAAAT +CGAAAC +CGAAAG +CGAATA +CGAATT +CGAATC +CGAATG +CGAACA +CGAACT +CGAACC +CGAACG +CGAAGA +CGAAGT +CGAAGC +CGAAGG +CGATAA +CGATAT +CGATAC +CGATAG +CGATTA +CGATTT +CGATTC +CGATTG +CGATCA +CGATCT +CGATCC +CGATCG +CGATGA +CGATGT +CGATGC +CGATGG +CGACAA +CGACAT +CGACAC +CGACAG +CGACTA +CGACTT +CGACTC +CGACTG +CGACCA +CGACCT +CGACCC +CGACCG +CGACGA +CGACGT +CGACGC +CGACGG +CGAGAA +CGAGAT +CGAGAC +CGAGAG +CGAGTA +CGAGTT +CGAGTC +CGAGTG +CGAGCA +CGAGCT +CGAGCC +CGAGCG +CGAGGA +CGAGGT +CGAGGC +CGAGGG +CGTAAA +CGTAAT +CGTAAC +CGTAAG +CGTATA +CGTATT +CGTATC +CGTATG +CGTACA +CGTACT +CGTACC +CGTACG +CGTAGA +CGTAGT +CGTAGC +CGTAGG +CGTTAA +CGTTAT +CGTTAC +CGTTAG +CGTTTA +CGTTTT +CGTTTC +CGTTTG +CGTTCA +CGTTCT +CGTTCC +CGTTCG +CGTTGA +CGTTGT +CGTTGC +CGTTGG +CGTCAA +CGTCAT +CGTCAC +CGTCAG +CGTCTA +CGTCTT +CGTCTC +CGTCTG +CGTCCA +CGTCCT +CGTCCC +CGTCCG +CGTCGA +CGTCGT +CGTCGC +CGTCGG +CGTGAA +CGTGAT +CGTGAC +CGTGAG +CGTGTA +CGTGTT +CGTGTC +CGTGTG +CGTGCA +CGTGCT +CGTGCC +CGTGCG +CGTGGA +CGTGGT +CGTGGC +CGTGGG +CGCAAA +CGCAAT +CGCAAC +CGCAAG +CGCATA +CGCATT +CGCATC +CGCATG +CGCACA +CGCACT +CGCACC +CGCACG +CGCAGA +CGCAGT +CGCAGC +CGCAGG +CGCTAA +CGCTAT +CGCTAC +CGCTAG +CGCTTA +CGCTTT +CGCTTC +CGCTTG +CGCTCA +CGCTCT +CGCTCC +CGCTCG +CGCTGA +CGCTGT +CGCTGC +CGCTGG +CGCCAA +CGCCAT +CGCCAC +CGCCAG +CGCCTA +CGCCTT +CGCCTC +CGCCTG +CGCCCA +CGCCCT +CGCCCC +CGCCCG +CGCCGA +CGCCGT +CGCCGC +CGCCGG +CGCGAA +CGCGAT +CGCGAC +CGCGAG +CGCGTA +CGCGTT +CGCGTC +CGCGTG +CGCGCA +CGCGCT +CGCGCC +CGCGCG +CGCGGA +CGCGGT +CGCGGC +CGCGGG +CGGAAA +CGGAAT +CGGAAC +CGGAAG +CGGATA +CGGATT +CGGATC +CGGATG +CGGACA +CGGACT +CGGACC +CGGACG +CGGAGA +CGGAGT +CGGAGC +CGGAGG +CGGTAA +CGGTAT +CGGTAC +CGGTAG +CGGTTA +CGGTTT +CGGTTC +CGGTTG +CGGTCA +CGGTCT +CGGTCC +CGGTCG +CGGTGA +CGGTGT +CGGTGC +CGGTGG +CGGCAA +CGGCAT +CGGCAC +CGGCAG +CGGCTA +CGGCTT +CGGCTC +CGGCTG +CGGCCA +CGGCCT +CGGCCC +CGGCCG +CGGCGA +CGGCGT +CGGCGC +CGGCGG +CGGGAA +CGGGAT +CGGGAC +CGGGAG +CGGGTA +CGGGTT +CGGGTC +CGGGTG +CGGGCA +CGGGCT +CGGGCC +CGGGCG +CGGGGA +CGGGGT +CGGGGC +CGGGGG +GAAAAA +GAAAAT +GAAAAC +GAAAAG +GAAATA +GAAATT +GAAATC +GAAATG +GAAACA +GAAACT +GAAACC +GAAACG +GAAAGA +GAAAGT +GAAAGC +GAAAGG +GAATAA +GAATAT +GAATAC +GAATAG +GAATTA +GAATTT +GAATTC +GAATTG +GAATCA +GAATCT +GAATCC +GAATCG +GAATGA +GAATGT +GAATGC +GAATGG +GAACAA +GAACAT +GAACAC +GAACAG +GAACTA +GAACTT +GAACTC +GAACTG +GAACCA +GAACCT +GAACCC +GAACCG +GAACGA +GAACGT +GAACGC +GAACGG +GAAGAA +GAAGAT +GAAGAC +GAAGAG +GAAGTA +GAAGTT +GAAGTC +GAAGTG +GAAGCA +GAAGCT +GAAGCC +GAAGCG +GAAGGA +GAAGGT +GAAGGC +GAAGGG +GATAAA +GATAAT +GATAAC +GATAAG +GATATA +GATATT +GATATC +GATATG +GATACA +GATACT +GATACC +GATACG +GATAGA +GATAGT +GATAGC +GATAGG +GATTAA +GATTAT +GATTAC +GATTAG +GATTTA +GATTTT +GATTTC +GATTTG +GATTCA +GATTCT +GATTCC +GATTCG +GATTGA +GATTGT +GATTGC +GATTGG +GATCAA +GATCAT +GATCAC +GATCAG +GATCTA +GATCTT +GATCTC +GATCTG +GATCCA +GATCCT +GATCCC +GATCCG +GATCGA +GATCGT +GATCGC +GATCGG +GATGAA +GATGAT +GATGAC +GATGAG +GATGTA +GATGTT +GATGTC +GATGTG +GATGCA +GATGCT +GATGCC +GATGCG +GATGGA +GATGGT +GATGGC +GATGGG +GACAAA +GACAAT +GACAAC +GACAAG +GACATA +GACATT +GACATC +GACATG +GACACA +GACACT +GACACC +GACACG +GACAGA +GACAGT +GACAGC +GACAGG +GACTAA +GACTAT +GACTAC +GACTAG +GACTTA +GACTTT +GACTTC +GACTTG +GACTCA +GACTCT +GACTCC +GACTCG +GACTGA +GACTGT +GACTGC +GACTGG +GACCAA +GACCAT +GACCAC +GACCAG +GACCTA +GACCTT +GACCTC +GACCTG +GACCCA +GACCCT +GACCCC +GACCCG +GACCGA +GACCGT +GACCGC +GACCGG +GACGAA +GACGAT +GACGAC +GACGAG +GACGTA +GACGTT +GACGTC +GACGTG +GACGCA +GACGCT +GACGCC +GACGCG +GACGGA +GACGGT +GACGGC +GACGGG +GAGAAA +GAGAAT +GAGAAC +GAGAAG +GAGATA +GAGATT +GAGATC +GAGATG +GAGACA +GAGACT +GAGACC +GAGACG +GAGAGA +GAGAGT +GAGAGC +GAGAGG +GAGTAA +GAGTAT +GAGTAC +GAGTAG +GAGTTA +GAGTTT +GAGTTC +GAGTTG +GAGTCA +GAGTCT +GAGTCC +GAGTCG +GAGTGA +GAGTGT +GAGTGC +GAGTGG +GAGCAA +GAGCAT +GAGCAC +GAGCAG +GAGCTA +GAGCTT +GAGCTC +GAGCTG +GAGCCA +GAGCCT +GAGCCC +GAGCCG +GAGCGA +GAGCGT +GAGCGC +GAGCGG +GAGGAA +GAGGAT +GAGGAC +GAGGAG +GAGGTA +GAGGTT +GAGGTC +GAGGTG +GAGGCA +GAGGCT +GAGGCC +GAGGCG +GAGGGA +GAGGGT +GAGGGC +GAGGGG +GTAAAA +GTAAAT +GTAAAC +GTAAAG +GTAATA +GTAATT +GTAATC +GTAATG +GTAACA +GTAACT +GTAACC +GTAACG +GTAAGA +GTAAGT +GTAAGC +GTAAGG +GTATAA +GTATAT +GTATAC +GTATAG +GTATTA +GTATTT +GTATTC +GTATTG +GTATCA +GTATCT +GTATCC +GTATCG +GTATGA +GTATGT +GTATGC +GTATGG +GTACAA +GTACAT +GTACAC +GTACAG +GTACTA +GTACTT +GTACTC +GTACTG +GTACCA +GTACCT +GTACCC +GTACCG +GTACGA +GTACGT +GTACGC +GTACGG +GTAGAA +GTAGAT +GTAGAC +GTAGAG +GTAGTA +GTAGTT +GTAGTC +GTAGTG +GTAGCA +GTAGCT +GTAGCC +GTAGCG +GTAGGA +GTAGGT +GTAGGC +GTAGGG +GTTAAA +GTTAAT +GTTAAC +GTTAAG +GTTATA +GTTATT +GTTATC +GTTATG +GTTACA +GTTACT +GTTACC +GTTACG +GTTAGA +GTTAGT +GTTAGC +GTTAGG +GTTTAA +GTTTAT +GTTTAC +GTTTAG +GTTTTA +GTTTTT +GTTTTC +GTTTTG +GTTTCA +GTTTCT +GTTTCC +GTTTCG +GTTTGA +GTTTGT +GTTTGC +GTTTGG +GTTCAA +GTTCAT +GTTCAC +GTTCAG +GTTCTA +GTTCTT +GTTCTC +GTTCTG +GTTCCA +GTTCCT +GTTCCC +GTTCCG +GTTCGA +GTTCGT +GTTCGC +GTTCGG +GTTGAA +GTTGAT +GTTGAC +GTTGAG +GTTGTA +GTTGTT +GTTGTC +GTTGTG +GTTGCA +GTTGCT +GTTGCC +GTTGCG +GTTGGA +GTTGGT +GTTGGC +GTTGGG +GTCAAA +GTCAAT +GTCAAC +GTCAAG +GTCATA +GTCATT +GTCATC +GTCATG +GTCACA +GTCACT +GTCACC +GTCACG +GTCAGA +GTCAGT +GTCAGC +GTCAGG +GTCTAA +GTCTAT +GTCTAC +GTCTAG +GTCTTA +GTCTTT +GTCTTC +GTCTTG +GTCTCA +GTCTCT +GTCTCC +GTCTCG +GTCTGA +GTCTGT +GTCTGC +GTCTGG +GTCCAA +GTCCAT +GTCCAC +GTCCAG +GTCCTA +GTCCTT +GTCCTC +GTCCTG +GTCCCA +GTCCCT +GTCCCC +GTCCCG +GTCCGA +GTCCGT +GTCCGC +GTCCGG +GTCGAA +GTCGAT +GTCGAC +GTCGAG +GTCGTA +GTCGTT +GTCGTC +GTCGTG +GTCGCA +GTCGCT +GTCGCC +GTCGCG +GTCGGA +GTCGGT +GTCGGC +GTCGGG +GTGAAA +GTGAAT +GTGAAC +GTGAAG +GTGATA +GTGATT +GTGATC +GTGATG +GTGACA +GTGACT +GTGACC +GTGACG +GTGAGA +GTGAGT +GTGAGC +GTGAGG +GTGTAA +GTGTAT +GTGTAC +GTGTAG +GTGTTA +GTGTTT +GTGTTC +GTGTTG +GTGTCA +GTGTCT +GTGTCC +GTGTCG +GTGTGA +GTGTGT +GTGTGC +GTGTGG +GTGCAA +GTGCAT +GTGCAC +GTGCAG +GTGCTA +GTGCTT +GTGCTC +GTGCTG +GTGCCA +GTGCCT +GTGCCC +GTGCCG +GTGCGA +GTGCGT +GTGCGC +GTGCGG +GTGGAA +GTGGAT +GTGGAC +GTGGAG +GTGGTA +GTGGTT +GTGGTC +GTGGTG +GTGGCA +GTGGCT +GTGGCC +GTGGCG +GTGGGA +GTGGGT +GTGGGC +GTGGGG +GCAAAA +GCAAAT +GCAAAC +GCAAAG +GCAATA +GCAATT +GCAATC +GCAATG +GCAACA +GCAACT +GCAACC +GCAACG +GCAAGA +GCAAGT +GCAAGC +GCAAGG +GCATAA +GCATAT +GCATAC +GCATAG +GCATTA +GCATTT +GCATTC +GCATTG +GCATCA +GCATCT +GCATCC +GCATCG +GCATGA +GCATGT +GCATGC +GCATGG +GCACAA +GCACAT +GCACAC +GCACAG +GCACTA +GCACTT +GCACTC +GCACTG +GCACCA +GCACCT +GCACCC +GCACCG +GCACGA +GCACGT +GCACGC +GCACGG +GCAGAA +GCAGAT +GCAGAC +GCAGAG +GCAGTA +GCAGTT +GCAGTC +GCAGTG +GCAGCA +GCAGCT +GCAGCC +GCAGCG +GCAGGA +GCAGGT +GCAGGC +GCAGGG +GCTAAA +GCTAAT +GCTAAC +GCTAAG +GCTATA +GCTATT +GCTATC +GCTATG +GCTACA +GCTACT +GCTACC +GCTACG +GCTAGA +GCTAGT +GCTAGC +GCTAGG +GCTTAA +GCTTAT +GCTTAC +GCTTAG +GCTTTA +GCTTTT +GCTTTC +GCTTTG +GCTTCA +GCTTCT +GCTTCC +GCTTCG +GCTTGA +GCTTGT +GCTTGC +GCTTGG +GCTCAA +GCTCAT +GCTCAC +GCTCAG +GCTCTA +GCTCTT +GCTCTC +GCTCTG +GCTCCA +GCTCCT +GCTCCC +GCTCCG +GCTCGA +GCTCGT +GCTCGC +GCTCGG +GCTGAA +GCTGAT +GCTGAC +GCTGAG +GCTGTA +GCTGTT +GCTGTC +GCTGTG +GCTGCA +GCTGCT +GCTGCC +GCTGCG +GCTGGA +GCTGGT +GCTGGC +GCTGGG +GCCAAA +GCCAAT +GCCAAC +GCCAAG +GCCATA +GCCATT +GCCATC +GCCATG +GCCACA +GCCACT +GCCACC +GCCACG +GCCAGA +GCCAGT +GCCAGC +GCCAGG +GCCTAA +GCCTAT +GCCTAC +GCCTAG +GCCTTA +GCCTTT +GCCTTC +GCCTTG +GCCTCA +GCCTCT +GCCTCC +GCCTCG +GCCTGA +GCCTGT +GCCTGC +GCCTGG +GCCCAA +GCCCAT +GCCCAC +GCCCAG +GCCCTA +GCCCTT +GCCCTC +GCCCTG +GCCCCA +GCCCCT +GCCCCC +GCCCCG +GCCCGA +GCCCGT +GCCCGC +GCCCGG +GCCGAA +GCCGAT +GCCGAC +GCCGAG +GCCGTA +GCCGTT +GCCGTC +GCCGTG +GCCGCA +GCCGCT +GCCGCC +GCCGCG +GCCGGA +GCCGGT +GCCGGC +GCCGGG +GCGAAA +GCGAAT +GCGAAC +GCGAAG +GCGATA +GCGATT +GCGATC +GCGATG +GCGACA +GCGACT +GCGACC +GCGACG +GCGAGA +GCGAGT +GCGAGC +GCGAGG +GCGTAA +GCGTAT +GCGTAC +GCGTAG +GCGTTA +GCGTTT +GCGTTC +GCGTTG +GCGTCA +GCGTCT +GCGTCC +GCGTCG +GCGTGA +GCGTGT +GCGTGC +GCGTGG +GCGCAA +GCGCAT +GCGCAC +GCGCAG +GCGCTA +GCGCTT +GCGCTC +GCGCTG +GCGCCA +GCGCCT +GCGCCC +GCGCCG +GCGCGA +GCGCGT +GCGCGC +GCGCGG +GCGGAA +GCGGAT +GCGGAC +GCGGAG +GCGGTA +GCGGTT +GCGGTC +GCGGTG +GCGGCA +GCGGCT +GCGGCC +GCGGCG +GCGGGA +GCGGGT +GCGGGC +GCGGGG +GGAAAA +GGAAAT +GGAAAC +GGAAAG +GGAATA +GGAATT +GGAATC +GGAATG +GGAACA +GGAACT +GGAACC +GGAACG +GGAAGA +GGAAGT +GGAAGC +GGAAGG +GGATAA +GGATAT +GGATAC +GGATAG +GGATTA +GGATTT +GGATTC +GGATTG +GGATCA +GGATCT +GGATCC +GGATCG +GGATGA +GGATGT +GGATGC +GGATGG +GGACAA +GGACAT +GGACAC +GGACAG +GGACTA +GGACTT +GGACTC +GGACTG +GGACCA +GGACCT +GGACCC +GGACCG +GGACGA +GGACGT +GGACGC +GGACGG +GGAGAA +GGAGAT +GGAGAC +GGAGAG +GGAGTA +GGAGTT +GGAGTC +GGAGTG +GGAGCA +GGAGCT +GGAGCC +GGAGCG +GGAGGA +GGAGGT +GGAGGC +GGAGGG +GGTAAA +GGTAAT +GGTAAC +GGTAAG +GGTATA +GGTATT +GGTATC +GGTATG +GGTACA +GGTACT +GGTACC +GGTACG +GGTAGA +GGTAGT +GGTAGC +GGTAGG +GGTTAA +GGTTAT +GGTTAC +GGTTAG +GGTTTA +GGTTTT +GGTTTC +GGTTTG +GGTTCA +GGTTCT +GGTTCC +GGTTCG +GGTTGA +GGTTGT +GGTTGC +GGTTGG +GGTCAA +GGTCAT +GGTCAC +GGTCAG +GGTCTA +GGTCTT +GGTCTC +GGTCTG +GGTCCA +GGTCCT +GGTCCC +GGTCCG +GGTCGA +GGTCGT +GGTCGC +GGTCGG +GGTGAA +GGTGAT +GGTGAC +GGTGAG +GGTGTA +GGTGTT +GGTGTC +GGTGTG +GGTGCA +GGTGCT +GGTGCC +GGTGCG +GGTGGA +GGTGGT +GGTGGC +GGTGGG +GGCAAA +GGCAAT +GGCAAC +GGCAAG +GGCATA +GGCATT +GGCATC +GGCATG +GGCACA +GGCACT +GGCACC +GGCACG +GGCAGA +GGCAGT +GGCAGC +GGCAGG +GGCTAA +GGCTAT +GGCTAC +GGCTAG +GGCTTA +GGCTTT +GGCTTC +GGCTTG +GGCTCA +GGCTCT +GGCTCC +GGCTCG +GGCTGA +GGCTGT +GGCTGC +GGCTGG +GGCCAA +GGCCAT +GGCCAC +GGCCAG +GGCCTA +GGCCTT +GGCCTC +GGCCTG +GGCCCA +GGCCCT +GGCCCC +GGCCCG +GGCCGA +GGCCGT +GGCCGC +GGCCGG +GGCGAA +GGCGAT +GGCGAC +GGCGAG +GGCGTA +GGCGTT +GGCGTC +GGCGTG +GGCGCA +GGCGCT +GGCGCC +GGCGCG +GGCGGA +GGCGGT +GGCGGC +GGCGGG +GGGAAA +GGGAAT +GGGAAC +GGGAAG +GGGATA +GGGATT +GGGATC +GGGATG +GGGACA +GGGACT +GGGACC +GGGACG +GGGAGA +GGGAGT +GGGAGC +GGGAGG +GGGTAA +GGGTAT +GGGTAC +GGGTAG +GGGTTA +GGGTTT +GGGTTC +GGGTTG +GGGTCA +GGGTCT +GGGTCC +GGGTCG +GGGTGA +GGGTGT +GGGTGC +GGGTGG +GGGCAA +GGGCAT +GGGCAC +GGGCAG +GGGCTA +GGGCTT +GGGCTC +GGGCTG +GGGCCA +GGGCCT +GGGCCC +GGGCCG +GGGCGA +GGGCGT +GGGCGC +GGGCGG +GGGGAA +GGGGAT +GGGGAC +GGGGAG +GGGGTA +GGGGTT +GGGGTC +GGGGTG +GGGGCA +GGGGCT +GGGGCC +GGGGCG +GGGGGA +GGGGGT +GGGGGC +GGGGGG