From a02e9b442e605e1815eb525d791e11930b9507b4 Mon Sep 17 00:00:00 2001 From: ModelHub XC Date: Thu, 4 Jun 2026 23:06:22 +0800 Subject: [PATCH] =?UTF-8?q?=E5=88=9D=E5=A7=8B=E5=8C=96=E9=A1=B9=E7=9B=AE?= =?UTF-8?q?=EF=BC=8C=E7=94=B1ModelHub=20XC=E7=A4=BE=E5=8C=BA=E6=8F=90?= =?UTF-8?q?=E4=BE=9B=E6=A8=A1=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Model: AI-Sweden-Models/gpt-sw3-6.7b-v2-translator Source: Original Platform --- .gitattributes | 35 +++ README.md | 80 +++++++ config.json | 38 +++ generation_config.json | 8 + model-00001-of-00003.safetensors | 3 + model-00002-of-00003.safetensors | 3 + model-00003-of-00003.safetensors | 3 + model.safetensors.index.json | 395 +++++++++++++++++++++++++++++++ special_tokens_map.json | 30 +++ spiece.model | 3 + tokenizer_config.json | 49 ++++ 11 files changed, 647 insertions(+) create mode 100644 .gitattributes create mode 100644 README.md create mode 100644 config.json create mode 100644 generation_config.json create mode 100644 model-00001-of-00003.safetensors create mode 100644 model-00002-of-00003.safetensors create mode 100644 model-00003-of-00003.safetensors create mode 100644 model.safetensors.index.json create mode 100644 special_tokens_map.json create mode 100644 spiece.model create mode 100644 tokenizer_config.json diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..a6344aa --- /dev/null +++ b/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..d7501b1 --- /dev/null +++ b/README.md @@ -0,0 +1,80 @@ +--- +base_model: AI-Sweden-Models/gpt-sw3-6.7b-v2-instruct +language: +- sv +- da +- 'no' +- en +pipeline_tag: text-generation +inference: + parameters: + temperature: 0.7 +tags: +- translation +--- +# Model Card for gpt-sw3-6.7b-v2-translator +The `gpt-sw3-6.7b-v2-translator` is a finetuned version of `gpt-sw3-6.7b-v2-instruct` on a carefully selected translation pair dataset that was gathered by AI Sweden. + + +## Intended usage: +Translate text data from English to Swedish, or Swedish to English. + + +## How to use: +```python +import torch +from transformers import pipeline, StoppingCriteriaList, StoppingCriteria + +device = "cuda" if torch.cuda.is_available() else "cpu" + + +# (Optional) - define a stopping criteria +# We ideally want the model to stop generate once the response from the Bot is generated +class StopOnTokenCriteria(StoppingCriteria): + def __init__(self, stop_token_id): + self.stop_token_id = stop_token_id + + def __call__(self, input_ids, scores, **kwargs): + return input_ids[0, -1] == self.stop_token_id + + +pipe = pipeline( + task="text-generation", + model="AI-Sweden-Models/gpt-sw3-6.7b-v2-translator", + device=device +) + +stop_on_token_criteria = StopOnTokenCriteria(stop_token_id=pipe.tokenizer.bos_token_id) +text = "I like to eat ice cream in the summer." + +# This will translate English to Swedish +# To translate from Swedish to English the prompt would be: +# prompt = f"<|endoftext|>User: Översätt till Engelska från Svenska\n{text}Bot:" + +prompt = f"<|endoftext|>User: Översätt till Svenska från Engelska\n{text}Bot:" + +input_tokens = pipe.tokenizer(prompt, return_tensors="pt").input_ids.to(device) +max_model_length = 2048 +dynamic_max_length = max_model_length - input_tokens.shape[1] + +response = pipe( + prompt, + max_length=dynamic_max_length, + truncation=True, + stopping_criteria=StoppingCriteriaList([stop_on_token_criteria]) +) + +print(response[0]["generated_text"].split("Bot: ")[-1]) +``` +```python +>>> "Jag tycker om att äta glass på sommaren." +``` + +## Training & Data: +The training was done on 1 NVIDIA DGX using DeepSpeed ZeRO 3 for three epochs on roughly 4GB of carefully selected translation data. It is a full finetune of all of the model parameters. + +| Epoch | Training Loss | Evaluation Loss | +|-------|---------------|-----------------| +| 1 | 1.309 | 1.281 | +| 2 | 1.161 | 1.242 | +| 3 | 1.053 | 1.219 | \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..dc7f8b0 --- /dev/null +++ b/config.json @@ -0,0 +1,38 @@ +{ + "_name_or_path": "/data/models/gpt-sw3-translation-7b", + "activation_function": "gelu", + "apply_query_key_layer_scaling": true, + "architectures": [ + "GPT2LMHeadModel" + ], + "attn_pdrop": 0.1, + "bos_token_id": 1, + "embd_pdrop": 0.1, + "eos_token_id": 1, + "initializer_range": 0.01, + "layer_norm_epsilon": 1e-05, + "max_length": 2048, + "model_type": "gpt2", + "n_ctx": 2048, + "n_embd": 4096, + "n_head": 32, + "n_inner": 16384, + "n_layer": 32, + "n_positions": 2048, + "normalize_attention_scores": true, + "pad_token_id": 0, + "reorder_and_upcast_attn": false, + "resid_pdrop": 0.1, + "scale_attn_by_inverse_layer_idx": false, + "scale_attn_weights": true, + "summary_activation": null, + "summary_first_dropout": 0.1, + "summary_proj_to_labels": true, + "summary_type": "cls_index", + "summary_use_proj": true, + "tokenizer_class": "GPTSw3Tokenizer", + "torch_dtype": "bfloat16", + "transformers_version": "4.40.0.dev0", + "use_cache": true, + "vocab_size": 64000 +} diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..4463e47 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,8 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "eos_token_id": 1, + "max_length": 2048, + "pad_token_id": 0, + "transformers_version": "4.40.0.dev0" +} diff --git a/model-00001-of-00003.safetensors b/model-00001-of-00003.safetensors new file mode 100644 index 0000000..3a073ca --- /dev/null +++ b/model-00001-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59fadb1cee8c2566dc4eba3b749da23508569ce8725c633d56347a12792bb4a2 +size 4971452464 diff --git a/model-00002-of-00003.safetensors b/model-00002-of-00003.safetensors new file mode 100644 index 0000000..532e708 --- /dev/null +++ b/model-00002-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa0638c1ddd676dd976774c16abf1d8e3f54be5b9f4af397ed27a1fdbf7743f9 +size 4967399072 diff --git a/model-00003-of-00003.safetensors b/model-00003-of-00003.safetensors new file mode 100644 index 0000000..9e29b73 --- /dev/null +++ b/model-00003-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be6e3549c8d4a262a058d196f40e1a04958ca3f955aad5f5d80226d5850c65de +size 3490581120 diff --git a/model.safetensors.index.json b/model.safetensors.index.json new file mode 100644 index 0000000..7fae6c9 --- /dev/null +++ b/model.safetensors.index.json @@ -0,0 +1,395 @@ +{ + "metadata": { + "total_size": 13429391360 + }, + "weight_map": { + "transformer.h.0.attn.c_attn.bias": "model-00001-of-00003.safetensors", + "transformer.h.0.attn.c_attn.weight": "model-00001-of-00003.safetensors", + "transformer.h.0.attn.c_proj.bias": "model-00001-of-00003.safetensors", + "transformer.h.0.attn.c_proj.weight": "model-00001-of-00003.safetensors", + "transformer.h.0.ln_1.bias": "model-00001-of-00003.safetensors", + "transformer.h.0.ln_1.weight": "model-00001-of-00003.safetensors", + "transformer.h.0.ln_2.bias": "model-00001-of-00003.safetensors", + "transformer.h.0.ln_2.weight": "model-00001-of-00003.safetensors", + "transformer.h.0.mlp.c_fc.bias": "model-00001-of-00003.safetensors", + "transformer.h.0.mlp.c_fc.weight": "model-00001-of-00003.safetensors", + "transformer.h.0.mlp.c_proj.bias": "model-00001-of-00003.safetensors", + "transformer.h.0.mlp.c_proj.weight": "model-00001-of-00003.safetensors", + "transformer.h.1.attn.c_attn.bias": "model-00001-of-00003.safetensors", + "transformer.h.1.attn.c_attn.weight": "model-00001-of-00003.safetensors", + "transformer.h.1.attn.c_proj.bias": "model-00001-of-00003.safetensors", + "transformer.h.1.attn.c_proj.weight": "model-00001-of-00003.safetensors", + "transformer.h.1.ln_1.bias": "model-00001-of-00003.safetensors", + "transformer.h.1.ln_1.weight": "model-00001-of-00003.safetensors", + "transformer.h.1.ln_2.bias": "model-00001-of-00003.safetensors", + "transformer.h.1.ln_2.weight": "model-00001-of-00003.safetensors", + "transformer.h.1.mlp.c_fc.bias": "model-00001-of-00003.safetensors", + "transformer.h.1.mlp.c_fc.weight": "model-00001-of-00003.safetensors", + "transformer.h.1.mlp.c_proj.bias": "model-00001-of-00003.safetensors", + "transformer.h.1.mlp.c_proj.weight": "model-00001-of-00003.safetensors", + "transformer.h.10.attn.c_attn.bias": "model-00001-of-00003.safetensors", + "transformer.h.10.attn.c_attn.weight": "model-00001-of-00003.safetensors", + "transformer.h.10.attn.c_proj.bias": "model-00001-of-00003.safetensors", + "transformer.h.10.attn.c_proj.weight": "model-00001-of-00003.safetensors", + "transformer.h.10.ln_1.bias": "model-00001-of-00003.safetensors", + "transformer.h.10.ln_1.weight": "model-00001-of-00003.safetensors", + "transformer.h.10.ln_2.bias": "model-00001-of-00003.safetensors", + "transformer.h.10.ln_2.weight": "model-00001-of-00003.safetensors", + "transformer.h.10.mlp.c_fc.bias": "model-00001-of-00003.safetensors", + "transformer.h.10.mlp.c_fc.weight": "model-00001-of-00003.safetensors", + "transformer.h.10.mlp.c_proj.bias": "model-00001-of-00003.safetensors", + "transformer.h.10.mlp.c_proj.weight": "model-00001-of-00003.safetensors", + "transformer.h.11.attn.c_attn.bias": "model-00002-of-00003.safetensors", + "transformer.h.11.attn.c_attn.weight": "model-00002-of-00003.safetensors", + "transformer.h.11.attn.c_proj.bias": "model-00002-of-00003.safetensors", + "transformer.h.11.attn.c_proj.weight": "model-00002-of-00003.safetensors", + "transformer.h.11.ln_1.bias": "model-00001-of-00003.safetensors", + "transformer.h.11.ln_1.weight": "model-00001-of-00003.safetensors", + "transformer.h.11.ln_2.bias": "model-00002-of-00003.safetensors", + "transformer.h.11.ln_2.weight": "model-00002-of-00003.safetensors", + "transformer.h.11.mlp.c_fc.bias": "model-00002-of-00003.safetensors", + "transformer.h.11.mlp.c_fc.weight": "model-00002-of-00003.safetensors", + "transformer.h.11.mlp.c_proj.bias": "model-00002-of-00003.safetensors", + "transformer.h.11.mlp.c_proj.weight": "model-00002-of-00003.safetensors", + "transformer.h.12.attn.c_attn.bias": "model-00002-of-00003.safetensors", + "transformer.h.12.attn.c_attn.weight": "model-00002-of-00003.safetensors", + "transformer.h.12.attn.c_proj.bias": "model-00002-of-00003.safetensors", + "transformer.h.12.attn.c_proj.weight": "model-00002-of-00003.safetensors", + "transformer.h.12.ln_1.bias": "model-00002-of-00003.safetensors", + "transformer.h.12.ln_1.weight": "model-00002-of-00003.safetensors", + "transformer.h.12.ln_2.bias": "model-00002-of-00003.safetensors", + "transformer.h.12.ln_2.weight": "model-00002-of-00003.safetensors", + "transformer.h.12.mlp.c_fc.bias": "model-00002-of-00003.safetensors", + "transformer.h.12.mlp.c_fc.weight": "model-00002-of-00003.safetensors", + "transformer.h.12.mlp.c_proj.bias": "model-00002-of-00003.safetensors", + "transformer.h.12.mlp.c_proj.weight": "model-00002-of-00003.safetensors", + "transformer.h.13.attn.c_attn.bias": "model-00002-of-00003.safetensors", + "transformer.h.13.attn.c_attn.weight": "model-00002-of-00003.safetensors", + "transformer.h.13.attn.c_proj.bias": "model-00002-of-00003.safetensors", + "transformer.h.13.attn.c_proj.weight": "model-00002-of-00003.safetensors", + "transformer.h.13.ln_1.bias": "model-00002-of-00003.safetensors", + "transformer.h.13.ln_1.weight": "model-00002-of-00003.safetensors", + "transformer.h.13.ln_2.bias": "model-00002-of-00003.safetensors", + "transformer.h.13.ln_2.weight": "model-00002-of-00003.safetensors", + "transformer.h.13.mlp.c_fc.bias": "model-00002-of-00003.safetensors", + "transformer.h.13.mlp.c_fc.weight": "model-00002-of-00003.safetensors", + "transformer.h.13.mlp.c_proj.bias": "model-00002-of-00003.safetensors", + "transformer.h.13.mlp.c_proj.weight": "model-00002-of-00003.safetensors", + "transformer.h.14.attn.c_attn.bias": "model-00002-of-00003.safetensors", + "transformer.h.14.attn.c_attn.weight": "model-00002-of-00003.safetensors", + "transformer.h.14.attn.c_proj.bias": "model-00002-of-00003.safetensors", + "transformer.h.14.attn.c_proj.weight": "model-00002-of-00003.safetensors", + "transformer.h.14.ln_1.bias": "model-00002-of-00003.safetensors", + "transformer.h.14.ln_1.weight": "model-00002-of-00003.safetensors", + "transformer.h.14.ln_2.bias": "model-00002-of-00003.safetensors", + "transformer.h.14.ln_2.weight": "model-00002-of-00003.safetensors", + "transformer.h.14.mlp.c_fc.bias": "model-00002-of-00003.safetensors", + "transformer.h.14.mlp.c_fc.weight": "model-00002-of-00003.safetensors", + "transformer.h.14.mlp.c_proj.bias": "model-00002-of-00003.safetensors", + "transformer.h.14.mlp.c_proj.weight": "model-00002-of-00003.safetensors", + "transformer.h.15.attn.c_attn.bias": "model-00002-of-00003.safetensors", + "transformer.h.15.attn.c_attn.weight": "model-00002-of-00003.safetensors", + "transformer.h.15.attn.c_proj.bias": "model-00002-of-00003.safetensors", + "transformer.h.15.attn.c_proj.weight": "model-00002-of-00003.safetensors", + "transformer.h.15.ln_1.bias": "model-00002-of-00003.safetensors", + "transformer.h.15.ln_1.weight": "model-00002-of-00003.safetensors", + "transformer.h.15.ln_2.bias": "model-00002-of-00003.safetensors", + "transformer.h.15.ln_2.weight": "model-00002-of-00003.safetensors", + "transformer.h.15.mlp.c_fc.bias": "model-00002-of-00003.safetensors", + "transformer.h.15.mlp.c_fc.weight": "model-00002-of-00003.safetensors", + "transformer.h.15.mlp.c_proj.bias": "model-00002-of-00003.safetensors", + "transformer.h.15.mlp.c_proj.weight": "model-00002-of-00003.safetensors", + "transformer.h.16.attn.c_attn.bias": "model-00002-of-00003.safetensors", + "transformer.h.16.attn.c_attn.weight": "model-00002-of-00003.safetensors", + "transformer.h.16.attn.c_proj.bias": "model-00002-of-00003.safetensors", + "transformer.h.16.attn.c_proj.weight": "model-00002-of-00003.safetensors", + "transformer.h.16.ln_1.bias": "model-00002-of-00003.safetensors", + "transformer.h.16.ln_1.weight": "model-00002-of-00003.safetensors", + "transformer.h.16.ln_2.bias": "model-00002-of-00003.safetensors", + "transformer.h.16.ln_2.weight": "model-00002-of-00003.safetensors", + "transformer.h.16.mlp.c_fc.bias": "model-00002-of-00003.safetensors", + "transformer.h.16.mlp.c_fc.weight": "model-00002-of-00003.safetensors", + "transformer.h.16.mlp.c_proj.bias": "model-00002-of-00003.safetensors", + "transformer.h.16.mlp.c_proj.weight": "model-00002-of-00003.safetensors", + "transformer.h.17.attn.c_attn.bias": "model-00002-of-00003.safetensors", + "transformer.h.17.attn.c_attn.weight": "model-00002-of-00003.safetensors", + "transformer.h.17.attn.c_proj.bias": "model-00002-of-00003.safetensors", + "transformer.h.17.attn.c_proj.weight": "model-00002-of-00003.safetensors", + "transformer.h.17.ln_1.bias": "model-00002-of-00003.safetensors", + "transformer.h.17.ln_1.weight": "model-00002-of-00003.safetensors", + "transformer.h.17.ln_2.bias": "model-00002-of-00003.safetensors", + "transformer.h.17.ln_2.weight": "model-00002-of-00003.safetensors", + "transformer.h.17.mlp.c_fc.bias": "model-00002-of-00003.safetensors", + "transformer.h.17.mlp.c_fc.weight": "model-00002-of-00003.safetensors", + "transformer.h.17.mlp.c_proj.bias": "model-00002-of-00003.safetensors", + "transformer.h.17.mlp.c_proj.weight": "model-00002-of-00003.safetensors", + "transformer.h.18.attn.c_attn.bias": "model-00002-of-00003.safetensors", + "transformer.h.18.attn.c_attn.weight": "model-00002-of-00003.safetensors", + "transformer.h.18.attn.c_proj.bias": "model-00002-of-00003.safetensors", + "transformer.h.18.attn.c_proj.weight": "model-00002-of-00003.safetensors", + "transformer.h.18.ln_1.bias": "model-00002-of-00003.safetensors", + "transformer.h.18.ln_1.weight": "model-00002-of-00003.safetensors", + "transformer.h.18.ln_2.bias": "model-00002-of-00003.safetensors", + "transformer.h.18.ln_2.weight": "model-00002-of-00003.safetensors", + "transformer.h.18.mlp.c_fc.bias": "model-00002-of-00003.safetensors", + "transformer.h.18.mlp.c_fc.weight": "model-00002-of-00003.safetensors", + "transformer.h.18.mlp.c_proj.bias": "model-00002-of-00003.safetensors", + "transformer.h.18.mlp.c_proj.weight": "model-00002-of-00003.safetensors", + "transformer.h.19.attn.c_attn.bias": "model-00002-of-00003.safetensors", + "transformer.h.19.attn.c_attn.weight": "model-00002-of-00003.safetensors", + "transformer.h.19.attn.c_proj.bias": "model-00002-of-00003.safetensors", + "transformer.h.19.attn.c_proj.weight": "model-00002-of-00003.safetensors", + "transformer.h.19.ln_1.bias": "model-00002-of-00003.safetensors", + "transformer.h.19.ln_1.weight": "model-00002-of-00003.safetensors", + "transformer.h.19.ln_2.bias": "model-00002-of-00003.safetensors", + "transformer.h.19.ln_2.weight": "model-00002-of-00003.safetensors", + "transformer.h.19.mlp.c_fc.bias": "model-00002-of-00003.safetensors", + "transformer.h.19.mlp.c_fc.weight": "model-00002-of-00003.safetensors", + "transformer.h.19.mlp.c_proj.bias": "model-00002-of-00003.safetensors", + "transformer.h.19.mlp.c_proj.weight": "model-00002-of-00003.safetensors", + "transformer.h.2.attn.c_attn.bias": "model-00001-of-00003.safetensors", + "transformer.h.2.attn.c_attn.weight": "model-00001-of-00003.safetensors", + "transformer.h.2.attn.c_proj.bias": "model-00001-of-00003.safetensors", + "transformer.h.2.attn.c_proj.weight": "model-00001-of-00003.safetensors", + "transformer.h.2.ln_1.bias": "model-00001-of-00003.safetensors", + "transformer.h.2.ln_1.weight": "model-00001-of-00003.safetensors", + "transformer.h.2.ln_2.bias": "model-00001-of-00003.safetensors", + "transformer.h.2.ln_2.weight": "model-00001-of-00003.safetensors", + "transformer.h.2.mlp.c_fc.bias": "model-00001-of-00003.safetensors", + "transformer.h.2.mlp.c_fc.weight": "model-00001-of-00003.safetensors", + "transformer.h.2.mlp.c_proj.bias": "model-00001-of-00003.safetensors", + "transformer.h.2.mlp.c_proj.weight": "model-00001-of-00003.safetensors", + "transformer.h.20.attn.c_attn.bias": "model-00002-of-00003.safetensors", + "transformer.h.20.attn.c_attn.weight": "model-00002-of-00003.safetensors", + "transformer.h.20.attn.c_proj.bias": "model-00002-of-00003.safetensors", + "transformer.h.20.attn.c_proj.weight": "model-00002-of-00003.safetensors", + "transformer.h.20.ln_1.bias": "model-00002-of-00003.safetensors", + "transformer.h.20.ln_1.weight": "model-00002-of-00003.safetensors", + "transformer.h.20.ln_2.bias": "model-00002-of-00003.safetensors", + "transformer.h.20.ln_2.weight": "model-00002-of-00003.safetensors", + "transformer.h.20.mlp.c_fc.bias": "model-00002-of-00003.safetensors", + "transformer.h.20.mlp.c_fc.weight": "model-00002-of-00003.safetensors", + "transformer.h.20.mlp.c_proj.bias": "model-00002-of-00003.safetensors", + "transformer.h.20.mlp.c_proj.weight": "model-00002-of-00003.safetensors", + "transformer.h.21.attn.c_attn.bias": "model-00002-of-00003.safetensors", + "transformer.h.21.attn.c_attn.weight": "model-00002-of-00003.safetensors", + "transformer.h.21.attn.c_proj.bias": "model-00002-of-00003.safetensors", + "transformer.h.21.attn.c_proj.weight": "model-00002-of-00003.safetensors", + "transformer.h.21.ln_1.bias": "model-00002-of-00003.safetensors", + "transformer.h.21.ln_1.weight": "model-00002-of-00003.safetensors", + "transformer.h.21.ln_2.bias": "model-00002-of-00003.safetensors", + "transformer.h.21.ln_2.weight": "model-00002-of-00003.safetensors", + "transformer.h.21.mlp.c_fc.bias": "model-00002-of-00003.safetensors", + "transformer.h.21.mlp.c_fc.weight": "model-00002-of-00003.safetensors", + "transformer.h.21.mlp.c_proj.bias": "model-00002-of-00003.safetensors", + "transformer.h.21.mlp.c_proj.weight": "model-00002-of-00003.safetensors", + "transformer.h.22.attn.c_attn.bias": "model-00002-of-00003.safetensors", + "transformer.h.22.attn.c_attn.weight": "model-00002-of-00003.safetensors", + "transformer.h.22.attn.c_proj.bias": "model-00002-of-00003.safetensors", + "transformer.h.22.attn.c_proj.weight": "model-00002-of-00003.safetensors", + "transformer.h.22.ln_1.bias": "model-00002-of-00003.safetensors", + "transformer.h.22.ln_1.weight": "model-00002-of-00003.safetensors", + "transformer.h.22.ln_2.bias": "model-00002-of-00003.safetensors", + "transformer.h.22.ln_2.weight": "model-00002-of-00003.safetensors", + "transformer.h.22.mlp.c_fc.bias": "model-00002-of-00003.safetensors", + "transformer.h.22.mlp.c_fc.weight": "model-00002-of-00003.safetensors", + "transformer.h.22.mlp.c_proj.bias": "model-00002-of-00003.safetensors", + "transformer.h.22.mlp.c_proj.weight": "model-00002-of-00003.safetensors", + "transformer.h.23.attn.c_attn.bias": "model-00002-of-00003.safetensors", + "transformer.h.23.attn.c_attn.weight": "model-00002-of-00003.safetensors", + "transformer.h.23.attn.c_proj.bias": "model-00002-of-00003.safetensors", + "transformer.h.23.attn.c_proj.weight": "model-00002-of-00003.safetensors", + "transformer.h.23.ln_1.bias": "model-00002-of-00003.safetensors", + "transformer.h.23.ln_1.weight": "model-00002-of-00003.safetensors", + "transformer.h.23.ln_2.bias": "model-00002-of-00003.safetensors", + "transformer.h.23.ln_2.weight": "model-00002-of-00003.safetensors", + "transformer.h.23.mlp.c_fc.bias": "model-00003-of-00003.safetensors", + "transformer.h.23.mlp.c_fc.weight": "model-00003-of-00003.safetensors", + "transformer.h.23.mlp.c_proj.bias": "model-00003-of-00003.safetensors", + "transformer.h.23.mlp.c_proj.weight": "model-00003-of-00003.safetensors", + "transformer.h.24.attn.c_attn.bias": "model-00003-of-00003.safetensors", + "transformer.h.24.attn.c_attn.weight": "model-00003-of-00003.safetensors", + "transformer.h.24.attn.c_proj.bias": "model-00003-of-00003.safetensors", + "transformer.h.24.attn.c_proj.weight": "model-00003-of-00003.safetensors", + "transformer.h.24.ln_1.bias": "model-00003-of-00003.safetensors", + "transformer.h.24.ln_1.weight": "model-00003-of-00003.safetensors", + "transformer.h.24.ln_2.bias": "model-00003-of-00003.safetensors", + "transformer.h.24.ln_2.weight": "model-00003-of-00003.safetensors", + "transformer.h.24.mlp.c_fc.bias": "model-00003-of-00003.safetensors", + "transformer.h.24.mlp.c_fc.weight": "model-00003-of-00003.safetensors", + "transformer.h.24.mlp.c_proj.bias": "model-00003-of-00003.safetensors", + "transformer.h.24.mlp.c_proj.weight": "model-00003-of-00003.safetensors", + "transformer.h.25.attn.c_attn.bias": "model-00003-of-00003.safetensors", + "transformer.h.25.attn.c_attn.weight": "model-00003-of-00003.safetensors", + "transformer.h.25.attn.c_proj.bias": "model-00003-of-00003.safetensors", + "transformer.h.25.attn.c_proj.weight": "model-00003-of-00003.safetensors", + "transformer.h.25.ln_1.bias": "model-00003-of-00003.safetensors", + "transformer.h.25.ln_1.weight": "model-00003-of-00003.safetensors", + "transformer.h.25.ln_2.bias": "model-00003-of-00003.safetensors", + "transformer.h.25.ln_2.weight": "model-00003-of-00003.safetensors", + "transformer.h.25.mlp.c_fc.bias": "model-00003-of-00003.safetensors", + "transformer.h.25.mlp.c_fc.weight": "model-00003-of-00003.safetensors", + "transformer.h.25.mlp.c_proj.bias": "model-00003-of-00003.safetensors", + "transformer.h.25.mlp.c_proj.weight": "model-00003-of-00003.safetensors", + "transformer.h.26.attn.c_attn.bias": "model-00003-of-00003.safetensors", + "transformer.h.26.attn.c_attn.weight": "model-00003-of-00003.safetensors", + "transformer.h.26.attn.c_proj.bias": "model-00003-of-00003.safetensors", + "transformer.h.26.attn.c_proj.weight": "model-00003-of-00003.safetensors", + "transformer.h.26.ln_1.bias": "model-00003-of-00003.safetensors", + "transformer.h.26.ln_1.weight": "model-00003-of-00003.safetensors", + "transformer.h.26.ln_2.bias": "model-00003-of-00003.safetensors", + "transformer.h.26.ln_2.weight": "model-00003-of-00003.safetensors", + "transformer.h.26.mlp.c_fc.bias": "model-00003-of-00003.safetensors", + "transformer.h.26.mlp.c_fc.weight": "model-00003-of-00003.safetensors", + "transformer.h.26.mlp.c_proj.bias": "model-00003-of-00003.safetensors", + "transformer.h.26.mlp.c_proj.weight": "model-00003-of-00003.safetensors", + "transformer.h.27.attn.c_attn.bias": "model-00003-of-00003.safetensors", + "transformer.h.27.attn.c_attn.weight": "model-00003-of-00003.safetensors", + "transformer.h.27.attn.c_proj.bias": "model-00003-of-00003.safetensors", + "transformer.h.27.attn.c_proj.weight": "model-00003-of-00003.safetensors", + "transformer.h.27.ln_1.bias": "model-00003-of-00003.safetensors", + "transformer.h.27.ln_1.weight": "model-00003-of-00003.safetensors", + "transformer.h.27.ln_2.bias": "model-00003-of-00003.safetensors", + "transformer.h.27.ln_2.weight": "model-00003-of-00003.safetensors", + "transformer.h.27.mlp.c_fc.bias": "model-00003-of-00003.safetensors", + "transformer.h.27.mlp.c_fc.weight": "model-00003-of-00003.safetensors", + "transformer.h.27.mlp.c_proj.bias": "model-00003-of-00003.safetensors", + "transformer.h.27.mlp.c_proj.weight": "model-00003-of-00003.safetensors", + "transformer.h.28.attn.c_attn.bias": "model-00003-of-00003.safetensors", + "transformer.h.28.attn.c_attn.weight": "model-00003-of-00003.safetensors", + "transformer.h.28.attn.c_proj.bias": "model-00003-of-00003.safetensors", + "transformer.h.28.attn.c_proj.weight": "model-00003-of-00003.safetensors", + "transformer.h.28.ln_1.bias": "model-00003-of-00003.safetensors", + "transformer.h.28.ln_1.weight": "model-00003-of-00003.safetensors", + "transformer.h.28.ln_2.bias": "model-00003-of-00003.safetensors", + "transformer.h.28.ln_2.weight": "model-00003-of-00003.safetensors", + "transformer.h.28.mlp.c_fc.bias": "model-00003-of-00003.safetensors", + "transformer.h.28.mlp.c_fc.weight": "model-00003-of-00003.safetensors", + "transformer.h.28.mlp.c_proj.bias": "model-00003-of-00003.safetensors", + "transformer.h.28.mlp.c_proj.weight": "model-00003-of-00003.safetensors", + "transformer.h.29.attn.c_attn.bias": "model-00003-of-00003.safetensors", + "transformer.h.29.attn.c_attn.weight": "model-00003-of-00003.safetensors", + "transformer.h.29.attn.c_proj.bias": "model-00003-of-00003.safetensors", + "transformer.h.29.attn.c_proj.weight": "model-00003-of-00003.safetensors", + "transformer.h.29.ln_1.bias": "model-00003-of-00003.safetensors", + "transformer.h.29.ln_1.weight": "model-00003-of-00003.safetensors", + "transformer.h.29.ln_2.bias": "model-00003-of-00003.safetensors", + "transformer.h.29.ln_2.weight": "model-00003-of-00003.safetensors", + "transformer.h.29.mlp.c_fc.bias": "model-00003-of-00003.safetensors", + "transformer.h.29.mlp.c_fc.weight": "model-00003-of-00003.safetensors", + "transformer.h.29.mlp.c_proj.bias": "model-00003-of-00003.safetensors", + "transformer.h.29.mlp.c_proj.weight": "model-00003-of-00003.safetensors", + "transformer.h.3.attn.c_attn.bias": "model-00001-of-00003.safetensors", + "transformer.h.3.attn.c_attn.weight": "model-00001-of-00003.safetensors", + "transformer.h.3.attn.c_proj.bias": "model-00001-of-00003.safetensors", + "transformer.h.3.attn.c_proj.weight": "model-00001-of-00003.safetensors", + "transformer.h.3.ln_1.bias": "model-00001-of-00003.safetensors", + "transformer.h.3.ln_1.weight": "model-00001-of-00003.safetensors", + "transformer.h.3.ln_2.bias": "model-00001-of-00003.safetensors", + "transformer.h.3.ln_2.weight": "model-00001-of-00003.safetensors", + "transformer.h.3.mlp.c_fc.bias": "model-00001-of-00003.safetensors", + "transformer.h.3.mlp.c_fc.weight": "model-00001-of-00003.safetensors", + "transformer.h.3.mlp.c_proj.bias": "model-00001-of-00003.safetensors", + "transformer.h.3.mlp.c_proj.weight": "model-00001-of-00003.safetensors", + "transformer.h.30.attn.c_attn.bias": "model-00003-of-00003.safetensors", + "transformer.h.30.attn.c_attn.weight": "model-00003-of-00003.safetensors", + "transformer.h.30.attn.c_proj.bias": "model-00003-of-00003.safetensors", + "transformer.h.30.attn.c_proj.weight": "model-00003-of-00003.safetensors", + "transformer.h.30.ln_1.bias": "model-00003-of-00003.safetensors", + "transformer.h.30.ln_1.weight": "model-00003-of-00003.safetensors", + "transformer.h.30.ln_2.bias": "model-00003-of-00003.safetensors", + "transformer.h.30.ln_2.weight": "model-00003-of-00003.safetensors", + "transformer.h.30.mlp.c_fc.bias": "model-00003-of-00003.safetensors", + "transformer.h.30.mlp.c_fc.weight": "model-00003-of-00003.safetensors", + "transformer.h.30.mlp.c_proj.bias": "model-00003-of-00003.safetensors", + "transformer.h.30.mlp.c_proj.weight": "model-00003-of-00003.safetensors", + "transformer.h.31.attn.c_attn.bias": "model-00003-of-00003.safetensors", + "transformer.h.31.attn.c_attn.weight": "model-00003-of-00003.safetensors", + "transformer.h.31.attn.c_proj.bias": "model-00003-of-00003.safetensors", + "transformer.h.31.attn.c_proj.weight": "model-00003-of-00003.safetensors", + "transformer.h.31.ln_1.bias": "model-00003-of-00003.safetensors", + "transformer.h.31.ln_1.weight": "model-00003-of-00003.safetensors", + "transformer.h.31.ln_2.bias": "model-00003-of-00003.safetensors", + "transformer.h.31.ln_2.weight": "model-00003-of-00003.safetensors", + "transformer.h.31.mlp.c_fc.bias": "model-00003-of-00003.safetensors", + "transformer.h.31.mlp.c_fc.weight": "model-00003-of-00003.safetensors", + "transformer.h.31.mlp.c_proj.bias": "model-00003-of-00003.safetensors", + "transformer.h.31.mlp.c_proj.weight": "model-00003-of-00003.safetensors", + "transformer.h.4.attn.c_attn.bias": "model-00001-of-00003.safetensors", + "transformer.h.4.attn.c_attn.weight": "model-00001-of-00003.safetensors", + "transformer.h.4.attn.c_proj.bias": "model-00001-of-00003.safetensors", + "transformer.h.4.attn.c_proj.weight": "model-00001-of-00003.safetensors", + "transformer.h.4.ln_1.bias": "model-00001-of-00003.safetensors", + "transformer.h.4.ln_1.weight": "model-00001-of-00003.safetensors", + "transformer.h.4.ln_2.bias": "model-00001-of-00003.safetensors", + "transformer.h.4.ln_2.weight": "model-00001-of-00003.safetensors", + "transformer.h.4.mlp.c_fc.bias": "model-00001-of-00003.safetensors", + "transformer.h.4.mlp.c_fc.weight": "model-00001-of-00003.safetensors", + "transformer.h.4.mlp.c_proj.bias": "model-00001-of-00003.safetensors", + "transformer.h.4.mlp.c_proj.weight": "model-00001-of-00003.safetensors", + "transformer.h.5.attn.c_attn.bias": "model-00001-of-00003.safetensors", + "transformer.h.5.attn.c_attn.weight": "model-00001-of-00003.safetensors", + "transformer.h.5.attn.c_proj.bias": "model-00001-of-00003.safetensors", + "transformer.h.5.attn.c_proj.weight": "model-00001-of-00003.safetensors", + "transformer.h.5.ln_1.bias": "model-00001-of-00003.safetensors", + "transformer.h.5.ln_1.weight": "model-00001-of-00003.safetensors", + "transformer.h.5.ln_2.bias": "model-00001-of-00003.safetensors", + "transformer.h.5.ln_2.weight": "model-00001-of-00003.safetensors", + "transformer.h.5.mlp.c_fc.bias": "model-00001-of-00003.safetensors", + "transformer.h.5.mlp.c_fc.weight": "model-00001-of-00003.safetensors", + "transformer.h.5.mlp.c_proj.bias": "model-00001-of-00003.safetensors", + "transformer.h.5.mlp.c_proj.weight": "model-00001-of-00003.safetensors", + "transformer.h.6.attn.c_attn.bias": "model-00001-of-00003.safetensors", + "transformer.h.6.attn.c_attn.weight": "model-00001-of-00003.safetensors", + "transformer.h.6.attn.c_proj.bias": "model-00001-of-00003.safetensors", + "transformer.h.6.attn.c_proj.weight": "model-00001-of-00003.safetensors", + "transformer.h.6.ln_1.bias": "model-00001-of-00003.safetensors", + "transformer.h.6.ln_1.weight": "model-00001-of-00003.safetensors", + "transformer.h.6.ln_2.bias": "model-00001-of-00003.safetensors", + "transformer.h.6.ln_2.weight": "model-00001-of-00003.safetensors", + "transformer.h.6.mlp.c_fc.bias": "model-00001-of-00003.safetensors", + "transformer.h.6.mlp.c_fc.weight": "model-00001-of-00003.safetensors", + "transformer.h.6.mlp.c_proj.bias": "model-00001-of-00003.safetensors", + "transformer.h.6.mlp.c_proj.weight": "model-00001-of-00003.safetensors", + "transformer.h.7.attn.c_attn.bias": "model-00001-of-00003.safetensors", + "transformer.h.7.attn.c_attn.weight": "model-00001-of-00003.safetensors", + "transformer.h.7.attn.c_proj.bias": "model-00001-of-00003.safetensors", + "transformer.h.7.attn.c_proj.weight": "model-00001-of-00003.safetensors", + "transformer.h.7.ln_1.bias": "model-00001-of-00003.safetensors", + "transformer.h.7.ln_1.weight": "model-00001-of-00003.safetensors", + "transformer.h.7.ln_2.bias": "model-00001-of-00003.safetensors", + "transformer.h.7.ln_2.weight": "model-00001-of-00003.safetensors", + "transformer.h.7.mlp.c_fc.bias": "model-00001-of-00003.safetensors", + "transformer.h.7.mlp.c_fc.weight": "model-00001-of-00003.safetensors", + "transformer.h.7.mlp.c_proj.bias": "model-00001-of-00003.safetensors", + "transformer.h.7.mlp.c_proj.weight": "model-00001-of-00003.safetensors", + "transformer.h.8.attn.c_attn.bias": "model-00001-of-00003.safetensors", + "transformer.h.8.attn.c_attn.weight": "model-00001-of-00003.safetensors", + "transformer.h.8.attn.c_proj.bias": "model-00001-of-00003.safetensors", + "transformer.h.8.attn.c_proj.weight": "model-00001-of-00003.safetensors", + "transformer.h.8.ln_1.bias": "model-00001-of-00003.safetensors", + "transformer.h.8.ln_1.weight": "model-00001-of-00003.safetensors", + "transformer.h.8.ln_2.bias": "model-00001-of-00003.safetensors", + "transformer.h.8.ln_2.weight": "model-00001-of-00003.safetensors", + "transformer.h.8.mlp.c_fc.bias": "model-00001-of-00003.safetensors", + "transformer.h.8.mlp.c_fc.weight": "model-00001-of-00003.safetensors", + "transformer.h.8.mlp.c_proj.bias": "model-00001-of-00003.safetensors", + "transformer.h.8.mlp.c_proj.weight": "model-00001-of-00003.safetensors", + "transformer.h.9.attn.c_attn.bias": "model-00001-of-00003.safetensors", + "transformer.h.9.attn.c_attn.weight": "model-00001-of-00003.safetensors", + "transformer.h.9.attn.c_proj.bias": "model-00001-of-00003.safetensors", + "transformer.h.9.attn.c_proj.weight": "model-00001-of-00003.safetensors", + "transformer.h.9.ln_1.bias": "model-00001-of-00003.safetensors", + "transformer.h.9.ln_1.weight": "model-00001-of-00003.safetensors", + "transformer.h.9.ln_2.bias": "model-00001-of-00003.safetensors", + "transformer.h.9.ln_2.weight": "model-00001-of-00003.safetensors", + "transformer.h.9.mlp.c_fc.bias": "model-00001-of-00003.safetensors", + "transformer.h.9.mlp.c_fc.weight": "model-00001-of-00003.safetensors", + "transformer.h.9.mlp.c_proj.bias": "model-00001-of-00003.safetensors", + "transformer.h.9.mlp.c_proj.weight": "model-00001-of-00003.safetensors", + "transformer.ln_f.bias": "model-00003-of-00003.safetensors", + "transformer.ln_f.weight": "model-00003-of-00003.safetensors", + "transformer.wpe.weight": "model-00001-of-00003.safetensors", + "transformer.wte.weight": "model-00001-of-00003.safetensors" + } +} diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..045e3c1 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/spiece.model b/spiece.model new file mode 100644 index 0000000..491f2fd --- /dev/null +++ b/spiece.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4bda2cd84ff0ac659cda40e746c55f47ee3e57cf18471670ad26998c28be52d +size 1071955 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..93f8efb --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,49 @@ +{ + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": true, + "do_lower_case": false, + "eos_token": "<|endoftext|>", + "keep_accents": false, + "model_max_length": 2048, + "pad_token": "", + "padding_side": "left", + "remove_space": false, + "sp_model_kwargs": {}, + "tokenizer_class": "GPTSw3Tokenizer", + "truncation_side": "left", + "unk_token": "" +}