初始化项目,由ModelHub XC社区提供模型
Model: anjajar/baby_goldfish_new Source: Original Platform
This commit is contained in:
35
.gitattributes
vendored
Normal file
35
.gitattributes
vendored
Normal file
@@ -0,0 +1,35 @@
|
||||
*.7z filter=lfs diff=lfs merge=lfs -text
|
||||
*.arrow filter=lfs diff=lfs merge=lfs -text
|
||||
*.bin filter=lfs diff=lfs merge=lfs -text
|
||||
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
||||
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
||||
*.ftz filter=lfs diff=lfs merge=lfs -text
|
||||
*.gz filter=lfs diff=lfs merge=lfs -text
|
||||
*.h5 filter=lfs diff=lfs merge=lfs -text
|
||||
*.joblib filter=lfs diff=lfs merge=lfs -text
|
||||
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
||||
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
||||
*.model filter=lfs diff=lfs merge=lfs -text
|
||||
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
||||
*.npy filter=lfs diff=lfs merge=lfs -text
|
||||
*.npz filter=lfs diff=lfs merge=lfs -text
|
||||
*.onnx filter=lfs diff=lfs merge=lfs -text
|
||||
*.ot filter=lfs diff=lfs merge=lfs -text
|
||||
*.parquet filter=lfs diff=lfs merge=lfs -text
|
||||
*.pb filter=lfs diff=lfs merge=lfs -text
|
||||
*.pickle filter=lfs diff=lfs merge=lfs -text
|
||||
*.pkl filter=lfs diff=lfs merge=lfs -text
|
||||
*.pt filter=lfs diff=lfs merge=lfs -text
|
||||
*.pth filter=lfs diff=lfs merge=lfs -text
|
||||
*.rar filter=lfs diff=lfs merge=lfs -text
|
||||
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
||||
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
||||
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
||||
*.tar filter=lfs diff=lfs merge=lfs -text
|
||||
*.tflite filter=lfs diff=lfs merge=lfs -text
|
||||
*.tgz filter=lfs diff=lfs merge=lfs -text
|
||||
*.wasm filter=lfs diff=lfs merge=lfs -text
|
||||
*.xz filter=lfs diff=lfs merge=lfs -text
|
||||
*.zip filter=lfs diff=lfs merge=lfs -text
|
||||
*.zst filter=lfs diff=lfs merge=lfs -text
|
||||
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
||||
54
README.md
Normal file
54
README.md
Normal file
@@ -0,0 +1,54 @@
|
||||
---
|
||||
library_name: transformers
|
||||
base_model: gpt_small_config.json
|
||||
tags:
|
||||
- generated_from_trainer
|
||||
model-index:
|
||||
- name: baby_goldfish_new
|
||||
results: []
|
||||
---
|
||||
|
||||
<!-- This model card has been generated automatically according to the information the Trainer had access to. You
|
||||
should probably proofread and complete it, then remove this comment. -->
|
||||
|
||||
# baby_goldfish_new
|
||||
|
||||
This model is a fine-tuned version of [gpt_small_config.json](https://huggingface.co/gpt_small_config.json) on the None dataset.
|
||||
|
||||
## Model description
|
||||
|
||||
More information needed
|
||||
|
||||
## Intended uses & limitations
|
||||
|
||||
More information needed
|
||||
|
||||
## Training and evaluation data
|
||||
|
||||
More information needed
|
||||
|
||||
## Training procedure
|
||||
|
||||
### Training hyperparameters
|
||||
|
||||
The following hyperparameters were used during training:
|
||||
- learning_rate: 0.0001
|
||||
- train_batch_size: 8
|
||||
- eval_batch_size: 8
|
||||
- seed: 43
|
||||
- optimizer: Use OptimizerNames.ADAMW_TORCH_FUSED with betas=(0.9,0.999) and epsilon=1e-06 and optimizer_args=No additional optimizer arguments
|
||||
- lr_scheduler_type: linear
|
||||
- lr_scheduler_warmup_steps: 387
|
||||
- num_epochs: 10
|
||||
- mixed_precision_training: Native AMP
|
||||
|
||||
### Training results
|
||||
|
||||
|
||||
|
||||
### Framework versions
|
||||
|
||||
- Transformers 4.57.3
|
||||
- Pytorch 2.9.1+cu128
|
||||
- Datasets 4.4.1
|
||||
- Tokenizers 0.22.1
|
||||
34
config.json
Normal file
34
config.json
Normal file
@@ -0,0 +1,34 @@
|
||||
{
|
||||
"activation_function": "gelu",
|
||||
"architectures": [
|
||||
"GPT2LMHeadModel"
|
||||
],
|
||||
"attn_pdrop": 0.1,
|
||||
"bos_token_id": 50000,
|
||||
"dtype": "float32",
|
||||
"embd_pdrop": 0.1,
|
||||
"eos_token_id": 50001,
|
||||
"initializer_range": 0.02,
|
||||
"layer_norm_epsilon": 1e-05,
|
||||
"model_type": "gpt2",
|
||||
"n_ctx": 512,
|
||||
"n_embd": 512,
|
||||
"n_head": 8,
|
||||
"n_inner": 2048,
|
||||
"n_layer": 4,
|
||||
"n_positions": 512,
|
||||
"pad_token_id": 50002,
|
||||
"prefix": "[CLS]",
|
||||
"reorder_and_upcast_attn": false,
|
||||
"resid_pdrop": 0.1,
|
||||
"scale_attn_by_inverse_layer_idx": false,
|
||||
"scale_attn_weights": true,
|
||||
"summary_activation": null,
|
||||
"summary_first_dropout": 0.1,
|
||||
"summary_proj_to_labels": true,
|
||||
"summary_type": "cls_index",
|
||||
"summary_use_proj": true,
|
||||
"transformers_version": "4.57.3",
|
||||
"use_cache": true,
|
||||
"vocab_size": 51200
|
||||
}
|
||||
7
generation_config.json
Normal file
7
generation_config.json
Normal file
@@ -0,0 +1,7 @@
|
||||
{
|
||||
"_from_model_config": true,
|
||||
"bos_token_id": 50000,
|
||||
"eos_token_id": 50001,
|
||||
"pad_token_id": 50002,
|
||||
"transformers_version": "4.57.3"
|
||||
}
|
||||
34
last-checkpoint/config.json
Normal file
34
last-checkpoint/config.json
Normal file
@@ -0,0 +1,34 @@
|
||||
{
|
||||
"activation_function": "gelu",
|
||||
"architectures": [
|
||||
"GPT2LMHeadModel"
|
||||
],
|
||||
"attn_pdrop": 0.1,
|
||||
"bos_token_id": 50000,
|
||||
"dtype": "float32",
|
||||
"embd_pdrop": 0.1,
|
||||
"eos_token_id": 50001,
|
||||
"initializer_range": 0.02,
|
||||
"layer_norm_epsilon": 1e-05,
|
||||
"model_type": "gpt2",
|
||||
"n_ctx": 512,
|
||||
"n_embd": 512,
|
||||
"n_head": 8,
|
||||
"n_inner": 2048,
|
||||
"n_layer": 4,
|
||||
"n_positions": 512,
|
||||
"pad_token_id": 50002,
|
||||
"prefix": "[CLS]",
|
||||
"reorder_and_upcast_attn": false,
|
||||
"resid_pdrop": 0.1,
|
||||
"scale_attn_by_inverse_layer_idx": false,
|
||||
"scale_attn_weights": true,
|
||||
"summary_activation": null,
|
||||
"summary_first_dropout": 0.1,
|
||||
"summary_proj_to_labels": true,
|
||||
"summary_type": "cls_index",
|
||||
"summary_use_proj": true,
|
||||
"transformers_version": "4.57.3",
|
||||
"use_cache": true,
|
||||
"vocab_size": 51200
|
||||
}
|
||||
7
last-checkpoint/generation_config.json
Normal file
7
last-checkpoint/generation_config.json
Normal file
@@ -0,0 +1,7 @@
|
||||
{
|
||||
"_from_model_config": true,
|
||||
"bos_token_id": 50000,
|
||||
"eos_token_id": 50001,
|
||||
"pad_token_id": 50002,
|
||||
"transformers_version": "4.57.3"
|
||||
}
|
||||
3
last-checkpoint/model.safetensors
Normal file
3
last-checkpoint/model.safetensors
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:045584847064d2eb5dc229653586e0327ef2a4dfbb5e56b830944e62cb173418
|
||||
size 156353584
|
||||
3
last-checkpoint/optimizer.pt
Normal file
3
last-checkpoint/optimizer.pt
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:ce56a644b43a957d344559794c779a3624f1264caad1d97458aedabe7db4314e
|
||||
size 312741835
|
||||
3
last-checkpoint/rng_state.pth
Normal file
3
last-checkpoint/rng_state.pth
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:e8425ffa539cd7299756c17d804955693f3af2c98cd1bdd1a958b1cbe0c28550
|
||||
size 14645
|
||||
3
last-checkpoint/scaler.pt
Normal file
3
last-checkpoint/scaler.pt
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:5c4faeb7590b14fb868d259d539b68ad8bbe516624c9eb07f67bcb75d3c509da
|
||||
size 1383
|
||||
3
last-checkpoint/scheduler.pt
Normal file
3
last-checkpoint/scheduler.pt
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:609cf1da746d897de6995be7207b46d4bd7fddf35a26103e7fae174d3583b315
|
||||
size 1465
|
||||
1243
last-checkpoint/special_tokens_map.json
Normal file
1243
last-checkpoint/special_tokens_map.json
Normal file
File diff suppressed because it is too large
Load Diff
210940
last-checkpoint/tokenizer.json
Normal file
210940
last-checkpoint/tokenizer.json
Normal file
File diff suppressed because one or more lines are too long
10829
last-checkpoint/tokenizer_config.json
Normal file
10829
last-checkpoint/tokenizer_config.json
Normal file
File diff suppressed because it is too large
Load Diff
937
last-checkpoint/trainer_state.json
Normal file
937
last-checkpoint/trainer_state.json
Normal file
@@ -0,0 +1,937 @@
|
||||
{
|
||||
"best_global_step": null,
|
||||
"best_metric": null,
|
||||
"best_model_checkpoint": null,
|
||||
"epoch": 10.0,
|
||||
"eval_steps": 500,
|
||||
"global_step": 38780,
|
||||
"is_hyper_param_search": false,
|
||||
"is_local_process_zero": true,
|
||||
"is_world_process_zero": true,
|
||||
"log_history": [
|
||||
{
|
||||
"epoch": 0.07735946364105209,
|
||||
"grad_norm": 1.558766484260559,
|
||||
"learning_rate": 7.726098191214471e-05,
|
||||
"loss": 7.285,
|
||||
"step": 300
|
||||
},
|
||||
{
|
||||
"epoch": 0.15471892728210418,
|
||||
"grad_norm": 0.9385259747505188,
|
||||
"learning_rate": 9.944781600812648e-05,
|
||||
"loss": 4.7972,
|
||||
"step": 600
|
||||
},
|
||||
{
|
||||
"epoch": 0.23207839092315627,
|
||||
"grad_norm": 0.9864380359649658,
|
||||
"learning_rate": 9.866642356679604e-05,
|
||||
"loss": 4.5563,
|
||||
"step": 900
|
||||
},
|
||||
{
|
||||
"epoch": 0.30943785456420836,
|
||||
"grad_norm": 0.971341073513031,
|
||||
"learning_rate": 9.788503112546558e-05,
|
||||
"loss": 4.4545,
|
||||
"step": 1200
|
||||
},
|
||||
{
|
||||
"epoch": 0.3867973182052604,
|
||||
"grad_norm": 0.8991674184799194,
|
||||
"learning_rate": 9.710363868413513e-05,
|
||||
"loss": 4.3616,
|
||||
"step": 1500
|
||||
},
|
||||
{
|
||||
"epoch": 0.46415678184631254,
|
||||
"grad_norm": 0.9576361775398254,
|
||||
"learning_rate": 9.632224624280469e-05,
|
||||
"loss": 4.2695,
|
||||
"step": 1800
|
||||
},
|
||||
{
|
||||
"epoch": 0.5415162454873647,
|
||||
"grad_norm": 0.9536678791046143,
|
||||
"learning_rate": 9.554085380147423e-05,
|
||||
"loss": 4.2254,
|
||||
"step": 2100
|
||||
},
|
||||
{
|
||||
"epoch": 0.6188757091284167,
|
||||
"grad_norm": 0.892548680305481,
|
||||
"learning_rate": 9.475946136014378e-05,
|
||||
"loss": 4.1759,
|
||||
"step": 2400
|
||||
},
|
||||
{
|
||||
"epoch": 0.6962351727694688,
|
||||
"grad_norm": 0.9262155294418335,
|
||||
"learning_rate": 9.397806891881334e-05,
|
||||
"loss": 4.1328,
|
||||
"step": 2700
|
||||
},
|
||||
{
|
||||
"epoch": 0.7735946364105208,
|
||||
"grad_norm": 1.055438756942749,
|
||||
"learning_rate": 9.319667647748288e-05,
|
||||
"loss": 4.0732,
|
||||
"step": 3000
|
||||
},
|
||||
{
|
||||
"epoch": 0.850954100051573,
|
||||
"grad_norm": 1.0588972568511963,
|
||||
"learning_rate": 9.241528403615243e-05,
|
||||
"loss": 4.0574,
|
||||
"step": 3300
|
||||
},
|
||||
{
|
||||
"epoch": 0.9283135636926251,
|
||||
"grad_norm": 1.344167947769165,
|
||||
"learning_rate": 9.163389159482197e-05,
|
||||
"loss": 4.0472,
|
||||
"step": 3600
|
||||
},
|
||||
{
|
||||
"epoch": 1.0056730273336771,
|
||||
"grad_norm": 0.9573405981063843,
|
||||
"learning_rate": 9.085249915349152e-05,
|
||||
"loss": 3.972,
|
||||
"step": 3900
|
||||
},
|
||||
{
|
||||
"epoch": 1.0830324909747293,
|
||||
"grad_norm": 1.0597045421600342,
|
||||
"learning_rate": 9.007110671216108e-05,
|
||||
"loss": 3.8933,
|
||||
"step": 4200
|
||||
},
|
||||
{
|
||||
"epoch": 1.1603919546157813,
|
||||
"grad_norm": 1.1895560026168823,
|
||||
"learning_rate": 8.928971427083062e-05,
|
||||
"loss": 3.8657,
|
||||
"step": 4500
|
||||
},
|
||||
{
|
||||
"epoch": 1.2377514182568334,
|
||||
"grad_norm": 1.1971007585525513,
|
||||
"learning_rate": 8.850832182950017e-05,
|
||||
"loss": 3.8486,
|
||||
"step": 4800
|
||||
},
|
||||
{
|
||||
"epoch": 1.3151108818978856,
|
||||
"grad_norm": 1.2342840433120728,
|
||||
"learning_rate": 8.772692938816972e-05,
|
||||
"loss": 3.8417,
|
||||
"step": 5100
|
||||
},
|
||||
{
|
||||
"epoch": 1.3924703455389376,
|
||||
"grad_norm": 1.213428258895874,
|
||||
"learning_rate": 8.694553694683926e-05,
|
||||
"loss": 3.8048,
|
||||
"step": 5400
|
||||
},
|
||||
{
|
||||
"epoch": 1.4698298091799897,
|
||||
"grad_norm": 1.191662073135376,
|
||||
"learning_rate": 8.616414450550882e-05,
|
||||
"loss": 3.7818,
|
||||
"step": 5700
|
||||
},
|
||||
{
|
||||
"epoch": 1.5471892728210417,
|
||||
"grad_norm": 1.3016968965530396,
|
||||
"learning_rate": 8.538275206417838e-05,
|
||||
"loss": 3.7365,
|
||||
"step": 6000
|
||||
},
|
||||
{
|
||||
"epoch": 1.6245487364620939,
|
||||
"grad_norm": 1.179246187210083,
|
||||
"learning_rate": 8.460135962284792e-05,
|
||||
"loss": 3.7605,
|
||||
"step": 6300
|
||||
},
|
||||
{
|
||||
"epoch": 1.701908200103146,
|
||||
"grad_norm": 1.2382755279541016,
|
||||
"learning_rate": 8.381996718151747e-05,
|
||||
"loss": 3.6887,
|
||||
"step": 6600
|
||||
},
|
||||
{
|
||||
"epoch": 1.7792676637441982,
|
||||
"grad_norm": 1.209956169128418,
|
||||
"learning_rate": 8.303857474018702e-05,
|
||||
"loss": 3.7204,
|
||||
"step": 6900
|
||||
},
|
||||
{
|
||||
"epoch": 1.8566271273852502,
|
||||
"grad_norm": 1.119544267654419,
|
||||
"learning_rate": 8.225718229885656e-05,
|
||||
"loss": 3.6682,
|
||||
"step": 7200
|
||||
},
|
||||
{
|
||||
"epoch": 1.933986591026302,
|
||||
"grad_norm": 1.2890771627426147,
|
||||
"learning_rate": 8.147578985752612e-05,
|
||||
"loss": 3.6434,
|
||||
"step": 7500
|
||||
},
|
||||
{
|
||||
"epoch": 2.0113460546673543,
|
||||
"grad_norm": 1.2189580202102661,
|
||||
"learning_rate": 8.069439741619567e-05,
|
||||
"loss": 3.6477,
|
||||
"step": 7800
|
||||
},
|
||||
{
|
||||
"epoch": 2.0887055183084065,
|
||||
"grad_norm": 1.4200156927108765,
|
||||
"learning_rate": 7.991300497486521e-05,
|
||||
"loss": 3.5883,
|
||||
"step": 8100
|
||||
},
|
||||
{
|
||||
"epoch": 2.1660649819494586,
|
||||
"grad_norm": 1.1501333713531494,
|
||||
"learning_rate": 7.913161253353476e-05,
|
||||
"loss": 3.6023,
|
||||
"step": 8400
|
||||
},
|
||||
{
|
||||
"epoch": 2.2434244455905104,
|
||||
"grad_norm": 1.321439504623413,
|
||||
"learning_rate": 7.83502200922043e-05,
|
||||
"loss": 3.5765,
|
||||
"step": 8700
|
||||
},
|
||||
{
|
||||
"epoch": 2.3207839092315625,
|
||||
"grad_norm": 1.4532771110534668,
|
||||
"learning_rate": 7.756882765087386e-05,
|
||||
"loss": 3.5858,
|
||||
"step": 9000
|
||||
},
|
||||
{
|
||||
"epoch": 2.3981433728726147,
|
||||
"grad_norm": 1.2922136783599854,
|
||||
"learning_rate": 7.67874352095434e-05,
|
||||
"loss": 3.5483,
|
||||
"step": 9300
|
||||
},
|
||||
{
|
||||
"epoch": 2.475502836513667,
|
||||
"grad_norm": 1.459169864654541,
|
||||
"learning_rate": 7.600604276821297e-05,
|
||||
"loss": 3.551,
|
||||
"step": 9600
|
||||
},
|
||||
{
|
||||
"epoch": 2.552862300154719,
|
||||
"grad_norm": 1.3106615543365479,
|
||||
"learning_rate": 7.522465032688251e-05,
|
||||
"loss": 3.5216,
|
||||
"step": 9900
|
||||
},
|
||||
{
|
||||
"epoch": 2.630221763795771,
|
||||
"grad_norm": 1.4244039058685303,
|
||||
"learning_rate": 7.444325788555206e-05,
|
||||
"loss": 3.5251,
|
||||
"step": 10200
|
||||
},
|
||||
{
|
||||
"epoch": 2.707581227436823,
|
||||
"grad_norm": 1.3957465887069702,
|
||||
"learning_rate": 7.366186544422162e-05,
|
||||
"loss": 3.5245,
|
||||
"step": 10500
|
||||
},
|
||||
{
|
||||
"epoch": 2.784940691077875,
|
||||
"grad_norm": 1.4246965646743774,
|
||||
"learning_rate": 7.288047300289116e-05,
|
||||
"loss": 3.4938,
|
||||
"step": 10800
|
||||
},
|
||||
{
|
||||
"epoch": 2.8623001547189273,
|
||||
"grad_norm": 1.3009408712387085,
|
||||
"learning_rate": 7.209908056156071e-05,
|
||||
"loss": 3.4972,
|
||||
"step": 11100
|
||||
},
|
||||
{
|
||||
"epoch": 2.9396596183599795,
|
||||
"grad_norm": 1.2788194417953491,
|
||||
"learning_rate": 7.131768812023025e-05,
|
||||
"loss": 3.4835,
|
||||
"step": 11400
|
||||
},
|
||||
{
|
||||
"epoch": 3.0170190820010316,
|
||||
"grad_norm": 1.415262222290039,
|
||||
"learning_rate": 7.05362956788998e-05,
|
||||
"loss": 3.4686,
|
||||
"step": 11700
|
||||
},
|
||||
{
|
||||
"epoch": 3.0943785456420834,
|
||||
"grad_norm": 1.3552271127700806,
|
||||
"learning_rate": 6.975490323756934e-05,
|
||||
"loss": 3.4434,
|
||||
"step": 12000
|
||||
},
|
||||
{
|
||||
"epoch": 3.1717380092831355,
|
||||
"grad_norm": 1.2953003644943237,
|
||||
"learning_rate": 6.89735107962389e-05,
|
||||
"loss": 3.406,
|
||||
"step": 12300
|
||||
},
|
||||
{
|
||||
"epoch": 3.2490974729241877,
|
||||
"grad_norm": 1.2616957426071167,
|
||||
"learning_rate": 6.819211835490845e-05,
|
||||
"loss": 3.4219,
|
||||
"step": 12600
|
||||
},
|
||||
{
|
||||
"epoch": 3.32645693656524,
|
||||
"grad_norm": 1.3086093664169312,
|
||||
"learning_rate": 6.7410725913578e-05,
|
||||
"loss": 3.4327,
|
||||
"step": 12900
|
||||
},
|
||||
{
|
||||
"epoch": 3.403816400206292,
|
||||
"grad_norm": 1.5225331783294678,
|
||||
"learning_rate": 6.662933347224755e-05,
|
||||
"loss": 3.3881,
|
||||
"step": 13200
|
||||
},
|
||||
{
|
||||
"epoch": 3.4811758638473442,
|
||||
"grad_norm": 1.3017733097076416,
|
||||
"learning_rate": 6.58479410309171e-05,
|
||||
"loss": 3.4253,
|
||||
"step": 13500
|
||||
},
|
||||
{
|
||||
"epoch": 3.558535327488396,
|
||||
"grad_norm": 1.4945634603500366,
|
||||
"learning_rate": 6.506654858958666e-05,
|
||||
"loss": 3.3739,
|
||||
"step": 13800
|
||||
},
|
||||
{
|
||||
"epoch": 3.635894791129448,
|
||||
"grad_norm": 1.3506596088409424,
|
||||
"learning_rate": 6.42851561482562e-05,
|
||||
"loss": 3.3795,
|
||||
"step": 14100
|
||||
},
|
||||
{
|
||||
"epoch": 3.7132542547705003,
|
||||
"grad_norm": 1.3715941905975342,
|
||||
"learning_rate": 6.350376370692575e-05,
|
||||
"loss": 3.3621,
|
||||
"step": 14400
|
||||
},
|
||||
{
|
||||
"epoch": 3.7906137184115525,
|
||||
"grad_norm": 1.4353686571121216,
|
||||
"learning_rate": 6.27223712655953e-05,
|
||||
"loss": 3.3625,
|
||||
"step": 14700
|
||||
},
|
||||
{
|
||||
"epoch": 3.867973182052604,
|
||||
"grad_norm": 1.4907252788543701,
|
||||
"learning_rate": 6.194097882426484e-05,
|
||||
"loss": 3.3694,
|
||||
"step": 15000
|
||||
},
|
||||
{
|
||||
"epoch": 3.9453326456936564,
|
||||
"grad_norm": 1.3906782865524292,
|
||||
"learning_rate": 6.11595863829344e-05,
|
||||
"loss": 3.3778,
|
||||
"step": 15300
|
||||
},
|
||||
{
|
||||
"epoch": 4.0226921093347086,
|
||||
"grad_norm": 1.4113860130310059,
|
||||
"learning_rate": 6.0378193941603944e-05,
|
||||
"loss": 3.3418,
|
||||
"step": 15600
|
||||
},
|
||||
{
|
||||
"epoch": 4.100051572975761,
|
||||
"grad_norm": 1.371813416481018,
|
||||
"learning_rate": 5.959680150027349e-05,
|
||||
"loss": 3.2831,
|
||||
"step": 15900
|
||||
},
|
||||
{
|
||||
"epoch": 4.177411036616813,
|
||||
"grad_norm": 1.433017611503601,
|
||||
"learning_rate": 5.881540905894304e-05,
|
||||
"loss": 3.2937,
|
||||
"step": 16200
|
||||
},
|
||||
{
|
||||
"epoch": 4.254770500257865,
|
||||
"grad_norm": 1.454952597618103,
|
||||
"learning_rate": 5.803401661761259e-05,
|
||||
"loss": 3.2925,
|
||||
"step": 16500
|
||||
},
|
||||
{
|
||||
"epoch": 4.332129963898917,
|
||||
"grad_norm": 1.4268256425857544,
|
||||
"learning_rate": 5.725262417628213e-05,
|
||||
"loss": 3.3171,
|
||||
"step": 16800
|
||||
},
|
||||
{
|
||||
"epoch": 4.409489427539969,
|
||||
"grad_norm": 1.4231845140457153,
|
||||
"learning_rate": 5.647123173495169e-05,
|
||||
"loss": 3.3303,
|
||||
"step": 17100
|
||||
},
|
||||
{
|
||||
"epoch": 4.486848891181021,
|
||||
"grad_norm": 1.358296275138855,
|
||||
"learning_rate": 5.568983929362124e-05,
|
||||
"loss": 3.3273,
|
||||
"step": 17400
|
||||
},
|
||||
{
|
||||
"epoch": 4.564208354822073,
|
||||
"grad_norm": 1.4314409494400024,
|
||||
"learning_rate": 5.490844685229078e-05,
|
||||
"loss": 3.3027,
|
||||
"step": 17700
|
||||
},
|
||||
{
|
||||
"epoch": 4.641567818463125,
|
||||
"grad_norm": 1.447662353515625,
|
||||
"learning_rate": 5.4127054410960335e-05,
|
||||
"loss": 3.2779,
|
||||
"step": 18000
|
||||
},
|
||||
{
|
||||
"epoch": 4.718927282104177,
|
||||
"grad_norm": 1.498307466506958,
|
||||
"learning_rate": 5.334566196962988e-05,
|
||||
"loss": 3.2733,
|
||||
"step": 18300
|
||||
},
|
||||
{
|
||||
"epoch": 4.796286745745229,
|
||||
"grad_norm": 1.3249318599700928,
|
||||
"learning_rate": 5.256426952829944e-05,
|
||||
"loss": 3.2159,
|
||||
"step": 18600
|
||||
},
|
||||
{
|
||||
"epoch": 4.873646209386282,
|
||||
"grad_norm": 1.7372560501098633,
|
||||
"learning_rate": 5.1782877086968985e-05,
|
||||
"loss": 3.2769,
|
||||
"step": 18900
|
||||
},
|
||||
{
|
||||
"epoch": 4.951005673027334,
|
||||
"grad_norm": 1.475892186164856,
|
||||
"learning_rate": 5.100148464563853e-05,
|
||||
"loss": 3.2724,
|
||||
"step": 19200
|
||||
},
|
||||
{
|
||||
"epoch": 5.028365136668386,
|
||||
"grad_norm": 1.5225096940994263,
|
||||
"learning_rate": 5.0220092204308076e-05,
|
||||
"loss": 3.2132,
|
||||
"step": 19500
|
||||
},
|
||||
{
|
||||
"epoch": 5.105724600309438,
|
||||
"grad_norm": 1.5637928247451782,
|
||||
"learning_rate": 4.943869976297763e-05,
|
||||
"loss": 3.2313,
|
||||
"step": 19800
|
||||
},
|
||||
{
|
||||
"epoch": 5.18308406395049,
|
||||
"grad_norm": 1.5115944147109985,
|
||||
"learning_rate": 4.865730732164718e-05,
|
||||
"loss": 3.2163,
|
||||
"step": 20100
|
||||
},
|
||||
{
|
||||
"epoch": 5.260443527591542,
|
||||
"grad_norm": 1.4446969032287598,
|
||||
"learning_rate": 4.7875914880316726e-05,
|
||||
"loss": 3.1997,
|
||||
"step": 20400
|
||||
},
|
||||
{
|
||||
"epoch": 5.337802991232594,
|
||||
"grad_norm": 1.4487448930740356,
|
||||
"learning_rate": 4.709452243898628e-05,
|
||||
"loss": 3.2236,
|
||||
"step": 20700
|
||||
},
|
||||
{
|
||||
"epoch": 5.415162454873646,
|
||||
"grad_norm": 1.5380080938339233,
|
||||
"learning_rate": 4.6313129997655824e-05,
|
||||
"loss": 3.2076,
|
||||
"step": 21000
|
||||
},
|
||||
{
|
||||
"epoch": 5.492521918514698,
|
||||
"grad_norm": 1.4626458883285522,
|
||||
"learning_rate": 4.5531737556325376e-05,
|
||||
"loss": 3.2204,
|
||||
"step": 21300
|
||||
},
|
||||
{
|
||||
"epoch": 5.56988138215575,
|
||||
"grad_norm": 1.6070873737335205,
|
||||
"learning_rate": 4.475034511499492e-05,
|
||||
"loss": 3.182,
|
||||
"step": 21600
|
||||
},
|
||||
{
|
||||
"epoch": 5.647240845796802,
|
||||
"grad_norm": 1.5365498065948486,
|
||||
"learning_rate": 4.3968952673664474e-05,
|
||||
"loss": 3.1846,
|
||||
"step": 21900
|
||||
},
|
||||
{
|
||||
"epoch": 5.724600309437855,
|
||||
"grad_norm": 1.6350524425506592,
|
||||
"learning_rate": 4.3187560232334026e-05,
|
||||
"loss": 3.2233,
|
||||
"step": 22200
|
||||
},
|
||||
{
|
||||
"epoch": 5.801959773078907,
|
||||
"grad_norm": 1.5178848505020142,
|
||||
"learning_rate": 4.240616779100357e-05,
|
||||
"loss": 3.2046,
|
||||
"step": 22500
|
||||
},
|
||||
{
|
||||
"epoch": 5.879319236719959,
|
||||
"grad_norm": 1.5043169260025024,
|
||||
"learning_rate": 4.162477534967312e-05,
|
||||
"loss": 3.16,
|
||||
"step": 22800
|
||||
},
|
||||
{
|
||||
"epoch": 5.956678700361011,
|
||||
"grad_norm": 1.371469259262085,
|
||||
"learning_rate": 4.084338290834267e-05,
|
||||
"loss": 3.2134,
|
||||
"step": 23100
|
||||
},
|
||||
{
|
||||
"epoch": 6.034038164002063,
|
||||
"grad_norm": 1.660897970199585,
|
||||
"learning_rate": 4.0061990467012215e-05,
|
||||
"loss": 3.1417,
|
||||
"step": 23400
|
||||
},
|
||||
{
|
||||
"epoch": 6.111397627643115,
|
||||
"grad_norm": 1.6934055089950562,
|
||||
"learning_rate": 3.928059802568177e-05,
|
||||
"loss": 3.1623,
|
||||
"step": 23700
|
||||
},
|
||||
{
|
||||
"epoch": 6.188757091284167,
|
||||
"grad_norm": 1.6035997867584229,
|
||||
"learning_rate": 3.849920558435132e-05,
|
||||
"loss": 3.1501,
|
||||
"step": 24000
|
||||
},
|
||||
{
|
||||
"epoch": 6.266116554925219,
|
||||
"grad_norm": 1.618349313735962,
|
||||
"learning_rate": 3.7717813143020865e-05,
|
||||
"loss": 3.1305,
|
||||
"step": 24300
|
||||
},
|
||||
{
|
||||
"epoch": 6.343476018566271,
|
||||
"grad_norm": 1.519572377204895,
|
||||
"learning_rate": 3.693642070169042e-05,
|
||||
"loss": 3.1529,
|
||||
"step": 24600
|
||||
},
|
||||
{
|
||||
"epoch": 6.420835482207323,
|
||||
"grad_norm": 1.5830146074295044,
|
||||
"learning_rate": 3.615502826035996e-05,
|
||||
"loss": 3.1571,
|
||||
"step": 24900
|
||||
},
|
||||
{
|
||||
"epoch": 6.498194945848375,
|
||||
"grad_norm": 1.6157386302947998,
|
||||
"learning_rate": 3.537363581902951e-05,
|
||||
"loss": 3.1564,
|
||||
"step": 25200
|
||||
},
|
||||
{
|
||||
"epoch": 6.575554409489428,
|
||||
"grad_norm": 1.5344434976577759,
|
||||
"learning_rate": 3.459224337769906e-05,
|
||||
"loss": 3.1638,
|
||||
"step": 25500
|
||||
},
|
||||
{
|
||||
"epoch": 6.65291387313048,
|
||||
"grad_norm": 1.6386032104492188,
|
||||
"learning_rate": 3.381085093636861e-05,
|
||||
"loss": 3.0942,
|
||||
"step": 25800
|
||||
},
|
||||
{
|
||||
"epoch": 6.730273336771532,
|
||||
"grad_norm": 1.5561423301696777,
|
||||
"learning_rate": 3.302945849503816e-05,
|
||||
"loss": 3.121,
|
||||
"step": 26100
|
||||
},
|
||||
{
|
||||
"epoch": 6.807632800412584,
|
||||
"grad_norm": 1.6447923183441162,
|
||||
"learning_rate": 3.224806605370771e-05,
|
||||
"loss": 3.1108,
|
||||
"step": 26400
|
||||
},
|
||||
{
|
||||
"epoch": 6.884992264053636,
|
||||
"grad_norm": 1.6027878522872925,
|
||||
"learning_rate": 3.1466673612377256e-05,
|
||||
"loss": 3.1331,
|
||||
"step": 26700
|
||||
},
|
||||
{
|
||||
"epoch": 6.9623517276946885,
|
||||
"grad_norm": 1.6786209344863892,
|
||||
"learning_rate": 3.068528117104681e-05,
|
||||
"loss": 3.1515,
|
||||
"step": 27000
|
||||
},
|
||||
{
|
||||
"epoch": 7.03971119133574,
|
||||
"grad_norm": 1.725610613822937,
|
||||
"learning_rate": 2.9903888729716357e-05,
|
||||
"loss": 3.1025,
|
||||
"step": 27300
|
||||
},
|
||||
{
|
||||
"epoch": 7.117070654976792,
|
||||
"grad_norm": 1.6194796562194824,
|
||||
"learning_rate": 2.9122496288385903e-05,
|
||||
"loss": 3.0819,
|
||||
"step": 27600
|
||||
},
|
||||
{
|
||||
"epoch": 7.194430118617844,
|
||||
"grad_norm": 1.7126758098602295,
|
||||
"learning_rate": 2.8341103847055455e-05,
|
||||
"loss": 3.1056,
|
||||
"step": 27900
|
||||
},
|
||||
{
|
||||
"epoch": 7.271789582258896,
|
||||
"grad_norm": 1.610686182975769,
|
||||
"learning_rate": 2.7559711405725004e-05,
|
||||
"loss": 3.0932,
|
||||
"step": 28200
|
||||
},
|
||||
{
|
||||
"epoch": 7.349149045899948,
|
||||
"grad_norm": 1.6700507402420044,
|
||||
"learning_rate": 2.677831896439455e-05,
|
||||
"loss": 3.0938,
|
||||
"step": 28500
|
||||
},
|
||||
{
|
||||
"epoch": 7.426508509541001,
|
||||
"grad_norm": 1.5000895261764526,
|
||||
"learning_rate": 2.59969265230641e-05,
|
||||
"loss": 3.0911,
|
||||
"step": 28800
|
||||
},
|
||||
{
|
||||
"epoch": 7.503867973182053,
|
||||
"grad_norm": 1.6568007469177246,
|
||||
"learning_rate": 2.521553408173365e-05,
|
||||
"loss": 3.0938,
|
||||
"step": 29100
|
||||
},
|
||||
{
|
||||
"epoch": 7.581227436823105,
|
||||
"grad_norm": 1.7494336366653442,
|
||||
"learning_rate": 2.44341416404032e-05,
|
||||
"loss": 3.0711,
|
||||
"step": 29400
|
||||
},
|
||||
{
|
||||
"epoch": 7.658586900464157,
|
||||
"grad_norm": 1.7158912420272827,
|
||||
"learning_rate": 2.3652749199072748e-05,
|
||||
"loss": 3.072,
|
||||
"step": 29700
|
||||
},
|
||||
{
|
||||
"epoch": 7.735946364105208,
|
||||
"grad_norm": 1.7721878290176392,
|
||||
"learning_rate": 2.2871356757742297e-05,
|
||||
"loss": 3.1069,
|
||||
"step": 30000
|
||||
},
|
||||
{
|
||||
"epoch": 7.813305827746261,
|
||||
"grad_norm": 1.5379910469055176,
|
||||
"learning_rate": 2.2089964316411846e-05,
|
||||
"loss": 3.0882,
|
||||
"step": 30300
|
||||
},
|
||||
{
|
||||
"epoch": 7.890665291387313,
|
||||
"grad_norm": 1.6254152059555054,
|
||||
"learning_rate": 2.1308571875081395e-05,
|
||||
"loss": 3.0506,
|
||||
"step": 30600
|
||||
},
|
||||
{
|
||||
"epoch": 7.968024755028365,
|
||||
"grad_norm": 1.6591140031814575,
|
||||
"learning_rate": 2.0527179433750944e-05,
|
||||
"loss": 3.0912,
|
||||
"step": 30900
|
||||
},
|
||||
{
|
||||
"epoch": 8.045384218669417,
|
||||
"grad_norm": 1.4908177852630615,
|
||||
"learning_rate": 1.9745786992420496e-05,
|
||||
"loss": 3.0567,
|
||||
"step": 31200
|
||||
},
|
||||
{
|
||||
"epoch": 8.12274368231047,
|
||||
"grad_norm": 1.6893351078033447,
|
||||
"learning_rate": 1.896439455109004e-05,
|
||||
"loss": 3.0538,
|
||||
"step": 31500
|
||||
},
|
||||
{
|
||||
"epoch": 8.200103145951521,
|
||||
"grad_norm": 1.6335694789886475,
|
||||
"learning_rate": 1.818300210975959e-05,
|
||||
"loss": 3.0596,
|
||||
"step": 31800
|
||||
},
|
||||
{
|
||||
"epoch": 8.277462609592574,
|
||||
"grad_norm": 1.814844012260437,
|
||||
"learning_rate": 1.7401609668429143e-05,
|
||||
"loss": 3.0789,
|
||||
"step": 32100
|
||||
},
|
||||
{
|
||||
"epoch": 8.354822073233626,
|
||||
"grad_norm": 1.666052222251892,
|
||||
"learning_rate": 1.662021722709869e-05,
|
||||
"loss": 3.0435,
|
||||
"step": 32400
|
||||
},
|
||||
{
|
||||
"epoch": 8.432181536874678,
|
||||
"grad_norm": 1.8534607887268066,
|
||||
"learning_rate": 1.5838824785768237e-05,
|
||||
"loss": 3.0542,
|
||||
"step": 32700
|
||||
},
|
||||
{
|
||||
"epoch": 8.50954100051573,
|
||||
"grad_norm": 1.8089135885238647,
|
||||
"learning_rate": 1.5057432344437788e-05,
|
||||
"loss": 3.0435,
|
||||
"step": 33000
|
||||
},
|
||||
{
|
||||
"epoch": 8.586900464156782,
|
||||
"grad_norm": 1.5717253684997559,
|
||||
"learning_rate": 1.4276039903107338e-05,
|
||||
"loss": 3.0323,
|
||||
"step": 33300
|
||||
},
|
||||
{
|
||||
"epoch": 8.664259927797834,
|
||||
"grad_norm": 1.681136131286621,
|
||||
"learning_rate": 1.3494647461776889e-05,
|
||||
"loss": 3.0528,
|
||||
"step": 33600
|
||||
},
|
||||
{
|
||||
"epoch": 8.741619391438887,
|
||||
"grad_norm": 1.700218915939331,
|
||||
"learning_rate": 1.2713255020446434e-05,
|
||||
"loss": 3.0454,
|
||||
"step": 33900
|
||||
},
|
||||
{
|
||||
"epoch": 8.818978855079939,
|
||||
"grad_norm": 1.8672676086425781,
|
||||
"learning_rate": 1.1931862579115985e-05,
|
||||
"loss": 3.0757,
|
||||
"step": 34200
|
||||
},
|
||||
{
|
||||
"epoch": 8.896338318720991,
|
||||
"grad_norm": 1.7094194889068604,
|
||||
"learning_rate": 1.1150470137785534e-05,
|
||||
"loss": 3.0514,
|
||||
"step": 34500
|
||||
},
|
||||
{
|
||||
"epoch": 8.973697782362041,
|
||||
"grad_norm": 1.7016539573669434,
|
||||
"learning_rate": 1.0369077696455083e-05,
|
||||
"loss": 3.022,
|
||||
"step": 34800
|
||||
},
|
||||
{
|
||||
"epoch": 9.051057246003094,
|
||||
"grad_norm": 1.7859755754470825,
|
||||
"learning_rate": 9.587685255124633e-06,
|
||||
"loss": 3.0189,
|
||||
"step": 35100
|
||||
},
|
||||
{
|
||||
"epoch": 9.128416709644146,
|
||||
"grad_norm": 1.6786860227584839,
|
||||
"learning_rate": 8.80629281379418e-06,
|
||||
"loss": 3.0062,
|
||||
"step": 35400
|
||||
},
|
||||
{
|
||||
"epoch": 9.205776173285198,
|
||||
"grad_norm": 1.7441751956939697,
|
||||
"learning_rate": 8.024900372463731e-06,
|
||||
"loss": 3.0036,
|
||||
"step": 35700
|
||||
},
|
||||
{
|
||||
"epoch": 9.28313563692625,
|
||||
"grad_norm": 1.6931071281433105,
|
||||
"learning_rate": 7.243507931133279e-06,
|
||||
"loss": 3.0179,
|
||||
"step": 36000
|
||||
},
|
||||
{
|
||||
"epoch": 9.360495100567302,
|
||||
"grad_norm": 1.5787148475646973,
|
||||
"learning_rate": 6.46211548980283e-06,
|
||||
"loss": 3.045,
|
||||
"step": 36300
|
||||
},
|
||||
{
|
||||
"epoch": 9.437854564208354,
|
||||
"grad_norm": 1.8229496479034424,
|
||||
"learning_rate": 5.680723048472378e-06,
|
||||
"loss": 3.0025,
|
||||
"step": 36600
|
||||
},
|
||||
{
|
||||
"epoch": 9.515214027849407,
|
||||
"grad_norm": 1.8122637271881104,
|
||||
"learning_rate": 4.899330607141927e-06,
|
||||
"loss": 3.0239,
|
||||
"step": 36900
|
||||
},
|
||||
{
|
||||
"epoch": 9.592573491490459,
|
||||
"grad_norm": 1.5085257291793823,
|
||||
"learning_rate": 4.117938165811476e-06,
|
||||
"loss": 3.0226,
|
||||
"step": 37200
|
||||
},
|
||||
{
|
||||
"epoch": 9.669932955131511,
|
||||
"grad_norm": 1.8228789567947388,
|
||||
"learning_rate": 3.336545724481025e-06,
|
||||
"loss": 3.0286,
|
||||
"step": 37500
|
||||
},
|
||||
{
|
||||
"epoch": 9.747292418772563,
|
||||
"grad_norm": 1.5136455297470093,
|
||||
"learning_rate": 2.5551532831505747e-06,
|
||||
"loss": 3.0184,
|
||||
"step": 37800
|
||||
},
|
||||
{
|
||||
"epoch": 9.824651882413615,
|
||||
"grad_norm": 1.7498648166656494,
|
||||
"learning_rate": 1.7737608418201238e-06,
|
||||
"loss": 3.002,
|
||||
"step": 38100
|
||||
},
|
||||
{
|
||||
"epoch": 9.902011346054667,
|
||||
"grad_norm": 1.625130534172058,
|
||||
"learning_rate": 9.923684004896727e-07,
|
||||
"loss": 3.0423,
|
||||
"step": 38400
|
||||
},
|
||||
{
|
||||
"epoch": 9.97937080969572,
|
||||
"grad_norm": 1.782974362373352,
|
||||
"learning_rate": 2.1097595915922174e-07,
|
||||
"loss": 3.023,
|
||||
"step": 38700
|
||||
}
|
||||
],
|
||||
"logging_steps": 300,
|
||||
"max_steps": 38780,
|
||||
"num_input_tokens_seen": 0,
|
||||
"num_train_epochs": 10,
|
||||
"save_steps": 300,
|
||||
"stateful_callbacks": {
|
||||
"TrainerControl": {
|
||||
"args": {
|
||||
"should_epoch_stop": false,
|
||||
"should_evaluate": false,
|
||||
"should_log": false,
|
||||
"should_save": true,
|
||||
"should_training_stop": true
|
||||
},
|
||||
"attributes": {}
|
||||
}
|
||||
},
|
||||
"total_flos": 1.20181986164736e+16,
|
||||
"train_batch_size": 8,
|
||||
"trial_name": null,
|
||||
"trial_params": null
|
||||
}
|
||||
3
last-checkpoint/training_args.bin
Normal file
3
last-checkpoint/training_args.bin
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:eb41a6a22dfdaf4e03f114cbb5893d8e2bed45157fad3f8de27661c34562afea
|
||||
size 5841
|
||||
3
model.safetensors
Normal file
3
model.safetensors
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:045584847064d2eb5dc229653586e0327ef2a4dfbb5e56b830944e62cb173418
|
||||
size 156353584
|
||||
1243
special_tokens_map.json
Normal file
1243
special_tokens_map.json
Normal file
File diff suppressed because it is too large
Load Diff
210940
tokenizer.json
Normal file
210940
tokenizer.json
Normal file
File diff suppressed because one or more lines are too long
10829
tokenizer_config.json
Normal file
10829
tokenizer_config.json
Normal file
File diff suppressed because it is too large
Load Diff
3
training_args.bin
Normal file
3
training_args.bin
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:eb41a6a22dfdaf4e03f114cbb5893d8e2bed45157fad3f8de27661c34562afea
|
||||
size 5841
|
||||
Reference in New Issue
Block a user