初始化项目,由ModelHub XC社区提供模型

Model: anjajar/baby_goldfish_new
Source: Original Platform
This commit is contained in:
ModelHub XC
2026-05-15 17:57:17 +08:00
commit 065cf68cba
21 changed files with 447156 additions and 0 deletions

35
.gitattributes vendored Normal file
View File

@@ -0,0 +1,35 @@
*.7z filter=lfs diff=lfs merge=lfs -text
*.arrow filter=lfs diff=lfs merge=lfs -text
*.bin filter=lfs diff=lfs merge=lfs -text
*.bz2 filter=lfs diff=lfs merge=lfs -text
*.ckpt filter=lfs diff=lfs merge=lfs -text
*.ftz filter=lfs diff=lfs merge=lfs -text
*.gz filter=lfs diff=lfs merge=lfs -text
*.h5 filter=lfs diff=lfs merge=lfs -text
*.joblib filter=lfs diff=lfs merge=lfs -text
*.lfs.* filter=lfs diff=lfs merge=lfs -text
*.mlmodel filter=lfs diff=lfs merge=lfs -text
*.model filter=lfs diff=lfs merge=lfs -text
*.msgpack filter=lfs diff=lfs merge=lfs -text
*.npy filter=lfs diff=lfs merge=lfs -text
*.npz filter=lfs diff=lfs merge=lfs -text
*.onnx filter=lfs diff=lfs merge=lfs -text
*.ot filter=lfs diff=lfs merge=lfs -text
*.parquet filter=lfs diff=lfs merge=lfs -text
*.pb filter=lfs diff=lfs merge=lfs -text
*.pickle filter=lfs diff=lfs merge=lfs -text
*.pkl filter=lfs diff=lfs merge=lfs -text
*.pt filter=lfs diff=lfs merge=lfs -text
*.pth filter=lfs diff=lfs merge=lfs -text
*.rar filter=lfs diff=lfs merge=lfs -text
*.safetensors filter=lfs diff=lfs merge=lfs -text
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
*.tar.* filter=lfs diff=lfs merge=lfs -text
*.tar filter=lfs diff=lfs merge=lfs -text
*.tflite filter=lfs diff=lfs merge=lfs -text
*.tgz filter=lfs diff=lfs merge=lfs -text
*.wasm filter=lfs diff=lfs merge=lfs -text
*.xz filter=lfs diff=lfs merge=lfs -text
*.zip filter=lfs diff=lfs merge=lfs -text
*.zst filter=lfs diff=lfs merge=lfs -text
*tfevents* filter=lfs diff=lfs merge=lfs -text

54
README.md Normal file
View File

@@ -0,0 +1,54 @@
---
library_name: transformers
base_model: gpt_small_config.json
tags:
- generated_from_trainer
model-index:
- name: baby_goldfish_new
results: []
---
<!-- This model card has been generated automatically according to the information the Trainer had access to. You
should probably proofread and complete it, then remove this comment. -->
# baby_goldfish_new
This model is a fine-tuned version of [gpt_small_config.json](https://huggingface.co/gpt_small_config.json) on the None dataset.
## Model description
More information needed
## Intended uses & limitations
More information needed
## Training and evaluation data
More information needed
## Training procedure
### Training hyperparameters
The following hyperparameters were used during training:
- learning_rate: 0.0001
- train_batch_size: 8
- eval_batch_size: 8
- seed: 43
- optimizer: Use OptimizerNames.ADAMW_TORCH_FUSED with betas=(0.9,0.999) and epsilon=1e-06 and optimizer_args=No additional optimizer arguments
- lr_scheduler_type: linear
- lr_scheduler_warmup_steps: 387
- num_epochs: 10
- mixed_precision_training: Native AMP
### Training results
### Framework versions
- Transformers 4.57.3
- Pytorch 2.9.1+cu128
- Datasets 4.4.1
- Tokenizers 0.22.1

34
config.json Normal file
View File

@@ -0,0 +1,34 @@
{
"activation_function": "gelu",
"architectures": [
"GPT2LMHeadModel"
],
"attn_pdrop": 0.1,
"bos_token_id": 50000,
"dtype": "float32",
"embd_pdrop": 0.1,
"eos_token_id": 50001,
"initializer_range": 0.02,
"layer_norm_epsilon": 1e-05,
"model_type": "gpt2",
"n_ctx": 512,
"n_embd": 512,
"n_head": 8,
"n_inner": 2048,
"n_layer": 4,
"n_positions": 512,
"pad_token_id": 50002,
"prefix": "[CLS]",
"reorder_and_upcast_attn": false,
"resid_pdrop": 0.1,
"scale_attn_by_inverse_layer_idx": false,
"scale_attn_weights": true,
"summary_activation": null,
"summary_first_dropout": 0.1,
"summary_proj_to_labels": true,
"summary_type": "cls_index",
"summary_use_proj": true,
"transformers_version": "4.57.3",
"use_cache": true,
"vocab_size": 51200
}

7
generation_config.json Normal file
View File

@@ -0,0 +1,7 @@
{
"_from_model_config": true,
"bos_token_id": 50000,
"eos_token_id": 50001,
"pad_token_id": 50002,
"transformers_version": "4.57.3"
}

View File

@@ -0,0 +1,34 @@
{
"activation_function": "gelu",
"architectures": [
"GPT2LMHeadModel"
],
"attn_pdrop": 0.1,
"bos_token_id": 50000,
"dtype": "float32",
"embd_pdrop": 0.1,
"eos_token_id": 50001,
"initializer_range": 0.02,
"layer_norm_epsilon": 1e-05,
"model_type": "gpt2",
"n_ctx": 512,
"n_embd": 512,
"n_head": 8,
"n_inner": 2048,
"n_layer": 4,
"n_positions": 512,
"pad_token_id": 50002,
"prefix": "[CLS]",
"reorder_and_upcast_attn": false,
"resid_pdrop": 0.1,
"scale_attn_by_inverse_layer_idx": false,
"scale_attn_weights": true,
"summary_activation": null,
"summary_first_dropout": 0.1,
"summary_proj_to_labels": true,
"summary_type": "cls_index",
"summary_use_proj": true,
"transformers_version": "4.57.3",
"use_cache": true,
"vocab_size": 51200
}

View File

@@ -0,0 +1,7 @@
{
"_from_model_config": true,
"bos_token_id": 50000,
"eos_token_id": 50001,
"pad_token_id": 50002,
"transformers_version": "4.57.3"
}

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:045584847064d2eb5dc229653586e0327ef2a4dfbb5e56b830944e62cb173418
size 156353584

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:ce56a644b43a957d344559794c779a3624f1264caad1d97458aedabe7db4314e
size 312741835

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:e8425ffa539cd7299756c17d804955693f3af2c98cd1bdd1a958b1cbe0c28550
size 14645

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:5c4faeb7590b14fb868d259d539b68ad8bbe516624c9eb07f67bcb75d3c509da
size 1383

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:609cf1da746d897de6995be7207b46d4bd7fddf35a26103e7fae174d3583b315
size 1465

File diff suppressed because it is too large Load Diff

210940
last-checkpoint/tokenizer.json Normal file

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,937 @@
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 10.0,
"eval_steps": 500,
"global_step": 38780,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.07735946364105209,
"grad_norm": 1.558766484260559,
"learning_rate": 7.726098191214471e-05,
"loss": 7.285,
"step": 300
},
{
"epoch": 0.15471892728210418,
"grad_norm": 0.9385259747505188,
"learning_rate": 9.944781600812648e-05,
"loss": 4.7972,
"step": 600
},
{
"epoch": 0.23207839092315627,
"grad_norm": 0.9864380359649658,
"learning_rate": 9.866642356679604e-05,
"loss": 4.5563,
"step": 900
},
{
"epoch": 0.30943785456420836,
"grad_norm": 0.971341073513031,
"learning_rate": 9.788503112546558e-05,
"loss": 4.4545,
"step": 1200
},
{
"epoch": 0.3867973182052604,
"grad_norm": 0.8991674184799194,
"learning_rate": 9.710363868413513e-05,
"loss": 4.3616,
"step": 1500
},
{
"epoch": 0.46415678184631254,
"grad_norm": 0.9576361775398254,
"learning_rate": 9.632224624280469e-05,
"loss": 4.2695,
"step": 1800
},
{
"epoch": 0.5415162454873647,
"grad_norm": 0.9536678791046143,
"learning_rate": 9.554085380147423e-05,
"loss": 4.2254,
"step": 2100
},
{
"epoch": 0.6188757091284167,
"grad_norm": 0.892548680305481,
"learning_rate": 9.475946136014378e-05,
"loss": 4.1759,
"step": 2400
},
{
"epoch": 0.6962351727694688,
"grad_norm": 0.9262155294418335,
"learning_rate": 9.397806891881334e-05,
"loss": 4.1328,
"step": 2700
},
{
"epoch": 0.7735946364105208,
"grad_norm": 1.055438756942749,
"learning_rate": 9.319667647748288e-05,
"loss": 4.0732,
"step": 3000
},
{
"epoch": 0.850954100051573,
"grad_norm": 1.0588972568511963,
"learning_rate": 9.241528403615243e-05,
"loss": 4.0574,
"step": 3300
},
{
"epoch": 0.9283135636926251,
"grad_norm": 1.344167947769165,
"learning_rate": 9.163389159482197e-05,
"loss": 4.0472,
"step": 3600
},
{
"epoch": 1.0056730273336771,
"grad_norm": 0.9573405981063843,
"learning_rate": 9.085249915349152e-05,
"loss": 3.972,
"step": 3900
},
{
"epoch": 1.0830324909747293,
"grad_norm": 1.0597045421600342,
"learning_rate": 9.007110671216108e-05,
"loss": 3.8933,
"step": 4200
},
{
"epoch": 1.1603919546157813,
"grad_norm": 1.1895560026168823,
"learning_rate": 8.928971427083062e-05,
"loss": 3.8657,
"step": 4500
},
{
"epoch": 1.2377514182568334,
"grad_norm": 1.1971007585525513,
"learning_rate": 8.850832182950017e-05,
"loss": 3.8486,
"step": 4800
},
{
"epoch": 1.3151108818978856,
"grad_norm": 1.2342840433120728,
"learning_rate": 8.772692938816972e-05,
"loss": 3.8417,
"step": 5100
},
{
"epoch": 1.3924703455389376,
"grad_norm": 1.213428258895874,
"learning_rate": 8.694553694683926e-05,
"loss": 3.8048,
"step": 5400
},
{
"epoch": 1.4698298091799897,
"grad_norm": 1.191662073135376,
"learning_rate": 8.616414450550882e-05,
"loss": 3.7818,
"step": 5700
},
{
"epoch": 1.5471892728210417,
"grad_norm": 1.3016968965530396,
"learning_rate": 8.538275206417838e-05,
"loss": 3.7365,
"step": 6000
},
{
"epoch": 1.6245487364620939,
"grad_norm": 1.179246187210083,
"learning_rate": 8.460135962284792e-05,
"loss": 3.7605,
"step": 6300
},
{
"epoch": 1.701908200103146,
"grad_norm": 1.2382755279541016,
"learning_rate": 8.381996718151747e-05,
"loss": 3.6887,
"step": 6600
},
{
"epoch": 1.7792676637441982,
"grad_norm": 1.209956169128418,
"learning_rate": 8.303857474018702e-05,
"loss": 3.7204,
"step": 6900
},
{
"epoch": 1.8566271273852502,
"grad_norm": 1.119544267654419,
"learning_rate": 8.225718229885656e-05,
"loss": 3.6682,
"step": 7200
},
{
"epoch": 1.933986591026302,
"grad_norm": 1.2890771627426147,
"learning_rate": 8.147578985752612e-05,
"loss": 3.6434,
"step": 7500
},
{
"epoch": 2.0113460546673543,
"grad_norm": 1.2189580202102661,
"learning_rate": 8.069439741619567e-05,
"loss": 3.6477,
"step": 7800
},
{
"epoch": 2.0887055183084065,
"grad_norm": 1.4200156927108765,
"learning_rate": 7.991300497486521e-05,
"loss": 3.5883,
"step": 8100
},
{
"epoch": 2.1660649819494586,
"grad_norm": 1.1501333713531494,
"learning_rate": 7.913161253353476e-05,
"loss": 3.6023,
"step": 8400
},
{
"epoch": 2.2434244455905104,
"grad_norm": 1.321439504623413,
"learning_rate": 7.83502200922043e-05,
"loss": 3.5765,
"step": 8700
},
{
"epoch": 2.3207839092315625,
"grad_norm": 1.4532771110534668,
"learning_rate": 7.756882765087386e-05,
"loss": 3.5858,
"step": 9000
},
{
"epoch": 2.3981433728726147,
"grad_norm": 1.2922136783599854,
"learning_rate": 7.67874352095434e-05,
"loss": 3.5483,
"step": 9300
},
{
"epoch": 2.475502836513667,
"grad_norm": 1.459169864654541,
"learning_rate": 7.600604276821297e-05,
"loss": 3.551,
"step": 9600
},
{
"epoch": 2.552862300154719,
"grad_norm": 1.3106615543365479,
"learning_rate": 7.522465032688251e-05,
"loss": 3.5216,
"step": 9900
},
{
"epoch": 2.630221763795771,
"grad_norm": 1.4244039058685303,
"learning_rate": 7.444325788555206e-05,
"loss": 3.5251,
"step": 10200
},
{
"epoch": 2.707581227436823,
"grad_norm": 1.3957465887069702,
"learning_rate": 7.366186544422162e-05,
"loss": 3.5245,
"step": 10500
},
{
"epoch": 2.784940691077875,
"grad_norm": 1.4246965646743774,
"learning_rate": 7.288047300289116e-05,
"loss": 3.4938,
"step": 10800
},
{
"epoch": 2.8623001547189273,
"grad_norm": 1.3009408712387085,
"learning_rate": 7.209908056156071e-05,
"loss": 3.4972,
"step": 11100
},
{
"epoch": 2.9396596183599795,
"grad_norm": 1.2788194417953491,
"learning_rate": 7.131768812023025e-05,
"loss": 3.4835,
"step": 11400
},
{
"epoch": 3.0170190820010316,
"grad_norm": 1.415262222290039,
"learning_rate": 7.05362956788998e-05,
"loss": 3.4686,
"step": 11700
},
{
"epoch": 3.0943785456420834,
"grad_norm": 1.3552271127700806,
"learning_rate": 6.975490323756934e-05,
"loss": 3.4434,
"step": 12000
},
{
"epoch": 3.1717380092831355,
"grad_norm": 1.2953003644943237,
"learning_rate": 6.89735107962389e-05,
"loss": 3.406,
"step": 12300
},
{
"epoch": 3.2490974729241877,
"grad_norm": 1.2616957426071167,
"learning_rate": 6.819211835490845e-05,
"loss": 3.4219,
"step": 12600
},
{
"epoch": 3.32645693656524,
"grad_norm": 1.3086093664169312,
"learning_rate": 6.7410725913578e-05,
"loss": 3.4327,
"step": 12900
},
{
"epoch": 3.403816400206292,
"grad_norm": 1.5225331783294678,
"learning_rate": 6.662933347224755e-05,
"loss": 3.3881,
"step": 13200
},
{
"epoch": 3.4811758638473442,
"grad_norm": 1.3017733097076416,
"learning_rate": 6.58479410309171e-05,
"loss": 3.4253,
"step": 13500
},
{
"epoch": 3.558535327488396,
"grad_norm": 1.4945634603500366,
"learning_rate": 6.506654858958666e-05,
"loss": 3.3739,
"step": 13800
},
{
"epoch": 3.635894791129448,
"grad_norm": 1.3506596088409424,
"learning_rate": 6.42851561482562e-05,
"loss": 3.3795,
"step": 14100
},
{
"epoch": 3.7132542547705003,
"grad_norm": 1.3715941905975342,
"learning_rate": 6.350376370692575e-05,
"loss": 3.3621,
"step": 14400
},
{
"epoch": 3.7906137184115525,
"grad_norm": 1.4353686571121216,
"learning_rate": 6.27223712655953e-05,
"loss": 3.3625,
"step": 14700
},
{
"epoch": 3.867973182052604,
"grad_norm": 1.4907252788543701,
"learning_rate": 6.194097882426484e-05,
"loss": 3.3694,
"step": 15000
},
{
"epoch": 3.9453326456936564,
"grad_norm": 1.3906782865524292,
"learning_rate": 6.11595863829344e-05,
"loss": 3.3778,
"step": 15300
},
{
"epoch": 4.0226921093347086,
"grad_norm": 1.4113860130310059,
"learning_rate": 6.0378193941603944e-05,
"loss": 3.3418,
"step": 15600
},
{
"epoch": 4.100051572975761,
"grad_norm": 1.371813416481018,
"learning_rate": 5.959680150027349e-05,
"loss": 3.2831,
"step": 15900
},
{
"epoch": 4.177411036616813,
"grad_norm": 1.433017611503601,
"learning_rate": 5.881540905894304e-05,
"loss": 3.2937,
"step": 16200
},
{
"epoch": 4.254770500257865,
"grad_norm": 1.454952597618103,
"learning_rate": 5.803401661761259e-05,
"loss": 3.2925,
"step": 16500
},
{
"epoch": 4.332129963898917,
"grad_norm": 1.4268256425857544,
"learning_rate": 5.725262417628213e-05,
"loss": 3.3171,
"step": 16800
},
{
"epoch": 4.409489427539969,
"grad_norm": 1.4231845140457153,
"learning_rate": 5.647123173495169e-05,
"loss": 3.3303,
"step": 17100
},
{
"epoch": 4.486848891181021,
"grad_norm": 1.358296275138855,
"learning_rate": 5.568983929362124e-05,
"loss": 3.3273,
"step": 17400
},
{
"epoch": 4.564208354822073,
"grad_norm": 1.4314409494400024,
"learning_rate": 5.490844685229078e-05,
"loss": 3.3027,
"step": 17700
},
{
"epoch": 4.641567818463125,
"grad_norm": 1.447662353515625,
"learning_rate": 5.4127054410960335e-05,
"loss": 3.2779,
"step": 18000
},
{
"epoch": 4.718927282104177,
"grad_norm": 1.498307466506958,
"learning_rate": 5.334566196962988e-05,
"loss": 3.2733,
"step": 18300
},
{
"epoch": 4.796286745745229,
"grad_norm": 1.3249318599700928,
"learning_rate": 5.256426952829944e-05,
"loss": 3.2159,
"step": 18600
},
{
"epoch": 4.873646209386282,
"grad_norm": 1.7372560501098633,
"learning_rate": 5.1782877086968985e-05,
"loss": 3.2769,
"step": 18900
},
{
"epoch": 4.951005673027334,
"grad_norm": 1.475892186164856,
"learning_rate": 5.100148464563853e-05,
"loss": 3.2724,
"step": 19200
},
{
"epoch": 5.028365136668386,
"grad_norm": 1.5225096940994263,
"learning_rate": 5.0220092204308076e-05,
"loss": 3.2132,
"step": 19500
},
{
"epoch": 5.105724600309438,
"grad_norm": 1.5637928247451782,
"learning_rate": 4.943869976297763e-05,
"loss": 3.2313,
"step": 19800
},
{
"epoch": 5.18308406395049,
"grad_norm": 1.5115944147109985,
"learning_rate": 4.865730732164718e-05,
"loss": 3.2163,
"step": 20100
},
{
"epoch": 5.260443527591542,
"grad_norm": 1.4446969032287598,
"learning_rate": 4.7875914880316726e-05,
"loss": 3.1997,
"step": 20400
},
{
"epoch": 5.337802991232594,
"grad_norm": 1.4487448930740356,
"learning_rate": 4.709452243898628e-05,
"loss": 3.2236,
"step": 20700
},
{
"epoch": 5.415162454873646,
"grad_norm": 1.5380080938339233,
"learning_rate": 4.6313129997655824e-05,
"loss": 3.2076,
"step": 21000
},
{
"epoch": 5.492521918514698,
"grad_norm": 1.4626458883285522,
"learning_rate": 4.5531737556325376e-05,
"loss": 3.2204,
"step": 21300
},
{
"epoch": 5.56988138215575,
"grad_norm": 1.6070873737335205,
"learning_rate": 4.475034511499492e-05,
"loss": 3.182,
"step": 21600
},
{
"epoch": 5.647240845796802,
"grad_norm": 1.5365498065948486,
"learning_rate": 4.3968952673664474e-05,
"loss": 3.1846,
"step": 21900
},
{
"epoch": 5.724600309437855,
"grad_norm": 1.6350524425506592,
"learning_rate": 4.3187560232334026e-05,
"loss": 3.2233,
"step": 22200
},
{
"epoch": 5.801959773078907,
"grad_norm": 1.5178848505020142,
"learning_rate": 4.240616779100357e-05,
"loss": 3.2046,
"step": 22500
},
{
"epoch": 5.879319236719959,
"grad_norm": 1.5043169260025024,
"learning_rate": 4.162477534967312e-05,
"loss": 3.16,
"step": 22800
},
{
"epoch": 5.956678700361011,
"grad_norm": 1.371469259262085,
"learning_rate": 4.084338290834267e-05,
"loss": 3.2134,
"step": 23100
},
{
"epoch": 6.034038164002063,
"grad_norm": 1.660897970199585,
"learning_rate": 4.0061990467012215e-05,
"loss": 3.1417,
"step": 23400
},
{
"epoch": 6.111397627643115,
"grad_norm": 1.6934055089950562,
"learning_rate": 3.928059802568177e-05,
"loss": 3.1623,
"step": 23700
},
{
"epoch": 6.188757091284167,
"grad_norm": 1.6035997867584229,
"learning_rate": 3.849920558435132e-05,
"loss": 3.1501,
"step": 24000
},
{
"epoch": 6.266116554925219,
"grad_norm": 1.618349313735962,
"learning_rate": 3.7717813143020865e-05,
"loss": 3.1305,
"step": 24300
},
{
"epoch": 6.343476018566271,
"grad_norm": 1.519572377204895,
"learning_rate": 3.693642070169042e-05,
"loss": 3.1529,
"step": 24600
},
{
"epoch": 6.420835482207323,
"grad_norm": 1.5830146074295044,
"learning_rate": 3.615502826035996e-05,
"loss": 3.1571,
"step": 24900
},
{
"epoch": 6.498194945848375,
"grad_norm": 1.6157386302947998,
"learning_rate": 3.537363581902951e-05,
"loss": 3.1564,
"step": 25200
},
{
"epoch": 6.575554409489428,
"grad_norm": 1.5344434976577759,
"learning_rate": 3.459224337769906e-05,
"loss": 3.1638,
"step": 25500
},
{
"epoch": 6.65291387313048,
"grad_norm": 1.6386032104492188,
"learning_rate": 3.381085093636861e-05,
"loss": 3.0942,
"step": 25800
},
{
"epoch": 6.730273336771532,
"grad_norm": 1.5561423301696777,
"learning_rate": 3.302945849503816e-05,
"loss": 3.121,
"step": 26100
},
{
"epoch": 6.807632800412584,
"grad_norm": 1.6447923183441162,
"learning_rate": 3.224806605370771e-05,
"loss": 3.1108,
"step": 26400
},
{
"epoch": 6.884992264053636,
"grad_norm": 1.6027878522872925,
"learning_rate": 3.1466673612377256e-05,
"loss": 3.1331,
"step": 26700
},
{
"epoch": 6.9623517276946885,
"grad_norm": 1.6786209344863892,
"learning_rate": 3.068528117104681e-05,
"loss": 3.1515,
"step": 27000
},
{
"epoch": 7.03971119133574,
"grad_norm": 1.725610613822937,
"learning_rate": 2.9903888729716357e-05,
"loss": 3.1025,
"step": 27300
},
{
"epoch": 7.117070654976792,
"grad_norm": 1.6194796562194824,
"learning_rate": 2.9122496288385903e-05,
"loss": 3.0819,
"step": 27600
},
{
"epoch": 7.194430118617844,
"grad_norm": 1.7126758098602295,
"learning_rate": 2.8341103847055455e-05,
"loss": 3.1056,
"step": 27900
},
{
"epoch": 7.271789582258896,
"grad_norm": 1.610686182975769,
"learning_rate": 2.7559711405725004e-05,
"loss": 3.0932,
"step": 28200
},
{
"epoch": 7.349149045899948,
"grad_norm": 1.6700507402420044,
"learning_rate": 2.677831896439455e-05,
"loss": 3.0938,
"step": 28500
},
{
"epoch": 7.426508509541001,
"grad_norm": 1.5000895261764526,
"learning_rate": 2.59969265230641e-05,
"loss": 3.0911,
"step": 28800
},
{
"epoch": 7.503867973182053,
"grad_norm": 1.6568007469177246,
"learning_rate": 2.521553408173365e-05,
"loss": 3.0938,
"step": 29100
},
{
"epoch": 7.581227436823105,
"grad_norm": 1.7494336366653442,
"learning_rate": 2.44341416404032e-05,
"loss": 3.0711,
"step": 29400
},
{
"epoch": 7.658586900464157,
"grad_norm": 1.7158912420272827,
"learning_rate": 2.3652749199072748e-05,
"loss": 3.072,
"step": 29700
},
{
"epoch": 7.735946364105208,
"grad_norm": 1.7721878290176392,
"learning_rate": 2.2871356757742297e-05,
"loss": 3.1069,
"step": 30000
},
{
"epoch": 7.813305827746261,
"grad_norm": 1.5379910469055176,
"learning_rate": 2.2089964316411846e-05,
"loss": 3.0882,
"step": 30300
},
{
"epoch": 7.890665291387313,
"grad_norm": 1.6254152059555054,
"learning_rate": 2.1308571875081395e-05,
"loss": 3.0506,
"step": 30600
},
{
"epoch": 7.968024755028365,
"grad_norm": 1.6591140031814575,
"learning_rate": 2.0527179433750944e-05,
"loss": 3.0912,
"step": 30900
},
{
"epoch": 8.045384218669417,
"grad_norm": 1.4908177852630615,
"learning_rate": 1.9745786992420496e-05,
"loss": 3.0567,
"step": 31200
},
{
"epoch": 8.12274368231047,
"grad_norm": 1.6893351078033447,
"learning_rate": 1.896439455109004e-05,
"loss": 3.0538,
"step": 31500
},
{
"epoch": 8.200103145951521,
"grad_norm": 1.6335694789886475,
"learning_rate": 1.818300210975959e-05,
"loss": 3.0596,
"step": 31800
},
{
"epoch": 8.277462609592574,
"grad_norm": 1.814844012260437,
"learning_rate": 1.7401609668429143e-05,
"loss": 3.0789,
"step": 32100
},
{
"epoch": 8.354822073233626,
"grad_norm": 1.666052222251892,
"learning_rate": 1.662021722709869e-05,
"loss": 3.0435,
"step": 32400
},
{
"epoch": 8.432181536874678,
"grad_norm": 1.8534607887268066,
"learning_rate": 1.5838824785768237e-05,
"loss": 3.0542,
"step": 32700
},
{
"epoch": 8.50954100051573,
"grad_norm": 1.8089135885238647,
"learning_rate": 1.5057432344437788e-05,
"loss": 3.0435,
"step": 33000
},
{
"epoch": 8.586900464156782,
"grad_norm": 1.5717253684997559,
"learning_rate": 1.4276039903107338e-05,
"loss": 3.0323,
"step": 33300
},
{
"epoch": 8.664259927797834,
"grad_norm": 1.681136131286621,
"learning_rate": 1.3494647461776889e-05,
"loss": 3.0528,
"step": 33600
},
{
"epoch": 8.741619391438887,
"grad_norm": 1.700218915939331,
"learning_rate": 1.2713255020446434e-05,
"loss": 3.0454,
"step": 33900
},
{
"epoch": 8.818978855079939,
"grad_norm": 1.8672676086425781,
"learning_rate": 1.1931862579115985e-05,
"loss": 3.0757,
"step": 34200
},
{
"epoch": 8.896338318720991,
"grad_norm": 1.7094194889068604,
"learning_rate": 1.1150470137785534e-05,
"loss": 3.0514,
"step": 34500
},
{
"epoch": 8.973697782362041,
"grad_norm": 1.7016539573669434,
"learning_rate": 1.0369077696455083e-05,
"loss": 3.022,
"step": 34800
},
{
"epoch": 9.051057246003094,
"grad_norm": 1.7859755754470825,
"learning_rate": 9.587685255124633e-06,
"loss": 3.0189,
"step": 35100
},
{
"epoch": 9.128416709644146,
"grad_norm": 1.6786860227584839,
"learning_rate": 8.80629281379418e-06,
"loss": 3.0062,
"step": 35400
},
{
"epoch": 9.205776173285198,
"grad_norm": 1.7441751956939697,
"learning_rate": 8.024900372463731e-06,
"loss": 3.0036,
"step": 35700
},
{
"epoch": 9.28313563692625,
"grad_norm": 1.6931071281433105,
"learning_rate": 7.243507931133279e-06,
"loss": 3.0179,
"step": 36000
},
{
"epoch": 9.360495100567302,
"grad_norm": 1.5787148475646973,
"learning_rate": 6.46211548980283e-06,
"loss": 3.045,
"step": 36300
},
{
"epoch": 9.437854564208354,
"grad_norm": 1.8229496479034424,
"learning_rate": 5.680723048472378e-06,
"loss": 3.0025,
"step": 36600
},
{
"epoch": 9.515214027849407,
"grad_norm": 1.8122637271881104,
"learning_rate": 4.899330607141927e-06,
"loss": 3.0239,
"step": 36900
},
{
"epoch": 9.592573491490459,
"grad_norm": 1.5085257291793823,
"learning_rate": 4.117938165811476e-06,
"loss": 3.0226,
"step": 37200
},
{
"epoch": 9.669932955131511,
"grad_norm": 1.8228789567947388,
"learning_rate": 3.336545724481025e-06,
"loss": 3.0286,
"step": 37500
},
{
"epoch": 9.747292418772563,
"grad_norm": 1.5136455297470093,
"learning_rate": 2.5551532831505747e-06,
"loss": 3.0184,
"step": 37800
},
{
"epoch": 9.824651882413615,
"grad_norm": 1.7498648166656494,
"learning_rate": 1.7737608418201238e-06,
"loss": 3.002,
"step": 38100
},
{
"epoch": 9.902011346054667,
"grad_norm": 1.625130534172058,
"learning_rate": 9.923684004896727e-07,
"loss": 3.0423,
"step": 38400
},
{
"epoch": 9.97937080969572,
"grad_norm": 1.782974362373352,
"learning_rate": 2.1097595915922174e-07,
"loss": 3.023,
"step": 38700
}
],
"logging_steps": 300,
"max_steps": 38780,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 300,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.20181986164736e+16,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:eb41a6a22dfdaf4e03f114cbb5893d8e2bed45157fad3f8de27661c34562afea
size 5841

3
model.safetensors Normal file
View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:045584847064d2eb5dc229653586e0327ef2a4dfbb5e56b830944e62cb173418
size 156353584

1243
special_tokens_map.json Normal file

File diff suppressed because it is too large Load Diff

210940
tokenizer.json Normal file

File diff suppressed because one or more lines are too long

10829
tokenizer_config.json Normal file

File diff suppressed because it is too large Load Diff

3
training_args.bin Normal file
View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:eb41a6a22dfdaf4e03f114cbb5893d8e2bed45157fad3f8de27661c34562afea
size 5841