初始化项目,由ModelHub XC社区提供模型

Model: anjajar/baby_goldfish_large_new
Source: Original Platform
This commit is contained in:
ModelHub XC
2026-05-15 17:54:52 +08:00
commit e1061fd1d5
21 changed files with 447114 additions and 0 deletions

35
.gitattributes vendored Normal file
View File

@@ -0,0 +1,35 @@
*.7z filter=lfs diff=lfs merge=lfs -text
*.arrow filter=lfs diff=lfs merge=lfs -text
*.bin filter=lfs diff=lfs merge=lfs -text
*.bz2 filter=lfs diff=lfs merge=lfs -text
*.ckpt filter=lfs diff=lfs merge=lfs -text
*.ftz filter=lfs diff=lfs merge=lfs -text
*.gz filter=lfs diff=lfs merge=lfs -text
*.h5 filter=lfs diff=lfs merge=lfs -text
*.joblib filter=lfs diff=lfs merge=lfs -text
*.lfs.* filter=lfs diff=lfs merge=lfs -text
*.mlmodel filter=lfs diff=lfs merge=lfs -text
*.model filter=lfs diff=lfs merge=lfs -text
*.msgpack filter=lfs diff=lfs merge=lfs -text
*.npy filter=lfs diff=lfs merge=lfs -text
*.npz filter=lfs diff=lfs merge=lfs -text
*.onnx filter=lfs diff=lfs merge=lfs -text
*.ot filter=lfs diff=lfs merge=lfs -text
*.parquet filter=lfs diff=lfs merge=lfs -text
*.pb filter=lfs diff=lfs merge=lfs -text
*.pickle filter=lfs diff=lfs merge=lfs -text
*.pkl filter=lfs diff=lfs merge=lfs -text
*.pt filter=lfs diff=lfs merge=lfs -text
*.pth filter=lfs diff=lfs merge=lfs -text
*.rar filter=lfs diff=lfs merge=lfs -text
*.safetensors filter=lfs diff=lfs merge=lfs -text
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
*.tar.* filter=lfs diff=lfs merge=lfs -text
*.tar filter=lfs diff=lfs merge=lfs -text
*.tflite filter=lfs diff=lfs merge=lfs -text
*.tgz filter=lfs diff=lfs merge=lfs -text
*.wasm filter=lfs diff=lfs merge=lfs -text
*.xz filter=lfs diff=lfs merge=lfs -text
*.zip filter=lfs diff=lfs merge=lfs -text
*.zst filter=lfs diff=lfs merge=lfs -text
*tfevents* filter=lfs diff=lfs merge=lfs -text

54
README.md Normal file
View File

@@ -0,0 +1,54 @@
---
library_name: transformers
base_model: gpt_base_config.json
tags:
- generated_from_trainer
model-index:
- name: baby_goldfish_large_new
results: []
---
<!-- This model card has been generated automatically according to the information the Trainer had access to. You
should probably proofread and complete it, then remove this comment. -->
# baby_goldfish_large_new
This model is a fine-tuned version of [gpt_base_config.json](https://huggingface.co/gpt_base_config.json) on the None dataset.
## Model description
More information needed
## Intended uses & limitations
More information needed
## Training and evaluation data
More information needed
## Training procedure
### Training hyperparameters
The following hyperparameters were used during training:
- learning_rate: 0.0001
- train_batch_size: 8
- eval_batch_size: 8
- seed: 43
- optimizer: Use OptimizerNames.ADAMW_TORCH_FUSED with betas=(0.9,0.999) and epsilon=1e-06 and optimizer_args=No additional optimizer arguments
- lr_scheduler_type: linear
- lr_scheduler_warmup_steps: 370
- num_epochs: 10
- mixed_precision_training: Native AMP
### Training results
### Framework versions
- Transformers 4.57.3
- Pytorch 2.9.1+cu128
- Datasets 4.4.1
- Tokenizers 0.22.1

34
config.json Normal file
View File

@@ -0,0 +1,34 @@
{
"activation_function": "gelu",
"architectures": [
"GPT2LMHeadModel"
],
"attn_pdrop": 0.1,
"bos_token_id": 50000,
"dtype": "float32",
"embd_pdrop": 0.1,
"eos_token_id": 50001,
"initializer_range": 0.02,
"layer_norm_epsilon": 1e-05,
"model_type": "gpt2",
"n_ctx": 512,
"n_embd": 768,
"n_head": 12,
"n_inner": 3072,
"n_layer": 12,
"n_positions": 512,
"pad_token_id": 50002,
"prefix": "[CLS]",
"reorder_and_upcast_attn": false,
"resid_pdrop": 0.1,
"scale_attn_by_inverse_layer_idx": false,
"scale_attn_weights": true,
"summary_activation": null,
"summary_first_dropout": 0.1,
"summary_proj_to_labels": true,
"summary_type": "cls_index",
"summary_use_proj": true,
"transformers_version": "4.57.3",
"use_cache": true,
"vocab_size": 51200
}

7
generation_config.json Normal file
View File

@@ -0,0 +1,7 @@
{
"_from_model_config": true,
"bos_token_id": 50000,
"eos_token_id": 50001,
"pad_token_id": 50002,
"transformers_version": "4.57.3"
}

View File

@@ -0,0 +1,34 @@
{
"activation_function": "gelu",
"architectures": [
"GPT2LMHeadModel"
],
"attn_pdrop": 0.1,
"bos_token_id": 50000,
"dtype": "float32",
"embd_pdrop": 0.1,
"eos_token_id": 50001,
"initializer_range": 0.02,
"layer_norm_epsilon": 1e-05,
"model_type": "gpt2",
"n_ctx": 512,
"n_embd": 768,
"n_head": 12,
"n_inner": 3072,
"n_layer": 12,
"n_positions": 512,
"pad_token_id": 50002,
"prefix": "[CLS]",
"reorder_and_upcast_attn": false,
"resid_pdrop": 0.1,
"scale_attn_by_inverse_layer_idx": false,
"scale_attn_weights": true,
"summary_activation": null,
"summary_first_dropout": 0.1,
"summary_proj_to_labels": true,
"summary_type": "cls_index",
"summary_use_proj": true,
"transformers_version": "4.57.3",
"use_cache": true,
"vocab_size": 51200
}

View File

@@ -0,0 +1,7 @@
{
"_from_model_config": true,
"bos_token_id": 50000,
"eos_token_id": 50001,
"pad_token_id": 50002,
"transformers_version": "4.57.3"
}

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:bb714a54e7436386cbe4a6dd08d96875ccf875b3b479c198da494f3d4c279318
size 499098240

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:5cbd356aa7c72514b5cd5e47f061b8047d8a4daac4248848a1064c1d17f62ea7
size 998292875

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:36749db9fd0cc32a73244ea2a1cf9b08a71aa3e0ade1991dd0abd26aeb8596ff
size 14645

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:becbfb948a617a6400f03ed7d70ce73773a652f3d6de4b768b318aed59db3053
size 1383

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:37882662793e589338bcedd1bf979ae6a790c5499d3ceec610dbfa1b3cbdd0ff
size 1465

File diff suppressed because it is too large Load Diff

210940
last-checkpoint/tokenizer.json Normal file

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,895 @@
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 9.970278303161308,
"eval_steps": 500,
"global_step": 36900,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0810591731964334,
"grad_norm": 1.3518340587615967,
"learning_rate": 8.08108108108108e-05,
"loss": 6.5209,
"step": 300
},
{
"epoch": 0.1621183463928668,
"grad_norm": 1.4490776062011719,
"learning_rate": 9.9375e-05,
"loss": 4.8248,
"step": 600
},
{
"epoch": 0.2431775195893002,
"grad_norm": 0.9239162802696228,
"learning_rate": 9.855622270742358e-05,
"loss": 4.6051,
"step": 900
},
{
"epoch": 0.3242366927857336,
"grad_norm": 0.8636089563369751,
"learning_rate": 9.773744541484717e-05,
"loss": 4.5204,
"step": 1200
},
{
"epoch": 0.405295865982167,
"grad_norm": 0.8884899020195007,
"learning_rate": 9.691866812227075e-05,
"loss": 4.4294,
"step": 1500
},
{
"epoch": 0.4863550391786004,
"grad_norm": 0.9425375461578369,
"learning_rate": 9.609989082969433e-05,
"loss": 4.3436,
"step": 1800
},
{
"epoch": 0.5674142123750338,
"grad_norm": 0.9191689491271973,
"learning_rate": 9.528111353711791e-05,
"loss": 4.3269,
"step": 2100
},
{
"epoch": 0.6484733855714672,
"grad_norm": 0.8771488666534424,
"learning_rate": 9.446233624454149e-05,
"loss": 4.2743,
"step": 2400
},
{
"epoch": 0.7295325587679006,
"grad_norm": 0.9781838059425354,
"learning_rate": 9.364355895196507e-05,
"loss": 4.1491,
"step": 2700
},
{
"epoch": 0.810591731964334,
"grad_norm": 1.0138261318206787,
"learning_rate": 9.282478165938865e-05,
"loss": 4.047,
"step": 3000
},
{
"epoch": 0.8916509051607674,
"grad_norm": 1.063523769378662,
"learning_rate": 9.200600436681222e-05,
"loss": 4.0248,
"step": 3300
},
{
"epoch": 0.9727100783572008,
"grad_norm": 0.9346688389778137,
"learning_rate": 9.118722707423582e-05,
"loss": 3.9609,
"step": 3600
},
{
"epoch": 1.053769251553634,
"grad_norm": 0.9149222373962402,
"learning_rate": 9.036844978165939e-05,
"loss": 3.9169,
"step": 3900
},
{
"epoch": 1.1348284247500675,
"grad_norm": 1.1430439949035645,
"learning_rate": 8.954967248908297e-05,
"loss": 3.8597,
"step": 4200
},
{
"epoch": 1.215887597946501,
"grad_norm": 0.9831075072288513,
"learning_rate": 8.873089519650656e-05,
"loss": 3.8311,
"step": 4500
},
{
"epoch": 1.2969467711429343,
"grad_norm": 1.0454624891281128,
"learning_rate": 8.791211790393013e-05,
"loss": 3.7607,
"step": 4800
},
{
"epoch": 1.3780059443393677,
"grad_norm": 0.9663439393043518,
"learning_rate": 8.709334061135371e-05,
"loss": 3.7721,
"step": 5100
},
{
"epoch": 1.4590651175358011,
"grad_norm": 1.133966326713562,
"learning_rate": 8.627456331877731e-05,
"loss": 3.7393,
"step": 5400
},
{
"epoch": 1.5401242907322346,
"grad_norm": 0.9640753865242004,
"learning_rate": 8.545578602620087e-05,
"loss": 3.7126,
"step": 5700
},
{
"epoch": 1.621183463928668,
"grad_norm": 1.0320396423339844,
"learning_rate": 8.463700873362446e-05,
"loss": 3.6677,
"step": 6000
},
{
"epoch": 1.7022426371251014,
"grad_norm": 0.991076648235321,
"learning_rate": 8.381823144104804e-05,
"loss": 3.6452,
"step": 6300
},
{
"epoch": 1.7833018103215346,
"grad_norm": 1.1129027605056763,
"learning_rate": 8.299945414847162e-05,
"loss": 3.6468,
"step": 6600
},
{
"epoch": 1.864360983517968,
"grad_norm": 1.1750997304916382,
"learning_rate": 8.21806768558952e-05,
"loss": 3.6052,
"step": 6900
},
{
"epoch": 1.9454201567144014,
"grad_norm": 1.077567219734192,
"learning_rate": 8.136189956331878e-05,
"loss": 3.5974,
"step": 7200
},
{
"epoch": 2.026479329910835,
"grad_norm": 1.1307553052902222,
"learning_rate": 8.054312227074236e-05,
"loss": 3.5258,
"step": 7500
},
{
"epoch": 2.107538503107268,
"grad_norm": 0.9726724028587341,
"learning_rate": 7.972434497816595e-05,
"loss": 3.473,
"step": 7800
},
{
"epoch": 2.1885976763037016,
"grad_norm": 1.080590844154358,
"learning_rate": 7.890556768558953e-05,
"loss": 3.4878,
"step": 8100
},
{
"epoch": 2.269656849500135,
"grad_norm": 1.0957530736923218,
"learning_rate": 7.808679039301311e-05,
"loss": 3.473,
"step": 8400
},
{
"epoch": 2.3507160226965684,
"grad_norm": 1.1584407091140747,
"learning_rate": 7.726801310043669e-05,
"loss": 3.4178,
"step": 8700
},
{
"epoch": 2.431775195893002,
"grad_norm": 1.1033746004104614,
"learning_rate": 7.644923580786027e-05,
"loss": 3.4216,
"step": 9000
},
{
"epoch": 2.5128343690894352,
"grad_norm": 1.035995364189148,
"learning_rate": 7.563045851528384e-05,
"loss": 3.4284,
"step": 9300
},
{
"epoch": 2.5938935422858687,
"grad_norm": 1.1373919248580933,
"learning_rate": 7.481168122270743e-05,
"loss": 3.419,
"step": 9600
},
{
"epoch": 2.674952715482302,
"grad_norm": 1.0167981386184692,
"learning_rate": 7.3992903930131e-05,
"loss": 3.3995,
"step": 9900
},
{
"epoch": 2.7560118886787355,
"grad_norm": 1.1506949663162231,
"learning_rate": 7.317412663755458e-05,
"loss": 3.4456,
"step": 10200
},
{
"epoch": 2.837071061875169,
"grad_norm": 1.281085729598999,
"learning_rate": 7.235534934497818e-05,
"loss": 3.3682,
"step": 10500
},
{
"epoch": 2.9181302350716023,
"grad_norm": 1.1729835271835327,
"learning_rate": 7.153657205240175e-05,
"loss": 3.3507,
"step": 10800
},
{
"epoch": 2.9991894082680357,
"grad_norm": 1.1416144371032715,
"learning_rate": 7.071779475982533e-05,
"loss": 3.3374,
"step": 11100
},
{
"epoch": 3.080248581464469,
"grad_norm": 1.2445170879364014,
"learning_rate": 6.989901746724891e-05,
"loss": 3.2347,
"step": 11400
},
{
"epoch": 3.1613077546609025,
"grad_norm": 1.1831103563308716,
"learning_rate": 6.908024017467249e-05,
"loss": 3.2341,
"step": 11700
},
{
"epoch": 3.242366927857336,
"grad_norm": 1.1428508758544922,
"learning_rate": 6.826146288209607e-05,
"loss": 3.2382,
"step": 12000
},
{
"epoch": 3.3234261010537693,
"grad_norm": 1.2155035734176636,
"learning_rate": 6.744268558951965e-05,
"loss": 3.2109,
"step": 12300
},
{
"epoch": 3.4044852742502028,
"grad_norm": 1.1355677843093872,
"learning_rate": 6.662390829694324e-05,
"loss": 3.2283,
"step": 12600
},
{
"epoch": 3.485544447446636,
"grad_norm": 1.2202376127243042,
"learning_rate": 6.580513100436682e-05,
"loss": 3.2252,
"step": 12900
},
{
"epoch": 3.5666036206430696,
"grad_norm": 1.2496229410171509,
"learning_rate": 6.49863537117904e-05,
"loss": 3.2408,
"step": 13200
},
{
"epoch": 3.647662793839503,
"grad_norm": 1.1879061460494995,
"learning_rate": 6.416757641921398e-05,
"loss": 3.1999,
"step": 13500
},
{
"epoch": 3.7287219670359364,
"grad_norm": 1.2078286409378052,
"learning_rate": 6.334879912663756e-05,
"loss": 3.184,
"step": 13800
},
{
"epoch": 3.80978114023237,
"grad_norm": 1.157354712486267,
"learning_rate": 6.253002183406114e-05,
"loss": 3.1967,
"step": 14100
},
{
"epoch": 3.8908403134288028,
"grad_norm": 1.0799527168273926,
"learning_rate": 6.171124454148471e-05,
"loss": 3.1845,
"step": 14400
},
{
"epoch": 3.9718994866252366,
"grad_norm": 1.162443995475769,
"learning_rate": 6.0892467248908306e-05,
"loss": 3.1956,
"step": 14700
},
{
"epoch": 4.05295865982167,
"grad_norm": 1.362560749053955,
"learning_rate": 6.007368995633188e-05,
"loss": 3.0933,
"step": 15000
},
{
"epoch": 4.134017833018103,
"grad_norm": 1.3539903163909912,
"learning_rate": 5.9254912663755455e-05,
"loss": 3.0665,
"step": 15300
},
{
"epoch": 4.215077006214536,
"grad_norm": 1.2945058345794678,
"learning_rate": 5.843613537117904e-05,
"loss": 3.0822,
"step": 15600
},
{
"epoch": 4.29613617941097,
"grad_norm": 1.2873772382736206,
"learning_rate": 5.7617358078602625e-05,
"loss": 3.032,
"step": 15900
},
{
"epoch": 4.377195352607403,
"grad_norm": 1.3318586349487305,
"learning_rate": 5.67985807860262e-05,
"loss": 3.0344,
"step": 16200
},
{
"epoch": 4.458254525803837,
"grad_norm": 1.3891690969467163,
"learning_rate": 5.597980349344979e-05,
"loss": 3.0648,
"step": 16500
},
{
"epoch": 4.53931369900027,
"grad_norm": 1.3675804138183594,
"learning_rate": 5.516102620087337e-05,
"loss": 3.0373,
"step": 16800
},
{
"epoch": 4.620372872196704,
"grad_norm": 1.3631969690322876,
"learning_rate": 5.4342248908296944e-05,
"loss": 3.0216,
"step": 17100
},
{
"epoch": 4.701432045393137,
"grad_norm": 1.534043550491333,
"learning_rate": 5.352347161572052e-05,
"loss": 3.068,
"step": 17400
},
{
"epoch": 4.782491218589571,
"grad_norm": 1.3604155778884888,
"learning_rate": 5.2704694323144114e-05,
"loss": 3.04,
"step": 17700
},
{
"epoch": 4.863550391786004,
"grad_norm": 1.4917387962341309,
"learning_rate": 5.188591703056769e-05,
"loss": 3.0478,
"step": 18000
},
{
"epoch": 4.9446095649824375,
"grad_norm": 1.4303983449935913,
"learning_rate": 5.106713973799126e-05,
"loss": 3.012,
"step": 18300
},
{
"epoch": 5.0256687381788705,
"grad_norm": 1.5453460216522217,
"learning_rate": 5.024836244541485e-05,
"loss": 2.9692,
"step": 18600
},
{
"epoch": 5.106727911375304,
"grad_norm": 1.4777169227600098,
"learning_rate": 4.942958515283843e-05,
"loss": 2.9011,
"step": 18900
},
{
"epoch": 5.187787084571737,
"grad_norm": 1.5853521823883057,
"learning_rate": 4.861080786026201e-05,
"loss": 2.9105,
"step": 19200
},
{
"epoch": 5.268846257768171,
"grad_norm": 1.4681776762008667,
"learning_rate": 4.779203056768559e-05,
"loss": 2.8882,
"step": 19500
},
{
"epoch": 5.349905430964604,
"grad_norm": 1.3525965213775635,
"learning_rate": 4.697325327510918e-05,
"loss": 2.9123,
"step": 19800
},
{
"epoch": 5.430964604161037,
"grad_norm": 1.4700337648391724,
"learning_rate": 4.615447598253275e-05,
"loss": 2.8784,
"step": 20100
},
{
"epoch": 5.512023777357471,
"grad_norm": 1.5051637887954712,
"learning_rate": 4.533569868995633e-05,
"loss": 2.9082,
"step": 20400
},
{
"epoch": 5.593082950553905,
"grad_norm": 1.4689549207687378,
"learning_rate": 4.4516921397379915e-05,
"loss": 2.9082,
"step": 20700
},
{
"epoch": 5.674142123750338,
"grad_norm": 1.5370858907699585,
"learning_rate": 4.3698144104803496e-05,
"loss": 2.9034,
"step": 21000
},
{
"epoch": 5.755201296946771,
"grad_norm": 1.5931947231292725,
"learning_rate": 4.287936681222708e-05,
"loss": 2.8862,
"step": 21300
},
{
"epoch": 5.836260470143205,
"grad_norm": 1.4349807500839233,
"learning_rate": 4.206058951965066e-05,
"loss": 2.891,
"step": 21600
},
{
"epoch": 5.917319643339638,
"grad_norm": 1.625874400138855,
"learning_rate": 4.124181222707424e-05,
"loss": 2.87,
"step": 21900
},
{
"epoch": 5.998378816536071,
"grad_norm": 1.4641919136047363,
"learning_rate": 4.0423034934497815e-05,
"loss": 2.8981,
"step": 22200
},
{
"epoch": 6.079437989732504,
"grad_norm": 1.3958481550216675,
"learning_rate": 3.96042576419214e-05,
"loss": 2.7289,
"step": 22500
},
{
"epoch": 6.160497162928938,
"grad_norm": 1.586595058441162,
"learning_rate": 3.8785480349344985e-05,
"loss": 2.7348,
"step": 22800
},
{
"epoch": 6.241556336125371,
"grad_norm": 1.7938852310180664,
"learning_rate": 3.796670305676856e-05,
"loss": 2.7383,
"step": 23100
},
{
"epoch": 6.322615509321805,
"grad_norm": 1.7327919006347656,
"learning_rate": 3.714792576419214e-05,
"loss": 2.7897,
"step": 23400
},
{
"epoch": 6.403674682518238,
"grad_norm": 1.5519119501113892,
"learning_rate": 3.632914847161572e-05,
"loss": 2.7947,
"step": 23700
},
{
"epoch": 6.484733855714672,
"grad_norm": 1.7375255823135376,
"learning_rate": 3.5510371179039304e-05,
"loss": 2.79,
"step": 24000
},
{
"epoch": 6.565793028911105,
"grad_norm": 1.8126753568649292,
"learning_rate": 3.4691593886462886e-05,
"loss": 2.7754,
"step": 24300
},
{
"epoch": 6.646852202107539,
"grad_norm": 1.8342634439468384,
"learning_rate": 3.387281659388647e-05,
"loss": 2.7628,
"step": 24600
},
{
"epoch": 6.727911375303972,
"grad_norm": 1.7573124170303345,
"learning_rate": 3.305403930131005e-05,
"loss": 2.7771,
"step": 24900
},
{
"epoch": 6.8089705485004055,
"grad_norm": 1.7912206649780273,
"learning_rate": 3.2235262008733623e-05,
"loss": 2.7588,
"step": 25200
},
{
"epoch": 6.8900297216968385,
"grad_norm": 1.5853058099746704,
"learning_rate": 3.1416484716157205e-05,
"loss": 2.7534,
"step": 25500
},
{
"epoch": 6.971088894893272,
"grad_norm": 1.6281930208206177,
"learning_rate": 3.0597707423580786e-05,
"loss": 2.7513,
"step": 25800
},
{
"epoch": 7.052148068089705,
"grad_norm": 1.8934043645858765,
"learning_rate": 2.9778930131004368e-05,
"loss": 2.7019,
"step": 26100
},
{
"epoch": 7.133207241286139,
"grad_norm": 1.7691494226455688,
"learning_rate": 2.896015283842795e-05,
"loss": 2.6608,
"step": 26400
},
{
"epoch": 7.214266414482572,
"grad_norm": 1.8065195083618164,
"learning_rate": 2.8141375545851527e-05,
"loss": 2.6537,
"step": 26700
},
{
"epoch": 7.295325587679006,
"grad_norm": 1.902382254600525,
"learning_rate": 2.7322598253275112e-05,
"loss": 2.6414,
"step": 27000
},
{
"epoch": 7.376384760875439,
"grad_norm": 1.7119675874710083,
"learning_rate": 2.6503820960698687e-05,
"loss": 2.6422,
"step": 27300
},
{
"epoch": 7.457443934071873,
"grad_norm": 1.9427614212036133,
"learning_rate": 2.5685043668122272e-05,
"loss": 2.6449,
"step": 27600
},
{
"epoch": 7.538503107268306,
"grad_norm": 1.9298570156097412,
"learning_rate": 2.4866266375545853e-05,
"loss": 2.642,
"step": 27900
},
{
"epoch": 7.61956228046474,
"grad_norm": 1.762351393699646,
"learning_rate": 2.404748908296943e-05,
"loss": 2.6455,
"step": 28200
},
{
"epoch": 7.700621453661173,
"grad_norm": 1.7794419527053833,
"learning_rate": 2.3228711790393016e-05,
"loss": 2.6509,
"step": 28500
},
{
"epoch": 7.781680626857606,
"grad_norm": 1.777873158454895,
"learning_rate": 2.2409934497816594e-05,
"loss": 2.6572,
"step": 28800
},
{
"epoch": 7.862739800054039,
"grad_norm": 1.815291166305542,
"learning_rate": 2.1591157205240176e-05,
"loss": 2.6432,
"step": 29100
},
{
"epoch": 7.943798973250473,
"grad_norm": 1.8740344047546387,
"learning_rate": 2.0772379912663757e-05,
"loss": 2.663,
"step": 29400
},
{
"epoch": 8.024858146446906,
"grad_norm": 1.8893027305603027,
"learning_rate": 1.9953602620087335e-05,
"loss": 2.5881,
"step": 29700
},
{
"epoch": 8.10591731964334,
"grad_norm": 1.9792145490646362,
"learning_rate": 1.913482532751092e-05,
"loss": 2.5475,
"step": 30000
},
{
"epoch": 8.186976492839772,
"grad_norm": 1.8607609272003174,
"learning_rate": 1.83160480349345e-05,
"loss": 2.5636,
"step": 30300
},
{
"epoch": 8.268035666036207,
"grad_norm": 1.8906886577606201,
"learning_rate": 1.749727074235808e-05,
"loss": 2.5569,
"step": 30600
},
{
"epoch": 8.34909483923264,
"grad_norm": 2.0039448738098145,
"learning_rate": 1.667849344978166e-05,
"loss": 2.553,
"step": 30900
},
{
"epoch": 8.430154012429073,
"grad_norm": 1.9279450178146362,
"learning_rate": 1.585971615720524e-05,
"loss": 2.5327,
"step": 31200
},
{
"epoch": 8.511213185625508,
"grad_norm": 2.061372756958008,
"learning_rate": 1.5040938864628823e-05,
"loss": 2.5668,
"step": 31500
},
{
"epoch": 8.59227235882194,
"grad_norm": 2.1464438438415527,
"learning_rate": 1.4222161572052402e-05,
"loss": 2.5658,
"step": 31800
},
{
"epoch": 8.673331532018373,
"grad_norm": 2.0610902309417725,
"learning_rate": 1.3403384279475984e-05,
"loss": 2.5592,
"step": 32100
},
{
"epoch": 8.754390705214806,
"grad_norm": 2.092325448989868,
"learning_rate": 1.2584606986899564e-05,
"loss": 2.543,
"step": 32400
},
{
"epoch": 8.83544987841124,
"grad_norm": 2.0462098121643066,
"learning_rate": 1.1765829694323145e-05,
"loss": 2.5544,
"step": 32700
},
{
"epoch": 8.916509051607674,
"grad_norm": 2.0339772701263428,
"learning_rate": 1.0947052401746725e-05,
"loss": 2.5651,
"step": 33000
},
{
"epoch": 8.997568224804107,
"grad_norm": 2.069972276687622,
"learning_rate": 1.0128275109170306e-05,
"loss": 2.5323,
"step": 33300
},
{
"epoch": 9.07862739800054,
"grad_norm": 2.0698232650756836,
"learning_rate": 9.309497816593888e-06,
"loss": 2.4645,
"step": 33600
},
{
"epoch": 9.159686571196973,
"grad_norm": 2.0408966541290283,
"learning_rate": 8.490720524017468e-06,
"loss": 2.4776,
"step": 33900
},
{
"epoch": 9.240745744393408,
"grad_norm": 2.258899688720703,
"learning_rate": 7.671943231441048e-06,
"loss": 2.5065,
"step": 34200
},
{
"epoch": 9.32180491758984,
"grad_norm": 1.9570540189743042,
"learning_rate": 6.853165938864629e-06,
"loss": 2.4528,
"step": 34500
},
{
"epoch": 9.402864090786274,
"grad_norm": 2.107398509979248,
"learning_rate": 6.03438864628821e-06,
"loss": 2.4907,
"step": 34800
},
{
"epoch": 9.483923263982707,
"grad_norm": 1.9422398805618286,
"learning_rate": 5.21561135371179e-06,
"loss": 2.4871,
"step": 35100
},
{
"epoch": 9.564982437179141,
"grad_norm": 2.015700340270996,
"learning_rate": 4.396834061135372e-06,
"loss": 2.476,
"step": 35400
},
{
"epoch": 9.646041610375574,
"grad_norm": 2.022306203842163,
"learning_rate": 3.5780567685589524e-06,
"loss": 2.4837,
"step": 35700
},
{
"epoch": 9.727100783572007,
"grad_norm": 2.170642375946045,
"learning_rate": 2.759279475982533e-06,
"loss": 2.4854,
"step": 36000
},
{
"epoch": 9.80815995676844,
"grad_norm": 2.0419552326202393,
"learning_rate": 1.9405021834061136e-06,
"loss": 2.4956,
"step": 36300
},
{
"epoch": 9.889219129964875,
"grad_norm": 2.17526912689209,
"learning_rate": 1.1217248908296945e-06,
"loss": 2.5115,
"step": 36600
},
{
"epoch": 9.970278303161308,
"grad_norm": 1.8749662637710571,
"learning_rate": 3.0294759825327515e-07,
"loss": 2.4465,
"step": 36900
}
],
"logging_steps": 300,
"max_steps": 37010,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 300,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 7.7119298076672e+16,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:3872bdb655c67bdf5919cf9e0b02424ebb9d7fb07e8fa96ab22b7bdf3fb19645
size 5905

3
model.safetensors Normal file
View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:e80668711160fe88c57b19b95bce171c5525abdaf0bc18b28a1aaec9c8a69340
size 499098240

1243
special_tokens_map.json Normal file

File diff suppressed because it is too large Load Diff

210940
tokenizer.json Normal file

File diff suppressed because one or more lines are too long

10829
tokenizer_config.json Normal file

File diff suppressed because it is too large Load Diff

3
training_args.bin Normal file
View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:3872bdb655c67bdf5919cf9e0b02424ebb9d7fb07e8fa96ab22b7bdf3fb19645
size 5905