Files
naima-dojo-741hz-v3/last-checkpoint/trainer_state.json
ModelHub XC 62f79733d7 初始化项目,由ModelHub XC社区提供模型
Model: misterJB/naima-dojo-741hz-v3
Source: Original Platform
2026-05-19 23:39:18 +08:00

675 lines
19 KiB
JSON

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9862844814301125,
"eval_steps": 500,
"global_step": 1600,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 5.028895375132561,
"epoch": 0.015410695022345508,
"grad_norm": 6.3125,
"learning_rate": 4.897959183673469e-06,
"loss": 7.5902880859375,
"mean_token_accuracy": 0.09856362253893167,
"num_tokens": 132392.0,
"step": 25
},
{
"entropy": 5.056147763133049,
"epoch": 0.030821390044691015,
"grad_norm": 3.28125,
"learning_rate": 1e-05,
"loss": 4.8427432250976565,
"mean_token_accuracy": 0.33738345025107264,
"num_tokens": 254164.0,
"step": 50
},
{
"entropy": 3.3333908554911615,
"epoch": 0.04623208506703652,
"grad_norm": 2.046875,
"learning_rate": 9.841168996188057e-06,
"loss": 3.2706063842773436,
"mean_token_accuracy": 0.5363785127736628,
"num_tokens": 383737.0,
"step": 75
},
{
"entropy": 2.5955499114096163,
"epoch": 0.06164278008938203,
"grad_norm": 1.6171875,
"learning_rate": 9.682337992376113e-06,
"loss": 2.6094720458984373,
"mean_token_accuracy": 0.631682768985629,
"num_tokens": 516457.0,
"step": 100
},
{
"entropy": 2.3073494301736357,
"epoch": 0.07705347511172754,
"grad_norm": 2.078125,
"learning_rate": 9.523506988564168e-06,
"loss": 2.3903713989257813,
"mean_token_accuracy": 0.6567921816185117,
"num_tokens": 636033.0,
"step": 125
},
{
"entropy": 2.3254229539632796,
"epoch": 0.09246417013407304,
"grad_norm": 1.7890625,
"learning_rate": 9.364675984752224e-06,
"loss": 2.4465567016601564,
"mean_token_accuracy": 0.6496502718515694,
"num_tokens": 777342.0,
"step": 150
},
{
"entropy": 2.2117018654197453,
"epoch": 0.10787486515641856,
"grad_norm": 1.484375,
"learning_rate": 9.20584498094028e-06,
"loss": 2.288543701171875,
"mean_token_accuracy": 0.664428948648274,
"num_tokens": 902453.0,
"step": 175
},
{
"entropy": 2.174446207880974,
"epoch": 0.12328556017876406,
"grad_norm": 1.5078125,
"learning_rate": 9.047013977128337e-06,
"loss": 2.327859649658203,
"mean_token_accuracy": 0.6647384916990995,
"num_tokens": 1039427.0,
"step": 200
},
{
"entropy": 2.0399726448208093,
"epoch": 0.13869625520110956,
"grad_norm": 1.640625,
"learning_rate": 8.888182973316391e-06,
"loss": 2.156089324951172,
"mean_token_accuracy": 0.6871329558640719,
"num_tokens": 1166840.0,
"step": 225
},
{
"entropy": 2.071619209870696,
"epoch": 0.15410695022345508,
"grad_norm": 2.203125,
"learning_rate": 8.729351969504447e-06,
"loss": 2.2405677795410157,
"mean_token_accuracy": 0.680448934994638,
"num_tokens": 1300789.0,
"step": 250
},
{
"entropy": 2.032237692028284,
"epoch": 0.1695176452458006,
"grad_norm": 1.78125,
"learning_rate": 8.570520965692504e-06,
"loss": 2.1679531860351564,
"mean_token_accuracy": 0.6796054230630397,
"num_tokens": 1432596.0,
"step": 275
},
{
"entropy": 1.9784896748512983,
"epoch": 0.18492834026814609,
"grad_norm": 1.109375,
"learning_rate": 8.41168996188056e-06,
"loss": 2.0713957214355467,
"mean_token_accuracy": 0.6908648996800184,
"num_tokens": 1559688.0,
"step": 300
},
{
"entropy": 1.9056549924612045,
"epoch": 0.2003390352904916,
"grad_norm": 1.171875,
"learning_rate": 8.252858958068616e-06,
"loss": 2.1153318786621096,
"mean_token_accuracy": 0.7020338359847664,
"num_tokens": 1688970.0,
"step": 325
},
{
"entropy": 1.9747070623934269,
"epoch": 0.21574973031283712,
"grad_norm": 1.8046875,
"learning_rate": 8.09402795425667e-06,
"loss": 2.1253143310546876,
"mean_token_accuracy": 0.6871407954767347,
"num_tokens": 1816182.0,
"step": 350
},
{
"entropy": 2.024077450931072,
"epoch": 0.2311604253351826,
"grad_norm": 1.09375,
"learning_rate": 7.935196950444729e-06,
"loss": 2.1842442321777344,
"mean_token_accuracy": 0.677086523026228,
"num_tokens": 1950644.0,
"step": 375
},
{
"entropy": 1.9184869919717311,
"epoch": 0.24657112035752812,
"grad_norm": 1.3203125,
"learning_rate": 7.776365946632783e-06,
"loss": 2.0085203552246096,
"mean_token_accuracy": 0.694020996466279,
"num_tokens": 2079928.0,
"step": 400
},
{
"entropy": 1.8541498044878244,
"epoch": 0.26198181537987364,
"grad_norm": 1.09375,
"learning_rate": 7.617534942820839e-06,
"loss": 2.0481745910644533,
"mean_token_accuracy": 0.7019787009432912,
"num_tokens": 2218227.0,
"step": 425
},
{
"entropy": 1.9318342459201814,
"epoch": 0.27739251040221913,
"grad_norm": 1.7734375,
"learning_rate": 7.458703939008896e-06,
"loss": 1.995413818359375,
"mean_token_accuracy": 0.6925960695371032,
"num_tokens": 2350679.0,
"step": 450
},
{
"entropy": 1.7835957117378711,
"epoch": 0.29280320542456467,
"grad_norm": 1.2734375,
"learning_rate": 7.299872935196951e-06,
"loss": 1.928093719482422,
"mean_token_accuracy": 0.7172103912383317,
"num_tokens": 2472981.0,
"step": 475
},
{
"entropy": 1.9411964005231857,
"epoch": 0.30821390044691016,
"grad_norm": 1.609375,
"learning_rate": 7.141041931385007e-06,
"loss": 2.106062774658203,
"mean_token_accuracy": 0.6849398523569107,
"num_tokens": 2614781.0,
"step": 500
},
{
"entropy": 1.7948928633891046,
"epoch": 0.32362459546925565,
"grad_norm": 1.859375,
"learning_rate": 6.982210927573063e-06,
"loss": 1.9581819152832032,
"mean_token_accuracy": 0.70925975356251,
"num_tokens": 2743217.0,
"step": 525
},
{
"entropy": 1.720666101127863,
"epoch": 0.3390352904916012,
"grad_norm": 1.5703125,
"learning_rate": 6.823379923761118e-06,
"loss": 1.8939352416992188,
"mean_token_accuracy": 0.7219540763273835,
"num_tokens": 2872564.0,
"step": 550
},
{
"entropy": 1.765952904894948,
"epoch": 0.3544459855139467,
"grad_norm": 1.1640625,
"learning_rate": 6.6645489199491745e-06,
"loss": 1.8741084289550782,
"mean_token_accuracy": 0.7158531962707638,
"num_tokens": 3003409.0,
"step": 575
},
{
"entropy": 1.7450720983743668,
"epoch": 0.36985668053629217,
"grad_norm": 1.53125,
"learning_rate": 6.505717916137231e-06,
"loss": 1.8735758972167968,
"mean_token_accuracy": 0.7193968405947089,
"num_tokens": 3134051.0,
"step": 600
},
{
"entropy": 1.7807146763801576,
"epoch": 0.3852673755586377,
"grad_norm": 1.5078125,
"learning_rate": 6.346886912325286e-06,
"loss": 1.89009521484375,
"mean_token_accuracy": 0.7120489033311606,
"num_tokens": 3272289.0,
"step": 625
},
{
"entropy": 1.688743471726775,
"epoch": 0.4006780705809832,
"grad_norm": 1.5390625,
"learning_rate": 6.188055908513342e-06,
"loss": 1.8537098693847656,
"mean_token_accuracy": 0.7283642463758588,
"num_tokens": 3395473.0,
"step": 650
},
{
"entropy": 1.681125262901187,
"epoch": 0.4160887656033287,
"grad_norm": 1.21875,
"learning_rate": 6.029224904701399e-06,
"loss": 1.7891111755371094,
"mean_token_accuracy": 0.7292297334969043,
"num_tokens": 3524237.0,
"step": 675
},
{
"entropy": 1.7346787237748504,
"epoch": 0.43149946062567424,
"grad_norm": 1.625,
"learning_rate": 5.870393900889454e-06,
"loss": 1.9180752563476562,
"mean_token_accuracy": 0.716539504416287,
"num_tokens": 3660040.0,
"step": 700
},
{
"entropy": 1.7547597530111672,
"epoch": 0.4469101556480197,
"grad_norm": 1.7109375,
"learning_rate": 5.71156289707751e-06,
"loss": 1.8909840393066406,
"mean_token_accuracy": 0.7133365147560835,
"num_tokens": 3791154.0,
"step": 725
},
{
"entropy": 1.792205568253994,
"epoch": 0.4623208506703652,
"grad_norm": 1.203125,
"learning_rate": 5.552731893265566e-06,
"loss": 1.93789306640625,
"mean_token_accuracy": 0.7044468146562576,
"num_tokens": 3930320.0,
"step": 750
},
{
"entropy": 1.801283170208335,
"epoch": 0.47773154569271076,
"grad_norm": 1.3125,
"learning_rate": 5.393900889453621e-06,
"loss": 1.971471405029297,
"mean_token_accuracy": 0.702786465510726,
"num_tokens": 4061353.0,
"step": 775
},
{
"entropy": 1.7702496079355479,
"epoch": 0.49314224071505625,
"grad_norm": 1.515625,
"learning_rate": 5.235069885641678e-06,
"loss": 1.8937255859375,
"mean_token_accuracy": 0.7105545987561345,
"num_tokens": 4188252.0,
"step": 800
},
{
"entropy": 1.746090711504221,
"epoch": 0.5085529357374018,
"grad_norm": 1.3671875,
"learning_rate": 5.076238881829734e-06,
"loss": 1.8904119873046874,
"mean_token_accuracy": 0.710063861683011,
"num_tokens": 4319900.0,
"step": 825
},
{
"entropy": 1.6968115794286132,
"epoch": 0.5239636307597473,
"grad_norm": 1.40625,
"learning_rate": 4.91740787801779e-06,
"loss": 1.8843421936035156,
"mean_token_accuracy": 0.7204603585228324,
"num_tokens": 4452384.0,
"step": 850
},
{
"entropy": 1.7424921029433609,
"epoch": 0.5393743257820928,
"grad_norm": 1.0625,
"learning_rate": 4.758576874205845e-06,
"loss": 1.8412220764160157,
"mean_token_accuracy": 0.7172071708366274,
"num_tokens": 4583222.0,
"step": 875
},
{
"entropy": 1.7446832180023193,
"epoch": 0.5547850208044383,
"grad_norm": 1.7109375,
"learning_rate": 4.599745870393902e-06,
"loss": 1.9152328491210937,
"mean_token_accuracy": 0.7119575057178735,
"num_tokens": 4715126.0,
"step": 900
},
{
"entropy": 1.7576309859752655,
"epoch": 0.5701957158267837,
"grad_norm": 1.96875,
"learning_rate": 4.440914866581957e-06,
"loss": 1.8862844848632812,
"mean_token_accuracy": 0.7049576634168625,
"num_tokens": 4850794.0,
"step": 925
},
{
"entropy": 1.7258573825657368,
"epoch": 0.5856064108491293,
"grad_norm": 1.1875,
"learning_rate": 4.282083862770013e-06,
"loss": 1.818639678955078,
"mean_token_accuracy": 0.716189993545413,
"num_tokens": 4981104.0,
"step": 950
},
{
"entropy": 1.6470270904898643,
"epoch": 0.6010171058714748,
"grad_norm": 1.2421875,
"learning_rate": 4.123252858958069e-06,
"loss": 1.7824436950683593,
"mean_token_accuracy": 0.7320361129194498,
"num_tokens": 5114019.0,
"step": 975
},
{
"entropy": 1.62755079947412,
"epoch": 0.6164278008938203,
"grad_norm": 1.015625,
"learning_rate": 3.964421855146125e-06,
"loss": 1.7079803466796875,
"mean_token_accuracy": 0.7364445444941521,
"num_tokens": 5243964.0,
"step": 1000
},
{
"entropy": 1.7295116788893938,
"epoch": 0.6318384959161658,
"grad_norm": 1.3125,
"learning_rate": 3.8055908513341803e-06,
"loss": 1.844159393310547,
"mean_token_accuracy": 0.712419720813632,
"num_tokens": 5377738.0,
"step": 1025
},
{
"entropy": 1.6804203514009715,
"epoch": 0.6472491909385113,
"grad_norm": 1.2734375,
"learning_rate": 3.6467598475222366e-06,
"loss": 1.8214646911621093,
"mean_token_accuracy": 0.7212847074493766,
"num_tokens": 5510981.0,
"step": 1050
},
{
"entropy": 1.5993686743080615,
"epoch": 0.6626598859608568,
"grad_norm": 1.2890625,
"learning_rate": 3.4879288437102924e-06,
"loss": 1.6954243469238282,
"mean_token_accuracy": 0.7390070861950516,
"num_tokens": 5633201.0,
"step": 1075
},
{
"entropy": 1.667136338762939,
"epoch": 0.6780705809832024,
"grad_norm": 1.4375,
"learning_rate": 3.3290978398983487e-06,
"loss": 1.7823049926757812,
"mean_token_accuracy": 0.7207983901910484,
"num_tokens": 5759874.0,
"step": 1100
},
{
"entropy": 1.678079522177577,
"epoch": 0.6934812760055479,
"grad_norm": 1.25,
"learning_rate": 3.170266836086404e-06,
"loss": 1.7977125549316406,
"mean_token_accuracy": 0.7210700345411897,
"num_tokens": 5883476.0,
"step": 1125
},
{
"entropy": 1.6764129892736674,
"epoch": 0.7088919710278934,
"grad_norm": 1.2578125,
"learning_rate": 3.0114358322744603e-06,
"loss": 1.8482670593261719,
"mean_token_accuracy": 0.7182181442528963,
"num_tokens": 6011086.0,
"step": 1150
},
{
"entropy": 1.7141736481338739,
"epoch": 0.7243026660502389,
"grad_norm": 2.078125,
"learning_rate": 2.852604828462516e-06,
"loss": 1.8182452392578126,
"mean_token_accuracy": 0.7161370900273323,
"num_tokens": 6146546.0,
"step": 1175
},
{
"entropy": 1.641965696439147,
"epoch": 0.7397133610725843,
"grad_norm": 1.375,
"learning_rate": 2.693773824650572e-06,
"loss": 1.779376220703125,
"mean_token_accuracy": 0.7270625644922256,
"num_tokens": 6281309.0,
"step": 1200
},
{
"entropy": 1.6535117710381746,
"epoch": 0.7551240560949298,
"grad_norm": 1.5703125,
"learning_rate": 2.534942820838628e-06,
"loss": 1.762202606201172,
"mean_token_accuracy": 0.7227025451511144,
"num_tokens": 6407111.0,
"step": 1225
},
{
"entropy": 1.5914642249792814,
"epoch": 0.7705347511172754,
"grad_norm": 1.2265625,
"learning_rate": 2.376111817026684e-06,
"loss": 1.7256475830078124,
"mean_token_accuracy": 0.7357470904290676,
"num_tokens": 6537257.0,
"step": 1250
},
{
"entropy": 1.7124161531031132,
"epoch": 0.7859454461396209,
"grad_norm": 1.2890625,
"learning_rate": 2.21728081321474e-06,
"loss": 1.8616783142089843,
"mean_token_accuracy": 0.7149319493025541,
"num_tokens": 6662719.0,
"step": 1275
},
{
"entropy": 1.6728860459476709,
"epoch": 0.8013561411619664,
"grad_norm": 1.0390625,
"learning_rate": 2.0584498094027953e-06,
"loss": 1.8268055725097656,
"mean_token_accuracy": 0.7221832738444209,
"num_tokens": 6798456.0,
"step": 1300
},
{
"entropy": 1.6639322647452355,
"epoch": 0.8167668361843119,
"grad_norm": 1.421875,
"learning_rate": 1.8996188055908516e-06,
"loss": 1.7712481689453126,
"mean_token_accuracy": 0.72425989869982,
"num_tokens": 6928111.0,
"step": 1325
},
{
"entropy": 1.6321870504319669,
"epoch": 0.8321775312066574,
"grad_norm": 1.4609375,
"learning_rate": 1.7407878017789074e-06,
"loss": 1.8233981323242188,
"mean_token_accuracy": 0.728203468695283,
"num_tokens": 7062876.0,
"step": 1350
},
{
"entropy": 1.8206283743306995,
"epoch": 0.847588226229003,
"grad_norm": 1.3671875,
"learning_rate": 1.5819567979669634e-06,
"loss": 1.9631840515136718,
"mean_token_accuracy": 0.7005566702410578,
"num_tokens": 7198189.0,
"step": 1375
},
{
"entropy": 1.6183030263334512,
"epoch": 0.8629989212513485,
"grad_norm": 1.171875,
"learning_rate": 1.4231257941550193e-06,
"loss": 1.7469532775878907,
"mean_token_accuracy": 0.7299554903805255,
"num_tokens": 7331299.0,
"step": 1400
},
{
"entropy": 1.6894471324980258,
"epoch": 0.878409616273694,
"grad_norm": 0.95703125,
"learning_rate": 1.2642947903430749e-06,
"loss": 1.819949951171875,
"mean_token_accuracy": 0.7225298710912466,
"num_tokens": 7468273.0,
"step": 1425
},
{
"entropy": 1.6117998372018336,
"epoch": 0.8938203112960394,
"grad_norm": 1.3125,
"learning_rate": 1.105463786531131e-06,
"loss": 1.7713958740234375,
"mean_token_accuracy": 0.7342760527133941,
"num_tokens": 7598957.0,
"step": 1450
},
{
"entropy": 1.7378646701574325,
"epoch": 0.9092310063183849,
"grad_norm": 1.109375,
"learning_rate": 9.466327827191868e-07,
"loss": 1.8703489685058594,
"mean_token_accuracy": 0.7118765298649669,
"num_tokens": 7731099.0,
"step": 1475
},
{
"entropy": 1.6067133033648133,
"epoch": 0.9246417013407304,
"grad_norm": 0.9453125,
"learning_rate": 7.878017789072427e-07,
"loss": 1.7261012268066407,
"mean_token_accuracy": 0.7332322986423969,
"num_tokens": 7858562.0,
"step": 1500
},
{
"entropy": 1.6211172859743237,
"epoch": 0.940052396363076,
"grad_norm": 1.1875,
"learning_rate": 6.289707750952986e-07,
"loss": 1.801383514404297,
"mean_token_accuracy": 0.7289227614179253,
"num_tokens": 7990016.0,
"step": 1525
},
{
"entropy": 1.6162557833641769,
"epoch": 0.9554630913854215,
"grad_norm": 1.125,
"learning_rate": 4.7013977128335456e-07,
"loss": 1.7632601928710938,
"mean_token_accuracy": 0.7340098781138659,
"num_tokens": 8128392.0,
"step": 1550
},
{
"entropy": 1.638201398998499,
"epoch": 0.970873786407767,
"grad_norm": 1.0546875,
"learning_rate": 3.1130876747141044e-07,
"loss": 1.7828684997558595,
"mean_token_accuracy": 0.7265522088482976,
"num_tokens": 8255611.0,
"step": 1575
},
{
"entropy": 1.742434518635273,
"epoch": 0.9862844814301125,
"grad_norm": 1.0546875,
"learning_rate": 1.5247776365946635e-07,
"loss": 1.8525875854492186,
"mean_token_accuracy": 0.7128269827365875,
"num_tokens": 8385670.0,
"step": 1600
}
],
"logging_steps": 25,
"max_steps": 1623,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 200,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.0231669797680909e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}