Files
Llama-3.1-8B-coding/checkpoint-556/trainer_state.json
ModelHub XC 39e6955dd1 初始化项目,由ModelHub XC社区提供模型
Model: mremila/Llama-3.1-8B-coding
Source: Original Platform
2026-04-25 21:02:20 +08:00

585 lines
17 KiB
JSON

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 556,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 1.4901377972215415,
"epoch": 0.017992690469496767,
"grad_norm": 2.2936885356903076,
"learning_rate": 2.647058823529412e-06,
"loss": 1.6211814880371094,
"mean_token_accuracy": 0.6704532062634826,
"num_tokens": 1320132.0,
"step": 10
},
{
"entropy": 1.276751457154751,
"epoch": 0.03598538093899353,
"grad_norm": 0.6068552732467651,
"learning_rate": 4.9814471243042675e-06,
"loss": 1.2583621978759765,
"mean_token_accuracy": 0.7108760349452495,
"num_tokens": 2628713.0,
"step": 20
},
{
"entropy": 1.1512113714590668,
"epoch": 0.0539780714084903,
"grad_norm": 0.0513942688703537,
"learning_rate": 4.888682745825603e-06,
"loss": 1.1380729675292969,
"mean_token_accuracy": 0.7266728295013308,
"num_tokens": 3970015.0,
"step": 30
},
{
"entropy": 1.1279350664466619,
"epoch": 0.07197076187798707,
"grad_norm": 0.054850462824106216,
"learning_rate": 4.795918367346939e-06,
"loss": 1.0975475311279297,
"mean_token_accuracy": 0.7291999347507954,
"num_tokens": 5293538.0,
"step": 40
},
{
"entropy": 1.115758778527379,
"epoch": 0.08996345234748383,
"grad_norm": 0.04872556030750275,
"learning_rate": 4.7031539888682745e-06,
"loss": 1.0612051010131835,
"mean_token_accuracy": 0.7381280666217208,
"num_tokens": 6620811.0,
"step": 50
},
{
"entropy": 1.1105864774435759,
"epoch": 0.1079561428169806,
"grad_norm": 0.04739998281002045,
"learning_rate": 4.610389610389611e-06,
"loss": 1.0470812797546387,
"mean_token_accuracy": 0.7379071025177837,
"num_tokens": 7936154.0,
"step": 60
},
{
"entropy": 1.0977919282391668,
"epoch": 0.12594883328647738,
"grad_norm": 0.040797509253025055,
"learning_rate": 4.517625231910946e-06,
"loss": 1.0206071853637695,
"mean_token_accuracy": 0.741416247934103,
"num_tokens": 9258443.0,
"step": 70
},
{
"entropy": 1.0766226774081589,
"epoch": 0.14394152375597413,
"grad_norm": 0.04117418825626373,
"learning_rate": 4.424860853432282e-06,
"loss": 1.0037202835083008,
"mean_token_accuracy": 0.7426602357998491,
"num_tokens": 10559451.0,
"step": 80
},
{
"entropy": 1.0392444429919123,
"epoch": 0.1619342142254709,
"grad_norm": 0.03727104142308235,
"learning_rate": 4.332096474953618e-06,
"loss": 0.9694362640380859,
"mean_token_accuracy": 0.7481566898524761,
"num_tokens": 11859629.0,
"step": 90
},
{
"entropy": 1.030888595432043,
"epoch": 0.17992690469496767,
"grad_norm": 0.0377194844186306,
"learning_rate": 4.239332096474954e-06,
"loss": 0.9774051666259765,
"mean_token_accuracy": 0.7471670845523477,
"num_tokens": 13170517.0,
"step": 100
},
{
"entropy": 0.9908933199942112,
"epoch": 0.19791959516446445,
"grad_norm": 0.03397062420845032,
"learning_rate": 4.14656771799629e-06,
"loss": 0.9399270057678223,
"mean_token_accuracy": 0.7530543757602572,
"num_tokens": 14480196.0,
"step": 110
},
{
"entropy": 0.9985105341300369,
"epoch": 0.2159122856339612,
"grad_norm": 0.038795359432697296,
"learning_rate": 4.053803339517626e-06,
"loss": 0.9471940994262695,
"mean_token_accuracy": 0.7546697033569216,
"num_tokens": 15807230.0,
"step": 120
},
{
"entropy": 0.9790718862786889,
"epoch": 0.23390497610345798,
"grad_norm": 0.03815858066082001,
"learning_rate": 3.961038961038962e-06,
"loss": 0.925960922241211,
"mean_token_accuracy": 0.7591136118397117,
"num_tokens": 17157655.0,
"step": 130
},
{
"entropy": 0.9843037761747837,
"epoch": 0.25189766657295476,
"grad_norm": 0.03516776114702225,
"learning_rate": 3.868274582560297e-06,
"loss": 0.9341155052185058,
"mean_token_accuracy": 0.7569106232374907,
"num_tokens": 18481580.0,
"step": 140
},
{
"entropy": 0.9783661976456642,
"epoch": 0.2698903570424515,
"grad_norm": 0.034192971885204315,
"learning_rate": 3.7755102040816327e-06,
"loss": 0.918891716003418,
"mean_token_accuracy": 0.7582158392295242,
"num_tokens": 19792039.0,
"step": 150
},
{
"entropy": 0.9883000548928976,
"epoch": 0.28788304751194826,
"grad_norm": 0.03616062551736832,
"learning_rate": 3.6827458256029685e-06,
"loss": 0.9350194931030273,
"mean_token_accuracy": 0.7552345667034388,
"num_tokens": 21132002.0,
"step": 160
},
{
"entropy": 0.962575543858111,
"epoch": 0.305875737981445,
"grad_norm": 0.031624436378479004,
"learning_rate": 3.5899814471243043e-06,
"loss": 0.9099706649780274,
"mean_token_accuracy": 0.7614207146689296,
"num_tokens": 22456610.0,
"step": 170
},
{
"entropy": 0.981575589068234,
"epoch": 0.3238684284509418,
"grad_norm": 0.03008902259171009,
"learning_rate": 3.49721706864564e-06,
"loss": 0.9275808334350586,
"mean_token_accuracy": 0.7563599238172174,
"num_tokens": 23784860.0,
"step": 180
},
{
"entropy": 0.9543529843911529,
"epoch": 0.3418611189204386,
"grad_norm": 0.03235575929284096,
"learning_rate": 3.404452690166976e-06,
"loss": 0.9126798629760742,
"mean_token_accuracy": 0.7603292245417833,
"num_tokens": 25106610.0,
"step": 190
},
{
"entropy": 0.9536242228001356,
"epoch": 0.35985380938993533,
"grad_norm": 0.033603642135858536,
"learning_rate": 3.311688311688312e-06,
"loss": 0.9094326019287109,
"mean_token_accuracy": 0.7603268170729279,
"num_tokens": 26404730.0,
"step": 200
},
{
"entropy": 0.9402435509487986,
"epoch": 0.3778464998594321,
"grad_norm": 0.029900604858994484,
"learning_rate": 3.218923933209648e-06,
"loss": 0.8853635787963867,
"mean_token_accuracy": 0.7637220246717333,
"num_tokens": 27746430.0,
"step": 210
},
{
"entropy": 0.9270002828910947,
"epoch": 0.3958391903289289,
"grad_norm": 0.03154909983277321,
"learning_rate": 3.1261595547309838e-06,
"loss": 0.8845057487487793,
"mean_token_accuracy": 0.7643253333866596,
"num_tokens": 29091240.0,
"step": 220
},
{
"entropy": 0.9196253689005971,
"epoch": 0.41383188079842564,
"grad_norm": 0.028953028842806816,
"learning_rate": 3.0333951762523196e-06,
"loss": 0.880043888092041,
"mean_token_accuracy": 0.7643528375774622,
"num_tokens": 30412544.0,
"step": 230
},
{
"entropy": 0.9138461783528328,
"epoch": 0.4318245712679224,
"grad_norm": 0.028740836307406425,
"learning_rate": 2.9406307977736554e-06,
"loss": 0.8804447174072265,
"mean_token_accuracy": 0.7650679206475616,
"num_tokens": 31721248.0,
"step": 240
},
{
"entropy": 0.9258439548313617,
"epoch": 0.44981726173741915,
"grad_norm": 0.027906838804483414,
"learning_rate": 2.8478664192949912e-06,
"loss": 0.8891608238220214,
"mean_token_accuracy": 0.7623051449656486,
"num_tokens": 33030621.0,
"step": 250
},
{
"entropy": 0.9231391252949834,
"epoch": 0.46780995220691596,
"grad_norm": 0.027720769867300987,
"learning_rate": 2.7551020408163266e-06,
"loss": 0.9020990371704102,
"mean_token_accuracy": 0.7595951380208135,
"num_tokens": 34328254.0,
"step": 260
},
{
"entropy": 0.9248277079313993,
"epoch": 0.4858026426764127,
"grad_norm": 0.028005970641970634,
"learning_rate": 2.6623376623376624e-06,
"loss": 0.8968218803405762,
"mean_token_accuracy": 0.7620166089385748,
"num_tokens": 35639568.0,
"step": 270
},
{
"entropy": 0.9164260600693523,
"epoch": 0.5037953331459095,
"grad_norm": 0.025676406919956207,
"learning_rate": 2.5695732838589982e-06,
"loss": 0.894569206237793,
"mean_token_accuracy": 0.7612657260149718,
"num_tokens": 36947904.0,
"step": 280
},
{
"entropy": 0.9089541524648667,
"epoch": 0.5217880236154062,
"grad_norm": 0.028434382751584053,
"learning_rate": 2.476808905380334e-06,
"loss": 0.8868412017822266,
"mean_token_accuracy": 0.763394633680582,
"num_tokens": 38281521.0,
"step": 290
},
{
"entropy": 0.9049528720788658,
"epoch": 0.539780714084903,
"grad_norm": 0.02663426101207733,
"learning_rate": 2.38404452690167e-06,
"loss": 0.8812618255615234,
"mean_token_accuracy": 0.7641567781567573,
"num_tokens": 39595803.0,
"step": 300
},
{
"entropy": 0.900223555136472,
"epoch": 0.5577734045543997,
"grad_norm": 0.026907267048954964,
"learning_rate": 2.2912801484230057e-06,
"loss": 0.8773960113525391,
"mean_token_accuracy": 0.7646851245313883,
"num_tokens": 40918054.0,
"step": 310
},
{
"entropy": 0.9072908268310129,
"epoch": 0.5757660950238965,
"grad_norm": 0.033084969967603683,
"learning_rate": 2.1985157699443415e-06,
"loss": 0.8849006652832031,
"mean_token_accuracy": 0.7633785914629698,
"num_tokens": 42245476.0,
"step": 320
},
{
"entropy": 0.9075088860467077,
"epoch": 0.5937587854933933,
"grad_norm": 0.029511412605643272,
"learning_rate": 2.1057513914656773e-06,
"loss": 0.8799509048461914,
"mean_token_accuracy": 0.7644402593374252,
"num_tokens": 43592571.0,
"step": 330
},
{
"entropy": 0.897929747030139,
"epoch": 0.61175147596289,
"grad_norm": 0.027747338637709618,
"learning_rate": 2.012987012987013e-06,
"loss": 0.8784950256347657,
"mean_token_accuracy": 0.7654943082481622,
"num_tokens": 44949762.0,
"step": 340
},
{
"entropy": 0.8959064597263933,
"epoch": 0.6297441664323868,
"grad_norm": 0.02585972286760807,
"learning_rate": 1.920222634508349e-06,
"loss": 0.8677197456359863,
"mean_token_accuracy": 0.7666845623403787,
"num_tokens": 46266907.0,
"step": 350
},
{
"entropy": 0.9085025515407323,
"epoch": 0.6477368569018837,
"grad_norm": 0.026946574449539185,
"learning_rate": 1.8274582560296848e-06,
"loss": 0.8925327301025391,
"mean_token_accuracy": 0.7623184407129884,
"num_tokens": 47577598.0,
"step": 360
},
{
"entropy": 0.8742405578494072,
"epoch": 0.6657295473713803,
"grad_norm": 0.026929043233394623,
"learning_rate": 1.7346938775510206e-06,
"loss": 0.8524269104003906,
"mean_token_accuracy": 0.7705512259155511,
"num_tokens": 48888300.0,
"step": 370
},
{
"entropy": 0.9005698974244296,
"epoch": 0.6837222378408772,
"grad_norm": 0.027014046907424927,
"learning_rate": 1.6419294990723564e-06,
"loss": 0.8712619781494141,
"mean_token_accuracy": 0.7643290877342224,
"num_tokens": 50229069.0,
"step": 380
},
{
"entropy": 0.8819140480831266,
"epoch": 0.701714928310374,
"grad_norm": 0.028174864128232002,
"learning_rate": 1.5491651205936922e-06,
"loss": 0.8646106719970703,
"mean_token_accuracy": 0.7674408122897148,
"num_tokens": 51578947.0,
"step": 390
},
{
"entropy": 0.8925842920318245,
"epoch": 0.7197076187798707,
"grad_norm": 0.027017617598176003,
"learning_rate": 1.456400742115028e-06,
"loss": 0.8714614868164062,
"mean_token_accuracy": 0.7669254776090384,
"num_tokens": 52930805.0,
"step": 400
},
{
"entropy": 0.889844935759902,
"epoch": 0.7377003092493675,
"grad_norm": 0.02721812203526497,
"learning_rate": 1.3636363636363636e-06,
"loss": 0.8674912452697754,
"mean_token_accuracy": 0.7662461360916495,
"num_tokens": 54224294.0,
"step": 410
},
{
"entropy": 0.8719520575366915,
"epoch": 0.7556929997188642,
"grad_norm": 0.028012819588184357,
"learning_rate": 1.2708719851576994e-06,
"loss": 0.8511224746704101,
"mean_token_accuracy": 0.7702083302661776,
"num_tokens": 55540584.0,
"step": 420
},
{
"entropy": 0.8898111075162888,
"epoch": 0.773685690188361,
"grad_norm": 0.02642475627362728,
"learning_rate": 1.1781076066790352e-06,
"loss": 0.8730297088623047,
"mean_token_accuracy": 0.7653367448598146,
"num_tokens": 56827841.0,
"step": 430
},
{
"entropy": 0.8857162812724709,
"epoch": 0.7916783806578578,
"grad_norm": 0.02740148827433586,
"learning_rate": 1.0853432282003713e-06,
"loss": 0.8713733673095703,
"mean_token_accuracy": 0.7659575197845697,
"num_tokens": 58130682.0,
"step": 440
},
{
"entropy": 0.8843438906595111,
"epoch": 0.8096710711273545,
"grad_norm": 0.025668496266007423,
"learning_rate": 9.925788497217069e-07,
"loss": 0.8760784149169922,
"mean_token_accuracy": 0.7651905825361609,
"num_tokens": 59444140.0,
"step": 450
},
{
"entropy": 0.876284147053957,
"epoch": 0.8276637615968513,
"grad_norm": 0.026019152253866196,
"learning_rate": 8.998144712430428e-07,
"loss": 0.8590941429138184,
"mean_token_accuracy": 0.7688775883987546,
"num_tokens": 60778522.0,
"step": 460
},
{
"entropy": 0.8704025126062334,
"epoch": 0.8456564520663481,
"grad_norm": 0.024385536089539528,
"learning_rate": 8.070500927643786e-07,
"loss": 0.8481533050537109,
"mean_token_accuracy": 0.7709953064098954,
"num_tokens": 62138075.0,
"step": 470
},
{
"entropy": 0.886689430475235,
"epoch": 0.8636491425358448,
"grad_norm": 0.027147600427269936,
"learning_rate": 7.142857142857143e-07,
"loss": 0.8655129432678222,
"mean_token_accuracy": 0.7670928187668323,
"num_tokens": 63450349.0,
"step": 480
},
{
"entropy": 0.8841921042650938,
"epoch": 0.8816418330053416,
"grad_norm": 0.025846796110272408,
"learning_rate": 6.215213358070501e-07,
"loss": 0.8744302749633789,
"mean_token_accuracy": 0.7654220588505268,
"num_tokens": 64770576.0,
"step": 490
},
{
"entropy": 0.8944361335597932,
"epoch": 0.8996345234748383,
"grad_norm": 0.025025852024555206,
"learning_rate": 5.287569573283859e-07,
"loss": 0.8789453506469727,
"mean_token_accuracy": 0.7639346193522215,
"num_tokens": 66113087.0,
"step": 500
},
{
"entropy": 0.8843724082224071,
"epoch": 0.9176272139443351,
"grad_norm": 0.02651493437588215,
"learning_rate": 4.359925788497217e-07,
"loss": 0.8675421714782715,
"mean_token_accuracy": 0.7664000844582916,
"num_tokens": 67464302.0,
"step": 510
},
{
"entropy": 0.8899071650579572,
"epoch": 0.9356199044138319,
"grad_norm": 0.025058092549443245,
"learning_rate": 3.4322820037105757e-07,
"loss": 0.879638385772705,
"mean_token_accuracy": 0.7650359075516462,
"num_tokens": 68809443.0,
"step": 520
},
{
"entropy": 0.8678001549094916,
"epoch": 0.9536125948833286,
"grad_norm": 0.025574836879968643,
"learning_rate": 2.5046382189239333e-07,
"loss": 0.8517162322998046,
"mean_token_accuracy": 0.7706384485587477,
"num_tokens": 70130884.0,
"step": 530
},
{
"entropy": 0.8980348063632846,
"epoch": 0.9716052853528254,
"grad_norm": 0.02690030448138714,
"learning_rate": 1.5769944341372915e-07,
"loss": 0.8926727294921875,
"mean_token_accuracy": 0.7621918022632599,
"num_tokens": 71446103.0,
"step": 540
},
{
"entropy": 0.8809462685137988,
"epoch": 0.9895979758223222,
"grad_norm": 0.02480347640812397,
"learning_rate": 6.493506493506495e-08,
"loss": 0.8590832710266113,
"mean_token_accuracy": 0.7687337175011635,
"num_tokens": 72793622.0,
"step": 550
}
],
"logging_steps": 10,
"max_steps": 556,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.0599989240114708e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}