1235 lines
37 KiB
JSON
1235 lines
37 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 0.919056052845723,
|
|
"eval_steps": 500,
|
|
"global_step": 1200,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"entropy": 0.9638157235458493,
|
|
"epoch": 0.007658800440381025,
|
|
"grad_norm": 8.468399047851562,
|
|
"learning_rate": 1.125e-06,
|
|
"loss": 1.000040054321289,
|
|
"mean_token_accuracy": 0.784000868536532,
|
|
"num_tokens": 2256601.0,
|
|
"step": 10
|
|
},
|
|
{
|
|
"entropy": 0.8869377555325627,
|
|
"epoch": 0.01531760088076205,
|
|
"grad_norm": 0.4561779201030731,
|
|
"learning_rate": 2.375e-06,
|
|
"loss": 0.8214498519897461,
|
|
"mean_token_accuracy": 0.8017587121576071,
|
|
"num_tokens": 4548350.0,
|
|
"step": 20
|
|
},
|
|
{
|
|
"entropy": 0.6845712042413652,
|
|
"epoch": 0.022976401321143074,
|
|
"grad_norm": 0.07862971723079681,
|
|
"learning_rate": 3.625e-06,
|
|
"loss": 0.6381561279296875,
|
|
"mean_token_accuracy": 0.8316645501181483,
|
|
"num_tokens": 6813579.0,
|
|
"step": 30
|
|
},
|
|
{
|
|
"entropy": 0.619562475476414,
|
|
"epoch": 0.0306352017615241,
|
|
"grad_norm": 0.04751257970929146,
|
|
"learning_rate": 4.875e-06,
|
|
"loss": 0.5589711666107178,
|
|
"mean_token_accuracy": 0.8444838780909777,
|
|
"num_tokens": 9161357.0,
|
|
"step": 40
|
|
},
|
|
{
|
|
"entropy": 0.5984557962045074,
|
|
"epoch": 0.038294002201905125,
|
|
"grad_norm": 0.08734887838363647,
|
|
"learning_rate": 4.964454976303318e-06,
|
|
"loss": 0.5401619911193848,
|
|
"mean_token_accuracy": 0.8480381922796368,
|
|
"num_tokens": 11470288.0,
|
|
"step": 50
|
|
},
|
|
{
|
|
"entropy": 0.5861514512449503,
|
|
"epoch": 0.04595280264228615,
|
|
"grad_norm": 0.0356341153383255,
|
|
"learning_rate": 4.924960505529227e-06,
|
|
"loss": 0.5160280227661133,
|
|
"mean_token_accuracy": 0.8562987703830004,
|
|
"num_tokens": 13763429.0,
|
|
"step": 60
|
|
},
|
|
{
|
|
"entropy": 0.5815956988371909,
|
|
"epoch": 0.05361160308266718,
|
|
"grad_norm": 0.06630558520555496,
|
|
"learning_rate": 4.885466034755134e-06,
|
|
"loss": 0.5006961822509766,
|
|
"mean_token_accuracy": 0.8571984633803368,
|
|
"num_tokens": 16017906.0,
|
|
"step": 70
|
|
},
|
|
{
|
|
"entropy": 0.5651088379323482,
|
|
"epoch": 0.0612704035230482,
|
|
"grad_norm": 0.026391323655843735,
|
|
"learning_rate": 4.845971563981043e-06,
|
|
"loss": 0.4796905517578125,
|
|
"mean_token_accuracy": 0.86026117708534,
|
|
"num_tokens": 18302237.0,
|
|
"step": 80
|
|
},
|
|
{
|
|
"entropy": 0.5485505453310907,
|
|
"epoch": 0.06892920396342923,
|
|
"grad_norm": 0.16785100102424622,
|
|
"learning_rate": 4.806477093206952e-06,
|
|
"loss": 0.47562880516052247,
|
|
"mean_token_accuracy": 0.8600230842828751,
|
|
"num_tokens": 20613396.0,
|
|
"step": 90
|
|
},
|
|
{
|
|
"entropy": 0.5333567421883345,
|
|
"epoch": 0.07658800440381025,
|
|
"grad_norm": 0.03617825359106064,
|
|
"learning_rate": 4.766982622432859e-06,
|
|
"loss": 0.467279052734375,
|
|
"mean_token_accuracy": 0.8615671737119556,
|
|
"num_tokens": 22894278.0,
|
|
"step": 100
|
|
},
|
|
{
|
|
"entropy": 0.5182453896850348,
|
|
"epoch": 0.08424680484419128,
|
|
"grad_norm": 0.06548844277858734,
|
|
"learning_rate": 4.727488151658769e-06,
|
|
"loss": 0.458143424987793,
|
|
"mean_token_accuracy": 0.8627345286309719,
|
|
"num_tokens": 25206563.0,
|
|
"step": 110
|
|
},
|
|
{
|
|
"entropy": 0.5145570897497237,
|
|
"epoch": 0.0919056052845723,
|
|
"grad_norm": 0.03264116495847702,
|
|
"learning_rate": 4.6879936808846766e-06,
|
|
"loss": 0.45707268714904786,
|
|
"mean_token_accuracy": 0.8632168389856816,
|
|
"num_tokens": 27484040.0,
|
|
"step": 120
|
|
},
|
|
{
|
|
"entropy": 0.5025211286731064,
|
|
"epoch": 0.09956440572495333,
|
|
"grad_norm": 0.027719072997570038,
|
|
"learning_rate": 4.648499210110584e-06,
|
|
"loss": 0.4481193542480469,
|
|
"mean_token_accuracy": 0.8667424885556102,
|
|
"num_tokens": 29792202.0,
|
|
"step": 130
|
|
},
|
|
{
|
|
"entropy": 0.49839433738961814,
|
|
"epoch": 0.10722320616533436,
|
|
"grad_norm": 0.06785497069358826,
|
|
"learning_rate": 4.609004739336494e-06,
|
|
"loss": 0.4417428016662598,
|
|
"mean_token_accuracy": 0.8677690284326672,
|
|
"num_tokens": 32107572.0,
|
|
"step": 140
|
|
},
|
|
{
|
|
"entropy": 0.49580598613247273,
|
|
"epoch": 0.11488200660571538,
|
|
"grad_norm": 0.06671962141990662,
|
|
"learning_rate": 4.5695102685624015e-06,
|
|
"loss": 0.4398933410644531,
|
|
"mean_token_accuracy": 0.8677896987646818,
|
|
"num_tokens": 34407548.0,
|
|
"step": 150
|
|
},
|
|
{
|
|
"entropy": 0.49476480865851047,
|
|
"epoch": 0.1225408070460964,
|
|
"grad_norm": 0.02333025634288788,
|
|
"learning_rate": 4.53001579778831e-06,
|
|
"loss": 0.4360370635986328,
|
|
"mean_token_accuracy": 0.8685005273669958,
|
|
"num_tokens": 36664288.0,
|
|
"step": 160
|
|
},
|
|
{
|
|
"entropy": 0.48315772889181974,
|
|
"epoch": 0.13019960748647744,
|
|
"grad_norm": 0.022907257080078125,
|
|
"learning_rate": 4.490521327014219e-06,
|
|
"loss": 0.4286161422729492,
|
|
"mean_token_accuracy": 0.8704841218888759,
|
|
"num_tokens": 38965655.0,
|
|
"step": 170
|
|
},
|
|
{
|
|
"entropy": 0.4817703261971474,
|
|
"epoch": 0.13785840792685847,
|
|
"grad_norm": 0.02129477635025978,
|
|
"learning_rate": 4.4510268562401265e-06,
|
|
"loss": 0.42794160842895507,
|
|
"mean_token_accuracy": 0.8699940867722035,
|
|
"num_tokens": 41234311.0,
|
|
"step": 180
|
|
},
|
|
{
|
|
"entropy": 0.47914891233667734,
|
|
"epoch": 0.14551720836723947,
|
|
"grad_norm": 0.018593771383166313,
|
|
"learning_rate": 4.411532385466035e-06,
|
|
"loss": 0.42693591117858887,
|
|
"mean_token_accuracy": 0.869681833870709,
|
|
"num_tokens": 43517016.0,
|
|
"step": 190
|
|
},
|
|
{
|
|
"entropy": 0.46877209544181825,
|
|
"epoch": 0.1531760088076205,
|
|
"grad_norm": 0.054788339883089066,
|
|
"learning_rate": 4.372037914691944e-06,
|
|
"loss": 0.4187319755554199,
|
|
"mean_token_accuracy": 0.8715709690004587,
|
|
"num_tokens": 45813790.0,
|
|
"step": 200
|
|
},
|
|
{
|
|
"entropy": 0.4654896330088377,
|
|
"epoch": 0.16083480924800153,
|
|
"grad_norm": 0.028184032067656517,
|
|
"learning_rate": 4.332543443917852e-06,
|
|
"loss": 0.42058391571044923,
|
|
"mean_token_accuracy": 0.8709377760067583,
|
|
"num_tokens": 48144428.0,
|
|
"step": 210
|
|
},
|
|
{
|
|
"entropy": 0.465091020707041,
|
|
"epoch": 0.16849360968838256,
|
|
"grad_norm": 0.020246045663952827,
|
|
"learning_rate": 4.29304897314376e-06,
|
|
"loss": 0.42343916893005373,
|
|
"mean_token_accuracy": 0.8702428733929992,
|
|
"num_tokens": 50411268.0,
|
|
"step": 220
|
|
},
|
|
{
|
|
"entropy": 0.4618240131996572,
|
|
"epoch": 0.1761524101287636,
|
|
"grad_norm": 0.019168304279446602,
|
|
"learning_rate": 4.253554502369669e-06,
|
|
"loss": 0.42192907333374025,
|
|
"mean_token_accuracy": 0.8700458832085133,
|
|
"num_tokens": 52712022.0,
|
|
"step": 230
|
|
},
|
|
{
|
|
"entropy": 0.4555769263766706,
|
|
"epoch": 0.1838112105691446,
|
|
"grad_norm": 0.028189843520522118,
|
|
"learning_rate": 4.214060031595577e-06,
|
|
"loss": 0.41624298095703127,
|
|
"mean_token_accuracy": 0.8716806696727872,
|
|
"num_tokens": 55014306.0,
|
|
"step": 240
|
|
},
|
|
{
|
|
"entropy": 0.44512661090120675,
|
|
"epoch": 0.19147001100952563,
|
|
"grad_norm": 0.05628414452075958,
|
|
"learning_rate": 4.174565560821485e-06,
|
|
"loss": 0.4077006340026855,
|
|
"mean_token_accuracy": 0.8738998031243682,
|
|
"num_tokens": 57332962.0,
|
|
"step": 250
|
|
},
|
|
{
|
|
"entropy": 0.4452826808206737,
|
|
"epoch": 0.19912881144990666,
|
|
"grad_norm": 0.03325086459517479,
|
|
"learning_rate": 4.135071090047394e-06,
|
|
"loss": 0.4117462158203125,
|
|
"mean_token_accuracy": 0.872441022284329,
|
|
"num_tokens": 59617425.0,
|
|
"step": 260
|
|
},
|
|
{
|
|
"entropy": 0.4452002733945847,
|
|
"epoch": 0.2067876118902877,
|
|
"grad_norm": 0.025545459240674973,
|
|
"learning_rate": 4.095576619273302e-06,
|
|
"loss": 0.4101984977722168,
|
|
"mean_token_accuracy": 0.8726631047204136,
|
|
"num_tokens": 61923455.0,
|
|
"step": 270
|
|
},
|
|
{
|
|
"entropy": 0.4457569817081094,
|
|
"epoch": 0.21444641233066872,
|
|
"grad_norm": 0.031541287899017334,
|
|
"learning_rate": 4.05608214849921e-06,
|
|
"loss": 0.41226825714111326,
|
|
"mean_token_accuracy": 0.8723130978643894,
|
|
"num_tokens": 64205792.0,
|
|
"step": 280
|
|
},
|
|
{
|
|
"entropy": 0.43516338849440217,
|
|
"epoch": 0.22210521277104975,
|
|
"grad_norm": 0.0174368005245924,
|
|
"learning_rate": 4.0165876777251185e-06,
|
|
"loss": 0.40465373992919923,
|
|
"mean_token_accuracy": 0.8751402111724019,
|
|
"num_tokens": 66513837.0,
|
|
"step": 290
|
|
},
|
|
{
|
|
"entropy": 0.4423691611737013,
|
|
"epoch": 0.22976401321143075,
|
|
"grad_norm": 0.017861908301711082,
|
|
"learning_rate": 3.977093206951027e-06,
|
|
"loss": 0.4081140041351318,
|
|
"mean_token_accuracy": 0.8730683302506804,
|
|
"num_tokens": 68807723.0,
|
|
"step": 300
|
|
},
|
|
{
|
|
"entropy": 0.43804038101807236,
|
|
"epoch": 0.23742281365181178,
|
|
"grad_norm": 0.01862718164920807,
|
|
"learning_rate": 3.937598736176936e-06,
|
|
"loss": 0.4079318046569824,
|
|
"mean_token_accuracy": 0.8735693100839853,
|
|
"num_tokens": 71105215.0,
|
|
"step": 310
|
|
},
|
|
{
|
|
"entropy": 0.4372365009970963,
|
|
"epoch": 0.2450816140921928,
|
|
"grad_norm": 0.01921224780380726,
|
|
"learning_rate": 3.898104265402844e-06,
|
|
"loss": 0.4054897308349609,
|
|
"mean_token_accuracy": 0.8743662687018514,
|
|
"num_tokens": 73443927.0,
|
|
"step": 320
|
|
},
|
|
{
|
|
"entropy": 0.4320651748217642,
|
|
"epoch": 0.25274041453257384,
|
|
"grad_norm": 0.01882867142558098,
|
|
"learning_rate": 3.858609794628752e-06,
|
|
"loss": 0.40141096115112307,
|
|
"mean_token_accuracy": 0.8753820607438684,
|
|
"num_tokens": 75767699.0,
|
|
"step": 330
|
|
},
|
|
{
|
|
"entropy": 0.43937554229050874,
|
|
"epoch": 0.2603992149729549,
|
|
"grad_norm": 0.017837367951869965,
|
|
"learning_rate": 3.819115323854661e-06,
|
|
"loss": 0.40896854400634763,
|
|
"mean_token_accuracy": 0.8731667961925268,
|
|
"num_tokens": 78059352.0,
|
|
"step": 340
|
|
},
|
|
{
|
|
"entropy": 0.4369427585974336,
|
|
"epoch": 0.2680580154133359,
|
|
"grad_norm": 0.02532646618783474,
|
|
"learning_rate": 3.779620853080569e-06,
|
|
"loss": 0.40844316482543946,
|
|
"mean_token_accuracy": 0.8738381527364254,
|
|
"num_tokens": 80348013.0,
|
|
"step": 350
|
|
},
|
|
{
|
|
"entropy": 0.4335848417133093,
|
|
"epoch": 0.27571681585371693,
|
|
"grad_norm": 0.031413592398166656,
|
|
"learning_rate": 3.7401263823064775e-06,
|
|
"loss": 0.4057870388031006,
|
|
"mean_token_accuracy": 0.8742154082283378,
|
|
"num_tokens": 82659712.0,
|
|
"step": 360
|
|
},
|
|
{
|
|
"entropy": 0.43348568454384806,
|
|
"epoch": 0.2833756162940979,
|
|
"grad_norm": 0.018019968643784523,
|
|
"learning_rate": 3.7006319115323856e-06,
|
|
"loss": 0.4047835826873779,
|
|
"mean_token_accuracy": 0.8740507639944554,
|
|
"num_tokens": 84980573.0,
|
|
"step": 370
|
|
},
|
|
{
|
|
"entropy": 0.4382829017937183,
|
|
"epoch": 0.29103441673447894,
|
|
"grad_norm": 0.03078663907945156,
|
|
"learning_rate": 3.661137440758294e-06,
|
|
"loss": 0.4107412338256836,
|
|
"mean_token_accuracy": 0.8724170258268714,
|
|
"num_tokens": 87201556.0,
|
|
"step": 380
|
|
},
|
|
{
|
|
"entropy": 0.43776033921167257,
|
|
"epoch": 0.29869321717485997,
|
|
"grad_norm": 0.02386470139026642,
|
|
"learning_rate": 3.6216429699842024e-06,
|
|
"loss": 0.4104489326477051,
|
|
"mean_token_accuracy": 0.8728113612160087,
|
|
"num_tokens": 89478842.0,
|
|
"step": 390
|
|
},
|
|
{
|
|
"entropy": 0.43476897608488796,
|
|
"epoch": 0.306352017615241,
|
|
"grad_norm": 0.01759020984172821,
|
|
"learning_rate": 3.5821484992101106e-06,
|
|
"loss": 0.40773825645446776,
|
|
"mean_token_accuracy": 0.8733439993113279,
|
|
"num_tokens": 91786832.0,
|
|
"step": 400
|
|
},
|
|
{
|
|
"entropy": 0.4269565034657717,
|
|
"epoch": 0.31401081805562203,
|
|
"grad_norm": 0.02766292169690132,
|
|
"learning_rate": 3.5426540284360196e-06,
|
|
"loss": 0.40129985809326174,
|
|
"mean_token_accuracy": 0.8752415424212814,
|
|
"num_tokens": 94116280.0,
|
|
"step": 410
|
|
},
|
|
{
|
|
"entropy": 0.43023997033014894,
|
|
"epoch": 0.32166961849600306,
|
|
"grad_norm": 0.02584155462682247,
|
|
"learning_rate": 3.5031595576619278e-06,
|
|
"loss": 0.4028194427490234,
|
|
"mean_token_accuracy": 0.8748508550226688,
|
|
"num_tokens": 96390327.0,
|
|
"step": 420
|
|
},
|
|
{
|
|
"entropy": 0.4311048804782331,
|
|
"epoch": 0.3293284189363841,
|
|
"grad_norm": 0.030514976009726524,
|
|
"learning_rate": 3.463665086887836e-06,
|
|
"loss": 0.40514497756958007,
|
|
"mean_token_accuracy": 0.8740883070975543,
|
|
"num_tokens": 98693689.0,
|
|
"step": 430
|
|
},
|
|
{
|
|
"entropy": 0.42613045433536173,
|
|
"epoch": 0.3369872193767651,
|
|
"grad_norm": 0.028530791401863098,
|
|
"learning_rate": 3.4241706161137446e-06,
|
|
"loss": 0.3992255449295044,
|
|
"mean_token_accuracy": 0.8751774175092578,
|
|
"num_tokens": 101002410.0,
|
|
"step": 440
|
|
},
|
|
{
|
|
"entropy": 0.4248706246726215,
|
|
"epoch": 0.34464601981714615,
|
|
"grad_norm": 0.019795197993516922,
|
|
"learning_rate": 3.3846761453396527e-06,
|
|
"loss": 0.3998436450958252,
|
|
"mean_token_accuracy": 0.8754259610548616,
|
|
"num_tokens": 103306507.0,
|
|
"step": 450
|
|
},
|
|
{
|
|
"entropy": 0.42447849148884415,
|
|
"epoch": 0.3523048202575272,
|
|
"grad_norm": 0.026115981861948967,
|
|
"learning_rate": 3.3451816745655613e-06,
|
|
"loss": 0.39901156425476075,
|
|
"mean_token_accuracy": 0.8751692553982139,
|
|
"num_tokens": 105628688.0,
|
|
"step": 460
|
|
},
|
|
{
|
|
"entropy": 0.42756049148738384,
|
|
"epoch": 0.3599636206979082,
|
|
"grad_norm": 0.024642089381814003,
|
|
"learning_rate": 3.3056872037914695e-06,
|
|
"loss": 0.40213947296142577,
|
|
"mean_token_accuracy": 0.8749727945774793,
|
|
"num_tokens": 107893640.0,
|
|
"step": 470
|
|
},
|
|
{
|
|
"entropy": 0.42482230020686984,
|
|
"epoch": 0.3676224211382892,
|
|
"grad_norm": 0.01740400120615959,
|
|
"learning_rate": 3.2661927330173777e-06,
|
|
"loss": 0.39930453300476076,
|
|
"mean_token_accuracy": 0.8750920739024878,
|
|
"num_tokens": 110182930.0,
|
|
"step": 480
|
|
},
|
|
{
|
|
"entropy": 0.4245555128902197,
|
|
"epoch": 0.3752812215786702,
|
|
"grad_norm": 0.01773119531571865,
|
|
"learning_rate": 3.2266982622432863e-06,
|
|
"loss": 0.4016741752624512,
|
|
"mean_token_accuracy": 0.875009255297482,
|
|
"num_tokens": 112454989.0,
|
|
"step": 490
|
|
},
|
|
{
|
|
"entropy": 0.42941147135570645,
|
|
"epoch": 0.38294002201905125,
|
|
"grad_norm": 0.024097498506307602,
|
|
"learning_rate": 3.1872037914691945e-06,
|
|
"loss": 0.4024195671081543,
|
|
"mean_token_accuracy": 0.8748315701261162,
|
|
"num_tokens": 114752265.0,
|
|
"step": 500
|
|
},
|
|
{
|
|
"entropy": 0.4233192947693169,
|
|
"epoch": 0.3905988224594323,
|
|
"grad_norm": 0.020476436242461205,
|
|
"learning_rate": 3.147709320695103e-06,
|
|
"loss": 0.39432778358459475,
|
|
"mean_token_accuracy": 0.8766942717134952,
|
|
"num_tokens": 117042981.0,
|
|
"step": 510
|
|
},
|
|
{
|
|
"entropy": 0.4223570663481951,
|
|
"epoch": 0.3982576228998133,
|
|
"grad_norm": 0.023389821872115135,
|
|
"learning_rate": 3.1082148499210112e-06,
|
|
"loss": 0.3961241006851196,
|
|
"mean_token_accuracy": 0.875825615786016,
|
|
"num_tokens": 119360637.0,
|
|
"step": 520
|
|
},
|
|
{
|
|
"entropy": 0.42124154828488825,
|
|
"epoch": 0.40591642334019434,
|
|
"grad_norm": 0.024319512769579887,
|
|
"learning_rate": 3.0687203791469194e-06,
|
|
"loss": 0.3995026111602783,
|
|
"mean_token_accuracy": 0.8755284296348691,
|
|
"num_tokens": 121646498.0,
|
|
"step": 530
|
|
},
|
|
{
|
|
"entropy": 0.4242598842829466,
|
|
"epoch": 0.4135752237805754,
|
|
"grad_norm": 0.021054713055491447,
|
|
"learning_rate": 3.029225908372828e-06,
|
|
"loss": 0.39807853698730467,
|
|
"mean_token_accuracy": 0.8752687338739633,
|
|
"num_tokens": 123920056.0,
|
|
"step": 540
|
|
},
|
|
{
|
|
"entropy": 0.4243672636337578,
|
|
"epoch": 0.4212340242209564,
|
|
"grad_norm": 0.02523292973637581,
|
|
"learning_rate": 2.989731437598736e-06,
|
|
"loss": 0.4009854316711426,
|
|
"mean_token_accuracy": 0.875128398090601,
|
|
"num_tokens": 126207207.0,
|
|
"step": 550
|
|
},
|
|
{
|
|
"entropy": 0.42228690376505257,
|
|
"epoch": 0.42889282466133744,
|
|
"grad_norm": 0.01712065190076828,
|
|
"learning_rate": 2.950236966824645e-06,
|
|
"loss": 0.3969071865081787,
|
|
"mean_token_accuracy": 0.8754786295816303,
|
|
"num_tokens": 128477832.0,
|
|
"step": 560
|
|
},
|
|
{
|
|
"entropy": 0.4186239805072546,
|
|
"epoch": 0.43655162510171847,
|
|
"grad_norm": 0.0163181871175766,
|
|
"learning_rate": 2.910742496050553e-06,
|
|
"loss": 0.3907261848449707,
|
|
"mean_token_accuracy": 0.8775882260873914,
|
|
"num_tokens": 130769237.0,
|
|
"step": 570
|
|
},
|
|
{
|
|
"entropy": 0.42039443040266633,
|
|
"epoch": 0.4442104255420995,
|
|
"grad_norm": 0.02180050127208233,
|
|
"learning_rate": 2.871248025276461e-06,
|
|
"loss": 0.397492790222168,
|
|
"mean_token_accuracy": 0.876148846000433,
|
|
"num_tokens": 133031465.0,
|
|
"step": 580
|
|
},
|
|
{
|
|
"entropy": 0.4161387952044606,
|
|
"epoch": 0.45186922598248047,
|
|
"grad_norm": 0.017672181129455566,
|
|
"learning_rate": 2.83175355450237e-06,
|
|
"loss": 0.3924778699874878,
|
|
"mean_token_accuracy": 0.8767253663390875,
|
|
"num_tokens": 135322215.0,
|
|
"step": 590
|
|
},
|
|
{
|
|
"entropy": 0.42231198167428374,
|
|
"epoch": 0.4595280264228615,
|
|
"grad_norm": 0.031108738854527473,
|
|
"learning_rate": 2.7922590837282783e-06,
|
|
"loss": 0.3990061283111572,
|
|
"mean_token_accuracy": 0.8758387329056859,
|
|
"num_tokens": 137590880.0,
|
|
"step": 600
|
|
},
|
|
{
|
|
"entropy": 0.4186031956225634,
|
|
"epoch": 0.46718682686324253,
|
|
"grad_norm": 0.020063655450940132,
|
|
"learning_rate": 2.752764612954187e-06,
|
|
"loss": 0.3937615156173706,
|
|
"mean_token_accuracy": 0.8771176159381866,
|
|
"num_tokens": 139882957.0,
|
|
"step": 610
|
|
},
|
|
{
|
|
"entropy": 0.4200515809468925,
|
|
"epoch": 0.47484562730362356,
|
|
"grad_norm": 0.01840071938931942,
|
|
"learning_rate": 2.713270142180095e-06,
|
|
"loss": 0.39367630481719973,
|
|
"mean_token_accuracy": 0.8769001543521882,
|
|
"num_tokens": 142150557.0,
|
|
"step": 620
|
|
},
|
|
{
|
|
"entropy": 0.41964845787733795,
|
|
"epoch": 0.4825044277440046,
|
|
"grad_norm": 0.03402973338961601,
|
|
"learning_rate": 2.6737756714060033e-06,
|
|
"loss": 0.3966160535812378,
|
|
"mean_token_accuracy": 0.876146792806685,
|
|
"num_tokens": 144417758.0,
|
|
"step": 630
|
|
},
|
|
{
|
|
"entropy": 0.41940504405647516,
|
|
"epoch": 0.4901632281843856,
|
|
"grad_norm": 0.01644454151391983,
|
|
"learning_rate": 2.634281200631912e-06,
|
|
"loss": 0.3938936710357666,
|
|
"mean_token_accuracy": 0.8767383242025971,
|
|
"num_tokens": 146708789.0,
|
|
"step": 640
|
|
},
|
|
{
|
|
"entropy": 0.4258418914861977,
|
|
"epoch": 0.49782202862476665,
|
|
"grad_norm": 0.028102336451411247,
|
|
"learning_rate": 2.59478672985782e-06,
|
|
"loss": 0.40206151008605956,
|
|
"mean_token_accuracy": 0.8747876984998584,
|
|
"num_tokens": 149020891.0,
|
|
"step": 650
|
|
},
|
|
{
|
|
"entropy": 0.4147974385879934,
|
|
"epoch": 0.5054808290651477,
|
|
"grad_norm": 0.022107699885964394,
|
|
"learning_rate": 2.5552922590837287e-06,
|
|
"loss": 0.3919835090637207,
|
|
"mean_token_accuracy": 0.8775576103478671,
|
|
"num_tokens": 151299185.0,
|
|
"step": 660
|
|
},
|
|
{
|
|
"entropy": 0.41798324035480616,
|
|
"epoch": 0.5131396295055287,
|
|
"grad_norm": 0.016613123938441277,
|
|
"learning_rate": 2.515797788309637e-06,
|
|
"loss": 0.39252438545227053,
|
|
"mean_token_accuracy": 0.8769917484372854,
|
|
"num_tokens": 153586618.0,
|
|
"step": 670
|
|
},
|
|
{
|
|
"entropy": 0.42186861447989943,
|
|
"epoch": 0.5207984299459097,
|
|
"grad_norm": 0.01724848710000515,
|
|
"learning_rate": 2.4763033175355454e-06,
|
|
"loss": 0.39810571670532224,
|
|
"mean_token_accuracy": 0.8758242284879089,
|
|
"num_tokens": 155856504.0,
|
|
"step": 680
|
|
},
|
|
{
|
|
"entropy": 0.41550648426637055,
|
|
"epoch": 0.5284572303862908,
|
|
"grad_norm": 0.023920124396681786,
|
|
"learning_rate": 2.4368088467614536e-06,
|
|
"loss": 0.3909403085708618,
|
|
"mean_token_accuracy": 0.8782436966896057,
|
|
"num_tokens": 158153561.0,
|
|
"step": 690
|
|
},
|
|
{
|
|
"entropy": 0.416356707457453,
|
|
"epoch": 0.5361160308266718,
|
|
"grad_norm": 0.025771064683794975,
|
|
"learning_rate": 2.397314375987362e-06,
|
|
"loss": 0.3935497522354126,
|
|
"mean_token_accuracy": 0.8770827081054449,
|
|
"num_tokens": 160414992.0,
|
|
"step": 700
|
|
},
|
|
{
|
|
"entropy": 0.4169067163951695,
|
|
"epoch": 0.5437748312670528,
|
|
"grad_norm": 0.0343230739235878,
|
|
"learning_rate": 2.3578199052132704e-06,
|
|
"loss": 0.3934483528137207,
|
|
"mean_token_accuracy": 0.8766279637813568,
|
|
"num_tokens": 162719857.0,
|
|
"step": 710
|
|
},
|
|
{
|
|
"entropy": 0.4152266987599432,
|
|
"epoch": 0.5514336317074339,
|
|
"grad_norm": 0.03518790379166603,
|
|
"learning_rate": 2.3183254344391786e-06,
|
|
"loss": 0.39179143905639646,
|
|
"mean_token_accuracy": 0.8775566022843122,
|
|
"num_tokens": 164998717.0,
|
|
"step": 720
|
|
},
|
|
{
|
|
"entropy": 0.41512856343761084,
|
|
"epoch": 0.5590924321478149,
|
|
"grad_norm": 0.017510589212179184,
|
|
"learning_rate": 2.278830963665087e-06,
|
|
"loss": 0.3936178207397461,
|
|
"mean_token_accuracy": 0.8766550052911043,
|
|
"num_tokens": 167283423.0,
|
|
"step": 730
|
|
},
|
|
{
|
|
"entropy": 0.41410760041326283,
|
|
"epoch": 0.5667512325881958,
|
|
"grad_norm": 0.028657181188464165,
|
|
"learning_rate": 2.2393364928909954e-06,
|
|
"loss": 0.3910004377365112,
|
|
"mean_token_accuracy": 0.8774017574265599,
|
|
"num_tokens": 169574664.0,
|
|
"step": 740
|
|
},
|
|
{
|
|
"entropy": 0.41111645018681886,
|
|
"epoch": 0.5744100330285768,
|
|
"grad_norm": 0.017415538430213928,
|
|
"learning_rate": 2.1998420221169035e-06,
|
|
"loss": 0.3903531551361084,
|
|
"mean_token_accuracy": 0.878159393183887,
|
|
"num_tokens": 171918950.0,
|
|
"step": 750
|
|
},
|
|
{
|
|
"entropy": 0.4121713091619313,
|
|
"epoch": 0.5820688334689579,
|
|
"grad_norm": 0.016998812556266785,
|
|
"learning_rate": 2.160347551342812e-06,
|
|
"loss": 0.38824028968811036,
|
|
"mean_token_accuracy": 0.8780525822192431,
|
|
"num_tokens": 174167500.0,
|
|
"step": 760
|
|
},
|
|
{
|
|
"entropy": 0.41725681545212867,
|
|
"epoch": 0.5897276339093389,
|
|
"grad_norm": 0.02490777149796486,
|
|
"learning_rate": 2.1208530805687207e-06,
|
|
"loss": 0.3949615955352783,
|
|
"mean_token_accuracy": 0.8761186260730028,
|
|
"num_tokens": 176430133.0,
|
|
"step": 770
|
|
},
|
|
{
|
|
"entropy": 0.41779537945985795,
|
|
"epoch": 0.5973864343497199,
|
|
"grad_norm": 0.023372486233711243,
|
|
"learning_rate": 2.081358609794629e-06,
|
|
"loss": 0.3959986925125122,
|
|
"mean_token_accuracy": 0.8762047516182065,
|
|
"num_tokens": 178715515.0,
|
|
"step": 780
|
|
},
|
|
{
|
|
"entropy": 0.4114751876331866,
|
|
"epoch": 0.605045234790101,
|
|
"grad_norm": 0.01719413697719574,
|
|
"learning_rate": 2.0418641390205375e-06,
|
|
"loss": 0.3856500148773193,
|
|
"mean_token_accuracy": 0.8787943137809634,
|
|
"num_tokens": 180969485.0,
|
|
"step": 790
|
|
},
|
|
{
|
|
"entropy": 0.41450852565467355,
|
|
"epoch": 0.612704035230482,
|
|
"grad_norm": 0.01740705594420433,
|
|
"learning_rate": 2.0023696682464457e-06,
|
|
"loss": 0.3912628650665283,
|
|
"mean_token_accuracy": 0.8774056326597929,
|
|
"num_tokens": 183230970.0,
|
|
"step": 800
|
|
},
|
|
{
|
|
"entropy": 0.4187679937109351,
|
|
"epoch": 0.620362835670863,
|
|
"grad_norm": 0.021541906520724297,
|
|
"learning_rate": 1.962875197472354e-06,
|
|
"loss": 0.3923694610595703,
|
|
"mean_token_accuracy": 0.8767649749293923,
|
|
"num_tokens": 2285361.0,
|
|
"step": 810
|
|
},
|
|
{
|
|
"entropy": 0.41270146872848273,
|
|
"epoch": 0.6280216361112441,
|
|
"grad_norm": 0.015697607770562172,
|
|
"learning_rate": 1.9233807266982625e-06,
|
|
"loss": 0.3898160457611084,
|
|
"mean_token_accuracy": 0.8781723350286483,
|
|
"num_tokens": 4587240.0,
|
|
"step": 820
|
|
},
|
|
{
|
|
"entropy": 0.41292855991050603,
|
|
"epoch": 0.6356804365516251,
|
|
"grad_norm": 0.020294206216931343,
|
|
"learning_rate": 1.8838862559241708e-06,
|
|
"loss": 0.3890320062637329,
|
|
"mean_token_accuracy": 0.8778593957424163,
|
|
"num_tokens": 6872728.0,
|
|
"step": 830
|
|
},
|
|
{
|
|
"entropy": 0.411221484746784,
|
|
"epoch": 0.6433392369920061,
|
|
"grad_norm": 0.046215225011110306,
|
|
"learning_rate": 1.8443917851500792e-06,
|
|
"loss": 0.3870939970016479,
|
|
"mean_token_accuracy": 0.8784448400139808,
|
|
"num_tokens": 9160881.0,
|
|
"step": 840
|
|
},
|
|
{
|
|
"entropy": 0.41191457901149986,
|
|
"epoch": 0.6509980374323872,
|
|
"grad_norm": 0.01619116961956024,
|
|
"learning_rate": 1.8048973143759876e-06,
|
|
"loss": 0.3880185604095459,
|
|
"mean_token_accuracy": 0.8788302283734083,
|
|
"num_tokens": 11456890.0,
|
|
"step": 850
|
|
},
|
|
{
|
|
"entropy": 0.4106498261913657,
|
|
"epoch": 0.6586568378727682,
|
|
"grad_norm": 0.016796967014670372,
|
|
"learning_rate": 1.7654028436018958e-06,
|
|
"loss": 0.38854246139526366,
|
|
"mean_token_accuracy": 0.8780328661203385,
|
|
"num_tokens": 13779035.0,
|
|
"step": 860
|
|
},
|
|
{
|
|
"entropy": 0.41066021313890816,
|
|
"epoch": 0.6663156383131492,
|
|
"grad_norm": 0.10197632014751434,
|
|
"learning_rate": 1.7259083728278042e-06,
|
|
"loss": 0.3858454465866089,
|
|
"mean_token_accuracy": 0.878836939483881,
|
|
"num_tokens": 16050638.0,
|
|
"step": 870
|
|
},
|
|
{
|
|
"entropy": 0.41092505119740963,
|
|
"epoch": 0.6739744387535302,
|
|
"grad_norm": 0.019850848242640495,
|
|
"learning_rate": 1.6864139020537126e-06,
|
|
"loss": 0.38851447105407716,
|
|
"mean_token_accuracy": 0.8778711641207337,
|
|
"num_tokens": 18346546.0,
|
|
"step": 880
|
|
},
|
|
{
|
|
"entropy": 0.4060887537896633,
|
|
"epoch": 0.6816332391939113,
|
|
"grad_norm": 0.01596878468990326,
|
|
"learning_rate": 1.646919431279621e-06,
|
|
"loss": 0.38296055793762207,
|
|
"mean_token_accuracy": 0.8796854361891746,
|
|
"num_tokens": 20693646.0,
|
|
"step": 890
|
|
},
|
|
{
|
|
"entropy": 0.41415045112371446,
|
|
"epoch": 0.6892920396342923,
|
|
"grad_norm": 0.020703142508864403,
|
|
"learning_rate": 1.6074249605055296e-06,
|
|
"loss": 0.39114315509796144,
|
|
"mean_token_accuracy": 0.8775241926312447,
|
|
"num_tokens": 22979888.0,
|
|
"step": 900
|
|
},
|
|
{
|
|
"entropy": 0.4099827105179429,
|
|
"epoch": 0.6969508400746733,
|
|
"grad_norm": 0.018190376460552216,
|
|
"learning_rate": 1.5679304897314377e-06,
|
|
"loss": 0.3876938343048096,
|
|
"mean_token_accuracy": 0.8783342713490129,
|
|
"num_tokens": 25245863.0,
|
|
"step": 910
|
|
},
|
|
{
|
|
"entropy": 0.4111726184375584,
|
|
"epoch": 0.7046096405150544,
|
|
"grad_norm": 0.03450653702020645,
|
|
"learning_rate": 1.5284360189573461e-06,
|
|
"loss": 0.38763377666473386,
|
|
"mean_token_accuracy": 0.8786343418061733,
|
|
"num_tokens": 27522191.0,
|
|
"step": 920
|
|
},
|
|
{
|
|
"entropy": 0.41031082523986695,
|
|
"epoch": 0.7122684409554354,
|
|
"grad_norm": 0.019164785742759705,
|
|
"learning_rate": 1.4889415481832545e-06,
|
|
"loss": 0.38515076637268064,
|
|
"mean_token_accuracy": 0.8786031175404787,
|
|
"num_tokens": 29812189.0,
|
|
"step": 930
|
|
},
|
|
{
|
|
"entropy": 0.4064248114824295,
|
|
"epoch": 0.7199272413958164,
|
|
"grad_norm": 0.01673167012631893,
|
|
"learning_rate": 1.449447077409163e-06,
|
|
"loss": 0.38536901473999025,
|
|
"mean_token_accuracy": 0.8789769830182195,
|
|
"num_tokens": 32068083.0,
|
|
"step": 940
|
|
},
|
|
{
|
|
"entropy": 0.4080692335031927,
|
|
"epoch": 0.7275860418361974,
|
|
"grad_norm": 0.018068261444568634,
|
|
"learning_rate": 1.4099526066350713e-06,
|
|
"loss": 0.38577680587768554,
|
|
"mean_token_accuracy": 0.879298797622323,
|
|
"num_tokens": 34347274.0,
|
|
"step": 950
|
|
},
|
|
{
|
|
"entropy": 0.4161804819479585,
|
|
"epoch": 0.7352448422765784,
|
|
"grad_norm": 0.018596794456243515,
|
|
"learning_rate": 1.3704581358609795e-06,
|
|
"loss": 0.3958181858062744,
|
|
"mean_token_accuracy": 0.8759849725291133,
|
|
"num_tokens": 36614119.0,
|
|
"step": 960
|
|
},
|
|
{
|
|
"entropy": 0.4100917984731495,
|
|
"epoch": 0.7429036427169594,
|
|
"grad_norm": 0.022355731576681137,
|
|
"learning_rate": 1.3309636650868879e-06,
|
|
"loss": 0.38584303855895996,
|
|
"mean_token_accuracy": 0.8787054903805256,
|
|
"num_tokens": 38895883.0,
|
|
"step": 970
|
|
},
|
|
{
|
|
"entropy": 0.40771132363006474,
|
|
"epoch": 0.7505624431573404,
|
|
"grad_norm": 0.0167247261852026,
|
|
"learning_rate": 1.2914691943127962e-06,
|
|
"loss": 0.3863053321838379,
|
|
"mean_token_accuracy": 0.8788028365001083,
|
|
"num_tokens": 41161264.0,
|
|
"step": 980
|
|
},
|
|
{
|
|
"entropy": 0.4112453758716583,
|
|
"epoch": 0.7582212435977215,
|
|
"grad_norm": 0.017709029838442802,
|
|
"learning_rate": 1.2519747235387048e-06,
|
|
"loss": 0.38743517398834226,
|
|
"mean_token_accuracy": 0.8780813764780759,
|
|
"num_tokens": 43424897.0,
|
|
"step": 990
|
|
},
|
|
{
|
|
"entropy": 0.40866071078926325,
|
|
"epoch": 0.7658800440381025,
|
|
"grad_norm": 0.03608441352844238,
|
|
"learning_rate": 1.212480252764613e-06,
|
|
"loss": 0.38464813232421874,
|
|
"mean_token_accuracy": 0.8790146630257368,
|
|
"num_tokens": 45688164.0,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"entropy": 0.406084228400141,
|
|
"epoch": 0.7735388444784835,
|
|
"grad_norm": 0.01752273179590702,
|
|
"learning_rate": 1.1729857819905214e-06,
|
|
"loss": 0.3844747543334961,
|
|
"mean_token_accuracy": 0.8798089537769556,
|
|
"num_tokens": 48021969.0,
|
|
"step": 1010
|
|
},
|
|
{
|
|
"entropy": 0.4065625052899122,
|
|
"epoch": 0.7811976449188646,
|
|
"grad_norm": 0.015777474269270897,
|
|
"learning_rate": 1.1334913112164298e-06,
|
|
"loss": 0.3854344844818115,
|
|
"mean_token_accuracy": 0.8788379110395909,
|
|
"num_tokens": 50319637.0,
|
|
"step": 1020
|
|
},
|
|
{
|
|
"entropy": 0.41205244278535247,
|
|
"epoch": 0.7888564453592456,
|
|
"grad_norm": 0.02172328531742096,
|
|
"learning_rate": 1.0939968404423382e-06,
|
|
"loss": 0.3861358880996704,
|
|
"mean_token_accuracy": 0.8784137150272727,
|
|
"num_tokens": 52584409.0,
|
|
"step": 1030
|
|
},
|
|
{
|
|
"entropy": 0.4055585923604667,
|
|
"epoch": 0.7965152457996266,
|
|
"grad_norm": 0.015919683501124382,
|
|
"learning_rate": 1.0545023696682466e-06,
|
|
"loss": 0.38269662857055664,
|
|
"mean_token_accuracy": 0.8797270691022276,
|
|
"num_tokens": 54841155.0,
|
|
"step": 1040
|
|
},
|
|
{
|
|
"entropy": 0.4033643173985183,
|
|
"epoch": 0.8041740462400077,
|
|
"grad_norm": 0.016592318192124367,
|
|
"learning_rate": 1.015007898894155e-06,
|
|
"loss": 0.38209493160247804,
|
|
"mean_token_accuracy": 0.8799605475738644,
|
|
"num_tokens": 57158410.0,
|
|
"step": 1050
|
|
},
|
|
{
|
|
"entropy": 0.4045918888412416,
|
|
"epoch": 0.8118328466803887,
|
|
"grad_norm": 0.016041293740272522,
|
|
"learning_rate": 9.755134281200633e-07,
|
|
"loss": 0.3831462383270264,
|
|
"mean_token_accuracy": 0.879774154163897,
|
|
"num_tokens": 59496509.0,
|
|
"step": 1060
|
|
},
|
|
{
|
|
"entropy": 0.41076484909281136,
|
|
"epoch": 0.8194916471207697,
|
|
"grad_norm": 0.016471123322844505,
|
|
"learning_rate": 9.360189573459716e-07,
|
|
"loss": 0.38496901988983157,
|
|
"mean_token_accuracy": 0.8790599407628179,
|
|
"num_tokens": 61783974.0,
|
|
"step": 1070
|
|
},
|
|
{
|
|
"entropy": 0.40901572797447444,
|
|
"epoch": 0.8271504475611507,
|
|
"grad_norm": 0.01565743237733841,
|
|
"learning_rate": 8.9652448657188e-07,
|
|
"loss": 0.3854458570480347,
|
|
"mean_token_accuracy": 0.8787517255172134,
|
|
"num_tokens": 64100657.0,
|
|
"step": 1080
|
|
},
|
|
{
|
|
"entropy": 0.4073885683901608,
|
|
"epoch": 0.8348092480015318,
|
|
"grad_norm": 0.015450418926775455,
|
|
"learning_rate": 8.570300157977884e-07,
|
|
"loss": 0.38377454280853274,
|
|
"mean_token_accuracy": 0.8794623363763094,
|
|
"num_tokens": 66386805.0,
|
|
"step": 1090
|
|
},
|
|
{
|
|
"entropy": 0.4052654759958386,
|
|
"epoch": 0.8424680484419128,
|
|
"grad_norm": 0.015288250520825386,
|
|
"learning_rate": 8.175355450236967e-07,
|
|
"loss": 0.3812230348587036,
|
|
"mean_token_accuracy": 0.8800647355616092,
|
|
"num_tokens": 68721383.0,
|
|
"step": 1100
|
|
},
|
|
{
|
|
"entropy": 0.4113547313027084,
|
|
"epoch": 0.8501268488822938,
|
|
"grad_norm": 0.02288076840341091,
|
|
"learning_rate": 7.780410742496052e-07,
|
|
"loss": 0.3886786222457886,
|
|
"mean_token_accuracy": 0.8778966784477233,
|
|
"num_tokens": 70994578.0,
|
|
"step": 1110
|
|
},
|
|
{
|
|
"entropy": 0.4017532772384584,
|
|
"epoch": 0.8577856493226749,
|
|
"grad_norm": 0.03342736139893532,
|
|
"learning_rate": 7.385466034755135e-07,
|
|
"loss": 0.37719638347625734,
|
|
"mean_token_accuracy": 0.88170400056988,
|
|
"num_tokens": 73294302.0,
|
|
"step": 1120
|
|
},
|
|
{
|
|
"entropy": 0.4073382027447224,
|
|
"epoch": 0.8654444497630559,
|
|
"grad_norm": 0.015439708717167377,
|
|
"learning_rate": 6.990521327014219e-07,
|
|
"loss": 0.3865658283233643,
|
|
"mean_token_accuracy": 0.8787333536893129,
|
|
"num_tokens": 75596477.0,
|
|
"step": 1130
|
|
},
|
|
{
|
|
"entropy": 0.4074658391997218,
|
|
"epoch": 0.8731032502034369,
|
|
"grad_norm": 0.0176975317299366,
|
|
"learning_rate": 6.595576619273302e-07,
|
|
"loss": 0.3862590789794922,
|
|
"mean_token_accuracy": 0.8786328813061118,
|
|
"num_tokens": 77899557.0,
|
|
"step": 1140
|
|
},
|
|
{
|
|
"entropy": 0.40545356962829826,
|
|
"epoch": 0.880762050643818,
|
|
"grad_norm": 0.022836821153759956,
|
|
"learning_rate": 6.200631911532385e-07,
|
|
"loss": 0.3839853763580322,
|
|
"mean_token_accuracy": 0.8795729441568255,
|
|
"num_tokens": 80171019.0,
|
|
"step": 1150
|
|
},
|
|
{
|
|
"entropy": 0.4044990832917392,
|
|
"epoch": 0.888420851084199,
|
|
"grad_norm": 0.021449508145451546,
|
|
"learning_rate": 5.80568720379147e-07,
|
|
"loss": 0.38073570728302003,
|
|
"mean_token_accuracy": 0.8800560528412461,
|
|
"num_tokens": 82521264.0,
|
|
"step": 1160
|
|
},
|
|
{
|
|
"entropy": 0.40236521204933523,
|
|
"epoch": 0.8960796515245799,
|
|
"grad_norm": 0.02636878378689289,
|
|
"learning_rate": 5.410742496050553e-07,
|
|
"loss": 0.3819872379302979,
|
|
"mean_token_accuracy": 0.8800198381766677,
|
|
"num_tokens": 84803701.0,
|
|
"step": 1170
|
|
},
|
|
{
|
|
"entropy": 0.4063895161263645,
|
|
"epoch": 0.9037384519649609,
|
|
"grad_norm": 0.019358456134796143,
|
|
"learning_rate": 5.015797788309637e-07,
|
|
"loss": 0.38217973709106445,
|
|
"mean_token_accuracy": 0.8796799056231975,
|
|
"num_tokens": 87127376.0,
|
|
"step": 1180
|
|
},
|
|
{
|
|
"entropy": 0.40959922643378377,
|
|
"epoch": 0.911397252405342,
|
|
"grad_norm": 0.018155870959162712,
|
|
"learning_rate": 4.6208530805687207e-07,
|
|
"loss": 0.3856808662414551,
|
|
"mean_token_accuracy": 0.8791890177875757,
|
|
"num_tokens": 89417694.0,
|
|
"step": 1190
|
|
},
|
|
{
|
|
"entropy": 0.4059012939222157,
|
|
"epoch": 0.919056052845723,
|
|
"grad_norm": 0.019169267266988754,
|
|
"learning_rate": 4.225908372827804e-07,
|
|
"loss": 0.38498947620391843,
|
|
"mean_token_accuracy": 0.879179273173213,
|
|
"num_tokens": 91699734.0,
|
|
"step": 1200
|
|
}
|
|
],
|
|
"logging_steps": 10,
|
|
"max_steps": 1306,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 1,
|
|
"save_steps": 200,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": false
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 4.114433819648459e+18,
|
|
"train_batch_size": 2,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|