Files
gemma-3-1b-it-IFeval/train/log.json
ModelHub XC 12f0f58b49 初始化项目,由ModelHub XC社区提供模型
Model: kth8/gemma-3-1b-it-IFeval
Source: Original Platform
2026-05-23 23:00:56 +08:00

1426 lines
32 KiB
JSON

[
{
"loss": 3.2371,
"grad_norm": 4.275357246398926,
"learning_rate": 3.870967741935484e-05,
"epoch": 0.010810810810810811,
"step": 10
},
{
"loss": 2.0982,
"grad_norm": 1.6401941776275635,
"learning_rate": 8.172043010752689e-05,
"epoch": 0.021621621621621623,
"step": 20
},
{
"loss": 1.7814,
"grad_norm": 1.0990010499954224,
"learning_rate": 0.00012473118279569893,
"epoch": 0.032432432432432434,
"step": 30
},
{
"loss": 1.8768,
"grad_norm": 1.2032235860824585,
"learning_rate": 0.00016774193548387098,
"epoch": 0.043243243243243246,
"step": 40
},
{
"loss": 1.7967,
"grad_norm": 0.8312140107154846,
"learning_rate": 0.000210752688172043,
"epoch": 0.05405405405405406,
"step": 50
},
{
"loss": 1.7088,
"grad_norm": 0.7974215149879456,
"learning_rate": 0.00025376344086021504,
"epoch": 0.06486486486486487,
"step": 60
},
{
"loss": 1.7921,
"grad_norm": 0.8754441142082214,
"learning_rate": 0.0002967741935483871,
"epoch": 0.07567567567567568,
"step": 70
},
{
"loss": 1.7169,
"grad_norm": 0.9818633794784546,
"learning_rate": 0.00033978494623655914,
"epoch": 0.08648648648648649,
"step": 80
},
{
"loss": 1.8469,
"grad_norm": 1.0391249656677246,
"learning_rate": 0.0003827956989247312,
"epoch": 0.0972972972972973,
"step": 90
},
{
"loss": 1.763,
"grad_norm": 1.2312983274459839,
"learning_rate": 0.00039998849055034085,
"epoch": 0.10810810810810811,
"step": 100
},
{
"loss": 1.9223,
"grad_norm": 1.136441707611084,
"learning_rate": 0.00039991815982176333,
"epoch": 0.11891891891891893,
"step": 110
},
{
"loss": 1.9628,
"grad_norm": 0.9119946360588074,
"learning_rate": 0.0003997839149608889,
"epoch": 0.12972972972972974,
"step": 120
},
{
"eval_loss": 1.777016043663025,
"eval_runtime": 18.3656,
"eval_samples_per_second": 42.416,
"eval_steps_per_second": 10.618,
"epoch": 0.13297297297297297,
"step": 123
},
{
"loss": 1.6845,
"grad_norm": 1.4189170598983765,
"learning_rate": 0.00039958579888599896,
"epoch": 0.14054054054054055,
"step": 130
},
{
"loss": 1.8529,
"grad_norm": 1.0813618898391724,
"learning_rate": 0.00039932387493509636,
"epoch": 0.15135135135135136,
"step": 140
},
{
"loss": 1.7712,
"grad_norm": 1.0759323835372925,
"learning_rate": 0.00039899822684565697,
"epoch": 0.16216216216216217,
"step": 150
},
{
"loss": 1.7869,
"grad_norm": 1.0583269596099854,
"learning_rate": 0.00039860895872785806,
"epoch": 0.17297297297297298,
"step": 160
},
{
"loss": 1.8945,
"grad_norm": 1.1669530868530273,
"learning_rate": 0.0003981561950312943,
"epoch": 0.1837837837837838,
"step": 170
},
{
"loss": 1.8797,
"grad_norm": 1.0436373949050903,
"learning_rate": 0.0003976400805051915,
"epoch": 0.1945945945945946,
"step": 180
},
{
"loss": 1.9332,
"grad_norm": 0.8406238555908203,
"learning_rate": 0.00039706078015212907,
"epoch": 0.20540540540540542,
"step": 190
},
{
"loss": 1.7786,
"grad_norm": 1.1354097127914429,
"learning_rate": 0.0003964184791752895,
"epoch": 0.21621621621621623,
"step": 200
},
{
"loss": 1.8089,
"grad_norm": 1.4123671054840088,
"learning_rate": 0.0003957133829192479,
"epoch": 0.22702702702702704,
"step": 210
},
{
"loss": 1.7916,
"grad_norm": 0.9333382248878479,
"learning_rate": 0.00039494571680432364,
"epoch": 0.23783783783783785,
"step": 220
},
{
"loss": 1.7808,
"grad_norm": 1.0521595478057861,
"learning_rate": 0.0003941157262545123,
"epoch": 0.24864864864864866,
"step": 230
},
{
"loss": 1.7977,
"grad_norm": 1.2558528184890747,
"learning_rate": 0.00039322367661902426,
"epoch": 0.2594594594594595,
"step": 240
},
{
"eval_loss": 1.7805718183517456,
"eval_runtime": 10.1271,
"eval_samples_per_second": 76.922,
"eval_steps_per_second": 19.255,
"epoch": 0.26594594594594595,
"step": 246
},
{
"loss": 1.8482,
"grad_norm": 1.1523817777633667,
"learning_rate": 0.00039226985308745137,
"epoch": 0.2702702702702703,
"step": 250
},
{
"loss": 1.685,
"grad_norm": 0.9244216084480286,
"learning_rate": 0.00039125456059859175,
"epoch": 0.2810810810810811,
"step": 260
},
{
"loss": 1.8774,
"grad_norm": 1.1236217021942139,
"learning_rate": 0.0003901781237429604,
"epoch": 0.2918918918918919,
"step": 270
},
{
"loss": 1.9237,
"grad_norm": 1.112891435623169,
"learning_rate": 0.0003890408866590171,
"epoch": 0.3027027027027027,
"step": 280
},
{
"loss": 1.7769,
"grad_norm": 1.0233204364776611,
"learning_rate": 0.00038784321292314485,
"epoch": 0.31351351351351353,
"step": 290
},
{
"loss": 1.8192,
"grad_norm": 1.1586676836013794,
"learning_rate": 0.00038658548543341384,
"epoch": 0.32432432432432434,
"step": 300
},
{
"loss": 1.7718,
"grad_norm": 1.034834384918213,
"learning_rate": 0.00038526810628716854,
"epoch": 0.33513513513513515,
"step": 310
},
{
"loss": 1.6869,
"grad_norm": 1.1128815412521362,
"learning_rate": 0.0003838914966524765,
"epoch": 0.34594594594594597,
"step": 320
},
{
"loss": 1.8987,
"grad_norm": 1.048621654510498,
"learning_rate": 0.00038245609663348034,
"epoch": 0.3567567567567568,
"step": 330
},
{
"loss": 1.7818,
"grad_norm": 1.3258867263793945,
"learning_rate": 0.00038096236512969556,
"epoch": 0.3675675675675676,
"step": 340
},
{
"loss": 1.7062,
"grad_norm": 0.9586314558982849,
"learning_rate": 0.0003794107796893002,
"epoch": 0.3783783783783784,
"step": 350
},
{
"loss": 1.8131,
"grad_norm": 1.0099109411239624,
"learning_rate": 0.00037780183635646145,
"epoch": 0.3891891891891892,
"step": 360
},
{
"eval_loss": 1.7687468528747559,
"eval_runtime": 10.0798,
"eval_samples_per_second": 77.283,
"eval_steps_per_second": 19.346,
"epoch": 0.3989189189189189,
"step": 369
},
{
"loss": 1.8824,
"grad_norm": 1.201002597808838,
"learning_rate": 0.00037613604951274986,
"epoch": 0.4,
"step": 370
},
{
"loss": 1.8594,
"grad_norm": 0.9146278500556946,
"learning_rate": 0.0003744139517126908,
"epoch": 0.41081081081081083,
"step": 380
},
{
"loss": 1.7923,
"grad_norm": 1.1093569993972778,
"learning_rate": 0.00037263609351350583,
"epoch": 0.42162162162162165,
"step": 390
},
{
"loss": 1.9701,
"grad_norm": 0.9460511207580566,
"learning_rate": 0.0003708030432990989,
"epoch": 0.43243243243243246,
"step": 400
},
{
"loss": 1.7968,
"grad_norm": 1.1481722593307495,
"learning_rate": 0.0003689153870983431,
"epoch": 0.44324324324324327,
"step": 410
},
{
"loss": 1.7019,
"grad_norm": 1.1272804737091064,
"learning_rate": 0.00036697372839772634,
"epoch": 0.4540540540540541,
"step": 420
},
{
"loss": 1.7139,
"grad_norm": 0.8615907430648804,
"learning_rate": 0.000364978687948416,
"epoch": 0.4648648648648649,
"step": 430
},
{
"loss": 1.7769,
"grad_norm": 1.0832351446151733,
"learning_rate": 0.0003629309035678035,
"epoch": 0.4756756756756757,
"step": 440
},
{
"loss": 1.8117,
"grad_norm": 1.0243345499038696,
"learning_rate": 0.00036083102993559343,
"epoch": 0.4864864864864865,
"step": 450
},
{
"loss": 1.7035,
"grad_norm": 0.9396358728408813,
"learning_rate": 0.00035867973838450153,
"epoch": 0.4972972972972973,
"step": 460
},
{
"loss": 1.9568,
"grad_norm": 0.9557101130485535,
"learning_rate": 0.0003564777166856282,
"epoch": 0.5081081081081081,
"step": 470
},
{
"loss": 1.9079,
"grad_norm": 1.1307172775268555,
"learning_rate": 0.00035422566882857765,
"epoch": 0.518918918918919,
"step": 480
},
{
"loss": 1.8791,
"grad_norm": 1.2252289056777954,
"learning_rate": 0.0003519243147963909,
"epoch": 0.5297297297297298,
"step": 490
},
{
"eval_loss": 1.7642391920089722,
"eval_runtime": 10.1201,
"eval_samples_per_second": 76.976,
"eval_steps_per_second": 19.269,
"epoch": 0.5318918918918919,
"step": 492
},
{
"loss": 1.7498,
"grad_norm": 0.9916505813598633,
"learning_rate": 0.00034957439033536647,
"epoch": 0.5405405405405406,
"step": 500
},
{
"loss": 1.8562,
"grad_norm": 1.2275047302246094,
"learning_rate": 0.0003471766467198408,
"epoch": 0.5513513513513514,
"step": 510
},
{
"loss": 1.7812,
"grad_norm": 0.9753154516220093,
"learning_rate": 0.00034473185051200515,
"epoch": 0.5621621621621622,
"step": 520
},
{
"loss": 2.0087,
"grad_norm": 1.2194623947143555,
"learning_rate": 0.0003422407833168343,
"epoch": 0.572972972972973,
"step": 530
},
{
"loss": 1.8641,
"grad_norm": 1.1282182931900024,
"learning_rate": 0.00033970424153220637,
"epoch": 0.5837837837837838,
"step": 540
},
{
"loss": 1.8962,
"grad_norm": 1.3077672719955444,
"learning_rate": 0.0003371230360942931,
"epoch": 0.5945945945945946,
"step": 550
},
{
"loss": 1.7113,
"grad_norm": 1.1093400716781616,
"learning_rate": 0.0003344979922183026,
"epoch": 0.6054054054054054,
"step": 560
},
{
"loss": 1.8013,
"grad_norm": 1.0412172079086304,
"learning_rate": 0.0003318299491346565,
"epoch": 0.6162162162162163,
"step": 570
},
{
"loss": 1.8316,
"grad_norm": 1.1250932216644287,
"learning_rate": 0.00032911975982068706,
"epoch": 0.6270270270270271,
"step": 580
},
{
"loss": 1.7729,
"grad_norm": 0.971480131149292,
"learning_rate": 0.0003263682907279387,
"epoch": 0.6378378378378379,
"step": 590
},
{
"loss": 1.745,
"grad_norm": 1.1424800157546997,
"learning_rate": 0.00032357642150516265,
"epoch": 0.6486486486486487,
"step": 600
},
{
"loss": 1.6717,
"grad_norm": 1.3536049127578735,
"learning_rate": 0.00032074504471709146,
"epoch": 0.6594594594594595,
"step": 610
},
{
"eval_loss": 1.7533202171325684,
"eval_runtime": 10.0831,
"eval_samples_per_second": 77.258,
"eval_steps_per_second": 19.339,
"epoch": 0.6648648648648648,
"step": 615
},
{
"loss": 1.7822,
"grad_norm": 0.8749492168426514,
"learning_rate": 0.0003178750655590848,
"epoch": 0.6702702702702703,
"step": 620
},
{
"loss": 1.8368,
"grad_norm": 3.0736031532287598,
"learning_rate": 0.00031496740156773776,
"epoch": 0.6810810810810811,
"step": 630
},
{
"loss": 1.7322,
"grad_norm": 1.288352131843567,
"learning_rate": 0.00031202298232754186,
"epoch": 0.6918918918918919,
"step": 640
},
{
"loss": 1.8685,
"grad_norm": 1.0477159023284912,
"learning_rate": 0.00030904274917369686,
"epoch": 0.7027027027027027,
"step": 650
},
{
"loss": 1.7483,
"grad_norm": 0.9655544757843018,
"learning_rate": 0.0003060276548911634,
"epoch": 0.7135135135135136,
"step": 660
},
{
"loss": 1.8099,
"grad_norm": 1.1260396242141724,
"learning_rate": 0.00030297866341005684,
"epoch": 0.7243243243243244,
"step": 670
},
{
"loss": 1.6145,
"grad_norm": 1.1371850967407227,
"learning_rate": 0.0002998967494974774,
"epoch": 0.7351351351351352,
"step": 680
},
{
"loss": 1.8311,
"grad_norm": 0.9440209865570068,
"learning_rate": 0.0002967828984458751,
"epoch": 0.745945945945946,
"step": 690
},
{
"loss": 1.9393,
"grad_norm": 1.3496946096420288,
"learning_rate": 0.00029363810575805106,
"epoch": 0.7567567567567568,
"step": 700
},
{
"loss": 1.9767,
"grad_norm": 1.0028049945831299,
"learning_rate": 0.00029046337682889315,
"epoch": 0.7675675675675676,
"step": 710
},
{
"loss": 1.8328,
"grad_norm": 1.1777056455612183,
"learning_rate": 0.00028725972662395013,
"epoch": 0.7783783783783784,
"step": 720
},
{
"loss": 1.7484,
"grad_norm": 1.2826964855194092,
"learning_rate": 0.00028402817935494547,
"epoch": 0.7891891891891892,
"step": 730
},
{
"eval_loss": 1.7475706338882446,
"eval_runtime": 10.0081,
"eval_samples_per_second": 77.837,
"eval_steps_per_second": 19.484,
"epoch": 0.7978378378378378,
"step": 738
},
{
"loss": 1.9079,
"grad_norm": 1.1097257137298584,
"learning_rate": 0.00028076976815233546,
"epoch": 0.8,
"step": 740
},
{
"loss": 1.7847,
"grad_norm": 1.1187055110931396,
"learning_rate": 0.00027748553473501593,
"epoch": 0.8108108108108109,
"step": 750
},
{
"loss": 1.6747,
"grad_norm": 1.182005524635315,
"learning_rate": 0.00027417652907728274,
"epoch": 0.8216216216216217,
"step": 760
},
{
"loss": 1.7653,
"grad_norm": 0.9777538180351257,
"learning_rate": 0.000270843809073154,
"epoch": 0.8324324324324325,
"step": 770
},
{
"loss": 1.7749,
"grad_norm": 1.1285064220428467,
"learning_rate": 0.0002674884401981597,
"epoch": 0.8432432432432433,
"step": 780
},
{
"loss": 1.7904,
"grad_norm": 0.9783152937889099,
"learning_rate": 0.000264111495168707,
"epoch": 0.8540540540540541,
"step": 790
},
{
"loss": 1.6915,
"grad_norm": 1.107857346534729,
"learning_rate": 0.0002607140535991321,
"epoch": 0.8648648648648649,
"step": 800
},
{
"loss": 1.7857,
"grad_norm": 1.2584813833236694,
"learning_rate": 0.0002572972016565451,
"epoch": 0.8756756756756757,
"step": 810
},
{
"loss": 1.8468,
"grad_norm": 1.2436493635177612,
"learning_rate": 0.00025386203171358157,
"epoch": 0.8864864864864865,
"step": 820
},
{
"loss": 1.9164,
"grad_norm": 1.624140739440918,
"learning_rate": 0.00025040964199916856,
"epoch": 0.8972972972972973,
"step": 830
},
{
"loss": 1.8009,
"grad_norm": 1.0699501037597656,
"learning_rate": 0.0002469411362474199,
"epoch": 0.9081081081081082,
"step": 840
},
{
"loss": 1.6318,
"grad_norm": 0.9692312479019165,
"learning_rate": 0.0002434576233447703,
"epoch": 0.918918918918919,
"step": 850
},
{
"loss": 1.752,
"grad_norm": 0.9754092693328857,
"learning_rate": 0.000239960216975463,
"epoch": 0.9297297297297298,
"step": 860
},
{
"eval_loss": 1.7383391857147217,
"eval_runtime": 10.0527,
"eval_samples_per_second": 77.491,
"eval_steps_per_second": 19.398,
"epoch": 0.9308108108108109,
"step": 861
},
{
"loss": 1.9364,
"grad_norm": 1.0026895999908447,
"learning_rate": 0.00023645003526550292,
"epoch": 0.9405405405405406,
"step": 870
},
{
"loss": 1.8438,
"grad_norm": 1.269220232963562,
"learning_rate": 0.00023292820042519066,
"epoch": 0.9513513513513514,
"step": 880
},
{
"loss": 1.7952,
"grad_norm": 1.0278656482696533,
"learning_rate": 0.00022939583839034965,
"epoch": 0.9621621621621622,
"step": 890
},
{
"loss": 1.6568,
"grad_norm": 0.9819965958595276,
"learning_rate": 0.0002258540784623631,
"epoch": 0.972972972972973,
"step": 900
},
{
"loss": 1.8287,
"grad_norm": 1.1272140741348267,
"learning_rate": 0.00022230405294713465,
"epoch": 0.9837837837837838,
"step": 910
},
{
"loss": 1.7379,
"grad_norm": 1.1125059127807617,
"learning_rate": 0.0002187468967930883,
"epoch": 0.9945945945945946,
"step": 920
},
{
"loss": 1.7004,
"grad_norm": 1.0192606449127197,
"learning_rate": 0.000215183747228324,
"epoch": 1.0054054054054054,
"step": 930
},
{
"loss": 1.5612,
"grad_norm": 0.9857641458511353,
"learning_rate": 0.000211615743397044,
"epoch": 1.0162162162162163,
"step": 940
},
{
"loss": 1.2828,
"grad_norm": 1.0608668327331543,
"learning_rate": 0.00020804402599536661,
"epoch": 1.027027027027027,
"step": 950
},
{
"loss": 1.4035,
"grad_norm": 1.485253930091858,
"learning_rate": 0.0002044697369066443,
"epoch": 1.037837837837838,
"step": 960
},
{
"loss": 1.5254,
"grad_norm": 0.9453800320625305,
"learning_rate": 0.0002008940188364015,
"epoch": 1.0486486486486486,
"step": 970
},
{
"loss": 1.4442,
"grad_norm": 1.1382359266281128,
"learning_rate": 0.00019731801494701044,
"epoch": 1.0594594594594595,
"step": 980
},
{
"eval_loss": 1.7562943696975708,
"eval_runtime": 10.0133,
"eval_samples_per_second": 77.797,
"eval_steps_per_second": 19.474,
"epoch": 1.0637837837837838,
"step": 984
},
{
"loss": 1.6187,
"grad_norm": 1.2494144439697266,
"learning_rate": 0.0001937428684922197,
"epoch": 1.0702702702702702,
"step": 990
},
{
"loss": 1.6528,
"grad_norm": 0.9464777708053589,
"learning_rate": 0.00019016972245165526,
"epoch": 1.0810810810810811,
"step": 1000
},
{
"loss": 1.4064,
"grad_norm": 0.9740603566169739,
"learning_rate": 0.0001865997191654074,
"epoch": 1.0918918918918918,
"step": 1010
},
{
"loss": 1.3695,
"grad_norm": 1.2424192428588867,
"learning_rate": 0.00018303399996882325,
"epoch": 1.1027027027027028,
"step": 1020
},
{
"loss": 1.4764,
"grad_norm": 1.0215702056884766,
"learning_rate": 0.00017947370482762005,
"epoch": 1.1135135135135135,
"step": 1030
},
{
"loss": 1.5442,
"grad_norm": 1.0910210609436035,
"learning_rate": 0.00017591997197343657,
"epoch": 1.1243243243243244,
"step": 1040
},
{
"loss": 1.5187,
"grad_norm": 1.1207563877105713,
"learning_rate": 0.00017237393753993875,
"epoch": 1.135135135135135,
"step": 1050
},
{
"loss": 1.4571,
"grad_norm": 1.0761910676956177,
"learning_rate": 0.0001688367351995959,
"epoch": 1.145945945945946,
"step": 1060
},
{
"loss": 1.3436,
"grad_norm": 0.9719659090042114,
"learning_rate": 0.00016530949580124404,
"epoch": 1.1567567567567567,
"step": 1070
},
{
"loss": 1.5315,
"grad_norm": 1.0876080989837646,
"learning_rate": 0.00016179334700855189,
"epoch": 1.1675675675675676,
"step": 1080
},
{
"loss": 1.369,
"grad_norm": 1.1940348148345947,
"learning_rate": 0.0001582894129395051,
"epoch": 1.1783783783783783,
"step": 1090
},
{
"loss": 1.4257,
"grad_norm": 1.1275503635406494,
"learning_rate": 0.00015479881380702415,
"epoch": 1.1891891891891893,
"step": 1100
},
{
"eval_loss": 1.763828158378601,
"eval_runtime": 9.9983,
"eval_samples_per_second": 77.914,
"eval_steps_per_second": 19.503,
"epoch": 1.1967567567567567,
"step": 1107
},
{
"loss": 1.3932,
"grad_norm": 1.2186589241027832,
"learning_rate": 0.00015132266556083018,
"epoch": 1.2,
"step": 1110
},
{
"loss": 1.5195,
"grad_norm": 1.040711760520935,
"learning_rate": 0.00014786207953067492,
"epoch": 1.2108108108108109,
"step": 1120
},
{
"loss": 1.3596,
"grad_norm": 1.1564419269561768,
"learning_rate": 0.00014441816207104636,
"epoch": 1.2216216216216216,
"step": 1130
},
{
"loss": 1.539,
"grad_norm": 0.9457581639289856,
"learning_rate": 0.00014099201420746585,
"epoch": 1.2324324324324325,
"step": 1140
},
{
"loss": 1.4309,
"grad_norm": 1.2473818063735962,
"learning_rate": 0.00013758473128448837,
"epoch": 1.2432432432432432,
"step": 1150
},
{
"loss": 1.513,
"grad_norm": 1.0576856136322021,
"learning_rate": 0.0001341974026155195,
"epoch": 1.2540540540540541,
"step": 1160
},
{
"loss": 1.5212,
"grad_norm": 1.021657943725586,
"learning_rate": 0.00013083111113456025,
"epoch": 1.2648648648648648,
"step": 1170
},
{
"loss": 1.4125,
"grad_norm": 1.4797037839889526,
"learning_rate": 0.0001274869330499914,
"epoch": 1.2756756756756757,
"step": 1180
},
{
"loss": 1.4741,
"grad_norm": 1.4238656759262085,
"learning_rate": 0.00012416593750050803,
"epoch": 1.2864864864864864,
"step": 1190
},
{
"loss": 1.5072,
"grad_norm": 1.0679641962051392,
"learning_rate": 0.00012086918621331431,
"epoch": 1.2972972972972974,
"step": 1200
},
{
"loss": 1.3807,
"grad_norm": 1.5260353088378906,
"learning_rate": 0.00011759773316468794,
"epoch": 1.308108108108108,
"step": 1210
},
{
"loss": 1.4398,
"grad_norm": 1.005669355392456,
"learning_rate": 0.00011435262424302224,
"epoch": 1.318918918918919,
"step": 1220
},
{
"loss": 1.4314,
"grad_norm": 1.116134762763977,
"learning_rate": 0.00011113489691445385,
"epoch": 1.3297297297297297,
"step": 1230
},
{
"eval_loss": 1.7593566179275513,
"eval_runtime": 9.9147,
"eval_samples_per_second": 78.57,
"eval_steps_per_second": 19.668,
"epoch": 1.3297297297297297,
"step": 1230
},
{
"loss": 1.4174,
"grad_norm": 1.100644826889038,
"learning_rate": 0.00010794557989118352,
"epoch": 1.3405405405405406,
"step": 1240
},
{
"loss": 1.3901,
"grad_norm": 0.9467904567718506,
"learning_rate": 0.00010478569280259542,
"epoch": 1.3513513513513513,
"step": 1250
},
{
"loss": 1.5013,
"grad_norm": 1.2005168199539185,
"learning_rate": 0.00010165624586927987,
"epoch": 1.3621621621621622,
"step": 1260
},
{
"loss": 1.4634,
"grad_norm": 1.0398645401000977,
"learning_rate": 9.855823958006427e-05,
"epoch": 1.372972972972973,
"step": 1270
},
{
"loss": 1.4728,
"grad_norm": 1.1238207817077637,
"learning_rate": 9.549266437215549e-05,
"epoch": 1.3837837837837839,
"step": 1280
},
{
"loss": 1.453,
"grad_norm": 1.067688226699829,
"learning_rate": 9.246050031449569e-05,
"epoch": 1.3945945945945946,
"step": 1290
},
{
"loss": 1.432,
"grad_norm": 1.1034791469573975,
"learning_rate": 8.946271679443276e-05,
"epoch": 1.4054054054054055,
"step": 1300
},
{
"loss": 1.3956,
"grad_norm": 1.4038920402526855,
"learning_rate": 8.650027220780555e-05,
"epoch": 1.4162162162162162,
"step": 1310
},
{
"loss": 1.3489,
"grad_norm": 1.0994772911071777,
"learning_rate": 8.357411365254341e-05,
"epoch": 1.427027027027027,
"step": 1320
},
{
"loss": 1.3385,
"grad_norm": 1.1797088384628296,
"learning_rate": 8.068517662587798e-05,
"epoch": 1.4378378378378378,
"step": 1330
},
{
"loss": 1.3012,
"grad_norm": 1.1310184001922607,
"learning_rate": 7.783438472526257e-05,
"epoch": 1.4486486486486487,
"step": 1340
},
{
"loss": 1.4044,
"grad_norm": 1.3859984874725342,
"learning_rate": 7.502264935309742e-05,
"epoch": 1.4594594594594594,
"step": 1350
},
{
"eval_loss": 1.7528764009475708,
"eval_runtime": 10.0314,
"eval_samples_per_second": 77.656,
"eval_steps_per_second": 19.439,
"epoch": 1.4627027027027026,
"step": 1353
},
{
"loss": 1.5262,
"grad_norm": 1.2141237258911133,
"learning_rate": 7.225086942535244e-05,
"epoch": 1.4702702702702704,
"step": 1360
},
{
"loss": 1.4401,
"grad_norm": 1.1930843591690063,
"learning_rate": 6.95199310841829e-05,
"epoch": 1.481081081081081,
"step": 1370
},
{
"loss": 1.3915,
"grad_norm": 1.0784533023834229,
"learning_rate": 6.6830707414628e-05,
"epoch": 1.491891891891892,
"step": 1380
},
{
"loss": 1.5117,
"grad_norm": 1.2977006435394287,
"learning_rate": 6.41840581654848e-05,
"epoch": 1.5027027027027027,
"step": 1390
},
{
"loss": 1.5385,
"grad_norm": 1.091192603111267,
"learning_rate": 6.158082947444484e-05,
"epoch": 1.5135135135135136,
"step": 1400
},
{
"loss": 1.2558,
"grad_norm": 1.2064927816390991,
"learning_rate": 5.902185359758272e-05,
"epoch": 1.5243243243243243,
"step": 1410
},
{
"loss": 1.3401,
"grad_norm": 1.18263578414917,
"learning_rate": 5.6507948643282905e-05,
"epoch": 1.535135135135135,
"step": 1420
},
{
"loss": 1.4368,
"grad_norm": 1.0201761722564697,
"learning_rate": 5.4039918310688995e-05,
"epoch": 1.545945945945946,
"step": 1430
},
{
"loss": 1.5864,
"grad_norm": 1.0474286079406738,
"learning_rate": 5.1618551632759904e-05,
"epoch": 1.5567567567567568,
"step": 1440
},
{
"loss": 1.3581,
"grad_norm": 0.9824125170707703,
"learning_rate": 4.924462272401484e-05,
"epoch": 1.5675675675675675,
"step": 1450
},
{
"loss": 1.4015,
"grad_norm": 1.190414547920227,
"learning_rate": 4.6918890533048034e-05,
"epoch": 1.5783783783783782,
"step": 1460
},
{
"loss": 1.511,
"grad_norm": 1.0979052782058716,
"learning_rate": 4.464209859989146e-05,
"epoch": 1.5891891891891892,
"step": 1470
},
{
"eval_loss": 1.7506372928619385,
"eval_runtime": 10.0066,
"eval_samples_per_second": 77.849,
"eval_steps_per_second": 19.487,
"epoch": 1.5956756756756758,
"step": 1476
},
{
"loss": 1.419,
"grad_norm": 1.2529748678207397,
"learning_rate": 4.241497481830396e-05,
"epoch": 1.6,
"step": 1480
},
{
"loss": 1.2973,
"grad_norm": 1.1004537343978882,
"learning_rate": 4.023823120306269e-05,
"epoch": 1.6108108108108108,
"step": 1490
},
{
"loss": 1.4756,
"grad_norm": 1.2662088871002197,
"learning_rate": 3.811256366233098e-05,
"epoch": 1.6216216216216215,
"step": 1500
},
{
"loss": 1.3959,
"grad_norm": 1.118185043334961,
"learning_rate": 3.603865177517516e-05,
"epoch": 1.6324324324324324,
"step": 1510
},
{
"loss": 1.4682,
"grad_norm": 0.995052695274353,
"learning_rate": 3.4017158574302564e-05,
"epoch": 1.6432432432432433,
"step": 1520
},
{
"loss": 1.4508,
"grad_norm": 1.0658509731292725,
"learning_rate": 3.204873033408853e-05,
"epoch": 1.654054054054054,
"step": 1530
},
{
"loss": 1.3456,
"grad_norm": 1.1724168062210083,
"learning_rate": 3.013399636396195e-05,
"epoch": 1.6648648648648647,
"step": 1540
},
{
"loss": 1.4285,
"grad_norm": 0.971674919128418,
"learning_rate": 2.827356880721368e-05,
"epoch": 1.6756756756756757,
"step": 1550
},
{
"loss": 1.2316,
"grad_norm": 0.939606785774231,
"learning_rate": 2.6468042445293883e-05,
"epoch": 1.6864864864864866,
"step": 1560
},
{
"loss": 1.4636,
"grad_norm": 1.2107715606689453,
"learning_rate": 2.4717994507659147e-05,
"epoch": 1.6972972972972973,
"step": 1570
},
{
"loss": 1.3471,
"grad_norm": 1.3718624114990234,
"learning_rate": 2.3023984487231466e-05,
"epoch": 1.708108108108108,
"step": 1580
},
{
"loss": 1.4623,
"grad_norm": 1.218471646308899,
"learning_rate": 2.1386553961527666e-05,
"epoch": 1.718918918918919,
"step": 1590
},
{
"eval_loss": 1.749324917793274,
"eval_runtime": 10.0439,
"eval_samples_per_second": 77.56,
"eval_steps_per_second": 19.415,
"epoch": 1.7286486486486488,
"step": 1599
},
{
"loss": 1.3439,
"grad_norm": 1.4828319549560547,
"learning_rate": 1.9806226419516192e-05,
"epoch": 1.7297297297297298,
"step": 1600
},
{
"loss": 1.3913,
"grad_norm": 1.1505448818206787,
"learning_rate": 1.828350709425677e-05,
"epoch": 1.7405405405405405,
"step": 1610
},
{
"loss": 1.4736,
"grad_norm": 1.2893474102020264,
"learning_rate": 1.68188828013768e-05,
"epoch": 1.7513513513513512,
"step": 1620
},
{
"loss": 1.3173,
"grad_norm": 1.2402210235595703,
"learning_rate": 1.541282178343566e-05,
"epoch": 1.7621621621621621,
"step": 1630
},
{
"loss": 1.3873,
"grad_norm": 1.0396865606307983,
"learning_rate": 1.4065773560226913e-05,
"epoch": 1.772972972972973,
"step": 1640
},
{
"loss": 1.3416,
"grad_norm": 1.0871046781539917,
"learning_rate": 1.277816878506597e-05,
"epoch": 1.7837837837837838,
"step": 1650
},
{
"loss": 1.4252,
"grad_norm": 1.3051209449768066,
"learning_rate": 1.1550419107109722e-05,
"epoch": 1.7945945945945945,
"step": 1660
},
{
"loss": 1.2819,
"grad_norm": 1.1347006559371948,
"learning_rate": 1.0382917039751783e-05,
"epoch": 1.8054054054054054,
"step": 1670
},
{
"loss": 1.3439,
"grad_norm": 1.1445626020431519,
"learning_rate": 9.276035835135166e-06,
"epoch": 1.8162162162162163,
"step": 1680
},
{
"loss": 1.3387,
"grad_norm": 1.1986947059631348,
"learning_rate": 8.230129364823213e-06,
"epoch": 1.827027027027027,
"step": 1690
},
{
"loss": 1.3861,
"grad_norm": 1.0802196264266968,
"learning_rate": 7.245532006666178e-06,
"epoch": 1.8378378378378377,
"step": 1700
},
{
"loss": 1.4836,
"grad_norm": 1.322296142578125,
"learning_rate": 6.322558537900247e-06,
"epoch": 1.8486486486486486,
"step": 1710
},
{
"loss": 1.35,
"grad_norm": 1.2359806299209595,
"learning_rate": 5.46150403451271e-06,
"epoch": 1.8594594594594596,
"step": 1720
},
{
"eval_loss": 1.746907353401184,
"eval_runtime": 10.0635,
"eval_samples_per_second": 77.409,
"eval_steps_per_second": 19.377,
"epoch": 1.8616216216216217,
"step": 1722
},
{
"loss": 1.5115,
"grad_norm": 0.9085004329681396,
"learning_rate": 4.6626437769057955e-06,
"epoch": 1.8702702702702703,
"step": 1730
},
{
"loss": 1.3037,
"grad_norm": 1.031083583831787,
"learning_rate": 3.9262331618890256e-06,
"epoch": 1.881081081081081,
"step": 1740
},
{
"loss": 1.605,
"grad_norm": 1.240249752998352,
"learning_rate": 3.2525076210286e-06,
"epoch": 1.8918918918918919,
"step": 1750
},
{
"loss": 1.3975,
"grad_norm": 0.9747676849365234,
"learning_rate": 2.6416825453794646e-06,
"epoch": 1.9027027027027028,
"step": 1760
},
{
"loss": 1.3456,
"grad_norm": 1.1354199647903442,
"learning_rate": 2.093953216624556e-06,
"epoch": 1.9135135135135135,
"step": 1770
},
{
"loss": 1.5099,
"grad_norm": 1.3150416612625122,
"learning_rate": 1.609494744642892e-06,
"epoch": 1.9243243243243242,
"step": 1780
},
{
"loss": 1.3622,
"grad_norm": 1.1703237295150757,
"learning_rate": 1.188462011526692e-06,
"epoch": 1.9351351351351351,
"step": 1790
},
{
"loss": 1.3636,
"grad_norm": 1.1234310865402222,
"learning_rate": 8.309896220654034e-07,
"epoch": 1.945945945945946,
"step": 1800
},
{
"loss": 1.377,
"grad_norm": 1.2217568159103394,
"learning_rate": 5.371918607122827e-07,
"epoch": 1.9567567567567568,
"step": 1810
},
{
"loss": 1.4881,
"grad_norm": 1.1424119472503662,
"learning_rate": 3.0716265504753263e-07,
"epoch": 1.9675675675675675,
"step": 1820
},
{
"loss": 1.3919,
"grad_norm": 1.0163501501083374,
"learning_rate": 1.409755457494555e-07,
"epoch": 1.9783783783783784,
"step": 1830
},
{
"loss": 1.3928,
"grad_norm": 1.117492914199829,
"learning_rate": 3.868366308346083e-08,
"epoch": 1.9891891891891893,
"step": 1840
},
{
"eval_loss": 1.7465505599975586,
"eval_runtime": 9.9858,
"eval_samples_per_second": 78.011,
"eval_steps_per_second": 19.528,
"epoch": 1.9945945945945946,
"step": 1845
},
{
"loss": 1.3995,
"grad_norm": 1.4473432302474976,
"learning_rate": 3.1970991622998217e-10,
"epoch": 2.0,
"step": 1850
},
{
"train_runtime": 958.4314,
"train_samples_per_second": 30.878,
"train_steps_per_second": 1.93,
"total_flos": 9.431756938411008e+16,
"train_loss": 1.6259772120295344,
"epoch": 2.0,
"step": 1850
}
]