Files
biomed-Qwen2.5-VL-3B-Instruct/trainer_state.json
ModelHub XC 6bd44e5d39 初始化项目,由ModelHub XC社区提供模型
Model: AdaptLLM/biomed-Qwen2.5-VL-3B-Instruct
Source: Original Platform
2026-05-20 07:56:23 +08:00

2802 lines
68 KiB
JSON

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9998731447418495,
"eval_steps": 500,
"global_step": 3941,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002537105163009007,
"grad_norm": 4.738877773284912,
"learning_rate": 2.278481012658228e-07,
"loss": 1.6626,
"step": 10
},
{
"epoch": 0.005074210326018014,
"grad_norm": 4.201930046081543,
"learning_rate": 4.810126582278482e-07,
"loss": 1.6508,
"step": 20
},
{
"epoch": 0.00761131548902702,
"grad_norm": 3.1523282527923584,
"learning_rate": 7.341772151898735e-07,
"loss": 1.6588,
"step": 30
},
{
"epoch": 0.010148420652036028,
"grad_norm": 2.514768123626709,
"learning_rate": 9.873417721518988e-07,
"loss": 1.6091,
"step": 40
},
{
"epoch": 0.012685525815045033,
"grad_norm": 1.9985918998718262,
"learning_rate": 1.240506329113924e-06,
"loss": 1.5742,
"step": 50
},
{
"epoch": 0.01522263097805404,
"grad_norm": 1.5602017641067505,
"learning_rate": 1.4936708860759495e-06,
"loss": 1.5291,
"step": 60
},
{
"epoch": 0.01775973614106305,
"grad_norm": 1.5359352827072144,
"learning_rate": 1.7468354430379747e-06,
"loss": 1.5007,
"step": 70
},
{
"epoch": 0.020296841304072055,
"grad_norm": 1.5024731159210205,
"learning_rate": 2.0000000000000003e-06,
"loss": 1.4645,
"step": 80
},
{
"epoch": 0.022833946467081062,
"grad_norm": 1.4507105350494385,
"learning_rate": 2.2531645569620258e-06,
"loss": 1.4484,
"step": 90
},
{
"epoch": 0.025371051630090066,
"grad_norm": 1.4768809080123901,
"learning_rate": 2.5063291139240508e-06,
"loss": 1.4516,
"step": 100
},
{
"epoch": 0.027908156793099072,
"grad_norm": 1.3820849657058716,
"learning_rate": 2.7594936708860766e-06,
"loss": 1.4131,
"step": 110
},
{
"epoch": 0.03044526195610808,
"grad_norm": 1.4740626811981201,
"learning_rate": 3.0126582278481016e-06,
"loss": 1.4007,
"step": 120
},
{
"epoch": 0.03298236711911709,
"grad_norm": 1.3979785442352295,
"learning_rate": 3.265822784810127e-06,
"loss": 1.3964,
"step": 130
},
{
"epoch": 0.0355194722821261,
"grad_norm": 1.3529947996139526,
"learning_rate": 3.518987341772152e-06,
"loss": 1.3681,
"step": 140
},
{
"epoch": 0.038056577445135104,
"grad_norm": 1.5106327533721924,
"learning_rate": 3.7721518987341775e-06,
"loss": 1.3842,
"step": 150
},
{
"epoch": 0.04059368260814411,
"grad_norm": 1.469246506690979,
"learning_rate": 4.025316455696203e-06,
"loss": 1.3481,
"step": 160
},
{
"epoch": 0.04313078777115312,
"grad_norm": 1.4268747568130493,
"learning_rate": 4.278481012658228e-06,
"loss": 1.3738,
"step": 170
},
{
"epoch": 0.045667892934162124,
"grad_norm": 1.364691138267517,
"learning_rate": 4.531645569620253e-06,
"loss": 1.3568,
"step": 180
},
{
"epoch": 0.04820499809717113,
"grad_norm": 1.5261390209197998,
"learning_rate": 4.784810126582279e-06,
"loss": 1.347,
"step": 190
},
{
"epoch": 0.05074210326018013,
"grad_norm": 1.4499975442886353,
"learning_rate": 5.037974683544305e-06,
"loss": 1.367,
"step": 200
},
{
"epoch": 0.05327920842318914,
"grad_norm": 1.506212592124939,
"learning_rate": 5.29113924050633e-06,
"loss": 1.3344,
"step": 210
},
{
"epoch": 0.055816313586198145,
"grad_norm": 1.4976462125778198,
"learning_rate": 5.544303797468355e-06,
"loss": 1.3454,
"step": 220
},
{
"epoch": 0.05835341874920715,
"grad_norm": 1.45829439163208,
"learning_rate": 5.79746835443038e-06,
"loss": 1.3444,
"step": 230
},
{
"epoch": 0.06089052391221616,
"grad_norm": 1.502082347869873,
"learning_rate": 6.050632911392406e-06,
"loss": 1.3264,
"step": 240
},
{
"epoch": 0.06342762907522517,
"grad_norm": 1.525439739227295,
"learning_rate": 6.303797468354431e-06,
"loss": 1.3259,
"step": 250
},
{
"epoch": 0.06596473423823418,
"grad_norm": 1.4980591535568237,
"learning_rate": 6.5569620253164564e-06,
"loss": 1.3338,
"step": 260
},
{
"epoch": 0.06850183940124319,
"grad_norm": 1.4034385681152344,
"learning_rate": 6.810126582278481e-06,
"loss": 1.3293,
"step": 270
},
{
"epoch": 0.0710389445642522,
"grad_norm": 1.3902912139892578,
"learning_rate": 7.0632911392405065e-06,
"loss": 1.3339,
"step": 280
},
{
"epoch": 0.0735760497272612,
"grad_norm": 1.546105146408081,
"learning_rate": 7.316455696202533e-06,
"loss": 1.3232,
"step": 290
},
{
"epoch": 0.07611315489027021,
"grad_norm": 1.396514654159546,
"learning_rate": 7.569620253164558e-06,
"loss": 1.3185,
"step": 300
},
{
"epoch": 0.07865026005327921,
"grad_norm": 1.400497555732727,
"learning_rate": 7.822784810126582e-06,
"loss": 1.3231,
"step": 310
},
{
"epoch": 0.08118736521628822,
"grad_norm": 1.4383889436721802,
"learning_rate": 8.075949367088608e-06,
"loss": 1.2916,
"step": 320
},
{
"epoch": 0.08372447037929723,
"grad_norm": 1.383995532989502,
"learning_rate": 8.329113924050633e-06,
"loss": 1.3045,
"step": 330
},
{
"epoch": 0.08626157554230623,
"grad_norm": 1.544356346130371,
"learning_rate": 8.582278481012659e-06,
"loss": 1.313,
"step": 340
},
{
"epoch": 0.08879868070531524,
"grad_norm": 1.5004216432571411,
"learning_rate": 8.835443037974685e-06,
"loss": 1.3128,
"step": 350
},
{
"epoch": 0.09133578586832425,
"grad_norm": 1.5500295162200928,
"learning_rate": 9.08860759493671e-06,
"loss": 1.2921,
"step": 360
},
{
"epoch": 0.09387289103133326,
"grad_norm": 1.3721230030059814,
"learning_rate": 9.341772151898735e-06,
"loss": 1.2989,
"step": 370
},
{
"epoch": 0.09640999619434226,
"grad_norm": 1.5480691194534302,
"learning_rate": 9.59493670886076e-06,
"loss": 1.307,
"step": 380
},
{
"epoch": 0.09894710135735126,
"grad_norm": 1.4069089889526367,
"learning_rate": 9.848101265822785e-06,
"loss": 1.2914,
"step": 390
},
{
"epoch": 0.10148420652036026,
"grad_norm": 1.5373787879943848,
"learning_rate": 9.999968603457859e-06,
"loss": 1.302,
"step": 400
},
{
"epoch": 0.10402131168336927,
"grad_norm": 1.411720871925354,
"learning_rate": 9.999615396887012e-06,
"loss": 1.2963,
"step": 410
},
{
"epoch": 0.10655841684637828,
"grad_norm": 1.4174768924713135,
"learning_rate": 9.998869765883566e-06,
"loss": 1.2673,
"step": 420
},
{
"epoch": 0.10909552200938728,
"grad_norm": 1.492872714996338,
"learning_rate": 9.997731768972785e-06,
"loss": 1.3116,
"step": 430
},
{
"epoch": 0.11163262717239629,
"grad_norm": 1.487380027770996,
"learning_rate": 9.996201495477102e-06,
"loss": 1.2903,
"step": 440
},
{
"epoch": 0.1141697323354053,
"grad_norm": 1.4736219644546509,
"learning_rate": 9.994279065509094e-06,
"loss": 1.295,
"step": 450
},
{
"epoch": 0.1167068374984143,
"grad_norm": 1.4441871643066406,
"learning_rate": 9.991964629962067e-06,
"loss": 1.293,
"step": 460
},
{
"epoch": 0.11924394266142331,
"grad_norm": 1.4621552228927612,
"learning_rate": 9.989258370498208e-06,
"loss": 1.2746,
"step": 470
},
{
"epoch": 0.12178104782443232,
"grad_norm": 1.4845929145812988,
"learning_rate": 9.986160499534318e-06,
"loss": 1.3113,
"step": 480
},
{
"epoch": 0.12431815298744132,
"grad_norm": 1.4002912044525146,
"learning_rate": 9.982671260225156e-06,
"loss": 1.2872,
"step": 490
},
{
"epoch": 0.12685525815045035,
"grad_norm": 1.380839228630066,
"learning_rate": 9.97879092644434e-06,
"loss": 1.2743,
"step": 500
},
{
"epoch": 0.12939236331345935,
"grad_norm": 1.5403680801391602,
"learning_rate": 9.974519802762853e-06,
"loss": 1.2862,
"step": 510
},
{
"epoch": 0.13192946847646836,
"grad_norm": 1.4719021320343018,
"learning_rate": 9.969858224425138e-06,
"loss": 1.2733,
"step": 520
},
{
"epoch": 0.13446657363947737,
"grad_norm": 1.5172195434570312,
"learning_rate": 9.96480655732279e-06,
"loss": 1.2961,
"step": 530
},
{
"epoch": 0.13700367880248637,
"grad_norm": 1.3394590616226196,
"learning_rate": 9.959365197965824e-06,
"loss": 1.2742,
"step": 540
},
{
"epoch": 0.13954078396549538,
"grad_norm": 1.4464937448501587,
"learning_rate": 9.953534573451568e-06,
"loss": 1.268,
"step": 550
},
{
"epoch": 0.1420778891285044,
"grad_norm": 1.4036401510238647,
"learning_rate": 9.947315141431126e-06,
"loss": 1.2636,
"step": 560
},
{
"epoch": 0.1446149942915134,
"grad_norm": 1.3706105947494507,
"learning_rate": 9.940707390073465e-06,
"loss": 1.2728,
"step": 570
},
{
"epoch": 0.1471520994545224,
"grad_norm": 1.322920560836792,
"learning_rate": 9.933711838027096e-06,
"loss": 1.2585,
"step": 580
},
{
"epoch": 0.1496892046175314,
"grad_norm": 1.4396326541900635,
"learning_rate": 9.926329034379361e-06,
"loss": 1.2752,
"step": 590
},
{
"epoch": 0.15222630978054041,
"grad_norm": 1.389569878578186,
"learning_rate": 9.918559558613344e-06,
"loss": 1.284,
"step": 600
},
{
"epoch": 0.15476341494354942,
"grad_norm": 1.5533502101898193,
"learning_rate": 9.910404020562377e-06,
"loss": 1.2732,
"step": 610
},
{
"epoch": 0.15730052010655843,
"grad_norm": 1.3960537910461426,
"learning_rate": 9.901863060362176e-06,
"loss": 1.2756,
"step": 620
},
{
"epoch": 0.15983762526956743,
"grad_norm": 1.3757154941558838,
"learning_rate": 9.8929373484006e-06,
"loss": 1.2502,
"step": 630
},
{
"epoch": 0.16237473043257644,
"grad_norm": 1.467374324798584,
"learning_rate": 9.883627585265032e-06,
"loss": 1.2528,
"step": 640
},
{
"epoch": 0.16491183559558545,
"grad_norm": 1.4819854497909546,
"learning_rate": 9.873934501687381e-06,
"loss": 1.2602,
"step": 650
},
{
"epoch": 0.16744894075859446,
"grad_norm": 1.3049192428588867,
"learning_rate": 9.863858858486736e-06,
"loss": 1.2522,
"step": 660
},
{
"epoch": 0.16998604592160346,
"grad_norm": 1.367477536201477,
"learning_rate": 9.853401446509641e-06,
"loss": 1.2655,
"step": 670
},
{
"epoch": 0.17252315108461247,
"grad_norm": 1.394960880279541,
"learning_rate": 9.842563086568024e-06,
"loss": 1.273,
"step": 680
},
{
"epoch": 0.17506025624762148,
"grad_norm": 1.375187873840332,
"learning_rate": 9.831344629374778e-06,
"loss": 1.2805,
"step": 690
},
{
"epoch": 0.17759736141063048,
"grad_norm": 1.4008384943008423,
"learning_rate": 9.81974695547697e-06,
"loss": 1.2516,
"step": 700
},
{
"epoch": 0.1801344665736395,
"grad_norm": 1.398503065109253,
"learning_rate": 9.807770975186743e-06,
"loss": 1.2578,
"step": 710
},
{
"epoch": 0.1826715717366485,
"grad_norm": 1.3208105564117432,
"learning_rate": 9.795417628509857e-06,
"loss": 1.2591,
"step": 720
},
{
"epoch": 0.1852086768996575,
"grad_norm": 1.3660458326339722,
"learning_rate": 9.78268788507191e-06,
"loss": 1.2741,
"step": 730
},
{
"epoch": 0.1877457820626665,
"grad_norm": 1.3826677799224854,
"learning_rate": 9.769582744042224e-06,
"loss": 1.2588,
"step": 740
},
{
"epoch": 0.19028288722567552,
"grad_norm": 1.572285771369934,
"learning_rate": 9.756103234055432e-06,
"loss": 1.2609,
"step": 750
},
{
"epoch": 0.19281999238868452,
"grad_norm": 1.4226349592208862,
"learning_rate": 9.742250413130728e-06,
"loss": 1.2472,
"step": 760
},
{
"epoch": 0.19535709755169353,
"grad_norm": 1.3697686195373535,
"learning_rate": 9.728025368588829e-06,
"loss": 1.2492,
"step": 770
},
{
"epoch": 0.1978942027147025,
"grad_norm": 1.4662617444992065,
"learning_rate": 9.713429216966624e-06,
"loss": 1.2438,
"step": 780
},
{
"epoch": 0.20043130787771152,
"grad_norm": 1.425715446472168,
"learning_rate": 9.698463103929542e-06,
"loss": 1.2668,
"step": 790
},
{
"epoch": 0.20296841304072052,
"grad_norm": 1.380704402923584,
"learning_rate": 9.68312820418163e-06,
"loss": 1.2439,
"step": 800
},
{
"epoch": 0.20550551820372953,
"grad_norm": 1.4010827541351318,
"learning_rate": 9.667425721373333e-06,
"loss": 1.2606,
"step": 810
},
{
"epoch": 0.20804262336673854,
"grad_norm": 1.3978463411331177,
"learning_rate": 9.651356888007041e-06,
"loss": 1.2584,
"step": 820
},
{
"epoch": 0.21057972852974755,
"grad_norm": 1.4867024421691895,
"learning_rate": 9.634922965340334e-06,
"loss": 1.243,
"step": 830
},
{
"epoch": 0.21311683369275655,
"grad_norm": 1.3792587518692017,
"learning_rate": 9.618125243286989e-06,
"loss": 1.2341,
"step": 840
},
{
"epoch": 0.21565393885576556,
"grad_norm": 1.3928625583648682,
"learning_rate": 9.60096504031573e-06,
"loss": 1.2482,
"step": 850
},
{
"epoch": 0.21819104401877457,
"grad_norm": 1.4658421277999878,
"learning_rate": 9.58344370334675e-06,
"loss": 1.2737,
"step": 860
},
{
"epoch": 0.22072814918178357,
"grad_norm": 1.35547935962677,
"learning_rate": 9.565562607645974e-06,
"loss": 1.2433,
"step": 870
},
{
"epoch": 0.22326525434479258,
"grad_norm": 1.3512929677963257,
"learning_rate": 9.547323156717133e-06,
"loss": 1.2451,
"step": 880
},
{
"epoch": 0.2258023595078016,
"grad_norm": 1.371601939201355,
"learning_rate": 9.52872678219158e-06,
"loss": 1.2412,
"step": 890
},
{
"epoch": 0.2283394646708106,
"grad_norm": 1.4480026960372925,
"learning_rate": 9.50977494371594e-06,
"loss": 1.2487,
"step": 900
},
{
"epoch": 0.2308765698338196,
"grad_norm": 1.3161512613296509,
"learning_rate": 9.490469128837525e-06,
"loss": 1.2355,
"step": 910
},
{
"epoch": 0.2334136749968286,
"grad_norm": 1.4747735261917114,
"learning_rate": 9.470810852887586e-06,
"loss": 1.2314,
"step": 920
},
{
"epoch": 0.23595078015983761,
"grad_norm": 1.3615195751190186,
"learning_rate": 9.450801658862371e-06,
"loss": 1.249,
"step": 930
},
{
"epoch": 0.23848788532284662,
"grad_norm": 1.3905844688415527,
"learning_rate": 9.430443117302006e-06,
"loss": 1.2357,
"step": 940
},
{
"epoch": 0.24102499048585563,
"grad_norm": 1.4403789043426514,
"learning_rate": 9.409736826167233e-06,
"loss": 1.2482,
"step": 950
},
{
"epoch": 0.24356209564886463,
"grad_norm": 1.5014008283615112,
"learning_rate": 9.388684410713977e-06,
"loss": 1.2437,
"step": 960
},
{
"epoch": 0.24609920081187364,
"grad_norm": 1.383191466331482,
"learning_rate": 9.367287523365782e-06,
"loss": 1.237,
"step": 970
},
{
"epoch": 0.24863630597488265,
"grad_norm": 1.4577914476394653,
"learning_rate": 9.345547843584108e-06,
"loss": 1.2366,
"step": 980
},
{
"epoch": 0.25117341113789166,
"grad_norm": 1.3893483877182007,
"learning_rate": 9.323467077736513e-06,
"loss": 1.2432,
"step": 990
},
{
"epoch": 0.2537105163009007,
"grad_norm": 1.331141710281372,
"learning_rate": 9.301046958962707e-06,
"loss": 1.2438,
"step": 1000
},
{
"epoch": 0.25624762146390967,
"grad_norm": 1.5160998106002808,
"learning_rate": 9.278289247038537e-06,
"loss": 1.2404,
"step": 1010
},
{
"epoch": 0.2587847266269187,
"grad_norm": 1.2425806522369385,
"learning_rate": 9.255195728237837e-06,
"loss": 1.2361,
"step": 1020
},
{
"epoch": 0.2613218317899277,
"grad_norm": 1.325697660446167,
"learning_rate": 9.231768215192243e-06,
"loss": 1.2344,
"step": 1030
},
{
"epoch": 0.2638589369529367,
"grad_norm": 1.4259663820266724,
"learning_rate": 9.2080085467489e-06,
"loss": 1.258,
"step": 1040
},
{
"epoch": 0.2663960421159457,
"grad_norm": 1.4262241125106812,
"learning_rate": 9.183918587826142e-06,
"loss": 1.2518,
"step": 1050
},
{
"epoch": 0.26893314727895473,
"grad_norm": 1.4553781747817993,
"learning_rate": 9.159500229267103e-06,
"loss": 1.2387,
"step": 1060
},
{
"epoch": 0.2714702524419637,
"grad_norm": 1.3143069744110107,
"learning_rate": 9.134755387691315e-06,
"loss": 1.2474,
"step": 1070
},
{
"epoch": 0.27400735760497275,
"grad_norm": 1.3108036518096924,
"learning_rate": 9.109686005344258e-06,
"loss": 1.2362,
"step": 1080
},
{
"epoch": 0.2765444627679817,
"grad_norm": 1.4027985334396362,
"learning_rate": 9.084294049944919e-06,
"loss": 1.2344,
"step": 1090
},
{
"epoch": 0.27908156793099076,
"grad_norm": 1.3254168033599854,
"learning_rate": 9.05858151453134e-06,
"loss": 1.2294,
"step": 1100
},
{
"epoch": 0.28161867309399974,
"grad_norm": 1.3602770566940308,
"learning_rate": 9.032550417304189e-06,
"loss": 1.2408,
"step": 1110
},
{
"epoch": 0.2841557782570088,
"grad_norm": 1.3525891304016113,
"learning_rate": 9.006202801468342e-06,
"loss": 1.2436,
"step": 1120
},
{
"epoch": 0.28669288342001775,
"grad_norm": 1.3206712007522583,
"learning_rate": 8.979540735072512e-06,
"loss": 1.2103,
"step": 1130
},
{
"epoch": 0.2892299885830268,
"grad_norm": 1.3212580680847168,
"learning_rate": 8.952566310846931e-06,
"loss": 1.2184,
"step": 1140
},
{
"epoch": 0.29176709374603577,
"grad_norm": 1.4156090021133423,
"learning_rate": 8.925281646039078e-06,
"loss": 1.2323,
"step": 1150
},
{
"epoch": 0.2943041989090448,
"grad_norm": 1.3827259540557861,
"learning_rate": 8.897688882247515e-06,
"loss": 1.2226,
"step": 1160
},
{
"epoch": 0.2968413040720538,
"grad_norm": 1.375603199005127,
"learning_rate": 8.869790185253766e-06,
"loss": 1.2241,
"step": 1170
},
{
"epoch": 0.2993784092350628,
"grad_norm": 1.4006825685501099,
"learning_rate": 8.841587744852339e-06,
"loss": 1.2405,
"step": 1180
},
{
"epoch": 0.3019155143980718,
"grad_norm": 1.3298841714859009,
"learning_rate": 8.813083774678841e-06,
"loss": 1.2296,
"step": 1190
},
{
"epoch": 0.30445261956108083,
"grad_norm": 1.3826628923416138,
"learning_rate": 8.784280512036235e-06,
"loss": 1.2272,
"step": 1200
},
{
"epoch": 0.3069897247240898,
"grad_norm": 1.3651957511901855,
"learning_rate": 8.755180217719218e-06,
"loss": 1.225,
"step": 1210
},
{
"epoch": 0.30952682988709884,
"grad_norm": 1.4205344915390015,
"learning_rate": 8.72578517583679e-06,
"loss": 1.2351,
"step": 1220
},
{
"epoch": 0.3120639350501078,
"grad_norm": 1.350210189819336,
"learning_rate": 8.696097693632944e-06,
"loss": 1.2146,
"step": 1230
},
{
"epoch": 0.31460104021311686,
"grad_norm": 1.391353726387024,
"learning_rate": 8.666120101305596e-06,
"loss": 1.2444,
"step": 1240
},
{
"epoch": 0.31713814537612584,
"grad_norm": 1.4733567237854004,
"learning_rate": 8.635854751823666e-06,
"loss": 1.2427,
"step": 1250
},
{
"epoch": 0.31967525053913487,
"grad_norm": 1.3797138929367065,
"learning_rate": 8.60530402074241e-06,
"loss": 1.2236,
"step": 1260
},
{
"epoch": 0.32221235570214385,
"grad_norm": 1.2921204566955566,
"learning_rate": 8.574470306016936e-06,
"loss": 1.2375,
"step": 1270
},
{
"epoch": 0.3247494608651529,
"grad_norm": 1.3980249166488647,
"learning_rate": 8.543356027814009e-06,
"loss": 1.2176,
"step": 1280
},
{
"epoch": 0.32728656602816186,
"grad_norm": 1.3124829530715942,
"learning_rate": 8.511963628322076e-06,
"loss": 1.2289,
"step": 1290
},
{
"epoch": 0.3298236711911709,
"grad_norm": 1.3660489320755005,
"learning_rate": 8.480295571559581e-06,
"loss": 1.2222,
"step": 1300
},
{
"epoch": 0.3323607763541799,
"grad_norm": 1.3039889335632324,
"learning_rate": 8.448354343181568e-06,
"loss": 1.23,
"step": 1310
},
{
"epoch": 0.3348978815171889,
"grad_norm": 1.2941962480545044,
"learning_rate": 8.416142450284565e-06,
"loss": 1.234,
"step": 1320
},
{
"epoch": 0.3374349866801979,
"grad_norm": 1.3144862651824951,
"learning_rate": 8.383662421209813e-06,
"loss": 1.2291,
"step": 1330
},
{
"epoch": 0.3399720918432069,
"grad_norm": 1.4170352220535278,
"learning_rate": 8.350916805344812e-06,
"loss": 1.2501,
"step": 1340
},
{
"epoch": 0.3425091970062159,
"grad_norm": 1.2927628755569458,
"learning_rate": 8.317908172923207e-06,
"loss": 1.2057,
"step": 1350
},
{
"epoch": 0.34504630216922494,
"grad_norm": 1.3344128131866455,
"learning_rate": 8.28463911482306e-06,
"loss": 1.2244,
"step": 1360
},
{
"epoch": 0.3475834073322339,
"grad_norm": 1.3674781322479248,
"learning_rate": 8.251112242363488e-06,
"loss": 1.241,
"step": 1370
},
{
"epoch": 0.35012051249524295,
"grad_norm": 1.3904838562011719,
"learning_rate": 8.217330187099689e-06,
"loss": 1.2063,
"step": 1380
},
{
"epoch": 0.35265761765825193,
"grad_norm": 1.3588337898254395,
"learning_rate": 8.183295600616399e-06,
"loss": 1.2127,
"step": 1390
},
{
"epoch": 0.35519472282126097,
"grad_norm": 1.4805495738983154,
"learning_rate": 8.149011154319763e-06,
"loss": 1.224,
"step": 1400
},
{
"epoch": 0.35773182798426995,
"grad_norm": 1.3868420124053955,
"learning_rate": 8.114479539227653e-06,
"loss": 1.2399,
"step": 1410
},
{
"epoch": 0.360268933147279,
"grad_norm": 1.3122705221176147,
"learning_rate": 8.079703465758447e-06,
"loss": 1.216,
"step": 1420
},
{
"epoch": 0.36280603831028796,
"grad_norm": 1.3743106126785278,
"learning_rate": 8.044685663518289e-06,
"loss": 1.2258,
"step": 1430
},
{
"epoch": 0.365343143473297,
"grad_norm": 1.274947166442871,
"learning_rate": 8.009428881086836e-06,
"loss": 1.2159,
"step": 1440
},
{
"epoch": 0.367880248636306,
"grad_norm": 1.3252506256103516,
"learning_rate": 7.97393588580152e-06,
"loss": 1.2097,
"step": 1450
},
{
"epoch": 0.370417353799315,
"grad_norm": 1.349714756011963,
"learning_rate": 7.93820946354034e-06,
"loss": 1.2118,
"step": 1460
},
{
"epoch": 0.372954458962324,
"grad_norm": 1.353014349937439,
"learning_rate": 7.902252418503198e-06,
"loss": 1.2293,
"step": 1470
},
{
"epoch": 0.375491564125333,
"grad_norm": 1.3702679872512817,
"learning_rate": 7.86606757299178e-06,
"loss": 1.2096,
"step": 1480
},
{
"epoch": 0.378028669288342,
"grad_norm": 1.4039068222045898,
"learning_rate": 7.829657767188052e-06,
"loss": 1.2264,
"step": 1490
},
{
"epoch": 0.38056577445135104,
"grad_norm": 1.3884947299957275,
"learning_rate": 7.793025858931317e-06,
"loss": 1.2283,
"step": 1500
},
{
"epoch": 0.38310287961436,
"grad_norm": 1.306620717048645,
"learning_rate": 7.756174723493908e-06,
"loss": 1.2325,
"step": 1510
},
{
"epoch": 0.38563998477736905,
"grad_norm": 1.3600883483886719,
"learning_rate": 7.719107253355494e-06,
"loss": 1.2324,
"step": 1520
},
{
"epoch": 0.38817708994037803,
"grad_norm": 1.2543104887008667,
"learning_rate": 7.68182635797606e-06,
"loss": 1.1939,
"step": 1530
},
{
"epoch": 0.39071419510338706,
"grad_norm": 1.2916842699050903,
"learning_rate": 7.644334963567542e-06,
"loss": 1.2105,
"step": 1540
},
{
"epoch": 0.39325130026639604,
"grad_norm": 1.3755340576171875,
"learning_rate": 7.606636012864126e-06,
"loss": 1.226,
"step": 1550
},
{
"epoch": 0.395788405429405,
"grad_norm": 1.3301304578781128,
"learning_rate": 7.568732464891293e-06,
"loss": 1.2194,
"step": 1560
},
{
"epoch": 0.39832551059241406,
"grad_norm": 1.4263029098510742,
"learning_rate": 7.530627294733549e-06,
"loss": 1.2152,
"step": 1570
},
{
"epoch": 0.40086261575542304,
"grad_norm": 1.3354136943817139,
"learning_rate": 7.492323493300912e-06,
"loss": 1.2028,
"step": 1580
},
{
"epoch": 0.40339972091843207,
"grad_norm": 1.3600828647613525,
"learning_rate": 7.453824067094152e-06,
"loss": 1.2132,
"step": 1590
},
{
"epoch": 0.40593682608144105,
"grad_norm": 1.3342976570129395,
"learning_rate": 7.4151320379688105e-06,
"loss": 1.2235,
"step": 1600
},
{
"epoch": 0.4084739312444501,
"grad_norm": 1.3055970668792725,
"learning_rate": 7.376250442898006e-06,
"loss": 1.2121,
"step": 1610
},
{
"epoch": 0.41101103640745906,
"grad_norm": 1.2475143671035767,
"learning_rate": 7.33718233373407e-06,
"loss": 1.213,
"step": 1620
},
{
"epoch": 0.4135481415704681,
"grad_norm": 1.3556421995162964,
"learning_rate": 7.297930776968989e-06,
"loss": 1.2219,
"step": 1630
},
{
"epoch": 0.4160852467334771,
"grad_norm": 1.4067039489746094,
"learning_rate": 7.258498853493729e-06,
"loss": 1.2248,
"step": 1640
},
{
"epoch": 0.4186223518964861,
"grad_norm": 1.3992860317230225,
"learning_rate": 7.2188896583563984e-06,
"loss": 1.2041,
"step": 1650
},
{
"epoch": 0.4211594570594951,
"grad_norm": 1.267992615699768,
"learning_rate": 7.179106300519329e-06,
"loss": 1.232,
"step": 1660
},
{
"epoch": 0.4236965622225041,
"grad_norm": 1.3517013788223267,
"learning_rate": 7.13915190261504e-06,
"loss": 1.2012,
"step": 1670
},
{
"epoch": 0.4262336673855131,
"grad_norm": 1.3722771406173706,
"learning_rate": 7.099029600701144e-06,
"loss": 1.2013,
"step": 1680
},
{
"epoch": 0.42877077254852214,
"grad_norm": 1.3306978940963745,
"learning_rate": 7.0587425440141955e-06,
"loss": 1.2057,
"step": 1690
},
{
"epoch": 0.4313078777115311,
"grad_norm": 1.2793058156967163,
"learning_rate": 7.0182938947225025e-06,
"loss": 1.2094,
"step": 1700
},
{
"epoch": 0.43384498287454015,
"grad_norm": 1.2533907890319824,
"learning_rate": 6.977686827677926e-06,
"loss": 1.22,
"step": 1710
},
{
"epoch": 0.43638208803754913,
"grad_norm": 1.3793444633483887,
"learning_rate": 6.936924530166682e-06,
"loss": 1.2301,
"step": 1720
},
{
"epoch": 0.43891919320055817,
"grad_norm": 1.2866994142532349,
"learning_rate": 6.896010201659173e-06,
"loss": 1.2108,
"step": 1730
},
{
"epoch": 0.44145629836356715,
"grad_norm": 1.3020964860916138,
"learning_rate": 6.854947053558849e-06,
"loss": 1.2133,
"step": 1740
},
{
"epoch": 0.4439934035265762,
"grad_norm": 1.266717791557312,
"learning_rate": 6.8137383089501526e-06,
"loss": 1.2067,
"step": 1750
},
{
"epoch": 0.44653050868958516,
"grad_norm": 1.3352110385894775,
"learning_rate": 6.772387202345528e-06,
"loss": 1.2128,
"step": 1760
},
{
"epoch": 0.4490676138525942,
"grad_norm": 1.2845940589904785,
"learning_rate": 6.730896979431543e-06,
"loss": 1.2168,
"step": 1770
},
{
"epoch": 0.4516047190156032,
"grad_norm": 1.3710917234420776,
"learning_rate": 6.689270896814139e-06,
"loss": 1.2091,
"step": 1780
},
{
"epoch": 0.4541418241786122,
"grad_norm": 1.351404070854187,
"learning_rate": 6.647512221763005e-06,
"loss": 1.2047,
"step": 1790
},
{
"epoch": 0.4566789293416212,
"grad_norm": 1.3901602029800415,
"learning_rate": 6.6056242319551315e-06,
"loss": 1.2074,
"step": 1800
},
{
"epoch": 0.4592160345046302,
"grad_norm": 1.3269678354263306,
"learning_rate": 6.563610215217551e-06,
"loss": 1.2012,
"step": 1810
},
{
"epoch": 0.4617531396676392,
"grad_norm": 1.4395819902420044,
"learning_rate": 6.5214734692692594e-06,
"loss": 1.2121,
"step": 1820
},
{
"epoch": 0.46429024483064824,
"grad_norm": 1.3422843217849731,
"learning_rate": 6.479217301462386e-06,
"loss": 1.2072,
"step": 1830
},
{
"epoch": 0.4668273499936572,
"grad_norm": 1.389143466949463,
"learning_rate": 6.43684502852259e-06,
"loss": 1.2005,
"step": 1840
},
{
"epoch": 0.46936445515666625,
"grad_norm": 1.2481592893600464,
"learning_rate": 6.394359976288729e-06,
"loss": 1.2026,
"step": 1850
},
{
"epoch": 0.47190156031967523,
"grad_norm": 1.2201452255249023,
"learning_rate": 6.3517654794518156e-06,
"loss": 1.2086,
"step": 1860
},
{
"epoch": 0.47443866548268426,
"grad_norm": 1.2975594997406006,
"learning_rate": 6.309064881293265e-06,
"loss": 1.2118,
"step": 1870
},
{
"epoch": 0.47697577064569324,
"grad_norm": 1.3062618970870972,
"learning_rate": 6.266261533422487e-06,
"loss": 1.2117,
"step": 1880
},
{
"epoch": 0.4795128758087023,
"grad_norm": 1.356292963027954,
"learning_rate": 6.223358795513812e-06,
"loss": 1.2045,
"step": 1890
},
{
"epoch": 0.48204998097171126,
"grad_norm": 1.259065866470337,
"learning_rate": 6.18036003504278e-06,
"loss": 1.1995,
"step": 1900
},
{
"epoch": 0.4845870861347203,
"grad_norm": 1.3219462633132935,
"learning_rate": 6.1372686270218385e-06,
"loss": 1.1936,
"step": 1910
},
{
"epoch": 0.48712419129772927,
"grad_norm": 1.2755314111709595,
"learning_rate": 6.094087953735423e-06,
"loss": 1.2122,
"step": 1920
},
{
"epoch": 0.4896612964607383,
"grad_norm": 1.2812877893447876,
"learning_rate": 6.050821404474483e-06,
"loss": 1.1939,
"step": 1930
},
{
"epoch": 0.4921984016237473,
"grad_norm": 1.308876633644104,
"learning_rate": 6.00747237527045e-06,
"loss": 1.2163,
"step": 1940
},
{
"epoch": 0.4947355067867563,
"grad_norm": 1.3260457515716553,
"learning_rate": 5.964044268628688e-06,
"loss": 1.2022,
"step": 1950
},
{
"epoch": 0.4972726119497653,
"grad_norm": 1.3721497058868408,
"learning_rate": 5.920540493261415e-06,
"loss": 1.2128,
"step": 1960
},
{
"epoch": 0.49980971711277433,
"grad_norm": 1.4000312089920044,
"learning_rate": 5.8769644638201635e-06,
"loss": 1.2014,
"step": 1970
},
{
"epoch": 0.5023468222757833,
"grad_norm": 1.31642746925354,
"learning_rate": 5.8333196006277536e-06,
"loss": 1.1962,
"step": 1980
},
{
"epoch": 0.5048839274387923,
"grad_norm": 1.224902629852295,
"learning_rate": 5.789609329409826e-06,
"loss": 1.2015,
"step": 1990
},
{
"epoch": 0.5074210326018014,
"grad_norm": 1.3817135095596313,
"learning_rate": 5.7458370810259635e-06,
"loss": 1.1935,
"step": 2000
},
{
"epoch": 0.5099581377648104,
"grad_norm": 1.268218755722046,
"learning_rate": 5.702006291200389e-06,
"loss": 1.1894,
"step": 2010
},
{
"epoch": 0.5124952429278193,
"grad_norm": 1.4027165174484253,
"learning_rate": 5.6581204002523e-06,
"loss": 1.1883,
"step": 2020
},
{
"epoch": 0.5150323480908283,
"grad_norm": 1.2554658651351929,
"learning_rate": 5.614182852825835e-06,
"loss": 1.1995,
"step": 2030
},
{
"epoch": 0.5175694532538374,
"grad_norm": 1.4268817901611328,
"learning_rate": 5.570197097619688e-06,
"loss": 1.2145,
"step": 2040
},
{
"epoch": 0.5201065584168464,
"grad_norm": 1.2476046085357666,
"learning_rate": 5.526166587116436e-06,
"loss": 1.1952,
"step": 2050
},
{
"epoch": 0.5226436635798554,
"grad_norm": 1.3912453651428223,
"learning_rate": 5.4820947773115374e-06,
"loss": 1.2126,
"step": 2060
},
{
"epoch": 0.5251807687428643,
"grad_norm": 1.3362759351730347,
"learning_rate": 5.437985127442065e-06,
"loss": 1.1981,
"step": 2070
},
{
"epoch": 0.5277178739058734,
"grad_norm": 1.243674874305725,
"learning_rate": 5.393841099715205e-06,
"loss": 1.1944,
"step": 2080
},
{
"epoch": 0.5302549790688824,
"grad_norm": 1.3977802991867065,
"learning_rate": 5.349666159036482e-06,
"loss": 1.1924,
"step": 2090
},
{
"epoch": 0.5327920842318914,
"grad_norm": 1.330768346786499,
"learning_rate": 5.305463772737812e-06,
"loss": 1.1907,
"step": 2100
},
{
"epoch": 0.5353291893949004,
"grad_norm": 1.3184082508087158,
"learning_rate": 5.261237410305344e-06,
"loss": 1.1979,
"step": 2110
},
{
"epoch": 0.5378662945579095,
"grad_norm": 1.3445682525634766,
"learning_rate": 5.2169905431071356e-06,
"loss": 1.2007,
"step": 2120
},
{
"epoch": 0.5404033997209184,
"grad_norm": 1.2852333784103394,
"learning_rate": 5.172726644120678e-06,
"loss": 1.187,
"step": 2130
},
{
"epoch": 0.5429405048839274,
"grad_norm": 1.33376944065094,
"learning_rate": 5.128449187660309e-06,
"loss": 1.1913,
"step": 2140
},
{
"epoch": 0.5454776100469364,
"grad_norm": 1.3172907829284668,
"learning_rate": 5.084161649104502e-06,
"loss": 1.1996,
"step": 2150
},
{
"epoch": 0.5480147152099455,
"grad_norm": 1.3981866836547852,
"learning_rate": 5.039867504623084e-06,
"loss": 1.1792,
"step": 2160
},
{
"epoch": 0.5505518203729545,
"grad_norm": 1.2657582759857178,
"learning_rate": 4.995570230904386e-06,
"loss": 1.1744,
"step": 2170
},
{
"epoch": 0.5530889255359634,
"grad_norm": 1.2834333181381226,
"learning_rate": 4.951273304882358e-06,
"loss": 1.1934,
"step": 2180
},
{
"epoch": 0.5556260306989724,
"grad_norm": 1.2566810846328735,
"learning_rate": 4.906980203463659e-06,
"loss": 1.1934,
"step": 2190
},
{
"epoch": 0.5581631358619815,
"grad_norm": 1.310981035232544,
"learning_rate": 4.862694403254747e-06,
"loss": 1.1952,
"step": 2200
},
{
"epoch": 0.5607002410249905,
"grad_norm": 1.2907445430755615,
"learning_rate": 4.818419380289009e-06,
"loss": 1.213,
"step": 2210
},
{
"epoch": 0.5632373461879995,
"grad_norm": 1.3687422275543213,
"learning_rate": 4.774158609753908e-06,
"loss": 1.1969,
"step": 2220
},
{
"epoch": 0.5657744513510085,
"grad_norm": 1.2513952255249023,
"learning_rate": 4.729915565718223e-06,
"loss": 1.1855,
"step": 2230
},
{
"epoch": 0.5683115565140175,
"grad_norm": 1.3049182891845703,
"learning_rate": 4.685693720859369e-06,
"loss": 1.1888,
"step": 2240
},
{
"epoch": 0.5708486616770265,
"grad_norm": 1.2568814754486084,
"learning_rate": 4.641496546190813e-06,
"loss": 1.1858,
"step": 2250
},
{
"epoch": 0.5733857668400355,
"grad_norm": 1.2858439683914185,
"learning_rate": 4.597327510789635e-06,
"loss": 1.18,
"step": 2260
},
{
"epoch": 0.5759228720030445,
"grad_norm": 1.3341573476791382,
"learning_rate": 4.553190081524242e-06,
"loss": 1.1904,
"step": 2270
},
{
"epoch": 0.5784599771660536,
"grad_norm": 1.2678639888763428,
"learning_rate": 4.5090877227822424e-06,
"loss": 1.1908,
"step": 2280
},
{
"epoch": 0.5809970823290626,
"grad_norm": 1.3060572147369385,
"learning_rate": 4.46502389619853e-06,
"loss": 1.202,
"step": 2290
},
{
"epoch": 0.5835341874920715,
"grad_norm": 1.2779840230941772,
"learning_rate": 4.421002060383569e-06,
"loss": 1.1926,
"step": 2300
},
{
"epoch": 0.5860712926550805,
"grad_norm": 1.3469980955123901,
"learning_rate": 4.3770256706519375e-06,
"loss": 1.1777,
"step": 2310
},
{
"epoch": 0.5886083978180896,
"grad_norm": 1.2928153276443481,
"learning_rate": 4.3330981787511006e-06,
"loss": 1.1779,
"step": 2320
},
{
"epoch": 0.5911455029810986,
"grad_norm": 1.424657940864563,
"learning_rate": 4.289223032590491e-06,
"loss": 1.2134,
"step": 2330
},
{
"epoch": 0.5936826081441076,
"grad_norm": 1.3630969524383545,
"learning_rate": 4.245403675970877e-06,
"loss": 1.1821,
"step": 2340
},
{
"epoch": 0.5962197133071165,
"grad_norm": 1.3015477657318115,
"learning_rate": 4.201643548314051e-06,
"loss": 1.1874,
"step": 2350
},
{
"epoch": 0.5987568184701256,
"grad_norm": 1.2937594652175903,
"learning_rate": 4.157946084392871e-06,
"loss": 1.2015,
"step": 2360
},
{
"epoch": 0.6012939236331346,
"grad_norm": 1.1649646759033203,
"learning_rate": 4.114314714061659e-06,
"loss": 1.1787,
"step": 2370
},
{
"epoch": 0.6038310287961436,
"grad_norm": 1.2585970163345337,
"learning_rate": 4.0707528619869976e-06,
"loss": 1.1739,
"step": 2380
},
{
"epoch": 0.6063681339591526,
"grad_norm": 1.3575518131256104,
"learning_rate": 4.027263947378907e-06,
"loss": 1.1744,
"step": 2390
},
{
"epoch": 0.6089052391221617,
"grad_norm": 1.3373316526412964,
"learning_rate": 3.9838513837224814e-06,
"loss": 1.189,
"step": 2400
},
{
"epoch": 0.6114423442851706,
"grad_norm": 1.2326818704605103,
"learning_rate": 3.940518578509963e-06,
"loss": 1.1842,
"step": 2410
},
{
"epoch": 0.6139794494481796,
"grad_norm": 1.34890878200531,
"learning_rate": 3.8972689329732725e-06,
"loss": 1.1954,
"step": 2420
},
{
"epoch": 0.6165165546111886,
"grad_norm": 1.2541084289550781,
"learning_rate": 3.854105841817056e-06,
"loss": 1.1771,
"step": 2430
},
{
"epoch": 0.6190536597741977,
"grad_norm": 1.254408836364746,
"learning_rate": 3.811032692952227e-06,
"loss": 1.1655,
"step": 2440
},
{
"epoch": 0.6215907649372067,
"grad_norm": 1.2309529781341553,
"learning_rate": 3.7680528672300404e-06,
"loss": 1.1909,
"step": 2450
},
{
"epoch": 0.6241278701002156,
"grad_norm": 1.3115286827087402,
"learning_rate": 3.7251697381767373e-06,
"loss": 1.192,
"step": 2460
},
{
"epoch": 0.6266649752632246,
"grad_norm": 1.23496413230896,
"learning_rate": 3.6823866717287437e-06,
"loss": 1.1905,
"step": 2470
},
{
"epoch": 0.6292020804262337,
"grad_norm": 1.3248686790466309,
"learning_rate": 3.6397070259684793e-06,
"loss": 1.1864,
"step": 2480
},
{
"epoch": 0.6317391855892427,
"grad_norm": 1.256982445716858,
"learning_rate": 3.5971341508607814e-06,
"loss": 1.169,
"step": 2490
},
{
"epoch": 0.6342762907522517,
"grad_norm": 1.30142343044281,
"learning_rate": 3.5546713879899563e-06,
"loss": 1.1699,
"step": 2500
},
{
"epoch": 0.6368133959152606,
"grad_norm": 1.3655248880386353,
"learning_rate": 3.512322070297503e-06,
"loss": 1.1719,
"step": 2510
},
{
"epoch": 0.6393505010782697,
"grad_norm": 1.288802146911621,
"learning_rate": 3.4700895218205026e-06,
"loss": 1.1869,
"step": 2520
},
{
"epoch": 0.6418876062412787,
"grad_norm": 1.245276689529419,
"learning_rate": 3.4279770574307096e-06,
"loss": 1.1882,
"step": 2530
},
{
"epoch": 0.6444247114042877,
"grad_norm": 1.222338080406189,
"learning_rate": 3.385987982574372e-06,
"loss": 1.1746,
"step": 2540
},
{
"epoch": 0.6469618165672967,
"grad_norm": 1.2364776134490967,
"learning_rate": 3.3441255930127752e-06,
"loss": 1.1912,
"step": 2550
},
{
"epoch": 0.6494989217303058,
"grad_norm": 1.270403504371643,
"learning_rate": 3.3023931745635606e-06,
"loss": 1.1805,
"step": 2560
},
{
"epoch": 0.6520360268933147,
"grad_norm": 1.3970060348510742,
"learning_rate": 3.2607940028428154e-06,
"loss": 1.1913,
"step": 2570
},
{
"epoch": 0.6545731320563237,
"grad_norm": 1.3008099794387817,
"learning_rate": 3.2193313430079737e-06,
"loss": 1.1978,
"step": 2580
},
{
"epoch": 0.6571102372193327,
"grad_norm": 1.3095245361328125,
"learning_rate": 3.178008449501517e-06,
"loss": 1.1744,
"step": 2590
},
{
"epoch": 0.6596473423823418,
"grad_norm": 1.1827950477600098,
"learning_rate": 3.1368285657955464e-06,
"loss": 1.1779,
"step": 2600
},
{
"epoch": 0.6621844475453508,
"grad_norm": 1.2912861108779907,
"learning_rate": 3.0957949241371845e-06,
"loss": 1.197,
"step": 2610
},
{
"epoch": 0.6647215527083598,
"grad_norm": 1.2797883749008179,
"learning_rate": 3.0549107452948866e-06,
"loss": 1.1945,
"step": 2620
},
{
"epoch": 0.6672586578713687,
"grad_norm": 1.3769925832748413,
"learning_rate": 3.014179238305629e-06,
"loss": 1.1819,
"step": 2630
},
{
"epoch": 0.6697957630343778,
"grad_norm": 1.204916000366211,
"learning_rate": 2.9736036002230332e-06,
"loss": 1.1646,
"step": 2640
},
{
"epoch": 0.6723328681973868,
"grad_norm": 1.216291904449463,
"learning_rate": 2.933187015866431e-06,
"loss": 1.1929,
"step": 2650
},
{
"epoch": 0.6748699733603958,
"grad_norm": 1.273775339126587,
"learning_rate": 2.892932657570878e-06,
"loss": 1.1775,
"step": 2660
},
{
"epoch": 0.6774070785234048,
"grad_norm": 1.2574785947799683,
"learning_rate": 2.8528436849381518e-06,
"loss": 1.2057,
"step": 2670
},
{
"epoch": 0.6799441836864138,
"grad_norm": 1.2418912649154663,
"learning_rate": 2.8129232445887623e-06,
"loss": 1.1858,
"step": 2680
},
{
"epoch": 0.6824812888494228,
"grad_norm": 1.3044190406799316,
"learning_rate": 2.773174469914964e-06,
"loss": 1.1867,
"step": 2690
},
{
"epoch": 0.6850183940124318,
"grad_norm": 1.3013222217559814,
"learning_rate": 2.7336004808348094e-06,
"loss": 1.1737,
"step": 2700
},
{
"epoch": 0.6875554991754408,
"grad_norm": 1.2504340410232544,
"learning_rate": 2.6942043835472725e-06,
"loss": 1.1827,
"step": 2710
},
{
"epoch": 0.6900926043384499,
"grad_norm": 1.292549729347229,
"learning_rate": 2.654989270288435e-06,
"loss": 1.1844,
"step": 2720
},
{
"epoch": 0.6926297095014589,
"grad_norm": 1.2378226518630981,
"learning_rate": 2.615958219088776e-06,
"loss": 1.1827,
"step": 2730
},
{
"epoch": 0.6951668146644678,
"grad_norm": 1.1776105165481567,
"learning_rate": 2.577114293531571e-06,
"loss": 1.1764,
"step": 2740
},
{
"epoch": 0.6977039198274768,
"grad_norm": 1.2368700504302979,
"learning_rate": 2.538460542512435e-06,
"loss": 1.1788,
"step": 2750
},
{
"epoch": 0.7002410249904859,
"grad_norm": 1.3077671527862549,
"learning_rate": 2.5000000000000015e-06,
"loss": 1.1786,
"step": 2760
},
{
"epoch": 0.7027781301534949,
"grad_norm": 1.2264658212661743,
"learning_rate": 2.461735684797794e-06,
"loss": 1.1891,
"step": 2770
},
{
"epoch": 0.7053152353165039,
"grad_norm": 1.411934494972229,
"learning_rate": 2.4236706003072733e-06,
"loss": 1.2021,
"step": 2780
},
{
"epoch": 0.7078523404795128,
"grad_norm": 1.3481911420822144,
"learning_rate": 2.385807734292097e-06,
"loss": 1.1687,
"step": 2790
},
{
"epoch": 0.7103894456425219,
"grad_norm": 1.2622945308685303,
"learning_rate": 2.3481500586436067e-06,
"loss": 1.1906,
"step": 2800
},
{
"epoch": 0.7129265508055309,
"grad_norm": 1.3974981307983398,
"learning_rate": 2.3107005291475653e-06,
"loss": 1.1894,
"step": 2810
},
{
"epoch": 0.7154636559685399,
"grad_norm": 1.2239971160888672,
"learning_rate": 2.273462085252146e-06,
"loss": 1.1596,
"step": 2820
},
{
"epoch": 0.7180007611315489,
"grad_norm": 1.3323861360549927,
"learning_rate": 2.236437649837223e-06,
"loss": 1.2045,
"step": 2830
},
{
"epoch": 0.720537866294558,
"grad_norm": 1.2548524141311646,
"learning_rate": 2.1996301289849474e-06,
"loss": 1.1791,
"step": 2840
},
{
"epoch": 0.7230749714575669,
"grad_norm": 1.3251192569732666,
"learning_rate": 2.1630424117516436e-06,
"loss": 1.174,
"step": 2850
},
{
"epoch": 0.7256120766205759,
"grad_norm": 1.2302052974700928,
"learning_rate": 2.126677369941047e-06,
"loss": 1.1498,
"step": 2860
},
{
"epoch": 0.7281491817835849,
"grad_norm": 1.2613223791122437,
"learning_rate": 2.0905378578788947e-06,
"loss": 1.1799,
"step": 2870
},
{
"epoch": 0.730686286946594,
"grad_norm": 1.2105897665023804,
"learning_rate": 2.0546267121888863e-06,
"loss": 1.169,
"step": 2880
},
{
"epoch": 0.733223392109603,
"grad_norm": 1.3086020946502686,
"learning_rate": 2.0189467515700283e-06,
"loss": 1.166,
"step": 2890
},
{
"epoch": 0.735760497272612,
"grad_norm": 1.2625070810317993,
"learning_rate": 1.9835007765754035e-06,
"loss": 1.1944,
"step": 2900
},
{
"epoch": 0.7382976024356209,
"grad_norm": 1.3192371129989624,
"learning_rate": 1.9482915693923442e-06,
"loss": 1.1712,
"step": 2910
},
{
"epoch": 0.74083470759863,
"grad_norm": 1.2234618663787842,
"learning_rate": 1.913321893624059e-06,
"loss": 1.1753,
"step": 2920
},
{
"epoch": 0.743371812761639,
"grad_norm": 1.3323945999145508,
"learning_rate": 1.878594494072713e-06,
"loss": 1.1681,
"step": 2930
},
{
"epoch": 0.745908917924648,
"grad_norm": 1.2558120489120483,
"learning_rate": 1.8441120965239912e-06,
"loss": 1.1796,
"step": 2940
},
{
"epoch": 0.748446023087657,
"grad_norm": 1.2716906070709229,
"learning_rate": 1.8098774075331383e-06,
"loss": 1.1894,
"step": 2950
},
{
"epoch": 0.750983128250666,
"grad_norm": 1.2781611680984497,
"learning_rate": 1.7758931142125308e-06,
"loss": 1.1855,
"step": 2960
},
{
"epoch": 0.753520233413675,
"grad_norm": 1.3427870273590088,
"learning_rate": 1.7421618840207576e-06,
"loss": 1.183,
"step": 2970
},
{
"epoch": 0.756057338576684,
"grad_norm": 1.2347464561462402,
"learning_rate": 1.7086863645532425e-06,
"loss": 1.1615,
"step": 2980
},
{
"epoch": 0.758594443739693,
"grad_norm": 1.2659286260604858,
"learning_rate": 1.6754691833344472e-06,
"loss": 1.1926,
"step": 2990
},
{
"epoch": 0.7611315489027021,
"grad_norm": 1.2952594757080078,
"learning_rate": 1.642512947611622e-06,
"loss": 1.1988,
"step": 3000
},
{
"epoch": 0.763668654065711,
"grad_norm": 1.2655588388442993,
"learning_rate": 1.6098202441501599e-06,
"loss": 1.1691,
"step": 3010
},
{
"epoch": 0.76620575922872,
"grad_norm": 1.2471306324005127,
"learning_rate": 1.5773936390305678e-06,
"loss": 1.1572,
"step": 3020
},
{
"epoch": 0.768742864391729,
"grad_norm": 1.2647178173065186,
"learning_rate": 1.5452356774470468e-06,
"loss": 1.1733,
"step": 3030
},
{
"epoch": 0.7712799695547381,
"grad_norm": 1.2472219467163086,
"learning_rate": 1.5133488835077204e-06,
"loss": 1.1772,
"step": 3040
},
{
"epoch": 0.7738170747177471,
"grad_norm": 1.2521233558654785,
"learning_rate": 1.4817357600365061e-06,
"loss": 1.172,
"step": 3050
},
{
"epoch": 0.7763541798807561,
"grad_norm": 1.3133138418197632,
"learning_rate": 1.4503987883766857e-06,
"loss": 1.1784,
"step": 3060
},
{
"epoch": 0.778891285043765,
"grad_norm": 1.2727947235107422,
"learning_rate": 1.4193404281961172e-06,
"loss": 1.1817,
"step": 3070
},
{
"epoch": 0.7814283902067741,
"grad_norm": 1.2805311679840088,
"learning_rate": 1.3885631172941932e-06,
"loss": 1.1841,
"step": 3080
},
{
"epoch": 0.7839654953697831,
"grad_norm": 1.2885172367095947,
"learning_rate": 1.3580692714104887e-06,
"loss": 1.162,
"step": 3090
},
{
"epoch": 0.7865026005327921,
"grad_norm": 1.3356949090957642,
"learning_rate": 1.3278612840351468e-06,
"loss": 1.1879,
"step": 3100
},
{
"epoch": 0.7890397056958011,
"grad_norm": 1.2000919580459595,
"learning_rate": 1.2979415262210089e-06,
"loss": 1.1772,
"step": 3110
},
{
"epoch": 0.79157681085881,
"grad_norm": 1.2145313024520874,
"learning_rate": 1.2683123463975144e-06,
"loss": 1.1662,
"step": 3120
},
{
"epoch": 0.7941139160218191,
"grad_norm": 1.2365864515304565,
"learning_rate": 1.2389760701863717e-06,
"loss": 1.1916,
"step": 3130
},
{
"epoch": 0.7966510211848281,
"grad_norm": 1.2178484201431274,
"learning_rate": 1.2099350002190063e-06,
"loss": 1.1686,
"step": 3140
},
{
"epoch": 0.7991881263478371,
"grad_norm": 1.2906898260116577,
"learning_rate": 1.1811914159558374e-06,
"loss": 1.1979,
"step": 3150
},
{
"epoch": 0.8017252315108461,
"grad_norm": 1.2861703634262085,
"learning_rate": 1.1527475735073574e-06,
"loss": 1.1937,
"step": 3160
},
{
"epoch": 0.8042623366738552,
"grad_norm": 1.2466925382614136,
"learning_rate": 1.1246057054570414e-06,
"loss": 1.1632,
"step": 3170
},
{
"epoch": 0.8067994418368641,
"grad_norm": 1.1871761083602905,
"learning_rate": 1.0967680206861198e-06,
"loss": 1.1587,
"step": 3180
},
{
"epoch": 0.8093365469998731,
"grad_norm": 1.2665520906448364,
"learning_rate": 1.069236704200195e-06,
"loss": 1.1679,
"step": 3190
},
{
"epoch": 0.8118736521628821,
"grad_norm": 1.197919487953186,
"learning_rate": 1.0420139169577393e-06,
"loss": 1.1652,
"step": 3200
},
{
"epoch": 0.8144107573258912,
"grad_norm": 1.2373968362808228,
"learning_rate": 1.01510179570048e-06,
"loss": 1.178,
"step": 3210
},
{
"epoch": 0.8169478624889002,
"grad_norm": 1.2911186218261719,
"learning_rate": 9.88502452785685e-07,
"loss": 1.1735,
"step": 3220
},
{
"epoch": 0.8194849676519091,
"grad_norm": 1.2003546953201294,
"learning_rate": 9.62217976020357e-07,
"loss": 1.1836,
"step": 3230
},
{
"epoch": 0.8220220728149181,
"grad_norm": 1.3649158477783203,
"learning_rate": 9.362504284973683e-07,
"loss": 1.1651,
"step": 3240
},
{
"epoch": 0.8245591779779272,
"grad_norm": 1.3181599378585815,
"learning_rate": 9.1060184843352e-07,
"loss": 1.1735,
"step": 3250
},
{
"epoch": 0.8270962831409362,
"grad_norm": 1.2371008396148682,
"learning_rate": 8.852742490095628e-07,
"loss": 1.1629,
"step": 3260
},
{
"epoch": 0.8296333883039452,
"grad_norm": 1.181353211402893,
"learning_rate": 8.602696182121812e-07,
"loss": 1.1722,
"step": 3270
},
{
"epoch": 0.8321704934669542,
"grad_norm": 1.314288854598999,
"learning_rate": 8.35589918677952e-07,
"loss": 1.1713,
"step": 3280
},
{
"epoch": 0.8347075986299632,
"grad_norm": 1.2208564281463623,
"learning_rate": 8.112370875393e-07,
"loss": 1.1801,
"step": 3290
},
{
"epoch": 0.8372447037929722,
"grad_norm": 1.2573906183242798,
"learning_rate": 7.872130362724422e-07,
"loss": 1.154,
"step": 3300
},
{
"epoch": 0.8397818089559812,
"grad_norm": 1.286879301071167,
"learning_rate": 7.635196505473652e-07,
"loss": 1.1759,
"step": 3310
},
{
"epoch": 0.8423189141189902,
"grad_norm": 1.2773866653442383,
"learning_rate": 7.401587900798091e-07,
"loss": 1.1746,
"step": 3320
},
{
"epoch": 0.8448560192819993,
"grad_norm": 1.3070263862609863,
"learning_rate": 7.171322884852988e-07,
"loss": 1.1866,
"step": 3330
},
{
"epoch": 0.8473931244450083,
"grad_norm": 1.2348638772964478,
"learning_rate": 6.944419531352236e-07,
"loss": 1.1816,
"step": 3340
},
{
"epoch": 0.8499302296080172,
"grad_norm": 1.1883289813995361,
"learning_rate": 6.720895650149744e-07,
"loss": 1.1795,
"step": 3350
},
{
"epoch": 0.8524673347710262,
"grad_norm": 1.245474934577942,
"learning_rate": 6.500768785841482e-07,
"loss": 1.1733,
"step": 3360
},
{
"epoch": 0.8550044399340353,
"grad_norm": 1.277593731880188,
"learning_rate": 6.284056216388451e-07,
"loss": 1.1731,
"step": 3370
},
{
"epoch": 0.8575415450970443,
"grad_norm": 1.2596774101257324,
"learning_rate": 6.070774951760505e-07,
"loss": 1.171,
"step": 3380
},
{
"epoch": 0.8600786502600533,
"grad_norm": 1.266883134841919,
"learning_rate": 5.860941732601166e-07,
"loss": 1.1668,
"step": 3390
},
{
"epoch": 0.8626157554230622,
"grad_norm": 1.2608847618103027,
"learning_rate": 5.654573028913735e-07,
"loss": 1.1704,
"step": 3400
},
{
"epoch": 0.8651528605860713,
"grad_norm": 1.311318278312683,
"learning_rate": 5.451685038768473e-07,
"loss": 1.1899,
"step": 3410
},
{
"epoch": 0.8676899657490803,
"grad_norm": 1.2083625793457031,
"learning_rate": 5.252293687031196e-07,
"loss": 1.1636,
"step": 3420
},
{
"epoch": 0.8702270709120893,
"grad_norm": 1.256255865097046,
"learning_rate": 5.05641462411336e-07,
"loss": 1.1651,
"step": 3430
},
{
"epoch": 0.8727641760750983,
"grad_norm": 1.261788010597229,
"learning_rate": 4.864063224743626e-07,
"loss": 1.1677,
"step": 3440
},
{
"epoch": 0.8753012812381074,
"grad_norm": 1.1607638597488403,
"learning_rate": 4.6752545867610963e-07,
"loss": 1.1722,
"step": 3450
},
{
"epoch": 0.8778383864011163,
"grad_norm": 1.1922293901443481,
"learning_rate": 4.4900035299302036e-07,
"loss": 1.1675,
"step": 3460
},
{
"epoch": 0.8803754915641253,
"grad_norm": 1.2432760000228882,
"learning_rate": 4.308324594777635e-07,
"loss": 1.1689,
"step": 3470
},
{
"epoch": 0.8829125967271343,
"grad_norm": 1.304062008857727,
"learning_rate": 4.130232041450866e-07,
"loss": 1.1684,
"step": 3480
},
{
"epoch": 0.8854497018901434,
"grad_norm": 1.1947314739227295,
"learning_rate": 3.9557398485989884e-07,
"loss": 1.1652,
"step": 3490
},
{
"epoch": 0.8879868070531524,
"grad_norm": 1.2069730758666992,
"learning_rate": 3.784861712275467e-07,
"loss": 1.1608,
"step": 3500
},
{
"epoch": 0.8905239122161613,
"grad_norm": 1.2010389566421509,
"learning_rate": 3.61761104486314e-07,
"loss": 1.1731,
"step": 3510
},
{
"epoch": 0.8930610173791703,
"grad_norm": 1.2449175119400024,
"learning_rate": 3.454000974021432e-07,
"loss": 1.1829,
"step": 3520
},
{
"epoch": 0.8955981225421794,
"grad_norm": 1.244471788406372,
"learning_rate": 3.294044341655983e-07,
"loss": 1.1629,
"step": 3530
},
{
"epoch": 0.8981352277051884,
"grad_norm": 1.2766717672348022,
"learning_rate": 3.1377537029107174e-07,
"loss": 1.1567,
"step": 3540
},
{
"epoch": 0.9006723328681974,
"grad_norm": 1.2156423330307007,
"learning_rate": 2.985141325182267e-07,
"loss": 1.177,
"step": 3550
},
{
"epoch": 0.9032094380312063,
"grad_norm": 1.2810441255569458,
"learning_rate": 2.836219187157202e-07,
"loss": 1.1757,
"step": 3560
},
{
"epoch": 0.9057465431942154,
"grad_norm": 1.312305212020874,
"learning_rate": 2.69099897787175e-07,
"loss": 1.1654,
"step": 3570
},
{
"epoch": 0.9082836483572244,
"grad_norm": 1.1842001676559448,
"learning_rate": 2.5494920957943314e-07,
"loss": 1.1597,
"step": 3580
},
{
"epoch": 0.9108207535202334,
"grad_norm": 1.239871859550476,
"learning_rate": 2.411709647930882e-07,
"loss": 1.1762,
"step": 3590
},
{
"epoch": 0.9133578586832424,
"grad_norm": 1.197737216949463,
"learning_rate": 2.2776624489530664e-07,
"loss": 1.1699,
"step": 3600
},
{
"epoch": 0.9158949638462515,
"grad_norm": 1.2735172510147095,
"learning_rate": 2.1473610203494032e-07,
"loss": 1.1742,
"step": 3610
},
{
"epoch": 0.9184320690092604,
"grad_norm": 1.2625423669815063,
"learning_rate": 2.0208155895994343e-07,
"loss": 1.1609,
"step": 3620
},
{
"epoch": 0.9209691741722694,
"grad_norm": 1.2696080207824707,
"learning_rate": 1.8980360893709582e-07,
"loss": 1.1742,
"step": 3630
},
{
"epoch": 0.9235062793352784,
"grad_norm": 1.2440428733825684,
"learning_rate": 1.7790321567404011e-07,
"loss": 1.1747,
"step": 3640
},
{
"epoch": 0.9260433844982875,
"grad_norm": 1.2741636037826538,
"learning_rate": 1.6638131324364094e-07,
"loss": 1.171,
"step": 3650
},
{
"epoch": 0.9285804896612965,
"grad_norm": 1.2880579233169556,
"learning_rate": 1.55238806010668e-07,
"loss": 1.1443,
"step": 3660
},
{
"epoch": 0.9311175948243055,
"grad_norm": 1.297038197517395,
"learning_rate": 1.444765685608096e-07,
"loss": 1.1733,
"step": 3670
},
{
"epoch": 0.9336546999873144,
"grad_norm": 1.208609938621521,
"learning_rate": 1.340954456320287e-07,
"loss": 1.1741,
"step": 3680
},
{
"epoch": 0.9361918051503235,
"grad_norm": 1.2683435678482056,
"learning_rate": 1.2409625204825802e-07,
"loss": 1.174,
"step": 3690
},
{
"epoch": 0.9387289103133325,
"grad_norm": 1.1996833086013794,
"learning_rate": 1.1447977265544141e-07,
"loss": 1.1777,
"step": 3700
},
{
"epoch": 0.9412660154763415,
"grad_norm": 1.2534565925598145,
"learning_rate": 1.052467622599329e-07,
"loss": 1.155,
"step": 3710
},
{
"epoch": 0.9438031206393505,
"grad_norm": 1.2183892726898193,
"learning_rate": 9.639794556925041e-08,
"loss": 1.1655,
"step": 3720
},
{
"epoch": 0.9463402258023595,
"grad_norm": 1.2513470649719238,
"learning_rate": 8.793401713519333e-08,
"loss": 1.1727,
"step": 3730
},
{
"epoch": 0.9488773309653685,
"grad_norm": 1.3019185066223145,
"learning_rate": 7.985564129932566e-08,
"loss": 1.175,
"step": 3740
},
{
"epoch": 0.9514144361283775,
"grad_norm": 1.2735040187835693,
"learning_rate": 7.216345214083264e-08,
"loss": 1.1796,
"step": 3750
},
{
"epoch": 0.9539515412913865,
"grad_norm": 1.2108937501907349,
"learning_rate": 6.485805342674901e-08,
"loss": 1.1478,
"step": 3760
},
{
"epoch": 0.9564886464543956,
"grad_norm": 1.3137885332107544,
"learning_rate": 5.7940018564570654e-08,
"loss": 1.1777,
"step": 3770
},
{
"epoch": 0.9590257516174046,
"grad_norm": 1.2065484523773193,
"learning_rate": 5.1409890557246876e-08,
"loss": 1.1749,
"step": 3780
},
{
"epoch": 0.9615628567804135,
"grad_norm": 1.3035470247268677,
"learning_rate": 4.526818196055938e-08,
"loss": 1.1795,
"step": 3790
},
{
"epoch": 0.9640999619434225,
"grad_norm": 1.2624789476394653,
"learning_rate": 3.951537484289114e-08,
"loss": 1.1865,
"step": 3800
},
{
"epoch": 0.9666370671064316,
"grad_norm": 1.2832140922546387,
"learning_rate": 3.4151920747390044e-08,
"loss": 1.1623,
"step": 3810
},
{
"epoch": 0.9691741722694406,
"grad_norm": 1.206804633140564,
"learning_rate": 2.9178240656523305e-08,
"loss": 1.1698,
"step": 3820
},
{
"epoch": 0.9717112774324496,
"grad_norm": 1.237695574760437,
"learning_rate": 2.4594724959037253e-08,
"loss": 1.1826,
"step": 3830
},
{
"epoch": 0.9742483825954585,
"grad_norm": 1.2522958517074585,
"learning_rate": 2.0401733419315727e-08,
"loss": 1.1679,
"step": 3840
},
{
"epoch": 0.9767854877584676,
"grad_norm": 1.209662914276123,
"learning_rate": 1.659959514913767e-08,
"loss": 1.1842,
"step": 3850
},
{
"epoch": 0.9793225929214766,
"grad_norm": 1.3333046436309814,
"learning_rate": 1.3188608581851114e-08,
"loss": 1.1629,
"step": 3860
},
{
"epoch": 0.9818596980844856,
"grad_norm": 1.2529042959213257,
"learning_rate": 1.016904144894304e-08,
"loss": 1.1779,
"step": 3870
},
{
"epoch": 0.9843968032474946,
"grad_norm": 1.2353804111480713,
"learning_rate": 7.541130759027848e-09,
"loss": 1.1728,
"step": 3880
},
{
"epoch": 0.9869339084105037,
"grad_norm": 1.1999486684799194,
"learning_rate": 5.305082779244464e-09,
"loss": 1.1607,
"step": 3890
},
{
"epoch": 0.9894710135735126,
"grad_norm": 1.1895602941513062,
"learning_rate": 3.4610730190648423e-09,
"loss": 1.1884,
"step": 3900
},
{
"epoch": 0.9920081187365216,
"grad_norm": 1.22995924949646,
"learning_rate": 2.0092462165194337e-09,
"loss": 1.1906,
"step": 3910
},
{
"epoch": 0.9945452238995306,
"grad_norm": 1.1689248085021973,
"learning_rate": 9.497163268351595e-10,
"loss": 1.1499,
"step": 3920
},
{
"epoch": 0.9970823290625397,
"grad_norm": 1.3178656101226807,
"learning_rate": 2.825665134920108e-10,
"loss": 1.173,
"step": 3930
},
{
"epoch": 0.9996194342255487,
"grad_norm": 1.2528467178344727,
"learning_rate": 7.849141696048002e-12,
"loss": 1.1774,
"step": 3940
},
{
"epoch": 0.9998731447418495,
"step": 3941,
"total_flos": 1.0333279192736596e+19,
"train_loss": 1.22370115647731,
"train_runtime": 28099.9428,
"train_samples_per_second": 17.953,
"train_steps_per_second": 0.14
}
],
"logging_steps": 10,
"max_steps": 3941,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.0333279192736596e+19,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}