Files

4430 lines
243 KiB
HTML
Raw Permalink Normal View History

<!doctype html>
<html>
<head>
<meta charset="utf-8"/>
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<title>Training Report</title>
<style>
body { margin: 0; font-family: system-ui, Segoe UI, Arial; background: #0b0f17; color: #e6e6e6; }
header { padding: 14px 18px; border-bottom: 1px solid rgba(255,255,255,0.10); }
.wrap { padding: 14px 18px; display: grid; gap: 14px; }
details { background: rgba(255,255,255,0.04); padding: 10px 12px; border-radius: 10px; }
summary { cursor: pointer; font-weight: 600; }
pre { margin: 8px 0 0; white-space: pre-wrap; word-wrap: break-word; }
.muted { opacity: .8; font-size: 12px; }
</style>
</head>
<body>
<header>
<div style="font-size:18px;font-weight:700;">Training Report</div>
<div class="muted">Single-file HTML (dashboard + run payload)</div>
</header>
<div class="wrap">
<details open><summary>Notes</summary><pre></pre></details>
<div> <script type="text/javascript">window.PlotlyConfig = {MathJaxConfig: 'local'};</script>
<script charset="utf-8" src="https://cdn.plot.ly/plotly-2.35.2.min.js"></script> <div id="fbea6609-4680-44c8-9218-05ec099740b1" class="plotly-graph-div" style="height:900px; width:100%;"></div> <script type="text/javascript"> window.PLOTLYENV=window.PLOTLYENV || {}; if (document.getElementById("fbea6609-4680-44c8-9218-05ec099740b1")) { Plotly.newPlot( "fbea6609-4680-44c8-9218-05ec099740b1", [{"mode":"lines","name":"train_loss (raw)","opacity":0.35,"x":[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,256,257,258,259,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299,300,301,302,303,304,305,306,307,308,309,310,311,312,313,314,315,316,317,318,319,320,321,322,323,324,325,326,327,328,329,330,331,332,333,334,335,336,337,338,339,340,341,342,343,344,345,346,347,348,349,350,351,352,353,354,355,356,357,358,359,360,361,362,363,364,365,366,367,368,369,370,371,372,373,374,375,376,377,378,379,380,381,382,383,384,385,386,387,388,389,390,391,392,393,394,395,396,397,398,399,400,401,402,403,404,405,406,407,408,409,410,411,412,413,414,415,416,417,418,419,420,421,422,423,424,425,426,427,428,429,430,431,432,433,434,435,436,437,438,439,440,441,442,443,444,445,446,447,448,449,450,451,452,453,454,455,456,457,458,459,460,461,462,463,464,465,466,467,468,469,470,471,472,473,474,475,476,477,478,479,480,481,482,483,484,485,486,487,488,489,490,491,492,493,494,495,496,497,498,499,500,501,502,503,504,505,506,507,508,509,510,511,512,513,514,515,516,517,518,519,520,521,522,523,524,525,526,527,528,529,530,531,532,533,534,535,536,537,538,539,540,541,542,543,544,545,546,547,548,549,550,551,552,553,554,555,556,557,558,559,560,561,562,563,564,565,566,567,568,569,570,571,572,573,574,575,576,577,578,579,580,581,582,583,584,585,586,587,588,589,590,591,592,593,594,595,596,597,598,599,600],"y":[2.2041,2.6901,2.6774,2.4478,1.9988,2.0358,2.5824,2.3479,2.6387,2.6083,2.3015,2.7042,2.5386,2.5886,2.7168,2.1886,2.4191,2.2736,2.7336,2.5288,2.0214,2.3708,2.5341,2.4852,2.4959,2.243,2.1622,2.1827,2.3346,1.9959,2.5963,2.3312,2.2653,2.6019,2.2858,2.1479,2.3149,2.4299,2.1583,2.1631,2.0198,2.3086,2.1388,1.8651,1.8407,1.9392,2.4167,2.2915,1.7683,2.0858,2.1761,2.145,1.829,2.345,1.9989,2.3336,2.0979,2.4041,2.1076,2.2672,2.1081,2.034,2.1147,1.9639,1.7557,1.8861,2.272,1.7173,1.8545,2.0492,1.9203,2.1317,2.282,2.2023,2.3545,2.1984,1.9843,2.2899,1.8448,1.857,2.1387,1.9587,2.5776,2.1138,2.0632,2.1885,2.2515,2.3173,1.9536,1.9134,2.2808,2.2055,1.9373,1.7517,1.9707,1.9629,1.8158,2.5133,2.263,2.1035,1.8114,2.3733,1.929,1.9718,2.1026,1.7999,2.0349,2.2571,2.0794,2.1671,2.1292,2.2534,2.0333,2.1495,1.5522,2.2221,2.4227,2.3886,2.0651,1.9465,2.588,1.7602,2.2738,1.8145,1.7572,2.0206,2.1292,2.1207,2.1515,1.7264,1.6036,2.132,2.061,1.8765,1.8337,2.0664,1.9651,1.9243,2.0289,1.7206,1.7212,1.559,1.9442,1.8913,2.1248,2.1745,2.0572,1.8028,2.2202,1.8407,2.3419,2.0321,1.7054,1.9304,2.1154,2.0367,1.5693,1.726,1.7872,1.9831,2.0571,1.8674,1.7982,1.7992,2.2013,1.7908,2.1308,1.735,2.1536,1.86
<details><summary>Run meta (quick)</summary><pre>{
&quot;model&quot;: &quot;unsloth/Phi-4-unsloth-bnb-4bit&quot;,
&quot;dataset&quot;: &quot;Mathieu-Thomas-JOSSET/michael_abab_conversations_infini_instruct.jsonl&quot;,
&quot;examples_total&quot;: 2872,
&quot;examples_train&quot;: 1436,
&quot;examples_eval&quot;: 1436,
&quot;world_size&quot;: 1,
&quot;effective_batch_size&quot;: 8,
&quot;steps_per_epoch_approx&quot;: 179.5,
&quot;max_steps&quot;: 2000,
&quot;eval_steps&quot;: 50,
&quot;save_steps&quot;: 50,
&quot;learning_rate&quot;: 9.95267419777795e-06,
&quot;warmup_steps&quot;: 10,
&quot;lr_scheduler_type&quot;: &quot;linear&quot;,
&quot;weight_decay&quot;: 0.009206070410847844,
&quot;lora_r&quot;: 32,
&quot;lora_alpha&quot;: 64,
&quot;lora_dropout&quot;: 0.0,
&quot;best_checkpoint&quot;: &quot;outputs/continue_r1_from_350_20260112_073729/checkpoint-100&quot;,
&quot;LR_AUTO_ENABLED&quot;: true,
&quot;LR_AUTO_USE_N&quot;: &quot;train&quot;,
&quot;LR_AUTO_N_REF&quot;: 1436,
&quot;LR_AUTO_BASE&quot;: 1e-05,
&quot;LR_AUTO_MULT&quot;: 0.5,
&quot;LR_AUTO_FINAL&quot;: 5e-06,
&quot;best_step&quot;: 100,
&quot;best_eval_loss&quot;: 2.2380564212799072,
&quot;best_blended&quot;: 1.3520409573791146,
&quot;best_blended_step&quot;: 600
}</pre></details>
<details><summary>config_snapshot</summary><pre>{
&quot;MODEL_NAME&quot;: &quot;unsloth/Phi-4-unsloth-bnb-4bit&quot;,
&quot;CHAT_TEMPLATE&quot;: &quot;phi-4&quot;,
&quot;MAX_SEQ_LENGTH&quot;: 2048,
&quot;LOAD_IN_4BIT&quot;: true,
&quot;DATASET_NAME&quot;: &quot;Mathieu-Thomas-JOSSET/michael_abab_conversations_infini_instruct.jsonl&quot;,
&quot;DATASET_SPLIT&quot;: &quot;train&quot;,
&quot;PER_DEVICE_TRAIN_BATCH_SIZE&quot;: 2,
&quot;GRADIENT_ACCUMULATION_STEPS&quot;: 4,
&quot;WARMUP_STEPS&quot;: 10,
&quot;MAX_STEPS&quot;: 2000,
&quot;LEARNING_RATE&quot;: 9.95267419777795e-06,
&quot;WEIGHT_DECAY&quot;: 0.009206070410847844,
&quot;LR_SCHEDULER_TYPE&quot;: &quot;linear&quot;,
&quot;SEED&quot;: 3407,
&quot;PLOTLY_DARK_MODE&quot;: true,
&quot;PLOTLY_BASE_COLOR&quot;: &quot;#00CC96&quot;,
&quot;PLOTLY_EMA_SPAN&quot;: 25,
&quot;LR_AUTO_ENABLED&quot;: true,
&quot;LR_AUTO_USE_N&quot;: &quot;train&quot;,
&quot;LR_AUTO_N_REF&quot;: 1436,
&quot;LR_AUTO_BASE&quot;: 1e-05,
&quot;LR_AUTO_MULT&quot;: 0.5,
&quot;LR_AUTO_FINAL&quot;: 5e-06
}</pre></details>
<details><summary>run_manifest</summary><pre>{
&quot;model_name&quot;: &quot;unsloth/Phi-4-unsloth-bnb-4bit&quot;,
&quot;dataset&quot;: {
&quot;name&quot;: &quot;Mathieu-Thomas-JOSSET/michael_abab_conversations_infini_instruct.jsonl&quot;,
&quot;split&quot;: &quot;train&quot;
},
&quot;training&quot;: {
&quot;max_steps&quot;: 2000,
&quot;learning_rate&quot;: 9.95267419777795e-06,
&quot;per_device_train_batch_size&quot;: 2,
&quot;gradient_accumulation_steps&quot;: 4,
&quot;max_seq_length&quot;: 2048,
&quot;seed&quot;: 3407,
&quot;optimizer&quot;: &quot;adamw_8bit&quot;,
&quot;lr_scheduler_type&quot;: &quot;linear&quot;
},
&quot;auto_lr&quot;: {
&quot;enabled&quot;: true,
&quot;use_n&quot;: &quot;train&quot;,
&quot;n_ref&quot;: 1436,
&quot;base&quot;: 1e-05,
&quot;mult&quot;: 0.5,
&quot;final&quot;: 5e-06
},
&quot;best&quot;: {
&quot;checkpoint&quot;: &quot;/content/outputs/continue_r1_from_350_20260112_073729/checkpoint-100&quot;,
&quot;metric&quot;: 2.2380564212799072,
&quot;metric_name&quot;: &quot;eval_loss&quot;
},
&quot;plotly&quot;: {
&quot;html&quot;: &quot;training_loss_step.html&quot;
}
}</pre></details>
<details><summary>trainer.state.log_history</summary><pre>[
{
&quot;loss&quot;: 2.2041,
&quot;grad_norm&quot;: 4.026190757751465,
&quot;learning_rate&quot;: 0.0,
&quot;epoch&quot;: 0.005571030640668524,
&quot;step&quot;: 1
},
{
&quot;loss&quot;: 2.6901,
&quot;grad_norm&quot;: 1.616629719734192,
&quot;learning_rate&quot;: 1.4636285584967574e-07,
&quot;epoch&quot;: 0.011142061281337047,
&quot;step&quot;: 2
},
{
&quot;loss&quot;: 2.6774,
&quot;grad_norm&quot;: 13.836981773376465,
&quot;learning_rate&quot;: 2.927257116993515e-07,
&quot;epoch&quot;: 0.016713091922005572,
&quot;step&quot;: 3
},
{
&quot;loss&quot;: 2.4478,
&quot;grad_norm&quot;: 1.857710361480713,
&quot;learning_rate&quot;: 4.3908856754902726e-07,
&quot;epoch&quot;: 0.022284122562674095,
&quot;step&quot;: 4
},
{
&quot;loss&quot;: 1.9988,
&quot;grad_norm&quot;: 1.4818029403686523,
&quot;learning_rate&quot;: 5.85451423398703e-07,
&quot;epoch&quot;: 0.027855153203342618,
&quot;step&quot;: 5
},
{
&quot;loss&quot;: 2.0358,
&quot;grad_norm&quot;: 1.726440191268921,
&quot;learning_rate&quot;: 7.318142792483787e-07,
&quot;epoch&quot;: 0.033426183844011144,
&quot;step&quot;: 6
},
{
&quot;loss&quot;: 2.5824,
&quot;grad_norm&quot;: 2.0604233741760254,
&quot;learning_rate&quot;: 8.781771350980545e-07,
&quot;epoch&quot;: 0.03899721448467967,
&quot;step&quot;: 7
},
{
&quot;loss&quot;: 2.3479,
&quot;grad_norm&quot;: 1.7288694381713867,
&quot;learning_rate&quot;: 1.0245399909477302e-06,
&quot;epoch&quot;: 0.04456824512534819,
&quot;step&quot;: 8
},
{
&quot;loss&quot;: 2.6387,
&quot;grad_norm&quot;: 1.9069620370864868,
&quot;learning_rate&quot;: 1.170902846797406e-06,
&quot;epoch&quot;: 0.05013927576601671,
&quot;step&quot;: 9
},
{
&quot;loss&quot;: 2.6083,
&quot;grad_norm&quot;: 1.4719465970993042,
&quot;learning_rate&quot;: 1.3172657026470817e-06,
&quot;epoch&quot;: 0.055710306406685235,
&quot;step&quot;: 10
},
{
&quot;loss&quot;: 2.3015,
&quot;grad_norm&quot;: 1.6306267976760864,
&quot;learning_rate&quot;: 1.4636285584967574e-06,
&quot;epoch&quot;: 0.06128133704735376,
&quot;step&quot;: 11
},
{
&quot;loss&quot;: 2.7042,
&quot;grad_norm&quot;: 1.4724116325378418,
&quot;learning_rate&quot;: 1.6099914143464333e-06,
&quot;epoch&quot;: 0.06685236768802229,
&quot;step&quot;: 12
},
{
&quot;loss&quot;: 2.5386,
&quot;grad_norm&quot;: 1.5470020771026611,
&quot;learning_rate&quot;: 1.756354270196109e-06,
&quot;epoch&quot;: 0.07242339832869081,
&quot;step&quot;: 13
},
{
&quot;loss&quot;: 2.5886,
&quot;grad_norm&quot;: 2.022662401199341,
&quot;learning_rate&quot;: 1.9027171260457846e-06,
&quot;epoch&quot;: 0.07799442896935933,
&quot;step&quot;: 14
},
{
&quot;loss&quot;: 2.7168,
&quot;grad_norm&quot;: 1.8387386798858643,
&quot;learning_rate&quot;: 2.0490799818954605e-06,
&quot;epoch&quot;: 0.08356545961002786,
&quot;step&quot;: 15
},
{
&quot;loss&quot;: 2.1886,
&quot;grad_norm&quot;: 1.9359395503997803,
&quot;learning_rate&quot;: 2.195442837745136e-06,
&quot;epoch&quot;: 0.08913649025069638,
&quot;step&quot;: 16
},
{
&quot;loss&quot;: 2.4191,
&quot;grad_norm&quot;: 1.5662318468093872,
&quot;learning_rate&quot;: 2.341805693594812e-06,
&quot;epoch&quot;: 0.0947075208913649,
&quot;step&quot;: 17
},
{
&quot;loss&quot;: 2.2736,
&quot;grad_norm&quot;: 1.7207640409469604,
&quot;learning_rate&quot;: 2.4881685494444876e-06,
&quot;epoch&quot;: 0.10027855153203342,
&quot;step&quot;: 18
},
{
&quot;loss&quot;: 2.7336,
&quot;grad_norm&quot;: 1.6225577592849731,
&quot;learning_rate&quot;: 2.6345314052941634e-06,
&quot;epoch&quot;: 0.10584958217270195,
&quot;step&quot;: 19
},
{
&quot;loss&quot;: 2.5288,
&quot;grad_norm&quot;: 1.6348892450332642,
&quot;learning_rate&quot;: 2.780894261143839e-06,
&quot;epoch&quot;: 0.11142061281337047,
&quot;step&quot;: 20
},
{
&quot;loss&quot;: 2.0214,
&quot;grad_norm&quot;: 1.5059679746627808,
&quot;learning_rate&quot;: 2.927257116993515e-06,
&quot;epoch&quot;: 0.116991643454039,
&quot;step&quot;: 21
},
{
&quot;loss&quot;: 2.3708,
&quot;grad_norm&quot;: 1.3699105978012085,
&quot;learning_rate&quot;: 3.073619972843191e-06,
&quot;epoch&quot;: 0.12256267409470752,
&quot;step&quot;: 22
},
{
&quot;loss&quot;: 2.5341,
&quot;grad_norm&quot;: 2.241403341293335,
&quot;learning_rate&quot;: 3.2199828286928667e-06,
&quot;epoch&quot;: 0.12813370473537605,
&quot;step&quot;: 23
},
{
&quot;loss&quot;: 2.4852,
&quot;grad_norm&quot;: 1.7692517042160034,
&quot;learning_rate&quot;: 3.3663456845425424e-06,
&quot;epoch&quot;: 0.13370473537604458,
&quot;step&quot;: 24
},
{
&quot;loss&quot;: 2.4959,
&quot;grad_norm&quot;: 1.9559876918792725,
&quot;learning_rate&quot;: 3.512708540392218e-06,
&quot;epoch&quot;: 0.1392757660167131,
&quot;step&quot;: 25
},
{
&quot;loss&quot;: 2.243,
&quot;grad_norm&quot;: 1.7536145448684692,
&quot;learning_rate&quot;: 3.659071396241894e-06,
&quot;epoch&quot;: 0.14484679665738162,
&quot;step&quot;: 26
},
{
&quot;loss&quot;: 2.1622,
&quot;grad_norm&quot;: 1.8103671073913574,
&quot;learning_rate&quot;: 3.805434252091569e-06,
&quot;epoch&quot;: 0.15041782729805014,
&quot;step&quot;: 27
},
{
&quot;loss&quot;: 2.1827,
&quot;grad_norm&quot;: 1.5473895072937012,
&quot;learning_rate&quot;: 3.951797107941245e-06,
&quot;epoch&quot;: 0.15598885793871867,
&quot;step&quot;: 28
},
{
&quot;loss&quot;: 2.3346,
&quot;grad_norm&quot;: 1.7137048244476318,
&quot;learning_rate&quot;: 4.098159963790921e-06,
&quot;epoch&quot;: 0.1615598885793872,
&quot;step&quot;: 29
},
{
&quot;loss&quot;: 1.9959,
&quot;grad_norm&quot;: 1.7803088426589966,
&quot;learning_rate&quot;: 4.244522819640596e-06,
&quot;epoch&quot;: 0.1671309192200557,
&quot;step&quot;: 30
},
{
&quot;loss&quot;: 2.5963,
&quot;grad_norm&quot;: 1.3565785884857178,
&quot;learning_rate&quot;: 4.390885675490272e-06,
&quot;epoch&quot;: 0.17270194986072424,
&quot;step&quot;: 31
},
{
&quot;loss&quot;: 2.3312,
&quot;grad_norm&quot;: 1.4962913990020752,
&quot;learning_rate&quot;: 4.537248531339948e-06,
&quot;epoch&quot;: 0.17827298050139276,
&quot;step&quot;: 32
},
{
&quot;loss&quot;: 2.2653,
&quot;grad_norm&quot;: 1.4864503145217896,
&quot;learning_rate&quot;: 4.683611387189624e-06,
&quot;epoch&quot;: 0.18384401114206128,
&quot;step&quot;: 33
},
{
&quot;loss&quot;: 2.6019,
&quot;grad_norm&quot;: 2.2222468852996826,
&quot;learning_rate&quot;: 4.829974243039299e-06,
&quot;epoch&quot;: 0.1894150417827298,
&quot;step&quot;: 34
},
{
&quot;loss&quot;: 2.2858,
&quot;grad_norm&quot;: 1.6111881732940674,
&quot;learning_rate&quot;: 4.976337098888975e-06,
&quot;epoch&quot;: 0.19498607242339833,
&quot;step&quot;: 35
},
{
&quot;loss&quot;: 2.1479,
&quot;grad_norm&quot;: 2.307185173034668,
&quot;learning_rate&quot;: 5.1226999547386506e-06,
&quot;epoch&quot;: 0.20055710306406685,
&quot;step&quot;: 36
},
{
&quot;loss&quot;: 2.3149,
&quot;grad_norm&quot;: 1.9714685678482056,
&quot;learning_rate&quot;: 5.269062810588327e-06,
&quot;epoch&quot;: 0.20612813370473537,
&quot;step&quot;: 37
},
{
&quot;loss&quot;: 2.4299,
&quot;grad_norm&quot;: 1.6915550231933594,
&quot;learning_rate&quot;: 5.415425666438002e-06,
&quot;epoch&quot;: 0.2116991643454039,
&quot;step&quot;: 38
},
{
&quot;loss&quot;: 2.1583,
&quot;grad_norm&quot;: 1.9084649085998535,
&quot;learning_rate&quot;: 5.561788522287678e-06,
&quot;epoch&quot;: 0.21727019498607242,
&quot;step&quot;: 39
},
{
&quot;loss&quot;: 2.1631,
&quot;grad_norm&quot;: 1.882629632949829,
&quot;learning_rate&quot;: 5.7081513781373534e-06,
&quot;epoch&quot;: 0.22284122562674094,
&quot;step&quot;: 40
},
{
&quot;loss&quot;: 2.0198,
&quot;grad_norm&quot;: 1.335666537284851,
&quot;learning_rate&quot;: 5.85451423398703e-06,
&quot;epoch&quot;: 0.22841225626740946,
&quot;step&quot;: 41
},
{
&quot;loss&quot;: 2.3086,
&quot;grad_norm&quot;: 2.620265007019043,
&quot;learning_rate&quot;: 6.000877089836705e-06,
&quot;epoch&quot;: 0.233983286908078,
&quot;step&quot;: 42
},
{
&quot;loss&quot;: 2.1388,
&quot;grad_norm&quot;: 2.0138227939605713,
&quot;learning_rate&quot;: 6.147239945686382e-06,
&quot;epoch&quot;: 0.2395543175487465,
&quot;step&quot;: 43
},
{
&quot;loss&quot;: 1.8651,
&quot;grad_norm&quot;: 1.6108520030975342,
&quot;learning_rate&quot;: 6.293602801536056e-06,
&quot;epoch&quot;: 0.24512534818941503,
&quot;step&quot;: 44
},
{
&quot;loss&quot;: 1.8407,
&quot;grad_norm&quot;: 1.5935970544815063,
&quot;learning_rate&quot;: 6.439965657385733e-06,
&quot;epoch&quot;: 0.25069637883008355,
&quot;step&quot;: 45
},
{
&quot;loss&quot;: 1.9392,
&quot;grad_norm&quot;: 1.3794289827346802,
&quot;learning_rate&quot;: 6.586328513235409e-06,
&quot;epoch&quot;: 0.2562674094707521,
&quot;step&quot;: 46
},
{
&quot;loss&quot;: 2.4167,
&quot;grad_norm&quot;: 1.23729407787323,
&quot;learning_rate&quot;: 6.732691369085085e-06,
&quot;epoch&quot;: 0.2618384401114206,
&quot;step&quot;: 47
},
{
&quot;loss&quot;: 2.2915,
&quot;grad_norm&quot;: 1.4265947341918945,
&quot;learning_rate&quot;: 6.87905422493476e-06,
&quot;epoch&quot;: 0.26740947075208915,
&quot;step&quot;: 48
},
{
&quot;loss&quot;: 1.7683,
&quot;grad_norm&quot;: 1.696736216545105,
&quot;learning_rate&quot;: 7.025417080784436e-06,
&quot;epoch&quot;: 0.27298050139275765,
&quot;step&quot;: 49
},
{
&quot;loss&quot;: 2.0858,
&quot;grad_norm&quot;: 1.5071961879730225,
&quot;learning_rate&quot;: 7.1717799366341115e-06,
&quot;epoch&quot;: 0.2785515320334262,
&quot;step&quot;: 50
},
{
&quot;eval_loss&quot;: 2.315699815750122,
&quot;eval_runtime&quot;: 35.9132,
&quot;eval_samples_per_second&quot;: 39.985,
&quot;eval_steps_per_second&quot;: 2.005,
&quot;epoch&quot;: 0.2785515320334262,
&quot;step&quot;: 50
},
{
&quot;loss&quot;: 2.1761,
&quot;grad_norm&quot;: 1.5999361276626587,
&quot;learning_rate&quot;: 7.318142792483788e-06,
&quot;epoch&quot;: 0.2841225626740947,
&quot;step&quot;: 51
},
{
&quot;loss&quot;: 2.145,
&quot;grad_norm&quot;: 2.0915520191192627,
&quot;learning_rate&quot;: 7.464505648333463e-06,
&quot;epoch&quot;: 0.28969359331476324,
&quot;step&quot;: 52
},
{
&quot;loss&quot;: 1.829,
&quot;grad_norm&quot;: 4.090714931488037,
&quot;learning_rate&quot;: 7.610868504183138e-06,
&quot;epoch&quot;: 0.29526462395543174,
&quot;step&quot;: 53
},
{
&quot;loss&quot;: 2.345,
&quot;grad_norm&quot;: 1.6347575187683105,
&quot;learning_rate&quot;: 7.757231360032815e-06,
&quot;epoch&quot;: 0.3008356545961003,
&quot;step&quot;: 54
},
{
&quot;loss&quot;: 1.9989,
&quot;grad_norm&quot;: 1.5609041452407837,
&quot;learning_rate&quot;: 7.90359421588249e-06,
&quot;epoch&quot;: 0.3064066852367688,
&quot;step&quot;: 55
},
{
&quot;loss&quot;: 2.3336,
&quot;grad_norm&quot;: 1.6561325788497925,
&quot;learning_rate&quot;: 8.049957071732166e-06,
&quot;epoch&quot;: 0.31197771587743733,
&quot;step&quot;: 56
},
{
&quot;loss&quot;: 2.0979,
&quot;grad_norm&quot;: 1.6579258441925049,
&quot;learning_rate&quot;: 8.196319927581842e-06,
&quot;epoch&quot;: 0.31754874651810583,
&quot;step&quot;: 57
},
{
&quot;loss&quot;: 2.4041,
&quot;grad_norm&quot;: 1.8354761600494385,
&quot;learning_rate&quot;: 8.342682783431518e-06,
&quot;epoch&quot;: 0.3231197771587744,
&quot;step&quot;: 58
},
{
&quot;loss&quot;: 2.1076,
&quot;grad_norm&quot;: 1.7043126821517944,
&quot;learning_rate&quot;: 8.489045639281193e-06,
&quot;epoch&quot;: 0.3286908077994429,
&quot;step&quot;: 59
},
{
&quot;loss&quot;: 2.2672,
&quot;grad_norm&quot;: 1.5663846731185913,
&quot;learning_rate&quot;: 8.635408495130869e-06,
&quot;epoch&quot;: 0.3342618384401114,
&quot;step&quot;: 60
},
{
&quot;loss&quot;: 2.1081,
&quot;grad_norm&quot;: 1.8134770393371582,
&quot;learning_rate&quot;: 8.781771350980545e-06,
&quot;epoch&quot;: 0.3398328690807799,
&quot;step&quot;: 61
},
{
&quot;loss&quot;: 2.034,
&quot;grad_norm&quot;: 1.3617796897888184,
&quot;learning_rate&quot;: 8.928134206830221e-06,
&quot;epoch&quot;: 0.34540389972144847,
&quot;step&quot;: 62
},
{
&quot;loss&quot;: 2.1147,
&quot;grad_norm&quot;: 1.7525910139083862,
&quot;learning_rate&quot;: 9.074497062679895e-06,
&quot;epoch&quot;: 0.35097493036211697,
&quot;step&quot;: 63
},
{
&quot;loss&quot;: 1.9639,
&quot;grad_norm&quot;: 1.7186285257339478,
&quot;learning_rate&quot;: 9.220859918529572e-06,
&quot;epoch&quot;: 0.3565459610027855,
&quot;step&quot;: 64
},
{
&quot;loss&quot;: 1.7557,
&quot;grad_norm&quot;: 1.9141530990600586,
&quot;learning_rate&quot;: 9.367222774379248e-06,
&quot;epoch&quot;: 0.362116991643454,
&quot;step&quot;: 65
},
{
&quot;loss&quot;: 1.8861,
&quot;grad_norm&quot;: 1.696165680885315,
&quot;learning_rate&quot;: 9.513585630228924e-06,
&quot;epoch&quot;: 0.36768802228412256,
&quot;step&quot;: 66
},
{
&quot;loss&quot;: 2.272,
&quot;grad_norm&quot;: 1.24228036403656,
&quot;learning_rate&quot;: 9.659948486078598e-06,
&quot;epoch&quot;: 0.3732590529247911,
&quot;step&quot;: 67
},
{
&quot;loss&quot;: 1.7173,
&quot;grad_norm&quot;: 1.9760662317276,
&quot;learning_rate&quot;: 9.806311341928276e-06,
&quot;epoch&quot;: 0.3788300835654596,
&quot;step&quot;: 68
},
{
&quot;loss&quot;: 1.8545,
&quot;grad_norm&quot;: 1.3207972049713135,
&quot;learning_rate&quot;: 9.95267419777795e-06,
&quot;epoch&quot;: 0.38440111420612816,
&quot;step&quot;: 69
},
{
&quot;loss&quot;: 2.0492,
&quot;grad_norm&quot;: 1.5849637985229492,
&quot;learning_rate&quot;: 1.0099037053627625e-05,
&quot;epoch&quot;: 0.38997214484679665,
&quot;step&quot;: 70
},
{
&quot;loss&quot;: 1.9203,
&quot;grad_norm&quot;: 2.7006468772888184,
&quot;learning_rate&quot;: 1.0245399909477301e-05,
&quot;epoch&quot;: 0.3955431754874652,
&quot;step&quot;: 71
},
{
&quot;loss&quot;: 2.1317,
&quot;grad_norm&quot;: 1.9178322553634644,
&quot;learning_rate&quot;: 1.0391762765326979e-05,
&quot;epoch&quot;: 0.4011142061281337,
&quot;step&quot;: 72
},
{
&quot;loss&quot;: 2.282,
&quot;grad_norm&quot;: 1.5044149160385132,
&quot;learning_rate&quot;: 1.0538125621176653e-05,
&quot;epoch&quot;: 0.40668523676880225,
&quot;step&quot;: 73
},
{
&quot;loss&quot;: 2.2023,
&quot;grad_norm&quot;: 1.9386659860610962,
&quot;learning_rate&quot;: 1.068448847702633e-05,
&quot;epoch&quot;: 0.41225626740947074,
&quot;step&quot;: 74
},
{
&quot;loss&quot;: 2.3545,
&quot;grad_norm&quot;: 1.3408238887786865,
&quot;learning_rate&quot;: 1.0830851332876004e-05,
&quot;epoch&quot;: 0.4178272980501393,
&quot;step&quot;: 75
},
{
&quot;loss&quot;: 2.1984,
&quot;grad_norm&quot;: 2.221109390258789,
&quot;learning_rate&quot;: 1.0977214188725682e-05,
&quot;epoch&quot;: 0.4233983286908078,
&quot;step&quot;: 76
},
{
&quot;loss&quot;: 1.9843,
&quot;grad_norm&quot;: 1.7843296527862549,
&quot;learning_rate&quot;: 1.1123577044575356e-05,
&quot;epoch&quot;: 0.42896935933147634,
&quot;step&quot;: 77
},
{
&quot;loss&quot;: 2.2899,
&quot;grad_norm&quot;: 1.6259101629257202,
&quot;learning_rate&quot;: 1.1269939900425032e-05,
&quot;epoch&quot;: 0.43454038997214484,
&quot;step&quot;: 78
},
{
&quot;loss&quot;: 1.8448,
&quot;grad_norm&quot;: 1.718583345413208,
&quot;learning_rate&quot;: 1.1416302756274707e-05,
&quot;epoch&quot;: 0.4401114206128134,
&quot;step&quot;: 79
},
{
&quot;loss&quot;: 1.857,
&quot;grad_norm&quot;: 1.8396937847137451,
&quot;learning_rate&quot;: 1.1562665612124385e-05,
&quot;epoch&quot;: 0.4456824512534819,
&quot;step&quot;: 80
},
{
&quot;loss&quot;: 2.1387,
&quot;grad_norm&quot;: 1.808605670928955,
&quot;learning_rate&quot;: 1.170902846797406e-05,
&quot;epoch&quot;: 0.45125348189415043,
&quot;step&quot;: 81
},
{
&quot;loss&quot;: 1.9587,
&quot;grad_norm&quot;: 2.590714931488037,
&quot;learning_rate&quot;: 1.1855391323823735e-05,
&quot;epoch&quot;: 0.4568245125348189,
&quot;step&quot;: 82
},
{
&quot;loss&quot;: 2.5776,
&quot;grad_norm&quot;: 1.550307273864746,
&quot;learning_rate&quot;: 1.200175417967341e-05,
&quot;epoch&quot;: 0.4623955431754875,
&quot;step&quot;: 83
},
{
&quot;loss&quot;: 2.1138,
&quot;grad_norm&quot;: 1.7622662782669067,
&quot;learning_rate&quot;: 1.2148117035523088e-05,
&quot;epoch&quot;: 0.467966573816156,
&quot;step&quot;: 84
},
{
&quot;loss&quot;: 2.0632,
&quot;grad_norm&quot;: 2.1933865547180176,
&quot;learning_rate&quot;: 1.2294479891372764e-05,
&quot;epoch&quot;: 0.4735376044568245,
&quot;step&quot;: 85
},
{
&quot;loss&quot;: 2.1885,
&quot;grad_norm&quot;: 1.6188870668411255,
&quot;learning_rate&quot;: 1.2440842747222438e-05,
&quot;epoch&quot;: 0.479108635097493,
&quot;step&quot;: 86
},
{
&quot;loss&quot;: 2.2515,
&quot;grad_norm&quot;: 1.6533507108688354,
&quot;learning_rate&quot;: 1.2587205603072113e-05,
&quot;epoch&quot;: 0.48467966573816157,
&quot;step&quot;: 87
},
{
&quot;loss&quot;: 2.3173,
&quot;grad_norm&quot;: 1.295457363128662,
&quot;learning_rate&quot;: 1.2733568458921789e-05,
&quot;epoch&quot;: 0.49025069637883006,
&quot;step&quot;: 88
},
{
&quot;loss&quot;: 1.9536,
&quot;grad_norm&quot;: 1.5764713287353516,
&quot;learning_rate&quot;: 1.2879931314771467e-05,
&quot;epoch&quot;: 0.4958217270194986,
&quot;step&quot;: 89
},
{
&quot;loss&quot;: 1.9134,
&quot;grad_norm&quot;: 1.8399816751480103,
&quot;learning_rate&quot;: 1.3026294170621141e-05,
&quot;epoch&quot;: 0.5013927576601671,
&quot;step&quot;: 90
},
{
&quot;loss&quot;: 2.2808,
&quot;grad_norm&quot;: 1.7519652843475342,
&quot;learning_rate&quot;: 1.3172657026470817e-05,
&quot;epoch&quot;: 0.5069637883008357,
&quot;step&quot;: 91
},
{
&quot;loss&quot;: 2.2055,
&quot;grad_norm&quot;: 1.4549530744552612,
&quot;learning_rate&quot;: 1.3319019882320492e-05,
&quot;epoch&quot;: 0.5125348189415042,
&quot;step&quot;: 92
},
{
&quot;loss&quot;: 1.9373,
&quot;grad_norm&quot;: 2.0461559295654297,
&quot;learning_rate&quot;: 1.346538273817017e-05,
&quot;epoch&quot;: 0.5181058495821727,
&quot;step&quot;: 93
},
{
&quot;loss&quot;: 1.7517,
&quot;grad_norm&quot;: 1.5427114963531494,
&quot;learning_rate&quot;: 1.3611745594019844e-05,
&quot;epoch&quot;: 0.5236768802228412,
&quot;step&quot;: 94
},
{
&quot;loss&quot;: 1.9707,
&quot;grad_norm&quot;: 1.5442962646484375,
&quot;learning_rate&quot;: 1.375810844986952e-05,
&quot;epoch&quot;: 0.5292479108635098,
&quot;step&quot;: 95
},
{
&quot;loss&quot;: 1.9629,
&quot;grad_norm&quot;: 1.939523458480835,
&quot;learning_rate&quot;: 1.3904471305719195e-05,
&quot;epoch&quot;: 0.5348189415041783,
&quot;step&quot;: 96
},
{
&quot;loss&quot;: 1.8158,
&quot;grad_norm&quot;: 1.9389022588729858,
&quot;learning_rate&quot;: 1.4050834161568872e-05,
&quot;epoch&quot;: 0.5403899721448467,
&quot;step&quot;: 97
},
{
&quot;loss&quot;: 2.5133,
&quot;grad_norm&quot;: 1.9970468282699585,
&quot;learning_rate&quot;: 1.4197197017418547e-05,
&quot;epoch&quot;: 0.5459610027855153,
&quot;step&quot;: 98
},
{
&quot;loss&quot;: 2.263,
&quot;grad_norm&quot;: 1.5786551237106323,
&quot;learning_rate&quot;: 1.4343559873268223e-05,
&quot;epoch&quot;: 0.5515320334261838,
&quot;step&quot;: 99
},
{
&quot;loss&quot;: 2.1035,
&quot;grad_norm&quot;: 2.2139763832092285,
&quot;learning_rate&quot;: 1.4489922729117897e-05,
&quot;epoch&quot;: 0.5571030640668524,
&quot;step&quot;: 100
},
{
&quot;eval_loss&quot;: 2.2380564212799072,
&quot;eval_runtime&quot;: 35.9418,
&quot;eval_samples_per_second&quot;: 39.954,
&quot;eval_steps_per_second&quot;: 2.003,
&quot;epoch&quot;: 0.5571030640668524,
&quot;step&quot;: 100
},
{
&quot;loss&quot;: 1.8114,
&quot;grad_norm&quot;: 2.2652857303619385,
&quot;learning_rate&quot;: 1.4636285584967575e-05,
&quot;epoch&quot;: 0.5626740947075209,
&quot;step&quot;: 101
},
{
&quot;loss&quot;: 2.3733,
&quot;grad_norm&quot;: 1.688621997833252,
&quot;learning_rate&quot;: 1.4782648440817251e-05,
&quot;epoch&quot;: 0.5682451253481894,
&quot;step&quot;: 102
},
{
&quot;loss&quot;: 1.929,
&quot;grad_norm&quot;: 2.500704765319824,
&quot;learning_rate&quot;: 1.4929011296666926e-05,
&quot;epoch&quot;: 0.5738161559888579,
&quot;step&quot;: 103
},
{
&quot;loss&quot;: 1.9718,
&quot;grad_norm&quot;: 1.492704153060913,
&quot;learning_rate&quot;: 1.50753741525166e-05,
&quot;epoch&quot;: 0.5793871866295265,
&quot;step&quot;: 104
},
{
&quot;loss&quot;: 2.1026,
&quot;grad_norm&quot;: 1.6980139017105103,
&quot;learning_rate&quot;: 1.5221737008366276e-05,
&quot;epoch&quot;: 0.584958217270195,
&quot;step&quot;: 105
},
{
&quot;loss&quot;: 1.7999,
&quot;grad_norm&quot;: 1.7127199172973633,
&quot;learning_rate&quot;: 1.5368099864215953e-05,
&quot;epoch&quot;: 0.5905292479108635,
&quot;step&quot;: 106
},
{
&quot;loss&quot;: 2.0349,
&quot;grad_norm&quot;: 1.8260376453399658,
&quot;learning_rate&quot;: 1.551446272006563e-05,
&quot;epoch&quot;: 0.596100278551532,
&quot;step&quot;: 107
},
{
&quot;loss&quot;: 2.2571,
&quot;grad_norm&quot;: 1.8122572898864746,
&quot;learning_rate&quot;: 1.5660825575915305e-05,
&quot;epoch&quot;: 0.6016713091922006,
&quot;step&quot;: 108
},
{
&quot;loss&quot;: 2.0794,
&quot;grad_norm&quot;: 2.299410343170166,
&quot;learning_rate&quot;: 1.580718843176498e-05,
&quot;epoch&quot;: 0.6072423398328691,
&quot;step&quot;: 109
},
{
&quot;loss&quot;: 2.1671,
&quot;grad_norm&quot;: 1.4942196607589722,
&quot;learning_rate&quot;: 1.5953551287614657e-05,
&quot;epoch&quot;: 0.6128133704735376,
&quot;step&quot;: 110
},
{
&quot;loss&quot;: 2.1292,
&quot;grad_norm&quot;: 1.6794716119766235,
&quot;learning_rate&quot;: 1.609991414346433e-05,
&quot;epoch&quot;: 0.6183844011142061,
&quot;step&quot;: 111
},
{
&quot;loss&quot;: 2.2534,
&quot;grad_norm&quot;: 1.8196300268173218,
&quot;learning_rate&quot;: 1.6246276999314006e-05,
&quot;epoch&quot;: 0.6239554317548747,
&quot;step&quot;: 112
},
{
&quot;loss&quot;: 2.0333,
&quot;grad_norm&quot;: 1.5703504085540771,
&quot;learning_rate&quot;: 1.6392639855163684e-05,
&quot;epoch&quot;: 0.6295264623955432,
&quot;step&quot;: 113
},
{
&quot;loss&quot;: 2.1495,
&quot;grad_norm&quot;: 1.766376256942749,
&quot;learning_rate&quot;: 1.6539002711013358e-05,
&quot;epoch&quot;: 0.6350974930362117,
&quot;step&quot;: 114
},
{
&quot;loss&quot;: 1.5522,
&quot;grad_norm&quot;: 2.6598968505859375,
&quot;learning_rate&quot;: 1.6685365566863036e-05,
&quot;epoch&quot;: 0.6406685236768802,
&quot;step&quot;: 115
},
{
&quot;loss&quot;: 2.2221,
&quot;grad_norm&quot;: 10.731008529663086,
&quot;learning_rate&quot;: 1.683172842271271e-05,
&quot;epoch&quot;: 0.6462395543175488,
&quot;step&quot;: 116
},
{
&quot;loss&quot;: 2.4227,
&quot;grad_norm&quot;: 2.2150168418884277,
&quot;learning_rate&quot;: 1.6978091278562385e-05,
&quot;epoch&quot;: 0.6518105849582173,
&quot;step&quot;: 117
},
{
&quot;loss&quot;: 2.3886,
&quot;grad_norm&quot;: 2.283031940460205,
&quot;learning_rate&quot;: 1.7124454134412063e-05,
&quot;epoch&quot;: 0.6573816155988857,
&quot;step&quot;: 118
},
{
&quot;loss&quot;: 2.0651,
&quot;grad_norm&quot;: 2.6018834114074707,
&quot;learning_rate&quot;: 1.7270816990261737e-05,
&quot;epoch&quot;: 0.6629526462395543,
&quot;step&quot;: 119
},
{
&quot;loss&quot;: 1.9465,
&quot;grad_norm&quot;: 1.8486937284469604,
&quot;learning_rate&quot;: 1.7417179846111412e-05,
&quot;epoch&quot;: 0.6685236768802229,
&quot;step&quot;: 120
},
{
&quot;loss&quot;: 2.588,
&quot;grad_norm&quot;: 2.0970637798309326,
&quot;learning_rate&quot;: 1.756354270196109e-05,
&quot;epoch&quot;: 0.6740947075208914,
&quot;step&quot;: 121
},
{
&quot;loss&quot;: 1.7602,
&quot;grad_norm&quot;: 1.5886075496673584,
&quot;learning_rate&quot;: 1.7709905557810764e-05,
&quot;epoch&quot;: 0.6796657381615598,
&quot;step&quot;: 122
},
{
&quot;loss&quot;: 2.2738,
&quot;grad_norm&quot;: 2.4414422512054443,
&quot;learning_rate&quot;: 1.7856268413660442e-05,
&quot;epoch&quot;: 0.6852367688022284,
&quot;step&quot;: 123
},
{
&quot;loss&quot;: 1.8145,
&quot;grad_norm&quot;: 1.7890093326568604,
&quot;learning_rate&quot;: 1.8002631269510116e-05,
&quot;epoch&quot;: 0.6908077994428969,
&quot;step&quot;: 124
},
{
&quot;loss&quot;: 1.7572,
&quot;grad_norm&quot;: 1.7805349826812744,
&quot;learning_rate&quot;: 1.814899412535979e-05,
&quot;epoch&quot;: 0.6963788300835655,
&quot;step&quot;: 125
},
{
&quot;loss&quot;: 2.0206,
&quot;grad_norm&quot;: 1.9520258903503418,
&quot;learning_rate&quot;: 1.829535698120947e-05,
&quot;epoch&quot;: 0.7019498607242339,
&quot;step&quot;: 126
},
{
&quot;loss&quot;: 2.1292,
&quot;grad_norm&quot;: 1.6244016885757446,
&quot;learning_rate&quot;: 1.8441719837059143e-05,
&quot;epoch&quot;: 0.7075208913649025,
&quot;step&quot;: 127
},
{
&quot;loss&quot;: 2.1207,
&quot;grad_norm&quot;: 1.6681342124938965,
&quot;learning_rate&quot;: 1.858808269290882e-05,
&quot;epoch&quot;: 0.713091922005571,
&quot;step&quot;: 128
},
{
&quot;loss&quot;: 2.1515,
&quot;grad_norm&quot;: 2.1032838821411133,
&quot;learning_rate&quot;: 1.8734445548758495e-05,
&quot;epoch&quot;: 0.7186629526462396,
&quot;step&quot;: 129
},
{
&quot;loss&quot;: 1.7264,
&quot;grad_norm&quot;: 2.093341588973999,
&quot;learning_rate&quot;: 1.888080840460817e-05,
&quot;epoch&quot;: 0.724233983286908,
&quot;step&quot;: 130
},
{
&quot;loss&quot;: 1.6036,
&quot;grad_norm&quot;: 1.9431419372558594,
&quot;learning_rate&quot;: 1.9027171260457848e-05,
&quot;epoch&quot;: 0.7298050139275766,
&quot;step&quot;: 131
},
{
&quot;loss&quot;: 2.132,
&quot;grad_norm&quot;: 3.0380795001983643,
&quot;learning_rate&quot;: 1.9173534116307522e-05,
&quot;epoch&quot;: 0.7353760445682451,
&quot;step&quot;: 132
},
{
&quot;loss&quot;: 2.061,
&quot;grad_norm&quot;: 3.623516321182251,
&quot;learning_rate&quot;: 1.9319896972157197e-05,
&quot;epoch&quot;: 0.7409470752089137,
&quot;step&quot;: 133
},
{
&quot;loss&quot;: 1.8765,
&quot;grad_norm&quot;: 2.320667266845703,
&quot;learning_rate&quot;: 1.9466259828006874e-05,
&quot;epoch&quot;: 0.7465181058495822,
&quot;step&quot;: 134
},
{
&quot;loss&quot;: 1.8337,
&quot;grad_norm&quot;: 1.9040995836257935,
&quot;learning_rate&quot;: 1.9612622683856552e-05,
&quot;epoch&quot;: 0.7520891364902507,
&quot;step&quot;: 135
},
{
&quot;loss&quot;: 2.0664,
&quot;grad_norm&quot;: 1.8677185773849487,
&quot;learning_rate&quot;: 1.9758985539706227e-05,
&quot;epoch&quot;: 0.7576601671309192,
&quot;step&quot;: 136
},
{
&quot;loss&quot;: 1.9651,
&quot;grad_norm&quot;: 2.414144992828369,
&quot;learning_rate&quot;: 1.99053483955559e-05,
&quot;epoch&quot;: 0.7632311977715878,
&quot;step&quot;: 137
},
{
&quot;loss&quot;: 1.9243,
&quot;grad_norm&quot;: 2.5697357654571533,
&quot;learning_rate&quot;: 1.989220087349807e-05,
&quot;epoch&quot;: 0.7688022284122563,
&quot;step&quot;: 138
},
{
&quot;loss&quot;: 2.0289,
&quot;grad_norm&quot;: 2.384612560272217,
&quot;learning_rate&quot;: 1.987905335144024e-05,
&quot;epoch&quot;: 0.7743732590529248,
&quot;step&quot;: 139
},
{
&quot;loss&quot;: 1.7206,
&quot;grad_norm&quot;: 2.284605026245117,
&quot;learning_rate&quot;: 1.986590582938241e-05,
&quot;epoch&quot;: 0.7799442896935933,
&quot;step&quot;: 140
},
{
&quot;loss&quot;: 1.7212,
&quot;grad_norm&quot;: 2.3488142490386963,
&quot;learning_rate&quot;: 1.985275830732458e-05,
&quot;epoch&quot;: 0.7855153203342619,
&quot;step&quot;: 141
},
{
&quot;loss&quot;: 1.559,
&quot;grad_norm&quot;: 1.849543809890747,
&quot;learning_rate&quot;: 1.983961078526675e-05,
&quot;epoch&quot;: 0.7910863509749304,
&quot;step&quot;: 142
},
{
&quot;loss&quot;: 1.9442,
&quot;grad_norm&quot;: 2.343719720840454,
&quot;learning_rate&quot;: 1.9826463263208915e-05,
&quot;epoch&quot;: 0.7966573816155988,
&quot;step&quot;: 143
},
{
&quot;loss&quot;: 1.8913,
&quot;grad_norm&quot;: 2.6115176677703857,
&quot;learning_rate&quot;: 1.9813315741151084e-05,
&quot;epoch&quot;: 0.8022284122562674,
&quot;step&quot;: 144
},
{
&quot;loss&quot;: 2.1248,
&quot;grad_norm&quot;: 2.703418731689453,
&quot;learning_rate&quot;: 1.9800168219093254e-05,
&quot;epoch&quot;: 0.807799442896936,
&quot;step&quot;: 145
},
{
&quot;loss&quot;: 2.1745,
&quot;grad_norm&quot;: 2.379194736480713,
&quot;learning_rate&quot;: 1.9787020697035423e-05,
&quot;epoch&quot;: 0.8133704735376045,
&quot;step&quot;: 146
},
{
&quot;loss&quot;: 2.0572,
&quot;grad_norm&quot;: 2.4916770458221436,
&quot;learning_rate&quot;: 1.9773873174977593e-05,
&quot;epoch&quot;: 0.8189415041782729,
&quot;step&quot;: 147
},
{
&quot;loss&quot;: 1.8028,
&quot;grad_norm&quot;: 3.7550413608551025,
&quot;learning_rate&quot;: 1.9760725652919762e-05,
&quot;epoch&quot;: 0.8245125348189415,
&quot;step&quot;: 148
},
{
&quot;loss&quot;: 2.2202,
&quot;grad_norm&quot;: 1.704113483428955,
&quot;learning_rate&quot;: 1.974757813086193e-05,
&quot;epoch&quot;: 0.83008356545961,
&quot;step&quot;: 149
},
{
&quot;loss&quot;: 1.8407,
&quot;grad_norm&quot;: 2.14805269241333,
&quot;learning_rate&quot;: 1.9734430608804098e-05,
&quot;epoch&quot;: 0.8356545961002786,
&quot;step&quot;: 150
},
{
&quot;eval_loss&quot;: 2.264693021774292,
&quot;eval_runtime&quot;: 35.9452,
&quot;eval_samples_per_second&quot;: 39.95,
&quot;eval_steps_per_second&quot;: 2.003,
&quot;epoch&quot;: 0.8356545961002786,
&quot;step&quot;: 150
},
{
&quot;loss&quot;: 2.3419,
&quot;grad_norm&quot;: 2.3600826263427734,
&quot;learning_rate&quot;: 1.972128308674627e-05,
&quot;epoch&quot;: 0.841225626740947,
&quot;step&quot;: 151
},
{
&quot;loss&quot;: 2.0321,
&quot;grad_norm&quot;: 2.7362117767333984,
&quot;learning_rate&quot;: 1.970813556468844e-05,
&quot;epoch&quot;: 0.8467966573816156,
&quot;step&quot;: 152
},
{
&quot;loss&quot;: 1.7054,
&quot;grad_norm&quot;: 2.982322931289673,
&quot;learning_rate&quot;: 1.9694988042630607e-05,
&quot;epoch&quot;: 0.8523676880222841,
&quot;step&quot;: 153
},
{
&quot;loss&quot;: 1.9304,
&quot;grad_norm&quot;: 2.8210840225219727,
&quot;learning_rate&quot;: 1.9681840520572776e-05,
&quot;epoch&quot;: 0.8579387186629527,
&quot;step&quot;: 154
},
{
&quot;loss&quot;: 2.1154,
&quot;grad_norm&quot;: 2.412022113800049,
&quot;learning_rate&quot;: 1.9668692998514946e-05,
&quot;epoch&quot;: 0.8635097493036211,
&quot;step&quot;: 155
},
{
&quot;loss&quot;: 2.0367,
&quot;grad_norm&quot;: 2.439105987548828,
&quot;learning_rate&quot;: 1.9655545476457115e-05,
&quot;epoch&quot;: 0.8690807799442897,
&quot;step&quot;: 156
},
{
&quot;loss&quot;: 1.5693,
&quot;grad_norm&quot;: 2.276296854019165,
&quot;learning_rate&quot;: 1.9642397954399285e-05,
&quot;epoch&quot;: 0.8746518105849582,
&quot;step&quot;: 157
},
{
&quot;loss&quot;: 1.726,
&quot;grad_norm&quot;: 2.12568998336792,
&quot;learning_rate&quot;: 1.9629250432341454e-05,
&quot;epoch&quot;: 0.8802228412256268,
&quot;step&quot;: 158
},
{
&quot;loss&quot;: 1.7872,
&quot;grad_norm&quot;: 2.1106767654418945,
&quot;learning_rate&quot;: 1.9616102910283624e-05,
&quot;epoch&quot;: 0.8857938718662952,
&quot;step&quot;: 159
},
{
&quot;loss&quot;: 1.9831,
&quot;grad_norm&quot;: 1.9893423318862915,
&quot;learning_rate&quot;: 1.960295538822579e-05,
&quot;epoch&quot;: 0.8913649025069638,
&quot;step&quot;: 160
},
{
&quot;loss&quot;: 2.0571,
&quot;grad_norm&quot;: 2.222038984298706,
&quot;learning_rate&quot;: 1.958980786616796e-05,
&quot;epoch&quot;: 0.8969359331476323,
&quot;step&quot;: 161
},
{
&quot;loss&quot;: 1.8674,
&quot;grad_norm&quot;: 2.5205395221710205,
&quot;learning_rate&quot;: 1.957666034411013e-05,
&quot;epoch&quot;: 0.9025069637883009,
&quot;step&quot;: 162
},
{
&quot;loss&quot;: 1.7982,
&quot;grad_norm&quot;: 2.212405204772949,
&quot;learning_rate&quot;: 1.95635128220523e-05,
&quot;epoch&quot;: 0.9080779944289693,
&quot;step&quot;: 163
},
{
&quot;loss&quot;: 1.7992,
&quot;grad_norm&quot;: 2.304945468902588,
&quot;learning_rate&quot;: 1.9550365299994468e-05,
&quot;epoch&quot;: 0.9136490250696379,
&quot;step&quot;: 164
},
{
&quot;loss&quot;: 2.2013,
&quot;grad_norm&quot;: 2.8349928855895996,
&quot;learning_rate&quot;: 1.9537217777936638e-05,
&quot;epoch&quot;: 0.9192200557103064,
&quot;step&quot;: 165
},
{
&quot;loss&quot;: 1.7908,
&quot;grad_norm&quot;: 2.2040209770202637,
&quot;learning_rate&quot;: 1.9524070255878807e-05,
&quot;epoch&quot;: 0.924791086350975,
&quot;step&quot;: 166
},
{
&quot;loss&quot;: 2.1308,
&quot;grad_norm&quot;: 2.550541400909424,
&quot;learning_rate&quot;: 1.9510922733820977e-05,
&quot;epoch&quot;: 0.9303621169916435,
&quot;step&quot;: 167
},
{
&quot;loss&quot;: 1.735,
&quot;grad_norm&quot;: 2.9808292388916016,
&quot;learning_rate&quot;: 1.9497775211763146e-05,
&quot;epoch&quot;: 0.935933147632312,
&quot;step&quot;: 168
},
{
&quot;loss&quot;: 2.1536,
&quot;grad_norm&quot;: 2.4572677612304688,
&quot;learning_rate&quot;: 1.9484627689705316e-05,
&quot;epoch&quot;: 0.9415041782729805,
&quot;step&quot;: 169
},
{
&quot;loss&quot;: 1.8616,
&quot;grad_norm&quot;: 2.414435863494873,
&quot;learning_rate&quot;: 1.9471480167647482e-05,
&quot;epoch&quot;: 0.947075208913649,
&quot;step&quot;: 170
},
{
&quot;loss&quot;: 1.9501,
&quot;grad_norm&quot;: 2.490251064300537,
&quot;learning_rate&quot;: 1.945833264558965e-05,
&quot;epoch&quot;: 0.9526462395543176,
&quot;step&quot;: 171
},
{
&quot;loss&quot;: 1.7965,
&quot;grad_norm&quot;: 3.2512645721435547,
&quot;learning_rate&quot;: 1.944518512353182e-05,
&quot;epoch&quot;: 0.958217270194986,
&quot;step&quot;: 172
},
{
&quot;loss&quot;: 1.8903,
&quot;grad_norm&quot;: 2.0697317123413086,
&quot;learning_rate&quot;: 1.943203760147399e-05,
&quot;epoch&quot;: 0.9637883008356546,
&quot;step&quot;: 173
},
{
&quot;loss&quot;: 1.9153,
&quot;grad_norm&quot;: 2.869088888168335,
&quot;learning_rate&quot;: 1.941889007941616e-05,
&quot;epoch&quot;: 0.9693593314763231,
&quot;step&quot;: 174
},
{
&quot;loss&quot;: 2.0043,
&quot;grad_norm&quot;: 2.5188841819763184,
&quot;learning_rate&quot;: 1.940574255735833e-05,
&quot;epoch&quot;: 0.9749303621169917,
&quot;step&quot;: 175
},
{
&quot;loss&quot;: 2.0106,
&quot;grad_norm&quot;: 2.2558531761169434,
&quot;learning_rate&quot;: 1.93925950353005e-05,
&quot;epoch&quot;: 0.9805013927576601,
&quot;step&quot;: 176
},
{
&quot;loss&quot;: 2.0356,
&quot;grad_norm&quot;: 2.78887677192688,
&quot;learning_rate&quot;: 1.9379447513242665e-05,
&quot;epoch&quot;: 0.9860724233983287,
&quot;step&quot;: 177
},
{
&quot;loss&quot;: 1.4849,
&quot;grad_norm&quot;: 2.9200024604797363,
&quot;learning_rate&quot;: 1.9366299991184838e-05,
&quot;epoch&quot;: 0.9916434540389972,
&quot;step&quot;: 178
},
{
&quot;loss&quot;: 1.6636,
&quot;grad_norm&quot;: 2.443997621536255,
&quot;learning_rate&quot;: 1.9353152469127007e-05,
&quot;epoch&quot;: 0.9972144846796658,
&quot;step&quot;: 179
},
{
&quot;loss&quot;: 1.4097,
&quot;grad_norm&quot;: 3.399275779724121,
&quot;learning_rate&quot;: 1.9340004947069174e-05,
&quot;epoch&quot;: 1.0,
&quot;step&quot;: 180
},
{
&quot;loss&quot;: 1.7611,
&quot;grad_norm&quot;: 2.312861442565918,
&quot;learning_rate&quot;: 1.9326857425011343e-05,
&quot;epoch&quot;: 1.0055710306406684,
&quot;step&quot;: 181
},
{
&quot;loss&quot;: 2.0967,
&quot;grad_norm&quot;: 2.799191951751709,
&quot;learning_rate&quot;: 1.9313709902953513e-05,
&quot;epoch&quot;: 1.011142061281337,
&quot;step&quot;: 182
},
{
&quot;loss&quot;: 2.3108,
&quot;grad_norm&quot;: 2.4845213890075684,
&quot;learning_rate&quot;: 1.9300562380895682e-05,
&quot;epoch&quot;: 1.0167130919220055,
&quot;step&quot;: 183
},
{
&quot;loss&quot;: 1.6263,
&quot;grad_norm&quot;: 2.72027325630188,
&quot;learning_rate&quot;: 1.928741485883785e-05,
&quot;epoch&quot;: 1.0222841225626742,
&quot;step&quot;: 184
},
{
&quot;loss&quot;: 1.6228,
&quot;grad_norm&quot;: 3.2783589363098145,
&quot;learning_rate&quot;: 1.927426733678002e-05,
&quot;epoch&quot;: 1.0278551532033426,
&quot;step&quot;: 185
},
{
&quot;loss&quot;: 1.9384,
&quot;grad_norm&quot;: 2.455291986465454,
&quot;learning_rate&quot;: 1.926111981472219e-05,
&quot;epoch&quot;: 1.033426183844011,
&quot;step&quot;: 186
},
{
&quot;loss&quot;: 1.5646,
&quot;grad_norm&quot;: 2.2230939865112305,
&quot;learning_rate&quot;: 1.9247972292664357e-05,
&quot;epoch&quot;: 1.0389972144846797,
&quot;step&quot;: 187
},
{
&quot;loss&quot;: 1.8545,
&quot;grad_norm&quot;: 2.596928119659424,
&quot;learning_rate&quot;: 1.9234824770606526e-05,
&quot;epoch&quot;: 1.0445682451253482,
&quot;step&quot;: 188
},
{
&quot;loss&quot;: 1.8436,
&quot;grad_norm&quot;: 2.5703697204589844,
&quot;learning_rate&quot;: 1.9221677248548696e-05,
&quot;epoch&quot;: 1.0501392757660166,
&quot;step&quot;: 189
},
{
&quot;loss&quot;: 2.2574,
&quot;grad_norm&quot;: 3.021871566772461,
&quot;learning_rate&quot;: 1.920852972649087e-05,
&quot;epoch&quot;: 1.0557103064066853,
&quot;step&quot;: 190
},
{
&quot;loss&quot;: 1.8046,
&quot;grad_norm&quot;: 2.35603404045105,
&quot;learning_rate&quot;: 1.9195382204433035e-05,
&quot;epoch&quot;: 1.0612813370473537,
&quot;step&quot;: 191
},
{
&quot;loss&quot;: 1.6635,
&quot;grad_norm&quot;: 2.453967809677124,
&quot;learning_rate&quot;: 1.9182234682375204e-05,
&quot;epoch&quot;: 1.0668523676880224,
&quot;step&quot;: 192
},
{
&quot;loss&quot;: 1.9118,
&quot;grad_norm&quot;: 3.2305331230163574,
&quot;learning_rate&quot;: 1.9169087160317374e-05,
&quot;epoch&quot;: 1.0724233983286908,
&quot;step&quot;: 193
},
{
&quot;loss&quot;: 1.5529,
&quot;grad_norm&quot;: 2.248871326446533,
&quot;learning_rate&quot;: 1.9155939638259543e-05,
&quot;epoch&quot;: 1.0779944289693593,
&quot;step&quot;: 194
},
{
&quot;loss&quot;: 1.7932,
&quot;grad_norm&quot;: 3.0331363677978516,
&quot;learning_rate&quot;: 1.9142792116201713e-05,
&quot;epoch&quot;: 1.083565459610028,
&quot;step&quot;: 195
},
{
&quot;loss&quot;: 1.7632,
&quot;grad_norm&quot;: 3.543948173522949,
&quot;learning_rate&quot;: 1.9129644594143882e-05,
&quot;epoch&quot;: 1.0891364902506964,
&quot;step&quot;: 196
},
{
&quot;loss&quot;: 1.8788,
&quot;grad_norm&quot;: 3.4173591136932373,
&quot;learning_rate&quot;: 1.911649707208605e-05,
&quot;epoch&quot;: 1.0947075208913648,
&quot;step&quot;: 197
},
{
&quot;loss&quot;: 2.0881,
&quot;grad_norm&quot;: 3.4639406204223633,
&quot;learning_rate&quot;: 1.9103349550028218e-05,
&quot;epoch&quot;: 1.1002785515320335,
&quot;step&quot;: 198
},
{
&quot;loss&quot;: 1.9197,
&quot;grad_norm&quot;: 3.6082725524902344,
&quot;learning_rate&quot;: 1.9090202027970388e-05,
&quot;epoch&quot;: 1.105849582172702,
&quot;step&quot;: 199
},
{
&quot;loss&quot;: 1.4541,
&quot;grad_norm&quot;: 2.834181070327759,
&quot;learning_rate&quot;: 1.9077054505912557e-05,
&quot;epoch&quot;: 1.1114206128133706,
&quot;step&quot;: 200
},
{
&quot;eval_loss&quot;: 2.3124067783355713,
&quot;eval_runtime&quot;: 35.9422,
&quot;eval_samples_per_second&quot;: 39.953,
&quot;eval_steps_per_second&quot;: 2.003,
&quot;epoch&quot;: 1.1114206128133706,
&quot;step&quot;: 200
},
{
&quot;loss&quot;: 1.8766,
&quot;grad_norm&quot;: 2.44728422164917,
&quot;learning_rate&quot;: 1.9063906983854727e-05,
&quot;epoch&quot;: 1.116991643454039,
&quot;step&quot;: 201
},
{
&quot;loss&quot;: 1.8877,
&quot;grad_norm&quot;: 3.1577866077423096,
&quot;learning_rate&quot;: 1.9050759461796896e-05,
&quot;epoch&quot;: 1.1225626740947074,
&quot;step&quot;: 202
},
{
&quot;loss&quot;: 1.6045,
&quot;grad_norm&quot;: 3.5458521842956543,
&quot;learning_rate&quot;: 1.9037611939739066e-05,
&quot;epoch&quot;: 1.128133704735376,
&quot;step&quot;: 203
},
{
&quot;loss&quot;: 1.705,
&quot;grad_norm&quot;: 2.496349811553955,
&quot;learning_rate&quot;: 1.9024464417681232e-05,
&quot;epoch&quot;: 1.1337047353760445,
&quot;step&quot;: 204
},
{
&quot;loss&quot;: 1.6478,
&quot;grad_norm&quot;: 3.2897088527679443,
&quot;learning_rate&quot;: 1.9011316895623405e-05,
&quot;epoch&quot;: 1.1392757660167132,
&quot;step&quot;: 205
},
{
&quot;loss&quot;: 1.6703,
&quot;grad_norm&quot;: 3.1694509983062744,
&quot;learning_rate&quot;: 1.8998169373565574e-05,
&quot;epoch&quot;: 1.1448467966573816,
&quot;step&quot;: 206
},
{
&quot;loss&quot;: 2.0232,
&quot;grad_norm&quot;: 2.8644907474517822,
&quot;learning_rate&quot;: 1.8985021851507744e-05,
&quot;epoch&quot;: 1.15041782729805,
&quot;step&quot;: 207
},
{
&quot;loss&quot;: 1.581,
&quot;grad_norm&quot;: 2.930053472518921,
&quot;learning_rate&quot;: 1.897187432944991e-05,
&quot;epoch&quot;: 1.1559888579387188,
&quot;step&quot;: 208
},
{
&quot;loss&quot;: 1.6617,
&quot;grad_norm&quot;: 2.9067940711975098,
&quot;learning_rate&quot;: 1.895872680739208e-05,
&quot;epoch&quot;: 1.1615598885793872,
&quot;step&quot;: 209
},
{
&quot;loss&quot;: 2.173,
&quot;grad_norm&quot;: 3.746903419494629,
&quot;learning_rate&quot;: 1.894557928533425e-05,
&quot;epoch&quot;: 1.1671309192200556,
&quot;step&quot;: 210
},
{
&quot;loss&quot;: 1.5917,
&quot;grad_norm&quot;: 4.83465576171875,
&quot;learning_rate&quot;: 1.893243176327642e-05,
&quot;epoch&quot;: 1.1727019498607243,
&quot;step&quot;: 211
},
{
&quot;loss&quot;: 1.531,
&quot;grad_norm&quot;: 3.0352439880371094,
&quot;learning_rate&quot;: 1.8919284241218588e-05,
&quot;epoch&quot;: 1.1782729805013927,
&quot;step&quot;: 212
},
{
&quot;loss&quot;: 2.193,
&quot;grad_norm&quot;: 2.738152027130127,
&quot;learning_rate&quot;: 1.8906136719160758e-05,
&quot;epoch&quot;: 1.1838440111420612,
&quot;step&quot;: 213
},
{
&quot;loss&quot;: 1.9884,
&quot;grad_norm&quot;: 3.005979061126709,
&quot;learning_rate&quot;: 1.8892989197102927e-05,
&quot;epoch&quot;: 1.1894150417827298,
&quot;step&quot;: 214
},
{
&quot;loss&quot;: 1.8659,
&quot;grad_norm&quot;: 3.930433750152588,
&quot;learning_rate&quot;: 1.8879841675045093e-05,
&quot;epoch&quot;: 1.1949860724233983,
&quot;step&quot;: 215
},
{
&quot;loss&quot;: 2.191,
&quot;grad_norm&quot;: 3.3943190574645996,
&quot;learning_rate&quot;: 1.8866694152987263e-05,
&quot;epoch&quot;: 1.200557103064067,
&quot;step&quot;: 216
},
{
&quot;loss&quot;: 2.1538,
&quot;grad_norm&quot;: 3.4692654609680176,
&quot;learning_rate&quot;: 1.8853546630929436e-05,
&quot;epoch&quot;: 1.2061281337047354,
&quot;step&quot;: 217
},
{
&quot;loss&quot;: 1.9939,
&quot;grad_norm&quot;: 2.889341354370117,
&quot;learning_rate&quot;: 1.8840399108871602e-05,
&quot;epoch&quot;: 1.2116991643454038,
&quot;step&quot;: 218
},
{
&quot;loss&quot;: 2.1667,
&quot;grad_norm&quot;: 3.123650550842285,
&quot;learning_rate&quot;: 1.882725158681377e-05,
&quot;epoch&quot;: 1.2172701949860725,
&quot;step&quot;: 219
},
{
&quot;loss&quot;: 1.9743,
&quot;grad_norm&quot;: 2.6485071182250977,
&quot;learning_rate&quot;: 1.881410406475594e-05,
&quot;epoch&quot;: 1.222841225626741,
&quot;step&quot;: 220
},
{
&quot;loss&quot;: 1.5679,
&quot;grad_norm&quot;: 3.791811227798462,
&quot;learning_rate&quot;: 1.880095654269811e-05,
&quot;epoch&quot;: 1.2284122562674096,
&quot;step&quot;: 221
},
{
&quot;loss&quot;: 2.1841,
&quot;grad_norm&quot;: 3.286864757537842,
&quot;learning_rate&quot;: 1.878780902064028e-05,
&quot;epoch&quot;: 1.233983286908078,
&quot;step&quot;: 222
},
{
&quot;loss&quot;: 2.1165,
&quot;grad_norm&quot;: 2.930072784423828,
&quot;learning_rate&quot;: 1.877466149858245e-05,
&quot;epoch&quot;: 1.2395543175487465,
&quot;step&quot;: 223
},
{
&quot;loss&quot;: 1.7816,
&quot;grad_norm&quot;: 2.936857223510742,
&quot;learning_rate&quot;: 1.876151397652462e-05,
&quot;epoch&quot;: 1.2451253481894151,
&quot;step&quot;: 224
},
{
&quot;loss&quot;: 1.702,
&quot;grad_norm&quot;: 2.3516695499420166,
&quot;learning_rate&quot;: 1.8748366454466785e-05,
&quot;epoch&quot;: 1.2506963788300836,
&quot;step&quot;: 225
},
{
&quot;loss&quot;: 1.7222,
&quot;grad_norm&quot;: 3.2817559242248535,
&quot;learning_rate&quot;: 1.8735218932408955e-05,
&quot;epoch&quot;: 1.2562674094707522,
&quot;step&quot;: 226
},
{
&quot;loss&quot;: 1.7151,
&quot;grad_norm&quot;: 2.987518548965454,
&quot;learning_rate&quot;: 1.8722071410351124e-05,
&quot;epoch&quot;: 1.2618384401114207,
&quot;step&quot;: 227
},
{
&quot;loss&quot;: 2.0545,
&quot;grad_norm&quot;: 3.132258415222168,
&quot;learning_rate&quot;: 1.8708923888293294e-05,
&quot;epoch&quot;: 1.267409470752089,
&quot;step&quot;: 228
},
{
&quot;loss&quot;: 1.4732,
&quot;grad_norm&quot;: 3.2233877182006836,
&quot;learning_rate&quot;: 1.8695776366235463e-05,
&quot;epoch&quot;: 1.2729805013927575,
&quot;step&quot;: 229
},
{
&quot;loss&quot;: 1.7168,
&quot;grad_norm&quot;: 3.2920405864715576,
&quot;learning_rate&quot;: 1.8682628844177633e-05,
&quot;epoch&quot;: 1.2785515320334262,
&quot;step&quot;: 230
},
{
&quot;loss&quot;: 1.497,
&quot;grad_norm&quot;: 2.536219596862793,
&quot;learning_rate&quot;: 1.8669481322119802e-05,
&quot;epoch&quot;: 1.2841225626740946,
&quot;step&quot;: 231
},
{
&quot;loss&quot;: 1.8177,
&quot;grad_norm&quot;: 4.246109485626221,
&quot;learning_rate&quot;: 1.865633380006197e-05,
&quot;epoch&quot;: 1.2896935933147633,
&quot;step&quot;: 232
},
{
&quot;loss&quot;: 1.9449,
&quot;grad_norm&quot;: 2.6518428325653076,
&quot;learning_rate&quot;: 1.864318627800414e-05,
&quot;epoch&quot;: 1.2952646239554317,
&quot;step&quot;: 233
},
{
&quot;loss&quot;: 1.4673,
&quot;grad_norm&quot;: 3.7276058197021484,
&quot;learning_rate&quot;: 1.863003875594631e-05,
&quot;epoch&quot;: 1.3008356545961002,
&quot;step&quot;: 234
},
{
&quot;loss&quot;: 1.865,
&quot;grad_norm&quot;: 3.2901997566223145,
&quot;learning_rate&quot;: 1.8616891233888477e-05,
&quot;epoch&quot;: 1.3064066852367688,
&quot;step&quot;: 235
},
{
&quot;loss&quot;: 1.7449,
&quot;grad_norm&quot;: 2.6417624950408936,
&quot;learning_rate&quot;: 1.8603743711830646e-05,
&quot;epoch&quot;: 1.3119777158774373,
&quot;step&quot;: 236
},
{
&quot;loss&quot;: 2.0348,
&quot;grad_norm&quot;: 3.81978702545166,
&quot;learning_rate&quot;: 1.8590596189772816e-05,
&quot;epoch&quot;: 1.317548746518106,
&quot;step&quot;: 237
},
{
&quot;loss&quot;: 1.8765,
&quot;grad_norm&quot;: 2.615661382675171,
&quot;learning_rate&quot;: 1.8577448667714985e-05,
&quot;epoch&quot;: 1.3231197771587744,
&quot;step&quot;: 238
},
{
&quot;loss&quot;: 1.7559,
&quot;grad_norm&quot;: 3.2889416217803955,
&quot;learning_rate&quot;: 1.8564301145657155e-05,
&quot;epoch&quot;: 1.3286908077994428,
&quot;step&quot;: 239
},
{
&quot;loss&quot;: 1.9031,
&quot;grad_norm&quot;: 4.006824970245361,
&quot;learning_rate&quot;: 1.8551153623599324e-05,
&quot;epoch&quot;: 1.3342618384401115,
&quot;step&quot;: 240
},
{
&quot;loss&quot;: 1.6102,
&quot;grad_norm&quot;: 3.3491599559783936,
&quot;learning_rate&quot;: 1.8538006101541494e-05,
&quot;epoch&quot;: 1.33983286908078,
&quot;step&quot;: 241
},
{
&quot;loss&quot;: 2.0716,
&quot;grad_norm&quot;: 3.2669260501861572,
&quot;learning_rate&quot;: 1.852485857948366e-05,
&quot;epoch&quot;: 1.3454038997214486,
&quot;step&quot;: 242
},
{
&quot;loss&quot;: 2.0298,
&quot;grad_norm&quot;: 4.218564510345459,
&quot;learning_rate&quot;: 1.851171105742583e-05,
&quot;epoch&quot;: 1.350974930362117,
&quot;step&quot;: 243
},
{
&quot;loss&quot;: 1.9911,
&quot;grad_norm&quot;: 3.5515315532684326,
&quot;learning_rate&quot;: 1.8498563535368003e-05,
&quot;epoch&quot;: 1.3565459610027855,
&quot;step&quot;: 244
},
{
&quot;loss&quot;: 1.3477,
&quot;grad_norm&quot;: 4.0060343742370605,
&quot;learning_rate&quot;: 1.848541601331017e-05,
&quot;epoch&quot;: 1.362116991643454,
&quot;step&quot;: 245
},
{
&quot;loss&quot;: 1.4686,
&quot;grad_norm&quot;: 3.574927568435669,
&quot;learning_rate&quot;: 1.8472268491252338e-05,
&quot;epoch&quot;: 1.3676880222841226,
&quot;step&quot;: 246
},
{
&quot;loss&quot;: 1.617,
&quot;grad_norm&quot;: 3.4316840171813965,
&quot;learning_rate&quot;: 1.8459120969194508e-05,
&quot;epoch&quot;: 1.3732590529247912,
&quot;step&quot;: 247
},
{
&quot;loss&quot;: 1.6593,
&quot;grad_norm&quot;: 3.2629754543304443,
&quot;learning_rate&quot;: 1.8445973447136677e-05,
&quot;epoch&quot;: 1.3788300835654597,
&quot;step&quot;: 248
},
{
&quot;loss&quot;: 1.2608,
&quot;grad_norm&quot;: 3.133815050125122,
&quot;learning_rate&quot;: 1.8432825925078847e-05,
&quot;epoch&quot;: 1.384401114206128,
&quot;step&quot;: 249
},
{
&quot;loss&quot;: 1.8523,
&quot;grad_norm&quot;: 3.742141008377075,
&quot;learning_rate&quot;: 1.8419678403021016e-05,
&quot;epoch&quot;: 1.3899721448467965,
&quot;step&quot;: 250
},
{
&quot;eval_loss&quot;: 2.335228443145752,
&quot;eval_runtime&quot;: 35.9668,
&quot;eval_samples_per_second&quot;: 39.926,
&quot;eval_steps_per_second&quot;: 2.002,
&quot;epoch&quot;: 1.3899721448467965,
&quot;step&quot;: 250
},
{
&quot;loss&quot;: 1.7768,
&quot;grad_norm&quot;: 3.9163429737091064,
&quot;learning_rate&quot;: 1.8406530880963186e-05,
&quot;epoch&quot;: 1.3955431754874652,
&quot;step&quot;: 251
},
{
&quot;loss&quot;: 1.7455,
&quot;grad_norm&quot;: 3.3456947803497314,
&quot;learning_rate&quot;: 1.8393383358905352e-05,
&quot;epoch&quot;: 1.4011142061281336,
&quot;step&quot;: 252
},
{
&quot;loss&quot;: 1.7103,
&quot;grad_norm&quot;: 4.220420837402344,
&quot;learning_rate&quot;: 1.838023583684752e-05,
&quot;epoch&quot;: 1.4066852367688023,
&quot;step&quot;: 253
},
{
&quot;loss&quot;: 2.0054,
&quot;grad_norm&quot;: 4.233839511871338,
&quot;learning_rate&quot;: 1.836708831478969e-05,
&quot;epoch&quot;: 1.4122562674094707,
&quot;step&quot;: 254
},
{
&quot;loss&quot;: 1.7175,
&quot;grad_norm&quot;: 3.703934669494629,
&quot;learning_rate&quot;: 1.8353940792731864e-05,
&quot;epoch&quot;: 1.4178272980501392,
&quot;step&quot;: 255
},
{
&quot;loss&quot;: 1.7225,
&quot;grad_norm&quot;: 4.210822105407715,
&quot;learning_rate&quot;: 1.834079327067403e-05,
&quot;epoch&quot;: 1.4233983286908078,
&quot;step&quot;: 256
},
{
&quot;loss&quot;: 1.6882,
&quot;grad_norm&quot;: 3.8861896991729736,
&quot;learning_rate&quot;: 1.83276457486162e-05,
&quot;epoch&quot;: 1.4289693593314763,
&quot;step&quot;: 257
},
{
&quot;loss&quot;: 2.0721,
&quot;grad_norm&quot;: 4.4140424728393555,
&quot;learning_rate&quot;: 1.831449822655837e-05,
&quot;epoch&quot;: 1.434540389972145,
&quot;step&quot;: 258
},
{
&quot;loss&quot;: 1.6198,
&quot;grad_norm&quot;: 3.1098673343658447,
&quot;learning_rate&quot;: 1.830135070450054e-05,
&quot;epoch&quot;: 1.4401114206128134,
&quot;step&quot;: 259
},
{
&quot;loss&quot;: 1.9632,
&quot;grad_norm&quot;: 2.9485561847686768,
&quot;learning_rate&quot;: 1.8288203182442708e-05,
&quot;epoch&quot;: 1.4456824512534818,
&quot;step&quot;: 260
},
{
&quot;loss&quot;: 1.9262,
&quot;grad_norm&quot;: 3.842655658721924,
&quot;learning_rate&quot;: 1.8275055660384878e-05,
&quot;epoch&quot;: 1.4512534818941505,
&quot;step&quot;: 261
},
{
&quot;loss&quot;: 2.0807,
&quot;grad_norm&quot;: 4.122529983520508,
&quot;learning_rate&quot;: 1.8261908138327047e-05,
&quot;epoch&quot;: 1.456824512534819,
&quot;step&quot;: 262
},
{
&quot;loss&quot;: 2.0099,
&quot;grad_norm&quot;: 3.6181795597076416,
&quot;learning_rate&quot;: 1.8248760616269213e-05,
&quot;epoch&quot;: 1.4623955431754876,
&quot;step&quot;: 263
},
{
&quot;loss&quot;: 1.7435,
&quot;grad_norm&quot;: 3.9433975219726562,
&quot;learning_rate&quot;: 1.8235613094211383e-05,
&quot;epoch&quot;: 1.467966573816156,
&quot;step&quot;: 264
},
{
&quot;loss&quot;: 1.4648,
&quot;grad_norm&quot;: 5.496665000915527,
&quot;learning_rate&quot;: 1.8222465572153552e-05,
&quot;epoch&quot;: 1.4735376044568245,
&quot;step&quot;: 265
},
{
&quot;loss&quot;: 2.106,
&quot;grad_norm&quot;: 3.3920114040374756,
&quot;learning_rate&quot;: 1.8209318050095722e-05,
&quot;epoch&quot;: 1.479108635097493,
&quot;step&quot;: 266
},
{
&quot;loss&quot;: 1.4486,
&quot;grad_norm&quot;: 4.195888519287109,
&quot;learning_rate&quot;: 1.819617052803789e-05,
&quot;epoch&quot;: 1.4846796657381616,
&quot;step&quot;: 267
},
{
&quot;loss&quot;: 1.4996,
&quot;grad_norm&quot;: 3.5301265716552734,
&quot;learning_rate&quot;: 1.818302300598006e-05,
&quot;epoch&quot;: 1.49025069637883,
&quot;step&quot;: 268
},
{
&quot;loss&quot;: 1.8247,
&quot;grad_norm&quot;: 3.3157520294189453,
&quot;learning_rate&quot;: 1.8169875483922227e-05,
&quot;epoch&quot;: 1.4958217270194987,
&quot;step&quot;: 269
},
{
&quot;loss&quot;: 1.6092,
&quot;grad_norm&quot;: 4.3797383308410645,
&quot;learning_rate&quot;: 1.8156727961864397e-05,
&quot;epoch&quot;: 1.501392757660167,
&quot;step&quot;: 270
},
{
&quot;loss&quot;: 1.6071,
&quot;grad_norm&quot;: 3.3917229175567627,
&quot;learning_rate&quot;: 1.814358043980657e-05,
&quot;epoch&quot;: 1.5069637883008355,
&quot;step&quot;: 271
},
{
&quot;loss&quot;: 1.9553,
&quot;grad_norm&quot;: 3.171808958053589,
&quot;learning_rate&quot;: 1.813043291774874e-05,
&quot;epoch&quot;: 1.5125348189415042,
&quot;step&quot;: 272
},
{
&quot;loss&quot;: 1.8105,
&quot;grad_norm&quot;: 3.1904940605163574,
&quot;learning_rate&quot;: 1.8117285395690905e-05,
&quot;epoch&quot;: 1.5181058495821727,
&quot;step&quot;: 273
},
{
&quot;loss&quot;: 1.5718,
&quot;grad_norm&quot;: 3.7544777393341064,
&quot;learning_rate&quot;: 1.8104137873633075e-05,
&quot;epoch&quot;: 1.5236768802228413,
&quot;step&quot;: 274
},
{
&quot;loss&quot;: 1.9999,
&quot;grad_norm&quot;: 4.143693923950195,
&quot;learning_rate&quot;: 1.8090990351575244e-05,
&quot;epoch&quot;: 1.5292479108635098,
&quot;step&quot;: 275
},
{
&quot;loss&quot;: 2.0393,
&quot;grad_norm&quot;: 3.505359411239624,
&quot;learning_rate&quot;: 1.8077842829517414e-05,
&quot;epoch&quot;: 1.5348189415041782,
&quot;step&quot;: 276
},
{
&quot;loss&quot;: 1.6101,
&quot;grad_norm&quot;: 4.118677139282227,
&quot;learning_rate&quot;: 1.8064695307459583e-05,
&quot;epoch&quot;: 1.5403899721448466,
&quot;step&quot;: 277
},
{
&quot;loss&quot;: 1.6718,
&quot;grad_norm&quot;: 4.947996139526367,
&quot;learning_rate&quot;: 1.8051547785401753e-05,
&quot;epoch&quot;: 1.5459610027855153,
&quot;step&quot;: 278
},
{
&quot;loss&quot;: 2.2007,
&quot;grad_norm&quot;: 4.226828575134277,
&quot;learning_rate&quot;: 1.8038400263343922e-05,
&quot;epoch&quot;: 1.551532033426184,
&quot;step&quot;: 279
},
{
&quot;loss&quot;: 1.7025,
&quot;grad_norm&quot;: 4.085235118865967,
&quot;learning_rate&quot;: 1.802525274128609e-05,
&quot;epoch&quot;: 1.5571030640668524,
&quot;step&quot;: 280
},
{
&quot;loss&quot;: 1.7632,
&quot;grad_norm&quot;: 3.5451292991638184,
&quot;learning_rate&quot;: 1.8012105219228258e-05,
&quot;epoch&quot;: 1.5626740947075208,
&quot;step&quot;: 281
},
{
&quot;loss&quot;: 1.4975,
&quot;grad_norm&quot;: 5.2698540687561035,
&quot;learning_rate&quot;: 1.799895769717043e-05,
&quot;epoch&quot;: 1.5682451253481893,
&quot;step&quot;: 282
},
{
&quot;loss&quot;: 1.2189,
&quot;grad_norm&quot;: 3.662693738937378,
&quot;learning_rate&quot;: 1.7985810175112597e-05,
&quot;epoch&quot;: 1.573816155988858,
&quot;step&quot;: 283
},
{
&quot;loss&quot;: 2.1889,
&quot;grad_norm&quot;: 3.9369843006134033,
&quot;learning_rate&quot;: 1.7972662653054766e-05,
&quot;epoch&quot;: 1.5793871866295266,
&quot;step&quot;: 284
},
{
&quot;loss&quot;: 1.782,
&quot;grad_norm&quot;: 5.153691291809082,
&quot;learning_rate&quot;: 1.7959515130996936e-05,
&quot;epoch&quot;: 1.584958217270195,
&quot;step&quot;: 285
},
{
&quot;loss&quot;: 1.7055,
&quot;grad_norm&quot;: 3.5153331756591797,
&quot;learning_rate&quot;: 1.7946367608939105e-05,
&quot;epoch&quot;: 1.5905292479108635,
&quot;step&quot;: 286
},
{
&quot;loss&quot;: 2.0713,
&quot;grad_norm&quot;: 3.8740577697753906,
&quot;learning_rate&quot;: 1.7933220086881275e-05,
&quot;epoch&quot;: 1.596100278551532,
&quot;step&quot;: 287
},
{
&quot;loss&quot;: 1.6159,
&quot;grad_norm&quot;: 2.977501153945923,
&quot;learning_rate&quot;: 1.7920072564823445e-05,
&quot;epoch&quot;: 1.6016713091922006,
&quot;step&quot;: 288
},
{
&quot;loss&quot;: 2.0388,
&quot;grad_norm&quot;: 4.873539447784424,
&quot;learning_rate&quot;: 1.7906925042765614e-05,
&quot;epoch&quot;: 1.6072423398328692,
&quot;step&quot;: 289
},
{
&quot;loss&quot;: 1.7656,
&quot;grad_norm&quot;: 3.6297993659973145,
&quot;learning_rate&quot;: 1.789377752070778e-05,
&quot;epoch&quot;: 1.6128133704735377,
&quot;step&quot;: 290
},
{
&quot;loss&quot;: 1.9818,
&quot;grad_norm&quot;: 2.868178367614746,
&quot;learning_rate&quot;: 1.788062999864995e-05,
&quot;epoch&quot;: 1.6183844011142061,
&quot;step&quot;: 291
},
{
&quot;loss&quot;: 1.6421,
&quot;grad_norm&quot;: 4.532885551452637,
&quot;learning_rate&quot;: 1.786748247659212e-05,
&quot;epoch&quot;: 1.6239554317548746,
&quot;step&quot;: 292
},
{
&quot;loss&quot;: 1.653,
&quot;grad_norm&quot;: 5.63344669342041,
&quot;learning_rate&quot;: 1.785433495453429e-05,
&quot;epoch&quot;: 1.6295264623955432,
&quot;step&quot;: 293
},
{
&quot;loss&quot;: 1.8727,
&quot;grad_norm&quot;: 4.235146999359131,
&quot;learning_rate&quot;: 1.7841187432476458e-05,
&quot;epoch&quot;: 1.6350974930362117,
&quot;step&quot;: 294
},
{
&quot;loss&quot;: 1.3509,
&quot;grad_norm&quot;: 4.512764930725098,
&quot;learning_rate&quot;: 1.7828039910418628e-05,
&quot;epoch&quot;: 1.6406685236768803,
&quot;step&quot;: 295
},
{
&quot;loss&quot;: 1.7836,
&quot;grad_norm&quot;: 3.72898268699646,
&quot;learning_rate&quot;: 1.7814892388360797e-05,
&quot;epoch&quot;: 1.6462395543175488,
&quot;step&quot;: 296
},
{
&quot;loss&quot;: 1.6315,
&quot;grad_norm&quot;: 3.1936659812927246,
&quot;learning_rate&quot;: 1.7801744866302963e-05,
&quot;epoch&quot;: 1.6518105849582172,
&quot;step&quot;: 297
},
{
&quot;loss&quot;: 1.9805,
&quot;grad_norm&quot;: 3.1188321113586426,
&quot;learning_rate&quot;: 1.7788597344245136e-05,
&quot;epoch&quot;: 1.6573816155988856,
&quot;step&quot;: 298
},
{
&quot;loss&quot;: 1.8716,
&quot;grad_norm&quot;: 4.88875150680542,
&quot;learning_rate&quot;: 1.7775449822187306e-05,
&quot;epoch&quot;: 1.6629526462395543,
&quot;step&quot;: 299
},
{
&quot;loss&quot;: 1.4669,
&quot;grad_norm&quot;: 4.494915962219238,
&quot;learning_rate&quot;: 1.7762302300129472e-05,
&quot;epoch&quot;: 1.668523676880223,
&quot;step&quot;: 300
},
{
&quot;eval_loss&quot;: 2.3116097450256348,
&quot;eval_runtime&quot;: 35.9294,
&quot;eval_samples_per_second&quot;: 39.967,
&quot;eval_steps_per_second&quot;: 2.004,
&quot;epoch&quot;: 1.668523676880223,
&quot;step&quot;: 300
},
{
&quot;loss&quot;: 1.3418,
&quot;grad_norm&quot;: 4.365106582641602,
&quot;learning_rate&quot;: 1.774915477807164e-05,
&quot;epoch&quot;: 1.6740947075208914,
&quot;step&quot;: 301
},
{
&quot;loss&quot;: 1.4561,
&quot;grad_norm&quot;: 4.683363914489746,
&quot;learning_rate&quot;: 1.773600725601381e-05,
&quot;epoch&quot;: 1.6796657381615598,
&quot;step&quot;: 302
},
{
&quot;loss&quot;: 1.8321,
&quot;grad_norm&quot;: 4.195693492889404,
&quot;learning_rate&quot;: 1.772285973395598e-05,
&quot;epoch&quot;: 1.6852367688022283,
&quot;step&quot;: 303
},
{
&quot;loss&quot;: 1.8932,
&quot;grad_norm&quot;: 4.681265830993652,
&quot;learning_rate&quot;: 1.770971221189815e-05,
&quot;epoch&quot;: 1.690807799442897,
&quot;step&quot;: 304
},
{
&quot;loss&quot;: 2.0071,
&quot;grad_norm&quot;: 5.034351348876953,
&quot;learning_rate&quot;: 1.769656468984032e-05,
&quot;epoch&quot;: 1.6963788300835656,
&quot;step&quot;: 305
},
{
&quot;loss&quot;: 1.9824,
&quot;grad_norm&quot;: 3.9581334590911865,
&quot;learning_rate&quot;: 1.768341716778249e-05,
&quot;epoch&quot;: 1.701949860724234,
&quot;step&quot;: 306
},
{
&quot;loss&quot;: 2.2225,
&quot;grad_norm&quot;: 3.9467825889587402,
&quot;learning_rate&quot;: 1.7670269645724655e-05,
&quot;epoch&quot;: 1.7075208913649025,
&quot;step&quot;: 307
},
{
&quot;loss&quot;: 1.671,
&quot;grad_norm&quot;: 3.7253997325897217,
&quot;learning_rate&quot;: 1.7657122123666825e-05,
&quot;epoch&quot;: 1.713091922005571,
&quot;step&quot;: 308
},
{
&quot;loss&quot;: 1.7876,
&quot;grad_norm&quot;: 4.8212480545043945,
&quot;learning_rate&quot;: 1.7643974601608998e-05,
&quot;epoch&quot;: 1.7186629526462396,
&quot;step&quot;: 309
},
{
&quot;loss&quot;: 1.1102,
&quot;grad_norm&quot;: 4.235992431640625,
&quot;learning_rate&quot;: 1.7630827079551164e-05,
&quot;epoch&quot;: 1.724233983286908,
&quot;step&quot;: 310
},
{
&quot;loss&quot;: 1.7577,
&quot;grad_norm&quot;: 3.5870513916015625,
&quot;learning_rate&quot;: 1.7617679557493333e-05,
&quot;epoch&quot;: 1.7298050139275767,
&quot;step&quot;: 311
},
{
&quot;loss&quot;: 1.3948,
&quot;grad_norm&quot;: 4.27365779876709,
&quot;learning_rate&quot;: 1.7604532035435503e-05,
&quot;epoch&quot;: 1.7353760445682451,
&quot;step&quot;: 312
},
{
&quot;loss&quot;: 1.5507,
&quot;grad_norm&quot;: 4.927708625793457,
&quot;learning_rate&quot;: 1.7591384513377672e-05,
&quot;epoch&quot;: 1.7409470752089136,
&quot;step&quot;: 313
},
{
&quot;loss&quot;: 1.5299,
&quot;grad_norm&quot;: 4.702437877655029,
&quot;learning_rate&quot;: 1.7578236991319842e-05,
&quot;epoch&quot;: 1.7465181058495822,
&quot;step&quot;: 314
},
{
&quot;loss&quot;: 1.6187,
&quot;grad_norm&quot;: 4.205385684967041,
&quot;learning_rate&quot;: 1.756508946926201e-05,
&quot;epoch&quot;: 1.7520891364902507,
&quot;step&quot;: 315
},
{
&quot;loss&quot;: 1.6467,
&quot;grad_norm&quot;: 3.724274158477783,
&quot;learning_rate&quot;: 1.755194194720418e-05,
&quot;epoch&quot;: 1.7576601671309193,
&quot;step&quot;: 316
},
{
&quot;loss&quot;: 1.48,
&quot;grad_norm&quot;: 5.0788187980651855,
&quot;learning_rate&quot;: 1.7538794425146347e-05,
&quot;epoch&quot;: 1.7632311977715878,
&quot;step&quot;: 317
},
{
&quot;loss&quot;: 1.2413,
&quot;grad_norm&quot;: 4.211026191711426,
&quot;learning_rate&quot;: 1.7525646903088517e-05,
&quot;epoch&quot;: 1.7688022284122562,
&quot;step&quot;: 318
},
{
&quot;loss&quot;: 1.2792,
&quot;grad_norm&quot;: 4.383068561553955,
&quot;learning_rate&quot;: 1.7512499381030686e-05,
&quot;epoch&quot;: 1.7743732590529246,
&quot;step&quot;: 319
},
{
&quot;loss&quot;: 2.0635,
&quot;grad_norm&quot;: 5.2455668449401855,
&quot;learning_rate&quot;: 1.7499351858972856e-05,
&quot;epoch&quot;: 1.7799442896935933,
&quot;step&quot;: 320
},
{
&quot;loss&quot;: 1.9011,
&quot;grad_norm&quot;: 4.73854398727417,
&quot;learning_rate&quot;: 1.7486204336915025e-05,
&quot;epoch&quot;: 1.785515320334262,
&quot;step&quot;: 321
},
{
&quot;loss&quot;: 1.9017,
&quot;grad_norm&quot;: 5.136256217956543,
&quot;learning_rate&quot;: 1.7473056814857195e-05,
&quot;epoch&quot;: 1.7910863509749304,
&quot;step&quot;: 322
},
{
&quot;loss&quot;: 1.7304,
&quot;grad_norm&quot;: 5.707761764526367,
&quot;learning_rate&quot;: 1.7459909292799364e-05,
&quot;epoch&quot;: 1.7966573816155988,
&quot;step&quot;: 323
},
{
&quot;loss&quot;: 1.9703,
&quot;grad_norm&quot;: 4.81571102142334,
&quot;learning_rate&quot;: 1.744676177074153e-05,
&quot;epoch&quot;: 1.8022284122562673,
&quot;step&quot;: 324
},
{
&quot;loss&quot;: 1.6825,
&quot;grad_norm&quot;: 6.157602310180664,
&quot;learning_rate&quot;: 1.7433614248683703e-05,
&quot;epoch&quot;: 1.807799442896936,
&quot;step&quot;: 325
},
{
&quot;loss&quot;: 1.7945,
&quot;grad_norm&quot;: 5.200462818145752,
&quot;learning_rate&quot;: 1.7420466726625873e-05,
&quot;epoch&quot;: 1.8133704735376046,
&quot;step&quot;: 326
},
{
&quot;loss&quot;: 1.7701,
&quot;grad_norm&quot;: 5.342528820037842,
&quot;learning_rate&quot;: 1.7407319204568042e-05,
&quot;epoch&quot;: 1.818941504178273,
&quot;step&quot;: 327
},
{
&quot;loss&quot;: 1.8,
&quot;grad_norm&quot;: 4.419646739959717,
&quot;learning_rate&quot;: 1.739417168251021e-05,
&quot;epoch&quot;: 1.8245125348189415,
&quot;step&quot;: 328
},
{
&quot;loss&quot;: 1.3064,
&quot;grad_norm&quot;: 5.106484889984131,
&quot;learning_rate&quot;: 1.7381024160452378e-05,
&quot;epoch&quot;: 1.83008356545961,
&quot;step&quot;: 329
},
{
&quot;loss&quot;: 2.0357,
&quot;grad_norm&quot;: 4.221576690673828,
&quot;learning_rate&quot;: 1.7367876638394547e-05,
&quot;epoch&quot;: 1.8356545961002786,
&quot;step&quot;: 330
},
{
&quot;loss&quot;: 1.6015,
&quot;grad_norm&quot;: 6.323553562164307,
&quot;learning_rate&quot;: 1.7354729116336717e-05,
&quot;epoch&quot;: 1.841225626740947,
&quot;step&quot;: 331
},
{
&quot;loss&quot;: 1.5858,
&quot;grad_norm&quot;: 4.978970527648926,
&quot;learning_rate&quot;: 1.7341581594278887e-05,
&quot;epoch&quot;: 1.8467966573816157,
&quot;step&quot;: 332
},
{
&quot;loss&quot;: 1.9489,
&quot;grad_norm&quot;: 3.1882030963897705,
&quot;learning_rate&quot;: 1.7328434072221056e-05,
&quot;epoch&quot;: 1.8523676880222841,
&quot;step&quot;: 333
},
{
&quot;loss&quot;: 2.1722,
&quot;grad_norm&quot;: 4.047868251800537,
&quot;learning_rate&quot;: 1.7315286550163222e-05,
&quot;epoch&quot;: 1.8579387186629526,
&quot;step&quot;: 334
},
{
&quot;loss&quot;: 1.5027,
&quot;grad_norm&quot;: 4.2307448387146,
&quot;learning_rate&quot;: 1.730213902810539e-05,
&quot;epoch&quot;: 1.863509749303621,
&quot;step&quot;: 335
},
{
&quot;loss&quot;: 1.481,
&quot;grad_norm&quot;: 6.048774242401123,
&quot;learning_rate&quot;: 1.7288991506047565e-05,
&quot;epoch&quot;: 1.8690807799442897,
&quot;step&quot;: 336
},
{
&quot;loss&quot;: 1.8746,
&quot;grad_norm&quot;: 5.389241695404053,
&quot;learning_rate&quot;: 1.7275843983989734e-05,
&quot;epoch&quot;: 1.8746518105849583,
&quot;step&quot;: 337
},
{
&quot;loss&quot;: 2.0807,
&quot;grad_norm&quot;: 4.036198139190674,
&quot;learning_rate&quot;: 1.72626964619319e-05,
&quot;epoch&quot;: 1.8802228412256268,
&quot;step&quot;: 338
},
{
&quot;loss&quot;: 1.7448,
&quot;grad_norm&quot;: 5.005743503570557,
&quot;learning_rate&quot;: 1.724954893987407e-05,
&quot;epoch&quot;: 1.8857938718662952,
&quot;step&quot;: 339
},
{
&quot;loss&quot;: 1.9092,
&quot;grad_norm&quot;: 4.462837219238281,
&quot;learning_rate&quot;: 1.723640141781624e-05,
&quot;epoch&quot;: 1.8913649025069637,
&quot;step&quot;: 340
},
{
&quot;loss&quot;: 1.7032,
&quot;grad_norm&quot;: 4.945067405700684,
&quot;learning_rate&quot;: 1.722325389575841e-05,
&quot;epoch&quot;: 1.8969359331476323,
&quot;step&quot;: 341
},
{
&quot;loss&quot;: 1.9141,
&quot;grad_norm&quot;: 3.7232062816619873,
&quot;learning_rate&quot;: 1.721010637370058e-05,
&quot;epoch&quot;: 1.902506963788301,
&quot;step&quot;: 342
},
{
&quot;loss&quot;: 1.8258,
&quot;grad_norm&quot;: 3.8830628395080566,
&quot;learning_rate&quot;: 1.7196958851642748e-05,
&quot;epoch&quot;: 1.9080779944289694,
&quot;step&quot;: 343
},
{
&quot;loss&quot;: 1.7998,
&quot;grad_norm&quot;: 4.693456649780273,
&quot;learning_rate&quot;: 1.7183811329584917e-05,
&quot;epoch&quot;: 1.9136490250696379,
&quot;step&quot;: 344
},
{
&quot;loss&quot;: 2.0583,
&quot;grad_norm&quot;: 4.737421989440918,
&quot;learning_rate&quot;: 1.7170663807527083e-05,
&quot;epoch&quot;: 1.9192200557103063,
&quot;step&quot;: 345
},
{
&quot;loss&quot;: 1.494,
&quot;grad_norm&quot;: 2.78582501411438,
&quot;learning_rate&quot;: 1.7157516285469253e-05,
&quot;epoch&quot;: 1.924791086350975,
&quot;step&quot;: 346
},
{
&quot;loss&quot;: 1.7167,
&quot;grad_norm&quot;: 4.305075168609619,
&quot;learning_rate&quot;: 1.7144368763411423e-05,
&quot;epoch&quot;: 1.9303621169916436,
&quot;step&quot;: 347
},
{
&quot;loss&quot;: 1.7753,
&quot;grad_norm&quot;: 3.9957072734832764,
&quot;learning_rate&quot;: 1.7131221241353592e-05,
&quot;epoch&quot;: 1.935933147632312,
&quot;step&quot;: 348
},
{
&quot;loss&quot;: 1.8852,
&quot;grad_norm&quot;: 4.9537434577941895,
&quot;learning_rate&quot;: 1.711807371929576e-05,
&quot;epoch&quot;: 1.9415041782729805,
&quot;step&quot;: 349
},
{
&quot;loss&quot;: 1.8729,
&quot;grad_norm&quot;: 3.9404208660125732,
&quot;learning_rate&quot;: 1.710492619723793e-05,
&quot;epoch&quot;: 1.947075208913649,
&quot;step&quot;: 350
},
{
&quot;eval_loss&quot;: 2.3213632106781006,
&quot;eval_runtime&quot;: 35.9387,
&quot;eval_samples_per_second&quot;: 39.957,
&quot;eval_steps_per_second&quot;: 2.003,
&quot;epoch&quot;: 1.947075208913649,
&quot;step&quot;: 350
},
{
&quot;loss&quot;: 2.1419,
&quot;grad_norm&quot;: 3.202141046524048,
&quot;learning_rate&quot;: 1.70917786751801e-05,
&quot;epoch&quot;: 1.9526462395543176,
&quot;step&quot;: 351
},
{
&quot;loss&quot;: 1.83,
&quot;grad_norm&quot;: 4.432948112487793,
&quot;learning_rate&quot;: 1.707863115312227e-05,
&quot;epoch&quot;: 1.958217270194986,
&quot;step&quot;: 352
},
{
&quot;loss&quot;: 2.3556,
&quot;grad_norm&quot;: 5.213648796081543,
&quot;learning_rate&quot;: 1.706548363106444e-05,
&quot;epoch&quot;: 1.9637883008356547,
&quot;step&quot;: 353
},
{
&quot;loss&quot;: 2.1396,
&quot;grad_norm&quot;: 4.155479431152344,
&quot;learning_rate&quot;: 1.705233610900661e-05,
&quot;epoch&quot;: 1.9693593314763231,
&quot;step&quot;: 354
},
{
&quot;loss&quot;: 1.4222,
&quot;grad_norm&quot;: 5.146358013153076,
&quot;learning_rate&quot;: 1.7039188586948775e-05,
&quot;epoch&quot;: 1.9749303621169916,
&quot;step&quot;: 355
},
{
&quot;loss&quot;: 1.9362,
&quot;grad_norm&quot;: 3.264761447906494,
&quot;learning_rate&quot;: 1.7026041064890945e-05,
&quot;epoch&quot;: 1.98050139275766,
&quot;step&quot;: 356
},
{
&quot;loss&quot;: 1.9471,
&quot;grad_norm&quot;: 3.308243989944458,
&quot;learning_rate&quot;: 1.7012893542833114e-05,
&quot;epoch&quot;: 1.9860724233983287,
&quot;step&quot;: 357
},
{
&quot;loss&quot;: 2.0193,
&quot;grad_norm&quot;: 4.1630859375,
&quot;learning_rate&quot;: 1.6999746020775284e-05,
&quot;epoch&quot;: 1.9916434540389973,
&quot;step&quot;: 358
},
{
&quot;loss&quot;: 2.1048,
&quot;grad_norm&quot;: 4.196152210235596,
&quot;learning_rate&quot;: 1.6986598498717453e-05,
&quot;epoch&quot;: 1.9972144846796658,
&quot;step&quot;: 359
},
{
&quot;loss&quot;: 2.0755,
&quot;grad_norm&quot;: 4.194087028503418,
&quot;learning_rate&quot;: 1.6973450976659623e-05,
&quot;epoch&quot;: 2.0,
&quot;step&quot;: 360
},
{
&quot;loss&quot;: 1.4388,
&quot;grad_norm&quot;: 4.208454132080078,
&quot;learning_rate&quot;: 1.6960303454601792e-05,
&quot;epoch&quot;: 2.0055710306406684,
&quot;step&quot;: 361
},
{
&quot;loss&quot;: 1.7819,
&quot;grad_norm&quot;: 3.549447774887085,
&quot;learning_rate&quot;: 1.694715593254396e-05,
&quot;epoch&quot;: 2.011142061281337,
&quot;step&quot;: 362
},
{
&quot;loss&quot;: 1.5135,
&quot;grad_norm&quot;: 3.6767420768737793,
&quot;learning_rate&quot;: 1.693400841048613e-05,
&quot;epoch&quot;: 2.0167130919220058,
&quot;step&quot;: 363
},
{
&quot;loss&quot;: 1.7713,
&quot;grad_norm&quot;: 3.816209554672241,
&quot;learning_rate&quot;: 1.69208608884283e-05,
&quot;epoch&quot;: 2.022284122562674,
&quot;step&quot;: 364
},
{
&quot;loss&quot;: 1.6624,
&quot;grad_norm&quot;: 3.2220561504364014,
&quot;learning_rate&quot;: 1.6907713366370467e-05,
&quot;epoch&quot;: 2.0278551532033426,
&quot;step&quot;: 365
},
{
&quot;loss&quot;: 1.9059,
&quot;grad_norm&quot;: 3.4210987091064453,
&quot;learning_rate&quot;: 1.6894565844312637e-05,
&quot;epoch&quot;: 2.033426183844011,
&quot;step&quot;: 366
},
{
&quot;loss&quot;: 1.155,
&quot;grad_norm&quot;: 4.348776817321777,
&quot;learning_rate&quot;: 1.6881418322254806e-05,
&quot;epoch&quot;: 2.0389972144846795,
&quot;step&quot;: 367
},
{
&quot;loss&quot;: 1.4513,
&quot;grad_norm&quot;: 4.143118858337402,
&quot;learning_rate&quot;: 1.6868270800196976e-05,
&quot;epoch&quot;: 2.0445682451253484,
&quot;step&quot;: 368
},
{
&quot;loss&quot;: 1.8148,
&quot;grad_norm&quot;: 4.118925094604492,
&quot;learning_rate&quot;: 1.6855123278139145e-05,
&quot;epoch&quot;: 2.050139275766017,
&quot;step&quot;: 369
},
{
&quot;loss&quot;: 1.6325,
&quot;grad_norm&quot;: 4.060324668884277,
&quot;learning_rate&quot;: 1.6841975756081315e-05,
&quot;epoch&quot;: 2.0557103064066853,
&quot;step&quot;: 370
},
{
&quot;loss&quot;: 1.694,
&quot;grad_norm&quot;: 4.604481220245361,
&quot;learning_rate&quot;: 1.6828828234023484e-05,
&quot;epoch&quot;: 2.0612813370473537,
&quot;step&quot;: 371
},
{
&quot;loss&quot;: 1.4905,
&quot;grad_norm&quot;: 5.273688316345215,
&quot;learning_rate&quot;: 1.681568071196565e-05,
&quot;epoch&quot;: 2.066852367688022,
&quot;step&quot;: 372
},
{
&quot;loss&quot;: 1.1557,
&quot;grad_norm&quot;: 6.0254387855529785,
&quot;learning_rate&quot;: 1.680253318990782e-05,
&quot;epoch&quot;: 2.0724233983286906,
&quot;step&quot;: 373
},
{
&quot;loss&quot;: 0.999,
&quot;grad_norm&quot;: 5.017882823944092,
&quot;learning_rate&quot;: 1.678938566784999e-05,
&quot;epoch&quot;: 2.0779944289693595,
&quot;step&quot;: 374
},
{
&quot;loss&quot;: 1.4159,
&quot;grad_norm&quot;: 6.874935626983643,
&quot;learning_rate&quot;: 1.6776238145792162e-05,
&quot;epoch&quot;: 2.083565459610028,
&quot;step&quot;: 375
},
{
&quot;loss&quot;: 0.9789,
&quot;grad_norm&quot;: 6.245709419250488,
&quot;learning_rate&quot;: 1.676309062373433e-05,
&quot;epoch&quot;: 2.0891364902506964,
&quot;step&quot;: 376
},
{
&quot;loss&quot;: 1.3929,
&quot;grad_norm&quot;: 6.976832866668701,
&quot;learning_rate&quot;: 1.6749943101676498e-05,
&quot;epoch&quot;: 2.094707520891365,
&quot;step&quot;: 377
},
{
&quot;loss&quot;: 1.5721,
&quot;grad_norm&quot;: 7.426636695861816,
&quot;learning_rate&quot;: 1.6736795579618668e-05,
&quot;epoch&quot;: 2.1002785515320332,
&quot;step&quot;: 378
},
{
&quot;loss&quot;: 1.4603,
&quot;grad_norm&quot;: 8.876333236694336,
&quot;learning_rate&quot;: 1.6723648057560837e-05,
&quot;epoch&quot;: 2.105849582172702,
&quot;step&quot;: 379
},
{
&quot;loss&quot;: 1.2115,
&quot;grad_norm&quot;: 5.889682769775391,
&quot;learning_rate&quot;: 1.6710500535503007e-05,
&quot;epoch&quot;: 2.1114206128133706,
&quot;step&quot;: 380
},
{
&quot;loss&quot;: 1.1689,
&quot;grad_norm&quot;: 6.435322284698486,
&quot;learning_rate&quot;: 1.6697353013445176e-05,
&quot;epoch&quot;: 2.116991643454039,
&quot;step&quot;: 381
},
{
&quot;loss&quot;: 1.1904,
&quot;grad_norm&quot;: 6.061446666717529,
&quot;learning_rate&quot;: 1.6684205491387342e-05,
&quot;epoch&quot;: 2.1225626740947074,
&quot;step&quot;: 382
},
{
&quot;loss&quot;: 1.3799,
&quot;grad_norm&quot;: 7.56770658493042,
&quot;learning_rate&quot;: 1.6671057969329512e-05,
&quot;epoch&quot;: 2.128133704735376,
&quot;step&quot;: 383
},
{
&quot;loss&quot;: 1.5787,
&quot;grad_norm&quot;: 8.942233085632324,
&quot;learning_rate&quot;: 1.665791044727168e-05,
&quot;epoch&quot;: 2.1337047353760448,
&quot;step&quot;: 384
},
{
&quot;loss&quot;: 1.4084,
&quot;grad_norm&quot;: 7.448763847351074,
&quot;learning_rate&quot;: 1.664476292521385e-05,
&quot;epoch&quot;: 2.139275766016713,
&quot;step&quot;: 385
},
{
&quot;loss&quot;: 1.3685,
&quot;grad_norm&quot;: 5.792154312133789,
&quot;learning_rate&quot;: 1.663161540315602e-05,
&quot;epoch&quot;: 2.1448467966573816,
&quot;step&quot;: 386
},
{
&quot;loss&quot;: 1.5465,
&quot;grad_norm&quot;: 7.226157188415527,
&quot;learning_rate&quot;: 1.661846788109819e-05,
&quot;epoch&quot;: 2.15041782729805,
&quot;step&quot;: 387
},
{
&quot;loss&quot;: 1.1914,
&quot;grad_norm&quot;: 5.6042022705078125,
&quot;learning_rate&quot;: 1.660532035904036e-05,
&quot;epoch&quot;: 2.1559888579387185,
&quot;step&quot;: 388
},
{
&quot;loss&quot;: 1.6443,
&quot;grad_norm&quot;: 5.619427680969238,
&quot;learning_rate&quot;: 1.6592172836982525e-05,
&quot;epoch&quot;: 2.1615598885793874,
&quot;step&quot;: 389
},
{
&quot;loss&quot;: 1.5371,
&quot;grad_norm&quot;: 4.770148754119873,
&quot;learning_rate&quot;: 1.65790253149247e-05,
&quot;epoch&quot;: 2.167130919220056,
&quot;step&quot;: 390
},
{
&quot;loss&quot;: 1.5124,
&quot;grad_norm&quot;: 7.61703634262085,
&quot;learning_rate&quot;: 1.6565877792866868e-05,
&quot;epoch&quot;: 2.1727019498607243,
&quot;step&quot;: 391
},
{
&quot;loss&quot;: 1.6248,
&quot;grad_norm&quot;: 4.498234272003174,
&quot;learning_rate&quot;: 1.6552730270809037e-05,
&quot;epoch&quot;: 2.1782729805013927,
&quot;step&quot;: 392
},
{
&quot;loss&quot;: 1.4621,
&quot;grad_norm&quot;: 4.0563063621521,
&quot;learning_rate&quot;: 1.6539582748751204e-05,
&quot;epoch&quot;: 2.183844011142061,
&quot;step&quot;: 393
},
{
&quot;loss&quot;: 1.4315,
&quot;grad_norm&quot;: 6.069952964782715,
&quot;learning_rate&quot;: 1.6526435226693373e-05,
&quot;epoch&quot;: 2.1894150417827296,
&quot;step&quot;: 394
},
{
&quot;loss&quot;: 1.4308,
&quot;grad_norm&quot;: 6.728673458099365,
&quot;learning_rate&quot;: 1.6513287704635543e-05,
&quot;epoch&quot;: 2.1949860724233985,
&quot;step&quot;: 395
},
{
&quot;loss&quot;: 1.2975,
&quot;grad_norm&quot;: 14.551620483398438,
&quot;learning_rate&quot;: 1.6500140182577712e-05,
&quot;epoch&quot;: 2.200557103064067,
&quot;step&quot;: 396
},
{
&quot;loss&quot;: 1.4624,
&quot;grad_norm&quot;: 6.782831192016602,
&quot;learning_rate&quot;: 1.648699266051988e-05,
&quot;epoch&quot;: 2.2061281337047354,
&quot;step&quot;: 397
},
{
&quot;loss&quot;: 1.5891,
&quot;grad_norm&quot;: 6.513261795043945,
&quot;learning_rate&quot;: 1.647384513846205e-05,
&quot;epoch&quot;: 2.211699164345404,
&quot;step&quot;: 398
},
{
&quot;loss&quot;: 1.3152,
&quot;grad_norm&quot;: 6.3476433753967285,
&quot;learning_rate&quot;: 1.646069761640422e-05,
&quot;epoch&quot;: 2.2172701949860723,
&quot;step&quot;: 399
},
{
&quot;loss&quot;: 1.3129,
&quot;grad_norm&quot;: 4.936390399932861,
&quot;learning_rate&quot;: 1.6447550094346387e-05,
&quot;epoch&quot;: 2.222841225626741,
&quot;step&quot;: 400
},
{
&quot;eval_loss&quot;: 2.531832218170166,
&quot;eval_runtime&quot;: 35.95,
&quot;eval_samples_per_second&quot;: 39.944,
&quot;eval_steps_per_second&quot;: 2.003,
&quot;epoch&quot;: 2.222841225626741,
&quot;step&quot;: 400
},
{
&quot;loss&quot;: 1.2283,
&quot;grad_norm&quot;: 8.302631378173828,
&quot;learning_rate&quot;: 1.6434402572288556e-05,
&quot;epoch&quot;: 2.2284122562674096,
&quot;step&quot;: 401
},
{
&quot;loss&quot;: 1.1884,
&quot;grad_norm&quot;: 5.8890886306762695,
&quot;learning_rate&quot;: 1.642125505023073e-05,
&quot;epoch&quot;: 2.233983286908078,
&quot;step&quot;: 402
},
{
&quot;loss&quot;: 1.3971,
&quot;grad_norm&quot;: 6.417287349700928,
&quot;learning_rate&quot;: 1.6408107528172895e-05,
&quot;epoch&quot;: 2.2395543175487465,
&quot;step&quot;: 403
},
{
&quot;loss&quot;: 1.5501,
&quot;grad_norm&quot;: 6.351545810699463,
&quot;learning_rate&quot;: 1.6394960006115065e-05,
&quot;epoch&quot;: 2.245125348189415,
&quot;step&quot;: 404
},
{
&quot;loss&quot;: 1.1685,
&quot;grad_norm&quot;: 5.121798992156982,
&quot;learning_rate&quot;: 1.6381812484057234e-05,
&quot;epoch&quot;: 2.2506963788300833,
&quot;step&quot;: 405
},
{
&quot;loss&quot;: 1.3617,
&quot;grad_norm&quot;: 5.293002128601074,
&quot;learning_rate&quot;: 1.6368664961999404e-05,
&quot;epoch&quot;: 2.256267409470752,
&quot;step&quot;: 406
},
{
&quot;loss&quot;: 1.3164,
&quot;grad_norm&quot;: 6.6434431076049805,
&quot;learning_rate&quot;: 1.6355517439941573e-05,
&quot;epoch&quot;: 2.2618384401114207,
&quot;step&quot;: 407
},
{
&quot;loss&quot;: 1.4339,
&quot;grad_norm&quot;: 6.383541584014893,
&quot;learning_rate&quot;: 1.6342369917883743e-05,
&quot;epoch&quot;: 2.267409470752089,
&quot;step&quot;: 408
},
{
&quot;loss&quot;: 1.3699,
&quot;grad_norm&quot;: 5.989224433898926,
&quot;learning_rate&quot;: 1.6329222395825913e-05,
&quot;epoch&quot;: 2.2729805013927575,
&quot;step&quot;: 409
},
{
&quot;loss&quot;: 1.4938,
&quot;grad_norm&quot;: 6.49315881729126,
&quot;learning_rate&quot;: 1.631607487376808e-05,
&quot;epoch&quot;: 2.2785515320334264,
&quot;step&quot;: 410
},
{
&quot;loss&quot;: 1.0902,
&quot;grad_norm&quot;: 4.942923069000244,
&quot;learning_rate&quot;: 1.6302927351710248e-05,
&quot;epoch&quot;: 2.284122562674095,
&quot;step&quot;: 411
},
{
&quot;loss&quot;: 1.0282,
&quot;grad_norm&quot;: 5.219899654388428,
&quot;learning_rate&quot;: 1.6289779829652418e-05,
&quot;epoch&quot;: 2.2896935933147633,
&quot;step&quot;: 412
},
{
&quot;loss&quot;: 1.3465,
&quot;grad_norm&quot;: 5.91557502746582,
&quot;learning_rate&quot;: 1.6276632307594587e-05,
&quot;epoch&quot;: 2.2952646239554317,
&quot;step&quot;: 413
},
{
&quot;loss&quot;: 1.4312,
&quot;grad_norm&quot;: 7.332894325256348,
&quot;learning_rate&quot;: 1.6263484785536757e-05,
&quot;epoch&quot;: 2.3008356545961,
&quot;step&quot;: 414
},
{
&quot;loss&quot;: 1.1921,
&quot;grad_norm&quot;: 6.784351825714111,
&quot;learning_rate&quot;: 1.6250337263478926e-05,
&quot;epoch&quot;: 2.3064066852367686,
&quot;step&quot;: 415
},
{
&quot;loss&quot;: 1.3644,
&quot;grad_norm&quot;: 6.222668647766113,
&quot;learning_rate&quot;: 1.6237189741421096e-05,
&quot;epoch&quot;: 2.3119777158774375,
&quot;step&quot;: 416
},
{
&quot;loss&quot;: 1.3318,
&quot;grad_norm&quot;: 6.7379841804504395,
&quot;learning_rate&quot;: 1.6224042219363265e-05,
&quot;epoch&quot;: 2.317548746518106,
&quot;step&quot;: 417
},
{
&quot;loss&quot;: 1.3955,
&quot;grad_norm&quot;: 7.218482494354248,
&quot;learning_rate&quot;: 1.6210894697305435e-05,
&quot;epoch&quot;: 2.3231197771587744,
&quot;step&quot;: 418
},
{
&quot;loss&quot;: 1.5949,
&quot;grad_norm&quot;: 6.676080226898193,
&quot;learning_rate&quot;: 1.6197747175247604e-05,
&quot;epoch&quot;: 2.328690807799443,
&quot;step&quot;: 419
},
{
&quot;loss&quot;: 1.2428,
&quot;grad_norm&quot;: 6.974861145019531,
&quot;learning_rate&quot;: 1.618459965318977e-05,
&quot;epoch&quot;: 2.3342618384401113,
&quot;step&quot;: 420
},
{
&quot;loss&quot;: 1.2438,
&quot;grad_norm&quot;: 7.018064975738525,
&quot;learning_rate&quot;: 1.617145213113194e-05,
&quot;epoch&quot;: 2.33983286908078,
&quot;step&quot;: 421
},
{
&quot;loss&quot;: 1.4979,
&quot;grad_norm&quot;: 6.781156063079834,
&quot;learning_rate&quot;: 1.615830460907411e-05,
&quot;epoch&quot;: 2.3454038997214486,
&quot;step&quot;: 422
},
{
&quot;loss&quot;: 1.4914,
&quot;grad_norm&quot;: 6.291943550109863,
&quot;learning_rate&quot;: 1.614515708701628e-05,
&quot;epoch&quot;: 2.350974930362117,
&quot;step&quot;: 423
},
{
&quot;loss&quot;: 1.1937,
&quot;grad_norm&quot;: 6.769220352172852,
&quot;learning_rate&quot;: 1.613200956495845e-05,
&quot;epoch&quot;: 2.3565459610027855,
&quot;step&quot;: 424
},
{
&quot;loss&quot;: 1.4428,
&quot;grad_norm&quot;: 7.461434841156006,
&quot;learning_rate&quot;: 1.6118862042900618e-05,
&quot;epoch&quot;: 2.362116991643454,
&quot;step&quot;: 425
},
{
&quot;loss&quot;: 1.0756,
&quot;grad_norm&quot;: 5.971315860748291,
&quot;learning_rate&quot;: 1.6105714520842788e-05,
&quot;epoch&quot;: 2.3676880222841223,
&quot;step&quot;: 426
},
{
&quot;loss&quot;: 1.1709,
&quot;grad_norm&quot;: 6.632075786590576,
&quot;learning_rate&quot;: 1.6092566998784954e-05,
&quot;epoch&quot;: 2.3732590529247912,
&quot;step&quot;: 427
},
{
&quot;loss&quot;: 1.2953,
&quot;grad_norm&quot;: 6.03197717666626,
&quot;learning_rate&quot;: 1.6079419476727123e-05,
&quot;epoch&quot;: 2.3788300835654597,
&quot;step&quot;: 428
},
{
&quot;loss&quot;: 1.1653,
&quot;grad_norm&quot;: 7.393289089202881,
&quot;learning_rate&quot;: 1.6066271954669296e-05,
&quot;epoch&quot;: 2.384401114206128,
&quot;step&quot;: 429
},
{
&quot;loss&quot;: 1.542,
&quot;grad_norm&quot;: 9.518671989440918,
&quot;learning_rate&quot;: 1.6053124432611462e-05,
&quot;epoch&quot;: 2.3899721448467965,
&quot;step&quot;: 430
},
{
&quot;loss&quot;: 1.0957,
&quot;grad_norm&quot;: 7.086347579956055,
&quot;learning_rate&quot;: 1.6039976910553632e-05,
&quot;epoch&quot;: 2.3955431754874654,
&quot;step&quot;: 431
},
{
&quot;loss&quot;: 1.1408,
&quot;grad_norm&quot;: 5.21544885635376,
&quot;learning_rate&quot;: 1.60268293884958e-05,
&quot;epoch&quot;: 2.401114206128134,
&quot;step&quot;: 432
},
{
&quot;loss&quot;: 1.1708,
&quot;grad_norm&quot;: 7.537359237670898,
&quot;learning_rate&quot;: 1.601368186643797e-05,
&quot;epoch&quot;: 2.4066852367688023,
&quot;step&quot;: 433
},
{
&quot;loss&quot;: 1.101,
&quot;grad_norm&quot;: 4.926475524902344,
&quot;learning_rate&quot;: 1.600053434438014e-05,
&quot;epoch&quot;: 2.4122562674094707,
&quot;step&quot;: 434
},
{
&quot;loss&quot;: 1.3898,
&quot;grad_norm&quot;: 5.6016740798950195,
&quot;learning_rate&quot;: 1.598738682232231e-05,
&quot;epoch&quot;: 2.417827298050139,
&quot;step&quot;: 435
},
{
&quot;loss&quot;: 1.4717,
&quot;grad_norm&quot;: 7.16878604888916,
&quot;learning_rate&quot;: 1.597423930026448e-05,
&quot;epoch&quot;: 2.4233983286908076,
&quot;step&quot;: 436
},
{
&quot;loss&quot;: 1.6173,
&quot;grad_norm&quot;: 6.310802459716797,
&quot;learning_rate&quot;: 1.5961091778206646e-05,
&quot;epoch&quot;: 2.4289693593314765,
&quot;step&quot;: 437
},
{
&quot;loss&quot;: 1.6172,
&quot;grad_norm&quot;: 8.035069465637207,
&quot;learning_rate&quot;: 1.5947944256148815e-05,
&quot;epoch&quot;: 2.434540389972145,
&quot;step&quot;: 438
},
{
&quot;loss&quot;: 1.4479,
&quot;grad_norm&quot;: 7.806406497955322,
&quot;learning_rate&quot;: 1.5934796734090985e-05,
&quot;epoch&quot;: 2.4401114206128134,
&quot;step&quot;: 439
},
{
&quot;loss&quot;: 1.3459,
&quot;grad_norm&quot;: 5.882315635681152,
&quot;learning_rate&quot;: 1.5921649212033154e-05,
&quot;epoch&quot;: 2.445682451253482,
&quot;step&quot;: 440
},
{
&quot;loss&quot;: 1.2195,
&quot;grad_norm&quot;: 5.817505359649658,
&quot;learning_rate&quot;: 1.5908501689975324e-05,
&quot;epoch&quot;: 2.4512534818941503,
&quot;step&quot;: 441
},
{
&quot;loss&quot;: 1.3043,
&quot;grad_norm&quot;: 7.497400283813477,
&quot;learning_rate&quot;: 1.5895354167917493e-05,
&quot;epoch&quot;: 2.456824512534819,
&quot;step&quot;: 442
},
{
&quot;loss&quot;: 1.42,
&quot;grad_norm&quot;: 5.955392837524414,
&quot;learning_rate&quot;: 1.5882206645859663e-05,
&quot;epoch&quot;: 2.4623955431754876,
&quot;step&quot;: 443
},
{
&quot;loss&quot;: 1.4764,
&quot;grad_norm&quot;: 8.848158836364746,
&quot;learning_rate&quot;: 1.5869059123801832e-05,
&quot;epoch&quot;: 2.467966573816156,
&quot;step&quot;: 444
},
{
&quot;loss&quot;: 1.4508,
&quot;grad_norm&quot;: 6.384143829345703,
&quot;learning_rate&quot;: 1.5855911601744002e-05,
&quot;epoch&quot;: 2.4735376044568245,
&quot;step&quot;: 445
},
{
&quot;loss&quot;: 1.3499,
&quot;grad_norm&quot;: 7.251498699188232,
&quot;learning_rate&quot;: 1.584276407968617e-05,
&quot;epoch&quot;: 2.479108635097493,
&quot;step&quot;: 446
},
{
&quot;loss&quot;: 1.297,
&quot;grad_norm&quot;: 8.700945854187012,
&quot;learning_rate&quot;: 1.5829616557628337e-05,
&quot;epoch&quot;: 2.4846796657381613,
&quot;step&quot;: 447
},
{
&quot;loss&quot;: 1.1607,
&quot;grad_norm&quot;: 8.17098617553711,
&quot;learning_rate&quot;: 1.5816469035570507e-05,
&quot;epoch&quot;: 2.4902506963788302,
&quot;step&quot;: 448
},
{
&quot;loss&quot;: 1.5328,
&quot;grad_norm&quot;: 6.918285846710205,
&quot;learning_rate&quot;: 1.5803321513512676e-05,
&quot;epoch&quot;: 2.4958217270194987,
&quot;step&quot;: 449
},
{
&quot;loss&quot;: 1.6258,
&quot;grad_norm&quot;: 6.7390851974487305,
&quot;learning_rate&quot;: 1.5790173991454846e-05,
&quot;epoch&quot;: 2.501392757660167,
&quot;step&quot;: 450
},
{
&quot;eval_loss&quot;: 2.571645498275757,
&quot;eval_runtime&quot;: 35.9556,
&quot;eval_samples_per_second&quot;: 39.938,
&quot;eval_steps_per_second&quot;: 2.002,
&quot;epoch&quot;: 2.501392757660167,
&quot;step&quot;: 450
},
{
&quot;loss&quot;: 1.5923,
&quot;grad_norm&quot;: 6.522182941436768,
&quot;learning_rate&quot;: 1.5777026469397015e-05,
&quot;epoch&quot;: 2.5069637883008355,
&quot;step&quot;: 451
},
{
&quot;loss&quot;: 1.2816,
&quot;grad_norm&quot;: 5.984560489654541,
&quot;learning_rate&quot;: 1.5763878947339185e-05,
&quot;epoch&quot;: 2.5125348189415044,
&quot;step&quot;: 452
},
{
&quot;loss&quot;: 1.2029,
&quot;grad_norm&quot;: 8.060498237609863,
&quot;learning_rate&quot;: 1.5750731425281354e-05,
&quot;epoch&quot;: 2.518105849582173,
&quot;step&quot;: 453
},
{
&quot;loss&quot;: 1.2117,
&quot;grad_norm&quot;: 6.93899393081665,
&quot;learning_rate&quot;: 1.573758390322352e-05,
&quot;epoch&quot;: 2.5236768802228413,
&quot;step&quot;: 454
},
{
&quot;loss&quot;: 1.4347,
&quot;grad_norm&quot;: 6.21560525894165,
&quot;learning_rate&quot;: 1.572443638116569e-05,
&quot;epoch&quot;: 2.5292479108635098,
&quot;step&quot;: 455
},
{
&quot;loss&quot;: 1.3394,
&quot;grad_norm&quot;: 7.837366580963135,
&quot;learning_rate&quot;: 1.5711288859107863e-05,
&quot;epoch&quot;: 2.534818941504178,
&quot;step&quot;: 456
},
{
&quot;loss&quot;: 1.4262,
&quot;grad_norm&quot;: 7.609643936157227,
&quot;learning_rate&quot;: 1.5698141337050033e-05,
&quot;epoch&quot;: 2.5403899721448466,
&quot;step&quot;: 457
},
{
&quot;loss&quot;: 1.3738,
&quot;grad_norm&quot;: 6.487556457519531,
&quot;learning_rate&quot;: 1.56849938149922e-05,
&quot;epoch&quot;: 2.545961002785515,
&quot;step&quot;: 458
},
{
&quot;loss&quot;: 1.4021,
&quot;grad_norm&quot;: 6.344869136810303,
&quot;learning_rate&quot;: 1.5671846292934368e-05,
&quot;epoch&quot;: 2.551532033426184,
&quot;step&quot;: 459
},
{
&quot;loss&quot;: 1.3887,
&quot;grad_norm&quot;: 6.960203170776367,
&quot;learning_rate&quot;: 1.5658698770876538e-05,
&quot;epoch&quot;: 2.5571030640668524,
&quot;step&quot;: 460
},
{
&quot;loss&quot;: 1.2997,
&quot;grad_norm&quot;: 11.57795524597168,
&quot;learning_rate&quot;: 1.5645551248818707e-05,
&quot;epoch&quot;: 2.562674094707521,
&quot;step&quot;: 461
},
{
&quot;loss&quot;: 1.5967,
&quot;grad_norm&quot;: 6.889705181121826,
&quot;learning_rate&quot;: 1.5632403726760877e-05,
&quot;epoch&quot;: 2.5682451253481893,
&quot;step&quot;: 462
},
{
&quot;loss&quot;: 1.2643,
&quot;grad_norm&quot;: 8.502350807189941,
&quot;learning_rate&quot;: 1.5619256204703046e-05,
&quot;epoch&quot;: 2.573816155988858,
&quot;step&quot;: 463
},
{
&quot;loss&quot;: 1.3686,
&quot;grad_norm&quot;: 8.704366683959961,
&quot;learning_rate&quot;: 1.5606108682645216e-05,
&quot;epoch&quot;: 2.5793871866295266,
&quot;step&quot;: 464
},
{
&quot;loss&quot;: 0.9961,
&quot;grad_norm&quot;: 8.154948234558105,
&quot;learning_rate&quot;: 1.5592961160587382e-05,
&quot;epoch&quot;: 2.584958217270195,
&quot;step&quot;: 465
},
{
&quot;loss&quot;: 1.0603,
&quot;grad_norm&quot;: 5.729700088500977,
&quot;learning_rate&quot;: 1.557981363852955e-05,
&quot;epoch&quot;: 2.5905292479108635,
&quot;step&quot;: 466
},
{
&quot;loss&quot;: 1.6641,
&quot;grad_norm&quot;: 7.716269493103027,
&quot;learning_rate&quot;: 1.556666611647172e-05,
&quot;epoch&quot;: 2.596100278551532,
&quot;step&quot;: 467
},
{
&quot;loss&quot;: 1.2886,
&quot;grad_norm&quot;: 11.220166206359863,
&quot;learning_rate&quot;: 1.555351859441389e-05,
&quot;epoch&quot;: 2.6016713091922004,
&quot;step&quot;: 468
},
{
&quot;loss&quot;: 1.2922,
&quot;grad_norm&quot;: 7.163726329803467,
&quot;learning_rate&quot;: 1.554037107235606e-05,
&quot;epoch&quot;: 2.6072423398328692,
&quot;step&quot;: 469
},
{
&quot;loss&quot;: 1.1046,
&quot;grad_norm&quot;: 7.28581428527832,
&quot;learning_rate&quot;: 1.552722355029823e-05,
&quot;epoch&quot;: 2.6128133704735377,
&quot;step&quot;: 470
},
{
&quot;loss&quot;: 1.6142,
&quot;grad_norm&quot;: 9.65365219116211,
&quot;learning_rate&quot;: 1.5514076028240396e-05,
&quot;epoch&quot;: 2.618384401114206,
&quot;step&quot;: 471
},
{
&quot;loss&quot;: 1.5575,
&quot;grad_norm&quot;: 6.458492279052734,
&quot;learning_rate&quot;: 1.550092850618257e-05,
&quot;epoch&quot;: 2.6239554317548746,
&quot;step&quot;: 472
},
{
&quot;loss&quot;: 1.3655,
&quot;grad_norm&quot;: 7.325246810913086,
&quot;learning_rate&quot;: 1.5487780984124738e-05,
&quot;epoch&quot;: 2.6295264623955434,
&quot;step&quot;: 473
},
{
&quot;loss&quot;: 1.3344,
&quot;grad_norm&quot;: 7.81355619430542,
&quot;learning_rate&quot;: 1.5474633462066908e-05,
&quot;epoch&quot;: 2.635097493036212,
&quot;step&quot;: 474
},
{
&quot;loss&quot;: 1.2505,
&quot;grad_norm&quot;: 7.347303867340088,
&quot;learning_rate&quot;: 1.5461485940009074e-05,
&quot;epoch&quot;: 2.6406685236768803,
&quot;step&quot;: 475
},
{
&quot;loss&quot;: 1.1988,
&quot;grad_norm&quot;: 7.306774616241455,
&quot;learning_rate&quot;: 1.5448338417951243e-05,
&quot;epoch&quot;: 2.6462395543175488,
&quot;step&quot;: 476
},
{
&quot;loss&quot;: 1.4075,
&quot;grad_norm&quot;: 7.261951446533203,
&quot;learning_rate&quot;: 1.5435190895893413e-05,
&quot;epoch&quot;: 2.651810584958217,
&quot;step&quot;: 477
},
{
&quot;loss&quot;: 1.3235,
&quot;grad_norm&quot;: 8.138806343078613,
&quot;learning_rate&quot;: 1.5422043373835582e-05,
&quot;epoch&quot;: 2.6573816155988856,
&quot;step&quot;: 478
},
{
&quot;loss&quot;: 1.4297,
&quot;grad_norm&quot;: 7.515624046325684,
&quot;learning_rate&quot;: 1.5408895851777752e-05,
&quot;epoch&quot;: 2.662952646239554,
&quot;step&quot;: 479
},
{
&quot;loss&quot;: 1.0187,
&quot;grad_norm&quot;: 7.298752307891846,
&quot;learning_rate&quot;: 1.539574832971992e-05,
&quot;epoch&quot;: 2.668523676880223,
&quot;step&quot;: 480
},
{
&quot;loss&quot;: 1.1512,
&quot;grad_norm&quot;: 7.08530855178833,
&quot;learning_rate&quot;: 1.538260080766209e-05,
&quot;epoch&quot;: 2.6740947075208914,
&quot;step&quot;: 481
},
{
&quot;loss&quot;: 0.9209,
&quot;grad_norm&quot;: 8.528051376342773,
&quot;learning_rate&quot;: 1.5369453285604257e-05,
&quot;epoch&quot;: 2.67966573816156,
&quot;step&quot;: 482
},
{
&quot;loss&quot;: 1.6726,
&quot;grad_norm&quot;: 6.991207122802734,
&quot;learning_rate&quot;: 1.535630576354643e-05,
&quot;epoch&quot;: 2.6852367688022283,
&quot;step&quot;: 483
},
{
&quot;loss&quot;: 1.6101,
&quot;grad_norm&quot;: 6.910933971405029,
&quot;learning_rate&quot;: 1.53431582414886e-05,
&quot;epoch&quot;: 2.690807799442897,
&quot;step&quot;: 484
},
{
&quot;loss&quot;: 1.0596,
&quot;grad_norm&quot;: 6.858171463012695,
&quot;learning_rate&quot;: 1.5330010719430766e-05,
&quot;epoch&quot;: 2.6963788300835656,
&quot;step&quot;: 485
},
{
&quot;loss&quot;: 1.3009,
&quot;grad_norm&quot;: 7.1738409996032715,
&quot;learning_rate&quot;: 1.5316863197372935e-05,
&quot;epoch&quot;: 2.701949860724234,
&quot;step&quot;: 486
},
{
&quot;loss&quot;: 1.1306,
&quot;grad_norm&quot;: 6.751303672790527,
&quot;learning_rate&quot;: 1.5303715675315105e-05,
&quot;epoch&quot;: 2.7075208913649025,
&quot;step&quot;: 487
},
{
&quot;loss&quot;: 1.6064,
&quot;grad_norm&quot;: 7.458596706390381,
&quot;learning_rate&quot;: 1.5290568153257274e-05,
&quot;epoch&quot;: 2.713091922005571,
&quot;step&quot;: 488
},
{
&quot;loss&quot;: 1.3423,
&quot;grad_norm&quot;: 4.847519397735596,
&quot;learning_rate&quot;: 1.5277420631199444e-05,
&quot;epoch&quot;: 2.7186629526462394,
&quot;step&quot;: 489
},
{
&quot;loss&quot;: 1.0908,
&quot;grad_norm&quot;: 6.585028648376465,
&quot;learning_rate&quot;: 1.5264273109141613e-05,
&quot;epoch&quot;: 2.724233983286908,
&quot;step&quot;: 490
},
{
&quot;loss&quot;: 1.6632,
&quot;grad_norm&quot;: 5.222984790802002,
&quot;learning_rate&quot;: 1.5251125587083783e-05,
&quot;epoch&quot;: 2.7298050139275767,
&quot;step&quot;: 491
},
{
&quot;loss&quot;: 1.3113,
&quot;grad_norm&quot;: 6.947058200836182,
&quot;learning_rate&quot;: 1.523797806502595e-05,
&quot;epoch&quot;: 2.735376044568245,
&quot;step&quot;: 492
},
{
&quot;loss&quot;: 1.0863,
&quot;grad_norm&quot;: 5.885672569274902,
&quot;learning_rate&quot;: 1.522483054296812e-05,
&quot;epoch&quot;: 2.7409470752089136,
&quot;step&quot;: 493
},
{
&quot;loss&quot;: 1.1982,
&quot;grad_norm&quot;: 7.9502034187316895,
&quot;learning_rate&quot;: 1.521168302091029e-05,
&quot;epoch&quot;: 2.7465181058495824,
&quot;step&quot;: 494
},
{
&quot;loss&quot;: 1.3941,
&quot;grad_norm&quot;: 5.9523773193359375,
&quot;learning_rate&quot;: 1.5198535498852457e-05,
&quot;epoch&quot;: 2.752089136490251,
&quot;step&quot;: 495
},
{
&quot;loss&quot;: 1.3251,
&quot;grad_norm&quot;: 7.984345436096191,
&quot;learning_rate&quot;: 1.5185387976794627e-05,
&quot;epoch&quot;: 2.7576601671309193,
&quot;step&quot;: 496
},
{
&quot;loss&quot;: 0.8109,
&quot;grad_norm&quot;: 8.467183113098145,
&quot;learning_rate&quot;: 1.5172240454736796e-05,
&quot;epoch&quot;: 2.7632311977715878,
&quot;step&quot;: 497
},
{
&quot;loss&quot;: 1.1339,
&quot;grad_norm&quot;: 7.878790378570557,
&quot;learning_rate&quot;: 1.5159092932678966e-05,
&quot;epoch&quot;: 2.768802228412256,
&quot;step&quot;: 498
},
{
&quot;loss&quot;: 1.1736,
&quot;grad_norm&quot;: 5.638209819793701,
&quot;learning_rate&quot;: 1.5145945410621134e-05,
&quot;epoch&quot;: 2.7743732590529246,
&quot;step&quot;: 499
},
{
&quot;loss&quot;: 1.3546,
&quot;grad_norm&quot;: 7.818211078643799,
&quot;learning_rate&quot;: 1.5132797888563303e-05,
&quot;epoch&quot;: 2.779944289693593,
&quot;step&quot;: 500
},
{
&quot;eval_loss&quot;: 2.6166257858276367,
&quot;eval_runtime&quot;: 35.971,
&quot;eval_samples_per_second&quot;: 39.921,
&quot;eval_steps_per_second&quot;: 2.002,
&quot;epoch&quot;: 2.779944289693593,
&quot;step&quot;: 500
},
{
&quot;loss&quot;: 1.4636,
&quot;grad_norm&quot;: 6.118830680847168,
&quot;learning_rate&quot;: 1.5119650366505473e-05,
&quot;epoch&quot;: 2.785515320334262,
&quot;step&quot;: 501
},
{
&quot;loss&quot;: 1.5519,
&quot;grad_norm&quot;: 7.9165778160095215,
&quot;learning_rate&quot;: 1.510650284444764e-05,
&quot;epoch&quot;: 2.7910863509749304,
&quot;step&quot;: 502
},
{
&quot;loss&quot;: 1.5206,
&quot;grad_norm&quot;: 6.975761413574219,
&quot;learning_rate&quot;: 1.5093355322389812e-05,
&quot;epoch&quot;: 2.796657381615599,
&quot;step&quot;: 503
},
{
&quot;loss&quot;: 1.0665,
&quot;grad_norm&quot;: 9.277933120727539,
&quot;learning_rate&quot;: 1.5080207800331981e-05,
&quot;epoch&quot;: 2.8022284122562673,
&quot;step&quot;: 504
},
{
&quot;loss&quot;: 1.2801,
&quot;grad_norm&quot;: 8.121682167053223,
&quot;learning_rate&quot;: 1.5067060278274151e-05,
&quot;epoch&quot;: 2.807799442896936,
&quot;step&quot;: 505
},
{
&quot;loss&quot;: 1.565,
&quot;grad_norm&quot;: 8.76021957397461,
&quot;learning_rate&quot;: 1.5053912756216319e-05,
&quot;epoch&quot;: 2.8133704735376046,
&quot;step&quot;: 506
},
{
&quot;loss&quot;: 1.3502,
&quot;grad_norm&quot;: 8.618566513061523,
&quot;learning_rate&quot;: 1.5040765234158488e-05,
&quot;epoch&quot;: 2.818941504178273,
&quot;step&quot;: 507
},
{
&quot;loss&quot;: 1.5859,
&quot;grad_norm&quot;: 8.027894020080566,
&quot;learning_rate&quot;: 1.5027617712100658e-05,
&quot;epoch&quot;: 2.8245125348189415,
&quot;step&quot;: 508
},
{
&quot;loss&quot;: 1.4159,
&quot;grad_norm&quot;: 7.063473701477051,
&quot;learning_rate&quot;: 1.5014470190042826e-05,
&quot;epoch&quot;: 2.83008356545961,
&quot;step&quot;: 509
},
{
&quot;loss&quot;: 1.5672,
&quot;grad_norm&quot;: 6.095931053161621,
&quot;learning_rate&quot;: 1.5001322667984995e-05,
&quot;epoch&quot;: 2.8356545961002784,
&quot;step&quot;: 510
},
{
&quot;loss&quot;: 1.2551,
&quot;grad_norm&quot;: 6.445271968841553,
&quot;learning_rate&quot;: 1.4988175145927165e-05,
&quot;epoch&quot;: 2.841225626740947,
&quot;step&quot;: 511
},
{
&quot;loss&quot;: 0.9671,
&quot;grad_norm&quot;: 7.601891040802002,
&quot;learning_rate&quot;: 1.4975027623869334e-05,
&quot;epoch&quot;: 2.8467966573816157,
&quot;step&quot;: 512
},
{
&quot;loss&quot;: 1.4462,
&quot;grad_norm&quot;: 8.017728805541992,
&quot;learning_rate&quot;: 1.4961880101811502e-05,
&quot;epoch&quot;: 2.852367688022284,
&quot;step&quot;: 513
},
{
&quot;loss&quot;: 1.2006,
&quot;grad_norm&quot;: 6.753676891326904,
&quot;learning_rate&quot;: 1.4948732579753672e-05,
&quot;epoch&quot;: 2.8579387186629526,
&quot;step&quot;: 514
},
{
&quot;loss&quot;: 0.9354,
&quot;grad_norm&quot;: 6.220627784729004,
&quot;learning_rate&quot;: 1.4935585057695843e-05,
&quot;epoch&quot;: 2.863509749303621,
&quot;step&quot;: 515
},
{
&quot;loss&quot;: 1.5554,
&quot;grad_norm&quot;: 7.825878620147705,
&quot;learning_rate&quot;: 1.4922437535638009e-05,
&quot;epoch&quot;: 2.86908077994429,
&quot;step&quot;: 516
},
{
&quot;loss&quot;: 0.9281,
&quot;grad_norm&quot;: 7.7669548988342285,
&quot;learning_rate&quot;: 1.490929001358018e-05,
&quot;epoch&quot;: 2.8746518105849583,
&quot;step&quot;: 517
},
{
&quot;loss&quot;: 1.1678,
&quot;grad_norm&quot;: 6.18816614151001,
&quot;learning_rate&quot;: 1.489614249152235e-05,
&quot;epoch&quot;: 2.8802228412256268,
&quot;step&quot;: 518
},
{
&quot;loss&quot;: 0.8378,
&quot;grad_norm&quot;: 11.241938591003418,
&quot;learning_rate&quot;: 1.4882994969464517e-05,
&quot;epoch&quot;: 2.885793871866295,
&quot;step&quot;: 519
},
{
&quot;loss&quot;: 1.6602,
&quot;grad_norm&quot;: 6.708087921142578,
&quot;learning_rate&quot;: 1.4869847447406687e-05,
&quot;epoch&quot;: 2.8913649025069637,
&quot;step&quot;: 520
},
{
&quot;loss&quot;: 1.5575,
&quot;grad_norm&quot;: 8.96353530883789,
&quot;learning_rate&quot;: 1.4856699925348856e-05,
&quot;epoch&quot;: 2.896935933147632,
&quot;step&quot;: 521
},
{
&quot;loss&quot;: 1.3553,
&quot;grad_norm&quot;: 7.286456108093262,
&quot;learning_rate&quot;: 1.4843552403291026e-05,
&quot;epoch&quot;: 2.902506963788301,
&quot;step&quot;: 522
},
{
&quot;loss&quot;: 1.3618,
&quot;grad_norm&quot;: 6.448929309844971,
&quot;learning_rate&quot;: 1.4830404881233194e-05,
&quot;epoch&quot;: 2.9080779944289694,
&quot;step&quot;: 523
},
{
&quot;loss&quot;: 1.0911,
&quot;grad_norm&quot;: 6.1524739265441895,
&quot;learning_rate&quot;: 1.4817257359175363e-05,
&quot;epoch&quot;: 2.913649025069638,
&quot;step&quot;: 524
},
{
&quot;loss&quot;: 1.2465,
&quot;grad_norm&quot;: 6.833171367645264,
&quot;learning_rate&quot;: 1.4804109837117533e-05,
&quot;epoch&quot;: 2.9192200557103063,
&quot;step&quot;: 525
},
{
&quot;loss&quot;: 0.9937,
&quot;grad_norm&quot;: 8.745670318603516,
&quot;learning_rate&quot;: 1.47909623150597e-05,
&quot;epoch&quot;: 2.924791086350975,
&quot;step&quot;: 526
},
{
&quot;loss&quot;: 1.3931,
&quot;grad_norm&quot;: 6.3659186363220215,
&quot;learning_rate&quot;: 1.477781479300187e-05,
&quot;epoch&quot;: 2.9303621169916436,
&quot;step&quot;: 527
},
{
&quot;loss&quot;: 1.029,
&quot;grad_norm&quot;: 8.309256553649902,
&quot;learning_rate&quot;: 1.476466727094404e-05,
&quot;epoch&quot;: 2.935933147632312,
&quot;step&quot;: 528
},
{
&quot;loss&quot;: 1.383,
&quot;grad_norm&quot;: 7.611057758331299,
&quot;learning_rate&quot;: 1.4751519748886211e-05,
&quot;epoch&quot;: 2.9415041782729805,
&quot;step&quot;: 529
},
{
&quot;loss&quot;: 1.4833,
&quot;grad_norm&quot;: 9.441068649291992,
&quot;learning_rate&quot;: 1.4738372226828379e-05,
&quot;epoch&quot;: 2.947075208913649,
&quot;step&quot;: 530
},
{
&quot;loss&quot;: 1.2739,
&quot;grad_norm&quot;: 7.198431968688965,
&quot;learning_rate&quot;: 1.4725224704770548e-05,
&quot;epoch&quot;: 2.9526462395543174,
&quot;step&quot;: 531
},
{
&quot;loss&quot;: 1.4294,
&quot;grad_norm&quot;: 8.88117790222168,
&quot;learning_rate&quot;: 1.4712077182712718e-05,
&quot;epoch&quot;: 2.958217270194986,
&quot;step&quot;: 532
},
{
&quot;loss&quot;: 1.0204,
&quot;grad_norm&quot;: 9.982294082641602,
&quot;learning_rate&quot;: 1.4698929660654886e-05,
&quot;epoch&quot;: 2.9637883008356547,
&quot;step&quot;: 533
},
{
&quot;loss&quot;: 1.1488,
&quot;grad_norm&quot;: 8.535533905029297,
&quot;learning_rate&quot;: 1.4685782138597055e-05,
&quot;epoch&quot;: 2.969359331476323,
&quot;step&quot;: 534
},
{
&quot;loss&quot;: 1.49,
&quot;grad_norm&quot;: 6.813885688781738,
&quot;learning_rate&quot;: 1.4672634616539225e-05,
&quot;epoch&quot;: 2.9749303621169916,
&quot;step&quot;: 535
},
{
&quot;loss&quot;: 1.113,
&quot;grad_norm&quot;: 9.557439804077148,
&quot;learning_rate&quot;: 1.4659487094481394e-05,
&quot;epoch&quot;: 2.98050139275766,
&quot;step&quot;: 536
},
{
&quot;loss&quot;: 1.5004,
&quot;grad_norm&quot;: 6.406128883361816,
&quot;learning_rate&quot;: 1.4646339572423562e-05,
&quot;epoch&quot;: 2.986072423398329,
&quot;step&quot;: 537
},
{
&quot;loss&quot;: 1.1722,
&quot;grad_norm&quot;: 7.9670915603637695,
&quot;learning_rate&quot;: 1.4633192050365732e-05,
&quot;epoch&quot;: 2.9916434540389973,
&quot;step&quot;: 538
},
{
&quot;loss&quot;: 1.5033,
&quot;grad_norm&quot;: 9.402728080749512,
&quot;learning_rate&quot;: 1.4620044528307901e-05,
&quot;epoch&quot;: 2.997214484679666,
&quot;step&quot;: 539
},
{
&quot;loss&quot;: 1.279,
&quot;grad_norm&quot;: 7.38714075088501,
&quot;learning_rate&quot;: 1.4606897006250069e-05,
&quot;epoch&quot;: 3.0,
&quot;step&quot;: 540
},
{
&quot;loss&quot;: 0.6426,
&quot;grad_norm&quot;: 7.639667510986328,
&quot;learning_rate&quot;: 1.4593749484192238e-05,
&quot;epoch&quot;: 3.0055710306406684,
&quot;step&quot;: 541
},
{
&quot;loss&quot;: 0.8426,
&quot;grad_norm&quot;: 7.864633560180664,
&quot;learning_rate&quot;: 1.458060196213441e-05,
&quot;epoch&quot;: 3.011142061281337,
&quot;step&quot;: 542
},
{
&quot;loss&quot;: 1.0651,
&quot;grad_norm&quot;: 6.637276649475098,
&quot;learning_rate&quot;: 1.4567454440076576e-05,
&quot;epoch&quot;: 3.0167130919220058,
&quot;step&quot;: 543
},
{
&quot;loss&quot;: 1.0804,
&quot;grad_norm&quot;: 7.148686408996582,
&quot;learning_rate&quot;: 1.4554306918018747e-05,
&quot;epoch&quot;: 3.022284122562674,
&quot;step&quot;: 544
},
{
&quot;loss&quot;: 1.0182,
&quot;grad_norm&quot;: 6.767364501953125,
&quot;learning_rate&quot;: 1.4541159395960917e-05,
&quot;epoch&quot;: 3.0278551532033426,
&quot;step&quot;: 545
},
{
&quot;loss&quot;: 0.8165,
&quot;grad_norm&quot;: 6.77062463760376,
&quot;learning_rate&quot;: 1.4528011873903086e-05,
&quot;epoch&quot;: 3.033426183844011,
&quot;step&quot;: 546
},
{
&quot;loss&quot;: 0.9941,
&quot;grad_norm&quot;: 8.067922592163086,
&quot;learning_rate&quot;: 1.4514864351845254e-05,
&quot;epoch&quot;: 3.0389972144846795,
&quot;step&quot;: 547
},
{
&quot;loss&quot;: 0.9579,
&quot;grad_norm&quot;: 8.817468643188477,
&quot;learning_rate&quot;: 1.4501716829787423e-05,
&quot;epoch&quot;: 3.0445682451253484,
&quot;step&quot;: 548
},
{
&quot;loss&quot;: 0.6023,
&quot;grad_norm&quot;: 8.70374870300293,
&quot;learning_rate&quot;: 1.4488569307729593e-05,
&quot;epoch&quot;: 3.050139275766017,
&quot;step&quot;: 549
},
{
&quot;loss&quot;: 1.1392,
&quot;grad_norm&quot;: 9.344374656677246,
&quot;learning_rate&quot;: 1.447542178567176e-05,
&quot;epoch&quot;: 3.0557103064066853,
&quot;step&quot;: 550
},
{
&quot;eval_loss&quot;: 3.0347440242767334,
&quot;eval_runtime&quot;: 35.9429,
&quot;eval_samples_per_second&quot;: 39.952,
&quot;eval_steps_per_second&quot;: 2.003,
&quot;epoch&quot;: 3.0557103064066853,
&quot;step&quot;: 550
},
{
&quot;loss&quot;: 0.663,
&quot;grad_norm&quot;: 10.07166862487793,
&quot;learning_rate&quot;: 1.446227426361393e-05,
&quot;epoch&quot;: 3.0612813370473537,
&quot;step&quot;: 551
},
{
&quot;loss&quot;: 1.1801,
&quot;grad_norm&quot;: 14.619653701782227,
&quot;learning_rate&quot;: 1.44491267415561e-05,
&quot;epoch&quot;: 3.066852367688022,
&quot;step&quot;: 552
},
{
&quot;loss&quot;: 0.9107,
&quot;grad_norm&quot;: 10.427509307861328,
&quot;learning_rate&quot;: 1.443597921949827e-05,
&quot;epoch&quot;: 3.0724233983286906,
&quot;step&quot;: 553
},
{
&quot;loss&quot;: 0.9474,
&quot;grad_norm&quot;: 8.392213821411133,
&quot;learning_rate&quot;: 1.4422831697440437e-05,
&quot;epoch&quot;: 3.0779944289693595,
&quot;step&quot;: 554
},
{
&quot;loss&quot;: 0.7972,
&quot;grad_norm&quot;: 13.848929405212402,
&quot;learning_rate&quot;: 1.4409684175382607e-05,
&quot;epoch&quot;: 3.083565459610028,
&quot;step&quot;: 555
},
{
&quot;loss&quot;: 0.7298,
&quot;grad_norm&quot;: 9.263422966003418,
&quot;learning_rate&quot;: 1.4396536653324778e-05,
&quot;epoch&quot;: 3.0891364902506964,
&quot;step&quot;: 556
},
{
&quot;loss&quot;: 0.5749,
&quot;grad_norm&quot;: 11.082460403442383,
&quot;learning_rate&quot;: 1.4383389131266946e-05,
&quot;epoch&quot;: 3.094707520891365,
&quot;step&quot;: 557
},
{
&quot;loss&quot;: 0.7301,
&quot;grad_norm&quot;: 7.7812604904174805,
&quot;learning_rate&quot;: 1.4370241609209115e-05,
&quot;epoch&quot;: 3.1002785515320332,
&quot;step&quot;: 558
},
{
&quot;loss&quot;: 0.5884,
&quot;grad_norm&quot;: 12.2935791015625,
&quot;learning_rate&quot;: 1.4357094087151285e-05,
&quot;epoch&quot;: 3.105849582172702,
&quot;step&quot;: 559
},
{
&quot;loss&quot;: 0.8034,
&quot;grad_norm&quot;: 8.129678726196289,
&quot;learning_rate&quot;: 1.4343946565093454e-05,
&quot;epoch&quot;: 3.1114206128133706,
&quot;step&quot;: 560
},
{
&quot;loss&quot;: 0.6528,
&quot;grad_norm&quot;: 8.628301620483398,
&quot;learning_rate&quot;: 1.4330799043035622e-05,
&quot;epoch&quot;: 3.116991643454039,
&quot;step&quot;: 561
},
{
&quot;loss&quot;: 0.8483,
&quot;grad_norm&quot;: 10.514995574951172,
&quot;learning_rate&quot;: 1.4317651520977792e-05,
&quot;epoch&quot;: 3.1225626740947074,
&quot;step&quot;: 562
},
{
&quot;loss&quot;: 0.7009,
&quot;grad_norm&quot;: 8.187010765075684,
&quot;learning_rate&quot;: 1.4304503998919961e-05,
&quot;epoch&quot;: 3.128133704735376,
&quot;step&quot;: 563
},
{
&quot;loss&quot;: 0.8732,
&quot;grad_norm&quot;: 10.525712013244629,
&quot;learning_rate&quot;: 1.4291356476862129e-05,
&quot;epoch&quot;: 3.1337047353760448,
&quot;step&quot;: 564
},
{
&quot;loss&quot;: 0.8319,
&quot;grad_norm&quot;: 9.198347091674805,
&quot;learning_rate&quot;: 1.4278208954804298e-05,
&quot;epoch&quot;: 3.139275766016713,
&quot;step&quot;: 565
},
{
&quot;loss&quot;: 0.726,
&quot;grad_norm&quot;: 8.486757278442383,
&quot;learning_rate&quot;: 1.4265061432746468e-05,
&quot;epoch&quot;: 3.1448467966573816,
&quot;step&quot;: 566
},
{
&quot;loss&quot;: 0.4993,
&quot;grad_norm&quot;: 8.220407485961914,
&quot;learning_rate&quot;: 1.4251913910688636e-05,
&quot;epoch&quot;: 3.15041782729805,
&quot;step&quot;: 567
},
{
&quot;loss&quot;: 1.0212,
&quot;grad_norm&quot;: 7.644767761230469,
&quot;learning_rate&quot;: 1.4238766388630805e-05,
&quot;epoch&quot;: 3.1559888579387185,
&quot;step&quot;: 568
},
{
&quot;loss&quot;: 0.8306,
&quot;grad_norm&quot;: 11.287712097167969,
&quot;learning_rate&quot;: 1.4225618866572977e-05,
&quot;epoch&quot;: 3.1615598885793874,
&quot;step&quot;: 569
},
{
&quot;loss&quot;: 0.626,
&quot;grad_norm&quot;: 7.9160637855529785,
&quot;learning_rate&quot;: 1.4212471344515146e-05,
&quot;epoch&quot;: 3.167130919220056,
&quot;step&quot;: 570
},
{
&quot;loss&quot;: 0.75,
&quot;grad_norm&quot;: 11.988582611083984,
&quot;learning_rate&quot;: 1.4199323822457314e-05,
&quot;epoch&quot;: 3.1727019498607243,
&quot;step&quot;: 571
},
{
&quot;loss&quot;: 1.2685,
&quot;grad_norm&quot;: 9.961721420288086,
&quot;learning_rate&quot;: 1.4186176300399483e-05,
&quot;epoch&quot;: 3.1782729805013927,
&quot;step&quot;: 572
},
{
&quot;loss&quot;: 0.7429,
&quot;grad_norm&quot;: 12.098424911499023,
&quot;learning_rate&quot;: 1.4173028778341653e-05,
&quot;epoch&quot;: 3.183844011142061,
&quot;step&quot;: 573
},
{
&quot;loss&quot;: 1.0086,
&quot;grad_norm&quot;: 8.59049129486084,
&quot;learning_rate&quot;: 1.415988125628382e-05,
&quot;epoch&quot;: 3.1894150417827296,
&quot;step&quot;: 574
},
{
&quot;loss&quot;: 1.1215,
&quot;grad_norm&quot;: 10.50232219696045,
&quot;learning_rate&quot;: 1.414673373422599e-05,
&quot;epoch&quot;: 3.1949860724233985,
&quot;step&quot;: 575
},
{
&quot;loss&quot;: 0.6729,
&quot;grad_norm&quot;: 11.673900604248047,
&quot;learning_rate&quot;: 1.413358621216816e-05,
&quot;epoch&quot;: 3.200557103064067,
&quot;step&quot;: 576
},
{
&quot;loss&quot;: 1.2036,
&quot;grad_norm&quot;: 6.419600009918213,
&quot;learning_rate&quot;: 1.412043869011033e-05,
&quot;epoch&quot;: 3.2061281337047354,
&quot;step&quot;: 577
},
{
&quot;loss&quot;: 0.6877,
&quot;grad_norm&quot;: 10.218490600585938,
&quot;learning_rate&quot;: 1.4107291168052497e-05,
&quot;epoch&quot;: 3.211699164345404,
&quot;step&quot;: 578
},
{
&quot;loss&quot;: 0.5637,
&quot;grad_norm&quot;: 5.7183918952941895,
&quot;learning_rate&quot;: 1.4094143645994667e-05,
&quot;epoch&quot;: 3.2172701949860723,
&quot;step&quot;: 579
},
{
&quot;loss&quot;: 0.7498,
&quot;grad_norm&quot;: 11.460823059082031,
&quot;learning_rate&quot;: 1.4080996123936836e-05,
&quot;epoch&quot;: 3.222841225626741,
&quot;step&quot;: 580
},
{
&quot;loss&quot;: 0.6792,
&quot;grad_norm&quot;: 8.623233795166016,
&quot;learning_rate&quot;: 1.4067848601879004e-05,
&quot;epoch&quot;: 3.2284122562674096,
&quot;step&quot;: 581
},
{
&quot;loss&quot;: 0.6752,
&quot;grad_norm&quot;: 11.339884757995605,
&quot;learning_rate&quot;: 1.4054701079821174e-05,
&quot;epoch&quot;: 3.233983286908078,
&quot;step&quot;: 582
},
{
&quot;loss&quot;: 0.8586,
&quot;grad_norm&quot;: 12.452316284179688,
&quot;learning_rate&quot;: 1.4041553557763345e-05,
&quot;epoch&quot;: 3.2395543175487465,
&quot;step&quot;: 583
},
{
&quot;loss&quot;: 0.8345,
&quot;grad_norm&quot;: 6.755831241607666,
&quot;learning_rate&quot;: 1.4028406035705514e-05,
&quot;epoch&quot;: 3.245125348189415,
&quot;step&quot;: 584
},
{
&quot;loss&quot;: 0.6932,
&quot;grad_norm&quot;: 9.68067741394043,
&quot;learning_rate&quot;: 1.4015258513647682e-05,
&quot;epoch&quot;: 3.2506963788300833,
&quot;step&quot;: 585
},
{
&quot;loss&quot;: 1.2071,
&quot;grad_norm&quot;: 11.948298454284668,
&quot;learning_rate&quot;: 1.4002110991589852e-05,
&quot;epoch&quot;: 3.256267409470752,
&quot;step&quot;: 586
},
{
&quot;loss&quot;: 0.7349,
&quot;grad_norm&quot;: 11.49226188659668,
&quot;learning_rate&quot;: 1.3988963469532021e-05,
&quot;epoch&quot;: 3.2618384401114207,
&quot;step&quot;: 587
},
{
&quot;loss&quot;: 0.7923,
&quot;grad_norm&quot;: 10.757736206054688,
&quot;learning_rate&quot;: 1.3975815947474189e-05,
&quot;epoch&quot;: 3.267409470752089,
&quot;step&quot;: 588
},
{
&quot;loss&quot;: 0.6857,
&quot;grad_norm&quot;: 8.46744441986084,
&quot;learning_rate&quot;: 1.3962668425416358e-05,
&quot;epoch&quot;: 3.2729805013927575,
&quot;step&quot;: 589
},
{
&quot;loss&quot;: 0.9153,
&quot;grad_norm&quot;: 6.472330093383789,
&quot;learning_rate&quot;: 1.3949520903358528e-05,
&quot;epoch&quot;: 3.2785515320334264,
&quot;step&quot;: 590
},
{
&quot;loss&quot;: 0.7542,
&quot;grad_norm&quot;: 12.151514053344727,
&quot;learning_rate&quot;: 1.3936373381300696e-05,
&quot;epoch&quot;: 3.284122562674095,
&quot;step&quot;: 591
},
{
&quot;loss&quot;: 0.9487,
&quot;grad_norm&quot;: 11.680760383605957,
&quot;learning_rate&quot;: 1.3923225859242865e-05,
&quot;epoch&quot;: 3.2896935933147633,
&quot;step&quot;: 592
},
{
&quot;loss&quot;: 0.6893,
&quot;grad_norm&quot;: 9.367558479309082,
&quot;learning_rate&quot;: 1.3910078337185035e-05,
&quot;epoch&quot;: 3.2952646239554317,
&quot;step&quot;: 593
},
{
&quot;loss&quot;: 0.7126,
&quot;grad_norm&quot;: 10.658570289611816,
&quot;learning_rate&quot;: 1.3896930815127206e-05,
&quot;epoch&quot;: 3.3008356545961,
&quot;step&quot;: 594
},
{
&quot;loss&quot;: 0.8014,
&quot;grad_norm&quot;: 8.675304412841797,
&quot;learning_rate&quot;: 1.3883783293069372e-05,
&quot;epoch&quot;: 3.3064066852367686,
&quot;step&quot;: 595
},
{
&quot;loss&quot;: 0.7078,
&quot;grad_norm&quot;: 6.470170974731445,
&quot;learning_rate&quot;: 1.3870635771011543e-05,
&quot;epoch&quot;: 3.3119777158774375,
&quot;step&quot;: 596
},
{
&quot;loss&quot;: 0.6612,
&quot;grad_norm&quot;: 7.141599178314209,
&quot;learning_rate&quot;: 1.3857488248953713e-05,
&quot;epoch&quot;: 3.317548746518106,
&quot;step&quot;: 597
},
{
&quot;loss&quot;: 0.8968,
&quot;grad_norm&quot;: 9.977639198303223,
&quot;learning_rate&quot;: 1.384434072689588e-05,
&quot;epoch&quot;: 3.3231197771587744,
&quot;step&quot;: 598
},
{
&quot;loss&quot;: 0.8395,
&quot;grad_norm&quot;: 10.208252906799316,
&quot;learning_rate&quot;: 1.383119320483805e-05,
&quot;epoch&quot;: 3.328690807799443,
&quot;step&quot;: 599
},
{
&quot;loss&quot;: 0.9248,
&quot;grad_norm&quot;: 9.933085441589355,
&quot;learning_rate&quot;: 1.381804568278022e-05,
&quot;epoch&quot;: 3.3342618384401113,
&quot;step&quot;: 600
},
{
&quot;eval_loss&quot;: 2.9689695835113525,
&quot;eval_runtime&quot;: 35.9526,
&quot;eval_samples_per_second&quot;: 39.941,
&quot;eval_steps_per_second&quot;: 2.003,
&quot;epoch&quot;: 3.3342618384401113,
&quot;step&quot;: 600
},
{
&quot;train_runtime&quot;: 2220.807,
&quot;train_samples_per_second&quot;: 5.944,
&quot;train_steps_per_second&quot;: 0.743,
&quot;total_flos&quot;: 4.35104765343744e+16,
&quot;train_loss&quot;: 1.654274252106746,
&quot;epoch&quot;: 3.3342618384401113,
&quot;step&quot;: 600
}
]</pre></details>
<script type="application/json" id="run-payload">{"run_meta": {"model": "unsloth/Phi-4-unsloth-bnb-4bit", "dataset": "Mathieu-Thomas-JOSSET/michael_abab_conversations_infini_instruct.jsonl", "examples_total": 2872, "examples_train": 1436, "examples_eval": 1436, "world_size": 1, "effective_batch_size": 8, "steps_per_epoch_approx": 179.5, "max_steps": 2000, "eval_steps": 50, "save_steps": 50, "learning_rate": 9.95267419777795e-06, "warmup_steps": 10, "lr_scheduler_type": "linear", "weight_decay": 0.009206070410847844, "lora_r": 32, "lora_alpha": 64, "lora_dropout": 0.0, "best_checkpoint": "outputs/continue_r1_from_350_20260112_073729/checkpoint-100", "LR_AUTO_ENABLED": true, "LR_AUTO_USE_N": "train", "LR_AUTO_N_REF": 1436, "LR_AUTO_BASE": 1e-05, "LR_AUTO_MULT": 0.5, "LR_AUTO_FINAL": 5e-06, "best_step": 100, "best_eval_loss": 2.2380564212799072, "best_blended": 1.3520409573791146, "best_blended_step": 600}, "config_snapshot": {"MODEL_NAME": "unsloth/Phi-4-unsloth-bnb-4bit", "CHAT_TEMPLATE": "phi-4", "MAX_SEQ_LENGTH": 2048, "LOAD_IN_4BIT": true, "DATASET_NAME": "Mathieu-Thomas-JOSSET/michael_abab_conversations_infini_instruct.jsonl", "DATASET_SPLIT": "train", "PER_DEVICE_TRAIN_BATCH_SIZE": 2, "GRADIENT_ACCUMULATION_STEPS": 4, "WARMUP_STEPS": 10, "MAX_STEPS": 2000, "LEARNING_RATE": 9.95267419777795e-06, "WEIGHT_DECAY": 0.009206070410847844, "LR_SCHEDULER_TYPE": "linear", "SEED": 3407, "PLOTLY_DARK_MODE": true, "PLOTLY_BASE_COLOR": "#00CC96", "PLOTLY_EMA_SPAN": 25, "LR_AUTO_ENABLED": true, "LR_AUTO_USE_N": "train", "LR_AUTO_N_REF": 1436, "LR_AUTO_BASE": 1e-05, "LR_AUTO_MULT": 0.5, "LR_AUTO_FINAL": 5e-06}, "run_manifest": {"model_name": "unsloth/Phi-4-unsloth-bnb-4bit", "dataset": {"name": "Mathieu-Thomas-JOSSET/michael_abab_conversations_infini_instruct.jsonl", "split": "train"}, "training": {"max_steps": 2000, "learning_rate": 9.95267419777795e-06, "per_device_train_batch_size": 2, "gradient_accumulation_steps": 4, "max_seq_length": 2048, "seed": 3407, "optimizer": "adamw_8bit", "lr_scheduler_type": "linear"}, "auto_lr": {"enabled": true, "use_n": "train", "n_ref": 1436, "base": 1e-05, "mult": 0.5, "final": 5e-06}, "best": {"checkpoint": "/content/outputs/continue_r1_from_350_20260112_073729/checkpoint-100", "metric": 2.2380564212799072, "metric_name": "eval_loss"}, "plotly": {"html": "training_loss_step.html"}}, "log_history": [{"loss": 2.2041, "grad_norm": 4.026190757751465, "learning_rate": 0.0, "epoch": 0.005571030640668524, "step": 1}, {"loss": 2.6901, "grad_norm": 1.616629719734192, "learning_rate": 1.4636285584967574e-07, "epoch": 0.011142061281337047, "step": 2}, {"loss": 2.6774, "grad_norm": 13.836981773376465, "learning_rate": 2.927257116993515e-07, "epoch": 0.016713091922005572, "step": 3}, {"loss": 2.4478, "grad_norm": 1.857710361480713, "learning_rate": 4.3908856754902726e-07, "epoch": 0.022284122562674095, "step": 4}, {"loss": 1.9988, "grad_norm": 1.4818029403686523, "learning_rate": 5.85451423398703e-07, "epoch": 0.027855153203342618, "step": 5}, {"loss": 2.0358, "grad_norm": 1.726440191268921, "learning_rate": 7.318142792483787e-07, "epoch": 0.033426183844011144, "step": 6}, {"loss": 2.5824, "grad_norm": 2.0604233741760254, "learning_rate": 8.781771350980545e-07, "epoch": 0.03899721448467967, "step": 7}, {"loss": 2.3479, "grad_norm": 1.7288694381713867, "learning_rate": 1.0245399909477302e-06, "epoch": 0.04456824512534819, "step": 8}, {"loss": 2.6387, "grad_norm": 1.9069620370864868, "learning_rate": 1.170902846797406e-06, "epoch": 0.05013927576601671, "step": 9}, {"loss": 2.6083, "grad_norm": 1.4719465970993042, "learning_rate": 1.3172657026470817e-06, "epoch": 0.055710306406685235, "step": 10}, {"loss": 2.3015, "grad_norm": 1.6306267976760864, "learning_rate": 1.4636285584967574e-06, "epoch": 0.06128133704735376, "step": 11}, {"loss": 2.7042, "grad_norm": 1.4724116325378418, "learning_rate": 1.6099914143464333e-06, "epoch": 0.06685236768802229, "step": 12}, {"loss": 2.5386, "grad_norm": 1.5470020771026611, "learning_rate": 1.756354270196109e-06, "epoch": 0.07242339832869081, "step": 13}, {"loss": 2.5886
</div>
</body>
</html>