{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 3.0,
  "eval_steps": 500,
  "global_step": 387,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.07759456838021339,
      "grad_norm": 1.9683642394428182,
      "learning_rate": 2.307692307692308e-06,
      "loss": 0.7343237400054932,
      "step": 10
    },
    {
      "epoch": 0.15518913676042678,
      "grad_norm": 1.4175428237350762,
      "learning_rate": 4.871794871794872e-06,
      "loss": 0.5461452007293701,
      "step": 20
    },
    {
      "epoch": 0.23278370514064015,
      "grad_norm": 0.5442834252561063,
      "learning_rate": 7.435897435897437e-06,
      "loss": 0.3490773677825928,
      "step": 30
    },
    {
      "epoch": 0.31037827352085356,
      "grad_norm": 0.32322946422972365,
      "learning_rate": 1e-05,
      "loss": 0.2592954635620117,
      "step": 40
    },
    {
      "epoch": 0.3879728419010669,
      "grad_norm": 0.24901563193155196,
      "learning_rate": 9.979639600327522e-06,
      "loss": 0.2136533737182617,
      "step": 50
    },
    {
      "epoch": 0.4655674102812803,
      "grad_norm": 0.2047675084448879,
      "learning_rate": 9.918724219660013e-06,
      "loss": 0.18301695585250854,
      "step": 60
    },
    {
      "epoch": 0.5431619786614937,
      "grad_norm": 0.1694310997257767,
      "learning_rate": 9.817749962596115e-06,
      "loss": 0.16246029138565063,
      "step": 70
    },
    {
      "epoch": 0.6207565470417071,
      "grad_norm": 0.22587456656054467,
      "learning_rate": 9.677539179628005e-06,
      "loss": 0.14934264421463012,
      "step": 80
    },
    {
      "epoch": 0.6983511154219205,
      "grad_norm": 0.22154973989105028,
      "learning_rate": 9.499233769787534e-06,
      "loss": 0.134801185131073,
      "step": 90
    },
    {
      "epoch": 0.7759456838021338,
      "grad_norm": 0.2099862635469814,
      "learning_rate": 9.284285880837947e-06,
      "loss": 0.13017673492431642,
      "step": 100
    },
    {
      "epoch": 0.8535402521823472,
      "grad_norm": 0.32230657820182124,
      "learning_rate": 9.034446082750352e-06,
      "loss": 0.12214579582214355,
      "step": 110
    },
    {
      "epoch": 0.9311348205625606,
      "grad_norm": 0.324253054340729,
      "learning_rate": 8.751749110782013e-06,
      "loss": 0.12026152610778809,
      "step": 120
    },
    {
      "epoch": 1.0077594568380213,
      "grad_norm": 0.20488241588612174,
      "learning_rate": 8.438497294267117e-06,
      "loss": 0.11126101016998291,
      "step": 130
    },
    {
      "epoch": 1.0853540252182348,
      "grad_norm": 0.20661218086124847,
      "learning_rate": 8.097241806078616e-06,
      "loss": 0.10776399374008179,
      "step": 140
    },
    {
      "epoch": 1.162948593598448,
      "grad_norm": 0.25468202960165104,
      "learning_rate": 7.730761885468486e-06,
      "loss": 0.10431833267211914,
      "step": 150
    },
    {
      "epoch": 1.2405431619786615,
      "grad_norm": 0.17930064486716413,
      "learning_rate": 7.342042203498952e-06,
      "loss": 0.10304663181304932,
      "step": 160
    },
    {
      "epoch": 1.3181377303588748,
      "grad_norm": 0.20225538073749422,
      "learning_rate": 6.934248555404197e-06,
      "loss": 0.09784629344940185,
      "step": 170
    },
    {
      "epoch": 1.3957322987390883,
      "grad_norm": 0.2256721972453044,
      "learning_rate": 6.510702077847864e-06,
      "loss": 0.09537227749824524,
      "step": 180
    },
    {
      "epoch": 1.4733268671193016,
      "grad_norm": 0.21487787771920072,
      "learning_rate": 6.074852201055121e-06,
      "loss": 0.09520423412322998,
      "step": 190
    },
    {
      "epoch": 1.5509214354995149,
      "grad_norm": 0.17540761321861204,
      "learning_rate": 5.630248556101448e-06,
      "loss": 0.09088362455368042,
      "step": 200
    },
    {
      "epoch": 1.6285160038797284,
      "grad_norm": 0.21743503130668765,
      "learning_rate": 5.180512066149682e-06,
      "loss": 0.0899280071258545,
      "step": 210
    },
    {
      "epoch": 1.706110572259942,
      "grad_norm": 0.20331687416060285,
      "learning_rate": 4.729305457072913e-06,
      "loss": 0.0881616234779358,
      "step": 220
    },
    {
      "epoch": 1.7837051406401552,
      "grad_norm": 0.15781467110120098,
      "learning_rate": 4.280303427629404e-06,
      "loss": 0.08638249635696411,
      "step": 230
    },
    {
      "epoch": 1.8612997090203685,
      "grad_norm": 0.1623620489054104,
      "learning_rate": 3.8371627221284495e-06,
      "loss": 0.08716154098510742,
      "step": 240
    },
    {
      "epoch": 1.938894277400582,
      "grad_norm": 0.15611783173066054,
      "learning_rate": 3.403492349320101e-06,
      "loss": 0.08580605983734131,
      "step": 250
    },
    {
      "epoch": 2.0155189136760425,
      "grad_norm": 0.15287072067575233,
      "learning_rate": 2.982824190050958e-06,
      "loss": 0.08316840529441834,
      "step": 260
    },
    {
      "epoch": 2.093113482056256,
      "grad_norm": 0.1853136112632167,
      "learning_rate": 2.5785842330619038e-06,
      "loss": 0.08091338872909545,
      "step": 270
    },
    {
      "epoch": 2.1707080504364695,
      "grad_norm": 0.14114872525549504,
      "learning_rate": 2.1940646731880887e-06,
      "loss": 0.08085420131683349,
      "step": 280
    },
    {
      "epoch": 2.248302618816683,
      "grad_norm": 0.13643528182686213,
      "learning_rate": 1.8323970991978823e-06,
      "loss": 0.08156624436378479,
      "step": 290
    },
    {
      "epoch": 2.325897187196896,
      "grad_norm": 0.14573681730374075,
      "learning_rate": 1.4965269896332884e-06,
      "loss": 0.0808843195438385,
      "step": 300
    },
    {
      "epoch": 2.4034917555771096,
      "grad_norm": 0.1466398992341211,
      "learning_rate": 1.1891897243618184e-06,
      "loss": 0.07979943156242371,
      "step": 310
    },
    {
      "epoch": 2.481086323957323,
      "grad_norm": 0.12798260710398743,
      "learning_rate": 9.128883072055411e-07,
      "loss": 0.08049517869949341,
      "step": 320
    },
    {
      "epoch": 2.558680892337536,
      "grad_norm": 0.13826353734235647,
      "learning_rate": 6.698729810778065e-07,
      "loss": 0.08011389374732972,
      "step": 330
    },
    {
      "epoch": 2.6362754607177497,
      "grad_norm": 0.1305401343538733,
      "learning_rate": 4.6212290164521554e-07,
      "loss": 0.08163015246391296,
      "step": 340
    },
    {
      "epoch": 2.713870029097963,
      "grad_norm": 0.12804004522045906,
      "learning_rate": 2.9133001876746004e-07,
      "loss": 0.08051948547363282,
      "step": 350
    },
    {
      "epoch": 2.7914645974781767,
      "grad_norm": 0.12808224007612634,
      "learning_rate": 1.5888529698718347e-07,
      "loss": 0.07719261646270752,
      "step": 360
    },
    {
      "epoch": 2.86905916585839,
      "grad_norm": 0.12117673381149041,
      "learning_rate": 6.58673872923693e-08,
      "loss": 0.08128957152366638,
      "step": 370
    },
    {
      "epoch": 2.946653734238603,
      "grad_norm": 0.124324493318766,
      "learning_rate": 1.3033842410251074e-08,
      "loss": 0.07743191719055176,
      "step": 380
    },
    {
      "epoch": 3.0,
      "step": 387,
      "total_flos": 3081875480379392.0,
      "train_loss": 0.06056562058377327,
      "train_runtime": 29609.547,
      "train_samples_per_second": 6.685,
      "train_steps_per_second": 0.013
    }
  ],
  "logging_steps": 10,
  "max_steps": 387,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 3,
  "save_steps": 40,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 3081875480379392.0,
  "train_batch_size": 4,
  "trial_name": null,
  "trial_params": null
}