Llama3.2-3B_Paper_Impact_me…/trainer_state.json

{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 1.0,
  "eval_steps": 500,
  "global_step": 166,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.030211480362537766,
      "grad_norm": 0.21704457700252533,
      "learning_rate": 4.705882352941177e-06,
      "loss": 0.284,
      "step": 5
    },
    {
      "epoch": 0.06042296072507553,
      "grad_norm": 0.07700642943382263,
      "learning_rate": 1.0588235294117648e-05,
      "loss": 0.091,
      "step": 10
    },
    {
      "epoch": 0.09063444108761329,
      "grad_norm": 1.0004132986068726,
      "learning_rate": 1.647058823529412e-05,
      "loss": 0.0881,
      "step": 15
    },
    {
      "epoch": 0.12084592145015106,
      "grad_norm": 0.017268722876906395,
      "learning_rate": 1.9991110182465032e-05,
      "loss": 0.0856,
      "step": 20
    },
    {
      "epoch": 0.1510574018126888,
      "grad_norm": 0.06252593547105789,
      "learning_rate": 1.9891281165856876e-05,
      "loss": 0.0776,
      "step": 25
    },
    {
      "epoch": 0.18126888217522658,
      "grad_norm": 0.013158817775547504,
      "learning_rate": 1.968162302997659e-05,
      "loss": 0.0796,
      "step": 30
    },
    {
      "epoch": 0.21148036253776434,
      "grad_norm": 0.054746970534324646,
      "learning_rate": 1.9364463741042694e-05,
      "loss": 0.0775,
      "step": 35
    },
    {
      "epoch": 0.24169184290030213,
      "grad_norm": 0.027843380346894264,
      "learning_rate": 1.8943324918225495e-05,
      "loss": 0.0776,
      "step": 40
    },
    {
      "epoch": 0.2719033232628399,
      "grad_norm": 0.09170668572187424,
      "learning_rate": 1.8422882730893323e-05,
      "loss": 0.0778,
      "step": 45
    },
    {
      "epoch": 0.3021148036253776,
      "grad_norm": 0.049589045345783234,
      "learning_rate": 1.7808915976161364e-05,
      "loss": 0.0776,
      "step": 50
    },
    {
      "epoch": 0.3323262839879154,
      "grad_norm": 0.057375721633434296,
      "learning_rate": 1.710824191327075e-05,
      "loss": 0.0787,
      "step": 55
    },
    {
      "epoch": 0.36253776435045315,
      "grad_norm": 0.04218236356973648,
      "learning_rate": 1.632864056726917e-05,
      "loss": 0.079,
      "step": 60
    },
    {
      "epoch": 0.39274924471299094,
      "grad_norm": 0.1125708743929863,
      "learning_rate": 1.5478768342496872e-05,
      "loss": 0.0776,
      "step": 65
    },
    {
      "epoch": 0.4229607250755287,
      "grad_norm": 0.024594679474830627,
      "learning_rate": 1.4568061905081874e-05,
      "loss": 0.0779,
      "step": 70
    },
    {
      "epoch": 0.45317220543806647,
      "grad_norm": 0.01017661951482296,
      "learning_rate": 1.3606633401697557e-05,
      "loss": 0.0782,
      "step": 75
    },
    {
      "epoch": 0.48338368580060426,
      "grad_norm": 0.07011096179485321,
      "learning_rate": 1.2605158178034656e-05,
      "loss": 0.0791,
      "step": 80
    },
    {
      "epoch": 0.513595166163142,
      "grad_norm": 0.01498446986079216,
      "learning_rate": 1.157475624372018e-05,
      "loss": 0.0792,
      "step": 85
    },
    {
      "epoch": 0.5438066465256798,
      "grad_norm": 0.03184051066637039,
      "learning_rate": 1.0526868799852797e-05,
      "loss": 0.0779,
      "step": 90
    },
    {
      "epoch": 0.5740181268882175,
      "grad_norm": 0.078987717628479,
      "learning_rate": 9.473131200147205e-06,
      "loss": 0.0781,
      "step": 95
    },
    {
      "epoch": 0.6042296072507553,
      "grad_norm": 0.05952491611242294,
      "learning_rate": 8.425243756279824e-06,
      "loss": 0.0771,
      "step": 100
    },
    {
      "epoch": 0.6344410876132931,
      "grad_norm": 0.014677044935524464,
      "learning_rate": 7.394841821965345e-06,
      "loss": 0.0771,
      "step": 105
    },
    {
      "epoch": 0.6646525679758308,
      "grad_norm": 0.03106486238539219,
      "learning_rate": 6.3933665983024465e-06,
      "loss": 0.0776,
      "step": 110
    },
    {
      "epoch": 0.6948640483383686,
      "grad_norm": 0.03548077121376991,
      "learning_rate": 5.431938094918132e-06,
      "loss": 0.0767,
      "step": 115
    },
    {
      "epoch": 0.7250755287009063,
      "grad_norm": 0.02386642061173916,
      "learning_rate": 4.5212316575031325e-06,
      "loss": 0.0778,
      "step": 120
    },
    {
      "epoch": 0.7552870090634441,
      "grad_norm": 0.03368431329727173,
      "learning_rate": 3.6713594327308343e-06,
      "loss": 0.0776,
      "step": 125
    },
    {
      "epoch": 0.7854984894259819,
      "grad_norm": 0.016041960567235947,
      "learning_rate": 2.891758086729253e-06,
      "loss": 0.0769,
      "step": 130
    },
    {
      "epoch": 0.8157099697885196,
      "grad_norm": 0.03274780884385109,
      "learning_rate": 2.19108402383864e-06,
      "loss": 0.0768,
      "step": 135
    },
    {
      "epoch": 0.8459214501510574,
      "grad_norm": 0.03985007107257843,
      "learning_rate": 1.5771172691066793e-06,
      "loss": 0.0765,
      "step": 140
    },
    {
      "epoch": 0.8761329305135952,
      "grad_norm": 0.02575680799782276,
      "learning_rate": 1.0566750817745076e-06,
      "loss": 0.077,
      "step": 145
    },
    {
      "epoch": 0.9063444108761329,
      "grad_norm": 0.04101819917559624,
      "learning_rate": 6.355362589573078e-07,
      "loss": 0.0758,
      "step": 150
    },
    {
      "epoch": 0.9365558912386707,
      "grad_norm": 0.06996775418519974,
      "learning_rate": 3.1837697002341293e-07,
      "loss": 0.0775,
      "step": 155
    },
    {
      "epoch": 0.9667673716012085,
      "grad_norm": 0.007283793296664953,
      "learning_rate": 1.0871883414312778e-07,
      "loss": 0.0758,
      "step": 160
    },
    {
      "epoch": 0.9969788519637462,
      "grad_norm": 0.017074227333068848,
      "learning_rate": 8.889817534969425e-09,
      "loss": 0.0768,
      "step": 165
    },
    {
      "epoch": 1.0,
      "step": 166,
      "total_flos": 2.8024067250035098e+17,
      "train_loss": 0.08477670932749667,
      "train_runtime": 1255.2183,
      "train_samples_per_second": 16.86,
      "train_steps_per_second": 0.132
    }
  ],
  "logging_steps": 5,
  "max_steps": 166,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 1,
  "save_steps": 500,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 2.8024067250035098e+17,
  "train_batch_size": 8,
  "trial_name": null,
  "trial_params": null
}