Llama3.2-3B_Paper_Impact_pa…/trainer_state.json

{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 1.0,
  "eval_steps": 500,
  "global_step": 144,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.034904013961605584,
      "grad_norm": 0.2724517285823822,
      "learning_rate": 5.333333333333334e-06,
      "loss": 0.0934,
      "step": 5
    },
    {
      "epoch": 0.06980802792321117,
      "grad_norm": 0.3414818048477173,
      "learning_rate": 1.2e-05,
      "loss": 0.0773,
      "step": 10
    },
    {
      "epoch": 0.10471204188481675,
      "grad_norm": 0.07792749255895615,
      "learning_rate": 1.866666666666667e-05,
      "loss": 0.0711,
      "step": 15
    },
    {
      "epoch": 0.13961605584642234,
      "grad_norm": 0.0294520054012537,
      "learning_rate": 1.995259033893236e-05,
      "loss": 0.0736,
      "step": 20
    },
    {
      "epoch": 0.17452006980802792,
      "grad_norm": 0.013957683928310871,
      "learning_rate": 1.9760758775559275e-05,
      "loss": 0.0697,
      "step": 25
    },
    {
      "epoch": 0.2094240837696335,
      "grad_norm": 0.065118707716465,
      "learning_rate": 1.9424380828337146e-05,
      "loss": 0.0699,
      "step": 30
    },
    {
      "epoch": 0.2443280977312391,
      "grad_norm": 0.021100476384162903,
      "learning_rate": 1.894843789440892e-05,
      "loss": 0.0697,
      "step": 35
    },
    {
      "epoch": 0.2792321116928447,
      "grad_norm": 0.026198429986834526,
      "learning_rate": 1.833997817889878e-05,
      "loss": 0.0695,
      "step": 40
    },
    {
      "epoch": 0.31413612565445026,
      "grad_norm": 0.07283973693847656,
      "learning_rate": 1.760801231854278e-05,
      "loss": 0.07,
      "step": 45
    },
    {
      "epoch": 0.34904013961605584,
      "grad_norm": 0.04578598588705063,
      "learning_rate": 1.676337994380903e-05,
      "loss": 0.0701,
      "step": 50
    },
    {
      "epoch": 0.38394415357766143,
      "grad_norm": 0.10095158964395523,
      "learning_rate": 1.581858915557953e-05,
      "loss": 0.0698,
      "step": 55
    },
    {
      "epoch": 0.418848167539267,
      "grad_norm": 0.028562646359205246,
      "learning_rate": 1.4787631293572094e-05,
      "loss": 0.0699,
      "step": 60
    },
    {
      "epoch": 0.4537521815008726,
      "grad_norm": 0.02697976492345333,
      "learning_rate": 1.368577373958362e-05,
      "loss": 0.0695,
      "step": 65
    },
    {
      "epoch": 0.4886561954624782,
      "grad_norm": 0.0685800239443779,
      "learning_rate": 1.2529333823916807e-05,
      "loss": 0.0696,
      "step": 70
    },
    {
      "epoch": 0.5235602094240838,
      "grad_norm": 0.13133621215820312,
      "learning_rate": 1.133543718319398e-05,
      "loss": 0.0713,
      "step": 75
    },
    {
      "epoch": 0.5584642233856894,
      "grad_norm": 0.017290577292442322,
      "learning_rate": 1.0121764148019977e-05,
      "loss": 0.0696,
      "step": 80
    },
    {
      "epoch": 0.5933682373472949,
      "grad_norm": 0.05858515202999115,
      "learning_rate": 8.906287916221259e-06,
      "loss": 0.0696,
      "step": 85
    },
    {
      "epoch": 0.6282722513089005,
      "grad_norm": 0.07648473978042603,
      "learning_rate": 7.707008389035102e-06,
      "loss": 0.0699,
      "step": 90
    },
    {
      "epoch": 0.6631762652705061,
      "grad_norm": 0.052451424300670624,
      "learning_rate": 6.5416856118498874e-06,
      "loss": 0.0697,
      "step": 95
    },
    {
      "epoch": 0.6980802792321117,
      "grad_norm": 0.03691520541906357,
      "learning_rate": 5.427576766953615e-06,
      "loss": 0.0697,
      "step": 100
    },
    {
      "epoch": 0.7329842931937173,
      "grad_norm": 0.003152969991788268,
      "learning_rate": 4.381180613146396e-06,
      "loss": 0.0695,
      "step": 105
    },
    {
      "epoch": 0.7678883071553229,
      "grad_norm": 0.017924955114722252,
      "learning_rate": 3.4179931567925216e-06,
      "loss": 0.0694,
      "step": 110
    },
    {
      "epoch": 0.8027923211169284,
      "grad_norm": 0.04167533293366432,
      "learning_rate": 2.5522781725621814e-06,
      "loss": 0.0694,
      "step": 115
    },
    {
      "epoch": 0.837696335078534,
      "grad_norm": 0.03422262519598007,
      "learning_rate": 1.7968559722048906e-06,
      "loss": 0.0692,
      "step": 120
    },
    {
      "epoch": 0.8726003490401396,
      "grad_norm": 0.0365980863571167,
      "learning_rate": 1.1629135494628097e-06,
      "loss": 0.0696,
      "step": 125
    },
    {
      "epoch": 0.9075043630017452,
      "grad_norm": 0.032294586300849915,
      "learning_rate": 6.598389126745209e-07,
      "loss": 0.0695,
      "step": 130
    },
    {
      "epoch": 0.9424083769633508,
      "grad_norm": 0.001334571628831327,
      "learning_rate": 2.9508205842594727e-07,
      "loss": 0.0695,
      "step": 135
    },
    {
      "epoch": 0.9773123909249564,
      "grad_norm": 0.05335932970046997,
      "learning_rate": 7.404464507973608e-08,
      "loss": 0.0693,
      "step": 140
    },
    {
      "epoch": 1.0,
      "step": 144,
      "total_flos": 2.4545020729727386e+17,
      "train_loss": 0.07097241137590674,
      "train_runtime": 1113.1898,
      "train_samples_per_second": 16.457,
      "train_steps_per_second": 0.129
    }
  ],
  "logging_steps": 5,
  "max_steps": 144,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 1,
  "save_steps": 500,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 2.4545020729727386e+17,
  "train_batch_size": 8,
  "trial_name": null,
  "trial_params": null
}