prompt-injection-judge-3b/debug.log

[2026-03-29 17:06:48,176] [DEBUG] [axolotl.utils.config.resolve_dtype:66] [PID:1870] bf16 support detected, enabling for this configuration.
[2026-03-29 17:06:48,413] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:1870] baseline 0.000GB ()
[2026-03-29 17:06:48,417] [INFO] [axolotl.cli.config.load_cfg:248] [PID:1870] config:
{
  "activation_offloading": false,
  "adapter": "lora",
  "axolotl_config_path": "config.yaml",
  "base_model": "dphn/Dolphin3.0-Llama3.2-3B",
  "base_model_config": "dphn/Dolphin3.0-Llama3.2-3B",
  "batch_size": 16,
  "bf16": true,
  "capabilities": {
    "bf16": true,
    "compute_capability": "sm_90",
    "fp8": false,
    "n_gpu": 1,
    "n_node": 1
  },
  "chat_template": "tokenizer_default",
  "context_parallel_size": 1,
  "dataloader_num_workers": 1,
  "dataloader_pin_memory": true,
  "dataloader_prefetch_factor": 256,
  "dataset_processes": 26,
  "datasets": [
    {
      "chat_template": "tokenizer_default",
      "data_files": [
        "train.jsonl"
      ],
      "field_messages": "messages",
      "message_property_mappings": {
        "content": "content",
        "role": "role"
      },
      "path": "karan11/defender-judge-fine-tune",
      "trust_remote_code": false,
      "type": "chat_template"
    }
  ],
  "ddp": false,
  "device": "cuda:0",
  "dion_rank_fraction": 1.0,
  "dion_rank_multiple_of": 1,
  "env_capabilities": {
    "torch_version": "2.7.1"
  },
  "eval_batch_size": 4,
  "eval_causal_lm_metrics": [
    "sacrebleu",
    "comet",
    "ter",
    "chrf"
  ],
  "eval_max_new_tokens": 128,
  "eval_sample_packing": false,
  "eval_steps": 50,
  "eval_table_size": 0,
  "experimental_skip_move_to_device": true,
  "flash_attention": true,
  "fp16": false,
  "gradient_accumulation_steps": 4,
  "gradient_checkpointing": true,
  "gradient_checkpointing_kwargs": {
    "use_reentrant": true
  },
  "include_tkps": true,
  "is_llama_derived_model": true,
  "learning_rate": 0.00015,
  "lisa_layers_attribute": "model.layers",
  "load_best_model_at_end": false,
  "load_in_4bit": false,
  "load_in_8bit": true,
  "local_rank": 0,
  "logging_steps": 5,
  "lora_alpha": 64,
  "lora_dropout": 0.1,
  "lora_r": 32,
  "lora_target_modules": [
    "q_proj",
    "k_proj",
    "v_proj",
    "o_proj",
    "gate_proj",
    "up_proj",
    "down_proj"
  ],
  "loraplus_lr_embedding": 1e-06,
  "lr_scheduler": "cosine",
  "max_grad_norm": 1.0,
  "mean_resizing_embeddings": false,
  "micro_batch_size": 4,
  "model_config_type": "llama",
  "neftune_noise_alpha": 5.0,
  "num_epochs": 3.0,
  "optimizer": "adamw_bnb_8bit",
  "output_dir": "./outputs/defender-judge",
  "pad_to_sequence_len": true,
  "pretrain_multipack_attn": true,
  "profiler_steps_start": 0,
  "qlora_sharded_model_loading": false,
  "ray_num_workers": 1,
  "remove_unused_columns": false,
  "resources_per_worker": {
    "GPU": 1
  },
  "sample_packing": true,
  "sample_packing_bin_size": 200,
  "sample_packing_group_size": 100000,
  "save_only_model": false,
  "save_safetensors": true,
  "save_steps": 50,
  "sequence_len": 8192,
  "shuffle_before_merging_datasets": false,
  "shuffle_merged_datasets": true,
  "skip_prepare_dataset": false,
  "streaming_multipack_buffer_size": 10000,
  "strict": false,
  "tensor_parallel_size": 1,
  "tiled_mlp_use_original_mlp": true,
  "tokenizer_config": "dphn/Dolphin3.0-Llama3.2-3B",
  "tokenizer_save_jinja_files": true,
  "torch_dtype": "torch.bfloat16",
  "train_on_inputs": false,
  "trl": {
    "log_completions": false,
    "mask_truncated_completions": false,
    "ref_model_mixup_alpha": 0.9,
    "ref_model_sync_steps": 64,
    "scale_rewards": true,
    "sync_ref_model": false,
    "use_vllm": false,
    "vllm_server_host": "0.0.0.0",
    "vllm_server_port": 8000
  },
  "use_ray": false,
  "val_set_size": 0.0,
  "vllm": {
    "device": "auto",
    "dtype": "auto",
    "gpu_memory_utilization": 0.9,
    "host": "0.0.0.0",
    "port": 8000
  },
  "warmup_ratio": 0.1,
  "weight_decay": 0.05,
  "world_size": 1
}
[2026-03-29 17:06:48,431] [WARNING] [axolotl.cli.checks.check_user_token:46] [PID:1870] Error verifying HuggingFace token. Remember to log in using `huggingface-cli login` and get your access token from https://huggingface.co/settings/tokens if you want to use gated models or datasets.
[2026-03-29 17:06:49,360] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:278] [PID:1870] EOS: 128256 / <|im_end|>
[2026-03-29 17:06:49,362] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:279] [PID:1870] BOS: 128000 / <|begin_of_text|>
[2026-03-29 17:06:49,364] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:1870] PAD: 128001 / <|end_of_text|>
[2026-03-29 17:06:49,365] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:1870] UNK: None / None
[2026-03-29 17:06:49,368] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:476] [PID:1870] Unable to find prepared dataset in last_run_prepared/b72df10b48d5f00efef65cbe761c72f7
[2026-03-29 17:06:49,370] [INFO] [axolotl.utils.data.sft._load_raw_datasets:320] [PID:1870] Loading raw datasets...
[2026-03-29 17:06:49,372] [WARNING] [axolotl.utils.data.sft._load_raw_datasets:322] [PID:1870] Processing datasets during training can lead to VRAM instability. Please pre-process your dataset using `axolotl preprocess path/to/config.yml`.
[2026-03-29 17:06:50,431] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:87] [PID:1870] Loading dataset: karan11/defender-judge-fine-tune with base_type: chat_template and prompt_style: None
[2026-03-29 17:06:50,440] [INFO] [axolotl.prompt_strategies.chat_template.__call__:969] [PID:1870] Using chat template:
---
{%- if tools %}
    {{- '<|im_start|>system\n' }}
    {%- if messages[0]['role'] == 'system' %}
        {{- messages[0]['content'] }}
    {%- else %}
        {{- 'You are Dolphin, created by Eric Hartford. You are a helpful assistant.' }}
    {%- endif %}
    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
    {%- for tool in tools %}
        {{- "\n" }}
        {{- tool | tojson }}
    {%- endfor %}
    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
{%- else %}
    {%- if messages[0]['role'] == 'system' %}
        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
    {%- else %}
        {{- '<|im_start|>system\nYou are Dolphin, created by Eric Hartford. You are a helpful assistant.<|im_end|>\n' }}
    {%- endif %}
{%- endif %}
{%- for message in messages %}
    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
    {%- elif message.role == "assistant" %}
        {{- '<|im_start|>' + message.role }}
        {%- if message.content %}
            {{- '\n' + message.content }}
        {%- endif %}
        {%- for tool_call in message.tool_calls %}
            {%- if tool_call.function is defined %}
                {%- set tool_call = tool_call.function %}
            {%- endif %}
            {{- '\n<tool_call>\n{"name": "' }}
            {{- tool_call.name }}
            {{- '", "arguments": ' }}
            {{- tool_call.arguments | tojson }}
            {{- '}\n</tool_call>' }}
        {%- endfor %}
        {{- '<|im_end|>\n' }}
    {%- elif message.role == "tool" %}
        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
            {{- '<|im_start|>user' }}
        {%- endif %}
        {{- '\n<tool_response>\n' }}
        {{- message.content }}
        {{- '\n</tool_response>' }}
        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
            {{- '<|im_end|>\n' }}
        {%- endif %}
    {%- endif %}
{%- endfor %}
{%- if add_generation_prompt %}
    {{- '<|im_start|>assistant\n' }}
{%- endif %}

---

Tokenizing Prompts (num_proc=26):   0%|                                                                                                                  | 0/2700 [00:00<?, ? examples/s]
Tokenizing Prompts (num_proc=26):   4%|████                                                                                                    | 104/2700 [00:01<00:33, 77.15 examples/s]
Tokenizing Prompts (num_proc=26):   8%|███████▉                                                                                               | 208/2700 [00:01<00:16, 147.13 examples/s]
Tokenizing Prompts (num_proc=26):  12%|███████████▉                                                                                           | 312/2700 [00:01<00:09, 241.56 examples/s]
Tokenizing Prompts (num_proc=26):  15%|███████████████▊                                                                                       | 416/2700 [00:01<00:07, 324.49 examples/s]
Tokenizing Prompts (num_proc=26):  19%|███████████████████▊                                                                                   | 520/2700 [00:01<00:05, 411.19 examples/s]
Tokenizing Prompts (num_proc=26):  23%|███████████████████████▊                                                                               | 624/2700 [00:02<00:04, 468.20 examples/s]
Tokenizing Prompts (num_proc=26):  27%|███████████████████████████▊                                                                           | 728/2700 [00:02<00:03, 525.61 examples/s]
Tokenizing Prompts (num_proc=26):  31%|███████████████████████████████▋                                                                       | 832/2700 [00:02<00:03, 579.18 examples/s]
Tokenizing Prompts (num_proc=26):  35%|███████████████████████████████████▋                                                                   | 936/2700 [00:02<00:02, 623.87 examples/s]
Tokenizing Prompts (num_proc=26):  39%|███████████████████████████████████████▎                                                              | 1040/2700 [00:02<00:02, 631.67 examples/s]
Tokenizing Prompts (num_proc=26):  42%|███████████████████████████████████████████▏                                                          | 1144/2700 [00:02<00:02, 631.58 examples/s]
Tokenizing Prompts (num_proc=26):  46%|███████████████████████████████████████████████▏                                                      | 1248/2700 [00:03<00:02, 645.11 examples/s]
Tokenizing Prompts (num_proc=26):  54%|███████████████████████████████████████████████████████                                               | 1456/2700 [00:03<00:01, 821.63 examples/s]
Tokenizing Prompts (num_proc=26):  58%|██████████████████████████████████████████████████████████▉                                           | 1560/2700 [00:03<00:01, 804.13 examples/s]
Tokenizing Prompts (num_proc=26):  62%|██████████████████████████████████████████████████████████████▊                                       | 1664/2700 [00:03<00:01, 772.18 examples/s]
Tokenizing Prompts (num_proc=26):  65%|█████████████████████████████████<E29688><E29688>
[2026-03-29 17:06:56,225] [INFO] [axolotl.utils.data.utils.handle_long_seq_in_dataset:218] [PID:1870] min_input_len: 915
[2026-03-29 17:06:56,227] [INFO] [axolotl.utils.data.utils.handle_long_seq_in_dataset:220] [PID:1870] max_input_len: 5341

Dropping Long Sequences (>8192) (num_proc=26):   0%|                                                                                                     | 0/2700 [00:00<?, ? examples/s]
Dropping Long Sequences (>8192) (num_proc=26):   4%|███▌                                                                                       | 104/2700 [00:01<00:26, 99.21 examples/s]
Dropping Long Sequences (>8192) (num_proc=26):  23%|████████████████████▊                                                                     | 624/2700 [00:01<00:02, 701.67 examples/s]
Dropping Long Sequences (>8192) (num_proc=26):  39%|█████████████████████████████████▉                                                      | 1040/2700 [00:01<00:01, 1190.54 examples/s]
Dropping Long Sequences (>8192) (num_proc=26): 100%|████████████████████████████████████████████████████████████████████████████████████████| 2700/2700 [00:01<00:00, 1481.80 examples/s]

Drop Samples with Zero Trainable Tokens (num_proc=26):   0%|                                                                                             | 0/2700 [00:00<?, ? examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=26):   4%|███▏                                                                               | 104/2700 [00:01<00:36, 71.14 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=26):  39%|███████████████████████████████▏                                                 | 1040/2700 [00:01<00:01, 881.31 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=26):  58%|██████████████████████████████████████████████▏                                 | 1559/2700 [00:01<00:01, 1117.53 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=26): 100%|████████████████████████████████████████████████████████████████████████████████| 2700/2700 [00:02<00:00, 1205.04 examples/s]

Add position_id column (Sample Packing) (num_proc=26):   0%|                                                                                             | 0/2700 [00:00<?, ? examples/s]
Add position_id column (Sample Packing) (num_proc=26):   4%|███▏                                                                              | 104/2700 [00:00<00:13, 188.30 examples/s]
Add position_id column (Sample Packing) (num_proc=26):  15%|████████████▋                                                                     | 416/2700 [00:00<00:02, 773.39 examples/s]
Add position_id column (Sample Packing) (num_proc=26):  58%|██████████████████████████████████████████████▏                                 | 1560/2700 [00:00<00:00, 3137.08 examples/s]
Add position_id column (Sample Packing) (num_proc=26): 100%|████████████████████████████████████████████████████████████████████████████████| 2700/2700 [00:00<00:00, 5046.91 examples/s]
Add position_id column (Sample Packing) (num_proc=26): 100%|████████████████████████████████████████████████████████████████████████████████| 2700/2700 [00:01<00:00, 2033.93 examples/s]

Saving the dataset (0/10 shards):   0%|                                                                                                                  | 0/2700 [00:00<?, ? examples/s]
Saving the dataset (0/10 shards):  10%|██████████▏                                                                                           | 270/2700 [00:00<00:01, 1392.03 examples/s]
Saving the dataset (1/10 shards):  10%|██████████▏                                                                                           | 270/2700 [00:00<00:01, 1392.03 examples/s]
Saving the dataset (2/10 shards):  40%|████████████████████████████████████████▍                                                            | 1080/2700 [00:00<00:01, 1392.03 examples/s]
Saving the dataset (3/10 shards):  40%|████████████████████████████████████████▍                                                            | 1080/2700 [00:00<00:01, 1392.03 examples/s]
Saving the dataset (4/10 shards):  40%|████████████████████████████████████████▍                                                            | 1080/2700 [00:00<00:01, 1392.03 examples/s]
Saving the dataset (5/10 shards):  70%|██████████████████████████████████████████████████████████████████████▋                              | 1890/2700 [00:00<00:00, 1392.03 examples/s]
Saving the dataset (6/10 shards):  70%|██████████████████████████████████████████████████████████████████████▋                              | 1890/2700 [00:00<00:00, 1392.03 examples/s]
Saving the dataset (7/10 shards):  70%|██████████████████████████████████████████████████████████████████████▋                              | 1890/2700 [00:00<00:00, 1392.03 examples/s]
Saving the dataset (8/10 shards):  80%|████████████████████████████████████████████████████████████████████████████████▊                    | 2160/2700 [00:00<00:00, 1392.03 examples/s]
Saving the dataset (9/10 shards):  90%|██████████████████████████████████████████████████████████████████████████████████████████▉          | 2430/2700 [00:00<00:00, 1392.03 examples/s]
Saving the dataset (10/10 shards): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 2700/2700 [00:00<00:00, 1392.03 examples/s]
Saving the dataset (10/10 shards): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 2700/2700 [00:00<00:00, 8593.97 examples/s]
[2026-03-29 17:07:02,957] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:404] [PID:1870] total_num_tokens: 2_740_122
[2026-03-29 17:07:03,005] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:422] [PID:1870] `total_supervised_tokens: 202_860`
[2026-03-29 17:07:04,075] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:1870] generate_batches time: 0.48715806007385254
[2026-03-29 17:07:04,570] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:1870] generate_batches time: 0.4939417839050293
[2026-03-29 17:07:05,063] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:1870] generate_batches time: 0.4897637367248535
[2026-03-29 17:07:05,552] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:1870] generate_batches time: 0.4878394603729248
[2026-03-29 17:07:05,581] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:434] [PID:1870] gather_len_batches: [85]
[2026-03-29 17:07:05,582] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:481] [PID:1870] data_loader_len: 21
[2026-03-29 17:07:05,583] [INFO] [axolotl.utils.trainer.calc_sample_packing_eff_est:497] [PID:1870] sample_packing_eff_est across ranks: [0.9837869083180147]
[2026-03-29 17:07:05,585] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:509] [PID:1870] sample_packing_eff_est: 0.99
[2026-03-29 17:07:05,586] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:520] [PID:1870] total_num_steps: 63
[2026-03-29 17:07:05,588] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:121] [PID:1870] Maximum number of steps set at 63
[2026-03-29 17:07:05,617] [DEBUG] [axolotl.train.setup_model_and_tokenizer:65] [PID:1870] Loading tokenizer... dphn/Dolphin3.0-Llama3.2-3B
[2026-03-29 17:07:06,638] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:278] [PID:1870] EOS: 128256 / <|im_end|>
[2026-03-29 17:07:06,639] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:279] [PID:1870] BOS: 128000 / <|begin_of_text|>
[2026-03-29 17:07:06,641] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:1870] PAD: 128001 / <|end_of_text|>
[2026-03-29 17:07:06,642] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:1870] UNK: None / None
[2026-03-29 17:07:06,644] [DEBUG] [axolotl.train.setup_model_and_tokenizer:74] [PID:1870] Loading model
[2026-03-29 17:07:06,747] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:87] [PID:1870] Patched Trainer.evaluation_loop with nanmean loss calculation
[2026-03-29 17:07:06,752] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:138] [PID:1870] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation
[2026-03-29 17:07:06,755] [INFO] [axolotl.loaders.patch_manager._apply_multipack_patches:301] [PID:1870] Applying multipack dataloader patch for sample packing...

model.safetensors.index.json: 0.00B [00:00, ?B/s]
model.safetensors.index.json: 20.9kB [00:00, 78.6MB/s]

model-00001-of-00002.safetensors:   0%|                                                                                                                      | 0.00/4.97G [00:00<?, ?B/s]
model-00001-of-00002.safetensors:   0%|                                                                                                             | 931k/4.97G [00:01<2:46:31, 497kB/s]
model-00001-of-00002.safetensors:   1%|█▌                                                                                                           | 70.1M/4.97G [00:03<02:56, 27.8MB/s]
model-00001-of-00002.safetensors:   3%|███                                                                                                           | 140M/4.97G [00:03<01:34, 51.2MB/s]
model-00001-of-00002.safetensors:   4%|████▌                                                                                                         | 207M/4.97G [00:06<02:08, 37.1MB/s]
model-00001-of-00002.safetensors:   6%|██████                                                                                                        | 274M/4.97G [00:08<02:34, 30.4MB/s]
model-00001-of-00002.safetensors:   7%|███████▌                                                                                                      | 341M/4.97G [00:11<02:47, 27.6MB/s]
model-00001-of-00002.safetensors:   7%|███████▋                                                                                                      | 345M/4.97G [00:12<02:53, 26.6MB/s]
model-00001-of-00002.safetensors:   8%|█████████▏                                                                                                    | 414M/4.97G [00:13<02:31, 30.0MB/s]
model-00001-of-00002.safetensors:  10%|██████████▋                                                                                                   | 481M/4.97G [00:15<02:12, 33.8MB/s]
model-00001-of-00002.safetensors:  11%|████████████▏                                                                                                 | 548M/4.97G [00:16<01:45, 41.9MB/s]
model-00001-of-00002.safetensors:  12%|█████████████▋                                                                                                | 616M/4.97G [00:17<01:43, 41.8MB/s]
model-00001-of-00002.safetensors:  14%|███████████████                                                                                               | 683M/4.97G [00:20<01:53, 37.7MB/s]
model-00001-of-00002.safetensors:  15%|████████████████▌                                                                                             | 750M/4.97G [00:22<02:02, 34.5MB/s]
model-00001-of-00002.safetensors:  16%|██████████████████                                                                                            | 817M/4.97G [00:23<01:47, 38.6MB/s]
model-00001-of-00002.safetensors:  18%|███████████████████▌                                                                                          | 884M/4.97G [00:26<01:59, 34.3MB/s]
model-00001-of-00002.safetensors:  19%|█████████████████████                                                                                         | 951M/4.97G [00:26<01:25, 47.2MB/s]
model-00001-of-00002.safetensors:  20%|██████████████████████▎                                                                                      | 1.02G/4.97G [00:28<01:28, 44.4MB/s]
model-00001-of-00002.safetensors:  22%|███████████████████████▊                                                                                     | 1.08G/4.97G [00:29<01:21, 47.4MB/s]
model-00001-of-00002.safetensors:  23%|█████████████████████████▎       

model-00002-of-00002.safetensors:   0%|                                                                                                                      | 0.00/1.46G [00:00<?, ?B/s]
model-00002-of-00002.safetensors:   0%|                                                                                                             | 609k/1.46G [00:02<1:34:37, 257kB/s]
model-00002-of-00002.safetensors:   5%|█████                                                                                                        | 67.7M/1.46G [00:05<01:45, 13.2MB/s]
model-00002-of-00002.safetensors:   9%|██████████▏                                                                                                   | 135M/1.46G [00:07<00:56, 23.3MB/s]
model-00002-of-00002.safetensors:  14%|███████████████▏                                                                                              | 202M/1.46G [00:10<00:56, 22.2MB/s]
model-00002-of-00002.safetensors:  18%|████████████████████▎                                                                                         | 269M/1.46G [00:11<00:39, 30.3MB/s]
model-00002-of-00002.safetensors:  23%|█████████████████████████▎                                                                                    | 336M/1.46G [00:11<00:27, 41.4MB/s]
model-00002-of-00002.safetensors:  28%|██████████████████████████████▎                                                                               | 403M/1.46G [00:12<00:21, 48.1MB/s]
model-00002-of-00002.safetensors:  32%|███████████████████████████████████▍                                                                          | 470M/1.46G [00:13<00:17, 55.8MB/s]
model-00002-of-00002.safetensors:  37%|████████████████████████████████████████▍                                                                     | 537M/1.46G [00:14<00:14, 64.2MB/s]
model-00002-of-00002.safetensors:  41%|█████████████████████████████████████████████▌                                                                | 604M/1.46G [00:15<00:12, 66.4MB/s]
model-00002-of-00002.safetensors:  46%|██████████████████████████████████████████████████▌                                                           | 671M/1.46G [00:15<00:10, 73.8MB/s]
model-00002-of-00002.safetensors:  51%|███████████████████████████████████████████████████████▋                                                      | 738M/1.46G [00:16<00:09, 75.1MB/s]
model-00002-of-00002.safetensors:  54%|███████████████████████████████████████████████████████████▍                                                  | 789M/1.46G [00:17<00:09, 71.6MB/s]
model-00002-of-00002.safetensors:  59%|████████████████████████████████████████████████████████████████▌                                             | 856M/1.46G [00:18<00:08, 71.0MB/s]
model-00002-of-00002.safetensors:  63%|█████████████████████████████████████████████████████████████████████▌                                        | 924M/1.46G [00:19<00:07, 71.9MB/s]
model-00002-of-00002.safetensors:  68%|███

Loading checkpoint shards:   0%|                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     | 0/2 [00:00<?, ?it/s]
Loading checkpoint shards:  50%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                                                                                                                                                              | 1/2 [00:06<00:06,  6.41s/it]
Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:08<00:00,  3.97s/it]
Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████<E29688><E29688>

generation_config.json:   0%|                                                                                    | 0.00/206 [00:00<?, ?B/s]
generation_config.json: 100%|██████████████████████████████████████████████████████████████████████████████| 206/206 [00:00<00:00, 556kB/s]
[2026-03-29 17:08:50,906] [INFO] [axolotl.loaders.model._prepare_model_for_quantization:863] [PID:1870] converting PEFT model w/ prepare_model_for_kbit_training
[2026-03-29 17:08:50,909] [INFO] [axolotl.loaders.model._configure_embedding_dtypes:345] [PID:1870] Converting modules to torch.bfloat16
[2026-03-29 17:08:50,913] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:1870] Memory usage after model load 4.879GB (+4.879GB allocated, +4.984GB reserved)
trainable params: 48,627,712 || all params: 3,261,383,680 || trainable%: 1.4910
[2026-03-29 17:08:51,319] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:1870] after adapters 3.559GB (+3.559GB allocated, +5.166GB reserved)
[2026-03-29 17:08:55,931] [INFO] [axolotl.train.save_initial_configs:398] [PID:1870] Pre-saving adapter config to ./outputs/defender-judge...
[2026-03-29 17:08:55,954] [INFO] [axolotl.train.save_initial_configs:402] [PID:1870] Pre-saving tokenizer to ./outputs/defender-judge...
[2026-03-29 17:08:56,307] [INFO] [axolotl.train.save_initial_configs:407] [PID:1870] Pre-saving model config to ./outputs/defender-judge...
[2026-03-29 17:08:56,337] [INFO] [axolotl.train.execute_training:196] [PID:1870] Starting trainer...
[2026-03-29 17:08:58,996] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:1870] generate_batches time: 0.915003776550293
[2026-03-29 17:08:59,888] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:1870] generate_batches time: 0.8903026580810547
[2026-03-29 17:09:00,762] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:1870] generate_batches time: 0.8731865882873535
[2026-03-29 17:09:01,657] [DEBUG] [axolotl.utils.samplers.multipack.__len__:458] [PID:1870] generate_batches time: 0.8936741352081299
[2026-03-29 17:09:01,660] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:434] [PID:1870] gather_len_batches: [85]

  0%|                                                                                                                                                                   | 0/63 [00:00<?, ?it/s][2026-03-29 17:09:05,685] [WARNING] [py.warnings._showwarnmsg:110] [PID:1870] /root/miniconda3/envs/py3.11/lib/python3.11/site-packages/bitsandbytes/autograd/_functions.py:186: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization
  warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization")


  2%|██▍                                                                                                                                                        | 1/63 [00:17<18:06, 17.53s/it]
  3%|████▉                                                                                                                                                      | 2/63 [00:30<15:12, 14.96s/it]
  5%|███████▌                                                                                                                                                        | 3/63 [00:43<14:08, 14.14s/it]
  6%|██████████▏                                                                                                                                                     | 4/63 [00:56<13:31, 13.75s/it]
  8%|████████████▋                                                                                                                                                   | 5/63 [01:10<13:07, 13.57s/it]
                                                                                                                                                                                                    
{'loss': 2.3013, 'grad_norm': 1.6521763801574707, 'learning_rate': 9.999999999999999e-05, 'memory/max_active (GiB)': 57.73, 'memory/max_allocated (GiB)': 57.73, 'memory/device_reserved (GiB)': 66.98, 'tokens_per_second_per_gpu': 718.74, 'epoch': 0.24}

  8%|████████████▋                                                                                                                                                   | 5/63 [01:10<13:07, 13.57s/it]
 10%|███████████████▏                                                                                                                                                | 6/63 [01:23<12:45, 13.43s/it]
 11%|█████████████████▊                                                                                                                                              | 7/63 [01:36<12:26, 13.34s/it]
 13%|████████████████████▎                                                                                                                                           | 8/63 [01:49<12:10, 13.27s/it]
 14%|██████████████████████▊                                                                                                                                         | 9/63 [02:02<11:54, 13.23s/it]
 16%|█████████████████████████▏                                                                                                                                     | 10/63 [02:15<11:39, 13.20s/it]
                                                                                                                                                                                                    
{'loss': 1.3236, 'grad_norm': 1.0924193859100342, 'learning_rate': 0.00014897709775520417, 'memory/max_active (GiB)': 57.73, 'memory/max_allocated (GiB)': 57.73, 'memory/device_reserved (GiB)': 66.98, 'tokens_per_second_per_gpu': 731.56, 'epoch': 0.47}

 16%|█████████████████████████▏                                                                                                                                     | 10/63 [02:15<11:39, 13.20s/it]
 17%|███████████████████████████▊                                                                                                                                   | 11/63 [02:29<11:25, 13.18s/it]
 19%|██████████████████████████████▎                                                                                                                                | 12/63 [02:42<11:12, 13.18s/it]
 21%|████████████████████████████████▊                                                                                                                              | 13/63 [02:55<10:58, 13.18s/it]
 22%|███████████████████████████████████▎                                                                                                                           | 14/63 [03:08<10:45, 13.17s/it]
 24%|█████████████████████████████████████▊                                                                                                                         | 15/63 [03:21<10:32, 13.17s/it]
                                                                                                                                                                                                    
{'loss': 0.5684, 'grad_norm': 0.6611356735229492, 'learning_rate': 0.00014282678705231832, 'memory/max_active (GiB)': 57.73, 'memory/max_allocated (GiB)': 57.73, 'memory/device_reserved (GiB)': 66.98, 'tokens_per_second_per_gpu': 722.65, 'epoch': 0.71}

 24%|█████████████████████████████████████▊                                                                                                                         | 15/63 [03:21<10:32, 13.17s/it]
 25%|████████████████████████████████████████▍                                                                                                                      | 16/63 [03:34<10:18, 13.16s/it]
 27%|██████████████████████████████████████████▉                                                                                                                    | 17/63 [03:48<10:05, 13.16s/it]
 29%|█████████████████████████████████████████████▍                                                                                                                 | 18/63 [04:01<09:52, 13.17s/it]
 30%|███████████████████████████████████████████████▉                                                                                                               | 19/63 [04:14<09:39, 13.17s/it]
 32%|██████████████████████████████████████████████████▍                                                                                                            | 20/63 [04:27<09:25, 13.16s/it]
                                                                                                                                                                                                    
{'loss': 0.3071, 'grad_norm': 0.6052656173706055, 'learning_rate': 0.00013155799573326722, 'memory/max_active (GiB)': 57.73, 'memory/max_allocated (GiB)': 57.73, 'memory/device_reserved (GiB)': 66.98, 'tokens_per_second_per_gpu': 728.39, 'epoch': 0.94}

 32%|██████████████████████████████████████████████████▍                                                                                                            | 20/63 [04:27<09:25, 13.16s/it]
 33%|█████████████████████████████████████████████████████                                                                                                          | 21/63 [04:40<09:12, 13.15s/it]
 35%|███████████████████████████████████████████████████████▌                                                                                                       | 22/63 [04:43<06:50, 10.00s/it]
 37%|██████████████████████████████████████████████████████████                                                                                                     | 23/63 [05:00<07:59, 12.00s/it]
 38%|████████████████████████████████████████████████████████████▌                                                                                                  | 24/63 [05:13<08:01, 12.34s/it]
 40%|███████████████████████████████████████████████████████████████                                                                                                | 25/63 [05:26<07:58, 12.58s/it]
                                                                                                                                                                                                    
{'loss': 0.2304, 'grad_norm': 0.5231208801269531, 'learning_rate': 0.00011602111185918203, 'memory/max_active (GiB)': 57.73, 'memory/max_allocated (GiB)': 57.73, 'memory/device_reserved (GiB)': 66.98, 'tokens_per_second_per_gpu': 625.16, 'epoch': 1.14}

 40%|███████████████████████████████████████████████████████████████                                                                                                | 25/63 [05:26<07:58, 12.58s/it]
 41%|█████████████████████████████████████████████████████████████████▌                                                                                             | 26/63 [05:39<07:52, 12.77s/it]
 43%|████████████████████████████████████████████████████████████████████▏                                                                                          | 27/63 [05:52<07:43, 12.89s/it]
 44%|██████████████████████████████████████████████████████████████████████▋                                                                                        | 28/63 [06:05<07:33, 12.96s/it]
 46%|█████████████████████████████████████████████████████████████████████████▏                                                                                     | 29/63 [06:18<07:22, 13.02s/it]
 48%|███████████████████████████████████████████████████████████████████████████▋                                                                                   | 30/63 [06:32<07:11, 13.06s/it]
                                                                                                                                                                                                    
{'loss': 0.2163, 'grad_norm': 0.34258556365966797, 'learning_rate': 9.738861082512709e-05, 'memory/max_active (GiB)': 57.73, 'memory/max_allocated (GiB)': 57.73, 'memory/device_reserved (GiB)': 66.98, 'tokens_per_second_per_gpu': 724.72, 'epoch': 1.38}

 48%|███████████████████████████████████████████████████████████████████████████▋                                                                                   | 30/63 [06:32<07:11, 13.06s/it]
 49%|██████████████████████████████████████████████████████████████████████████████▏                                                                                | 31/63 [06:45<06:59, 13.11s/it]
 51%|████████████████████████████████████████████████████████████████████████████████▊                                                                              | 32/63 [06:58<06:46, 13.12s/it]
 52%|███████████████████████████████████████████████████████████████████████████████████▎                                                                           | 33/63 [07:11<06:34, 13.14s/it]
 54%|█████████████████████████████████████████████████████████████████████████████████████▊                                                                         | 34/63 [07:24<06:20, 13.13s/it]
 56%|████████████████████████████████████████████████████████████████████████████████████████▎                                                                      | 35/63 [07:37<06:07, 13.14s/it]
                                                                                                                                                                                                    
{'loss': 0.2039, 'grad_norm': 0.3575713038444519, 'learning_rate': 7.706657567761216e-05, 'memory/max_active (GiB)': 57.73, 'memory/max_allocated (GiB)': 57.73, 'memory/device_reserved (GiB)': 66.98, 'tokens_per_second_per_gpu': 723.32, 'epoch': 1.61}

 56%|████████████████████████████████████████████████████████████████████████████████████████▎                                                                      | 35/63 [07:37<06:07, 13.14s/it]
 57%|██████████████████████████████████████████████████████████████████████████████████████████▊                                                                    | 36/63 [07:51<05:54, 13.15s/it]
 59%|█████████████████████████████████████████████████████████████████████████████████████████████▍                                                                 | 37/63 [08:04<05:42, 13.16s/it]
 60%|███████████████████████████████████████████████████████████████████████████████████████████████▉                                                               | 38/63 [08:17<05:28, 13.15s/it]
 62%|██████████████████████████████████████████████████████████████████████████████████████████████████▍                                                            | 39/63 [08:30<05:15, 13.16s/it]
 63%|████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                          | 40/63 [08:43<05:02, 13.17s/it]
                                                                                                                                                                                                    
{'loss': 0.1938, 'grad_norm': 0.4452803134918213, 'learning_rate': 5.658858846444006e-05, 'memory/max_active (GiB)': 57.73, 'memory/max_allocated (GiB)': 57.73, 'memory/device_reserved (GiB)': 66.98, 'tokens_per_second_per_gpu': 725.05, 'epoch': 1.85}

 63%|████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                          | 40/63 [08:43<05:02, 13.17s/it]
 65%|███████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                       | 41/63 [08:56<04:49, 13.16s/it]
 67%|██████████████████████████████████████████████████████████████████████████████████████████████████████████                                                     | 42/63 [09:10<04:36, 13.16s/it]
 68%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                  | 43/63 [09:23<04:23, 13.16s/it]
 70%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                | 44/63 [09:26<03:14, 10.26s/it]
 71%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                             | 45/63 [09:43<03:38, 12.13s/it]
                                                                                                                                                                                                    
{'loss': 0.1849, 'grad_norm': 0.2797813415527344, 'learning_rate': 3.750000000000001e-05, 'memory/max_active (GiB)': 57.73, 'memory/max_allocated (GiB)': 57.73, 'memory/device_reserved (GiB)': 66.98, 'tokens_per_second_per_gpu': 607.58, 'epoch': 2.05}

 71%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                             | 45/63 [09:43<03:38, 12.13s/it]
 73%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                           | 46/63 [09:56<03:31, 12.44s/it]
 75%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                        | 47/63 [10:09<03:22, 12.66s/it]
 76%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                     | 48/63 [10:22<03:12, 12.80s/it]
 78%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                   | 49/63 [10:35<03:00, 12.92s/it]
 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                | 50/63 [10:49<02:48, 12.99s/it]
                                                                                                                                                                                                    
{'loss': 0.1819, 'grad_norm': 0.36559921503067017, 'learning_rate': 2.1241311512366167e-05, 'memory/max_active (GiB)': 57.73, 'memory/max_allocated (GiB)': 57.73, 'memory/device_reserved (GiB)': 66.98, 'tokens_per_second_per_gpu': 730.88, 'epoch': 2.28}

 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                | 50/63 [10:49<02:48, 12.99s/it][2026-03-29 17:19:50,790] [INFO] [axolotl.core.trainers.base._save:671] [PID:1870] Saving model checkpoint to ./outputs/defender-judge/checkpoint-50
[2026-03-29 17:19:53,187] [WARNING] [py.warnings._showwarnmsg:110] [PID:1870] /root/miniconda3/envs/py3.11/lib/python3.11/site-packages/bitsandbytes/autograd/_functions.py:186: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization
  warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization")


 81%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                              | 51/63 [11:04<02:45, 13.77s/it]
 83%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                           | 52/63 [11:17<02:29, 13.59s/it]
 84%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                         | 53/63 [11:31<02:14, 13.47s/it]
 86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                      | 54/63 [11:44<02:00, 13.37s/it]
 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                    | 55/63 [11:57<01:46, 13.30s/it]
                                                                                                                                                                                                    
{'loss': 0.1818, 'grad_norm': 0.2546774744987488, 'learning_rate': 9.039468659513327e-06, 'memory/max_active (GiB)': 57.73, 'memory/max_allocated (GiB)': 57.73, 'memory/device_reserved (GiB)': 66.98, 'tokens_per_second_per_gpu': 729.82, 'epoch': 2.52}

 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                    | 55/63 [11:57<01:46, 13.30s/it]
 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                 | 56/63 [12:10<01:32, 13.26s/it]
 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊               | 57/63 [12:23<01:19, 13.22s/it]
 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍            | 58/63 [12:36<01:06, 13.21s/it]
 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉          | 59/63 [12:49<00:52, 13.19s/it]
 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍       | 60/63 [13:03<00:39, 13.18s/it]
                                                                                                                                                                                                    
{'loss': 0.1846, 'grad_norm': 0.27984631061553955, 'learning_rate': 1.8152713029423283e-06, 'memory/max_active (GiB)': 57.73, 'memory/max_allocated (GiB)': 57.73, 'memory/device_reserved (GiB)': 66.98, 'tokens_per_second_per_gpu': 723.26, 'epoch': 2.75}

 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍       | 60/63 [13:03<00:39, 13.18s/it]
 97%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉     | 61/63 [13:16<00:26, 13.17s/it]
 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍  | 62/63 [13:29<00:13, 13.17s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 63/63 [13:42<00:00, 13.17s/it][2026-03-29 17:22:44,281] [INFO] [axolotl.core.trainers.base._save:671] [PID:1870] Saving model checkpoint to ./outputs/defender-judge/checkpoint-63

                                                                                                                                                                                                    
{'train_runtime': 825.577, 'train_samples_per_second': 1.221, 'train_steps_per_second': 0.076, 'train_loss': 0.49069616813508293, 'memory/max_active (GiB)': 57.73, 'memory/max_allocated (GiB)': 57.73, 'memory/device_reserved (GiB)': 66.98, 'epoch': 2.89}

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 63/63 [13:45<00:00, 13.17s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 63/63 [13:45<00:00, 13.10s/it]
[2026-03-29 17:22:47,727] [INFO] [axolotl.train.save_trained_model:218] [PID:1870] Training completed! Saving trained model to ./outputs/defender-judge.
[2026-03-29 17:22:48,827] [INFO] [axolotl.train.save_trained_model:336] [PID:1870] Model successfully saved to ./outputs/defender-judge