Files
ModelHub XC 40496fc651 初始化项目,由ModelHub XC社区提供模型
Model: Gandalf1/qwen3-8b-finance-finqa-phase3-merged
Source: Original Platform
2026-06-01 14:21:54 +08:00

1881 lines
217 KiB
Plaintext
Raw Permalink Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

[2026-05-14 13:44:44,807] [DEBUG] [axolotl.utils.config.resolve_dtype:74] [PID:11984] bf16 support detected, enabling for this configuration.
[2026-05-14 13:44:45,352] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:11984] baseline 0.000GB ()
[2026-05-14 13:44:45,353] [INFO] [axolotl.cli.config.load_cfg:333] [PID:11984] config:
{
"activation_offloading": false,
"adapter": "qlora",
"attn_implementation": "flash_attention_2",
"attn_needs_dtype_cast": true,
"attn_supports_packing": true,
"attn_uses_flash_lib": true,
"auto_resume_from_checkpoints": true,
"axolotl_config_path": "./data/config.yaml",
"base_model": "Qwen/Qwen3-8B",
"base_model_config": "Qwen/Qwen3-8B",
"batch_size": 80,
"bf16": true,
"capabilities": {
"bf16": true,
"compute_capability": "sm_80",
"fp8": false,
"n_gpu": 1,
"n_node": 1,
"tf32": true
},
"chat_template": "qwen3",
"context_parallel_size": 1,
"cut_cross_entropy": true,
"dataloader_num_workers": 1,
"dataloader_pin_memory": true,
"dataloader_prefetch_factor": 256,
"dataset_num_proc": 128,
"dataset_prepared_path": "last_run_prepared",
"datasets": [
{
"chat_template": "tokenizer_default",
"field_messages": "messages",
"field_tools": "tools",
"message_property_mappings": {
"content": "content",
"role": "role"
},
"path": "Gandalf1/indian-finance-synthetic-phase2-cleaned",
"roles_to_train": [
"assistant"
],
"train_on_eos": "turn",
"trust_remote_code": false,
"type": "chat_template"
}
],
"ddp": false,
"device": "cuda:0",
"device_map": "auto",
"dion_rank_fraction": 1.0,
"dion_rank_multiple_of": 1,
"eaft_alpha": 1.0,
"eaft_k": 20,
"env_capabilities": {
"torch_version": "2.9.1"
},
"eval_batch_size": 10,
"eval_causal_lm_metrics": [
"sacrebleu",
"comet",
"ter",
"chrf"
],
"eval_max_new_tokens": 128,
"eval_sample_packing": true,
"eval_table_size": 0,
"experimental_skip_move_to_device": true,
"fp16": false,
"generate_samples": false,
"generation_do_sample": true,
"generation_max_new_tokens": 50,
"generation_prompt_ratio": 0.5,
"generation_temperature": 0.7,
"gradient_accumulation_steps": 8,
"gradient_checkpointing": true,
"gradient_checkpointing_kwargs": {
"use_reentrant": false
},
"hub_model_id": "Gandalf1/qwen3-8b-finance-sft-phase2",
"hub_strategy": "all_checkpoints",
"include_tkps": true,
"is_preprocess": true,
"layer_offloading": false,
"learning_rate": 2e-05,
"lisa_layers_attribute": "model.layers",
"load_best_model_at_end": false,
"load_in_4bit": true,
"load_in_8bit": false,
"local_rank": 0,
"logging_steps": 10,
"lora_alpha": 64,
"lora_dropout": 0.05,
"lora_mlp_kernel": true,
"lora_o_kernel": true,
"lora_qkv_kernel": true,
"lora_r": 32,
"lora_target_modules": [
"q_proj",
"k_proj",
"v_proj",
"o_proj",
"gate_proj",
"down_proj",
"up_proj"
],
"loraplus_lr_embedding": 1e-06,
"loss_watchdog_patience": 3,
"loss_watchdog_threshold": 5.0,
"lr_scheduler": "cosine",
"max_grad_norm": 1.0,
"mean_resizing_embeddings": false,
"merge_method": "memory_efficient",
"micro_batch_size": 10,
"model_config_type": "qwen3",
"num_epochs": 2.0,
"num_generation_samples": 3,
"optimizer": "adamw_torch_4bit",
"otel_metrics_host": "localhost",
"otel_metrics_port": 8000,
"output_dir": "./outputs/finance-synthetic-sft-phase2",
"pad_to_sequence_len": true,
"plugins": [
"axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin"
],
"pretrain_multipack_attn": true,
"profiler_steps_start": 0,
"qlora_sharded_model_loading": false,
"quantize_moe_experts": false,
"ray_num_workers": 1,
"relora_prune_method": "magnitude",
"resources_per_worker": {
"GPU": 1
},
"sample_packing": true,
"sample_packing_bin_size": 200,
"sample_packing_group_size": 100000,
"save_only_model": false,
"save_safetensors": true,
"save_steps": 0.16666666666666666,
"save_total_limit": 3,
"saves_per_epoch": 3,
"seed": 42,
"sequence_len": 8192,
"shuffle_before_merging_datasets": false,
"shuffle_merged_datasets": true,
"skip_prepare_dataset": false,
"streaming_multipack_buffer_size": 10000,
"strict": false,
"tensor_parallel_size": 1,
"tf32": true,
"tiled_mlp_use_original_mlp": true,
"tokenizer_config": "Qwen/Qwen3-8B",
"tokenizer_save_jinja_files": true,
"torch_dtype": "torch.bfloat16",
"train_on_inputs": false,
"trl": {
"async_prefetch": false,
"log_completions": false,
"mask_truncated_completions": false,
"ref_model_mixup_alpha": 0.9,
"ref_model_sync_steps": 64,
"replay_buffer_size": 0,
"replay_recompute_logps": true,
"reroll_max_groups": 1,
"reroll_start_fraction": 1.0,
"reward_num_workers": 1,
"scale_rewards": true,
"skip_zero_advantage_batches": true,
"sync_ref_model": false,
"use_data_producer": false,
"use_vllm": false,
"vllm_lora_sync": false,
"vllm_server_host": "0.0.0.0",
"vllm_server_port": 8000
},
"use_otel_metrics": false,
"use_ray": false,
"val_set_size": 0.0,
"vllm": {
"device": "auto",
"dtype": "auto",
"gpu_memory_utilization": 0.9,
"host": "0.0.0.0",
"port": 8000
},
"warmup_ratio": 0.05,
"weight_decay": 0.01,
"world_size": 1
}
[2026-05-14 13:44:46,921] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:311] [PID:11984] EOS: 151645 / <|im_end|>
[2026-05-14 13:44:46,921] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:312] [PID:11984] BOS: None / None
[2026-05-14 13:44:46,921] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:313] [PID:11984] PAD: 151643 / <|endoftext|>
[2026-05-14 13:44:46,921] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:314] [PID:11984] UNK: None / None
[2026-05-14 13:44:46,922] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:482] [PID:11984] Unable to find prepared dataset in last_run_prepared/2c3cb7e1625c800787dbdc29d010a51d
[2026-05-14 13:44:46,922] [INFO] [axolotl.utils.data.sft._load_raw_datasets:320] [PID:11984] Loading raw datasets...
Downloading (incomplete total...): 0.00B [00:00, ?B/s]
Fetching 0 files: 0it [00:00, ?it/s]
Fetching 0 files: 0it [00:00, ?it/s]
Download complete: : 0.00B [00:00, ?B/s]
README.md: 0.00B [00:00, ?B/s]
README.md: 1.66kB [00:00, 6.58MB/s]
Download complete: : 0.00B [00:00, ?B/s]
data/train-00000-of-00001.parquet: 0%| | 0.00/29.6M [00:00<?, ?B/s]
data/train-00000-of-00001.parquet: 0%| | 0.00/29.6M [00:00<?, ?B/s]
data/train-00000-of-00001.parquet: 0%| | 0.00/29.6M [00:00<?, ?B/s]
data/train-00000-of-00001.parquet: 19%|██████▋ | 5.65M/29.6M [00:00<00:00, 28.1MB/s]
data/train-00000-of-00001.parquet: 85%|█████████████████████████████▊ | 25.2M/29.6M [00:00<00:00, 69.0MB/s]
data/train-00000-of-00001.parquet: 100%|███████████████████████████████████| 29.6M/29.6M [00:01<00:00, 28.9MB/s]
Generating train split: 0%| | 0/14763 [00:00<?, ? examples/s]
Generating train split: 100%|███████████████████████████████████| 14763/14763 [00:00<00:00, 59899.50 examples/s]
Generating train split: 100%|███████████████████████████████████| 14763/14763 [00:00<00:00, 58973.94 examples/s]
[2026-05-14 13:44:49,700] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:87] [PID:11984] Loading dataset: Gandalf1/indian-finance-synthetic-phase2-cleaned with base_type: chat_template and prompt_style: None
[2026-05-14 13:44:49,704] [INFO] [axolotl.prompt_strategies.chat_template.__call__:1191] [PID:11984] Using chat template:
---
{%- if tools %}
{{- '<|im_start|>system\n' }}
{%- if messages[0].role == 'system' %}
{{- messages[0].content + '\n\n' }}
{%- endif %}
{{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
{%- for tool in tools %}
{{- "\n" }}
{{- tool | tojson }}
{%- endfor %}
{{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
{%- else %}
{%- if messages[0].role == 'system' %}
{{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
{%- endif %}
{%- endif %}
{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
{#- Determine the real last index: use provided value or default to messages length - 1 #}
{%- if real_last_index is defined and real_last_index is not none %}
{%- set ns.real_last_index = real_last_index %}
{%- else %}
{%- set ns.real_last_index = messages|length - 1 %}
{%- endif %}
{%- for message in messages[::-1] %}
{%- set index = (messages|length - 1) - loop.index0 %}
{%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
{%- set ns.multi_step_tool = false %}
{%- set ns.last_query_index = index %}
{%- endif %}
{%- endfor %}
{%- for message in messages %}
{%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
{{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
{%- elif message.role == "assistant" %}
{%- set content = message.content %}
{%- set reasoning_content = '' %}
{%- if message.reasoning_content is defined and message.reasoning_content is not none %}
{%- set reasoning_content = message.reasoning_content %}
{%- else %}
{%- if '</think>' in message.content %}
{%- set content = message.content.split('</think>')[-1].lstrip('\n') %}
{%- set reasoning_content = message.content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
{%- endif %}
{%- endif %}
{%- if loop.index0 > ns.last_query_index %}
{%- if loop.index0 == ns.real_last_index or (loop.index0 != ns.real_last_index and reasoning_content) %}
{{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
{%- else %}
{{- '<|im_start|>' + message.role + '\n' + content }}
{%- endif %}
{%- else %}
{{- '<|im_start|>' + message.role + '\n' + content }}
{%- endif %}
{%- if message.tool_calls %}
{%- for tool_call in message.tool_calls %}
{%- if (loop.first and content) or (not loop.first) %}
{{- '\n' }}
{%- endif %}
{%- if tool_call.function %}
{%- set tool_call = tool_call.function %}
{%- endif %}
{{- '<tool_call>\n{"name": "' }}
{{- tool_call.name }}
{{- '", "arguments": ' }}
{%- if tool_call.arguments is string %}
{{- tool_call.arguments }}
{%- else %}
{{- tool_call.arguments | tojson }}
{%- endif %}
{{- '}\n</tool_call>' }}
{%- endfor %}
{%- endif %}
{{- '<|im_end|>\n' }}
{%- elif message.role == "tool" %}
{%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
{{- '<|im_start|>user' }}
{%- endif %}
{{- '\n<tool_response>\n' }}
{{- message.content }}
{{- '\n</tool_response>' }}
{%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
{{- '<|im_end|>\n' }}
{%- endif %}
{%- endif %}
{%- endfor %}
{%- if add_generation_prompt %}
{{- '<|im_start|>assistant\n' }}
{%- if enable_thinking is defined and enable_thinking is false %}
{{- '<think>\n\n</think>\n\n' }}
{%- else %}
{{- '<think>\n\n' }}
{%- endif %}
{%- endif %}
---
Tokenizing Prompts (num_proc=128): 0%| | 0/14763 [00:00<?, ? examples/s]
Tokenizing Prompts (num_proc=128): 1%|▏ | 116/14763 [00:07<15:10, 16.08 examples/s]
Tokenizing Prompts (num_proc=128): 2%|▍ | 232/14763 [00:08<07:37, 31.73 examples/s]
Tokenizing Prompts (num_proc=128): 2%|▋ | 348/14763 [00:09<05:19, 45.05 examples/s]
Tokenizing Prompts (num_proc=128): 3%|▉ | 464/14763 [00:10<04:11, 56.86 examples/s]
Tokenizing Prompts (num_proc=128): 4%|█▏ | 580/14763 [00:12<03:33, 66.54 examples/s]
Tokenizing Prompts (num_proc=128): 5%|█▎ | 696/14763 [00:13<03:13, 72.63 examples/s]
Tokenizing Prompts (num_proc=128): 6%|█▌ | 812/14763 [00:14<02:59, 77.53 examples/s]
Tokenizing Prompts (num_proc=128): 6%|█▊ | 928/14763 [00:16<02:51, 80.80 examples/s]
Tokenizing Prompts (num_proc=128): 7%|█▉ | 1044/14763 [00:17<02:54, 78.42 examples/s]
Tokenizing Prompts (num_proc=128): 8%|██▏ | 1160/14763 [00:18<02:41, 84.20 examples/s]
Tokenizing Prompts (num_proc=128): 9%|██▍ | 1276/14763 [00:19<02:28, 91.02 examples/s]
Tokenizing Prompts (num_proc=128): 9%|██▋ | 1392/14763 [00:21<02:33, 87.25 examples/s]
Tokenizing Prompts (num_proc=128): 10%|██▊ | 1508/14763 [00:22<02:25, 90.92 examples/s]
Tokenizing Prompts (num_proc=128): 11%|███ | 1624/14763 [00:23<02:21, 92.95 examples/s]
Tokenizing Prompts (num_proc=128): 12%|███▎ | 1740/14763 [00:24<02:20, 92.68 examples/s]
Tokenizing Prompts (num_proc=128): 13%|███▌ | 1856/14763 [00:26<02:23, 89.77 examples/s]
Tokenizing Prompts (num_proc=128): 13%|███▋ | 1972/14763 [00:27<02:25, 87.76 examples/s]
Tokenizing Prompts (num_proc=128): 14%|███▉ | 2088/14763 [00:28<02:18, 91.45 examples/s]
Tokenizing Prompts (num_proc=128): 15%|████▏ | 2204/14763 [00:29<02:16, 92.15 examples/s]
Tokenizing Prompts (num_proc=128): 16%|████▍ | 2320/14763 [00:31<02:10, 95.70 examples/s]
Tokenizing Prompts (num_proc=128): 17%|████▌ | 2436/14763 [00:32<02:08, 95.82 examples/s]
Tokenizing Prompts (num_proc=128): 17%|████▊ | 2552/14763 [00:33<02:08, 95.13 examples/s]
Tokenizing Prompts (num_proc=128): 18%|█████ | 2668/14763 [00:34<02:14, 90.10 examples/s]
Tokenizing Prompts (num_proc=128): 19%|█████▎ | 2784/14763 [00:36<02:15, 88.73 examples/s]
Tokenizing Prompts (num_proc=128): 20%|█████▌ | 2900/14763 [00:37<02:05, 94.65 examples/s]
Tokenizing Prompts (num_proc=128): 20%|█████▋ | 3016/14763 [00:38<02:09, 90.38 examples/s]
Tokenizing Prompts (num_proc=128): 21%|█████▉ | 3132/14763 [00:39<02:04, 93.11 examples/s]
Tokenizing Prompts (num_proc=128): 22%|██████▏ | 3248/14763 [00:41<02:02, 94.23 examples/s]
Tokenizing Prompts (num_proc=128): 23%|██████▍ | 3364/14763 [00:42<01:58, 96.24 examples/s]
Tokenizing Prompts (num_proc=128): 24%|██████▌ | 3480/14763 [00:43<02:01, 93.09 examples/s]
Tokenizing Prompts (num_proc=128): 24%|██████▊ | 3596/14763 [00:44<01:55, 96.34 examples/s]
Tokenizing Prompts (num_proc=128): 25%|███████ | 3712/14763 [00:45<01:54, 96.79 examples/s]
Tokenizing Prompts (num_proc=128): 26%|███████▎ | 3828/14763 [00:47<01:57, 93.43 examples/s]
Tokenizing Prompts (num_proc=128): 27%|███████▍ | 3944/14763 [00:48<01:50, 97.81 examples/s]
Tokenizing Prompts (num_proc=128): 28%|███████▋ | 4060/14763 [00:49<02:00, 88.88 examples/s]
Tokenizing Prompts (num_proc=128): 28%|███████▉ | 4176/14763 [00:50<01:50, 96.18 examples/s]
Tokenizing Prompts (num_proc=128): 29%|████████▏ | 4292/14763 [00:52<01:56, 89.67 examples/s]
Tokenizing Prompts (num_proc=128): 30%|████████▎ | 4408/14763 [00:53<01:56, 88.83 examples/s]
Tokenizing Prompts (num_proc=128): 31%|████████▌ | 4524/14763 [00:54<01:46, 95.96 examples/s]
Tokenizing Prompts (num_proc=128): 31%|████████▊ | 4640/14763 [00:56<01:48, 93.18 examples/s]
Tokenizing Prompts (num_proc=128): 32%|█████████ | 4756/14763 [00:57<01:46, 94.26 examples/s]
Tokenizing Prompts (num_proc=128): 33%|█████████▏ | 4872/14763 [00:58<01:45, 93.61 examples/s]
Tokenizing Prompts (num_proc=128): 34%|█████████▍ | 4988/14763 [00:59<01:40, 97.16 examples/s]
Tokenizing Prompts (num_proc=128): 35%|█████████▋ | 5103/14763 [01:00<01:41, 95.06 examples/s]
Tokenizing Prompts (num_proc=128): 35%|█████████▉ | 5218/14763 [01:02<01:41, 94.13 examples/s]
Tokenizing Prompts (num_proc=128): 36%|██████████ | 5333/14763 [01:03<01:38, 95.28 examples/s]
Tokenizing Prompts (num_proc=128): 37%|██████████▎ | 5448/14763 [01:04<01:39, 93.31 examples/s]
Tokenizing Prompts (num_proc=128): 38%|██████████▌ | 5563/14763 [01:05<01:41, 90.98 examples/s]
Tokenizing Prompts (num_proc=128): 38%|██████████▊ | 5678/14763 [01:06<01:35, 95.59 examples/s]
Tokenizing Prompts (num_proc=128): 39%|██████████▉ | 5793/14763 [01:08<01:38, 91.25 examples/s]
Tokenizing Prompts (num_proc=128): 40%|███████████▏ | 5908/14763 [01:09<01:29, 99.04 examples/s]
Tokenizing Prompts (num_proc=128): 41%|███████████▍ | 6023/14763 [01:10<01:29, 97.97 examples/s]
Tokenizing Prompts (num_proc=128): 42%|███████████▋ | 6138/14763 [01:11<01:26, 99.63 examples/s]
Tokenizing Prompts (num_proc=128): 42%|███████████▊ | 6253/14763 [01:13<01:33, 91.29 examples/s]
Tokenizing Prompts (num_proc=128): 43%|████████████ | 6368/14763 [01:14<01:35, 88.20 examples/s]
Tokenizing Prompts (num_proc=128): 44%|████████████▎ | 6483/14763 [01:15<01:32, 89.27 examples/s]
Tokenizing Prompts (num_proc=128): 45%|████████████▌ | 6598/14763 [01:16<01:27, 93.59 examples/s]
Tokenizing Prompts (num_proc=128): 45%|████████████▋ | 6713/14763 [01:18<01:30, 89.32 examples/s]
Tokenizing Prompts (num_proc=128): 46%|████████████▉ | 6828/14763 [01:19<01:28, 89.89 examples/s]
Tokenizing Prompts (num_proc=128): 47%|█████████████▏ | 6943/14763 [01:20<01:20, 97.28 examples/s]
Tokenizing Prompts (num_proc=128): 48%|█████████████▍ | 7058/14763 [01:21<01:22, 93.14 examples/s]
Tokenizing Prompts (num_proc=128): 49%|█████████████▌ | 7173/14763 [01:23<01:20, 93.80 examples/s]
Tokenizing Prompts (num_proc=128): 49%|█████████████▊ | 7288/14763 [01:24<01:19, 93.60 examples/s]
Tokenizing Prompts (num_proc=128): 50%|██████████████ | 7403/14763 [01:25<01:16, 95.63 examples/s]
Tokenizing Prompts (num_proc=128): 51%|██████████████▎ | 7518/14763 [01:26<01:15, 96.13 examples/s]
Tokenizing Prompts (num_proc=128): 52%|██████████████▍ | 7633/14763 [01:27<01:12, 98.83 examples/s]
Tokenizing Prompts (num_proc=128): 52%|██████████████▋ | 7748/14763 [01:29<01:16, 91.30 examples/s]
Tokenizing Prompts (num_proc=128): 53%|██████████████▉ | 7863/14763 [01:30<01:14, 92.54 examples/s]
Tokenizing Prompts (num_proc=128): 54%|███████████████▏ | 7978/14763 [01:31<01:11, 94.85 examples/s]
Tokenizing Prompts (num_proc=128): 55%|███████████████▎ | 8093/14763 [01:33<01:16, 86.71 examples/s]
Tokenizing Prompts (num_proc=128): 56%|███████████████▌ | 8208/14763 [01:33<01:07, 96.61 examples/s]
Tokenizing Prompts (num_proc=128): 56%|███████████████▊ | 8323/14763 [01:35<01:07, 95.37 examples/s]
Tokenizing Prompts (num_proc=128): 57%|████████████████ | 8438/14763 [01:36<01:07, 93.22 examples/s]
Tokenizing Prompts (num_proc=128): 58%|████████████████▏ | 8553/14763 [01:37<01:06, 93.13 examples/s]
Tokenizing Prompts (num_proc=128): 59%|████████████████▍ | 8668/14763 [01:39<01:05, 92.66 examples/s]
Tokenizing Prompts (num_proc=128): 59%|████████████████▋ | 8783/14763 [01:40<01:03, 94.35 examples/s]
Tokenizing Prompts (num_proc=128): 60%|████████████████▉ | 8898/14763 [01:41<01:02, 93.38 examples/s]
Tokenizing Prompts (num_proc=128): 61%|█████████████████ | 9013/14763 [01:42<01:04, 89.77 examples/s]
Tokenizing Prompts (num_proc=128): 62%|█████████████████▎ | 9128/14763 [01:44<01:03, 89.33 examples/s]
Tokenizing Prompts (num_proc=128): 63%|█████████████████▌ | 9243/14763 [01:45<00:59, 92.63 examples/s]
Tokenizing Prompts (num_proc=128): 63%|█████████████████▋ | 9358/14763 [01:46<00:59, 91.22 examples/s]
Tokenizing Prompts (num_proc=128): 64%|█████████████████▉ | 9473/14763 [01:47<00:53, 98.16 examples/s]
Tokenizing Prompts (num_proc=128): 65%|██████████████████▏ | 9588/14763 [01:48<00:55, 92.98 examples/s]
Tokenizing Prompts (num_proc=128): 66%|██████████████████▍ | 9703/14763 [01:50<00:54, 93.42 examples/s]
Tokenizing Prompts (num_proc=128): 67%|██████████████████▌ | 9818/14763 [01:51<00:52, 93.97 examples/s]
Tokenizing Prompts (num_proc=128): 67%|██████████████████▊ | 9933/14763 [01:52<00:53, 90.51 examples/s]
Tokenizing Prompts (num_proc=128): 68%|██████████████████▍ | 10048/14763 [01:53<00:50, 93.19 examples/s]
Tokenizing Prompts (num_proc=128): 69%|██████████████████▌ | 10163/14763 [01:55<00:49, 93.72 examples/s]
Tokenizing Prompts (num_proc=128): 70%|██████████████████▊ | 10278/14763 [01:56<00:47, 93.64 examples/s]
Tokenizing Prompts (num_proc=128): 70%|███████████████████ | 10393/14763 [01:57<00:48, 90.63 examples/s]
Tokenizing Prompts (num_proc=128): 71%|███████████████████▏ | 10508/14763 [01:58<00:46, 92.19 examples/s]
Tokenizing Prompts (num_proc=128): 72%|███████████████████▍ | 10623/14763 [02:00<00:46, 89.14 examples/s]
Tokenizing Prompts (num_proc=128): 73%|███████████████████▋ | 10738/14763 [02:01<00:44, 91.28 examples/s]
Tokenizing Prompts (num_proc=128): 74%|███████████████████▊ | 10853/14763 [02:02<00:42, 91.99 examples/s]
Tokenizing Prompts (num_proc=128): 74%|████████████████████ | 10968/14763 [02:03<00:41, 92.56 examples/s]
Tokenizing Prompts (num_proc=128): 75%|████████████████████▎ | 11083/14763 [02:05<00:40, 91.53 examples/s]
Tokenizing Prompts (num_proc=128): 76%|████████████████████▍ | 11198/14763 [02:06<00:37, 94.50 examples/s]
Tokenizing Prompts (num_proc=128): 77%|████████████████████▋ | 11313/14763 [02:07<00:37, 92.39 examples/s]
Tokenizing Prompts (num_proc=128): 77%|████████████████████▉ | 11428/14763 [02:08<00:36, 91.70 examples/s]
Tokenizing Prompts (num_proc=128): 78%|█████████████████████ | 11543/14763 [02:09<00:33, 96.40 examples/s]
Tokenizing Prompts (num_proc=128): 79%|█████████████████████▎ | 11658/14763 [02:11<00:34, 91.26 examples/s]
Tokenizing Prompts (num_proc=128): 80%|█████████████████████▌ | 11773/14763 [02:12<00:31, 96.18 examples/s]
Tokenizing Prompts (num_proc=128): 81%|█████████████████████▋ | 11888/14763 [02:13<00:30, 95.39 examples/s]
Tokenizing Prompts (num_proc=128): 81%|█████████████████████▉ | 12003/14763 [02:15<00:30, 91.02 examples/s]
Tokenizing Prompts (num_proc=128): 82%|██████████████████████▏ | 12118/14763 [02:16<00:28, 93.21 examples/s]
Tokenizing Prompts (num_proc=128): 83%|██████████████████████▎ | 12233/14763 [02:17<00:26, 93.79 examples/s]
Tokenizing Prompts (num_proc=128): 84%|██████████████████████▌ | 12348/14763 [02:18<00:26, 92.24 examples/s]
Tokenizing Prompts (num_proc=128): 84%|██████████████████████▊ | 12463/14763 [02:19<00:24, 93.55 examples/s]
Tokenizing Prompts (num_proc=128): 85%|███████████████████████ | 12578/14763 [02:21<00:24, 90.75 examples/s]
Tokenizing Prompts (num_proc=128): 86%|███████████████████████▏ | 12693/14763 [02:22<00:22, 90.43 examples/s]
Tokenizing Prompts (num_proc=128): 87%|███████████████████████▍ | 12808/14763 [02:23<00:21, 92.00 examples/s]
Tokenizing Prompts (num_proc=128): 88%|███████████████████████▋ | 12923/14763 [02:25<00:20, 90.22 examples/s]
Tokenizing Prompts (num_proc=128): 88%|███████████████████████▊ | 13038/14763 [02:26<00:18, 94.26 examples/s]
Tokenizing Prompts (num_proc=128): 89%|████████████████████████ | 13153/14763 [02:27<00:17, 93.37 examples/s]
Tokenizing Prompts (num_proc=128): 90%|████████████████████████▎ | 13268/14763 [02:28<00:16, 91.32 examples/s]
Tokenizing Prompts (num_proc=128): 91%|████████████████████████▍ | 13383/14763 [02:30<00:15, 90.95 examples/s]
Tokenizing Prompts (num_proc=128): 91%|████████████████████████▋ | 13498/14763 [02:31<00:13, 95.45 examples/s]
Tokenizing Prompts (num_proc=128): 92%|████████████████████████▉ | 13613/14763 [02:32<00:12, 91.03 examples/s]
Tokenizing Prompts (num_proc=128): 93%|█████████████████████████ | 13728/14763 [02:33<00:11, 93.72 examples/s]
Tokenizing Prompts (num_proc=128): 94%|█████████████████████████▎ | 13843/14763 [02:34<00:09, 92.35 examples/s]
Tokenizing Prompts (num_proc=128): 95%|█████████████████████████▌ | 13958/14763 [02:36<00:08, 92.91 examples/s]
Tokenizing Prompts (num_proc=128): 95%|█████████████████████████▋ | 14073/14763 [02:37<00:07, 96.57 examples/s]
Tokenizing Prompts (num_proc=128): 96%|█████████████████████████▉ | 14188/14763 [02:38<00:06, 95.35 examples/s]
Tokenizing Prompts (num_proc=128): 97%|██████████████████████████▏| 14303/14763 [02:39<00:05, 90.78 examples/s]
Tokenizing Prompts (num_proc=128): 98%|██████████████████████████▎| 14418/14763 [02:40<00:03, 95.12 examples/s]
Tokenizing Prompts (num_proc=128): 98%|██████████████████████████▌| 14533/14763 [02:42<00:02, 91.54 examples/s]
Tokenizing Prompts (num_proc=128): 99%|██████████████████████████▊| 14648/14763 [02:43<00:01, 92.23 examples/s]
Tokenizing Prompts (num_proc=128): 100%|███████████████████████████| 14763/14763 [02:44<00:00, 88.07 examples/s]
Tokenizing Prompts (num_proc=128): 100%|███████████████████████████| 14763/14763 [02:46<00:00, 88.40 examples/s]
[2026-05-14 13:47:44,920] [INFO] [axolotl.utils.data.utils._log_dataset_stats:212] [PID:11984] min_input_len: 591
[2026-05-14 13:47:44,920] [INFO] [axolotl.utils.data.utils._log_dataset_stats:213] [PID:11984] max_input_len: 4338
Dropping Invalid Sequences (<None or >8192) (num_proc=128): 0%| | 0/14763 [00:00<?, ? examples/s]
Dropping Invalid Sequences (<None or >8192) (num_proc=128): 1%| | 116/14763 [00:04<10:09, 24.03 examples/s]
Dropping Invalid Sequences (<None or >8192) (num_proc=128): 2%| | 348/14763 [00:04<02:40, 89.78 examples/s]
Dropping Invalid Sequences (<None or >8192) (num_proc=128): 5%|▏ | 696/14763 [00:05<01:04, 217.48 examples/s]
Dropping Invalid Sequences (<None or >8192) (num_proc=128): 8%|▏ | 1160/14763 [00:05<00:30, 439.33 examples/s]
Dropping Invalid Sequences (<None or >8192) (num_proc=128): 10%|▏ | 1508/14763 [00:05<00:20, 641.12 examples/s]
Dropping Invalid Sequences (<None or >8192) (num_proc=128): 13%|▎ | 1856/14763 [00:05<00:14, 871.04 examples/s]
Dropping Invalid Sequences (<None or >8192) (num_proc=128): 16%|▏| 2320/14763 [00:05<00:10, 1213.83 examples/s]
Dropping Invalid Sequences (<None or >8192) (num_proc=128): 56%|▌| 8208/14763 [00:05<00:00, 8610.30 examples/s]
Dropping Invalid Sequences (<None or >8192) (num_proc=128): 100%|█| 14763/14763 [00:06<00:00, 2153.50 examples/s
Drop Samples with Zero Trainable Tokens (num_proc=128): 0%| | 0/14763 [00:00<?, ? examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=128): 1%| | 116/14763 [00:06<12:42, 19.22 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=128): 2%|▏ | 348/14763 [00:06<03:20, 72.06 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=128): 5%|▎ | 696/14763 [00:06<01:18, 178.63 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=128): 6%|▍ | 928/14763 [00:06<00:51, 270.26 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=128): 8%|▍ | 1160/14763 [00:06<00:34, 388.71 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=128): 11%|▋ | 1624/14763 [00:06<00:18, 704.03 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=128): 13%|▊ | 1972/14763 [00:06<00:13, 958.47 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=128): 16%|▊ | 2320/14763 [00:06<00:09, 1257.68 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=128): 18%|▉ | 2668/14763 [00:06<00:07, 1537.53 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=128): 20%|█ | 3016/14763 [00:07<00:06, 1697.32 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=128): 23%|█▏ | 3364/14763 [00:07<00:05, 1936.50 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=128): 25%|█▎ | 3712/14763 [00:07<00:05, 2089.13 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=128): 28%|█▍ | 4176/14763 [00:07<00:04, 2513.82 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=128): 31%|█▌ | 4524/14763 [00:07<00:03, 2716.76 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=128): 98%|██▉| 14533/14763 [00:07<00:00, 25159.95 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=128): 100%|████| 14763/14763 [00:08<00:00, 1708.40 examples/s]
Add position_id column (Sample Packing) (num_proc=128): 0%| | 0/14763 [00:00<?, ? examples/s]
Add position_id column (Sample Packing) (num_proc=128): 1%| | 116/14763 [00:06<13:54, 17.56 examples/s]
Add position_id column (Sample Packing) (num_proc=128): 2%|▏ | 232/14763 [00:06<05:48, 41.74 examples/s]
Add position_id column (Sample Packing) (num_proc=128): 4%|▎ | 580/14763 [00:06<01:41, 139.80 examples/s]
Add position_id column (Sample Packing) (num_proc=128): 6%|▍ | 928/14763 [00:06<00:51, 269.75 examples/s]
Add position_id column (Sample Packing) (num_proc=128): 9%|▌ | 1276/14763 [00:07<00:31, 432.17 examples/s]
Add position_id column (Sample Packing) (num_proc=128): 10%|▌ | 1508/14763 [00:07<00:24, 548.53 examples/s]
Add position_id column (Sample Packing) (num_proc=128): 12%|▋ | 1740/14763 [00:07<00:18, 693.70 examples/s]
Add position_id column (Sample Packing) (num_proc=128): 13%|▊ | 1972/14763 [00:07<00:14, 863.55 examples/s]
Add position_id column (Sample Packing) (num_proc=128): 15%|▋ | 2204/14763 [00:07<00:11, 1055.77 examples/s]
Add position_id column (Sample Packing) (num_proc=128): 32%|█▌ | 4756/14763 [00:07<00:02, 4976.24 examples/s]
Add position_id column (Sample Packing) (num_proc=128): 61%|██▍ | 9013/14763 [00:07<00:00, 11982.32 examples/s]
Add position_id column (Sample Packing) (num_proc=128): 100%|████| 14763/14763 [00:08<00:00, 1660.74 examples/s]
Saving the dataset (0/57 shards): 0%| | 0/14763 [00:00<?, ? examples/s]
Saving the dataset (0/57 shards): 2%|▌ | 259/14763 [00:58<54:53, 4.40 examples/s]
Saving the dataset (0/57 shards): 2%|▌ | 259/14763 [01:00<56:01, 4.31 examples/s]
[2026-05-14 13:49:12,125] [ERROR] [axolotl.telemetry.errors.wrapper:158] [PID:11984] Error captured in telemetry. Run ID: 13cb3f8e-4ac0-4cd8-8c6a-299c044c5614
Traceback (most recent call last):
File "/workspace/axolotl-venv/bin/axolotl", line 12, in <module>
sys.exit(main())
^^^^^^
File "/workspace/axolotl/src/axolotl/cli/main.py", line 456, in main
cli()
File "/workspace/axolotl-venv/lib/python3.12/site-packages/click/core.py", line 1485, in __call__
return self.main(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/axolotl-venv/lib/python3.12/site-packages/click/core.py", line 1406, in main
rv = self.invoke(ctx)
^^^^^^^^^^^^^^^^
File "/workspace/axolotl-venv/lib/python3.12/site-packages/click/core.py", line 1873, in invoke
return _process_result(sub_ctx.command.invoke(sub_ctx))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/axolotl-venv/lib/python3.12/site-packages/click/core.py", line 1269, in invoke
return ctx.invoke(self.callback, **ctx.params)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/axolotl-venv/lib/python3.12/site-packages/click/core.py", line 824, in invoke
return callback(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/axolotl/src/axolotl/cli/utils/args.py", line 48, in wrapper
return func(*args, **filtered_kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/axolotl/src/axolotl/cli/main.py", line 75, in preprocess
do_cli(config=config, **kwargs)
File "/workspace/axolotl/src/axolotl/cli/preprocess.py", line 120, in do_cli
do_preprocess(parsed_cfg, parsed_cli_args)
File "/workspace/axolotl/src/axolotl/telemetry/errors.py", line 127, in wrapper
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/workspace/axolotl/src/axolotl/cli/preprocess.py", line 74, in do_preprocess
load_datasets(cfg=cfg, cli_args=cli_args)
File "/workspace/axolotl/src/axolotl/telemetry/errors.py", line 127, in wrapper
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/workspace/axolotl/src/axolotl/common/datasets.py", line 61, in load_datasets
train_dataset, eval_dataset, total_num_steps, prompters = prepare_datasets(
^^^^^^^^^^^^^^^^^
File "/workspace/axolotl/src/axolotl/utils/data/utils.py", line 50, in wrapper
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/workspace/axolotl/src/axolotl/utils/data/sft.py", line 65, in prepare_datasets
return _prepare_standard_dataset(cfg, tokenizer, processor)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/axolotl/src/axolotl/utils/data/sft.py", line 98, in _prepare_standard_dataset
train_dataset, eval_dataset, prompters = loader.load(_load_datasets)
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/axolotl/src/axolotl/utils/data/lock.py", line 38, in load
result = load_fn()
^^^^^^^^^
File "/workspace/axolotl/src/axolotl/utils/data/sft.py", line 77, in _load_datasets
train_dataset, eval_dataset, prompters = _load_and_prepare_datasets(
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/axolotl/src/axolotl/utils/data/sft.py", line 496, in _load_and_prepare_datasets
dataset, prompters = _load_tokenized_prepared_datasets(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/axolotl/src/axolotl/utils/data/sft.py", line 299, in _load_tokenized_prepared_datasets
dataset, prompters = _load_raw_datasets(
^^^^^^^^^^^^^^^^^^^
File "/workspace/axolotl/src/axolotl/utils/data/sft.py", line 364, in _load_raw_datasets
save_preprocessed_dataset(cfg, dataset, dataset_hash, split)
File "/workspace/axolotl/src/axolotl/utils/data/shared.py", line 440, in save_preprocessed_dataset
dataset.save_to_disk(
File "/workspace/axolotl-venv/lib/python3.12/site-packages/datasets/arrow_dataset.py", line 1909, in save_to_disk
for job_id, done, content in iflatmap_unordered(
^^^^^^^^^^^^^^^^^^^
File "/workspace/axolotl-venv/lib/python3.12/site-packages/datasets/utils/py_utils.py", line 617, in iflatmap_unordered
raise RuntimeError(
RuntimeError: One of the subprocesses has abruptly died during map operation.To debug the error, disable multiprocessing.
[2026-05-14 13:52:16,708] [DEBUG] [axolotl.utils.config.resolve_dtype:74] [PID:25326] bf16 support detected, enabling for this configuration.
[2026-05-14 13:52:17,989] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:25326] baseline 0.000GB ()
[2026-05-14 13:52:17,990] [INFO] [axolotl.cli.config.load_cfg:333] [PID:25326] config:
{
"activation_offloading": false,
"adapter": "qlora",
"attn_implementation": "flash_attention_2",
"attn_needs_dtype_cast": true,
"attn_supports_packing": true,
"attn_uses_flash_lib": true,
"auto_resume_from_checkpoints": true,
"axolotl_config_path": "./data/config.yaml",
"base_model": "Qwen/Qwen3-8B",
"base_model_config": "Qwen/Qwen3-8B",
"batch_size": 80,
"bf16": true,
"capabilities": {
"bf16": true,
"compute_capability": "sm_80",
"fp8": false,
"n_gpu": 1,
"n_node": 1,
"tf32": true
},
"chat_template": "qwen3",
"context_parallel_size": 1,
"cut_cross_entropy": true,
"dataloader_num_workers": 1,
"dataloader_pin_memory": true,
"dataloader_prefetch_factor": 256,
"dataset_num_proc": 1,
"dataset_prepared_path": "last_run_prepared",
"datasets": [
{
"chat_template": "tokenizer_default",
"field_messages": "messages",
"field_tools": "tools",
"message_property_mappings": {
"content": "content",
"role": "role"
},
"path": "Gandalf1/indian-finance-synthetic-phase2-cleaned",
"roles_to_train": [
"assistant"
],
"train_on_eos": "turn",
"trust_remote_code": false,
"type": "chat_template"
}
],
"ddp": false,
"device": "cuda:0",
"device_map": "auto",
"dion_rank_fraction": 1.0,
"dion_rank_multiple_of": 1,
"eaft_alpha": 1.0,
"eaft_k": 20,
"env_capabilities": {
"torch_version": "2.9.1"
},
"eval_batch_size": 10,
"eval_causal_lm_metrics": [
"sacrebleu",
"comet",
"ter",
"chrf"
],
"eval_max_new_tokens": 128,
"eval_sample_packing": true,
"eval_table_size": 0,
"experimental_skip_move_to_device": true,
"fp16": false,
"generate_samples": false,
"generation_do_sample": true,
"generation_max_new_tokens": 50,
"generation_prompt_ratio": 0.5,
"generation_temperature": 0.7,
"gradient_accumulation_steps": 8,
"gradient_checkpointing": true,
"gradient_checkpointing_kwargs": {
"use_reentrant": false
},
"hub_model_id": "Gandalf1/qwen3-8b-finance-sft-phase2",
"hub_strategy": "all_checkpoints",
"include_tkps": true,
"is_preprocess": true,
"layer_offloading": false,
"learning_rate": 2e-05,
"lisa_layers_attribute": "model.layers",
"load_best_model_at_end": false,
"load_in_4bit": true,
"load_in_8bit": false,
"local_rank": 0,
"logging_steps": 10,
"lora_alpha": 64,
"lora_dropout": 0.05,
"lora_mlp_kernel": true,
"lora_o_kernel": true,
"lora_qkv_kernel": true,
"lora_r": 32,
"lora_target_modules": [
"q_proj",
"k_proj",
"v_proj",
"o_proj",
"gate_proj",
"down_proj",
"up_proj"
],
"loraplus_lr_embedding": 1e-06,
"loss_watchdog_patience": 3,
"loss_watchdog_threshold": 5.0,
"lr_scheduler": "cosine",
"max_grad_norm": 1.0,
"mean_resizing_embeddings": false,
"merge_method": "memory_efficient",
"micro_batch_size": 10,
"model_config_type": "qwen3",
"num_epochs": 2.0,
"num_generation_samples": 3,
"optimizer": "adamw_torch_4bit",
"otel_metrics_host": "localhost",
"otel_metrics_port": 8000,
"output_dir": "./outputs/finance-synthetic-sft-phase2",
"pad_to_sequence_len": true,
"plugins": [
"axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin"
],
"pretrain_multipack_attn": true,
"profiler_steps_start": 0,
"qlora_sharded_model_loading": false,
"quantize_moe_experts": false,
"ray_num_workers": 1,
"relora_prune_method": "magnitude",
"resources_per_worker": {
"GPU": 1
},
"sample_packing": true,
"sample_packing_bin_size": 200,
"sample_packing_group_size": 100000,
"save_only_model": false,
"save_safetensors": true,
"save_steps": 0.16666666666666666,
"save_total_limit": 3,
"saves_per_epoch": 3,
"seed": 42,
"sequence_len": 8192,
"shuffle_before_merging_datasets": false,
"shuffle_merged_datasets": true,
"skip_prepare_dataset": false,
"streaming_multipack_buffer_size": 10000,
"strict": false,
"tensor_parallel_size": 1,
"tf32": true,
"tiled_mlp_use_original_mlp": true,
"tokenizer_config": "Qwen/Qwen3-8B",
"tokenizer_save_jinja_files": true,
"torch_dtype": "torch.bfloat16",
"train_on_inputs": false,
"trl": {
"async_prefetch": false,
"log_completions": false,
"mask_truncated_completions": false,
"ref_model_mixup_alpha": 0.9,
"ref_model_sync_steps": 64,
"replay_buffer_size": 0,
"replay_recompute_logps": true,
"reroll_max_groups": 1,
"reroll_start_fraction": 1.0,
"reward_num_workers": 1,
"scale_rewards": true,
"skip_zero_advantage_batches": true,
"sync_ref_model": false,
"use_data_producer": false,
"use_vllm": false,
"vllm_lora_sync": false,
"vllm_server_host": "0.0.0.0",
"vllm_server_port": 8000
},
"use_otel_metrics": false,
"use_ray": false,
"val_set_size": 0.0,
"vllm": {
"device": "auto",
"dtype": "auto",
"gpu_memory_utilization": 0.9,
"host": "0.0.0.0",
"port": 8000
},
"warmup_ratio": 0.05,
"weight_decay": 0.01,
"world_size": 1
}
[2026-05-14 13:52:19,357] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:311] [PID:25326] EOS: 151645 / <|im_end|>
[2026-05-14 13:52:19,357] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:312] [PID:25326] BOS: None / None
[2026-05-14 13:52:19,357] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:313] [PID:25326] PAD: 151643 / <|endoftext|>
[2026-05-14 13:52:19,357] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:314] [PID:25326] UNK: None / None
[2026-05-14 13:52:19,358] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:482] [PID:25326] Unable to find prepared dataset in last_run_prepared/2c3cb7e1625c800787dbdc29d010a51d
[2026-05-14 13:52:19,358] [INFO] [axolotl.utils.data.sft._load_raw_datasets:320] [PID:25326] Loading raw datasets...
Downloading (incomplete total...): 0.00B [00:00, ?B/s]
Fetching 0 files: 0it [00:00, ?it/s]
Fetching 0 files: 0it [00:00, ?it/s]
Download complete: : 0.00B [00:00, ?B/s]
Download complete: : 0.00B [00:00, ?B/s]
[2026-05-14 13:52:20,634] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:87] [PID:25326] Loading dataset: Gandalf1/indian-finance-synthetic-phase2-cleaned with base_type: chat_template and prompt_style: None
[2026-05-14 13:52:20,637] [INFO] [axolotl.prompt_strategies.chat_template.__call__:1191] [PID:25326] Using chat template:
---
{%- if tools %}
{{- '<|im_start|>system\n' }}
{%- if messages[0].role == 'system' %}
{{- messages[0].content + '\n\n' }}
{%- endif %}
{{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
{%- for tool in tools %}
{{- "\n" }}
{{- tool | tojson }}
{%- endfor %}
{{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
{%- else %}
{%- if messages[0].role == 'system' %}
{{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
{%- endif %}
{%- endif %}
{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
{#- Determine the real last index: use provided value or default to messages length - 1 #}
{%- if real_last_index is defined and real_last_index is not none %}
{%- set ns.real_last_index = real_last_index %}
{%- else %}
{%- set ns.real_last_index = messages|length - 1 %}
{%- endif %}
{%- for message in messages[::-1] %}
{%- set index = (messages|length - 1) - loop.index0 %}
{%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
{%- set ns.multi_step_tool = false %}
{%- set ns.last_query_index = index %}
{%- endif %}
{%- endfor %}
{%- for message in messages %}
{%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
{{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
{%- elif message.role == "assistant" %}
{%- set content = message.content %}
{%- set reasoning_content = '' %}
{%- if message.reasoning_content is defined and message.reasoning_content is not none %}
{%- set reasoning_content = message.reasoning_content %}
{%- else %}
{%- if '</think>' in message.content %}
{%- set content = message.content.split('</think>')[-1].lstrip('\n') %}
{%- set reasoning_content = message.content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
{%- endif %}
{%- endif %}
{%- if loop.index0 > ns.last_query_index %}
{%- if loop.index0 == ns.real_last_index or (loop.index0 != ns.real_last_index and reasoning_content) %}
{{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
{%- else %}
{{- '<|im_start|>' + message.role + '\n' + content }}
{%- endif %}
{%- else %}
{{- '<|im_start|>' + message.role + '\n' + content }}
{%- endif %}
{%- if message.tool_calls %}
{%- for tool_call in message.tool_calls %}
{%- if (loop.first and content) or (not loop.first) %}
{{- '\n' }}
{%- endif %}
{%- if tool_call.function %}
{%- set tool_call = tool_call.function %}
{%- endif %}
{{- '<tool_call>\n{"name": "' }}
{{- tool_call.name }}
{{- '", "arguments": ' }}
{%- if tool_call.arguments is string %}
{{- tool_call.arguments }}
{%- else %}
{{- tool_call.arguments | tojson }}
{%- endif %}
{{- '}\n</tool_call>' }}
{%- endfor %}
{%- endif %}
{{- '<|im_end|>\n' }}
{%- elif message.role == "tool" %}
{%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
{{- '<|im_start|>user' }}
{%- endif %}
{{- '\n<tool_response>\n' }}
{{- message.content }}
{{- '\n</tool_response>' }}
{%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
{{- '<|im_end|>\n' }}
{%- endif %}
{%- endif %}
{%- endfor %}
{%- if add_generation_prompt %}
{{- '<|im_start|>assistant\n' }}
{%- if enable_thinking is defined and enable_thinking is false %}
{{- '<think>\n\n</think>\n\n' }}
{%- else %}
{{- '<think>\n\n' }}
{%- endif %}
{%- endif %}
---
Tokenizing Prompts (num_proc=1): 0%| | 0/14763 [00:00<?, ? examples/s]
Tokenizing Prompts (num_proc=1): 7%|██ | 1000/14763 [00:15<03:39, 62.81 examples/s]
Tokenizing Prompts (num_proc=1): 7%|██ | 1000/14763 [00:31<03:39, 62.81 examples/s]
Tokenizing Prompts (num_proc=1): 14%|████ | 2000/14763 [00:31<03:17, 64.63 examples/s]
Tokenizing Prompts (num_proc=1): 14%|████ | 2000/14763 [00:42<03:17, 64.63 examples/s]
Tokenizing Prompts (num_proc=1): 20%|██████ | 3000/14763 [00:46<03:00, 65.17 examples/s]
Tokenizing Prompts (num_proc=1): 27%|████████▏ | 4000/14763 [01:00<02:42, 66.31 examples/s]
Tokenizing Prompts (num_proc=1): 27%|████████▏ | 4000/14763 [01:11<02:42, 66.31 examples/s]
Tokenizing Prompts (num_proc=1): 34%|██████████▏ | 5000/14763 [01:16<02:28, 65.94 examples/s]
Tokenizing Prompts (num_proc=1): 34%|██████████▏ | 5000/14763 [01:31<02:28, 65.94 examples/s]
Tokenizing Prompts (num_proc=1): 41%|████████████▏ | 6000/14763 [01:31<02:12, 66.36 examples/s]
Tokenizing Prompts (num_proc=1): 41%|████████████▏ | 6000/14763 [01:42<02:12, 66.36 examples/s]
Tokenizing Prompts (num_proc=1): 47%|██████████████▏ | 7000/14763 [01:45<01:55, 67.19 examples/s]
Tokenizing Prompts (num_proc=1): 54%|████████████████▎ | 8000/14763 [01:59<01:38, 68.66 examples/s]
Tokenizing Prompts (num_proc=1): 54%|████████████████▎ | 8000/14763 [02:11<01:38, 68.66 examples/s]
Tokenizing Prompts (num_proc=1): 61%|██████████████████▎ | 9000/14763 [02:12<01:21, 70.88 examples/s]
Tokenizing Prompts (num_proc=1): 68%|███████████████████▋ | 10000/14763 [02:25<01:06, 72.15 examples/s]
Tokenizing Prompts (num_proc=1): 75%|█████████████████████▌ | 11000/14763 [02:39<00:51, 73.24 examples/s]
Tokenizing Prompts (num_proc=1): 75%|█████████████████████▌ | 11000/14763 [02:51<00:51, 73.24 examples/s]
Tokenizing Prompts (num_proc=1): 81%|███████████████████████▌ | 12000/14763 [02:52<00:37, 74.31 examples/s]
Tokenizing Prompts (num_proc=1): 88%|█████████████████████████▌ | 13000/14763 [03:05<00:23, 74.83 examples/s]
Tokenizing Prompts (num_proc=1): 95%|███████████████████████████▌ | 14000/14763 [03:18<00:10, 74.41 examples/s]
Tokenizing Prompts (num_proc=1): 100%|█████████████████████████████| 14763/14763 [03:29<00:00, 74.24 examples/s]
Tokenizing Prompts (num_proc=1): 100%|█████████████████████████████| 14763/14763 [03:29<00:00, 70.49 examples/s]
[2026-05-14 13:55:57,931] [INFO] [axolotl.utils.data.utils._log_dataset_stats:212] [PID:25326] min_input_len: 591
[2026-05-14 13:55:57,932] [INFO] [axolotl.utils.data.utils._log_dataset_stats:213] [PID:25326] max_input_len: 4338
Dropping Invalid Sequences (<None or >8192) (num_proc=1): 0%| | 0/14763 [00:00<?, ? examples/s]
Dropping Invalid Sequences (<None or >8192) (num_proc=1): 7%|▏ | 1000/14763 [00:00<00:09, 1431.08 examples/s]
Dropping Invalid Sequences (<None or >8192) (num_proc=1): 14%|▍ | 2000/14763 [00:01<00:08, 1553.70 examples/s]
Dropping Invalid Sequences (<None or >8192) (num_proc=1): 20%|▌ | 3000/14763 [00:01<00:07, 1608.36 examples/s]
Dropping Invalid Sequences (<None or >8192) (num_proc=1): 27%|▊ | 4000/14763 [00:02<00:06, 1658.84 examples/s]
Dropping Invalid Sequences (<None or >8192) (num_proc=1): 34%|█ | 5000/14763 [00:03<00:05, 1675.80 examples/s]
Dropping Invalid Sequences (<None or >8192) (num_proc=1): 41%|█▏ | 6000/14763 [00:03<00:05, 1706.11 examples/s]
Dropping Invalid Sequences (<None or >8192) (num_proc=1): 47%|█▍ | 7000/14763 [00:04<00:04, 1696.20 examples/s]
Dropping Invalid Sequences (<None or >8192) (num_proc=1): 54%|█▋ | 8000/14763 [00:04<00:03, 1694.97 examples/s]
Dropping Invalid Sequences (<None or >8192) (num_proc=1): 61%|█▊ | 9000/14763 [00:05<00:03, 1706.05 examples/s]
Dropping Invalid Sequences (<None or >8192) (num_proc=1): 68%|█▎| 10000/14763 [00:05<00:02, 1713.44 examples/s]
Dropping Invalid Sequences (<None or >8192) (num_proc=1): 75%|█▍| 11000/14763 [00:06<00:02, 1720.45 examples/s]
Dropping Invalid Sequences (<None or >8192) (num_proc=1): 81%|█▋| 12000/14763 [00:07<00:01, 1728.08 examples/s]
Dropping Invalid Sequences (<None or >8192) (num_proc=1): 88%|█▊| 13000/14763 [00:07<00:01, 1723.26 examples/s]
Dropping Invalid Sequences (<None or >8192) (num_proc=1): 95%|█▉| 14000/14763 [00:08<00:00, 1718.49 examples/s]
Dropping Invalid Sequences (<None or >8192) (num_proc=1): 100%|██| 14763/14763 [00:08<00:00, 1725.22 examples/s]
Dropping Invalid Sequences (<None or >8192) (num_proc=1): 100%|██| 14763/14763 [00:08<00:00, 1665.67 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=1): 0%| | 0/14763 [00:00<?, ? examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=1): 7%|▍ | 1000/14763 [00:00<00:09, 1480.39 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=1): 14%|▉ | 2000/14763 [00:01<00:07, 1624.88 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=1): 20%|█▍ | 3000/14763 [00:01<00:06, 1691.39 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=1): 27%|█▉ | 4000/14763 [00:02<00:06, 1693.88 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=1): 34%|██▎ | 5000/14763 [00:02<00:05, 1702.30 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=1): 41%|██▊ | 6000/14763 [00:03<00:05, 1728.09 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=1): 47%|███▎ | 7000/14763 [00:04<00:04, 1714.08 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=1): 54%|███▊ | 8000/14763 [00:04<00:03, 1706.22 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=1): 61%|████▎ | 9000/14763 [00:05<00:03, 1738.85 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=1): 68%|████ | 10000/14763 [00:05<00:02, 1742.02 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=1): 75%|████▍ | 11000/14763 [00:06<00:02, 1720.32 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=1): 81%|████▉ | 12000/14763 [00:07<00:01, 1726.95 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=1): 88%|█████▎| 13000/14763 [00:07<00:01, 1728.68 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=1): 95%|█████▋| 14000/14763 [00:08<00:00, 1737.40 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=1): 100%|██████| 14763/14763 [00:08<00:00, 1747.36 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=1): 100%|██████| 14763/14763 [00:08<00:00, 1688.84 examples/s]
Add position_id column (Sample Packing) (num_proc=1): 0%| | 0/14763 [00:00<?, ? examples/s]
Add position_id column (Sample Packing) (num_proc=1): 7%|▌ | 1000/14763 [00:01<00:16, 853.52 examples/s]
Add position_id column (Sample Packing) (num_proc=1): 14%|█ | 2000/14763 [00:02<00:13, 919.04 examples/s]
Add position_id column (Sample Packing) (num_proc=1): 20%|█▍ | 3000/14763 [00:03<00:11, 1002.51 examples/s]
Add position_id column (Sample Packing) (num_proc=1): 27%|█▉ | 4000/14763 [00:03<00:10, 1071.46 examples/s]
Add position_id column (Sample Packing) (num_proc=1): 34%|██▎ | 5000/14763 [00:04<00:08, 1105.51 examples/s]
Add position_id column (Sample Packing) (num_proc=1): 41%|██▊ | 6000/14763 [00:05<00:07, 1120.70 examples/s]
Add position_id column (Sample Packing) (num_proc=1): 47%|███▎ | 7000/14763 [00:06<00:06, 1133.90 examples/s]
Add position_id column (Sample Packing) (num_proc=1): 54%|███▊ | 8000/14763 [00:07<00:05, 1136.07 examples/s]
Add position_id column (Sample Packing) (num_proc=1): 61%|████▎ | 9000/14763 [00:08<00:05, 1149.89 examples/s]
Add position_id column (Sample Packing) (num_proc=1): 68%|████ | 10000/14763 [00:09<00:04, 1157.43 examples/s]
Add position_id column (Sample Packing) (num_proc=1): 75%|████▍ | 11000/14763 [00:09<00:03, 1166.93 examples/s]
Add position_id column (Sample Packing) (num_proc=1): 81%|████▉ | 12000/14763 [00:10<00:02, 1159.14 examples/s]
Add position_id column (Sample Packing) (num_proc=1): 88%|█████▎| 13000/14763 [00:11<00:01, 1152.36 examples/s]
Add position_id column (Sample Packing) (num_proc=1): 95%|█████▋| 14000/14763 [00:12<00:00, 1156.19 examples/s]
Add position_id column (Sample Packing) (num_proc=1): 100%|██████| 14763/14763 [00:13<00:00, 1150.85 examples/s]
Add position_id column (Sample Packing) (num_proc=1): 100%|██████| 14763/14763 [00:13<00:00, 1103.88 examples/s]
Saving the dataset (0/1 shards): 0%| | 0/14763 [00:00<?, ? examples/s]
Saving the dataset (0/1 shards): 14%|███▉ | 2000/14763 [00:08<00:51, 247.02 examples/s]
Saving the dataset (0/1 shards): 54%|███████████████▏ | 8000/14763 [00:08<00:05, 1273.03 examples/s]
Saving the dataset (0/1 shards): 95%|█████████████████████████▌ | 14000/14763 [00:08<00:00, 2655.38 examples/s]
Saving the dataset (1/1 shards): 100%|███████████████████████████| 14763/14763 [00:08<00:00, 2655.38 examples/s]
Saving the dataset (1/1 shards): 100%|███████████████████████████| 14763/14763 [00:09<00:00, 1547.31 examples/s]
[2026-05-14 13:56:38,558] [INFO] [axolotl.common.datasets.load_datasets:74] [PID:25326] check_dataset_labels...
[2026-05-14 13:56:38,572] [INFO] [axolotl.utils.tokenization.check_example_labels:44] [PID:25326] <|im_start|>(-100, 151644) system(-100, 8948) 
(-100, 198) You(-100, 2610)  are(-100, 525)  B(-100, 425) any(-100, 3767) an(-100, 276) Tree(-100, 6533) ,(-100, 11)  an(-100, 458)  expert(-100, 6203)  Indian(-100, 7748)  personal(-100, 4345)  finance(-100, 17017)  assistant(-100, 17847) .(-100, 13)  You(-100, 1446)  have(-100, 614)  access(-100, 2615)  to(-100, 311)  financial(-100, 5896)  calculation(-100, 21937)  tools(-100, 7375) .(-100, 13)  Use(-100, 5443)  them(-100, 1105)  to(-100, 311)  provide(-100, 3410)  accurate(-100, 13382) ,(-100, 11)  personalized(-100, 34549)  advice(-100, 9462)  based(-100, 3118)  on(-100, 389)  FY(-100, 46366)  (-100, 220) 2(-100, 17) 0(-100, 15) 2(-100, 17) 4(-100, 19) -(-100, 12) 2(-100, 17) 5(-100, 20)  Indian(-100, 7748)  tax(-100, 3742)  rules(-100, 5601)  and(-100, 323)  current(-100, 1482)  financial(-100, 5896)  regulations(-100, 14305) .(-100, 13)  Always(-100, 23240)  show(-100, 1473)  your(-100, 697)  reasoning(-100, 32711)  before(-100, 1573)  taking(-100, 4633)  action(-100, 1917) :(-100, 25)  decom(-100, 28502) pose(-100, 2900)  the(-100, 279)  problem(-100, 3491) ,(-100, 11)  identify(-100, 10542)  what(-100, 1128)  information(-100, 1995)  the(-100, 279)  user(-100, 1196)  provided(-100, 3897)  vs(-100, 6165)  what(-100, 1128) 's(-100, 594)  missing(-100, 7402) ,(-100, 11)  state(-100, 1584)  any(-100, 894)  assumptions(-100, 31846)  explicitly(-100, 20975) ,(-100, 11)  then(-100, 1221)  decide(-100, 10279)  whether(-100, 3425)  to(-100, 311)  use(-100, 990)  tools(-100, 7375) ,(-100, 11)  ask(-100, 2548)  for(-100, 369)  clarification(-100, 63684) ,(-100, 11)  or(-100, 476)  answer(-100, 4226)  directly(-100, 5961) .(-100, 13)  Never(-100, 14695)  guarantee(-100, 15440)  returns(-100, 4675)  on(-100, 389)  market(-100, 3081) -linked(-100, 54414)  instruments(-100, 23316) .(-100, 13)  When(-100, 3197)  information(-100, 1995)  is(-100, 374)  missing(-100, 7402) ,(-100, 11)  either(-100, 2987)  ask(-100, 2548)  the(-100, 279)  user(-100, 1196)  or(-100, 476)  clearly(-100, 9355)  state(-100, 1584)  your(-100, 697)  assumptions(-100, 31846) .
(-100, 382) #(-100, 2)  Tools(-100, 13852) 
(-100, 271) You(-100, 2610)  may(-100, 1231)  call(-100, 1618)  one(-100, 825)  or(-100, 476)  more(-100, 803)  functions(-100, 5746)  to(-100, 311)  assist(-100, 7789)  with(-100, 448)  the(-100, 279)  user(-100, 1196)  query(-100, 3239) .
(-100, 382) You(-100, 2610)  are(-100, 525)  provided(-100, 3897)  with(-100, 448)  function(-100, 729)  signatures(-100, 32628)  within(-100, 2878)  <(-100, 366) tools(-100, 15918) ></(-100, 1472) tools(-100, 15918) >(-100, 29)  XML(-100, 11874)  tags(-100, 9492) :
(-100, 510) <(-100, 27) tools(-100, 15918) >
(-100, 397) {"(-100, 4913) type(-100, 1313) ":(-100, 788)  "(-100, 330) function(-100, 1688) ",(-100, 497)  "(-100, 330) function(-100, 1688) ":(-100, 788)  {"(-100, 5212) name(-100, 606) ":(-100, 788)  "(-100, 330) calculate(-100, 35597) _s(-100, 643) ip(-100, 573) _returns(-100, 58900) ",(-100, 497)  "(-100, 330) description(-100, 4684) ":(-100, 788)  "(-100, 330) Calculate(-100, 47866)  the(-100, 279)  future(-100, 3853)  value(-100, 897)  of(-100, 315)  a(-100, 264)  System(-100, 739) atic(-100, 774)  Investment(-100, 32250)  Plan(-100, 9680)  ((-100, 320) S(-100, 50) IP(-100, 3298) ).(-100, 568) ",(-100, 497)  "(-100, 330) parameters(-100, 13786) ":(-100, 788)  {"(-100, 5212) type(-100, 1313) ":(-100, 788)  "(-100, 330) object(-100, 1700) ",(-100, 497)  "(-100, 330) properties(-100, 13193) ":(-100, 788)  {"(-100, 5212) monthly(-100, 69138) _amount(-100, 13471) ":(-100, 788)  {"(-100, 5212) type(-100, 1313) ":(-100, 788)  "(-100, 330) number(-100, 4082) ",(-100, 497)  "(-100, 330) description(-100, 4684) ":(-100, 788)  "(-100, 330) Monthly(-100, 72007)  SIP(-100, 65441)  investment(-100, 9162)  amount(-100, 3311)  in(-100, 304)  IN(-100, 1964) R(-100, 49) ."(-100, 1189) },(-100, 2137)  "(-100, 330) annual(-100, 63609) _return(-100, 12511) _pct(-100, 71512) ":(-100, 788)  {"(-100, 5212) type(-100, 1313) ":(-100, 788)  "(-100, 330) number(-100, 4082) ",(-100, 497)  "(-100, 330) description(-100, 4684) ":(-100, 788)  "(-100, 330) Expected(-100, 18896)  annual(-100, 9775)  return(-100, 470)  percentage(-100, 11414) ."(-100, 1189) },(-100, 2137)  "(-100, 330) ten(-100, 1960) ure(-100, 552) _years(-100, 74490) ":(-100, 788)  {"(-100, 5212) type(-100, 1313) ":(-100, 788)  "(-100, 330) integer(-100, 11662) ",(-100, 497)  "(-100, 330) description(-100, 4684) ":(-100, 788)  "(-100, 330) Invest(-100, 33876) ment(-100, 478)  duration(-100, 8090)  in(-100, 304)  years(-100, 1635) ."(-100, 1189) }},(-100, 38154)  "(-100, 330) required(-100, 6279) ":(-100, 788)  ["(-100, 4383) monthly(-100, 69138) _amount(-100, 13471) ",(-100, 497)  "(-100, 330) annual(-100, 63609) _return(-100, 12511) _pct(-100, 71512) ",(-100, 497)  "(-100, 330) ten(-100, 1960) ure(-100, 552) _years(-100, 74490) "](-100, 1341) }}(-100, 3417) }
(-100, 532) </(-100, 522) tools(-100, 15918) >
(-100, 1339) For(-100, 2461)  each(-100, 1817)  function(-100, 729)  call(-100, 1618) ,(-100, 11)  return(-100, 470)  a(-100, 264)  json(-100, 2951)  object(-100, 1633)  with(-100, 448)  function(-100, 729)  name(-100, 829)  and(-100, 323)  arguments(-100, 5977)  within(-100, 2878)  (-100, 220) <tool_call>(-100, 151657) </tool_call>(-100, 151658)  XML(-100, 11874)  tags(-100, 9492) :
(-100, 510) <tool_call>(-100, 151657) 
(-100, 198) {"(-100, 4913) name(-100, 606) ":(-100, 788)  <(-100, 366) function(-100, 1688) -name(-100, 11494) >,(-100, 8066)  "(-100, 330) arguments(-100, 16370) ":(-100, 788)  <(-100, 366) args(-100, 2116) -json(-100, 56080) -object(-100, 40432) >}
(-100, 31296) </tool_call>(-100, 151658) <|im_end|>(-100, 151645) 
(-100, 198) <|im_start|>(-100, 151644) user(-100, 872) 
(-100, 198) I(-100, 40) 'm(-100, 2776)  (-100, 220) 4(-100, 19) 9(-100, 24) ,(-100, 11)  single(-100, 3175)  with(-100, 448)  one(-100, 825)  dependent(-100, 17749) ,(-100, 11)  and(-100, 323)  honestly(-100, 26044)  feeling(-100, 8266)  a(-100, 264)  bit(-100, 2699)  overwhelmed(-100, 42106)  about(-100, 911)  retirement(-100, 20950)  planning(-100, 9115) .(-100, 13)  Can(-100, 2980)  someone(-100, 4325)  explain(-100, 10339)  N(-100, 451) PS(-100, 5012)  properly(-100, 10277)  —(-100, 1959)  like(-100, 1075)  what(-100, 1128)  are(-100, 525)  the(-100, 279)  different(-100, 2155)  tiers(-100, 63171) ,(-100, 11)  how(-100, 1246)  exactly(-100, 6896)  do(-100, 653)  the(-100, 279)  tax(-100, 3742)  benefits(-100, 7567)  work(-100, 975)  under(-100, 1212)  (-100, 220) 8(-100, 23) 0(-100, 15) C(-100, 34)  and(-100, 323)  (-100, 220) 8(-100, 23) 0(-100, 15) CC(-100, 3706) D(-100, 35) ,(-100, 11)  and(-100, 323)  what(-100, 1128)  are(-100, 525)  the(-100, 279)  real(-100, 1931)  withdrawal(-100, 29736)  rules(-100, 5601) ?(-100, 30)  I(-100, 358) 'm(-100, 2776)  a(-100, 264)  pharmacist(-100, 89294)  in(-100, 304)  Vis(-100, 7656) akh(-100, 21758) apat(-100, 25760) nam(-100, 12400)  earning(-100, 27644)  around(-100, 2163)  (-100, 220) 3(-100, 18) 0(-100, 15)  lak(-100, 93686) hs(-100, 4997)  per(-100, 817)  year(-100, 1042)  and(-100, 323)  just(-100, 1101)  trying(-100, 4460)  to(-100, 311)  figure(-100, 7071)  out(-100, 700)  if(-100, 421)  this(-100, 419)  is(-100, 374)  a(-100, 264)  good(-100, 1661)  option(-100, 2999)  for(-100, 369)  me(-100, 752)  at(-100, 518)  my(-100, 847)  age(-100, 4231) .(-100, 13) <|im_end|>(-100, 151645) 
(-100, 198) <|im_start|>(-100, 151644) assistant(-100, 77091) 
(-100, 198) <think>(-100, 151667) 
(198, 198) The(785, 785)  user(1196, 1196)  wants(6801, 6801)  a(264, 264)  conceptual(43801, 43801)  overview(23251, 23251)  of(315, 315)  N(451, 451) PS(5012, 5012)  focusing(21080, 21080)  on(389, 389)  tiers(63171, 63171) ,(11, 11)  tax(3742, 3742)  benefits(7567, 7567)  ((320, 320) 8(23, 23) 0(15, 15) C(34, 34) ,(11, 11)  (220, 220) 8(23, 23) 0(15, 15) CC(3706, 3706) D(35, 35) ),(701, 701)  and(323, 323)  withdrawal(29736, 29736)  rules(5601, 5601) .(13, 13)  I(358, 358)  need(1184, 1184)  to(311, 311)  clarify(37163, 37163)  that(429, 429)  (220, 220) 8(23, 23) 0(15, 15) CC(3706, 3706) D(35, 35) ((7, 7) 1(16, 16) )(8, 8)  is(374, 374)  part(949, 949)  of(315, 315)  (220, 220) 8(23, 23) 0(15, 15) C(34, 34)  limit(3930, 3930) ,(11, 11)  and(323, 323)  (220, 220) 8(23, 23) 0(15, 15) CC(3706, 3706) D(35, 35) ((7, 7) 1(16, 16) B(33, 33) )(8, 8)  is(374, 374)  extra(4960, 4960)  ₹(89791, 89791) 5(20, 20) 0(15, 15) k(74, 74) .(13, 13)  Employer(74852, 74852)  contribution(18527, 18527)  ((320, 320) 8(23, 23) 0(15, 15) CC(3706, 3706) D(35, 35) ((7, 7) 2(17, 17) ))(593, 593)  is(374, 374)  separate(8651, 8651) .(13, 13)  Withdraw(67381, 67381) al(278, 278)  rules(5601, 5601) :(25, 25)  at(518, 518)  (220, 220) 6(21, 21) 0(15, 15) ,(11, 11)  (220, 220) 6(21, 21) 0(15, 15) %(4, 4)  tax(3742, 3742) -free(12577, 12577)  lump(48529, 48529)  sum(2629, 2629)  +(488, 488)  (220, 220) 4(19, 19) 0(15, 15) %(4, 4)  ann(2935, 2935) uity(34494, 34494) ;(26, 26)  partial(7130, 7130)  withdrawals(85204, 85204)  after(1283, 1283)  (220, 220) 3(18, 18)  years(1635, 1635) ;(26, 26)  early(4124, 4124)  exit(4869, 4869)  after(1283, 1283)  (220, 220) 1(16, 16) 0(15, 15)  years(1635, 1635) .(13, 13)  Also(7281, 7281)  mention(6286, 6286)  that(429, 429)  tax(3742, 3742)  benefits(7567, 7567)  only(1172, 1172)  apply(3796, 3796)  under(1212, 1212)  old(2310, 2310)  regime(17484, 17484) .(13, 13)  User(2657, 2657)  is(374, 374)  (220, 220) 4(19, 19) 9(24, 24) ,(11, 11)  so(773, 773)  mention(6286, 6286)  horizon(34074, 34074)  and(323, 323)  suitability(80628, 80628) .(13, 13)  Avoid(34006, 34006)  calculations(28117, 28117) ,(11, 11)  use(990, 990)  qualitative(61029, 61029)  language(4128, 4128) .(13, 13)  Include(29734, 29734)  cave(25385, 25385) ats(1862, 1862)  about(911, 911)  regime(17484, 17484)  choice(5754, 5754)  and(323, 323)  liquidity(52013, 52013) .
(624, 624) </think>(151668, 151668) 
(271, 271) The(785, 785)  National(5055, 5055)  Pension(80457, 80457)  System(739, 739)  ((320, 320) N(45, 45) PS(5012, 5012) )(8, 8)  is(374, 374)  a(264, 264)  voluntary(35979, 35979)  retirement(20950, 20950)  savings(18986, 18986)  scheme(12859, 12859)  regulated(34219, 34219)  by(553, 553)  P(393, 393) FR(10504, 10504) DA(6352, 6352) .(13, 13)  It(1084, 1084)  has(702, 702)  two(1378, 1378)  tiers(63171, 63171) :(25, 25)  Tier(35232, 35232)  I(358, 358)  ((320, 320) mandatory(80116, 80116)  for(369, 369)  tax(3742, 3742)  benefits(7567, 7567) ,(11, 11)  lock(5296, 5296) -in(3419, 3419)  till(11956, 11956)  age(4231, 4231)  (220, 220) 6(21, 21) 0(15, 15) )(8, 8)  and(323, 323)  Tier(35232, 35232)  II(7946, 7946)  ((320, 320) vol(12536, 12536) untary(58956, 58956) ,(11, 11)  no(902, 902)  lock(5296, 5296) -in(3419, 3419) ,(11, 11)  no(902, 902)  tax(3742, 3742)  benefits(7567, 7567) ).(568, 568)  For(1752, 1752)  sal(4274, 4274) ar(277, 277) ied(1122, 1122)  individuals(7775, 7775) ,(11, 11)  your(697, 697)  own(1828, 1828)  Tier(35232, 35232)  I(358, 358)  contribution(18527, 18527)  up(705, 705)  to(311, 311)  (220, 220) 1(16, 16) 0(15, 15) %(4, 4)  of(315, 315)  salary(16107, 16107)  qualifies(79907, 79907)  under(1212, 1212)  Section(11113, 11113)  (220, 220) 8(23, 23) 0(15, 15) CC(3706, 3706) D(35, 35) ((7, 7) 1(16, 16) )(8, 8)  within(2878, 2878)  the(279, 279)  overall(8084, 8084)  ₹(89791, 89791) 1(16, 16) .(13, 13) 5(20, 20)  lakh(62173, 62173)  (220, 220) 8(23, 23) 0(15, 15) C(34, 34)  limit(3930, 3930) ,(11, 11)  and(323, 323)  an(458, 458)  additional(5107, 5107)  ₹(89791, 89791) 5(20, 20) 0(15, 15) ,(11, 11) 0(15, 15) 0(15, 15) 0(15, 15)  under(1212, 1212)  (220, 220) 8(23, 23) 0(15, 15) CC(3706, 3706) D(35, 35) ((7, 7) 1(16, 16) B(33, 33) )(8, 8)  ((320, 320) over(1975, 1975)  and(323, 323)  above(3403, 3403)  (220, 220) 8(23, 23) 0(15, 15) C(34, 34) ).(568, 568)  Employer(74852, 74852)  contributions(19026, 19026)  up(705, 705)  to(311, 311)  (220, 220) 1(16, 16) 0(15, 15) %(4, 4)  of(315, 315)  salary(16107, 16107)  are(525, 525)  exempt(37928, 37928)  under(1212, 1212)  (220, 220) 8(23, 23) 0(15, 15) CC(3706, 3706) D(35, 35) ((7, 7) 2(17, 17) )(8, 8)  with(448, 448)  no(902, 902)  upper(8416, 8416)  cap(2062, 2062) .(13, 13)  At(2411, 2411)  age(4231, 4231)  (220, 220) 4(19, 19) 9(24, 24) ,(11, 11)  you(498, 498)  have(614, 614)  about(911, 911)  (220, 220) 1(16, 16) 1(16, 16)  years(1635, 1635)  to(311, 311)  retirement(20950, 20950) .(13, 13)  Withdraw(67381, 67381) als(1127, 1127) :(25, 25)  At(2411, 2411)  (220, 220) 6(21, 21) 0(15, 15) ,(11, 11)  you(498, 498)  can(646, 646)  take(1896, 1896)  (220, 220) 6(21, 21) 0(15, 15) %(4, 4)  tax(3742, 3742) -free(12577, 12577)  lump(48529, 48529)  sum(2629, 2629) ,(11, 11)  but(714, 714)  must(1969, 1969)  use(990, 990)  (220, 220) 4(19, 19) 0(15, 15) %(4, 4)  to(311, 311)  buy(3695, 3695)  an(458, 458)  ann(2935, 2935) uity(34494, 34494)  ((320, 320) tax(17616, 17616) able(480, 480) ).(568, 568)  Partial(24552, 24552)  withdrawals(85204, 85204)  ((320, 320) up(454, 454)  to(311, 311)  (220, 220) 2(17, 17) 5(20, 20) %(4, 4)  of(315, 315)  own(1828, 1828)  contributions(19026, 19026) )(8, 8)  are(525, 525)  allowed(5420, 5420)  after(1283, 1283)  (220, 220) 3(18, 18)  years(1635, 1635)  for(369, 369)  specific(3151, 3151)  needs(3880, 3880)  like(1075, 1075)  children(2841, 2841) 's(594, 594)  education(6731, 6731) ,(11, 11)  marriage(10866, 10866) ,(11, 11)  or(476, 476)  buying(11833, 11833)  a(264, 264)  house(3753, 3753) .(13, 13)  Early(22752, 22752)  exit(4869, 4869)  ((320, 320) before(14801, 14801)  (220, 220) 6(21, 21) 0(15, 15) )(8, 8)  after(1283, 1283)  (220, 220) 1(16, 16) 0(15, 15)  years(1635, 1635) :(25, 25)  (220, 220) 8(23, 23) 0(15, 15) %(4, 4)  must(1969, 1969)  be(387, 387)  ann(2935, 2935) uit(3083, 3083) ized(1506, 1506) ,(11, 11)  (220, 220) 2(17, 17) 0(15, 15) %(4, 4)  tax(3742, 3742) -free(12577, 12577) .(13, 13)  N(451, 451) PS(5012, 5012)  offers(6081, 6081)  flexibility(24177, 24177)  to(311, 311)  choose(5157, 5157)  asset(9329, 9329)  allocation(23757, 23757)  ((320, 320) Equ(24509, 24509) ity(487, 487)  up(705, 705)  to(311, 311)  (220, 220) 7(22, 22) 5(20, 20) %(4, 4)  for(369, 369)  Tier(35232, 35232)  I(358, 358) ).(568, 568)  For(1752, 1752)  a(264, 264)  high(1550, 1550)  ear(2430, 2430) ner(1194, 1194)  like(1075, 1075)  you(498, 498) ,(11, 11)  N(451, 451) PS(5012, 5012)  is(374, 374)  beneficial(23699, 23699)  under(1212, 1212)  the(279, 279)  old(2310, 2310)  tax(3742, 3742)  regime(17484, 17484)  due(4152, 4152)  to(311, 311)  deductions(70637, 70637) ;(26, 26)  in(304, 304)  the(279, 279)  new(501, 501)  regime(17484, 17484) ,(11, 11)  no(902, 902)  deductions(70637, 70637)  apply(3796, 3796) .(13, 13)  Consider(21144, 21144)  your(697, 697)  liquidity(52013, 52013)  needs(3880, 3880)  and(323, 323)  retirement(20950, 20950)  goals(8845, 8845) .(13, 13)  Consult(19960, 19960)  a(264, 264)  tax(3742, 3742)  advisor(36613, 36613)  to(311, 311)  optimize(29436, 29436)  your(697, 697)  overall(8084, 8084)  plan(3119, 3119) .(13, 13) <|im_end|>(151645, 151645) 
(-100, 198)
[2026-05-14 13:56:38,574] [INFO] [axolotl.utils.tokenization.check_example_labels:45] [PID:25326]
[2026-05-14 13:56:38,574] [INFO] [axolotl.utils.tokenization.check_example_labels:48] [PID:25326] Total input len: 946
[2026-05-14 13:56:38,574] [INFO] [axolotl.utils.tokenization.check_example_labels:49] [PID:25326] Count of labels: 506
[2026-05-14 13:56:38,574] [INFO] [axolotl.common.datasets.load_datasets:90] [PID:25326] printing prompters...
[2026-05-14 13:56:38,574] [INFO] [axolotl.common.datasets.load_datasets:92] [PID:25326] Pre-tokenized or custom dataset types are unsupported for logging
[2026-05-14 13:56:38,968] [INFO] [axolotl.cli.preprocess.do_preprocess:92] [PID:25326] Success! Preprocessed data path: `dataset_prepared_path: last_run_prepared`
[2026-05-14 13:57:44,324] [DEBUG] [axolotl.utils.config.resolve_dtype:74] [PID:26066] bf16 support detected, enabling for this configuration.
[2026-05-14 13:57:44,465] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:26066] baseline 0.000GB ()
[2026-05-14 13:57:44,466] [INFO] [axolotl.cli.config.load_cfg:333] [PID:26066] config:
{
"activation_offloading": false,
"adapter": "qlora",
"attn_implementation": "flash_attention_2",
"attn_needs_dtype_cast": true,
"attn_supports_packing": true,
"attn_uses_flash_lib": true,
"auto_resume_from_checkpoints": true,
"axolotl_config_path": "./data/config.yaml",
"base_model": "Qwen/Qwen3-8B",
"base_model_config": "Qwen/Qwen3-8B",
"batch_size": 80,
"bf16": true,
"capabilities": {
"bf16": true,
"compute_capability": "sm_80",
"fp8": false,
"n_gpu": 1,
"n_node": 1,
"tf32": true
},
"chat_template": "qwen3",
"context_parallel_size": 1,
"cut_cross_entropy": true,
"dataloader_num_workers": 1,
"dataloader_pin_memory": true,
"dataloader_prefetch_factor": 256,
"dataset_num_proc": 1,
"dataset_prepared_path": "last_run_prepared",
"datasets": [
{
"chat_template": "tokenizer_default",
"field_messages": "messages",
"field_tools": "tools",
"message_property_mappings": {
"content": "content",
"role": "role"
},
"path": "Gandalf1/indian-finance-synthetic-phase2-cleaned",
"roles_to_train": [
"assistant"
],
"train_on_eos": "turn",
"trust_remote_code": false,
"type": "chat_template"
}
],
"ddp": false,
"device": "cuda:0",
"dion_rank_fraction": 1.0,
"dion_rank_multiple_of": 1,
"eaft_alpha": 1.0,
"eaft_k": 20,
"env_capabilities": {
"torch_version": "2.9.1"
},
"eval_batch_size": 10,
"eval_causal_lm_metrics": [
"sacrebleu",
"comet",
"ter",
"chrf"
],
"eval_max_new_tokens": 128,
"eval_sample_packing": true,
"eval_table_size": 0,
"experimental_skip_move_to_device": true,
"fp16": false,
"generate_samples": false,
"generation_do_sample": true,
"generation_max_new_tokens": 50,
"generation_prompt_ratio": 0.5,
"generation_temperature": 0.7,
"gradient_accumulation_steps": 8,
"gradient_checkpointing": true,
"gradient_checkpointing_kwargs": {
"use_reentrant": false
},
"hub_model_id": "Gandalf1/qwen3-8b-finance-sft-phase2",
"hub_strategy": "all_checkpoints",
"include_tkps": true,
"layer_offloading": false,
"learning_rate": 2e-05,
"lisa_layers_attribute": "model.layers",
"load_best_model_at_end": false,
"load_in_4bit": true,
"load_in_8bit": false,
"local_rank": 0,
"logging_steps": 10,
"lora_alpha": 64,
"lora_dropout": 0.05,
"lora_mlp_kernel": true,
"lora_o_kernel": true,
"lora_qkv_kernel": true,
"lora_r": 32,
"lora_target_modules": [
"q_proj",
"k_proj",
"v_proj",
"o_proj",
"gate_proj",
"down_proj",
"up_proj"
],
"loraplus_lr_embedding": 1e-06,
"loss_watchdog_patience": 3,
"loss_watchdog_threshold": 5.0,
"lr_scheduler": "cosine",
"max_grad_norm": 1.0,
"mean_resizing_embeddings": false,
"merge_method": "memory_efficient",
"micro_batch_size": 10,
"model_config_type": "qwen3",
"num_epochs": 2.0,
"num_generation_samples": 3,
"optimizer": "adamw_torch_4bit",
"otel_metrics_host": "localhost",
"otel_metrics_port": 8000,
"output_dir": "./outputs/finance-synthetic-sft-phase2",
"pad_to_sequence_len": true,
"plugins": [
"axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin"
],
"pretrain_multipack_attn": true,
"profiler_steps_start": 0,
"qlora_sharded_model_loading": false,
"quantize_moe_experts": false,
"ray_num_workers": 1,
"relora_prune_method": "magnitude",
"resources_per_worker": {
"GPU": 1
},
"sample_packing": true,
"sample_packing_bin_size": 200,
"sample_packing_group_size": 100000,
"save_only_model": false,
"save_safetensors": true,
"save_steps": 0.16666666666666666,
"save_total_limit": 3,
"saves_per_epoch": 3,
"seed": 42,
"sequence_len": 8192,
"shuffle_before_merging_datasets": false,
"shuffle_merged_datasets": true,
"skip_prepare_dataset": false,
"streaming_multipack_buffer_size": 10000,
"strict": false,
"tensor_parallel_size": 1,
"tf32": true,
"tiled_mlp_use_original_mlp": true,
"tokenizer_config": "Qwen/Qwen3-8B",
"tokenizer_save_jinja_files": true,
"torch_dtype": "torch.bfloat16",
"train_on_inputs": false,
"trl": {
"async_prefetch": false,
"log_completions": false,
"mask_truncated_completions": false,
"ref_model_mixup_alpha": 0.9,
"ref_model_sync_steps": 64,
"replay_buffer_size": 0,
"replay_recompute_logps": true,
"reroll_max_groups": 1,
"reroll_start_fraction": 1.0,
"reward_num_workers": 1,
"scale_rewards": true,
"skip_zero_advantage_batches": true,
"sync_ref_model": false,
"use_data_producer": false,
"use_vllm": false,
"vllm_lora_sync": false,
"vllm_server_host": "0.0.0.0",
"vllm_server_port": 8000
},
"use_otel_metrics": false,
"use_ray": false,
"val_set_size": 0.0,
"vllm": {
"device": "auto",
"dtype": "auto",
"gpu_memory_utilization": 0.9,
"host": "0.0.0.0",
"port": 8000
},
"warmup_ratio": 0.05,
"weight_decay": 0.01,
"world_size": 1
}
[2026-05-14 13:57:45,656] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:311] [PID:26066] EOS: 151645 / <|im_end|>
[2026-05-14 13:57:45,656] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:312] [PID:26066] BOS: None / None
[2026-05-14 13:57:45,656] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:313] [PID:26066] PAD: 151643 / <|endoftext|>
[2026-05-14 13:57:45,656] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:314] [PID:26066] UNK: None / None
[2026-05-14 13:57:45,657] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:477] [PID:26066] Loading prepared dataset from disk at last_run_prepared/2c3cb7e1625c800787dbdc29d010a51d...
[2026-05-14 13:57:45,727] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:420] [PID:26066] total_num_tokens: 23_382_259
[2026-05-14 13:57:45,926] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:438] [PID:26066] `total_supervised_tokens: 11_016_035`
[2026-05-14 13:57:46,079] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:26066] Using single process for pack_parallel, running sequentially.
[2026-05-14 13:57:47,437] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:26066] Using single process for pack_parallel, running sequentially.
[2026-05-14 13:57:47,736] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:26066] generate_batches time: 0.30095791816711426
[2026-05-14 13:57:47,738] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:26066] Using single process for pack_parallel, running sequentially.
[2026-05-14 13:57:48,034] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:26066] generate_batches time: 0.29753828048706055
[2026-05-14 13:57:48,036] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:26066] Using single process for pack_parallel, running sequentially.
[2026-05-14 13:57:48,309] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:26066] generate_batches time: 0.27533483505249023
[2026-05-14 13:57:48,312] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:26066] Using single process for pack_parallel, running sequentially.
[2026-05-14 13:57:48,612] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:26066] generate_batches time: 0.3027362823486328
[2026-05-14 13:57:48,657] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:26066] gather_len_batches: [287]
[2026-05-14 13:57:48,658] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:495] [PID:26066] data_loader_len: 35
[2026-05-14 13:57:48,658] [INFO] [axolotl.utils.trainer.calc_sample_packing_eff_est:504] [PID:26066] sample_packing_eff_est across ranks: [0.9945225306919643]
[2026-05-14 13:57:48,658] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:516] [PID:26066] sample_packing_eff_est: 1.0
[2026-05-14 13:57:48,658] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:521] [PID:26066] total_num_steps: 70
[2026-05-14 13:57:48,658] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:121] [PID:26066] Maximum number of steps set at 70
[2026-05-14 13:57:48,706] [DEBUG] [axolotl.train.setup_model_and_tokenizer:70] [PID:26066] loading tokenizer... Qwen/Qwen3-8B
[2026-05-14 13:57:49,701] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:311] [PID:26066] EOS: 151645 / <|im_end|>
[2026-05-14 13:57:49,701] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:312] [PID:26066] BOS: None / None
[2026-05-14 13:57:49,701] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:313] [PID:26066] PAD: 151643 / <|endoftext|>
[2026-05-14 13:57:49,701] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:314] [PID:26066] UNK: None / None
[2026-05-14 13:57:49,701] [DEBUG] [axolotl.train.setup_model_and_tokenizer:81] [PID:26066] Loading model
[2026-05-14 13:57:49,820] [DEBUG] [axolotl.monkeypatch.torchao_optim.patch_torchao_optim_state_8bit:75] [PID:26066] Patched OptimState8bit for torch.compile compatibility
[2026-05-14 13:57:49,820] [DEBUG] [axolotl.monkeypatch.torchao_optim.patch_torchao_optim_state_8bit:122] [PID:26066] Patched OptimState4bit for torch.compile compatibility
[2026-05-14 13:57:49,820] [DEBUG] [axolotl.monkeypatch.torchao_optim.patch_torchao_optim_state_8bit:154] [PID:26066] Patched OptimStateFp8 for torch.compile compatibility
[2026-05-14 13:57:49,826] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:94] [PID:26066] Patched Trainer.evaluation_loop with nanmean loss calculation
[2026-05-14 13:57:49,827] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:148] [PID:26066] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation
[2026-05-14 13:57:49,830] [INFO] [axolotl.loaders.patch_manager._apply_multipack_patches:598] [PID:26066] Applying multipack dataloader patch for sample packing...
[2026-05-14 13:57:49,830] [WARNING] [axolotl.loaders.patch_manager._apply_self_attention_lora_patch:556] [PID:26066] Cannot patch self-attention - requires no dropout
[2026-05-14 13:57:49,864] [INFO] [axolotl.integrations.cut_cross_entropy.pre_model_load:94] [PID:26066] Applying Cut Cross Entropy to model type: qwen3
model.safetensors.index.json: 0.00B [00:00, ?B/s]
model.safetensors.index.json: 32.9kB [00:00, 46.7MB/s]
Downloading (incomplete total...): 0.00B [00:00, ?B/s]
Fetching 5 files: 0%| | 0/5 [00:00<?, ?it/s]
Downloading (incomplete total...): 0%| | 0.00/3.99G [00:00<?, ?B/s]
Downloading (incomplete total...): 0%| | 0.00/7.95G [00:00<?, ?B/s]
Downloading (incomplete total...): 0%| | 0.00/11.9G [00:00<?, ?B/s]
Downloading (incomplete total...): 0%| | 0.00/15.1G [00:00<?, ?B/s]
Downloading (incomplete total...): 0%| | 0.00/16.4G [00:00<?, ?B/s]
Downloading (incomplete total...): 0%| | 0.00/16.4G [00:00<?, ?B/s]
Downloading (incomplete total...): 0%| | 0.00/16.4G [00:00<?, ?B/s]
Downloading (incomplete total...): 0%| | 0.00/16.4G [00:00<?, ?B/s]
Downloading (incomplete total...): 0%| | 143k/16.4G [00:01<6:20:01, 718kB/s]
Downloading (incomplete total...): 0%| | 19.2M/16.4G [00:09<2:08:37, 2.12MB/s]
Downloading (incomplete total...): 0%| | 44.3M/16.4G [00:12<1:05:18, 4.17MB/s]
Downloading (incomplete total...): 0%|▏ | 71.8M/16.4G [00:14<39:14, 6.93MB/s]
Downloading (incomplete total...): 1%|▏ | 105M/16.4G [00:18<37:09, 7.30MB/s]
Downloading (incomplete total...): 1%|▏ | 105M/16.4G [00:29<37:09, 7.30MB/s]
Downloading (incomplete total...): 1%|▍ | 171M/16.4G [00:31<46:24, 5.82MB/s]
Downloading (incomplete total...): 1%|▌ | 238M/16.4G [00:40<40:41, 6.61MB/s]
Downloading (incomplete total...): 2%|▋ | 305M/16.4G [00:41<26:04, 10.3MB/s]
Downloading (incomplete total...): 2%|▊ | 373M/16.4G [00:45<23:02, 11.6MB/s]
Downloading (incomplete total...): 3%|█ | 494M/16.4G [00:50<16:35, 16.0MB/s]
Downloading (incomplete total...): 3%|█▏ | 561M/16.4G [00:50<12:11, 21.6MB/s]
Downloading (incomplete total...): 4%|█▍ | 628M/16.4G [00:56<15:25, 17.0MB/s]
Downloading (incomplete total...): 5%|█▉ | 897M/16.4G [01:00<08:17, 31.1MB/s]
Downloading (incomplete total...): 7%|██▎ | 1.10G/16.4G [01:00<05:07, 49.6MB/s]
Downloading (incomplete total...): 10%|███▌ | 1.63G/16.4G [01:01<02:06, 116MB/s]
Downloading (incomplete total...): 12%|████▍ | 2.04G/16.4G [01:01<01:17, 185MB/s]
Downloading (incomplete total...): 14%|████▉ | 2.24G/16.4G [01:02<01:12, 196MB/s]
Downloading (incomplete total...): 19%|██████▊ | 3.11G/16.4G [01:02<00:30, 436MB/s]
Downloading (incomplete total...): 21%|███████▋ | 3.52G/16.4G [01:03<00:26, 483MB/s]
Downloading (incomplete total...): 26%|█████████▍ | 4.32G/16.4G [01:05<00:25, 465MB/s]
Downloading (incomplete total...): 26%|█████████▍ | 4.32G/16.4G [01:05<00:26, 453MB/s]
Downloading (incomplete total...): 28%|██████████▏ | 4.66G/16.4G [01:05<00:24, 476MB/s]
Downloading (incomplete total...): 32%|███████████▍ | 5.19G/16.4G [01:06<00:19, 584MB/s]
Downloading (incomplete total...): 38%|█████████████▋ | 6.25G/16.4G [01:06<00:12, 835MB/s]
Downloading (incomplete total...): 38%|█████████████▎ | 6.25G/16.4G [01:06<00:08, 1.20GB/s]
Downloading (incomplete total...): 38%|█████████████▎ | 6.25G/16.4G [01:06<00:08, 1.20GB/s]
Downloading (incomplete total...): 42%|██████████████▌ | 6.82G/16.4G [01:06<00:07, 1.36GB/s]
Downloading (incomplete total...): 44%|███████████████▍ | 7.22G/16.4G [01:07<00:08, 1.08GB/s]
Downloading (incomplete total...): 46%|████████████████▏ | 7.56G/16.4G [01:08<00:08, 1.03GB/s]
Downloading (incomplete total...): 47%|████████████████▉ | 7.70G/16.4G [01:08<00:08, 986MB/s]
Downloading (incomplete total...): 49%|█████████████████▌ | 7.97G/16.4G [01:09<00:12, 701MB/s]
Downloading (incomplete total...): 49%|█████████████████▊ | 8.10G/16.4G [01:09<00:11, 697MB/s]
Downloading (incomplete total...): 53%|██████████████████▉ | 8.64G/16.4G [01:09<00:08, 878MB/s]
Downloading (incomplete total...): 54%|███████████████████▌ | 8.91G/16.4G [01:10<00:09, 806MB/s]
Downloading (incomplete total...): 58%|████████████████████▏ | 9.43G/16.4G [01:10<00:06, 1.07GB/s]
Downloading (incomplete total...): 58%|████████████████████▏ | 9.43G/16.4G [01:10<00:06, 1.07GB/s]
Downloading (incomplete total...): 58%|████████████████████▏ | 9.43G/16.4G [01:10<00:06, 1.07GB/s]
Downloading (incomplete total...): 62%|█████████████████████▋ | 10.2G/16.4G [01:10<00:05, 1.14GB/s]
Downloading (incomplete total...): 67%|███████████████████████▎ | 10.9G/16.4G [01:11<00:04, 1.29GB/s]
Downloading (incomplete total...): 76%|██████████████████████████▋ | 12.5G/16.4G [01:11<00:01, 3.23GB/s]
Downloading (incomplete total...): 76%|██████████████████████████▋ | 12.5G/16.4G [01:11<00:01, 3.23GB/s]
Downloading (incomplete total...): 76%|██████████████████████████▋ | 12.5G/16.4G [01:11<00:01, 3.23GB/s]
Downloading (incomplete total...): 81%|████████████████████████████▍ | 13.3G/16.4G [01:11<00:01, 2.97GB/s]
Fetching 5 files: 20%|███████████▊ | 1/5 [01:11<04:45, 71.45s/it]
Downloading (incomplete total...): 85%|█████████████████████████████▋ | 13.9G/16.4G [01:11<00:00, 2.88GB/s]
Downloading (incomplete total...): 89%|███████████████████████████████▏ | 14.6G/16.4G [01:11<00:00, 3.15GB/s]
Downloading (incomplete total...): 95%|█████████████████████████████████ | 15.5G/16.4G [01:12<00:00, 3.40GB/s]
Downloading (incomplete total...): 100%|██████████████████████████████████▉| 16.4G/16.4G [01:12<00:00, 2.91GB/s]
Fetching 5 files: 40%|███████████████████████▌ | 2/5 [01:12<01:30, 30.12s/it]
Fetching 5 files: 100%|███████████████████████████████████████████████████████████| 5/5 [01:12<00:00, 14.53s/it]
Download complete: 100%|███████████████████████████████████████████████████| 16.4G/16.4G [01:12<00:00, 2.91GB/s]
Download complete: 100%|████████████████████████████████████████████████████| 16.4G/16.4G [01:12<00:00, 225MB/s]
Loading weights: 0%| | 0/399 [00:00<?, ?it/s]
Loading weights: 0%|▏ | 1/399 [00:00<03:25, 1.94it/s]
Loading weights: 1%|▎ | 2/399 [00:00<03:13, 2.06it/s]
Loading weights: 2%|▊ | 6/399 [00:01<00:51, 7.59it/s]
Loading weights: 7%|███▋ | 26/399 [00:01<00:09, 40.43it/s]
Loading weights: 10%|█████▌ | 39/399 [00:01<00:06, 57.29it/s]
Loading weights: 15%|████████▌ | 60/399 [00:01<00:03, 87.30it/s]
Loading weights: 20%|██████████▉ | 78/399 [00:01<00:02, 107.71it/s]
Loading weights: 23%|█████████████ | 93/399 [00:01<00:02, 118.03it/s]
Loading weights: 28%|███████████████▎ | 111/399 [00:01<00:02, 133.33it/s]
Loading weights: 32%|█████████████████▊ | 127/399 [00:02<00:03, 83.34it/s]
Loading weights: 37%|████████████████████▎ | 147/399 [00:02<00:02, 104.23it/s]
Loading weights: 41%|██████████████████████▎ | 162/399 [00:02<00:02, 112.28it/s]
Loading weights: 45%|████████████████████████▉ | 181/399 [00:02<00:01, 125.40it/s]
Loading weights: 49%|███████████████████████████ | 196/399 [00:02<00:01, 110.94it/s]
Loading weights: 52%|█████████████████████████████▎ | 209/399 [00:02<00:02, 89.79it/s]
Loading weights: 55%|██████████████████████████████▉ | 220/399 [00:03<00:02, 71.86it/s]
Loading weights: 58%|████████████████████████████████▋ | 233/399 [00:03<00:02, 82.21it/s]
Loading weights: 62%|██████████████████████████████████▋ | 247/399 [00:03<00:01, 92.76it/s]
Loading weights: 66%|████████████████████████████████████▌ | 265/399 [00:03<00:01, 111.25it/s]
Loading weights: 70%|██████████████████████████████████████▌ | 280/399 [00:03<00:01, 117.73it/s]
Loading weights: 74%|████████████████████████████████████████▌ | 294/399 [00:03<00:00, 121.58it/s]
Loading weights: 77%|██████████████████████████████████████████▍ | 308/399 [00:03<00:00, 116.04it/s]
Loading weights: 80%|████████████████████████████████████████████▏ | 321/399 [00:03<00:00, 118.64it/s]
Loading weights: 84%|██████████████████████████████████████████████▏ | 335/399 [00:03<00:00, 124.20it/s]
Loading weights: 89%|████████████████████████████████████████████████▊ | 354/399 [00:04<00:00, 142.23it/s]
Loading weights: 92%|██████████████████████████████████████████████████▊ | 369/399 [00:04<00:00, 137.77it/s]
Loading weights: 97%|█████████████████████████████████████████████████████▌ | 389/399 [00:04<00:00, 153.22it/s]
Loading weights: 100%|████████████████████████████████████████████████████████| 399/399 [00:04<00:00, 92.82it/s]
generation_config.json: 0%| | 0.00/239 [00:00<?, ?B/s]
generation_config.json: 100%|██████████████████████████████████████████████████| 239/239 [00:00<00:00, 2.17MB/s]
[2026-05-14 13:59:09,177] [INFO] [axolotl.loaders.model._prepare_model_for_quantization:900] [PID:26066] converting PEFT model w/ prepare_model_for_kbit_training
[2026-05-14 13:59:09,183] [INFO] [axolotl.loaders.model._configure_embedding_dtypes:356] [PID:26066] Converting modules to torch.bfloat16
[2026-05-14 13:59:09,188] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:26066] Memory usage after model load 9.148GB (+9.148GB allocated, +10.395GB reserved)
trainable params: 87,293,952 || all params: 8,278,029,312 || trainable%: 1.0545
[2026-05-14 13:59:10,518] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:26066] after adapters 5.997GB (+5.997GB allocated, +10.559GB reserved)
[2026-05-14 13:59:11,364] [INFO] [axolotl.monkeypatch.lora_kernels.apply_lora_kernel_patches:478] [PID:26066] LoRA kernels: dropout=0.05 enabled
[2026-05-14 13:59:13,738] [INFO] [axolotl.train.save_initial_configs:450] [PID:26066] Pre-saving adapter config to ./outputs/finance-synthetic-sft-phase2...
[2026-05-14 13:59:13,738] [INFO] [axolotl.train.save_initial_configs:454] [PID:26066] Pre-saving tokenizer to ./outputs/finance-synthetic-sft-phase2...
[2026-05-14 13:59:13,833] [INFO] [axolotl.train.save_initial_configs:459] [PID:26066] Pre-saving model config to ./outputs/finance-synthetic-sft-phase2...
[2026-05-14 13:59:13,837] [INFO] [axolotl.train.execute_training:226] [PID:26066] Starting trainer...
[2026-05-14 13:59:14,333] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:26066] Using single process for pack_parallel, running sequentially.
[2026-05-14 13:59:14,638] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:26066] Using single process for pack_parallel, running sequentially.
[2026-05-14 13:59:14,942] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:26066] generate_batches time: 0.3063161373138428
[2026-05-14 13:59:14,944] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:26066] Using single process for pack_parallel, running sequentially.
[2026-05-14 13:59:15,260] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:26066] generate_batches time: 0.31746554374694824
[2026-05-14 13:59:15,262] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:26066] Using single process for pack_parallel, running sequentially.
[2026-05-14 13:59:15,571] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:26066] generate_batches time: 0.31086015701293945
[2026-05-14 13:59:15,573] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:26066] Using single process for pack_parallel, running sequentially.
[2026-05-14 13:59:15,871] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:26066] generate_batches time: 0.2998006343841553
[2026-05-14 13:59:15,871] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:26066] gather_len_batches: [287]
0%| | 0/70 [00:00<?, ?it/s][2026-05-14 13:59:15,981] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:26066] Using single process for pack_parallel, running sequentially.
[2026-05-14 13:59:26,543] [ERROR] [axolotl.telemetry.errors.wrapper:158] [PID:26066] Error captured in telemetry. Run ID: 29354a0a-cd9e-4fdd-aae5-2bd9658fd326
Traceback (most recent call last):
File "<frozen runpy>", line 198, in _run_module_as_main
File "<frozen runpy>", line 88, in _run_code
File "/workspace/axolotl/src/axolotl/cli/train.py", line 145, in <module>
fire.Fire(do_cli)
File "/workspace/axolotl-venv/lib/python3.12/site-packages/fire/core.py", line 135, in Fire
component_trace = _Fire(component, args, parsed_flag_args, context, name)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/axolotl-venv/lib/python3.12/site-packages/fire/core.py", line 468, in _Fire
component, remaining_args = _CallAndUpdateTrace(
^^^^^^^^^^^^^^^^^^^^
File "/workspace/axolotl-venv/lib/python3.12/site-packages/fire/core.py", line 684, in _CallAndUpdateTrace
component = fn(*varargs, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/axolotl/src/axolotl/cli/train.py", line 96, in do_cli
do_train(parsed_cfg, parsed_cli_args)
File "/workspace/axolotl/src/axolotl/cli/train.py", line 50, in do_train
model, tokenizer, trainer = train(cfg=cfg, dataset_meta=dataset_meta)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/axolotl/src/axolotl/telemetry/errors.py", line 127, in wrapper
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/workspace/axolotl/src/axolotl/train.py", line 628, in train
execute_training(cfg, trainer, resume_from_checkpoint)
File "/workspace/axolotl/src/axolotl/train.py", line 227, in execute_training
trainer.train(resume_from_checkpoint=resume_from_checkpoint)
File "/workspace/axolotl-venv/lib/python3.12/site-packages/transformers/trainer.py", line 1425, in train
return inner_training_loop(
^^^^^^^^^^^^^^^^^^^^
File "/workspace/axolotl-venv/lib/python3.12/site-packages/transformers/trainer.py", line 1507, in _inner_training_loop
self._run_epoch(
File "/workspace/axolotl-venv/lib/python3.12/site-packages/transformers/trainer.py", line 1735, in _run_epoch
tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/axolotl/src/axolotl/core/trainers/mixins/layer_offloading.py", line 304, in training_step
return super().training_step(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/axolotl/src/axolotl/core/trainers/mixins/activation_checkpointing.py", line 65, in training_step
return super().training_step(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/axolotl-venv/lib/python3.12/site-packages/transformers/trainer.py", line 1907, in training_step
loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/axolotl/src/axolotl/core/trainers/base.py", line 456, in compute_loss
return super().compute_loss(
^^^^^^^^^^^^^^^^^^^^^
File "/workspace/axolotl-venv/lib/python3.12/site-packages/transformers/trainer.py", line 1979, in compute_loss
outputs = model(**inputs)
^^^^^^^^^^^^^^^
File "/workspace/axolotl-venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/axolotl-venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/axolotl-venv/lib/python3.12/site-packages/accelerate/utils/operations.py", line 823, in forward
return model_forward(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/axolotl-venv/lib/python3.12/site-packages/accelerate/utils/operations.py", line 811, in __call__
return convert_to_fp32(self.model_forward(*args, **kwargs))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/axolotl-venv/lib/python3.12/site-packages/torch/amp/autocast_mode.py", line 44, in decorate_autocast
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/workspace/axolotl-venv/lib/python3.12/site-packages/peft/peft_model.py", line 1993, in forward
return self.base_model(
^^^^^^^^^^^^^^^^
File "/workspace/axolotl-venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/axolotl-venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/axolotl-venv/lib/python3.12/site-packages/peft/tuners/tuners_utils.py", line 330, in forward
return self.model.forward(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/axolotl-venv/lib/python3.12/site-packages/cut_cross_entropy/transformers/llama.py", line 53, in cce_forward
outputs: BaseModelOutputWithPast = self.model(
^^^^^^^^^^^
File "/workspace/axolotl-venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/axolotl-venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/axolotl-venv/lib/python3.12/site-packages/transformers/utils/generic.py", line 952, in wrapper
output = func(self, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/axolotl-venv/lib/python3.12/site-packages/transformers/utils/output_capturing.py", line 248, in wrapper
outputs = func(self, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/axolotl-venv/lib/python3.12/site-packages/transformers/models/qwen3/modeling_qwen3.py", line 424, in forward
hidden_states = decoder_layer(
^^^^^^^^^^^^^^
File "/workspace/axolotl-venv/lib/python3.12/site-packages/transformers/modeling_layers.py", line 92, in __call__
return self._gradient_checkpointing_func(partial(super().__call__, **kwargs), *args)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/axolotl-venv/lib/python3.12/site-packages/torch/_compile.py", line 53, in inner
return disable_fn(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py", line 1044, in _fn
return fn(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^
File "/workspace/axolotl-venv/lib/python3.12/site-packages/torch/utils/checkpoint.py", line 503, in checkpoint
ret = function(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/axolotl-venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/axolotl-venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/axolotl-venv/lib/python3.12/site-packages/transformers/models/qwen3/modeling_qwen3.py", line 332, in forward
hidden_states = self.mlp(hidden_states)
^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/axolotl-venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/axolotl-venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/axolotl/src/axolotl/kernels/lora.py", line 729, in apply_lora_mlp_swiglu
out = LoRA_MLP.apply(
^^^^^^^^^^^^^^^
File "/workspace/axolotl-venv/lib/python3.12/site-packages/torch/autograd/function.py", line 581, in apply
return super().apply(*args, **kwargs) # type: ignore[misc]
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/axolotl-venv/lib/python3.12/site-packages/torch/amp/autocast_mode.py", line 527, in decorate_fwd
return fwd(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^
File "/workspace/axolotl/src/axolotl/kernels/lora.py", line 414, in forward
output = matmul_lora(
^^^^^^^^^^^^
File "/workspace/axolotl/src/axolotl/kernels/lora.py", line 273, in matmul_lora
out += s * X_lora @ A @ B
~~^~~~~~~~
torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 1.88 GiB. GPU 0 has a total capacity of 39.49 GiB of which 1.30 GiB is free. Process 137982 has 38.18 GiB memory in use. Of the allocated memory 35.49 GiB is allocated by PyTorch, and 2.20 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
Exception in thread Thread-5 (_pin_memory_loop):
Traceback (most recent call last):
File "/root/.local/share/uv/python/cpython-3.12.13-linux-x86_64-gnu/lib/python3.12/threading.py", line 1075, in _bootstrap_inner
 self.run()
File "/root/.local/share/uv/python/cpython-3.12.13-linux-x86_64-gnu/lib/python3.12/threading.py", line 1012, in run
0%| | 0/70 [00:11<?, ?it/s]
[2026-05-14 14:04:50,081] [DEBUG] [axolotl.utils.config.resolve_dtype:74] [PID:26498] bf16 support detected, enabling for this configuration.
[2026-05-14 14:04:50,664] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:26498] baseline 0.000GB ()
[2026-05-14 14:04:50,665] [INFO] [axolotl.cli.config.load_cfg:333] [PID:26498] config:
{
"activation_offloading": false,
"adapter": "qlora",
"attn_implementation": "flash_attention_2",
"attn_needs_dtype_cast": true,
"attn_supports_packing": true,
"attn_uses_flash_lib": true,
"auto_resume_from_checkpoints": true,
"axolotl_config_path": "./data/config.yaml",
"base_model": "Qwen/Qwen3-8B",
"base_model_config": "Qwen/Qwen3-8B",
"batch_size": 32,
"bf16": true,
"capabilities": {
"bf16": true,
"compute_capability": "sm_80",
"fp8": false,
"n_gpu": 1,
"n_node": 1,
"tf32": true
},
"chat_template": "qwen3",
"context_parallel_size": 1,
"cut_cross_entropy": true,
"dataloader_num_workers": 1,
"dataloader_pin_memory": true,
"dataloader_prefetch_factor": 256,
"dataset_num_proc": 1,
"dataset_prepared_path": "last_run_prepared",
"datasets": [
{
"chat_template": "tokenizer_default",
"field_messages": "messages",
"field_tools": "tools",
"message_property_mappings": {
"content": "content",
"role": "role"
},
"path": "Gandalf1/indian-finance-synthetic-phase2-cleaned",
"roles_to_train": [
"assistant"
],
"train_on_eos": "turn",
"trust_remote_code": false,
"type": "chat_template"
}
],
"ddp": false,
"device": "cuda:0",
"dion_rank_fraction": 1.0,
"dion_rank_multiple_of": 1,
"eaft_alpha": 1.0,
"eaft_k": 20,
"env_capabilities": {
"torch_version": "2.9.1"
},
"eval_batch_size": 4,
"eval_causal_lm_metrics": [
"sacrebleu",
"comet",
"ter",
"chrf"
],
"eval_max_new_tokens": 128,
"eval_sample_packing": true,
"eval_table_size": 0,
"experimental_skip_move_to_device": true,
"fp16": false,
"generate_samples": false,
"generation_do_sample": true,
"generation_max_new_tokens": 50,
"generation_prompt_ratio": 0.5,
"generation_temperature": 0.7,
"gradient_accumulation_steps": 8,
"gradient_checkpointing": true,
"gradient_checkpointing_kwargs": {
"use_reentrant": false
},
"hub_model_id": "Gandalf1/qwen3-8b-finance-sft-phase2",
"hub_strategy": "all_checkpoints",
"include_tkps": true,
"layer_offloading": false,
"learning_rate": 2e-05,
"lisa_layers_attribute": "model.layers",
"load_best_model_at_end": false,
"load_in_4bit": true,
"load_in_8bit": false,
"local_rank": 0,
"logging_steps": 10,
"lora_alpha": 64,
"lora_dropout": 0.05,
"lora_mlp_kernel": true,
"lora_o_kernel": true,
"lora_qkv_kernel": true,
"lora_r": 32,
"lora_target_modules": [
"q_proj",
"k_proj",
"v_proj",
"o_proj",
"gate_proj",
"down_proj",
"up_proj"
],
"loraplus_lr_embedding": 1e-06,
"loss_watchdog_patience": 3,
"loss_watchdog_threshold": 5.0,
"lr_scheduler": "cosine",
"max_grad_norm": 1.0,
"mean_resizing_embeddings": false,
"merge_method": "memory_efficient",
"micro_batch_size": 4,
"model_config_type": "qwen3",
"num_epochs": 2.0,
"num_generation_samples": 3,
"optimizer": "adamw_torch_4bit",
"otel_metrics_host": "localhost",
"otel_metrics_port": 8000,
"output_dir": "./outputs/finance-synthetic-sft-phase2",
"pad_to_sequence_len": true,
"plugins": [
"axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin"
],
"pretrain_multipack_attn": true,
"profiler_steps_start": 0,
"qlora_sharded_model_loading": false,
"quantize_moe_experts": false,
"ray_num_workers": 1,
"relora_prune_method": "magnitude",
"resources_per_worker": {
"GPU": 1
},
"sample_packing": true,
"sample_packing_bin_size": 200,
"sample_packing_group_size": 100000,
"save_only_model": false,
"save_safetensors": true,
"save_steps": 0.16666666666666666,
"save_total_limit": 3,
"saves_per_epoch": 3,
"seed": 42,
"sequence_len": 6144,
"shuffle_before_merging_datasets": false,
"shuffle_merged_datasets": true,
"skip_prepare_dataset": false,
"streaming_multipack_buffer_size": 10000,
"strict": false,
"tensor_parallel_size": 1,
"tf32": true,
"tiled_mlp_use_original_mlp": true,
"tokenizer_config": "Qwen/Qwen3-8B",
"tokenizer_save_jinja_files": true,
"torch_dtype": "torch.bfloat16",
"train_on_inputs": false,
"trl": {
"async_prefetch": false,
"log_completions": false,
"mask_truncated_completions": false,
"ref_model_mixup_alpha": 0.9,
"ref_model_sync_steps": 64,
"replay_buffer_size": 0,
"replay_recompute_logps": true,
"reroll_max_groups": 1,
"reroll_start_fraction": 1.0,
"reward_num_workers": 1,
"scale_rewards": true,
"skip_zero_advantage_batches": true,
"sync_ref_model": false,
"use_data_producer": false,
"use_vllm": false,
"vllm_lora_sync": false,
"vllm_server_host": "0.0.0.0",
"vllm_server_port": 8000
},
"use_otel_metrics": false,
"use_ray": false,
"val_set_size": 0.0,
"vllm": {
"device": "auto",
"dtype": "auto",
"gpu_memory_utilization": 0.9,
"host": "0.0.0.0",
"port": 8000
},
"warmup_ratio": 0.05,
"weight_decay": 0.01,
"world_size": 1
}
[2026-05-14 14:04:51,741] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:311] [PID:26498] EOS: 151645 / <|im_end|>
[2026-05-14 14:04:51,741] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:312] [PID:26498] BOS: None / None
[2026-05-14 14:04:51,741] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:313] [PID:26498] PAD: 151643 / <|endoftext|>
[2026-05-14 14:04:51,742] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:314] [PID:26498] UNK: None / None
[2026-05-14 14:04:51,742] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:482] [PID:26498] Unable to find prepared dataset in last_run_prepared/8e970b09b0233ad980a67dcca6703606
[2026-05-14 14:04:51,742] [INFO] [axolotl.utils.data.sft._load_raw_datasets:320] [PID:26498] Loading raw datasets...
[2026-05-14 14:04:51,742] [WARNING] [axolotl.utils.data.sft._load_raw_datasets:322] [PID:26498] Processing datasets during training can lead to VRAM instability. Please pre-process your dataset using `axolotl preprocess path/to/config.yml`.
Downloading (incomplete total...): 0.00B [00:00, ?B/s]
Fetching 0 files: 0it [00:00, ?it/s]
Fetching 0 files: 0it [00:00, ?it/s]
Download complete: : 0.00B [00:00, ?B/s]
Download complete: : 0.00B [00:00, ?B/s]
[2026-05-14 14:04:53,700] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:87] [PID:26498] Loading dataset: Gandalf1/indian-finance-synthetic-phase2-cleaned with base_type: chat_template and prompt_style: None
[2026-05-14 14:04:53,702] [INFO] [axolotl.prompt_strategies.chat_template.__call__:1191] [PID:26498] Using chat template:
---
{%- if tools %}
{{- '<|im_start|>system\n' }}
{%- if messages[0].role == 'system' %}
{{- messages[0].content + '\n\n' }}
{%- endif %}
{{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
{%- for tool in tools %}
{{- "\n" }}
{{- tool | tojson }}
{%- endfor %}
{{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
{%- else %}
{%- if messages[0].role == 'system' %}
{{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
{%- endif %}
{%- endif %}
{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
{#- Determine the real last index: use provided value or default to messages length - 1 #}
{%- if real_last_index is defined and real_last_index is not none %}
{%- set ns.real_last_index = real_last_index %}
{%- else %}
{%- set ns.real_last_index = messages|length - 1 %}
{%- endif %}
{%- for message in messages[::-1] %}
{%- set index = (messages|length - 1) - loop.index0 %}
{%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
{%- set ns.multi_step_tool = false %}
{%- set ns.last_query_index = index %}
{%- endif %}
{%- endfor %}
{%- for message in messages %}
{%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
{{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
{%- elif message.role == "assistant" %}
{%- set content = message.content %}
{%- set reasoning_content = '' %}
{%- if message.reasoning_content is defined and message.reasoning_content is not none %}
{%- set reasoning_content = message.reasoning_content %}
{%- else %}
{%- if '</think>' in message.content %}
{%- set content = message.content.split('</think>')[-1].lstrip('\n') %}
{%- set reasoning_content = message.content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
{%- endif %}
{%- endif %}
{%- if loop.index0 > ns.last_query_index %}
{%- if loop.index0 == ns.real_last_index or (loop.index0 != ns.real_last_index and reasoning_content) %}
{{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
{%- else %}
{{- '<|im_start|>' + message.role + '\n' + content }}
{%- endif %}
{%- else %}
{{- '<|im_start|>' + message.role + '\n' + content }}
{%- endif %}
{%- if message.tool_calls %}
{%- for tool_call in message.tool_calls %}
{%- if (loop.first and content) or (not loop.first) %}
{{- '\n' }}
{%- endif %}
{%- if tool_call.function %}
{%- set tool_call = tool_call.function %}
{%- endif %}
{{- '<tool_call>\n{"name": "' }}
{{- tool_call.name }}
{{- '", "arguments": ' }}
{%- if tool_call.arguments is string %}
{{- tool_call.arguments }}
{%- else %}
{{- tool_call.arguments | tojson }}
{%- endif %}
{{- '}\n</tool_call>' }}
{%- endfor %}
{%- endif %}
{{- '<|im_end|>\n' }}
{%- elif message.role == "tool" %}
{%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
{{- '<|im_start|>user' }}
{%- endif %}
{{- '\n<tool_response>\n' }}
{{- message.content }}
{{- '\n</tool_response>' }}
{%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
{{- '<|im_end|>\n' }}
{%- endif %}
{%- endif %}
{%- endfor %}
{%- if add_generation_prompt %}
{{- '<|im_start|>assistant\n' }}
{%- if enable_thinking is defined and enable_thinking is false %}
{{- '<think>\n\n</think>\n\n' }}
{%- else %}
{{- '<think>\n\n' }}
{%- endif %}
{%- endif %}
---
Tokenizing Prompts (num_proc=1): 0%| | 0/14763 [00:00<?, ? examples/s]
Tokenizing Prompts (num_proc=1): 7%|██ | 1000/14763 [00:16<03:41, 62.00 examples/s]
Tokenizing Prompts (num_proc=1): 7%|██ | 1000/14763 [00:30<03:41, 62.00 examples/s]
Tokenizing Prompts (num_proc=1): 14%|████ | 2000/14763 [00:31<03:18, 64.18 examples/s]
Tokenizing Prompts (num_proc=1): 14%|████ | 2000/14763 [00:42<03:18, 64.18 examples/s]
Tokenizing Prompts (num_proc=1): 20%|██████ | 3000/14763 [00:46<03:01, 64.99 examples/s]
Tokenizing Prompts (num_proc=1): 20%|██████ | 3000/14763 [01:00<03:01, 64.99 examples/s]
Tokenizing Prompts (num_proc=1): 27%|████████▏ | 4000/14763 [01:01<02:42, 66.27 examples/s]
Tokenizing Prompts (num_proc=1): 27%|████████▏ | 4000/14763 [01:12<02:42, 66.27 examples/s]
Tokenizing Prompts (num_proc=1): 34%|██████████▏ | 5000/14763 [01:16<02:27, 66.04 examples/s]
Tokenizing Prompts (num_proc=1): 34%|██████████▏ | 5000/14763 [01:30<02:27, 66.04 examples/s]
Tokenizing Prompts (num_proc=1): 41%|████████████▏ | 6000/14763 [01:30<02:11, 66.86 examples/s]
Tokenizing Prompts (num_proc=1): 41%|████████████▏ | 6000/14763 [01:42<02:11, 66.86 examples/s]
Tokenizing Prompts (num_proc=1): 47%|██████████████▏ | 7000/14763 [01:45<01:54, 67.64 examples/s]
Tokenizing Prompts (num_proc=1): 54%|████████████████▎ | 8000/14763 [01:59<01:37, 69.19 examples/s]
Tokenizing Prompts (num_proc=1): 54%|████████████████▎ | 8000/14763 [02:10<01:37, 69.19 examples/s]
Tokenizing Prompts (num_proc=1): 61%|██████████████████▎ | 9000/14763 [02:12<01:20, 71.45 examples/s]
Tokenizing Prompts (num_proc=1): 61%|██████████████████▎ | 9000/14763 [02:22<01:20, 71.45 examples/s]
Tokenizing Prompts (num_proc=1): 68%|███████████████████▋ | 10000/14763 [02:25<01:05, 72.84 examples/s]
Tokenizing Prompts (num_proc=1): 75%|█████████████████████▌ | 11000/14763 [02:38<00:50, 73.81 examples/s]
Tokenizing Prompts (num_proc=1): 75%|█████████████████████▌ | 11000/14763 [02:50<00:50, 73.81 examples/s]
Tokenizing Prompts (num_proc=1): 81%|███████████████████████▌ | 12000/14763 [02:51<00:36, 75.07 examples/s]
Tokenizing Prompts (num_proc=1): 81%|███████████████████████▌ | 12000/14763 [03:02<00:36, 75.07 examples/s]
Tokenizing Prompts (num_proc=1): 88%|█████████████████████████▌ | 13000/14763 [03:04<00:23, 75.34 examples/s]
Tokenizing Prompts (num_proc=1): 95%|███████████████████████████▌ | 14000/14763 [03:17<00:10, 74.83 examples/s]
Tokenizing Prompts (num_proc=1): 100%|█████████████████████████████| 14763/14763 [03:28<00:00, 74.72 examples/s]
Tokenizing Prompts (num_proc=1): 100%|█████████████████████████████| 14763/14763 [03:28<00:00, 70.85 examples/s]
[2026-05-14 14:08:29,854] [INFO] [axolotl.utils.data.utils._log_dataset_stats:212] [PID:26498] min_input_len: 591
[2026-05-14 14:08:29,855] [INFO] [axolotl.utils.data.utils._log_dataset_stats:213] [PID:26498] max_input_len: 4338
Dropping Invalid Sequences (<None or >6144) (num_proc=1): 0%| | 0/14763 [00:00<?, ? examples/s]
Dropping Invalid Sequences (<None or >6144) (num_proc=1): 7%|▏ | 1000/14763 [00:00<00:09, 1453.44 examples/s]
Dropping Invalid Sequences (<None or >6144) (num_proc=1): 14%|▍ | 2000/14763 [00:01<00:08, 1568.36 examples/s]
Dropping Invalid Sequences (<None or >6144) (num_proc=1): 20%|▌ | 3000/14763 [00:01<00:07, 1627.08 examples/s]
Dropping Invalid Sequences (<None or >6144) (num_proc=1): 27%|▊ | 4000/14763 [00:02<00:06, 1669.29 examples/s]
Dropping Invalid Sequences (<None or >6144) (num_proc=1): 34%|█ | 5000/14763 [00:03<00:05, 1674.26 examples/s]
Dropping Invalid Sequences (<None or >6144) (num_proc=1): 41%|█▏ | 6000/14763 [00:03<00:05, 1693.46 examples/s]
Dropping Invalid Sequences (<None or >6144) (num_proc=1): 47%|█▍ | 7000/14763 [00:04<00:04, 1681.13 examples/s]
Dropping Invalid Sequences (<None or >6144) (num_proc=1): 54%|█▋ | 8000/14763 [00:04<00:04, 1678.51 examples/s]
Dropping Invalid Sequences (<None or >6144) (num_proc=1): 61%|█▊ | 9000/14763 [00:05<00:03, 1687.52 examples/s]
Dropping Invalid Sequences (<None or >6144) (num_proc=1): 68%|█▎| 10000/14763 [00:05<00:02, 1704.69 examples/s]
Dropping Invalid Sequences (<None or >6144) (num_proc=1): 75%|█▍| 11000/14763 [00:06<00:02, 1705.31 examples/s]
Dropping Invalid Sequences (<None or >6144) (num_proc=1): 81%|█▋| 12000/14763 [00:07<00:01, 1711.63 examples/s]
Dropping Invalid Sequences (<None or >6144) (num_proc=1): 88%|█▊| 13000/14763 [00:07<00:01, 1707.70 examples/s]
Dropping Invalid Sequences (<None or >6144) (num_proc=1): 95%|█▉| 14000/14763 [00:08<00:00, 1700.81 examples/s]
Dropping Invalid Sequences (<None or >6144) (num_proc=1): 100%|██| 14763/14763 [00:08<00:00, 1708.72 examples/s]
Dropping Invalid Sequences (<None or >6144) (num_proc=1): 100%|██| 14763/14763 [00:08<00:00, 1659.28 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=1): 0%| | 0/14763 [00:00<?, ? examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=1): 7%|▍ | 1000/14763 [00:00<00:09, 1488.10 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=1): 14%|▉ | 2000/14763 [00:01<00:07, 1597.83 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=1): 20%|█▍ | 3000/14763 [00:01<00:07, 1650.05 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=1): 27%|█▉ | 4000/14763 [00:02<00:06, 1685.61 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=1): 34%|██▎ | 5000/14763 [00:03<00:05, 1689.72 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=1): 41%|██▊ | 6000/14763 [00:03<00:05, 1699.22 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=1): 47%|███▎ | 7000/14763 [00:04<00:04, 1687.34 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=1): 54%|███▊ | 8000/14763 [00:04<00:04, 1673.68 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=1): 61%|████▎ | 9000/14763 [00:05<00:03, 1696.45 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=1): 68%|████ | 10000/14763 [00:05<00:02, 1705.90 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=1): 75%|████▍ | 11000/14763 [00:06<00:02, 1708.96 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=1): 81%|████▉ | 12000/14763 [00:07<00:01, 1710.59 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=1): 88%|█████▎| 13000/14763 [00:07<00:01, 1712.56 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=1): 95%|█████▋| 14000/14763 [00:08<00:00, 1710.18 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=1): 100%|██████| 14763/14763 [00:08<00:00, 1722.32 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=1): 100%|██████| 14763/14763 [00:08<00:00, 1668.57 examples/s]
Add position_id column (Sample Packing) (num_proc=1): 0%| | 0/14763 [00:00<?, ? examples/s]
Add position_id column (Sample Packing) (num_proc=1): 7%|▌ | 1000/14763 [00:01<00:15, 881.50 examples/s]
Add position_id column (Sample Packing) (num_proc=1): 14%|▉ | 2000/14763 [00:02<00:12, 1008.58 examples/s]
Add position_id column (Sample Packing) (num_proc=1): 20%|█▍ | 3000/14763 [00:02<00:10, 1079.47 examples/s]
Add position_id column (Sample Packing) (num_proc=1): 27%|█▉ | 4000/14763 [00:03<00:09, 1103.19 examples/s]
Add position_id column (Sample Packing) (num_proc=1): 34%|██▎ | 5000/14763 [00:04<00:08, 1128.61 examples/s]
Add position_id column (Sample Packing) (num_proc=1): 41%|██▊ | 6000/14763 [00:05<00:07, 1136.62 examples/s]
Add position_id column (Sample Packing) (num_proc=1): 47%|███▎ | 7000/14763 [00:06<00:06, 1144.16 examples/s]
Add position_id column (Sample Packing) (num_proc=1): 54%|███▊ | 8000/14763 [00:07<00:05, 1141.22 examples/s]
Add position_id column (Sample Packing) (num_proc=1): 61%|████▎ | 9000/14763 [00:08<00:04, 1156.14 examples/s]
Add position_id column (Sample Packing) (num_proc=1): 68%|████ | 10000/14763 [00:08<00:04, 1164.26 examples/s]
Add position_id column (Sample Packing) (num_proc=1): 75%|████▍ | 11000/14763 [00:09<00:03, 1152.89 examples/s]
Add position_id column (Sample Packing) (num_proc=1): 81%|████▉ | 12000/14763 [00:10<00:02, 1155.75 examples/s]
Add position_id column (Sample Packing) (num_proc=1): 88%|█████▎| 13000/14763 [00:11<00:01, 1149.65 examples/s]
Add position_id column (Sample Packing) (num_proc=1): 95%|█████▋| 14000/14763 [00:12<00:00, 1154.78 examples/s]
Add position_id column (Sample Packing) (num_proc=1): 100%|██████| 14763/14763 [00:13<00:00, 1158.66 examples/s]
Add position_id column (Sample Packing) (num_proc=1): 100%|██████| 14763/14763 [00:13<00:00, 1120.44 examples/s]
Saving the dataset (0/1 shards): 0%| | 0/14763 [00:00<?, ? examples/s]
Saving the dataset (0/1 shards): 14%|███▉ | 2000/14763 [00:08<00:55, 228.58 examples/s]
Saving the dataset (0/1 shards): 54%|███████████████▏ | 8000/14763 [00:08<00:05, 1181.51 examples/s]
Saving the dataset (0/1 shards): 95%|█████████████████████████▌ | 14000/14763 [00:09<00:00, 2471.27 examples/s]
Saving the dataset (1/1 shards): 100%|███████████████████████████| 14763/14763 [00:09<00:00, 2471.27 examples/s]
Saving the dataset (1/1 shards): 100%|███████████████████████████| 14763/14763 [00:10<00:00, 1448.15 examples/s]
[2026-05-14 14:09:11,124] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:420] [PID:26498] total_num_tokens: 23_382_259
[2026-05-14 14:09:11,329] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:438] [PID:26498] `total_supervised_tokens: 11_016_035`
[2026-05-14 14:09:11,489] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:26498] Using single process for pack_parallel, running sequentially.
[2026-05-14 14:09:12,642] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:26498] Using single process for pack_parallel, running sequentially.
[2026-05-14 14:09:12,941] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:26498] generate_batches time: 0.3018453121185303
[2026-05-14 14:09:12,944] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:26498] Using single process for pack_parallel, running sequentially.
[2026-05-14 14:09:13,260] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:26498] generate_batches time: 0.31858301162719727
[2026-05-14 14:09:13,263] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:26498] Using single process for pack_parallel, running sequentially.
[2026-05-14 14:09:13,584] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:26498] generate_batches time: 0.3229527473449707
[2026-05-14 14:09:13,586] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:26498] Using single process for pack_parallel, running sequentially.
[2026-05-14 14:09:13,864] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:26498] generate_batches time: 0.2801856994628906
[2026-05-14 14:09:13,913] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:26498] gather_len_batches: [960]
[2026-05-14 14:09:13,914] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:495] [PID:26498] data_loader_len: 120
[2026-05-14 14:09:13,914] [INFO] [axolotl.utils.trainer.calc_sample_packing_eff_est:504] [PID:26498] sample_packing_eff_est across ranks: [0.991069327460395]
[2026-05-14 14:09:13,914] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:516] [PID:26498] sample_packing_eff_est: 1.0
[2026-05-14 14:09:13,914] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:521] [PID:26498] total_num_steps: 240
[2026-05-14 14:09:13,914] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:121] [PID:26498] Maximum number of steps set at 240
[2026-05-14 14:09:13,976] [DEBUG] [axolotl.train.setup_model_and_tokenizer:70] [PID:26498] loading tokenizer... Qwen/Qwen3-8B
[2026-05-14 14:09:15,874] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:311] [PID:26498] EOS: 151645 / <|im_end|>
[2026-05-14 14:09:15,875] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:312] [PID:26498] BOS: None / None
[2026-05-14 14:09:15,875] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:313] [PID:26498] PAD: 151643 / <|endoftext|>
[2026-05-14 14:09:15,875] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:314] [PID:26498] UNK: None / None
[2026-05-14 14:09:15,875] [DEBUG] [axolotl.train.setup_model_and_tokenizer:81] [PID:26498] Loading model
[2026-05-14 14:09:15,966] [DEBUG] [axolotl.monkeypatch.torchao_optim.patch_torchao_optim_state_8bit:75] [PID:26498] Patched OptimState8bit for torch.compile compatibility
[2026-05-14 14:09:15,966] [DEBUG] [axolotl.monkeypatch.torchao_optim.patch_torchao_optim_state_8bit:122] [PID:26498] Patched OptimState4bit for torch.compile compatibility
[2026-05-14 14:09:15,966] [DEBUG] [axolotl.monkeypatch.torchao_optim.patch_torchao_optim_state_8bit:154] [PID:26498] Patched OptimStateFp8 for torch.compile compatibility
[2026-05-14 14:09:15,972] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:94] [PID:26498] Patched Trainer.evaluation_loop with nanmean loss calculation
[2026-05-14 14:09:15,973] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:148] [PID:26498] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation
[2026-05-14 14:09:15,975] [INFO] [axolotl.loaders.patch_manager._apply_multipack_patches:598] [PID:26498] Applying multipack dataloader patch for sample packing...
[2026-05-14 14:09:15,975] [WARNING] [axolotl.loaders.patch_manager._apply_self_attention_lora_patch:556] [PID:26498] Cannot patch self-attention - requires no dropout
[2026-05-14 14:09:15,999] [INFO] [axolotl.integrations.cut_cross_entropy.pre_model_load:94] [PID:26498] Applying Cut Cross Entropy to model type: qwen3
Loading weights: 0%| | 0/399 [00:00<?, ?it/s]
Loading weights: 0%|▏ | 1/399 [00:00<01:16, 5.20it/s]
Loading weights: 1%|▎ | 2/399 [00:00<01:16, 5.17it/s]
Loading weights: 2%|▊ | 6/399 [00:00<00:25, 15.60it/s]
Loading weights: 7%|███▋ | 26/399 [00:00<00:05, 69.68it/s]
Loading weights: 10%|█████▌ | 39/399 [00:00<00:04, 86.64it/s]
Loading weights: 15%|████████▎ | 59/399 [00:00<00:02, 119.74it/s]
Loading weights: 18%|██████████▏ | 73/399 [00:00<00:02, 123.51it/s]
Loading weights: 23%|████████████▉ | 92/399 [00:01<00:02, 142.48it/s]
Loading weights: 27%|██████████████▉ | 108/399 [00:01<00:01, 147.13it/s]
Loading weights: 32%|█████████████████▎ | 126/399 [00:01<00:01, 141.71it/s]
Loading weights: 35%|███████████████████▊ | 141/399 [00:01<00:02, 98.40it/s]
Loading weights: 38%|█████████████████████ | 153/399 [00:01<00:02, 101.45it/s]
Loading weights: 42%|███████████████████████▎ | 169/399 [00:01<00:02, 113.51it/s]
Loading weights: 46%|█████████████████████████ | 182/399 [00:01<00:01, 116.72it/s]
Loading weights: 51%|███████████████████████████▊ | 202/399 [00:01<00:01, 138.13it/s]
Loading weights: 55%|██████████████████████████████▏ | 219/399 [00:02<00:01, 145.18it/s]
Loading weights: 59%|████████████████████████████████▌ | 236/399 [00:02<00:01, 146.87it/s]
Loading weights: 63%|██████████████████████████████████▋ | 252/399 [00:02<00:00, 147.08it/s]
Loading weights: 67%|█████████████████████████████████████ | 269/399 [00:02<00:00, 148.79it/s]
Loading weights: 72%|███████████████████████████████████████▌ | 287/399 [00:02<00:00, 155.51it/s]
Loading weights: 76%|█████████████████████████████████████████▊ | 303/399 [00:02<00:00, 147.81it/s]
Loading weights: 81%|████████████████████████████████████████████▌ | 323/399 [00:02<00:00, 161.15it/s]
Loading weights: 85%|██████████████████████████████████████████████▊ | 340/399 [00:02<00:00, 158.55it/s]
Loading weights: 89%|█████████████████████████████████████████████████▏ | 357/399 [00:02<00:00, 157.03it/s]
Loading weights: 93%|███████████████████████████████████████████████████▍ | 373/399 [00:03<00:00, 151.93it/s]
Loading weights: 98%|█████████████████████████████████████████████████████▊ | 390/399 [00:03<00:00, 150.24it/s]
Loading weights: 100%|███████████████████████████████████████████████████████| 399/399 [00:03<00:00, 125.43it/s]
[2026-05-14 14:09:21,850] [INFO] [axolotl.loaders.model._prepare_model_for_quantization:900] [PID:26498] converting PEFT model w/ prepare_model_for_kbit_training
[2026-05-14 14:09:21,856] [INFO] [axolotl.loaders.model._configure_embedding_dtypes:356] [PID:26498] Converting modules to torch.bfloat16
[2026-05-14 14:09:21,862] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:26498] Memory usage after model load 9.148GB (+9.148GB allocated, +10.395GB reserved)
trainable params: 87,293,952 || all params: 8,278,029,312 || trainable%: 1.0545
[2026-05-14 14:09:23,159] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:26498] after adapters 5.997GB (+5.997GB allocated, +10.559GB reserved)
[2026-05-14 14:09:24,066] [INFO] [axolotl.monkeypatch.lora_kernels.apply_lora_kernel_patches:478] [PID:26498] LoRA kernels: dropout=0.05 enabled
[2026-05-14 14:09:25,128] [INFO] [axolotl.train.save_initial_configs:450] [PID:26498] Pre-saving adapter config to ./outputs/finance-synthetic-sft-phase2...
[2026-05-14 14:09:25,129] [INFO] [axolotl.train.save_initial_configs:454] [PID:26498] Pre-saving tokenizer to ./outputs/finance-synthetic-sft-phase2...
[2026-05-14 14:09:25,238] [INFO] [axolotl.train.save_initial_configs:459] [PID:26498] Pre-saving model config to ./outputs/finance-synthetic-sft-phase2...
[2026-05-14 14:09:25,241] [INFO] [axolotl.train.execute_training:226] [PID:26498] Starting trainer...
[2026-05-14 14:09:25,751] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:26498] Using single process for pack_parallel, running sequentially.
[2026-05-14 14:09:26,070] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:26498] Using single process for pack_parallel, running sequentially.
[2026-05-14 14:09:26,381] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:26498] generate_batches time: 0.3131840229034424
[2026-05-14 14:09:26,383] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:26498] Using single process for pack_parallel, running sequentially.
[2026-05-14 14:09:26,743] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:26498] generate_batches time: 0.36243748664855957
[2026-05-14 14:09:26,746] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:26498] Using single process for pack_parallel, running sequentially.
[2026-05-14 14:09:27,089] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:26498] generate_batches time: 0.3450140953063965
[2026-05-14 14:09:27,092] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:26498] Using single process for pack_parallel, running sequentially.
[2026-05-14 14:09:27,428] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:26498] generate_batches time: 0.33806943893432617
[2026-05-14 14:09:27,428] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:26498] gather_len_batches: [960]
0%| | 0/240 [00:00<?, ?it/s][2026-05-14 14:09:27,540] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:26498] Using single process for pack_parallel, running sequentially.
0%|▎ | 1/240 [01:24<5:35:43, 84.28s/it]
1%|▌ | 2/240 [02:30<4:53:08, 73.90s/it]
1%|▉ | 3/240 [03:35<4:35:16, 69.69s/it]
2%|█▏ | 4/240 [04:39<4:25:33, 67.51s/it]
2%|█▌ | 5/240 [05:44<4:20:00, 66.38s/it]Process Process-1:
Traceback (most recent call last):
File "/root/.local/share/uv/python/cpython-3.12.13-linux-x86_64-gnu/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
self.run()
File "/root/.local/share/uv/python/cpython-3.12.13-linux-x86_64-gnu/lib/python3.12/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/workspace/axolotl/src/axolotl/monkeypatch/data/batch_dataset_fetcher.py", line 54, in patched_worker_loop
return _worker_loop(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/axolotl-venv/lib/python3.12/site-packages/torch/utils/data/_utils/worker.py", line 315, in _worker_loop
r = index_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/.local/share/uv/python/cpython-3.12.13-linux-x86_64-gnu/lib/python3.12/multiprocessing/queues.py", line 113, in get
if not self._poll(timeout):
^^^^^^^^^^^^^^^^^^^
File "/root/.local/share/uv/python/cpython-3.12.13-linux-x86_64-gnu/lib/python3.12/multiprocessing/connection.py", line 257, in poll
return self._poll(timeout)
^^^^^^^^^^^^^^^^^^^
File "/root/.local/share/uv/python/cpython-3.12.13-linux-x86_64-gnu/lib/python3.12/multiprocessing/connection.py", line 440, in _poll
r = wait([self], timeout)
^^^^^^^^^^^^^^^^^^^^^
File "/root/.local/share/uv/python/cpython-3.12.13-linux-x86_64-gnu/lib/python3.12/multiprocessing/connection.py", line 1136, in wait
ready = selector.select(timeout)
^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/.local/share/uv/python/cpython-3.12.13-linux-x86_64-gnu/lib/python3.12/selectors.py", line 415, in select
fd_event_list = self._selector.poll(timeout)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/axolotl/src/axolotl/train.py", line 179, in <lambda>
lambda signum, frame: terminate_handler(signum, frame, _model_weakref),
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/axolotl/src/axolotl/train.py", line 171, in terminate_handler
_model.save_pretrained(cfg.output_dir)
File "/workspace/axolotl-venv/lib/python3.12/site-packages/peft/peft_model.py", line 294, in save_pretrained
output_state_dict = get_peft_model_state_dict(
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/axolotl-venv/lib/python3.12/site-packages/peft/utils/save_and_load.py", line 111, in get_peft_model_state_dict
state_dict = model.state_dict()
^^^^^^^^^^^^^^^^^^
File "/workspace/axolotl-venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 2265, in state_dict
module.state_dict(
File "/workspace/axolotl-venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 2265, in state_dict
module.state_dict(
File "/workspace/axolotl-venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 2265, in state_dict
module.state_dict(
[Previous line repeated 5 more times]
File "/workspace/axolotl-venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 2262, in state_dict
self._save_to_state_dict(destination, prefix, keep_vars)
File "/workspace/axolotl-venv/lib/python3.12/site-packages/bitsandbytes/nn/modules.py", line 526, in _save_to_state_dict
for k, v in self.weight.quant_state.as_dict(packed=True).items():
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/axolotl-venv/lib/python3.12/site-packages/bitsandbytes/functional.py", line 523, in as_dict
"nested_quant_map": self.state2.code.clone(), # un-shared to avoid restoring it after shared tensors are removed by safetensors
^^^^^^^^^^^^^^^^^^^^^^^^
torch.AcceleratorError: CUDA error: initialization error
Search for `cudaErrorInitializationError' in https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html for more information.
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
[2026-05-14 14:16:00,926] [WARNING] [py.warnings._showwarnmsg:112] [PID:26498] /workspace/axolotl-venv/lib/python3.12/site-packages/peft/utils/other.py:1419: UserWarning: Unable to fetch remote file due to the following error DataLoader worker (pid 26631) exited unexpectedly with exit code 1. Details are lost due to multiprocessing. Rerunning with num_workers=0 may give better error trace. - silently ignoring the lookup for the file config.json in Qwen/Qwen3-8B.
warnings.warn(
[2026-05-14 14:16:00,927] [WARNING] [py.warnings._showwarnmsg:112] [PID:26498] /workspace/axolotl-venv/lib/python3.12/site-packages/peft/utils/save_and_load.py:372: UserWarning: Could not find a config file in Qwen/Qwen3-8B - will assume that the vocabulary was not modified.
warnings.warn(
2%|█▌ | 5/240 [06:33<5:08:32, 78.78s/it]
[2026-05-14 14:16:47,206] [DEBUG] [axolotl.utils.config.resolve_dtype:74] [PID:27401] bf16 support detected, enabling for this configuration.
[2026-05-14 14:16:47,436] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:27401] baseline 0.000GB ()
[2026-05-14 14:16:47,437] [INFO] [axolotl.cli.config.load_cfg:333] [PID:27401] config:
{
"activation_offloading": false,
"adapter": "qlora",
"attn_implementation": "flash_attention_2",
"attn_needs_dtype_cast": true,
"attn_supports_packing": true,
"attn_uses_flash_lib": true,
"auto_resume_from_checkpoints": true,
"axolotl_config_path": "./data/config.yaml",
"base_model": "Qwen/Qwen3-8B",
"base_model_config": "Qwen/Qwen3-8B",
"batch_size": 64,
"bf16": true,
"capabilities": {
"bf16": true,
"compute_capability": "sm_80",
"fp8": false,
"n_gpu": 1,
"n_node": 1,
"tf32": true
},
"chat_template": "qwen3",
"context_parallel_size": 1,
"cut_cross_entropy": true,
"dataloader_num_workers": 1,
"dataloader_pin_memory": true,
"dataloader_prefetch_factor": 256,
"dataset_num_proc": 1,
"dataset_prepared_path": "last_run_prepared",
"datasets": [
{
"chat_template": "tokenizer_default",
"field_messages": "messages",
"field_tools": "tools",
"message_property_mappings": {
"content": "content",
"role": "role"
},
"path": "Gandalf1/indian-finance-synthetic-phase2-cleaned",
"roles_to_train": [
"assistant"
],
"train_on_eos": "turn",
"trust_remote_code": false,
"type": "chat_template"
}
],
"ddp": false,
"device": "cuda:0",
"dion_rank_fraction": 1.0,
"dion_rank_multiple_of": 1,
"eaft_alpha": 1.0,
"eaft_k": 20,
"env_capabilities": {
"torch_version": "2.9.1"
},
"eval_batch_size": 8,
"eval_causal_lm_metrics": [
"sacrebleu",
"comet",
"ter",
"chrf"
],
"eval_max_new_tokens": 128,
"eval_sample_packing": true,
"eval_table_size": 0,
"experimental_skip_move_to_device": true,
"fp16": false,
"generate_samples": false,
"generation_do_sample": true,
"generation_max_new_tokens": 50,
"generation_prompt_ratio": 0.5,
"generation_temperature": 0.7,
"gradient_accumulation_steps": 8,
"gradient_checkpointing": true,
"gradient_checkpointing_kwargs": {
"use_reentrant": false
},
"hub_model_id": "Gandalf1/qwen3-8b-finance-sft-phase2",
"hub_strategy": "all_checkpoints",
"include_tkps": true,
"layer_offloading": false,
"learning_rate": 2e-05,
"lisa_layers_attribute": "model.layers",
"load_best_model_at_end": false,
"load_in_4bit": true,
"load_in_8bit": false,
"local_rank": 0,
"logging_steps": 10,
"lora_alpha": 64,
"lora_dropout": 0.05,
"lora_mlp_kernel": true,
"lora_o_kernel": true,
"lora_qkv_kernel": true,
"lora_r": 32,
"lora_target_modules": [
"q_proj",
"k_proj",
"v_proj",
"o_proj",
"gate_proj",
"down_proj",
"up_proj"
],
"loraplus_lr_embedding": 1e-06,
"loss_watchdog_patience": 3,
"loss_watchdog_threshold": 5.0,
"lr_scheduler": "cosine",
"max_grad_norm": 1.0,
"mean_resizing_embeddings": false,
"merge_method": "memory_efficient",
"micro_batch_size": 8,
"model_config_type": "qwen3",
"num_epochs": 2.0,
"num_generation_samples": 3,
"optimizer": "adamw_torch_4bit",
"otel_metrics_host": "localhost",
"otel_metrics_port": 8000,
"output_dir": "./outputs/finance-synthetic-sft-phase2",
"pad_to_sequence_len": true,
"plugins": [
"axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin"
],
"pretrain_multipack_attn": true,
"profiler_steps_start": 0,
"qlora_sharded_model_loading": false,
"quantize_moe_experts": false,
"ray_num_workers": 1,
"relora_prune_method": "magnitude",
"resources_per_worker": {
"GPU": 1
},
"sample_packing": true,
"sample_packing_bin_size": 200,
"sample_packing_group_size": 100000,
"save_only_model": false,
"save_safetensors": true,
"save_steps": 0.16666666666666666,
"save_total_limit": 3,
"saves_per_epoch": 3,
"seed": 42,
"sequence_len": 6144,
"shuffle_before_merging_datasets": false,
"shuffle_merged_datasets": true,
"skip_prepare_dataset": false,
"streaming_multipack_buffer_size": 10000,
"strict": false,
"tensor_parallel_size": 1,
"tf32": true,
"tiled_mlp_use_original_mlp": true,
"tokenizer_config": "Qwen/Qwen3-8B",
"tokenizer_save_jinja_files": true,
"torch_dtype": "torch.bfloat16",
"train_on_inputs": false,
"trl": {
"async_prefetch": false,
"log_completions": false,
"mask_truncated_completions": false,
"ref_model_mixup_alpha": 0.9,
"ref_model_sync_steps": 64,
"replay_buffer_size": 0,
"replay_recompute_logps": true,
"reroll_max_groups": 1,
"reroll_start_fraction": 1.0,
"reward_num_workers": 1,
"scale_rewards": true,
"skip_zero_advantage_batches": true,
"sync_ref_model": false,
"use_data_producer": false,
"use_vllm": false,
"vllm_lora_sync": false,
"vllm_server_host": "0.0.0.0",
"vllm_server_port": 8000
},
"use_otel_metrics": false,
"use_ray": false,
"val_set_size": 0.0,
"vllm": {
"device": "auto",
"dtype": "auto",
"gpu_memory_utilization": 0.9,
"host": "0.0.0.0",
"port": 8000
},
"warmup_ratio": 0.05,
"weight_decay": 0.01,
"world_size": 1
}
[2026-05-14 14:16:48,543] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:311] [PID:27401] EOS: 151645 / <|im_end|>
[2026-05-14 14:16:48,543] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:312] [PID:27401] BOS: None / None
[2026-05-14 14:16:48,543] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:313] [PID:27401] PAD: 151643 / <|endoftext|>
[2026-05-14 14:16:48,543] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:314] [PID:27401] UNK: None / None
[2026-05-14 14:16:48,544] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:477] [PID:27401] Loading prepared dataset from disk at last_run_prepared/8e970b09b0233ad980a67dcca6703606...
[2026-05-14 14:16:48,619] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:420] [PID:27401] total_num_tokens: 23_382_259
[2026-05-14 14:16:48,820] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:438] [PID:27401] `total_supervised_tokens: 11_016_035`
[2026-05-14 14:16:48,973] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:27401] Using single process for pack_parallel, running sequentially.
[2026-05-14 14:16:49,950] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:27401] Using single process for pack_parallel, running sequentially.
[2026-05-14 14:16:50,214] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27401] generate_batches time: 0.2658224105834961
[2026-05-14 14:16:50,216] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:27401] Using single process for pack_parallel, running sequentially.
[2026-05-14 14:16:50,480] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27401] generate_batches time: 0.26596903800964355
[2026-05-14 14:16:50,483] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:27401] Using single process for pack_parallel, running sequentially.
[2026-05-14 14:16:50,747] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27401] generate_batches time: 0.26615142822265625
[2026-05-14 14:16:50,749] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:27401] Using single process for pack_parallel, running sequentially.
[2026-05-14 14:16:51,014] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27401] generate_batches time: 0.26613616943359375
[2026-05-14 14:16:51,076] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:27401] gather_len_batches: [478]
[2026-05-14 14:16:51,076] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:495] [PID:27401] data_loader_len: 59
[2026-05-14 14:16:51,076] [INFO] [axolotl.utils.trainer.calc_sample_packing_eff_est:504] [PID:27401] sample_packing_eff_est across ranks: [0.9952160610480953]
[2026-05-14 14:16:51,076] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:516] [PID:27401] sample_packing_eff_est: 1.0
[2026-05-14 14:16:51,076] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:521] [PID:27401] total_num_steps: 118
[2026-05-14 14:16:51,076] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:121] [PID:27401] Maximum number of steps set at 118
[2026-05-14 14:16:51,107] [DEBUG] [axolotl.train.setup_model_and_tokenizer:70] [PID:27401] loading tokenizer... Qwen/Qwen3-8B
[2026-05-14 14:16:52,139] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:311] [PID:27401] EOS: 151645 / <|im_end|>
[2026-05-14 14:16:52,139] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:312] [PID:27401] BOS: None / None
[2026-05-14 14:16:52,139] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:313] [PID:27401] PAD: 151643 / <|endoftext|>
[2026-05-14 14:16:52,139] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:314] [PID:27401] UNK: None / None
[2026-05-14 14:16:52,139] [DEBUG] [axolotl.train.setup_model_and_tokenizer:81] [PID:27401] Loading model
[2026-05-14 14:16:52,226] [DEBUG] [axolotl.monkeypatch.torchao_optim.patch_torchao_optim_state_8bit:75] [PID:27401] Patched OptimState8bit for torch.compile compatibility
[2026-05-14 14:16:52,226] [DEBUG] [axolotl.monkeypatch.torchao_optim.patch_torchao_optim_state_8bit:122] [PID:27401] Patched OptimState4bit for torch.compile compatibility
[2026-05-14 14:16:52,226] [DEBUG] [axolotl.monkeypatch.torchao_optim.patch_torchao_optim_state_8bit:154] [PID:27401] Patched OptimStateFp8 for torch.compile compatibility
[2026-05-14 14:16:52,231] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:94] [PID:27401] Patched Trainer.evaluation_loop with nanmean loss calculation
[2026-05-14 14:16:52,232] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:148] [PID:27401] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation
[2026-05-14 14:16:52,233] [INFO] [axolotl.loaders.patch_manager._apply_multipack_patches:598] [PID:27401] Applying multipack dataloader patch for sample packing...
[2026-05-14 14:16:52,233] [WARNING] [axolotl.loaders.patch_manager._apply_self_attention_lora_patch:556] [PID:27401] Cannot patch self-attention - requires no dropout
[2026-05-14 14:16:52,248] [INFO] [axolotl.integrations.cut_cross_entropy.pre_model_load:94] [PID:27401] Applying Cut Cross Entropy to model type: qwen3
Loading weights: 0%| | 0/399 [00:00<?, ?it/s]
Loading weights: 0%|▏ | 1/399 [00:00<01:18, 5.09it/s]
Loading weights: 1%|▎ | 2/399 [00:00<01:17, 5.10it/s]
Loading weights: 2%|▊ | 6/399 [00:00<00:25, 15.28it/s]
Loading weights: 7%|███▋ | 26/399 [00:00<00:05, 69.67it/s]
Loading weights: 10%|█████▌ | 39/399 [00:00<00:04, 87.27it/s]
Loading weights: 15%|████████▍ | 60/399 [00:00<00:02, 118.35it/s]
Loading weights: 20%|██████████▉ | 78/399 [00:00<00:02, 135.62it/s]
Loading weights: 23%|█████████████ | 93/399 [00:01<00:02, 139.69it/s]
Loading weights: 29%|███████████████▋ | 114/399 [00:01<00:01, 153.18it/s]
Loading weights: 33%|█████████████████▉ | 130/399 [00:01<00:01, 152.10it/s]
Loading weights: 37%|████████████████████▍ | 148/399 [00:01<00:01, 152.66it/s]
Loading weights: 42%|███████████████████████ | 167/399 [00:01<00:01, 162.57it/s]
Loading weights: 46%|█████████████████████████▎ | 184/399 [00:01<00:01, 157.05it/s]
Loading weights: 51%|███████████████████████████▊ | 202/399 [00:01<00:01, 161.91it/s]
Loading weights: 55%|██████████████████████████████▏ | 219/399 [00:01<00:01, 157.80it/s]
Loading weights: 59%|████████████████████████████████▌ | 236/399 [00:01<00:01, 156.28it/s]
Loading weights: 64%|███████████████████████████████████▍ | 257/399 [00:02<00:00, 163.72it/s]
Loading weights: 69%|█████████████████████████████████████▊ | 274/399 [00:02<00:00, 161.44it/s]
Loading weights: 73%|████████████████████████████████████████ | 291/399 [00:02<00:00, 158.63it/s]
Loading weights: 77%|██████████████████████████████████████████▎ | 307/399 [00:02<00:00, 158.03it/s]
Loading weights: 81%|████████████████████████████████████████████▋ | 324/399 [00:02<00:00, 157.01it/s]
Loading weights: 86%|███████████████████████████████████████████████▎ | 343/399 [00:02<00:00, 166.08it/s]
Loading weights: 90%|█████████████████████████████████████████████████▌ | 360/399 [00:02<00:00, 159.82it/s]
Loading weights: 95%|████████████████████████████████████████████████████ | 378/399 [00:02<00:00, 165.32it/s]
Loading weights: 99%|██████████████████████████████████████████████████████▍| 395/399 [00:02<00:00, 159.54it/s]
Loading weights: 100%|███████████████████████████████████████████████████████| 399/399 [00:02<00:00, 137.78it/s]
[2026-05-14 14:16:56,731] [INFO] [axolotl.loaders.model._prepare_model_for_quantization:900] [PID:27401] converting PEFT model w/ prepare_model_for_kbit_training
[2026-05-14 14:16:56,737] [INFO] [axolotl.loaders.model._configure_embedding_dtypes:356] [PID:27401] Converting modules to torch.bfloat16
[2026-05-14 14:16:56,742] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:27401] Memory usage after model load 9.148GB (+9.148GB allocated, +10.395GB reserved)
trainable params: 87,293,952 || all params: 8,278,029,312 || trainable%: 1.0545
[2026-05-14 14:16:58,024] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:27401] after adapters 5.997GB (+5.997GB allocated, +10.559GB reserved)
[2026-05-14 14:16:58,858] [INFO] [axolotl.monkeypatch.lora_kernels.apply_lora_kernel_patches:478] [PID:27401] LoRA kernels: dropout=0.05 enabled
[2026-05-14 14:16:59,965] [INFO] [axolotl.train.save_initial_configs:450] [PID:27401] Pre-saving adapter config to ./outputs/finance-synthetic-sft-phase2...
[2026-05-14 14:16:59,966] [INFO] [axolotl.train.save_initial_configs:454] [PID:27401] Pre-saving tokenizer to ./outputs/finance-synthetic-sft-phase2...
[2026-05-14 14:17:00,066] [INFO] [axolotl.train.save_initial_configs:459] [PID:27401] Pre-saving model config to ./outputs/finance-synthetic-sft-phase2...
[2026-05-14 14:17:00,070] [INFO] [axolotl.train.execute_training:226] [PID:27401] Starting trainer...
[2026-05-14 14:17:00,556] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:27401] Using single process for pack_parallel, running sequentially.
[2026-05-14 14:17:00,857] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:27401] Using single process for pack_parallel, running sequentially.
[2026-05-14 14:17:01,154] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27401] generate_batches time: 0.2984797954559326
[2026-05-14 14:17:01,156] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:27401] Using single process for pack_parallel, running sequentially.
[2026-05-14 14:17:01,452] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27401] generate_batches time: 0.29767322540283203
[2026-05-14 14:17:01,454] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:27401] Using single process for pack_parallel, running sequentially.
[2026-05-14 14:17:01,750] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27401] generate_batches time: 0.29801011085510254
[2026-05-14 14:17:01,752] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:27401] Using single process for pack_parallel, running sequentially.
[2026-05-14 14:17:02,048] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27401] generate_batches time: 0.2980222702026367
[2026-05-14 14:17:02,049] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:27401] gather_len_batches: [478]
0%| | 0/118 [00:00<?, ?it/s][2026-05-14 14:17:02,153] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:27401] Using single process for pack_parallel, running sequentially.
1%|▌ | 1/118 [02:11<4:17:03, 131.83s/it]
2%|█▏ | 2/118 [04:20<4:10:45, 129.70s/it]
3%|█▊ | 3/118 [06:27<4:06:16, 128.49s/it]
3%|██▍ | 4/118 [08:36<4:04:27, 128.66s/it]
4%|███ | 5/118 [10:43<4:01:24, 128.18s/it]
5%|███▋ | 6/118 [12:51<3:58:58, 128.02s/it]
6%|████▎ | 7/118 [14:58<3:56:40, 127.93s/it]
7%|████▉ | 8/118 [17:06<3:54:21, 127.84s/it]
8%|█████▍ | 9/118 [19:13<3:52:03, 127.74s/it]
8%|██████ | 10/118 [21:21<3:49:57, 127.76s/it]
{'loss': '1.607', 'grad_norm': '1.072', 'learning_rate': '1.994e-05', 'ppl': '4.987', 'memory/max_active (GiB)': '32.17', 'memory/max_allocated (GiB)': '32.17', 'memory/device_reserved (GiB)': '33.31', 'tokens/train_per_sec_per_gpu': '163.5', 'tokens/total': 3932160, 'tokens/trainable': 1840181, 'epoch': '0.1674'}
8%|██████ | 10/118 [21:21<3:49:57, 127.76s/it]
9%|██████▌ | 11/118 [23:29<3:47:45, 127.71s/it]
10%|███████▏ | 12/118 [25:36<3:45:31, 127.66s/it]
11%|███████▊ | 13/118 [27:44<3:43:33, 127.75s/it]
12%|████████▍ | 14/118 [29:52<3:41:22, 127.72s/it]
13%|█████████ | 15/118 [31:59<3:39:02, 127.60s/it]
14%|█████████▋ | 16/118 [34:07<3:36:55, 127.60s/it]
14%|██████████▏ | 17/118 [36:15<3:35:02, 127.75s/it]
15%|██████████▊ | 18/118 [38:23<3:32:54, 127.75s/it]
16%|███████████▍ | 19/118 [40:31<3:30:48, 127.77s/it]
17%|████████████ | 20/118 [42:39<3:28:53, 127.89s/it]
{'loss': '1.412', 'grad_norm': '0.3657', 'learning_rate': '1.925e-05', 'ppl': '4.104', 'memory/max_active (GiB)': '32.17', 'memory/max_allocated (GiB)': '32.17', 'memory/device_reserved (GiB)': '33.31', 'tokens/train_per_sec_per_gpu': '176.4', 'tokens/total': 7864320, 'tokens/trainable': 3698601, 'epoch': '0.3347'}
17%|████████████ | 20/118 [42:39<3:28:53, 127.89s/it][2026-05-14 14:59:41,397] [INFO] [axolotl.core.trainers.base._save:818] [PID:27401] Saving model checkpoint to ./outputs/finance-synthetic-sft-phase2/checkpoint-20
18%|████████████▋ | 21/118 [44:50<3:28:30, 128.98s/it]
19%|█████████████▏ | 22/118 [46:58<3:25:48, 128.63s/it]
19%|█████████████▊ | 23/118 [49:06<3:23:18, 128.41s/it]
20%|██████████████▍ | 24/118 [51:14<3:21:03, 128.33s/it]
21%|███████████████ | 25/118 [53:22<3:18:39, 128.17s/it]
22%|███████████████▋ | 26/118 [55:30<3:16:24, 128.09s/it]
23%|████████████████▏ | 27/118 [57:38<3:14:14, 128.07s/it]
24%|████████████████▊ | 28/118 [59:45<3:11:53, 127.92s/it]
25%|████████████████▉ | 29/118 [1:01:53<3:09:39, 127.86s/it]
25%|█████████████████▌ | 30/118 [1:04:01<3:07:31, 127.86s/it]
{'loss': '1.32', 'grad_norm': '0.2517', 'learning_rate': '1.786e-05', 'ppl': '3.745', 'memory/max_active (GiB)': '32.17', 'memory/max_allocated (GiB)': '32.17', 'memory/device_reserved (GiB)': '33.31', 'tokens/train_per_sec_per_gpu': '181.4', 'tokens/total': 11796480, 'tokens/trainable': 5537401, 'epoch': '0.5021'}
25%|█████████████████▌ | 30/118 [1:04:01<3:07:31, 127.86s/it]
26%|██████████████████▏ | 31/118 [1:06:09<3:05:31, 127.94s/it]
27%|██████████████████▋ | 32/118 [1:08:17<3:03:13, 127.83s/it]
28%|███████████████████▎ | 33/118 [1:10:25<3:01:05, 127.83s/it]
29%|███████████████████▉ | 34/118 [1:12:33<2:59:01, 127.88s/it]
30%|████████████████████▍ | 35/118 [1:14:40<2:56:50, 127.84s/it]
31%|█████████████████████ | 36/118 [1:16:50<2:55:20, 128.29s/it]
31%|█████████████████████▋ | 37/118 [1:18:57<2:52:58, 128.13s/it]
32%|██████████████████████▏ | 38/118 [1:21:05<2:50:32, 127.91s/it]
33%|██████████████████████▊ | 39/118 [1:23:12<2:48:14, 127.78s/it]
34%|███████████████████████▍ | 40/118 [1:25:20<2:46:01, 127.71s/it]
{'loss': '1.293', 'grad_norm': '0.2172', 'learning_rate': '1.586e-05', 'ppl': '3.645', 'memory/max_active (GiB)': '32.17', 'memory/max_allocated (GiB)': '32.17', 'memory/device_reserved (GiB)': '33.31', 'tokens/train_per_sec_per_gpu': '184.9', 'tokens/total': 15728640, 'tokens/trainable': 7378776, 'epoch': '0.6695'}
34%|███████████████████████▍ | 40/118 [1:25:20<2:46:01, 127.71s/it][2026-05-14 15:42:22,458] [INFO] [axolotl.core.trainers.base._save:818] [PID:27401] Saving model checkpoint to ./outputs/finance-synthetic-sft-phase2/checkpoint-40
35%|███████████████████████▉ | 41/118 [1:27:30<2:44:46, 128.39s/it]
36%|████████████████████████▌ | 42/118 [1:29:37<2:42:14, 128.09s/it]
36%|█████████████████████████▏ | 43/118 [1:31:45<2:39:59, 127.99s/it]
37%|█████████████████████████▋ | 44/118 [1:33:53<2:37:48, 127.96s/it]
38%|██████████████████████████▎ | 45/118 [1:36:00<2:35:34, 127.87s/it]
39%|██████████████████████████▉ | 46/118 [1:38:08<2:33:22, 127.81s/it]
40%|███████████████████████████▍ | 47/118 [1:40:16<2:31:12, 127.79s/it]
41%|████████████████████████████ | 48/118 [1:42:23<2:28:52, 127.61s/it]
42%|████████████████████████████▋ | 49/118 [1:44:30<2:26:36, 127.49s/it]
42%|█████████████████████████████▏ | 50/118 [1:46:37<2:24:22, 127.39s/it]
{'loss': '1.269', 'grad_norm': '0.1839', 'learning_rate': '1.341e-05', 'ppl': '3.556', 'memory/max_active (GiB)': '32.17', 'memory/max_allocated (GiB)': '32.17', 'memory/device_reserved (GiB)': '33.31', 'tokens/train_per_sec_per_gpu': '178.7', 'tokens/total': 19660800, 'tokens/trainable': 9224452, 'epoch': '0.8368'}
42%|█████████████████████████████▏ | 50/118 [1:46:37<2:24:22, 127.39s/it]
43%|█████████████████████████████▊ | 51/118 [1:48:45<2:22:22, 127.50s/it]
44%|██████████████████████████████▍ | 52/118 [1:50:53<2:20:17, 127.54s/it]
45%|██████████████████████████████▉ | 53/118 [1:53:00<2:18:08, 127.52s/it]
46%|███████████████████████████████▌ | 54/118 [1:55:08<2:16:09, 127.66s/it]
47%|████████████████████████████████▏ | 55/118 [1:57:16<2:13:55, 127.55s/it]
47%|████████████████████████████████▋ | 56/118 [1:59:23<2:11:46, 127.52s/it]
48%|█████████████████████████████████▎ | 57/118 [2:01:30<2:09:35, 127.47s/it]
49%|█████████████████████████████████▉ | 58/118 [2:03:38<2:07:34, 127.57s/it]
50%|██████████████████████████████████▌ | 59/118 [2:05:46<2:05:32, 127.67s/it]
51%|███████████████████████████████████ | 60/118 [2:07:18<1:53:04, 116.97s/it]
{'loss': '1.242', 'grad_norm': '0.1736', 'learning_rate': '1.069e-05', 'ppl': '3.463', 'memory/max_active (GiB)': '32.17', 'memory/max_allocated (GiB)': '32.17', 'memory/device_reserved (GiB)': '33.31', 'tokens/train_per_sec_per_gpu': '200.6', 'tokens/total': 23482368, 'tokens/trainable': 11016035, 'epoch': '1'}
51%|███████████████████████████████████ | 60/118 [2:07:18<1:53:04, 116.97s/it][2026-05-14 16:24:20,726] [INFO] [axolotl.core.trainers.base._save:818] [PID:27401] Saving model checkpoint to ./outputs/finance-synthetic-sft-phase2/checkpoint-60
[2026-05-14 16:24:24,709] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:27401] Using single process for pack_parallel, running sequentially.
52%|███████████████████████████████████▋ | 61/118 [2:09:31<1:55:40, 121.76s/it]
53%|████████████████████████████████████▎ | 62/118 [2:11:38<1:55:11, 123.42s/it]
53%|████████████████████████████████████▊ | 63/118 [2:13:46<1:54:15, 124.64s/it]
54%|█████████████████████████████████████▍ | 64/118 [2:15:54<1:53:02, 125.60s/it]
55%|██████████████████████████████████████ | 65/118 [2:18:01<1:51:26, 126.16s/it]
56%|██████████████████████████████████████▌ | 66/118 [2:20:08<1:49:38, 126.50s/it]
57%|███████████████████████████████████████▏ | 67/118 [2:22:16<1:47:50, 126.88s/it]
58%|███████████████████████████████████████▊ | 68/118 [2:24:24<1:46:03, 127.26s/it]
58%|████████████████████████▌ | 69/118 [2:26:32<1:44:03, 127.42s/it]
59%|████████████████████████▉ | 70/118 [2:28:40<1:41:58, 127.47s/it]
{'loss': '1.222', 'grad_norm': '0.157', 'learning_rate': '7.93e-06', 'ppl': '3.394', 'memory/max_active (GiB)': '32.17', 'memory/max_allocated (GiB)': '32.17', 'memory/device_reserved (GiB)': '33.31', 'tokens/train_per_sec_per_gpu': '178.7', 'tokens/total': 27414528, 'tokens/trainable': 12851399, 'epoch': '1.167'}
59%|████████████████████████▉ | 70/118 [2:28:40<1:41:58, 127.47s/it]
60%|█████████████████████████▎ | 71/118 [2:30:48<1:40:00, 127.67s/it]
61%|█████████████████████████▋ | 72/118 [2:32:56<1:37:53, 127.68s/it]
62%|█████████████████████████▉ | 73/118 [2:35:04<1:35:50, 127.80s/it]
63%|██████████████████████████▎ | 74/118 [2:37:11<1:33:43, 127.81s/it]
64%|██████████████████████████▋ | 75/118 [2:39:20<1:31:41, 127.94s/it]
64%|███████████████████████████ | 76/118 [2:41:27<1:29:29, 127.85s/it]
65%|███████████████████████████▍ | 77/118 [2:43:35<1:27:21, 127.85s/it]
66%|███████████████████████████▊ | 78/118 [2:45:43<1:25:19, 127.98s/it]
67%|████████████████████████████ | 79/118 [2:47:51<1:23:09, 127.95s/it]
68%|████████████████████████████▍ | 80/118 [2:49:59<1:20:57, 127.82s/it]
{'loss': '1.215', 'grad_norm': '0.1533', 'learning_rate': '5.324e-06', 'ppl': '3.37', 'memory/max_active (GiB)': '32.17', 'memory/max_allocated (GiB)': '32.17', 'memory/device_reserved (GiB)': '33.31', 'tokens/train_per_sec_per_gpu': '180', 'tokens/total': 31346688, 'tokens/trainable': 14712337, 'epoch': '1.335'}
68%|████████████████████████████▍ | 80/118 [2:49:59<1:20:57, 127.82s/it][2026-05-14 17:07:01,536] [INFO] [axolotl.core.trainers.base._save:818] [PID:27401] Saving model checkpoint to ./outputs/finance-synthetic-sft-phase2/checkpoint-80
69%|████████████████████████████▊ | 81/118 [2:52:10<1:19:22, 128.72s/it]
69%|█████████████████████████████▏ | 82/118 [2:54:17<1:16:59, 128.33s/it]
70%|█████████████████████████████▌ | 83/118 [2:56:25<1:14:42, 128.08s/it]
71%|█████████████████████████████▉ | 84/118 [2:58:33<1:12:35, 128.11s/it]
72%|██████████████████████████████▎ | 85/118 [3:00:41<1:10:24, 128.01s/it]
73%|██████████████████████████████▌ | 86/118 [3:02:48<1:08:13, 127.92s/it]
74%|██████████████████████████████▉ | 87/118 [3:04:56<1:06:04, 127.88s/it]
75%|███████████████████████████████▎ | 88/118 [3:07:04<1:03:56, 127.90s/it]
75%|███████████████████████████████▋ | 89/118 [3:09:12<1:01:47, 127.84s/it]
76%|█████████████████████████████████▌ | 90/118 [3:11:19<59:38, 127.82s/it]
{'loss': '1.202', 'grad_norm': '0.1499', 'learning_rate': '3.078e-06', 'ppl': '3.326', 'memory/max_active (GiB)': '32.17', 'memory/max_allocated (GiB)': '32.17', 'memory/device_reserved (GiB)': '33.31', 'tokens/train_per_sec_per_gpu': '176', 'tokens/total': 35278848, 'tokens/trainable': 16552748, 'epoch': '1.502'}
76%|█████████████████████████████████▌ | 90/118 [3:11:19<59:38, 127.82s/it]
77%|█████████████████████████████████▉ | 91/118 [3:13:27<57:28, 127.72s/it]
78%|██████████████████████████████████▎ | 92/118 [3:15:34<55:18, 127.64s/it]
79%|██████████████████████████████████▋ | 93/118 [3:17:42<53:12, 127.71s/it]
80%|███████████████████████████████████ | 94/118 [3:19:50<51:07, 127.82s/it]
81%|███████████████████████████████████▍ | 95/118 [3:21:58<48:59, 127.82s/it]
81%|███████████████████████████████████▊ | 96/118 [3:24:06<46:51, 127.78s/it]
82%|████████████████████████████████████▏ | 97/118 [3:26:14<44:42, 127.76s/it]
83%|████████████████████████████████████▌ | 98/118 [3:28:21<42:35, 127.76s/it]
84%|████████████████████████████████████▉ | 99/118 [3:30:29<40:28, 127.84s/it]
85%|████████████████████████████████████▍ | 100/118 [3:32:37<38:21, 127.83s/it]
{'loss': '1.195', 'grad_norm': '0.1503', 'learning_rate': '1.363e-06', 'ppl': '3.304', 'memory/max_active (GiB)': '32.17', 'memory/max_allocated (GiB)': '32.17', 'memory/device_reserved (GiB)': '33.31', 'tokens/train_per_sec_per_gpu': '177.9', 'tokens/total': 39211008, 'tokens/trainable': 18391700, 'epoch': '1.669'}
85%|████████████████████████████████████▍ | 100/118 [3:32:37<38:21, 127.83s/it][2026-05-14 17:49:39,854] [INFO] [axolotl.core.trainers.base._save:818] [PID:27401] Saving model checkpoint to ./outputs/finance-synthetic-sft-phase2/checkpoint-100
86%|████████████████████████████████████▊ | 101/118 [3:34:48<36:26, 128.62s/it]
86%|█████████████████████████████████████▏ | 102/118 [3:36:55<34:11, 128.20s/it]
87%|█████████████████████████████████████▌ | 103/118 [3:39:02<31:59, 127.96s/it]
88%|█████████████████████████████████████▉ | 104/118 [3:41:10<29:52, 128.02s/it]
89%|██████████████████████████████████████▎ | 105/118 [3:43:18<27:44, 128.01s/it]
90%|██████████████████████████████████████▋ | 106/118 [3:45:26<25:33, 127.82s/it]
91%|██████████████████████████████████████▉ | 107/118 [3:47:33<23:24, 127.71s/it]
92%|███████████████████████████████████████▎ | 108/118 [3:49:41<21:17, 127.74s/it]
92%|███████████████████████████████████████▋ | 109/118 [3:51:49<19:09, 127.72s/it]
93%|████████████████████████████████████████ | 110/118 [3:53:57<17:02, 127.77s/it]
{'loss': '1.191', 'grad_norm': '0.1469', 'learning_rate': '3.114e-07', 'ppl': '3.292', 'memory/max_active (GiB)': '32.17', 'memory/max_allocated (GiB)': '32.17', 'memory/device_reserved (GiB)': '33.31', 'tokens/train_per_sec_per_gpu': '179.6', 'tokens/total': 43143168, 'tokens/trainable': 20246224, 'epoch': '1.837'}
93%|████████████████████████████████████████ | 110/118 [3:53:57<17:02, 127.77s/it]
94%|████████████████████████████████████████▍ | 111/118 [3:56:05<14:55, 127.97s/it]
95%|████████████████████████████████████████▊ | 112/118 [3:58:13<12:47, 127.92s/it]
96%|█████████████████████████████████████████▏ | 113/118 [4:00:20<10:39, 127.82s/it]
97%|█████████████████████████████████████████▌ | 114/118 [4:02:28<08:31, 127.77s/it]
97%|█████████████████████████████████████████▉ | 115/118 [4:04:36<06:23, 127.90s/it]
98%|██████████████████████████████████████████▎| 116/118 [4:06:44<04:15, 127.91s/it]
99%|██████████████████████████████████████████▋| 117/118 [4:08:52<02:07, 127.75s/it]
100%|███████████████████████████████████████████| 118/118 [4:10:59<00:00, 127.59s/it][2026-05-14 18:28:01,475] [INFO] [axolotl.core.trainers.base._save:818] [PID:27401] Saving model checkpoint to ./outputs/finance-synthetic-sft-phase2/checkpoint-118
{'train_runtime': '1.506e+04', 'train_samples_per_second': '0.501', 'train_steps_per_second': '0.008', 'train_loss': '1.281', 'memory/max_active (GiB)': '32.17', 'memory/max_allocated (GiB)': '32.17', 'memory/device_reserved (GiB)': '33.31', 'epoch': '1.971', 'tokens/train_per_sec_per_gpu': '189.9'}
100%|███████████████████████████████████████████| 118/118 [4:11:01<00:00, 127.59s/it]
100%|███████████████████████████████████████████| 118/118 [4:11:01<00:00, 127.64s/it]
[2026-05-14 18:28:16,499] [INFO] [axolotl.train.save_trained_model:267] [PID:27401] Training completed! Saving trained model to ./outputs/finance-synthetic-sft-phase2.
[2026-05-14 18:28:17,359] [INFO] [axolotl.train.save_trained_model:388] [PID:27401] Model successfully saved to ./outputs/finance-synthetic-sft-phase2
[2026-05-14 18:28:17,446] [INFO] [axolotl.core.trainers.base._save:818] [PID:27401] Saving model checkpoint to ./outputs/finance-synthetic-sft-phase2
Processing Files (0 / 0) : | | 0.00B / 0.00B
New Data Upload : | | 0.00B / 0.00B 
...-phase2/training_args.bin: 100%|███████████████████| 11.3kB / 11.3kB 
...sft-phase2/tokenizer.json: 100%|███████████████████| 11.4MB / 11.4MB 
...adapter_model.safetensors: 55%|██████████▍ | 192MB / 349MB 
...-phase2/training_args.bin: 100%|███████████████████| 11.3kB / 11.3kB 
...sft-phase2/tokenizer.json: 100%|███████████████████| 11.4MB / 11.4MB 
...adapter_model.safetensors: 55%|██████████▍ | 192MB / 349MB 
Processing Files (2 / 3) : 56%|██████████▋ | 203MB / 361MB, ???B/s
...-phase2/training_args.bin: 100%|███████████████████| 11.3kB / 11.3kB 
...sft-phase2/tokenizer.json: 100%|███████████████████| 11.4MB / 11.4MB 
...adapter_model.safetensors: 100%|███████████████████| 349MB / 349MB 
Processing Files (3 / 3) : 100%|███████████████████| 361MB / 361MB, 786MB/s
...-phase2/training_args.bin: 100%|███████████████████| 11.3kB / 11.3kB 
...sft-phase2/tokenizer.json: 100%|███████████████████| 11.4MB / 11.4MB 
...adapter_model.safetensors: 100%|███████████████████| 349MB / 349MB 
...-phase2/training_args.bin: 100%|███████████████████| 11.3kB / 11.3kB 
...sft-phase2/tokenizer.json: 100%|███████████████████| 11.4MB / 11.4MB 
...adapter_model.safetensors: 100%|███████████████████| 349MB / 349MB 
...-phase2/training_args.bin: 100%|███████████████████| 11.3kB / 11.3kB 
...sft-phase2/tokenizer.json: 100%|███████████████████| 11.4MB / 11.4MB 
...adapter_model.safetensors: 100%|███████████████████| 349MB / 349MB 
Processing Files (3 / 3) : 100%|███████████████████| 361MB / 361MB, 262MB/s
New Data Upload : | | 0.00B / 0.00B, 0.00B/s
...-phase2/training_args.bin: 100%|███████████████████| 11.3kB / 11.3kB
...sft-phase2/tokenizer.json: 100%|███████████████████| 11.4MB / 11.4MB
...adapter_model.safetensors: 100%|███████████████████| 349MB / 349MB