[2026-05-14 13:44:44,807] [DEBUG] [axolotl.utils.config.resolve_dtype:74] [PID:11984] bf16 support detected, enabling for this configuration. [2026-05-14 13:44:45,352] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:11984] baseline 0.000GB () [2026-05-14 13:44:45,353] [INFO] [axolotl.cli.config.load_cfg:333] [PID:11984] config: { "activation_offloading": false, "adapter": "qlora", "attn_implementation": "flash_attention_2", "attn_needs_dtype_cast": true, "attn_supports_packing": true, "attn_uses_flash_lib": true, "auto_resume_from_checkpoints": true, "axolotl_config_path": "./data/config.yaml", "base_model": "Qwen/Qwen3-8B", "base_model_config": "Qwen/Qwen3-8B", "batch_size": 80, "bf16": true, "capabilities": { "bf16": true, "compute_capability": "sm_80", "fp8": false, "n_gpu": 1, "n_node": 1, "tf32": true }, "chat_template": "qwen3", "context_parallel_size": 1, "cut_cross_entropy": true, "dataloader_num_workers": 1, "dataloader_pin_memory": true, "dataloader_prefetch_factor": 256, "dataset_num_proc": 128, "dataset_prepared_path": "last_run_prepared", "datasets": [ { "chat_template": "tokenizer_default", "field_messages": "messages", "field_tools": "tools", "message_property_mappings": { "content": "content", "role": "role" }, "path": "Gandalf1/indian-finance-synthetic-phase2-cleaned", "roles_to_train": [ "assistant" ], "train_on_eos": "turn", "trust_remote_code": false, "type": "chat_template" } ], "ddp": false, "device": "cuda:0", "device_map": "auto", "dion_rank_fraction": 1.0, "dion_rank_multiple_of": 1, "eaft_alpha": 1.0, "eaft_k": 20, "env_capabilities": { "torch_version": "2.9.1" }, "eval_batch_size": 10, "eval_causal_lm_metrics": [ "sacrebleu", "comet", "ter", "chrf" ], "eval_max_new_tokens": 128, "eval_sample_packing": true, "eval_table_size": 0, "experimental_skip_move_to_device": true, "fp16": false, "generate_samples": false, "generation_do_sample": true, "generation_max_new_tokens": 50, "generation_prompt_ratio": 0.5, "generation_temperature": 0.7, "gradient_accumulation_steps": 8, "gradient_checkpointing": true, "gradient_checkpointing_kwargs": { "use_reentrant": false }, "hub_model_id": "Gandalf1/qwen3-8b-finance-sft-phase2", "hub_strategy": "all_checkpoints", "include_tkps": true, "is_preprocess": true, "layer_offloading": false, "learning_rate": 2e-05, "lisa_layers_attribute": "model.layers", "load_best_model_at_end": false, "load_in_4bit": true, "load_in_8bit": false, "local_rank": 0, "logging_steps": 10, "lora_alpha": 64, "lora_dropout": 0.05, "lora_mlp_kernel": true, "lora_o_kernel": true, "lora_qkv_kernel": true, "lora_r": 32, "lora_target_modules": [ "q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "down_proj", "up_proj" ], "loraplus_lr_embedding": 1e-06, "loss_watchdog_patience": 3, "loss_watchdog_threshold": 5.0, "lr_scheduler": "cosine", "max_grad_norm": 1.0, "mean_resizing_embeddings": false, "merge_method": "memory_efficient", "micro_batch_size": 10, "model_config_type": "qwen3", "num_epochs": 2.0, "num_generation_samples": 3, "optimizer": "adamw_torch_4bit", "otel_metrics_host": "localhost", "otel_metrics_port": 8000, "output_dir": "./outputs/finance-synthetic-sft-phase2", "pad_to_sequence_len": true, "plugins": [ "axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin" ], "pretrain_multipack_attn": true, "profiler_steps_start": 0, "qlora_sharded_model_loading": false, "quantize_moe_experts": false, "ray_num_workers": 1, "relora_prune_method": "magnitude", "resources_per_worker": { "GPU": 1 }, "sample_packing": true, "sample_packing_bin_size": 200, "sample_packing_group_size": 100000, "save_only_model": false, "save_safetensors": true, "save_steps": 0.16666666666666666, "save_total_limit": 3, "saves_per_epoch": 3, "seed": 42, "sequence_len": 8192, "shuffle_before_merging_datasets": false, "shuffle_merged_datasets": true, "skip_prepare_dataset": false, "streaming_multipack_buffer_size": 10000, "strict": false, "tensor_parallel_size": 1, "tf32": true, "tiled_mlp_use_original_mlp": true, "tokenizer_config": "Qwen/Qwen3-8B", "tokenizer_save_jinja_files": true, "torch_dtype": "torch.bfloat16", "train_on_inputs": false, "trl": { "async_prefetch": false, "log_completions": false, "mask_truncated_completions": false, "ref_model_mixup_alpha": 0.9, "ref_model_sync_steps": 64, "replay_buffer_size": 0, "replay_recompute_logps": true, "reroll_max_groups": 1, "reroll_start_fraction": 1.0, "reward_num_workers": 1, "scale_rewards": true, "skip_zero_advantage_batches": true, "sync_ref_model": false, "use_data_producer": false, "use_vllm": false, "vllm_lora_sync": false, "vllm_server_host": "0.0.0.0", "vllm_server_port": 8000 }, "use_otel_metrics": false, "use_ray": false, "val_set_size": 0.0, "vllm": { "device": "auto", "dtype": "auto", "gpu_memory_utilization": 0.9, "host": "0.0.0.0", "port": 8000 }, "warmup_ratio": 0.05, "weight_decay": 0.01, "world_size": 1 } [2026-05-14 13:44:46,921] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:311] [PID:11984] EOS: 151645 / <|im_end|> [2026-05-14 13:44:46,921] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:312] [PID:11984] BOS: None / None [2026-05-14 13:44:46,921] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:313] [PID:11984] PAD: 151643 / <|endoftext|> [2026-05-14 13:44:46,921] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:314] [PID:11984] UNK: None / None [2026-05-14 13:44:46,922] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:482] [PID:11984] Unable to find prepared dataset in last_run_prepared/2c3cb7e1625c800787dbdc29d010a51d [2026-05-14 13:44:46,922] [INFO] [axolotl.utils.data.sft._load_raw_datasets:320] [PID:11984] Loading raw datasets... Downloading (incomplete total...): 0.00B [00:00, ?B/s] Fetching 0 files: 0it [00:00, ?it/s] Fetching 0 files: 0it [00:00, ?it/s] Download complete: : 0.00B [00:00, ?B/s] README.md: 0.00B [00:00, ?B/s] README.md: 1.66kB [00:00, 6.58MB/s] Download complete: : 0.00B [00:00, ?B/s] data/train-00000-of-00001.parquet: 0%| | 0.00/29.6M [00:00system\n' }} {%- if messages[0].role == 'system' %} {{- messages[0].content + '\n\n' }} {%- endif %} {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} {%- for tool in tools %} {{- "\n" }} {{- tool | tojson }} {%- endfor %} {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} {%- else %} {%- if messages[0].role == 'system' %} {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} {%- endif %} {%- endif %} {%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} {#- Determine the real last index: use provided value or default to messages length - 1 #} {%- if real_last_index is defined and real_last_index is not none %} {%- set ns.real_last_index = real_last_index %} {%- else %} {%- set ns.real_last_index = messages|length - 1 %} {%- endif %} {%- for message in messages[::-1] %} {%- set index = (messages|length - 1) - loop.index0 %} {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} {%- set ns.multi_step_tool = false %} {%- set ns.last_query_index = index %} {%- endif %} {%- endfor %} {%- for message in messages %} {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} {%- elif message.role == "assistant" %} {%- set content = message.content %} {%- set reasoning_content = '' %} {%- if message.reasoning_content is defined and message.reasoning_content is not none %} {%- set reasoning_content = message.reasoning_content %} {%- else %} {%- if '' in message.content %} {%- set content = message.content.split('')[-1].lstrip('\n') %} {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} {%- endif %} {%- endif %} {%- if loop.index0 > ns.last_query_index %} {%- if loop.index0 == ns.real_last_index or (loop.index0 != ns.real_last_index and reasoning_content) %} {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} {%- else %} {{- '<|im_start|>' + message.role + '\n' + content }} {%- endif %} {%- else %} {{- '<|im_start|>' + message.role + '\n' + content }} {%- endif %} {%- if message.tool_calls %} {%- for tool_call in message.tool_calls %} {%- if (loop.first and content) or (not loop.first) %} {{- '\n' }} {%- endif %} {%- if tool_call.function %} {%- set tool_call = tool_call.function %} {%- endif %} {{- '\n{"name": "' }} {{- tool_call.name }} {{- '", "arguments": ' }} {%- if tool_call.arguments is string %} {{- tool_call.arguments }} {%- else %} {{- tool_call.arguments | tojson }} {%- endif %} {{- '}\n' }} {%- endfor %} {%- endif %} {{- '<|im_end|>\n' }} {%- elif message.role == "tool" %} {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} {{- '<|im_start|>user' }} {%- endif %} {{- '\n\n' }} {{- message.content }} {{- '\n' }} {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} {{- '<|im_end|>\n' }} {%- endif %} {%- endif %} {%- endfor %} {%- if add_generation_prompt %} {{- '<|im_start|>assistant\n' }} {%- if enable_thinking is defined and enable_thinking is false %} {{- '\n\n\n\n' }} {%- else %} {{- '\n\n' }} {%- endif %} {%- endif %} --- Tokenizing Prompts (num_proc=128): 0%| | 0/14763 [00:008192) (num_proc=128): 0%| | 0/14763 [00:008192) (num_proc=128): 1%| | 116/14763 [00:04<10:09, 24.03 examples/s] Dropping Invalid Sequences (8192) (num_proc=128): 2%| | 348/14763 [00:04<02:40, 89.78 examples/s] Dropping Invalid Sequences (8192) (num_proc=128): 5%|▏ | 696/14763 [00:05<01:04, 217.48 examples/s] Dropping Invalid Sequences (8192) (num_proc=128): 8%|▏ | 1160/14763 [00:05<00:30, 439.33 examples/s] Dropping Invalid Sequences (8192) (num_proc=128): 10%|▏ | 1508/14763 [00:05<00:20, 641.12 examples/s] Dropping Invalid Sequences (8192) (num_proc=128): 13%|▎ | 1856/14763 [00:05<00:14, 871.04 examples/s] Dropping Invalid Sequences (8192) (num_proc=128): 16%|▏| 2320/14763 [00:05<00:10, 1213.83 examples/s] Dropping Invalid Sequences (8192) (num_proc=128): 56%|▌| 8208/14763 [00:05<00:00, 8610.30 examples/s] Dropping Invalid Sequences (8192) (num_proc=128): 100%|█| 14763/14763 [00:06<00:00, 2153.50 examples/s Drop Samples with Zero Trainable Tokens (num_proc=128): 0%| | 0/14763 [00:00 sys.exit(main()) ^^^^^^ File "/workspace/axolotl/src/axolotl/cli/main.py", line 456, in main cli() File "/workspace/axolotl-venv/lib/python3.12/site-packages/click/core.py", line 1485, in __call__ return self.main(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/workspace/axolotl-venv/lib/python3.12/site-packages/click/core.py", line 1406, in main rv = self.invoke(ctx) ^^^^^^^^^^^^^^^^ File "/workspace/axolotl-venv/lib/python3.12/site-packages/click/core.py", line 1873, in invoke return _process_result(sub_ctx.command.invoke(sub_ctx)) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/workspace/axolotl-venv/lib/python3.12/site-packages/click/core.py", line 1269, in invoke return ctx.invoke(self.callback, **ctx.params) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/workspace/axolotl-venv/lib/python3.12/site-packages/click/core.py", line 824, in invoke return callback(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^ File "/workspace/axolotl/src/axolotl/cli/utils/args.py", line 48, in wrapper return func(*args, **filtered_kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/workspace/axolotl/src/axolotl/cli/main.py", line 75, in preprocess do_cli(config=config, **kwargs) File "/workspace/axolotl/src/axolotl/cli/preprocess.py", line 120, in do_cli do_preprocess(parsed_cfg, parsed_cli_args) File "/workspace/axolotl/src/axolotl/telemetry/errors.py", line 127, in wrapper return func(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^ File "/workspace/axolotl/src/axolotl/cli/preprocess.py", line 74, in do_preprocess load_datasets(cfg=cfg, cli_args=cli_args) File "/workspace/axolotl/src/axolotl/telemetry/errors.py", line 127, in wrapper return func(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^ File "/workspace/axolotl/src/axolotl/common/datasets.py", line 61, in load_datasets train_dataset, eval_dataset, total_num_steps, prompters = prepare_datasets( ^^^^^^^^^^^^^^^^^ File "/workspace/axolotl/src/axolotl/utils/data/utils.py", line 50, in wrapper return func(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^ File "/workspace/axolotl/src/axolotl/utils/data/sft.py", line 65, in prepare_datasets return _prepare_standard_dataset(cfg, tokenizer, processor) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/workspace/axolotl/src/axolotl/utils/data/sft.py", line 98, in _prepare_standard_dataset train_dataset, eval_dataset, prompters = loader.load(_load_datasets) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/workspace/axolotl/src/axolotl/utils/data/lock.py", line 38, in load result = load_fn() ^^^^^^^^^ File "/workspace/axolotl/src/axolotl/utils/data/sft.py", line 77, in _load_datasets train_dataset, eval_dataset, prompters = _load_and_prepare_datasets( ^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/workspace/axolotl/src/axolotl/utils/data/sft.py", line 496, in _load_and_prepare_datasets dataset, prompters = _load_tokenized_prepared_datasets( ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/workspace/axolotl/src/axolotl/utils/data/sft.py", line 299, in _load_tokenized_prepared_datasets dataset, prompters = _load_raw_datasets( ^^^^^^^^^^^^^^^^^^^ File "/workspace/axolotl/src/axolotl/utils/data/sft.py", line 364, in _load_raw_datasets save_preprocessed_dataset(cfg, dataset, dataset_hash, split) File "/workspace/axolotl/src/axolotl/utils/data/shared.py", line 440, in save_preprocessed_dataset dataset.save_to_disk( File "/workspace/axolotl-venv/lib/python3.12/site-packages/datasets/arrow_dataset.py", line 1909, in save_to_disk for job_id, done, content in iflatmap_unordered( ^^^^^^^^^^^^^^^^^^^ File "/workspace/axolotl-venv/lib/python3.12/site-packages/datasets/utils/py_utils.py", line 617, in iflatmap_unordered raise RuntimeError( RuntimeError: One of the subprocesses has abruptly died during map operation.To debug the error, disable multiprocessing. [2026-05-14 13:52:16,708] [DEBUG] [axolotl.utils.config.resolve_dtype:74] [PID:25326] bf16 support detected, enabling for this configuration. [2026-05-14 13:52:17,989] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:25326] baseline 0.000GB () [2026-05-14 13:52:17,990] [INFO] [axolotl.cli.config.load_cfg:333] [PID:25326] config: { "activation_offloading": false, "adapter": "qlora", "attn_implementation": "flash_attention_2", "attn_needs_dtype_cast": true, "attn_supports_packing": true, "attn_uses_flash_lib": true, "auto_resume_from_checkpoints": true, "axolotl_config_path": "./data/config.yaml", "base_model": "Qwen/Qwen3-8B", "base_model_config": "Qwen/Qwen3-8B", "batch_size": 80, "bf16": true, "capabilities": { "bf16": true, "compute_capability": "sm_80", "fp8": false, "n_gpu": 1, "n_node": 1, "tf32": true }, "chat_template": "qwen3", "context_parallel_size": 1, "cut_cross_entropy": true, "dataloader_num_workers": 1, "dataloader_pin_memory": true, "dataloader_prefetch_factor": 256, "dataset_num_proc": 1, "dataset_prepared_path": "last_run_prepared", "datasets": [ { "chat_template": "tokenizer_default", "field_messages": "messages", "field_tools": "tools", "message_property_mappings": { "content": "content", "role": "role" }, "path": "Gandalf1/indian-finance-synthetic-phase2-cleaned", "roles_to_train": [ "assistant" ], "train_on_eos": "turn", "trust_remote_code": false, "type": "chat_template" } ], "ddp": false, "device": "cuda:0", "device_map": "auto", "dion_rank_fraction": 1.0, "dion_rank_multiple_of": 1, "eaft_alpha": 1.0, "eaft_k": 20, "env_capabilities": { "torch_version": "2.9.1" }, "eval_batch_size": 10, "eval_causal_lm_metrics": [ "sacrebleu", "comet", "ter", "chrf" ], "eval_max_new_tokens": 128, "eval_sample_packing": true, "eval_table_size": 0, "experimental_skip_move_to_device": true, "fp16": false, "generate_samples": false, "generation_do_sample": true, "generation_max_new_tokens": 50, "generation_prompt_ratio": 0.5, "generation_temperature": 0.7, "gradient_accumulation_steps": 8, "gradient_checkpointing": true, "gradient_checkpointing_kwargs": { "use_reentrant": false }, "hub_model_id": "Gandalf1/qwen3-8b-finance-sft-phase2", "hub_strategy": "all_checkpoints", "include_tkps": true, "is_preprocess": true, "layer_offloading": false, "learning_rate": 2e-05, "lisa_layers_attribute": "model.layers", "load_best_model_at_end": false, "load_in_4bit": true, "load_in_8bit": false, "local_rank": 0, "logging_steps": 10, "lora_alpha": 64, "lora_dropout": 0.05, "lora_mlp_kernel": true, "lora_o_kernel": true, "lora_qkv_kernel": true, "lora_r": 32, "lora_target_modules": [ "q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "down_proj", "up_proj" ], "loraplus_lr_embedding": 1e-06, "loss_watchdog_patience": 3, "loss_watchdog_threshold": 5.0, "lr_scheduler": "cosine", "max_grad_norm": 1.0, "mean_resizing_embeddings": false, "merge_method": "memory_efficient", "micro_batch_size": 10, "model_config_type": "qwen3", "num_epochs": 2.0, "num_generation_samples": 3, "optimizer": "adamw_torch_4bit", "otel_metrics_host": "localhost", "otel_metrics_port": 8000, "output_dir": "./outputs/finance-synthetic-sft-phase2", "pad_to_sequence_len": true, "plugins": [ "axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin" ], "pretrain_multipack_attn": true, "profiler_steps_start": 0, "qlora_sharded_model_loading": false, "quantize_moe_experts": false, "ray_num_workers": 1, "relora_prune_method": "magnitude", "resources_per_worker": { "GPU": 1 }, "sample_packing": true, "sample_packing_bin_size": 200, "sample_packing_group_size": 100000, "save_only_model": false, "save_safetensors": true, "save_steps": 0.16666666666666666, "save_total_limit": 3, "saves_per_epoch": 3, "seed": 42, "sequence_len": 8192, "shuffle_before_merging_datasets": false, "shuffle_merged_datasets": true, "skip_prepare_dataset": false, "streaming_multipack_buffer_size": 10000, "strict": false, "tensor_parallel_size": 1, "tf32": true, "tiled_mlp_use_original_mlp": true, "tokenizer_config": "Qwen/Qwen3-8B", "tokenizer_save_jinja_files": true, "torch_dtype": "torch.bfloat16", "train_on_inputs": false, "trl": { "async_prefetch": false, "log_completions": false, "mask_truncated_completions": false, "ref_model_mixup_alpha": 0.9, "ref_model_sync_steps": 64, "replay_buffer_size": 0, "replay_recompute_logps": true, "reroll_max_groups": 1, "reroll_start_fraction": 1.0, "reward_num_workers": 1, "scale_rewards": true, "skip_zero_advantage_batches": true, "sync_ref_model": false, "use_data_producer": false, "use_vllm": false, "vllm_lora_sync": false, "vllm_server_host": "0.0.0.0", "vllm_server_port": 8000 }, "use_otel_metrics": false, "use_ray": false, "val_set_size": 0.0, "vllm": { "device": "auto", "dtype": "auto", "gpu_memory_utilization": 0.9, "host": "0.0.0.0", "port": 8000 }, "warmup_ratio": 0.05, "weight_decay": 0.01, "world_size": 1 } [2026-05-14 13:52:19,357] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:311] [PID:25326] EOS: 151645 / <|im_end|> [2026-05-14 13:52:19,357] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:312] [PID:25326] BOS: None / None [2026-05-14 13:52:19,357] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:313] [PID:25326] PAD: 151643 / <|endoftext|> [2026-05-14 13:52:19,357] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:314] [PID:25326] UNK: None / None [2026-05-14 13:52:19,358] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:482] [PID:25326] Unable to find prepared dataset in last_run_prepared/2c3cb7e1625c800787dbdc29d010a51d [2026-05-14 13:52:19,358] [INFO] [axolotl.utils.data.sft._load_raw_datasets:320] [PID:25326] Loading raw datasets... Downloading (incomplete total...): 0.00B [00:00, ?B/s] Fetching 0 files: 0it [00:00, ?it/s] Fetching 0 files: 0it [00:00, ?it/s] Download complete: : 0.00B [00:00, ?B/s] Download complete: : 0.00B [00:00, ?B/s] [2026-05-14 13:52:20,634] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:87] [PID:25326] Loading dataset: Gandalf1/indian-finance-synthetic-phase2-cleaned with base_type: chat_template and prompt_style: None [2026-05-14 13:52:20,637] [INFO] [axolotl.prompt_strategies.chat_template.__call__:1191] [PID:25326] Using chat template: --- {%- if tools %} {{- '<|im_start|>system\n' }} {%- if messages[0].role == 'system' %} {{- messages[0].content + '\n\n' }} {%- endif %} {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} {%- for tool in tools %} {{- "\n" }} {{- tool | tojson }} {%- endfor %} {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} {%- else %} {%- if messages[0].role == 'system' %} {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} {%- endif %} {%- endif %} {%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} {#- Determine the real last index: use provided value or default to messages length - 1 #} {%- if real_last_index is defined and real_last_index is not none %} {%- set ns.real_last_index = real_last_index %} {%- else %} {%- set ns.real_last_index = messages|length - 1 %} {%- endif %} {%- for message in messages[::-1] %} {%- set index = (messages|length - 1) - loop.index0 %} {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} {%- set ns.multi_step_tool = false %} {%- set ns.last_query_index = index %} {%- endif %} {%- endfor %} {%- for message in messages %} {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} {%- elif message.role == "assistant" %} {%- set content = message.content %} {%- set reasoning_content = '' %} {%- if message.reasoning_content is defined and message.reasoning_content is not none %} {%- set reasoning_content = message.reasoning_content %} {%- else %} {%- if '' in message.content %} {%- set content = message.content.split('')[-1].lstrip('\n') %} {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} {%- endif %} {%- endif %} {%- if loop.index0 > ns.last_query_index %} {%- if loop.index0 == ns.real_last_index or (loop.index0 != ns.real_last_index and reasoning_content) %} {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} {%- else %} {{- '<|im_start|>' + message.role + '\n' + content }} {%- endif %} {%- else %} {{- '<|im_start|>' + message.role + '\n' + content }} {%- endif %} {%- if message.tool_calls %} {%- for tool_call in message.tool_calls %} {%- if (loop.first and content) or (not loop.first) %} {{- '\n' }} {%- endif %} {%- if tool_call.function %} {%- set tool_call = tool_call.function %} {%- endif %} {{- '\n{"name": "' }} {{- tool_call.name }} {{- '", "arguments": ' }} {%- if tool_call.arguments is string %} {{- tool_call.arguments }} {%- else %} {{- tool_call.arguments | tojson }} {%- endif %} {{- '}\n' }} {%- endfor %} {%- endif %} {{- '<|im_end|>\n' }} {%- elif message.role == "tool" %} {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} {{- '<|im_start|>user' }} {%- endif %} {{- '\n\n' }} {{- message.content }} {{- '\n' }} {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} {{- '<|im_end|>\n' }} {%- endif %} {%- endif %} {%- endfor %} {%- if add_generation_prompt %} {{- '<|im_start|>assistant\n' }} {%- if enable_thinking is defined and enable_thinking is false %} {{- '\n\n\n\n' }} {%- else %} {{- '\n\n' }} {%- endif %} {%- endif %} --- Tokenizing Prompts (num_proc=1): 0%| | 0/14763 [00:008192) (num_proc=1): 0%| | 0/14763 [00:008192) (num_proc=1): 7%|▏ | 1000/14763 [00:00<00:09, 1431.08 examples/s] Dropping Invalid Sequences (8192) (num_proc=1): 14%|▍ | 2000/14763 [00:01<00:08, 1553.70 examples/s] Dropping Invalid Sequences (8192) (num_proc=1): 20%|▌ | 3000/14763 [00:01<00:07, 1608.36 examples/s] Dropping Invalid Sequences (8192) (num_proc=1): 27%|▊ | 4000/14763 [00:02<00:06, 1658.84 examples/s] Dropping Invalid Sequences (8192) (num_proc=1): 34%|█ | 5000/14763 [00:03<00:05, 1675.80 examples/s] Dropping Invalid Sequences (8192) (num_proc=1): 41%|█▏ | 6000/14763 [00:03<00:05, 1706.11 examples/s] Dropping Invalid Sequences (8192) (num_proc=1): 47%|█▍ | 7000/14763 [00:04<00:04, 1696.20 examples/s] Dropping Invalid Sequences (8192) (num_proc=1): 54%|█▋ | 8000/14763 [00:04<00:03, 1694.97 examples/s] Dropping Invalid Sequences (8192) (num_proc=1): 61%|█▊ | 9000/14763 [00:05<00:03, 1706.05 examples/s] Dropping Invalid Sequences (8192) (num_proc=1): 68%|█▎| 10000/14763 [00:05<00:02, 1713.44 examples/s] Dropping Invalid Sequences (8192) (num_proc=1): 75%|█▍| 11000/14763 [00:06<00:02, 1720.45 examples/s] Dropping Invalid Sequences (8192) (num_proc=1): 81%|█▋| 12000/14763 [00:07<00:01, 1728.08 examples/s] Dropping Invalid Sequences (8192) (num_proc=1): 88%|█▊| 13000/14763 [00:07<00:01, 1723.26 examples/s] Dropping Invalid Sequences (8192) (num_proc=1): 95%|█▉| 14000/14763 [00:08<00:00, 1718.49 examples/s] Dropping Invalid Sequences (8192) (num_proc=1): 100%|██| 14763/14763 [00:08<00:00, 1725.22 examples/s] Dropping Invalid Sequences (8192) (num_proc=1): 100%|██| 14763/14763 [00:08<00:00, 1665.67 examples/s] Drop Samples with Zero Trainable Tokens (num_proc=1): 0%| | 0/14763 [00:00(-100, 151644) system(-100, 8948)  (-100, 198) You(-100, 2610)  are(-100, 525)  B(-100, 425) any(-100, 3767) an(-100, 276) Tree(-100, 6533) ,(-100, 11)  an(-100, 458)  expert(-100, 6203)  Indian(-100, 7748)  personal(-100, 4345)  finance(-100, 17017)  assistant(-100, 17847) .(-100, 13)  You(-100, 1446)  have(-100, 614)  access(-100, 2615)  to(-100, 311)  financial(-100, 5896)  calculation(-100, 21937)  tools(-100, 7375) .(-100, 13)  Use(-100, 5443)  them(-100, 1105)  to(-100, 311)  provide(-100, 3410)  accurate(-100, 13382) ,(-100, 11)  personalized(-100, 34549)  advice(-100, 9462)  based(-100, 3118)  on(-100, 389)  FY(-100, 46366)  (-100, 220) 2(-100, 17) 0(-100, 15) 2(-100, 17) 4(-100, 19) -(-100, 12) 2(-100, 17) 5(-100, 20)  Indian(-100, 7748)  tax(-100, 3742)  rules(-100, 5601)  and(-100, 323)  current(-100, 1482)  financial(-100, 5896)  regulations(-100, 14305) .(-100, 13)  Always(-100, 23240)  show(-100, 1473)  your(-100, 697)  reasoning(-100, 32711)  before(-100, 1573)  taking(-100, 4633)  action(-100, 1917) :(-100, 25)  decom(-100, 28502) pose(-100, 2900)  the(-100, 279)  problem(-100, 3491) ,(-100, 11)  identify(-100, 10542)  what(-100, 1128)  information(-100, 1995)  the(-100, 279)  user(-100, 1196)  provided(-100, 3897)  vs(-100, 6165)  what(-100, 1128) 's(-100, 594)  missing(-100, 7402) ,(-100, 11)  state(-100, 1584)  any(-100, 894)  assumptions(-100, 31846)  explicitly(-100, 20975) ,(-100, 11)  then(-100, 1221)  decide(-100, 10279)  whether(-100, 3425)  to(-100, 311)  use(-100, 990)  tools(-100, 7375) ,(-100, 11)  ask(-100, 2548)  for(-100, 369)  clarification(-100, 63684) ,(-100, 11)  or(-100, 476)  answer(-100, 4226)  directly(-100, 5961) .(-100, 13)  Never(-100, 14695)  guarantee(-100, 15440)  returns(-100, 4675)  on(-100, 389)  market(-100, 3081) -linked(-100, 54414)  instruments(-100, 23316) .(-100, 13)  When(-100, 3197)  information(-100, 1995)  is(-100, 374)  missing(-100, 7402) ,(-100, 11)  either(-100, 2987)  ask(-100, 2548)  the(-100, 279)  user(-100, 1196)  or(-100, 476)  clearly(-100, 9355)  state(-100, 1584)  your(-100, 697)  assumptions(-100, 31846) . (-100, 382) #(-100, 2)  Tools(-100, 13852)  (-100, 271) You(-100, 2610)  may(-100, 1231)  call(-100, 1618)  one(-100, 825)  or(-100, 476)  more(-100, 803)  functions(-100, 5746)  to(-100, 311)  assist(-100, 7789)  with(-100, 448)  the(-100, 279)  user(-100, 1196)  query(-100, 3239) . (-100, 382) You(-100, 2610)  are(-100, 525)  provided(-100, 3897)  with(-100, 448)  function(-100, 729)  signatures(-100, 32628)  within(-100, 2878)  <(-100, 366) tools(-100, 15918) >(-100, 29)  XML(-100, 11874)  tags(-100, 9492) : (-100, 510) <(-100, 27) tools(-100, 15918) > (-100, 397) {"(-100, 4913) type(-100, 1313) ":(-100, 788)  "(-100, 330) function(-100, 1688) ",(-100, 497)  "(-100, 330) function(-100, 1688) ":(-100, 788)  {"(-100, 5212) name(-100, 606) ":(-100, 788)  "(-100, 330) calculate(-100, 35597) _s(-100, 643) ip(-100, 573) _returns(-100, 58900) ",(-100, 497)  "(-100, 330) description(-100, 4684) ":(-100, 788)  "(-100, 330) Calculate(-100, 47866)  the(-100, 279)  future(-100, 3853)  value(-100, 897)  of(-100, 315)  a(-100, 264)  System(-100, 739) atic(-100, 774)  Investment(-100, 32250)  Plan(-100, 9680)  ((-100, 320) S(-100, 50) IP(-100, 3298) ).(-100, 568) ",(-100, 497)  "(-100, 330) parameters(-100, 13786) ":(-100, 788)  {"(-100, 5212) type(-100, 1313) ":(-100, 788)  "(-100, 330) object(-100, 1700) ",(-100, 497)  "(-100, 330) properties(-100, 13193) ":(-100, 788)  {"(-100, 5212) monthly(-100, 69138) _amount(-100, 13471) ":(-100, 788)  {"(-100, 5212) type(-100, 1313) ":(-100, 788)  "(-100, 330) number(-100, 4082) ",(-100, 497)  "(-100, 330) description(-100, 4684) ":(-100, 788)  "(-100, 330) Monthly(-100, 72007)  SIP(-100, 65441)  investment(-100, 9162)  amount(-100, 3311)  in(-100, 304)  IN(-100, 1964) R(-100, 49) ."(-100, 1189) },(-100, 2137)  "(-100, 330) annual(-100, 63609) _return(-100, 12511) _pct(-100, 71512) ":(-100, 788)  {"(-100, 5212) type(-100, 1313) ":(-100, 788)  "(-100, 330) number(-100, 4082) ",(-100, 497)  "(-100, 330) description(-100, 4684) ":(-100, 788)  "(-100, 330) Expected(-100, 18896)  annual(-100, 9775)  return(-100, 470)  percentage(-100, 11414) ."(-100, 1189) },(-100, 2137)  "(-100, 330) ten(-100, 1960) ure(-100, 552) _years(-100, 74490) ":(-100, 788)  {"(-100, 5212) type(-100, 1313) ":(-100, 788)  "(-100, 330) integer(-100, 11662) ",(-100, 497)  "(-100, 330) description(-100, 4684) ":(-100, 788)  "(-100, 330) Invest(-100, 33876) ment(-100, 478)  duration(-100, 8090)  in(-100, 304)  years(-100, 1635) ."(-100, 1189) }},(-100, 38154)  "(-100, 330) required(-100, 6279) ":(-100, 788)  ["(-100, 4383) monthly(-100, 69138) _amount(-100, 13471) ",(-100, 497)  "(-100, 330) annual(-100, 63609) _return(-100, 12511) _pct(-100, 71512) ",(-100, 497)  "(-100, 330) ten(-100, 1960) ure(-100, 552) _years(-100, 74490) "](-100, 1341) }}(-100, 3417) } (-100, 532)  (-100, 1339) For(-100, 2461)  each(-100, 1817)  function(-100, 729)  call(-100, 1618) ,(-100, 11)  return(-100, 470)  a(-100, 264)  json(-100, 2951)  object(-100, 1633)  with(-100, 448)  function(-100, 729)  name(-100, 829)  and(-100, 323)  arguments(-100, 5977)  within(-100, 2878)  (-100, 220) (-100, 151657) (-100, 151658)  XML(-100, 11874)  tags(-100, 9492) : (-100, 510) (-100, 151657)  (-100, 198) {"(-100, 4913) name(-100, 606) ":(-100, 788)  <(-100, 366) function(-100, 1688) -name(-100, 11494) >,(-100, 8066)  "(-100, 330) arguments(-100, 16370) ":(-100, 788)  <(-100, 366) args(-100, 2116) -json(-100, 56080) -object(-100, 40432) >} (-100, 31296) (-100, 151658) <|im_end|>(-100, 151645)  (-100, 198) <|im_start|>(-100, 151644) user(-100, 872)  (-100, 198) I(-100, 40) 'm(-100, 2776)  (-100, 220) 4(-100, 19) 9(-100, 24) ,(-100, 11)  single(-100, 3175)  with(-100, 448)  one(-100, 825)  dependent(-100, 17749) ,(-100, 11)  and(-100, 323)  honestly(-100, 26044)  feeling(-100, 8266)  a(-100, 264)  bit(-100, 2699)  overwhelmed(-100, 42106)  about(-100, 911)  retirement(-100, 20950)  planning(-100, 9115) .(-100, 13)  Can(-100, 2980)  someone(-100, 4325)  explain(-100, 10339)  N(-100, 451) PS(-100, 5012)  properly(-100, 10277)  —(-100, 1959)  like(-100, 1075)  what(-100, 1128)  are(-100, 525)  the(-100, 279)  different(-100, 2155)  tiers(-100, 63171) ,(-100, 11)  how(-100, 1246)  exactly(-100, 6896)  do(-100, 653)  the(-100, 279)  tax(-100, 3742)  benefits(-100, 7567)  work(-100, 975)  under(-100, 1212)  (-100, 220) 8(-100, 23) 0(-100, 15) C(-100, 34)  and(-100, 323)  (-100, 220) 8(-100, 23) 0(-100, 15) CC(-100, 3706) D(-100, 35) ,(-100, 11)  and(-100, 323)  what(-100, 1128)  are(-100, 525)  the(-100, 279)  real(-100, 1931)  withdrawal(-100, 29736)  rules(-100, 5601) ?(-100, 30)  I(-100, 358) 'm(-100, 2776)  a(-100, 264)  pharmacist(-100, 89294)  in(-100, 304)  Vis(-100, 7656) akh(-100, 21758) apat(-100, 25760) nam(-100, 12400)  earning(-100, 27644)  around(-100, 2163)  (-100, 220) 3(-100, 18) 0(-100, 15)  lak(-100, 93686) hs(-100, 4997)  per(-100, 817)  year(-100, 1042)  and(-100, 323)  just(-100, 1101)  trying(-100, 4460)  to(-100, 311)  figure(-100, 7071)  out(-100, 700)  if(-100, 421)  this(-100, 419)  is(-100, 374)  a(-100, 264)  good(-100, 1661)  option(-100, 2999)  for(-100, 369)  me(-100, 752)  at(-100, 518)  my(-100, 847)  age(-100, 4231) .(-100, 13) <|im_end|>(-100, 151645)  (-100, 198) <|im_start|>(-100, 151644) assistant(-100, 77091)  (-100, 198) (-100, 151667)  (198, 198) The(785, 785)  user(1196, 1196)  wants(6801, 6801)  a(264, 264)  conceptual(43801, 43801)  overview(23251, 23251)  of(315, 315)  N(451, 451) PS(5012, 5012)  focusing(21080, 21080)  on(389, 389)  tiers(63171, 63171) ,(11, 11)  tax(3742, 3742)  benefits(7567, 7567)  ((320, 320) 8(23, 23) 0(15, 15) C(34, 34) ,(11, 11)  (220, 220) 8(23, 23) 0(15, 15) CC(3706, 3706) D(35, 35) ),(701, 701)  and(323, 323)  withdrawal(29736, 29736)  rules(5601, 5601) .(13, 13)  I(358, 358)  need(1184, 1184)  to(311, 311)  clarify(37163, 37163)  that(429, 429)  (220, 220) 8(23, 23) 0(15, 15) CC(3706, 3706) D(35, 35) ((7, 7) 1(16, 16) )(8, 8)  is(374, 374)  part(949, 949)  of(315, 315)  (220, 220) 8(23, 23) 0(15, 15) C(34, 34)  limit(3930, 3930) ,(11, 11)  and(323, 323)  (220, 220) 8(23, 23) 0(15, 15) CC(3706, 3706) D(35, 35) ((7, 7) 1(16, 16) B(33, 33) )(8, 8)  is(374, 374)  extra(4960, 4960)  ₹(89791, 89791) 5(20, 20) 0(15, 15) k(74, 74) .(13, 13)  Employer(74852, 74852)  contribution(18527, 18527)  ((320, 320) 8(23, 23) 0(15, 15) CC(3706, 3706) D(35, 35) ((7, 7) 2(17, 17) ))(593, 593)  is(374, 374)  separate(8651, 8651) .(13, 13)  Withdraw(67381, 67381) al(278, 278)  rules(5601, 5601) :(25, 25)  at(518, 518)  (220, 220) 6(21, 21) 0(15, 15) ,(11, 11)  (220, 220) 6(21, 21) 0(15, 15) %(4, 4)  tax(3742, 3742) -free(12577, 12577)  lump(48529, 48529)  sum(2629, 2629)  +(488, 488)  (220, 220) 4(19, 19) 0(15, 15) %(4, 4)  ann(2935, 2935) uity(34494, 34494) ;(26, 26)  partial(7130, 7130)  withdrawals(85204, 85204)  after(1283, 1283)  (220, 220) 3(18, 18)  years(1635, 1635) ;(26, 26)  early(4124, 4124)  exit(4869, 4869)  after(1283, 1283)  (220, 220) 1(16, 16) 0(15, 15)  years(1635, 1635) .(13, 13)  Also(7281, 7281)  mention(6286, 6286)  that(429, 429)  tax(3742, 3742)  benefits(7567, 7567)  only(1172, 1172)  apply(3796, 3796)  under(1212, 1212)  old(2310, 2310)  regime(17484, 17484) .(13, 13)  User(2657, 2657)  is(374, 374)  (220, 220) 4(19, 19) 9(24, 24) ,(11, 11)  so(773, 773)  mention(6286, 6286)  horizon(34074, 34074)  and(323, 323)  suitability(80628, 80628) .(13, 13)  Avoid(34006, 34006)  calculations(28117, 28117) ,(11, 11)  use(990, 990)  qualitative(61029, 61029)  language(4128, 4128) .(13, 13)  Include(29734, 29734)  cave(25385, 25385) ats(1862, 1862)  about(911, 911)  regime(17484, 17484)  choice(5754, 5754)  and(323, 323)  liquidity(52013, 52013) . (624, 624) (151668, 151668)  (271, 271) The(785, 785)  National(5055, 5055)  Pension(80457, 80457)  System(739, 739)  ((320, 320) N(45, 45) PS(5012, 5012) )(8, 8)  is(374, 374)  a(264, 264)  voluntary(35979, 35979)  retirement(20950, 20950)  savings(18986, 18986)  scheme(12859, 12859)  regulated(34219, 34219)  by(553, 553)  P(393, 393) FR(10504, 10504) DA(6352, 6352) .(13, 13)  It(1084, 1084)  has(702, 702)  two(1378, 1378)  tiers(63171, 63171) :(25, 25)  Tier(35232, 35232)  I(358, 358)  ((320, 320) mandatory(80116, 80116)  for(369, 369)  tax(3742, 3742)  benefits(7567, 7567) ,(11, 11)  lock(5296, 5296) -in(3419, 3419)  till(11956, 11956)  age(4231, 4231)  (220, 220) 6(21, 21) 0(15, 15) )(8, 8)  and(323, 323)  Tier(35232, 35232)  II(7946, 7946)  ((320, 320) vol(12536, 12536) untary(58956, 58956) ,(11, 11)  no(902, 902)  lock(5296, 5296) -in(3419, 3419) ,(11, 11)  no(902, 902)  tax(3742, 3742)  benefits(7567, 7567) ).(568, 568)  For(1752, 1752)  sal(4274, 4274) ar(277, 277) ied(1122, 1122)  individuals(7775, 7775) ,(11, 11)  your(697, 697)  own(1828, 1828)  Tier(35232, 35232)  I(358, 358)  contribution(18527, 18527)  up(705, 705)  to(311, 311)  (220, 220) 1(16, 16) 0(15, 15) %(4, 4)  of(315, 315)  salary(16107, 16107)  qualifies(79907, 79907)  under(1212, 1212)  Section(11113, 11113)  (220, 220) 8(23, 23) 0(15, 15) CC(3706, 3706) D(35, 35) ((7, 7) 1(16, 16) )(8, 8)  within(2878, 2878)  the(279, 279)  overall(8084, 8084)  ₹(89791, 89791) 1(16, 16) .(13, 13) 5(20, 20)  lakh(62173, 62173)  (220, 220) 8(23, 23) 0(15, 15) C(34, 34)  limit(3930, 3930) ,(11, 11)  and(323, 323)  an(458, 458)  additional(5107, 5107)  ₹(89791, 89791) 5(20, 20) 0(15, 15) ,(11, 11) 0(15, 15) 0(15, 15) 0(15, 15)  under(1212, 1212)  (220, 220) 8(23, 23) 0(15, 15) CC(3706, 3706) D(35, 35) ((7, 7) 1(16, 16) B(33, 33) )(8, 8)  ((320, 320) over(1975, 1975)  and(323, 323)  above(3403, 3403)  (220, 220) 8(23, 23) 0(15, 15) C(34, 34) ).(568, 568)  Employer(74852, 74852)  contributions(19026, 19026)  up(705, 705)  to(311, 311)  (220, 220) 1(16, 16) 0(15, 15) %(4, 4)  of(315, 315)  salary(16107, 16107)  are(525, 525)  exempt(37928, 37928)  under(1212, 1212)  (220, 220) 8(23, 23) 0(15, 15) CC(3706, 3706) D(35, 35) ((7, 7) 2(17, 17) )(8, 8)  with(448, 448)  no(902, 902)  upper(8416, 8416)  cap(2062, 2062) .(13, 13)  At(2411, 2411)  age(4231, 4231)  (220, 220) 4(19, 19) 9(24, 24) ,(11, 11)  you(498, 498)  have(614, 614)  about(911, 911)  (220, 220) 1(16, 16) 1(16, 16)  years(1635, 1635)  to(311, 311)  retirement(20950, 20950) .(13, 13)  Withdraw(67381, 67381) als(1127, 1127) :(25, 25)  At(2411, 2411)  (220, 220) 6(21, 21) 0(15, 15) ,(11, 11)  you(498, 498)  can(646, 646)  take(1896, 1896)  (220, 220) 6(21, 21) 0(15, 15) %(4, 4)  tax(3742, 3742) -free(12577, 12577)  lump(48529, 48529)  sum(2629, 2629) ,(11, 11)  but(714, 714)  must(1969, 1969)  use(990, 990)  (220, 220) 4(19, 19) 0(15, 15) %(4, 4)  to(311, 311)  buy(3695, 3695)  an(458, 458)  ann(2935, 2935) uity(34494, 34494)  ((320, 320) tax(17616, 17616) able(480, 480) ).(568, 568)  Partial(24552, 24552)  withdrawals(85204, 85204)  ((320, 320) up(454, 454)  to(311, 311)  (220, 220) 2(17, 17) 5(20, 20) %(4, 4)  of(315, 315)  own(1828, 1828)  contributions(19026, 19026) )(8, 8)  are(525, 525)  allowed(5420, 5420)  after(1283, 1283)  (220, 220) 3(18, 18)  years(1635, 1635)  for(369, 369)  specific(3151, 3151)  needs(3880, 3880)  like(1075, 1075)  children(2841, 2841) 's(594, 594)  education(6731, 6731) ,(11, 11)  marriage(10866, 10866) ,(11, 11)  or(476, 476)  buying(11833, 11833)  a(264, 264)  house(3753, 3753) .(13, 13)  Early(22752, 22752)  exit(4869, 4869)  ((320, 320) before(14801, 14801)  (220, 220) 6(21, 21) 0(15, 15) )(8, 8)  after(1283, 1283)  (220, 220) 1(16, 16) 0(15, 15)  years(1635, 1635) :(25, 25)  (220, 220) 8(23, 23) 0(15, 15) %(4, 4)  must(1969, 1969)  be(387, 387)  ann(2935, 2935) uit(3083, 3083) ized(1506, 1506) ,(11, 11)  (220, 220) 2(17, 17) 0(15, 15) %(4, 4)  tax(3742, 3742) -free(12577, 12577) .(13, 13)  N(451, 451) PS(5012, 5012)  offers(6081, 6081)  flexibility(24177, 24177)  to(311, 311)  choose(5157, 5157)  asset(9329, 9329)  allocation(23757, 23757)  ((320, 320) Equ(24509, 24509) ity(487, 487)  up(705, 705)  to(311, 311)  (220, 220) 7(22, 22) 5(20, 20) %(4, 4)  for(369, 369)  Tier(35232, 35232)  I(358, 358) ).(568, 568)  For(1752, 1752)  a(264, 264)  high(1550, 1550)  ear(2430, 2430) ner(1194, 1194)  like(1075, 1075)  you(498, 498) ,(11, 11)  N(451, 451) PS(5012, 5012)  is(374, 374)  beneficial(23699, 23699)  under(1212, 1212)  the(279, 279)  old(2310, 2310)  tax(3742, 3742)  regime(17484, 17484)  due(4152, 4152)  to(311, 311)  deductions(70637, 70637) ;(26, 26)  in(304, 304)  the(279, 279)  new(501, 501)  regime(17484, 17484) ,(11, 11)  no(902, 902)  deductions(70637, 70637)  apply(3796, 3796) .(13, 13)  Consider(21144, 21144)  your(697, 697)  liquidity(52013, 52013)  needs(3880, 3880)  and(323, 323)  retirement(20950, 20950)  goals(8845, 8845) .(13, 13)  Consult(19960, 19960)  a(264, 264)  tax(3742, 3742)  advisor(36613, 36613)  to(311, 311)  optimize(29436, 29436)  your(697, 697)  overall(8084, 8084)  plan(3119, 3119) .(13, 13) <|im_end|>(151645, 151645)  (-100, 198) [2026-05-14 13:56:38,574] [INFO] [axolotl.utils.tokenization.check_example_labels:45] [PID:25326] [2026-05-14 13:56:38,574] [INFO] [axolotl.utils.tokenization.check_example_labels:48] [PID:25326] Total input len: 946 [2026-05-14 13:56:38,574] [INFO] [axolotl.utils.tokenization.check_example_labels:49] [PID:25326] Count of labels: 506 [2026-05-14 13:56:38,574] [INFO] [axolotl.common.datasets.load_datasets:90] [PID:25326] printing prompters... [2026-05-14 13:56:38,574] [INFO] [axolotl.common.datasets.load_datasets:92] [PID:25326] Pre-tokenized or custom dataset types are unsupported for logging [2026-05-14 13:56:38,968] [INFO] [axolotl.cli.preprocess.do_preprocess:92] [PID:25326] Success! Preprocessed data path: `dataset_prepared_path: last_run_prepared` [2026-05-14 13:57:44,324] [DEBUG] [axolotl.utils.config.resolve_dtype:74] [PID:26066] bf16 support detected, enabling for this configuration. [2026-05-14 13:57:44,465] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:26066] baseline 0.000GB () [2026-05-14 13:57:44,466] [INFO] [axolotl.cli.config.load_cfg:333] [PID:26066] config: { "activation_offloading": false, "adapter": "qlora", "attn_implementation": "flash_attention_2", "attn_needs_dtype_cast": true, "attn_supports_packing": true, "attn_uses_flash_lib": true, "auto_resume_from_checkpoints": true, "axolotl_config_path": "./data/config.yaml", "base_model": "Qwen/Qwen3-8B", "base_model_config": "Qwen/Qwen3-8B", "batch_size": 80, "bf16": true, "capabilities": { "bf16": true, "compute_capability": "sm_80", "fp8": false, "n_gpu": 1, "n_node": 1, "tf32": true }, "chat_template": "qwen3", "context_parallel_size": 1, "cut_cross_entropy": true, "dataloader_num_workers": 1, "dataloader_pin_memory": true, "dataloader_prefetch_factor": 256, "dataset_num_proc": 1, "dataset_prepared_path": "last_run_prepared", "datasets": [ { "chat_template": "tokenizer_default", "field_messages": "messages", "field_tools": "tools", "message_property_mappings": { "content": "content", "role": "role" }, "path": "Gandalf1/indian-finance-synthetic-phase2-cleaned", "roles_to_train": [ "assistant" ], "train_on_eos": "turn", "trust_remote_code": false, "type": "chat_template" } ], "ddp": false, "device": "cuda:0", "dion_rank_fraction": 1.0, "dion_rank_multiple_of": 1, "eaft_alpha": 1.0, "eaft_k": 20, "env_capabilities": { "torch_version": "2.9.1" }, "eval_batch_size": 10, "eval_causal_lm_metrics": [ "sacrebleu", "comet", "ter", "chrf" ], "eval_max_new_tokens": 128, "eval_sample_packing": true, "eval_table_size": 0, "experimental_skip_move_to_device": true, "fp16": false, "generate_samples": false, "generation_do_sample": true, "generation_max_new_tokens": 50, "generation_prompt_ratio": 0.5, "generation_temperature": 0.7, "gradient_accumulation_steps": 8, "gradient_checkpointing": true, "gradient_checkpointing_kwargs": { "use_reentrant": false }, "hub_model_id": "Gandalf1/qwen3-8b-finance-sft-phase2", "hub_strategy": "all_checkpoints", "include_tkps": true, "layer_offloading": false, "learning_rate": 2e-05, "lisa_layers_attribute": "model.layers", "load_best_model_at_end": false, "load_in_4bit": true, "load_in_8bit": false, "local_rank": 0, "logging_steps": 10, "lora_alpha": 64, "lora_dropout": 0.05, "lora_mlp_kernel": true, "lora_o_kernel": true, "lora_qkv_kernel": true, "lora_r": 32, "lora_target_modules": [ "q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "down_proj", "up_proj" ], "loraplus_lr_embedding": 1e-06, "loss_watchdog_patience": 3, "loss_watchdog_threshold": 5.0, "lr_scheduler": "cosine", "max_grad_norm": 1.0, "mean_resizing_embeddings": false, "merge_method": "memory_efficient", "micro_batch_size": 10, "model_config_type": "qwen3", "num_epochs": 2.0, "num_generation_samples": 3, "optimizer": "adamw_torch_4bit", "otel_metrics_host": "localhost", "otel_metrics_port": 8000, "output_dir": "./outputs/finance-synthetic-sft-phase2", "pad_to_sequence_len": true, "plugins": [ "axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin" ], "pretrain_multipack_attn": true, "profiler_steps_start": 0, "qlora_sharded_model_loading": false, "quantize_moe_experts": false, "ray_num_workers": 1, "relora_prune_method": "magnitude", "resources_per_worker": { "GPU": 1 }, "sample_packing": true, "sample_packing_bin_size": 200, "sample_packing_group_size": 100000, "save_only_model": false, "save_safetensors": true, "save_steps": 0.16666666666666666, "save_total_limit": 3, "saves_per_epoch": 3, "seed": 42, "sequence_len": 8192, "shuffle_before_merging_datasets": false, "shuffle_merged_datasets": true, "skip_prepare_dataset": false, "streaming_multipack_buffer_size": 10000, "strict": false, "tensor_parallel_size": 1, "tf32": true, "tiled_mlp_use_original_mlp": true, "tokenizer_config": "Qwen/Qwen3-8B", "tokenizer_save_jinja_files": true, "torch_dtype": "torch.bfloat16", "train_on_inputs": false, "trl": { "async_prefetch": false, "log_completions": false, "mask_truncated_completions": false, "ref_model_mixup_alpha": 0.9, "ref_model_sync_steps": 64, "replay_buffer_size": 0, "replay_recompute_logps": true, "reroll_max_groups": 1, "reroll_start_fraction": 1.0, "reward_num_workers": 1, "scale_rewards": true, "skip_zero_advantage_batches": true, "sync_ref_model": false, "use_data_producer": false, "use_vllm": false, "vllm_lora_sync": false, "vllm_server_host": "0.0.0.0", "vllm_server_port": 8000 }, "use_otel_metrics": false, "use_ray": false, "val_set_size": 0.0, "vllm": { "device": "auto", "dtype": "auto", "gpu_memory_utilization": 0.9, "host": "0.0.0.0", "port": 8000 }, "warmup_ratio": 0.05, "weight_decay": 0.01, "world_size": 1 } [2026-05-14 13:57:45,656] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:311] [PID:26066] EOS: 151645 / <|im_end|> [2026-05-14 13:57:45,656] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:312] [PID:26066] BOS: None / None [2026-05-14 13:57:45,656] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:313] [PID:26066] PAD: 151643 / <|endoftext|> [2026-05-14 13:57:45,656] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:314] [PID:26066] UNK: None / None [2026-05-14 13:57:45,657] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:477] [PID:26066] Loading prepared dataset from disk at last_run_prepared/2c3cb7e1625c800787dbdc29d010a51d... [2026-05-14 13:57:45,727] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:420] [PID:26066] total_num_tokens: 23_382_259 [2026-05-14 13:57:45,926] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:438] [PID:26066] `total_supervised_tokens: 11_016_035` [2026-05-14 13:57:46,079] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:26066] Using single process for pack_parallel, running sequentially. [2026-05-14 13:57:47,437] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:26066] Using single process for pack_parallel, running sequentially. [2026-05-14 13:57:47,736] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:26066] generate_batches time: 0.30095791816711426 [2026-05-14 13:57:47,738] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:26066] Using single process for pack_parallel, running sequentially. [2026-05-14 13:57:48,034] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:26066] generate_batches time: 0.29753828048706055 [2026-05-14 13:57:48,036] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:26066] Using single process for pack_parallel, running sequentially. [2026-05-14 13:57:48,309] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:26066] generate_batches time: 0.27533483505249023 [2026-05-14 13:57:48,312] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:26066] Using single process for pack_parallel, running sequentially. [2026-05-14 13:57:48,612] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:26066] generate_batches time: 0.3027362823486328 [2026-05-14 13:57:48,657] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:26066] gather_len_batches: [287] [2026-05-14 13:57:48,658] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:495] [PID:26066] data_loader_len: 35 [2026-05-14 13:57:48,658] [INFO] [axolotl.utils.trainer.calc_sample_packing_eff_est:504] [PID:26066] sample_packing_eff_est across ranks: [0.9945225306919643] [2026-05-14 13:57:48,658] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:516] [PID:26066] sample_packing_eff_est: 1.0 [2026-05-14 13:57:48,658] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:521] [PID:26066] total_num_steps: 70 [2026-05-14 13:57:48,658] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:121] [PID:26066] Maximum number of steps set at 70 [2026-05-14 13:57:48,706] [DEBUG] [axolotl.train.setup_model_and_tokenizer:70] [PID:26066] loading tokenizer... Qwen/Qwen3-8B [2026-05-14 13:57:49,701] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:311] [PID:26066] EOS: 151645 / <|im_end|> [2026-05-14 13:57:49,701] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:312] [PID:26066] BOS: None / None [2026-05-14 13:57:49,701] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:313] [PID:26066] PAD: 151643 / <|endoftext|> [2026-05-14 13:57:49,701] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:314] [PID:26066] UNK: None / None [2026-05-14 13:57:49,701] [DEBUG] [axolotl.train.setup_model_and_tokenizer:81] [PID:26066] Loading model [2026-05-14 13:57:49,820] [DEBUG] [axolotl.monkeypatch.torchao_optim.patch_torchao_optim_state_8bit:75] [PID:26066] Patched OptimState8bit for torch.compile compatibility [2026-05-14 13:57:49,820] [DEBUG] [axolotl.monkeypatch.torchao_optim.patch_torchao_optim_state_8bit:122] [PID:26066] Patched OptimState4bit for torch.compile compatibility [2026-05-14 13:57:49,820] [DEBUG] [axolotl.monkeypatch.torchao_optim.patch_torchao_optim_state_8bit:154] [PID:26066] Patched OptimStateFp8 for torch.compile compatibility [2026-05-14 13:57:49,826] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:94] [PID:26066] Patched Trainer.evaluation_loop with nanmean loss calculation [2026-05-14 13:57:49,827] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:148] [PID:26066] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation [2026-05-14 13:57:49,830] [INFO] [axolotl.loaders.patch_manager._apply_multipack_patches:598] [PID:26066] Applying multipack dataloader patch for sample packing... [2026-05-14 13:57:49,830] [WARNING] [axolotl.loaders.patch_manager._apply_self_attention_lora_patch:556] [PID:26066] Cannot patch self-attention - requires no dropout [2026-05-14 13:57:49,864] [INFO] [axolotl.integrations.cut_cross_entropy.pre_model_load:94] [PID:26066] Applying Cut Cross Entropy to model type: qwen3 model.safetensors.index.json: 0.00B [00:00, ?B/s] model.safetensors.index.json: 32.9kB [00:00, 46.7MB/s] Downloading (incomplete total...): 0.00B [00:00, ?B/s] Fetching 5 files: 0%| | 0/5 [00:00", line 198, in _run_module_as_main File "", line 88, in _run_code File "/workspace/axolotl/src/axolotl/cli/train.py", line 145, in fire.Fire(do_cli) File "/workspace/axolotl-venv/lib/python3.12/site-packages/fire/core.py", line 135, in Fire component_trace = _Fire(component, args, parsed_flag_args, context, name) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/workspace/axolotl-venv/lib/python3.12/site-packages/fire/core.py", line 468, in _Fire component, remaining_args = _CallAndUpdateTrace( ^^^^^^^^^^^^^^^^^^^^ File "/workspace/axolotl-venv/lib/python3.12/site-packages/fire/core.py", line 684, in _CallAndUpdateTrace component = fn(*varargs, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^ File "/workspace/axolotl/src/axolotl/cli/train.py", line 96, in do_cli do_train(parsed_cfg, parsed_cli_args) File "/workspace/axolotl/src/axolotl/cli/train.py", line 50, in do_train model, tokenizer, trainer = train(cfg=cfg, dataset_meta=dataset_meta) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/workspace/axolotl/src/axolotl/telemetry/errors.py", line 127, in wrapper return func(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^ File "/workspace/axolotl/src/axolotl/train.py", line 628, in train execute_training(cfg, trainer, resume_from_checkpoint) File "/workspace/axolotl/src/axolotl/train.py", line 227, in execute_training trainer.train(resume_from_checkpoint=resume_from_checkpoint) File "/workspace/axolotl-venv/lib/python3.12/site-packages/transformers/trainer.py", line 1425, in train return inner_training_loop( ^^^^^^^^^^^^^^^^^^^^ File "/workspace/axolotl-venv/lib/python3.12/site-packages/transformers/trainer.py", line 1507, in _inner_training_loop self._run_epoch( File "/workspace/axolotl-venv/lib/python3.12/site-packages/transformers/trainer.py", line 1735, in _run_epoch tr_loss_step = self.training_step(model, inputs, num_items_in_batch) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/workspace/axolotl/src/axolotl/core/trainers/mixins/layer_offloading.py", line 304, in training_step return super().training_step(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/workspace/axolotl/src/axolotl/core/trainers/mixins/activation_checkpointing.py", line 65, in training_step return super().training_step(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/workspace/axolotl-venv/lib/python3.12/site-packages/transformers/trainer.py", line 1907, in training_step loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/workspace/axolotl/src/axolotl/core/trainers/base.py", line 456, in compute_loss return super().compute_loss( ^^^^^^^^^^^^^^^^^^^^^ File "/workspace/axolotl-venv/lib/python3.12/site-packages/transformers/trainer.py", line 1979, in compute_loss outputs = model(**inputs) ^^^^^^^^^^^^^^^ File "/workspace/axolotl-venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl return self._call_impl(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/workspace/axolotl-venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl return forward_call(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/workspace/axolotl-venv/lib/python3.12/site-packages/accelerate/utils/operations.py", line 823, in forward return model_forward(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/workspace/axolotl-venv/lib/python3.12/site-packages/accelerate/utils/operations.py", line 811, in __call__ return convert_to_fp32(self.model_forward(*args, **kwargs)) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/workspace/axolotl-venv/lib/python3.12/site-packages/torch/amp/autocast_mode.py", line 44, in decorate_autocast return func(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^ File "/workspace/axolotl-venv/lib/python3.12/site-packages/peft/peft_model.py", line 1993, in forward return self.base_model( ^^^^^^^^^^^^^^^^ File "/workspace/axolotl-venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl return self._call_impl(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/workspace/axolotl-venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl return forward_call(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/workspace/axolotl-venv/lib/python3.12/site-packages/peft/tuners/tuners_utils.py", line 330, in forward return self.model.forward(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/workspace/axolotl-venv/lib/python3.12/site-packages/cut_cross_entropy/transformers/llama.py", line 53, in cce_forward outputs: BaseModelOutputWithPast = self.model( ^^^^^^^^^^^ File "/workspace/axolotl-venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl return self._call_impl(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/workspace/axolotl-venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl return forward_call(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/workspace/axolotl-venv/lib/python3.12/site-packages/transformers/utils/generic.py", line 952, in wrapper output = func(self, *args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/workspace/axolotl-venv/lib/python3.12/site-packages/transformers/utils/output_capturing.py", line 248, in wrapper outputs = func(self, *args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/workspace/axolotl-venv/lib/python3.12/site-packages/transformers/models/qwen3/modeling_qwen3.py", line 424, in forward hidden_states = decoder_layer( ^^^^^^^^^^^^^^ File "/workspace/axolotl-venv/lib/python3.12/site-packages/transformers/modeling_layers.py", line 92, in __call__ return self._gradient_checkpointing_func(partial(super().__call__, **kwargs), *args) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/workspace/axolotl-venv/lib/python3.12/site-packages/torch/_compile.py", line 53, in inner return disable_fn(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py", line 1044, in _fn return fn(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^ File "/workspace/axolotl-venv/lib/python3.12/site-packages/torch/utils/checkpoint.py", line 503, in checkpoint ret = function(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^ File "/workspace/axolotl-venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl return self._call_impl(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/workspace/axolotl-venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl return forward_call(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/workspace/axolotl-venv/lib/python3.12/site-packages/transformers/models/qwen3/modeling_qwen3.py", line 332, in forward hidden_states = self.mlp(hidden_states) ^^^^^^^^^^^^^^^^^^^^^^^ File "/workspace/axolotl-venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl return self._call_impl(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/workspace/axolotl-venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl return forward_call(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/workspace/axolotl/src/axolotl/kernels/lora.py", line 729, in apply_lora_mlp_swiglu out = LoRA_MLP.apply( ^^^^^^^^^^^^^^^ File "/workspace/axolotl-venv/lib/python3.12/site-packages/torch/autograd/function.py", line 581, in apply return super().apply(*args, **kwargs) # type: ignore[misc] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/workspace/axolotl-venv/lib/python3.12/site-packages/torch/amp/autocast_mode.py", line 527, in decorate_fwd return fwd(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^ File "/workspace/axolotl/src/axolotl/kernels/lora.py", line 414, in forward output = matmul_lora( ^^^^^^^^^^^^ File "/workspace/axolotl/src/axolotl/kernels/lora.py", line 273, in matmul_lora out += s * X_lora @ A @ B ~~^~~~~~~~ torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 1.88 GiB. GPU 0 has a total capacity of 39.49 GiB of which 1.30 GiB is free. Process 137982 has 38.18 GiB memory in use. Of the allocated memory 35.49 GiB is allocated by PyTorch, and 2.20 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) Exception in thread Thread-5 (_pin_memory_loop): Traceback (most recent call last): File "/root/.local/share/uv/python/cpython-3.12.13-linux-x86_64-gnu/lib/python3.12/threading.py", line 1075, in _bootstrap_inner  self.run() File "/root/.local/share/uv/python/cpython-3.12.13-linux-x86_64-gnu/lib/python3.12/threading.py", line 1012, in run 0%| | 0/70 [00:11 [2026-05-14 14:04:51,741] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:312] [PID:26498] BOS: None / None [2026-05-14 14:04:51,741] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:313] [PID:26498] PAD: 151643 / <|endoftext|> [2026-05-14 14:04:51,742] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:314] [PID:26498] UNK: None / None [2026-05-14 14:04:51,742] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:482] [PID:26498] Unable to find prepared dataset in last_run_prepared/8e970b09b0233ad980a67dcca6703606 [2026-05-14 14:04:51,742] [INFO] [axolotl.utils.data.sft._load_raw_datasets:320] [PID:26498] Loading raw datasets... [2026-05-14 14:04:51,742] [WARNING] [axolotl.utils.data.sft._load_raw_datasets:322] [PID:26498] Processing datasets during training can lead to VRAM instability. Please pre-process your dataset using `axolotl preprocess path/to/config.yml`. Downloading (incomplete total...): 0.00B [00:00, ?B/s] Fetching 0 files: 0it [00:00, ?it/s] Fetching 0 files: 0it [00:00, ?it/s] Download complete: : 0.00B [00:00, ?B/s] Download complete: : 0.00B [00:00, ?B/s] [2026-05-14 14:04:53,700] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:87] [PID:26498] Loading dataset: Gandalf1/indian-finance-synthetic-phase2-cleaned with base_type: chat_template and prompt_style: None [2026-05-14 14:04:53,702] [INFO] [axolotl.prompt_strategies.chat_template.__call__:1191] [PID:26498] Using chat template: --- {%- if tools %} {{- '<|im_start|>system\n' }} {%- if messages[0].role == 'system' %} {{- messages[0].content + '\n\n' }} {%- endif %} {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} {%- for tool in tools %} {{- "\n" }} {{- tool | tojson }} {%- endfor %} {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} {%- else %} {%- if messages[0].role == 'system' %} {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} {%- endif %} {%- endif %} {%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} {#- Determine the real last index: use provided value or default to messages length - 1 #} {%- if real_last_index is defined and real_last_index is not none %} {%- set ns.real_last_index = real_last_index %} {%- else %} {%- set ns.real_last_index = messages|length - 1 %} {%- endif %} {%- for message in messages[::-1] %} {%- set index = (messages|length - 1) - loop.index0 %} {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} {%- set ns.multi_step_tool = false %} {%- set ns.last_query_index = index %} {%- endif %} {%- endfor %} {%- for message in messages %} {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} {%- elif message.role == "assistant" %} {%- set content = message.content %} {%- set reasoning_content = '' %} {%- if message.reasoning_content is defined and message.reasoning_content is not none %} {%- set reasoning_content = message.reasoning_content %} {%- else %} {%- if '' in message.content %} {%- set content = message.content.split('')[-1].lstrip('\n') %} {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} {%- endif %} {%- endif %} {%- if loop.index0 > ns.last_query_index %} {%- if loop.index0 == ns.real_last_index or (loop.index0 != ns.real_last_index and reasoning_content) %} {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} {%- else %} {{- '<|im_start|>' + message.role + '\n' + content }} {%- endif %} {%- else %} {{- '<|im_start|>' + message.role + '\n' + content }} {%- endif %} {%- if message.tool_calls %} {%- for tool_call in message.tool_calls %} {%- if (loop.first and content) or (not loop.first) %} {{- '\n' }} {%- endif %} {%- if tool_call.function %} {%- set tool_call = tool_call.function %} {%- endif %} {{- '\n{"name": "' }} {{- tool_call.name }} {{- '", "arguments": ' }} {%- if tool_call.arguments is string %} {{- tool_call.arguments }} {%- else %} {{- tool_call.arguments | tojson }} {%- endif %} {{- '}\n' }} {%- endfor %} {%- endif %} {{- '<|im_end|>\n' }} {%- elif message.role == "tool" %} {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} {{- '<|im_start|>user' }} {%- endif %} {{- '\n\n' }} {{- message.content }} {{- '\n' }} {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} {{- '<|im_end|>\n' }} {%- endif %} {%- endif %} {%- endfor %} {%- if add_generation_prompt %} {{- '<|im_start|>assistant\n' }} {%- if enable_thinking is defined and enable_thinking is false %} {{- '\n\n\n\n' }} {%- else %} {{- '\n\n' }} {%- endif %} {%- endif %} --- Tokenizing Prompts (num_proc=1): 0%| | 0/14763 [00:006144) (num_proc=1): 0%| | 0/14763 [00:006144) (num_proc=1): 7%|▏ | 1000/14763 [00:00<00:09, 1453.44 examples/s] Dropping Invalid Sequences (6144) (num_proc=1): 14%|▍ | 2000/14763 [00:01<00:08, 1568.36 examples/s] Dropping Invalid Sequences (6144) (num_proc=1): 20%|▌ | 3000/14763 [00:01<00:07, 1627.08 examples/s] Dropping Invalid Sequences (6144) (num_proc=1): 27%|▊ | 4000/14763 [00:02<00:06, 1669.29 examples/s] Dropping Invalid Sequences (6144) (num_proc=1): 34%|█ | 5000/14763 [00:03<00:05, 1674.26 examples/s] Dropping Invalid Sequences (6144) (num_proc=1): 41%|█▏ | 6000/14763 [00:03<00:05, 1693.46 examples/s] Dropping Invalid Sequences (6144) (num_proc=1): 47%|█▍ | 7000/14763 [00:04<00:04, 1681.13 examples/s] Dropping Invalid Sequences (6144) (num_proc=1): 54%|█▋ | 8000/14763 [00:04<00:04, 1678.51 examples/s] Dropping Invalid Sequences (6144) (num_proc=1): 61%|█▊ | 9000/14763 [00:05<00:03, 1687.52 examples/s] Dropping Invalid Sequences (6144) (num_proc=1): 68%|█▎| 10000/14763 [00:05<00:02, 1704.69 examples/s] Dropping Invalid Sequences (6144) (num_proc=1): 75%|█▍| 11000/14763 [00:06<00:02, 1705.31 examples/s] Dropping Invalid Sequences (6144) (num_proc=1): 81%|█▋| 12000/14763 [00:07<00:01, 1711.63 examples/s] Dropping Invalid Sequences (6144) (num_proc=1): 88%|█▊| 13000/14763 [00:07<00:01, 1707.70 examples/s] Dropping Invalid Sequences (6144) (num_proc=1): 95%|█▉| 14000/14763 [00:08<00:00, 1700.81 examples/s] Dropping Invalid Sequences (6144) (num_proc=1): 100%|██| 14763/14763 [00:08<00:00, 1708.72 examples/s] Dropping Invalid Sequences (6144) (num_proc=1): 100%|██| 14763/14763 [00:08<00:00, 1659.28 examples/s] Drop Samples with Zero Trainable Tokens (num_proc=1): 0%| | 0/14763 [00:00 [2026-05-14 14:09:15,875] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:312] [PID:26498] BOS: None / None [2026-05-14 14:09:15,875] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:313] [PID:26498] PAD: 151643 / <|endoftext|> [2026-05-14 14:09:15,875] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:314] [PID:26498] UNK: None / None [2026-05-14 14:09:15,875] [DEBUG] [axolotl.train.setup_model_and_tokenizer:81] [PID:26498] Loading model [2026-05-14 14:09:15,966] [DEBUG] [axolotl.monkeypatch.torchao_optim.patch_torchao_optim_state_8bit:75] [PID:26498] Patched OptimState8bit for torch.compile compatibility [2026-05-14 14:09:15,966] [DEBUG] [axolotl.monkeypatch.torchao_optim.patch_torchao_optim_state_8bit:122] [PID:26498] Patched OptimState4bit for torch.compile compatibility [2026-05-14 14:09:15,966] [DEBUG] [axolotl.monkeypatch.torchao_optim.patch_torchao_optim_state_8bit:154] [PID:26498] Patched OptimStateFp8 for torch.compile compatibility [2026-05-14 14:09:15,972] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:94] [PID:26498] Patched Trainer.evaluation_loop with nanmean loss calculation [2026-05-14 14:09:15,973] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:148] [PID:26498] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation [2026-05-14 14:09:15,975] [INFO] [axolotl.loaders.patch_manager._apply_multipack_patches:598] [PID:26498] Applying multipack dataloader patch for sample packing... [2026-05-14 14:09:15,975] [WARNING] [axolotl.loaders.patch_manager._apply_self_attention_lora_patch:556] [PID:26498] Cannot patch self-attention - requires no dropout [2026-05-14 14:09:15,999] [INFO] [axolotl.integrations.cut_cross_entropy.pre_model_load:94] [PID:26498] Applying Cut Cross Entropy to model type: qwen3 Loading weights: 0%| | 0/399 [00:00 lambda signum, frame: terminate_handler(signum, frame, _model_weakref), ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/workspace/axolotl/src/axolotl/train.py", line 171, in terminate_handler _model.save_pretrained(cfg.output_dir) File "/workspace/axolotl-venv/lib/python3.12/site-packages/peft/peft_model.py", line 294, in save_pretrained output_state_dict = get_peft_model_state_dict( ^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/workspace/axolotl-venv/lib/python3.12/site-packages/peft/utils/save_and_load.py", line 111, in get_peft_model_state_dict state_dict = model.state_dict() ^^^^^^^^^^^^^^^^^^ File "/workspace/axolotl-venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 2265, in state_dict module.state_dict( File "/workspace/axolotl-venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 2265, in state_dict module.state_dict( File "/workspace/axolotl-venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 2265, in state_dict module.state_dict( [Previous line repeated 5 more times] File "/workspace/axolotl-venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 2262, in state_dict self._save_to_state_dict(destination, prefix, keep_vars) File "/workspace/axolotl-venv/lib/python3.12/site-packages/bitsandbytes/nn/modules.py", line 526, in _save_to_state_dict for k, v in self.weight.quant_state.as_dict(packed=True).items(): ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/workspace/axolotl-venv/lib/python3.12/site-packages/bitsandbytes/functional.py", line 523, in as_dict "nested_quant_map": self.state2.code.clone(), # un-shared to avoid restoring it after shared tensors are removed by safetensors ^^^^^^^^^^^^^^^^^^^^^^^^ torch.AcceleratorError: CUDA error: initialization error Search for `cudaErrorInitializationError' in https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html for more information. CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect. For debugging consider passing CUDA_LAUNCH_BLOCKING=1 Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions. [2026-05-14 14:16:00,926] [WARNING] [py.warnings._showwarnmsg:112] [PID:26498] /workspace/axolotl-venv/lib/python3.12/site-packages/peft/utils/other.py:1419: UserWarning: Unable to fetch remote file due to the following error DataLoader worker (pid 26631) exited unexpectedly with exit code 1. Details are lost due to multiprocessing. Rerunning with num_workers=0 may give better error trace. - silently ignoring the lookup for the file config.json in Qwen/Qwen3-8B. warnings.warn( [2026-05-14 14:16:00,927] [WARNING] [py.warnings._showwarnmsg:112] [PID:26498] /workspace/axolotl-venv/lib/python3.12/site-packages/peft/utils/save_and_load.py:372: UserWarning: Could not find a config file in Qwen/Qwen3-8B - will assume that the vocabulary was not modified. warnings.warn( 2%|█▌ | 5/240 [06:33<5:08:32, 78.78s/it] [2026-05-14 14:16:47,206] [DEBUG] [axolotl.utils.config.resolve_dtype:74] [PID:27401] bf16 support detected, enabling for this configuration. [2026-05-14 14:16:47,436] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:27401] baseline 0.000GB () [2026-05-14 14:16:47,437] [INFO] [axolotl.cli.config.load_cfg:333] [PID:27401] config: { "activation_offloading": false, "adapter": "qlora", "attn_implementation": "flash_attention_2", "attn_needs_dtype_cast": true, "attn_supports_packing": true, "attn_uses_flash_lib": true, "auto_resume_from_checkpoints": true, "axolotl_config_path": "./data/config.yaml", "base_model": "Qwen/Qwen3-8B", "base_model_config": "Qwen/Qwen3-8B", "batch_size": 64, "bf16": true, "capabilities": { "bf16": true, "compute_capability": "sm_80", "fp8": false, "n_gpu": 1, "n_node": 1, "tf32": true }, "chat_template": "qwen3", "context_parallel_size": 1, "cut_cross_entropy": true, "dataloader_num_workers": 1, "dataloader_pin_memory": true, "dataloader_prefetch_factor": 256, "dataset_num_proc": 1, "dataset_prepared_path": "last_run_prepared", "datasets": [ { "chat_template": "tokenizer_default", "field_messages": "messages", "field_tools": "tools", "message_property_mappings": { "content": "content", "role": "role" }, "path": "Gandalf1/indian-finance-synthetic-phase2-cleaned", "roles_to_train": [ "assistant" ], "train_on_eos": "turn", "trust_remote_code": false, "type": "chat_template" } ], "ddp": false, "device": "cuda:0", "dion_rank_fraction": 1.0, "dion_rank_multiple_of": 1, "eaft_alpha": 1.0, "eaft_k": 20, "env_capabilities": { "torch_version": "2.9.1" }, "eval_batch_size": 8, "eval_causal_lm_metrics": [ "sacrebleu", "comet", "ter", "chrf" ], "eval_max_new_tokens": 128, "eval_sample_packing": true, "eval_table_size": 0, "experimental_skip_move_to_device": true, "fp16": false, "generate_samples": false, "generation_do_sample": true, "generation_max_new_tokens": 50, "generation_prompt_ratio": 0.5, "generation_temperature": 0.7, "gradient_accumulation_steps": 8, "gradient_checkpointing": true, "gradient_checkpointing_kwargs": { "use_reentrant": false }, "hub_model_id": "Gandalf1/qwen3-8b-finance-sft-phase2", "hub_strategy": "all_checkpoints", "include_tkps": true, "layer_offloading": false, "learning_rate": 2e-05, "lisa_layers_attribute": "model.layers", "load_best_model_at_end": false, "load_in_4bit": true, "load_in_8bit": false, "local_rank": 0, "logging_steps": 10, "lora_alpha": 64, "lora_dropout": 0.05, "lora_mlp_kernel": true, "lora_o_kernel": true, "lora_qkv_kernel": true, "lora_r": 32, "lora_target_modules": [ "q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "down_proj", "up_proj" ], "loraplus_lr_embedding": 1e-06, "loss_watchdog_patience": 3, "loss_watchdog_threshold": 5.0, "lr_scheduler": "cosine", "max_grad_norm": 1.0, "mean_resizing_embeddings": false, "merge_method": "memory_efficient", "micro_batch_size": 8, "model_config_type": "qwen3", "num_epochs": 2.0, "num_generation_samples": 3, "optimizer": "adamw_torch_4bit", "otel_metrics_host": "localhost", "otel_metrics_port": 8000, "output_dir": "./outputs/finance-synthetic-sft-phase2", "pad_to_sequence_len": true, "plugins": [ "axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin" ], "pretrain_multipack_attn": true, "profiler_steps_start": 0, "qlora_sharded_model_loading": false, "quantize_moe_experts": false, "ray_num_workers": 1, "relora_prune_method": "magnitude", "resources_per_worker": { "GPU": 1 }, "sample_packing": true, "sample_packing_bin_size": 200, "sample_packing_group_size": 100000, "save_only_model": false, "save_safetensors": true, "save_steps": 0.16666666666666666, "save_total_limit": 3, "saves_per_epoch": 3, "seed": 42, "sequence_len": 6144, "shuffle_before_merging_datasets": false, "shuffle_merged_datasets": true, "skip_prepare_dataset": false, "streaming_multipack_buffer_size": 10000, "strict": false, "tensor_parallel_size": 1, "tf32": true, "tiled_mlp_use_original_mlp": true, "tokenizer_config": "Qwen/Qwen3-8B", "tokenizer_save_jinja_files": true, "torch_dtype": "torch.bfloat16", "train_on_inputs": false, "trl": { "async_prefetch": false, "log_completions": false, "mask_truncated_completions": false, "ref_model_mixup_alpha": 0.9, "ref_model_sync_steps": 64, "replay_buffer_size": 0, "replay_recompute_logps": true, "reroll_max_groups": 1, "reroll_start_fraction": 1.0, "reward_num_workers": 1, "scale_rewards": true, "skip_zero_advantage_batches": true, "sync_ref_model": false, "use_data_producer": false, "use_vllm": false, "vllm_lora_sync": false, "vllm_server_host": "0.0.0.0", "vllm_server_port": 8000 }, "use_otel_metrics": false, "use_ray": false, "val_set_size": 0.0, "vllm": { "device": "auto", "dtype": "auto", "gpu_memory_utilization": 0.9, "host": "0.0.0.0", "port": 8000 }, "warmup_ratio": 0.05, "weight_decay": 0.01, "world_size": 1 } [2026-05-14 14:16:48,543] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:311] [PID:27401] EOS: 151645 / <|im_end|> [2026-05-14 14:16:48,543] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:312] [PID:27401] BOS: None / None [2026-05-14 14:16:48,543] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:313] [PID:27401] PAD: 151643 / <|endoftext|> [2026-05-14 14:16:48,543] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:314] [PID:27401] UNK: None / None [2026-05-14 14:16:48,544] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:477] [PID:27401] Loading prepared dataset from disk at last_run_prepared/8e970b09b0233ad980a67dcca6703606... [2026-05-14 14:16:48,619] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:420] [PID:27401] total_num_tokens: 23_382_259 [2026-05-14 14:16:48,820] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:438] [PID:27401] `total_supervised_tokens: 11_016_035` [2026-05-14 14:16:48,973] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:27401] Using single process for pack_parallel, running sequentially. [2026-05-14 14:16:49,950] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:27401] Using single process for pack_parallel, running sequentially. [2026-05-14 14:16:50,214] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27401] generate_batches time: 0.2658224105834961 [2026-05-14 14:16:50,216] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:27401] Using single process for pack_parallel, running sequentially. [2026-05-14 14:16:50,480] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27401] generate_batches time: 0.26596903800964355 [2026-05-14 14:16:50,483] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:27401] Using single process for pack_parallel, running sequentially. [2026-05-14 14:16:50,747] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27401] generate_batches time: 0.26615142822265625 [2026-05-14 14:16:50,749] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:27401] Using single process for pack_parallel, running sequentially. [2026-05-14 14:16:51,014] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27401] generate_batches time: 0.26613616943359375 [2026-05-14 14:16:51,076] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:27401] gather_len_batches: [478] [2026-05-14 14:16:51,076] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:495] [PID:27401] data_loader_len: 59 [2026-05-14 14:16:51,076] [INFO] [axolotl.utils.trainer.calc_sample_packing_eff_est:504] [PID:27401] sample_packing_eff_est across ranks: [0.9952160610480953] [2026-05-14 14:16:51,076] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:516] [PID:27401] sample_packing_eff_est: 1.0 [2026-05-14 14:16:51,076] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:521] [PID:27401] total_num_steps: 118 [2026-05-14 14:16:51,076] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:121] [PID:27401] Maximum number of steps set at 118 [2026-05-14 14:16:51,107] [DEBUG] [axolotl.train.setup_model_and_tokenizer:70] [PID:27401] loading tokenizer... Qwen/Qwen3-8B [2026-05-14 14:16:52,139] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:311] [PID:27401] EOS: 151645 / <|im_end|> [2026-05-14 14:16:52,139] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:312] [PID:27401] BOS: None / None [2026-05-14 14:16:52,139] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:313] [PID:27401] PAD: 151643 / <|endoftext|> [2026-05-14 14:16:52,139] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:314] [PID:27401] UNK: None / None [2026-05-14 14:16:52,139] [DEBUG] [axolotl.train.setup_model_and_tokenizer:81] [PID:27401] Loading model [2026-05-14 14:16:52,226] [DEBUG] [axolotl.monkeypatch.torchao_optim.patch_torchao_optim_state_8bit:75] [PID:27401] Patched OptimState8bit for torch.compile compatibility [2026-05-14 14:16:52,226] [DEBUG] [axolotl.monkeypatch.torchao_optim.patch_torchao_optim_state_8bit:122] [PID:27401] Patched OptimState4bit for torch.compile compatibility [2026-05-14 14:16:52,226] [DEBUG] [axolotl.monkeypatch.torchao_optim.patch_torchao_optim_state_8bit:154] [PID:27401] Patched OptimStateFp8 for torch.compile compatibility [2026-05-14 14:16:52,231] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:94] [PID:27401] Patched Trainer.evaluation_loop with nanmean loss calculation [2026-05-14 14:16:52,232] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:148] [PID:27401] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation [2026-05-14 14:16:52,233] [INFO] [axolotl.loaders.patch_manager._apply_multipack_patches:598] [PID:27401] Applying multipack dataloader patch for sample packing... [2026-05-14 14:16:52,233] [WARNING] [axolotl.loaders.patch_manager._apply_self_attention_lora_patch:556] [PID:27401] Cannot patch self-attention - requires no dropout [2026-05-14 14:16:52,248] [INFO] [axolotl.integrations.cut_cross_entropy.pre_model_load:94] [PID:27401] Applying Cut Cross Entropy to model type: qwen3 Loading weights: 0%| | 0/399 [00:00