1881 lines
217 KiB
Plaintext
1881 lines
217 KiB
Plaintext
|
|
[2026-05-14 13:44:44,807] [DEBUG] [axolotl.utils.config.resolve_dtype:74] [PID:11984] bf16 support detected, enabling for this configuration.
|
|||
|
|
[2026-05-14 13:44:45,352] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:11984] baseline 0.000GB ()
|
|||
|
|
[2026-05-14 13:44:45,353] [INFO] [axolotl.cli.config.load_cfg:333] [PID:11984] config:
|
|||
|
|
{
|
|||
|
|
"activation_offloading": false,
|
|||
|
|
"adapter": "qlora",
|
|||
|
|
"attn_implementation": "flash_attention_2",
|
|||
|
|
"attn_needs_dtype_cast": true,
|
|||
|
|
"attn_supports_packing": true,
|
|||
|
|
"attn_uses_flash_lib": true,
|
|||
|
|
"auto_resume_from_checkpoints": true,
|
|||
|
|
"axolotl_config_path": "./data/config.yaml",
|
|||
|
|
"base_model": "Qwen/Qwen3-8B",
|
|||
|
|
"base_model_config": "Qwen/Qwen3-8B",
|
|||
|
|
"batch_size": 80,
|
|||
|
|
"bf16": true,
|
|||
|
|
"capabilities": {
|
|||
|
|
"bf16": true,
|
|||
|
|
"compute_capability": "sm_80",
|
|||
|
|
"fp8": false,
|
|||
|
|
"n_gpu": 1,
|
|||
|
|
"n_node": 1,
|
|||
|
|
"tf32": true
|
|||
|
|
},
|
|||
|
|
"chat_template": "qwen3",
|
|||
|
|
"context_parallel_size": 1,
|
|||
|
|
"cut_cross_entropy": true,
|
|||
|
|
"dataloader_num_workers": 1,
|
|||
|
|
"dataloader_pin_memory": true,
|
|||
|
|
"dataloader_prefetch_factor": 256,
|
|||
|
|
"dataset_num_proc": 128,
|
|||
|
|
"dataset_prepared_path": "last_run_prepared",
|
|||
|
|
"datasets": [
|
|||
|
|
{
|
|||
|
|
"chat_template": "tokenizer_default",
|
|||
|
|
"field_messages": "messages",
|
|||
|
|
"field_tools": "tools",
|
|||
|
|
"message_property_mappings": {
|
|||
|
|
"content": "content",
|
|||
|
|
"role": "role"
|
|||
|
|
},
|
|||
|
|
"path": "Gandalf1/indian-finance-synthetic-phase2-cleaned",
|
|||
|
|
"roles_to_train": [
|
|||
|
|
"assistant"
|
|||
|
|
],
|
|||
|
|
"train_on_eos": "turn",
|
|||
|
|
"trust_remote_code": false,
|
|||
|
|
"type": "chat_template"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"ddp": false,
|
|||
|
|
"device": "cuda:0",
|
|||
|
|
"device_map": "auto",
|
|||
|
|
"dion_rank_fraction": 1.0,
|
|||
|
|
"dion_rank_multiple_of": 1,
|
|||
|
|
"eaft_alpha": 1.0,
|
|||
|
|
"eaft_k": 20,
|
|||
|
|
"env_capabilities": {
|
|||
|
|
"torch_version": "2.9.1"
|
|||
|
|
},
|
|||
|
|
"eval_batch_size": 10,
|
|||
|
|
"eval_causal_lm_metrics": [
|
|||
|
|
"sacrebleu",
|
|||
|
|
"comet",
|
|||
|
|
"ter",
|
|||
|
|
"chrf"
|
|||
|
|
],
|
|||
|
|
"eval_max_new_tokens": 128,
|
|||
|
|
"eval_sample_packing": true,
|
|||
|
|
"eval_table_size": 0,
|
|||
|
|
"experimental_skip_move_to_device": true,
|
|||
|
|
"fp16": false,
|
|||
|
|
"generate_samples": false,
|
|||
|
|
"generation_do_sample": true,
|
|||
|
|
"generation_max_new_tokens": 50,
|
|||
|
|
"generation_prompt_ratio": 0.5,
|
|||
|
|
"generation_temperature": 0.7,
|
|||
|
|
"gradient_accumulation_steps": 8,
|
|||
|
|
"gradient_checkpointing": true,
|
|||
|
|
"gradient_checkpointing_kwargs": {
|
|||
|
|
"use_reentrant": false
|
|||
|
|
},
|
|||
|
|
"hub_model_id": "Gandalf1/qwen3-8b-finance-sft-phase2",
|
|||
|
|
"hub_strategy": "all_checkpoints",
|
|||
|
|
"include_tkps": true,
|
|||
|
|
"is_preprocess": true,
|
|||
|
|
"layer_offloading": false,
|
|||
|
|
"learning_rate": 2e-05,
|
|||
|
|
"lisa_layers_attribute": "model.layers",
|
|||
|
|
"load_best_model_at_end": false,
|
|||
|
|
"load_in_4bit": true,
|
|||
|
|
"load_in_8bit": false,
|
|||
|
|
"local_rank": 0,
|
|||
|
|
"logging_steps": 10,
|
|||
|
|
"lora_alpha": 64,
|
|||
|
|
"lora_dropout": 0.05,
|
|||
|
|
"lora_mlp_kernel": true,
|
|||
|
|
"lora_o_kernel": true,
|
|||
|
|
"lora_qkv_kernel": true,
|
|||
|
|
"lora_r": 32,
|
|||
|
|
"lora_target_modules": [
|
|||
|
|
"q_proj",
|
|||
|
|
"k_proj",
|
|||
|
|
"v_proj",
|
|||
|
|
"o_proj",
|
|||
|
|
"gate_proj",
|
|||
|
|
"down_proj",
|
|||
|
|
"up_proj"
|
|||
|
|
],
|
|||
|
|
"loraplus_lr_embedding": 1e-06,
|
|||
|
|
"loss_watchdog_patience": 3,
|
|||
|
|
"loss_watchdog_threshold": 5.0,
|
|||
|
|
"lr_scheduler": "cosine",
|
|||
|
|
"max_grad_norm": 1.0,
|
|||
|
|
"mean_resizing_embeddings": false,
|
|||
|
|
"merge_method": "memory_efficient",
|
|||
|
|
"micro_batch_size": 10,
|
|||
|
|
"model_config_type": "qwen3",
|
|||
|
|
"num_epochs": 2.0,
|
|||
|
|
"num_generation_samples": 3,
|
|||
|
|
"optimizer": "adamw_torch_4bit",
|
|||
|
|
"otel_metrics_host": "localhost",
|
|||
|
|
"otel_metrics_port": 8000,
|
|||
|
|
"output_dir": "./outputs/finance-synthetic-sft-phase2",
|
|||
|
|
"pad_to_sequence_len": true,
|
|||
|
|
"plugins": [
|
|||
|
|
"axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin"
|
|||
|
|
],
|
|||
|
|
"pretrain_multipack_attn": true,
|
|||
|
|
"profiler_steps_start": 0,
|
|||
|
|
"qlora_sharded_model_loading": false,
|
|||
|
|
"quantize_moe_experts": false,
|
|||
|
|
"ray_num_workers": 1,
|
|||
|
|
"relora_prune_method": "magnitude",
|
|||
|
|
"resources_per_worker": {
|
|||
|
|
"GPU": 1
|
|||
|
|
},
|
|||
|
|
"sample_packing": true,
|
|||
|
|
"sample_packing_bin_size": 200,
|
|||
|
|
"sample_packing_group_size": 100000,
|
|||
|
|
"save_only_model": false,
|
|||
|
|
"save_safetensors": true,
|
|||
|
|
"save_steps": 0.16666666666666666,
|
|||
|
|
"save_total_limit": 3,
|
|||
|
|
"saves_per_epoch": 3,
|
|||
|
|
"seed": 42,
|
|||
|
|
"sequence_len": 8192,
|
|||
|
|
"shuffle_before_merging_datasets": false,
|
|||
|
|
"shuffle_merged_datasets": true,
|
|||
|
|
"skip_prepare_dataset": false,
|
|||
|
|
"streaming_multipack_buffer_size": 10000,
|
|||
|
|
"strict": false,
|
|||
|
|
"tensor_parallel_size": 1,
|
|||
|
|
"tf32": true,
|
|||
|
|
"tiled_mlp_use_original_mlp": true,
|
|||
|
|
"tokenizer_config": "Qwen/Qwen3-8B",
|
|||
|
|
"tokenizer_save_jinja_files": true,
|
|||
|
|
"torch_dtype": "torch.bfloat16",
|
|||
|
|
"train_on_inputs": false,
|
|||
|
|
"trl": {
|
|||
|
|
"async_prefetch": false,
|
|||
|
|
"log_completions": false,
|
|||
|
|
"mask_truncated_completions": false,
|
|||
|
|
"ref_model_mixup_alpha": 0.9,
|
|||
|
|
"ref_model_sync_steps": 64,
|
|||
|
|
"replay_buffer_size": 0,
|
|||
|
|
"replay_recompute_logps": true,
|
|||
|
|
"reroll_max_groups": 1,
|
|||
|
|
"reroll_start_fraction": 1.0,
|
|||
|
|
"reward_num_workers": 1,
|
|||
|
|
"scale_rewards": true,
|
|||
|
|
"skip_zero_advantage_batches": true,
|
|||
|
|
"sync_ref_model": false,
|
|||
|
|
"use_data_producer": false,
|
|||
|
|
"use_vllm": false,
|
|||
|
|
"vllm_lora_sync": false,
|
|||
|
|
"vllm_server_host": "0.0.0.0",
|
|||
|
|
"vllm_server_port": 8000
|
|||
|
|
},
|
|||
|
|
"use_otel_metrics": false,
|
|||
|
|
"use_ray": false,
|
|||
|
|
"val_set_size": 0.0,
|
|||
|
|
"vllm": {
|
|||
|
|
"device": "auto",
|
|||
|
|
"dtype": "auto",
|
|||
|
|
"gpu_memory_utilization": 0.9,
|
|||
|
|
"host": "0.0.0.0",
|
|||
|
|
"port": 8000
|
|||
|
|
},
|
|||
|
|
"warmup_ratio": 0.05,
|
|||
|
|
"weight_decay": 0.01,
|
|||
|
|
"world_size": 1
|
|||
|
|
}
|
|||
|
|
[2026-05-14 13:44:46,921] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:311] [PID:11984] EOS: 151645 / <|im_end|>
|
|||
|
|
[2026-05-14 13:44:46,921] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:312] [PID:11984] BOS: None / None
|
|||
|
|
[2026-05-14 13:44:46,921] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:313] [PID:11984] PAD: 151643 / <|endoftext|>
|
|||
|
|
[2026-05-14 13:44:46,921] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:314] [PID:11984] UNK: None / None
|
|||
|
|
[2026-05-14 13:44:46,922] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:482] [PID:11984] Unable to find prepared dataset in last_run_prepared/2c3cb7e1625c800787dbdc29d010a51d
|
|||
|
|
[2026-05-14 13:44:46,922] [INFO] [axolotl.utils.data.sft._load_raw_datasets:320] [PID:11984] Loading raw datasets...
|
|||
|
|
Downloading (incomplete total...): 0.00B [00:00, ?B/s]
|
|||
|
|
Fetching 0 files: 0it [00:00, ?it/s][A
Fetching 0 files: 0it [00:00, ?it/s]
|
|||
|
|
Download complete: : 0.00B [00:00, ?B/s]
|
|||
|
|
README.md: 0.00B [00:00, ?B/s][A
README.md: 1.66kB [00:00, 6.58MB/s]
|
|||
|
|
Download complete: : 0.00B [00:00, ?B/s]
|
|||
|
|
data/train-00000-of-00001.parquet: 0%| | 0.00/29.6M [00:00<?, ?B/s]
data/train-00000-of-00001.parquet: 0%| | 0.00/29.6M [00:00<?, ?B/s]
data/train-00000-of-00001.parquet: 0%| | 0.00/29.6M [00:00<?, ?B/s]
data/train-00000-of-00001.parquet: 19%|██████▋ | 5.65M/29.6M [00:00<00:00, 28.1MB/s]
data/train-00000-of-00001.parquet: 85%|█████████████████████████████▊ | 25.2M/29.6M [00:00<00:00, 69.0MB/s]
data/train-00000-of-00001.parquet: 100%|███████████████████████████████████| 29.6M/29.6M [00:01<00:00, 28.9MB/s]
|
|||
|
|
Generating train split: 0%| | 0/14763 [00:00<?, ? examples/s]
Generating train split: 100%|███████████████████████████████████| 14763/14763 [00:00<00:00, 59899.50 examples/s]
Generating train split: 100%|███████████████████████████████████| 14763/14763 [00:00<00:00, 58973.94 examples/s]
|
|||
|
|
[2026-05-14 13:44:49,700] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:87] [PID:11984] Loading dataset: Gandalf1/indian-finance-synthetic-phase2-cleaned with base_type: chat_template and prompt_style: None
|
|||
|
|
[2026-05-14 13:44:49,704] [INFO] [axolotl.prompt_strategies.chat_template.__call__:1191] [PID:11984] Using chat template:
|
|||
|
|
---
|
|||
|
|
{%- if tools %}
|
|||
|
|
{{- '<|im_start|>system\n' }}
|
|||
|
|
{%- if messages[0].role == 'system' %}
|
|||
|
|
{{- messages[0].content + '\n\n' }}
|
|||
|
|
{%- endif %}
|
|||
|
|
{{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
|
|||
|
|
{%- for tool in tools %}
|
|||
|
|
{{- "\n" }}
|
|||
|
|
{{- tool | tojson }}
|
|||
|
|
{%- endfor %}
|
|||
|
|
{{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
|
|||
|
|
{%- else %}
|
|||
|
|
{%- if messages[0].role == 'system' %}
|
|||
|
|
{{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
|
|||
|
|
{%- endif %}
|
|||
|
|
{%- endif %}
|
|||
|
|
{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
|
|||
|
|
{#- Determine the real last index: use provided value or default to messages length - 1 #}
|
|||
|
|
{%- if real_last_index is defined and real_last_index is not none %}
|
|||
|
|
{%- set ns.real_last_index = real_last_index %}
|
|||
|
|
{%- else %}
|
|||
|
|
{%- set ns.real_last_index = messages|length - 1 %}
|
|||
|
|
{%- endif %}
|
|||
|
|
{%- for message in messages[::-1] %}
|
|||
|
|
{%- set index = (messages|length - 1) - loop.index0 %}
|
|||
|
|
{%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
|
|||
|
|
{%- set ns.multi_step_tool = false %}
|
|||
|
|
{%- set ns.last_query_index = index %}
|
|||
|
|
{%- endif %}
|
|||
|
|
{%- endfor %}
|
|||
|
|
{%- for message in messages %}
|
|||
|
|
{%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
|
|||
|
|
{{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
|
|||
|
|
{%- elif message.role == "assistant" %}
|
|||
|
|
{%- set content = message.content %}
|
|||
|
|
{%- set reasoning_content = '' %}
|
|||
|
|
{%- if message.reasoning_content is defined and message.reasoning_content is not none %}
|
|||
|
|
{%- set reasoning_content = message.reasoning_content %}
|
|||
|
|
{%- else %}
|
|||
|
|
{%- if '</think>' in message.content %}
|
|||
|
|
{%- set content = message.content.split('</think>')[-1].lstrip('\n') %}
|
|||
|
|
{%- set reasoning_content = message.content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
|
|||
|
|
{%- endif %}
|
|||
|
|
{%- endif %}
|
|||
|
|
{%- if loop.index0 > ns.last_query_index %}
|
|||
|
|
{%- if loop.index0 == ns.real_last_index or (loop.index0 != ns.real_last_index and reasoning_content) %}
|
|||
|
|
{{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
|
|||
|
|
{%- else %}
|
|||
|
|
{{- '<|im_start|>' + message.role + '\n' + content }}
|
|||
|
|
{%- endif %}
|
|||
|
|
{%- else %}
|
|||
|
|
{{- '<|im_start|>' + message.role + '\n' + content }}
|
|||
|
|
{%- endif %}
|
|||
|
|
{%- if message.tool_calls %}
|
|||
|
|
{%- for tool_call in message.tool_calls %}
|
|||
|
|
{%- if (loop.first and content) or (not loop.first) %}
|
|||
|
|
{{- '\n' }}
|
|||
|
|
{%- endif %}
|
|||
|
|
{%- if tool_call.function %}
|
|||
|
|
{%- set tool_call = tool_call.function %}
|
|||
|
|
{%- endif %}
|
|||
|
|
{{- '<tool_call>\n{"name": "' }}
|
|||
|
|
{{- tool_call.name }}
|
|||
|
|
{{- '", "arguments": ' }}
|
|||
|
|
{%- if tool_call.arguments is string %}
|
|||
|
|
{{- tool_call.arguments }}
|
|||
|
|
{%- else %}
|
|||
|
|
{{- tool_call.arguments | tojson }}
|
|||
|
|
{%- endif %}
|
|||
|
|
{{- '}\n</tool_call>' }}
|
|||
|
|
{%- endfor %}
|
|||
|
|
{%- endif %}
|
|||
|
|
{{- '<|im_end|>\n' }}
|
|||
|
|
{%- elif message.role == "tool" %}
|
|||
|
|
{%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
|
|||
|
|
{{- '<|im_start|>user' }}
|
|||
|
|
{%- endif %}
|
|||
|
|
{{- '\n<tool_response>\n' }}
|
|||
|
|
{{- message.content }}
|
|||
|
|
{{- '\n</tool_response>' }}
|
|||
|
|
{%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
|
|||
|
|
{{- '<|im_end|>\n' }}
|
|||
|
|
{%- endif %}
|
|||
|
|
{%- endif %}
|
|||
|
|
{%- endfor %}
|
|||
|
|
{%- if add_generation_prompt %}
|
|||
|
|
{{- '<|im_start|>assistant\n' }}
|
|||
|
|
{%- if enable_thinking is defined and enable_thinking is false %}
|
|||
|
|
{{- '<think>\n\n</think>\n\n' }}
|
|||
|
|
{%- else %}
|
|||
|
|
{{- '<think>\n\n' }}
|
|||
|
|
{%- endif %}
|
|||
|
|
{%- endif %}
|
|||
|
|
|
|||
|
|
---
|
|||
|
|
Tokenizing Prompts (num_proc=128): 0%| | 0/14763 [00:00<?, ? examples/s]
Tokenizing Prompts (num_proc=128): 1%|▏ | 116/14763 [00:07<15:10, 16.08 examples/s]
Tokenizing Prompts (num_proc=128): 2%|▍ | 232/14763 [00:08<07:37, 31.73 examples/s]
Tokenizing Prompts (num_proc=128): 2%|▋ | 348/14763 [00:09<05:19, 45.05 examples/s]
Tokenizing Prompts (num_proc=128): 3%|▉ | 464/14763 [00:10<04:11, 56.86 examples/s]
Tokenizing Prompts (num_proc=128): 4%|█▏ | 580/14763 [00:12<03:33, 66.54 examples/s]
Tokenizing Prompts (num_proc=128): 5%|█▎ | 696/14763 [00:13<03:13, 72.63 examples/s]
Tokenizing Prompts (num_proc=128): 6%|█▌ | 812/14763 [00:14<02:59, 77.53 examples/s]
Tokenizing Prompts (num_proc=128): 6%|█▊ | 928/14763 [00:16<02:51, 80.80 examples/s]
Tokenizing Prompts (num_proc=128): 7%|█▉ | 1044/14763 [00:17<02:54, 78.42 examples/s]
Tokenizing Prompts (num_proc=128): 8%|██▏ | 1160/14763 [00:18<02:41, 84.20 examples/s]
Tokenizing Prompts (num_proc=128): 9%|██▍ | 1276/14763 [00:19<02:28, 91.02 examples/s]
Tokenizing Prompts (num_proc=128): 9%|██▋ | 1392/14763 [00:21<02:33, 87.25 examples/s]
Tokenizing Prompts (num_proc=128): 10%|██▊ | 1508/14763 [00:22<02:25, 90.92 examples/s]
Tokenizing Prompts (num_proc=128): 11%|███ | 1624/14763 [00:23<02:21, 92.95 examples/s]
Tokenizing Prompts (num_proc=128): 12%|███▎ | 1740/14763 [00:24<02:20, 92.68 examples/s]
Tokenizing Prompts (num_proc=128): 13%|███▌ | 1856/14763 [00:26<02:23, 89.77 examples/s]
Tokenizing Prompts (num_proc=128): 13%|███▋ | 1972/14763 [00:27<02:25, 87.76 examples/s]
Tokenizing Prompts (num_proc=128): 14%|███▉ | 2088/14763 [00:28<02:18, 91.45 examples/s]
Tokenizing Prompts (num_proc=128): 15%|████▏ | 2204/14763 [00:29<02:16, 92.15 examples/s]
Tokenizing Prompts (num_proc=128): 16%|████▍ | 2320/14763 [00:31<02:10, 95.70 examples/s]
Tokenizing Prompts (num_proc=128): 17%|████▌ | 2436/14763 [00:32<02:08, 95.82 examples/s]
Tokenizing Prompts (num_proc=128): 17%|████▊ | 2552/14763 [00:33<02:08, 95.13 examples/s]
Tokenizing Prompts (num_proc=128): 18%|█████ | 2668/14763 [00:34<02:14, 90.10 examples/s]
Tokenizing Prompts (num_proc=128): 19%|█████▎ | 2784/14763 [00:36<02:15, 88.73 examples/s]
Tokenizing Prompts (num_proc=128): 20%|█████▌ | 2900/14763 [00:37<02:05, 94.65 examples/s]
Tokenizing Prompts (num_proc=128): 20%|█████▋ | 3016/14763 [00:38<02:09, 90.38 examples/s]
Tokenizing Prompts (num_proc=128): 21%|█████▉ | 3132/14763 [00:39<02:04, 93.11 examples/s]
Tokenizing Prompts (num_proc=128): 22%|██████▏ | 3248/14763 [00:41<02:02, 94.23 examples/s]
Tokenizing Prompts (num_proc=128): 23%|██████▍ | 3364/14763 [00:42<01:58, 96.24 examples/s]
Tokenizing Prompts (num_proc=128): 24%|██████▌ | 3480/14763 [00:43<02:01, 93.09 examples/s]
Tokenizing Prompts (num_proc=128): 24%|██████▊ | 3596/14763 [00:44<01:55, 96.34 examples/s]
Tokenizing Prompts (num_proc=128): 25%|███████ | 3712/14763 [00:45<01:54, 96.79 examples/s]
Tokenizing Prompts (num_proc=128): 26%|███████▎ | 3828/14763 [00:47<01:
|
|||
|
|
[2026-05-14 13:47:44,920] [INFO] [axolotl.utils.data.utils._log_dataset_stats:212] [PID:11984] min_input_len: 591
|
|||
|
|
[2026-05-14 13:47:44,920] [INFO] [axolotl.utils.data.utils._log_dataset_stats:213] [PID:11984] max_input_len: 4338
|
|||
|
|
Dropping Invalid Sequences (<None or >8192) (num_proc=128): 0%| | 0/14763 [00:00<?, ? examples/s]
Dropping Invalid Sequences (<None or >8192) (num_proc=128): 1%| | 116/14763 [00:04<10:09, 24.03 examples/s]
Dropping Invalid Sequences (<None or >8192) (num_proc=128): 2%| | 348/14763 [00:04<02:40, 89.78 examples/s]
Dropping Invalid Sequences (<None or >8192) (num_proc=128): 5%|▏ | 696/14763 [00:05<01:04, 217.48 examples/s]
Dropping Invalid Sequences (<None or >8192) (num_proc=128): 8%|▏ | 1160/14763 [00:05<00:30, 439.33 examples/s]
Dropping Invalid Sequences (<None or >8192) (num_proc=128): 10%|▏ | 1508/14763 [00:05<00:20, 641.12 examples/s]
Dropping Invalid Sequences (<None or >8192) (num_proc=128): 13%|▎ | 1856/14763 [00:05<00:14, 871.04 examples/s]
Dropping Invalid Sequences (<None or >8192) (num_proc=128): 16%|▏| 2320/14763 [00:05<00:10, 1213.83 examples/s]
Dropping Invalid Sequences (<None or >8192) (num_proc=128): 56%|▌| 8208/14763 [00:05<00:00, 8610.30 examples/s]
Dropping Invalid Sequences (<None or >8192) (num_proc=128): 100%|█| 14763/14763 [00:06<00:00, 2153.50 examples/s
|
|||
|
|
Drop Samples with Zero Trainable Tokens (num_proc=128): 0%| | 0/14763 [00:00<?, ? examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=128): 1%| | 116/14763 [00:06<12:42, 19.22 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=128): 2%|▏ | 348/14763 [00:06<03:20, 72.06 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=128): 5%|▎ | 696/14763 [00:06<01:18, 178.63 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=128): 6%|▍ | 928/14763 [00:06<00:51, 270.26 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=128): 8%|▍ | 1160/14763 [00:06<00:34, 388.71 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=128): 11%|▋ | 1624/14763 [00:06<00:18, 704.03 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=128): 13%|▊ | 1972/14763 [00:06<00:13, 958.47 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=128): 16%|▊ | 2320/14763 [00:06<00:09, 1257.68 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=128): 18%|▉ | 2668/14763 [00:06<00:07, 1537.53 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=128): 20%|█ | 3016/14763 [00:07<00:06, 1697.32 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=128): 23%|█▏ | 3364/14763 [00:07<00:05, 1936.50 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=128): 25%|█▎ | 3712/14763 [00:07<00:05, 2089.13 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=128): 28%|█▍ | 4176/14763 [00:07<00:04, 2513.82 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=128): 31%|█▌ | 4524/14763 [00:07<00:03, 2716.76 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=128): 98%|██▉| 14533/14763 [00:07<00:00, 25159.95 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=128): 100%|████| 14763/14763 [00:08<00:00, 1708.40 examples/s]
|
|||
|
|
Add position_id column (Sample Packing) (num_proc=128): 0%| | 0/14763 [00:00<?, ? examples/s]
Add position_id column (Sample Packing) (num_proc=128): 1%| | 116/14763 [00:06<13:54, 17.56 examples/s]
Add position_id column (Sample Packing) (num_proc=128): 2%|▏ | 232/14763 [00:06<05:48, 41.74 examples/s]
Add position_id column (Sample Packing) (num_proc=128): 4%|▎ | 580/14763 [00:06<01:41, 139.80 examples/s]
Add position_id column (Sample Packing) (num_proc=128): 6%|▍ | 928/14763 [00:06<00:51, 269.75 examples/s]
Add position_id column (Sample Packing) (num_proc=128): 9%|▌ | 1276/14763 [00:07<00:31, 432.17 examples/s]
Add position_id column (Sample Packing) (num_proc=128): 10%|▌ | 1508/14763 [00:07<00:24, 548.53 examples/s]
Add position_id column (Sample Packing) (num_proc=128): 12%|▋ | 1740/14763 [00:07<00:18, 693.70 examples/s]
Add position_id column (Sample Packing) (num_proc=128): 13%|▊ | 1972/14763 [00:07<00:14, 863.55 examples/s]
Add position_id column (Sample Packing) (num_proc=128): 15%|▋ | 2204/14763 [00:07<00:11, 1055.77 examples/s]
Add position_id column (Sample Packing) (num_proc=128): 32%|█▌ | 4756/14763 [00:07<00:02, 4976.24 examples/s]
Add position_id column (Sample Packing) (num_proc=128): 61%|██▍ | 9013/14763 [00:07<00:00, 11982.32 examples/s]
Add position_id column (Sample Packing) (num_proc=128): 100%|████| 14763/14763 [00:08<00:00, 1660.74 examples/s]
|
|||
|
|
Saving the dataset (0/57 shards): 0%| | 0/14763 [00:00<?, ? examples/s]
Saving the dataset (0/57 shards): 2%|▌ | 259/14763 [00:58<54:53, 4.40 examples/s]
Saving the dataset (0/57 shards): 2%|▌ | 259/14763 [01:00<56:01, 4.31 examples/s]
|
|||
|
|
[2026-05-14 13:49:12,125] [ERROR] [axolotl.telemetry.errors.wrapper:158] [PID:11984] Error captured in telemetry. Run ID: 13cb3f8e-4ac0-4cd8-8c6a-299c044c5614
|
|||
|
|
Traceback (most recent call last):
|
|||
|
|
File "/workspace/axolotl-venv/bin/axolotl", line 12, in <module>
|
|||
|
|
sys.exit(main())
|
|||
|
|
^^^^^^
|
|||
|
|
File "/workspace/axolotl/src/axolotl/cli/main.py", line 456, in main
|
|||
|
|
cli()
|
|||
|
|
File "/workspace/axolotl-venv/lib/python3.12/site-packages/click/core.py", line 1485, in __call__
|
|||
|
|
return self.main(*args, **kwargs)
|
|||
|
|
^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|||
|
|
File "/workspace/axolotl-venv/lib/python3.12/site-packages/click/core.py", line 1406, in main
|
|||
|
|
rv = self.invoke(ctx)
|
|||
|
|
^^^^^^^^^^^^^^^^
|
|||
|
|
File "/workspace/axolotl-venv/lib/python3.12/site-packages/click/core.py", line 1873, in invoke
|
|||
|
|
return _process_result(sub_ctx.command.invoke(sub_ctx))
|
|||
|
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|||
|
|
File "/workspace/axolotl-venv/lib/python3.12/site-packages/click/core.py", line 1269, in invoke
|
|||
|
|
return ctx.invoke(self.callback, **ctx.params)
|
|||
|
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|||
|
|
File "/workspace/axolotl-venv/lib/python3.12/site-packages/click/core.py", line 824, in invoke
|
|||
|
|
return callback(*args, **kwargs)
|
|||
|
|
^^^^^^^^^^^^^^^^^^^^^^^^^
|
|||
|
|
File "/workspace/axolotl/src/axolotl/cli/utils/args.py", line 48, in wrapper
|
|||
|
|
return func(*args, **filtered_kwargs)
|
|||
|
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|||
|
|
File "/workspace/axolotl/src/axolotl/cli/main.py", line 75, in preprocess
|
|||
|
|
do_cli(config=config, **kwargs)
|
|||
|
|
File "/workspace/axolotl/src/axolotl/cli/preprocess.py", line 120, in do_cli
|
|||
|
|
do_preprocess(parsed_cfg, parsed_cli_args)
|
|||
|
|
File "/workspace/axolotl/src/axolotl/telemetry/errors.py", line 127, in wrapper
|
|||
|
|
return func(*args, **kwargs)
|
|||
|
|
^^^^^^^^^^^^^^^^^^^^^
|
|||
|
|
File "/workspace/axolotl/src/axolotl/cli/preprocess.py", line 74, in do_preprocess
|
|||
|
|
load_datasets(cfg=cfg, cli_args=cli_args)
|
|||
|
|
File "/workspace/axolotl/src/axolotl/telemetry/errors.py", line 127, in wrapper
|
|||
|
|
return func(*args, **kwargs)
|
|||
|
|
^^^^^^^^^^^^^^^^^^^^^
|
|||
|
|
File "/workspace/axolotl/src/axolotl/common/datasets.py", line 61, in load_datasets
|
|||
|
|
train_dataset, eval_dataset, total_num_steps, prompters = prepare_datasets(
|
|||
|
|
^^^^^^^^^^^^^^^^^
|
|||
|
|
File "/workspace/axolotl/src/axolotl/utils/data/utils.py", line 50, in wrapper
|
|||
|
|
return func(*args, **kwargs)
|
|||
|
|
^^^^^^^^^^^^^^^^^^^^^
|
|||
|
|
File "/workspace/axolotl/src/axolotl/utils/data/sft.py", line 65, in prepare_datasets
|
|||
|
|
return _prepare_standard_dataset(cfg, tokenizer, processor)
|
|||
|
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|||
|
|
File "/workspace/axolotl/src/axolotl/utils/data/sft.py", line 98, in _prepare_standard_dataset
|
|||
|
|
train_dataset, eval_dataset, prompters = loader.load(_load_datasets)
|
|||
|
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|||
|
|
File "/workspace/axolotl/src/axolotl/utils/data/lock.py", line 38, in load
|
|||
|
|
result = load_fn()
|
|||
|
|
^^^^^^^^^
|
|||
|
|
File "/workspace/axolotl/src/axolotl/utils/data/sft.py", line 77, in _load_datasets
|
|||
|
|
train_dataset, eval_dataset, prompters = _load_and_prepare_datasets(
|
|||
|
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|||
|
|
File "/workspace/axolotl/src/axolotl/utils/data/sft.py", line 496, in _load_and_prepare_datasets
|
|||
|
|
dataset, prompters = _load_tokenized_prepared_datasets(
|
|||
|
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|||
|
|
File "/workspace/axolotl/src/axolotl/utils/data/sft.py", line 299, in _load_tokenized_prepared_datasets
|
|||
|
|
dataset, prompters = _load_raw_datasets(
|
|||
|
|
^^^^^^^^^^^^^^^^^^^
|
|||
|
|
File "/workspace/axolotl/src/axolotl/utils/data/sft.py", line 364, in _load_raw_datasets
|
|||
|
|
save_preprocessed_dataset(cfg, dataset, dataset_hash, split)
|
|||
|
|
File "/workspace/axolotl/src/axolotl/utils/data/shared.py", line 440, in save_preprocessed_dataset
|
|||
|
|
dataset.save_to_disk(
|
|||
|
|
File "/workspace/axolotl-venv/lib/python3.12/site-packages/datasets/arrow_dataset.py", line 1909, in save_to_disk
|
|||
|
|
for job_id, done, content in iflatmap_unordered(
|
|||
|
|
^^^^^^^^^^^^^^^^^^^
|
|||
|
|
File "/workspace/axolotl-venv/lib/python3.12/site-packages/datasets/utils/py_utils.py", line 617, in iflatmap_unordered
|
|||
|
|
raise RuntimeError(
|
|||
|
|
RuntimeError: One of the subprocesses has abruptly died during map operation.To debug the error, disable multiprocessing.
|
|||
|
|
[2026-05-14 13:52:16,708] [DEBUG] [axolotl.utils.config.resolve_dtype:74] [PID:25326] bf16 support detected, enabling for this configuration.
|
|||
|
|
[2026-05-14 13:52:17,989] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:25326] baseline 0.000GB ()
|
|||
|
|
[2026-05-14 13:52:17,990] [INFO] [axolotl.cli.config.load_cfg:333] [PID:25326] config:
|
|||
|
|
{
|
|||
|
|
"activation_offloading": false,
|
|||
|
|
"adapter": "qlora",
|
|||
|
|
"attn_implementation": "flash_attention_2",
|
|||
|
|
"attn_needs_dtype_cast": true,
|
|||
|
|
"attn_supports_packing": true,
|
|||
|
|
"attn_uses_flash_lib": true,
|
|||
|
|
"auto_resume_from_checkpoints": true,
|
|||
|
|
"axolotl_config_path": "./data/config.yaml",
|
|||
|
|
"base_model": "Qwen/Qwen3-8B",
|
|||
|
|
"base_model_config": "Qwen/Qwen3-8B",
|
|||
|
|
"batch_size": 80,
|
|||
|
|
"bf16": true,
|
|||
|
|
"capabilities": {
|
|||
|
|
"bf16": true,
|
|||
|
|
"compute_capability": "sm_80",
|
|||
|
|
"fp8": false,
|
|||
|
|
"n_gpu": 1,
|
|||
|
|
"n_node": 1,
|
|||
|
|
"tf32": true
|
|||
|
|
},
|
|||
|
|
"chat_template": "qwen3",
|
|||
|
|
"context_parallel_size": 1,
|
|||
|
|
"cut_cross_entropy": true,
|
|||
|
|
"dataloader_num_workers": 1,
|
|||
|
|
"dataloader_pin_memory": true,
|
|||
|
|
"dataloader_prefetch_factor": 256,
|
|||
|
|
"dataset_num_proc": 1,
|
|||
|
|
"dataset_prepared_path": "last_run_prepared",
|
|||
|
|
"datasets": [
|
|||
|
|
{
|
|||
|
|
"chat_template": "tokenizer_default",
|
|||
|
|
"field_messages": "messages",
|
|||
|
|
"field_tools": "tools",
|
|||
|
|
"message_property_mappings": {
|
|||
|
|
"content": "content",
|
|||
|
|
"role": "role"
|
|||
|
|
},
|
|||
|
|
"path": "Gandalf1/indian-finance-synthetic-phase2-cleaned",
|
|||
|
|
"roles_to_train": [
|
|||
|
|
"assistant"
|
|||
|
|
],
|
|||
|
|
"train_on_eos": "turn",
|
|||
|
|
"trust_remote_code": false,
|
|||
|
|
"type": "chat_template"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"ddp": false,
|
|||
|
|
"device": "cuda:0",
|
|||
|
|
"device_map": "auto",
|
|||
|
|
"dion_rank_fraction": 1.0,
|
|||
|
|
"dion_rank_multiple_of": 1,
|
|||
|
|
"eaft_alpha": 1.0,
|
|||
|
|
"eaft_k": 20,
|
|||
|
|
"env_capabilities": {
|
|||
|
|
"torch_version": "2.9.1"
|
|||
|
|
},
|
|||
|
|
"eval_batch_size": 10,
|
|||
|
|
"eval_causal_lm_metrics": [
|
|||
|
|
"sacrebleu",
|
|||
|
|
"comet",
|
|||
|
|
"ter",
|
|||
|
|
"chrf"
|
|||
|
|
],
|
|||
|
|
"eval_max_new_tokens": 128,
|
|||
|
|
"eval_sample_packing": true,
|
|||
|
|
"eval_table_size": 0,
|
|||
|
|
"experimental_skip_move_to_device": true,
|
|||
|
|
"fp16": false,
|
|||
|
|
"generate_samples": false,
|
|||
|
|
"generation_do_sample": true,
|
|||
|
|
"generation_max_new_tokens": 50,
|
|||
|
|
"generation_prompt_ratio": 0.5,
|
|||
|
|
"generation_temperature": 0.7,
|
|||
|
|
"gradient_accumulation_steps": 8,
|
|||
|
|
"gradient_checkpointing": true,
|
|||
|
|
"gradient_checkpointing_kwargs": {
|
|||
|
|
"use_reentrant": false
|
|||
|
|
},
|
|||
|
|
"hub_model_id": "Gandalf1/qwen3-8b-finance-sft-phase2",
|
|||
|
|
"hub_strategy": "all_checkpoints",
|
|||
|
|
"include_tkps": true,
|
|||
|
|
"is_preprocess": true,
|
|||
|
|
"layer_offloading": false,
|
|||
|
|
"learning_rate": 2e-05,
|
|||
|
|
"lisa_layers_attribute": "model.layers",
|
|||
|
|
"load_best_model_at_end": false,
|
|||
|
|
"load_in_4bit": true,
|
|||
|
|
"load_in_8bit": false,
|
|||
|
|
"local_rank": 0,
|
|||
|
|
"logging_steps": 10,
|
|||
|
|
"lora_alpha": 64,
|
|||
|
|
"lora_dropout": 0.05,
|
|||
|
|
"lora_mlp_kernel": true,
|
|||
|
|
"lora_o_kernel": true,
|
|||
|
|
"lora_qkv_kernel": true,
|
|||
|
|
"lora_r": 32,
|
|||
|
|
"lora_target_modules": [
|
|||
|
|
"q_proj",
|
|||
|
|
"k_proj",
|
|||
|
|
"v_proj",
|
|||
|
|
"o_proj",
|
|||
|
|
"gate_proj",
|
|||
|
|
"down_proj",
|
|||
|
|
"up_proj"
|
|||
|
|
],
|
|||
|
|
"loraplus_lr_embedding": 1e-06,
|
|||
|
|
"loss_watchdog_patience": 3,
|
|||
|
|
"loss_watchdog_threshold": 5.0,
|
|||
|
|
"lr_scheduler": "cosine",
|
|||
|
|
"max_grad_norm": 1.0,
|
|||
|
|
"mean_resizing_embeddings": false,
|
|||
|
|
"merge_method": "memory_efficient",
|
|||
|
|
"micro_batch_size": 10,
|
|||
|
|
"model_config_type": "qwen3",
|
|||
|
|
"num_epochs": 2.0,
|
|||
|
|
"num_generation_samples": 3,
|
|||
|
|
"optimizer": "adamw_torch_4bit",
|
|||
|
|
"otel_metrics_host": "localhost",
|
|||
|
|
"otel_metrics_port": 8000,
|
|||
|
|
"output_dir": "./outputs/finance-synthetic-sft-phase2",
|
|||
|
|
"pad_to_sequence_len": true,
|
|||
|
|
"plugins": [
|
|||
|
|
"axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin"
|
|||
|
|
],
|
|||
|
|
"pretrain_multipack_attn": true,
|
|||
|
|
"profiler_steps_start": 0,
|
|||
|
|
"qlora_sharded_model_loading": false,
|
|||
|
|
"quantize_moe_experts": false,
|
|||
|
|
"ray_num_workers": 1,
|
|||
|
|
"relora_prune_method": "magnitude",
|
|||
|
|
"resources_per_worker": {
|
|||
|
|
"GPU": 1
|
|||
|
|
},
|
|||
|
|
"sample_packing": true,
|
|||
|
|
"sample_packing_bin_size": 200,
|
|||
|
|
"sample_packing_group_size": 100000,
|
|||
|
|
"save_only_model": false,
|
|||
|
|
"save_safetensors": true,
|
|||
|
|
"save_steps": 0.16666666666666666,
|
|||
|
|
"save_total_limit": 3,
|
|||
|
|
"saves_per_epoch": 3,
|
|||
|
|
"seed": 42,
|
|||
|
|
"sequence_len": 8192,
|
|||
|
|
"shuffle_before_merging_datasets": false,
|
|||
|
|
"shuffle_merged_datasets": true,
|
|||
|
|
"skip_prepare_dataset": false,
|
|||
|
|
"streaming_multipack_buffer_size": 10000,
|
|||
|
|
"strict": false,
|
|||
|
|
"tensor_parallel_size": 1,
|
|||
|
|
"tf32": true,
|
|||
|
|
"tiled_mlp_use_original_mlp": true,
|
|||
|
|
"tokenizer_config": "Qwen/Qwen3-8B",
|
|||
|
|
"tokenizer_save_jinja_files": true,
|
|||
|
|
"torch_dtype": "torch.bfloat16",
|
|||
|
|
"train_on_inputs": false,
|
|||
|
|
"trl": {
|
|||
|
|
"async_prefetch": false,
|
|||
|
|
"log_completions": false,
|
|||
|
|
"mask_truncated_completions": false,
|
|||
|
|
"ref_model_mixup_alpha": 0.9,
|
|||
|
|
"ref_model_sync_steps": 64,
|
|||
|
|
"replay_buffer_size": 0,
|
|||
|
|
"replay_recompute_logps": true,
|
|||
|
|
"reroll_max_groups": 1,
|
|||
|
|
"reroll_start_fraction": 1.0,
|
|||
|
|
"reward_num_workers": 1,
|
|||
|
|
"scale_rewards": true,
|
|||
|
|
"skip_zero_advantage_batches": true,
|
|||
|
|
"sync_ref_model": false,
|
|||
|
|
"use_data_producer": false,
|
|||
|
|
"use_vllm": false,
|
|||
|
|
"vllm_lora_sync": false,
|
|||
|
|
"vllm_server_host": "0.0.0.0",
|
|||
|
|
"vllm_server_port": 8000
|
|||
|
|
},
|
|||
|
|
"use_otel_metrics": false,
|
|||
|
|
"use_ray": false,
|
|||
|
|
"val_set_size": 0.0,
|
|||
|
|
"vllm": {
|
|||
|
|
"device": "auto",
|
|||
|
|
"dtype": "auto",
|
|||
|
|
"gpu_memory_utilization": 0.9,
|
|||
|
|
"host": "0.0.0.0",
|
|||
|
|
"port": 8000
|
|||
|
|
},
|
|||
|
|
"warmup_ratio": 0.05,
|
|||
|
|
"weight_decay": 0.01,
|
|||
|
|
"world_size": 1
|
|||
|
|
}
|
|||
|
|
[2026-05-14 13:52:19,357] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:311] [PID:25326] EOS: 151645 / <|im_end|>
|
|||
|
|
[2026-05-14 13:52:19,357] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:312] [PID:25326] BOS: None / None
|
|||
|
|
[2026-05-14 13:52:19,357] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:313] [PID:25326] PAD: 151643 / <|endoftext|>
|
|||
|
|
[2026-05-14 13:52:19,357] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:314] [PID:25326] UNK: None / None
|
|||
|
|
[2026-05-14 13:52:19,358] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:482] [PID:25326] Unable to find prepared dataset in last_run_prepared/2c3cb7e1625c800787dbdc29d010a51d
|
|||
|
|
[2026-05-14 13:52:19,358] [INFO] [axolotl.utils.data.sft._load_raw_datasets:320] [PID:25326] Loading raw datasets...
|
|||
|
|
Downloading (incomplete total...): 0.00B [00:00, ?B/s]
|
|||
|
|
Fetching 0 files: 0it [00:00, ?it/s][A
Fetching 0 files: 0it [00:00, ?it/s]
|
|||
|
|
Download complete: : 0.00B [00:00, ?B/s]
Download complete: : 0.00B [00:00, ?B/s]
|
|||
|
|
[2026-05-14 13:52:20,634] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:87] [PID:25326] Loading dataset: Gandalf1/indian-finance-synthetic-phase2-cleaned with base_type: chat_template and prompt_style: None
|
|||
|
|
[2026-05-14 13:52:20,637] [INFO] [axolotl.prompt_strategies.chat_template.__call__:1191] [PID:25326] Using chat template:
|
|||
|
|
---
|
|||
|
|
{%- if tools %}
|
|||
|
|
{{- '<|im_start|>system\n' }}
|
|||
|
|
{%- if messages[0].role == 'system' %}
|
|||
|
|
{{- messages[0].content + '\n\n' }}
|
|||
|
|
{%- endif %}
|
|||
|
|
{{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
|
|||
|
|
{%- for tool in tools %}
|
|||
|
|
{{- "\n" }}
|
|||
|
|
{{- tool | tojson }}
|
|||
|
|
{%- endfor %}
|
|||
|
|
{{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
|
|||
|
|
{%- else %}
|
|||
|
|
{%- if messages[0].role == 'system' %}
|
|||
|
|
{{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
|
|||
|
|
{%- endif %}
|
|||
|
|
{%- endif %}
|
|||
|
|
{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
|
|||
|
|
{#- Determine the real last index: use provided value or default to messages length - 1 #}
|
|||
|
|
{%- if real_last_index is defined and real_last_index is not none %}
|
|||
|
|
{%- set ns.real_last_index = real_last_index %}
|
|||
|
|
{%- else %}
|
|||
|
|
{%- set ns.real_last_index = messages|length - 1 %}
|
|||
|
|
{%- endif %}
|
|||
|
|
{%- for message in messages[::-1] %}
|
|||
|
|
{%- set index = (messages|length - 1) - loop.index0 %}
|
|||
|
|
{%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
|
|||
|
|
{%- set ns.multi_step_tool = false %}
|
|||
|
|
{%- set ns.last_query_index = index %}
|
|||
|
|
{%- endif %}
|
|||
|
|
{%- endfor %}
|
|||
|
|
{%- for message in messages %}
|
|||
|
|
{%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
|
|||
|
|
{{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
|
|||
|
|
{%- elif message.role == "assistant" %}
|
|||
|
|
{%- set content = message.content %}
|
|||
|
|
{%- set reasoning_content = '' %}
|
|||
|
|
{%- if message.reasoning_content is defined and message.reasoning_content is not none %}
|
|||
|
|
{%- set reasoning_content = message.reasoning_content %}
|
|||
|
|
{%- else %}
|
|||
|
|
{%- if '</think>' in message.content %}
|
|||
|
|
{%- set content = message.content.split('</think>')[-1].lstrip('\n') %}
|
|||
|
|
{%- set reasoning_content = message.content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
|
|||
|
|
{%- endif %}
|
|||
|
|
{%- endif %}
|
|||
|
|
{%- if loop.index0 > ns.last_query_index %}
|
|||
|
|
{%- if loop.index0 == ns.real_last_index or (loop.index0 != ns.real_last_index and reasoning_content) %}
|
|||
|
|
{{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
|
|||
|
|
{%- else %}
|
|||
|
|
{{- '<|im_start|>' + message.role + '\n' + content }}
|
|||
|
|
{%- endif %}
|
|||
|
|
{%- else %}
|
|||
|
|
{{- '<|im_start|>' + message.role + '\n' + content }}
|
|||
|
|
{%- endif %}
|
|||
|
|
{%- if message.tool_calls %}
|
|||
|
|
{%- for tool_call in message.tool_calls %}
|
|||
|
|
{%- if (loop.first and content) or (not loop.first) %}
|
|||
|
|
{{- '\n' }}
|
|||
|
|
{%- endif %}
|
|||
|
|
{%- if tool_call.function %}
|
|||
|
|
{%- set tool_call = tool_call.function %}
|
|||
|
|
{%- endif %}
|
|||
|
|
{{- '<tool_call>\n{"name": "' }}
|
|||
|
|
{{- tool_call.name }}
|
|||
|
|
{{- '", "arguments": ' }}
|
|||
|
|
{%- if tool_call.arguments is string %}
|
|||
|
|
{{- tool_call.arguments }}
|
|||
|
|
{%- else %}
|
|||
|
|
{{- tool_call.arguments | tojson }}
|
|||
|
|
{%- endif %}
|
|||
|
|
{{- '}\n</tool_call>' }}
|
|||
|
|
{%- endfor %}
|
|||
|
|
{%- endif %}
|
|||
|
|
{{- '<|im_end|>\n' }}
|
|||
|
|
{%- elif message.role == "tool" %}
|
|||
|
|
{%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
|
|||
|
|
{{- '<|im_start|>user' }}
|
|||
|
|
{%- endif %}
|
|||
|
|
{{- '\n<tool_response>\n' }}
|
|||
|
|
{{- message.content }}
|
|||
|
|
{{- '\n</tool_response>' }}
|
|||
|
|
{%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
|
|||
|
|
{{- '<|im_end|>\n' }}
|
|||
|
|
{%- endif %}
|
|||
|
|
{%- endif %}
|
|||
|
|
{%- endfor %}
|
|||
|
|
{%- if add_generation_prompt %}
|
|||
|
|
{{- '<|im_start|>assistant\n' }}
|
|||
|
|
{%- if enable_thinking is defined and enable_thinking is false %}
|
|||
|
|
{{- '<think>\n\n</think>\n\n' }}
|
|||
|
|
{%- else %}
|
|||
|
|
{{- '<think>\n\n' }}
|
|||
|
|
{%- endif %}
|
|||
|
|
{%- endif %}
|
|||
|
|
|
|||
|
|
---
|
|||
|
|
Tokenizing Prompts (num_proc=1): 0%| | 0/14763 [00:00<?, ? examples/s]
Tokenizing Prompts (num_proc=1): 7%|██ | 1000/14763 [00:15<03:39, 62.81 examples/s]
Tokenizing Prompts (num_proc=1): 7%|██ | 1000/14763 [00:31<03:39, 62.81 examples/s]
Tokenizing Prompts (num_proc=1): 14%|████ | 2000/14763 [00:31<03:17, 64.63 examples/s]
Tokenizing Prompts (num_proc=1): 14%|████ | 2000/14763 [00:42<03:17, 64.63 examples/s]
Tokenizing Prompts (num_proc=1): 20%|██████ | 3000/14763 [00:46<03:00, 65.17 examples/s]
Tokenizing Prompts (num_proc=1): 27%|████████▏ | 4000/14763 [01:00<02:42, 66.31 examples/s]
Tokenizing Prompts (num_proc=1): 27%|████████▏ | 4000/14763 [01:11<02:42, 66.31 examples/s]
Tokenizing Prompts (num_proc=1): 34%|██████████▏ | 5000/14763 [01:16<02:28, 65.94 examples/s]
Tokenizing Prompts (num_proc=1): 34%|██████████▏ | 5000/14763 [01:31<02:28, 65.94 examples/s]
Tokenizing Prompts (num_proc=1): 41%|████████████▏ | 6000/14763 [01:31<02:12, 66.36 examples/s]
Tokenizing Prompts (num_proc=1): 41%|████████████▏ | 6000/14763 [01:42<02:12, 66.36 examples/s]
Tokenizing Prompts (num_proc=1): 47%|██████████████▏ | 7000/14763 [01:45<01:55, 67.19 examples/s]
Tokenizing Prompts (num_proc=1): 54%|████████████████▎ | 8000/14763 [01:59<01:38, 68.66 examples/s]
Tokenizing Prompts (num_proc=1): 54%|████████████████▎ | 8000/14763 [02:11<01:38, 68.66 examples/s]
Tokenizing Prompts (num_proc=1): 61%|██████████████████▎ | 9000/14763 [02:12<01:21, 70.88 examples/s]
Tokenizing Prompts (num_proc=1): 68%|███████████████████▋ | 10000/14763 [02:25<01:06, 72.15 examples/s]
Tokenizing Prompts (num_proc=1): 75%|█████████████████████▌ | 11000/14763 [02:39<00:51, 73.24 examples/s]
Tokenizing Prompts (num_proc=1): 75%|█████████████████████▌ | 11000/14763 [02:51<00:51, 73.24 examples/s]
Tokenizing Prompts (num_proc=1): 81%|███████████████████████▌ | 12000/14763 [02:52<00:37, 74.31 examples/s]
Tokenizing Prompts (num_proc=1): 88%|█████████████████████████▌ | 13000/14763 [03:05<00:23, 74.83 examples/s]
Tokenizing Prompts (num_proc=1): 95%|███████████████████████████▌ | 14000/14763 [03:18<00:10, 74.41 examples/s]
Tokenizing Prompts (num_proc=1): 100%|█████████████████████████████| 14763/14763 [03:29<00:00, 74.24 examples/s]
Tokenizing Prompts (num_proc=1): 100%|█████████████████████████████| 14763/14763 [03:29<00:00, 70.49 examples/s]
|
|||
|
|
[2026-05-14 13:55:57,931] [INFO] [axolotl.utils.data.utils._log_dataset_stats:212] [PID:25326] min_input_len: 591
|
|||
|
|
[2026-05-14 13:55:57,932] [INFO] [axolotl.utils.data.utils._log_dataset_stats:213] [PID:25326] max_input_len: 4338
|
|||
|
|
Dropping Invalid Sequences (<None or >8192) (num_proc=1): 0%| | 0/14763 [00:00<?, ? examples/s]
Dropping Invalid Sequences (<None or >8192) (num_proc=1): 7%|▏ | 1000/14763 [00:00<00:09, 1431.08 examples/s]
Dropping Invalid Sequences (<None or >8192) (num_proc=1): 14%|▍ | 2000/14763 [00:01<00:08, 1553.70 examples/s]
Dropping Invalid Sequences (<None or >8192) (num_proc=1): 20%|▌ | 3000/14763 [00:01<00:07, 1608.36 examples/s]
Dropping Invalid Sequences (<None or >8192) (num_proc=1): 27%|▊ | 4000/14763 [00:02<00:06, 1658.84 examples/s]
Dropping Invalid Sequences (<None or >8192) (num_proc=1): 34%|█ | 5000/14763 [00:03<00:05, 1675.80 examples/s]
Dropping Invalid Sequences (<None or >8192) (num_proc=1): 41%|█▏ | 6000/14763 [00:03<00:05, 1706.11 examples/s]
Dropping Invalid Sequences (<None or >8192) (num_proc=1): 47%|█▍ | 7000/14763 [00:04<00:04, 1696.20 examples/s]
Dropping Invalid Sequences (<None or >8192) (num_proc=1): 54%|█▋ | 8000/14763 [00:04<00:03, 1694.97 examples/s]
Dropping Invalid Sequences (<None or >8192) (num_proc=1): 61%|█▊ | 9000/14763 [00:05<00:03, 1706.05 examples/s]
Dropping Invalid Sequences (<None or >8192) (num_proc=1): 68%|█▎| 10000/14763 [00:05<00:02, 1713.44 examples/s]
Dropping Invalid Sequences (<None or >8192) (num_proc=1): 75%|█▍| 11000/14763 [00:06<00:02, 1720.45 examples/s]
Dropping Invalid Sequences (<None or >8192) (num_proc=1): 81%|█▋| 12000/14763 [00:07<00:01, 1728.08 examples/s]
Dropping Invalid Sequences (<None or >8192) (num_proc=1): 88%|█▊| 13000/14763 [00:07<00:01, 1723.26 examples/s]
Dropping Invalid Sequences (<None or >8192) (num_proc=1): 95%|█▉| 14000/14763 [00:08<00:00, 1718.49 examples/s]
Dropping Invalid Sequences (<None or >8192) (num_proc=1): 100%|██| 14763/14763 [00:08<00:00, 1725.22 examples/s]
Dropping Invalid Sequences (<None or >8192) (num_proc=1): 100%|██| 14763/14763 [00:08<00:00, 1665.67 examples/s]
|
|||
|
|
Drop Samples with Zero Trainable Tokens (num_proc=1): 0%| | 0/14763 [00:00<?, ? examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=1): 7%|▍ | 1000/14763 [00:00<00:09, 1480.39 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=1): 14%|▉ | 2000/14763 [00:01<00:07, 1624.88 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=1): 20%|█▍ | 3000/14763 [00:01<00:06, 1691.39 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=1): 27%|█▉ | 4000/14763 [00:02<00:06, 1693.88 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=1): 34%|██▎ | 5000/14763 [00:02<00:05, 1702.30 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=1): 41%|██▊ | 6000/14763 [00:03<00:05, 1728.09 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=1): 47%|███▎ | 7000/14763 [00:04<00:04, 1714.08 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=1): 54%|███▊ | 8000/14763 [00:04<00:03, 1706.22 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=1): 61%|████▎ | 9000/14763 [00:05<00:03, 1738.85 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=1): 68%|████ | 10000/14763 [00:05<00:02, 1742.02 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=1): 75%|████▍ | 11000/14763 [00:06<00:02, 1720.32 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=1): 81%|████▉ | 12000/14763 [00:07<00:01, 1726.95 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=1): 88%|█████▎| 13000/14763 [00:07<00:01, 1728.68 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=1): 95%|█████▋| 14000/14763 [00:08<00:00, 1737.40 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=1): 100%|██████| 14763/14763 [00:08<00:00, 1747.36 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=1): 100%|██████| 14763/14763 [00:08<00:00, 1688.84 examples/s]
|
|||
|
|
Add position_id column (Sample Packing) (num_proc=1): 0%| | 0/14763 [00:00<?, ? examples/s]
Add position_id column (Sample Packing) (num_proc=1): 7%|▌ | 1000/14763 [00:01<00:16, 853.52 examples/s]
Add position_id column (Sample Packing) (num_proc=1): 14%|█ | 2000/14763 [00:02<00:13, 919.04 examples/s]
Add position_id column (Sample Packing) (num_proc=1): 20%|█▍ | 3000/14763 [00:03<00:11, 1002.51 examples/s]
Add position_id column (Sample Packing) (num_proc=1): 27%|█▉ | 4000/14763 [00:03<00:10, 1071.46 examples/s]
Add position_id column (Sample Packing) (num_proc=1): 34%|██▎ | 5000/14763 [00:04<00:08, 1105.51 examples/s]
Add position_id column (Sample Packing) (num_proc=1): 41%|██▊ | 6000/14763 [00:05<00:07, 1120.70 examples/s]
Add position_id column (Sample Packing) (num_proc=1): 47%|███▎ | 7000/14763 [00:06<00:06, 1133.90 examples/s]
Add position_id column (Sample Packing) (num_proc=1): 54%|███▊ | 8000/14763 [00:07<00:05, 1136.07 examples/s]
Add position_id column (Sample Packing) (num_proc=1): 61%|████▎ | 9000/14763 [00:08<00:05, 1149.89 examples/s]
Add position_id column (Sample Packing) (num_proc=1): 68%|████ | 10000/14763 [00:09<00:04, 1157.43 examples/s]
Add position_id column (Sample Packing) (num_proc=1): 75%|████▍ | 11000/14763 [00:09<00:03, 1166.93 examples/s]
Add position_id column (Sample Packing) (num_proc=1): 81%|████▉ | 12000/14763 [00:10<00:02, 1159.14 examples/s]
Add position_id column (Sample Packing) (num_proc=1): 88%|█████▎| 13000/14763 [00:11<00:01, 1152.36 examples/s]
Add position_id column (Sample Packing) (num_proc=1): 95%|█████▋| 14000/14763 [00:12<00:00, 1156.19 examples/s]
Add position_id column (Sample Packing) (num_proc=1): 100%|██████| 14763/14763 [00:13<00:00, 1150.85 examples/s]
Add position_id column (Sample Packing) (num_proc=1): 100%|██████| 14763/14763 [00:13<00:00, 1103.88 examples/s]
|
|||
|
|
Saving the dataset (0/1 shards): 0%| | 0/14763 [00:00<?, ? examples/s]
Saving the dataset (0/1 shards): 14%|███▉ | 2000/14763 [00:08<00:51, 247.02 examples/s]
Saving the dataset (0/1 shards): 54%|███████████████▏ | 8000/14763 [00:08<00:05, 1273.03 examples/s]
Saving the dataset (0/1 shards): 95%|█████████████████████████▌ | 14000/14763 [00:08<00:00, 2655.38 examples/s]
Saving the dataset (1/1 shards): 100%|███████████████████████████| 14763/14763 [00:08<00:00, 2655.38 examples/s]
Saving the dataset (1/1 shards): 100%|███████████████████████████| 14763/14763 [00:09<00:00, 1547.31 examples/s]
|
|||
|
|
[2026-05-14 13:56:38,558] [INFO] [axolotl.common.datasets.load_datasets:74] [PID:25326] check_dataset_labels...
|
|||
|
|
[2026-05-14 13:56:38,572] [INFO] [axolotl.utils.tokenization.check_example_labels:44] [PID:25326] [31m<|im_start|>[0m[97m(-100, 151644)[0m [31msystem[0m[97m(-100, 8948)[0m [31m
|
|||
|
|
[0m[97m(-100, 198)[0m [31mYou[0m[97m(-100, 2610)[0m [31m are[0m[97m(-100, 525)[0m [31m B[0m[97m(-100, 425)[0m [31many[0m[97m(-100, 3767)[0m [31man[0m[97m(-100, 276)[0m [31mTree[0m[97m(-100, 6533)[0m [31m,[0m[97m(-100, 11)[0m [31m an[0m[97m(-100, 458)[0m [31m expert[0m[97m(-100, 6203)[0m [31m Indian[0m[97m(-100, 7748)[0m [31m personal[0m[97m(-100, 4345)[0m [31m finance[0m[97m(-100, 17017)[0m [31m assistant[0m[97m(-100, 17847)[0m [31m.[0m[97m(-100, 13)[0m [31m You[0m[97m(-100, 1446)[0m [31m have[0m[97m(-100, 614)[0m [31m access[0m[97m(-100, 2615)[0m [31m to[0m[97m(-100, 311)[0m [31m financial[0m[97m(-100, 5896)[0m [31m calculation[0m[97m(-100, 21937)[0m [31m tools[0m[97m(-100, 7375)[0m [31m.[0m[97m(-100, 13)[0m [31m Use[0m[97m(-100, 5443)[0m [31m them[0m[97m(-100, 1105)[0m [31m to[0m[97m(-100, 311)[0m [31m provide[0m[97m(-100, 3410)[0m [31m accurate[0m[97m(-100, 13382)[0m [31m,[0m[97m(-100, 11)[0m [31m personalized[0m[97m(-100, 34549)[0m [31m advice[0m[97m(-100, 9462)[0m [31m based[0m[97m(-100, 3118)[0m [31m on[0m[97m(-100, 389)[0m [31m FY[0m[97m(-100, 46366)[0m [31m [0m[97m(-100, 220)[0m [31m2[0m[97m(-100, 17)[0m [31m0[0m[97m(-100, 15)[0m [31m2[0m[97m(-100, 17)[0m [31m4[0m[97m(-100, 19)[0m [31m-[0m[97m(-100, 12)[0m [31m2[0m[97m(-100, 17)[0m [31m5[0m[97m(-100, 20)[0m [31m Indian[0m[97m(-100, 7748)[0m [31m tax[0m[97m(-100, 3742)[0m [31m rules[0m[97m(-100, 5601)[0m [31m and[0m[97m(-100, 323)[0m [31m current[0m[97m(-100, 1482)[0m [31m financial[0m[97m(-100, 5896)[0m [31m regulations[0m[97m(-100, 14305)[0m [31m.[0m[97m(-100, 13)[0m [31m Always[0m[97m(-100, 23240)[0m [31m show[0m[97m(-100, 1473)[0m [31m your[0m[97m(-100, 697)[0m [31m reasoning[0m[97m(-100, 32711)[0m [31m before[0m[97m(-100, 1573)[0m [31m taking[0m[97m(-100, 4633)[0m [31m action[0m[97m(-100, 1917)[0m [31m:[0m[97m(-100, 25)[0m [31m decom[0m[97m(-100, 28502)[0m [31mpose[0m[97m(-100, 2900)[0m [31m the[0m[97m(-100, 279)[0m [31m problem[0m[97m(-100, 3491)[0m [31m,[0m[97m(-100, 11)[0m [31m identify[0m[97m(-100, 10542)[0m [31m what[0m[97m(-100, 1128)[0m [31m information[0m[97m(-100, 1995)[0m [31m the[0m[97m(-100, 279)[0m [31m user[0m[97m(-100, 1196)[0m [31m provided[0m[97m(-100, 3897)[0m [31m vs[0m[97m(-100, 6165)[0m [31m what[0m[97m(-100, 1128)[0m [31m's[0m[97m(-100, 594)[0m [31m missing[0m[97m(-100, 7402)[0m [31m,[0m[97m(-100, 11)[0m [31m state[0m[97m(-100, 1584)[0m [31m any[0m[97m(-100, 894)[0m [31m assumptions[0m[97m(-100, 31846)[0m [31m explicitly[0m[97m(-100, 20975)[0m [31m,[0m[97m(-100, 11)[0m [31m then[0m[97m(-100, 1221)[0m [31m decide[0m[97m(-100, 10279)[0m [31m whether[0m[97m(-100, 3425)[0m [31m to[0m[97m(-100, 311)[0m [31m use[0m[97m(-100, 990)[0m [31m tools[0m[97m(-100, 7375)[0m [31m,[0m[97m(-100, 11)[0m [31m ask[0m[97m(-100, 2548)[0m [31m for[0m[97m(-100, 369)[0m [31m clarification[0m[97m(-100, 63684)[0m [31m,[0m[97m(-100, 11)[0m [31m or[0m[97m(-100, 476)[0m [31m answer[0m[97m(-100, 4226)[0m [31m directly[0m[97m(-100, 5961)[0m [31m.[0m[97m(-100, 13)[0m [31m Never[0m[97m(-100, 14695)[0m [31m guarantee[0m[97m(-100, 15440)[0m [31m returns[0m[97m(-100, 4675)[0m [31m on[0m[97m(-100, 389)[0m [31m market[0m[97m(-100, 3081)[0m [31m-linked[0m[97m(-100, 54414)[0m [31m instruments[0m[97m(-100, 23316)[0m [31m.[0m[97m(-100, 13)[0m [31m When[0m[97m(-100, 3197)[0m [31m information[0m[97m(-100, 1995)[0m [31m is[0m[97m(-100, 374)[0m [31m missing[0m[97m(-100, 7402)[0m [31m,[0m[97m(-100, 11)[0m [31m either[0m[97m(-100, 2987)[0m [31m ask[0m[97m(-100, 2548)[0m [31m the[0m[97m(-100, 279)[0m [31m user[0m[97m(-100, 1196)[0m [31m or[0m[97m(-100, 476)[0m [31m clearly[0m[97m(-100, 9355)[0m [31m state[0m[97m(-100, 1584)[0m [31m your[0
|
|||
|
|
|
|||
|
|
[0m[97m(-100, 382)[0m [31m#[0m[97m(-100, 2)[0m [31m Tools[0m[97m(-100, 13852)[0m [31m
|
|||
|
|
|
|||
|
|
[0m[97m(-100, 271)[0m [31mYou[0m[97m(-100, 2610)[0m [31m may[0m[97m(-100, 1231)[0m [31m call[0m[97m(-100, 1618)[0m [31m one[0m[97m(-100, 825)[0m [31m or[0m[97m(-100, 476)[0m [31m more[0m[97m(-100, 803)[0m [31m functions[0m[97m(-100, 5746)[0m [31m to[0m[97m(-100, 311)[0m [31m assist[0m[97m(-100, 7789)[0m [31m with[0m[97m(-100, 448)[0m [31m the[0m[97m(-100, 279)[0m [31m user[0m[97m(-100, 1196)[0m [31m query[0m[97m(-100, 3239)[0m [31m.
|
|||
|
|
|
|||
|
|
[0m[97m(-100, 382)[0m [31mYou[0m[97m(-100, 2610)[0m [31m are[0m[97m(-100, 525)[0m [31m provided[0m[97m(-100, 3897)[0m [31m with[0m[97m(-100, 448)[0m [31m function[0m[97m(-100, 729)[0m [31m signatures[0m[97m(-100, 32628)[0m [31m within[0m[97m(-100, 2878)[0m [31m <[0m[97m(-100, 366)[0m [31mtools[0m[97m(-100, 15918)[0m [31m></[0m[97m(-100, 1472)[0m [31mtools[0m[97m(-100, 15918)[0m [31m>[0m[97m(-100, 29)[0m [31m XML[0m[97m(-100, 11874)[0m [31m tags[0m[97m(-100, 9492)[0m [31m:
|
|||
|
|
[0m[97m(-100, 510)[0m [31m<[0m[97m(-100, 27)[0m [31mtools[0m[97m(-100, 15918)[0m [31m>
|
|||
|
|
[0m[97m(-100, 397)[0m [31m{"[0m[97m(-100, 4913)[0m [31mtype[0m[97m(-100, 1313)[0m [31m":[0m[97m(-100, 788)[0m [31m "[0m[97m(-100, 330)[0m [31mfunction[0m[97m(-100, 1688)[0m [31m",[0m[97m(-100, 497)[0m [31m "[0m[97m(-100, 330)[0m [31mfunction[0m[97m(-100, 1688)[0m [31m":[0m[97m(-100, 788)[0m [31m {"[0m[97m(-100, 5212)[0m [31mname[0m[97m(-100, 606)[0m [31m":[0m[97m(-100, 788)[0m [31m "[0m[97m(-100, 330)[0m [31mcalculate[0m[97m(-100, 35597)[0m [31m_s[0m[97m(-100, 643)[0m [31mip[0m[97m(-100, 573)[0m [31m_returns[0m[97m(-100, 58900)[0m [31m",[0m[97m(-100, 497)[0m [31m "[0m[97m(-100, 330)[0m [31mdescription[0m[97m(-100, 4684)[0m [31m":[0m[97m(-100, 788)[0m [31m "[0m[97m(-100, 330)[0m [31mCalculate[0m[97m(-100, 47866)[0m [31m the[0m[97m(-100, 279)[0m [31m future[0m[97m(-100, 3853)[0m [31m value[0m[97m(-100, 897)[0m [31m of[0m[97m(-100, 315)[0m [31m a[0m[97m(-100, 264)[0m [31m System[0m[97m(-100, 739)[0m [31matic[0m[97m(-100, 774)[0m [31m Investment[0m[97m(-100, 32250)[0m [31m Plan[0m[97m(-100, 9680)[0m [31m ([0m[97m(-100, 320)[0m [31mS[0m[97m(-100, 50)[0m [31mIP[0m[97m(-100, 3298)[0m [31m).[0m[97m(-100, 568)[0m [31m",[0m[97m(-100, 497)[0m [31m "[0m[97m(-100, 330)[0m [31mparameters[0m[97m(-100, 13786)[0m [31m":[0m[97m(-100, 788)[0m [31m {"[0m[97m(-100, 5212)[0m [31mtype[0m[97m(-100, 1313)[0m [31m":[0m[97m(-100, 788)[0m [31m "[0m[97m(-100, 330)[0m [31mobject[0m[97m(-100, 1700)[0m [31m",[0m[97m(-100, 497)[0m [31m "[0m[97m(-100, 330)[0m [31mproperties[0m[97m(-100, 13193)[0m [31m":[0m[97m(-100, 788)[0m [31m {"[0m[97m(-100, 5212)[0m [31mmonthly[0m[97m(-100, 69138)[0m [31m_amount[0m[97m(-100, 13471)[0m [31m":[0m[97m(-100, 788)[0m [31m {"[0m[97m(-100, 5212)[0m [31mtype[0m[97m(-100, 1313)[0m [31m":[0m[97m(-100, 788)[0m [31m "[0m[97m(-100, 330)[0m [31mnumber[0m[97m(-100, 4082)[0m [31m",[0m[97m(-100, 497)[0m [31m "[0m[97m(-100, 330)[0m [31mdescription[0m[97m(-100, 4684)[0m [31m":[0m[97m(-100, 788)[0m [31m "[0m[97m(-100, 330)[0m [31mMonthly[0m[97m(-100, 72007)[0m [31m SIP[0m[97m(-100, 65441)[0m [31m investment[0m[97m(-100, 9162)[0m [31m amount[0m[97m(-100, 3311)[0m [31m in[0m[97m(-100, 304)[0m [31m IN[0m[97m(-100, 1964)[0m [31mR[0m[97m(-100, 49)[0m [31m."[0m[97m(-100, 1189)[0m [31m},[0m[97m(-100, 2137)[0m [31m "[0m[97m(-100, 330)[0m [31mannual[0m[97m(-100, 63609)[0m [31m_return[0m[97m(-100, 12511)[0m [31m_pct[0m[97m(-100, 71512)[0m [31m":[0m[97m(-100, 788)[0m [31m {"[0m[97m(-100, 5212)[0m [31mtype[0m[97m(-100, 1313)[0m [31m":[0m[97m(-100, 788)[0m [31m "[0m[97m(-100, 330)[0m [31mnumber[0m[97m(-100, 4082)[0m [31m",[0m[97m(-100, 497)[0m [31m "[0m[97m(-100, 330)[0m [31mdescription[0m[97m(-100, 4684)[0m [31m":[0m[97m(-100, 788)[0m [31m "[0m[97m(-100, 330)[0m [31mExpected[0m[97m(-100, 18896)[0m [31m annual[0m[97m(-100, 9775)[0m [31m return[0m[97m(-100, 470)[0m [31m percentage[0m[97m(-100, 11414)[0m [31m."[0m[97m(-100, 1189)[0m [31m},[0m[97m(-100, 2137)[0m [31m "[0m[97m(-100, 330)[0m [31mten[0m[97m(-100, 1960)[0m [31mure[0m[97m(-100, 552)[0m [31m_years[0m[97m(-100, 74490)[0m [31m":[0m[97m(-100, 788)[0m [31m {"[0m[97m(-100, 5212)[0m [31mtype[0m[97m(-100, 1313)[0m [31m":[0m[97m(-100, 788)[0m [31m "[0m[97m(-100, 330)[0m [31minteger[0m[97m(-100, 11662)[0m [31m",[0m[97m(-100, 497)[0m [31m "[0m[97m(-100, 330)[0m [31mdescription[0m[97m(-100, 4684)[0m [31m":[0m[97m(-100, 788)[0m [31m "[0m[97m(-100, 330)[0m [31mInvest[0m[97m(-100, 33876)[0m [31mment[0m[97m(-100, 478)[0m [31m duration[0m[97m(-100, 8090)[0m [31m in[0m[97m(-100, 304)[0m [31m years[0m[97m(-100, 1635)[0m [31m."[0m[97m(-100, 1189)[0m [31m}},[0m[97m(-100, 38154)[0m [31m "[0m[97m(-100, 330)[0m [31mrequired[0m[97m(-100, 6279)[0m [31m
|
|||
|
|
[0m[97m(-100, 532)[0m [31m</[0m[97m(-100, 522)[0m [31mtools[0m[97m(-100, 15918)[0m [31m>
|
|||
|
|
|
|||
|
|
[0m[97m(-100, 1339)[0m [31mFor[0m[97m(-100, 2461)[0m [31m each[0m[97m(-100, 1817)[0m [31m function[0m[97m(-100, 729)[0m [31m call[0m[97m(-100, 1618)[0m [31m,[0m[97m(-100, 11)[0m [31m return[0m[97m(-100, 470)[0m [31m a[0m[97m(-100, 264)[0m [31m json[0m[97m(-100, 2951)[0m [31m object[0m[97m(-100, 1633)[0m [31m with[0m[97m(-100, 448)[0m [31m function[0m[97m(-100, 729)[0m [31m name[0m[97m(-100, 829)[0m [31m and[0m[97m(-100, 323)[0m [31m arguments[0m[97m(-100, 5977)[0m [31m within[0m[97m(-100, 2878)[0m [31m [0m[97m(-100, 220)[0m [31m<tool_call>[0m[97m(-100, 151657)[0m [31m</tool_call>[0m[97m(-100, 151658)[0m [31m XML[0m[97m(-100, 11874)[0m [31m tags[0m[97m(-100, 9492)[0m [31m:
|
|||
|
|
[0m[97m(-100, 510)[0m [31m<tool_call>[0m[97m(-100, 151657)[0m [31m
|
|||
|
|
[0m[97m(-100, 198)[0m [31m{"[0m[97m(-100, 4913)[0m [31mname[0m[97m(-100, 606)[0m [31m":[0m[97m(-100, 788)[0m [31m <[0m[97m(-100, 366)[0m [31mfunction[0m[97m(-100, 1688)[0m [31m-name[0m[97m(-100, 11494)[0m [31m>,[0m[97m(-100, 8066)[0m [31m "[0m[97m(-100, 330)[0m [31marguments[0m[97m(-100, 16370)[0m [31m":[0m[97m(-100, 788)[0m [31m <[0m[97m(-100, 366)[0m [31margs[0m[97m(-100, 2116)[0m [31m-json[0m[97m(-100, 56080)[0m [31m-object[0m[97m(-100, 40432)[0m [31m>}
|
|||
|
|
[0m[97m(-100, 31296)[0m [31m</tool_call>[0m[97m(-100, 151658)[0m [31m<|im_end|>[0m[97m(-100, 151645)[0m [31m
|
|||
|
|
[0m[97m(-100, 198)[0m [31m<|im_start|>[0m[97m(-100, 151644)[0m [31muser[0m[97m(-100, 872)[0m [31m
|
|||
|
|
[0m[97m(-100, 198)[0m [31mI[0m[97m(-100, 40)[0m [31m'm[0m[97m(-100, 2776)[0m [31m [0m[97m(-100, 220)[0m [31m4[0m[97m(-100, 19)[0m [31m9[0m[97m(-100, 24)[0m [31m,[0m[97m(-100, 11)[0m [31m single[0m[97m(-100, 3175)[0m [31m with[0m[97m(-100, 448)[0m [31m one[0m[97m(-100, 825)[0m [31m dependent[0m[97m(-100, 17749)[0m [31m,[0m[97m(-100, 11)[0m [31m and[0m[97m(-100, 323)[0m [31m honestly[0m[97m(-100, 26044)[0m [31m feeling[0m[97m(-100, 8266)[0m [31m a[0m[97m(-100, 264)[0m [31m bit[0m[97m(-100, 2699)[0m [31m overwhelmed[0m[97m(-100, 42106)[0m [31m about[0m[97m(-100, 911)[0m [31m retirement[0m[97m(-100, 20950)[0m [31m planning[0m[97m(-100, 9115)[0m [31m.[0m[97m(-100, 13)[0m [31m Can[0m[97m(-100, 2980)[0m [31m someone[0m[97m(-100, 4325)[0m [31m explain[0m[97m(-100, 10339)[0m [31m N[0m[97m(-100, 451)[0m [31mPS[0m[97m(-100, 5012)[0m [31m properly[0m[97m(-100, 10277)[0m [31m —[0m[97m(-100, 1959)[0m [31m like[0m[97m(-100, 1075)[0m [31m what[0m[97m(-100, 1128)[0m [31m are[0m[97m(-100, 525)[0m [31m the[0m[97m(-100, 279)[0m [31m different[0m[97m(-100, 2155)[0m [31m tiers[0m[97m(-100, 63171)[0m [31m,[0m[97m(-100, 11)[0m [31m how[0m[97m(-100, 1246)[0m [31m exactly[0m[97m(-100, 6896)[0m [31m do[0m[97m(-100, 653)[0m [31m the[0m[97m(-100, 279)[0m [31m tax[0m[97m(-100, 3742)[0m [31m benefits[0m[97m(-100, 7567)[0m [31m work[0m[97m(-100, 975)[0m [31m under[0m[97m(-100, 1212)[0m [31m [0m[97m(-100, 220)[0m [31m8[0m[97m(-100, 23)[0m [31m0[0m[97m(-100, 15)[0m [31mC[0m[97m(-100, 34)[0m [31m and[0m[97m(-100, 323)[0m [31m [0m[97m(-100, 220)[0m [31m8[0m[97m(-100, 23)[0m [31m0[0m[97m(-100, 15)[0m [31mCC[0m[97m(-100, 3706)[0m [31mD[0m[97m(-100, 35)[0m [31m,[0m[97m(-100, 11)[0m [31m and[0m[97m(-100, 323)[0m [31m what[0m[97m(-100, 1128)[0m [31m are[0m[97m(-100, 525)[0m [31m the[0m[97m(-100, 279)[0m [31m real[0m[97m(-100, 1931)[0m [31m withdrawal[0m[97m(-100, 29736)[0m [31m rules[0m[97m(-100, 5601)[0m [31m?[0m[97m(-100, 30)[0m [31m I[0m[97m(-100, 358)[0m [31m'm[0m[97m(-100, 2776)[0m [31m a[0m[97m(-100, 264)[0m [31m pharmacist[0m[97m(-100, 89294)[0m [31m in[0m[97m(-100, 304)[0m [31m Vis[0m[97m(-100, 7656)[0m [31makh[0m[97m(-100, 21758)[0m [31mapat[0m[97m(-100, 25760)[0m [31mnam[0m[97m(-100, 12400)[0m [31m earning[0m[97m(-100, 27644)[0m [31m around[0m[97m(-100, 2163)[0m [31m [0m[97m(-100, 220)[0m [31m3[0m[97m(-100, 18)[0m [31m0[0m[97m(-100, 15)[0m [31m lak[0m[97m(-100, 93686)[0m [31mhs[0m[97m(-100, 4997)[0m [31m per[0m[97m(-100, 817)[0m [31m year[0m[97m(-100, 1042)[0m [31m and[0m[97m(-100, 323)[0m [31m just[0m[97m(-100, 1101)[0m [31m trying[0m[97m(-100, 4460)[0m [31m to[0m[97m(-100, 311)[0m [31m figure[0m[97m(-100, 7071)[0m [31m out[0m[97m(-100, 700)[0m [31m if[0m[97m(-100, 421)[0m [31m this[0m[97m(-100, 419)[0m [31m is[0m[97m(-100, 374)[0m [31m a[0m[97m(-100, 264)[0m [31m good[0m[97m(-100, 1661)[0m [31m option[0m[97m(-100, 2999)[0m [31m for[0m[97m(-100, 369)[0m [31m me[0m[97m(-100, 752)[0m [31m at[0m[97m(-100, 518)[0m [31m my[0m[97m(-100, 847)[0m [31m age[0m[97m(-100, 4231)[0m [31m.[0m[97m(-100, 13)[0m [31m<|im_end|>[0m[97m(-100, 151645)[0m [31m
|
|||
|
|
[0m[97m(-100, 198)[0m [31m<|im_start|>[0m[97m(-100, 151644)[0m [31massistant[0m[97m(-100, 77091)[0m [31m
|
|||
|
|
[0m[97m(-100, 198)[0m [31m<think>[0m[97m(-100, 151667)[0m [32m
|
|||
|
|
[0m[97m(198, 198)[0m [32mThe[0m[97m(785, 785)[0m [32m user[0m[97m(1196, 1196)[0m [32m wants[0m[97m(6801, 6801)[0m [32m a[0m[97m(264, 264)[0m [32m conceptual[0m[97m(43801, 43801)[0m [32m overview[0m[97m(23251, 23251)[0m [32m of[0m[97m(315, 315)[0m [32m N[0m[97m(451, 451)[0m [32mPS[0m[97m(5012, 5012)[0m [32m focusing[0m[97m(21080, 21080)[0m [32m on[0m[97m(389, 389)[0m [32m tiers[0m[97m(63171, 63171)[0m [32m,[0m[97m(11, 11)[0m [32m tax[0m[97m(3742, 3742)[0m [32m benefits[0m[97m(7567, 7567)[0m [32m ([0m[97m(320, 320)[0m [32m8[0m[97m(23, 23)[0m [32m0[0m[97m(15, 15)[0m [32mC[0m[97m(34, 34)[0m [32m,[0m[97m(11, 11)[0m [32m [0m[97m(220, 220)[0m [32m8[0m[97m(23, 23)[0m [32m0[0m[97m(15, 15)[0m [32mCC[0m[97m(3706, 3706)[0m [32mD[0m[97m(35, 35)[0m [32m),[0m[97m(701, 701)[0m [32m and[0m[97m(323, 323)[0m [32m withdrawal[0m[97m(29736, 29736)[0m [32m rules[0m[97m(5601, 5601)[0m [32m.[0m[97m(13, 13)[0m [32m I[0m[97m(358, 358)[0m [32m need[0m[97m(1184, 1184)[0m [32m to[0m[97m(311, 311)[0m [32m clarify[0m[97m(37163, 37163)[0m [32m that[0m[97m(429, 429)[0m [32m [0m[97m(220, 220)[0m [32m8[0m[97m(23, 23)[0m [32m0[0m[97m(15, 15)[0m [32mCC[0m[97m(3706, 3706)[0m [32mD[0m[97m(35, 35)[0m [32m([0m[97m(7, 7)[0m [32m1[0m[97m(16, 16)[0m [32m)[0m[97m(8, 8)[0m [32m is[0m[97m(374, 374)[0m [32m part[0m[97m(949, 949)[0m [32m of[0m[97m(315, 315)[0m [32m [0m[97m(220, 220)[0m [32m8[0m[97m(23, 23)[0m [32m0[0m[97m(15, 15)[0m [32mC[0m[97m(34, 34)[0m [32m limit[0m[97m(3930, 3930)[0m [32m,[0m[97m(11, 11)[0m [32m and[0m[97m(323, 323)[0m [32m [0m[97m(220, 220)[0m [32m8[0m[97m(23, 23)[0m [32m0[0m[97m(15, 15)[0m [32mCC[0m[97m(3706, 3706)[0m [32mD[0m[97m(35, 35)[0m [32m([0m[97m(7, 7)[0m [32m1[0m[97m(16, 16)[0m [32mB[0m[97m(33, 33)[0m [32m)[0m[97m(8, 8)[0m [32m is[0m[97m(374, 374)[0m [32m extra[0m[97m(4960, 4960)[0m [32m ₹[0m[97m(89791, 89791)[0m [32m5[0m[97m(20, 20)[0m [32m0[0m[97m(15, 15)[0m [32mk[0m[97m(74, 74)[0m [32m.[0m[97m(13, 13)[0m [32m Employer[0m[97m(74852, 74852)[0m [32m contribution[0m[97m(18527, 18527)[0m [32m ([0m[97m(320, 320)[0m [32m8[0m[97m(23, 23)[0m [32m0[0m[97m(15, 15)[0m [32mCC[0m[97m(3706, 3706)[0m [32mD[0m[97m(35, 35)[0m [32m([0m[97m(7, 7)[0m [32m2[0m[97m(17, 17)[0m [32m))[0m[97m(593, 593)[0m [32m is[0m[97m(374, 374)[0m [32m separate[0m[97m(8651, 8651)[0m [32m.[0m[97m(13, 13)[0m [32m Withdraw[0m[97m(67381, 67381)[0m [32mal[0m[97m(278, 278)[0m [32m rules[0m[97m(5601, 5601)[0m [32m:[0m[97m(25, 25)[0m [32m at[0m[97m(518, 518)[0m [32m [0m[97m(220, 220)[0m [32m6[0m[97m(21, 21)[0m [32m0[0m[97m(15, 15)[0m [32m,[0m[97m(11, 11)[0m [32m [0m[97m(220, 220)[0m [32m6[0m[97m(21, 21)[0m [32m0[0m[97m(15, 15)[0m [32m%[0m[97m(4, 4)[0m [32m tax[0m[97m(3742, 3742)[0m [32m-free[0m[97m(12577, 12577)[0m [32m lump[0m[97m(48529, 48529)[0m [32m sum[0m[97m(2629, 2629)[0m [32m +[0m[97m(488, 488)[0m [32m [0m[97m(220, 220)[0m [32m4[0m[97m(19, 19)[0m [32m0[0m[97m(15, 15)[0m [32m%[0m[97m(4, 4)[0m [32m ann[0m[97m(2935, 2935)[0m [32muity[0m[97m(34494, 34494)[0m [32m;[0m[97m(26, 26)[0m [32m partial[0m[97m(7130, 7130)[0m [32m withdrawals[0m[97m(85204, 85204)[0m [32m after[0m[97m(1283, 1283)[0m [32m [0m[97m(220, 220)[0m [32m3[0m[97m(18, 18)[0m [32m years[0m[97m(1635, 1635)[0m [32m;[0m[97m(26, 26)[0m [32m early[0m[97m(4124, 4124)[0m [32m exit[0m[97m(4869, 4869)[0m [32m after[0m[97m(1283, 1283)[0m [32m [0m[97m(220, 220)[0m [32m1[0m[97m(16, 16)[0m [32m0[0m[97m(15, 15)[0m [32m years[0m[97m(1635, 1635)[0m [32m.[0m[97m(13, 13)[0m [32m Also[0m[97m(7281, 7281)[0m [32m mention[0m[97m(6286, 6286)[0m [32m that[0m[97m(429, 429)[0m [32m tax[0m[97m(3742, 3742)[0m [32m benefits[0m[97m(7567, 7567)
|
|||
|
|
[0m[97m(624, 624)[0m [32m</think>[0m[97m(151668, 151668)[0m [32m
|
|||
|
|
|
|||
|
|
[0m[97m(271, 271)[0m [32mThe[0m[97m(785, 785)[0m [32m National[0m[97m(5055, 5055)[0m [32m Pension[0m[97m(80457, 80457)[0m [32m System[0m[97m(739, 739)[0m [32m ([0m[97m(320, 320)[0m [32mN[0m[97m(45, 45)[0m [32mPS[0m[97m(5012, 5012)[0m [32m)[0m[97m(8, 8)[0m [32m is[0m[97m(374, 374)[0m [32m a[0m[97m(264, 264)[0m [32m voluntary[0m[97m(35979, 35979)[0m [32m retirement[0m[97m(20950, 20950)[0m [32m savings[0m[97m(18986, 18986)[0m [32m scheme[0m[97m(12859, 12859)[0m [32m regulated[0m[97m(34219, 34219)[0m [32m by[0m[97m(553, 553)[0m [32m P[0m[97m(393, 393)[0m [32mFR[0m[97m(10504, 10504)[0m [32mDA[0m[97m(6352, 6352)[0m [32m.[0m[97m(13, 13)[0m [32m It[0m[97m(1084, 1084)[0m [32m has[0m[97m(702, 702)[0m [32m two[0m[97m(1378, 1378)[0m [32m tiers[0m[97m(63171, 63171)[0m [32m:[0m[97m(25, 25)[0m [32m Tier[0m[97m(35232, 35232)[0m [32m I[0m[97m(358, 358)[0m [32m ([0m[97m(320, 320)[0m [32mmandatory[0m[97m(80116, 80116)[0m [32m for[0m[97m(369, 369)[0m [32m tax[0m[97m(3742, 3742)[0m [32m benefits[0m[97m(7567, 7567)[0m [32m,[0m[97m(11, 11)[0m [32m lock[0m[97m(5296, 5296)[0m [32m-in[0m[97m(3419, 3419)[0m [32m till[0m[97m(11956, 11956)[0m [32m age[0m[97m(4231, 4231)[0m [32m [0m[97m(220, 220)[0m [32m6[0m[97m(21, 21)[0m [32m0[0m[97m(15, 15)[0m [32m)[0m[97m(8, 8)[0m [32m and[0m[97m(323, 323)[0m [32m Tier[0m[97m(35232, 35232)[0m [32m II[0m[97m(7946, 7946)[0m [32m ([0m[97m(320, 320)[0m [32mvol[0m[97m(12536, 12536)[0m [32muntary[0m[97m(58956, 58956)[0m [32m,[0m[97m(11, 11)[0m [32m no[0m[97m(902, 902)[0m [32m lock[0m[97m(5296, 5296)[0m [32m-in[0m[97m(3419, 3419)[0m [32m,[0m[97m(11, 11)[0m [32m no[0m[97m(902, 902)[0m [32m tax[0m[97m(3742, 3742)[0m [32m benefits[0m[97m(7567, 7567)[0m [32m).[0m[97m(568, 568)[0m [32m For[0m[97m(1752, 1752)[0m [32m sal[0m[97m(4274, 4274)[0m [32mar[0m[97m(277, 277)[0m [32mied[0m[97m(1122, 1122)[0m [32m individuals[0m[97m(7775, 7775)[0m [32m,[0m[97m(11, 11)[0m [32m your[0m[97m(697, 697)[0m [32m own[0m[97m(1828, 1828)[0m [32m Tier[0m[97m(35232, 35232)[0m [32m I[0m[97m(358, 358)[0m [32m contribution[0m[97m(18527, 18527)[0m [32m up[0m[97m(705, 705)[0m [32m to[0m[97m(311, 311)[0m [32m [0m[97m(220, 220)[0m [32m1[0m[97m(16, 16)[0m [32m0[0m[97m(15, 15)[0m [32m%[0m[97m(4, 4)[0m [32m of[0m[97m(315, 315)[0m [32m salary[0m[97m(16107, 16107)[0m [32m qualifies[0m[97m(79907, 79907)[0m [32m under[0m[97m(1212, 1212)[0m [32m Section[0m[97m(11113, 11113)[0m [32m [0m[97m(220, 220)[0m [32m8[0m[97m(23, 23)[0m [32m0[0m[97m(15, 15)[0m [32mCC[0m[97m(3706, 3706)[0m [32mD[0m[97m(35, 35)[0m [32m([0m[97m(7, 7)[0m [32m1[0m[97m(16, 16)[0m [32m)[0m[97m(8, 8)[0m [32m within[0m[97m(2878, 2878)[0m [32m the[0m[97m(279, 279)[0m [32m overall[0m[97m(8084, 8084)[0m [32m ₹[0m[97m(89791, 89791)[0m [32m1[0m[97m(16, 16)[0m [32m.[0m[97m(13, 13)[0m [32m5[0m[97m(20, 20)[0m [32m lakh[0m[97m(62173, 62173)[0m [32m [0m[97m(220, 220)[0m [32m8[0m[97m(23, 23)[0m [32m0[0m[97m(15, 15)[0m [32mC[0m[97m(34, 34)[0m [32m limit[0m[97m(3930, 3930)[0m [32m,[0m[97m(11, 11)[0m [32m and[0m[97m(323, 323)[0m [32m an[0m[97m(458, 458)[0m [32m additional[0m[97m(5107, 5107)[0m [32m ₹[0m[97m(89791, 89791)[0m [32m5[0m[97m(20, 20)[0m [32m0[0m[97m(15, 15)[0m [32m,[0m[97m(11, 11)[0m [32m0[0m[97m(15, 15)[0m [32m0[0m[97m(15, 15)[0m [32m0[0m[97m(15, 15)[0m [32m under[0m[97m(1212, 1212)[0m [32m [0m[97m(220, 220)[0m [32m8[0m[97m(23, 23)[0m [32m0[0m[97m(15, 15)[0m [32mCC[0m[97m(3706, 3706)[0m [32mD[0m[97m(35, 35)[0m [32m([0m[97m(7, 7)[0m [32m1[0m[97m(16, 16)[0m [32mB[0m[97m(33, 33)[0m [32m)[0m[97m(8, 8)[0m [32m ([0m[97m(320, 320)[0m [32mover[0m[97m(1975, 1975)[0m [32m and[0m[97m(323, 323)[0m [32m above[0m[97m(3403
|
|||
|
|
[0m[97m(-100, 198)[0m
|
|||
|
|
[2026-05-14 13:56:38,574] [INFO] [axolotl.utils.tokenization.check_example_labels:45] [PID:25326]
|
|||
|
|
|
|||
|
|
|
|||
|
|
|
|||
|
|
[2026-05-14 13:56:38,574] [INFO] [axolotl.utils.tokenization.check_example_labels:48] [PID:25326] Total input len: 946
|
|||
|
|
[2026-05-14 13:56:38,574] [INFO] [axolotl.utils.tokenization.check_example_labels:49] [PID:25326] Count of labels: 506
|
|||
|
|
[2026-05-14 13:56:38,574] [INFO] [axolotl.common.datasets.load_datasets:90] [PID:25326] printing prompters...
|
|||
|
|
[2026-05-14 13:56:38,574] [INFO] [axolotl.common.datasets.load_datasets:92] [PID:25326] Pre-tokenized or custom dataset types are unsupported for logging
|
|||
|
|
[2026-05-14 13:56:38,968] [INFO] [axolotl.cli.preprocess.do_preprocess:92] [PID:25326] [32mSuccess! Preprocessed data path: `dataset_prepared_path: last_run_prepared`[39m
|
|||
|
|
[2026-05-14 13:57:44,324] [DEBUG] [axolotl.utils.config.resolve_dtype:74] [PID:26066] bf16 support detected, enabling for this configuration.
|
|||
|
|
[2026-05-14 13:57:44,465] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:26066] baseline 0.000GB ()
|
|||
|
|
[2026-05-14 13:57:44,466] [INFO] [axolotl.cli.config.load_cfg:333] [PID:26066] config:
|
|||
|
|
{
|
|||
|
|
"activation_offloading": false,
|
|||
|
|
"adapter": "qlora",
|
|||
|
|
"attn_implementation": "flash_attention_2",
|
|||
|
|
"attn_needs_dtype_cast": true,
|
|||
|
|
"attn_supports_packing": true,
|
|||
|
|
"attn_uses_flash_lib": true,
|
|||
|
|
"auto_resume_from_checkpoints": true,
|
|||
|
|
"axolotl_config_path": "./data/config.yaml",
|
|||
|
|
"base_model": "Qwen/Qwen3-8B",
|
|||
|
|
"base_model_config": "Qwen/Qwen3-8B",
|
|||
|
|
"batch_size": 80,
|
|||
|
|
"bf16": true,
|
|||
|
|
"capabilities": {
|
|||
|
|
"bf16": true,
|
|||
|
|
"compute_capability": "sm_80",
|
|||
|
|
"fp8": false,
|
|||
|
|
"n_gpu": 1,
|
|||
|
|
"n_node": 1,
|
|||
|
|
"tf32": true
|
|||
|
|
},
|
|||
|
|
"chat_template": "qwen3",
|
|||
|
|
"context_parallel_size": 1,
|
|||
|
|
"cut_cross_entropy": true,
|
|||
|
|
"dataloader_num_workers": 1,
|
|||
|
|
"dataloader_pin_memory": true,
|
|||
|
|
"dataloader_prefetch_factor": 256,
|
|||
|
|
"dataset_num_proc": 1,
|
|||
|
|
"dataset_prepared_path": "last_run_prepared",
|
|||
|
|
"datasets": [
|
|||
|
|
{
|
|||
|
|
"chat_template": "tokenizer_default",
|
|||
|
|
"field_messages": "messages",
|
|||
|
|
"field_tools": "tools",
|
|||
|
|
"message_property_mappings": {
|
|||
|
|
"content": "content",
|
|||
|
|
"role": "role"
|
|||
|
|
},
|
|||
|
|
"path": "Gandalf1/indian-finance-synthetic-phase2-cleaned",
|
|||
|
|
"roles_to_train": [
|
|||
|
|
"assistant"
|
|||
|
|
],
|
|||
|
|
"train_on_eos": "turn",
|
|||
|
|
"trust_remote_code": false,
|
|||
|
|
"type": "chat_template"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"ddp": false,
|
|||
|
|
"device": "cuda:0",
|
|||
|
|
"dion_rank_fraction": 1.0,
|
|||
|
|
"dion_rank_multiple_of": 1,
|
|||
|
|
"eaft_alpha": 1.0,
|
|||
|
|
"eaft_k": 20,
|
|||
|
|
"env_capabilities": {
|
|||
|
|
"torch_version": "2.9.1"
|
|||
|
|
},
|
|||
|
|
"eval_batch_size": 10,
|
|||
|
|
"eval_causal_lm_metrics": [
|
|||
|
|
"sacrebleu",
|
|||
|
|
"comet",
|
|||
|
|
"ter",
|
|||
|
|
"chrf"
|
|||
|
|
],
|
|||
|
|
"eval_max_new_tokens": 128,
|
|||
|
|
"eval_sample_packing": true,
|
|||
|
|
"eval_table_size": 0,
|
|||
|
|
"experimental_skip_move_to_device": true,
|
|||
|
|
"fp16": false,
|
|||
|
|
"generate_samples": false,
|
|||
|
|
"generation_do_sample": true,
|
|||
|
|
"generation_max_new_tokens": 50,
|
|||
|
|
"generation_prompt_ratio": 0.5,
|
|||
|
|
"generation_temperature": 0.7,
|
|||
|
|
"gradient_accumulation_steps": 8,
|
|||
|
|
"gradient_checkpointing": true,
|
|||
|
|
"gradient_checkpointing_kwargs": {
|
|||
|
|
"use_reentrant": false
|
|||
|
|
},
|
|||
|
|
"hub_model_id": "Gandalf1/qwen3-8b-finance-sft-phase2",
|
|||
|
|
"hub_strategy": "all_checkpoints",
|
|||
|
|
"include_tkps": true,
|
|||
|
|
"layer_offloading": false,
|
|||
|
|
"learning_rate": 2e-05,
|
|||
|
|
"lisa_layers_attribute": "model.layers",
|
|||
|
|
"load_best_model_at_end": false,
|
|||
|
|
"load_in_4bit": true,
|
|||
|
|
"load_in_8bit": false,
|
|||
|
|
"local_rank": 0,
|
|||
|
|
"logging_steps": 10,
|
|||
|
|
"lora_alpha": 64,
|
|||
|
|
"lora_dropout": 0.05,
|
|||
|
|
"lora_mlp_kernel": true,
|
|||
|
|
"lora_o_kernel": true,
|
|||
|
|
"lora_qkv_kernel": true,
|
|||
|
|
"lora_r": 32,
|
|||
|
|
"lora_target_modules": [
|
|||
|
|
"q_proj",
|
|||
|
|
"k_proj",
|
|||
|
|
"v_proj",
|
|||
|
|
"o_proj",
|
|||
|
|
"gate_proj",
|
|||
|
|
"down_proj",
|
|||
|
|
"up_proj"
|
|||
|
|
],
|
|||
|
|
"loraplus_lr_embedding": 1e-06,
|
|||
|
|
"loss_watchdog_patience": 3,
|
|||
|
|
"loss_watchdog_threshold": 5.0,
|
|||
|
|
"lr_scheduler": "cosine",
|
|||
|
|
"max_grad_norm": 1.0,
|
|||
|
|
"mean_resizing_embeddings": false,
|
|||
|
|
"merge_method": "memory_efficient",
|
|||
|
|
"micro_batch_size": 10,
|
|||
|
|
"model_config_type": "qwen3",
|
|||
|
|
"num_epochs": 2.0,
|
|||
|
|
"num_generation_samples": 3,
|
|||
|
|
"optimizer": "adamw_torch_4bit",
|
|||
|
|
"otel_metrics_host": "localhost",
|
|||
|
|
"otel_metrics_port": 8000,
|
|||
|
|
"output_dir": "./outputs/finance-synthetic-sft-phase2",
|
|||
|
|
"pad_to_sequence_len": true,
|
|||
|
|
"plugins": [
|
|||
|
|
"axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin"
|
|||
|
|
],
|
|||
|
|
"pretrain_multipack_attn": true,
|
|||
|
|
"profiler_steps_start": 0,
|
|||
|
|
"qlora_sharded_model_loading": false,
|
|||
|
|
"quantize_moe_experts": false,
|
|||
|
|
"ray_num_workers": 1,
|
|||
|
|
"relora_prune_method": "magnitude",
|
|||
|
|
"resources_per_worker": {
|
|||
|
|
"GPU": 1
|
|||
|
|
},
|
|||
|
|
"sample_packing": true,
|
|||
|
|
"sample_packing_bin_size": 200,
|
|||
|
|
"sample_packing_group_size": 100000,
|
|||
|
|
"save_only_model": false,
|
|||
|
|
"save_safetensors": true,
|
|||
|
|
"save_steps": 0.16666666666666666,
|
|||
|
|
"save_total_limit": 3,
|
|||
|
|
"saves_per_epoch": 3,
|
|||
|
|
"seed": 42,
|
|||
|
|
"sequence_len": 8192,
|
|||
|
|
"shuffle_before_merging_datasets": false,
|
|||
|
|
"shuffle_merged_datasets": true,
|
|||
|
|
"skip_prepare_dataset": false,
|
|||
|
|
"streaming_multipack_buffer_size": 10000,
|
|||
|
|
"strict": false,
|
|||
|
|
"tensor_parallel_size": 1,
|
|||
|
|
"tf32": true,
|
|||
|
|
"tiled_mlp_use_original_mlp": true,
|
|||
|
|
"tokenizer_config": "Qwen/Qwen3-8B",
|
|||
|
|
"tokenizer_save_jinja_files": true,
|
|||
|
|
"torch_dtype": "torch.bfloat16",
|
|||
|
|
"train_on_inputs": false,
|
|||
|
|
"trl": {
|
|||
|
|
"async_prefetch": false,
|
|||
|
|
"log_completions": false,
|
|||
|
|
"mask_truncated_completions": false,
|
|||
|
|
"ref_model_mixup_alpha": 0.9,
|
|||
|
|
"ref_model_sync_steps": 64,
|
|||
|
|
"replay_buffer_size": 0,
|
|||
|
|
"replay_recompute_logps": true,
|
|||
|
|
"reroll_max_groups": 1,
|
|||
|
|
"reroll_start_fraction": 1.0,
|
|||
|
|
"reward_num_workers": 1,
|
|||
|
|
"scale_rewards": true,
|
|||
|
|
"skip_zero_advantage_batches": true,
|
|||
|
|
"sync_ref_model": false,
|
|||
|
|
"use_data_producer": false,
|
|||
|
|
"use_vllm": false,
|
|||
|
|
"vllm_lora_sync": false,
|
|||
|
|
"vllm_server_host": "0.0.0.0",
|
|||
|
|
"vllm_server_port": 8000
|
|||
|
|
},
|
|||
|
|
"use_otel_metrics": false,
|
|||
|
|
"use_ray": false,
|
|||
|
|
"val_set_size": 0.0,
|
|||
|
|
"vllm": {
|
|||
|
|
"device": "auto",
|
|||
|
|
"dtype": "auto",
|
|||
|
|
"gpu_memory_utilization": 0.9,
|
|||
|
|
"host": "0.0.0.0",
|
|||
|
|
"port": 8000
|
|||
|
|
},
|
|||
|
|
"warmup_ratio": 0.05,
|
|||
|
|
"weight_decay": 0.01,
|
|||
|
|
"world_size": 1
|
|||
|
|
}
|
|||
|
|
[2026-05-14 13:57:45,656] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:311] [PID:26066] EOS: 151645 / <|im_end|>
|
|||
|
|
[2026-05-14 13:57:45,656] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:312] [PID:26066] BOS: None / None
|
|||
|
|
[2026-05-14 13:57:45,656] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:313] [PID:26066] PAD: 151643 / <|endoftext|>
|
|||
|
|
[2026-05-14 13:57:45,656] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:314] [PID:26066] UNK: None / None
|
|||
|
|
[2026-05-14 13:57:45,657] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:477] [PID:26066] Loading prepared dataset from disk at last_run_prepared/2c3cb7e1625c800787dbdc29d010a51d...
|
|||
|
|
[2026-05-14 13:57:45,727] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:420] [PID:26066] total_num_tokens: 23_382_259
|
|||
|
|
[2026-05-14 13:57:45,926] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:438] [PID:26066] `total_supervised_tokens: 11_016_035`
|
|||
|
|
[2026-05-14 13:57:46,079] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:26066] Using single process for pack_parallel, running sequentially.
|
|||
|
|
[2026-05-14 13:57:47,437] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:26066] Using single process for pack_parallel, running sequentially.
|
|||
|
|
[2026-05-14 13:57:47,736] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:26066] generate_batches time: 0.30095791816711426
|
|||
|
|
[2026-05-14 13:57:47,738] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:26066] Using single process for pack_parallel, running sequentially.
|
|||
|
|
[2026-05-14 13:57:48,034] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:26066] generate_batches time: 0.29753828048706055
|
|||
|
|
[2026-05-14 13:57:48,036] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:26066] Using single process for pack_parallel, running sequentially.
|
|||
|
|
[2026-05-14 13:57:48,309] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:26066] generate_batches time: 0.27533483505249023
|
|||
|
|
[2026-05-14 13:57:48,312] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:26066] Using single process for pack_parallel, running sequentially.
|
|||
|
|
[2026-05-14 13:57:48,612] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:26066] generate_batches time: 0.3027362823486328
|
|||
|
|
[2026-05-14 13:57:48,657] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:26066] gather_len_batches: [287]
|
|||
|
|
[2026-05-14 13:57:48,658] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:495] [PID:26066] data_loader_len: 35
|
|||
|
|
[2026-05-14 13:57:48,658] [INFO] [axolotl.utils.trainer.calc_sample_packing_eff_est:504] [PID:26066] sample_packing_eff_est across ranks: [0.9945225306919643]
|
|||
|
|
[2026-05-14 13:57:48,658] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:516] [PID:26066] sample_packing_eff_est: 1.0
|
|||
|
|
[2026-05-14 13:57:48,658] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:521] [PID:26066] total_num_steps: 70
|
|||
|
|
[2026-05-14 13:57:48,658] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:121] [PID:26066] Maximum number of steps set at 70
|
|||
|
|
[2026-05-14 13:57:48,706] [DEBUG] [axolotl.train.setup_model_and_tokenizer:70] [PID:26066] loading tokenizer... Qwen/Qwen3-8B
|
|||
|
|
[2026-05-14 13:57:49,701] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:311] [PID:26066] EOS: 151645 / <|im_end|>
|
|||
|
|
[2026-05-14 13:57:49,701] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:312] [PID:26066] BOS: None / None
|
|||
|
|
[2026-05-14 13:57:49,701] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:313] [PID:26066] PAD: 151643 / <|endoftext|>
|
|||
|
|
[2026-05-14 13:57:49,701] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:314] [PID:26066] UNK: None / None
|
|||
|
|
[2026-05-14 13:57:49,701] [DEBUG] [axolotl.train.setup_model_and_tokenizer:81] [PID:26066] Loading model
|
|||
|
|
[2026-05-14 13:57:49,820] [DEBUG] [axolotl.monkeypatch.torchao_optim.patch_torchao_optim_state_8bit:75] [PID:26066] Patched OptimState8bit for torch.compile compatibility
|
|||
|
|
[2026-05-14 13:57:49,820] [DEBUG] [axolotl.monkeypatch.torchao_optim.patch_torchao_optim_state_8bit:122] [PID:26066] Patched OptimState4bit for torch.compile compatibility
|
|||
|
|
[2026-05-14 13:57:49,820] [DEBUG] [axolotl.monkeypatch.torchao_optim.patch_torchao_optim_state_8bit:154] [PID:26066] Patched OptimStateFp8 for torch.compile compatibility
|
|||
|
|
[2026-05-14 13:57:49,826] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:94] [PID:26066] Patched Trainer.evaluation_loop with nanmean loss calculation
|
|||
|
|
[2026-05-14 13:57:49,827] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:148] [PID:26066] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation
|
|||
|
|
[2026-05-14 13:57:49,830] [INFO] [axolotl.loaders.patch_manager._apply_multipack_patches:598] [PID:26066] Applying multipack dataloader patch for sample packing...
|
|||
|
|
[2026-05-14 13:57:49,830] [WARNING] [axolotl.loaders.patch_manager._apply_self_attention_lora_patch:556] [PID:26066] Cannot patch self-attention - requires no dropout
|
|||
|
|
[2026-05-14 13:57:49,864] [INFO] [axolotl.integrations.cut_cross_entropy.pre_model_load:94] [PID:26066] Applying Cut Cross Entropy to model type: qwen3
|
|||
|
|
model.safetensors.index.json: 0.00B [00:00, ?B/s]
model.safetensors.index.json: 32.9kB [00:00, 46.7MB/s]
|
|||
|
|
Downloading (incomplete total...): 0.00B [00:00, ?B/s]
|
|||
|
|
Fetching 5 files: 0%| | 0/5 [00:00<?, ?it/s][A
Downloading (incomplete total...): 0%| | 0.00/3.99G [00:00<?, ?B/s]
Downloading (incomplete total...): 0%| | 0.00/7.95G [00:00<?, ?B/s]
Downloading (incomplete total...): 0%| | 0.00/11.9G [00:00<?, ?B/s]
Downloading (incomplete total...): 0%| | 0.00/15.1G [00:00<?, ?B/s]
Downloading (incomplete total...): 0%| | 0.00/16.4G [00:00<?, ?B/s]
Downloading (incomplete total...): 0%| | 0.00/16.4G [00:00<?, ?B/s]
Downloading (incomplete total...): 0%| | 0.00/16.4G [00:00<?, ?B/s]
Downloading (incomplete total...): 0%| | 0.00/16.4G [00:00<?, ?B/s]
Downloading (incomplete total...): 0%| | 143k/16.4G [00:01<6:20:01, 718kB/s]
Downloading (incomplete total...): 0%| | 19.2M/16.4G [00:09<2:08:37, 2.12MB/s]
Downloading (incomplete total...): 0%| | 44.3M/16.4G [00:12<1:05:18, 4.17MB/s]
Downloading (incomplete total...): 0%|▏ | 71.8M/16.4G [00:14<39:14, 6.93MB/s]
Downloading (incomplete total...): 1%|▏ | 105M/16.4G [00:18<37:09, 7.30MB/s]
Downloading (incomplete total...): 1%|▏ | 105M/16.4G [00:29<37:09, 7.30MB/s]
Downloading (incomplete total...): 1%|▍ | 171M/16.4G [00:31<46:24, 5.82MB/s]
Downloading (incomplete total...): 1%|▌ | 238M/16.4G [00:40<40:41, 6.61MB/s]
Downloading (incomplete total...): 2%|▋ | 305M/16.4G [00:41<26:04, 10.3MB/s]
Downloading (incomplete total...): 2%|▊ | 373M/16.4G [00:45<23:02, 11.6MB/s]
Downloading (incomplete total...): 3%|█ | 494M/16.4G [00:50<16:35, 16.0MB/s]
Downloading (incomplete total...): 3%|█▏ | 561M/16.4G [00:50<12:11, 21.6MB/s]
Downloading (incomplete total...): 4%|█▍ | 628M/16.4G [00:56<15:25, 17.0MB/s]
Downloading (incomplete total...): 5%|█▉ | 897M/16.4G [01:00<08:17, 31.1MB/s]
Downloading (incomplete total...): 7%|██▎ | 1.10G/16.4G [01:00<05:07, 49.6MB/s]
Downloading (incomplete total...): 10%|███▌ | 1.63G/16.4G [01:01<02:06, 116MB/s]
Downloading (incomplete total...): 12%|████▍ | 2.04G/16.4G [01:01<01:17, 185MB/s]
Downloading (incomplete total...): 14%|████▉ | 2.24G/16.4G [01:02<01:12, 196MB/s]
Downloading (incomplete total...): 19%|██████▊ | 3.11G/16.4G [01:02<00:30, 436MB/s]
Downloading (incomplete total...): 21%|███████▋ | 3.52G/16.4G [01:03<00:26, 483MB/s]
Downloading (incomplete total...): 26%|█████████▍ | 4.32G/16.4G [01:05<00:25, 465MB/s]
Downloading (incomplete total...): 26%|█████████▍ | 4.32G/16.4G [01:05<00:26, 453MB/s]
Downloading (incomplete total...): 28%|██████████▏ | 4.66G/16.4G [01:05<00:24, 476MB/s]
Downloading (incomplete total...): 32%|███████████▍ | 5.19G/16.4G [01:06<00:19, 584MB/s]
Downloading (incomplete total...): 38%|█████████████▋ | 6.25G/16.4G [01:06<00:12, 835MB/s]
Downloading (incomplete total...): 38%|█
|
|||
|
|
Fetching 5 files: 20%|███████████▊ | 1/5 [01:11<04:45, 71.45s/it][A
Downloading (incomplete total...): 85%|█████████████████████████████▋ | 13.9G/16.4G [01:11<00:00, 2.88GB/s]
Downloading (incomplete total...): 89%|███████████████████████████████▏ | 14.6G/16.4G [01:11<00:00, 3.15GB/s]
Downloading (incomplete total...): 95%|█████████████████████████████████ | 15.5G/16.4G [01:12<00:00, 3.40GB/s]
Downloading (incomplete total...): 100%|██████████████████████████████████▉| 16.4G/16.4G [01:12<00:00, 2.91GB/s]
|
|||
|
|
Fetching 5 files: 40%|███████████████████████▌ | 2/5 [01:12<01:30, 30.12s/it][A
Fetching 5 files: 100%|███████████████████████████████████████████████████████████| 5/5 [01:12<00:00, 14.53s/it]
|
|||
|
|
Download complete: 100%|███████████████████████████████████████████████████| 16.4G/16.4G [01:12<00:00, 2.91GB/s]
Download complete: 100%|████████████████████████████████████████████████████| 16.4G/16.4G [01:12<00:00, 225MB/s]
|
|||
|
|
Loading weights: 0%| | 0/399 [00:00<?, ?it/s]
Loading weights: 0%|▏ | 1/399 [00:00<03:25, 1.94it/s]
Loading weights: 1%|▎ | 2/399 [00:00<03:13, 2.06it/s]
Loading weights: 2%|▊ | 6/399 [00:01<00:51, 7.59it/s]
Loading weights: 7%|███▋ | 26/399 [00:01<00:09, 40.43it/s]
Loading weights: 10%|█████▌ | 39/399 [00:01<00:06, 57.29it/s]
Loading weights: 15%|████████▌ | 60/399 [00:01<00:03, 87.30it/s]
Loading weights: 20%|██████████▉ | 78/399 [00:01<00:02, 107.71it/s]
Loading weights: 23%|█████████████ | 93/399 [00:01<00:02, 118.03it/s]
Loading weights: 28%|███████████████▎ | 111/399 [00:01<00:02, 133.33it/s]
Loading weights: 32%|█████████████████▊ | 127/399 [00:02<00:03, 83.34it/s]
Loading weights: 37%|████████████████████▎ | 147/399 [00:02<00:02, 104.23it/s]
Loading weights: 41%|██████████████████████▎ | 162/399 [00:02<00:02, 112.28it/s]
Loading weights: 45%|████████████████████████▉ | 181/399 [00:02<00:01, 125.40it/s]
Loading weights: 49%|███████████████████████████ | 196/399 [00:02<00:01, 110.94it/s]
Loading weights: 52%|█████████████████████████████▎ | 209/399 [00:02<00:02, 89.79it/s]
Loading weights: 55%|██████████████████████████████▉ | 220/399 [00:03<00:02, 71.86it/s]
Loading weights: 58%|████████████████████████████████▋ | 233/399 [00:03<00:02, 82.21it/s]
Loading weights: 62%|██████████████████████████████████▋ | 247/399 [00:03<00:01, 92.76it/s]
Loading weights: 66%|████████████████████████████████████▌ | 265/399 [00:03<00:01, 111.25it/s]
Loading weights: 70%|██████████████████████████████████████▌ | 280/399 [00:03<00:01, 117.73it/s]
Loading weights: 74%|████████████████████████████████████████▌ | 294/399 [00:03<00:00, 121.58it/s]
Loading weights: 77%|██████████████████████████████████████████▍ | 308/399 [00:03<00:00, 116.04it/s]
Loading weights: 80%|████████████████████████████████████████████▏ | 321/399 [00:03<00:00, 118.64it/s]
Loading weights: 84%|██████████████████████████████████████████████▏ | 335/399 [00:03<00:00, 124.20it/s]
Loading weights: 89%|████████████████████████████████████████████<E29688>
|
|||
|
|
generation_config.json: 0%| | 0.00/239 [00:00<?, ?B/s]
generation_config.json: 100%|██████████████████████████████████████████████████| 239/239 [00:00<00:00, 2.17MB/s]
|
|||
|
|
[2026-05-14 13:59:09,177] [INFO] [axolotl.loaders.model._prepare_model_for_quantization:900] [PID:26066] converting PEFT model w/ prepare_model_for_kbit_training
|
|||
|
|
[2026-05-14 13:59:09,183] [INFO] [axolotl.loaders.model._configure_embedding_dtypes:356] [PID:26066] Converting modules to torch.bfloat16
|
|||
|
|
[2026-05-14 13:59:09,188] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:26066] Memory usage after model load 9.148GB (+9.148GB allocated, +10.395GB reserved)
|
|||
|
|
trainable params: 87,293,952 || all params: 8,278,029,312 || trainable%: 1.0545
|
|||
|
|
[2026-05-14 13:59:10,518] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:26066] after adapters 5.997GB (+5.997GB allocated, +10.559GB reserved)
|
|||
|
|
[2026-05-14 13:59:11,364] [INFO] [axolotl.monkeypatch.lora_kernels.apply_lora_kernel_patches:478] [PID:26066] LoRA kernels: dropout=0.05 enabled
|
|||
|
|
[2026-05-14 13:59:13,738] [INFO] [axolotl.train.save_initial_configs:450] [PID:26066] Pre-saving adapter config to ./outputs/finance-synthetic-sft-phase2...
|
|||
|
|
[2026-05-14 13:59:13,738] [INFO] [axolotl.train.save_initial_configs:454] [PID:26066] Pre-saving tokenizer to ./outputs/finance-synthetic-sft-phase2...
|
|||
|
|
[2026-05-14 13:59:13,833] [INFO] [axolotl.train.save_initial_configs:459] [PID:26066] Pre-saving model config to ./outputs/finance-synthetic-sft-phase2...
|
|||
|
|
[2026-05-14 13:59:13,837] [INFO] [axolotl.train.execute_training:226] [PID:26066] Starting trainer...
|
|||
|
|
[2026-05-14 13:59:14,333] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:26066] Using single process for pack_parallel, running sequentially.
|
|||
|
|
[2026-05-14 13:59:14,638] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:26066] Using single process for pack_parallel, running sequentially.
|
|||
|
|
[2026-05-14 13:59:14,942] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:26066] generate_batches time: 0.3063161373138428
|
|||
|
|
[2026-05-14 13:59:14,944] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:26066] Using single process for pack_parallel, running sequentially.
|
|||
|
|
[2026-05-14 13:59:15,260] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:26066] generate_batches time: 0.31746554374694824
|
|||
|
|
[2026-05-14 13:59:15,262] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:26066] Using single process for pack_parallel, running sequentially.
|
|||
|
|
[2026-05-14 13:59:15,571] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:26066] generate_batches time: 0.31086015701293945
|
|||
|
|
[2026-05-14 13:59:15,573] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:26066] Using single process for pack_parallel, running sequentially.
|
|||
|
|
[2026-05-14 13:59:15,871] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:26066] generate_batches time: 0.2998006343841553
|
|||
|
|
[2026-05-14 13:59:15,871] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:26066] gather_len_batches: [287]
|
|||
|
|
0%| | 0/70 [00:00<?, ?it/s][2026-05-14 13:59:15,981] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:26066] Using single process for pack_parallel, running sequentially.
|
|||
|
|
[2026-05-14 13:59:26,543] [ERROR] [axolotl.telemetry.errors.wrapper:158] [PID:26066] Error captured in telemetry. Run ID: 29354a0a-cd9e-4fdd-aae5-2bd9658fd326
|
|||
|
|
Traceback (most recent call last):
|
|||
|
|
File "<frozen runpy>", line 198, in _run_module_as_main
|
|||
|
|
File "<frozen runpy>", line 88, in _run_code
|
|||
|
|
File "/workspace/axolotl/src/axolotl/cli/train.py", line 145, in <module>
|
|||
|
|
fire.Fire(do_cli)
|
|||
|
|
File "/workspace/axolotl-venv/lib/python3.12/site-packages/fire/core.py", line 135, in Fire
|
|||
|
|
component_trace = _Fire(component, args, parsed_flag_args, context, name)
|
|||
|
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|||
|
|
File "/workspace/axolotl-venv/lib/python3.12/site-packages/fire/core.py", line 468, in _Fire
|
|||
|
|
component, remaining_args = _CallAndUpdateTrace(
|
|||
|
|
^^^^^^^^^^^^^^^^^^^^
|
|||
|
|
File "/workspace/axolotl-venv/lib/python3.12/site-packages/fire/core.py", line 684, in _CallAndUpdateTrace
|
|||
|
|
component = fn(*varargs, **kwargs)
|
|||
|
|
^^^^^^^^^^^^^^^^^^^^^^
|
|||
|
|
File "/workspace/axolotl/src/axolotl/cli/train.py", line 96, in do_cli
|
|||
|
|
do_train(parsed_cfg, parsed_cli_args)
|
|||
|
|
File "/workspace/axolotl/src/axolotl/cli/train.py", line 50, in do_train
|
|||
|
|
model, tokenizer, trainer = train(cfg=cfg, dataset_meta=dataset_meta)
|
|||
|
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|||
|
|
File "/workspace/axolotl/src/axolotl/telemetry/errors.py", line 127, in wrapper
|
|||
|
|
return func(*args, **kwargs)
|
|||
|
|
^^^^^^^^^^^^^^^^^^^^^
|
|||
|
|
File "/workspace/axolotl/src/axolotl/train.py", line 628, in train
|
|||
|
|
execute_training(cfg, trainer, resume_from_checkpoint)
|
|||
|
|
File "/workspace/axolotl/src/axolotl/train.py", line 227, in execute_training
|
|||
|
|
trainer.train(resume_from_checkpoint=resume_from_checkpoint)
|
|||
|
|
File "/workspace/axolotl-venv/lib/python3.12/site-packages/transformers/trainer.py", line 1425, in train
|
|||
|
|
return inner_training_loop(
|
|||
|
|
^^^^^^^^^^^^^^^^^^^^
|
|||
|
|
File "/workspace/axolotl-venv/lib/python3.12/site-packages/transformers/trainer.py", line 1507, in _inner_training_loop
|
|||
|
|
self._run_epoch(
|
|||
|
|
File "/workspace/axolotl-venv/lib/python3.12/site-packages/transformers/trainer.py", line 1735, in _run_epoch
|
|||
|
|
tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
|
|||
|
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|||
|
|
File "/workspace/axolotl/src/axolotl/core/trainers/mixins/layer_offloading.py", line 304, in training_step
|
|||
|
|
return super().training_step(*args, **kwargs)
|
|||
|
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|||
|
|
File "/workspace/axolotl/src/axolotl/core/trainers/mixins/activation_checkpointing.py", line 65, in training_step
|
|||
|
|
return super().training_step(*args, **kwargs)
|
|||
|
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|||
|
|
File "/workspace/axolotl-venv/lib/python3.12/site-packages/transformers/trainer.py", line 1907, in training_step
|
|||
|
|
loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch)
|
|||
|
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|||
|
|
File "/workspace/axolotl/src/axolotl/core/trainers/base.py", line 456, in compute_loss
|
|||
|
|
return super().compute_loss(
|
|||
|
|
^^^^^^^^^^^^^^^^^^^^^
|
|||
|
|
File "/workspace/axolotl-venv/lib/python3.12/site-packages/transformers/trainer.py", line 1979, in compute_loss
|
|||
|
|
outputs = model(**inputs)
|
|||
|
|
^^^^^^^^^^^^^^^
|
|||
|
|
File "/workspace/axolotl-venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
|
|||
|
|
return self._call_impl(*args, **kwargs)
|
|||
|
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|||
|
|
File "/workspace/axolotl-venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl
|
|||
|
|
return forward_call(*args, **kwargs)
|
|||
|
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|||
|
|
File "/workspace/axolotl-venv/lib/python3.12/site-packages/accelerate/utils/operations.py", line 823, in forward
|
|||
|
|
return model_forward(*args, **kwargs)
|
|||
|
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|||
|
|
File "/workspace/axolotl-venv/lib/python3.12/site-packages/accelerate/utils/operations.py", line 811, in __call__
|
|||
|
|
return convert_to_fp32(self.model_forward(*args, **kwargs))
|
|||
|
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|||
|
|
File "/workspace/axolotl-venv/lib/python3.12/site-packages/torch/amp/autocast_mode.py", line 44, in decorate_autocast
|
|||
|
|
return func(*args, **kwargs)
|
|||
|
|
^^^^^^^^^^^^^^^^^^^^^
|
|||
|
|
File "/workspace/axolotl-venv/lib/python3.12/site-packages/peft/peft_model.py", line 1993, in forward
|
|||
|
|
return self.base_model(
|
|||
|
|
^^^^^^^^^^^^^^^^
|
|||
|
|
File "/workspace/axolotl-venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
|
|||
|
|
return self._call_impl(*args, **kwargs)
|
|||
|
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|||
|
|
File "/workspace/axolotl-venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl
|
|||
|
|
return forward_call(*args, **kwargs)
|
|||
|
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|||
|
|
File "/workspace/axolotl-venv/lib/python3.12/site-packages/peft/tuners/tuners_utils.py", line 330, in forward
|
|||
|
|
return self.model.forward(*args, **kwargs)
|
|||
|
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|||
|
|
File "/workspace/axolotl-venv/lib/python3.12/site-packages/cut_cross_entropy/transformers/llama.py", line 53, in cce_forward
|
|||
|
|
outputs: BaseModelOutputWithPast = self.model(
|
|||
|
|
^^^^^^^^^^^
|
|||
|
|
File "/workspace/axolotl-venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
|
|||
|
|
return self._call_impl(*args, **kwargs)
|
|||
|
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|||
|
|
File "/workspace/axolotl-venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl
|
|||
|
|
return forward_call(*args, **kwargs)
|
|||
|
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|||
|
|
File "/workspace/axolotl-venv/lib/python3.12/site-packages/transformers/utils/generic.py", line 952, in wrapper
|
|||
|
|
output = func(self, *args, **kwargs)
|
|||
|
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|||
|
|
File "/workspace/axolotl-venv/lib/python3.12/site-packages/transformers/utils/output_capturing.py", line 248, in wrapper
|
|||
|
|
outputs = func(self, *args, **kwargs)
|
|||
|
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|||
|
|
File "/workspace/axolotl-venv/lib/python3.12/site-packages/transformers/models/qwen3/modeling_qwen3.py", line 424, in forward
|
|||
|
|
hidden_states = decoder_layer(
|
|||
|
|
^^^^^^^^^^^^^^
|
|||
|
|
File "/workspace/axolotl-venv/lib/python3.12/site-packages/transformers/modeling_layers.py", line 92, in __call__
|
|||
|
|
return self._gradient_checkpointing_func(partial(super().__call__, **kwargs), *args)
|
|||
|
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|||
|
|
File "/workspace/axolotl-venv/lib/python3.12/site-packages/torch/_compile.py", line 53, in inner
|
|||
|
|
return disable_fn(*args, **kwargs)
|
|||
|
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|||
|
|
File "/workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py", line 1044, in _fn
|
|||
|
|
return fn(*args, **kwargs)
|
|||
|
|
^^^^^^^^^^^^^^^^^^^
|
|||
|
|
File "/workspace/axolotl-venv/lib/python3.12/site-packages/torch/utils/checkpoint.py", line 503, in checkpoint
|
|||
|
|
ret = function(*args, **kwargs)
|
|||
|
|
^^^^^^^^^^^^^^^^^^^^^^^^^
|
|||
|
|
File "/workspace/axolotl-venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
|
|||
|
|
return self._call_impl(*args, **kwargs)
|
|||
|
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|||
|
|
File "/workspace/axolotl-venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl
|
|||
|
|
return forward_call(*args, **kwargs)
|
|||
|
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|||
|
|
File "/workspace/axolotl-venv/lib/python3.12/site-packages/transformers/models/qwen3/modeling_qwen3.py", line 332, in forward
|
|||
|
|
hidden_states = self.mlp(hidden_states)
|
|||
|
|
^^^^^^^^^^^^^^^^^^^^^^^
|
|||
|
|
File "/workspace/axolotl-venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
|
|||
|
|
return self._call_impl(*args, **kwargs)
|
|||
|
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|||
|
|
File "/workspace/axolotl-venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl
|
|||
|
|
return forward_call(*args, **kwargs)
|
|||
|
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|||
|
|
File "/workspace/axolotl/src/axolotl/kernels/lora.py", line 729, in apply_lora_mlp_swiglu
|
|||
|
|
out = LoRA_MLP.apply(
|
|||
|
|
^^^^^^^^^^^^^^^
|
|||
|
|
File "/workspace/axolotl-venv/lib/python3.12/site-packages/torch/autograd/function.py", line 581, in apply
|
|||
|
|
return super().apply(*args, **kwargs) # type: ignore[misc]
|
|||
|
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|||
|
|
File "/workspace/axolotl-venv/lib/python3.12/site-packages/torch/amp/autocast_mode.py", line 527, in decorate_fwd
|
|||
|
|
return fwd(*args, **kwargs)
|
|||
|
|
^^^^^^^^^^^^^^^^^^^^
|
|||
|
|
File "/workspace/axolotl/src/axolotl/kernels/lora.py", line 414, in forward
|
|||
|
|
output = matmul_lora(
|
|||
|
|
^^^^^^^^^^^^
|
|||
|
|
File "/workspace/axolotl/src/axolotl/kernels/lora.py", line 273, in matmul_lora
|
|||
|
|
out += s * X_lora @ A @ B
|
|||
|
|
~~^~~~~~~~
|
|||
|
|
torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 1.88 GiB. GPU 0 has a total capacity of 39.49 GiB of which 1.30 GiB is free. Process 137982 has 38.18 GiB memory in use. Of the allocated memory 35.49 GiB is allocated by PyTorch, and 2.20 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
|
|||
|
|
Exception in thread Thread-5 (_pin_memory_loop):
|
|||
|
|
Traceback (most recent call last):
|
|||
|
|
File "/root/.local/share/uv/python/cpython-3.12.13-linux-x86_64-gnu/lib/python3.12/threading.py", line 1075, in _bootstrap_inner
|
|||
|
|
[0m self.run()
|
|||
|
|
File "/root/.local/share/uv/python/cpython-3.12.13-linux-x86_64-gnu/lib/python3.12/threading.py", line 1012, in run
|
|||
|
|
0%| | 0/70 [00:11<?, ?it/s]
|
|||
|
|
[2026-05-14 14:04:50,081] [DEBUG] [axolotl.utils.config.resolve_dtype:74] [PID:26498] bf16 support detected, enabling for this configuration.
|
|||
|
|
[2026-05-14 14:04:50,664] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:26498] baseline 0.000GB ()
|
|||
|
|
[2026-05-14 14:04:50,665] [INFO] [axolotl.cli.config.load_cfg:333] [PID:26498] config:
|
|||
|
|
{
|
|||
|
|
"activation_offloading": false,
|
|||
|
|
"adapter": "qlora",
|
|||
|
|
"attn_implementation": "flash_attention_2",
|
|||
|
|
"attn_needs_dtype_cast": true,
|
|||
|
|
"attn_supports_packing": true,
|
|||
|
|
"attn_uses_flash_lib": true,
|
|||
|
|
"auto_resume_from_checkpoints": true,
|
|||
|
|
"axolotl_config_path": "./data/config.yaml",
|
|||
|
|
"base_model": "Qwen/Qwen3-8B",
|
|||
|
|
"base_model_config": "Qwen/Qwen3-8B",
|
|||
|
|
"batch_size": 32,
|
|||
|
|
"bf16": true,
|
|||
|
|
"capabilities": {
|
|||
|
|
"bf16": true,
|
|||
|
|
"compute_capability": "sm_80",
|
|||
|
|
"fp8": false,
|
|||
|
|
"n_gpu": 1,
|
|||
|
|
"n_node": 1,
|
|||
|
|
"tf32": true
|
|||
|
|
},
|
|||
|
|
"chat_template": "qwen3",
|
|||
|
|
"context_parallel_size": 1,
|
|||
|
|
"cut_cross_entropy": true,
|
|||
|
|
"dataloader_num_workers": 1,
|
|||
|
|
"dataloader_pin_memory": true,
|
|||
|
|
"dataloader_prefetch_factor": 256,
|
|||
|
|
"dataset_num_proc": 1,
|
|||
|
|
"dataset_prepared_path": "last_run_prepared",
|
|||
|
|
"datasets": [
|
|||
|
|
{
|
|||
|
|
"chat_template": "tokenizer_default",
|
|||
|
|
"field_messages": "messages",
|
|||
|
|
"field_tools": "tools",
|
|||
|
|
"message_property_mappings": {
|
|||
|
|
"content": "content",
|
|||
|
|
"role": "role"
|
|||
|
|
},
|
|||
|
|
"path": "Gandalf1/indian-finance-synthetic-phase2-cleaned",
|
|||
|
|
"roles_to_train": [
|
|||
|
|
"assistant"
|
|||
|
|
],
|
|||
|
|
"train_on_eos": "turn",
|
|||
|
|
"trust_remote_code": false,
|
|||
|
|
"type": "chat_template"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"ddp": false,
|
|||
|
|
"device": "cuda:0",
|
|||
|
|
"dion_rank_fraction": 1.0,
|
|||
|
|
"dion_rank_multiple_of": 1,
|
|||
|
|
"eaft_alpha": 1.0,
|
|||
|
|
"eaft_k": 20,
|
|||
|
|
"env_capabilities": {
|
|||
|
|
"torch_version": "2.9.1"
|
|||
|
|
},
|
|||
|
|
"eval_batch_size": 4,
|
|||
|
|
"eval_causal_lm_metrics": [
|
|||
|
|
"sacrebleu",
|
|||
|
|
"comet",
|
|||
|
|
"ter",
|
|||
|
|
"chrf"
|
|||
|
|
],
|
|||
|
|
"eval_max_new_tokens": 128,
|
|||
|
|
"eval_sample_packing": true,
|
|||
|
|
"eval_table_size": 0,
|
|||
|
|
"experimental_skip_move_to_device": true,
|
|||
|
|
"fp16": false,
|
|||
|
|
"generate_samples": false,
|
|||
|
|
"generation_do_sample": true,
|
|||
|
|
"generation_max_new_tokens": 50,
|
|||
|
|
"generation_prompt_ratio": 0.5,
|
|||
|
|
"generation_temperature": 0.7,
|
|||
|
|
"gradient_accumulation_steps": 8,
|
|||
|
|
"gradient_checkpointing": true,
|
|||
|
|
"gradient_checkpointing_kwargs": {
|
|||
|
|
"use_reentrant": false
|
|||
|
|
},
|
|||
|
|
"hub_model_id": "Gandalf1/qwen3-8b-finance-sft-phase2",
|
|||
|
|
"hub_strategy": "all_checkpoints",
|
|||
|
|
"include_tkps": true,
|
|||
|
|
"layer_offloading": false,
|
|||
|
|
"learning_rate": 2e-05,
|
|||
|
|
"lisa_layers_attribute": "model.layers",
|
|||
|
|
"load_best_model_at_end": false,
|
|||
|
|
"load_in_4bit": true,
|
|||
|
|
"load_in_8bit": false,
|
|||
|
|
"local_rank": 0,
|
|||
|
|
"logging_steps": 10,
|
|||
|
|
"lora_alpha": 64,
|
|||
|
|
"lora_dropout": 0.05,
|
|||
|
|
"lora_mlp_kernel": true,
|
|||
|
|
"lora_o_kernel": true,
|
|||
|
|
"lora_qkv_kernel": true,
|
|||
|
|
"lora_r": 32,
|
|||
|
|
"lora_target_modules": [
|
|||
|
|
"q_proj",
|
|||
|
|
"k_proj",
|
|||
|
|
"v_proj",
|
|||
|
|
"o_proj",
|
|||
|
|
"gate_proj",
|
|||
|
|
"down_proj",
|
|||
|
|
"up_proj"
|
|||
|
|
],
|
|||
|
|
"loraplus_lr_embedding": 1e-06,
|
|||
|
|
"loss_watchdog_patience": 3,
|
|||
|
|
"loss_watchdog_threshold": 5.0,
|
|||
|
|
"lr_scheduler": "cosine",
|
|||
|
|
"max_grad_norm": 1.0,
|
|||
|
|
"mean_resizing_embeddings": false,
|
|||
|
|
"merge_method": "memory_efficient",
|
|||
|
|
"micro_batch_size": 4,
|
|||
|
|
"model_config_type": "qwen3",
|
|||
|
|
"num_epochs": 2.0,
|
|||
|
|
"num_generation_samples": 3,
|
|||
|
|
"optimizer": "adamw_torch_4bit",
|
|||
|
|
"otel_metrics_host": "localhost",
|
|||
|
|
"otel_metrics_port": 8000,
|
|||
|
|
"output_dir": "./outputs/finance-synthetic-sft-phase2",
|
|||
|
|
"pad_to_sequence_len": true,
|
|||
|
|
"plugins": [
|
|||
|
|
"axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin"
|
|||
|
|
],
|
|||
|
|
"pretrain_multipack_attn": true,
|
|||
|
|
"profiler_steps_start": 0,
|
|||
|
|
"qlora_sharded_model_loading": false,
|
|||
|
|
"quantize_moe_experts": false,
|
|||
|
|
"ray_num_workers": 1,
|
|||
|
|
"relora_prune_method": "magnitude",
|
|||
|
|
"resources_per_worker": {
|
|||
|
|
"GPU": 1
|
|||
|
|
},
|
|||
|
|
"sample_packing": true,
|
|||
|
|
"sample_packing_bin_size": 200,
|
|||
|
|
"sample_packing_group_size": 100000,
|
|||
|
|
"save_only_model": false,
|
|||
|
|
"save_safetensors": true,
|
|||
|
|
"save_steps": 0.16666666666666666,
|
|||
|
|
"save_total_limit": 3,
|
|||
|
|
"saves_per_epoch": 3,
|
|||
|
|
"seed": 42,
|
|||
|
|
"sequence_len": 6144,
|
|||
|
|
"shuffle_before_merging_datasets": false,
|
|||
|
|
"shuffle_merged_datasets": true,
|
|||
|
|
"skip_prepare_dataset": false,
|
|||
|
|
"streaming_multipack_buffer_size": 10000,
|
|||
|
|
"strict": false,
|
|||
|
|
"tensor_parallel_size": 1,
|
|||
|
|
"tf32": true,
|
|||
|
|
"tiled_mlp_use_original_mlp": true,
|
|||
|
|
"tokenizer_config": "Qwen/Qwen3-8B",
|
|||
|
|
"tokenizer_save_jinja_files": true,
|
|||
|
|
"torch_dtype": "torch.bfloat16",
|
|||
|
|
"train_on_inputs": false,
|
|||
|
|
"trl": {
|
|||
|
|
"async_prefetch": false,
|
|||
|
|
"log_completions": false,
|
|||
|
|
"mask_truncated_completions": false,
|
|||
|
|
"ref_model_mixup_alpha": 0.9,
|
|||
|
|
"ref_model_sync_steps": 64,
|
|||
|
|
"replay_buffer_size": 0,
|
|||
|
|
"replay_recompute_logps": true,
|
|||
|
|
"reroll_max_groups": 1,
|
|||
|
|
"reroll_start_fraction": 1.0,
|
|||
|
|
"reward_num_workers": 1,
|
|||
|
|
"scale_rewards": true,
|
|||
|
|
"skip_zero_advantage_batches": true,
|
|||
|
|
"sync_ref_model": false,
|
|||
|
|
"use_data_producer": false,
|
|||
|
|
"use_vllm": false,
|
|||
|
|
"vllm_lora_sync": false,
|
|||
|
|
"vllm_server_host": "0.0.0.0",
|
|||
|
|
"vllm_server_port": 8000
|
|||
|
|
},
|
|||
|
|
"use_otel_metrics": false,
|
|||
|
|
"use_ray": false,
|
|||
|
|
"val_set_size": 0.0,
|
|||
|
|
"vllm": {
|
|||
|
|
"device": "auto",
|
|||
|
|
"dtype": "auto",
|
|||
|
|
"gpu_memory_utilization": 0.9,
|
|||
|
|
"host": "0.0.0.0",
|
|||
|
|
"port": 8000
|
|||
|
|
},
|
|||
|
|
"warmup_ratio": 0.05,
|
|||
|
|
"weight_decay": 0.01,
|
|||
|
|
"world_size": 1
|
|||
|
|
}
|
|||
|
|
[2026-05-14 14:04:51,741] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:311] [PID:26498] EOS: 151645 / <|im_end|>
|
|||
|
|
[2026-05-14 14:04:51,741] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:312] [PID:26498] BOS: None / None
|
|||
|
|
[2026-05-14 14:04:51,741] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:313] [PID:26498] PAD: 151643 / <|endoftext|>
|
|||
|
|
[2026-05-14 14:04:51,742] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:314] [PID:26498] UNK: None / None
|
|||
|
|
[2026-05-14 14:04:51,742] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:482] [PID:26498] Unable to find prepared dataset in last_run_prepared/8e970b09b0233ad980a67dcca6703606
|
|||
|
|
[2026-05-14 14:04:51,742] [INFO] [axolotl.utils.data.sft._load_raw_datasets:320] [PID:26498] Loading raw datasets...
|
|||
|
|
[2026-05-14 14:04:51,742] [WARNING] [axolotl.utils.data.sft._load_raw_datasets:322] [PID:26498] Processing datasets during training can lead to VRAM instability. Please pre-process your dataset using `axolotl preprocess path/to/config.yml`.
|
|||
|
|
Downloading (incomplete total...): 0.00B [00:00, ?B/s]
|
|||
|
|
Fetching 0 files: 0it [00:00, ?it/s][A
Fetching 0 files: 0it [00:00, ?it/s]
|
|||
|
|
Download complete: : 0.00B [00:00, ?B/s]
Download complete: : 0.00B [00:00, ?B/s]
|
|||
|
|
[2026-05-14 14:04:53,700] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:87] [PID:26498] Loading dataset: Gandalf1/indian-finance-synthetic-phase2-cleaned with base_type: chat_template and prompt_style: None
|
|||
|
|
[2026-05-14 14:04:53,702] [INFO] [axolotl.prompt_strategies.chat_template.__call__:1191] [PID:26498] Using chat template:
|
|||
|
|
---
|
|||
|
|
{%- if tools %}
|
|||
|
|
{{- '<|im_start|>system\n' }}
|
|||
|
|
{%- if messages[0].role == 'system' %}
|
|||
|
|
{{- messages[0].content + '\n\n' }}
|
|||
|
|
{%- endif %}
|
|||
|
|
{{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
|
|||
|
|
{%- for tool in tools %}
|
|||
|
|
{{- "\n" }}
|
|||
|
|
{{- tool | tojson }}
|
|||
|
|
{%- endfor %}
|
|||
|
|
{{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
|
|||
|
|
{%- else %}
|
|||
|
|
{%- if messages[0].role == 'system' %}
|
|||
|
|
{{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
|
|||
|
|
{%- endif %}
|
|||
|
|
{%- endif %}
|
|||
|
|
{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
|
|||
|
|
{#- Determine the real last index: use provided value or default to messages length - 1 #}
|
|||
|
|
{%- if real_last_index is defined and real_last_index is not none %}
|
|||
|
|
{%- set ns.real_last_index = real_last_index %}
|
|||
|
|
{%- else %}
|
|||
|
|
{%- set ns.real_last_index = messages|length - 1 %}
|
|||
|
|
{%- endif %}
|
|||
|
|
{%- for message in messages[::-1] %}
|
|||
|
|
{%- set index = (messages|length - 1) - loop.index0 %}
|
|||
|
|
{%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
|
|||
|
|
{%- set ns.multi_step_tool = false %}
|
|||
|
|
{%- set ns.last_query_index = index %}
|
|||
|
|
{%- endif %}
|
|||
|
|
{%- endfor %}
|
|||
|
|
{%- for message in messages %}
|
|||
|
|
{%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
|
|||
|
|
{{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
|
|||
|
|
{%- elif message.role == "assistant" %}
|
|||
|
|
{%- set content = message.content %}
|
|||
|
|
{%- set reasoning_content = '' %}
|
|||
|
|
{%- if message.reasoning_content is defined and message.reasoning_content is not none %}
|
|||
|
|
{%- set reasoning_content = message.reasoning_content %}
|
|||
|
|
{%- else %}
|
|||
|
|
{%- if '</think>' in message.content %}
|
|||
|
|
{%- set content = message.content.split('</think>')[-1].lstrip('\n') %}
|
|||
|
|
{%- set reasoning_content = message.content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
|
|||
|
|
{%- endif %}
|
|||
|
|
{%- endif %}
|
|||
|
|
{%- if loop.index0 > ns.last_query_index %}
|
|||
|
|
{%- if loop.index0 == ns.real_last_index or (loop.index0 != ns.real_last_index and reasoning_content) %}
|
|||
|
|
{{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
|
|||
|
|
{%- else %}
|
|||
|
|
{{- '<|im_start|>' + message.role + '\n' + content }}
|
|||
|
|
{%- endif %}
|
|||
|
|
{%- else %}
|
|||
|
|
{{- '<|im_start|>' + message.role + '\n' + content }}
|
|||
|
|
{%- endif %}
|
|||
|
|
{%- if message.tool_calls %}
|
|||
|
|
{%- for tool_call in message.tool_calls %}
|
|||
|
|
{%- if (loop.first and content) or (not loop.first) %}
|
|||
|
|
{{- '\n' }}
|
|||
|
|
{%- endif %}
|
|||
|
|
{%- if tool_call.function %}
|
|||
|
|
{%- set tool_call = tool_call.function %}
|
|||
|
|
{%- endif %}
|
|||
|
|
{{- '<tool_call>\n{"name": "' }}
|
|||
|
|
{{- tool_call.name }}
|
|||
|
|
{{- '", "arguments": ' }}
|
|||
|
|
{%- if tool_call.arguments is string %}
|
|||
|
|
{{- tool_call.arguments }}
|
|||
|
|
{%- else %}
|
|||
|
|
{{- tool_call.arguments | tojson }}
|
|||
|
|
{%- endif %}
|
|||
|
|
{{- '}\n</tool_call>' }}
|
|||
|
|
{%- endfor %}
|
|||
|
|
{%- endif %}
|
|||
|
|
{{- '<|im_end|>\n' }}
|
|||
|
|
{%- elif message.role == "tool" %}
|
|||
|
|
{%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
|
|||
|
|
{{- '<|im_start|>user' }}
|
|||
|
|
{%- endif %}
|
|||
|
|
{{- '\n<tool_response>\n' }}
|
|||
|
|
{{- message.content }}
|
|||
|
|
{{- '\n</tool_response>' }}
|
|||
|
|
{%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
|
|||
|
|
{{- '<|im_end|>\n' }}
|
|||
|
|
{%- endif %}
|
|||
|
|
{%- endif %}
|
|||
|
|
{%- endfor %}
|
|||
|
|
{%- if add_generation_prompt %}
|
|||
|
|
{{- '<|im_start|>assistant\n' }}
|
|||
|
|
{%- if enable_thinking is defined and enable_thinking is false %}
|
|||
|
|
{{- '<think>\n\n</think>\n\n' }}
|
|||
|
|
{%- else %}
|
|||
|
|
{{- '<think>\n\n' }}
|
|||
|
|
{%- endif %}
|
|||
|
|
{%- endif %}
|
|||
|
|
|
|||
|
|
---
|
|||
|
|
Tokenizing Prompts (num_proc=1): 0%| | 0/14763 [00:00<?, ? examples/s]
Tokenizing Prompts (num_proc=1): 7%|██ | 1000/14763 [00:16<03:41, 62.00 examples/s]
Tokenizing Prompts (num_proc=1): 7%|██ | 1000/14763 [00:30<03:41, 62.00 examples/s]
Tokenizing Prompts (num_proc=1): 14%|████ | 2000/14763 [00:31<03:18, 64.18 examples/s]
Tokenizing Prompts (num_proc=1): 14%|████ | 2000/14763 [00:42<03:18, 64.18 examples/s]
Tokenizing Prompts (num_proc=1): 20%|██████ | 3000/14763 [00:46<03:01, 64.99 examples/s]
Tokenizing Prompts (num_proc=1): 20%|██████ | 3000/14763 [01:00<03:01, 64.99 examples/s]
Tokenizing Prompts (num_proc=1): 27%|████████▏ | 4000/14763 [01:01<02:42, 66.27 examples/s]
Tokenizing Prompts (num_proc=1): 27%|████████▏ | 4000/14763 [01:12<02:42, 66.27 examples/s]
Tokenizing Prompts (num_proc=1): 34%|██████████▏ | 5000/14763 [01:16<02:27, 66.04 examples/s]
Tokenizing Prompts (num_proc=1): 34%|██████████▏ | 5000/14763 [01:30<02:27, 66.04 examples/s]
Tokenizing Prompts (num_proc=1): 41%|████████████▏ | 6000/14763 [01:30<02:11, 66.86 examples/s]
Tokenizing Prompts (num_proc=1): 41%|████████████▏ | 6000/14763 [01:42<02:11, 66.86 examples/s]
Tokenizing Prompts (num_proc=1): 47%|██████████████▏ | 7000/14763 [01:45<01:54, 67.64 examples/s]
Tokenizing Prompts (num_proc=1): 54%|████████████████▎ | 8000/14763 [01:59<01:37, 69.19 examples/s]
Tokenizing Prompts (num_proc=1): 54%|████████████████▎ | 8000/14763 [02:10<01:37, 69.19 examples/s]
Tokenizing Prompts (num_proc=1): 61%|██████████████████▎ | 9000/14763 [02:12<01:20, 71.45 examples/s]
Tokenizing Prompts (num_proc=1): 61%|██████████████████▎ | 9000/14763 [02:22<01:20, 71.45 examples/s]
Tokenizing Prompts (num_proc=1): 68%|███████████████████▋ | 10000/14763 [02:25<01:05, 72.84 examples/s]
Tokenizing Prompts (num_proc=1): 75%|█████████████████████▌ | 11000/14763 [02:38<00:50, 73.81 examples/s]
Tokenizing Prompts (num_proc=1): 75%|█████████████████████▌ | 11000/14763 [02:50<00:50, 73.81 examples/s]
Tokenizing Prompts (num_proc=1): 81%|███████████████████████▌ | 12000/14763 [02:51<00:36, 75.07 examples/s]
Tokenizing Prompts (num_proc=1): 81%|███████████████████████▌ | 12000/14763 [03:02<00:36, 75.07 examples/s]
Tokenizing Prompts (num_proc=1): 88%|█████████████████████████▌ | 13000/14763 [03:04<00:23, 75.34 examples/s]
Tokenizing Prompts (num_proc=1): 95%|███████████████████████████▌ | 14000/14763 [03:17<00:10, 74.83 examples/s]
Tokenizing Prompts (num_proc=1): 100%|█████████████████████████████| 14763/14763 [03:28<00:00, 74.72 examples/s]
Tokenizing Prompts (num_proc=1): 100%|█████████████████████████████| 14763/14763 [03:28<00:00, 70.85 examples/s]
|
|||
|
|
[2026-05-14 14:08:29,854] [INFO] [axolotl.utils.data.utils._log_dataset_stats:212] [PID:26498] min_input_len: 591
|
|||
|
|
[2026-05-14 14:08:29,855] [INFO] [axolotl.utils.data.utils._log_dataset_stats:213] [PID:26498] max_input_len: 4338
|
|||
|
|
Dropping Invalid Sequences (<None or >6144) (num_proc=1): 0%| | 0/14763 [00:00<?, ? examples/s]
Dropping Invalid Sequences (<None or >6144) (num_proc=1): 7%|▏ | 1000/14763 [00:00<00:09, 1453.44 examples/s]
Dropping Invalid Sequences (<None or >6144) (num_proc=1): 14%|▍ | 2000/14763 [00:01<00:08, 1568.36 examples/s]
Dropping Invalid Sequences (<None or >6144) (num_proc=1): 20%|▌ | 3000/14763 [00:01<00:07, 1627.08 examples/s]
Dropping Invalid Sequences (<None or >6144) (num_proc=1): 27%|▊ | 4000/14763 [00:02<00:06, 1669.29 examples/s]
Dropping Invalid Sequences (<None or >6144) (num_proc=1): 34%|█ | 5000/14763 [00:03<00:05, 1674.26 examples/s]
Dropping Invalid Sequences (<None or >6144) (num_proc=1): 41%|█▏ | 6000/14763 [00:03<00:05, 1693.46 examples/s]
Dropping Invalid Sequences (<None or >6144) (num_proc=1): 47%|█▍ | 7000/14763 [00:04<00:04, 1681.13 examples/s]
Dropping Invalid Sequences (<None or >6144) (num_proc=1): 54%|█▋ | 8000/14763 [00:04<00:04, 1678.51 examples/s]
Dropping Invalid Sequences (<None or >6144) (num_proc=1): 61%|█▊ | 9000/14763 [00:05<00:03, 1687.52 examples/s]
Dropping Invalid Sequences (<None or >6144) (num_proc=1): 68%|█▎| 10000/14763 [00:05<00:02, 1704.69 examples/s]
Dropping Invalid Sequences (<None or >6144) (num_proc=1): 75%|█▍| 11000/14763 [00:06<00:02, 1705.31 examples/s]
Dropping Invalid Sequences (<None or >6144) (num_proc=1): 81%|█▋| 12000/14763 [00:07<00:01, 1711.63 examples/s]
Dropping Invalid Sequences (<None or >6144) (num_proc=1): 88%|█▊| 13000/14763 [00:07<00:01, 1707.70 examples/s]
Dropping Invalid Sequences (<None or >6144) (num_proc=1): 95%|█▉| 14000/14763 [00:08<00:00, 1700.81 examples/s]
Dropping Invalid Sequences (<None or >6144) (num_proc=1): 100%|██| 14763/14763 [00:08<00:00, 1708.72 examples/s]
Dropping Invalid Sequences (<None or >6144) (num_proc=1): 100%|██| 14763/14763 [00:08<00:00, 1659.28 examples/s]
|
|||
|
|
Drop Samples with Zero Trainable Tokens (num_proc=1): 0%| | 0/14763 [00:00<?, ? examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=1): 7%|▍ | 1000/14763 [00:00<00:09, 1488.10 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=1): 14%|▉ | 2000/14763 [00:01<00:07, 1597.83 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=1): 20%|█▍ | 3000/14763 [00:01<00:07, 1650.05 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=1): 27%|█▉ | 4000/14763 [00:02<00:06, 1685.61 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=1): 34%|██▎ | 5000/14763 [00:03<00:05, 1689.72 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=1): 41%|██▊ | 6000/14763 [00:03<00:05, 1699.22 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=1): 47%|███▎ | 7000/14763 [00:04<00:04, 1687.34 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=1): 54%|███▊ | 8000/14763 [00:04<00:04, 1673.68 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=1): 61%|████▎ | 9000/14763 [00:05<00:03, 1696.45 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=1): 68%|████ | 10000/14763 [00:05<00:02, 1705.90 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=1): 75%|████▍ | 11000/14763 [00:06<00:02, 1708.96 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=1): 81%|████▉ | 12000/14763 [00:07<00:01, 1710.59 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=1): 88%|█████▎| 13000/14763 [00:07<00:01, 1712.56 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=1): 95%|█████▋| 14000/14763 [00:08<00:00, 1710.18 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=1): 100%|██████| 14763/14763 [00:08<00:00, 1722.32 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=1): 100%|██████| 14763/14763 [00:08<00:00, 1668.57 examples/s]
|
|||
|
|
Add position_id column (Sample Packing) (num_proc=1): 0%| | 0/14763 [00:00<?, ? examples/s]
Add position_id column (Sample Packing) (num_proc=1): 7%|▌ | 1000/14763 [00:01<00:15, 881.50 examples/s]
Add position_id column (Sample Packing) (num_proc=1): 14%|▉ | 2000/14763 [00:02<00:12, 1008.58 examples/s]
Add position_id column (Sample Packing) (num_proc=1): 20%|█▍ | 3000/14763 [00:02<00:10, 1079.47 examples/s]
Add position_id column (Sample Packing) (num_proc=1): 27%|█▉ | 4000/14763 [00:03<00:09, 1103.19 examples/s]
Add position_id column (Sample Packing) (num_proc=1): 34%|██▎ | 5000/14763 [00:04<00:08, 1128.61 examples/s]
Add position_id column (Sample Packing) (num_proc=1): 41%|██▊ | 6000/14763 [00:05<00:07, 1136.62 examples/s]
Add position_id column (Sample Packing) (num_proc=1): 47%|███▎ | 7000/14763 [00:06<00:06, 1144.16 examples/s]
Add position_id column (Sample Packing) (num_proc=1): 54%|███▊ | 8000/14763 [00:07<00:05, 1141.22 examples/s]
Add position_id column (Sample Packing) (num_proc=1): 61%|████▎ | 9000/14763 [00:08<00:04, 1156.14 examples/s]
Add position_id column (Sample Packing) (num_proc=1): 68%|████ | 10000/14763 [00:08<00:04, 1164.26 examples/s]
Add position_id column (Sample Packing) (num_proc=1): 75%|████▍ | 11000/14763 [00:09<00:03, 1152.89 examples/s]
Add position_id column (Sample Packing) (num_proc=1): 81%|████▉ | 12000/14763 [00:10<00:02, 1155.75 examples/s]
Add position_id column (Sample Packing) (num_proc=1): 88%|█████▎| 13000/14763 [00:11<00:01, 1149.65 examples/s]
Add position_id column (Sample Packing) (num_proc=1): 95%|█████▋| 14000/14763 [00:12<00:00, 1154.78 examples/s]
Add position_id column (Sample Packing) (num_proc=1): 100%|██████| 14763/14763 [00:13<00:00, 1158.66 examples/s]
Add position_id column (Sample Packing) (num_proc=1): 100%|██████| 14763/14763 [00:13<00:00, 1120.44 examples/s]
|
|||
|
|
Saving the dataset (0/1 shards): 0%| | 0/14763 [00:00<?, ? examples/s]
Saving the dataset (0/1 shards): 14%|███▉ | 2000/14763 [00:08<00:55, 228.58 examples/s]
Saving the dataset (0/1 shards): 54%|███████████████▏ | 8000/14763 [00:08<00:05, 1181.51 examples/s]
Saving the dataset (0/1 shards): 95%|█████████████████████████▌ | 14000/14763 [00:09<00:00, 2471.27 examples/s]
Saving the dataset (1/1 shards): 100%|███████████████████████████| 14763/14763 [00:09<00:00, 2471.27 examples/s]
Saving the dataset (1/1 shards): 100%|███████████████████████████| 14763/14763 [00:10<00:00, 1448.15 examples/s]
|
|||
|
|
[2026-05-14 14:09:11,124] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:420] [PID:26498] total_num_tokens: 23_382_259
|
|||
|
|
[2026-05-14 14:09:11,329] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:438] [PID:26498] `total_supervised_tokens: 11_016_035`
|
|||
|
|
[2026-05-14 14:09:11,489] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:26498] Using single process for pack_parallel, running sequentially.
|
|||
|
|
[2026-05-14 14:09:12,642] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:26498] Using single process for pack_parallel, running sequentially.
|
|||
|
|
[2026-05-14 14:09:12,941] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:26498] generate_batches time: 0.3018453121185303
|
|||
|
|
[2026-05-14 14:09:12,944] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:26498] Using single process for pack_parallel, running sequentially.
|
|||
|
|
[2026-05-14 14:09:13,260] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:26498] generate_batches time: 0.31858301162719727
|
|||
|
|
[2026-05-14 14:09:13,263] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:26498] Using single process for pack_parallel, running sequentially.
|
|||
|
|
[2026-05-14 14:09:13,584] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:26498] generate_batches time: 0.3229527473449707
|
|||
|
|
[2026-05-14 14:09:13,586] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:26498] Using single process for pack_parallel, running sequentially.
|
|||
|
|
[2026-05-14 14:09:13,864] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:26498] generate_batches time: 0.2801856994628906
|
|||
|
|
[2026-05-14 14:09:13,913] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:26498] gather_len_batches: [960]
|
|||
|
|
[2026-05-14 14:09:13,914] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:495] [PID:26498] data_loader_len: 120
|
|||
|
|
[2026-05-14 14:09:13,914] [INFO] [axolotl.utils.trainer.calc_sample_packing_eff_est:504] [PID:26498] sample_packing_eff_est across ranks: [0.991069327460395]
|
|||
|
|
[2026-05-14 14:09:13,914] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:516] [PID:26498] sample_packing_eff_est: 1.0
|
|||
|
|
[2026-05-14 14:09:13,914] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:521] [PID:26498] total_num_steps: 240
|
|||
|
|
[2026-05-14 14:09:13,914] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:121] [PID:26498] Maximum number of steps set at 240
|
|||
|
|
[2026-05-14 14:09:13,976] [DEBUG] [axolotl.train.setup_model_and_tokenizer:70] [PID:26498] loading tokenizer... Qwen/Qwen3-8B
|
|||
|
|
[2026-05-14 14:09:15,874] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:311] [PID:26498] EOS: 151645 / <|im_end|>
|
|||
|
|
[2026-05-14 14:09:15,875] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:312] [PID:26498] BOS: None / None
|
|||
|
|
[2026-05-14 14:09:15,875] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:313] [PID:26498] PAD: 151643 / <|endoftext|>
|
|||
|
|
[2026-05-14 14:09:15,875] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:314] [PID:26498] UNK: None / None
|
|||
|
|
[2026-05-14 14:09:15,875] [DEBUG] [axolotl.train.setup_model_and_tokenizer:81] [PID:26498] Loading model
|
|||
|
|
[2026-05-14 14:09:15,966] [DEBUG] [axolotl.monkeypatch.torchao_optim.patch_torchao_optim_state_8bit:75] [PID:26498] Patched OptimState8bit for torch.compile compatibility
|
|||
|
|
[2026-05-14 14:09:15,966] [DEBUG] [axolotl.monkeypatch.torchao_optim.patch_torchao_optim_state_8bit:122] [PID:26498] Patched OptimState4bit for torch.compile compatibility
|
|||
|
|
[2026-05-14 14:09:15,966] [DEBUG] [axolotl.monkeypatch.torchao_optim.patch_torchao_optim_state_8bit:154] [PID:26498] Patched OptimStateFp8 for torch.compile compatibility
|
|||
|
|
[2026-05-14 14:09:15,972] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:94] [PID:26498] Patched Trainer.evaluation_loop with nanmean loss calculation
|
|||
|
|
[2026-05-14 14:09:15,973] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:148] [PID:26498] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation
|
|||
|
|
[2026-05-14 14:09:15,975] [INFO] [axolotl.loaders.patch_manager._apply_multipack_patches:598] [PID:26498] Applying multipack dataloader patch for sample packing...
|
|||
|
|
[2026-05-14 14:09:15,975] [WARNING] [axolotl.loaders.patch_manager._apply_self_attention_lora_patch:556] [PID:26498] Cannot patch self-attention - requires no dropout
|
|||
|
|
[2026-05-14 14:09:15,999] [INFO] [axolotl.integrations.cut_cross_entropy.pre_model_load:94] [PID:26498] Applying Cut Cross Entropy to model type: qwen3
|
|||
|
|
Loading weights: 0%| | 0/399 [00:00<?, ?it/s]
Loading weights: 0%|▏ | 1/399 [00:00<01:16, 5.20it/s]
Loading weights: 1%|▎ | 2/399 [00:00<01:16, 5.17it/s]
Loading weights: 2%|▊ | 6/399 [00:00<00:25, 15.60it/s]
Loading weights: 7%|███▋ | 26/399 [00:00<00:05, 69.68it/s]
Loading weights: 10%|█████▌ | 39/399 [00:00<00:04, 86.64it/s]
Loading weights: 15%|████████▎ | 59/399 [00:00<00:02, 119.74it/s]
Loading weights: 18%|██████████▏ | 73/399 [00:00<00:02, 123.51it/s]
Loading weights: 23%|████████████▉ | 92/399 [00:01<00:02, 142.48it/s]
Loading weights: 27%|██████████████▉ | 108/399 [00:01<00:01, 147.13it/s]
Loading weights: 32%|█████████████████▎ | 126/399 [00:01<00:01, 141.71it/s]
Loading weights: 35%|███████████████████▊ | 141/399 [00:01<00:02, 98.40it/s]
Loading weights: 38%|█████████████████████ | 153/399 [00:01<00:02, 101.45it/s]
Loading weights: 42%|███████████████████████▎ | 169/399 [00:01<00:02, 113.51it/s]
Loading weights: 46%|█████████████████████████ | 182/399 [00:01<00:01, 116.72it/s]
Loading weights: 51%|███████████████████████████▊ | 202/399 [00:01<00:01, 138.13it/s]
Loading weights: 55%|██████████████████████████████▏ | 219/399 [00:02<00:01, 145.18it/s]
Loading weights: 59%|████████████████████████████████▌ | 236/399 [00:02<00:01, 146.87it/s]
Loading weights: 63%|██████████████████████████████████▋ | 252/399 [00:02<00:00, 147.08it/s]
Loading weights: 67%|█████████████████████████████████████ | 269/399 [00:02<00:00, 148.79it/s]
Loading weights: 72%|███████████████████████████████████████▌ | 287/399 [00:02<00:00, 155.51it/s]
Loading weights: 76%|█████████████████████████████████████████▊ | 303/399 [00:02<00:00, 147.81it/s]
Loading weights: 81%|████████████████████████████████████████████▌ | 323/399 [00:02<00:00, 161.15it/s]
Loading weights: 85%|██████████████████████████████████████████████▊ | 340/399 [00:02<00:00, 158.55it/s]
Loading weights: 89%|█████████████████████████████████████████████████▏ | 357/399 [00:02<00:00, 157.03it/s]
Loading weights: 93%|████████████████████████████████████████████<E29688>
|
|||
|
|
[2026-05-14 14:09:21,850] [INFO] [axolotl.loaders.model._prepare_model_for_quantization:900] [PID:26498] converting PEFT model w/ prepare_model_for_kbit_training
|
|||
|
|
[2026-05-14 14:09:21,856] [INFO] [axolotl.loaders.model._configure_embedding_dtypes:356] [PID:26498] Converting modules to torch.bfloat16
|
|||
|
|
[2026-05-14 14:09:21,862] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:26498] Memory usage after model load 9.148GB (+9.148GB allocated, +10.395GB reserved)
|
|||
|
|
trainable params: 87,293,952 || all params: 8,278,029,312 || trainable%: 1.0545
|
|||
|
|
[2026-05-14 14:09:23,159] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:26498] after adapters 5.997GB (+5.997GB allocated, +10.559GB reserved)
|
|||
|
|
[2026-05-14 14:09:24,066] [INFO] [axolotl.monkeypatch.lora_kernels.apply_lora_kernel_patches:478] [PID:26498] LoRA kernels: dropout=0.05 enabled
|
|||
|
|
[2026-05-14 14:09:25,128] [INFO] [axolotl.train.save_initial_configs:450] [PID:26498] Pre-saving adapter config to ./outputs/finance-synthetic-sft-phase2...
|
|||
|
|
[2026-05-14 14:09:25,129] [INFO] [axolotl.train.save_initial_configs:454] [PID:26498] Pre-saving tokenizer to ./outputs/finance-synthetic-sft-phase2...
|
|||
|
|
[2026-05-14 14:09:25,238] [INFO] [axolotl.train.save_initial_configs:459] [PID:26498] Pre-saving model config to ./outputs/finance-synthetic-sft-phase2...
|
|||
|
|
[2026-05-14 14:09:25,241] [INFO] [axolotl.train.execute_training:226] [PID:26498] Starting trainer...
|
|||
|
|
[2026-05-14 14:09:25,751] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:26498] Using single process for pack_parallel, running sequentially.
|
|||
|
|
[2026-05-14 14:09:26,070] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:26498] Using single process for pack_parallel, running sequentially.
|
|||
|
|
[2026-05-14 14:09:26,381] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:26498] generate_batches time: 0.3131840229034424
|
|||
|
|
[2026-05-14 14:09:26,383] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:26498] Using single process for pack_parallel, running sequentially.
|
|||
|
|
[2026-05-14 14:09:26,743] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:26498] generate_batches time: 0.36243748664855957
|
|||
|
|
[2026-05-14 14:09:26,746] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:26498] Using single process for pack_parallel, running sequentially.
|
|||
|
|
[2026-05-14 14:09:27,089] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:26498] generate_batches time: 0.3450140953063965
|
|||
|
|
[2026-05-14 14:09:27,092] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:26498] Using single process for pack_parallel, running sequentially.
|
|||
|
|
[2026-05-14 14:09:27,428] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:26498] generate_batches time: 0.33806943893432617
|
|||
|
|
[2026-05-14 14:09:27,428] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:26498] gather_len_batches: [960]
|
|||
|
|
0%| | 0/240 [00:00<?, ?it/s][2026-05-14 14:09:27,540] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:26498] Using single process for pack_parallel, running sequentially.
|
|||
|
|
0%|▎ | 1/240 [01:24<5:35:43, 84.28s/it]
1%|▌ | 2/240 [02:30<4:53:08, 73.90s/it]
1%|▉ | 3/240 [03:35<4:35:16, 69.69s/it]
2%|█▏ | 4/240 [04:39<4:25:33, 67.51s/it]
2%|█▌ | 5/240 [05:44<4:20:00, 66.38s/it]Process Process-1:
|
|||
|
|
Traceback (most recent call last):
|
|||
|
|
File "/root/.local/share/uv/python/cpython-3.12.13-linux-x86_64-gnu/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
|
|||
|
|
self.run()
|
|||
|
|
File "/root/.local/share/uv/python/cpython-3.12.13-linux-x86_64-gnu/lib/python3.12/multiprocessing/process.py", line 108, in run
|
|||
|
|
self._target(*self._args, **self._kwargs)
|
|||
|
|
File "/workspace/axolotl/src/axolotl/monkeypatch/data/batch_dataset_fetcher.py", line 54, in patched_worker_loop
|
|||
|
|
return _worker_loop(*args, **kwargs)
|
|||
|
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|||
|
|
File "/workspace/axolotl-venv/lib/python3.12/site-packages/torch/utils/data/_utils/worker.py", line 315, in _worker_loop
|
|||
|
|
r = index_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
|
|||
|
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|||
|
|
File "/root/.local/share/uv/python/cpython-3.12.13-linux-x86_64-gnu/lib/python3.12/multiprocessing/queues.py", line 113, in get
|
|||
|
|
if not self._poll(timeout):
|
|||
|
|
^^^^^^^^^^^^^^^^^^^
|
|||
|
|
File "/root/.local/share/uv/python/cpython-3.12.13-linux-x86_64-gnu/lib/python3.12/multiprocessing/connection.py", line 257, in poll
|
|||
|
|
return self._poll(timeout)
|
|||
|
|
^^^^^^^^^^^^^^^^^^^
|
|||
|
|
File "/root/.local/share/uv/python/cpython-3.12.13-linux-x86_64-gnu/lib/python3.12/multiprocessing/connection.py", line 440, in _poll
|
|||
|
|
r = wait([self], timeout)
|
|||
|
|
^^^^^^^^^^^^^^^^^^^^^
|
|||
|
|
File "/root/.local/share/uv/python/cpython-3.12.13-linux-x86_64-gnu/lib/python3.12/multiprocessing/connection.py", line 1136, in wait
|
|||
|
|
ready = selector.select(timeout)
|
|||
|
|
^^^^^^^^^^^^^^^^^^^^^^^^
|
|||
|
|
File "/root/.local/share/uv/python/cpython-3.12.13-linux-x86_64-gnu/lib/python3.12/selectors.py", line 415, in select
|
|||
|
|
fd_event_list = self._selector.poll(timeout)
|
|||
|
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|||
|
|
File "/workspace/axolotl/src/axolotl/train.py", line 179, in <lambda>
|
|||
|
|
lambda signum, frame: terminate_handler(signum, frame, _model_weakref),
|
|||
|
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|||
|
|
File "/workspace/axolotl/src/axolotl/train.py", line 171, in terminate_handler
|
|||
|
|
_model.save_pretrained(cfg.output_dir)
|
|||
|
|
File "/workspace/axolotl-venv/lib/python3.12/site-packages/peft/peft_model.py", line 294, in save_pretrained
|
|||
|
|
output_state_dict = get_peft_model_state_dict(
|
|||
|
|
^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|||
|
|
File "/workspace/axolotl-venv/lib/python3.12/site-packages/peft/utils/save_and_load.py", line 111, in get_peft_model_state_dict
|
|||
|
|
state_dict = model.state_dict()
|
|||
|
|
^^^^^^^^^^^^^^^^^^
|
|||
|
|
File "/workspace/axolotl-venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 2265, in state_dict
|
|||
|
|
module.state_dict(
|
|||
|
|
File "/workspace/axolotl-venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 2265, in state_dict
|
|||
|
|
module.state_dict(
|
|||
|
|
File "/workspace/axolotl-venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 2265, in state_dict
|
|||
|
|
module.state_dict(
|
|||
|
|
[Previous line repeated 5 more times]
|
|||
|
|
File "/workspace/axolotl-venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 2262, in state_dict
|
|||
|
|
self._save_to_state_dict(destination, prefix, keep_vars)
|
|||
|
|
File "/workspace/axolotl-venv/lib/python3.12/site-packages/bitsandbytes/nn/modules.py", line 526, in _save_to_state_dict
|
|||
|
|
for k, v in self.weight.quant_state.as_dict(packed=True).items():
|
|||
|
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|||
|
|
File "/workspace/axolotl-venv/lib/python3.12/site-packages/bitsandbytes/functional.py", line 523, in as_dict
|
|||
|
|
"nested_quant_map": self.state2.code.clone(), # un-shared to avoid restoring it after shared tensors are removed by safetensors
|
|||
|
|
^^^^^^^^^^^^^^^^^^^^^^^^
|
|||
|
|
torch.AcceleratorError: CUDA error: initialization error
|
|||
|
|
Search for `cudaErrorInitializationError' in https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html for more information.
|
|||
|
|
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
|
|||
|
|
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
|
|||
|
|
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
|
|||
|
|
|
|||
|
|
[2026-05-14 14:16:00,926] [WARNING] [py.warnings._showwarnmsg:112] [PID:26498] /workspace/axolotl-venv/lib/python3.12/site-packages/peft/utils/other.py:1419: UserWarning: Unable to fetch remote file due to the following error DataLoader worker (pid 26631) exited unexpectedly with exit code 1. Details are lost due to multiprocessing. Rerunning with num_workers=0 may give better error trace. - silently ignoring the lookup for the file config.json in Qwen/Qwen3-8B.
|
|||
|
|
warnings.warn(
|
|||
|
|
|
|||
|
|
[2026-05-14 14:16:00,927] [WARNING] [py.warnings._showwarnmsg:112] [PID:26498] /workspace/axolotl-venv/lib/python3.12/site-packages/peft/utils/save_and_load.py:372: UserWarning: Could not find a config file in Qwen/Qwen3-8B - will assume that the vocabulary was not modified.
|
|||
|
|
warnings.warn(
|
|||
|
|
|
|||
|
|
2%|█▌ | 5/240 [06:33<5:08:32, 78.78s/it]
|
|||
|
|
[0m[2026-05-14 14:16:47,206] [DEBUG] [axolotl.utils.config.resolve_dtype:74] [PID:27401] bf16 support detected, enabling for this configuration.
|
|||
|
|
[2026-05-14 14:16:47,436] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:27401] baseline 0.000GB ()
|
|||
|
|
[2026-05-14 14:16:47,437] [INFO] [axolotl.cli.config.load_cfg:333] [PID:27401] config:
|
|||
|
|
{
|
|||
|
|
"activation_offloading": false,
|
|||
|
|
"adapter": "qlora",
|
|||
|
|
"attn_implementation": "flash_attention_2",
|
|||
|
|
"attn_needs_dtype_cast": true,
|
|||
|
|
"attn_supports_packing": true,
|
|||
|
|
"attn_uses_flash_lib": true,
|
|||
|
|
"auto_resume_from_checkpoints": true,
|
|||
|
|
"axolotl_config_path": "./data/config.yaml",
|
|||
|
|
"base_model": "Qwen/Qwen3-8B",
|
|||
|
|
"base_model_config": "Qwen/Qwen3-8B",
|
|||
|
|
"batch_size": 64,
|
|||
|
|
"bf16": true,
|
|||
|
|
"capabilities": {
|
|||
|
|
"bf16": true,
|
|||
|
|
"compute_capability": "sm_80",
|
|||
|
|
"fp8": false,
|
|||
|
|
"n_gpu": 1,
|
|||
|
|
"n_node": 1,
|
|||
|
|
"tf32": true
|
|||
|
|
},
|
|||
|
|
"chat_template": "qwen3",
|
|||
|
|
"context_parallel_size": 1,
|
|||
|
|
"cut_cross_entropy": true,
|
|||
|
|
"dataloader_num_workers": 1,
|
|||
|
|
"dataloader_pin_memory": true,
|
|||
|
|
"dataloader_prefetch_factor": 256,
|
|||
|
|
"dataset_num_proc": 1,
|
|||
|
|
"dataset_prepared_path": "last_run_prepared",
|
|||
|
|
"datasets": [
|
|||
|
|
{
|
|||
|
|
"chat_template": "tokenizer_default",
|
|||
|
|
"field_messages": "messages",
|
|||
|
|
"field_tools": "tools",
|
|||
|
|
"message_property_mappings": {
|
|||
|
|
"content": "content",
|
|||
|
|
"role": "role"
|
|||
|
|
},
|
|||
|
|
"path": "Gandalf1/indian-finance-synthetic-phase2-cleaned",
|
|||
|
|
"roles_to_train": [
|
|||
|
|
"assistant"
|
|||
|
|
],
|
|||
|
|
"train_on_eos": "turn",
|
|||
|
|
"trust_remote_code": false,
|
|||
|
|
"type": "chat_template"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"ddp": false,
|
|||
|
|
"device": "cuda:0",
|
|||
|
|
"dion_rank_fraction": 1.0,
|
|||
|
|
"dion_rank_multiple_of": 1,
|
|||
|
|
"eaft_alpha": 1.0,
|
|||
|
|
"eaft_k": 20,
|
|||
|
|
"env_capabilities": {
|
|||
|
|
"torch_version": "2.9.1"
|
|||
|
|
},
|
|||
|
|
"eval_batch_size": 8,
|
|||
|
|
"eval_causal_lm_metrics": [
|
|||
|
|
"sacrebleu",
|
|||
|
|
"comet",
|
|||
|
|
"ter",
|
|||
|
|
"chrf"
|
|||
|
|
],
|
|||
|
|
"eval_max_new_tokens": 128,
|
|||
|
|
"eval_sample_packing": true,
|
|||
|
|
"eval_table_size": 0,
|
|||
|
|
"experimental_skip_move_to_device": true,
|
|||
|
|
"fp16": false,
|
|||
|
|
"generate_samples": false,
|
|||
|
|
"generation_do_sample": true,
|
|||
|
|
"generation_max_new_tokens": 50,
|
|||
|
|
"generation_prompt_ratio": 0.5,
|
|||
|
|
"generation_temperature": 0.7,
|
|||
|
|
"gradient_accumulation_steps": 8,
|
|||
|
|
"gradient_checkpointing": true,
|
|||
|
|
"gradient_checkpointing_kwargs": {
|
|||
|
|
"use_reentrant": false
|
|||
|
|
},
|
|||
|
|
"hub_model_id": "Gandalf1/qwen3-8b-finance-sft-phase2",
|
|||
|
|
"hub_strategy": "all_checkpoints",
|
|||
|
|
"include_tkps": true,
|
|||
|
|
"layer_offloading": false,
|
|||
|
|
"learning_rate": 2e-05,
|
|||
|
|
"lisa_layers_attribute": "model.layers",
|
|||
|
|
"load_best_model_at_end": false,
|
|||
|
|
"load_in_4bit": true,
|
|||
|
|
"load_in_8bit": false,
|
|||
|
|
"local_rank": 0,
|
|||
|
|
"logging_steps": 10,
|
|||
|
|
"lora_alpha": 64,
|
|||
|
|
"lora_dropout": 0.05,
|
|||
|
|
"lora_mlp_kernel": true,
|
|||
|
|
"lora_o_kernel": true,
|
|||
|
|
"lora_qkv_kernel": true,
|
|||
|
|
"lora_r": 32,
|
|||
|
|
"lora_target_modules": [
|
|||
|
|
"q_proj",
|
|||
|
|
"k_proj",
|
|||
|
|
"v_proj",
|
|||
|
|
"o_proj",
|
|||
|
|
"gate_proj",
|
|||
|
|
"down_proj",
|
|||
|
|
"up_proj"
|
|||
|
|
],
|
|||
|
|
"loraplus_lr_embedding": 1e-06,
|
|||
|
|
"loss_watchdog_patience": 3,
|
|||
|
|
"loss_watchdog_threshold": 5.0,
|
|||
|
|
"lr_scheduler": "cosine",
|
|||
|
|
"max_grad_norm": 1.0,
|
|||
|
|
"mean_resizing_embeddings": false,
|
|||
|
|
"merge_method": "memory_efficient",
|
|||
|
|
"micro_batch_size": 8,
|
|||
|
|
"model_config_type": "qwen3",
|
|||
|
|
"num_epochs": 2.0,
|
|||
|
|
"num_generation_samples": 3,
|
|||
|
|
"optimizer": "adamw_torch_4bit",
|
|||
|
|
"otel_metrics_host": "localhost",
|
|||
|
|
"otel_metrics_port": 8000,
|
|||
|
|
"output_dir": "./outputs/finance-synthetic-sft-phase2",
|
|||
|
|
"pad_to_sequence_len": true,
|
|||
|
|
"plugins": [
|
|||
|
|
"axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin"
|
|||
|
|
],
|
|||
|
|
"pretrain_multipack_attn": true,
|
|||
|
|
"profiler_steps_start": 0,
|
|||
|
|
"qlora_sharded_model_loading": false,
|
|||
|
|
"quantize_moe_experts": false,
|
|||
|
|
"ray_num_workers": 1,
|
|||
|
|
"relora_prune_method": "magnitude",
|
|||
|
|
"resources_per_worker": {
|
|||
|
|
"GPU": 1
|
|||
|
|
},
|
|||
|
|
"sample_packing": true,
|
|||
|
|
"sample_packing_bin_size": 200,
|
|||
|
|
"sample_packing_group_size": 100000,
|
|||
|
|
"save_only_model": false,
|
|||
|
|
"save_safetensors": true,
|
|||
|
|
"save_steps": 0.16666666666666666,
|
|||
|
|
"save_total_limit": 3,
|
|||
|
|
"saves_per_epoch": 3,
|
|||
|
|
"seed": 42,
|
|||
|
|
"sequence_len": 6144,
|
|||
|
|
"shuffle_before_merging_datasets": false,
|
|||
|
|
"shuffle_merged_datasets": true,
|
|||
|
|
"skip_prepare_dataset": false,
|
|||
|
|
"streaming_multipack_buffer_size": 10000,
|
|||
|
|
"strict": false,
|
|||
|
|
"tensor_parallel_size": 1,
|
|||
|
|
"tf32": true,
|
|||
|
|
"tiled_mlp_use_original_mlp": true,
|
|||
|
|
"tokenizer_config": "Qwen/Qwen3-8B",
|
|||
|
|
"tokenizer_save_jinja_files": true,
|
|||
|
|
"torch_dtype": "torch.bfloat16",
|
|||
|
|
"train_on_inputs": false,
|
|||
|
|
"trl": {
|
|||
|
|
"async_prefetch": false,
|
|||
|
|
"log_completions": false,
|
|||
|
|
"mask_truncated_completions": false,
|
|||
|
|
"ref_model_mixup_alpha": 0.9,
|
|||
|
|
"ref_model_sync_steps": 64,
|
|||
|
|
"replay_buffer_size": 0,
|
|||
|
|
"replay_recompute_logps": true,
|
|||
|
|
"reroll_max_groups": 1,
|
|||
|
|
"reroll_start_fraction": 1.0,
|
|||
|
|
"reward_num_workers": 1,
|
|||
|
|
"scale_rewards": true,
|
|||
|
|
"skip_zero_advantage_batches": true,
|
|||
|
|
"sync_ref_model": false,
|
|||
|
|
"use_data_producer": false,
|
|||
|
|
"use_vllm": false,
|
|||
|
|
"vllm_lora_sync": false,
|
|||
|
|
"vllm_server_host": "0.0.0.0",
|
|||
|
|
"vllm_server_port": 8000
|
|||
|
|
},
|
|||
|
|
"use_otel_metrics": false,
|
|||
|
|
"use_ray": false,
|
|||
|
|
"val_set_size": 0.0,
|
|||
|
|
"vllm": {
|
|||
|
|
"device": "auto",
|
|||
|
|
"dtype": "auto",
|
|||
|
|
"gpu_memory_utilization": 0.9,
|
|||
|
|
"host": "0.0.0.0",
|
|||
|
|
"port": 8000
|
|||
|
|
},
|
|||
|
|
"warmup_ratio": 0.05,
|
|||
|
|
"weight_decay": 0.01,
|
|||
|
|
"world_size": 1
|
|||
|
|
}
|
|||
|
|
[2026-05-14 14:16:48,543] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:311] [PID:27401] EOS: 151645 / <|im_end|>
|
|||
|
|
[2026-05-14 14:16:48,543] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:312] [PID:27401] BOS: None / None
|
|||
|
|
[2026-05-14 14:16:48,543] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:313] [PID:27401] PAD: 151643 / <|endoftext|>
|
|||
|
|
[2026-05-14 14:16:48,543] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:314] [PID:27401] UNK: None / None
|
|||
|
|
[2026-05-14 14:16:48,544] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:477] [PID:27401] Loading prepared dataset from disk at last_run_prepared/8e970b09b0233ad980a67dcca6703606...
|
|||
|
|
[2026-05-14 14:16:48,619] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:420] [PID:27401] total_num_tokens: 23_382_259
|
|||
|
|
[2026-05-14 14:16:48,820] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:438] [PID:27401] `total_supervised_tokens: 11_016_035`
|
|||
|
|
[2026-05-14 14:16:48,973] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:27401] Using single process for pack_parallel, running sequentially.
|
|||
|
|
[2026-05-14 14:16:49,950] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:27401] Using single process for pack_parallel, running sequentially.
|
|||
|
|
[2026-05-14 14:16:50,214] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27401] generate_batches time: 0.2658224105834961
|
|||
|
|
[2026-05-14 14:16:50,216] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:27401] Using single process for pack_parallel, running sequentially.
|
|||
|
|
[2026-05-14 14:16:50,480] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27401] generate_batches time: 0.26596903800964355
|
|||
|
|
[2026-05-14 14:16:50,483] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:27401] Using single process for pack_parallel, running sequentially.
|
|||
|
|
[2026-05-14 14:16:50,747] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27401] generate_batches time: 0.26615142822265625
|
|||
|
|
[2026-05-14 14:16:50,749] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:27401] Using single process for pack_parallel, running sequentially.
|
|||
|
|
[2026-05-14 14:16:51,014] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27401] generate_batches time: 0.26613616943359375
|
|||
|
|
[2026-05-14 14:16:51,076] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:27401] gather_len_batches: [478]
|
|||
|
|
[2026-05-14 14:16:51,076] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:495] [PID:27401] data_loader_len: 59
|
|||
|
|
[2026-05-14 14:16:51,076] [INFO] [axolotl.utils.trainer.calc_sample_packing_eff_est:504] [PID:27401] sample_packing_eff_est across ranks: [0.9952160610480953]
|
|||
|
|
[2026-05-14 14:16:51,076] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:516] [PID:27401] sample_packing_eff_est: 1.0
|
|||
|
|
[2026-05-14 14:16:51,076] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:521] [PID:27401] total_num_steps: 118
|
|||
|
|
[2026-05-14 14:16:51,076] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:121] [PID:27401] Maximum number of steps set at 118
|
|||
|
|
[2026-05-14 14:16:51,107] [DEBUG] [axolotl.train.setup_model_and_tokenizer:70] [PID:27401] loading tokenizer... Qwen/Qwen3-8B
|
|||
|
|
[2026-05-14 14:16:52,139] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:311] [PID:27401] EOS: 151645 / <|im_end|>
|
|||
|
|
[2026-05-14 14:16:52,139] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:312] [PID:27401] BOS: None / None
|
|||
|
|
[2026-05-14 14:16:52,139] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:313] [PID:27401] PAD: 151643 / <|endoftext|>
|
|||
|
|
[2026-05-14 14:16:52,139] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:314] [PID:27401] UNK: None / None
|
|||
|
|
[2026-05-14 14:16:52,139] [DEBUG] [axolotl.train.setup_model_and_tokenizer:81] [PID:27401] Loading model
|
|||
|
|
[2026-05-14 14:16:52,226] [DEBUG] [axolotl.monkeypatch.torchao_optim.patch_torchao_optim_state_8bit:75] [PID:27401] Patched OptimState8bit for torch.compile compatibility
|
|||
|
|
[2026-05-14 14:16:52,226] [DEBUG] [axolotl.monkeypatch.torchao_optim.patch_torchao_optim_state_8bit:122] [PID:27401] Patched OptimState4bit for torch.compile compatibility
|
|||
|
|
[2026-05-14 14:16:52,226] [DEBUG] [axolotl.monkeypatch.torchao_optim.patch_torchao_optim_state_8bit:154] [PID:27401] Patched OptimStateFp8 for torch.compile compatibility
|
|||
|
|
[2026-05-14 14:16:52,231] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:94] [PID:27401] Patched Trainer.evaluation_loop with nanmean loss calculation
|
|||
|
|
[2026-05-14 14:16:52,232] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:148] [PID:27401] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation
|
|||
|
|
[2026-05-14 14:16:52,233] [INFO] [axolotl.loaders.patch_manager._apply_multipack_patches:598] [PID:27401] Applying multipack dataloader patch for sample packing...
|
|||
|
|
[2026-05-14 14:16:52,233] [WARNING] [axolotl.loaders.patch_manager._apply_self_attention_lora_patch:556] [PID:27401] Cannot patch self-attention - requires no dropout
|
|||
|
|
[2026-05-14 14:16:52,248] [INFO] [axolotl.integrations.cut_cross_entropy.pre_model_load:94] [PID:27401] Applying Cut Cross Entropy to model type: qwen3
|
|||
|
|
Loading weights: 0%| | 0/399 [00:00<?, ?it/s]
Loading weights: 0%|▏ | 1/399 [00:00<01:18, 5.09it/s]
Loading weights: 1%|▎ | 2/399 [00:00<01:17, 5.10it/s]
Loading weights: 2%|▊ | 6/399 [00:00<00:25, 15.28it/s]
Loading weights: 7%|███▋ | 26/399 [00:00<00:05, 69.67it/s]
Loading weights: 10%|█████▌ | 39/399 [00:00<00:04, 87.27it/s]
Loading weights: 15%|████████▍ | 60/399 [00:00<00:02, 118.35it/s]
Loading weights: 20%|██████████▉ | 78/399 [00:00<00:02, 135.62it/s]
Loading weights: 23%|█████████████ | 93/399 [00:01<00:02, 139.69it/s]
Loading weights: 29%|███████████████▋ | 114/399 [00:01<00:01, 153.18it/s]
Loading weights: 33%|█████████████████▉ | 130/399 [00:01<00:01, 152.10it/s]
Loading weights: 37%|████████████████████▍ | 148/399 [00:01<00:01, 152.66it/s]
Loading weights: 42%|███████████████████████ | 167/399 [00:01<00:01, 162.57it/s]
Loading weights: 46%|█████████████████████████▎ | 184/399 [00:01<00:01, 157.05it/s]
Loading weights: 51%|███████████████████████████▊ | 202/399 [00:01<00:01, 161.91it/s]
Loading weights: 55%|██████████████████████████████▏ | 219/399 [00:01<00:01, 157.80it/s]
Loading weights: 59%|████████████████████████████████▌ | 236/399 [00:01<00:01, 156.28it/s]
Loading weights: 64%|███████████████████████████████████▍ | 257/399 [00:02<00:00, 163.72it/s]
Loading weights: 69%|█████████████████████████████████████▊ | 274/399 [00:02<00:00, 161.44it/s]
Loading weights: 73%|████████████████████████████████████████ | 291/399 [00:02<00:00, 158.63it/s]
Loading weights: 77%|██████████████████████████████████████████▎ | 307/399 [00:02<00:00, 158.03it/s]
Loading weights: 81%|████████████████████████████████████████████▋ | 324/399 [00:02<00:00, 157.01it/s]
Loading weights: 86%|███████████████████████████████████████████████▎ | 343/399 [00:02<00:00, 166.08it/s]
Loading weights: 90%|█████████████████████████████████████████████████▌ | 360/399 [00:02<00:00, 159.82it/s]
Loading weights: 95%|████████████████████████████████████████████████████ | 378/399 [00:02<00:00, 165.32it/s]
Loading weights: 99%|███████████████████<E29688><E29688>
|
|||
|
|
[2026-05-14 14:16:56,731] [INFO] [axolotl.loaders.model._prepare_model_for_quantization:900] [PID:27401] converting PEFT model w/ prepare_model_for_kbit_training
|
|||
|
|
[2026-05-14 14:16:56,737] [INFO] [axolotl.loaders.model._configure_embedding_dtypes:356] [PID:27401] Converting modules to torch.bfloat16
|
|||
|
|
[2026-05-14 14:16:56,742] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:27401] Memory usage after model load 9.148GB (+9.148GB allocated, +10.395GB reserved)
|
|||
|
|
trainable params: 87,293,952 || all params: 8,278,029,312 || trainable%: 1.0545
|
|||
|
|
[2026-05-14 14:16:58,024] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:27401] after adapters 5.997GB (+5.997GB allocated, +10.559GB reserved)
|
|||
|
|
[2026-05-14 14:16:58,858] [INFO] [axolotl.monkeypatch.lora_kernels.apply_lora_kernel_patches:478] [PID:27401] LoRA kernels: dropout=0.05 enabled
|
|||
|
|
[2026-05-14 14:16:59,965] [INFO] [axolotl.train.save_initial_configs:450] [PID:27401] Pre-saving adapter config to ./outputs/finance-synthetic-sft-phase2...
|
|||
|
|
[2026-05-14 14:16:59,966] [INFO] [axolotl.train.save_initial_configs:454] [PID:27401] Pre-saving tokenizer to ./outputs/finance-synthetic-sft-phase2...
|
|||
|
|
[2026-05-14 14:17:00,066] [INFO] [axolotl.train.save_initial_configs:459] [PID:27401] Pre-saving model config to ./outputs/finance-synthetic-sft-phase2...
|
|||
|
|
[2026-05-14 14:17:00,070] [INFO] [axolotl.train.execute_training:226] [PID:27401] Starting trainer...
|
|||
|
|
[2026-05-14 14:17:00,556] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:27401] Using single process for pack_parallel, running sequentially.
|
|||
|
|
[2026-05-14 14:17:00,857] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:27401] Using single process for pack_parallel, running sequentially.
|
|||
|
|
[2026-05-14 14:17:01,154] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27401] generate_batches time: 0.2984797954559326
|
|||
|
|
[2026-05-14 14:17:01,156] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:27401] Using single process for pack_parallel, running sequentially.
|
|||
|
|
[2026-05-14 14:17:01,452] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27401] generate_batches time: 0.29767322540283203
|
|||
|
|
[2026-05-14 14:17:01,454] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:27401] Using single process for pack_parallel, running sequentially.
|
|||
|
|
[2026-05-14 14:17:01,750] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27401] generate_batches time: 0.29801011085510254
|
|||
|
|
[2026-05-14 14:17:01,752] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:27401] Using single process for pack_parallel, running sequentially.
|
|||
|
|
[2026-05-14 14:17:02,048] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27401] generate_batches time: 0.2980222702026367
|
|||
|
|
[2026-05-14 14:17:02,049] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:27401] gather_len_batches: [478]
|
|||
|
|
0%| | 0/118 [00:00<?, ?it/s][2026-05-14 14:17:02,153] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:27401] Using single process for pack_parallel, running sequentially.
|
|||
|
|
1%|▌ | 1/118 [02:11<4:17:03, 131.83s/it]
2%|█▏ | 2/118 [04:20<4:10:45, 129.70s/it]
3%|█▊ | 3/118 [06:27<4:06:16, 128.49s/it]
3%|██▍ | 4/118 [08:36<4:04:27, 128.66s/it]
4%|███ | 5/118 [10:43<4:01:24, 128.18s/it]
5%|███▋ | 6/118 [12:51<3:58:58, 128.02s/it]
6%|████▎ | 7/118 [14:58<3:56:40, 127.93s/it]
7%|████▉ | 8/118 [17:06<3:54:21, 127.84s/it]
8%|█████▍ | 9/118 [19:13<3:52:03, 127.74s/it]
8%|██████ | 10/118 [21:21<3:49:57, 127.76s/it]
{'loss': '1.607', 'grad_norm': '1.072', 'learning_rate': '1.994e-05', 'ppl': '4.987', 'memory/max_active (GiB)': '32.17', 'memory/max_allocated (GiB)': '32.17', 'memory/device_reserved (GiB)': '33.31', 'tokens/train_per_sec_per_gpu': '163.5', 'tokens/total': 3932160, 'tokens/trainable': 1840181, 'epoch': '0.1674'}
|
|||
|
|
8%|██████ | 10/118 [21:21<3:49:57, 127.76s/it]
9%|██████▌ | 11/118 [23:29<3:47:45, 127.71s/it]
10%|███████▏ | 12/118 [25:36<3:45:31, 127.66s/it]
11%|███████▊ | 13/118 [27:44<3:43:33, 127.75s/it]
12%|████████▍ | 14/118 [29:52<3:41:22, 127.72s/it]
13%|█████████ | 15/118 [31:59<3:39:02, 127.60s/it]
14%|█████████▋ | 16/118 [34:07<3:36:55, 127.60s/it]
14%|██████████▏ | 17/118 [36:15<3:35:02, 127.75s/it]
15%|██████████▊ | 18/118 [38:23<3:32:54, 127.75s/it]
16%|███████████▍ | 19/118 [40:31<3:30:48, 127.77s/it]
17%|████████████ | 20/118 [42:39<3:28:53, 127.89s/it]
{'loss': '1.412', 'grad_norm': '0.3657', 'learning_rate': '1.925e-05', 'ppl': '4.104', 'memory/max_active (GiB)': '32.17', 'memory/max_allocated (GiB)': '32.17', 'memory/device_reserved (GiB)': '33.31', 'tokens/train_per_sec_per_gpu': '176.4', 'tokens/total': 7864320, 'tokens/trainable': 3698601, 'epoch': '0.3347'}
|
|||
|
|
17%|████████████ | 20/118 [42:39<3:28:53, 127.89s/it][2026-05-14 14:59:41,397] [INFO] [axolotl.core.trainers.base._save:818] [PID:27401] Saving model checkpoint to ./outputs/finance-synthetic-sft-phase2/checkpoint-20
|
|||
|
|
18%|████████████▋ | 21/118 [44:50<3:28:30, 128.98s/it]
19%|█████████████▏ | 22/118 [46:58<3:25:48, 128.63s/it]
19%|█████████████▊ | 23/118 [49:06<3:23:18, 128.41s/it]
20%|██████████████▍ | 24/118 [51:14<3:21:03, 128.33s/it]
21%|███████████████ | 25/118 [53:22<3:18:39, 128.17s/it]
22%|███████████████▋ | 26/118 [55:30<3:16:24, 128.09s/it]
23%|████████████████▏ | 27/118 [57:38<3:14:14, 128.07s/it]
24%|████████████████▊ | 28/118 [59:45<3:11:53, 127.92s/it]
25%|████████████████▉ | 29/118 [1:01:53<3:09:39, 127.86s/it]
25%|█████████████████▌ | 30/118 [1:04:01<3:07:31, 127.86s/it]
{'loss': '1.32', 'grad_norm': '0.2517', 'learning_rate': '1.786e-05', 'ppl': '3.745', 'memory/max_active (GiB)': '32.17', 'memory/max_allocated (GiB)': '32.17', 'memory/device_reserved (GiB)': '33.31', 'tokens/train_per_sec_per_gpu': '181.4', 'tokens/total': 11796480, 'tokens/trainable': 5537401, 'epoch': '0.5021'}
|
|||
|
|
25%|█████████████████▌ | 30/118 [1:04:01<3:07:31, 127.86s/it]
26%|██████████████████▏ | 31/118 [1:06:09<3:05:31, 127.94s/it]
27%|██████████████████▋ | 32/118 [1:08:17<3:03:13, 127.83s/it]
28%|███████████████████▎ | 33/118 [1:10:25<3:01:05, 127.83s/it]
29%|███████████████████▉ | 34/118 [1:12:33<2:59:01, 127.88s/it]
30%|████████████████████▍ | 35/118 [1:14:40<2:56:50, 127.84s/it]
31%|█████████████████████ | 36/118 [1:16:50<2:55:20, 128.29s/it]
31%|█████████████████████▋ | 37/118 [1:18:57<2:52:58, 128.13s/it]
32%|██████████████████████▏ | 38/118 [1:21:05<2:50:32, 127.91s/it]
33%|██████████████████████▊ | 39/118 [1:23:12<2:48:14, 127.78s/it]
34%|███████████████████████▍ | 40/118 [1:25:20<2:46:01, 127.71s/it]
{'loss': '1.293', 'grad_norm': '0.2172', 'learning_rate': '1.586e-05', 'ppl': '3.645', 'memory/max_active (GiB)': '32.17', 'memory/max_allocated (GiB)': '32.17', 'memory/device_reserved (GiB)': '33.31', 'tokens/train_per_sec_per_gpu': '184.9', 'tokens/total': 15728640, 'tokens/trainable': 7378776, 'epoch': '0.6695'}
|
|||
|
|
34%|███████████████████████▍ | 40/118 [1:25:20<2:46:01, 127.71s/it][2026-05-14 15:42:22,458] [INFO] [axolotl.core.trainers.base._save:818] [PID:27401] Saving model checkpoint to ./outputs/finance-synthetic-sft-phase2/checkpoint-40
|
|||
|
|
35%|███████████████████████▉ | 41/118 [1:27:30<2:44:46, 128.39s/it]
36%|████████████████████████▌ | 42/118 [1:29:37<2:42:14, 128.09s/it]
36%|█████████████████████████▏ | 43/118 [1:31:45<2:39:59, 127.99s/it]
37%|█████████████████████████▋ | 44/118 [1:33:53<2:37:48, 127.96s/it]
38%|██████████████████████████▎ | 45/118 [1:36:00<2:35:34, 127.87s/it]
39%|██████████████████████████▉ | 46/118 [1:38:08<2:33:22, 127.81s/it]
40%|███████████████████████████▍ | 47/118 [1:40:16<2:31:12, 127.79s/it]
41%|████████████████████████████ | 48/118 [1:42:23<2:28:52, 127.61s/it]
42%|████████████████████████████▋ | 49/118 [1:44:30<2:26:36, 127.49s/it]
42%|█████████████████████████████▏ | 50/118 [1:46:37<2:24:22, 127.39s/it]
{'loss': '1.269', 'grad_norm': '0.1839', 'learning_rate': '1.341e-05', 'ppl': '3.556', 'memory/max_active (GiB)': '32.17', 'memory/max_allocated (GiB)': '32.17', 'memory/device_reserved (GiB)': '33.31', 'tokens/train_per_sec_per_gpu': '178.7', 'tokens/total': 19660800, 'tokens/trainable': 9224452, 'epoch': '0.8368'}
|
|||
|
|
42%|█████████████████████████████▏ | 50/118 [1:46:37<2:24:22, 127.39s/it]
43%|█████████████████████████████▊ | 51/118 [1:48:45<2:22:22, 127.50s/it]
44%|██████████████████████████████▍ | 52/118 [1:50:53<2:20:17, 127.54s/it]
45%|██████████████████████████████▉ | 53/118 [1:53:00<2:18:08, 127.52s/it]
46%|███████████████████████████████▌ | 54/118 [1:55:08<2:16:09, 127.66s/it]
47%|████████████████████████████████▏ | 55/118 [1:57:16<2:13:55, 127.55s/it]
47%|████████████████████████████████▋ | 56/118 [1:59:23<2:11:46, 127.52s/it]
48%|█████████████████████████████████▎ | 57/118 [2:01:30<2:09:35, 127.47s/it]
49%|█████████████████████████████████▉ | 58/118 [2:03:38<2:07:34, 127.57s/it]
50%|██████████████████████████████████▌ | 59/118 [2:05:46<2:05:32, 127.67s/it]
51%|███████████████████████████████████ | 60/118 [2:07:18<1:53:04, 116.97s/it]
{'loss': '1.242', 'grad_norm': '0.1736', 'learning_rate': '1.069e-05', 'ppl': '3.463', 'memory/max_active (GiB)': '32.17', 'memory/max_allocated (GiB)': '32.17', 'memory/device_reserved (GiB)': '33.31', 'tokens/train_per_sec_per_gpu': '200.6', 'tokens/total': 23482368, 'tokens/trainable': 11016035, 'epoch': '1'}
|
|||
|
|
51%|███████████████████████████████████ | 60/118 [2:07:18<1:53:04, 116.97s/it][2026-05-14 16:24:20,726] [INFO] [axolotl.core.trainers.base._save:818] [PID:27401] Saving model checkpoint to ./outputs/finance-synthetic-sft-phase2/checkpoint-60
|
|||
|
|
[2026-05-14 16:24:24,709] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:27401] Using single process for pack_parallel, running sequentially.
|
|||
|
|
52%|███████████████████████████████████▋ | 61/118 [2:09:31<1:55:40, 121.76s/it]
53%|████████████████████████████████████▎ | 62/118 [2:11:38<1:55:11, 123.42s/it]
53%|████████████████████████████████████▊ | 63/118 [2:13:46<1:54:15, 124.64s/it]
54%|█████████████████████████████████████▍ | 64/118 [2:15:54<1:53:02, 125.60s/it]
55%|██████████████████████████████████████ | 65/118 [2:18:01<1:51:26, 126.16s/it]
56%|██████████████████████████████████████▌ | 66/118 [2:20:08<1:49:38, 126.50s/it]
57%|███████████████████████████████████████▏ | 67/118 [2:22:16<1:47:50, 126.88s/it]
58%|███████████████████████████████████████▊ | 68/118 [2:24:24<1:46:03, 127.26s/it]
58%|████████████████████████▌ | 69/118 [2:26:32<1:44:03, 127.42s/it]
59%|████████████████████████▉ | 70/118 [2:28:40<1:41:58, 127.47s/it]
{'loss': '1.222', 'grad_norm': '0.157', 'learning_rate': '7.93e-06', 'ppl': '3.394', 'memory/max_active (GiB)': '32.17', 'memory/max_allocated (GiB)': '32.17', 'memory/device_reserved (GiB)': '33.31', 'tokens/train_per_sec_per_gpu': '178.7', 'tokens/total': 27414528, 'tokens/trainable': 12851399, 'epoch': '1.167'}
|
|||
|
|
59%|████████████████████████▉ | 70/118 [2:28:40<1:41:58, 127.47s/it]
60%|█████████████████████████▎ | 71/118 [2:30:48<1:40:00, 127.67s/it]
61%|█████████████████████████▋ | 72/118 [2:32:56<1:37:53, 127.68s/it]
62%|█████████████████████████▉ | 73/118 [2:35:04<1:35:50, 127.80s/it]
63%|██████████████████████████▎ | 74/118 [2:37:11<1:33:43, 127.81s/it]
64%|██████████████████████████▋ | 75/118 [2:39:20<1:31:41, 127.94s/it]
64%|███████████████████████████ | 76/118 [2:41:27<1:29:29, 127.85s/it]
65%|███████████████████████████▍ | 77/118 [2:43:35<1:27:21, 127.85s/it]
66%|███████████████████████████▊ | 78/118 [2:45:43<1:25:19, 127.98s/it]
67%|████████████████████████████ | 79/118 [2:47:51<1:23:09, 127.95s/it]
68%|████████████████████████████▍ | 80/118 [2:49:59<1:20:57, 127.82s/it]
{'loss': '1.215', 'grad_norm': '0.1533', 'learning_rate': '5.324e-06', 'ppl': '3.37', 'memory/max_active (GiB)': '32.17', 'memory/max_allocated (GiB)': '32.17', 'memory/device_reserved (GiB)': '33.31', 'tokens/train_per_sec_per_gpu': '180', 'tokens/total': 31346688, 'tokens/trainable': 14712337, 'epoch': '1.335'}
|
|||
|
|
68%|████████████████████████████▍ | 80/118 [2:49:59<1:20:57, 127.82s/it][2026-05-14 17:07:01,536] [INFO] [axolotl.core.trainers.base._save:818] [PID:27401] Saving model checkpoint to ./outputs/finance-synthetic-sft-phase2/checkpoint-80
|
|||
|
|
69%|████████████████████████████▊ | 81/118 [2:52:10<1:19:22, 128.72s/it]
69%|█████████████████████████████▏ | 82/118 [2:54:17<1:16:59, 128.33s/it]
70%|█████████████████████████████▌ | 83/118 [2:56:25<1:14:42, 128.08s/it]
71%|█████████████████████████████▉ | 84/118 [2:58:33<1:12:35, 128.11s/it]
72%|██████████████████████████████▎ | 85/118 [3:00:41<1:10:24, 128.01s/it]
73%|██████████████████████████████▌ | 86/118 [3:02:48<1:08:13, 127.92s/it]
74%|██████████████████████████████▉ | 87/118 [3:04:56<1:06:04, 127.88s/it]
75%|███████████████████████████████▎ | 88/118 [3:07:04<1:03:56, 127.90s/it]
75%|███████████████████████████████▋ | 89/118 [3:09:12<1:01:47, 127.84s/it]
76%|█████████████████████████████████▌ | 90/118 [3:11:19<59:38, 127.82s/it]
{'loss': '1.202', 'grad_norm': '0.1499', 'learning_rate': '3.078e-06', 'ppl': '3.326', 'memory/max_active (GiB)': '32.17', 'memory/max_allocated (GiB)': '32.17', 'memory/device_reserved (GiB)': '33.31', 'tokens/train_per_sec_per_gpu': '176', 'tokens/total': 35278848, 'tokens/trainable': 16552748, 'epoch': '1.502'}
|
|||
|
|
76%|█████████████████████████████████▌ | 90/118 [3:11:19<59:38, 127.82s/it]
77%|█████████████████████████████████▉ | 91/118 [3:13:27<57:28, 127.72s/it]
78%|██████████████████████████████████▎ | 92/118 [3:15:34<55:18, 127.64s/it]
79%|██████████████████████████████████▋ | 93/118 [3:17:42<53:12, 127.71s/it]
80%|███████████████████████████████████ | 94/118 [3:19:50<51:07, 127.82s/it]
81%|███████████████████████████████████▍ | 95/118 [3:21:58<48:59, 127.82s/it]
81%|███████████████████████████████████▊ | 96/118 [3:24:06<46:51, 127.78s/it]
82%|████████████████████████████████████▏ | 97/118 [3:26:14<44:42, 127.76s/it]
83%|████████████████████████████████████▌ | 98/118 [3:28:21<42:35, 127.76s/it]
84%|████████████████████████████████████▉ | 99/118 [3:30:29<40:28, 127.84s/it]
85%|████████████████████████████████████▍ | 100/118 [3:32:37<38:21, 127.83s/it]
{'loss': '1.195', 'grad_norm': '0.1503', 'learning_rate': '1.363e-06', 'ppl': '3.304', 'memory/max_active (GiB)': '32.17', 'memory/max_allocated (GiB)': '32.17', 'memory/device_reserved (GiB)': '33.31', 'tokens/train_per_sec_per_gpu': '177.9', 'tokens/total': 39211008, 'tokens/trainable': 18391700, 'epoch': '1.669'}
|
|||
|
|
85%|████████████████████████████████████▍ | 100/118 [3:32:37<38:21, 127.83s/it][2026-05-14 17:49:39,854] [INFO] [axolotl.core.trainers.base._save:818] [PID:27401] Saving model checkpoint to ./outputs/finance-synthetic-sft-phase2/checkpoint-100
|
|||
|
|
86%|████████████████████████████████████▊ | 101/118 [3:34:48<36:26, 128.62s/it]
86%|█████████████████████████████████████▏ | 102/118 [3:36:55<34:11, 128.20s/it]
87%|█████████████████████████████████████▌ | 103/118 [3:39:02<31:59, 127.96s/it]
88%|█████████████████████████████████████▉ | 104/118 [3:41:10<29:52, 128.02s/it]
89%|██████████████████████████████████████▎ | 105/118 [3:43:18<27:44, 128.01s/it]
90%|██████████████████████████████████████▋ | 106/118 [3:45:26<25:33, 127.82s/it]
91%|██████████████████████████████████████▉ | 107/118 [3:47:33<23:24, 127.71s/it]
92%|███████████████████████████████████████▎ | 108/118 [3:49:41<21:17, 127.74s/it]
92%|███████████████████████████████████████▋ | 109/118 [3:51:49<19:09, 127.72s/it]
93%|████████████████████████████████████████ | 110/118 [3:53:57<17:02, 127.77s/it]
{'loss': '1.191', 'grad_norm': '0.1469', 'learning_rate': '3.114e-07', 'ppl': '3.292', 'memory/max_active (GiB)': '32.17', 'memory/max_allocated (GiB)': '32.17', 'memory/device_reserved (GiB)': '33.31', 'tokens/train_per_sec_per_gpu': '179.6', 'tokens/total': 43143168, 'tokens/trainable': 20246224, 'epoch': '1.837'}
|
|||
|
|
93%|████████████████████████████████████████ | 110/118 [3:53:57<17:02, 127.77s/it]
94%|████████████████████████████████████████▍ | 111/118 [3:56:05<14:55, 127.97s/it]
95%|████████████████████████████████████████▊ | 112/118 [3:58:13<12:47, 127.92s/it]
96%|█████████████████████████████████████████▏ | 113/118 [4:00:20<10:39, 127.82s/it]
97%|█████████████████████████████████████████▌ | 114/118 [4:02:28<08:31, 127.77s/it]
97%|█████████████████████████████████████████▉ | 115/118 [4:04:36<06:23, 127.90s/it]
98%|██████████████████████████████████████████▎| 116/118 [4:06:44<04:15, 127.91s/it]
99%|██████████████████████████████████████████▋| 117/118 [4:08:52<02:07, 127.75s/it]
100%|███████████████████████████████████████████| 118/118 [4:10:59<00:00, 127.59s/it][2026-05-14 18:28:01,475] [INFO] [axolotl.core.trainers.base._save:818] [PID:27401] Saving model checkpoint to ./outputs/finance-synthetic-sft-phase2/checkpoint-118
|
|||
|
|
{'train_runtime': '1.506e+04', 'train_samples_per_second': '0.501', 'train_steps_per_second': '0.008', 'train_loss': '1.281', 'memory/max_active (GiB)': '32.17', 'memory/max_allocated (GiB)': '32.17', 'memory/device_reserved (GiB)': '33.31', 'epoch': '1.971', 'tokens/train_per_sec_per_gpu': '189.9'}
|
|||
|
|
100%|███████████████████████████████████████████| 118/118 [4:11:01<00:00, 127.59s/it]
100%|███████████████████████████████████████████| 118/118 [4:11:01<00:00, 127.64s/it]
|
|||
|
|
[2026-05-14 18:28:16,499] [INFO] [axolotl.train.save_trained_model:267] [PID:27401] Training completed! Saving trained model to ./outputs/finance-synthetic-sft-phase2.
|
|||
|
|
[2026-05-14 18:28:17,359] [INFO] [axolotl.train.save_trained_model:388] [PID:27401] Model successfully saved to ./outputs/finance-synthetic-sft-phase2
|
|||
|
|
[2026-05-14 18:28:17,446] [INFO] [axolotl.core.trainers.base._save:818] [PID:27401] Saving model checkpoint to ./outputs/finance-synthetic-sft-phase2
|
|||
|
|
Processing Files (0 / 0) : | | 0.00B / 0.00B
|
|||
|
|
New Data Upload : | | 0.00B / 0.00B [A
|
|||
|
|
|
|||
|
|
...-phase2/training_args.bin: 100%|███████████████████| 11.3kB / 11.3kB [A[A
|
|||
|
|
|
|||
|
|
|
|||
|
|
...sft-phase2/tokenizer.json: 100%|███████████████████| 11.4MB / 11.4MB [A[A[A
|
|||
|
|
|
|||
|
|
|
|||
|
|
|
|||
|
|
...adapter_model.safetensors: 55%|██████████▍ | 192MB / 349MB [A[A[A[A
|
|||
|
|
|
|||
|
|
...-phase2/training_args.bin: 100%|███████████████████| 11.3kB / 11.3kB [A[A
|
|||
|
|
|
|||
|
|
|
|||
|
|
...sft-phase2/tokenizer.json: 100%|███████████████████| 11.4MB / 11.4MB [A[A[A
|
|||
|
|
|
|||
|
|
|
|||
|
|
|
|||
|
|
...adapter_model.safetensors: 55%|██████████▍ | 192MB / 349MB [A[A[A[A
Processing Files (2 / 3) : 56%|██████████▋ | 203MB / 361MB, ???B/s
|
|||
|
|
|
|||
|
|
...-phase2/training_args.bin: 100%|███████████████████| 11.3kB / 11.3kB [A[A
|
|||
|
|
|
|||
|
|
|
|||
|
|
...sft-phase2/tokenizer.json: 100%|███████████████████| 11.4MB / 11.4MB [A[A[A
|
|||
|
|
|
|||
|
|
|
|||
|
|
|
|||
|
|
...adapter_model.safetensors: 100%|███████████████████| 349MB / 349MB [A[A[A[A
Processing Files (3 / 3) : 100%|███████████████████| 361MB / 361MB, 786MB/s
|
|||
|
|
|
|||
|
|
...-phase2/training_args.bin: 100%|███████████████████| 11.3kB / 11.3kB [A[A
|
|||
|
|
|
|||
|
|
|
|||
|
|
...sft-phase2/tokenizer.json: 100%|███████████████████| 11.4MB / 11.4MB [A[A[A
|
|||
|
|
|
|||
|
|
|
|||
|
|
|
|||
|
|
...adapter_model.safetensors: 100%|███████████████████| 349MB / 349MB [A[A[A[A
|
|||
|
|
|
|||
|
|
...-phase2/training_args.bin: 100%|███████████████████| 11.3kB / 11.3kB [A[A
|
|||
|
|
|
|||
|
|
|
|||
|
|
...sft-phase2/tokenizer.json: 100%|███████████████████| 11.4MB / 11.4MB [A[A[A
|
|||
|
|
|
|||
|
|
|
|||
|
|
|
|||
|
|
...adapter_model.safetensors: 100%|███████████████████| 349MB / 349MB [A[A[A[A
|
|||
|
|
|
|||
|
|
...-phase2/training_args.bin: 100%|███████████████████| 11.3kB / 11.3kB [A[A
|
|||
|
|
|
|||
|
|
|
|||
|
|
...sft-phase2/tokenizer.json: 100%|███████████████████| 11.4MB / 11.4MB [A[A[A
|
|||
|
|
|
|||
|
|
|
|||
|
|
|
|||
|
|
...adapter_model.safetensors: 100%|███████████████████| 349MB / 349MB [A[A[A[A
Processing Files (3 / 3) : 100%|███████████████████| 361MB / 361MB, 262MB/s
|
|||
|
|
New Data Upload : | | 0.00B / 0.00B, 0.00B/s
|
|||
|
|
...-phase2/training_args.bin: 100%|███████████████████| 11.3kB / 11.3kB
|
|||
|
|
...sft-phase2/tokenizer.json: 100%|███████████████████| 11.4MB / 11.4MB
|
|||
|
|
...adapter_model.safetensors: 100%|███████████████████| 349MB / 349MB
|