The following values were not passed to `accelerate launch` and had defaults used instead: `--num_processes` was set to a value of `2` More than one GPU was found, enabling multi-GPU training. If this was unintended please pass in `--num_processes=1`. `--num_machines` was set to a value of `1` `--mixed_precision` was set to a value of `'no'` `--dynamo_backend` was set to a value of `'no'` To avoid this warning pass in values for each of the problematic parameters or run `accelerate config`. [2026-04-17 02:08:45,271] [WARNING] [torchao] Skipping import of cpp extensions due to incompatible torch version. Please upgrade to torch >= 2.11.0 (found 2.10.0+cu128). [2026-04-17 02:08:45,439] [WARNING] [torchao] Skipping import of cpp extensions due to incompatible torch version. Please upgrade to torch >= 2.11.0 (found 2.10.0+cu128). [2026-04-17 02:08:47,222] [WARNING] [axolotl.utils.schemas.validation] sample_packing without flash, sdp, xformers, sage, or flex attention does not handle cross sample decontamination. [2026-04-17 02:08:47,223] [INFO] [axolotl.utils.schemas.validation] Setting `pad_to_sequence_len: true` to prevent memory leaks when sample_packing [2026-04-17 02:08:47,223] [WARNING] [axolotl.utils.schemas.validation] Configuring FSDP fields with the `fsdp_` prefix is deprecated. Please omit the `fsdp_` prefix from the any fields in `fsdp_config`. [2026-04-17 02:08:47,467] [INFO] [axolotl.cli.config] config: { "activation_offloading": false, "axolotl_config_path": "/workspace/data/sage-classifier-train-scripts/qwen3/fft/qwen3-4B-train-v1-6-no-liger-flex-magnifi-module-classifier-04-17-relabelled-upsampled.yml", "base_model": "Tifin-Sage/magnifi-classifier-01-05-search-agent-3-epochs-3k-unknown-errors", "base_model_config": "Tifin-Sage/magnifi-classifier-01-05-search-agent-3-epochs-3k-unknown-errors", "batch_size": 2, "bf16": true, "capabilities": { "bf16": true, "compute_capability": "sm_80", "fp8": false, "n_gpu": 2, "n_node": 1, "tf32": true }, "chat_template": "qwen3", "context_parallel_size": 1, "dataloader_num_workers": 2, "dataloader_pin_memory": true, "dataloader_prefetch_factor": 256, "dataset_num_proc": 128, "dataset_prepared_path": "/workspace/data/datasets_prepared/magnifi-module-classifier-04-17-relabelled-upsampled", "datasets": [ { "chat_template": "tokenizer_default", "field_messages": "messages", "message_property_mappings": { "content": "content", "role": "role" }, "path": "Tifin-Sage/magnifi-module-classifier-04-17-relabelled-upsampled", "split": "train", "trust_remote_code": false, "type": "chat_template" } ], "ddp": true, "device": "cuda:0", "device_map": { "": 0 }, "dion_rank_fraction": 1.0, "dion_rank_multiple_of": 1, "eaft_alpha": 1.0, "eaft_k": 20, "env_capabilities": { "torch_version": "2.10.0" }, "eval_batch_size": 1, "eval_causal_lm_metrics": [ "sacrebleu", "comet", "ter", "chrf" ], "eval_max_new_tokens": 128, "eval_sample_packing": true, "eval_steps": 0.25, "eval_table_size": 0, "evals_per_epoch": 2, "experimental_skip_move_to_device": true, "fp16": false, "fsdp": [ "full_shard", "auto_wrap" ], "fsdp_config": { "activation_checkpointing": true, "auto_wrap_policy": "TRANSFORMER_BASED_WRAP", "cpu_ram_efficient_loading": true, "fsdp_version": 2, "offload_params": false, "reshard_after_forward": true, "state_dict_type": "FULL_STATE_DICT", "transformer_layer_cls_to_wrap": "Qwen3DecoderLayer" }, "fsdp_version": 2, "generate_samples": false, "generation_do_sample": true, "generation_max_new_tokens": 50, "generation_prompt_ratio": 0.5, "generation_temperature": 0.7, "gradient_accumulation_steps": 1, "gradient_checkpointing": false, "hub_model_id": "Tifin-Sage/magnifi-module-classifier-04-17-relabelled-upsampled", "include_tkps": true, "layer_offloading": false, "learning_rate": 2e-05, "lisa_layers_attribute": "model.layers", "load_best_model_at_end": false, "load_in_4bit": false, "load_in_8bit": false, "local_rank": 0, "logging_steps": 1, "lora_dropout": 0.0, "loraplus_lr_embedding": 1e-06, "lr_scheduler": "cosine", "mean_resizing_embeddings": false, "merge_method": "memory_efficient", "micro_batch_size": 1, "model_config_type": "qwen3", "num_epochs": 2.0, "num_generation_samples": 3, "optimizer": "adamw_torch_fused", "otel_metrics_host": "localhost", "otel_metrics_port": 8000, "output_dir": "/workspace/data/outputs/qwen3-4B/fft_magnifi-module-classifier-04-17-relabelled-upsampled/", "pad_to_sequence_len": true, "pretrain_multipack_attn": true, "profiler_steps_start": 0, "qlora_sharded_model_loading": false, "quantize_moe_experts": false, "ray_num_workers": 1, "resources_per_worker": { "GPU": 1 }, "sample_packing": true, "sample_packing_bin_size": 200, "sample_packing_group_size": 100000, "save_only_model": false, "save_safetensors": true, "save_steps": 0.5, "saves_per_epoch": 1, "sequence_len": 16000, "shuffle_before_merging_datasets": false, "shuffle_merged_datasets": true, "skip_prepare_dataset": false, "streaming_multipack_buffer_size": 10000, "strict": false, "tensor_parallel_size": 1, "tf32": true, "tiled_mlp_use_original_mlp": true, "tokenizer_config": "Tifin-Sage/magnifi-classifier-01-05-search-agent-3-epochs-3k-unknown-errors", "tokenizer_save_jinja_files": true, "torch_dtype": "torch.bfloat16", "train_on_inputs": false, "trl": { "async_prefetch": false, "log_completions": false, "mask_truncated_completions": false, "ref_model_mixup_alpha": 0.9, "ref_model_sync_steps": 64, "replay_buffer_size": 0, "replay_recompute_logps": true, "reroll_max_groups": 1, "reroll_start_fraction": 1.0, "reward_num_workers": 1, "scale_rewards": true, "skip_zero_advantage_batches": true, "sync_ref_model": false, "use_data_producer": false, "use_vllm": false, "vllm_lora_sync": false, "vllm_server_host": "0.0.0.0", "vllm_server_port": 8000 }, "use_otel_metrics": false, "use_ray": false, "use_wandb": true, "val_set_size": 0.1, "vllm": { "device": "auto", "dtype": "auto", "gpu_memory_utilization": 0.9, "host": "0.0.0.0", "port": 8000 }, "wandb_name": "magnifi-module-classifier-04-17-relabelled-upsampled", "wandb_project": "sage-classifier", "warmup_ratio": 0.1, "weight_decay": 0.0, "world_size": 2 } [2026-04-17 02:08:51,607] [INFO] [axolotl.utils.data.shared] Loading prepared dataset from disk at /workspace/data/datasets_prepared/magnifi-module-classifier-04-17-relabelled-upsampled/6241b9d0f4bdccc4ed4f52e5adefd1bc... [Gloo] Rank [Gloo] Rank 1 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 0 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 [2026-04-17 02:08:57,019] [INFO] [axolotl.utils.samplers.multipack] gather_len_batches: [54, 54] [2026-04-17 02:08:57,129] [INFO] [axolotl.utils.trainer] sample_packing_eff_est across ranks: [0.9649779796600342, 0.9649779796600342] [2026-04-17 02:09:01,870] [INFO] [axolotl.utils.samplers.multipack] gather_len_batches: [478, 478] [2026-04-17 02:09:01,872] [INFO] [axolotl.utils.trainer] sample_packing_eff_est across ranks: [0.969444751739502, 0.9775572419166565] [2026-04-17 02:09:01,874] [INFO] [axolotl.utils.data.sft] Maximum number of steps set at 478 [2026-04-17 02:09:03,028] [INFO] [axolotl.loaders.patch_manager] Applying multipack dataloader patch for sample packing... Fetching 2 files: 0%| | 0/2 [00:00