commit ad612efc77dcbf35b0ed543befeeb716a3ece063 Author: ModelHub XC Date: Tue Jun 16 06:04:17 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: laion/Sera-4.5A-Full-T1-v3-316-axolotl__Qwen3-8B Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..52373fe --- /dev/null +++ b/.gitattributes @@ -0,0 +1,36 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..676191f --- /dev/null +++ b/README.md @@ -0,0 +1,133 @@ +--- +library_name: transformers +base_model: Qwen/Qwen3-8B +tags: +- generated_from_trainer +datasets: +- laion/Sera-4.5A-Full-T1-v3-316 +model-index: +- name: e/data1/datasets/playground/ot-baf/checkpoints/sera-v3-316-axolotl__Qwen3-8B + results: [] +--- + + + +[Built with Axolotl](https://github.com/axolotl-ai-cloud/axolotl) +
See axolotl config + +axolotl version: `0.16.0.dev0` +```yaml +# Mirror of SERA's axolotl_qwen3_8b.yaml, parameterized for Jupiter SFT on v3 data. +# Filled at render time via sed-substitution of 316. + +base_model: Qwen/Qwen3-8B +deepspeed: /e/scratch/jureap59/feuer1/code/axolotl/deepspeed_configs/zero3_bf16.json + +load_in_8bit: false +load_in_4bit: false + +# plugins disabled 2026-04-22: CCE + bf16 + flash-attn on aarch64/torch2.9 caused +# gradient explosion (grad_norm 9.8e+11) and loss -> 0 within first 3-7 steps. +# plugins: +# - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin + +chat_template: chatml +datasets: + - path: laion/Sera-4.5A-Full-T1-v3-316 + data_files: + - sera-4.5a-full-t1_v3_316.jsonl + type: chat_template + field_messages: messages + ds_type: json + message_field_training: train + +dataset_prepared_path: /e/data1/datasets/playground/ot-baf/axolotl_dataset_cache/sera-v3-316 +output_dir: /e/data1/datasets/playground/ot-baf/checkpoints/sera-v3-316-axolotl__Qwen3-8B + +sequence_len: 32768 + +wandb_project: +wandb_entity: +wandb_watch: +wandb_name: sera-v3-316-axolotl__Qwen3-8B +wandb_log_model: + +gradient_accumulation_steps: 8 +micro_batch_size: 1 +num_epochs: 3 +optimizer: adamw_torch +lr_scheduler: cosine +learning_rate: 1e-5 +adam_beta1: 0.9 +adam_beta2: 0.95 + +bf16: auto +tf32: false + +gradient_checkpointing: true +activation_offloading: true +resume_from_checkpoint: +logging_steps: 1 +flash_attention: true + +loss_watchdog_threshold: 5.0 +loss_watchdog_patience: 3 + +warmup_ratio: 0.1875 +evals_per_epoch: 0 +save_strategy: epoch + +weight_decay: 0.01 +max_grad_norm: 1.0 +special_tokens: + +``` + +

+ +# e/data1/datasets/playground/ot-baf/checkpoints/sera-v3-316-axolotl__Qwen3-8B + +This model is a fine-tuned version of [Qwen/Qwen3-8B](https://huggingface.co/Qwen/Qwen3-8B) on the laion/Sera-4.5A-Full-T1-v3-316 dataset. + +## Model description + +More information needed + +## Intended uses & limitations + +More information needed + +## Training and evaluation data + +More information needed + +## Training procedure + +### Training hyperparameters + +The following hyperparameters were used during training: +- learning_rate: 1e-05 +- train_batch_size: 1 +- eval_batch_size: 1 +- seed: 42 +- distributed_type: multi-GPU +- num_devices: 4 +- gradient_accumulation_steps: 8 +- total_train_batch_size: 32 +- total_eval_batch_size: 4 +- optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.95) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments +- lr_scheduler_type: cosine +- lr_scheduler_warmup_steps: 2 +- training_steps: 10 + +### Training results + + + +### Framework versions + +- Transformers 5.5.0 +- Pytorch 2.9.1+cu130 +- Datasets 4.5.0 +- Tokenizers 0.22.2 diff --git a/chat_template.jinja b/chat_template.jinja new file mode 100644 index 0000000..2116e45 --- /dev/null +++ b/chat_template.jinja @@ -0,0 +1,4 @@ +{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + ' +' + message['content'] + '<|im_end|>' + ' +'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant +' }}{% endif %} diff --git a/config.json b/config.json new file mode 100644 index 0000000..f9fd364 --- /dev/null +++ b/config.json @@ -0,0 +1,71 @@ +{ + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": null, + "dtype": "bfloat16", + "eos_token_id": 151645, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 12288, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 40960, + "max_window_layers": 36, + "model_type": "qwen3", + "num_attention_heads": 32, + "num_hidden_layers": 36, + "num_key_value_heads": 8, + "pad_token_id": 151643, + "rms_norm_eps": 1e-06, + "rope_parameters": { + "rope_theta": 1000000, + "rope_type": "default" + }, + "sliding_window": null, + "tie_word_embeddings": false, + "transformers_version": "5.5.0", + "use_cache": false, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/debug.log b/debug.log new file mode 100644 index 0000000..135eee3 --- /dev/null +++ b/debug.log @@ -0,0 +1,2698 @@ +[2026-04-22 17:08:14,463] [DEBUG] [axolotl.utils.config.resolve_dtype:74] [PID:738399] bf16 support detected, enabling for this configuration. +[2026-04-22 17:08:14,469] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:738399] baseline 0.000GB () +[2026-04-22 17:08:14,469] [INFO] [axolotl.cli.config.load_cfg:341] [PID:738399] config: +{ + "activation_offloading": true, + "adam_beta1": 0.9, + "adam_beta2": 0.95, + "axolotl_config_path": "/e/scratch/jureap59/feuer1/code/axolotl_configs/qwen3_8b_sera_v3_316.yaml", + "base_model": "Qwen/Qwen3-8B", + "base_model_config": "Qwen/Qwen3-8B", + "batch_size": 32, + "bf16": true, + "capabilities": { + "bf16": true, + "compute_capability": "sm_90", + "fp8": true, + "n_gpu": 4, + "n_node": 1, + "tf32": true + }, + "chat_template": "chatml", + "context_parallel_size": 1, + "dataloader_num_workers": 4, + "dataloader_pin_memory": true, + "dataloader_prefetch_factor": 256, + "dataset_num_proc": 288, + "dataset_prepared_path": "/e/data1/datasets/playground/ot-baf/axolotl_dataset_cache/sera-v3-316", + "datasets": [ + { + "chat_template": "tokenizer_default", + "data_files": [ + "sera-4.5a-full-t1_v3_316.jsonl" + ], + "ds_type": "json", + "field_messages": "messages", + "message_field_training": "train", + "message_property_mappings": { + "content": "content", + "role": "role" + }, + "path": "laion/Sera-4.5A-Full-T1-v3-316", + "trust_remote_code": false, + "type": "chat_template" + } + ], + "ddp": true, + "deepspeed": { + "bf16": { + "enabled": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false, + "zero_optimization": { + "contiguous_gradients": true, + "gather_16bit_weights_on_model_save": true, + "max_live_parameters": 0, + "max_reuse_distance": 0, + "overlap_comm": true, + "reduce_bucket_size": "auto", + "stage": 3, + "stage3_param_persistence_threshold": "auto", + "stage3_prefetch_bucket_size": "auto", + "sub_group_size": 0 + } + }, + "device": "cuda:0", + "device_map": { + "": 0 + }, + "dion_rank_fraction": 1.0, + "dion_rank_multiple_of": 1, + "eaft_alpha": 1.0, + "eaft_k": 20, + "env_capabilities": { + "torch_version": "2.9.1" + }, + "eval_batch_size": 1, + "eval_causal_lm_metrics": [ + "sacrebleu", + "comet", + "ter", + "chrf" + ], + "eval_max_new_tokens": 128, + "eval_table_size": 0, + "evals_per_epoch": 0, + "experimental_skip_move_to_device": true, + "flash_attention": true, + "fp16": false, + "generate_samples": false, + "generation_do_sample": true, + "generation_max_new_tokens": 50, + "generation_prompt_ratio": 0.5, + "generation_temperature": 0.7, + "gradient_accumulation_steps": 8, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": { + "use_reentrant": true + }, + "include_tkps": true, + "layer_offloading": false, + "learning_rate": 1e-05, + "lisa_layers_attribute": "model.layers", + "load_best_model_at_end": false, + "load_in_4bit": false, + "load_in_8bit": false, + "local_rank": 0, + "logging_steps": 1, + "lora_dropout": 0.0, + "loraplus_lr_embedding": 1e-06, + "loss_watchdog_patience": 3, + "loss_watchdog_threshold": 5.0, + "lr_scheduler": "cosine", + "max_grad_norm": 1.0, + "mean_resizing_embeddings": false, + "merge_method": "memory_efficient", + "micro_batch_size": 1, + "model_config_type": "qwen3", + "num_epochs": 3.0, + "num_generation_samples": 3, + "optimizer": "adamw_torch", + "otel_metrics_host": "localhost", + "otel_metrics_port": 8000, + "output_dir": "/e/data1/datasets/playground/ot-baf/checkpoints/sera-v3-316-axolotl__Qwen3-8B", + "pretrain_multipack_attn": true, + "profiler_steps_start": 0, + "qlora_sharded_model_loading": false, + "quantize_moe_experts": false, + "ray_num_workers": 1, + "resources_per_worker": { + "GPU": 1 + }, + "sample_packing_bin_size": 200, + "sample_packing_group_size": 100000, + "save_only_model": false, + "save_safetensors": true, + "save_strategy": "epoch", + "sequence_len": 32768, + "shuffle_before_merging_datasets": false, + "shuffle_merged_datasets": true, + "skip_prepare_dataset": false, + "streaming_multipack_buffer_size": 10000, + "strict": false, + "tensor_parallel_size": 1, + "tf32": false, + "tiled_mlp_use_original_mlp": true, + "tokenizer_config": "Qwen/Qwen3-8B", + "tokenizer_save_jinja_files": true, + "torch_dtype": "torch.bfloat16", + "train_on_inputs": false, + "trl": { + "async_prefetch": false, + "log_completions": false, + "mask_truncated_completions": false, + "ref_model_mixup_alpha": 0.9, + "ref_model_sync_steps": 64, + "replay_buffer_size": 0, + "replay_recompute_logps": true, + "reroll_max_groups": 1, + "reroll_start_fraction": 1.0, + "reward_num_workers": 1, + "scale_rewards": true, + "skip_zero_advantage_batches": true, + "sync_ref_model": false, + "use_data_producer": false, + "use_vllm": false, + "vllm_lora_sync": false, + "vllm_server_host": "0.0.0.0", + "vllm_server_port": 8000 + }, + "use_otel_metrics": false, + "use_ray": false, + "val_set_size": 0.0, + "vllm": { + "device": "auto", + "dtype": "auto", + "gpu_memory_utilization": 0.9, + "host": "0.0.0.0", + "port": 8000 + }, + "wandb_name": "sera-v3-316-axolotl__Qwen3-8B", + "warmup_ratio": 0.1875, + "weight_decay": 0.01, + "world_size": 4 +} +[2026-04-22 17:08:14,478] [INFO] [axolotl.cli.checks.check_user_token:37] [PID:738399] Skipping HuggingFace token verification because HF_HUB_OFFLINE is set to True. Only local files will be used. +[2026-04-22 17:08:14,964] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:307] [PID:738399] EOS: 151645 / <|im_end|> +[2026-04-22 17:08:14,964] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:308] [PID:738399] BOS: None / None +[2026-04-22 17:08:14,964] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:309] [PID:738399] PAD: 151643 / <|endoftext|> +[2026-04-22 17:08:14,964] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:310] [PID:738399] UNK: None / None +[2026-04-22 17:08:15,014] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:475] [PID:738399] Loading prepared dataset from disk at /e/data1/datasets/playground/ot-baf/axolotl_dataset_cache/sera-v3-316/7d33c987dbb7eda46f5b7e5dea4f0fe6... +[2026-04-22 17:08:15,099] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:420] [PID:738399] total_num_tokens: 2_624_301 +[2026-04-22 17:08:15,163] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:438] [PID:738399] `total_supervised_tokens: 494_705` +[2026-04-22 17:08:15,164] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:521] [PID:738399] total_num_steps: 10 +[2026-04-22 17:08:15,164] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:121] [PID:738399] Maximum number of steps set at 10 +[2026-04-22 17:08:15,201] [DEBUG] [axolotl.train.setup_model_and_tokenizer:70] [PID:738399] loading tokenizer... Qwen/Qwen3-8B +[2026-04-22 17:08:15,461] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:307] [PID:738399] EOS: 151645 / <|im_end|> +[2026-04-22 17:08:15,462] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:308] [PID:738399] BOS: None / None +[2026-04-22 17:08:15,462] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:309] [PID:738399] PAD: 151643 / <|endoftext|> +[2026-04-22 17:08:15,462] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:310] [PID:738399] UNK: None / None +[2026-04-22 17:08:15,462] [DEBUG] [axolotl.train.setup_model_and_tokenizer:81] [PID:738399] Loading model +[2026-04-22 17:08:15,474] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:94] [PID:738399] Patched Trainer.evaluation_loop with nanmean loss calculation +[2026-04-22 17:08:15,475] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:148] [PID:738399] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation +[2026-04-22 17:08:15,638] [INFO] [axolotl.monkeypatch.attention.flash_attn_4.patch_flash_attn_4:52] [PID:738399] Flash Attention 4 is available for your GPU and offers faster training speeds. To enable: pip install flash-attn-4 +[2026-04-22 17:08:15,639] [INFO] [axolotl.monkeypatch.deepspeed_utils.patch_checkpoint_wrapper_setattr:52] [PID:738399] CheckpointWrapper patched to forward DeepSpeed attributes +[2026-04-22 17:08:48,719] [INFO] [axolotl.loaders.model._configure_embedding_dtypes:361] [PID:738399] Converting modules to torch.bfloat16 +[2026-04-22 17:08:48,722] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:738399] Memory usage after model load 4.973GB (+4.973GB allocated, +5.080GB reserved) +[2026-04-22 17:08:51,616] [INFO] [axolotl.train.save_initial_configs:421] [PID:738399] Pre-saving tokenizer to /e/data1/datasets/playground/ot-baf/checkpoints/sera-v3-316-axolotl__Qwen3-8B... +[2026-04-22 17:08:51,718] [INFO] [axolotl.train.save_initial_configs:426] [PID:738399] Pre-saving model config to /e/data1/datasets/playground/ot-baf/checkpoints/sera-v3-316-axolotl__Qwen3-8B... +[2026-04-22 17:08:51,723] [INFO] [axolotl.train.execute_training:222] [PID:738399] Starting trainer... +[2026-04-22 17:08:52,156] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_external_parameters to wrapped module Qwen3DecoderLayer +[2026-04-22 17:08:52,156] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_external_parameters to wrapped module Qwen3DecoderLayer +[2026-04-22 17:08:52,156] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_external_parameters to wrapped module Qwen3DecoderLayer +[2026-04-22 17:08:52,156] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_external_parameters to wrapped module Qwen3DecoderLayer +[2026-04-22 17:08:52,157] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_external_parameters to wrapped module Qwen3DecoderLayer +[2026-04-22 17:08:52,157] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_external_parameters to wrapped module Qwen3DecoderLayer +[2026-04-22 17:08:52,157] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_external_parameters to wrapped module Qwen3DecoderLayer +[2026-04-22 17:08:52,157] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_external_parameters to wrapped module Qwen3DecoderLayer +[2026-04-22 17:08:52,157] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_external_parameters to wrapped module Qwen3DecoderLayer +[2026-04-22 17:08:52,157] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_external_parameters to wrapped module Qwen3DecoderLayer +[2026-04-22 17:08:52,157] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_external_parameters to wrapped module Qwen3DecoderLayer +[2026-04-22 17:08:52,158] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_external_parameters to wrapped module Qwen3DecoderLayer +[2026-04-22 17:08:52,158] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_external_parameters to wrapped module Qwen3DecoderLayer +[2026-04-22 17:08:52,158] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_external_parameters to wrapped module Qwen3DecoderLayer +[2026-04-22 17:08:52,158] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_external_parameters to wrapped module Qwen3DecoderLayer +[2026-04-22 17:08:52,158] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_external_parameters to wrapped module Qwen3DecoderLayer +[2026-04-22 17:08:52,158] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_external_parameters to wrapped module Qwen3DecoderLayer +[2026-04-22 17:08:52,158] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_external_parameters to wrapped module Qwen3DecoderLayer +[2026-04-22 17:08:52,159] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_external_parameters to wrapped module Qwen3DecoderLayer +[2026-04-22 17:08:52,159] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_external_parameters to wrapped module Qwen3DecoderLayer +[2026-04-22 17:08:52,159] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_external_parameters to wrapped module Qwen3DecoderLayer +[2026-04-22 17:08:52,159] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_external_parameters to wrapped module Qwen3DecoderLayer +[2026-04-22 17:08:52,159] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_external_parameters to wrapped module Qwen3DecoderLayer +[2026-04-22 17:08:52,159] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_external_parameters to wrapped module Qwen3DecoderLayer +[2026-04-22 17:08:52,160] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_external_parameters to wrapped module Qwen3DecoderLayer +[2026-04-22 17:08:52,160] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_external_parameters to wrapped module Qwen3DecoderLayer +[2026-04-22 17:08:52,160] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_external_parameters to wrapped module Qwen3DecoderLayer +[2026-04-22 17:08:52,160] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_external_parameters to wrapped module Qwen3DecoderLayer +[2026-04-22 17:08:52,160] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_external_parameters to wrapped module Qwen3DecoderLayer +[2026-04-22 17:08:52,160] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_external_parameters to wrapped module Qwen3DecoderLayer +[2026-04-22 17:08:52,160] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_external_parameters to wrapped module Qwen3DecoderLayer +[2026-04-22 17:08:52,160] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_external_parameters to wrapped module Qwen3DecoderLayer +[2026-04-22 17:08:52,161] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_external_parameters to wrapped module Qwen3DecoderLayer +[2026-04-22 17:08:52,161] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_external_parameters to wrapped module Qwen3DecoderLayer +[2026-04-22 17:08:52,161] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_external_parameters to wrapped module Qwen3DecoderLayer +[2026-04-22 17:08:52,161] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_external_parameters to wrapped module Qwen3DecoderLayer +[2026-04-22 17:08:52,164] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_id to wrapped module Qwen3DecoderLayer +[2026-04-22 17:08:52,166] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_id to wrapped module Qwen3DecoderLayer +[2026-04-22 17:08:52,167] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_id to wrapped module Qwen3DecoderLayer +[2026-04-22 17:08:52,168] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_id to wrapped module Qwen3DecoderLayer +[2026-04-22 17:08:52,170] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_id to wrapped module Qwen3DecoderLayer +[2026-04-22 17:08:52,171] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_id to wrapped module Qwen3DecoderLayer +[2026-04-22 17:08:52,172] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_id to wrapped module Qwen3DecoderLayer +[2026-04-22 17:08:52,173] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_id to wrapped module Qwen3DecoderLayer +[2026-04-22 17:08:52,175] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_id to wrapped module Qwen3DecoderLayer +[2026-04-22 17:08:52,176] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_id to wrapped module Qwen3DecoderLayer +[2026-04-22 17:08:52,177] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_id to wrapped module Qwen3DecoderLayer +[2026-04-22 17:08:52,178] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_id to wrapped module Qwen3DecoderLayer +[2026-04-22 17:08:52,179] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_id to wrapped module Qwen3DecoderLayer +[2026-04-22 17:08:52,181] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_id to wrapped module Qwen3DecoderLayer +[2026-04-22 17:08:52,182] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_id to wrapped module Qwen3DecoderLayer +[2026-04-22 17:08:52,183] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_id to wrapped module Qwen3DecoderLayer +[2026-04-22 17:08:52,184] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_id to wrapped module Qwen3DecoderLayer +[2026-04-22 17:08:52,185] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_id to wrapped module Qwen3DecoderLayer +[2026-04-22 17:08:52,186] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_id to wrapped module Qwen3DecoderLayer +[2026-04-22 17:08:52,188] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_id to wrapped module Qwen3DecoderLayer +[2026-04-22 17:08:52,189] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_id to wrapped module Qwen3DecoderLayer +[2026-04-22 17:08:52,190] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_id to wrapped module Qwen3DecoderLayer +[2026-04-22 17:08:52,191] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_id to wrapped module Qwen3DecoderLayer +[2026-04-22 17:08:52,192] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_id to wrapped module Qwen3DecoderLayer +[2026-04-22 17:08:52,193] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_id to wrapped module Qwen3DecoderLayer +[2026-04-22 17:08:52,195] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_id to wrapped module Qwen3DecoderLayer +[2026-04-22 17:08:52,196] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_id to wrapped module Qwen3DecoderLayer +[2026-04-22 17:08:52,197] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_id to wrapped module Qwen3DecoderLayer +[2026-04-22 17:08:52,198] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_id to wrapped module Qwen3DecoderLayer +[2026-04-22 17:08:52,199] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_id to wrapped module Qwen3DecoderLayer +[2026-04-22 17:08:52,200] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_id to wrapped module Qwen3DecoderLayer +[2026-04-22 17:08:52,201] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_id to wrapped module Qwen3DecoderLayer +[2026-04-22 17:08:52,203] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_id to wrapped module Qwen3DecoderLayer +[2026-04-22 17:08:52,204] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_id to wrapped module Qwen3DecoderLayer +[2026-04-22 17:08:52,205] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_id to wrapped module Qwen3DecoderLayer +[2026-04-22 17:08:52,207] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_id to wrapped module Qwen3DecoderLayer + 0%| | 0/10 [00:00 self.virtual_memory_safe_pct=60% of virtual memory used +[2026-04-22 17:09:09,584] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:09,641] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:09,694] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:09,753] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:09,812] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:09,871] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:09,930] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:09,988] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:10,047] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:10,105] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:10,163] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:10,222] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:10,281] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:10,339] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:10,397] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:10,456] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:10,515] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:10,573] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:10,633] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:10,692] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:10,752] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:10,811] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:10,870] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:10,928] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:10,987] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:11,046] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:11,104] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:11,162] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:11,221] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:11,280] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:11,338] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:11,397] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:11,456] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:11,522] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:11,576] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:11,633] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:20,757] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:20,777] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:20,814] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:20,854] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:20,895] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:20,935] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:20,977] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:21,019] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:21,061] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:21,102] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:21,143] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:21,185] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:21,228] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:21,271] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:21,313] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:21,355] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:21,397] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:21,444] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:21,515] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:21,563] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:21,615] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:21,660] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:21,712] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:21,765] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:21,818] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:21,871] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:21,925] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:21,978] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:22,030] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:22,083] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:22,136] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:22,189] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:22,242] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:22,295] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:22,347] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:22,400] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:30,826] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:30,843] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:30,895] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:30,949] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:31,005] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:31,060] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:31,117] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:31,173] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:31,229] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:31,284] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:31,340] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:31,397] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:31,454] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:31,512] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:31,570] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:31,629] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:31,686] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:31,745] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:31,803] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:31,862] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:31,920] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:31,979] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:32,038] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:32,097] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:32,156] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:32,215] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:32,275] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:32,335] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:32,395] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:32,454] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:32,514] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:32,574] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:32,634] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:32,694] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:32,753] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:32,810] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:40,829] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:40,846] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:40,909] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:40,977] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:41,042] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:41,110] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:41,177] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:41,242] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:41,311] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:41,378] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:41,446] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:41,513] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:41,580] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:41,647] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:41,714] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:41,781] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:41,847] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:41,915] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:41,981] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:42,047] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:42,115] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:42,181] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:42,248] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:42,315] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:42,382] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:42,448] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:42,515] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:42,582] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:42,649] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:42,716] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:42,783] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:42,849] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:42,917] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:42,983] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:43,050] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:43,117] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:51,899] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:51,915] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:51,971] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:52,030] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:52,090] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:52,151] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:52,212] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:52,270] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:52,331] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:52,391] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:52,452] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:52,512] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:52,572] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:52,631] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:52,691] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:52,750] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:52,809] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:52,868] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:52,928] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:52,988] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:53,047] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:53,107] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:53,167] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:53,225] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:53,285] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:53,344] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:53,403] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:53,462] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:53,521] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:53,580] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:53,639] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:53,697] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:53,756] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:53,815] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:53,875] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:09:53,932] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:01,697] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:01,714] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:01,773] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:01,834] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:01,896] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:01,956] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:02,017] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:02,077] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:02,139] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:02,201] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:02,262] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:02,325] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:02,388] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:02,451] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:02,513] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:02,575] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:02,636] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:02,698] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:02,759] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:02,820] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:02,881] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:02,943] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:03,003] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:03,065] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:03,126] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:03,188] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:03,249] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:03,311] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:03,372] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:03,433] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:03,495] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:03,557] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:03,618] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:03,679] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:03,740] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:03,800] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:11,802] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:11,818] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:11,871] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:11,926] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:11,981] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:12,037] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:12,093] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:12,149] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:12,205] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:12,261] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:12,316] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:12,371] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:12,427] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:12,484] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:12,539] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:12,595] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:12,650] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:12,705] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:12,762] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:12,817] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:12,873] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:12,929] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:12,985] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:13,041] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:13,099] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:13,155] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:13,211] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:13,268] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:13,325] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:13,382] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:13,440] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:13,497] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:13,554] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:14,061] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:14,114] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:14,169] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer + 10%|█ | 1/10 [01:31<13:45, 91.72s/it] {'loss': '1.108', 'grad_norm': '19.17', 'learning_rate': '0', 'ppl': '3.028', 'memory/max_active (GiB)': '78.4', 'memory/max_allocated (GiB)': '77.05', 'memory/device_reserved (GiB)': '90.88', 'tokens/train_per_sec_per_gpu': '55.82', 'tokens/total': 847936, 'tokens/trainable': 170242, 'epoch': '0.32'} + 10%|█ | 1/10 [01:31<13:45, 91.72s/it][2026-04-22 17:10:25,045] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:25,081] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:25,142] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:25,204] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:25,266] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:25,329] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:25,392] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:25,453] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:25,516] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:25,578] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:25,641] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:25,705] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:25,768] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:25,831] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:25,894] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:25,958] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:26,021] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:26,083] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:26,147] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:26,210] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:26,274] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:26,337] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:26,400] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:26,463] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:26,527] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:26,590] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:26,654] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:26,717] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:26,780] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:26,843] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:26,907] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:26,971] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:27,034] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:27,098] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:27,161] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:27,225] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:35,730] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:35,752] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:35,810] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:35,873] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:35,938] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:36,003] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:36,066] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:36,131] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:36,197] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:36,262] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:36,326] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:36,392] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:36,456] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:36,520] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:36,584] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:36,648] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:36,714] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:36,778] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:36,843] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:36,909] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:36,973] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:37,036] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:37,099] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:37,162] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:37,226] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:37,290] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:37,354] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:37,417] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:37,480] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:37,544] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:37,608] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:37,673] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:37,737] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:37,814] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:37,865] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:37,926] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:48,281] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:48,300] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:48,351] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:48,405] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:48,461] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:48,515] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:48,569] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:48,626] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:48,683] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:48,742] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:48,799] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:48,858] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:48,914] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:48,971] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:49,024] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:49,081] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:49,138] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:49,195] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:49,253] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:49,311] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:49,369] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:49,423] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:49,478] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:49,533] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:49,590] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:49,650] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:49,708] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:49,772] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:49,837] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:49,901] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:49,965] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:50,031] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:50,095] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:50,161] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:50,224] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:50,289] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:59,795] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:59,814] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:59,867] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:59,923] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:10:59,980] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:00,039] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:00,096] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:00,153] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:00,209] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:00,268] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:00,327] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:00,389] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:00,450] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:00,512] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:00,572] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:00,632] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:00,693] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:00,754] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:00,814] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:00,870] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:00,928] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:00,986] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:01,046] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:01,106] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:01,167] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:01,228] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:01,290] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:01,352] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:01,413] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:01,473] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:01,534] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:01,591] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:01,648] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:01,706] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:01,765] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:01,825] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:12,525] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:12,542] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:12,603] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:12,669] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:12,736] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:12,799] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:12,863] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:12,930] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:12,993] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:13,060] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:13,125] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:13,190] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:13,258] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:13,323] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:13,387] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:13,453] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:13,518] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:13,582] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:13,647] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:13,712] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:13,777] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:13,841] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:13,906] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:13,971] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:14,036] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:14,100] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:14,165] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:14,231] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:14,296] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:14,360] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:14,425] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:14,491] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:14,556] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:14,621] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:14,687] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:14,751] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:24,655] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:24,673] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:24,722] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:24,777] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:24,831] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:24,885] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:24,940] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:24,993] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:25,048] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:25,104] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:25,159] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:25,213] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:25,269] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:25,325] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:25,379] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:25,435] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:25,491] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:25,547] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:25,603] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:25,659] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:25,715] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:25,771] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:25,828] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:25,883] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:25,939] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:25,995] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:26,052] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:26,110] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:26,168] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:26,230] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:26,292] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:26,354] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:26,416] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:26,481] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:26,539] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:26,601] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:37,249] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:37,267] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:37,323] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:37,384] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:37,446] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:37,507] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:37,568] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:37,629] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:37,690] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:37,750] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:37,811] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:37,874] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:37,936] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:38,000] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:38,063] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:38,127] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:38,192] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:38,257] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:38,321] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:38,385] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:38,451] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:38,516] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:38,581] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:38,644] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:38,710] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:38,775] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:38,839] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:38,904] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:38,969] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:39,035] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:39,099] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:39,163] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:39,229] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:39,295] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:39,359] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:39,424] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:49,636] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:49,654] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:49,714] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:49,779] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:49,843] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:49,908] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:49,971] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:50,035] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:50,099] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:50,164] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:50,229] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:50,296] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:50,361] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:50,426] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:50,491] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:50,556] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:50,620] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:50,684] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:50,749] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:50,813] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:50,877] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:50,940] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:51,004] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:51,068] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:51,133] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:51,197] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:51,260] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:51,324] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:51,389] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:51,453] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:51,517] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:51,581] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:51,645] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:51,710] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:51,775] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:11:51,837] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:02,475] [WARNING] [stage3.py:2497:step] 15 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time + 20%|██ | 2/10 [03:09<12:40, 95.10s/it] {'loss': '1.13', 'grad_norm': '19.85', 'learning_rate': '5e-06', 'ppl': '3.095', 'memory/max_active (GiB)': '91', 'memory/max_allocated (GiB)': '91', 'memory/device_reserved (GiB)': '92.61', 'tokens/train_per_sec_per_gpu': '57.46', 'tokens/total': 1742656, 'tokens/trainable': 329267, 'epoch': '0.64'} + 20%|██ | 2/10 [03:09<12:40, 95.10s/it][2026-04-22 17:12:02,502] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:02,538] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:02,598] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:02,662] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:02,726] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:02,790] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:02,853] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:02,916] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:02,979] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:03,043] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:03,106] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:03,169] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:03,233] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:03,297] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:03,770] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:04,189] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:04,249] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:04,312] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:04,375] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:04,438] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:04,499] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:04,561] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:04,624] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:05,032] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:05,096] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:05,159] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:05,221] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:05,284] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:05,348] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:05,411] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:05,474] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:05,538] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:05,601] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:05,665] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:05,731] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:05,795] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:15,002] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:15,020] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:15,050] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:15,083] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:15,119] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:15,154] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:15,189] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:15,224] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:15,259] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:15,313] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:15,376] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:15,439] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:15,504] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:15,568] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:15,633] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:15,698] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:15,761] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:15,825] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:15,890] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:15,954] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:16,018] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:16,083] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:16,146] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:16,211] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:16,275] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:16,339] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:16,403] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:16,468] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:16,532] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:16,596] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:16,662] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:16,727] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:16,791] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:16,867] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:16,919] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:16,984] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:26,104] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:26,121] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:26,162] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:26,205] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:26,249] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:26,293] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:26,337] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:26,381] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:26,426] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:26,471] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:26,514] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:26,557] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:26,602] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:26,646] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:26,694] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:26,738] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:26,783] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:26,827] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:26,872] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:26,915] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:26,959] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:27,005] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:27,050] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:27,094] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:27,138] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:27,185] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:27,232] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:27,276] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:27,320] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:27,365] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:27,411] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:27,455] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:27,499] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:27,544] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:27,591] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:27,634] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:33,770] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:33,798] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:33,817] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:33,848] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:33,879] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:33,909] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:33,940] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:33,970] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:34,009] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:34,066] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:34,129] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:34,183] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:34,244] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:34,302] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:34,363] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:34,424] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:34,484] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:34,544] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:34,603] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:34,663] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:34,723] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:34,784] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:34,844] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:34,903] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:34,963] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:35,024] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:35,084] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:35,145] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:35,203] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:35,264] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:35,325] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:35,384] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:35,445] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:35,505] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:35,565] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:35,626] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:44,158] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:44,175] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:44,223] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:44,276] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:44,328] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:44,381] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:44,434] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:44,488] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:44,541] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:44,594] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:44,646] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:44,700] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:44,755] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:44,807] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:44,861] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:44,915] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:44,969] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:45,023] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:45,077] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:45,131] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:45,186] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:45,240] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:45,294] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:45,348] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:45,401] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:45,456] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:45,509] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:45,563] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:45,617] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:45,671] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:45,726] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:45,780] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:45,834] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:45,888] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:45,943] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:45,995] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:53,759] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:53,776] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:53,812] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:53,848] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:53,886] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:53,925] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:53,962] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:54,000] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:54,037] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:54,090] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:54,154] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:54,212] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:54,273] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:54,332] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:54,397] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:54,462] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:54,525] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:54,590] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:54,658] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:54,720] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:54,785] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:54,853] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:54,916] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:54,994] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:55,102] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:55,174] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:55,237] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:55,272] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:55,336] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:55,414] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:55,505] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:55,571] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:55,612] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:55,670] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:55,731] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:12:55,797] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:06,471] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:06,488] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:06,524] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:06,563] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:06,603] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:06,642] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:06,683] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:06,725] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:06,764] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:06,803] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:06,849] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:06,923] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:06,978] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:07,042] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:07,107] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:07,184] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:07,236] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:07,302] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:07,365] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:07,429] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:07,493] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:07,556] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:07,621] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:07,684] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:07,747] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:07,811] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:07,875] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:07,937] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:08,002] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:08,066] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:08,131] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:08,195] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:08,259] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:08,324] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:08,387] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:08,451] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:17,870] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:17,887] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:17,928] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:17,972] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:18,015] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:18,060] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:18,106] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:18,153] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:18,199] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:18,244] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:18,289] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:18,332] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:18,378] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:18,424] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:18,474] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:18,545] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:18,603] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:18,654] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:18,710] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:18,770] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:18,829] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:18,888] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:18,947] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:19,007] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:19,066] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:19,126] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:19,187] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:19,246] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:19,308] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:19,367] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:19,426] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:19,486] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:19,546] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:19,608] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:19,665] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:19,725] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:28,886] [WARNING] [stage3.py:2497:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time + 30%|███ | 3/10 [04:35<10:37, 91.13s/it] {'loss': '1.059', 'grad_norm': '12.71', 'learning_rate': '1e-05', 'ppl': '2.883', 'memory/max_active (GiB)': '85.71', 'memory/max_allocated (GiB)': '85.71', 'memory/device_reserved (GiB)': '92.9', 'tokens/train_per_sec_per_gpu': '70.55', 'tokens/total': 2548864, 'tokens/trainable': 479668, 'epoch': '0.96'} + 30%|███ | 3/10 [04:35<10:37, 91.13s/it][2026-04-22 17:13:28,932] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:28,966] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:29,028] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:29,091] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:29,154] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:29,216] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:29,278] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:29,342] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:29,406] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:29,470] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:29,533] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:29,597] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:29,661] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:29,724] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:29,788] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:29,851] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:29,915] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:29,978] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:30,043] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:30,107] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:30,171] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:30,234] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:30,298] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:30,361] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:30,424] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:30,488] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:30,551] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:30,614] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:30,678] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:30,742] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:30,807] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:31,250] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:31,313] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:31,375] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:31,799] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:31,859] [DEBUG] [axolotl.monkeypatch.deepspeed_utils.new_setattr:43] [PID:738399] Forwarded ds_grads_remaining to wrapped module Qwen3DecoderLayer +[2026-04-22 17:13:42,086] [WARNING] [stage3.py:2497:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time + 40%|████ | 4/10 [04:48<06:02, 60.36s/it] {'loss': '0.8501', 'grad_norm': '5.362', 'learning_rate': '9.619e-06', 'ppl': '2.34', 'memory/max_active (GiB)': '84.29', 'memory/max_allocated (GiB)': '84.29', 'memory/device_reserved (GiB)': '92.9', 'tokens/train_per_sec_per_gpu': '381', 'tokens/total': 2655680, 'tokens/trainable': 499753, 'epoch': '1'} + 40%|████ | 4/10 [04:48<06:02, 60.36s/it][2026-04-22 17:13:44,246] [INFO] [axolotl.core.trainers.base._save:741] [PID:738399] Saving model checkpoint to /e/data1/datasets/playground/ot-baf/checkpoints/sera-v3-316-axolotl__Qwen3-8B/checkpoint-4 + + Writing model shards: 0%| | 0/1 [00:00", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..996867e --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:32d35c4f09479b0b6c555c83f0f79bfd301bbe53325dc6da828d6a32a18a86a5 +size 8849