commit 5770ee00344c7996f9b50d9bbea0a4561734f679 Author: ModelHub XC Date: Sat May 23 01:43:12 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: DaDing777/qwen2.5-VL-3B-atm-finetune-cot-full-old Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..b63a787 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,56 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text + + +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zstandard filter=lfs diff=lfs merge=lfs -text +*.tfevents* filter=lfs diff=lfs merge=lfs -text +*.db* filter=lfs diff=lfs merge=lfs -text +*.ark* filter=lfs diff=lfs merge=lfs -text +**/*ckpt*data* filter=lfs diff=lfs merge=lfs -text +**/*ckpt*.meta filter=lfs diff=lfs merge=lfs -text +**/*ckpt*.index filter=lfs diff=lfs merge=lfs -text + +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.gguf* filter=lfs diff=lfs merge=lfs -text +*.ggml filter=lfs diff=lfs merge=lfs -text +*.llamafile* filter=lfs diff=lfs merge=lfs -text +*.pt2 filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text + +model-00001-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text +training_args.bin filter=lfs diff=lfs merge=lfs -text +merges.txt filter=lfs diff=lfs merge=lfs -text +model-00002-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text +model-00004-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text +vocab.json filter=lfs diff=lfs merge=lfs -text +model-00003-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..9e1b333 --- /dev/null +++ b/README.md @@ -0,0 +1,65 @@ +--- +base_model: /home/xuke/dch/pretrained_models/Qwen2.5-VL-3B-Instruct +frameworks: +- "" +library_name: transformers +license: other +model-index: +- name: qwen2.5-VL-3B-finetune-cot-full + results: [] +tags: +- llama-factory +- full +- generated_from_trainer +tasks: [] +--- + + + +# qwen2.5-VL-3B-finetune-cot-full + +This model is a fine-tuned version of [/home/xuke/dch/pretrained_models/Qwen2.5-VL-3B-Instruct](https://huggingface.co//home/xuke/dch/pretrained_models/Qwen2.5-VL-3B-Instruct) on the atm_finetune_cot dataset. + +## Model description + +More information needed + +## Intended uses & limitations + +More information needed + +## Training and evaluation data + +More information needed + +## Training procedure + +### Training hyperparameters + +The following hyperparameters were used during training: +- learning_rate: 1e-05 +- train_batch_size: 2 +- eval_batch_size: 8 +- seed: 42 +- distributed_type: multi-GPU +- num_devices: 8 +- gradient_accumulation_steps: 4 +- total_train_batch_size: 64 +- total_eval_batch_size: 64 +- optimizer: Use adamw_torch with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments +- lr_scheduler_type: cosine +- lr_scheduler_warmup_ratio: 0.1 +- num_epochs: 1.0 + +### Training results + + + +### Framework versions + +- Transformers 4.52.4 +- Pytorch 2.5.1+cu121 +- Datasets 3.6.0 +- Tokenizers 0.21.1 + diff --git a/added_tokens.json b/added_tokens.json new file mode 100644 index 0000000..482ced4 --- /dev/null +++ b/added_tokens.json @@ -0,0 +1,24 @@ +{ + "": 151658, + "": 151657, + "<|box_end|>": 151649, + "<|box_start|>": 151648, + "<|endoftext|>": 151643, + "<|file_sep|>": 151664, + "<|fim_middle|>": 151660, + "<|fim_pad|>": 151662, + "<|fim_prefix|>": 151659, + "<|fim_suffix|>": 151661, + "<|im_end|>": 151645, + "<|im_start|>": 151644, + "<|image_pad|>": 151655, + "<|object_ref_end|>": 151647, + "<|object_ref_start|>": 151646, + "<|quad_end|>": 151651, + "<|quad_start|>": 151650, + "<|repo_name|>": 151663, + "<|video_pad|>": 151656, + "<|vision_end|>": 151653, + "<|vision_pad|>": 151654, + "<|vision_start|>": 151652 +} diff --git a/all_results.json b/all_results.json new file mode 100644 index 0000000..f57fc3f --- /dev/null +++ b/all_results.json @@ -0,0 +1,8 @@ +{ + "epoch": 1.0, + "total_flos": 3.149345964351816e+17, + "train_loss": 0.4079481167808606, + "train_runtime": 5827.269, + "train_samples_per_second": 3.415, + "train_steps_per_second": 0.053 +} \ No newline at end of file diff --git a/chat_template.jinja b/chat_template.jinja new file mode 100644 index 0000000..6c22663 --- /dev/null +++ b/chat_template.jinja @@ -0,0 +1,7 @@ +{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system +You are a helpful assistant.<|im_end|> +{% endif %}<|im_start|>{{ message['role'] }} +{% if message['content'] is string %}{{ message['content'] }}<|im_end|> +{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|> +{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant +{% endif %} \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..04c112a --- /dev/null +++ b/config.json @@ -0,0 +1,105 @@ +{ + "architectures": [ + "Qwen2_5_VLForConditionalGeneration" + ], + "attention_dropout": 0.0, + "bos_token_id": 151643, + "eos_token_id": 151645, + "hidden_act": "silu", + "hidden_size": 2048, + "image_token_id": 151655, + "initializer_range": 0.02, + "intermediate_size": 11008, + "max_position_embeddings": 128000, + "max_window_layers": 70, + "model_type": "qwen2_5_vl", + "num_attention_heads": 16, + "num_hidden_layers": 36, + "num_key_value_heads": 2, + "rms_norm_eps": 1e-06, + "rope_scaling": { + "mrope_section": [ + 16, + 24, + 24 + ], + "rope_type": "default", + "type": "default" + }, + "rope_theta": 1000000.0, + "sliding_window": 32768, + "text_config": { + "architectures": [ + "Qwen2_5_VLForConditionalGeneration" + ], + "attention_dropout": 0.0, + "bos_token_id": 151643, + "eos_token_id": 151645, + "hidden_act": "silu", + "hidden_size": 2048, + "image_token_id": null, + "initializer_range": 0.02, + "intermediate_size": 11008, + "max_position_embeddings": 128000, + "max_window_layers": 70, + "model_type": "qwen2_5_vl_text", + "num_attention_heads": 16, + "num_hidden_layers": 36, + "num_key_value_heads": 2, + "rms_norm_eps": 1e-06, + "rope_scaling": { + "mrope_section": [ + 16, + 24, + 24 + ], + "rope_type": "default", + "type": "default" + }, + "rope_theta": 1000000.0, + "sliding_window": 32768, + "tie_word_embeddings": true, + "torch_dtype": "bfloat16", + "use_cache": false, + "use_sliding_window": false, + "video_token_id": null, + "vision_end_token_id": 151653, + "vision_start_token_id": 151652, + "vision_token_id": 151654, + "vocab_size": 151936 + }, + "torch_dtype": "float32", + "transformers_version": "4.52.4", + "use_cache": false, + "use_sliding_window": false, + "video_token_id": 151656, + "vision_config": { + "depth": 32, + "fullatt_block_indexes": [ + 7, + 15, + 23, + 31 + ], + "hidden_act": "silu", + "hidden_size": 1280, + "in_channels": 3, + "in_chans": 3, + "initializer_range": 0.02, + "intermediate_size": 3420, + "model_type": "qwen2_5_vl", + "num_heads": 16, + "out_hidden_size": 2048, + "patch_size": 14, + "spatial_merge_size": 2, + "spatial_patch_size": 14, + "temporal_patch_size": 2, + "tokens_per_second": 2, + "torch_dtype": "bfloat16", + "window_size": 112 + }, + "vision_end_token_id": 151653, + "vision_start_token_id": 151652, + "vision_token_id": 151654, + "vocab_size": 151936 +} diff --git a/configuration.json b/configuration.json new file mode 100644 index 0000000..3a6d425 --- /dev/null +++ b/configuration.json @@ -0,0 +1 @@ +{"framework":"Pytorch","task":"image-text-to-text"} \ No newline at end of file diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..c110271 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 151643, + "do_sample": true, + "eos_token_id": [ + 151645, + 151643 + ], + "pad_token_id": 151643, + "repetition_penalty": 1.05, + "temperature": 1e-06, + "transformers_version": "4.52.4" +} diff --git a/merges.txt b/merges.txt new file mode 100644 index 0000000..80c1a19 --- /dev/null +++ b/merges.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8831e4f1a044471340f7c0a83d7bd71306a5b867e95fd870f74d0c5308a904d5 +size 1671853 diff --git a/model-00001-of-00004.safetensors b/model-00001-of-00004.safetensors new file mode 100644 index 0000000..b21e2b1 --- /dev/null +++ b/model-00001-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d06a84c304c48eb59ed5d929408d92c16b9001c2b182d216a038a84236f51a26 +size 4972304384 diff --git a/model-00002-of-00004.safetensors b/model-00002-of-00004.safetensors new file mode 100644 index 0000000..4c63087 --- /dev/null +++ b/model-00002-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02eb44a235085b2adefa9580fc8262314ce476272feaee43ac7779e4e66c6dcc +size 4932949248 diff --git a/model-00003-of-00004.safetensors b/model-00003-of-00004.safetensors new file mode 100644 index 0000000..5261ea6 --- /dev/null +++ b/model-00003-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90524294b65c08724060c055a31a744d6a270441c2f4f638ea9a1c4ee5dd8725 +size 4932949336 diff --git a/model-00004-of-00004.safetensors b/model-00004-of-00004.safetensors new file mode 100644 index 0000000..e84169c --- /dev/null +++ b/model-00004-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8822447c46176acb5281e451e42168359bf979b84a65223d06e6c053fd0b120e +size 1425040040 diff --git a/model.safetensors.index.json b/model.safetensors.index.json new file mode 100644 index 0000000..f7b913f --- /dev/null +++ b/model.safetensors.index.json @@ -0,0 +1,832 @@ +{ + "metadata": { + "total_size": 16263151616 + }, + "weight_map": { + "lm_head.weight": "model-00004-of-00004.safetensors", + "model.embed_tokens.weight": "model-00001-of-00004.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.19.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.3.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.30.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.32.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.32.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.32.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.32.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.32.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.32.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.32.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.32.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.32.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.32.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.32.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.32.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.33.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.33.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.33.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.33.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.33.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.33.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.33.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.33.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.33.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.33.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.33.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.33.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.34.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.34.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.34.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.34.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.34.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.34.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.34.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.34.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.34.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.34.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.34.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.34.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.35.input_layernorm.weight": "model-00004-of-00004.safetensors", + "model.layers.35.mlp.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.35.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.35.mlp.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.35.post_attention_layernorm.weight": "model-00004-of-00004.safetensors", + "model.layers.35.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.35.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.35.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.35.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.35.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.35.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.35.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.4.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.4.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.4.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.4.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.5.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.5.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.5.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.5.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.6.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.6.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.6.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.6.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.7.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.7.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.7.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.7.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.8.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.norm.weight": "model-00004-of-00004.safetensors", + "visual.blocks.0.attn.proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.0.attn.proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.0.attn.qkv.bias": "model-00001-of-00004.safetensors", + "visual.blocks.0.attn.qkv.weight": "model-00001-of-00004.safetensors", + "visual.blocks.0.mlp.down_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.0.mlp.gate_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.0.mlp.up_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.0.norm1.weight": "model-00001-of-00004.safetensors", + "visual.blocks.0.norm2.weight": "model-00001-of-00004.safetensors", + "visual.blocks.1.attn.proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.1.attn.proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.1.attn.qkv.bias": "model-00001-of-00004.safetensors", + "visual.blocks.1.attn.qkv.weight": "model-00001-of-00004.safetensors", + "visual.blocks.1.mlp.down_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.1.mlp.gate_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.1.mlp.up_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.1.norm1.weight": "model-00001-of-00004.safetensors", + "visual.blocks.1.norm2.weight": "model-00001-of-00004.safetensors", + "visual.blocks.10.attn.proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.10.attn.proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.10.attn.qkv.bias": "model-00001-of-00004.safetensors", + "visual.blocks.10.attn.qkv.weight": "model-00001-of-00004.safetensors", + "visual.blocks.10.mlp.down_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.10.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.10.mlp.gate_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.10.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.10.mlp.up_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.10.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.10.norm1.weight": "model-00001-of-00004.safetensors", + "visual.blocks.10.norm2.weight": "model-00001-of-00004.safetensors", + "visual.blocks.11.attn.proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.11.attn.proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.11.attn.qkv.bias": "model-00001-of-00004.safetensors", + "visual.blocks.11.attn.qkv.weight": "model-00001-of-00004.safetensors", + "visual.blocks.11.mlp.down_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.11.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.11.mlp.gate_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.11.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.11.mlp.up_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.11.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.11.norm1.weight": "model-00001-of-00004.safetensors", + "visual.blocks.11.norm2.weight": "model-00001-of-00004.safetensors", + "visual.blocks.12.attn.proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.12.attn.proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.12.attn.qkv.bias": "model-00001-of-00004.safetensors", + "visual.blocks.12.attn.qkv.weight": "model-00001-of-00004.safetensors", + "visual.blocks.12.mlp.down_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.12.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.12.mlp.gate_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.12.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.12.mlp.up_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.12.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.12.norm1.weight": "model-00001-of-00004.safetensors", + "visual.blocks.12.norm2.weight": "model-00001-of-00004.safetensors", + "visual.blocks.13.attn.proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.13.attn.proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.13.attn.qkv.bias": "model-00001-of-00004.safetensors", + "visual.blocks.13.attn.qkv.weight": "model-00001-of-00004.safetensors", + "visual.blocks.13.mlp.down_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.13.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.13.mlp.gate_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.13.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.13.mlp.up_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.13.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.13.norm1.weight": "model-00001-of-00004.safetensors", + "visual.blocks.13.norm2.weight": "model-00001-of-00004.safetensors", + "visual.blocks.14.attn.proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.14.attn.proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.14.attn.qkv.bias": "model-00001-of-00004.safetensors", + "visual.blocks.14.attn.qkv.weight": "model-00001-of-00004.safetensors", + "visual.blocks.14.mlp.down_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.14.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.14.mlp.gate_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.14.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.14.mlp.up_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.14.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.14.norm1.weight": "model-00001-of-00004.safetensors", + "visual.blocks.14.norm2.weight": "model-00001-of-00004.safetensors", + "visual.blocks.15.attn.proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.15.attn.proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.15.attn.qkv.bias": "model-00001-of-00004.safetensors", + "visual.blocks.15.attn.qkv.weight": "model-00001-of-00004.safetensors", + "visual.blocks.15.mlp.down_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.15.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.15.mlp.gate_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.15.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.15.mlp.up_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.15.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.15.norm1.weight": "model-00001-of-00004.safetensors", + "visual.blocks.15.norm2.weight": "model-00001-of-00004.safetensors", + "visual.blocks.16.attn.proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.16.attn.proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.16.attn.qkv.bias": "model-00001-of-00004.safetensors", + "visual.blocks.16.attn.qkv.weight": "model-00001-of-00004.safetensors", + "visual.blocks.16.mlp.down_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.16.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.16.mlp.gate_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.16.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.16.mlp.up_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.16.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.16.norm1.weight": "model-00001-of-00004.safetensors", + "visual.blocks.16.norm2.weight": "model-00001-of-00004.safetensors", + "visual.blocks.17.attn.proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.17.attn.proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.17.attn.qkv.bias": "model-00001-of-00004.safetensors", + "visual.blocks.17.attn.qkv.weight": "model-00001-of-00004.safetensors", + "visual.blocks.17.mlp.down_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.17.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.17.mlp.gate_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.17.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.17.mlp.up_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.17.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.17.norm1.weight": "model-00001-of-00004.safetensors", + "visual.blocks.17.norm2.weight": "model-00001-of-00004.safetensors", + "visual.blocks.18.attn.proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.18.attn.proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.18.attn.qkv.bias": "model-00001-of-00004.safetensors", + "visual.blocks.18.attn.qkv.weight": "model-00001-of-00004.safetensors", + "visual.blocks.18.mlp.down_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.18.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.18.mlp.gate_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.18.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.18.mlp.up_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.18.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.18.norm1.weight": "model-00001-of-00004.safetensors", + "visual.blocks.18.norm2.weight": "model-00001-of-00004.safetensors", + "visual.blocks.19.attn.proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.19.attn.proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.19.attn.qkv.bias": "model-00001-of-00004.safetensors", + "visual.blocks.19.attn.qkv.weight": "model-00001-of-00004.safetensors", + "visual.blocks.19.mlp.down_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.19.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.19.mlp.gate_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.19.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.19.mlp.up_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.19.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.19.norm1.weight": "model-00001-of-00004.safetensors", + "visual.blocks.19.norm2.weight": "model-00001-of-00004.safetensors", + "visual.blocks.2.attn.proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.2.attn.proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.2.attn.qkv.bias": "model-00001-of-00004.safetensors", + "visual.blocks.2.attn.qkv.weight": "model-00001-of-00004.safetensors", + "visual.blocks.2.mlp.down_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.2.mlp.gate_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.2.mlp.up_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.2.norm1.weight": "model-00001-of-00004.safetensors", + "visual.blocks.2.norm2.weight": "model-00001-of-00004.safetensors", + "visual.blocks.20.attn.proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.20.attn.proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.20.attn.qkv.bias": "model-00001-of-00004.safetensors", + "visual.blocks.20.attn.qkv.weight": "model-00001-of-00004.safetensors", + "visual.blocks.20.mlp.down_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.20.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.20.mlp.gate_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.20.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.20.mlp.up_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.20.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.20.norm1.weight": "model-00001-of-00004.safetensors", + "visual.blocks.20.norm2.weight": "model-00001-of-00004.safetensors", + "visual.blocks.21.attn.proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.21.attn.proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.21.attn.qkv.bias": "model-00001-of-00004.safetensors", + "visual.blocks.21.attn.qkv.weight": "model-00001-of-00004.safetensors", + "visual.blocks.21.mlp.down_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.21.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.21.mlp.gate_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.21.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.21.mlp.up_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.21.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.21.norm1.weight": "model-00001-of-00004.safetensors", + "visual.blocks.21.norm2.weight": "model-00001-of-00004.safetensors", + "visual.blocks.22.attn.proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.22.attn.proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.22.attn.qkv.bias": "model-00001-of-00004.safetensors", + "visual.blocks.22.attn.qkv.weight": "model-00001-of-00004.safetensors", + "visual.blocks.22.mlp.down_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.22.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.22.mlp.gate_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.22.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.22.mlp.up_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.22.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.22.norm1.weight": "model-00001-of-00004.safetensors", + "visual.blocks.22.norm2.weight": "model-00001-of-00004.safetensors", + "visual.blocks.23.attn.proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.23.attn.proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.23.attn.qkv.bias": "model-00001-of-00004.safetensors", + "visual.blocks.23.attn.qkv.weight": "model-00001-of-00004.safetensors", + "visual.blocks.23.mlp.down_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.23.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.23.mlp.gate_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.23.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.23.mlp.up_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.23.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.23.norm1.weight": "model-00001-of-00004.safetensors", + "visual.blocks.23.norm2.weight": "model-00001-of-00004.safetensors", + "visual.blocks.24.attn.proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.24.attn.proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.24.attn.qkv.bias": "model-00001-of-00004.safetensors", + "visual.blocks.24.attn.qkv.weight": "model-00001-of-00004.safetensors", + "visual.blocks.24.mlp.down_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.24.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.24.mlp.gate_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.24.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.24.mlp.up_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.24.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.24.norm1.weight": "model-00001-of-00004.safetensors", + "visual.blocks.24.norm2.weight": "model-00001-of-00004.safetensors", + "visual.blocks.25.attn.proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.25.attn.proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.25.attn.qkv.bias": "model-00001-of-00004.safetensors", + "visual.blocks.25.attn.qkv.weight": "model-00001-of-00004.safetensors", + "visual.blocks.25.mlp.down_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.25.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.25.mlp.gate_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.25.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.25.mlp.up_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.25.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.25.norm1.weight": "model-00001-of-00004.safetensors", + "visual.blocks.25.norm2.weight": "model-00001-of-00004.safetensors", + "visual.blocks.26.attn.proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.26.attn.proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.26.attn.qkv.bias": "model-00001-of-00004.safetensors", + "visual.blocks.26.attn.qkv.weight": "model-00001-of-00004.safetensors", + "visual.blocks.26.mlp.down_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.26.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.26.mlp.gate_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.26.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.26.mlp.up_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.26.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.26.norm1.weight": "model-00001-of-00004.safetensors", + "visual.blocks.26.norm2.weight": "model-00001-of-00004.safetensors", + "visual.blocks.27.attn.proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.27.attn.proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.27.attn.qkv.bias": "model-00001-of-00004.safetensors", + "visual.blocks.27.attn.qkv.weight": "model-00001-of-00004.safetensors", + "visual.blocks.27.mlp.down_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.27.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.27.mlp.gate_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.27.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.27.mlp.up_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.27.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.27.norm1.weight": "model-00001-of-00004.safetensors", + "visual.blocks.27.norm2.weight": "model-00001-of-00004.safetensors", + "visual.blocks.28.attn.proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.28.attn.proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.28.attn.qkv.bias": "model-00001-of-00004.safetensors", + "visual.blocks.28.attn.qkv.weight": "model-00001-of-00004.safetensors", + "visual.blocks.28.mlp.down_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.28.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.28.mlp.gate_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.28.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.28.mlp.up_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.28.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.28.norm1.weight": "model-00001-of-00004.safetensors", + "visual.blocks.28.norm2.weight": "model-00001-of-00004.safetensors", + "visual.blocks.29.attn.proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.29.attn.proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.29.attn.qkv.bias": "model-00001-of-00004.safetensors", + "visual.blocks.29.attn.qkv.weight": "model-00001-of-00004.safetensors", + "visual.blocks.29.mlp.down_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.29.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.29.mlp.gate_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.29.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.29.mlp.up_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.29.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.29.norm1.weight": "model-00001-of-00004.safetensors", + "visual.blocks.29.norm2.weight": "model-00001-of-00004.safetensors", + "visual.blocks.3.attn.proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.3.attn.proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.3.attn.qkv.bias": "model-00001-of-00004.safetensors", + "visual.blocks.3.attn.qkv.weight": "model-00001-of-00004.safetensors", + "visual.blocks.3.mlp.down_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.3.mlp.gate_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.3.mlp.up_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.3.norm1.weight": "model-00001-of-00004.safetensors", + "visual.blocks.3.norm2.weight": "model-00001-of-00004.safetensors", + "visual.blocks.30.attn.proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.30.attn.proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.30.attn.qkv.bias": "model-00001-of-00004.safetensors", + "visual.blocks.30.attn.qkv.weight": "model-00001-of-00004.safetensors", + "visual.blocks.30.mlp.down_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.30.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.30.mlp.gate_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.30.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.30.mlp.up_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.30.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.30.norm1.weight": "model-00001-of-00004.safetensors", + "visual.blocks.30.norm2.weight": "model-00001-of-00004.safetensors", + "visual.blocks.31.attn.proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.31.attn.proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.31.attn.qkv.bias": "model-00001-of-00004.safetensors", + "visual.blocks.31.attn.qkv.weight": "model-00001-of-00004.safetensors", + "visual.blocks.31.mlp.down_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.31.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.31.mlp.gate_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.31.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.31.mlp.up_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.31.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.31.norm1.weight": "model-00001-of-00004.safetensors", + "visual.blocks.31.norm2.weight": "model-00001-of-00004.safetensors", + "visual.blocks.4.attn.proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.4.attn.proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.4.attn.qkv.bias": "model-00001-of-00004.safetensors", + "visual.blocks.4.attn.qkv.weight": "model-00001-of-00004.safetensors", + "visual.blocks.4.mlp.down_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.4.mlp.gate_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.4.mlp.up_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.4.norm1.weight": "model-00001-of-00004.safetensors", + "visual.blocks.4.norm2.weight": "model-00001-of-00004.safetensors", + "visual.blocks.5.attn.proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.5.attn.proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.5.attn.qkv.bias": "model-00001-of-00004.safetensors", + "visual.blocks.5.attn.qkv.weight": "model-00001-of-00004.safetensors", + "visual.blocks.5.mlp.down_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.5.mlp.gate_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.5.mlp.up_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.5.norm1.weight": "model-00001-of-00004.safetensors", + "visual.blocks.5.norm2.weight": "model-00001-of-00004.safetensors", + "visual.blocks.6.attn.proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.6.attn.proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.6.attn.qkv.bias": "model-00001-of-00004.safetensors", + "visual.blocks.6.attn.qkv.weight": "model-00001-of-00004.safetensors", + "visual.blocks.6.mlp.down_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.6.mlp.gate_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.6.mlp.up_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.6.norm1.weight": "model-00001-of-00004.safetensors", + "visual.blocks.6.norm2.weight": "model-00001-of-00004.safetensors", + "visual.blocks.7.attn.proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.7.attn.proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.7.attn.qkv.bias": "model-00001-of-00004.safetensors", + "visual.blocks.7.attn.qkv.weight": "model-00001-of-00004.safetensors", + "visual.blocks.7.mlp.down_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.7.mlp.gate_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.7.mlp.up_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.7.norm1.weight": "model-00001-of-00004.safetensors", + "visual.blocks.7.norm2.weight": "model-00001-of-00004.safetensors", + "visual.blocks.8.attn.proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.8.attn.proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.8.attn.qkv.bias": "model-00001-of-00004.safetensors", + "visual.blocks.8.attn.qkv.weight": "model-00001-of-00004.safetensors", + "visual.blocks.8.mlp.down_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.8.mlp.gate_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.8.mlp.up_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.8.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.8.norm1.weight": "model-00001-of-00004.safetensors", + "visual.blocks.8.norm2.weight": "model-00001-of-00004.safetensors", + "visual.blocks.9.attn.proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.9.attn.proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.9.attn.qkv.bias": "model-00001-of-00004.safetensors", + "visual.blocks.9.attn.qkv.weight": "model-00001-of-00004.safetensors", + "visual.blocks.9.mlp.down_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.9.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.9.mlp.gate_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.9.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.9.mlp.up_proj.bias": "model-00001-of-00004.safetensors", + "visual.blocks.9.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "visual.blocks.9.norm1.weight": "model-00001-of-00004.safetensors", + "visual.blocks.9.norm2.weight": "model-00001-of-00004.safetensors", + "visual.merger.ln_q.weight": "model-00001-of-00004.safetensors", + "visual.merger.mlp.0.bias": "model-00001-of-00004.safetensors", + "visual.merger.mlp.0.weight": "model-00001-of-00004.safetensors", + "visual.merger.mlp.2.bias": "model-00001-of-00004.safetensors", + "visual.merger.mlp.2.weight": "model-00001-of-00004.safetensors", + "visual.patch_embed.proj.weight": "model-00001-of-00004.safetensors" + } +} diff --git a/preprocessor_config.json b/preprocessor_config.json new file mode 100644 index 0000000..1c234b7 --- /dev/null +++ b/preprocessor_config.json @@ -0,0 +1,36 @@ +{ + "crop_size": null, + "data_format": "channels_first", + "default_to_square": true, + "device": null, + "do_center_crop": null, + "do_convert_rgb": true, + "do_normalize": true, + "do_rescale": true, + "do_resize": true, + "image_mean": [ + 0.48145466, + 0.4578275, + 0.40821073 + ], + "image_processor_type": "Qwen2VLImageProcessorFast", + "image_std": [ + 0.26862954, + 0.26130258, + 0.27577711 + ], + "input_data_format": null, + "max_pixels": 12845056, + "merge_size": 2, + "min_pixels": 3136, + "patch_size": 14, + "processor_class": "Qwen2_5_VLProcessor", + "resample": 3, + "rescale_factor": 0.00392156862745098, + "return_tensors": null, + "size": { + "longest_edge": 12845056, + "shortest_edge": 3136 + }, + "temporal_patch_size": 2 +} diff --git a/runs/Dec01_02-50-11_A800Server/events.out.tfevents.1764557574.A800Server.312092.0 b/runs/Dec01_02-50-11_A800Server/events.out.tfevents.1764557574.A800Server.312092.0 new file mode 100644 index 0000000..5f46f11 --- /dev/null +++ b/runs/Dec01_02-50-11_A800Server/events.out.tfevents.1764557574.A800Server.312092.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e51ba155fd3c77c26a2985660526163356516b07e718a38d3f36260632d005c8 +size 7763 diff --git a/runs/Dec01_02-55-55_A800Server/events.out.tfevents.1764557855.A800Server.323093.0 b/runs/Dec01_02-55-55_A800Server/events.out.tfevents.1764557855.A800Server.323093.0 new file mode 100644 index 0000000..d4141c2 --- /dev/null +++ b/runs/Dec01_02-55-55_A800Server/events.out.tfevents.1764557855.A800Server.323093.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:894ca00488b3dffcfd977b01c31055b7e980a1afbe3fab82a2265370b44756d4 +size 7556 diff --git a/runs/Dec01_02-59-24_A800Server/events.out.tfevents.1764558073.A800Server.328791.0 b/runs/Dec01_02-59-24_A800Server/events.out.tfevents.1764558073.A800Server.328791.0 new file mode 100644 index 0000000..7f2f5f4 --- /dev/null +++ b/runs/Dec01_02-59-24_A800Server/events.out.tfevents.1764558073.A800Server.328791.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4afd2e419d6982e8637b6e79ddf467f139e66568a77a91247f5da72369059eb8 +size 72816 diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..ac23c0a --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,31 @@ +{ + "additional_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "eos_token": { + "content": "<|im_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..51ebb3b --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa +size 11421896 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..230f071 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,209 @@ +{ + "add_bos_token": false, + "add_prefix_space": false, + "added_tokens_decoder": { + "151643": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151644": { + "content": "<|im_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151645": { + "content": "<|im_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151646": { + "content": "<|object_ref_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151647": { + "content": "<|object_ref_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151648": { + "content": "<|box_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151649": { + "content": "<|box_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151650": { + "content": "<|quad_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151651": { + "content": "<|quad_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151652": { + "content": "<|vision_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151653": { + "content": "<|vision_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151654": { + "content": "<|vision_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151655": { + "content": "<|image_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151656": { + "content": "<|video_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151657": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151658": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151659": { + "content": "<|fim_prefix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151660": { + "content": "<|fim_middle|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151661": { + "content": "<|fim_suffix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151662": { + "content": "<|fim_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151663": { + "content": "<|repo_name|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151664": { + "content": "<|file_sep|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|im_end|>", + "errors": "replace", + "extra_special_tokens": {}, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "padding_side": "right", + "processor_class": "Qwen2_5_VLProcessor", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/train_results.json b/train_results.json new file mode 100644 index 0000000..f57fc3f --- /dev/null +++ b/train_results.json @@ -0,0 +1,8 @@ +{ + "epoch": 1.0, + "total_flos": 3.149345964351816e+17, + "train_loss": 0.4079481167808606, + "train_runtime": 5827.269, + "train_samples_per_second": 3.415, + "train_steps_per_second": 0.053 +} \ No newline at end of file diff --git a/trainer_log.jsonl b/trainer_log.jsonl new file mode 100644 index 0000000..00fe672 --- /dev/null +++ b/trainer_log.jsonl @@ -0,0 +1,312 @@ +{"current_steps": 1, "total_steps": 311, "loss": 1.4509, "lr": 0.0, "epoch": 0.003215434083601286, "percentage": 0.32, "elapsed_time": "0:00:49", "remaining_time": "4:14:08"} +{"current_steps": 2, "total_steps": 311, "loss": 1.4192, "lr": 3.125e-07, "epoch": 0.006430868167202572, "percentage": 0.64, "elapsed_time": "0:01:14", "remaining_time": "3:10:55"} +{"current_steps": 3, "total_steps": 311, "loss": 1.4011, "lr": 6.25e-07, "epoch": 0.00964630225080386, "percentage": 0.96, "elapsed_time": "0:01:51", "remaining_time": "3:10:07"} +{"current_steps": 4, "total_steps": 311, "loss": 1.3749, "lr": 9.375000000000001e-07, "epoch": 0.012861736334405145, "percentage": 1.29, "elapsed_time": "0:02:13", "remaining_time": "2:50:36"} +{"current_steps": 5, "total_steps": 311, "loss": 1.3428, "lr": 1.25e-06, "epoch": 0.01607717041800643, "percentage": 1.61, "elapsed_time": "0:02:45", "remaining_time": "2:49:06"} +{"current_steps": 6, "total_steps": 311, "loss": 1.2579, "lr": 1.5625e-06, "epoch": 0.01929260450160772, "percentage": 1.93, "elapsed_time": "0:03:07", "remaining_time": "2:38:28"} +{"current_steps": 7, "total_steps": 311, "loss": 1.2162, "lr": 1.8750000000000003e-06, "epoch": 0.022508038585209004, "percentage": 2.25, "elapsed_time": "0:03:24", "remaining_time": "2:28:20"} +{"current_steps": 8, "total_steps": 311, "loss": 1.1246, "lr": 2.1875000000000002e-06, "epoch": 0.02572347266881029, "percentage": 2.57, "elapsed_time": "0:03:42", "remaining_time": "2:20:41"} +{"current_steps": 9, "total_steps": 311, "loss": 1.0988, "lr": 2.5e-06, "epoch": 0.028938906752411574, "percentage": 2.89, "elapsed_time": "0:04:00", "remaining_time": "2:14:34"} +{"current_steps": 10, "total_steps": 311, "loss": 1.0293, "lr": 2.8125e-06, "epoch": 0.03215434083601286, "percentage": 3.22, "elapsed_time": "0:04:18", "remaining_time": "2:09:27"} +{"current_steps": 11, "total_steps": 311, "loss": 1.0261, "lr": 3.125e-06, "epoch": 0.03536977491961415, "percentage": 3.54, "elapsed_time": "0:04:36", "remaining_time": "2:05:36"} +{"current_steps": 12, "total_steps": 311, "loss": 0.9731, "lr": 3.4375e-06, "epoch": 0.03858520900321544, "percentage": 3.86, "elapsed_time": "0:04:55", "remaining_time": "2:02:38"} +{"current_steps": 13, "total_steps": 311, "loss": 0.9391, "lr": 3.7500000000000005e-06, "epoch": 0.04180064308681672, "percentage": 4.18, "elapsed_time": "0:05:15", "remaining_time": "2:00:29"} +{"current_steps": 14, "total_steps": 311, "loss": 0.8457, "lr": 4.0625000000000005e-06, "epoch": 0.04501607717041801, "percentage": 4.5, "elapsed_time": "0:05:33", "remaining_time": "1:57:49"} +{"current_steps": 15, "total_steps": 311, "loss": 0.854, "lr": 4.3750000000000005e-06, "epoch": 0.04823151125401929, "percentage": 4.82, "elapsed_time": "0:05:51", "remaining_time": "1:55:35"} +{"current_steps": 16, "total_steps": 311, "loss": 0.7838, "lr": 4.6875000000000004e-06, "epoch": 0.05144694533762058, "percentage": 5.14, "elapsed_time": "0:06:10", "remaining_time": "1:53:43"} +{"current_steps": 17, "total_steps": 311, "loss": 0.8028, "lr": 5e-06, "epoch": 0.05466237942122187, "percentage": 5.47, "elapsed_time": "0:06:28", "remaining_time": "1:52:01"} +{"current_steps": 18, "total_steps": 311, "loss": 0.7586, "lr": 5.3125e-06, "epoch": 0.05787781350482315, "percentage": 5.79, "elapsed_time": "0:06:46", "remaining_time": "1:50:23"} +{"current_steps": 19, "total_steps": 311, "loss": 0.7981, "lr": 5.625e-06, "epoch": 0.06109324758842444, "percentage": 6.11, "elapsed_time": "0:07:05", "remaining_time": "1:48:52"} +{"current_steps": 20, "total_steps": 311, "loss": 0.6801, "lr": 5.9375e-06, "epoch": 0.06430868167202572, "percentage": 6.43, "elapsed_time": "0:07:23", "remaining_time": "1:47:28"} +{"current_steps": 21, "total_steps": 311, "loss": 0.7076, "lr": 6.25e-06, "epoch": 0.06752411575562701, "percentage": 6.75, "elapsed_time": "0:07:41", "remaining_time": "1:46:07"} +{"current_steps": 22, "total_steps": 311, "loss": 0.6911, "lr": 6.5625e-06, "epoch": 0.0707395498392283, "percentage": 7.07, "elapsed_time": "0:07:59", "remaining_time": "1:45:02"} +{"current_steps": 23, "total_steps": 311, "loss": 0.6124, "lr": 6.875e-06, "epoch": 0.07395498392282958, "percentage": 7.4, "elapsed_time": "0:08:17", "remaining_time": "1:43:45"} +{"current_steps": 24, "total_steps": 311, "loss": 0.6253, "lr": 7.1875e-06, "epoch": 0.07717041800643087, "percentage": 7.72, "elapsed_time": "0:08:35", "remaining_time": "1:42:39"} +{"current_steps": 25, "total_steps": 311, "loss": 0.6117, "lr": 7.500000000000001e-06, "epoch": 0.08038585209003216, "percentage": 8.04, "elapsed_time": "0:08:53", "remaining_time": "1:41:39"} +{"current_steps": 26, "total_steps": 311, "loss": 0.5949, "lr": 7.8125e-06, "epoch": 0.08360128617363344, "percentage": 8.36, "elapsed_time": "0:09:12", "remaining_time": "1:40:54"} +{"current_steps": 27, "total_steps": 311, "loss": 0.5983, "lr": 8.125000000000001e-06, "epoch": 0.08681672025723473, "percentage": 8.68, "elapsed_time": "0:09:29", "remaining_time": "1:39:51"} +{"current_steps": 28, "total_steps": 311, "loss": 0.5706, "lr": 8.4375e-06, "epoch": 0.09003215434083602, "percentage": 9.0, "elapsed_time": "0:09:47", "remaining_time": "1:39:02"} +{"current_steps": 29, "total_steps": 311, "loss": 0.536, "lr": 8.750000000000001e-06, "epoch": 0.0932475884244373, "percentage": 9.32, "elapsed_time": "0:10:05", "remaining_time": "1:38:12"} +{"current_steps": 30, "total_steps": 311, "loss": 0.5384, "lr": 9.0625e-06, "epoch": 0.09646302250803858, "percentage": 9.65, "elapsed_time": "0:10:24", "remaining_time": "1:37:27"} +{"current_steps": 31, "total_steps": 311, "loss": 0.5491, "lr": 9.375000000000001e-06, "epoch": 0.09967845659163987, "percentage": 9.97, "elapsed_time": "0:10:41", "remaining_time": "1:36:35"} +{"current_steps": 32, "total_steps": 311, "loss": 0.524, "lr": 9.6875e-06, "epoch": 0.10289389067524116, "percentage": 10.29, "elapsed_time": "0:10:59", "remaining_time": "1:35:49"} +{"current_steps": 33, "total_steps": 311, "loss": 0.5224, "lr": 1e-05, "epoch": 0.10610932475884244, "percentage": 10.61, "elapsed_time": "0:11:17", "remaining_time": "1:35:04"} +{"current_steps": 34, "total_steps": 311, "loss": 0.5185, "lr": 9.999683023724021e-06, "epoch": 0.10932475884244373, "percentage": 10.93, "elapsed_time": "0:11:35", "remaining_time": "1:34:23"} +{"current_steps": 35, "total_steps": 311, "loss": 0.5219, "lr": 9.998732135085665e-06, "epoch": 0.11254019292604502, "percentage": 11.25, "elapsed_time": "0:11:53", "remaining_time": "1:33:46"} +{"current_steps": 36, "total_steps": 311, "loss": 0.4951, "lr": 9.99714745464859e-06, "epoch": 0.1157556270096463, "percentage": 11.58, "elapsed_time": "0:12:11", "remaining_time": "1:33:10"} +{"current_steps": 37, "total_steps": 311, "loss": 0.5087, "lr": 9.994929183335237e-06, "epoch": 0.1189710610932476, "percentage": 11.9, "elapsed_time": "0:12:30", "remaining_time": "1:32:39"} +{"current_steps": 38, "total_steps": 311, "loss": 0.4586, "lr": 9.992077602401358e-06, "epoch": 0.12218649517684887, "percentage": 12.22, "elapsed_time": "0:12:49", "remaining_time": "1:32:06"} +{"current_steps": 39, "total_steps": 311, "loss": 0.4962, "lr": 9.988593073400354e-06, "epoch": 0.12540192926045016, "percentage": 12.54, "elapsed_time": "0:13:09", "remaining_time": "1:31:44"} +{"current_steps": 40, "total_steps": 311, "loss": 0.5232, "lr": 9.984476038137437e-06, "epoch": 0.12861736334405144, "percentage": 12.86, "elapsed_time": "0:13:27", "remaining_time": "1:31:08"} +{"current_steps": 41, "total_steps": 311, "loss": 0.4603, "lr": 9.979727018613607e-06, "epoch": 0.13183279742765272, "percentage": 13.18, "elapsed_time": "0:13:46", "remaining_time": "1:30:45"} +{"current_steps": 42, "total_steps": 311, "loss": 0.456, "lr": 9.974346616959476e-06, "epoch": 0.13504823151125403, "percentage": 13.5, "elapsed_time": "0:14:21", "remaining_time": "1:31:55"} +{"current_steps": 43, "total_steps": 311, "loss": 0.45, "lr": 9.968335515358916e-06, "epoch": 0.1382636655948553, "percentage": 13.83, "elapsed_time": "0:14:39", "remaining_time": "1:31:22"} +{"current_steps": 44, "total_steps": 311, "loss": 0.4605, "lr": 9.961694475962562e-06, "epoch": 0.1414790996784566, "percentage": 14.15, "elapsed_time": "0:14:57", "remaining_time": "1:30:48"} +{"current_steps": 45, "total_steps": 311, "loss": 0.4636, "lr": 9.954424340791195e-06, "epoch": 0.14469453376205788, "percentage": 14.47, "elapsed_time": "0:15:15", "remaining_time": "1:30:12"} +{"current_steps": 46, "total_steps": 311, "loss": 0.4533, "lr": 9.94652603162896e-06, "epoch": 0.14790996784565916, "percentage": 14.79, "elapsed_time": "0:15:34", "remaining_time": "1:29:43"} +{"current_steps": 47, "total_steps": 311, "loss": 0.442, "lr": 9.938000549906509e-06, "epoch": 0.15112540192926044, "percentage": 15.11, "elapsed_time": "0:15:52", "remaining_time": "1:29:10"} +{"current_steps": 48, "total_steps": 311, "loss": 0.477, "lr": 9.92884897657402e-06, "epoch": 0.15434083601286175, "percentage": 15.43, "elapsed_time": "0:16:10", "remaining_time": "1:28:36"} +{"current_steps": 49, "total_steps": 311, "loss": 0.4266, "lr": 9.919072471964146e-06, "epoch": 0.15755627009646303, "percentage": 15.76, "elapsed_time": "0:16:29", "remaining_time": "1:28:08"} +{"current_steps": 50, "total_steps": 311, "loss": 0.4375, "lr": 9.908672275644898e-06, "epoch": 0.1607717041800643, "percentage": 16.08, "elapsed_time": "0:16:47", "remaining_time": "1:27:39"} +{"current_steps": 51, "total_steps": 311, "loss": 0.4438, "lr": 9.897649706262474e-06, "epoch": 0.1639871382636656, "percentage": 16.4, "elapsed_time": "0:17:05", "remaining_time": "1:27:07"} +{"current_steps": 52, "total_steps": 311, "loss": 0.4388, "lr": 9.88600616137407e-06, "epoch": 0.16720257234726688, "percentage": 16.72, "elapsed_time": "0:17:25", "remaining_time": "1:26:45"} +{"current_steps": 53, "total_steps": 311, "loss": 0.4612, "lr": 9.873743117270691e-06, "epoch": 0.17041800643086816, "percentage": 17.04, "elapsed_time": "0:17:42", "remaining_time": "1:26:13"} +{"current_steps": 54, "total_steps": 311, "loss": 0.4513, "lr": 9.860862128789954e-06, "epoch": 0.17363344051446947, "percentage": 17.36, "elapsed_time": "0:18:00", "remaining_time": "1:25:41"} +{"current_steps": 55, "total_steps": 311, "loss": 0.4439, "lr": 9.847364829118963e-06, "epoch": 0.17684887459807075, "percentage": 17.68, "elapsed_time": "0:18:18", "remaining_time": "1:25:13"} +{"current_steps": 56, "total_steps": 311, "loss": 0.4484, "lr": 9.833252929587231e-06, "epoch": 0.18006430868167203, "percentage": 18.01, "elapsed_time": "0:18:36", "remaining_time": "1:24:45"} +{"current_steps": 57, "total_steps": 311, "loss": 0.4642, "lr": 9.818528219449705e-06, "epoch": 0.1832797427652733, "percentage": 18.33, "elapsed_time": "0:18:54", "remaining_time": "1:24:17"} +{"current_steps": 58, "total_steps": 311, "loss": 0.4289, "lr": 9.803192565659898e-06, "epoch": 0.1864951768488746, "percentage": 18.65, "elapsed_time": "0:19:13", "remaining_time": "1:23:49"} +{"current_steps": 59, "total_steps": 311, "loss": 0.416, "lr": 9.78724791263318e-06, "epoch": 0.18971061093247588, "percentage": 18.97, "elapsed_time": "0:19:30", "remaining_time": "1:23:20"} +{"current_steps": 60, "total_steps": 311, "loss": 0.4083, "lr": 9.770696282000245e-06, "epoch": 0.19292604501607716, "percentage": 19.29, "elapsed_time": "0:19:49", "remaining_time": "1:22:54"} +{"current_steps": 61, "total_steps": 311, "loss": 0.4177, "lr": 9.753539772350792e-06, "epoch": 0.19614147909967847, "percentage": 19.61, "elapsed_time": "0:20:06", "remaining_time": "1:22:26"} +{"current_steps": 62, "total_steps": 311, "loss": 0.4328, "lr": 9.735780558967434e-06, "epoch": 0.19935691318327975, "percentage": 19.94, "elapsed_time": "0:20:25", "remaining_time": "1:22:00"} +{"current_steps": 63, "total_steps": 311, "loss": 0.424, "lr": 9.717420893549902e-06, "epoch": 0.20257234726688103, "percentage": 20.26, "elapsed_time": "0:20:43", "remaining_time": "1:21:33"} +{"current_steps": 64, "total_steps": 311, "loss": 0.4254, "lr": 9.698463103929542e-06, "epoch": 0.2057877813504823, "percentage": 20.58, "elapsed_time": "0:21:01", "remaining_time": "1:21:10"} +{"current_steps": 65, "total_steps": 311, "loss": 0.4202, "lr": 9.67890959377418e-06, "epoch": 0.2090032154340836, "percentage": 20.9, "elapsed_time": "0:21:20", "remaining_time": "1:20:46"} +{"current_steps": 66, "total_steps": 311, "loss": 0.397, "lr": 9.658762842283343e-06, "epoch": 0.21221864951768488, "percentage": 21.22, "elapsed_time": "0:21:38", "remaining_time": "1:20:18"} +{"current_steps": 67, "total_steps": 311, "loss": 0.3912, "lr": 9.638025403873939e-06, "epoch": 0.21543408360128619, "percentage": 21.54, "elapsed_time": "0:21:56", "remaining_time": "1:19:54"} +{"current_steps": 68, "total_steps": 311, "loss": 0.3758, "lr": 9.616699907856368e-06, "epoch": 0.21864951768488747, "percentage": 21.86, "elapsed_time": "0:22:15", "remaining_time": "1:19:31"} +{"current_steps": 69, "total_steps": 311, "loss": 0.4368, "lr": 9.594789058101154e-06, "epoch": 0.22186495176848875, "percentage": 22.19, "elapsed_time": "0:22:33", "remaining_time": "1:19:06"} +{"current_steps": 70, "total_steps": 311, "loss": 0.4067, "lr": 9.57229563269612e-06, "epoch": 0.22508038585209003, "percentage": 22.51, "elapsed_time": "0:22:52", "remaining_time": "1:18:43"} +{"current_steps": 71, "total_steps": 311, "loss": 0.3884, "lr": 9.549222483594154e-06, "epoch": 0.2282958199356913, "percentage": 22.83, "elapsed_time": "0:23:10", "remaining_time": "1:18:19"} +{"current_steps": 72, "total_steps": 311, "loss": 0.3895, "lr": 9.525572536251608e-06, "epoch": 0.2315112540192926, "percentage": 23.15, "elapsed_time": "0:23:27", "remaining_time": "1:17:53"} +{"current_steps": 73, "total_steps": 311, "loss": 0.408, "lr": 9.501348789257373e-06, "epoch": 0.2347266881028939, "percentage": 23.47, "elapsed_time": "0:23:46", "remaining_time": "1:17:31"} +{"current_steps": 74, "total_steps": 311, "loss": 0.3862, "lr": 9.476554313952697e-06, "epoch": 0.2379421221864952, "percentage": 23.79, "elapsed_time": "0:24:04", "remaining_time": "1:17:05"} +{"current_steps": 75, "total_steps": 311, "loss": 0.4149, "lr": 9.451192254041759e-06, "epoch": 0.24115755627009647, "percentage": 24.12, "elapsed_time": "0:24:22", "remaining_time": "1:16:41"} +{"current_steps": 76, "total_steps": 311, "loss": 0.38, "lr": 9.425265825193077e-06, "epoch": 0.24437299035369775, "percentage": 24.44, "elapsed_time": "0:24:40", "remaining_time": "1:16:16"} +{"current_steps": 77, "total_steps": 311, "loss": 0.3799, "lr": 9.398778314631801e-06, "epoch": 0.24758842443729903, "percentage": 24.76, "elapsed_time": "0:24:59", "remaining_time": "1:15:57"} +{"current_steps": 78, "total_steps": 311, "loss": 0.3882, "lr": 9.371733080722911e-06, "epoch": 0.2508038585209003, "percentage": 25.08, "elapsed_time": "0:25:17", "remaining_time": "1:15:33"} +{"current_steps": 79, "total_steps": 311, "loss": 0.4123, "lr": 9.34413355254542e-06, "epoch": 0.2540192926045016, "percentage": 25.4, "elapsed_time": "0:25:35", "remaining_time": "1:15:10"} +{"current_steps": 80, "total_steps": 311, "loss": 0.3627, "lr": 9.31598322945759e-06, "epoch": 0.2572347266881029, "percentage": 25.72, "elapsed_time": "0:25:56", "remaining_time": "1:14:54"} +{"current_steps": 81, "total_steps": 311, "loss": 0.3747, "lr": 9.287285680653254e-06, "epoch": 0.2604501607717042, "percentage": 26.05, "elapsed_time": "0:26:14", "remaining_time": "1:14:31"} +{"current_steps": 82, "total_steps": 311, "loss": 0.399, "lr": 9.258044544709276e-06, "epoch": 0.26366559485530544, "percentage": 26.37, "elapsed_time": "0:26:32", "remaining_time": "1:14:08"} +{"current_steps": 83, "total_steps": 311, "loss": 0.3995, "lr": 9.228263529124199e-06, "epoch": 0.26688102893890675, "percentage": 26.69, "elapsed_time": "0:26:53", "remaining_time": "1:13:51"} +{"current_steps": 84, "total_steps": 311, "loss": 0.4221, "lr": 9.197946409848196e-06, "epoch": 0.27009646302250806, "percentage": 27.01, "elapsed_time": "0:27:11", "remaining_time": "1:13:30"} +{"current_steps": 85, "total_steps": 311, "loss": 0.3649, "lr": 9.167097030804289e-06, "epoch": 0.2733118971061093, "percentage": 27.33, "elapsed_time": "0:27:30", "remaining_time": "1:13:07"} +{"current_steps": 86, "total_steps": 311, "loss": 0.3638, "lr": 9.135719303400995e-06, "epoch": 0.2765273311897106, "percentage": 27.65, "elapsed_time": "0:27:48", "remaining_time": "1:12:46"} +{"current_steps": 87, "total_steps": 311, "loss": 0.3722, "lr": 9.103817206036383e-06, "epoch": 0.2797427652733119, "percentage": 27.97, "elapsed_time": "0:28:07", "remaining_time": "1:12:24"} +{"current_steps": 88, "total_steps": 311, "loss": 0.3656, "lr": 9.071394783593664e-06, "epoch": 0.2829581993569132, "percentage": 28.3, "elapsed_time": "0:28:25", "remaining_time": "1:12:02"} +{"current_steps": 89, "total_steps": 311, "loss": 0.3916, "lr": 9.038456146928325e-06, "epoch": 0.2861736334405145, "percentage": 28.62, "elapsed_time": "0:28:44", "remaining_time": "1:11:40"} +{"current_steps": 90, "total_steps": 311, "loss": 0.3903, "lr": 9.005005472346923e-06, "epoch": 0.28938906752411575, "percentage": 28.94, "elapsed_time": "0:29:02", "remaining_time": "1:11:18"} +{"current_steps": 91, "total_steps": 311, "loss": 0.3987, "lr": 8.971047001077561e-06, "epoch": 0.29260450160771706, "percentage": 29.26, "elapsed_time": "0:29:20", "remaining_time": "1:10:55"} +{"current_steps": 92, "total_steps": 311, "loss": 0.4044, "lr": 8.936585038732143e-06, "epoch": 0.2958199356913183, "percentage": 29.58, "elapsed_time": "0:29:38", "remaining_time": "1:10:33"} +{"current_steps": 93, "total_steps": 311, "loss": 0.3858, "lr": 8.90162395476046e-06, "epoch": 0.2990353697749196, "percentage": 29.9, "elapsed_time": "0:29:57", "remaining_time": "1:10:13"} +{"current_steps": 94, "total_steps": 311, "loss": 0.4002, "lr": 8.866168181896198e-06, "epoch": 0.3022508038585209, "percentage": 30.23, "elapsed_time": "0:30:15", "remaining_time": "1:09:50"} +{"current_steps": 95, "total_steps": 311, "loss": 0.375, "lr": 8.83022221559489e-06, "epoch": 0.3054662379421222, "percentage": 30.55, "elapsed_time": "0:30:32", "remaining_time": "1:09:27"} +{"current_steps": 96, "total_steps": 311, "loss": 0.3549, "lr": 8.793790613463956e-06, "epoch": 0.3086816720257235, "percentage": 30.87, "elapsed_time": "0:30:50", "remaining_time": "1:09:05"} +{"current_steps": 97, "total_steps": 311, "loss": 0.3798, "lr": 8.756877994684818e-06, "epoch": 0.31189710610932475, "percentage": 31.19, "elapsed_time": "0:31:08", "remaining_time": "1:08:43"} +{"current_steps": 98, "total_steps": 311, "loss": 0.3871, "lr": 8.719489039427256e-06, "epoch": 0.31511254019292606, "percentage": 31.51, "elapsed_time": "0:31:26", "remaining_time": "1:08:20"} +{"current_steps": 99, "total_steps": 311, "loss": 0.4025, "lr": 8.681628488255986e-06, "epoch": 0.3183279742765273, "percentage": 31.83, "elapsed_time": "0:31:45", "remaining_time": "1:07:59"} +{"current_steps": 100, "total_steps": 311, "loss": 0.3998, "lr": 8.643301141529619e-06, "epoch": 0.3215434083601286, "percentage": 32.15, "elapsed_time": "0:32:03", "remaining_time": "1:07:39"} +{"current_steps": 101, "total_steps": 311, "loss": 0.3714, "lr": 8.604511858792006e-06, "epoch": 0.3247588424437299, "percentage": 32.48, "elapsed_time": "0:32:22", "remaining_time": "1:07:18"} +{"current_steps": 102, "total_steps": 311, "loss": 0.3509, "lr": 8.565265558156101e-06, "epoch": 0.3279742765273312, "percentage": 32.8, "elapsed_time": "0:32:39", "remaining_time": "1:06:55"} +{"current_steps": 103, "total_steps": 311, "loss": 0.366, "lr": 8.525567215680397e-06, "epoch": 0.3311897106109325, "percentage": 33.12, "elapsed_time": "0:32:58", "remaining_time": "1:06:34"} +{"current_steps": 104, "total_steps": 311, "loss": 0.3919, "lr": 8.485421864737997e-06, "epoch": 0.33440514469453375, "percentage": 33.44, "elapsed_time": "0:33:16", "remaining_time": "1:06:14"} +{"current_steps": 105, "total_steps": 311, "loss": 0.3623, "lr": 8.444834595378434e-06, "epoch": 0.33762057877813506, "percentage": 33.76, "elapsed_time": "0:33:35", "remaining_time": "1:05:54"} +{"current_steps": 106, "total_steps": 311, "loss": 0.3758, "lr": 8.403810553682307e-06, "epoch": 0.3408360128617363, "percentage": 34.08, "elapsed_time": "0:33:53", "remaining_time": "1:05:32"} +{"current_steps": 107, "total_steps": 311, "loss": 0.3456, "lr": 8.362354941108803e-06, "epoch": 0.3440514469453376, "percentage": 34.41, "elapsed_time": "0:34:11", "remaining_time": "1:05:11"} +{"current_steps": 108, "total_steps": 311, "loss": 0.3754, "lr": 8.320473013836197e-06, "epoch": 0.34726688102893893, "percentage": 34.73, "elapsed_time": "0:34:29", "remaining_time": "1:04:49"} +{"current_steps": 109, "total_steps": 311, "loss": 0.3858, "lr": 8.278170082095422e-06, "epoch": 0.3504823151125402, "percentage": 35.05, "elapsed_time": "0:34:47", "remaining_time": "1:04:28"} +{"current_steps": 110, "total_steps": 311, "loss": 0.3941, "lr": 8.23545150949679e-06, "epoch": 0.3536977491961415, "percentage": 35.37, "elapsed_time": "0:35:05", "remaining_time": "1:04:07"} +{"current_steps": 111, "total_steps": 311, "loss": 0.3712, "lr": 8.192322712349917e-06, "epoch": 0.35691318327974275, "percentage": 35.69, "elapsed_time": "0:35:24", "remaining_time": "1:03:47"} +{"current_steps": 112, "total_steps": 311, "loss": 0.3532, "lr": 8.148789158977012e-06, "epoch": 0.36012861736334406, "percentage": 36.01, "elapsed_time": "0:35:41", "remaining_time": "1:03:25"} +{"current_steps": 113, "total_steps": 311, "loss": 0.3801, "lr": 8.104856369019525e-06, "epoch": 0.3633440514469453, "percentage": 36.33, "elapsed_time": "0:35:59", "remaining_time": "1:03:04"} +{"current_steps": 114, "total_steps": 311, "loss": 0.3594, "lr": 8.060529912738316e-06, "epoch": 0.3665594855305466, "percentage": 36.66, "elapsed_time": "0:36:18", "remaining_time": "1:02:43"} +{"current_steps": 115, "total_steps": 311, "loss": 0.3696, "lr": 8.0158154103074e-06, "epoch": 0.36977491961414793, "percentage": 36.98, "elapsed_time": "0:36:36", "remaining_time": "1:02:23"} +{"current_steps": 116, "total_steps": 311, "loss": 0.3553, "lr": 7.970718531101365e-06, "epoch": 0.3729903536977492, "percentage": 37.3, "elapsed_time": "0:36:54", "remaining_time": "1:02:02"} +{"current_steps": 117, "total_steps": 311, "loss": 0.3775, "lr": 7.925244992976538e-06, "epoch": 0.3762057877813505, "percentage": 37.62, "elapsed_time": "0:37:12", "remaining_time": "1:01:42"} +{"current_steps": 118, "total_steps": 311, "loss": 0.3591, "lr": 7.879400561546033e-06, "epoch": 0.37942122186495175, "percentage": 37.94, "elapsed_time": "0:37:31", "remaining_time": "1:01:22"} +{"current_steps": 119, "total_steps": 311, "loss": 0.3723, "lr": 7.833191049448706e-06, "epoch": 0.38263665594855306, "percentage": 38.26, "elapsed_time": "0:37:49", "remaining_time": "1:01:02"} +{"current_steps": 120, "total_steps": 311, "loss": 0.3566, "lr": 7.786622315612182e-06, "epoch": 0.3858520900321543, "percentage": 38.59, "elapsed_time": "0:38:06", "remaining_time": "1:00:39"} +{"current_steps": 121, "total_steps": 311, "loss": 0.3809, "lr": 7.739700264509993e-06, "epoch": 0.3890675241157556, "percentage": 38.91, "elapsed_time": "0:38:25", "remaining_time": "1:00:19"} +{"current_steps": 122, "total_steps": 311, "loss": 0.3707, "lr": 7.692430845412946e-06, "epoch": 0.39228295819935693, "percentage": 39.23, "elapsed_time": "0:38:43", "remaining_time": "0:59:58"} +{"current_steps": 123, "total_steps": 311, "loss": 0.3642, "lr": 7.644820051634813e-06, "epoch": 0.3954983922829582, "percentage": 39.55, "elapsed_time": "0:39:01", "remaining_time": "0:59:38"} +{"current_steps": 124, "total_steps": 311, "loss": 0.3605, "lr": 7.596873919772438e-06, "epoch": 0.3987138263665595, "percentage": 39.87, "elapsed_time": "0:39:20", "remaining_time": "0:59:19"} +{"current_steps": 125, "total_steps": 311, "loss": 0.3648, "lr": 7.548598528940354e-06, "epoch": 0.40192926045016075, "percentage": 40.19, "elapsed_time": "0:39:38", "remaining_time": "0:58:58"} +{"current_steps": 126, "total_steps": 311, "loss": 0.3735, "lr": 7.500000000000001e-06, "epoch": 0.40514469453376206, "percentage": 40.51, "elapsed_time": "0:39:55", "remaining_time": "0:58:37"} +{"current_steps": 127, "total_steps": 311, "loss": 0.3775, "lr": 7.451084494783668e-06, "epoch": 0.40836012861736337, "percentage": 40.84, "elapsed_time": "0:40:14", "remaining_time": "0:58:17"} +{"current_steps": 128, "total_steps": 311, "loss": 0.3646, "lr": 7.401858215313228e-06, "epoch": 0.4115755627009646, "percentage": 41.16, "elapsed_time": "0:40:31", "remaining_time": "0:57:56"} +{"current_steps": 129, "total_steps": 311, "loss": 0.3752, "lr": 7.352327403013779e-06, "epoch": 0.41479099678456594, "percentage": 41.48, "elapsed_time": "0:40:49", "remaining_time": "0:57:36"} +{"current_steps": 130, "total_steps": 311, "loss": 0.3567, "lr": 7.302498337922293e-06, "epoch": 0.4180064308681672, "percentage": 41.8, "elapsed_time": "0:41:08", "remaining_time": "0:57:16"} +{"current_steps": 131, "total_steps": 311, "loss": 0.3623, "lr": 7.2523773378913655e-06, "epoch": 0.4212218649517685, "percentage": 42.12, "elapsed_time": "0:41:26", "remaining_time": "0:56:56"} +{"current_steps": 132, "total_steps": 311, "loss": 0.3709, "lr": 7.201970757788172e-06, "epoch": 0.42443729903536975, "percentage": 42.44, "elapsed_time": "0:41:44", "remaining_time": "0:56:36"} +{"current_steps": 133, "total_steps": 311, "loss": 0.3518, "lr": 7.151284988688731e-06, "epoch": 0.42765273311897106, "percentage": 42.77, "elapsed_time": "0:42:03", "remaining_time": "0:56:17"} +{"current_steps": 134, "total_steps": 311, "loss": 0.3623, "lr": 7.100326457067576e-06, "epoch": 0.43086816720257237, "percentage": 43.09, "elapsed_time": "0:42:22", "remaining_time": "0:55:58"} +{"current_steps": 135, "total_steps": 311, "loss": 0.3518, "lr": 7.049101623982938e-06, "epoch": 0.4340836012861736, "percentage": 43.41, "elapsed_time": "0:42:40", "remaining_time": "0:55:38"} +{"current_steps": 136, "total_steps": 311, "loss": 0.3812, "lr": 6.9976169842575526e-06, "epoch": 0.43729903536977494, "percentage": 43.73, "elapsed_time": "0:42:58", "remaining_time": "0:55:17"} +{"current_steps": 137, "total_steps": 311, "loss": 0.3615, "lr": 6.945879065655164e-06, "epoch": 0.4405144694533762, "percentage": 44.05, "elapsed_time": "0:43:17", "remaining_time": "0:54:58"} +{"current_steps": 138, "total_steps": 311, "loss": 0.3898, "lr": 6.893894428052881e-06, "epoch": 0.4437299035369775, "percentage": 44.37, "elapsed_time": "0:43:35", "remaining_time": "0:54:38"} +{"current_steps": 139, "total_steps": 311, "loss": 0.3437, "lr": 6.841669662609437e-06, "epoch": 0.44694533762057875, "percentage": 44.69, "elapsed_time": "0:43:53", "remaining_time": "0:54:19"} +{"current_steps": 140, "total_steps": 311, "loss": 0.3523, "lr": 6.789211390929497e-06, "epoch": 0.45016077170418006, "percentage": 45.02, "elapsed_time": "0:44:12", "remaining_time": "0:53:59"} +{"current_steps": 141, "total_steps": 311, "loss": 0.3738, "lr": 6.736526264224101e-06, "epoch": 0.4533762057877814, "percentage": 45.34, "elapsed_time": "0:44:29", "remaining_time": "0:53:39"} +{"current_steps": 142, "total_steps": 311, "loss": 0.3726, "lr": 6.6836209624673575e-06, "epoch": 0.4565916398713826, "percentage": 45.66, "elapsed_time": "0:44:48", "remaining_time": "0:53:19"} +{"current_steps": 143, "total_steps": 311, "loss": 0.3322, "lr": 6.6305021935494755e-06, "epoch": 0.45980707395498394, "percentage": 45.98, "elapsed_time": "0:45:06", "remaining_time": "0:52:59"} +{"current_steps": 144, "total_steps": 311, "loss": 0.3328, "lr": 6.5771766924262795e-06, "epoch": 0.4630225080385852, "percentage": 46.3, "elapsed_time": "0:45:24", "remaining_time": "0:52:40"} +{"current_steps": 145, "total_steps": 311, "loss": 0.3492, "lr": 6.523651220265269e-06, "epoch": 0.4662379421221865, "percentage": 46.62, "elapsed_time": "0:45:43", "remaining_time": "0:52:20"} +{"current_steps": 146, "total_steps": 311, "loss": 0.3362, "lr": 6.469932563588386e-06, "epoch": 0.4694533762057878, "percentage": 46.95, "elapsed_time": "0:46:00", "remaining_time": "0:52:00"} +{"current_steps": 147, "total_steps": 311, "loss": 0.3438, "lr": 6.41602753341152e-06, "epoch": 0.47266881028938906, "percentage": 47.27, "elapsed_time": "0:46:18", "remaining_time": "0:51:40"} +{"current_steps": 148, "total_steps": 311, "loss": 0.3434, "lr": 6.361942964380967e-06, "epoch": 0.4758842443729904, "percentage": 47.59, "elapsed_time": "0:46:37", "remaining_time": "0:51:20"} +{"current_steps": 149, "total_steps": 311, "loss": 0.3487, "lr": 6.307685713906835e-06, "epoch": 0.4790996784565916, "percentage": 47.91, "elapsed_time": "0:46:55", "remaining_time": "0:51:01"} +{"current_steps": 150, "total_steps": 311, "loss": 0.3335, "lr": 6.2532626612936035e-06, "epoch": 0.48231511254019294, "percentage": 48.23, "elapsed_time": "0:47:14", "remaining_time": "0:50:42"} +{"current_steps": 151, "total_steps": 311, "loss": 0.3578, "lr": 6.1986807068678926e-06, "epoch": 0.4855305466237942, "percentage": 48.55, "elapsed_time": "0:47:32", "remaining_time": "0:50:22"} +{"current_steps": 152, "total_steps": 311, "loss": 0.3585, "lr": 6.143946771103561e-06, "epoch": 0.4887459807073955, "percentage": 48.87, "elapsed_time": "0:47:50", "remaining_time": "0:50:02"} +{"current_steps": 153, "total_steps": 311, "loss": 0.3163, "lr": 6.089067793744258e-06, "epoch": 0.4919614147909968, "percentage": 49.2, "elapsed_time": "0:48:08", "remaining_time": "0:49:43"} +{"current_steps": 154, "total_steps": 311, "loss": 0.3513, "lr": 6.034050732923538e-06, "epoch": 0.49517684887459806, "percentage": 49.52, "elapsed_time": "0:48:26", "remaining_time": "0:49:23"} +{"current_steps": 155, "total_steps": 311, "loss": 0.3436, "lr": 5.978902564282616e-06, "epoch": 0.4983922829581994, "percentage": 49.84, "elapsed_time": "0:48:44", "remaining_time": "0:49:03"} +{"current_steps": 156, "total_steps": 311, "loss": 0.3321, "lr": 5.923630280085948e-06, "epoch": 0.5016077170418006, "percentage": 50.16, "elapsed_time": "0:49:02", "remaining_time": "0:48:43"} +{"current_steps": 157, "total_steps": 311, "loss": 0.3464, "lr": 5.8682408883346535e-06, "epoch": 0.5048231511254019, "percentage": 50.48, "elapsed_time": "0:49:20", "remaining_time": "0:48:24"} +{"current_steps": 158, "total_steps": 311, "loss": 0.366, "lr": 5.8127414118779825e-06, "epoch": 0.5080385852090032, "percentage": 50.8, "elapsed_time": "0:49:38", "remaining_time": "0:48:04"} +{"current_steps": 159, "total_steps": 311, "loss": 0.3592, "lr": 5.757138887522884e-06, "epoch": 0.5112540192926045, "percentage": 51.13, "elapsed_time": "0:49:57", "remaining_time": "0:47:45"} +{"current_steps": 160, "total_steps": 311, "loss": 0.3374, "lr": 5.701440365141799e-06, "epoch": 0.5144694533762058, "percentage": 51.45, "elapsed_time": "0:50:15", "remaining_time": "0:47:25"} +{"current_steps": 161, "total_steps": 311, "loss": 0.3354, "lr": 5.645652906778808e-06, "epoch": 0.5176848874598071, "percentage": 51.77, "elapsed_time": "0:50:33", "remaining_time": "0:47:06"} +{"current_steps": 162, "total_steps": 311, "loss": 0.3402, "lr": 5.5897835857542315e-06, "epoch": 0.5209003215434084, "percentage": 52.09, "elapsed_time": "0:50:50", "remaining_time": "0:46:46"} +{"current_steps": 163, "total_steps": 311, "loss": 0.349, "lr": 5.533839485767795e-06, "epoch": 0.5241157556270096, "percentage": 52.41, "elapsed_time": "0:51:08", "remaining_time": "0:46:26"} +{"current_steps": 164, "total_steps": 311, "loss": 0.3314, "lr": 5.477827700000492e-06, "epoch": 0.5273311897106109, "percentage": 52.73, "elapsed_time": "0:51:26", "remaining_time": "0:46:06"} +{"current_steps": 165, "total_steps": 311, "loss": 0.3147, "lr": 5.421755330215223e-06, "epoch": 0.5305466237942122, "percentage": 53.05, "elapsed_time": "0:51:44", "remaining_time": "0:45:46"} +{"current_steps": 166, "total_steps": 311, "loss": 0.3427, "lr": 5.365629485856381e-06, "epoch": 0.5337620578778135, "percentage": 53.38, "elapsed_time": "0:52:02", "remaining_time": "0:45:27"} +{"current_steps": 167, "total_steps": 311, "loss": 0.3091, "lr": 5.30945728314841e-06, "epoch": 0.5369774919614148, "percentage": 53.7, "elapsed_time": "0:52:22", "remaining_time": "0:45:09"} +{"current_steps": 168, "total_steps": 311, "loss": 0.3197, "lr": 5.253245844193564e-06, "epoch": 0.5401929260450161, "percentage": 54.02, "elapsed_time": "0:52:40", "remaining_time": "0:44:49"} +{"current_steps": 169, "total_steps": 311, "loss": 0.3491, "lr": 5.197002296068878e-06, "epoch": 0.5434083601286174, "percentage": 54.34, "elapsed_time": "0:52:58", "remaining_time": "0:44:30"} +{"current_steps": 170, "total_steps": 311, "loss": 0.3323, "lr": 5.140733769922525e-06, "epoch": 0.5466237942122186, "percentage": 54.66, "elapsed_time": "0:53:16", "remaining_time": "0:44:10"} +{"current_steps": 171, "total_steps": 311, "loss": 0.3382, "lr": 5.084447400069656e-06, "epoch": 0.5498392282958199, "percentage": 54.98, "elapsed_time": "0:53:34", "remaining_time": "0:43:51"} +{"current_steps": 172, "total_steps": 311, "loss": 0.3424, "lr": 5.0281503230878304e-06, "epoch": 0.5530546623794212, "percentage": 55.31, "elapsed_time": "0:53:51", "remaining_time": "0:43:31"} +{"current_steps": 173, "total_steps": 311, "loss": 0.3357, "lr": 4.971849676912172e-06, "epoch": 0.5562700964630225, "percentage": 55.63, "elapsed_time": "0:54:10", "remaining_time": "0:43:12"} +{"current_steps": 174, "total_steps": 311, "loss": 0.3544, "lr": 4.915552599930345e-06, "epoch": 0.5594855305466238, "percentage": 55.95, "elapsed_time": "0:54:29", "remaining_time": "0:42:54"} +{"current_steps": 175, "total_steps": 311, "loss": 0.3134, "lr": 4.859266230077474e-06, "epoch": 0.5627009646302251, "percentage": 56.27, "elapsed_time": "0:54:48", "remaining_time": "0:42:35"} +{"current_steps": 176, "total_steps": 311, "loss": 0.3472, "lr": 4.802997703931124e-06, "epoch": 0.5659163987138264, "percentage": 56.59, "elapsed_time": "0:55:07", "remaining_time": "0:42:16"} +{"current_steps": 177, "total_steps": 311, "loss": 0.3484, "lr": 4.746754155806437e-06, "epoch": 0.5691318327974276, "percentage": 56.91, "elapsed_time": "0:55:25", "remaining_time": "0:41:57"} +{"current_steps": 178, "total_steps": 311, "loss": 0.3511, "lr": 4.6905427168515914e-06, "epoch": 0.572347266881029, "percentage": 57.23, "elapsed_time": "0:55:46", "remaining_time": "0:41:40"} +{"current_steps": 179, "total_steps": 311, "loss": 0.3823, "lr": 4.63437051414362e-06, "epoch": 0.5755627009646302, "percentage": 57.56, "elapsed_time": "0:56:05", "remaining_time": "0:41:21"} +{"current_steps": 180, "total_steps": 311, "loss": 0.3537, "lr": 4.5782446697847775e-06, "epoch": 0.5787781350482315, "percentage": 57.88, "elapsed_time": "0:56:22", "remaining_time": "0:41:01"} +{"current_steps": 181, "total_steps": 311, "loss": 0.3281, "lr": 4.52217229999951e-06, "epoch": 0.5819935691318328, "percentage": 58.2, "elapsed_time": "0:56:40", "remaining_time": "0:40:42"} +{"current_steps": 182, "total_steps": 311, "loss": 0.3307, "lr": 4.466160514232206e-06, "epoch": 0.5852090032154341, "percentage": 58.52, "elapsed_time": "0:56:58", "remaining_time": "0:40:22"} +{"current_steps": 183, "total_steps": 311, "loss": 0.3289, "lr": 4.410216414245771e-06, "epoch": 0.5884244372990354, "percentage": 58.84, "elapsed_time": "0:57:17", "remaining_time": "0:40:04"} +{"current_steps": 184, "total_steps": 311, "loss": 0.3139, "lr": 4.354347093221194e-06, "epoch": 0.5916398713826366, "percentage": 59.16, "elapsed_time": "0:57:35", "remaining_time": "0:39:44"} +{"current_steps": 185, "total_steps": 311, "loss": 0.3249, "lr": 4.298559634858202e-06, "epoch": 0.594855305466238, "percentage": 59.49, "elapsed_time": "0:57:53", "remaining_time": "0:39:25"} +{"current_steps": 186, "total_steps": 311, "loss": 0.3566, "lr": 4.2428611124771184e-06, "epoch": 0.5980707395498392, "percentage": 59.81, "elapsed_time": "0:58:11", "remaining_time": "0:39:06"} +{"current_steps": 187, "total_steps": 311, "loss": 0.3359, "lr": 4.187258588122019e-06, "epoch": 0.6012861736334405, "percentage": 60.13, "elapsed_time": "0:58:30", "remaining_time": "0:38:47"} +{"current_steps": 188, "total_steps": 311, "loss": 0.3231, "lr": 4.131759111665349e-06, "epoch": 0.6045016077170418, "percentage": 60.45, "elapsed_time": "0:58:48", "remaining_time": "0:38:28"} +{"current_steps": 189, "total_steps": 311, "loss": 0.3621, "lr": 4.076369719914055e-06, "epoch": 0.6077170418006431, "percentage": 60.77, "elapsed_time": "0:59:07", "remaining_time": "0:38:09"} +{"current_steps": 190, "total_steps": 311, "loss": 0.3263, "lr": 4.021097435717386e-06, "epoch": 0.6109324758842444, "percentage": 61.09, "elapsed_time": "0:59:25", "remaining_time": "0:37:50"} +{"current_steps": 191, "total_steps": 311, "loss": 0.3376, "lr": 3.965949267076465e-06, "epoch": 0.6141479099678456, "percentage": 61.41, "elapsed_time": "0:59:43", "remaining_time": "0:37:31"} +{"current_steps": 192, "total_steps": 311, "loss": 0.3161, "lr": 3.910932206255742e-06, "epoch": 0.617363344051447, "percentage": 61.74, "elapsed_time": "1:00:01", "remaining_time": "0:37:12"} +{"current_steps": 193, "total_steps": 311, "loss": 0.3241, "lr": 3.856053228896442e-06, "epoch": 0.6205787781350482, "percentage": 62.06, "elapsed_time": "1:00:19", "remaining_time": "0:36:52"} +{"current_steps": 194, "total_steps": 311, "loss": 0.3207, "lr": 3.8013192931321095e-06, "epoch": 0.6237942122186495, "percentage": 62.38, "elapsed_time": "1:00:37", "remaining_time": "0:36:33"} +{"current_steps": 195, "total_steps": 311, "loss": 0.3242, "lr": 3.7467373387063973e-06, "epoch": 0.6270096463022508, "percentage": 62.7, "elapsed_time": "1:00:55", "remaining_time": "0:36:14"} +{"current_steps": 196, "total_steps": 311, "loss": 0.3248, "lr": 3.692314286093167e-06, "epoch": 0.6302250803858521, "percentage": 63.02, "elapsed_time": "1:01:13", "remaining_time": "0:35:55"} +{"current_steps": 197, "total_steps": 311, "loss": 0.3291, "lr": 3.6380570356190346e-06, "epoch": 0.6334405144694534, "percentage": 63.34, "elapsed_time": "1:01:32", "remaining_time": "0:35:36"} +{"current_steps": 198, "total_steps": 311, "loss": 0.3078, "lr": 3.58397246658848e-06, "epoch": 0.6366559485530546, "percentage": 63.67, "elapsed_time": "1:01:51", "remaining_time": "0:35:17"} +{"current_steps": 199, "total_steps": 311, "loss": 0.3197, "lr": 3.5300674364116173e-06, "epoch": 0.639871382636656, "percentage": 63.99, "elapsed_time": "1:02:09", "remaining_time": "0:34:59"} +{"current_steps": 200, "total_steps": 311, "loss": 0.3141, "lr": 3.476348779734732e-06, "epoch": 0.6430868167202572, "percentage": 64.31, "elapsed_time": "1:02:27", "remaining_time": "0:34:40"} +{"current_steps": 201, "total_steps": 311, "loss": 0.3327, "lr": 3.4228233075737225e-06, "epoch": 0.6463022508038585, "percentage": 64.63, "elapsed_time": "1:02:46", "remaining_time": "0:34:21"} +{"current_steps": 202, "total_steps": 311, "loss": 0.3196, "lr": 3.3694978064505258e-06, "epoch": 0.6495176848874598, "percentage": 64.95, "elapsed_time": "1:03:03", "remaining_time": "0:34:01"} +{"current_steps": 203, "total_steps": 311, "loss": 0.355, "lr": 3.316379037532644e-06, "epoch": 0.6527331189710611, "percentage": 65.27, "elapsed_time": "1:03:22", "remaining_time": "0:33:42"} +{"current_steps": 204, "total_steps": 311, "loss": 0.3358, "lr": 3.2634737357758994e-06, "epoch": 0.6559485530546624, "percentage": 65.59, "elapsed_time": "1:03:41", "remaining_time": "0:33:24"} +{"current_steps": 205, "total_steps": 311, "loss": 0.3134, "lr": 3.2107886090705035e-06, "epoch": 0.6591639871382636, "percentage": 65.92, "elapsed_time": "1:03:58", "remaining_time": "0:33:04"} +{"current_steps": 206, "total_steps": 311, "loss": 0.3144, "lr": 3.158330337390565e-06, "epoch": 0.662379421221865, "percentage": 66.24, "elapsed_time": "1:04:17", "remaining_time": "0:32:46"} +{"current_steps": 207, "total_steps": 311, "loss": 0.3043, "lr": 3.10610557194712e-06, "epoch": 0.6655948553054662, "percentage": 66.56, "elapsed_time": "1:04:35", "remaining_time": "0:32:27"} +{"current_steps": 208, "total_steps": 311, "loss": 0.3121, "lr": 3.0541209343448373e-06, "epoch": 0.6688102893890675, "percentage": 66.88, "elapsed_time": "1:04:54", "remaining_time": "0:32:08"} +{"current_steps": 209, "total_steps": 311, "loss": 0.3454, "lr": 3.0023830157424504e-06, "epoch": 0.6720257234726688, "percentage": 67.2, "elapsed_time": "1:05:11", "remaining_time": "0:31:49"} +{"current_steps": 210, "total_steps": 311, "loss": 0.3175, "lr": 2.950898376017064e-06, "epoch": 0.6752411575562701, "percentage": 67.52, "elapsed_time": "1:05:29", "remaining_time": "0:31:30"} +{"current_steps": 211, "total_steps": 311, "loss": 0.319, "lr": 2.8996735429324256e-06, "epoch": 0.6784565916398714, "percentage": 67.85, "elapsed_time": "1:05:47", "remaining_time": "0:31:10"} +{"current_steps": 212, "total_steps": 311, "loss": 0.3085, "lr": 2.848715011311271e-06, "epoch": 0.6816720257234726, "percentage": 68.17, "elapsed_time": "1:06:06", "remaining_time": "0:30:52"} +{"current_steps": 213, "total_steps": 311, "loss": 0.3366, "lr": 2.7980292422118282e-06, "epoch": 0.684887459807074, "percentage": 68.49, "elapsed_time": "1:06:24", "remaining_time": "0:30:33"} +{"current_steps": 214, "total_steps": 311, "loss": 0.3171, "lr": 2.7476226621086354e-06, "epoch": 0.6881028938906752, "percentage": 68.81, "elapsed_time": "1:06:42", "remaining_time": "0:30:14"} +{"current_steps": 215, "total_steps": 311, "loss": 0.3158, "lr": 2.697501662077707e-06, "epoch": 0.6913183279742765, "percentage": 69.13, "elapsed_time": "1:07:01", "remaining_time": "0:29:55"} +{"current_steps": 216, "total_steps": 311, "loss": 0.3474, "lr": 2.6476725969862227e-06, "epoch": 0.6945337620578779, "percentage": 69.45, "elapsed_time": "1:07:18", "remaining_time": "0:29:36"} +{"current_steps": 217, "total_steps": 311, "loss": 0.3287, "lr": 2.5981417846867753e-06, "epoch": 0.6977491961414791, "percentage": 69.77, "elapsed_time": "1:07:37", "remaining_time": "0:29:17"} +{"current_steps": 218, "total_steps": 311, "loss": 0.3206, "lr": 2.548915505216333e-06, "epoch": 0.7009646302250804, "percentage": 70.1, "elapsed_time": "1:07:55", "remaining_time": "0:28:58"} +{"current_steps": 219, "total_steps": 311, "loss": 0.3228, "lr": 2.5000000000000015e-06, "epoch": 0.7041800643086816, "percentage": 70.42, "elapsed_time": "1:08:13", "remaining_time": "0:28:39"} +{"current_steps": 220, "total_steps": 311, "loss": 0.296, "lr": 2.4514014710596467e-06, "epoch": 0.707395498392283, "percentage": 70.74, "elapsed_time": "1:08:31", "remaining_time": "0:28:20"} +{"current_steps": 221, "total_steps": 311, "loss": 0.3358, "lr": 2.4031260802275623e-06, "epoch": 0.7106109324758842, "percentage": 71.06, "elapsed_time": "1:08:50", "remaining_time": "0:28:02"} +{"current_steps": 222, "total_steps": 311, "loss": 0.3332, "lr": 2.3551799483651894e-06, "epoch": 0.7138263665594855, "percentage": 71.38, "elapsed_time": "1:09:09", "remaining_time": "0:27:43"} +{"current_steps": 223, "total_steps": 311, "loss": 0.3356, "lr": 2.307569154587056e-06, "epoch": 0.7170418006430869, "percentage": 71.7, "elapsed_time": "1:09:26", "remaining_time": "0:27:24"} +{"current_steps": 224, "total_steps": 311, "loss": 0.315, "lr": 2.2602997354900075e-06, "epoch": 0.7202572347266881, "percentage": 72.03, "elapsed_time": "1:09:45", "remaining_time": "0:27:05"} +{"current_steps": 225, "total_steps": 311, "loss": 0.3355, "lr": 2.2133776843878185e-06, "epoch": 0.7234726688102894, "percentage": 72.35, "elapsed_time": "1:10:03", "remaining_time": "0:26:46"} +{"current_steps": 226, "total_steps": 311, "loss": 0.3301, "lr": 2.166808950551296e-06, "epoch": 0.7266881028938906, "percentage": 72.67, "elapsed_time": "1:10:20", "remaining_time": "0:26:27"} +{"current_steps": 227, "total_steps": 311, "loss": 0.3321, "lr": 2.120599438453968e-06, "epoch": 0.729903536977492, "percentage": 72.99, "elapsed_time": "1:10:38", "remaining_time": "0:26:08"} +{"current_steps": 228, "total_steps": 311, "loss": 0.3074, "lr": 2.074755007023461e-06, "epoch": 0.7331189710610932, "percentage": 73.31, "elapsed_time": "1:10:57", "remaining_time": "0:25:49"} +{"current_steps": 229, "total_steps": 311, "loss": 0.342, "lr": 2.0292814688986375e-06, "epoch": 0.7363344051446945, "percentage": 73.63, "elapsed_time": "1:11:15", "remaining_time": "0:25:30"} +{"current_steps": 230, "total_steps": 311, "loss": 0.3261, "lr": 1.9841845896926022e-06, "epoch": 0.7395498392282959, "percentage": 73.95, "elapsed_time": "1:11:33", "remaining_time": "0:25:12"} +{"current_steps": 231, "total_steps": 311, "loss": 0.3312, "lr": 1.9394700872616856e-06, "epoch": 0.7427652733118971, "percentage": 74.28, "elapsed_time": "1:11:52", "remaining_time": "0:24:53"} +{"current_steps": 232, "total_steps": 311, "loss": 0.341, "lr": 1.8951436309804766e-06, "epoch": 0.7459807073954984, "percentage": 74.6, "elapsed_time": "1:12:11", "remaining_time": "0:24:34"} +{"current_steps": 233, "total_steps": 311, "loss": 0.3039, "lr": 1.8512108410229878e-06, "epoch": 0.7491961414790996, "percentage": 74.92, "elapsed_time": "1:12:29", "remaining_time": "0:24:15"} +{"current_steps": 234, "total_steps": 311, "loss": 0.3003, "lr": 1.8076772876500831e-06, "epoch": 0.752411575562701, "percentage": 75.24, "elapsed_time": "1:12:46", "remaining_time": "0:23:56"} +{"current_steps": 235, "total_steps": 311, "loss": 0.3283, "lr": 1.7645484905032129e-06, "epoch": 0.7556270096463023, "percentage": 75.56, "elapsed_time": "1:13:05", "remaining_time": "0:23:38"} +{"current_steps": 236, "total_steps": 311, "loss": 0.3128, "lr": 1.7218299179045789e-06, "epoch": 0.7588424437299035, "percentage": 75.88, "elapsed_time": "1:13:23", "remaining_time": "0:23:19"} +{"current_steps": 237, "total_steps": 311, "loss": 0.3338, "lr": 1.6795269861638041e-06, "epoch": 0.7620578778135049, "percentage": 76.21, "elapsed_time": "1:13:40", "remaining_time": "0:23:00"} +{"current_steps": 238, "total_steps": 311, "loss": 0.2865, "lr": 1.6376450588911985e-06, "epoch": 0.7652733118971061, "percentage": 76.53, "elapsed_time": "1:13:59", "remaining_time": "0:22:41"} +{"current_steps": 239, "total_steps": 311, "loss": 0.3154, "lr": 1.5961894463176942e-06, "epoch": 0.7684887459807074, "percentage": 76.85, "elapsed_time": "1:14:16", "remaining_time": "0:22:22"} +{"current_steps": 240, "total_steps": 311, "loss": 0.3207, "lr": 1.555165404621567e-06, "epoch": 0.7717041800643086, "percentage": 77.17, "elapsed_time": "1:14:35", "remaining_time": "0:22:04"} +{"current_steps": 241, "total_steps": 311, "loss": 0.3403, "lr": 1.5145781352620054e-06, "epoch": 0.77491961414791, "percentage": 77.49, "elapsed_time": "1:14:53", "remaining_time": "0:21:45"} +{"current_steps": 242, "total_steps": 311, "loss": 0.2983, "lr": 1.4744327843196043e-06, "epoch": 0.7781350482315113, "percentage": 77.81, "elapsed_time": "1:15:10", "remaining_time": "0:21:26"} +{"current_steps": 243, "total_steps": 311, "loss": 0.3052, "lr": 1.434734441843899e-06, "epoch": 0.7813504823151125, "percentage": 78.14, "elapsed_time": "1:15:29", "remaining_time": "0:21:07"} +{"current_steps": 244, "total_steps": 311, "loss": 0.3155, "lr": 1.3954881412079945e-06, "epoch": 0.7845659163987139, "percentage": 78.46, "elapsed_time": "1:15:47", "remaining_time": "0:20:48"} +{"current_steps": 245, "total_steps": 311, "loss": 0.29, "lr": 1.3566988584703817e-06, "epoch": 0.7877813504823151, "percentage": 78.78, "elapsed_time": "1:16:07", "remaining_time": "0:20:30"} +{"current_steps": 246, "total_steps": 311, "loss": 0.303, "lr": 1.3183715117440143e-06, "epoch": 0.7909967845659164, "percentage": 79.1, "elapsed_time": "1:16:25", "remaining_time": "0:20:11"} +{"current_steps": 247, "total_steps": 311, "loss": 0.3258, "lr": 1.280510960572745e-06, "epoch": 0.7942122186495176, "percentage": 79.42, "elapsed_time": "1:16:43", "remaining_time": "0:19:52"} +{"current_steps": 248, "total_steps": 311, "loss": 0.3301, "lr": 1.2431220053151832e-06, "epoch": 0.797427652733119, "percentage": 79.74, "elapsed_time": "1:17:01", "remaining_time": "0:19:34"} +{"current_steps": 249, "total_steps": 311, "loss": 0.2993, "lr": 1.2062093865360458e-06, "epoch": 0.8006430868167203, "percentage": 80.06, "elapsed_time": "1:17:20", "remaining_time": "0:19:15"} +{"current_steps": 250, "total_steps": 311, "loss": 0.3119, "lr": 1.1697777844051105e-06, "epoch": 0.8038585209003215, "percentage": 80.39, "elapsed_time": "1:17:38", "remaining_time": "0:18:56"} +{"current_steps": 251, "total_steps": 311, "loss": 0.3017, "lr": 1.1338318181038037e-06, "epoch": 0.8070739549839229, "percentage": 80.71, "elapsed_time": "1:17:56", "remaining_time": "0:18:37"} +{"current_steps": 252, "total_steps": 311, "loss": 0.3205, "lr": 1.0983760452395415e-06, "epoch": 0.8102893890675241, "percentage": 81.03, "elapsed_time": "1:18:15", "remaining_time": "0:18:19"} +{"current_steps": 253, "total_steps": 311, "loss": 0.3265, "lr": 1.063414961267859e-06, "epoch": 0.8135048231511254, "percentage": 81.35, "elapsed_time": "1:18:33", "remaining_time": "0:18:00"} +{"current_steps": 254, "total_steps": 311, "loss": 0.3048, "lr": 1.02895299892244e-06, "epoch": 0.8167202572347267, "percentage": 81.67, "elapsed_time": "1:18:50", "remaining_time": "0:17:41"} +{"current_steps": 255, "total_steps": 311, "loss": 0.327, "lr": 9.949945276530782e-07, "epoch": 0.819935691318328, "percentage": 81.99, "elapsed_time": "1:19:08", "remaining_time": "0:17:22"} +{"current_steps": 256, "total_steps": 311, "loss": 0.304, "lr": 9.615438530716753e-07, "epoch": 0.8231511254019293, "percentage": 82.32, "elapsed_time": "1:19:27", "remaining_time": "0:17:04"} +{"current_steps": 257, "total_steps": 311, "loss": 0.3335, "lr": 9.286052164063369e-07, "epoch": 0.8263665594855305, "percentage": 82.64, "elapsed_time": "1:19:45", "remaining_time": "0:16:45"} +{"current_steps": 258, "total_steps": 311, "loss": 0.3438, "lr": 8.961827939636198e-07, "epoch": 0.8295819935691319, "percentage": 82.96, "elapsed_time": "1:20:03", "remaining_time": "0:16:26"} +{"current_steps": 259, "total_steps": 311, "loss": 0.3156, "lr": 8.64280696599008e-07, "epoch": 0.8327974276527331, "percentage": 83.28, "elapsed_time": "1:20:20", "remaining_time": "0:16:07"} +{"current_steps": 260, "total_steps": 311, "loss": 0.3126, "lr": 8.329029691957124e-07, "epoch": 0.8360128617363344, "percentage": 83.6, "elapsed_time": "1:20:38", "remaining_time": "0:15:49"} +{"current_steps": 261, "total_steps": 311, "loss": 0.3188, "lr": 8.02053590151805e-07, "epoch": 0.8392282958199357, "percentage": 83.92, "elapsed_time": "1:20:57", "remaining_time": "0:15:30"} +{"current_steps": 262, "total_steps": 311, "loss": 0.3211, "lr": 7.717364708758024e-07, "epoch": 0.842443729903537, "percentage": 84.24, "elapsed_time": "1:21:15", "remaining_time": "0:15:11"} +{"current_steps": 263, "total_steps": 311, "loss": 0.2978, "lr": 7.41955455290726e-07, "epoch": 0.8456591639871383, "percentage": 84.57, "elapsed_time": "1:21:33", "remaining_time": "0:14:53"} +{"current_steps": 264, "total_steps": 311, "loss": 0.3173, "lr": 7.127143193467445e-07, "epoch": 0.8488745980707395, "percentage": 84.89, "elapsed_time": "1:21:52", "remaining_time": "0:14:34"} +{"current_steps": 265, "total_steps": 311, "loss": 0.3002, "lr": 6.840167705424106e-07, "epoch": 0.8520900321543409, "percentage": 85.21, "elapsed_time": "1:22:11", "remaining_time": "0:14:15"} +{"current_steps": 266, "total_steps": 311, "loss": 0.3243, "lr": 6.558664474545817e-07, "epoch": 0.8553054662379421, "percentage": 85.53, "elapsed_time": "1:22:29", "remaining_time": "0:13:57"} +{"current_steps": 267, "total_steps": 311, "loss": 0.2968, "lr": 6.282669192770896e-07, "epoch": 0.8585209003215434, "percentage": 85.85, "elapsed_time": "1:22:47", "remaining_time": "0:13:38"} +{"current_steps": 268, "total_steps": 311, "loss": 0.32, "lr": 6.012216853682001e-07, "epoch": 0.8617363344051447, "percentage": 86.17, "elapsed_time": "1:23:05", "remaining_time": "0:13:19"} +{"current_steps": 269, "total_steps": 311, "loss": 0.309, "lr": 5.747341748069229e-07, "epoch": 0.864951768488746, "percentage": 86.5, "elapsed_time": "1:23:23", "remaining_time": "0:13:01"} +{"current_steps": 270, "total_steps": 311, "loss": 0.3231, "lr": 5.488077459582425e-07, "epoch": 0.8681672025723473, "percentage": 86.82, "elapsed_time": "1:23:41", "remaining_time": "0:12:42"} +{"current_steps": 271, "total_steps": 311, "loss": 0.292, "lr": 5.234456860473042e-07, "epoch": 0.8713826366559485, "percentage": 87.14, "elapsed_time": "1:24:00", "remaining_time": "0:12:23"} +{"current_steps": 272, "total_steps": 311, "loss": 0.3043, "lr": 4.986512107426283e-07, "epoch": 0.8745980707395499, "percentage": 87.46, "elapsed_time": "1:24:18", "remaining_time": "0:12:05"} +{"current_steps": 273, "total_steps": 311, "loss": 0.2818, "lr": 4.7442746374839363e-07, "epoch": 0.8778135048231511, "percentage": 87.78, "elapsed_time": "1:24:36", "remaining_time": "0:11:46"} +{"current_steps": 274, "total_steps": 311, "loss": 0.295, "lr": 4.50777516405847e-07, "epoch": 0.8810289389067524, "percentage": 88.1, "elapsed_time": "1:24:55", "remaining_time": "0:11:28"} +{"current_steps": 275, "total_steps": 311, "loss": 0.2951, "lr": 4.2770436730388166e-07, "epoch": 0.8842443729903537, "percentage": 88.42, "elapsed_time": "1:25:13", "remaining_time": "0:11:09"} +{"current_steps": 276, "total_steps": 311, "loss": 0.3191, "lr": 4.05210941898847e-07, "epoch": 0.887459807073955, "percentage": 88.75, "elapsed_time": "1:25:31", "remaining_time": "0:10:50"} +{"current_steps": 277, "total_steps": 311, "loss": 0.3118, "lr": 3.8330009214363197e-07, "epoch": 0.8906752411575563, "percentage": 89.07, "elapsed_time": "1:25:49", "remaining_time": "0:10:32"} +{"current_steps": 278, "total_steps": 311, "loss": 0.3207, "lr": 3.619745961260623e-07, "epoch": 0.8938906752411575, "percentage": 89.39, "elapsed_time": "1:26:07", "remaining_time": "0:10:13"} +{"current_steps": 279, "total_steps": 311, "loss": 0.3278, "lr": 3.4123715771665786e-07, "epoch": 0.8971061093247589, "percentage": 89.71, "elapsed_time": "1:26:25", "remaining_time": "0:09:54"} +{"current_steps": 280, "total_steps": 311, "loss": 0.2798, "lr": 3.2109040622582186e-07, "epoch": 0.9003215434083601, "percentage": 90.03, "elapsed_time": "1:26:43", "remaining_time": "0:09:36"} +{"current_steps": 281, "total_steps": 311, "loss": 0.307, "lr": 3.015368960704584e-07, "epoch": 0.9035369774919614, "percentage": 90.35, "elapsed_time": "1:27:01", "remaining_time": "0:09:17"} +{"current_steps": 282, "total_steps": 311, "loss": 0.2861, "lr": 2.8257910645009935e-07, "epoch": 0.9067524115755627, "percentage": 90.68, "elapsed_time": "1:27:19", "remaining_time": "0:08:58"} +{"current_steps": 283, "total_steps": 311, "loss": 0.3065, "lr": 2.6421944103256657e-07, "epoch": 0.909967845659164, "percentage": 91.0, "elapsed_time": "1:27:38", "remaining_time": "0:08:40"} +{"current_steps": 284, "total_steps": 311, "loss": 0.3013, "lr": 2.4646022764920843e-07, "epoch": 0.9131832797427653, "percentage": 91.32, "elapsed_time": "1:27:56", "remaining_time": "0:08:21"} +{"current_steps": 285, "total_steps": 311, "loss": 0.309, "lr": 2.2930371799975593e-07, "epoch": 0.9163987138263665, "percentage": 91.64, "elapsed_time": "1:28:14", "remaining_time": "0:08:03"} +{"current_steps": 286, "total_steps": 311, "loss": 0.2973, "lr": 2.1275208736682262e-07, "epoch": 0.9196141479099679, "percentage": 91.96, "elapsed_time": "1:28:32", "remaining_time": "0:07:44"} +{"current_steps": 287, "total_steps": 311, "loss": 0.307, "lr": 1.9680743434010385e-07, "epoch": 0.9228295819935691, "percentage": 92.28, "elapsed_time": "1:28:50", "remaining_time": "0:07:25"} +{"current_steps": 288, "total_steps": 311, "loss": 0.3035, "lr": 1.814717805502958e-07, "epoch": 0.9260450160771704, "percentage": 92.6, "elapsed_time": "1:29:08", "remaining_time": "0:07:07"} +{"current_steps": 289, "total_steps": 311, "loss": 0.2966, "lr": 1.667470704127694e-07, "epoch": 0.9292604501607717, "percentage": 92.93, "elapsed_time": "1:29:26", "remaining_time": "0:06:48"} +{"current_steps": 290, "total_steps": 311, "loss": 0.3021, "lr": 1.5263517088103862e-07, "epoch": 0.932475884244373, "percentage": 93.25, "elapsed_time": "1:29:45", "remaining_time": "0:06:29"} +{"current_steps": 291, "total_steps": 311, "loss": 0.3164, "lr": 1.3913787121004717e-07, "epoch": 0.9356913183279743, "percentage": 93.57, "elapsed_time": "1:30:03", "remaining_time": "0:06:11"} +{"current_steps": 292, "total_steps": 311, "loss": 0.3191, "lr": 1.2625688272930925e-07, "epoch": 0.9389067524115756, "percentage": 93.89, "elapsed_time": "1:30:21", "remaining_time": "0:05:52"} +{"current_steps": 293, "total_steps": 311, "loss": 0.3005, "lr": 1.1399383862592928e-07, "epoch": 0.9421221864951769, "percentage": 94.21, "elapsed_time": "1:30:40", "remaining_time": "0:05:34"} +{"current_steps": 294, "total_steps": 311, "loss": 0.3019, "lr": 1.0235029373752758e-07, "epoch": 0.9453376205787781, "percentage": 94.53, "elapsed_time": "1:30:58", "remaining_time": "0:05:15"} +{"current_steps": 295, "total_steps": 311, "loss": 0.2809, "lr": 9.132772435510362e-08, "epoch": 0.9485530546623794, "percentage": 94.86, "elapsed_time": "1:31:16", "remaining_time": "0:04:57"} +{"current_steps": 296, "total_steps": 311, "loss": 0.2959, "lr": 8.092752803585513e-08, "epoch": 0.9517684887459807, "percentage": 95.18, "elapsed_time": "1:31:34", "remaining_time": "0:04:38"} +{"current_steps": 297, "total_steps": 311, "loss": 0.2872, "lr": 7.115102342598101e-08, "epoch": 0.954983922829582, "percentage": 95.5, "elapsed_time": "1:31:52", "remaining_time": "0:04:19"} +{"current_steps": 298, "total_steps": 311, "loss": 0.3173, "lr": 6.199945009349173e-08, "epoch": 0.9581993569131833, "percentage": 95.82, "elapsed_time": "1:32:11", "remaining_time": "0:04:01"} +{"current_steps": 299, "total_steps": 311, "loss": 0.2896, "lr": 5.3473968371040575e-08, "epoch": 0.9614147909967846, "percentage": 96.14, "elapsed_time": "1:32:29", "remaining_time": "0:03:42"} +{"current_steps": 300, "total_steps": 311, "loss": 0.2965, "lr": 4.55756592088058e-08, "epoch": 0.9646302250803859, "percentage": 96.46, "elapsed_time": "1:32:47", "remaining_time": "0:03:24"} +{"current_steps": 301, "total_steps": 311, "loss": 0.3084, "lr": 3.8305524037438035e-08, "epoch": 0.9678456591639871, "percentage": 96.78, "elapsed_time": "1:33:05", "remaining_time": "0:03:05"} +{"current_steps": 302, "total_steps": 311, "loss": 0.328, "lr": 3.166448464108629e-08, "epoch": 0.9710610932475884, "percentage": 97.11, "elapsed_time": "1:33:24", "remaining_time": "0:02:47"} +{"current_steps": 303, "total_steps": 311, "loss": 0.2849, "lr": 2.5653383040524228e-08, "epoch": 0.9742765273311897, "percentage": 97.43, "elapsed_time": "1:33:42", "remaining_time": "0:02:28"} +{"current_steps": 304, "total_steps": 311, "loss": 0.3468, "lr": 2.0272981386393332e-08, "epoch": 0.977491961414791, "percentage": 97.75, "elapsed_time": "1:34:01", "remaining_time": "0:02:09"} +{"current_steps": 305, "total_steps": 311, "loss": 0.2976, "lr": 1.552396186256411e-08, "epoch": 0.9807073954983923, "percentage": 98.07, "elapsed_time": "1:34:20", "remaining_time": "0:01:51"} +{"current_steps": 306, "total_steps": 311, "loss": 0.3228, "lr": 1.1406926599646373e-08, "epoch": 0.9839228295819936, "percentage": 98.39, "elapsed_time": "1:34:38", "remaining_time": "0:01:32"} +{"current_steps": 307, "total_steps": 311, "loss": 0.2999, "lr": 7.922397598642551e-09, "epoch": 0.9871382636655949, "percentage": 98.71, "elapsed_time": "1:34:56", "remaining_time": "0:01:14"} +{"current_steps": 308, "total_steps": 311, "loss": 0.3042, "lr": 5.0708166647628345e-09, "epoch": 0.9903536977491961, "percentage": 99.04, "elapsed_time": "1:35:14", "remaining_time": "0:00:55"} +{"current_steps": 309, "total_steps": 311, "loss": 0.3057, "lr": 2.8525453514099966e-09, "epoch": 0.9935691318327974, "percentage": 99.36, "elapsed_time": "1:35:33", "remaining_time": "0:00:37"} +{"current_steps": 310, "total_steps": 311, "loss": 0.3086, "lr": 1.2678649143349485e-09, "epoch": 0.9967845659163987, "percentage": 99.68, "elapsed_time": "1:35:51", "remaining_time": "0:00:18"} +{"current_steps": 311, "total_steps": 311, "loss": 0.3017, "lr": 3.1697627597970794e-10, "epoch": 1.0, "percentage": 100.0, "elapsed_time": "1:36:09", "remaining_time": "0:00:00"} +{"current_steps": 311, "total_steps": 311, "epoch": 1.0, "percentage": 100.0, "elapsed_time": "1:37:07", "remaining_time": "0:00:00"} diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000..57fb2ef --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,2220 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 311, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.003215434083601286, + "grad_norm": 7.461493968963623, + "learning_rate": 0.0, + "loss": 1.4509, + "step": 1 + }, + { + "epoch": 0.006430868167202572, + "grad_norm": 7.841219425201416, + "learning_rate": 3.125e-07, + "loss": 1.4192, + "step": 2 + }, + { + "epoch": 0.00964630225080386, + "grad_norm": 8.970290184020996, + "learning_rate": 6.25e-07, + "loss": 1.4011, + "step": 3 + }, + { + "epoch": 0.012861736334405145, + "grad_norm": 6.137041091918945, + "learning_rate": 9.375000000000001e-07, + "loss": 1.3749, + "step": 4 + }, + { + "epoch": 0.01607717041800643, + "grad_norm": 5.494370460510254, + "learning_rate": 1.25e-06, + "loss": 1.3428, + "step": 5 + }, + { + "epoch": 0.01929260450160772, + "grad_norm": 4.324854373931885, + "learning_rate": 1.5625e-06, + "loss": 1.2579, + "step": 6 + }, + { + "epoch": 0.022508038585209004, + "grad_norm": 5.789402008056641, + "learning_rate": 1.8750000000000003e-06, + "loss": 1.2162, + "step": 7 + }, + { + "epoch": 0.02572347266881029, + "grad_norm": 4.819009304046631, + "learning_rate": 2.1875000000000002e-06, + "loss": 1.1246, + "step": 8 + }, + { + "epoch": 0.028938906752411574, + "grad_norm": 4.173788070678711, + "learning_rate": 2.5e-06, + "loss": 1.0988, + "step": 9 + }, + { + "epoch": 0.03215434083601286, + "grad_norm": 3.964716911315918, + "learning_rate": 2.8125e-06, + "loss": 1.0293, + "step": 10 + }, + { + "epoch": 0.03536977491961415, + "grad_norm": 3.7624258995056152, + "learning_rate": 3.125e-06, + "loss": 1.0261, + "step": 11 + }, + { + "epoch": 0.03858520900321544, + "grad_norm": 2.7567455768585205, + "learning_rate": 3.4375e-06, + "loss": 0.9731, + "step": 12 + }, + { + "epoch": 0.04180064308681672, + "grad_norm": 11.352989196777344, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.9391, + "step": 13 + }, + { + "epoch": 0.04501607717041801, + "grad_norm": 2.96602201461792, + "learning_rate": 4.0625000000000005e-06, + "loss": 0.8457, + "step": 14 + }, + { + "epoch": 0.04823151125401929, + "grad_norm": 3.602654218673706, + "learning_rate": 4.3750000000000005e-06, + "loss": 0.854, + "step": 15 + }, + { + "epoch": 0.05144694533762058, + "grad_norm": 3.3031013011932373, + "learning_rate": 4.6875000000000004e-06, + "loss": 0.7838, + "step": 16 + }, + { + "epoch": 0.05466237942122187, + "grad_norm": 3.8468689918518066, + "learning_rate": 5e-06, + "loss": 0.8028, + "step": 17 + }, + { + "epoch": 0.05787781350482315, + "grad_norm": 5.588563919067383, + "learning_rate": 5.3125e-06, + "loss": 0.7586, + "step": 18 + }, + { + "epoch": 0.06109324758842444, + "grad_norm": 3.002431631088257, + "learning_rate": 5.625e-06, + "loss": 0.7981, + "step": 19 + }, + { + "epoch": 0.06430868167202572, + "grad_norm": 2.289362668991089, + "learning_rate": 5.9375e-06, + "loss": 0.6801, + "step": 20 + }, + { + "epoch": 0.06752411575562701, + "grad_norm": 2.4948556423187256, + "learning_rate": 6.25e-06, + "loss": 0.7076, + "step": 21 + }, + { + "epoch": 0.0707395498392283, + "grad_norm": 3.449002742767334, + "learning_rate": 6.5625e-06, + "loss": 0.6911, + "step": 22 + }, + { + "epoch": 0.07395498392282958, + "grad_norm": 2.8027281761169434, + "learning_rate": 6.875e-06, + "loss": 0.6124, + "step": 23 + }, + { + "epoch": 0.07717041800643087, + "grad_norm": 3.1212947368621826, + "learning_rate": 7.1875e-06, + "loss": 0.6253, + "step": 24 + }, + { + "epoch": 0.08038585209003216, + "grad_norm": 2.3612632751464844, + "learning_rate": 7.500000000000001e-06, + "loss": 0.6117, + "step": 25 + }, + { + "epoch": 0.08360128617363344, + "grad_norm": 3.0025129318237305, + "learning_rate": 7.8125e-06, + "loss": 0.5949, + "step": 26 + }, + { + "epoch": 0.08681672025723473, + "grad_norm": 2.812004566192627, + "learning_rate": 8.125000000000001e-06, + "loss": 0.5983, + "step": 27 + }, + { + "epoch": 0.09003215434083602, + "grad_norm": 3.4180963039398193, + "learning_rate": 8.4375e-06, + "loss": 0.5706, + "step": 28 + }, + { + "epoch": 0.0932475884244373, + "grad_norm": 3.952913522720337, + "learning_rate": 8.750000000000001e-06, + "loss": 0.536, + "step": 29 + }, + { + "epoch": 0.09646302250803858, + "grad_norm": 2.5324349403381348, + "learning_rate": 9.0625e-06, + "loss": 0.5384, + "step": 30 + }, + { + "epoch": 0.09967845659163987, + "grad_norm": 2.9339852333068848, + "learning_rate": 9.375000000000001e-06, + "loss": 0.5491, + "step": 31 + }, + { + "epoch": 0.10289389067524116, + "grad_norm": 2.1388156414031982, + "learning_rate": 9.6875e-06, + "loss": 0.524, + "step": 32 + }, + { + "epoch": 0.10610932475884244, + "grad_norm": 2.724695920944214, + "learning_rate": 1e-05, + "loss": 0.5224, + "step": 33 + }, + { + "epoch": 0.10932475884244373, + "grad_norm": 2.3847827911376953, + "learning_rate": 9.999683023724021e-06, + "loss": 0.5185, + "step": 34 + }, + { + "epoch": 0.11254019292604502, + "grad_norm": 5.435914039611816, + "learning_rate": 9.998732135085665e-06, + "loss": 0.5219, + "step": 35 + }, + { + "epoch": 0.1157556270096463, + "grad_norm": 2.3512284755706787, + "learning_rate": 9.99714745464859e-06, + "loss": 0.4951, + "step": 36 + }, + { + "epoch": 0.1189710610932476, + "grad_norm": 2.4358675479888916, + "learning_rate": 9.994929183335237e-06, + "loss": 0.5087, + "step": 37 + }, + { + "epoch": 0.12218649517684887, + "grad_norm": 4.703172206878662, + "learning_rate": 9.992077602401358e-06, + "loss": 0.4586, + "step": 38 + }, + { + "epoch": 0.12540192926045016, + "grad_norm": 1.9784342050552368, + "learning_rate": 9.988593073400354e-06, + "loss": 0.4962, + "step": 39 + }, + { + "epoch": 0.12861736334405144, + "grad_norm": 2.9739573001861572, + "learning_rate": 9.984476038137437e-06, + "loss": 0.5232, + "step": 40 + }, + { + "epoch": 0.13183279742765272, + "grad_norm": 4.0715413093566895, + "learning_rate": 9.979727018613607e-06, + "loss": 0.4603, + "step": 41 + }, + { + "epoch": 0.13504823151125403, + "grad_norm": 3.099459409713745, + "learning_rate": 9.974346616959476e-06, + "loss": 0.456, + "step": 42 + }, + { + "epoch": 0.1382636655948553, + "grad_norm": 3.039191246032715, + "learning_rate": 9.968335515358916e-06, + "loss": 0.45, + "step": 43 + }, + { + "epoch": 0.1414790996784566, + "grad_norm": 22.894182205200195, + "learning_rate": 9.961694475962562e-06, + "loss": 0.4605, + "step": 44 + }, + { + "epoch": 0.14469453376205788, + "grad_norm": 2.549868106842041, + "learning_rate": 9.954424340791195e-06, + "loss": 0.4636, + "step": 45 + }, + { + "epoch": 0.14790996784565916, + "grad_norm": 2.1484735012054443, + "learning_rate": 9.94652603162896e-06, + "loss": 0.4533, + "step": 46 + }, + { + "epoch": 0.15112540192926044, + "grad_norm": 2.6148245334625244, + "learning_rate": 9.938000549906509e-06, + "loss": 0.442, + "step": 47 + }, + { + "epoch": 0.15434083601286175, + "grad_norm": 3.580359697341919, + "learning_rate": 9.92884897657402e-06, + "loss": 0.477, + "step": 48 + }, + { + "epoch": 0.15755627009646303, + "grad_norm": 2.5778746604919434, + "learning_rate": 9.919072471964146e-06, + "loss": 0.4266, + "step": 49 + }, + { + "epoch": 0.1607717041800643, + "grad_norm": 9.264074325561523, + "learning_rate": 9.908672275644898e-06, + "loss": 0.4375, + "step": 50 + }, + { + "epoch": 0.1639871382636656, + "grad_norm": 3.2539267539978027, + "learning_rate": 9.897649706262474e-06, + "loss": 0.4438, + "step": 51 + }, + { + "epoch": 0.16720257234726688, + "grad_norm": 3.373600721359253, + "learning_rate": 9.88600616137407e-06, + "loss": 0.4388, + "step": 52 + }, + { + "epoch": 0.17041800643086816, + "grad_norm": 5.1715898513793945, + "learning_rate": 9.873743117270691e-06, + "loss": 0.4612, + "step": 53 + }, + { + "epoch": 0.17363344051446947, + "grad_norm": 4.894754409790039, + "learning_rate": 9.860862128789954e-06, + "loss": 0.4513, + "step": 54 + }, + { + "epoch": 0.17684887459807075, + "grad_norm": 3.1228713989257812, + "learning_rate": 9.847364829118963e-06, + "loss": 0.4439, + "step": 55 + }, + { + "epoch": 0.18006430868167203, + "grad_norm": 5.72307014465332, + "learning_rate": 9.833252929587231e-06, + "loss": 0.4484, + "step": 56 + }, + { + "epoch": 0.1832797427652733, + "grad_norm": 7.6115336418151855, + "learning_rate": 9.818528219449705e-06, + "loss": 0.4642, + "step": 57 + }, + { + "epoch": 0.1864951768488746, + "grad_norm": 4.580008506774902, + "learning_rate": 9.803192565659898e-06, + "loss": 0.4289, + "step": 58 + }, + { + "epoch": 0.18971061093247588, + "grad_norm": 4.601083278656006, + "learning_rate": 9.78724791263318e-06, + "loss": 0.416, + "step": 59 + }, + { + "epoch": 0.19292604501607716, + "grad_norm": 5.066440105438232, + "learning_rate": 9.770696282000245e-06, + "loss": 0.4083, + "step": 60 + }, + { + "epoch": 0.19614147909967847, + "grad_norm": 4.051520824432373, + "learning_rate": 9.753539772350792e-06, + "loss": 0.4177, + "step": 61 + }, + { + "epoch": 0.19935691318327975, + "grad_norm": 3.406569242477417, + "learning_rate": 9.735780558967434e-06, + "loss": 0.4328, + "step": 62 + }, + { + "epoch": 0.20257234726688103, + "grad_norm": 4.951582908630371, + "learning_rate": 9.717420893549902e-06, + "loss": 0.424, + "step": 63 + }, + { + "epoch": 0.2057877813504823, + "grad_norm": 2.5664072036743164, + "learning_rate": 9.698463103929542e-06, + "loss": 0.4254, + "step": 64 + }, + { + "epoch": 0.2090032154340836, + "grad_norm": 2.8900935649871826, + "learning_rate": 9.67890959377418e-06, + "loss": 0.4202, + "step": 65 + }, + { + "epoch": 0.21221864951768488, + "grad_norm": 3.5653440952301025, + "learning_rate": 9.658762842283343e-06, + "loss": 0.397, + "step": 66 + }, + { + "epoch": 0.21543408360128619, + "grad_norm": 2.600797414779663, + "learning_rate": 9.638025403873939e-06, + "loss": 0.3912, + "step": 67 + }, + { + "epoch": 0.21864951768488747, + "grad_norm": 7.3537397384643555, + "learning_rate": 9.616699907856368e-06, + "loss": 0.3758, + "step": 68 + }, + { + "epoch": 0.22186495176848875, + "grad_norm": 5.298691272735596, + "learning_rate": 9.594789058101154e-06, + "loss": 0.4368, + "step": 69 + }, + { + "epoch": 0.22508038585209003, + "grad_norm": 2.9018328189849854, + "learning_rate": 9.57229563269612e-06, + "loss": 0.4067, + "step": 70 + }, + { + "epoch": 0.2282958199356913, + "grad_norm": 2.2843472957611084, + "learning_rate": 9.549222483594154e-06, + "loss": 0.3884, + "step": 71 + }, + { + "epoch": 0.2315112540192926, + "grad_norm": 4.242974281311035, + "learning_rate": 9.525572536251608e-06, + "loss": 0.3895, + "step": 72 + }, + { + "epoch": 0.2347266881028939, + "grad_norm": 3.73633074760437, + "learning_rate": 9.501348789257373e-06, + "loss": 0.408, + "step": 73 + }, + { + "epoch": 0.2379421221864952, + "grad_norm": 7.407820224761963, + "learning_rate": 9.476554313952697e-06, + "loss": 0.3862, + "step": 74 + }, + { + "epoch": 0.24115755627009647, + "grad_norm": 4.708957672119141, + "learning_rate": 9.451192254041759e-06, + "loss": 0.4149, + "step": 75 + }, + { + "epoch": 0.24437299035369775, + "grad_norm": 6.709017276763916, + "learning_rate": 9.425265825193077e-06, + "loss": 0.38, + "step": 76 + }, + { + "epoch": 0.24758842443729903, + "grad_norm": 2.1756374835968018, + "learning_rate": 9.398778314631801e-06, + "loss": 0.3799, + "step": 77 + }, + { + "epoch": 0.2508038585209003, + "grad_norm": 3.1432833671569824, + "learning_rate": 9.371733080722911e-06, + "loss": 0.3882, + "step": 78 + }, + { + "epoch": 0.2540192926045016, + "grad_norm": 2.0718796253204346, + "learning_rate": 9.34413355254542e-06, + "loss": 0.4123, + "step": 79 + }, + { + "epoch": 0.2572347266881029, + "grad_norm": 3.231426954269409, + "learning_rate": 9.31598322945759e-06, + "loss": 0.3627, + "step": 80 + }, + { + "epoch": 0.2604501607717042, + "grad_norm": 3.3169357776641846, + "learning_rate": 9.287285680653254e-06, + "loss": 0.3747, + "step": 81 + }, + { + "epoch": 0.26366559485530544, + "grad_norm": 1.9841314554214478, + "learning_rate": 9.258044544709276e-06, + "loss": 0.399, + "step": 82 + }, + { + "epoch": 0.26688102893890675, + "grad_norm": 2.6931841373443604, + "learning_rate": 9.228263529124199e-06, + "loss": 0.3995, + "step": 83 + }, + { + "epoch": 0.27009646302250806, + "grad_norm": 2.48873233795166, + "learning_rate": 9.197946409848196e-06, + "loss": 0.4221, + "step": 84 + }, + { + "epoch": 0.2733118971061093, + "grad_norm": 20.441673278808594, + "learning_rate": 9.167097030804289e-06, + "loss": 0.3649, + "step": 85 + }, + { + "epoch": 0.2765273311897106, + "grad_norm": 2.6681602001190186, + "learning_rate": 9.135719303400995e-06, + "loss": 0.3638, + "step": 86 + }, + { + "epoch": 0.2797427652733119, + "grad_norm": 4.435401439666748, + "learning_rate": 9.103817206036383e-06, + "loss": 0.3722, + "step": 87 + }, + { + "epoch": 0.2829581993569132, + "grad_norm": 5.914163589477539, + "learning_rate": 9.071394783593664e-06, + "loss": 0.3656, + "step": 88 + }, + { + "epoch": 0.2861736334405145, + "grad_norm": 6.216729640960693, + "learning_rate": 9.038456146928325e-06, + "loss": 0.3916, + "step": 89 + }, + { + "epoch": 0.28938906752411575, + "grad_norm": 2.873570442199707, + "learning_rate": 9.005005472346923e-06, + "loss": 0.3903, + "step": 90 + }, + { + "epoch": 0.29260450160771706, + "grad_norm": 4.470005035400391, + "learning_rate": 8.971047001077561e-06, + "loss": 0.3987, + "step": 91 + }, + { + "epoch": 0.2958199356913183, + "grad_norm": 2.5284571647644043, + "learning_rate": 8.936585038732143e-06, + "loss": 0.4044, + "step": 92 + }, + { + "epoch": 0.2990353697749196, + "grad_norm": 2.339695692062378, + "learning_rate": 8.90162395476046e-06, + "loss": 0.3858, + "step": 93 + }, + { + "epoch": 0.3022508038585209, + "grad_norm": 2.0064709186553955, + "learning_rate": 8.866168181896198e-06, + "loss": 0.4002, + "step": 94 + }, + { + "epoch": 0.3054662379421222, + "grad_norm": 3.07234525680542, + "learning_rate": 8.83022221559489e-06, + "loss": 0.375, + "step": 95 + }, + { + "epoch": 0.3086816720257235, + "grad_norm": 2.4521424770355225, + "learning_rate": 8.793790613463956e-06, + "loss": 0.3549, + "step": 96 + }, + { + "epoch": 0.31189710610932475, + "grad_norm": 2.3006341457366943, + "learning_rate": 8.756877994684818e-06, + "loss": 0.3798, + "step": 97 + }, + { + "epoch": 0.31511254019292606, + "grad_norm": 3.3463075160980225, + "learning_rate": 8.719489039427256e-06, + "loss": 0.3871, + "step": 98 + }, + { + "epoch": 0.3183279742765273, + "grad_norm": 2.5349507331848145, + "learning_rate": 8.681628488255986e-06, + "loss": 0.4025, + "step": 99 + }, + { + "epoch": 0.3215434083601286, + "grad_norm": 2.8825855255126953, + "learning_rate": 8.643301141529619e-06, + "loss": 0.3998, + "step": 100 + }, + { + "epoch": 0.3247588424437299, + "grad_norm": 4.388237953186035, + "learning_rate": 8.604511858792006e-06, + "loss": 0.3714, + "step": 101 + }, + { + "epoch": 0.3279742765273312, + "grad_norm": 2.6666557788848877, + "learning_rate": 8.565265558156101e-06, + "loss": 0.3509, + "step": 102 + }, + { + "epoch": 0.3311897106109325, + "grad_norm": 2.7230324745178223, + "learning_rate": 8.525567215680397e-06, + "loss": 0.366, + "step": 103 + }, + { + "epoch": 0.33440514469453375, + "grad_norm": 2.4554688930511475, + "learning_rate": 8.485421864737997e-06, + "loss": 0.3919, + "step": 104 + }, + { + "epoch": 0.33762057877813506, + "grad_norm": 7.866596221923828, + "learning_rate": 8.444834595378434e-06, + "loss": 0.3623, + "step": 105 + }, + { + "epoch": 0.3408360128617363, + "grad_norm": 2.528653144836426, + "learning_rate": 8.403810553682307e-06, + "loss": 0.3758, + "step": 106 + }, + { + "epoch": 0.3440514469453376, + "grad_norm": 2.836378335952759, + "learning_rate": 8.362354941108803e-06, + "loss": 0.3456, + "step": 107 + }, + { + "epoch": 0.34726688102893893, + "grad_norm": 1.8620100021362305, + "learning_rate": 8.320473013836197e-06, + "loss": 0.3754, + "step": 108 + }, + { + "epoch": 0.3504823151125402, + "grad_norm": 2.056680679321289, + "learning_rate": 8.278170082095422e-06, + "loss": 0.3858, + "step": 109 + }, + { + "epoch": 0.3536977491961415, + "grad_norm": 1.9714686870574951, + "learning_rate": 8.23545150949679e-06, + "loss": 0.3941, + "step": 110 + }, + { + "epoch": 0.35691318327974275, + "grad_norm": 2.2530500888824463, + "learning_rate": 8.192322712349917e-06, + "loss": 0.3712, + "step": 111 + }, + { + "epoch": 0.36012861736334406, + "grad_norm": 1.7236007452011108, + "learning_rate": 8.148789158977012e-06, + "loss": 0.3532, + "step": 112 + }, + { + "epoch": 0.3633440514469453, + "grad_norm": 1.8990964889526367, + "learning_rate": 8.104856369019525e-06, + "loss": 0.3801, + "step": 113 + }, + { + "epoch": 0.3665594855305466, + "grad_norm": 5.287169933319092, + "learning_rate": 8.060529912738316e-06, + "loss": 0.3594, + "step": 114 + }, + { + "epoch": 0.36977491961414793, + "grad_norm": 2.917484998703003, + "learning_rate": 8.0158154103074e-06, + "loss": 0.3696, + "step": 115 + }, + { + "epoch": 0.3729903536977492, + "grad_norm": 2.5253026485443115, + "learning_rate": 7.970718531101365e-06, + "loss": 0.3553, + "step": 116 + }, + { + "epoch": 0.3762057877813505, + "grad_norm": 2.7132797241210938, + "learning_rate": 7.925244992976538e-06, + "loss": 0.3775, + "step": 117 + }, + { + "epoch": 0.37942122186495175, + "grad_norm": 5.237837791442871, + "learning_rate": 7.879400561546033e-06, + "loss": 0.3591, + "step": 118 + }, + { + "epoch": 0.38263665594855306, + "grad_norm": 2.0805959701538086, + "learning_rate": 7.833191049448706e-06, + "loss": 0.3723, + "step": 119 + }, + { + "epoch": 0.3858520900321543, + "grad_norm": 1.8187751770019531, + "learning_rate": 7.786622315612182e-06, + "loss": 0.3566, + "step": 120 + }, + { + "epoch": 0.3890675241157556, + "grad_norm": 2.222515821456909, + "learning_rate": 7.739700264509993e-06, + "loss": 0.3809, + "step": 121 + }, + { + "epoch": 0.39228295819935693, + "grad_norm": 8.328165054321289, + "learning_rate": 7.692430845412946e-06, + "loss": 0.3707, + "step": 122 + }, + { + "epoch": 0.3954983922829582, + "grad_norm": 2.218949317932129, + "learning_rate": 7.644820051634813e-06, + "loss": 0.3642, + "step": 123 + }, + { + "epoch": 0.3987138263665595, + "grad_norm": 1.9735389947891235, + "learning_rate": 7.596873919772438e-06, + "loss": 0.3605, + "step": 124 + }, + { + "epoch": 0.40192926045016075, + "grad_norm": 3.412888526916504, + "learning_rate": 7.548598528940354e-06, + "loss": 0.3648, + "step": 125 + }, + { + "epoch": 0.40514469453376206, + "grad_norm": 4.399238109588623, + "learning_rate": 7.500000000000001e-06, + "loss": 0.3735, + "step": 126 + }, + { + "epoch": 0.40836012861736337, + "grad_norm": 1.8429063558578491, + "learning_rate": 7.451084494783668e-06, + "loss": 0.3775, + "step": 127 + }, + { + "epoch": 0.4115755627009646, + "grad_norm": 2.099372386932373, + "learning_rate": 7.401858215313228e-06, + "loss": 0.3646, + "step": 128 + }, + { + "epoch": 0.41479099678456594, + "grad_norm": 2.8833494186401367, + "learning_rate": 7.352327403013779e-06, + "loss": 0.3752, + "step": 129 + }, + { + "epoch": 0.4180064308681672, + "grad_norm": 2.006443500518799, + "learning_rate": 7.302498337922293e-06, + "loss": 0.3567, + "step": 130 + }, + { + "epoch": 0.4212218649517685, + "grad_norm": 2.024747371673584, + "learning_rate": 7.2523773378913655e-06, + "loss": 0.3623, + "step": 131 + }, + { + "epoch": 0.42443729903536975, + "grad_norm": 1.9539835453033447, + "learning_rate": 7.201970757788172e-06, + "loss": 0.3709, + "step": 132 + }, + { + "epoch": 0.42765273311897106, + "grad_norm": 1.9126152992248535, + "learning_rate": 7.151284988688731e-06, + "loss": 0.3518, + "step": 133 + }, + { + "epoch": 0.43086816720257237, + "grad_norm": 1.9806180000305176, + "learning_rate": 7.100326457067576e-06, + "loss": 0.3623, + "step": 134 + }, + { + "epoch": 0.4340836012861736, + "grad_norm": 4.260410785675049, + "learning_rate": 7.049101623982938e-06, + "loss": 0.3518, + "step": 135 + }, + { + "epoch": 0.43729903536977494, + "grad_norm": 2.0884203910827637, + "learning_rate": 6.9976169842575526e-06, + "loss": 0.3812, + "step": 136 + }, + { + "epoch": 0.4405144694533762, + "grad_norm": 4.204238414764404, + "learning_rate": 6.945879065655164e-06, + "loss": 0.3615, + "step": 137 + }, + { + "epoch": 0.4437299035369775, + "grad_norm": 1.9794977903366089, + "learning_rate": 6.893894428052881e-06, + "loss": 0.3898, + "step": 138 + }, + { + "epoch": 0.44694533762057875, + "grad_norm": 2.9515440464019775, + "learning_rate": 6.841669662609437e-06, + "loss": 0.3437, + "step": 139 + }, + { + "epoch": 0.45016077170418006, + "grad_norm": 2.980576992034912, + "learning_rate": 6.789211390929497e-06, + "loss": 0.3523, + "step": 140 + }, + { + "epoch": 0.4533762057877814, + "grad_norm": 4.675036907196045, + "learning_rate": 6.736526264224101e-06, + "loss": 0.3738, + "step": 141 + }, + { + "epoch": 0.4565916398713826, + "grad_norm": 3.4226956367492676, + "learning_rate": 6.6836209624673575e-06, + "loss": 0.3726, + "step": 142 + }, + { + "epoch": 0.45980707395498394, + "grad_norm": 2.1817691326141357, + "learning_rate": 6.6305021935494755e-06, + "loss": 0.3322, + "step": 143 + }, + { + "epoch": 0.4630225080385852, + "grad_norm": 2.0901007652282715, + "learning_rate": 6.5771766924262795e-06, + "loss": 0.3328, + "step": 144 + }, + { + "epoch": 0.4662379421221865, + "grad_norm": 1.8397691249847412, + "learning_rate": 6.523651220265269e-06, + "loss": 0.3492, + "step": 145 + }, + { + "epoch": 0.4694533762057878, + "grad_norm": 2.156468391418457, + "learning_rate": 6.469932563588386e-06, + "loss": 0.3362, + "step": 146 + }, + { + "epoch": 0.47266881028938906, + "grad_norm": 2.963684320449829, + "learning_rate": 6.41602753341152e-06, + "loss": 0.3438, + "step": 147 + }, + { + "epoch": 0.4758842443729904, + "grad_norm": 1.6273006200790405, + "learning_rate": 6.361942964380967e-06, + "loss": 0.3434, + "step": 148 + }, + { + "epoch": 0.4790996784565916, + "grad_norm": 2.0362226963043213, + "learning_rate": 6.307685713906835e-06, + "loss": 0.3487, + "step": 149 + }, + { + "epoch": 0.48231511254019294, + "grad_norm": 1.889363169670105, + "learning_rate": 6.2532626612936035e-06, + "loss": 0.3335, + "step": 150 + }, + { + "epoch": 0.4855305466237942, + "grad_norm": 5.194770336151123, + "learning_rate": 6.1986807068678926e-06, + "loss": 0.3578, + "step": 151 + }, + { + "epoch": 0.4887459807073955, + "grad_norm": 2.6607186794281006, + "learning_rate": 6.143946771103561e-06, + "loss": 0.3585, + "step": 152 + }, + { + "epoch": 0.4919614147909968, + "grad_norm": 1.6458920240402222, + "learning_rate": 6.089067793744258e-06, + "loss": 0.3163, + "step": 153 + }, + { + "epoch": 0.49517684887459806, + "grad_norm": 1.745608925819397, + "learning_rate": 6.034050732923538e-06, + "loss": 0.3513, + "step": 154 + }, + { + "epoch": 0.4983922829581994, + "grad_norm": 2.75510835647583, + "learning_rate": 5.978902564282616e-06, + "loss": 0.3436, + "step": 155 + }, + { + "epoch": 0.5016077170418006, + "grad_norm": 3.112760543823242, + "learning_rate": 5.923630280085948e-06, + "loss": 0.3321, + "step": 156 + }, + { + "epoch": 0.5048231511254019, + "grad_norm": 1.7312897443771362, + "learning_rate": 5.8682408883346535e-06, + "loss": 0.3464, + "step": 157 + }, + { + "epoch": 0.5080385852090032, + "grad_norm": 3.926618814468384, + "learning_rate": 5.8127414118779825e-06, + "loss": 0.366, + "step": 158 + }, + { + "epoch": 0.5112540192926045, + "grad_norm": 2.1648647785186768, + "learning_rate": 5.757138887522884e-06, + "loss": 0.3592, + "step": 159 + }, + { + "epoch": 0.5144694533762058, + "grad_norm": 3.066451072692871, + "learning_rate": 5.701440365141799e-06, + "loss": 0.3374, + "step": 160 + }, + { + "epoch": 0.5176848874598071, + "grad_norm": 1.5853915214538574, + "learning_rate": 5.645652906778808e-06, + "loss": 0.3354, + "step": 161 + }, + { + "epoch": 0.5209003215434084, + "grad_norm": 1.4135924577713013, + "learning_rate": 5.5897835857542315e-06, + "loss": 0.3402, + "step": 162 + }, + { + "epoch": 0.5241157556270096, + "grad_norm": 2.379409074783325, + "learning_rate": 5.533839485767795e-06, + "loss": 0.349, + "step": 163 + }, + { + "epoch": 0.5273311897106109, + "grad_norm": 1.670160174369812, + "learning_rate": 5.477827700000492e-06, + "loss": 0.3314, + "step": 164 + }, + { + "epoch": 0.5305466237942122, + "grad_norm": 2.1935579776763916, + "learning_rate": 5.421755330215223e-06, + "loss": 0.3147, + "step": 165 + }, + { + "epoch": 0.5337620578778135, + "grad_norm": 3.3353383541107178, + "learning_rate": 5.365629485856381e-06, + "loss": 0.3427, + "step": 166 + }, + { + "epoch": 0.5369774919614148, + "grad_norm": 1.6737383604049683, + "learning_rate": 5.30945728314841e-06, + "loss": 0.3091, + "step": 167 + }, + { + "epoch": 0.5401929260450161, + "grad_norm": 1.815943717956543, + "learning_rate": 5.253245844193564e-06, + "loss": 0.3197, + "step": 168 + }, + { + "epoch": 0.5434083601286174, + "grad_norm": 2.064694404602051, + "learning_rate": 5.197002296068878e-06, + "loss": 0.3491, + "step": 169 + }, + { + "epoch": 0.5466237942122186, + "grad_norm": 2.5412817001342773, + "learning_rate": 5.140733769922525e-06, + "loss": 0.3323, + "step": 170 + }, + { + "epoch": 0.5498392282958199, + "grad_norm": 1.7079787254333496, + "learning_rate": 5.084447400069656e-06, + "loss": 0.3382, + "step": 171 + }, + { + "epoch": 0.5530546623794212, + "grad_norm": 1.8138501644134521, + "learning_rate": 5.0281503230878304e-06, + "loss": 0.3424, + "step": 172 + }, + { + "epoch": 0.5562700964630225, + "grad_norm": 3.9634087085723877, + "learning_rate": 4.971849676912172e-06, + "loss": 0.3357, + "step": 173 + }, + { + "epoch": 0.5594855305466238, + "grad_norm": 2.114734172821045, + "learning_rate": 4.915552599930345e-06, + "loss": 0.3544, + "step": 174 + }, + { + "epoch": 0.5627009646302251, + "grad_norm": 1.3108409643173218, + "learning_rate": 4.859266230077474e-06, + "loss": 0.3134, + "step": 175 + }, + { + "epoch": 0.5659163987138264, + "grad_norm": 1.882356882095337, + "learning_rate": 4.802997703931124e-06, + "loss": 0.3472, + "step": 176 + }, + { + "epoch": 0.5691318327974276, + "grad_norm": 3.0320799350738525, + "learning_rate": 4.746754155806437e-06, + "loss": 0.3484, + "step": 177 + }, + { + "epoch": 0.572347266881029, + "grad_norm": 1.5372339487075806, + "learning_rate": 4.6905427168515914e-06, + "loss": 0.3511, + "step": 178 + }, + { + "epoch": 0.5755627009646302, + "grad_norm": 2.2504475116729736, + "learning_rate": 4.63437051414362e-06, + "loss": 0.3823, + "step": 179 + }, + { + "epoch": 0.5787781350482315, + "grad_norm": 2.124473810195923, + "learning_rate": 4.5782446697847775e-06, + "loss": 0.3537, + "step": 180 + }, + { + "epoch": 0.5819935691318328, + "grad_norm": 1.7836929559707642, + "learning_rate": 4.52217229999951e-06, + "loss": 0.3281, + "step": 181 + }, + { + "epoch": 0.5852090032154341, + "grad_norm": 1.819919228553772, + "learning_rate": 4.466160514232206e-06, + "loss": 0.3307, + "step": 182 + }, + { + "epoch": 0.5884244372990354, + "grad_norm": 2.2056925296783447, + "learning_rate": 4.410216414245771e-06, + "loss": 0.3289, + "step": 183 + }, + { + "epoch": 0.5916398713826366, + "grad_norm": 1.8819239139556885, + "learning_rate": 4.354347093221194e-06, + "loss": 0.3139, + "step": 184 + }, + { + "epoch": 0.594855305466238, + "grad_norm": 1.8985276222229004, + "learning_rate": 4.298559634858202e-06, + "loss": 0.3249, + "step": 185 + }, + { + "epoch": 0.5980707395498392, + "grad_norm": 3.259624481201172, + "learning_rate": 4.2428611124771184e-06, + "loss": 0.3566, + "step": 186 + }, + { + "epoch": 0.6012861736334405, + "grad_norm": 2.25924015045166, + "learning_rate": 4.187258588122019e-06, + "loss": 0.3359, + "step": 187 + }, + { + "epoch": 0.6045016077170418, + "grad_norm": 1.6317423582077026, + "learning_rate": 4.131759111665349e-06, + "loss": 0.3231, + "step": 188 + }, + { + "epoch": 0.6077170418006431, + "grad_norm": 1.4268584251403809, + "learning_rate": 4.076369719914055e-06, + "loss": 0.3621, + "step": 189 + }, + { + "epoch": 0.6109324758842444, + "grad_norm": 1.7488436698913574, + "learning_rate": 4.021097435717386e-06, + "loss": 0.3263, + "step": 190 + }, + { + "epoch": 0.6141479099678456, + "grad_norm": 2.6326963901519775, + "learning_rate": 3.965949267076465e-06, + "loss": 0.3376, + "step": 191 + }, + { + "epoch": 0.617363344051447, + "grad_norm": 1.5136549472808838, + "learning_rate": 3.910932206255742e-06, + "loss": 0.3161, + "step": 192 + }, + { + "epoch": 0.6205787781350482, + "grad_norm": 1.4793072938919067, + "learning_rate": 3.856053228896442e-06, + "loss": 0.3241, + "step": 193 + }, + { + "epoch": 0.6237942122186495, + "grad_norm": 2.289064884185791, + "learning_rate": 3.8013192931321095e-06, + "loss": 0.3207, + "step": 194 + }, + { + "epoch": 0.6270096463022508, + "grad_norm": 1.8162267208099365, + "learning_rate": 3.7467373387063973e-06, + "loss": 0.3242, + "step": 195 + }, + { + "epoch": 0.6302250803858521, + "grad_norm": 1.8329249620437622, + "learning_rate": 3.692314286093167e-06, + "loss": 0.3248, + "step": 196 + }, + { + "epoch": 0.6334405144694534, + "grad_norm": 1.6766780614852905, + "learning_rate": 3.6380570356190346e-06, + "loss": 0.3291, + "step": 197 + }, + { + "epoch": 0.6366559485530546, + "grad_norm": 66.03868865966797, + "learning_rate": 3.58397246658848e-06, + "loss": 0.3078, + "step": 198 + }, + { + "epoch": 0.639871382636656, + "grad_norm": 2.198519706726074, + "learning_rate": 3.5300674364116173e-06, + "loss": 0.3197, + "step": 199 + }, + { + "epoch": 0.6430868167202572, + "grad_norm": 2.2276482582092285, + "learning_rate": 3.476348779734732e-06, + "loss": 0.3141, + "step": 200 + }, + { + "epoch": 0.6463022508038585, + "grad_norm": 1.4550248384475708, + "learning_rate": 3.4228233075737225e-06, + "loss": 0.3327, + "step": 201 + }, + { + "epoch": 0.6495176848874598, + "grad_norm": 1.832467794418335, + "learning_rate": 3.3694978064505258e-06, + "loss": 0.3196, + "step": 202 + }, + { + "epoch": 0.6527331189710611, + "grad_norm": 1.8890045881271362, + "learning_rate": 3.316379037532644e-06, + "loss": 0.355, + "step": 203 + }, + { + "epoch": 0.6559485530546624, + "grad_norm": 2.667874813079834, + "learning_rate": 3.2634737357758994e-06, + "loss": 0.3358, + "step": 204 + }, + { + "epoch": 0.6591639871382636, + "grad_norm": 1.426277756690979, + "learning_rate": 3.2107886090705035e-06, + "loss": 0.3134, + "step": 205 + }, + { + "epoch": 0.662379421221865, + "grad_norm": 2.3840363025665283, + "learning_rate": 3.158330337390565e-06, + "loss": 0.3144, + "step": 206 + }, + { + "epoch": 0.6655948553054662, + "grad_norm": 1.9086871147155762, + "learning_rate": 3.10610557194712e-06, + "loss": 0.3043, + "step": 207 + }, + { + "epoch": 0.6688102893890675, + "grad_norm": 1.7807660102844238, + "learning_rate": 3.0541209343448373e-06, + "loss": 0.3121, + "step": 208 + }, + { + "epoch": 0.6720257234726688, + "grad_norm": 1.6139692068099976, + "learning_rate": 3.0023830157424504e-06, + "loss": 0.3454, + "step": 209 + }, + { + "epoch": 0.6752411575562701, + "grad_norm": 1.9122083187103271, + "learning_rate": 2.950898376017064e-06, + "loss": 0.3175, + "step": 210 + }, + { + "epoch": 0.6784565916398714, + "grad_norm": 2.084561586380005, + "learning_rate": 2.8996735429324256e-06, + "loss": 0.319, + "step": 211 + }, + { + "epoch": 0.6816720257234726, + "grad_norm": 1.411837100982666, + "learning_rate": 2.848715011311271e-06, + "loss": 0.3085, + "step": 212 + }, + { + "epoch": 0.684887459807074, + "grad_norm": 1.4443124532699585, + "learning_rate": 2.7980292422118282e-06, + "loss": 0.3366, + "step": 213 + }, + { + "epoch": 0.6881028938906752, + "grad_norm": 1.447710394859314, + "learning_rate": 2.7476226621086354e-06, + "loss": 0.3171, + "step": 214 + }, + { + "epoch": 0.6913183279742765, + "grad_norm": 1.3682136535644531, + "learning_rate": 2.697501662077707e-06, + "loss": 0.3158, + "step": 215 + }, + { + "epoch": 0.6945337620578779, + "grad_norm": 8.954407691955566, + "learning_rate": 2.6476725969862227e-06, + "loss": 0.3474, + "step": 216 + }, + { + "epoch": 0.6977491961414791, + "grad_norm": 1.565764307975769, + "learning_rate": 2.5981417846867753e-06, + "loss": 0.3287, + "step": 217 + }, + { + "epoch": 0.7009646302250804, + "grad_norm": 1.5535356998443604, + "learning_rate": 2.548915505216333e-06, + "loss": 0.3206, + "step": 218 + }, + { + "epoch": 0.7041800643086816, + "grad_norm": 1.5272207260131836, + "learning_rate": 2.5000000000000015e-06, + "loss": 0.3228, + "step": 219 + }, + { + "epoch": 0.707395498392283, + "grad_norm": 1.4562255144119263, + "learning_rate": 2.4514014710596467e-06, + "loss": 0.296, + "step": 220 + }, + { + "epoch": 0.7106109324758842, + "grad_norm": 1.6484540700912476, + "learning_rate": 2.4031260802275623e-06, + "loss": 0.3358, + "step": 221 + }, + { + "epoch": 0.7138263665594855, + "grad_norm": 1.4094409942626953, + "learning_rate": 2.3551799483651894e-06, + "loss": 0.3332, + "step": 222 + }, + { + "epoch": 0.7170418006430869, + "grad_norm": 1.8686710596084595, + "learning_rate": 2.307569154587056e-06, + "loss": 0.3356, + "step": 223 + }, + { + "epoch": 0.7202572347266881, + "grad_norm": 1.5808024406433105, + "learning_rate": 2.2602997354900075e-06, + "loss": 0.315, + "step": 224 + }, + { + "epoch": 0.7234726688102894, + "grad_norm": 1.5298748016357422, + "learning_rate": 2.2133776843878185e-06, + "loss": 0.3355, + "step": 225 + }, + { + "epoch": 0.7266881028938906, + "grad_norm": 12.085217475891113, + "learning_rate": 2.166808950551296e-06, + "loss": 0.3301, + "step": 226 + }, + { + "epoch": 0.729903536977492, + "grad_norm": 1.5451043844223022, + "learning_rate": 2.120599438453968e-06, + "loss": 0.3321, + "step": 227 + }, + { + "epoch": 0.7331189710610932, + "grad_norm": 1.8228951692581177, + "learning_rate": 2.074755007023461e-06, + "loss": 0.3074, + "step": 228 + }, + { + "epoch": 0.7363344051446945, + "grad_norm": 1.5614581108093262, + "learning_rate": 2.0292814688986375e-06, + "loss": 0.342, + "step": 229 + }, + { + "epoch": 0.7395498392282959, + "grad_norm": 1.4238361120224, + "learning_rate": 1.9841845896926022e-06, + "loss": 0.3261, + "step": 230 + }, + { + "epoch": 0.7427652733118971, + "grad_norm": 1.7577193975448608, + "learning_rate": 1.9394700872616856e-06, + "loss": 0.3312, + "step": 231 + }, + { + "epoch": 0.7459807073954984, + "grad_norm": 2.6132872104644775, + "learning_rate": 1.8951436309804766e-06, + "loss": 0.341, + "step": 232 + }, + { + "epoch": 0.7491961414790996, + "grad_norm": 1.9483258724212646, + "learning_rate": 1.8512108410229878e-06, + "loss": 0.3039, + "step": 233 + }, + { + "epoch": 0.752411575562701, + "grad_norm": 1.289170742034912, + "learning_rate": 1.8076772876500831e-06, + "loss": 0.3003, + "step": 234 + }, + { + "epoch": 0.7556270096463023, + "grad_norm": 2.7016942501068115, + "learning_rate": 1.7645484905032129e-06, + "loss": 0.3283, + "step": 235 + }, + { + "epoch": 0.7588424437299035, + "grad_norm": 1.3249751329421997, + "learning_rate": 1.7218299179045789e-06, + "loss": 0.3128, + "step": 236 + }, + { + "epoch": 0.7620578778135049, + "grad_norm": 2.7184793949127197, + "learning_rate": 1.6795269861638041e-06, + "loss": 0.3338, + "step": 237 + }, + { + "epoch": 0.7652733118971061, + "grad_norm": 1.27732253074646, + "learning_rate": 1.6376450588911985e-06, + "loss": 0.2865, + "step": 238 + }, + { + "epoch": 0.7684887459807074, + "grad_norm": 1.7141942977905273, + "learning_rate": 1.5961894463176942e-06, + "loss": 0.3154, + "step": 239 + }, + { + "epoch": 0.7717041800643086, + "grad_norm": 4.537605285644531, + "learning_rate": 1.555165404621567e-06, + "loss": 0.3207, + "step": 240 + }, + { + "epoch": 0.77491961414791, + "grad_norm": 1.5014591217041016, + "learning_rate": 1.5145781352620054e-06, + "loss": 0.3403, + "step": 241 + }, + { + "epoch": 0.7781350482315113, + "grad_norm": 1.7332754135131836, + "learning_rate": 1.4744327843196043e-06, + "loss": 0.2983, + "step": 242 + }, + { + "epoch": 0.7813504823151125, + "grad_norm": 1.8262887001037598, + "learning_rate": 1.434734441843899e-06, + "loss": 0.3052, + "step": 243 + }, + { + "epoch": 0.7845659163987139, + "grad_norm": 3.564021348953247, + "learning_rate": 1.3954881412079945e-06, + "loss": 0.3155, + "step": 244 + }, + { + "epoch": 0.7877813504823151, + "grad_norm": 1.3386446237564087, + "learning_rate": 1.3566988584703817e-06, + "loss": 0.29, + "step": 245 + }, + { + "epoch": 0.7909967845659164, + "grad_norm": 14.863595962524414, + "learning_rate": 1.3183715117440143e-06, + "loss": 0.303, + "step": 246 + }, + { + "epoch": 0.7942122186495176, + "grad_norm": 10.220840454101562, + "learning_rate": 1.280510960572745e-06, + "loss": 0.3258, + "step": 247 + }, + { + "epoch": 0.797427652733119, + "grad_norm": 1.610312819480896, + "learning_rate": 1.2431220053151832e-06, + "loss": 0.3301, + "step": 248 + }, + { + "epoch": 0.8006430868167203, + "grad_norm": 1.4349790811538696, + "learning_rate": 1.2062093865360458e-06, + "loss": 0.2993, + "step": 249 + }, + { + "epoch": 0.8038585209003215, + "grad_norm": 3.160371780395508, + "learning_rate": 1.1697777844051105e-06, + "loss": 0.3119, + "step": 250 + }, + { + "epoch": 0.8070739549839229, + "grad_norm": 1.5855354070663452, + "learning_rate": 1.1338318181038037e-06, + "loss": 0.3017, + "step": 251 + }, + { + "epoch": 0.8102893890675241, + "grad_norm": 2.18475341796875, + "learning_rate": 1.0983760452395415e-06, + "loss": 0.3205, + "step": 252 + }, + { + "epoch": 0.8135048231511254, + "grad_norm": 1.4034461975097656, + "learning_rate": 1.063414961267859e-06, + "loss": 0.3265, + "step": 253 + }, + { + "epoch": 0.8167202572347267, + "grad_norm": 1.7966201305389404, + "learning_rate": 1.02895299892244e-06, + "loss": 0.3048, + "step": 254 + }, + { + "epoch": 0.819935691318328, + "grad_norm": 1.4181287288665771, + "learning_rate": 9.949945276530782e-07, + "loss": 0.327, + "step": 255 + }, + { + "epoch": 0.8231511254019293, + "grad_norm": 1.8698980808258057, + "learning_rate": 9.615438530716753e-07, + "loss": 0.304, + "step": 256 + }, + { + "epoch": 0.8263665594855305, + "grad_norm": 1.4744179248809814, + "learning_rate": 9.286052164063369e-07, + "loss": 0.3335, + "step": 257 + }, + { + "epoch": 0.8295819935691319, + "grad_norm": 1.7521005868911743, + "learning_rate": 8.961827939636198e-07, + "loss": 0.3438, + "step": 258 + }, + { + "epoch": 0.8327974276527331, + "grad_norm": 2.057781934738159, + "learning_rate": 8.64280696599008e-07, + "loss": 0.3156, + "step": 259 + }, + { + "epoch": 0.8360128617363344, + "grad_norm": 1.6581817865371704, + "learning_rate": 8.329029691957124e-07, + "loss": 0.3126, + "step": 260 + }, + { + "epoch": 0.8392282958199357, + "grad_norm": 2.1192777156829834, + "learning_rate": 8.02053590151805e-07, + "loss": 0.3188, + "step": 261 + }, + { + "epoch": 0.842443729903537, + "grad_norm": 1.4133118391036987, + "learning_rate": 7.717364708758024e-07, + "loss": 0.3211, + "step": 262 + }, + { + "epoch": 0.8456591639871383, + "grad_norm": 1.5940773487091064, + "learning_rate": 7.41955455290726e-07, + "loss": 0.2978, + "step": 263 + }, + { + "epoch": 0.8488745980707395, + "grad_norm": 2.956995725631714, + "learning_rate": 7.127143193467445e-07, + "loss": 0.3173, + "step": 264 + }, + { + "epoch": 0.8520900321543409, + "grad_norm": 2.5843443870544434, + "learning_rate": 6.840167705424106e-07, + "loss": 0.3002, + "step": 265 + }, + { + "epoch": 0.8553054662379421, + "grad_norm": 1.8771467208862305, + "learning_rate": 6.558664474545817e-07, + "loss": 0.3243, + "step": 266 + }, + { + "epoch": 0.8585209003215434, + "grad_norm": 2.241569757461548, + "learning_rate": 6.282669192770896e-07, + "loss": 0.2968, + "step": 267 + }, + { + "epoch": 0.8617363344051447, + "grad_norm": 1.668226957321167, + "learning_rate": 6.012216853682001e-07, + "loss": 0.32, + "step": 268 + }, + { + "epoch": 0.864951768488746, + "grad_norm": 1.5583616495132446, + "learning_rate": 5.747341748069229e-07, + "loss": 0.309, + "step": 269 + }, + { + "epoch": 0.8681672025723473, + "grad_norm": 1.433117389678955, + "learning_rate": 5.488077459582425e-07, + "loss": 0.3231, + "step": 270 + }, + { + "epoch": 0.8713826366559485, + "grad_norm": 1.5444004535675049, + "learning_rate": 5.234456860473042e-07, + "loss": 0.292, + "step": 271 + }, + { + "epoch": 0.8745980707395499, + "grad_norm": 1.4406189918518066, + "learning_rate": 4.986512107426283e-07, + "loss": 0.3043, + "step": 272 + }, + { + "epoch": 0.8778135048231511, + "grad_norm": 1.3344569206237793, + "learning_rate": 4.7442746374839363e-07, + "loss": 0.2818, + "step": 273 + }, + { + "epoch": 0.8810289389067524, + "grad_norm": 1.5688618421554565, + "learning_rate": 4.50777516405847e-07, + "loss": 0.295, + "step": 274 + }, + { + "epoch": 0.8842443729903537, + "grad_norm": 1.4727739095687866, + "learning_rate": 4.2770436730388166e-07, + "loss": 0.2951, + "step": 275 + }, + { + "epoch": 0.887459807073955, + "grad_norm": 2.1879146099090576, + "learning_rate": 4.05210941898847e-07, + "loss": 0.3191, + "step": 276 + }, + { + "epoch": 0.8906752411575563, + "grad_norm": 1.7236080169677734, + "learning_rate": 3.8330009214363197e-07, + "loss": 0.3118, + "step": 277 + }, + { + "epoch": 0.8938906752411575, + "grad_norm": 1.4538291692733765, + "learning_rate": 3.619745961260623e-07, + "loss": 0.3207, + "step": 278 + }, + { + "epoch": 0.8971061093247589, + "grad_norm": 1.4028717279434204, + "learning_rate": 3.4123715771665786e-07, + "loss": 0.3278, + "step": 279 + }, + { + "epoch": 0.9003215434083601, + "grad_norm": 1.755362629890442, + "learning_rate": 3.2109040622582186e-07, + "loss": 0.2798, + "step": 280 + }, + { + "epoch": 0.9035369774919614, + "grad_norm": 2.1135966777801514, + "learning_rate": 3.015368960704584e-07, + "loss": 0.307, + "step": 281 + }, + { + "epoch": 0.9067524115755627, + "grad_norm": 1.4916549921035767, + "learning_rate": 2.8257910645009935e-07, + "loss": 0.2861, + "step": 282 + }, + { + "epoch": 0.909967845659164, + "grad_norm": 1.6314096450805664, + "learning_rate": 2.6421944103256657e-07, + "loss": 0.3065, + "step": 283 + }, + { + "epoch": 0.9131832797427653, + "grad_norm": 2.6644771099090576, + "learning_rate": 2.4646022764920843e-07, + "loss": 0.3013, + "step": 284 + }, + { + "epoch": 0.9163987138263665, + "grad_norm": 1.4383426904678345, + "learning_rate": 2.2930371799975593e-07, + "loss": 0.309, + "step": 285 + }, + { + "epoch": 0.9196141479099679, + "grad_norm": 1.4383573532104492, + "learning_rate": 2.1275208736682262e-07, + "loss": 0.2973, + "step": 286 + }, + { + "epoch": 0.9228295819935691, + "grad_norm": 1.249383568763733, + "learning_rate": 1.9680743434010385e-07, + "loss": 0.307, + "step": 287 + }, + { + "epoch": 0.9260450160771704, + "grad_norm": 3.4106099605560303, + "learning_rate": 1.814717805502958e-07, + "loss": 0.3035, + "step": 288 + }, + { + "epoch": 0.9292604501607717, + "grad_norm": 2.925081729888916, + "learning_rate": 1.667470704127694e-07, + "loss": 0.2966, + "step": 289 + }, + { + "epoch": 0.932475884244373, + "grad_norm": 3.2131919860839844, + "learning_rate": 1.5263517088103862e-07, + "loss": 0.3021, + "step": 290 + }, + { + "epoch": 0.9356913183279743, + "grad_norm": 1.7186495065689087, + "learning_rate": 1.3913787121004717e-07, + "loss": 0.3164, + "step": 291 + }, + { + "epoch": 0.9389067524115756, + "grad_norm": 1.486461877822876, + "learning_rate": 1.2625688272930925e-07, + "loss": 0.3191, + "step": 292 + }, + { + "epoch": 0.9421221864951769, + "grad_norm": 1.584938406944275, + "learning_rate": 1.1399383862592928e-07, + "loss": 0.3005, + "step": 293 + }, + { + "epoch": 0.9453376205787781, + "grad_norm": 3.820082426071167, + "learning_rate": 1.0235029373752758e-07, + "loss": 0.3019, + "step": 294 + }, + { + "epoch": 0.9485530546623794, + "grad_norm": 1.3474429845809937, + "learning_rate": 9.132772435510362e-08, + "loss": 0.2809, + "step": 295 + }, + { + "epoch": 0.9517684887459807, + "grad_norm": 1.2956693172454834, + "learning_rate": 8.092752803585513e-08, + "loss": 0.2959, + "step": 296 + }, + { + "epoch": 0.954983922829582, + "grad_norm": 7.804889678955078, + "learning_rate": 7.115102342598101e-08, + "loss": 0.2872, + "step": 297 + }, + { + "epoch": 0.9581993569131833, + "grad_norm": 3.1022164821624756, + "learning_rate": 6.199945009349173e-08, + "loss": 0.3173, + "step": 298 + }, + { + "epoch": 0.9614147909967846, + "grad_norm": 4.706262111663818, + "learning_rate": 5.3473968371040575e-08, + "loss": 0.2896, + "step": 299 + }, + { + "epoch": 0.9646302250803859, + "grad_norm": 1.4467219114303589, + "learning_rate": 4.55756592088058e-08, + "loss": 0.2965, + "step": 300 + }, + { + "epoch": 0.9678456591639871, + "grad_norm": 1.4120930433273315, + "learning_rate": 3.8305524037438035e-08, + "loss": 0.3084, + "step": 301 + }, + { + "epoch": 0.9710610932475884, + "grad_norm": 1.5261682271957397, + "learning_rate": 3.166448464108629e-08, + "loss": 0.328, + "step": 302 + }, + { + "epoch": 0.9742765273311897, + "grad_norm": 1.4508821964263916, + "learning_rate": 2.5653383040524228e-08, + "loss": 0.2849, + "step": 303 + }, + { + "epoch": 0.977491961414791, + "grad_norm": 6.040746212005615, + "learning_rate": 2.0272981386393332e-08, + "loss": 0.3468, + "step": 304 + }, + { + "epoch": 0.9807073954983923, + "grad_norm": 2.0974323749542236, + "learning_rate": 1.552396186256411e-08, + "loss": 0.2976, + "step": 305 + }, + { + "epoch": 0.9839228295819936, + "grad_norm": 1.473928451538086, + "learning_rate": 1.1406926599646373e-08, + "loss": 0.3228, + "step": 306 + }, + { + "epoch": 0.9871382636655949, + "grad_norm": 2.54904842376709, + "learning_rate": 7.922397598642551e-09, + "loss": 0.2999, + "step": 307 + }, + { + "epoch": 0.9903536977491961, + "grad_norm": 1.7829190492630005, + "learning_rate": 5.0708166647628345e-09, + "loss": 0.3042, + "step": 308 + }, + { + "epoch": 0.9935691318327974, + "grad_norm": 1.8181827068328857, + "learning_rate": 2.8525453514099966e-09, + "loss": 0.3057, + "step": 309 + }, + { + "epoch": 0.9967845659163987, + "grad_norm": 3.868682384490967, + "learning_rate": 1.2678649143349485e-09, + "loss": 0.3086, + "step": 310 + }, + { + "epoch": 1.0, + "grad_norm": 1.5854872465133667, + "learning_rate": 3.1697627597970794e-10, + "loss": 0.3017, + "step": 311 + }, + { + "epoch": 1.0, + "step": 311, + "total_flos": 3.149345964351816e+17, + "train_loss": 0.4079481167808606, + "train_runtime": 5827.269, + "train_samples_per_second": 3.415, + "train_steps_per_second": 0.053 + } + ], + "logging_steps": 1, + "max_steps": 311, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3.149345964351816e+17, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..ae9c057 --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:338ff32564c2345f34bf07979b403f80feca2260a2e9be630f7634e1c004641d +size 7416 diff --git a/training_loss.png b/training_loss.png new file mode 100644 index 0000000..e3a028f Binary files /dev/null and b/training_loss.png differ diff --git a/video_preprocessor_config.json b/video_preprocessor_config.json new file mode 100644 index 0000000..b64d80b --- /dev/null +++ b/video_preprocessor_config.json @@ -0,0 +1,86 @@ +{ + "_valid_kwargs_names": [ + "do_convert_rgb", + "do_resize", + "size", + "size_divisor", + "default_to_square", + "resample", + "do_rescale", + "rescale_factor", + "do_normalize", + "image_mean", + "image_std", + "do_pad", + "do_center_crop", + "crop_size", + "data_format", + "input_data_format", + "device", + "min_pixels", + "max_pixels", + "patch_size", + "temporal_patch_size", + "merge_size" + ], + "crop_size": null, + "data_format": "channels_first", + "default_to_square": true, + "device": null, + "do_center_crop": null, + "do_convert_rgb": true, + "do_normalize": true, + "do_pad": null, + "do_rescale": true, + "do_resize": true, + "image_mean": [ + 0.48145466, + 0.4578275, + 0.40821073 + ], + "image_processor_type": "Qwen2VLImageProcessor", + "image_std": [ + 0.26862954, + 0.26130258, + 0.27577711 + ], + "input_data_format": null, + "max_pixels": 12845056, + "merge_size": 2, + "min_pixels": 3136, + "model_valid_processing_keys": [ + "do_convert_rgb", + "do_resize", + "size", + "size_divisor", + "default_to_square", + "resample", + "do_rescale", + "rescale_factor", + "do_normalize", + "image_mean", + "image_std", + "do_pad", + "do_center_crop", + "crop_size", + "data_format", + "input_data_format", + "device", + "min_pixels", + "max_pixels", + "patch_size", + "temporal_patch_size", + "merge_size" + ], + "patch_size": 14, + "processor_class": "Qwen2_5_VLProcessor", + "resample": 3, + "rescale_factor": 0.00392156862745098, + "size": { + "longest_edge": 12845056, + "shortest_edge": 3136 + }, + "size_divisor": null, + "temporal_patch_size": 2, + "video_processor_type": "Qwen2VLVideoProcessor" +} diff --git a/vocab.json b/vocab.json new file mode 100644 index 0000000..6c49fc6 --- /dev/null +++ b/vocab.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca10d7e9fb3ed18575dd1e277a2579c16d108e32f27439684afa0e10b1440910 +size 2776833