commit 002044d1b4d6e5f03a7c4a4668e8b8a4ade3fc5c Author: ModelHub XC Date: Wed Jun 3 01:44:14 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: laion/r2egym-gpt5-codex-160ep-1M Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..2b385b5 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,56 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text + + +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zstandard filter=lfs diff=lfs merge=lfs -text +*.tfevents* filter=lfs diff=lfs merge=lfs -text +*.db* filter=lfs diff=lfs merge=lfs -text +*.ark* filter=lfs diff=lfs merge=lfs -text +**/*ckpt*data* filter=lfs diff=lfs merge=lfs -text +**/*ckpt*.meta filter=lfs diff=lfs merge=lfs -text +**/*ckpt*.index filter=lfs diff=lfs merge=lfs -text + +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.gguf* filter=lfs diff=lfs merge=lfs -text +*.ggml filter=lfs diff=lfs merge=lfs -text +*.llamafile* filter=lfs diff=lfs merge=lfs -text +*.pt2 filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text + +tokenizer.json filter=lfs diff=lfs merge=lfs -text +vocab.json filter=lfs diff=lfs merge=lfs -text +model-00004-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text +training_args.bin filter=lfs diff=lfs merge=lfs -text +model-00001-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text +model-00003-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text +merges.txt filter=lfs diff=lfs merge=lfs -text +model-00002-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..5dbbb9b --- /dev/null +++ b/README.md @@ -0,0 +1,60 @@ +--- +library_name: transformers +license: apache-2.0 +base_model: Qwen/Qwen3-8B +tags: +- llama-factory +- full +- generated_from_trainer +model-index: +- name: r2egym-gpt5-codex-160ep-1M + results: [] +--- + + + +# r2egym-gpt5-codex-160ep-1M + +This model is a fine-tuned version of [Qwen/Qwen3-8B](https://huggingface.co/Qwen/Qwen3-8B) on the penfever/r2egym_gpt5_codex_solve_traces dataset. + +## Model description + +More information needed + +## Intended uses & limitations + +More information needed + +## Training and evaluation data + +More information needed + +## Training procedure + +### Training hyperparameters + +The following hyperparameters were used during training: +- learning_rate: 4e-05 +- train_batch_size: 1 +- eval_batch_size: 8 +- seed: 42 +- distributed_type: multi-GPU +- num_devices: 16 +- total_train_batch_size: 16 +- total_eval_batch_size: 128 +- optimizer: Use OptimizerNames.ADAMW_TORCH_FUSED with betas=(0.9,0.98) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments +- lr_scheduler_type: cosine +- lr_scheduler_warmup_ratio: 0.1 +- num_epochs: 7.0 + +### Training results + + + +### Framework versions + +- Transformers 4.56.0 +- Pytorch 2.9.0+cu128 +- Datasets 4.4.1 +- Tokenizers 0.22.1 diff --git a/added_tokens.json b/added_tokens.json new file mode 100644 index 0000000..b54f913 --- /dev/null +++ b/added_tokens.json @@ -0,0 +1,28 @@ +{ + "": 151668, + "": 151658, + "": 151666, + "": 151667, + "": 151657, + "": 151665, + "<|box_end|>": 151649, + "<|box_start|>": 151648, + "<|endoftext|>": 151643, + "<|file_sep|>": 151664, + "<|fim_middle|>": 151660, + "<|fim_pad|>": 151662, + "<|fim_prefix|>": 151659, + "<|fim_suffix|>": 151661, + "<|im_end|>": 151645, + "<|im_start|>": 151644, + "<|image_pad|>": 151655, + "<|object_ref_end|>": 151647, + "<|object_ref_start|>": 151646, + "<|quad_end|>": 151651, + "<|quad_start|>": 151650, + "<|repo_name|>": 151663, + "<|video_pad|>": 151656, + "<|vision_end|>": 151653, + "<|vision_pad|>": 151654, + "<|vision_start|>": 151652 +} diff --git a/all_results.json b/all_results.json new file mode 100644 index 0000000..ec0ed39 --- /dev/null +++ b/all_results.json @@ -0,0 +1,16 @@ +{ + "achieved_tflops_per_gpu": 0.0003518731812688662, + "achieved_tflops_per_gpu_theoretical": 1439.8594391998463, + "epoch": 7.0, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.3174639642238617, + "mfu_percent": 2.4867362633842135e-05, + "mfu_percent_theoretical": 101.75685082684426, + "total_flos": 22338300149760.0, + "train_loss": 0.652452801803053, + "train_runtime": 3967.747, + "train_samples_per_second": 6.796, + "train_steps_per_second": 0.425, + "valid_targets_mean": 94.8, + "valid_targets_min": 62 +} \ No newline at end of file diff --git a/chat_template.jinja b/chat_template.jinja new file mode 100644 index 0000000..01be9b3 --- /dev/null +++ b/chat_template.jinja @@ -0,0 +1,89 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if message.content is string %} + {%- set content = message.content %} + {%- else %} + {%- set content = '' %} + {%- endif %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is string %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in content %} + {%- set reasoning_content = content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- set content = content.split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..d04042c --- /dev/null +++ b/config.json @@ -0,0 +1,68 @@ +{ + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "dtype": "bfloat16", + "eos_token_id": 151645, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 12288, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 40960, + "max_window_layers": 36, + "model_type": "qwen3", + "num_attention_heads": 32, + "num_hidden_layers": 36, + "num_key_value_heads": 8, + "pad_token_id": 151643, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000, + "sliding_window": null, + "tie_word_embeddings": false, + "transformers_version": "4.56.0", + "use_cache": false, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/configuration.json b/configuration.json new file mode 100644 index 0000000..bbeeda1 --- /dev/null +++ b/configuration.json @@ -0,0 +1 @@ +{"framework": "pytorch", "task": "text-generation", "allow_remote": true} \ No newline at end of file diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..eff07c5 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,12 @@ +{ + "do_sample": true, + "eos_token_id": [ + 151645, + 151643 + ], + "pad_token_id": 151643, + "temperature": 0.6, + "top_k": 20, + "top_p": 0.95, + "transformers_version": "4.56.0" +} diff --git a/merges.txt b/merges.txt new file mode 100644 index 0000000..80c1a19 --- /dev/null +++ b/merges.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8831e4f1a044471340f7c0a83d7bd71306a5b867e95fd870f74d0c5308a904d5 +size 1671853 diff --git a/model-00001-of-00004.safetensors b/model-00001-of-00004.safetensors new file mode 100644 index 0000000..8fd74a2 --- /dev/null +++ b/model-00001-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:519c6b37badcec1380b216ceb35356fea8116fc0df98fed40565c324ab386b1c +size 4902257696 diff --git a/model-00002-of-00004.safetensors b/model-00002-of-00004.safetensors new file mode 100644 index 0000000..4ab6204 --- /dev/null +++ b/model-00002-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:030e5abc1559c53db6d09a4e26ecfd379fe6451dca3fe5456b509007513ab450 +size 4915960368 diff --git a/model-00003-of-00004.safetensors b/model-00003-of-00004.safetensors new file mode 100644 index 0000000..a2efec8 --- /dev/null +++ b/model-00003-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:219f709b9d0bd613cade3605efc85a6be9a471a2be4bf2adc80b4edd4d928238 +size 4983068496 diff --git a/model-00004-of-00004.safetensors b/model-00004-of-00004.safetensors new file mode 100644 index 0000000..feb5c66 --- /dev/null +++ b/model-00004-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b0b92f0b7e9642c183ea177ff9692279fa8f72042900f0d326979c550a0e7b8 +size 1580230264 diff --git a/model.safetensors.index.json b/model.safetensors.index.json new file mode 100644 index 0000000..ba886c0 --- /dev/null +++ b/model.safetensors.index.json @@ -0,0 +1,407 @@ +{ + "metadata": { + "total_parameters": 308224, + "total_size": 16381470720 + }, + "weight_map": { + "lm_head.weight": "model-00004-of-00004.safetensors", + "model.embed_tokens.weight": "model-00001-of-00004.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.k_norm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.q_norm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.k_norm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.q_norm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.k_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.q_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.k_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.q_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.k_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.q_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.k_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.q_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.k_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.q_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.k_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.q_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.k_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.q_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.k_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.q_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.k_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.q_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.k_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.q_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.k_norm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.q_norm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.20.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.k_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.q_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.21.self_attn.k_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.21.self_attn.q_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.k_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.22.self_attn.q_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.k_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.q_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.k_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.q_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.k_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.q_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.k_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.q_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.k_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.q_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.k_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.q_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.k_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.q_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.k_norm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.q_norm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.30.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.k_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.q_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.k_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.q_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.32.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.32.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.32.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.32.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.32.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.32.self_attn.k_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.32.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.32.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.32.self_attn.q_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.32.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.32.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.33.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.33.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.33.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.33.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.33.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.33.self_attn.k_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.33.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.33.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.33.self_attn.q_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.33.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.33.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.34.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.34.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.34.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.34.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.34.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.34.self_attn.k_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.34.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.34.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.34.self_attn.q_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.34.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.34.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.35.input_layernorm.weight": "model-00004-of-00004.safetensors", + "model.layers.35.mlp.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.35.mlp.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.35.mlp.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.35.post_attention_layernorm.weight": "model-00004-of-00004.safetensors", + "model.layers.35.self_attn.k_norm.weight": "model-00004-of-00004.safetensors", + "model.layers.35.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.35.self_attn.o_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.35.self_attn.q_norm.weight": "model-00004-of-00004.safetensors", + "model.layers.35.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.35.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.k_norm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.q_norm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.k_norm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.q_norm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.k_norm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.q_norm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.k_norm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.q_norm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.k_norm.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.q_norm.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.k_norm.weight": "model-00001-of-00004.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.9.self_attn.q_norm.weight": "model-00001-of-00004.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.norm.weight": "model-00004-of-00004.safetensors" + } +} diff --git a/run_summary.json b/run_summary.json new file mode 100644 index 0000000..2def0f7 --- /dev/null +++ b/run_summary.json @@ -0,0 +1,12 @@ +{ + "agent_name": null, + "training_start": null, + "training_end": null, + "created_by": "DCAgent", + "base_model_name": "Qwen/Qwen3-8B", + "dataset_name": "penfever/r2egym_gpt5_codex_solve_traces", + "training_type": "SFT", + "training_parameters": "https://huggingface.co/laion/r2egym-gpt5-codex-160ep-1M/blob/main/config.json", + "wandb_link": "https://wandb.ai/dogml/dc-agent/runs/r2egym_gpt5_codex_solve_traces_hub-model-id_r2egym-gpt5-codex-160ep-1M_Qwen3-8B", + "traces_location_s3": null +} \ No newline at end of file diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..ac23c0a --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,31 @@ +{ + "additional_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "eos_token": { + "content": "<|im_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..cd71f61 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aeb13307a71acd8fe81861d94ad54ab689df773318809eed3cbe794b4492dae4 +size 11422654 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..e9dc937 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,240 @@ +{ + "add_bos_token": false, + "add_prefix_space": false, + "added_tokens_decoder": { + "151643": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151644": { + "content": "<|im_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151645": { + "content": "<|im_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151646": { + "content": "<|object_ref_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151647": { + "content": "<|object_ref_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151648": { + "content": "<|box_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151649": { + "content": "<|box_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151650": { + "content": "<|quad_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151651": { + "content": "<|quad_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151652": { + "content": "<|vision_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151653": { + "content": "<|vision_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151654": { + "content": "<|vision_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151655": { + "content": "<|image_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151656": { + "content": "<|video_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151657": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151658": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151659": { + "content": "<|fim_prefix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151660": { + "content": "<|fim_middle|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151661": { + "content": "<|fim_suffix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151662": { + "content": "<|fim_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151663": { + "content": "<|repo_name|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151664": { + "content": "<|file_sep|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151665": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151666": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151667": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151668": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|im_end|>", + "errors": "replace", + "extra_special_tokens": {}, + "model_max_length": 32768, + "pad_token": "<|endoftext|>", + "padding_side": "right", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/train_results.json b/train_results.json new file mode 100644 index 0000000..ec0ed39 --- /dev/null +++ b/train_results.json @@ -0,0 +1,16 @@ +{ + "achieved_tflops_per_gpu": 0.0003518731812688662, + "achieved_tflops_per_gpu_theoretical": 1439.8594391998463, + "epoch": 7.0, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.3174639642238617, + "mfu_percent": 2.4867362633842135e-05, + "mfu_percent_theoretical": 101.75685082684426, + "total_flos": 22338300149760.0, + "train_loss": 0.652452801803053, + "train_runtime": 3967.747, + "train_samples_per_second": 6.796, + "train_steps_per_second": 0.425, + "valid_targets_mean": 94.8, + "valid_targets_min": 62 +} \ No newline at end of file diff --git a/trainer_log.jsonl b/trainer_log.jsonl new file mode 100644 index 0000000..957053d --- /dev/null +++ b/trainer_log.jsonl @@ -0,0 +1,338 @@ +{"current_steps": 5, "total_steps": 1687, "loss": 3.331, "lr": 9.467455621301776e-07, "epoch": 0.02074688796680498, "percentage": 0.3, "elapsed_time": "0:00:23", "remaining_time": "2:12:07"} +{"current_steps": 10, "total_steps": 1687, "loss": 2.722, "lr": 2.1301775147929e-06, "epoch": 0.04149377593360996, "percentage": 0.59, "elapsed_time": "0:00:36", "remaining_time": "1:43:05"} +{"current_steps": 15, "total_steps": 1687, "loss": 3.1562, "lr": 3.313609467455622e-06, "epoch": 0.06224066390041494, "percentage": 0.89, "elapsed_time": "0:00:49", "remaining_time": "1:31:45"} +{"current_steps": 20, "total_steps": 1687, "loss": 2.9616, "lr": 4.497041420118343e-06, "epoch": 0.08298755186721991, "percentage": 1.19, "elapsed_time": "0:01:01", "remaining_time": "1:25:50"} +{"current_steps": 25, "total_steps": 1687, "loss": 2.4025, "lr": 5.680473372781066e-06, "epoch": 0.1037344398340249, "percentage": 1.48, "elapsed_time": "0:01:12", "remaining_time": "1:20:48"} +{"current_steps": 30, "total_steps": 1687, "loss": 2.3199, "lr": 6.863905325443787e-06, "epoch": 0.12448132780082988, "percentage": 1.78, "elapsed_time": "0:01:24", "remaining_time": "1:17:32"} +{"current_steps": 35, "total_steps": 1687, "loss": 2.0466, "lr": 8.04733727810651e-06, "epoch": 0.14522821576763487, "percentage": 2.07, "elapsed_time": "0:01:35", "remaining_time": "1:14:49"} +{"current_steps": 40, "total_steps": 1687, "loss": 2.0725, "lr": 9.230769230769232e-06, "epoch": 0.16597510373443983, "percentage": 2.37, "elapsed_time": "0:01:46", "remaining_time": "1:13:03"} +{"current_steps": 45, "total_steps": 1687, "loss": 1.7968, "lr": 1.0414201183431953e-05, "epoch": 0.18672199170124482, "percentage": 2.67, "elapsed_time": "0:01:57", "remaining_time": "1:11:35"} +{"current_steps": 50, "total_steps": 1687, "loss": 1.6917, "lr": 1.1597633136094675e-05, "epoch": 0.2074688796680498, "percentage": 2.96, "elapsed_time": "0:02:08", "remaining_time": "1:10:09"} +{"current_steps": 55, "total_steps": 1687, "loss": 1.5291, "lr": 1.2781065088757399e-05, "epoch": 0.22821576763485477, "percentage": 3.26, "elapsed_time": "0:02:20", "remaining_time": "1:09:31"} +{"current_steps": 60, "total_steps": 1687, "loss": 1.5082, "lr": 1.396449704142012e-05, "epoch": 0.24896265560165975, "percentage": 3.56, "elapsed_time": "0:02:31", "remaining_time": "1:08:26"} +{"current_steps": 65, "total_steps": 1687, "loss": 1.3951, "lr": 1.5147928994082842e-05, "epoch": 0.2697095435684647, "percentage": 3.85, "elapsed_time": "0:02:42", "remaining_time": "1:07:36"} +{"current_steps": 70, "total_steps": 1687, "loss": 1.4802, "lr": 1.6331360946745562e-05, "epoch": 0.29045643153526973, "percentage": 4.15, "elapsed_time": "0:02:53", "remaining_time": "1:06:36"} +{"current_steps": 75, "total_steps": 1687, "loss": 1.4573, "lr": 1.7514792899408286e-05, "epoch": 0.3112033195020747, "percentage": 4.45, "elapsed_time": "0:03:03", "remaining_time": "1:05:41"} +{"current_steps": 80, "total_steps": 1687, "loss": 1.3158, "lr": 1.8698224852071007e-05, "epoch": 0.33195020746887965, "percentage": 4.74, "elapsed_time": "0:03:13", "remaining_time": "1:04:52"} +{"current_steps": 85, "total_steps": 1687, "loss": 1.3183, "lr": 1.9881656804733727e-05, "epoch": 0.35269709543568467, "percentage": 5.04, "elapsed_time": "0:03:24", "remaining_time": "1:04:19"} +{"current_steps": 90, "total_steps": 1687, "loss": 1.1263, "lr": 2.106508875739645e-05, "epoch": 0.37344398340248963, "percentage": 5.33, "elapsed_time": "0:03:35", "remaining_time": "1:03:45"} +{"current_steps": 95, "total_steps": 1687, "loss": 1.2161, "lr": 2.224852071005917e-05, "epoch": 0.3941908713692946, "percentage": 5.63, "elapsed_time": "0:03:46", "remaining_time": "1:03:12"} +{"current_steps": 100, "total_steps": 1687, "loss": 1.179, "lr": 2.3431952662721896e-05, "epoch": 0.4149377593360996, "percentage": 5.93, "elapsed_time": "0:03:57", "remaining_time": "1:02:42"} +{"current_steps": 105, "total_steps": 1687, "loss": 1.2358, "lr": 2.461538461538462e-05, "epoch": 0.43568464730290457, "percentage": 6.22, "elapsed_time": "0:04:07", "remaining_time": "1:02:15"} +{"current_steps": 110, "total_steps": 1687, "loss": 1.2222, "lr": 2.5798816568047337e-05, "epoch": 0.45643153526970953, "percentage": 6.52, "elapsed_time": "0:04:18", "remaining_time": "1:01:53"} +{"current_steps": 115, "total_steps": 1687, "loss": 1.1238, "lr": 2.698224852071006e-05, "epoch": 0.47717842323651455, "percentage": 6.82, "elapsed_time": "0:04:30", "remaining_time": "1:01:34"} +{"current_steps": 120, "total_steps": 1687, "loss": 1.1292, "lr": 2.8165680473372784e-05, "epoch": 0.4979253112033195, "percentage": 7.11, "elapsed_time": "0:04:41", "remaining_time": "1:01:10"} +{"current_steps": 125, "total_steps": 1687, "loss": 1.2792, "lr": 2.9349112426035505e-05, "epoch": 0.5186721991701245, "percentage": 7.41, "elapsed_time": "0:04:51", "remaining_time": "1:00:44"} +{"current_steps": 130, "total_steps": 1687, "loss": 1.2547, "lr": 3.0532544378698226e-05, "epoch": 0.5394190871369294, "percentage": 7.71, "elapsed_time": "0:05:02", "remaining_time": "1:00:24"} +{"current_steps": 135, "total_steps": 1687, "loss": 1.1989, "lr": 3.171597633136095e-05, "epoch": 0.5601659751037344, "percentage": 8.0, "elapsed_time": "0:05:13", "remaining_time": "1:00:05"} +{"current_steps": 140, "total_steps": 1687, "loss": 1.291, "lr": 3.289940828402367e-05, "epoch": 0.5809128630705395, "percentage": 8.3, "elapsed_time": "0:05:24", "remaining_time": "0:59:40"} +{"current_steps": 145, "total_steps": 1687, "loss": 1.2244, "lr": 3.40828402366864e-05, "epoch": 0.6016597510373444, "percentage": 8.6, "elapsed_time": "0:05:34", "remaining_time": "0:59:20"} +{"current_steps": 150, "total_steps": 1687, "loss": 1.1273, "lr": 3.5266272189349114e-05, "epoch": 0.6224066390041494, "percentage": 8.89, "elapsed_time": "0:05:45", "remaining_time": "0:58:56"} +{"current_steps": 155, "total_steps": 1687, "loss": 1.0308, "lr": 3.644970414201184e-05, "epoch": 0.6431535269709544, "percentage": 9.19, "elapsed_time": "0:05:55", "remaining_time": "0:58:38"} +{"current_steps": 160, "total_steps": 1687, "loss": 0.9736, "lr": 3.763313609467456e-05, "epoch": 0.6639004149377593, "percentage": 9.48, "elapsed_time": "0:06:06", "remaining_time": "0:58:18"} +{"current_steps": 165, "total_steps": 1687, "loss": 1.166, "lr": 3.881656804733728e-05, "epoch": 0.6846473029045643, "percentage": 9.78, "elapsed_time": "0:06:17", "remaining_time": "0:58:06"} +{"current_steps": 170, "total_steps": 1687, "loss": 1.0946, "lr": 4e-05, "epoch": 0.7053941908713693, "percentage": 10.08, "elapsed_time": "0:06:28", "remaining_time": "0:57:48"} +{"current_steps": 175, "total_steps": 1687, "loss": 1.0991, "lr": 3.999892923951514e-05, "epoch": 0.7261410788381742, "percentage": 10.37, "elapsed_time": "0:06:38", "remaining_time": "0:57:24"} +{"current_steps": 180, "total_steps": 1687, "loss": 1.1184, "lr": 3.999571707271335e-05, "epoch": 0.7468879668049793, "percentage": 10.67, "elapsed_time": "0:06:49", "remaining_time": "0:57:06"} +{"current_steps": 185, "total_steps": 1687, "loss": 1.0885, "lr": 3.999036384354076e-05, "epoch": 0.7676348547717843, "percentage": 10.97, "elapsed_time": "0:06:59", "remaining_time": "0:56:47"} +{"current_steps": 190, "total_steps": 1687, "loss": 1.1776, "lr": 3.99828701252e-05, "epoch": 0.7883817427385892, "percentage": 11.26, "elapsed_time": "0:07:10", "remaining_time": "0:56:30"} +{"current_steps": 195, "total_steps": 1687, "loss": 1.1762, "lr": 3.997323672008881e-05, "epoch": 0.8091286307053942, "percentage": 11.56, "elapsed_time": "0:07:21", "remaining_time": "0:56:15"} +{"current_steps": 200, "total_steps": 1687, "loss": 1.0519, "lr": 3.9961464659714154e-05, "epoch": 0.8298755186721992, "percentage": 11.86, "elapsed_time": "0:07:31", "remaining_time": "0:55:56"} +{"current_steps": 205, "total_steps": 1687, "loss": 1.2221, "lr": 3.994755520458173e-05, "epoch": 0.8506224066390041, "percentage": 12.15, "elapsed_time": "0:08:40", "remaining_time": "1:02:44"} +{"current_steps": 210, "total_steps": 1687, "loss": 1.1059, "lr": 3.9931509844061034e-05, "epoch": 0.8713692946058091, "percentage": 12.45, "elapsed_time": "0:08:51", "remaining_time": "1:02:17"} +{"current_steps": 215, "total_steps": 1687, "loss": 1.0881, "lr": 3.991333029622587e-05, "epoch": 0.8921161825726142, "percentage": 12.74, "elapsed_time": "0:09:01", "remaining_time": "1:01:50"} +{"current_steps": 220, "total_steps": 1687, "loss": 1.0801, "lr": 3.9893018507670384e-05, "epoch": 0.9128630705394191, "percentage": 13.04, "elapsed_time": "0:09:12", "remaining_time": "1:01:21"} +{"current_steps": 225, "total_steps": 1687, "loss": 1.1093, "lr": 3.987057665330063e-05, "epoch": 0.9336099585062241, "percentage": 13.34, "elapsed_time": "0:09:22", "remaining_time": "1:00:55"} +{"current_steps": 230, "total_steps": 1687, "loss": 1.043, "lr": 3.984600713610169e-05, "epoch": 0.9543568464730291, "percentage": 13.63, "elapsed_time": "0:09:32", "remaining_time": "1:00:27"} +{"current_steps": 235, "total_steps": 1687, "loss": 1.1468, "lr": 3.981931258688038e-05, "epoch": 0.975103734439834, "percentage": 13.93, "elapsed_time": "0:09:42", "remaining_time": "1:00:00"} +{"current_steps": 240, "total_steps": 1687, "loss": 1.1073, "lr": 3.979049586398355e-05, "epoch": 0.995850622406639, "percentage": 14.23, "elapsed_time": "0:09:52", "remaining_time": "0:59:34"} +{"current_steps": 245, "total_steps": 1687, "loss": 1.1062, "lr": 3.975956005299202e-05, "epoch": 1.016597510373444, "percentage": 14.52, "elapsed_time": "0:10:03", "remaining_time": "0:59:13"} +{"current_steps": 250, "total_steps": 1687, "loss": 0.9987, "lr": 3.972650846639019e-05, "epoch": 1.037344398340249, "percentage": 14.82, "elapsed_time": "0:10:14", "remaining_time": "0:58:52"} +{"current_steps": 255, "total_steps": 1687, "loss": 0.9541, "lr": 3.9691344643211346e-05, "epoch": 1.058091286307054, "percentage": 15.12, "elapsed_time": "0:10:24", "remaining_time": "0:58:29"} +{"current_steps": 260, "total_steps": 1687, "loss": 1.0478, "lr": 3.965407234865871e-05, "epoch": 1.0788381742738589, "percentage": 15.41, "elapsed_time": "0:10:35", "remaining_time": "0:58:05"} +{"current_steps": 265, "total_steps": 1687, "loss": 0.9397, "lr": 3.9614695573702325e-05, "epoch": 1.099585062240664, "percentage": 15.71, "elapsed_time": "0:10:46", "remaining_time": "0:57:47"} +{"current_steps": 270, "total_steps": 1687, "loss": 0.9561, "lr": 3.957321853465163e-05, "epoch": 1.120331950207469, "percentage": 16.0, "elapsed_time": "0:10:57", "remaining_time": "0:57:28"} +{"current_steps": 275, "total_steps": 1687, "loss": 0.9533, "lr": 3.952964567270409e-05, "epoch": 1.1410788381742738, "percentage": 16.3, "elapsed_time": "0:11:07", "remaining_time": "0:57:06"} +{"current_steps": 280, "total_steps": 1687, "loss": 0.9866, "lr": 3.9483981653469586e-05, "epoch": 1.161825726141079, "percentage": 16.6, "elapsed_time": "0:11:17", "remaining_time": "0:56:45"} +{"current_steps": 285, "total_steps": 1687, "loss": 1.0861, "lr": 3.9436231366470836e-05, "epoch": 1.1825726141078838, "percentage": 16.89, "elapsed_time": "0:11:28", "remaining_time": "0:56:25"} +{"current_steps": 290, "total_steps": 1687, "loss": 0.8913, "lr": 3.93863999246199e-05, "epoch": 1.2033195020746887, "percentage": 17.19, "elapsed_time": "0:11:38", "remaining_time": "0:56:06"} +{"current_steps": 295, "total_steps": 1687, "loss": 0.9027, "lr": 3.933449266367066e-05, "epoch": 1.2240663900414939, "percentage": 17.49, "elapsed_time": "0:11:49", "remaining_time": "0:55:46"} +{"current_steps": 300, "total_steps": 1687, "loss": 0.984, "lr": 3.92805151416475e-05, "epoch": 1.2448132780082988, "percentage": 17.78, "elapsed_time": "0:11:59", "remaining_time": "0:55:25"} +{"current_steps": 305, "total_steps": 1687, "loss": 0.9891, "lr": 3.9224473138250186e-05, "epoch": 1.2655601659751037, "percentage": 18.08, "elapsed_time": "0:12:09", "remaining_time": "0:55:07"} +{"current_steps": 310, "total_steps": 1687, "loss": 0.8473, "lr": 3.9166372654235e-05, "epoch": 1.2863070539419086, "percentage": 18.38, "elapsed_time": "0:12:20", "remaining_time": "0:54:50"} +{"current_steps": 315, "total_steps": 1687, "loss": 0.9896, "lr": 3.9106219910772184e-05, "epoch": 1.3070539419087137, "percentage": 18.67, "elapsed_time": "0:12:30", "remaining_time": "0:54:30"} +{"current_steps": 320, "total_steps": 1687, "loss": 0.9571, "lr": 3.90440213487798e-05, "epoch": 1.3278008298755186, "percentage": 18.97, "elapsed_time": "0:12:41", "remaining_time": "0:54:13"} +{"current_steps": 325, "total_steps": 1687, "loss": 0.9725, "lr": 3.897978362823411e-05, "epoch": 1.3485477178423237, "percentage": 19.26, "elapsed_time": "0:12:52", "remaining_time": "0:53:56"} +{"current_steps": 330, "total_steps": 1687, "loss": 1.0897, "lr": 3.8913513627456374e-05, "epoch": 1.3692946058091287, "percentage": 19.56, "elapsed_time": "0:13:02", "remaining_time": "0:53:38"} +{"current_steps": 335, "total_steps": 1687, "loss": 1.0984, "lr": 3.8845218442376416e-05, "epoch": 1.3900414937759336, "percentage": 19.86, "elapsed_time": "0:13:13", "remaining_time": "0:53:21"} +{"current_steps": 340, "total_steps": 1687, "loss": 0.9807, "lr": 3.877490538577278e-05, "epoch": 1.4107883817427385, "percentage": 20.15, "elapsed_time": "0:13:23", "remaining_time": "0:53:04"} +{"current_steps": 345, "total_steps": 1687, "loss": 0.8938, "lr": 3.870258198648974e-05, "epoch": 1.4315352697095436, "percentage": 20.45, "elapsed_time": "0:13:34", "remaining_time": "0:52:47"} +{"current_steps": 350, "total_steps": 1687, "loss": 0.9289, "lr": 3.862825598863108e-05, "epoch": 1.4522821576763485, "percentage": 20.75, "elapsed_time": "0:13:44", "remaining_time": "0:52:30"} +{"current_steps": 355, "total_steps": 1687, "loss": 0.9448, "lr": 3.855193535073097e-05, "epoch": 1.4730290456431536, "percentage": 21.04, "elapsed_time": "0:13:55", "remaining_time": "0:52:13"} +{"current_steps": 360, "total_steps": 1687, "loss": 0.9142, "lr": 3.847362824490173e-05, "epoch": 1.4937759336099585, "percentage": 21.34, "elapsed_time": "0:14:05", "remaining_time": "0:51:56"} +{"current_steps": 365, "total_steps": 1687, "loss": 0.9044, "lr": 3.839334305595881e-05, "epoch": 1.5145228215767634, "percentage": 21.64, "elapsed_time": "0:14:15", "remaining_time": "0:51:38"} +{"current_steps": 370, "total_steps": 1687, "loss": 1.0486, "lr": 3.831108838052301e-05, "epoch": 1.5352697095435683, "percentage": 21.93, "elapsed_time": "0:14:25", "remaining_time": "0:51:19"} +{"current_steps": 375, "total_steps": 1687, "loss": 1.0625, "lr": 3.822687302609994e-05, "epoch": 1.5560165975103735, "percentage": 22.23, "elapsed_time": "0:14:35", "remaining_time": "0:51:02"} +{"current_steps": 380, "total_steps": 1687, "loss": 0.99, "lr": 3.814070601013697e-05, "epoch": 1.5767634854771784, "percentage": 22.53, "elapsed_time": "0:14:45", "remaining_time": "0:50:45"} +{"current_steps": 385, "total_steps": 1687, "loss": 1.0098, "lr": 3.8052596559057674e-05, "epoch": 1.5975103734439835, "percentage": 22.82, "elapsed_time": "0:14:55", "remaining_time": "0:50:28"} +{"current_steps": 390, "total_steps": 1687, "loss": 0.9996, "lr": 3.7962554107273926e-05, "epoch": 1.6182572614107884, "percentage": 23.12, "elapsed_time": "0:15:05", "remaining_time": "0:50:12"} +{"current_steps": 395, "total_steps": 1687, "loss": 0.9104, "lr": 3.7870588296175644e-05, "epoch": 1.6390041493775933, "percentage": 23.41, "elapsed_time": "0:15:16", "remaining_time": "0:49:57"} +{"current_steps": 400, "total_steps": 1687, "loss": 0.9904, "lr": 3.7776708973098476e-05, "epoch": 1.6597510373443982, "percentage": 23.71, "elapsed_time": "0:15:26", "remaining_time": "0:49:41"} +{"current_steps": 405, "total_steps": 1687, "loss": 1.0073, "lr": 3.768092619026937e-05, "epoch": 1.6804979253112033, "percentage": 24.01, "elapsed_time": "0:16:32", "remaining_time": "0:52:21"} +{"current_steps": 410, "total_steps": 1687, "loss": 0.9565, "lr": 3.7583250203730234e-05, "epoch": 1.7012448132780082, "percentage": 24.3, "elapsed_time": "0:16:42", "remaining_time": "0:52:02"} +{"current_steps": 415, "total_steps": 1687, "loss": 1.0132, "lr": 3.7483691472239744e-05, "epoch": 1.7219917012448134, "percentage": 24.6, "elapsed_time": "0:16:52", "remaining_time": "0:51:44"} +{"current_steps": 420, "total_steps": 1687, "loss": 0.8144, "lr": 3.7382260656153436e-05, "epoch": 1.7427385892116183, "percentage": 24.9, "elapsed_time": "0:17:03", "remaining_time": "0:51:27"} +{"current_steps": 425, "total_steps": 1687, "loss": 0.9513, "lr": 3.727896861628231e-05, "epoch": 1.7634854771784232, "percentage": 25.19, "elapsed_time": "0:17:13", "remaining_time": "0:51:09"} +{"current_steps": 430, "total_steps": 1687, "loss": 0.9952, "lr": 3.717382641272984e-05, "epoch": 1.784232365145228, "percentage": 25.49, "elapsed_time": "0:17:24", "remaining_time": "0:50:52"} +{"current_steps": 435, "total_steps": 1687, "loss": 0.8269, "lr": 3.7066845303707694e-05, "epoch": 1.8049792531120332, "percentage": 25.79, "elapsed_time": "0:17:35", "remaining_time": "0:50:37"} +{"current_steps": 440, "total_steps": 1687, "loss": 1.0451, "lr": 3.6958036744330297e-05, "epoch": 1.8257261410788381, "percentage": 26.08, "elapsed_time": "0:17:46", "remaining_time": "0:50:21"} +{"current_steps": 445, "total_steps": 1687, "loss": 0.8965, "lr": 3.6847412385388236e-05, "epoch": 1.8464730290456433, "percentage": 26.38, "elapsed_time": "0:17:56", "remaining_time": "0:50:04"} +{"current_steps": 450, "total_steps": 1687, "loss": 0.9751, "lr": 3.673498407210073e-05, "epoch": 1.8672199170124482, "percentage": 26.67, "elapsed_time": "0:18:06", "remaining_time": "0:49:46"} +{"current_steps": 455, "total_steps": 1687, "loss": 0.9165, "lr": 3.662076384284729e-05, "epoch": 1.887966804979253, "percentage": 26.97, "elapsed_time": "0:18:16", "remaining_time": "0:49:28"} +{"current_steps": 460, "total_steps": 1687, "loss": 0.9068, "lr": 3.650476392787873e-05, "epoch": 1.908713692946058, "percentage": 27.27, "elapsed_time": "0:18:26", "remaining_time": "0:49:11"} +{"current_steps": 465, "total_steps": 1687, "loss": 0.9429, "lr": 3.638699674800758e-05, "epoch": 1.929460580912863, "percentage": 27.56, "elapsed_time": "0:18:36", "remaining_time": "0:48:54"} +{"current_steps": 470, "total_steps": 1687, "loss": 1.0307, "lr": 3.6267474913278086e-05, "epoch": 1.950207468879668, "percentage": 27.86, "elapsed_time": "0:18:47", "remaining_time": "0:48:39"} +{"current_steps": 475, "total_steps": 1687, "loss": 1.0214, "lr": 3.614621122161603e-05, "epoch": 1.9709543568464731, "percentage": 28.16, "elapsed_time": "0:18:57", "remaining_time": "0:48:22"} +{"current_steps": 480, "total_steps": 1687, "loss": 1.0285, "lr": 3.6023218657458334e-05, "epoch": 1.991701244813278, "percentage": 28.45, "elapsed_time": "0:19:08", "remaining_time": "0:48:07"} +{"current_steps": 485, "total_steps": 1687, "loss": 0.8016, "lr": 3.589851039036277e-05, "epoch": 2.012448132780083, "percentage": 28.75, "elapsed_time": "0:19:18", "remaining_time": "0:47:50"} +{"current_steps": 490, "total_steps": 1687, "loss": 0.697, "lr": 3.577209977359778e-05, "epoch": 2.033195020746888, "percentage": 29.05, "elapsed_time": "0:19:29", "remaining_time": "0:47:35"} +{"current_steps": 495, "total_steps": 1687, "loss": 0.803, "lr": 3.5644000342712695e-05, "epoch": 2.0539419087136928, "percentage": 29.34, "elapsed_time": "0:19:39", "remaining_time": "0:47:20"} +{"current_steps": 500, "total_steps": 1687, "loss": 0.6451, "lr": 3.55142258140884e-05, "epoch": 2.074688796680498, "percentage": 29.64, "elapsed_time": "0:19:50", "remaining_time": "0:47:05"} +{"current_steps": 505, "total_steps": 1687, "loss": 0.7953, "lr": 3.538279008346861e-05, "epoch": 2.095435684647303, "percentage": 29.93, "elapsed_time": "0:20:00", "remaining_time": "0:46:49"} +{"current_steps": 510, "total_steps": 1687, "loss": 0.6491, "lr": 3.524970722447197e-05, "epoch": 2.116182572614108, "percentage": 30.23, "elapsed_time": "0:20:10", "remaining_time": "0:46:33"} +{"current_steps": 515, "total_steps": 1687, "loss": 0.8269, "lr": 3.511499148708517e-05, "epoch": 2.136929460580913, "percentage": 30.53, "elapsed_time": "0:20:20", "remaining_time": "0:46:17"} +{"current_steps": 520, "total_steps": 1687, "loss": 0.7598, "lr": 3.497865729613702e-05, "epoch": 2.1576763485477177, "percentage": 30.82, "elapsed_time": "0:20:30", "remaining_time": "0:46:00"} +{"current_steps": 525, "total_steps": 1687, "loss": 0.8091, "lr": 3.484071924975398e-05, "epoch": 2.1784232365145226, "percentage": 31.12, "elapsed_time": "0:20:40", "remaining_time": "0:45:44"} +{"current_steps": 530, "total_steps": 1687, "loss": 0.7631, "lr": 3.4701192117796964e-05, "epoch": 2.199170124481328, "percentage": 31.42, "elapsed_time": "0:20:50", "remaining_time": "0:45:29"} +{"current_steps": 535, "total_steps": 1687, "loss": 0.696, "lr": 3.456009084027995e-05, "epoch": 2.219917012448133, "percentage": 31.71, "elapsed_time": "0:21:00", "remaining_time": "0:45:13"} +{"current_steps": 540, "total_steps": 1687, "loss": 0.7411, "lr": 3.441743052577014e-05, "epoch": 2.240663900414938, "percentage": 32.01, "elapsed_time": "0:21:10", "remaining_time": "0:44:59"} +{"current_steps": 545, "total_steps": 1687, "loss": 0.7664, "lr": 3.4273226449770314e-05, "epoch": 2.2614107883817427, "percentage": 32.31, "elapsed_time": "0:21:20", "remaining_time": "0:44:42"} +{"current_steps": 550, "total_steps": 1687, "loss": 0.6762, "lr": 3.4127494053083086e-05, "epoch": 2.2821576763485476, "percentage": 32.6, "elapsed_time": "0:21:30", "remaining_time": "0:44:28"} +{"current_steps": 555, "total_steps": 1687, "loss": 0.6597, "lr": 3.398024894015764e-05, "epoch": 2.3029045643153525, "percentage": 32.9, "elapsed_time": "0:21:41", "remaining_time": "0:44:14"} +{"current_steps": 560, "total_steps": 1687, "loss": 0.8247, "lr": 3.383150687741883e-05, "epoch": 2.323651452282158, "percentage": 33.2, "elapsed_time": "0:21:51", "remaining_time": "0:43:59"} +{"current_steps": 565, "total_steps": 1687, "loss": 0.7557, "lr": 3.368128379157897e-05, "epoch": 2.3443983402489628, "percentage": 33.49, "elapsed_time": "0:22:01", "remaining_time": "0:43:43"} +{"current_steps": 570, "total_steps": 1687, "loss": 0.7067, "lr": 3.3529595767932496e-05, "epoch": 2.3651452282157677, "percentage": 33.79, "elapsed_time": "0:22:11", "remaining_time": "0:43:29"} +{"current_steps": 575, "total_steps": 1687, "loss": 0.8143, "lr": 3.3376459048633565e-05, "epoch": 2.3858921161825726, "percentage": 34.08, "elapsed_time": "0:22:21", "remaining_time": "0:43:14"} +{"current_steps": 580, "total_steps": 1687, "loss": 0.7022, "lr": 3.322189003095696e-05, "epoch": 2.4066390041493775, "percentage": 34.38, "elapsed_time": "0:22:31", "remaining_time": "0:43:00"} +{"current_steps": 585, "total_steps": 1687, "loss": 0.8357, "lr": 3.306590526554233e-05, "epoch": 2.4273858921161824, "percentage": 34.68, "elapsed_time": "0:22:41", "remaining_time": "0:42:44"} +{"current_steps": 590, "total_steps": 1687, "loss": 0.8137, "lr": 3.290852145462196e-05, "epoch": 2.4481327800829877, "percentage": 34.97, "elapsed_time": "0:22:51", "remaining_time": "0:42:30"} +{"current_steps": 595, "total_steps": 1687, "loss": 0.8062, "lr": 3.274975545023242e-05, "epoch": 2.4688796680497926, "percentage": 35.27, "elapsed_time": "0:23:01", "remaining_time": "0:42:16"} +{"current_steps": 600, "total_steps": 1687, "loss": 0.8078, "lr": 3.258962425241011e-05, "epoch": 2.4896265560165975, "percentage": 35.57, "elapsed_time": "0:23:12", "remaining_time": "0:42:02"} +{"current_steps": 605, "total_steps": 1687, "loss": 0.7583, "lr": 3.242814500737092e-05, "epoch": 2.5103734439834025, "percentage": 35.86, "elapsed_time": "0:24:16", "remaining_time": "0:43:24"} +{"current_steps": 610, "total_steps": 1687, "loss": 0.8238, "lr": 3.226533500567433e-05, "epoch": 2.5311203319502074, "percentage": 36.16, "elapsed_time": "0:24:27", "remaining_time": "0:43:10"} +{"current_steps": 615, "total_steps": 1687, "loss": 0.8316, "lr": 3.2101211680371965e-05, "epoch": 2.5518672199170123, "percentage": 36.46, "elapsed_time": "0:24:37", "remaining_time": "0:42:55"} +{"current_steps": 620, "total_steps": 1687, "loss": 0.8382, "lr": 3.193579260514097e-05, "epoch": 2.572614107883817, "percentage": 36.75, "elapsed_time": "0:24:47", "remaining_time": "0:42:40"} +{"current_steps": 625, "total_steps": 1687, "loss": 0.797, "lr": 3.176909549240226e-05, "epoch": 2.5933609958506225, "percentage": 37.05, "elapsed_time": "0:24:58", "remaining_time": "0:42:26"} +{"current_steps": 630, "total_steps": 1687, "loss": 0.7022, "lr": 3.1601138191423966e-05, "epoch": 2.6141078838174274, "percentage": 37.34, "elapsed_time": "0:25:08", "remaining_time": "0:42:11"} +{"current_steps": 635, "total_steps": 1687, "loss": 0.754, "lr": 3.143193868641019e-05, "epoch": 2.6348547717842323, "percentage": 37.64, "elapsed_time": "0:25:18", "remaining_time": "0:41:56"} +{"current_steps": 640, "total_steps": 1687, "loss": 0.7797, "lr": 3.1261515094575335e-05, "epoch": 2.6556016597510372, "percentage": 37.94, "elapsed_time": "0:25:29", "remaining_time": "0:41:42"} +{"current_steps": 645, "total_steps": 1687, "loss": 0.6938, "lr": 3.108988566420417e-05, "epoch": 2.6763485477178426, "percentage": 38.23, "elapsed_time": "0:25:39", "remaining_time": "0:41:27"} +{"current_steps": 650, "total_steps": 1687, "loss": 0.7476, "lr": 3.0917068772697934e-05, "epoch": 2.6970954356846475, "percentage": 38.53, "elapsed_time": "0:25:49", "remaining_time": "0:41:12"} +{"current_steps": 655, "total_steps": 1687, "loss": 0.6439, "lr": 3.074308292460646e-05, "epoch": 2.7178423236514524, "percentage": 38.83, "elapsed_time": "0:26:00", "remaining_time": "0:40:58"} +{"current_steps": 660, "total_steps": 1687, "loss": 0.6437, "lr": 3.056794674964685e-05, "epoch": 2.7385892116182573, "percentage": 39.12, "elapsed_time": "0:26:10", "remaining_time": "0:40:43"} +{"current_steps": 665, "total_steps": 1687, "loss": 0.7794, "lr": 3.0391679000708673e-05, "epoch": 2.759336099585062, "percentage": 39.42, "elapsed_time": "0:26:20", "remaining_time": "0:40:29"} +{"current_steps": 670, "total_steps": 1687, "loss": 0.7425, "lr": 3.0214298551845967e-05, "epoch": 2.780082987551867, "percentage": 39.72, "elapsed_time": "0:26:31", "remaining_time": "0:40:15"} +{"current_steps": 675, "total_steps": 1687, "loss": 0.799, "lr": 3.0035824396256267e-05, "epoch": 2.800829875518672, "percentage": 40.01, "elapsed_time": "0:26:41", "remaining_time": "0:40:00"} +{"current_steps": 680, "total_steps": 1687, "loss": 0.7134, "lr": 2.9856275644246903e-05, "epoch": 2.821576763485477, "percentage": 40.31, "elapsed_time": "0:26:51", "remaining_time": "0:39:46"} +{"current_steps": 685, "total_steps": 1687, "loss": 0.7767, "lr": 2.9675671521188766e-05, "epoch": 2.8423236514522823, "percentage": 40.6, "elapsed_time": "0:27:01", "remaining_time": "0:39:32"} +{"current_steps": 690, "total_steps": 1687, "loss": 0.8742, "lr": 2.949403136545769e-05, "epoch": 2.863070539419087, "percentage": 40.9, "elapsed_time": "0:27:11", "remaining_time": "0:39:17"} +{"current_steps": 695, "total_steps": 1687, "loss": 0.6692, "lr": 2.9311374626363793e-05, "epoch": 2.883817427385892, "percentage": 41.2, "elapsed_time": "0:27:21", "remaining_time": "0:39:03"} +{"current_steps": 700, "total_steps": 1687, "loss": 0.7173, "lr": 2.9127720862068928e-05, "epoch": 2.904564315352697, "percentage": 41.49, "elapsed_time": "0:27:31", "remaining_time": "0:38:49"} +{"current_steps": 705, "total_steps": 1687, "loss": 0.7826, "lr": 2.8943089737492465e-05, "epoch": 2.9253112033195023, "percentage": 41.79, "elapsed_time": "0:27:41", "remaining_time": "0:38:34"} +{"current_steps": 710, "total_steps": 1687, "loss": 0.7881, "lr": 2.8757501022205653e-05, "epoch": 2.9460580912863072, "percentage": 42.09, "elapsed_time": "0:27:52", "remaining_time": "0:38:21"} +{"current_steps": 715, "total_steps": 1687, "loss": 0.638, "lr": 2.8570974588314767e-05, "epoch": 2.966804979253112, "percentage": 42.38, "elapsed_time": "0:28:02", "remaining_time": "0:38:07"} +{"current_steps": 720, "total_steps": 1687, "loss": 0.7669, "lr": 2.8383530408333285e-05, "epoch": 2.987551867219917, "percentage": 42.68, "elapsed_time": "0:28:13", "remaining_time": "0:37:53"} +{"current_steps": 725, "total_steps": 1687, "loss": 0.6422, "lr": 2.8195188553043317e-05, "epoch": 3.008298755186722, "percentage": 42.98, "elapsed_time": "0:28:23", "remaining_time": "0:37:39"} +{"current_steps": 730, "total_steps": 1687, "loss": 0.5641, "lr": 2.800596918934648e-05, "epoch": 3.029045643153527, "percentage": 43.27, "elapsed_time": "0:28:33", "remaining_time": "0:37:25"} +{"current_steps": 735, "total_steps": 1687, "loss": 0.6125, "lr": 2.7815892578104554e-05, "epoch": 3.0497925311203318, "percentage": 43.57, "elapsed_time": "0:28:42", "remaining_time": "0:37:11"} +{"current_steps": 740, "total_steps": 1687, "loss": 0.4968, "lr": 2.762497907196996e-05, "epoch": 3.070539419087137, "percentage": 43.86, "elapsed_time": "0:28:52", "remaining_time": "0:36:57"} +{"current_steps": 745, "total_steps": 1687, "loss": 0.591, "lr": 2.743324911320655e-05, "epoch": 3.091286307053942, "percentage": 44.16, "elapsed_time": "0:29:02", "remaining_time": "0:36:43"} +{"current_steps": 750, "total_steps": 1687, "loss": 0.5391, "lr": 2.724072323150069e-05, "epoch": 3.112033195020747, "percentage": 44.46, "elapsed_time": "0:29:12", "remaining_time": "0:36:29"} +{"current_steps": 755, "total_steps": 1687, "loss": 0.5404, "lr": 2.704742204176301e-05, "epoch": 3.132780082987552, "percentage": 44.75, "elapsed_time": "0:29:22", "remaining_time": "0:36:15"} +{"current_steps": 760, "total_steps": 1687, "loss": 0.546, "lr": 2.6853366241921083e-05, "epoch": 3.1535269709543567, "percentage": 45.05, "elapsed_time": "0:29:32", "remaining_time": "0:36:01"} +{"current_steps": 765, "total_steps": 1687, "loss": 0.5017, "lr": 2.6658576610703142e-05, "epoch": 3.1742738589211617, "percentage": 45.35, "elapsed_time": "0:29:42", "remaining_time": "0:35:48"} +{"current_steps": 770, "total_steps": 1687, "loss": 0.4833, "lr": 2.6463074005413187e-05, "epoch": 3.195020746887967, "percentage": 45.64, "elapsed_time": "0:29:53", "remaining_time": "0:35:35"} +{"current_steps": 775, "total_steps": 1687, "loss": 0.5946, "lr": 2.6266879359697647e-05, "epoch": 3.215767634854772, "percentage": 45.94, "elapsed_time": "0:30:03", "remaining_time": "0:35:22"} +{"current_steps": 780, "total_steps": 1687, "loss": 0.5182, "lr": 2.6070013681303933e-05, "epoch": 3.236514522821577, "percentage": 46.24, "elapsed_time": "0:30:13", "remaining_time": "0:35:08"} +{"current_steps": 785, "total_steps": 1687, "loss": 0.5421, "lr": 2.5872498049830973e-05, "epoch": 3.2572614107883817, "percentage": 46.53, "elapsed_time": "0:30:23", "remaining_time": "0:34:55"} +{"current_steps": 790, "total_steps": 1687, "loss": 0.4724, "lr": 2.5674353614472084e-05, "epoch": 3.2780082987551866, "percentage": 46.83, "elapsed_time": "0:30:33", "remaining_time": "0:34:42"} +{"current_steps": 795, "total_steps": 1687, "loss": 0.4999, "lr": 2.5475601591750448e-05, "epoch": 3.2987551867219915, "percentage": 47.13, "elapsed_time": "0:30:43", "remaining_time": "0:34:28"} +{"current_steps": 800, "total_steps": 1687, "loss": 0.5116, "lr": 2.5276263263247282e-05, "epoch": 3.3195020746887964, "percentage": 47.42, "elapsed_time": "0:30:54", "remaining_time": "0:34:15"} +{"current_steps": 805, "total_steps": 1687, "loss": 0.6154, "lr": 2.5076359973323107e-05, "epoch": 3.340248962655602, "percentage": 47.72, "elapsed_time": "0:31:58", "remaining_time": "0:35:01"} +{"current_steps": 810, "total_steps": 1687, "loss": 0.5952, "lr": 2.4875913126832297e-05, "epoch": 3.3609958506224067, "percentage": 48.01, "elapsed_time": "0:32:08", "remaining_time": "0:34:48"} +{"current_steps": 815, "total_steps": 1687, "loss": 0.4865, "lr": 2.4674944186831108e-05, "epoch": 3.3817427385892116, "percentage": 48.31, "elapsed_time": "0:32:19", "remaining_time": "0:34:34"} +{"current_steps": 820, "total_steps": 1687, "loss": 0.504, "lr": 2.4473474672279497e-05, "epoch": 3.4024896265560165, "percentage": 48.61, "elapsed_time": "0:32:29", "remaining_time": "0:34:21"} +{"current_steps": 825, "total_steps": 1687, "loss": 0.5078, "lr": 2.427152615573697e-05, "epoch": 3.4232365145228214, "percentage": 48.9, "elapsed_time": "0:32:40", "remaining_time": "0:34:08"} +{"current_steps": 830, "total_steps": 1687, "loss": 0.5275, "lr": 2.4069120261052682e-05, "epoch": 3.4439834024896268, "percentage": 49.2, "elapsed_time": "0:32:49", "remaining_time": "0:33:53"} +{"current_steps": 835, "total_steps": 1687, "loss": 0.5831, "lr": 2.386627866105002e-05, "epoch": 3.4647302904564317, "percentage": 49.5, "elapsed_time": "0:32:59", "remaining_time": "0:33:39"} +{"current_steps": 840, "total_steps": 1687, "loss": 0.5253, "lr": 2.3663023075205992e-05, "epoch": 3.4854771784232366, "percentage": 49.79, "elapsed_time": "0:33:09", "remaining_time": "0:33:26"} +{"current_steps": 845, "total_steps": 1687, "loss": 0.5637, "lr": 2.3459375267325552e-05, "epoch": 3.5062240663900415, "percentage": 50.09, "elapsed_time": "0:33:19", "remaining_time": "0:33:12"} +{"current_steps": 850, "total_steps": 1687, "loss": 0.612, "lr": 2.325535704321126e-05, "epoch": 3.5269709543568464, "percentage": 50.39, "elapsed_time": "0:33:29", "remaining_time": "0:32:58"} +{"current_steps": 855, "total_steps": 1687, "loss": 0.5303, "lr": 2.3050990248328365e-05, "epoch": 3.5477178423236513, "percentage": 50.68, "elapsed_time": "0:33:39", "remaining_time": "0:32:44"} +{"current_steps": 860, "total_steps": 1687, "loss": 0.5785, "lr": 2.2846296765465708e-05, "epoch": 3.568464730290456, "percentage": 50.98, "elapsed_time": "0:33:49", "remaining_time": "0:32:31"} +{"current_steps": 865, "total_steps": 1687, "loss": 0.5289, "lr": 2.2641298512392585e-05, "epoch": 3.5892116182572615, "percentage": 51.27, "elapsed_time": "0:33:58", "remaining_time": "0:32:17"} +{"current_steps": 870, "total_steps": 1687, "loss": 0.519, "lr": 2.2436017439511878e-05, "epoch": 3.6099585062240664, "percentage": 51.57, "elapsed_time": "0:34:08", "remaining_time": "0:32:03"} +{"current_steps": 875, "total_steps": 1687, "loss": 0.5643, "lr": 2.2230475527509712e-05, "epoch": 3.6307053941908713, "percentage": 51.87, "elapsed_time": "0:34:18", "remaining_time": "0:31:50"} +{"current_steps": 880, "total_steps": 1687, "loss": 0.5333, "lr": 2.2024694785001814e-05, "epoch": 3.6514522821576763, "percentage": 52.16, "elapsed_time": "0:34:28", "remaining_time": "0:31:36"} +{"current_steps": 885, "total_steps": 1687, "loss": 0.5145, "lr": 2.1818697246176943e-05, "epoch": 3.6721991701244816, "percentage": 52.46, "elapsed_time": "0:34:37", "remaining_time": "0:31:22"} +{"current_steps": 890, "total_steps": 1687, "loss": 0.6124, "lr": 2.161250496843756e-05, "epoch": 3.6929460580912865, "percentage": 52.76, "elapsed_time": "0:34:48", "remaining_time": "0:31:10"} +{"current_steps": 895, "total_steps": 1687, "loss": 0.5131, "lr": 2.1406140030037988e-05, "epoch": 3.7136929460580914, "percentage": 53.05, "elapsed_time": "0:34:58", "remaining_time": "0:30:56"} +{"current_steps": 900, "total_steps": 1687, "loss": 0.6005, "lr": 2.119962452772039e-05, "epoch": 3.7344398340248963, "percentage": 53.35, "elapsed_time": "0:35:08", "remaining_time": "0:30:43"} +{"current_steps": 905, "total_steps": 1687, "loss": 0.637, "lr": 2.0992980574348687e-05, "epoch": 3.7551867219917012, "percentage": 53.65, "elapsed_time": "0:35:17", "remaining_time": "0:30:30"} +{"current_steps": 910, "total_steps": 1687, "loss": 0.562, "lr": 2.0786230296540864e-05, "epoch": 3.775933609958506, "percentage": 53.94, "elapsed_time": "0:35:28", "remaining_time": "0:30:17"} +{"current_steps": 915, "total_steps": 1687, "loss": 0.4917, "lr": 2.0579395832299688e-05, "epoch": 3.796680497925311, "percentage": 54.24, "elapsed_time": "0:35:38", "remaining_time": "0:30:04"} +{"current_steps": 920, "total_steps": 1687, "loss": 0.6087, "lr": 2.0372499328642277e-05, "epoch": 3.817427385892116, "percentage": 54.53, "elapsed_time": "0:35:48", "remaining_time": "0:29:50"} +{"current_steps": 925, "total_steps": 1687, "loss": 0.54, "lr": 2.016556293922869e-05, "epoch": 3.8381742738589213, "percentage": 54.83, "elapsed_time": "0:35:58", "remaining_time": "0:29:37"} +{"current_steps": 930, "total_steps": 1687, "loss": 0.5028, "lr": 1.9958608821989792e-05, "epoch": 3.858921161825726, "percentage": 55.13, "elapsed_time": "0:36:08", "remaining_time": "0:29:24"} +{"current_steps": 935, "total_steps": 1687, "loss": 0.4334, "lr": 1.9751659136754686e-05, "epoch": 3.879668049792531, "percentage": 55.42, "elapsed_time": "0:36:19", "remaining_time": "0:29:12"} +{"current_steps": 940, "total_steps": 1687, "loss": 0.3988, "lr": 1.9544736042877886e-05, "epoch": 3.900414937759336, "percentage": 55.72, "elapsed_time": "0:36:29", "remaining_time": "0:29:00"} +{"current_steps": 945, "total_steps": 1687, "loss": 0.5395, "lr": 1.9337861696866643e-05, "epoch": 3.921161825726141, "percentage": 56.02, "elapsed_time": "0:36:39", "remaining_time": "0:28:47"} +{"current_steps": 950, "total_steps": 1687, "loss": 0.6749, "lr": 1.913105825000844e-05, "epoch": 3.9419087136929463, "percentage": 56.31, "elapsed_time": "0:36:50", "remaining_time": "0:28:34"} +{"current_steps": 955, "total_steps": 1687, "loss": 0.5692, "lr": 1.8924347845999197e-05, "epoch": 3.962655601659751, "percentage": 56.61, "elapsed_time": "0:37:00", "remaining_time": "0:28:21"} +{"current_steps": 960, "total_steps": 1687, "loss": 0.5391, "lr": 1.871775261857215e-05, "epoch": 3.983402489626556, "percentage": 56.91, "elapsed_time": "0:37:10", "remaining_time": "0:28:08"} +{"current_steps": 965, "total_steps": 1687, "loss": 0.5623, "lr": 1.8511294689127887e-05, "epoch": 4.004149377593361, "percentage": 57.2, "elapsed_time": "0:37:20", "remaining_time": "0:27:56"} +{"current_steps": 970, "total_steps": 1687, "loss": 0.4125, "lr": 1.830499616436567e-05, "epoch": 4.024896265560166, "percentage": 57.5, "elapsed_time": "0:37:30", "remaining_time": "0:27:43"} +{"current_steps": 975, "total_steps": 1687, "loss": 0.4243, "lr": 1.8098879133916352e-05, "epoch": 4.045643153526971, "percentage": 57.79, "elapsed_time": "0:37:40", "remaining_time": "0:27:30"} +{"current_steps": 980, "total_steps": 1687, "loss": 0.4303, "lr": 1.789296566797706e-05, "epoch": 4.066390041493776, "percentage": 58.09, "elapsed_time": "0:37:50", "remaining_time": "0:27:17"} +{"current_steps": 985, "total_steps": 1687, "loss": 0.418, "lr": 1.768727781494807e-05, "epoch": 4.087136929460581, "percentage": 58.39, "elapsed_time": "0:38:00", "remaining_time": "0:27:05"} +{"current_steps": 990, "total_steps": 1687, "loss": 0.3992, "lr": 1.7481837599071903e-05, "epoch": 4.1078838174273855, "percentage": 58.68, "elapsed_time": "0:38:10", "remaining_time": "0:26:52"} +{"current_steps": 995, "total_steps": 1687, "loss": 0.379, "lr": 1.7276667018075073e-05, "epoch": 4.12863070539419, "percentage": 58.98, "elapsed_time": "0:38:20", "remaining_time": "0:26:39"} +{"current_steps": 1000, "total_steps": 1687, "loss": 0.3745, "lr": 1.7071788040812655e-05, "epoch": 4.149377593360996, "percentage": 59.28, "elapsed_time": "0:38:31", "remaining_time": "0:26:27"} +{"current_steps": 1005, "total_steps": 1687, "loss": 0.3669, "lr": 1.686722260491597e-05, "epoch": 4.170124481327801, "percentage": 59.57, "elapsed_time": "0:39:36", "remaining_time": "0:26:52"} +{"current_steps": 1010, "total_steps": 1687, "loss": 0.3601, "lr": 1.6662992614443525e-05, "epoch": 4.190871369294606, "percentage": 59.87, "elapsed_time": "0:39:46", "remaining_time": "0:26:39"} +{"current_steps": 1015, "total_steps": 1687, "loss": 0.3598, "lr": 1.6459119937535702e-05, "epoch": 4.211618257261411, "percentage": 60.17, "elapsed_time": "0:39:57", "remaining_time": "0:26:27"} +{"current_steps": 1020, "total_steps": 1687, "loss": 0.3857, "lr": 1.6255626404073132e-05, "epoch": 4.232365145228216, "percentage": 60.46, "elapsed_time": "0:40:07", "remaining_time": "0:26:14"} +{"current_steps": 1025, "total_steps": 1687, "loss": 0.3545, "lr": 1.605253380333927e-05, "epoch": 4.253112033195021, "percentage": 60.76, "elapsed_time": "0:40:17", "remaining_time": "0:26:01"} +{"current_steps": 1030, "total_steps": 1687, "loss": 0.327, "lr": 1.584986388168728e-05, "epoch": 4.273858921161826, "percentage": 61.06, "elapsed_time": "0:40:27", "remaining_time": "0:25:48"} +{"current_steps": 1035, "total_steps": 1687, "loss": 0.4217, "lr": 1.5647638340211525e-05, "epoch": 4.2946058091286305, "percentage": 61.35, "elapsed_time": "0:40:37", "remaining_time": "0:25:35"} +{"current_steps": 1040, "total_steps": 1687, "loss": 0.3972, "lr": 1.5445878832423876e-05, "epoch": 4.3153526970954355, "percentage": 61.65, "elapsed_time": "0:40:47", "remaining_time": "0:25:22"} +{"current_steps": 1045, "total_steps": 1687, "loss": 0.4227, "lr": 1.5244606961935187e-05, "epoch": 4.33609958506224, "percentage": 61.94, "elapsed_time": "0:40:57", "remaining_time": "0:25:09"} +{"current_steps": 1050, "total_steps": 1687, "loss": 0.3818, "lr": 1.5043844280142005e-05, "epoch": 4.356846473029045, "percentage": 62.24, "elapsed_time": "0:41:08", "remaining_time": "0:24:57"} +{"current_steps": 1055, "total_steps": 1687, "loss": 0.3707, "lr": 1.4843612283918995e-05, "epoch": 4.377593360995851, "percentage": 62.54, "elapsed_time": "0:41:18", "remaining_time": "0:24:44"} +{"current_steps": 1060, "total_steps": 1687, "loss": 0.3399, "lr": 1.4643932413317079e-05, "epoch": 4.398340248962656, "percentage": 62.83, "elapsed_time": "0:41:28", "remaining_time": "0:24:32"} +{"current_steps": 1065, "total_steps": 1687, "loss": 0.3923, "lr": 1.4444826049267784e-05, "epoch": 4.419087136929461, "percentage": 63.13, "elapsed_time": "0:41:38", "remaining_time": "0:24:19"} +{"current_steps": 1070, "total_steps": 1687, "loss": 0.3607, "lr": 1.4246314511293777e-05, "epoch": 4.439834024896266, "percentage": 63.43, "elapsed_time": "0:41:48", "remaining_time": "0:24:06"} +{"current_steps": 1075, "total_steps": 1687, "loss": 0.4178, "lr": 1.4048419055226146e-05, "epoch": 4.460580912863071, "percentage": 63.72, "elapsed_time": "0:41:58", "remaining_time": "0:23:53"} +{"current_steps": 1080, "total_steps": 1687, "loss": 0.4052, "lr": 1.3851160870928317e-05, "epoch": 4.481327800829876, "percentage": 64.02, "elapsed_time": "0:42:08", "remaining_time": "0:23:41"} +{"current_steps": 1085, "total_steps": 1687, "loss": 0.343, "lr": 1.3654561080027213e-05, "epoch": 4.5020746887966805, "percentage": 64.32, "elapsed_time": "0:42:19", "remaining_time": "0:23:28"} +{"current_steps": 1090, "total_steps": 1687, "loss": 0.393, "lr": 1.345864073365157e-05, "epoch": 4.522821576763485, "percentage": 64.61, "elapsed_time": "0:42:29", "remaining_time": "0:23:16"} +{"current_steps": 1095, "total_steps": 1687, "loss": 0.4336, "lr": 1.3263420810177902e-05, "epoch": 4.54356846473029, "percentage": 64.91, "elapsed_time": "0:42:38", "remaining_time": "0:23:03"} +{"current_steps": 1100, "total_steps": 1687, "loss": 0.4127, "lr": 1.3068922212984188e-05, "epoch": 4.564315352697095, "percentage": 65.2, "elapsed_time": "0:42:48", "remaining_time": "0:22:50"} +{"current_steps": 1105, "total_steps": 1687, "loss": 0.4029, "lr": 1.287516576821167e-05, "epoch": 4.5850622406639, "percentage": 65.5, "elapsed_time": "0:42:58", "remaining_time": "0:22:38"} +{"current_steps": 1110, "total_steps": 1687, "loss": 0.3313, "lr": 1.2682172222534805e-05, "epoch": 4.605809128630705, "percentage": 65.8, "elapsed_time": "0:43:08", "remaining_time": "0:22:25"} +{"current_steps": 1115, "total_steps": 1687, "loss": 0.3613, "lr": 1.2489962240939857e-05, "epoch": 4.62655601659751, "percentage": 66.09, "elapsed_time": "0:43:18", "remaining_time": "0:22:13"} +{"current_steps": 1120, "total_steps": 1687, "loss": 0.3179, "lr": 1.229855640451213e-05, "epoch": 4.647302904564316, "percentage": 66.39, "elapsed_time": "0:43:29", "remaining_time": "0:22:00"} +{"current_steps": 1125, "total_steps": 1687, "loss": 0.4198, "lr": 1.2107975208232259e-05, "epoch": 4.668049792531121, "percentage": 66.69, "elapsed_time": "0:43:38", "remaining_time": "0:21:48"} +{"current_steps": 1130, "total_steps": 1687, "loss": 0.3607, "lr": 1.1918239058781636e-05, "epoch": 4.6887966804979255, "percentage": 66.98, "elapsed_time": "0:43:48", "remaining_time": "0:21:35"} +{"current_steps": 1135, "total_steps": 1687, "loss": 0.3285, "lr": 1.1729368272357419e-05, "epoch": 4.70954356846473, "percentage": 67.28, "elapsed_time": "0:43:58", "remaining_time": "0:21:23"} +{"current_steps": 1140, "total_steps": 1687, "loss": 0.3855, "lr": 1.1541383072497077e-05, "epoch": 4.730290456431535, "percentage": 67.58, "elapsed_time": "0:44:08", "remaining_time": "0:21:10"} +{"current_steps": 1145, "total_steps": 1687, "loss": 0.337, "lr": 1.1354303587913003e-05, "epoch": 4.75103734439834, "percentage": 67.87, "elapsed_time": "0:44:18", "remaining_time": "0:20:58"} +{"current_steps": 1150, "total_steps": 1687, "loss": 0.3401, "lr": 1.1168149850337136e-05, "epoch": 4.771784232365145, "percentage": 68.17, "elapsed_time": "0:44:28", "remaining_time": "0:20:46"} +{"current_steps": 1155, "total_steps": 1687, "loss": 0.3953, "lr": 1.0982941792376125e-05, "epoch": 4.79253112033195, "percentage": 68.46, "elapsed_time": "0:44:38", "remaining_time": "0:20:33"} +{"current_steps": 1160, "total_steps": 1687, "loss": 0.41, "lr": 1.0798699245376959e-05, "epoch": 4.813278008298755, "percentage": 68.76, "elapsed_time": "0:44:47", "remaining_time": "0:20:21"} +{"current_steps": 1165, "total_steps": 1687, "loss": 0.3275, "lr": 1.0615441937303534e-05, "epoch": 4.83402489626556, "percentage": 69.06, "elapsed_time": "0:44:58", "remaining_time": "0:20:08"} +{"current_steps": 1170, "total_steps": 1687, "loss": 0.3779, "lr": 1.0433189490624253e-05, "epoch": 4.854771784232365, "percentage": 69.35, "elapsed_time": "0:45:08", "remaining_time": "0:19:56"} +{"current_steps": 1175, "total_steps": 1687, "loss": 0.3465, "lr": 1.0251961420210937e-05, "epoch": 4.875518672199171, "percentage": 69.65, "elapsed_time": "0:45:18", "remaining_time": "0:19:44"} +{"current_steps": 1180, "total_steps": 1687, "loss": 0.3008, "lr": 1.0071777131249237e-05, "epoch": 4.8962655601659755, "percentage": 69.95, "elapsed_time": "0:45:28", "remaining_time": "0:19:32"} +{"current_steps": 1185, "total_steps": 1687, "loss": 0.3767, "lr": 9.892655917160814e-06, "epoch": 4.91701244813278, "percentage": 70.24, "elapsed_time": "0:45:39", "remaining_time": "0:19:20"} +{"current_steps": 1190, "total_steps": 1687, "loss": 0.3356, "lr": 9.714616957537466e-06, "epoch": 4.937759336099585, "percentage": 70.54, "elapsed_time": "0:45:49", "remaining_time": "0:19:08"} +{"current_steps": 1195, "total_steps": 1687, "loss": 0.3299, "lr": 9.537679316087491e-06, "epoch": 4.95850622406639, "percentage": 70.84, "elapsed_time": "0:45:59", "remaining_time": "0:18:56"} +{"current_steps": 1200, "total_steps": 1687, "loss": 0.3432, "lr": 9.361861938594332e-06, "epoch": 4.979253112033195, "percentage": 71.13, "elapsed_time": "0:46:09", "remaining_time": "0:18:43"} +{"current_steps": 1205, "total_steps": 1687, "loss": 0.3926, "lr": 9.187183650888056e-06, "epoch": 5.0, "percentage": 71.43, "elapsed_time": "0:47:18", "remaining_time": "0:18:55"} +{"current_steps": 1210, "total_steps": 1687, "loss": 0.2897, "lr": 9.013663156829438e-06, "epoch": 5.020746887966805, "percentage": 71.72, "elapsed_time": "0:47:28", "remaining_time": "0:18:43"} +{"current_steps": 1215, "total_steps": 1687, "loss": 0.253, "lr": 8.841319036307334e-06, "epoch": 5.04149377593361, "percentage": 72.02, "elapsed_time": "0:47:38", "remaining_time": "0:18:30"} +{"current_steps": 1220, "total_steps": 1687, "loss": 0.2583, "lr": 8.670169743249143e-06, "epoch": 5.062240663900415, "percentage": 72.32, "elapsed_time": "0:47:48", "remaining_time": "0:18:18"} +{"current_steps": 1225, "total_steps": 1687, "loss": 0.2949, "lr": 8.50023360364487e-06, "epoch": 5.08298755186722, "percentage": 72.61, "elapsed_time": "0:47:59", "remaining_time": "0:18:05"} +{"current_steps": 1230, "total_steps": 1687, "loss": 0.2688, "lr": 8.331528813584832e-06, "epoch": 5.1037344398340245, "percentage": 72.91, "elapsed_time": "0:48:09", "remaining_time": "0:17:53"} +{"current_steps": 1235, "total_steps": 1687, "loss": 0.3074, "lr": 8.164073437311315e-06, "epoch": 5.124481327800829, "percentage": 73.21, "elapsed_time": "0:48:18", "remaining_time": "0:17:40"} +{"current_steps": 1240, "total_steps": 1687, "loss": 0.2572, "lr": 7.997885405284305e-06, "epoch": 5.145228215767635, "percentage": 73.5, "elapsed_time": "0:48:28", "remaining_time": "0:17:28"} +{"current_steps": 1245, "total_steps": 1687, "loss": 0.2603, "lr": 7.83298251226158e-06, "epoch": 5.16597510373444, "percentage": 73.8, "elapsed_time": "0:48:39", "remaining_time": "0:17:16"} +{"current_steps": 1250, "total_steps": 1687, "loss": 0.2724, "lr": 7.669382415393298e-06, "epoch": 5.186721991701245, "percentage": 74.1, "elapsed_time": "0:48:48", "remaining_time": "0:17:03"} +{"current_steps": 1255, "total_steps": 1687, "loss": 0.2, "lr": 7.507102632331382e-06, "epoch": 5.20746887966805, "percentage": 74.39, "elapsed_time": "0:48:59", "remaining_time": "0:16:51"} +{"current_steps": 1260, "total_steps": 1687, "loss": 0.2709, "lr": 7.3461605393537415e-06, "epoch": 5.228215767634855, "percentage": 74.69, "elapsed_time": "0:49:08", "remaining_time": "0:16:39"} +{"current_steps": 1265, "total_steps": 1687, "loss": 0.2643, "lr": 7.186573369503731e-06, "epoch": 5.24896265560166, "percentage": 74.99, "elapsed_time": "0:49:18", "remaining_time": "0:16:26"} +{"current_steps": 1270, "total_steps": 1687, "loss": 0.2853, "lr": 7.028358210744881e-06, "epoch": 5.269709543568465, "percentage": 75.28, "elapsed_time": "0:49:28", "remaining_time": "0:16:14"} +{"current_steps": 1275, "total_steps": 1687, "loss": 0.2802, "lr": 6.8715320041312095e-06, "epoch": 5.29045643153527, "percentage": 75.58, "elapsed_time": "0:49:38", "remaining_time": "0:16:02"} +{"current_steps": 1280, "total_steps": 1687, "loss": 0.2687, "lr": 6.716111541993213e-06, "epoch": 5.3112033195020745, "percentage": 75.87, "elapsed_time": "0:49:48", "remaining_time": "0:15:50"} +{"current_steps": 1285, "total_steps": 1687, "loss": 0.273, "lr": 6.562113466139836e-06, "epoch": 5.331950207468879, "percentage": 76.17, "elapsed_time": "0:49:57", "remaining_time": "0:15:37"} +{"current_steps": 1290, "total_steps": 1687, "loss": 0.2853, "lr": 6.4095542660765145e-06, "epoch": 5.352697095435684, "percentage": 76.47, "elapsed_time": "0:50:07", "remaining_time": "0:15:25"} +{"current_steps": 1295, "total_steps": 1687, "loss": 0.2691, "lr": 6.258450277239545e-06, "epoch": 5.37344398340249, "percentage": 76.76, "elapsed_time": "0:50:17", "remaining_time": "0:15:13"} +{"current_steps": 1300, "total_steps": 1687, "loss": 0.2528, "lr": 6.108817679246979e-06, "epoch": 5.394190871369295, "percentage": 77.06, "elapsed_time": "0:50:27", "remaining_time": "0:15:01"} +{"current_steps": 1305, "total_steps": 1687, "loss": 0.2519, "lr": 5.960672494166113e-06, "epoch": 5.4149377593361, "percentage": 77.36, "elapsed_time": "0:50:36", "remaining_time": "0:14:48"} +{"current_steps": 1310, "total_steps": 1687, "loss": 0.2879, "lr": 5.8140305847979895e-06, "epoch": 5.435684647302905, "percentage": 77.65, "elapsed_time": "0:50:46", "remaining_time": "0:14:36"} +{"current_steps": 1315, "total_steps": 1687, "loss": 0.287, "lr": 5.668907652978783e-06, "epoch": 5.45643153526971, "percentage": 77.95, "elapsed_time": "0:50:57", "remaining_time": "0:14:24"} +{"current_steps": 1320, "total_steps": 1687, "loss": 0.2254, "lr": 5.5253192378985966e-06, "epoch": 5.477178423236515, "percentage": 78.25, "elapsed_time": "0:51:07", "remaining_time": "0:14:12"} +{"current_steps": 1325, "total_steps": 1687, "loss": 0.2503, "lr": 5.383280714437518e-06, "epoch": 5.4979253112033195, "percentage": 78.54, "elapsed_time": "0:51:17", "remaining_time": "0:14:00"} +{"current_steps": 1330, "total_steps": 1687, "loss": 0.2999, "lr": 5.242807291519374e-06, "epoch": 5.518672199170124, "percentage": 78.84, "elapsed_time": "0:51:27", "remaining_time": "0:13:48"} +{"current_steps": 1335, "total_steps": 1687, "loss": 0.1811, "lr": 5.103914010483206e-06, "epoch": 5.539419087136929, "percentage": 79.13, "elapsed_time": "0:51:37", "remaining_time": "0:13:36"} +{"current_steps": 1340, "total_steps": 1687, "loss": 0.2922, "lr": 4.966615743472709e-06, "epoch": 5.560165975103734, "percentage": 79.43, "elapsed_time": "0:51:47", "remaining_time": "0:13:24"} +{"current_steps": 1345, "total_steps": 1687, "loss": 0.2274, "lr": 4.830927191843779e-06, "epoch": 5.580912863070539, "percentage": 79.73, "elapsed_time": "0:51:57", "remaining_time": "0:13:12"} +{"current_steps": 1350, "total_steps": 1687, "loss": 0.2435, "lr": 4.696862884590349e-06, "epoch": 5.601659751037344, "percentage": 80.02, "elapsed_time": "0:52:07", "remaining_time": "0:13:00"} +{"current_steps": 1355, "total_steps": 1687, "loss": 0.2842, "lr": 4.564437176788681e-06, "epoch": 5.622406639004149, "percentage": 80.32, "elapsed_time": "0:52:17", "remaining_time": "0:12:48"} +{"current_steps": 1360, "total_steps": 1687, "loss": 0.229, "lr": 4.433664248060295e-06, "epoch": 5.643153526970955, "percentage": 80.62, "elapsed_time": "0:52:27", "remaining_time": "0:12:36"} +{"current_steps": 1365, "total_steps": 1687, "loss": 0.2755, "lr": 4.304558101053629e-06, "epoch": 5.66390041493776, "percentage": 80.91, "elapsed_time": "0:52:37", "remaining_time": "0:12:24"} +{"current_steps": 1370, "total_steps": 1687, "loss": 0.2214, "lr": 4.177132559944761e-06, "epoch": 5.6846473029045645, "percentage": 81.21, "elapsed_time": "0:52:47", "remaining_time": "0:12:12"} +{"current_steps": 1375, "total_steps": 1687, "loss": 0.27, "lr": 4.051401268957087e-06, "epoch": 5.7053941908713695, "percentage": 81.51, "elapsed_time": "0:52:57", "remaining_time": "0:12:00"} +{"current_steps": 1380, "total_steps": 1687, "loss": 0.2684, "lr": 3.927377690900436e-06, "epoch": 5.726141078838174, "percentage": 81.8, "elapsed_time": "0:53:06", "remaining_time": "0:11:48"} +{"current_steps": 1385, "total_steps": 1687, "loss": 0.2639, "lr": 3.805075105729459e-06, "epoch": 5.746887966804979, "percentage": 82.1, "elapsed_time": "0:53:16", "remaining_time": "0:11:37"} +{"current_steps": 1390, "total_steps": 1687, "loss": 0.2862, "lr": 3.6845066091216917e-06, "epoch": 5.767634854771784, "percentage": 82.39, "elapsed_time": "0:53:26", "remaining_time": "0:11:25"} +{"current_steps": 1395, "total_steps": 1687, "loss": 0.2365, "lr": 3.56568511107533e-06, "epoch": 5.788381742738589, "percentage": 82.69, "elapsed_time": "0:53:36", "remaining_time": "0:11:13"} +{"current_steps": 1400, "total_steps": 1687, "loss": 0.3138, "lr": 3.448623334526853e-06, "epoch": 5.809128630705394, "percentage": 82.99, "elapsed_time": "0:53:46", "remaining_time": "0:11:01"} +{"current_steps": 1405, "total_steps": 1687, "loss": 0.2836, "lr": 3.333333813988726e-06, "epoch": 5.829875518672199, "percentage": 83.28, "elapsed_time": "0:54:55", "remaining_time": "0:11:01"} +{"current_steps": 1410, "total_steps": 1687, "loss": 0.2612, "lr": 3.219828894207242e-06, "epoch": 5.850622406639004, "percentage": 83.58, "elapsed_time": "0:55:04", "remaining_time": "0:10:49"} +{"current_steps": 1415, "total_steps": 1687, "loss": 0.2353, "lr": 3.1081207288406846e-06, "epoch": 5.87136929460581, "percentage": 83.88, "elapsed_time": "0:55:14", "remaining_time": "0:10:37"} +{"current_steps": 1420, "total_steps": 1687, "loss": 0.2309, "lr": 2.9982212791580044e-06, "epoch": 5.8921161825726145, "percentage": 84.17, "elapsed_time": "0:55:24", "remaining_time": "0:10:25"} +{"current_steps": 1425, "total_steps": 1687, "loss": 0.2902, "lr": 2.890142312757982e-06, "epoch": 5.912863070539419, "percentage": 84.47, "elapsed_time": "0:55:34", "remaining_time": "0:10:13"} +{"current_steps": 1430, "total_steps": 1687, "loss": 0.2352, "lr": 2.7838954023092845e-06, "epoch": 5.933609958506224, "percentage": 84.77, "elapsed_time": "0:55:45", "remaining_time": "0:10:01"} +{"current_steps": 1435, "total_steps": 1687, "loss": 0.254, "lr": 2.679491924311226e-06, "epoch": 5.954356846473029, "percentage": 85.06, "elapsed_time": "0:55:55", "remaining_time": "0:09:49"} +{"current_steps": 1440, "total_steps": 1687, "loss": 0.2627, "lr": 2.576943057875696e-06, "epoch": 5.975103734439834, "percentage": 85.36, "elapsed_time": "0:56:05", "remaining_time": "0:09:37"} +{"current_steps": 1445, "total_steps": 1687, "loss": 0.2422, "lr": 2.4762597835300815e-06, "epoch": 5.995850622406639, "percentage": 85.66, "elapsed_time": "0:56:15", "remaining_time": "0:09:25"} +{"current_steps": 1450, "total_steps": 1687, "loss": 0.2238, "lr": 2.377452882041551e-06, "epoch": 6.016597510373444, "percentage": 85.95, "elapsed_time": "0:56:24", "remaining_time": "0:09:13"} +{"current_steps": 1455, "total_steps": 1687, "loss": 0.1914, "lr": 2.280532933262678e-06, "epoch": 6.037344398340249, "percentage": 86.25, "elapsed_time": "0:56:35", "remaining_time": "0:09:01"} +{"current_steps": 1460, "total_steps": 1687, "loss": 0.2352, "lr": 2.1855103149985934e-06, "epoch": 6.058091286307054, "percentage": 86.54, "elapsed_time": "0:56:45", "remaining_time": "0:08:49"} +{"current_steps": 1465, "total_steps": 1687, "loss": 0.2003, "lr": 2.0923952018957826e-06, "epoch": 6.078838174273859, "percentage": 86.84, "elapsed_time": "0:56:55", "remaining_time": "0:08:37"} +{"current_steps": 1470, "total_steps": 1687, "loss": 0.185, "lr": 2.0011975643526106e-06, "epoch": 6.0995850622406635, "percentage": 87.14, "elapsed_time": "0:57:06", "remaining_time": "0:08:25"} +{"current_steps": 1475, "total_steps": 1687, "loss": 0.1719, "lr": 1.9119271674517305e-06, "epoch": 6.1203319502074685, "percentage": 87.43, "elapsed_time": "0:57:15", "remaining_time": "0:08:13"} +{"current_steps": 1480, "total_steps": 1687, "loss": 0.2254, "lr": 1.8245935699145035e-06, "epoch": 6.141078838174274, "percentage": 87.73, "elapsed_time": "0:57:25", "remaining_time": "0:08:01"} +{"current_steps": 1485, "total_steps": 1687, "loss": 0.1786, "lr": 1.7392061230774371e-06, "epoch": 6.161825726141079, "percentage": 88.03, "elapsed_time": "0:57:35", "remaining_time": "0:07:50"} +{"current_steps": 1490, "total_steps": 1687, "loss": 0.2062, "lr": 1.6557739698909436e-06, "epoch": 6.182572614107884, "percentage": 88.32, "elapsed_time": "0:57:45", "remaining_time": "0:07:38"} +{"current_steps": 1495, "total_steps": 1687, "loss": 0.1752, "lr": 1.574306043940288e-06, "epoch": 6.203319502074689, "percentage": 88.62, "elapsed_time": "0:57:55", "remaining_time": "0:07:26"} +{"current_steps": 1500, "total_steps": 1687, "loss": 0.2002, "lr": 1.4948110684890726e-06, "epoch": 6.224066390041494, "percentage": 88.92, "elapsed_time": "0:58:05", "remaining_time": "0:07:14"} +{"current_steps": 1505, "total_steps": 1687, "loss": 0.2326, "lr": 1.4172975555451363e-06, "epoch": 6.244813278008299, "percentage": 89.21, "elapsed_time": "0:58:15", "remaining_time": "0:07:02"} +{"current_steps": 1510, "total_steps": 1687, "loss": 0.2402, "lr": 1.3417738049491536e-06, "epoch": 6.265560165975104, "percentage": 89.51, "elapsed_time": "0:58:25", "remaining_time": "0:06:50"} +{"current_steps": 1515, "total_steps": 1687, "loss": 0.198, "lr": 1.268247903485902e-06, "epoch": 6.286307053941909, "percentage": 89.8, "elapsed_time": "0:58:36", "remaining_time": "0:06:39"} +{"current_steps": 1520, "total_steps": 1687, "loss": 0.21, "lr": 1.1967277240183716e-06, "epoch": 6.3070539419087135, "percentage": 90.1, "elapsed_time": "0:58:45", "remaining_time": "0:06:27"} +{"current_steps": 1525, "total_steps": 1687, "loss": 0.2066, "lr": 1.1272209246447696e-06, "epoch": 6.327800829875518, "percentage": 90.4, "elapsed_time": "0:58:55", "remaining_time": "0:06:15"} +{"current_steps": 1530, "total_steps": 1687, "loss": 0.199, "lr": 1.0597349478785123e-06, "epoch": 6.348547717842323, "percentage": 90.69, "elapsed_time": "0:59:05", "remaining_time": "0:06:03"} +{"current_steps": 1535, "total_steps": 1687, "loss": 0.2284, "lr": 9.942770198513218e-07, "epoch": 6.369294605809129, "percentage": 90.99, "elapsed_time": "0:59:15", "remaining_time": "0:05:52"} +{"current_steps": 1540, "total_steps": 1687, "loss": 0.2092, "lr": 9.308541495394751e-07, "epoch": 6.390041493775934, "percentage": 91.29, "elapsed_time": "0:59:25", "remaining_time": "0:05:40"} +{"current_steps": 1545, "total_steps": 1687, "loss": 0.2104, "lr": 8.694731280133051e-07, "epoch": 6.410788381742739, "percentage": 91.58, "elapsed_time": "0:59:35", "remaining_time": "0:05:28"} +{"current_steps": 1550, "total_steps": 1687, "loss": 0.2306, "lr": 8.101405277100549e-07, "epoch": 6.431535269709544, "percentage": 91.88, "elapsed_time": "0:59:44", "remaining_time": "0:05:16"} +{"current_steps": 1555, "total_steps": 1687, "loss": 0.1801, "lr": 7.528627017301016e-07, "epoch": 6.452282157676349, "percentage": 92.18, "elapsed_time": "0:59:54", "remaining_time": "0:05:05"} +{"current_steps": 1560, "total_steps": 1687, "loss": 0.19, "lr": 6.976457831567262e-07, "epoch": 6.473029045643154, "percentage": 92.47, "elapsed_time": "1:00:04", "remaining_time": "0:04:53"} +{"current_steps": 1565, "total_steps": 1687, "loss": 0.2115, "lr": 6.444956843993754e-07, "epoch": 6.4937759336099585, "percentage": 92.77, "elapsed_time": "1:00:14", "remaining_time": "0:04:41"} +{"current_steps": 1570, "total_steps": 1687, "loss": 0.2254, "lr": 5.934180965606007e-07, "epoch": 6.514522821576763, "percentage": 93.06, "elapsed_time": "1:00:24", "remaining_time": "0:04:30"} +{"current_steps": 1575, "total_steps": 1687, "loss": 0.2009, "lr": 5.444184888266768e-07, "epoch": 6.535269709543568, "percentage": 93.36, "elapsed_time": "1:00:34", "remaining_time": "0:04:18"} +{"current_steps": 1580, "total_steps": 1687, "loss": 0.1839, "lr": 4.975021078819731e-07, "epoch": 6.556016597510373, "percentage": 93.66, "elapsed_time": "1:00:44", "remaining_time": "0:04:06"} +{"current_steps": 1585, "total_steps": 1687, "loss": 0.2637, "lr": 4.5267397734717113e-07, "epoch": 6.576763485477178, "percentage": 93.95, "elapsed_time": "1:00:53", "remaining_time": "0:03:55"} +{"current_steps": 1590, "total_steps": 1687, "loss": 0.2036, "lr": 4.0993889724135314e-07, "epoch": 6.597510373443983, "percentage": 94.25, "elapsed_time": "1:01:03", "remaining_time": "0:03:43"} +{"current_steps": 1595, "total_steps": 1687, "loss": 0.2089, "lr": 3.693014434680242e-07, "epoch": 6.618257261410788, "percentage": 94.55, "elapsed_time": "1:01:14", "remaining_time": "0:03:31"} +{"current_steps": 1600, "total_steps": 1687, "loss": 0.2244, "lr": 3.307659673251595e-07, "epoch": 6.639004149377593, "percentage": 94.84, "elapsed_time": "1:01:23", "remaining_time": "0:03:20"} +{"current_steps": 1605, "total_steps": 1687, "loss": 0.2327, "lr": 2.9433659503926623e-07, "epoch": 6.659751037344399, "percentage": 95.14, "elapsed_time": "1:02:29", "remaining_time": "0:03:11"} +{"current_steps": 1610, "total_steps": 1687, "loss": 0.2035, "lr": 2.6001722732358127e-07, "epoch": 6.680497925311204, "percentage": 95.44, "elapsed_time": "1:02:38", "remaining_time": "0:02:59"} +{"current_steps": 1615, "total_steps": 1687, "loss": 0.1913, "lr": 2.27811538960383e-07, "epoch": 6.7012448132780085, "percentage": 95.73, "elapsed_time": "1:02:48", "remaining_time": "0:02:48"} +{"current_steps": 1620, "total_steps": 1687, "loss": 0.1967, "lr": 1.9772297840752407e-07, "epoch": 6.721991701244813, "percentage": 96.03, "elapsed_time": "1:02:58", "remaining_time": "0:02:36"} +{"current_steps": 1625, "total_steps": 1687, "loss": 0.2021, "lr": 1.6975476742916886e-07, "epoch": 6.742738589211618, "percentage": 96.32, "elapsed_time": "1:03:08", "remaining_time": "0:02:24"} +{"current_steps": 1630, "total_steps": 1687, "loss": 0.2313, "lr": 1.43909900750836e-07, "epoch": 6.763485477178423, "percentage": 96.62, "elapsed_time": "1:03:18", "remaining_time": "0:02:12"} +{"current_steps": 1635, "total_steps": 1687, "loss": 0.1937, "lr": 1.2019114573871947e-07, "epoch": 6.784232365145228, "percentage": 96.92, "elapsed_time": "1:03:28", "remaining_time": "0:02:01"} +{"current_steps": 1640, "total_steps": 1687, "loss": 0.1573, "lr": 9.860104210338562e-08, "epoch": 6.804979253112033, "percentage": 97.21, "elapsed_time": "1:03:38", "remaining_time": "0:01:49"} +{"current_steps": 1645, "total_steps": 1687, "loss": 0.1822, "lr": 7.914190162781277e-08, "epoch": 6.825726141078838, "percentage": 97.51, "elapsed_time": "1:03:48", "remaining_time": "0:01:37"} +{"current_steps": 1650, "total_steps": 1687, "loss": 0.2129, "lr": 6.181580791987385e-08, "epoch": 6.846473029045643, "percentage": 97.81, "elapsed_time": "1:03:58", "remaining_time": "0:01:26"} +{"current_steps": 1655, "total_steps": 1687, "loss": 0.1678, "lr": 4.6624616189214765e-08, "epoch": 6.867219917012449, "percentage": 98.1, "elapsed_time": "1:04:08", "remaining_time": "0:01:14"} +{"current_steps": 1660, "total_steps": 1687, "loss": 0.1979, "lr": 3.3569953048624426e-08, "epoch": 6.8879668049792535, "percentage": 98.4, "elapsed_time": "1:04:18", "remaining_time": "0:01:02"} +{"current_steps": 1665, "total_steps": 1687, "loss": 0.2183, "lr": 2.2653216339840746e-08, "epoch": 6.908713692946058, "percentage": 98.7, "elapsed_time": "1:04:28", "remaining_time": "0:00:51"} +{"current_steps": 1670, "total_steps": 1687, "loss": 0.1777, "lr": 1.3875574983894802e-08, "epoch": 6.929460580912863, "percentage": 98.99, "elapsed_time": "1:04:37", "remaining_time": "0:00:39"} +{"current_steps": 1675, "total_steps": 1687, "loss": 0.1835, "lr": 7.237968855937638e-09, "epoch": 6.950207468879668, "percentage": 99.29, "elapsed_time": "1:04:47", "remaining_time": "0:00:27"} +{"current_steps": 1680, "total_steps": 1687, "loss": 0.2133, "lr": 2.7411086846051984e-09, "epoch": 6.970954356846473, "percentage": 99.59, "elapsed_time": "1:04:57", "remaining_time": "0:00:16"} +{"current_steps": 1685, "total_steps": 1687, "loss": 0.2164, "lr": 3.8547597591254147e-10, "epoch": 6.991701244813278, "percentage": 99.88, "elapsed_time": "1:05:07", "remaining_time": "0:00:04"} +{"current_steps": 1687, "total_steps": 1687, "epoch": 7.0, "percentage": 100.0, "elapsed_time": "1:06:05", "remaining_time": "0:00:00"} diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000..ea938e4 --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,3754 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 7.0, + "eval_steps": 500, + "global_step": 1687, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02074688796680498, + "grad_norm": 41.397027140234286, + "learning_rate": 9.467455621301776e-07, + "loss": 3.331, + "loss_nan_ranks": 0, + "loss_rank_avg": 2.9795444011688232, + "step": 5, + "valid_targets_mean": 134.4, + "valid_targets_min": 41 + }, + { + "epoch": 0.04149377593360996, + "grad_norm": 35.61947240445416, + "learning_rate": 2.1301775147929e-06, + "loss": 2.722, + "loss_nan_ranks": 0, + "loss_rank_avg": 2.5642552375793457, + "step": 10, + "valid_targets_mean": 161.3, + "valid_targets_min": 48 + }, + { + "epoch": 0.06224066390041494, + "grad_norm": 52.370048634783274, + "learning_rate": 3.313609467455622e-06, + "loss": 3.1562, + "loss_nan_ranks": 0, + "loss_rank_avg": 3.396860122680664, + "step": 15, + "valid_targets_mean": 97.2, + "valid_targets_min": 66 + }, + { + "epoch": 0.08298755186721991, + "grad_norm": 25.358721084807172, + "learning_rate": 4.497041420118343e-06, + "loss": 2.9616, + "loss_nan_ranks": 0, + "loss_rank_avg": 3.084906578063965, + "step": 20, + "valid_targets_mean": 104.8, + "valid_targets_min": 68 + }, + { + "epoch": 0.1037344398340249, + "grad_norm": 14.282450323196407, + "learning_rate": 5.680473372781066e-06, + "loss": 2.4025, + "loss_nan_ranks": 0, + "loss_rank_avg": 2.6719250679016113, + "step": 25, + "valid_targets_mean": 96.6, + "valid_targets_min": 54 + }, + { + "epoch": 0.12448132780082988, + "grad_norm": 6.589012266835091, + "learning_rate": 6.863905325443787e-06, + "loss": 2.3199, + "loss_nan_ranks": 0, + "loss_rank_avg": 1.578616976737976, + "step": 30, + "valid_targets_mean": 189.2, + "valid_targets_min": 59 + }, + { + "epoch": 0.14522821576763487, + "grad_norm": 8.720894250976365, + "learning_rate": 8.04733727810651e-06, + "loss": 2.0466, + "loss_nan_ranks": 0, + "loss_rank_avg": 2.2473416328430176, + "step": 35, + "valid_targets_mean": 108.4, + "valid_targets_min": 59 + }, + { + "epoch": 0.16597510373443983, + "grad_norm": 6.572593557764174, + "learning_rate": 9.230769230769232e-06, + "loss": 2.0725, + "loss_nan_ranks": 0, + "loss_rank_avg": 2.0674421787261963, + "step": 40, + "valid_targets_mean": 113.9, + "valid_targets_min": 58 + }, + { + "epoch": 0.18672199170124482, + "grad_norm": 5.530699449121817, + "learning_rate": 1.0414201183431953e-05, + "loss": 1.7968, + "loss_nan_ranks": 0, + "loss_rank_avg": 1.739060878753662, + "step": 45, + "valid_targets_mean": 115.5, + "valid_targets_min": 62 + }, + { + "epoch": 0.2074688796680498, + "grad_norm": 4.37749124574813, + "learning_rate": 1.1597633136094675e-05, + "loss": 1.6917, + "loss_nan_ranks": 0, + "loss_rank_avg": 1.5368684530258179, + "step": 50, + "valid_targets_mean": 139.1, + "valid_targets_min": 57 + }, + { + "epoch": 0.22821576763485477, + "grad_norm": 4.295701199410857, + "learning_rate": 1.2781065088757399e-05, + "loss": 1.5291, + "loss_nan_ranks": 0, + "loss_rank_avg": 1.4562065601348877, + "step": 55, + "valid_targets_mean": 132.5, + "valid_targets_min": 55 + }, + { + "epoch": 0.24896265560165975, + "grad_norm": 4.084731189779167, + "learning_rate": 1.396449704142012e-05, + "loss": 1.5082, + "loss_nan_ranks": 0, + "loss_rank_avg": 1.390439510345459, + "step": 60, + "valid_targets_mean": 141.6, + "valid_targets_min": 47 + }, + { + "epoch": 0.2697095435684647, + "grad_norm": 4.96787833573562, + "learning_rate": 1.5147928994082842e-05, + "loss": 1.3951, + "loss_nan_ranks": 0, + "loss_rank_avg": 1.4382585287094116, + "step": 65, + "valid_targets_mean": 94.7, + "valid_targets_min": 64 + }, + { + "epoch": 0.29045643153526973, + "grad_norm": 4.26457221140469, + "learning_rate": 1.6331360946745562e-05, + "loss": 1.4802, + "loss_nan_ranks": 0, + "loss_rank_avg": 1.4862592220306396, + "step": 70, + "valid_targets_mean": 133.1, + "valid_targets_min": 75 + }, + { + "epoch": 0.3112033195020747, + "grad_norm": 4.762762510462149, + "learning_rate": 1.7514792899408286e-05, + "loss": 1.4573, + "loss_nan_ranks": 0, + "loss_rank_avg": 1.6193333864212036, + "step": 75, + "valid_targets_mean": 95.4, + "valid_targets_min": 62 + }, + { + "epoch": 0.33195020746887965, + "grad_norm": 5.211587363436592, + "learning_rate": 1.8698224852071007e-05, + "loss": 1.3158, + "loss_nan_ranks": 0, + "loss_rank_avg": 1.5069483518600464, + "step": 80, + "valid_targets_mean": 85.4, + "valid_targets_min": 51 + }, + { + "epoch": 0.35269709543568467, + "grad_norm": 5.167018764133475, + "learning_rate": 1.9881656804733727e-05, + "loss": 1.3183, + "loss_nan_ranks": 0, + "loss_rank_avg": 1.5408308506011963, + "step": 85, + "valid_targets_mean": 111.9, + "valid_targets_min": 78 + }, + { + "epoch": 0.37344398340248963, + "grad_norm": 4.19223324707348, + "learning_rate": 2.106508875739645e-05, + "loss": 1.1263, + "loss_nan_ranks": 0, + "loss_rank_avg": 1.0536243915557861, + "step": 90, + "valid_targets_mean": 109.9, + "valid_targets_min": 54 + }, + { + "epoch": 0.3941908713692946, + "grad_norm": 4.327426360795543, + "learning_rate": 2.224852071005917e-05, + "loss": 1.2161, + "loss_nan_ranks": 0, + "loss_rank_avg": 1.2340021133422852, + "step": 95, + "valid_targets_mean": 108.8, + "valid_targets_min": 47 + }, + { + "epoch": 0.4149377593360996, + "grad_norm": 4.407589788387759, + "learning_rate": 2.3431952662721896e-05, + "loss": 1.179, + "loss_nan_ranks": 0, + "loss_rank_avg": 1.28630530834198, + "step": 100, + "valid_targets_mean": 123.1, + "valid_targets_min": 78 + }, + { + "epoch": 0.43568464730290457, + "grad_norm": 4.476057603578654, + "learning_rate": 2.461538461538462e-05, + "loss": 1.2358, + "loss_nan_ranks": 0, + "loss_rank_avg": 1.2336297035217285, + "step": 105, + "valid_targets_mean": 96.8, + "valid_targets_min": 61 + }, + { + "epoch": 0.45643153526970953, + "grad_norm": 4.275689963533272, + "learning_rate": 2.5798816568047337e-05, + "loss": 1.2222, + "loss_nan_ranks": 0, + "loss_rank_avg": 1.3048899173736572, + "step": 110, + "valid_targets_mean": 119.9, + "valid_targets_min": 60 + }, + { + "epoch": 0.47717842323651455, + "grad_norm": 4.114686647723986, + "learning_rate": 2.698224852071006e-05, + "loss": 1.1238, + "loss_nan_ranks": 0, + "loss_rank_avg": 1.1282589435577393, + "step": 115, + "valid_targets_mean": 119.6, + "valid_targets_min": 57 + }, + { + "epoch": 0.4979253112033195, + "grad_norm": 2.848802193178965, + "learning_rate": 2.8165680473372784e-05, + "loss": 1.1292, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.9970792531967163, + "step": 120, + "valid_targets_mean": 197.2, + "valid_targets_min": 70 + }, + { + "epoch": 0.5186721991701245, + "grad_norm": 3.917339058459324, + "learning_rate": 2.9349112426035505e-05, + "loss": 1.2792, + "loss_nan_ranks": 0, + "loss_rank_avg": 1.0999289751052856, + "step": 125, + "valid_targets_mean": 106.1, + "valid_targets_min": 50 + }, + { + "epoch": 0.5394190871369294, + "grad_norm": 4.973393373968101, + "learning_rate": 3.0532544378698226e-05, + "loss": 1.2547, + "loss_nan_ranks": 0, + "loss_rank_avg": 1.5572994947433472, + "step": 130, + "valid_targets_mean": 93.6, + "valid_targets_min": 48 + }, + { + "epoch": 0.5601659751037344, + "grad_norm": 4.093849799808453, + "learning_rate": 3.171597633136095e-05, + "loss": 1.1989, + "loss_nan_ranks": 0, + "loss_rank_avg": 1.289677381515503, + "step": 135, + "valid_targets_mean": 117.5, + "valid_targets_min": 56 + }, + { + "epoch": 0.5809128630705395, + "grad_norm": 4.130110837411625, + "learning_rate": 3.289940828402367e-05, + "loss": 1.291, + "loss_nan_ranks": 0, + "loss_rank_avg": 1.284778356552124, + "step": 140, + "valid_targets_mean": 104.9, + "valid_targets_min": 59 + }, + { + "epoch": 0.6016597510373444, + "grad_norm": 5.236925712153894, + "learning_rate": 3.40828402366864e-05, + "loss": 1.2244, + "loss_nan_ranks": 0, + "loss_rank_avg": 1.3275096416473389, + "step": 145, + "valid_targets_mean": 93.9, + "valid_targets_min": 52 + }, + { + "epoch": 0.6224066390041494, + "grad_norm": 4.0627237296792185, + "learning_rate": 3.5266272189349114e-05, + "loss": 1.1273, + "loss_nan_ranks": 0, + "loss_rank_avg": 1.3301905393600464, + "step": 150, + "valid_targets_mean": 118.0, + "valid_targets_min": 82 + }, + { + "epoch": 0.6431535269709544, + "grad_norm": 3.6554328860688985, + "learning_rate": 3.644970414201184e-05, + "loss": 1.0308, + "loss_nan_ranks": 0, + "loss_rank_avg": 1.0490658283233643, + "step": 155, + "valid_targets_mean": 149.6, + "valid_targets_min": 53 + }, + { + "epoch": 0.6639004149377593, + "grad_norm": 4.388884052961649, + "learning_rate": 3.763313609467456e-05, + "loss": 0.9736, + "loss_nan_ranks": 0, + "loss_rank_avg": 1.3347870111465454, + "step": 160, + "valid_targets_mean": 87.1, + "valid_targets_min": 46 + }, + { + "epoch": 0.6846473029045643, + "grad_norm": 4.283544953396232, + "learning_rate": 3.881656804733728e-05, + "loss": 1.166, + "loss_nan_ranks": 0, + "loss_rank_avg": 1.3210680484771729, + "step": 165, + "valid_targets_mean": 119.6, + "valid_targets_min": 61 + }, + { + "epoch": 0.7053941908713693, + "grad_norm": 4.3275598749896504, + "learning_rate": 4e-05, + "loss": 1.0946, + "loss_nan_ranks": 0, + "loss_rank_avg": 1.173753023147583, + "step": 170, + "valid_targets_mean": 94.1, + "valid_targets_min": 63 + }, + { + "epoch": 0.7261410788381742, + "grad_norm": 3.133490340424578, + "learning_rate": 3.999892923951514e-05, + "loss": 1.0991, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.9916005730628967, + "step": 175, + "valid_targets_mean": 175.0, + "valid_targets_min": 65 + }, + { + "epoch": 0.7468879668049793, + "grad_norm": 3.820724592316591, + "learning_rate": 3.999571707271335e-05, + "loss": 1.1184, + "loss_nan_ranks": 0, + "loss_rank_avg": 1.162503719329834, + "step": 180, + "valid_targets_mean": 116.2, + "valid_targets_min": 66 + }, + { + "epoch": 0.7676348547717843, + "grad_norm": 3.1153583861325584, + "learning_rate": 3.999036384354076e-05, + "loss": 1.0885, + "loss_nan_ranks": 0, + "loss_rank_avg": 1.132462501525879, + "step": 185, + "valid_targets_mean": 155.9, + "valid_targets_min": 55 + }, + { + "epoch": 0.7883817427385892, + "grad_norm": 3.2268572394739867, + "learning_rate": 3.99828701252e-05, + "loss": 1.1776, + "loss_nan_ranks": 0, + "loss_rank_avg": 1.0295939445495605, + "step": 190, + "valid_targets_mean": 146.7, + "valid_targets_min": 68 + }, + { + "epoch": 0.8091286307053942, + "grad_norm": 4.014232602275577, + "learning_rate": 3.997323672008881e-05, + "loss": 1.1762, + "loss_nan_ranks": 0, + "loss_rank_avg": 1.3584688901901245, + "step": 195, + "valid_targets_mean": 106.2, + "valid_targets_min": 56 + }, + { + "epoch": 0.8298755186721992, + "grad_norm": 2.894292449308229, + "learning_rate": 3.9961464659714154e-05, + "loss": 1.0519, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.8366074562072754, + "step": 200, + "valid_targets_mean": 144.6, + "valid_targets_min": 63 + }, + { + "epoch": 0.8506224066390041, + "grad_norm": 3.346195548737324, + "learning_rate": 3.994755520458173e-05, + "loss": 1.2221, + "loss_nan_ranks": 0, + "loss_rank_avg": 1.131117343902588, + "step": 205, + "valid_targets_mean": 118.6, + "valid_targets_min": 61 + }, + { + "epoch": 0.8713692946058091, + "grad_norm": 2.861208912009083, + "learning_rate": 3.9931509844061034e-05, + "loss": 1.1059, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.914353609085083, + "step": 210, + "valid_targets_mean": 161.1, + "valid_targets_min": 68 + }, + { + "epoch": 0.8921161825726142, + "grad_norm": 2.4635047614518433, + "learning_rate": 3.991333029622587e-05, + "loss": 1.0881, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.7619673609733582, + "step": 215, + "valid_targets_mean": 187.9, + "valid_targets_min": 49 + }, + { + "epoch": 0.9128630705394191, + "grad_norm": 3.439388327039743, + "learning_rate": 3.9893018507670384e-05, + "loss": 1.0801, + "loss_nan_ranks": 0, + "loss_rank_avg": 1.1252914667129517, + "step": 220, + "valid_targets_mean": 128.8, + "valid_targets_min": 66 + }, + { + "epoch": 0.9336099585062241, + "grad_norm": 3.0479402830613016, + "learning_rate": 3.987057665330063e-05, + "loss": 1.1093, + "loss_nan_ranks": 0, + "loss_rank_avg": 1.052093505859375, + "step": 225, + "valid_targets_mean": 155.6, + "valid_targets_min": 63 + }, + { + "epoch": 0.9543568464730291, + "grad_norm": 3.4173751722238928, + "learning_rate": 3.984600713610169e-05, + "loss": 1.043, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.9762424230575562, + "step": 230, + "valid_targets_mean": 115.1, + "valid_targets_min": 66 + }, + { + "epoch": 0.975103734439834, + "grad_norm": 3.203880033800484, + "learning_rate": 3.981931258688038e-05, + "loss": 1.1468, + "loss_nan_ranks": 0, + "loss_rank_avg": 1.121521234512329, + "step": 235, + "valid_targets_mean": 137.5, + "valid_targets_min": 68 + }, + { + "epoch": 0.995850622406639, + "grad_norm": 3.4770939413356463, + "learning_rate": 3.979049586398355e-05, + "loss": 1.1073, + "loss_nan_ranks": 0, + "loss_rank_avg": 1.1996042728424072, + "step": 240, + "valid_targets_mean": 117.0, + "valid_targets_min": 46 + }, + { + "epoch": 1.016597510373444, + "grad_norm": 3.930284483527447, + "learning_rate": 3.975956005299202e-05, + "loss": 1.1062, + "loss_nan_ranks": 0, + "loss_rank_avg": 1.1427793502807617, + "step": 245, + "valid_targets_mean": 93.7, + "valid_targets_min": 50 + }, + { + "epoch": 1.037344398340249, + "grad_norm": 3.4787980480099634, + "learning_rate": 3.972650846639019e-05, + "loss": 0.9987, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.8831626176834106, + "step": 250, + "valid_targets_mean": 117.0, + "valid_targets_min": 51 + }, + { + "epoch": 1.058091286307054, + "grad_norm": 5.765404317940215, + "learning_rate": 3.9691344643211346e-05, + "loss": 0.9541, + "loss_nan_ranks": 0, + "loss_rank_avg": 1.1344108581542969, + "step": 255, + "valid_targets_mean": 104.3, + "valid_targets_min": 63 + }, + { + "epoch": 1.0788381742738589, + "grad_norm": 3.77909701623929, + "learning_rate": 3.965407234865871e-05, + "loss": 1.0478, + "loss_nan_ranks": 0, + "loss_rank_avg": 1.0921099185943604, + "step": 260, + "valid_targets_mean": 114.2, + "valid_targets_min": 59 + }, + { + "epoch": 1.099585062240664, + "grad_norm": 3.026006206344685, + "learning_rate": 3.9614695573702325e-05, + "loss": 0.9397, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.7856882810592651, + "step": 265, + "valid_targets_mean": 144.1, + "valid_targets_min": 64 + }, + { + "epoch": 1.120331950207469, + "grad_norm": 2.4834572794776446, + "learning_rate": 3.957321853465163e-05, + "loss": 0.9561, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.7817927598953247, + "step": 270, + "valid_targets_mean": 190.4, + "valid_targets_min": 55 + }, + { + "epoch": 1.1410788381742738, + "grad_norm": 3.056967616081349, + "learning_rate": 3.952964567270409e-05, + "loss": 0.9533, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.8634785413742065, + "step": 275, + "valid_targets_mean": 134.5, + "valid_targets_min": 49 + }, + { + "epoch": 1.161825726141079, + "grad_norm": 2.5906209217918117, + "learning_rate": 3.9483981653469586e-05, + "loss": 0.9866, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.7464009523391724, + "step": 280, + "valid_targets_mean": 179.6, + "valid_targets_min": 57 + }, + { + "epoch": 1.1825726141078838, + "grad_norm": 3.2705129782461686, + "learning_rate": 3.9436231366470836e-05, + "loss": 1.0861, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.9500118494033813, + "step": 285, + "valid_targets_mean": 130.6, + "valid_targets_min": 64 + }, + { + "epoch": 1.2033195020746887, + "grad_norm": 2.8686194044847473, + "learning_rate": 3.93863999246199e-05, + "loss": 0.8913, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.7544863224029541, + "step": 290, + "valid_targets_mean": 159.8, + "valid_targets_min": 63 + }, + { + "epoch": 1.2240663900414939, + "grad_norm": 2.4839411870748935, + "learning_rate": 3.933449266367066e-05, + "loss": 0.9027, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.6904275417327881, + "step": 295, + "valid_targets_mean": 166.6, + "valid_targets_min": 44 + }, + { + "epoch": 1.2448132780082988, + "grad_norm": 3.1275981909890938, + "learning_rate": 3.92805151416475e-05, + "loss": 0.984, + "loss_nan_ranks": 0, + "loss_rank_avg": 1.0172083377838135, + "step": 300, + "valid_targets_mean": 148.0, + "valid_targets_min": 68 + }, + { + "epoch": 1.2655601659751037, + "grad_norm": 3.6194049895575517, + "learning_rate": 3.9224473138250186e-05, + "loss": 0.9891, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.9883549213409424, + "step": 305, + "valid_targets_mean": 116.4, + "valid_targets_min": 52 + }, + { + "epoch": 1.2863070539419086, + "grad_norm": 3.426981706024395, + "learning_rate": 3.9166372654235e-05, + "loss": 0.8473, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.7937560677528381, + "step": 310, + "valid_targets_mean": 118.4, + "valid_targets_min": 46 + }, + { + "epoch": 1.3070539419087137, + "grad_norm": 3.152855357782168, + "learning_rate": 3.9106219910772184e-05, + "loss": 0.9896, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.8297359943389893, + "step": 315, + "valid_targets_mean": 124.4, + "valid_targets_min": 68 + }, + { + "epoch": 1.3278008298755186, + "grad_norm": 3.069176743069222, + "learning_rate": 3.90440213487798e-05, + "loss": 0.9571, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.8529037237167358, + "step": 320, + "valid_targets_mean": 140.6, + "valid_targets_min": 58 + }, + { + "epoch": 1.3485477178423237, + "grad_norm": 3.3034611009209978, + "learning_rate": 3.897978362823411e-05, + "loss": 0.9725, + "loss_nan_ranks": 0, + "loss_rank_avg": 1.0133411884307861, + "step": 325, + "valid_targets_mean": 133.8, + "valid_targets_min": 65 + }, + { + "epoch": 1.3692946058091287, + "grad_norm": 3.5392784144202873, + "learning_rate": 3.8913513627456374e-05, + "loss": 1.0897, + "loss_nan_ranks": 0, + "loss_rank_avg": 1.0982413291931152, + "step": 330, + "valid_targets_mean": 108.6, + "valid_targets_min": 47 + }, + { + "epoch": 1.3900414937759336, + "grad_norm": 4.133104453790177, + "learning_rate": 3.8845218442376416e-05, + "loss": 1.0984, + "loss_nan_ranks": 0, + "loss_rank_avg": 1.1011841297149658, + "step": 335, + "valid_targets_mean": 92.4, + "valid_targets_min": 58 + }, + { + "epoch": 1.4107883817427385, + "grad_norm": 2.911694581277071, + "learning_rate": 3.877490538577278e-05, + "loss": 0.9807, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.8329280614852905, + "step": 340, + "valid_targets_mean": 146.0, + "valid_targets_min": 69 + }, + { + "epoch": 1.4315352697095436, + "grad_norm": 3.7192204333227976, + "learning_rate": 3.870258198648974e-05, + "loss": 0.8938, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.9809222221374512, + "step": 345, + "valid_targets_mean": 112.0, + "valid_targets_min": 73 + }, + { + "epoch": 1.4522821576763485, + "grad_norm": 4.515200149712003, + "learning_rate": 3.862825598863108e-05, + "loss": 0.9289, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.774828314781189, + "step": 350, + "valid_targets_mean": 201.0, + "valid_targets_min": 64 + }, + { + "epoch": 1.4730290456431536, + "grad_norm": 3.36292777919316, + "learning_rate": 3.855193535073097e-05, + "loss": 0.9448, + "loss_nan_ranks": 0, + "loss_rank_avg": 1.003603458404541, + "step": 355, + "valid_targets_mean": 131.2, + "valid_targets_min": 58 + }, + { + "epoch": 1.4937759336099585, + "grad_norm": 3.3587994781305976, + "learning_rate": 3.847362824490173e-05, + "loss": 0.9142, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.8775392174720764, + "step": 360, + "valid_targets_mean": 119.1, + "valid_targets_min": 60 + }, + { + "epoch": 1.5145228215767634, + "grad_norm": 4.141295017158325, + "learning_rate": 3.839334305595881e-05, + "loss": 0.9044, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.9412416219711304, + "step": 365, + "valid_targets_mean": 143.9, + "valid_targets_min": 57 + }, + { + "epoch": 1.5352697095435683, + "grad_norm": 2.9785125564157107, + "learning_rate": 3.831108838052301e-05, + "loss": 1.0486, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.7641946077346802, + "step": 370, + "valid_targets_mean": 147.4, + "valid_targets_min": 64 + }, + { + "epoch": 1.5560165975103735, + "grad_norm": 4.1021332056584, + "learning_rate": 3.822687302609994e-05, + "loss": 1.0625, + "loss_nan_ranks": 0, + "loss_rank_avg": 1.2731382846832275, + "step": 375, + "valid_targets_mean": 96.9, + "valid_targets_min": 59 + }, + { + "epoch": 1.5767634854771784, + "grad_norm": 3.4014528210970743, + "learning_rate": 3.814070601013697e-05, + "loss": 0.99, + "loss_nan_ranks": 0, + "loss_rank_avg": 1.0162218809127808, + "step": 380, + "valid_targets_mean": 117.1, + "valid_targets_min": 67 + }, + { + "epoch": 1.5975103734439835, + "grad_norm": 3.998857883960084, + "learning_rate": 3.8052596559057674e-05, + "loss": 1.0098, + "loss_nan_ranks": 0, + "loss_rank_avg": 1.1427373886108398, + "step": 385, + "valid_targets_mean": 94.8, + "valid_targets_min": 63 + }, + { + "epoch": 1.6182572614107884, + "grad_norm": 3.573313288898261, + "learning_rate": 3.7962554107273926e-05, + "loss": 0.9996, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.9803476333618164, + "step": 390, + "valid_targets_mean": 125.8, + "valid_targets_min": 64 + }, + { + "epoch": 1.6390041493775933, + "grad_norm": 2.627525409761934, + "learning_rate": 3.7870588296175644e-05, + "loss": 0.9104, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.8428837060928345, + "step": 395, + "valid_targets_mean": 185.5, + "valid_targets_min": 54 + }, + { + "epoch": 1.6597510373443982, + "grad_norm": 3.8364107288625786, + "learning_rate": 3.7776708973098476e-05, + "loss": 0.9904, + "loss_nan_ranks": 0, + "loss_rank_avg": 1.069312334060669, + "step": 400, + "valid_targets_mean": 105.6, + "valid_targets_min": 63 + }, + { + "epoch": 1.6804979253112033, + "grad_norm": 2.548205520795793, + "learning_rate": 3.768092619026937e-05, + "loss": 1.0073, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.7841141223907471, + "step": 405, + "valid_targets_mean": 155.5, + "valid_targets_min": 62 + }, + { + "epoch": 1.7012448132780082, + "grad_norm": 3.382526642855817, + "learning_rate": 3.7583250203730234e-05, + "loss": 0.9565, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.8623814582824707, + "step": 410, + "valid_targets_mean": 124.7, + "valid_targets_min": 69 + }, + { + "epoch": 1.7219917012448134, + "grad_norm": 4.314083908492832, + "learning_rate": 3.7483691472239744e-05, + "loss": 1.0132, + "loss_nan_ranks": 0, + "loss_rank_avg": 1.0424532890319824, + "step": 415, + "valid_targets_mean": 93.2, + "valid_targets_min": 60 + }, + { + "epoch": 1.7427385892116183, + "grad_norm": 2.5482144454375693, + "learning_rate": 3.7382260656153436e-05, + "loss": 0.8144, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.6370407342910767, + "step": 420, + "valid_targets_mean": 142.7, + "valid_targets_min": 61 + }, + { + "epoch": 1.7634854771784232, + "grad_norm": 2.6674353520861014, + "learning_rate": 3.727896861628231e-05, + "loss": 0.9513, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.7452382445335388, + "step": 425, + "valid_targets_mean": 178.5, + "valid_targets_min": 51 + }, + { + "epoch": 1.784232365145228, + "grad_norm": 3.494532029292696, + "learning_rate": 3.717382641272984e-05, + "loss": 0.9952, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.999313235282898, + "step": 430, + "valid_targets_mean": 100.6, + "valid_targets_min": 41 + }, + { + "epoch": 1.8049792531120332, + "grad_norm": 2.11108790003265, + "learning_rate": 3.7066845303707694e-05, + "loss": 0.8269, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.5311782360076904, + "step": 435, + "valid_targets_mean": 236.6, + "valid_targets_min": 57 + }, + { + "epoch": 1.8257261410788381, + "grad_norm": 3.9114163853143737, + "learning_rate": 3.6958036744330297e-05, + "loss": 1.0451, + "loss_nan_ranks": 0, + "loss_rank_avg": 1.1796157360076904, + "step": 440, + "valid_targets_mean": 100.8, + "valid_targets_min": 65 + }, + { + "epoch": 1.8464730290456433, + "grad_norm": 2.966917119742618, + "learning_rate": 3.6847412385388236e-05, + "loss": 0.8965, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.8532978892326355, + "step": 445, + "valid_targets_mean": 140.1, + "valid_targets_min": 61 + }, + { + "epoch": 1.8672199170124482, + "grad_norm": 3.599518479860152, + "learning_rate": 3.673498407210073e-05, + "loss": 0.9751, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.9784261584281921, + "step": 450, + "valid_targets_mean": 105.6, + "valid_targets_min": 53 + }, + { + "epoch": 1.887966804979253, + "grad_norm": 3.940460990497638, + "learning_rate": 3.662076384284729e-05, + "loss": 0.9165, + "loss_nan_ranks": 0, + "loss_rank_avg": 1.0794143676757812, + "step": 455, + "valid_targets_mean": 93.3, + "valid_targets_min": 56 + }, + { + "epoch": 1.908713692946058, + "grad_norm": 2.4631143719909656, + "learning_rate": 3.650476392787873e-05, + "loss": 0.9068, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.7837638258934021, + "step": 460, + "valid_targets_mean": 185.1, + "valid_targets_min": 69 + }, + { + "epoch": 1.929460580912863, + "grad_norm": 3.725991171844277, + "learning_rate": 3.638699674800758e-05, + "loss": 0.9429, + "loss_nan_ranks": 0, + "loss_rank_avg": 1.1260584592819214, + "step": 465, + "valid_targets_mean": 108.4, + "valid_targets_min": 66 + }, + { + "epoch": 1.950207468879668, + "grad_norm": 3.3343720514690145, + "learning_rate": 3.6267474913278086e-05, + "loss": 1.0307, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.9506238698959351, + "step": 470, + "valid_targets_mean": 127.8, + "valid_targets_min": 50 + }, + { + "epoch": 1.9709543568464731, + "grad_norm": 3.827770704987103, + "learning_rate": 3.614621122161603e-05, + "loss": 1.0214, + "loss_nan_ranks": 0, + "loss_rank_avg": 1.1068814992904663, + "step": 475, + "valid_targets_mean": 105.6, + "valid_targets_min": 41 + }, + { + "epoch": 1.991701244813278, + "grad_norm": 3.861407678400857, + "learning_rate": 3.6023218657458334e-05, + "loss": 1.0285, + "loss_nan_ranks": 0, + "loss_rank_avg": 1.1757726669311523, + "step": 480, + "valid_targets_mean": 103.8, + "valid_targets_min": 64 + }, + { + "epoch": 2.012448132780083, + "grad_norm": 3.1202708919801934, + "learning_rate": 3.589851039036277e-05, + "loss": 0.8016, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.7584832310676575, + "step": 485, + "valid_targets_mean": 110.2, + "valid_targets_min": 56 + }, + { + "epoch": 2.033195020746888, + "grad_norm": 3.8354655021211514, + "learning_rate": 3.577209977359778e-05, + "loss": 0.697, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.8398362398147583, + "step": 490, + "valid_targets_mean": 103.0, + "valid_targets_min": 57 + }, + { + "epoch": 2.0539419087136928, + "grad_norm": 4.329441765322654, + "learning_rate": 3.5644000342712695e-05, + "loss": 0.803, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.9269231557846069, + "step": 495, + "valid_targets_mean": 100.8, + "valid_targets_min": 67 + }, + { + "epoch": 2.074688796680498, + "grad_norm": 4.4746136552596845, + "learning_rate": 3.55142258140884e-05, + "loss": 0.6451, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.8513693809509277, + "step": 500, + "valid_targets_mean": 110.4, + "valid_targets_min": 59 + }, + { + "epoch": 2.095435684647303, + "grad_norm": 4.6090358834641565, + "learning_rate": 3.538279008346861e-05, + "loss": 0.7953, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.8998103141784668, + "step": 505, + "valid_targets_mean": 84.5, + "valid_targets_min": 63 + }, + { + "epoch": 2.116182572614108, + "grad_norm": 3.5628641412692224, + "learning_rate": 3.524970722447197e-05, + "loss": 0.6491, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.6803164482116699, + "step": 510, + "valid_targets_mean": 125.2, + "valid_targets_min": 55 + }, + { + "epoch": 2.136929460580913, + "grad_norm": 4.445807529078514, + "learning_rate": 3.511499148708517e-05, + "loss": 0.8269, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.9091898202896118, + "step": 515, + "valid_targets_mean": 89.8, + "valid_targets_min": 55 + }, + { + "epoch": 2.1576763485477177, + "grad_norm": 3.696118063258985, + "learning_rate": 3.497865729613702e-05, + "loss": 0.7598, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.738641619682312, + "step": 520, + "valid_targets_mean": 128.2, + "valid_targets_min": 60 + }, + { + "epoch": 2.1784232365145226, + "grad_norm": 3.9014408920828285, + "learning_rate": 3.484071924975398e-05, + "loss": 0.8091, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.7708579301834106, + "step": 525, + "valid_targets_mean": 115.0, + "valid_targets_min": 44 + }, + { + "epoch": 2.199170124481328, + "grad_norm": 4.305295137986893, + "learning_rate": 3.4701192117796964e-05, + "loss": 0.7631, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.7537413835525513, + "step": 530, + "valid_targets_mean": 99.2, + "valid_targets_min": 61 + }, + { + "epoch": 2.219917012448133, + "grad_norm": 3.4539393109108762, + "learning_rate": 3.456009084027995e-05, + "loss": 0.696, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.6141202449798584, + "step": 535, + "valid_targets_mean": 142.5, + "valid_targets_min": 67 + }, + { + "epoch": 2.240663900414938, + "grad_norm": 3.499033553308627, + "learning_rate": 3.441743052577014e-05, + "loss": 0.7411, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.6117064952850342, + "step": 540, + "valid_targets_mean": 134.6, + "valid_targets_min": 66 + }, + { + "epoch": 2.2614107883817427, + "grad_norm": 4.012715094473001, + "learning_rate": 3.4273226449770314e-05, + "loss": 0.7664, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.848983645439148, + "step": 545, + "valid_targets_mean": 108.5, + "valid_targets_min": 66 + }, + { + "epoch": 2.2821576763485476, + "grad_norm": 4.829214040777758, + "learning_rate": 3.4127494053083086e-05, + "loss": 0.6762, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.8868389129638672, + "step": 550, + "valid_targets_mean": 86.9, + "valid_targets_min": 62 + }, + { + "epoch": 2.3029045643153525, + "grad_norm": 3.9749139794203345, + "learning_rate": 3.398024894015764e-05, + "loss": 0.6597, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.8123997449874878, + "step": 555, + "valid_targets_mean": 119.9, + "valid_targets_min": 72 + }, + { + "epoch": 2.323651452282158, + "grad_norm": 4.34876414676235, + "learning_rate": 3.383150687741883e-05, + "loss": 0.8247, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.9735461473464966, + "step": 560, + "valid_targets_mean": 99.9, + "valid_targets_min": 52 + }, + { + "epoch": 2.3443983402489628, + "grad_norm": 3.912739845823611, + "learning_rate": 3.368128379157897e-05, + "loss": 0.7557, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.7604776620864868, + "step": 565, + "valid_targets_mean": 119.3, + "valid_targets_min": 64 + }, + { + "epoch": 2.3651452282157677, + "grad_norm": 4.185361795583942, + "learning_rate": 3.3529595767932496e-05, + "loss": 0.7067, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.722848653793335, + "step": 570, + "valid_targets_mean": 133.8, + "valid_targets_min": 71 + }, + { + "epoch": 2.3858921161825726, + "grad_norm": 3.7895338790081166, + "learning_rate": 3.3376459048633565e-05, + "loss": 0.8143, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.7524533271789551, + "step": 575, + "valid_targets_mean": 126.1, + "valid_targets_min": 53 + }, + { + "epoch": 2.4066390041493775, + "grad_norm": 4.1719327780746225, + "learning_rate": 3.322189003095696e-05, + "loss": 0.7022, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.7856139540672302, + "step": 580, + "valid_targets_mean": 121.4, + "valid_targets_min": 55 + }, + { + "epoch": 2.4273858921161824, + "grad_norm": 3.7130870658654795, + "learning_rate": 3.306590526554233e-05, + "loss": 0.8357, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.8768580555915833, + "step": 585, + "valid_targets_mean": 129.4, + "valid_targets_min": 47 + }, + { + "epoch": 2.4481327800829877, + "grad_norm": 4.362956565915498, + "learning_rate": 3.290852145462196e-05, + "loss": 0.8137, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.8842595815658569, + "step": 590, + "valid_targets_mean": 99.6, + "valid_targets_min": 66 + }, + { + "epoch": 2.4688796680497926, + "grad_norm": 4.836605175013781, + "learning_rate": 3.274975545023242e-05, + "loss": 0.8062, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.9917868375778198, + "step": 595, + "valid_targets_mean": 92.1, + "valid_targets_min": 61 + }, + { + "epoch": 2.4896265560165975, + "grad_norm": 4.355290465359613, + "learning_rate": 3.258962425241011e-05, + "loss": 0.8078, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.9537732601165771, + "step": 600, + "valid_targets_mean": 113.8, + "valid_targets_min": 60 + }, + { + "epoch": 2.5103734439834025, + "grad_norm": 3.605491208223676, + "learning_rate": 3.242814500737092e-05, + "loss": 0.7583, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.6148207187652588, + "step": 605, + "valid_targets_mean": 158.1, + "valid_targets_min": 37 + }, + { + "epoch": 2.5311203319502074, + "grad_norm": 4.3713729869832845, + "learning_rate": 3.226533500567433e-05, + "loss": 0.8238, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.8725237846374512, + "step": 610, + "valid_targets_mean": 104.8, + "valid_targets_min": 58 + }, + { + "epoch": 2.5518672199170123, + "grad_norm": 4.489442159400303, + "learning_rate": 3.2101211680371965e-05, + "loss": 0.8316, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.9407273530960083, + "step": 615, + "valid_targets_mean": 98.4, + "valid_targets_min": 57 + }, + { + "epoch": 2.572614107883817, + "grad_norm": 4.291817799537676, + "learning_rate": 3.193579260514097e-05, + "loss": 0.8382, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.8661283254623413, + "step": 620, + "valid_targets_mean": 88.4, + "valid_targets_min": 62 + }, + { + "epoch": 2.5933609958506225, + "grad_norm": 3.951278391034252, + "learning_rate": 3.176909549240226e-05, + "loss": 0.797, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.6531973481178284, + "step": 625, + "valid_targets_mean": 127.5, + "valid_targets_min": 56 + }, + { + "epoch": 2.6141078838174274, + "grad_norm": 3.6399446812749994, + "learning_rate": 3.1601138191423966e-05, + "loss": 0.7022, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.821446418762207, + "step": 630, + "valid_targets_mean": 158.7, + "valid_targets_min": 64 + }, + { + "epoch": 2.6348547717842323, + "grad_norm": 3.783299086234017, + "learning_rate": 3.143193868641019e-05, + "loss": 0.754, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.7142607569694519, + "step": 635, + "valid_targets_mean": 128.2, + "valid_targets_min": 61 + }, + { + "epoch": 2.6556016597510372, + "grad_norm": 4.253882499474148, + "learning_rate": 3.1261515094575335e-05, + "loss": 0.7797, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.8119747638702393, + "step": 640, + "valid_targets_mean": 93.5, + "valid_targets_min": 59 + }, + { + "epoch": 2.6763485477178426, + "grad_norm": 4.113442911406122, + "learning_rate": 3.108988566420417e-05, + "loss": 0.6938, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.7523815631866455, + "step": 645, + "valid_targets_mean": 119.0, + "valid_targets_min": 64 + }, + { + "epoch": 2.6970954356846475, + "grad_norm": 3.6131999603541405, + "learning_rate": 3.0917068772697934e-05, + "loss": 0.7476, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.6980721950531006, + "step": 650, + "valid_targets_mean": 146.2, + "valid_targets_min": 78 + }, + { + "epoch": 2.7178423236514524, + "grad_norm": 3.732929812664138, + "learning_rate": 3.074308292460646e-05, + "loss": 0.6439, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.8016985654830933, + "step": 655, + "valid_targets_mean": 153.6, + "valid_targets_min": 57 + }, + { + "epoch": 2.7385892116182573, + "grad_norm": 3.0389793975855532, + "learning_rate": 3.056794674964685e-05, + "loss": 0.6437, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.5522057414054871, + "step": 660, + "valid_targets_mean": 211.7, + "valid_targets_min": 72 + }, + { + "epoch": 2.759336099585062, + "grad_norm": 3.985890415354317, + "learning_rate": 3.0391679000708673e-05, + "loss": 0.7794, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.7437437176704407, + "step": 665, + "valid_targets_mean": 115.2, + "valid_targets_min": 55 + }, + { + "epoch": 2.780082987551867, + "grad_norm": 4.76701659295223, + "learning_rate": 3.0214298551845967e-05, + "loss": 0.7425, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.880343496799469, + "step": 670, + "valid_targets_mean": 113.9, + "valid_targets_min": 71 + }, + { + "epoch": 2.800829875518672, + "grad_norm": 3.7563533860394323, + "learning_rate": 3.0035824396256267e-05, + "loss": 0.799, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.7248696684837341, + "step": 675, + "valid_targets_mean": 106.1, + "valid_targets_min": 61 + }, + { + "epoch": 2.821576763485477, + "grad_norm": 3.761242240389271, + "learning_rate": 2.9856275644246903e-05, + "loss": 0.7134, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.8497253656387329, + "step": 680, + "valid_targets_mean": 139.9, + "valid_targets_min": 69 + }, + { + "epoch": 2.8423236514522823, + "grad_norm": 4.290575353844001, + "learning_rate": 2.9675671521188766e-05, + "loss": 0.7767, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.8362669944763184, + "step": 685, + "valid_targets_mean": 109.0, + "valid_targets_min": 57 + }, + { + "epoch": 2.863070539419087, + "grad_norm": 4.343959204903679, + "learning_rate": 2.949403136545769e-05, + "loss": 0.8742, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.9363041520118713, + "step": 690, + "valid_targets_mean": 118.2, + "valid_targets_min": 63 + }, + { + "epoch": 2.883817427385892, + "grad_norm": 3.8671929065161064, + "learning_rate": 2.9311374626363793e-05, + "loss": 0.6692, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.7386797666549683, + "step": 695, + "valid_targets_mean": 117.1, + "valid_targets_min": 62 + }, + { + "epoch": 2.904564315352697, + "grad_norm": 2.4421362640071878, + "learning_rate": 2.9127720862068928e-05, + "loss": 0.7173, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.43712466955184937, + "step": 700, + "valid_targets_mean": 225.9, + "valid_targets_min": 63 + }, + { + "epoch": 2.9253112033195023, + "grad_norm": 3.79384332274057, + "learning_rate": 2.8943089737492465e-05, + "loss": 0.7826, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.6691803932189941, + "step": 705, + "valid_targets_mean": 134.9, + "valid_targets_min": 58 + }, + { + "epoch": 2.9460580912863072, + "grad_norm": 5.106290015763064, + "learning_rate": 2.8757501022205653e-05, + "loss": 0.7881, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.6905944347381592, + "step": 710, + "valid_targets_mean": 113.7, + "valid_targets_min": 65 + }, + { + "epoch": 2.966804979253112, + "grad_norm": 2.3434102426062147, + "learning_rate": 2.8570974588314767e-05, + "loss": 0.638, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.38373029232025146, + "step": 715, + "valid_targets_mean": 209.8, + "valid_targets_min": 68 + }, + { + "epoch": 2.987551867219917, + "grad_norm": 4.177527208829087, + "learning_rate": 2.8383530408333285e-05, + "loss": 0.7669, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.8700437545776367, + "step": 720, + "valid_targets_mean": 106.6, + "valid_targets_min": 65 + }, + { + "epoch": 3.008298755186722, + "grad_norm": 3.3708836828475635, + "learning_rate": 2.8195188553043317e-05, + "loss": 0.6422, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.5712267160415649, + "step": 725, + "valid_targets_mean": 125.0, + "valid_targets_min": 48 + }, + { + "epoch": 3.029045643153527, + "grad_norm": 4.331342732584377, + "learning_rate": 2.800596918934648e-05, + "loss": 0.5641, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.6667015552520752, + "step": 730, + "valid_targets_mean": 108.5, + "valid_targets_min": 57 + }, + { + "epoch": 3.0497925311203318, + "grad_norm": 7.744354080794957, + "learning_rate": 2.7815892578104554e-05, + "loss": 0.6125, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.6798431873321533, + "step": 735, + "valid_targets_mean": 95.9, + "valid_targets_min": 46 + }, + { + "epoch": 3.070539419087137, + "grad_norm": 4.7346730931654974, + "learning_rate": 2.762497907196996e-05, + "loss": 0.4968, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.4088674783706665, + "step": 740, + "valid_targets_mean": 149.1, + "valid_targets_min": 61 + }, + { + "epoch": 3.091286307053942, + "grad_norm": 3.8372167911375676, + "learning_rate": 2.743324911320655e-05, + "loss": 0.591, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.4837598204612732, + "step": 745, + "valid_targets_mean": 155.9, + "valid_targets_min": 48 + }, + { + "epoch": 3.112033195020747, + "grad_norm": 4.928852485615693, + "learning_rate": 2.724072323150069e-05, + "loss": 0.5391, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.6401872038841248, + "step": 750, + "valid_targets_mean": 109.9, + "valid_targets_min": 62 + }, + { + "epoch": 3.132780082987552, + "grad_norm": 5.373658977233531, + "learning_rate": 2.704742204176301e-05, + "loss": 0.5404, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.7401503324508667, + "step": 755, + "valid_targets_mean": 104.4, + "valid_targets_min": 75 + }, + { + "epoch": 3.1535269709543567, + "grad_norm": 4.695430238717871, + "learning_rate": 2.6853366241921083e-05, + "loss": 0.546, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.507964015007019, + "step": 760, + "valid_targets_mean": 131.6, + "valid_targets_min": 52 + }, + { + "epoch": 3.1742738589211617, + "grad_norm": 4.940895414049785, + "learning_rate": 2.6658576610703142e-05, + "loss": 0.5017, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.44809240102767944, + "step": 765, + "valid_targets_mean": 160.5, + "valid_targets_min": 59 + }, + { + "epoch": 3.195020746887967, + "grad_norm": 4.475353236399673, + "learning_rate": 2.6463074005413187e-05, + "loss": 0.4833, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.4449211061000824, + "step": 770, + "valid_targets_mean": 152.4, + "valid_targets_min": 72 + }, + { + "epoch": 3.215767634854772, + "grad_norm": 4.684709304749235, + "learning_rate": 2.6266879359697647e-05, + "loss": 0.5946, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.5302928686141968, + "step": 775, + "valid_targets_mean": 129.9, + "valid_targets_min": 72 + }, + { + "epoch": 3.236514522821577, + "grad_norm": 4.642771055970237, + "learning_rate": 2.6070013681303933e-05, + "loss": 0.5182, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.5693954229354858, + "step": 780, + "valid_targets_mean": 121.1, + "valid_targets_min": 63 + }, + { + "epoch": 3.2572614107883817, + "grad_norm": 4.02364920565193, + "learning_rate": 2.5872498049830973e-05, + "loss": 0.5421, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.4645788073539734, + "step": 785, + "valid_targets_mean": 152.3, + "valid_targets_min": 61 + }, + { + "epoch": 3.2780082987551866, + "grad_norm": 4.552089103946345, + "learning_rate": 2.5674353614472084e-05, + "loss": 0.4724, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.5232946276664734, + "step": 790, + "valid_targets_mean": 110.2, + "valid_targets_min": 63 + }, + { + "epoch": 3.2987551867219915, + "grad_norm": 3.7131472795548244, + "learning_rate": 2.5475601591750448e-05, + "loss": 0.4999, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.40184569358825684, + "step": 795, + "valid_targets_mean": 262.1, + "valid_targets_min": 49 + }, + { + "epoch": 3.3195020746887964, + "grad_norm": 5.143076340286301, + "learning_rate": 2.5276263263247282e-05, + "loss": 0.5116, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.527146577835083, + "step": 800, + "valid_targets_mean": 110.8, + "valid_targets_min": 62 + }, + { + "epoch": 3.340248962655602, + "grad_norm": 5.319558834403425, + "learning_rate": 2.5076359973323107e-05, + "loss": 0.6154, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.6301993131637573, + "step": 805, + "valid_targets_mean": 96.8, + "valid_targets_min": 56 + }, + { + "epoch": 3.3609958506224067, + "grad_norm": 3.8762909428008725, + "learning_rate": 2.4875913126832297e-05, + "loss": 0.5952, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.4537183940410614, + "step": 810, + "valid_targets_mean": 173.0, + "valid_targets_min": 57 + }, + { + "epoch": 3.3817427385892116, + "grad_norm": 7.622394767794957, + "learning_rate": 2.4674944186831108e-05, + "loss": 0.4865, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.46812310814857483, + "step": 815, + "valid_targets_mean": 114.8, + "valid_targets_min": 51 + }, + { + "epoch": 3.4024896265560165, + "grad_norm": 4.432218117678233, + "learning_rate": 2.4473474672279497e-05, + "loss": 0.504, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.4488857686519623, + "step": 820, + "valid_targets_mean": 124.0, + "valid_targets_min": 61 + }, + { + "epoch": 3.4232365145228214, + "grad_norm": 4.55996161463432, + "learning_rate": 2.427152615573697e-05, + "loss": 0.5078, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.34916043281555176, + "step": 825, + "valid_targets_mean": 176.0, + "valid_targets_min": 59 + }, + { + "epoch": 3.4439834024896268, + "grad_norm": 5.114951004982656, + "learning_rate": 2.4069120261052682e-05, + "loss": 0.5275, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.6819808483123779, + "step": 830, + "valid_targets_mean": 105.2, + "valid_targets_min": 71 + }, + { + "epoch": 3.4647302904564317, + "grad_norm": 5.28455932841826, + "learning_rate": 2.386627866105002e-05, + "loss": 0.5831, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.6226446628570557, + "step": 835, + "valid_targets_mean": 134.7, + "valid_targets_min": 60 + }, + { + "epoch": 3.4854771784232366, + "grad_norm": 4.473738543991443, + "learning_rate": 2.3663023075205992e-05, + "loss": 0.5253, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.5064611434936523, + "step": 840, + "valid_targets_mean": 167.0, + "valid_targets_min": 76 + }, + { + "epoch": 3.5062240663900415, + "grad_norm": 6.42051749883382, + "learning_rate": 2.3459375267325552e-05, + "loss": 0.5637, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.517977237701416, + "step": 845, + "valid_targets_mean": 111.2, + "valid_targets_min": 54 + }, + { + "epoch": 3.5269709543568464, + "grad_norm": 4.595241350402068, + "learning_rate": 2.325535704321126e-05, + "loss": 0.612, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.5615861415863037, + "step": 850, + "valid_targets_mean": 147.6, + "valid_targets_min": 70 + }, + { + "epoch": 3.5477178423236513, + "grad_norm": 5.192972612859483, + "learning_rate": 2.3050990248328365e-05, + "loss": 0.5303, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.5923424959182739, + "step": 855, + "valid_targets_mean": 103.9, + "valid_targets_min": 20 + }, + { + "epoch": 3.568464730290456, + "grad_norm": 5.3495277210724295, + "learning_rate": 2.2846296765465708e-05, + "loss": 0.5785, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.6900249719619751, + "step": 860, + "valid_targets_mean": 98.2, + "valid_targets_min": 48 + }, + { + "epoch": 3.5892116182572615, + "grad_norm": 4.61930376588509, + "learning_rate": 2.2641298512392585e-05, + "loss": 0.5289, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.5852305293083191, + "step": 865, + "valid_targets_mean": 107.1, + "valid_targets_min": 68 + }, + { + "epoch": 3.6099585062240664, + "grad_norm": 5.361219727707543, + "learning_rate": 2.2436017439511878e-05, + "loss": 0.519, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.630679190158844, + "step": 870, + "valid_targets_mean": 99.9, + "valid_targets_min": 48 + }, + { + "epoch": 3.6307053941908713, + "grad_norm": 4.72558048492487, + "learning_rate": 2.2230475527509712e-05, + "loss": 0.5643, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.5127944350242615, + "step": 875, + "valid_targets_mean": 135.3, + "valid_targets_min": 69 + }, + { + "epoch": 3.6514522821576763, + "grad_norm": 5.72644206689818, + "learning_rate": 2.2024694785001814e-05, + "loss": 0.5333, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.5506484508514404, + "step": 880, + "valid_targets_mean": 91.6, + "valid_targets_min": 56 + }, + { + "epoch": 3.6721991701244816, + "grad_norm": 5.396922771454308, + "learning_rate": 2.1818697246176943e-05, + "loss": 0.5145, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.6444652676582336, + "step": 885, + "valid_targets_mean": 89.5, + "valid_targets_min": 69 + }, + { + "epoch": 3.6929460580912865, + "grad_norm": 4.709143641817256, + "learning_rate": 2.161250496843756e-05, + "loss": 0.6124, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.5286765098571777, + "step": 890, + "valid_targets_mean": 118.1, + "valid_targets_min": 61 + }, + { + "epoch": 3.7136929460580914, + "grad_norm": 4.279711593845015, + "learning_rate": 2.1406140030037988e-05, + "loss": 0.5131, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.508799135684967, + "step": 895, + "valid_targets_mean": 155.9, + "valid_targets_min": 47 + }, + { + "epoch": 3.7344398340248963, + "grad_norm": 5.507064174030223, + "learning_rate": 2.119962452772039e-05, + "loss": 0.6005, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.6413220763206482, + "step": 900, + "valid_targets_mean": 102.9, + "valid_targets_min": 51 + }, + { + "epoch": 3.7551867219917012, + "grad_norm": 4.977065659139609, + "learning_rate": 2.0992980574348687e-05, + "loss": 0.637, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.660423994064331, + "step": 905, + "valid_targets_mean": 114.2, + "valid_targets_min": 63 + }, + { + "epoch": 3.775933609958506, + "grad_norm": 5.507287449132665, + "learning_rate": 2.0786230296540864e-05, + "loss": 0.562, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.6502384543418884, + "step": 910, + "valid_targets_mean": 95.9, + "valid_targets_min": 57 + }, + { + "epoch": 3.796680497925311, + "grad_norm": 3.542244490137563, + "learning_rate": 2.0579395832299688e-05, + "loss": 0.4917, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.38029953837394714, + "step": 915, + "valid_targets_mean": 189.2, + "valid_targets_min": 52 + }, + { + "epoch": 3.817427385892116, + "grad_norm": 5.057195244837334, + "learning_rate": 2.0372499328642277e-05, + "loss": 0.6087, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.6501518487930298, + "step": 920, + "valid_targets_mean": 115.8, + "valid_targets_min": 55 + }, + { + "epoch": 3.8381742738589213, + "grad_norm": 3.87850879824177, + "learning_rate": 2.016556293922869e-05, + "loss": 0.54, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.45556652545928955, + "step": 925, + "valid_targets_mean": 136.7, + "valid_targets_min": 61 + }, + { + "epoch": 3.858921161825726, + "grad_norm": 4.15102620173551, + "learning_rate": 1.9958608821989792e-05, + "loss": 0.5028, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.5197992324829102, + "step": 930, + "valid_targets_mean": 172.0, + "valid_targets_min": 43 + }, + { + "epoch": 3.879668049792531, + "grad_norm": 3.9883443533699623, + "learning_rate": 1.9751659136754686e-05, + "loss": 0.4334, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.4564608335494995, + "step": 935, + "valid_targets_mean": 171.6, + "valid_targets_min": 68 + }, + { + "epoch": 3.900414937759336, + "grad_norm": 3.8229294981435196, + "learning_rate": 1.9544736042877886e-05, + "loss": 0.3988, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.3890151381492615, + "step": 940, + "valid_targets_mean": 203.3, + "valid_targets_min": 58 + }, + { + "epoch": 3.921161825726141, + "grad_norm": 4.770214843535073, + "learning_rate": 1.9337861696866643e-05, + "loss": 0.5395, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.5733540058135986, + "step": 945, + "valid_targets_mean": 121.2, + "valid_targets_min": 58 + }, + { + "epoch": 3.9419087136929463, + "grad_norm": 5.715733351887051, + "learning_rate": 1.913105825000844e-05, + "loss": 0.6749, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.6428124904632568, + "step": 950, + "valid_targets_mean": 99.4, + "valid_targets_min": 63 + }, + { + "epoch": 3.962655601659751, + "grad_norm": 5.247147480424706, + "learning_rate": 1.8924347845999197e-05, + "loss": 0.5692, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.4043826162815094, + "step": 955, + "valid_targets_mean": 186.8, + "valid_targets_min": 55 + }, + { + "epoch": 3.983402489626556, + "grad_norm": 4.693398960678804, + "learning_rate": 1.871775261857215e-05, + "loss": 0.5391, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.5111123323440552, + "step": 960, + "valid_targets_mean": 134.2, + "valid_targets_min": 49 + }, + { + "epoch": 4.004149377593361, + "grad_norm": 4.562791344546384, + "learning_rate": 1.8511294689127887e-05, + "loss": 0.5623, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.4835764467716217, + "step": 965, + "valid_targets_mean": 91.6, + "valid_targets_min": 46 + }, + { + "epoch": 4.024896265560166, + "grad_norm": 4.231590917124419, + "learning_rate": 1.830499616436567e-05, + "loss": 0.4125, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.3629613518714905, + "step": 970, + "valid_targets_mean": 133.6, + "valid_targets_min": 56 + }, + { + "epoch": 4.045643153526971, + "grad_norm": 6.451063449554833, + "learning_rate": 1.8098879133916352e-05, + "loss": 0.4243, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.4441721737384796, + "step": 975, + "valid_targets_mean": 104.6, + "valid_targets_min": 48 + }, + { + "epoch": 4.066390041493776, + "grad_norm": 7.27315245214559, + "learning_rate": 1.789296566797706e-05, + "loss": 0.4303, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.44909948110580444, + "step": 980, + "valid_targets_mean": 116.8, + "valid_targets_min": 59 + }, + { + "epoch": 4.087136929460581, + "grad_norm": 5.423253701096964, + "learning_rate": 1.768727781494807e-05, + "loss": 0.418, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.4044135510921478, + "step": 985, + "valid_targets_mean": 116.2, + "valid_targets_min": 59 + }, + { + "epoch": 4.1078838174273855, + "grad_norm": 6.060822859449416, + "learning_rate": 1.7481837599071903e-05, + "loss": 0.3992, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.4017379879951477, + "step": 990, + "valid_targets_mean": 126.2, + "valid_targets_min": 63 + }, + { + "epoch": 4.12863070539419, + "grad_norm": 6.047434247325548, + "learning_rate": 1.7276667018075073e-05, + "loss": 0.379, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.4426099956035614, + "step": 995, + "valid_targets_mean": 105.3, + "valid_targets_min": 68 + }, + { + "epoch": 4.149377593360996, + "grad_norm": 4.238813714812382, + "learning_rate": 1.7071788040812655e-05, + "loss": 0.3745, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.30655837059020996, + "step": 1000, + "valid_targets_mean": 140.9, + "valid_targets_min": 59 + }, + { + "epoch": 4.170124481327801, + "grad_norm": 6.8069774791469015, + "learning_rate": 1.686722260491597e-05, + "loss": 0.3669, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.36141982674598694, + "step": 1005, + "valid_targets_mean": 105.2, + "valid_targets_min": 55 + }, + { + "epoch": 4.190871369294606, + "grad_norm": 6.0106894978076975, + "learning_rate": 1.6662992614443525e-05, + "loss": 0.3601, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.44953832030296326, + "step": 1010, + "valid_targets_mean": 99.1, + "valid_targets_min": 50 + }, + { + "epoch": 4.211618257261411, + "grad_norm": 4.13388889521017, + "learning_rate": 1.6459119937535702e-05, + "loss": 0.3598, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.2533540725708008, + "step": 1015, + "valid_targets_mean": 210.5, + "valid_targets_min": 42 + }, + { + "epoch": 4.232365145228216, + "grad_norm": 4.270769059639924, + "learning_rate": 1.6255626404073132e-05, + "loss": 0.3857, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.35333698987960815, + "step": 1020, + "valid_targets_mean": 151.1, + "valid_targets_min": 71 + }, + { + "epoch": 4.253112033195021, + "grad_norm": 3.928272352429347, + "learning_rate": 1.605253380333927e-05, + "loss": 0.3545, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.23659725487232208, + "step": 1025, + "valid_targets_mean": 209.1, + "valid_targets_min": 58 + }, + { + "epoch": 4.273858921161826, + "grad_norm": 6.520862768289659, + "learning_rate": 1.584986388168728e-05, + "loss": 0.327, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.3713211119174957, + "step": 1030, + "valid_targets_mean": 106.5, + "valid_targets_min": 68 + }, + { + "epoch": 4.2946058091286305, + "grad_norm": 6.088618896351517, + "learning_rate": 1.5647638340211525e-05, + "loss": 0.4217, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.4184168577194214, + "step": 1035, + "valid_targets_mean": 123.2, + "valid_targets_min": 57 + }, + { + "epoch": 4.3153526970954355, + "grad_norm": 6.739392196333286, + "learning_rate": 1.5445878832423876e-05, + "loss": 0.3972, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.4932287633419037, + "step": 1040, + "valid_targets_mean": 93.1, + "valid_targets_min": 55 + }, + { + "epoch": 4.33609958506224, + "grad_norm": 4.816460152015017, + "learning_rate": 1.5244606961935187e-05, + "loss": 0.4227, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.33327189087867737, + "step": 1045, + "valid_targets_mean": 126.0, + "valid_targets_min": 63 + }, + { + "epoch": 4.356846473029045, + "grad_norm": 4.85922792161953, + "learning_rate": 1.5043844280142005e-05, + "loss": 0.3818, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.3491777181625366, + "step": 1050, + "valid_targets_mean": 144.3, + "valid_targets_min": 66 + }, + { + "epoch": 4.377593360995851, + "grad_norm": 5.503313733342627, + "learning_rate": 1.4843612283918995e-05, + "loss": 0.3707, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.3861674666404724, + "step": 1055, + "valid_targets_mean": 145.2, + "valid_targets_min": 68 + }, + { + "epoch": 4.398340248962656, + "grad_norm": 3.744381810663879, + "learning_rate": 1.4643932413317079e-05, + "loss": 0.3399, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.2514806091785431, + "step": 1060, + "valid_targets_mean": 166.9, + "valid_targets_min": 69 + }, + { + "epoch": 4.419087136929461, + "grad_norm": 4.739596531448335, + "learning_rate": 1.4444826049267784e-05, + "loss": 0.3923, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.32454103231430054, + "step": 1065, + "valid_targets_mean": 124.9, + "valid_targets_min": 56 + }, + { + "epoch": 4.439834024896266, + "grad_norm": 5.668748766294243, + "learning_rate": 1.4246314511293777e-05, + "loss": 0.3607, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.3947162330150604, + "step": 1070, + "valid_targets_mean": 139.6, + "valid_targets_min": 58 + }, + { + "epoch": 4.460580912863071, + "grad_norm": 5.417321720671694, + "learning_rate": 1.4048419055226146e-05, + "loss": 0.4178, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.3876382112503052, + "step": 1075, + "valid_targets_mean": 137.8, + "valid_targets_min": 64 + }, + { + "epoch": 4.481327800829876, + "grad_norm": 5.060908418832114, + "learning_rate": 1.3851160870928317e-05, + "loss": 0.4052, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.38143742084503174, + "step": 1080, + "valid_targets_mean": 117.8, + "valid_targets_min": 66 + }, + { + "epoch": 4.5020746887966805, + "grad_norm": 4.781433991361493, + "learning_rate": 1.3654561080027213e-05, + "loss": 0.343, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.3337147533893585, + "step": 1085, + "valid_targets_mean": 156.2, + "valid_targets_min": 66 + }, + { + "epoch": 4.522821576763485, + "grad_norm": 6.7756272116807175, + "learning_rate": 1.345864073365157e-05, + "loss": 0.393, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.5130104422569275, + "step": 1090, + "valid_targets_mean": 103.8, + "valid_targets_min": 55 + }, + { + "epoch": 4.54356846473029, + "grad_norm": 6.000927566190182, + "learning_rate": 1.3263420810177902e-05, + "loss": 0.4336, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.4507676661014557, + "step": 1095, + "valid_targets_mean": 105.2, + "valid_targets_min": 55 + }, + { + "epoch": 4.564315352697095, + "grad_norm": 6.058238312428124, + "learning_rate": 1.3068922212984188e-05, + "loss": 0.4127, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.4390409588813782, + "step": 1100, + "valid_targets_mean": 97.3, + "valid_targets_min": 64 + }, + { + "epoch": 4.5850622406639, + "grad_norm": 4.203708752511155, + "learning_rate": 1.287516576821167e-05, + "loss": 0.4029, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.332113116979599, + "step": 1105, + "valid_targets_mean": 178.4, + "valid_targets_min": 69 + }, + { + "epoch": 4.605809128630705, + "grad_norm": 5.207374310292064, + "learning_rate": 1.2682172222534805e-05, + "loss": 0.3313, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.4056367874145508, + "step": 1110, + "valid_targets_mean": 100.6, + "valid_targets_min": 60 + }, + { + "epoch": 4.62655601659751, + "grad_norm": 4.98336947480142, + "learning_rate": 1.2489962240939857e-05, + "loss": 0.3613, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.2771909534931183, + "step": 1115, + "valid_targets_mean": 159.0, + "valid_targets_min": 61 + }, + { + "epoch": 4.647302904564316, + "grad_norm": 3.6594235056108, + "learning_rate": 1.229855640451213e-05, + "loss": 0.3179, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.16788652539253235, + "step": 1120, + "valid_targets_mean": 306.2, + "valid_targets_min": 68 + }, + { + "epoch": 4.668049792531121, + "grad_norm": 8.687907658737734, + "learning_rate": 1.2107975208232259e-05, + "loss": 0.4198, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.379871666431427, + "step": 1125, + "valid_targets_mean": 133.1, + "valid_targets_min": 61 + }, + { + "epoch": 4.6887966804979255, + "grad_norm": 5.836548120395633, + "learning_rate": 1.1918239058781636e-05, + "loss": 0.3607, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.3218669891357422, + "step": 1130, + "valid_targets_mean": 127.9, + "valid_targets_min": 70 + }, + { + "epoch": 4.70954356846473, + "grad_norm": 4.201638283395644, + "learning_rate": 1.1729368272357419e-05, + "loss": 0.3285, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.23533934354782104, + "step": 1135, + "valid_targets_mean": 199.4, + "valid_targets_min": 64 + }, + { + "epoch": 4.730290456431535, + "grad_norm": 5.095855157677563, + "learning_rate": 1.1541383072497077e-05, + "loss": 0.3855, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.38103365898132324, + "step": 1140, + "valid_targets_mean": 140.2, + "valid_targets_min": 59 + }, + { + "epoch": 4.75103734439834, + "grad_norm": 6.043002128231546, + "learning_rate": 1.1354303587913003e-05, + "loss": 0.337, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.39897701144218445, + "step": 1145, + "valid_targets_mean": 124.3, + "valid_targets_min": 51 + }, + { + "epoch": 4.771784232365145, + "grad_norm": 5.958399057291772, + "learning_rate": 1.1168149850337136e-05, + "loss": 0.3401, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.44645410776138306, + "step": 1150, + "valid_targets_mean": 117.4, + "valid_targets_min": 55 + }, + { + "epoch": 4.79253112033195, + "grad_norm": 4.690423833511766, + "learning_rate": 1.0982941792376125e-05, + "loss": 0.3953, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.35815998911857605, + "step": 1155, + "valid_targets_mean": 129.4, + "valid_targets_min": 57 + }, + { + "epoch": 4.813278008298755, + "grad_norm": 5.286714186295266, + "learning_rate": 1.0798699245376959e-05, + "loss": 0.41, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.34337374567985535, + "step": 1160, + "valid_targets_mean": 98.7, + "valid_targets_min": 50 + }, + { + "epoch": 4.83402489626556, + "grad_norm": 4.8262261611258985, + "learning_rate": 1.0615441937303534e-05, + "loss": 0.3275, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.32752180099487305, + "step": 1165, + "valid_targets_mean": 132.2, + "valid_targets_min": 69 + }, + { + "epoch": 4.854771784232365, + "grad_norm": 5.7535756694573665, + "learning_rate": 1.0433189490624253e-05, + "loss": 0.3779, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.4661560356616974, + "step": 1170, + "valid_targets_mean": 97.8, + "valid_targets_min": 64 + }, + { + "epoch": 4.875518672199171, + "grad_norm": 4.2819905844779775, + "learning_rate": 1.0251961420210937e-05, + "loss": 0.3465, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.33195045590400696, + "step": 1175, + "valid_targets_mean": 179.1, + "valid_targets_min": 72 + }, + { + "epoch": 4.8962655601659755, + "grad_norm": 3.688190995957866, + "learning_rate": 1.0071777131249237e-05, + "loss": 0.3008, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.2772628664970398, + "step": 1180, + "valid_targets_mean": 161.6, + "valid_targets_min": 72 + }, + { + "epoch": 4.91701244813278, + "grad_norm": 5.965502494236917, + "learning_rate": 9.892655917160814e-06, + "loss": 0.3767, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.4376785159111023, + "step": 1185, + "valid_targets_mean": 114.5, + "valid_targets_min": 63 + }, + { + "epoch": 4.937759336099585, + "grad_norm": 5.200859052929826, + "learning_rate": 9.714616957537466e-06, + "loss": 0.3356, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.35819631814956665, + "step": 1190, + "valid_targets_mean": 124.4, + "valid_targets_min": 53 + }, + { + "epoch": 4.95850622406639, + "grad_norm": 4.905971401096997, + "learning_rate": 9.537679316087491e-06, + "loss": 0.3299, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.33606231212615967, + "step": 1195, + "valid_targets_mean": 158.6, + "valid_targets_min": 69 + }, + { + "epoch": 4.979253112033195, + "grad_norm": 5.932174753958537, + "learning_rate": 9.361861938594332e-06, + "loss": 0.3432, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.44172829389572144, + "step": 1200, + "valid_targets_mean": 106.7, + "valid_targets_min": 62 + }, + { + "epoch": 5.0, + "grad_norm": 5.482278806871232, + "learning_rate": 9.187183650888056e-06, + "loss": 0.3926, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.35782986879348755, + "step": 1205, + "valid_targets_mean": 116.6, + "valid_targets_min": 46 + }, + { + "epoch": 5.020746887966805, + "grad_norm": 4.030939056748361, + "learning_rate": 9.013663156829438e-06, + "loss": 0.2897, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.24931949377059937, + "step": 1210, + "valid_targets_mean": 123.8, + "valid_targets_min": 60 + }, + { + "epoch": 5.04149377593361, + "grad_norm": 3.738822362764231, + "learning_rate": 8.841319036307334e-06, + "loss": 0.253, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.17903779447078705, + "step": 1215, + "valid_targets_mean": 230.9, + "valid_targets_min": 47 + }, + { + "epoch": 5.062240663900415, + "grad_norm": 7.551485917813854, + "learning_rate": 8.670169743249143e-06, + "loss": 0.2583, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.32046300172805786, + "step": 1220, + "valid_targets_mean": 85.4, + "valid_targets_min": 54 + }, + { + "epoch": 5.08298755186722, + "grad_norm": 5.108187510802179, + "learning_rate": 8.50023360364487e-06, + "loss": 0.2949, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.19512224197387695, + "step": 1225, + "valid_targets_mean": 146.3, + "valid_targets_min": 53 + }, + { + "epoch": 5.1037344398340245, + "grad_norm": 6.161943015701863, + "learning_rate": 8.331528813584832e-06, + "loss": 0.2688, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.3473849594593048, + "step": 1230, + "valid_targets_mean": 102.7, + "valid_targets_min": 59 + }, + { + "epoch": 5.124481327800829, + "grad_norm": 5.4077143243461325, + "learning_rate": 8.164073437311315e-06, + "loss": 0.3074, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.329645037651062, + "step": 1235, + "valid_targets_mean": 109.8, + "valid_targets_min": 50 + }, + { + "epoch": 5.145228215767635, + "grad_norm": 3.9507322454113507, + "learning_rate": 7.997885405284305e-06, + "loss": 0.2572, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.1828637272119522, + "step": 1240, + "valid_targets_mean": 137.2, + "valid_targets_min": 64 + }, + { + "epoch": 5.16597510373444, + "grad_norm": 4.418001305578677, + "learning_rate": 7.83298251226158e-06, + "loss": 0.2603, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.22215382754802704, + "step": 1245, + "valid_targets_mean": 156.5, + "valid_targets_min": 64 + }, + { + "epoch": 5.186721991701245, + "grad_norm": 4.865514725466622, + "learning_rate": 7.669382415393298e-06, + "loss": 0.2724, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.21926715970039368, + "step": 1250, + "valid_targets_mean": 142.2, + "valid_targets_min": 62 + }, + { + "epoch": 5.20746887966805, + "grad_norm": 4.671773078750636, + "learning_rate": 7.507102632331382e-06, + "loss": 0.2, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.21363668143749237, + "step": 1255, + "valid_targets_mean": 119.7, + "valid_targets_min": 59 + }, + { + "epoch": 5.228215767634855, + "grad_norm": 7.747309257352988, + "learning_rate": 7.3461605393537415e-06, + "loss": 0.2709, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.36064910888671875, + "step": 1260, + "valid_targets_mean": 103.8, + "valid_targets_min": 54 + }, + { + "epoch": 5.24896265560166, + "grad_norm": 5.836162881737733, + "learning_rate": 7.186573369503731e-06, + "loss": 0.2643, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.22864724695682526, + "step": 1265, + "valid_targets_mean": 155.4, + "valid_targets_min": 52 + }, + { + "epoch": 5.269709543568465, + "grad_norm": 6.109299844374152, + "learning_rate": 7.028358210744881e-06, + "loss": 0.2853, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.3166709840297699, + "step": 1270, + "valid_targets_mean": 109.1, + "valid_targets_min": 51 + }, + { + "epoch": 5.29045643153527, + "grad_norm": 4.832159172426314, + "learning_rate": 6.8715320041312095e-06, + "loss": 0.2802, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.3006090521812439, + "step": 1275, + "valid_targets_mean": 115.7, + "valid_targets_min": 66 + }, + { + "epoch": 5.3112033195020745, + "grad_norm": 3.182299417076334, + "learning_rate": 6.716111541993213e-06, + "loss": 0.2687, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.1729133427143097, + "step": 1280, + "valid_targets_mean": 199.3, + "valid_targets_min": 68 + }, + { + "epoch": 5.331950207468879, + "grad_norm": 5.136124561619918, + "learning_rate": 6.562113466139836e-06, + "loss": 0.273, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.2921794056892395, + "step": 1285, + "valid_targets_mean": 98.2, + "valid_targets_min": 55 + }, + { + "epoch": 5.352697095435684, + "grad_norm": 6.387158477899295, + "learning_rate": 6.4095542660765145e-06, + "loss": 0.2853, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.33419498801231384, + "step": 1290, + "valid_targets_mean": 112.9, + "valid_targets_min": 49 + }, + { + "epoch": 5.37344398340249, + "grad_norm": 4.174496355970623, + "learning_rate": 6.258450277239545e-06, + "loss": 0.2691, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.2561451494693756, + "step": 1295, + "valid_targets_mean": 127.8, + "valid_targets_min": 56 + }, + { + "epoch": 5.394190871369295, + "grad_norm": 4.555697459816092, + "learning_rate": 6.108817679246979e-06, + "loss": 0.2528, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.22688698768615723, + "step": 1300, + "valid_targets_mean": 113.3, + "valid_targets_min": 72 + }, + { + "epoch": 5.4149377593361, + "grad_norm": 3.7328921616493482, + "learning_rate": 5.960672494166113e-06, + "loss": 0.2519, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.18989375233650208, + "step": 1305, + "valid_targets_mean": 173.1, + "valid_targets_min": 64 + }, + { + "epoch": 5.435684647302905, + "grad_norm": 4.693648138618525, + "learning_rate": 5.8140305847979895e-06, + "loss": 0.2879, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.24130411446094513, + "step": 1310, + "valid_targets_mean": 112.8, + "valid_targets_min": 58 + }, + { + "epoch": 5.45643153526971, + "grad_norm": 6.1245996173536295, + "learning_rate": 5.668907652978783e-06, + "loss": 0.287, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.3436064124107361, + "step": 1315, + "valid_targets_mean": 101.9, + "valid_targets_min": 65 + }, + { + "epoch": 5.477178423236515, + "grad_norm": 4.880336940097785, + "learning_rate": 5.5253192378985966e-06, + "loss": 0.2254, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.2438402771949768, + "step": 1320, + "valid_targets_mean": 172.6, + "valid_targets_min": 56 + }, + { + "epoch": 5.4979253112033195, + "grad_norm": 5.184523533528152, + "learning_rate": 5.383280714437518e-06, + "loss": 0.2503, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.3002464175224304, + "step": 1325, + "valid_targets_mean": 124.2, + "valid_targets_min": 41 + }, + { + "epoch": 5.518672199170124, + "grad_norm": 5.31824198148948, + "learning_rate": 5.242807291519374e-06, + "loss": 0.2999, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.2796890139579773, + "step": 1330, + "valid_targets_mean": 105.6, + "valid_targets_min": 52 + }, + { + "epoch": 5.539419087136929, + "grad_norm": 4.406452877804828, + "learning_rate": 5.103914010483206e-06, + "loss": 0.1811, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.16050797700881958, + "step": 1335, + "valid_targets_mean": 169.6, + "valid_targets_min": 51 + }, + { + "epoch": 5.560165975103734, + "grad_norm": 5.526397123023929, + "learning_rate": 4.966615743472709e-06, + "loss": 0.2922, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.302845299243927, + "step": 1340, + "valid_targets_mean": 102.0, + "valid_targets_min": 61 + }, + { + "epoch": 5.580912863070539, + "grad_norm": 4.475088302223254, + "learning_rate": 4.830927191843779e-06, + "loss": 0.2274, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.22871732711791992, + "step": 1345, + "valid_targets_mean": 112.8, + "valid_targets_min": 66 + }, + { + "epoch": 5.601659751037344, + "grad_norm": 3.827600528927148, + "learning_rate": 4.696862884590349e-06, + "loss": 0.2435, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.17285335063934326, + "step": 1350, + "valid_targets_mean": 158.9, + "valid_targets_min": 56 + }, + { + "epoch": 5.622406639004149, + "grad_norm": 5.486601988943072, + "learning_rate": 4.564437176788681e-06, + "loss": 0.2842, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.2588198482990265, + "step": 1355, + "valid_targets_mean": 116.6, + "valid_targets_min": 55 + }, + { + "epoch": 5.643153526970955, + "grad_norm": 4.279231303644842, + "learning_rate": 4.433664248060295e-06, + "loss": 0.229, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.2061527669429779, + "step": 1360, + "valid_targets_mean": 153.4, + "valid_targets_min": 81 + }, + { + "epoch": 5.66390041493776, + "grad_norm": 5.487514913708906, + "learning_rate": 4.304558101053629e-06, + "loss": 0.2755, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.23738700151443481, + "step": 1365, + "valid_targets_mean": 129.2, + "valid_targets_min": 53 + }, + { + "epoch": 5.6846473029045645, + "grad_norm": 4.05314290527769, + "learning_rate": 4.177132559944761e-06, + "loss": 0.2214, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.196598082780838, + "step": 1370, + "valid_targets_mean": 164.5, + "valid_targets_min": 64 + }, + { + "epoch": 5.7053941908713695, + "grad_norm": 6.2913597721448, + "learning_rate": 4.051401268957087e-06, + "loss": 0.27, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.3582676351070404, + "step": 1375, + "valid_targets_mean": 107.6, + "valid_targets_min": 65 + }, + { + "epoch": 5.726141078838174, + "grad_norm": 6.031743246784628, + "learning_rate": 3.927377690900436e-06, + "loss": 0.2684, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.25066938996315, + "step": 1380, + "valid_targets_mean": 127.6, + "valid_targets_min": 59 + }, + { + "epoch": 5.746887966804979, + "grad_norm": 5.214705770189645, + "learning_rate": 3.805075105729459e-06, + "loss": 0.2639, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.25866299867630005, + "step": 1385, + "valid_targets_mean": 123.2, + "valid_targets_min": 55 + }, + { + "epoch": 5.767634854771784, + "grad_norm": 4.524404310503156, + "learning_rate": 3.6845066091216917e-06, + "loss": 0.2862, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.23066316545009613, + "step": 1390, + "valid_targets_mean": 128.7, + "valid_targets_min": 63 + }, + { + "epoch": 5.788381742738589, + "grad_norm": 5.097443863683129, + "learning_rate": 3.56568511107533e-06, + "loss": 0.2365, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.28775590658187866, + "step": 1395, + "valid_targets_mean": 93.3, + "valid_targets_min": 55 + }, + { + "epoch": 5.809128630705394, + "grad_norm": 5.670717377040933, + "learning_rate": 3.448623334526853e-06, + "loss": 0.3138, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.3401385545730591, + "step": 1400, + "valid_targets_mean": 94.4, + "valid_targets_min": 59 + }, + { + "epoch": 5.829875518672199, + "grad_norm": 4.62592863868813, + "learning_rate": 3.333333813988726e-06, + "loss": 0.2836, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.24296891689300537, + "step": 1405, + "valid_targets_mean": 131.2, + "valid_targets_min": 55 + }, + { + "epoch": 5.850622406639004, + "grad_norm": 5.086749493331627, + "learning_rate": 3.219828894207242e-06, + "loss": 0.2612, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.2650941014289856, + "step": 1410, + "valid_targets_mean": 124.6, + "valid_targets_min": 78 + }, + { + "epoch": 5.87136929460581, + "grad_norm": 4.85811557310476, + "learning_rate": 3.1081207288406846e-06, + "loss": 0.2353, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.2586798667907715, + "step": 1415, + "valid_targets_mean": 111.1, + "valid_targets_min": 41 + }, + { + "epoch": 5.8921161825726145, + "grad_norm": 2.889073243641731, + "learning_rate": 2.9982212791580044e-06, + "loss": 0.2309, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.15041622519493103, + "step": 1420, + "valid_targets_mean": 211.2, + "valid_targets_min": 67 + }, + { + "epoch": 5.912863070539419, + "grad_norm": 4.7851155337037925, + "learning_rate": 2.890142312757982e-06, + "loss": 0.2902, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.26748985052108765, + "step": 1425, + "valid_targets_mean": 115.7, + "valid_targets_min": 47 + }, + { + "epoch": 5.933609958506224, + "grad_norm": 4.580807918668048, + "learning_rate": 2.7838954023092845e-06, + "loss": 0.2352, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.2532392144203186, + "step": 1430, + "valid_targets_mean": 122.4, + "valid_targets_min": 60 + }, + { + "epoch": 5.954356846473029, + "grad_norm": 5.472491432141598, + "learning_rate": 2.679491924311226e-06, + "loss": 0.254, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.27012646198272705, + "step": 1435, + "valid_targets_mean": 116.7, + "valid_targets_min": 57 + }, + { + "epoch": 5.975103734439834, + "grad_norm": 4.968702559787331, + "learning_rate": 2.576943057875696e-06, + "loss": 0.2627, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.23716619610786438, + "step": 1440, + "valid_targets_mean": 146.7, + "valid_targets_min": 61 + }, + { + "epoch": 5.995850622406639, + "grad_norm": 3.6763933325276374, + "learning_rate": 2.4762597835300815e-06, + "loss": 0.2422, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.17048585414886475, + "step": 1445, + "valid_targets_mean": 182.2, + "valid_targets_min": 62 + }, + { + "epoch": 6.016597510373444, + "grad_norm": 3.4439805728949118, + "learning_rate": 2.377452882041551e-06, + "loss": 0.2238, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.20334039628505707, + "step": 1450, + "valid_targets_mean": 121.8, + "valid_targets_min": 48 + }, + { + "epoch": 6.037344398340249, + "grad_norm": 4.169982176027096, + "learning_rate": 2.280532933262678e-06, + "loss": 0.1914, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.22471293807029724, + "step": 1455, + "valid_targets_mean": 117.9, + "valid_targets_min": 55 + }, + { + "epoch": 6.058091286307054, + "grad_norm": 4.451395467337526, + "learning_rate": 2.1855103149985934e-06, + "loss": 0.2352, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.26145243644714355, + "step": 1460, + "valid_targets_mean": 95.1, + "valid_targets_min": 59 + }, + { + "epoch": 6.078838174273859, + "grad_norm": 4.720931193547068, + "learning_rate": 2.0923952018957826e-06, + "loss": 0.2003, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.2084977924823761, + "step": 1465, + "valid_targets_mean": 112.5, + "valid_targets_min": 70 + }, + { + "epoch": 6.0995850622406635, + "grad_norm": 4.908093674427741, + "learning_rate": 2.0011975643526106e-06, + "loss": 0.185, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.2433365285396576, + "step": 1470, + "valid_targets_mean": 120.3, + "valid_targets_min": 67 + }, + { + "epoch": 6.1203319502074685, + "grad_norm": 4.960759939432554, + "learning_rate": 1.9119271674517305e-06, + "loss": 0.1719, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.2257917821407318, + "step": 1475, + "valid_targets_mean": 112.6, + "valid_targets_min": 61 + }, + { + "epoch": 6.141078838174274, + "grad_norm": 6.960952455067493, + "learning_rate": 1.8245935699145035e-06, + "loss": 0.2254, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.2723103165626526, + "step": 1480, + "valid_targets_mean": 106.7, + "valid_targets_min": 66 + }, + { + "epoch": 6.161825726141079, + "grad_norm": 4.632315701623262, + "learning_rate": 1.7392061230774371e-06, + "loss": 0.1786, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.23765739798545837, + "step": 1485, + "valid_targets_mean": 107.6, + "valid_targets_min": 65 + }, + { + "epoch": 6.182572614107884, + "grad_norm": 5.756675590953208, + "learning_rate": 1.6557739698909436e-06, + "loss": 0.2062, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.26241517066955566, + "step": 1490, + "valid_targets_mean": 104.4, + "valid_targets_min": 62 + }, + { + "epoch": 6.203319502074689, + "grad_norm": 3.871596830434273, + "learning_rate": 1.574306043940288e-06, + "loss": 0.1752, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.15879438817501068, + "step": 1495, + "valid_targets_mean": 137.4, + "valid_targets_min": 58 + }, + { + "epoch": 6.224066390041494, + "grad_norm": 4.069174001456709, + "learning_rate": 1.4948110684890726e-06, + "loss": 0.2002, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.12537790834903717, + "step": 1500, + "valid_targets_mean": 171.4, + "valid_targets_min": 64 + }, + { + "epoch": 6.244813278008299, + "grad_norm": 5.581161273740655, + "learning_rate": 1.4172975555451363e-06, + "loss": 0.2326, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.2820102572441101, + "step": 1505, + "valid_targets_mean": 102.7, + "valid_targets_min": 68 + }, + { + "epoch": 6.265560165975104, + "grad_norm": 5.524451235658226, + "learning_rate": 1.3417738049491536e-06, + "loss": 0.2402, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.28135719895362854, + "step": 1510, + "valid_targets_mean": 96.0, + "valid_targets_min": 61 + }, + { + "epoch": 6.286307053941909, + "grad_norm": 4.869973029745467, + "learning_rate": 1.268247903485902e-06, + "loss": 0.198, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.25688737630844116, + "step": 1515, + "valid_targets_mean": 101.5, + "valid_targets_min": 62 + }, + { + "epoch": 6.3070539419087135, + "grad_norm": 4.314906507746581, + "learning_rate": 1.1967277240183716e-06, + "loss": 0.21, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.191103994846344, + "step": 1520, + "valid_targets_mean": 120.8, + "valid_targets_min": 60 + }, + { + "epoch": 6.327800829875518, + "grad_norm": 4.3035104208842085, + "learning_rate": 1.1272209246447696e-06, + "loss": 0.2066, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.17241765558719635, + "step": 1525, + "valid_targets_mean": 130.2, + "valid_targets_min": 55 + }, + { + "epoch": 6.348547717842323, + "grad_norm": 3.91462234843459, + "learning_rate": 1.0597349478785123e-06, + "loss": 0.199, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.22027219831943512, + "step": 1530, + "valid_targets_mean": 157.9, + "valid_targets_min": 59 + }, + { + "epoch": 6.369294605809129, + "grad_norm": 4.161379790290467, + "learning_rate": 9.942770198513218e-07, + "loss": 0.2284, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.19286894798278809, + "step": 1535, + "valid_targets_mean": 130.5, + "valid_targets_min": 55 + }, + { + "epoch": 6.390041493775934, + "grad_norm": 3.979992570097402, + "learning_rate": 9.308541495394751e-07, + "loss": 0.2092, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.1797044277191162, + "step": 1540, + "valid_targets_mean": 127.6, + "valid_targets_min": 63 + }, + { + "epoch": 6.410788381742739, + "grad_norm": 4.578593444010601, + "learning_rate": 8.694731280133051e-07, + "loss": 0.2104, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.25974029302597046, + "step": 1545, + "valid_targets_mean": 120.4, + "valid_targets_min": 66 + }, + { + "epoch": 6.431535269709544, + "grad_norm": 14.736273677745213, + "learning_rate": 8.101405277100549e-07, + "loss": 0.2306, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.2802034020423889, + "step": 1550, + "valid_targets_mean": 92.9, + "valid_targets_min": 61 + }, + { + "epoch": 6.452282157676349, + "grad_norm": 6.111463784813684, + "learning_rate": 7.528627017301016e-07, + "loss": 0.1801, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.29020482301712036, + "step": 1555, + "valid_targets_mean": 98.5, + "valid_targets_min": 69 + }, + { + "epoch": 6.473029045643154, + "grad_norm": 4.419863069023317, + "learning_rate": 6.976457831567262e-07, + "loss": 0.19, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.22686433792114258, + "step": 1560, + "valid_targets_mean": 121.2, + "valid_targets_min": 57 + }, + { + "epoch": 6.4937759336099585, + "grad_norm": 5.3614306604918, + "learning_rate": 6.444956843993754e-07, + "loss": 0.2115, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.24041424691677094, + "step": 1565, + "valid_targets_mean": 99.8, + "valid_targets_min": 52 + }, + { + "epoch": 6.514522821576763, + "grad_norm": 10.164349086292562, + "learning_rate": 5.934180965606007e-07, + "loss": 0.2254, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.2823329567909241, + "step": 1570, + "valid_targets_mean": 99.8, + "valid_targets_min": 69 + }, + { + "epoch": 6.535269709543568, + "grad_norm": 5.2042900023987055, + "learning_rate": 5.444184888266768e-07, + "loss": 0.2009, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.244424968957901, + "step": 1575, + "valid_targets_mean": 88.3, + "valid_targets_min": 51 + }, + { + "epoch": 6.556016597510373, + "grad_norm": 4.462057874376993, + "learning_rate": 4.975021078819731e-07, + "loss": 0.1839, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.22408881783485413, + "step": 1580, + "valid_targets_mean": 126.3, + "valid_targets_min": 50 + }, + { + "epoch": 6.576763485477178, + "grad_norm": 5.410684806378169, + "learning_rate": 4.5267397734717113e-07, + "loss": 0.2637, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.27906614542007446, + "step": 1585, + "valid_targets_mean": 92.4, + "valid_targets_min": 57 + }, + { + "epoch": 6.597510373443983, + "grad_norm": 4.119111648737928, + "learning_rate": 4.0993889724135314e-07, + "loss": 0.2036, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.1697758138179779, + "step": 1590, + "valid_targets_mean": 143.6, + "valid_targets_min": 61 + }, + { + "epoch": 6.618257261410788, + "grad_norm": 5.179582255288888, + "learning_rate": 3.693014434680242e-07, + "loss": 0.2089, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.24674925208091736, + "step": 1595, + "valid_targets_mean": 138.8, + "valid_targets_min": 54 + }, + { + "epoch": 6.639004149377593, + "grad_norm": 5.561101995379885, + "learning_rate": 3.307659673251595e-07, + "loss": 0.2244, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.177412211894989, + "step": 1600, + "valid_targets_mean": 117.6, + "valid_targets_min": 60 + }, + { + "epoch": 6.659751037344399, + "grad_norm": 5.195861980789606, + "learning_rate": 2.9433659503926623e-07, + "loss": 0.2327, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.25790226459503174, + "step": 1605, + "valid_targets_mean": 110.1, + "valid_targets_min": 53 + }, + { + "epoch": 6.680497925311204, + "grad_norm": 3.911906512279716, + "learning_rate": 2.6001722732358127e-07, + "loss": 0.2035, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.14186334609985352, + "step": 1610, + "valid_targets_mean": 141.2, + "valid_targets_min": 67 + }, + { + "epoch": 6.7012448132780085, + "grad_norm": 5.042285108929284, + "learning_rate": 2.27811538960383e-07, + "loss": 0.1913, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.2191961407661438, + "step": 1615, + "valid_targets_mean": 123.7, + "valid_targets_min": 44 + }, + { + "epoch": 6.721991701244813, + "grad_norm": 5.239166381330431, + "learning_rate": 1.9772297840752407e-07, + "loss": 0.1967, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.19531618058681488, + "step": 1620, + "valid_targets_mean": 111.1, + "valid_targets_min": 61 + }, + { + "epoch": 6.742738589211618, + "grad_norm": 4.934480610164203, + "learning_rate": 1.6975476742916886e-07, + "loss": 0.2021, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.21878521144390106, + "step": 1625, + "valid_targets_mean": 101.4, + "valid_targets_min": 50 + }, + { + "epoch": 6.763485477178423, + "grad_norm": 4.895769325490044, + "learning_rate": 1.43909900750836e-07, + "loss": 0.2313, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.23424255847930908, + "step": 1630, + "valid_targets_mean": 120.0, + "valid_targets_min": 60 + }, + { + "epoch": 6.784232365145228, + "grad_norm": 3.8671281371679322, + "learning_rate": 1.2019114573871947e-07, + "loss": 0.1937, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.17272061109542847, + "step": 1635, + "valid_targets_mean": 148.1, + "valid_targets_min": 62 + }, + { + "epoch": 6.804979253112033, + "grad_norm": 4.4379640165652345, + "learning_rate": 9.860104210338562e-08, + "loss": 0.1573, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.18832272291183472, + "step": 1640, + "valid_targets_mean": 119.2, + "valid_targets_min": 70 + }, + { + "epoch": 6.825726141078838, + "grad_norm": 3.06556856776517, + "learning_rate": 7.914190162781277e-08, + "loss": 0.1822, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.12091746181249619, + "step": 1645, + "valid_targets_mean": 182.6, + "valid_targets_min": 54 + }, + { + "epoch": 6.846473029045643, + "grad_norm": 4.099005450803784, + "learning_rate": 6.181580791987385e-08, + "loss": 0.2129, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.1655721664428711, + "step": 1650, + "valid_targets_mean": 148.9, + "valid_targets_min": 56 + }, + { + "epoch": 6.867219917012449, + "grad_norm": 4.310902974974436, + "learning_rate": 4.6624616189214765e-08, + "loss": 0.1678, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.15827462077140808, + "step": 1655, + "valid_targets_mean": 143.8, + "valid_targets_min": 61 + }, + { + "epoch": 6.8879668049792535, + "grad_norm": 5.3122495662963685, + "learning_rate": 3.3569953048624426e-08, + "loss": 0.1979, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.25778818130493164, + "step": 1660, + "valid_targets_mean": 134.8, + "valid_targets_min": 45 + }, + { + "epoch": 6.908713692946058, + "grad_norm": 7.693391989415569, + "learning_rate": 2.2653216339840746e-08, + "loss": 0.2183, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.21810811758041382, + "step": 1665, + "valid_targets_mean": 94.8, + "valid_targets_min": 20 + }, + { + "epoch": 6.929460580912863, + "grad_norm": 5.236163106459915, + "learning_rate": 1.3875574983894802e-08, + "loss": 0.1777, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.22439393401145935, + "step": 1670, + "valid_targets_mean": 117.6, + "valid_targets_min": 62 + }, + { + "epoch": 6.950207468879668, + "grad_norm": 3.8797498698175685, + "learning_rate": 7.237968855937638e-09, + "loss": 0.1835, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.14645695686340332, + "step": 1675, + "valid_targets_mean": 146.5, + "valid_targets_min": 64 + }, + { + "epoch": 6.970954356846473, + "grad_norm": 4.609262542128345, + "learning_rate": 2.7411086846051984e-09, + "loss": 0.2133, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.1896577924489975, + "step": 1680, + "valid_targets_mean": 98.5, + "valid_targets_min": 46 + }, + { + "epoch": 6.991701244813278, + "grad_norm": 2.6565371089348058, + "learning_rate": 3.8547597591254147e-10, + "loss": 0.2164, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.11129122227430344, + "step": 1685, + "valid_targets_mean": 239.2, + "valid_targets_min": 57 + }, + { + "epoch": 7.0, + "loss_nan_ranks": 0, + "loss_rank_avg": 0.3174639642238617, + "step": 1687, + "total_flos": 22338300149760.0, + "train_loss": 0.652452801803053, + "train_runtime": 3967.747, + "train_samples_per_second": 6.796, + "train_steps_per_second": 0.425, + "valid_targets_mean": 94.8, + "valid_targets_min": 62 + } + ], + "logging_steps": 5, + "max_steps": 1687, + "num_input_tokens_seen": 0, + "num_train_epochs": 7, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 22338300149760.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..96acabd --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0170b9f48358835966e9d5b36e1ca171de58de86a537ecba59350cd4ad78c1d5 +size 8721 diff --git a/training_loss.png b/training_loss.png new file mode 100644 index 0000000..19566d8 Binary files /dev/null and b/training_loss.png differ diff --git a/vocab.json b/vocab.json new file mode 100644 index 0000000..6c49fc6 --- /dev/null +++ b/vocab.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca10d7e9fb3ed18575dd1e277a2579c16d108e32f27439684afa0e10b1440910 +size 2776833