commit 478dd77780fe799caa2197889a3c22300d64937e Author: ModelHub XC Date: Thu Jun 18 12:34:13 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: IIGroup/X-Coder-SFT-Qwen2.5-7B Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..cad16db --- /dev/null +++ b/.gitattributes @@ -0,0 +1,56 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text + + +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zstandard filter=lfs diff=lfs merge=lfs -text +*.tfevents* filter=lfs diff=lfs merge=lfs -text +*.db* filter=lfs diff=lfs merge=lfs -text +*.ark* filter=lfs diff=lfs merge=lfs -text +**/*ckpt*data* filter=lfs diff=lfs merge=lfs -text +**/*ckpt*.meta filter=lfs diff=lfs merge=lfs -text +**/*ckpt*.index filter=lfs diff=lfs merge=lfs -text + +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.gguf* filter=lfs diff=lfs merge=lfs -text +*.ggml filter=lfs diff=lfs merge=lfs -text +*.llamafile* filter=lfs diff=lfs merge=lfs -text +*.pt2 filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text + +vocab.json filter=lfs diff=lfs merge=lfs -text +merges.txt filter=lfs diff=lfs merge=lfs -text +training_args.bin filter=lfs diff=lfs merge=lfs -text +model-00004-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text +model-00003-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text +model-00001-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text +model-00002-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..234f9db --- /dev/null +++ b/README.md @@ -0,0 +1,100 @@ +--- +license: apache-2.0 +base_model: + - Qwen/Qwen2.5-Coder-7B-Instruct +datasets: + - IIGroup/X-Coder-SFT-376k +language: + - en +tags: + - code + - sft + - competitive-programming +--- + +# X-Coder-SFT-Qwen2.5-7B + +X-Coder-SFT-Qwen2.5-7B is a code generation model fine-tuned on fully synthetic instruction data, designed for competitive programming tasks. It can serve as the foundation for subsequent RLVR training. + +## Model Description + +- **Base Model**: [Qwen/Qwen2.5-Coder-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct) +- **Training Method**: Supervised Fine-Tuning (SFT) +- **Training Data**: [IIGroup/X-Coder-SFT-376k](https://huggingface.co/datasets/IIGroup/X-Coder-SFT-376k) +- **Parameters**: 7B + +## Training + +SFT training can be performed using various frameworks such as [ms-swift](https://github.com/modelscope/ms-swift), [LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory), or [Megatron-LM](https://github.com/NVIDIA/Megatron-LM). For training details and code, please refer to the [X-Coder GitHub repository](https://github.com/JieWu02/X-Coder). + +### Training Hyperparameters + +| Parameter | Value | +|-----------|-------| +| Base Model | Qwen/Qwen2.5-Coder-7B-Instruct | +| Training Type | Full Parameter | +| Epochs | 8 | +| Global Batch Size | 128 | +| Learning Rate | 5e-5 | +| Max Grad Norm | 1.0 | +| Max Length | 32768 | +| Torch Dtype | bfloat16 | +| DeepSpeed | Zero3 Offload (80GB VRAM) / Zero2 (142GB VRAM) | +| Packing | True (2x faster training, slightly worse performance) | + +## Performance on LiveCodeBench v5. + +![Results](results.png) + +## Recommended Inference Parameters + +| Parameter | Value | +|-----------|-------| +| temperature | 0.6 | +| top_p | 0.95 | +| top_k | 20 (or -1 to disable) | +| max_new_tokens | 32768 | + +## Usage + +```python +from transformers import AutoModelForCausalLM, AutoTokenizer + +model_name = "IIGroup/X-Coder-SFT-Qwen2.5-7B" +tokenizer = AutoTokenizer.from_pretrained(model_name) +model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto") + +prompt = "Write a Python function to solve the two sum problem." +inputs = tokenizer(prompt, return_tensors="pt").to(model.device) +outputs = model.generate( + **inputs, + max_new_tokens=32768, + temperature=0.6, + top_p=0.95, + top_k=20, + do_sample=True +) +print(tokenizer.decode(outputs[0], skip_special_tokens=True)) +``` + +## Related Models + +- **RL Model**: [IIGroup/X-Coder-RL-Qwen2.5-7B](https://huggingface.co/IIGroup/X-Coder-RL-Qwen2.5-7B) - RLVR trained version achieving 62.9 on LiveCodeBench + +## Citation + +```bibtex +@misc{wu2026xcoderadvancingcompetitiveprogramming, + title={X-Coder: Advancing Competitive Programming with Fully Synthetic Tasks, Solutions, and Tests}, + author={Jie Wu and Haoling Li and Xin Zhang and Jiani Guo and Jane Luo and Steven Liu and Yangyu Huang and Ruihang Chu and Scarlett Li and Yujiu Yang}, + year={2026}, + eprint={2601.06953}, + archivePrefix={arXiv}, + primaryClass={cs.CL}, + url={https://arxiv.org/abs/2601.06953}, +} +``` + +## License + +This project is licensed under the Apache License 2.0. diff --git a/added_tokens.json b/added_tokens.json new file mode 100644 index 0000000..482ced4 --- /dev/null +++ b/added_tokens.json @@ -0,0 +1,24 @@ +{ + "": 151658, + "": 151657, + "<|box_end|>": 151649, + "<|box_start|>": 151648, + "<|endoftext|>": 151643, + "<|file_sep|>": 151664, + "<|fim_middle|>": 151660, + "<|fim_pad|>": 151662, + "<|fim_prefix|>": 151659, + "<|fim_suffix|>": 151661, + "<|im_end|>": 151645, + "<|im_start|>": 151644, + "<|image_pad|>": 151655, + "<|object_ref_end|>": 151647, + "<|object_ref_start|>": 151646, + "<|quad_end|>": 151651, + "<|quad_start|>": 151650, + "<|repo_name|>": 151663, + "<|video_pad|>": 151656, + "<|vision_end|>": 151653, + "<|vision_pad|>": 151654, + "<|vision_start|>": 151652 +} diff --git a/args.json b/args.json new file mode 100644 index 0000000..84adf33 --- /dev/null +++ b/args.json @@ -0,0 +1,366 @@ +{ + "model": "Qwen2.5-Coder-7B-Instruct", + "model_type": "qwen2_5", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": "flash_attn", + "num_labels": null, + "problem_type": null, + "rope_scaling": null, + "device_map": null, + "max_memory": {}, + "local_repo_path": null, + "init_strategy": null, + "template": "qwen2_5", + "system": null, + "max_length": 25000, + "truncation_strategy": "delete", + "max_pixels": null, + "agent_template": null, + "norm_bbox": null, + "response_prefix": null, + "padding_side": "right", + "loss_scale": "default", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "376k_sft.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 1, + "load_from_cache_file": true, + "dataset_shuffle": true, + "val_dataset_shuffle": false, + "streaming": false, + "interleave_prob": null, + "stopping_strategy": "first_exhausted", + "shuffle_buffer_size": 1000, + "download_mode": "reuse_dataset_if_exists", + "columns": {}, + "strict": false, + "remove_unused_columns": true, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.0, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "full", + "adapters": [], + "external_plugins": [], + "seed": 42, + "model_kwargs": {}, + "load_args": false, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ddp_timeout": 1800, + "ddp_backend": "nccl", + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/openpai_config/sft/Long_Cot_data/Stage1-380k-25k-length-Qwen2.5-Coder-7B-Instruct-8p-5e-5/v0-20250829-164426", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 2, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 5e-05, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.95, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 8.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.03, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/openpai_config/sft/Long_Cot_data/Stage1-380k-25k-length-Qwen2.5-Coder-7B-Instruct-8p-5e-5/v0-20250829-164426/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 20, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 1000.0, + "save_total_limit": 40, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": true, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 1000.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": "/openpai_config/sft/Long_Cot_data/Stage1-380k-25k-length-Qwen2.5-Coder-7B-Instruct-8p-5e-5/v0-20250829-164426", + "disable_tqdm": null, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "cpu", + "pin_memory": true + }, + "offload_param": { + "device": "cpu", + "pin_memory": true + }, + "overlap_comm": false, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "check_model": true, + "acc_strategy": "token", + "train_dataloader_shuffle": true, + "max_epochs": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "eval_use_evalscope": false, + "eval_datasets": [], + "eval_limit": null, + "eval_datasets_args": null, + "eval_generation_config": null, + "freeze_parameters": [], + "freeze_parameters_regex": null, + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "trainable_parameters_regex": null, + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "swanlab_token": null, + "swanlab_project": null, + "swanlab_workspace": null, + "swanlab_exp_name": null, + "swanlab_mode": "cloud", + "add_version": true, + "resume_only_model": false, + "create_checkpoint_symlink": false, + "packing": false, + "lazy_tokenize": true, + "loss_type": null, + "optimizer": null, + "metric": null, + "zero_hpz_partition_size": null, + "rank": 0, + "global_world_size": 80, + "local_world_size": 8, + "model_suffix": "Qwen2.5-Coder-7B-Instruct", + "model_info": "ModelInfo(model_type='qwen2_5', model_dir='/openpai_config/sft/Long_Cot_data/Qwen2.5-Coder-7B-Instruct', torch_dtype=torch.bfloat16, max_model_len=32768, quant_method=None, quant_bits=None, rope_scaling=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='qwen2_5', model_groups=[ModelGroup(models=[Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct', hf_model_id='Qwen/Qwen2.5-3B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct', hf_model_id='Qwen/Qwen2.5-7B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct', hf_model_id='Qwen/Qwen2.5-14B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct', hf_model_id='Qwen/Qwen2.5-32B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct', hf_model_id='Qwen/Qwen2.5-72B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B', hf_model_id='Qwen/Qwen2.5-0.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B', hf_model_id='Qwen/Qwen2.5-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B', hf_model_id='Qwen/Qwen2.5-3B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B', hf_model_id='Qwen/Qwen2.5-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B', hf_model_id='Qwen/Qwen2.5-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B', hf_model_id='Qwen/Qwen2.5-32B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B', hf_model_id='Qwen/Qwen2.5-72B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-3B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-7B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-14B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-32B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-72B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[]), ModelGroup(models=[Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B', hf_model_id='Qwen/Qwen2.5-Coder-0.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B', hf_model_id='Qwen/Qwen2.5-Coder-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B', hf_model_id='Qwen/Qwen2.5-Coder-3B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B', hf_model_id='Qwen/Qwen2.5-Coder-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B', hf_model_id='Qwen/Qwen2.5-Coder-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B', hf_model_id='Qwen/Qwen2.5-Coder-32B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=['coding'])], template='qwen2_5', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=None, requires=['transformers>=4.37'], tags=['coding'])", + "model_dir": "/openpai_config/sft/Long_Cot_data/Qwen2.5-Coder-7B-Instruct", + "hub": "", + "evaluation_strategy": "steps", + "training_args": "Seq2SeqTrainingArguments(output_dir='/openpai_config/sft/Long_Cot_data/Stage1-380k-25k-length-Qwen2.5-Coder-7B-Instruct-8p-5e-5/v0-20250829-164426', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=2, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=5e-05, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=8.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.03, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/openpai_config/sft/Long_Cot_data/Stage1-380k-25k-length-Qwen2.5-Coder-7B-Instruct-8p-5e-5/v0-20250829-164426/runs', logging_strategy=, logging_first_step=True, logging_steps=20, logging_nan_inf_filter=True, save_strategy=, save_steps=1000, save_total_limit=40, save_safetensors=True, save_on_each_node=False, save_only_model=True, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend='nccl', tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=1000, dataloader_num_workers=4, dataloader_prefetch_factor=10, past_index=-1, run_name='/openpai_config/sft/Long_Cot_data/Stage1-380k-25k-length-Qwen2.5-Coder-7B-Instruct-8p-5e-5/v0-20250829-164426', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'cpu', 'pin_memory': True}, 'offload_param': {'device': 'cpu', 'pin_memory': True}, 'overlap_comm': False, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, check_model=True, acc_strategy='token', train_dataloader_shuffle=True, max_epochs=None, metric_warmup_step=0, fsdp_num=1, acc_steps=1, eval_use_evalscope=False, eval_datasets=[], eval_limit=None, eval_datasets_args=None, eval_generation_config=None, train_type='full', optimizer=None, local_repo_path=None, galore_config=None)" +} \ No newline at end of file diff --git a/chat_template.jinja b/chat_template.jinja new file mode 100644 index 0000000..bdf7919 --- /dev/null +++ b/chat_template.jinja @@ -0,0 +1,54 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0]['role'] == 'system' %} + {{- messages[0]['content'] }} + {%- else %} + {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }} + {%- endif %} + {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0]['role'] == 'system' %} + {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }} + {%- else %} + {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {{- '<|im_start|>' + message.role }} + {%- if message.content %} + {{- '\n' + message.content }} + {%- endif %} + {%- for tool_call in message.tool_calls %} + {%- if tool_call.function is defined %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {{- tool_call.arguments | tojson }} + {{- '}\n' }} + {%- endfor %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} +{%- endif %} diff --git a/config.json b/config.json new file mode 100644 index 0000000..ffc49a5 --- /dev/null +++ b/config.json @@ -0,0 +1,29 @@ +{ + "architectures": [ + "Qwen2ForCausalLM" + ], + "attention_dropout": 0.0, + "bos_token_id": 151643, + "eos_token_id": 151645, + "hidden_act": "silu", + "hidden_size": 3584, + "initializer_range": 0.02, + "intermediate_size": 18944, + "max_position_embeddings": 32768, + "max_window_layers": 28, + "model_type": "qwen2", + "num_attention_heads": 28, + "num_hidden_layers": 28, + "num_key_value_heads": 4, + "pad_token_id": 151643, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000.0, + "sliding_window": 131072, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.52.3", + "use_cache": false, + "use_sliding_window": false, + "vocab_size": 152064 +} diff --git a/configuration.json b/configuration.json new file mode 100644 index 0000000..159097f --- /dev/null +++ b/configuration.json @@ -0,0 +1 @@ +{"framework": "pytorch", "task": "others", "allow_remote": true} \ No newline at end of file diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..169796a --- /dev/null +++ b/generation_config.json @@ -0,0 +1,14 @@ +{ + "bos_token_id": 151643, + "do_sample": true, + "eos_token_id": [ + 151645, + 151643 + ], + "pad_token_id": 151643, + "repetition_penalty": 1.1, + "temperature": 0.7, + "top_k": 20, + "top_p": 0.8, + "transformers_version": "4.52.3" +} diff --git a/merges.txt b/merges.txt new file mode 100644 index 0000000..80c1a19 --- /dev/null +++ b/merges.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8831e4f1a044471340f7c0a83d7bd71306a5b867e95fd870f74d0c5308a904d5 +size 1671853 diff --git a/model-00001-of-00004.safetensors b/model-00001-of-00004.safetensors new file mode 100644 index 0000000..319fe23 --- /dev/null +++ b/model-00001-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a07153324ed1c56a7e71b83ca368d54551eaad5aa712ec6555edfe9406f2952 +size 4877660776 diff --git a/model-00002-of-00004.safetensors b/model-00002-of-00004.safetensors new file mode 100644 index 0000000..186e535 --- /dev/null +++ b/model-00002-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47f60b0b04e0de5700e724ea25ec5b0904099e8500aebf31d66c1e7d5d856f79 +size 4932751008 diff --git a/model-00003-of-00004.safetensors b/model-00003-of-00004.safetensors new file mode 100644 index 0000000..fbe2e91 --- /dev/null +++ b/model-00003-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b96527169299b7eedaae2cf8eb6ce49b24b48d30725cf75f26cf5955441039b +size 4330865200 diff --git a/model-00004-of-00004.safetensors b/model-00004-of-00004.safetensors new file mode 100644 index 0000000..ac955e9 --- /dev/null +++ b/model-00004-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:831bb6e76bcde4ab682c75cdc23a2effafe8e718fe3a6ce71b01813e7fbe9d20 +size 1089994880 diff --git a/model.safetensors.index.json b/model.safetensors.index.json new file mode 100644 index 0000000..6ca5084 --- /dev/null +++ b/model.safetensors.index.json @@ -0,0 +1,346 @@ +{ + "metadata": { + "total_size": 15231233024 + }, + "weight_map": { + "lm_head.weight": "model-00004-of-00004.safetensors", + "model.embed_tokens.weight": "model-00001-of-00004.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.18.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.19.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.19.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.norm.weight": "model-00003-of-00004.safetensors" + } +} diff --git a/results.png b/results.png new file mode 100644 index 0000000..7126e89 Binary files /dev/null and b/results.png differ diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..ac23c0a --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,31 @@ +{ + "additional_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "eos_token": { + "content": "<|im_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..51ebb3b --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa +size 11421896 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..bb55e0a --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,207 @@ +{ + "add_bos_token": false, + "add_prefix_space": false, + "added_tokens_decoder": { + "151643": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151644": { + "content": "<|im_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151645": { + "content": "<|im_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151646": { + "content": "<|object_ref_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151647": { + "content": "<|object_ref_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151648": { + "content": "<|box_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151649": { + "content": "<|box_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151650": { + "content": "<|quad_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151651": { + "content": "<|quad_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151652": { + "content": "<|vision_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151653": { + "content": "<|vision_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151654": { + "content": "<|vision_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151655": { + "content": "<|image_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151656": { + "content": "<|video_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151657": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151658": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151659": { + "content": "<|fim_prefix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151660": { + "content": "<|fim_middle|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151661": { + "content": "<|fim_suffix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151662": { + "content": "<|fim_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151663": { + "content": "<|repo_name|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151664": { + "content": "<|file_sep|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|im_end|>", + "errors": "replace", + "extra_special_tokens": {}, + "model_max_length": 32768, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000..0f72cf8 --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,6152 @@ +{ + "best_global_step": 9000, + "best_metric": 0.44325256, + "best_model_checkpoint": "/openpai_config/sft/Long_Cot_data/Stage1-380k-25k-length-Qwen2.5-Coder-7B-Instruct-8p-5e-5/v0-20250829-164426/checkpoint-9000", + "epoch": 5.150246834084568, + "eval_steps": 1000, + "global_step": 12000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00042927666881305, + "grad_norm": 1.6802181005477905, + "learning_rate": 8.928571428571429e-08, + "loss": 0.9940392374992371, + "memory(GiB)": 47.57, + "step": 1, + "token_acc": 0.7531317395493966, + "train_speed(iter/s)": 0.017784 + }, + { + "epoch": 0.008585533376261001, + "grad_norm": 0.8075768947601318, + "learning_rate": 1.7857142857142857e-06, + "loss": 0.9834547544780531, + "memory(GiB)": 72.72, + "step": 20, + "token_acc": 0.7346348107371886, + "train_speed(iter/s)": 0.071123 + }, + { + "epoch": 0.017171066752522002, + "grad_norm": 0.4582468867301941, + "learning_rate": 3.5714285714285714e-06, + "loss": 0.8647201538085938, + "memory(GiB)": 72.72, + "step": 40, + "token_acc": 0.7403293957393929, + "train_speed(iter/s)": 0.078614 + }, + { + "epoch": 0.025756600128783, + "grad_norm": 0.22638651728630066, + "learning_rate": 5.357142857142857e-06, + "loss": 0.756020975112915, + "memory(GiB)": 72.72, + "step": 60, + "token_acc": 0.7670503626615286, + "train_speed(iter/s)": 0.081671 + }, + { + "epoch": 0.034342133505044004, + "grad_norm": 0.22832736372947693, + "learning_rate": 7.142857142857143e-06, + "loss": 0.6885409832000733, + "memory(GiB)": 72.72, + "step": 80, + "token_acc": 0.7765815619910137, + "train_speed(iter/s)": 0.083578 + }, + { + "epoch": 0.042927666881305004, + "grad_norm": 0.1798371970653534, + "learning_rate": 8.92857142857143e-06, + "loss": 0.6466886043548584, + "memory(GiB)": 72.72, + "step": 100, + "token_acc": 0.7900423674902669, + "train_speed(iter/s)": 0.084818 + }, + { + "epoch": 0.051513200257566, + "grad_norm": 0.17644034326076508, + "learning_rate": 1.0714285714285714e-05, + "loss": 0.6123067378997803, + "memory(GiB)": 72.72, + "step": 120, + "token_acc": 0.7967146967110779, + "train_speed(iter/s)": 0.086053 + }, + { + "epoch": 0.060098733633827, + "grad_norm": 0.20387020707130432, + "learning_rate": 1.25e-05, + "loss": 0.6003653049468994, + "memory(GiB)": 72.72, + "step": 140, + "token_acc": 0.8265060359403877, + "train_speed(iter/s)": 0.086654 + }, + { + "epoch": 0.06868426701008801, + "grad_norm": 0.24960927665233612, + "learning_rate": 1.4285714285714285e-05, + "loss": 0.5757434368133545, + "memory(GiB)": 72.72, + "step": 160, + "token_acc": 0.7987813737966625, + "train_speed(iter/s)": 0.087161 + }, + { + "epoch": 0.07726980038634901, + "grad_norm": 0.2726881504058838, + "learning_rate": 1.6071428571428572e-05, + "loss": 0.5653277397155761, + "memory(GiB)": 72.72, + "step": 180, + "token_acc": 0.8027387420432055, + "train_speed(iter/s)": 0.087596 + }, + { + "epoch": 0.08585533376261001, + "grad_norm": 0.2119862288236618, + "learning_rate": 1.785714285714286e-05, + "loss": 0.5523943424224853, + "memory(GiB)": 72.72, + "step": 200, + "token_acc": 0.8231761512065608, + "train_speed(iter/s)": 0.087961 + }, + { + "epoch": 0.094440867138871, + "grad_norm": 0.24396856129169464, + "learning_rate": 1.9642857142857145e-05, + "loss": 0.5498331546783447, + "memory(GiB)": 72.72, + "step": 220, + "token_acc": 0.7940026244174245, + "train_speed(iter/s)": 0.088292 + }, + { + "epoch": 0.103026400515132, + "grad_norm": 0.2601749002933502, + "learning_rate": 2.1428571428571428e-05, + "loss": 0.5398545265197754, + "memory(GiB)": 72.72, + "step": 240, + "token_acc": 0.80989644710031, + "train_speed(iter/s)": 0.088502 + }, + { + "epoch": 0.111611933891393, + "grad_norm": 0.42718759179115295, + "learning_rate": 2.3214285714285715e-05, + "loss": 0.5296700477600098, + "memory(GiB)": 72.72, + "step": 260, + "token_acc": 0.8186448573942751, + "train_speed(iter/s)": 0.088739 + }, + { + "epoch": 0.120197467267654, + "grad_norm": 0.2564183175563812, + "learning_rate": 2.5e-05, + "loss": 0.5314459323883056, + "memory(GiB)": 72.72, + "step": 280, + "token_acc": 0.8186711788362628, + "train_speed(iter/s)": 0.088928 + }, + { + "epoch": 0.128783000643915, + "grad_norm": 0.42152953147888184, + "learning_rate": 2.6785714285714288e-05, + "loss": 0.5304059028625489, + "memory(GiB)": 72.72, + "step": 300, + "token_acc": 0.8313805341388459, + "train_speed(iter/s)": 0.089116 + }, + { + "epoch": 0.13736853402017601, + "grad_norm": 0.44018375873565674, + "learning_rate": 2.857142857142857e-05, + "loss": 0.5269341945648194, + "memory(GiB)": 72.72, + "step": 320, + "token_acc": 0.8261322879913329, + "train_speed(iter/s)": 0.089293 + }, + { + "epoch": 0.14595406739643701, + "grad_norm": 0.332704097032547, + "learning_rate": 3.0357142857142857e-05, + "loss": 0.5224681854248047, + "memory(GiB)": 72.72, + "step": 340, + "token_acc": 0.8305089071105363, + "train_speed(iter/s)": 0.08944 + }, + { + "epoch": 0.15453960077269802, + "grad_norm": 0.2763151526451111, + "learning_rate": 3.2142857142857144e-05, + "loss": 0.5171589374542236, + "memory(GiB)": 72.72, + "step": 360, + "token_acc": 0.8298510336859191, + "train_speed(iter/s)": 0.08953 + }, + { + "epoch": 0.16312513414895902, + "grad_norm": 0.49729594588279724, + "learning_rate": 3.392857142857143e-05, + "loss": 0.5136796474456787, + "memory(GiB)": 72.72, + "step": 380, + "token_acc": 0.8310140069023154, + "train_speed(iter/s)": 0.089637 + }, + { + "epoch": 0.17171066752522002, + "grad_norm": 0.3252655267715454, + "learning_rate": 3.571428571428572e-05, + "loss": 0.5128469944000245, + "memory(GiB)": 72.72, + "step": 400, + "token_acc": 0.8536899287574551, + "train_speed(iter/s)": 0.089729 + }, + { + "epoch": 0.180296200901481, + "grad_norm": 0.28958284854888916, + "learning_rate": 3.7500000000000003e-05, + "loss": 0.5108192920684814, + "memory(GiB)": 72.72, + "step": 420, + "token_acc": 0.8385648117441578, + "train_speed(iter/s)": 0.089819 + }, + { + "epoch": 0.188881734277742, + "grad_norm": 0.34760820865631104, + "learning_rate": 3.928571428571429e-05, + "loss": 0.5059587478637695, + "memory(GiB)": 72.72, + "step": 440, + "token_acc": 0.8356636206879049, + "train_speed(iter/s)": 0.089855 + }, + { + "epoch": 0.197467267654003, + "grad_norm": 0.41139236092567444, + "learning_rate": 4.107142857142857e-05, + "loss": 0.5062141418457031, + "memory(GiB)": 72.72, + "step": 460, + "token_acc": 0.8206318874596391, + "train_speed(iter/s)": 0.089919 + }, + { + "epoch": 0.206052801030264, + "grad_norm": 0.3865952789783478, + "learning_rate": 4.2857142857142856e-05, + "loss": 0.4976132869720459, + "memory(GiB)": 72.72, + "step": 480, + "token_acc": 0.809242185807305, + "train_speed(iter/s)": 0.090029 + }, + { + "epoch": 0.214638334406525, + "grad_norm": 0.34395724534988403, + "learning_rate": 4.464285714285715e-05, + "loss": 0.504787015914917, + "memory(GiB)": 72.72, + "step": 500, + "token_acc": 0.8331918132662932, + "train_speed(iter/s)": 0.090119 + }, + { + "epoch": 0.223223867782786, + "grad_norm": 0.23087145388126373, + "learning_rate": 4.642857142857143e-05, + "loss": 0.49814434051513673, + "memory(GiB)": 72.72, + "step": 520, + "token_acc": 0.8304561567217256, + "train_speed(iter/s)": 0.0902 + }, + { + "epoch": 0.231809401159047, + "grad_norm": 0.3384479582309723, + "learning_rate": 4.8214285714285716e-05, + "loss": 0.49905076026916506, + "memory(GiB)": 72.72, + "step": 540, + "token_acc": 0.8417635120347525, + "train_speed(iter/s)": 0.090248 + }, + { + "epoch": 0.240394934535308, + "grad_norm": 0.40263310074806213, + "learning_rate": 5e-05, + "loss": 0.4956265926361084, + "memory(GiB)": 72.72, + "step": 560, + "token_acc": 0.816079870788593, + "train_speed(iter/s)": 0.090319 + }, + { + "epoch": 0.248980467911569, + "grad_norm": 0.30763953924179077, + "learning_rate": 4.999984903632473e-05, + "loss": 0.4967645168304443, + "memory(GiB)": 72.72, + "step": 580, + "token_acc": 0.8258691170614004, + "train_speed(iter/s)": 0.0903 + }, + { + "epoch": 0.25756600128783, + "grad_norm": 0.28709837794303894, + "learning_rate": 4.999939614712212e-05, + "loss": 0.49540038108825685, + "memory(GiB)": 72.72, + "step": 600, + "token_acc": 0.8345701058201058, + "train_speed(iter/s)": 0.090345 + }, + { + "epoch": 0.266151534664091, + "grad_norm": 0.27484264969825745, + "learning_rate": 4.999864133786175e-05, + "loss": 0.4913135051727295, + "memory(GiB)": 72.72, + "step": 620, + "token_acc": 0.8408849265417659, + "train_speed(iter/s)": 0.090402 + }, + { + "epoch": 0.27473706804035203, + "grad_norm": 0.275291383266449, + "learning_rate": 4.999758461765953e-05, + "loss": 0.4913851261138916, + "memory(GiB)": 72.72, + "step": 640, + "token_acc": 0.823726404893571, + "train_speed(iter/s)": 0.090443 + }, + { + "epoch": 0.28332260141661303, + "grad_norm": 0.31161361932754517, + "learning_rate": 4.999622599927756e-05, + "loss": 0.48822855949401855, + "memory(GiB)": 72.72, + "step": 660, + "token_acc": 0.8308604661462827, + "train_speed(iter/s)": 0.090487 + }, + { + "epoch": 0.29190813479287403, + "grad_norm": 0.3709673285484314, + "learning_rate": 4.999456549912401e-05, + "loss": 0.486361026763916, + "memory(GiB)": 72.72, + "step": 680, + "token_acc": 0.8271976771900934, + "train_speed(iter/s)": 0.090543 + }, + { + "epoch": 0.30049366816913503, + "grad_norm": 0.2165047973394394, + "learning_rate": 4.99926031372529e-05, + "loss": 0.48601832389831545, + "memory(GiB)": 72.72, + "step": 700, + "token_acc": 0.8387291407835747, + "train_speed(iter/s)": 0.090587 + }, + { + "epoch": 0.30907920154539603, + "grad_norm": 0.24446570873260498, + "learning_rate": 4.999033893736386e-05, + "loss": 0.48243279457092286, + "memory(GiB)": 72.72, + "step": 720, + "token_acc": 0.8372941834434668, + "train_speed(iter/s)": 0.090636 + }, + { + "epoch": 0.31766473492165703, + "grad_norm": 0.24655242264270782, + "learning_rate": 4.998777292680182e-05, + "loss": 0.48319129943847655, + "memory(GiB)": 72.72, + "step": 740, + "token_acc": 0.8441225801781377, + "train_speed(iter/s)": 0.090658 + }, + { + "epoch": 0.32625026829791803, + "grad_norm": 0.2514285445213318, + "learning_rate": 4.998490513655676e-05, + "loss": 0.47730517387390137, + "memory(GiB)": 72.72, + "step": 760, + "token_acc": 0.8397503992980168, + "train_speed(iter/s)": 0.090692 + }, + { + "epoch": 0.33483580167417903, + "grad_norm": 0.2303766906261444, + "learning_rate": 4.998173560126323e-05, + "loss": 0.4783301830291748, + "memory(GiB)": 72.72, + "step": 780, + "token_acc": 0.8443087371876962, + "train_speed(iter/s)": 0.090725 + }, + { + "epoch": 0.34342133505044004, + "grad_norm": 0.2418110966682434, + "learning_rate": 4.997826435920003e-05, + "loss": 0.47623915672302247, + "memory(GiB)": 72.72, + "step": 800, + "token_acc": 0.8400687170332869, + "train_speed(iter/s)": 0.090766 + }, + { + "epoch": 0.35200686842670104, + "grad_norm": 0.24591697752475739, + "learning_rate": 4.9974491452289664e-05, + "loss": 0.47069730758666994, + "memory(GiB)": 72.72, + "step": 820, + "token_acc": 0.833947379545595, + "train_speed(iter/s)": 0.090805 + }, + { + "epoch": 0.360592401802962, + "grad_norm": 0.17342260479927063, + "learning_rate": 4.9970416926097885e-05, + "loss": 0.47403693199157715, + "memory(GiB)": 72.72, + "step": 840, + "token_acc": 0.82827573574307, + "train_speed(iter/s)": 0.090837 + }, + { + "epoch": 0.369177935179223, + "grad_norm": 0.25668865442276, + "learning_rate": 4.9966040829833115e-05, + "loss": 0.4738626003265381, + "memory(GiB)": 72.72, + "step": 860, + "token_acc": 0.8238550967767793, + "train_speed(iter/s)": 0.090859 + }, + { + "epoch": 0.377763468555484, + "grad_norm": 0.23179244995117188, + "learning_rate": 4.99613632163459e-05, + "loss": 0.47292590141296387, + "memory(GiB)": 72.72, + "step": 880, + "token_acc": 0.8182959019634485, + "train_speed(iter/s)": 0.090879 + }, + { + "epoch": 0.386349001931745, + "grad_norm": 0.220433309674263, + "learning_rate": 4.995638414212821e-05, + "loss": 0.47188587188720704, + "memory(GiB)": 72.72, + "step": 900, + "token_acc": 0.8470956528576601, + "train_speed(iter/s)": 0.090882 + }, + { + "epoch": 0.394934535308006, + "grad_norm": 0.18783436715602875, + "learning_rate": 4.9951103667312795e-05, + "loss": 0.46758122444152833, + "memory(GiB)": 72.72, + "step": 920, + "token_acc": 0.8408150854174393, + "train_speed(iter/s)": 0.090902 + }, + { + "epoch": 0.403520068684267, + "grad_norm": 0.19517077505588531, + "learning_rate": 4.994552185567244e-05, + "loss": 0.4659998893737793, + "memory(GiB)": 72.72, + "step": 940, + "token_acc": 0.8483965614563244, + "train_speed(iter/s)": 0.090926 + }, + { + "epoch": 0.412105602060528, + "grad_norm": 0.21663770079612732, + "learning_rate": 4.9939638774619216e-05, + "loss": 0.46530804634094236, + "memory(GiB)": 72.72, + "step": 960, + "token_acc": 0.8299559114387157, + "train_speed(iter/s)": 0.090953 + }, + { + "epoch": 0.420691135436789, + "grad_norm": 0.2215634137392044, + "learning_rate": 4.993345449520364e-05, + "loss": 0.46740241050720216, + "memory(GiB)": 72.72, + "step": 980, + "token_acc": 0.8444846788598264, + "train_speed(iter/s)": 0.090976 + }, + { + "epoch": 0.42927666881305, + "grad_norm": 0.26028579473495483, + "learning_rate": 4.992696909211384e-05, + "loss": 0.4601090431213379, + "memory(GiB)": 72.72, + "step": 1000, + "token_acc": 0.8435964299778611, + "train_speed(iter/s)": 0.091007 + }, + { + "epoch": 0.42927666881305, + "eval_loss": 0.49964410066604614, + "eval_runtime": 68.8659, + "eval_samples_per_second": 54.657, + "eval_steps_per_second": 0.697, + "eval_token_acc": 0.8252659243990504, + "step": 1000 + }, + { + "epoch": 0.437862202189311, + "grad_norm": 0.1770976185798645, + "learning_rate": 4.992018264367464e-05, + "loss": 0.4663649082183838, + "memory(GiB)": 72.72, + "step": 1020, + "token_acc": 0.8298894735758832, + "train_speed(iter/s)": 0.090155 + }, + { + "epoch": 0.446447735565572, + "grad_norm": 0.19963237643241882, + "learning_rate": 4.991309523184661e-05, + "loss": 0.45961837768554686, + "memory(GiB)": 72.72, + "step": 1040, + "token_acc": 0.8395059398856838, + "train_speed(iter/s)": 0.090109 + }, + { + "epoch": 0.455033268941833, + "grad_norm": 0.16753822565078735, + "learning_rate": 4.9905706942225094e-05, + "loss": 0.4637617111206055, + "memory(GiB)": 72.72, + "step": 1060, + "token_acc": 0.8323250084598088, + "train_speed(iter/s)": 0.090086 + }, + { + "epoch": 0.463618802318094, + "grad_norm": 0.17514312267303467, + "learning_rate": 4.989801786403916e-05, + "loss": 0.45838212966918945, + "memory(GiB)": 72.72, + "step": 1080, + "token_acc": 0.8644555660931506, + "train_speed(iter/s)": 0.090071 + }, + { + "epoch": 0.472204335694355, + "grad_norm": 0.18766745924949646, + "learning_rate": 4.989002809015052e-05, + "loss": 0.46158289909362793, + "memory(GiB)": 72.72, + "step": 1100, + "token_acc": 0.8342647441453608, + "train_speed(iter/s)": 0.090073 + }, + { + "epoch": 0.480789869070616, + "grad_norm": 0.16132639348506927, + "learning_rate": 4.9881737717052436e-05, + "loss": 0.4612901210784912, + "memory(GiB)": 72.72, + "step": 1120, + "token_acc": 0.8357742084275915, + "train_speed(iter/s)": 0.090059 + }, + { + "epoch": 0.489375402446877, + "grad_norm": 0.2307191789150238, + "learning_rate": 4.987314684486852e-05, + "loss": 0.4583921432495117, + "memory(GiB)": 72.72, + "step": 1140, + "token_acc": 0.8285798810251781, + "train_speed(iter/s)": 0.090074 + }, + { + "epoch": 0.497960935823138, + "grad_norm": 0.18384596705436707, + "learning_rate": 4.9864255577351534e-05, + "loss": 0.4601446151733398, + "memory(GiB)": 72.72, + "step": 1160, + "token_acc": 0.8331488125236877, + "train_speed(iter/s)": 0.090082 + }, + { + "epoch": 0.506546469199399, + "grad_norm": 0.16498738527297974, + "learning_rate": 4.985506402188217e-05, + "loss": 0.46405863761901855, + "memory(GiB)": 72.72, + "step": 1180, + "token_acc": 0.8514261702005886, + "train_speed(iter/s)": 0.090094 + }, + { + "epoch": 0.51513200257566, + "grad_norm": 0.20875471830368042, + "learning_rate": 4.98455722894677e-05, + "loss": 0.4559325695037842, + "memory(GiB)": 72.72, + "step": 1200, + "token_acc": 0.8449181040663494, + "train_speed(iter/s)": 0.090127 + }, + { + "epoch": 0.523717535951921, + "grad_norm": 0.20588186383247375, + "learning_rate": 4.9835780494740655e-05, + "loss": 0.4588587760925293, + "memory(GiB)": 72.72, + "step": 1220, + "token_acc": 0.8452262520285315, + "train_speed(iter/s)": 0.090144 + }, + { + "epoch": 0.532303069328182, + "grad_norm": 0.1740783005952835, + "learning_rate": 4.982568875595748e-05, + "loss": 0.4509147644042969, + "memory(GiB)": 72.72, + "step": 1240, + "token_acc": 0.8587345890329355, + "train_speed(iter/s)": 0.090167 + }, + { + "epoch": 0.5408886027044431, + "grad_norm": 0.16246297955513, + "learning_rate": 4.981529719499704e-05, + "loss": 0.45482635498046875, + "memory(GiB)": 72.72, + "step": 1260, + "token_acc": 0.8503446562311433, + "train_speed(iter/s)": 0.09019 + }, + { + "epoch": 0.5494741360807041, + "grad_norm": 0.16924946010112762, + "learning_rate": 4.98046059373592e-05, + "loss": 0.45041213035583494, + "memory(GiB)": 72.72, + "step": 1280, + "token_acc": 0.8577976623734301, + "train_speed(iter/s)": 0.090182 + }, + { + "epoch": 0.5580596694569651, + "grad_norm": 0.1474551260471344, + "learning_rate": 4.979361511216328e-05, + "loss": 0.4552830696105957, + "memory(GiB)": 72.72, + "step": 1300, + "token_acc": 0.8588476242043739, + "train_speed(iter/s)": 0.090192 + }, + { + "epoch": 0.5666452028332261, + "grad_norm": 0.18833380937576294, + "learning_rate": 4.978232485214652e-05, + "loss": 0.45859723091125487, + "memory(GiB)": 72.72, + "step": 1320, + "token_acc": 0.8404036444064411, + "train_speed(iter/s)": 0.090203 + }, + { + "epoch": 0.5752307362094871, + "grad_norm": 0.15271180868148804, + "learning_rate": 4.977073529366244e-05, + "loss": 0.45444612503051757, + "memory(GiB)": 72.72, + "step": 1340, + "token_acc": 0.8709423088586175, + "train_speed(iter/s)": 0.090204 + }, + { + "epoch": 0.5838162695857481, + "grad_norm": 0.15060073137283325, + "learning_rate": 4.975884657667922e-05, + "loss": 0.44826583862304686, + "memory(GiB)": 72.72, + "step": 1360, + "token_acc": 0.8445347567633144, + "train_speed(iter/s)": 0.090219 + }, + { + "epoch": 0.5924018029620091, + "grad_norm": 0.20435456931591034, + "learning_rate": 4.974665884477803e-05, + "loss": 0.4500474452972412, + "memory(GiB)": 72.72, + "step": 1380, + "token_acc": 0.8436003043631144, + "train_speed(iter/s)": 0.090231 + }, + { + "epoch": 0.6009873363382701, + "grad_norm": 0.14778906106948853, + "learning_rate": 4.9734172245151256e-05, + "loss": 0.45103793144226073, + "memory(GiB)": 72.72, + "step": 1400, + "token_acc": 0.83692786963815, + "train_speed(iter/s)": 0.090242 + }, + { + "epoch": 0.6095728697145311, + "grad_norm": 0.14574205875396729, + "learning_rate": 4.972138692860072e-05, + "loss": 0.445733642578125, + "memory(GiB)": 72.72, + "step": 1420, + "token_acc": 0.8457673279623152, + "train_speed(iter/s)": 0.090256 + }, + { + "epoch": 0.6181584030907921, + "grad_norm": 0.16354091465473175, + "learning_rate": 4.97083030495359e-05, + "loss": 0.44748697280883787, + "memory(GiB)": 72.72, + "step": 1440, + "token_acc": 0.8472576057582539, + "train_speed(iter/s)": 0.09027 + }, + { + "epoch": 0.6267439364670531, + "grad_norm": 0.14656536281108856, + "learning_rate": 4.969492076597203e-05, + "loss": 0.44432525634765624, + "memory(GiB)": 72.72, + "step": 1460, + "token_acc": 0.8350947008237803, + "train_speed(iter/s)": 0.090292 + }, + { + "epoch": 0.6353294698433141, + "grad_norm": 0.16932909190654755, + "learning_rate": 4.9681240239528216e-05, + "loss": 0.44748797416687014, + "memory(GiB)": 72.72, + "step": 1480, + "token_acc": 0.8489820684323982, + "train_speed(iter/s)": 0.090307 + }, + { + "epoch": 0.6439150032195751, + "grad_norm": 0.16873878240585327, + "learning_rate": 4.9667261635425446e-05, + "loss": 0.4508372783660889, + "memory(GiB)": 72.72, + "step": 1500, + "token_acc": 0.8508125264242287, + "train_speed(iter/s)": 0.090325 + }, + { + "epoch": 0.6525005365958361, + "grad_norm": 0.1554819792509079, + "learning_rate": 4.965298512248466e-05, + "loss": 0.4475415706634521, + "memory(GiB)": 72.72, + "step": 1520, + "token_acc": 0.8513087716943568, + "train_speed(iter/s)": 0.090345 + }, + { + "epoch": 0.6610860699720971, + "grad_norm": 0.15099839866161346, + "learning_rate": 4.963841087312462e-05, + "loss": 0.44126238822937014, + "memory(GiB)": 72.72, + "step": 1540, + "token_acc": 0.8473235774968391, + "train_speed(iter/s)": 0.090357 + }, + { + "epoch": 0.6696716033483581, + "grad_norm": 0.16528978943824768, + "learning_rate": 4.9623539063359925e-05, + "loss": 0.44157891273498534, + "memory(GiB)": 72.72, + "step": 1560, + "token_acc": 0.8506024455489073, + "train_speed(iter/s)": 0.090379 + }, + { + "epoch": 0.6782571367246191, + "grad_norm": 0.1654183566570282, + "learning_rate": 4.9608369872798815e-05, + "loss": 0.4443850517272949, + "memory(GiB)": 72.72, + "step": 1580, + "token_acc": 0.8580666295200214, + "train_speed(iter/s)": 0.090387 + }, + { + "epoch": 0.6868426701008801, + "grad_norm": 0.17317461967468262, + "learning_rate": 4.9592903484641026e-05, + "loss": 0.44514150619506837, + "memory(GiB)": 72.72, + "step": 1600, + "token_acc": 0.8373144994303555, + "train_speed(iter/s)": 0.090402 + }, + { + "epoch": 0.6954282034771411, + "grad_norm": 0.14516599476337433, + "learning_rate": 4.9577140085675586e-05, + "loss": 0.4465588092803955, + "memory(GiB)": 72.72, + "step": 1620, + "token_acc": 0.8457774631145212, + "train_speed(iter/s)": 0.090411 + }, + { + "epoch": 0.7040137368534021, + "grad_norm": 0.19526512920856476, + "learning_rate": 4.956107986627855e-05, + "loss": 0.44002666473388674, + "memory(GiB)": 72.72, + "step": 1640, + "token_acc": 0.8571882184288229, + "train_speed(iter/s)": 0.090425 + }, + { + "epoch": 0.7125992702296631, + "grad_norm": 0.15198639035224915, + "learning_rate": 4.954472302041069e-05, + "loss": 0.4411801815032959, + "memory(GiB)": 72.72, + "step": 1660, + "token_acc": 0.8389074986086463, + "train_speed(iter/s)": 0.090436 + }, + { + "epoch": 0.721184803605924, + "grad_norm": 0.14642658829689026, + "learning_rate": 4.952806974561518e-05, + "loss": 0.4408212184906006, + "memory(GiB)": 72.72, + "step": 1680, + "token_acc": 0.8505627783277739, + "train_speed(iter/s)": 0.090445 + }, + { + "epoch": 0.729770336982185, + "grad_norm": 0.15425585210323334, + "learning_rate": 4.951112024301517e-05, + "loss": 0.4436194896697998, + "memory(GiB)": 72.72, + "step": 1700, + "token_acc": 0.8396897524541256, + "train_speed(iter/s)": 0.090457 + }, + { + "epoch": 0.738355870358446, + "grad_norm": 0.1366390883922577, + "learning_rate": 4.9493874717311416e-05, + "loss": 0.4426912307739258, + "memory(GiB)": 72.72, + "step": 1720, + "token_acc": 0.8376902006111753, + "train_speed(iter/s)": 0.090475 + }, + { + "epoch": 0.746941403734707, + "grad_norm": 0.18443486094474792, + "learning_rate": 4.9476333376779746e-05, + "loss": 0.4428090572357178, + "memory(GiB)": 72.72, + "step": 1740, + "token_acc": 0.8405056707361122, + "train_speed(iter/s)": 0.090496 + }, + { + "epoch": 0.755526937110968, + "grad_norm": 0.16430367529392242, + "learning_rate": 4.945849643326857e-05, + "loss": 0.4388707637786865, + "memory(GiB)": 72.72, + "step": 1760, + "token_acc": 0.8453185251787173, + "train_speed(iter/s)": 0.090513 + }, + { + "epoch": 0.764112470487229, + "grad_norm": 0.16152745485305786, + "learning_rate": 4.9440364102196345e-05, + "loss": 0.43615312576293946, + "memory(GiB)": 72.72, + "step": 1780, + "token_acc": 0.855409006002105, + "train_speed(iter/s)": 0.090532 + }, + { + "epoch": 0.77269800386349, + "grad_norm": 0.18139781057834625, + "learning_rate": 4.942193660254892e-05, + "loss": 0.440519380569458, + "memory(GiB)": 72.72, + "step": 1800, + "token_acc": 0.8458556213090118, + "train_speed(iter/s)": 0.09055 + }, + { + "epoch": 0.781283537239751, + "grad_norm": 0.1560135781764984, + "learning_rate": 4.9403214156876966e-05, + "loss": 0.4351651191711426, + "memory(GiB)": 72.72, + "step": 1820, + "token_acc": 0.844846138018734, + "train_speed(iter/s)": 0.090564 + }, + { + "epoch": 0.789869070616012, + "grad_norm": 0.18113134801387787, + "learning_rate": 4.9384196991293205e-05, + "loss": 0.4427495002746582, + "memory(GiB)": 72.72, + "step": 1840, + "token_acc": 0.8444957533319758, + "train_speed(iter/s)": 0.090581 + }, + { + "epoch": 0.798454603992273, + "grad_norm": 0.16674058139324188, + "learning_rate": 4.9364885335469734e-05, + "loss": 0.4387219429016113, + "memory(GiB)": 72.72, + "step": 1860, + "token_acc": 0.862598161076389, + "train_speed(iter/s)": 0.090598 + }, + { + "epoch": 0.807040137368534, + "grad_norm": 0.1326039731502533, + "learning_rate": 4.934527942263523e-05, + "loss": 0.4364177703857422, + "memory(GiB)": 72.72, + "step": 1880, + "token_acc": 0.8337704981881752, + "train_speed(iter/s)": 0.090612 + }, + { + "epoch": 0.815625670744795, + "grad_norm": 0.15598100423812866, + "learning_rate": 4.9325379489572165e-05, + "loss": 0.4394540309906006, + "memory(GiB)": 72.72, + "step": 1900, + "token_acc": 0.8388467949805115, + "train_speed(iter/s)": 0.090628 + }, + { + "epoch": 0.824211204121056, + "grad_norm": 0.19666017591953278, + "learning_rate": 4.930518577661388e-05, + "loss": 0.4369682788848877, + "memory(GiB)": 72.72, + "step": 1920, + "token_acc": 0.8537222609570074, + "train_speed(iter/s)": 0.090641 + }, + { + "epoch": 0.832796737497317, + "grad_norm": 0.14630870521068573, + "learning_rate": 4.928469852764176e-05, + "loss": 0.43962607383728025, + "memory(GiB)": 72.72, + "step": 1940, + "token_acc": 0.8393774787079826, + "train_speed(iter/s)": 0.090657 + }, + { + "epoch": 0.841382270873578, + "grad_norm": 0.1797455996274948, + "learning_rate": 4.926391799008223e-05, + "loss": 0.4379319190979004, + "memory(GiB)": 72.72, + "step": 1960, + "token_acc": 0.843222227690404, + "train_speed(iter/s)": 0.090674 + }, + { + "epoch": 0.849967804249839, + "grad_norm": 0.12199361622333527, + "learning_rate": 4.92428444149038e-05, + "loss": 0.4334880352020264, + "memory(GiB)": 72.72, + "step": 1980, + "token_acc": 0.8431945161599516, + "train_speed(iter/s)": 0.090689 + }, + { + "epoch": 0.8585533376261, + "grad_norm": 0.14421170949935913, + "learning_rate": 4.922147805661402e-05, + "loss": 0.43396615982055664, + "memory(GiB)": 72.72, + "step": 2000, + "token_acc": 0.8505196095201227, + "train_speed(iter/s)": 0.0907 + }, + { + "epoch": 0.8585533376261, + "eval_loss": 0.470032662153244, + "eval_runtime": 68.4365, + "eval_samples_per_second": 55.0, + "eval_steps_per_second": 0.701, + "eval_token_acc": 0.8330788522432155, + "step": 2000 + }, + { + "epoch": 0.867138871002361, + "grad_norm": 0.14598308503627777, + "learning_rate": 4.91998191732564e-05, + "loss": 0.4354074001312256, + "memory(GiB)": 72.72, + "step": 2020, + "token_acc": 0.8444459301633199, + "train_speed(iter/s)": 0.090271 + }, + { + "epoch": 0.875724404378622, + "grad_norm": 0.14193296432495117, + "learning_rate": 4.917786802640732e-05, + "loss": 0.4282365322113037, + "memory(GiB)": 72.72, + "step": 2040, + "token_acc": 0.851482400022546, + "train_speed(iter/s)": 0.090242 + }, + { + "epoch": 0.884309937754883, + "grad_norm": 0.1344188153743744, + "learning_rate": 4.9155624881172834e-05, + "loss": 0.4284001350402832, + "memory(GiB)": 72.72, + "step": 2060, + "token_acc": 0.8423048427291708, + "train_speed(iter/s)": 0.090228 + }, + { + "epoch": 0.892895471131144, + "grad_norm": 0.19214758276939392, + "learning_rate": 4.91330900061855e-05, + "loss": 0.4374197483062744, + "memory(GiB)": 72.72, + "step": 2080, + "token_acc": 0.838007610676071, + "train_speed(iter/s)": 0.09022 + }, + { + "epoch": 0.901481004507405, + "grad_norm": 0.14042872190475464, + "learning_rate": 4.911026367360114e-05, + "loss": 0.4368441104888916, + "memory(GiB)": 72.72, + "step": 2100, + "token_acc": 0.8546391628505924, + "train_speed(iter/s)": 0.09023 + }, + { + "epoch": 0.910066537883666, + "grad_norm": 0.12134739011526108, + "learning_rate": 4.90871461590955e-05, + "loss": 0.4329835414886475, + "memory(GiB)": 72.72, + "step": 2120, + "token_acc": 0.8415444091274719, + "train_speed(iter/s)": 0.090231 + }, + { + "epoch": 0.918652071259927, + "grad_norm": 0.13989004492759705, + "learning_rate": 4.906373774186097e-05, + "loss": 0.4377878665924072, + "memory(GiB)": 72.72, + "step": 2140, + "token_acc": 0.848478083434529, + "train_speed(iter/s)": 0.090235 + }, + { + "epoch": 0.927237604636188, + "grad_norm": 0.13958944380283356, + "learning_rate": 4.904003870460323e-05, + "loss": 0.4368983268737793, + "memory(GiB)": 72.72, + "step": 2160, + "token_acc": 0.8715589150065507, + "train_speed(iter/s)": 0.090238 + }, + { + "epoch": 0.935823138012449, + "grad_norm": 0.12047629058361053, + "learning_rate": 4.901604933353776e-05, + "loss": 0.432587194442749, + "memory(GiB)": 72.72, + "step": 2180, + "token_acc": 0.8463879291216281, + "train_speed(iter/s)": 0.090247 + }, + { + "epoch": 0.94440867138871, + "grad_norm": 0.19937904179096222, + "learning_rate": 4.899176991838646e-05, + "loss": 0.42923874855041505, + "memory(GiB)": 72.72, + "step": 2200, + "token_acc": 0.8560462814584306, + "train_speed(iter/s)": 0.090255 + }, + { + "epoch": 0.952994204764971, + "grad_norm": 0.13658791780471802, + "learning_rate": 4.896720075237411e-05, + "loss": 0.43826861381530763, + "memory(GiB)": 72.72, + "step": 2220, + "token_acc": 0.8582506049536265, + "train_speed(iter/s)": 0.090257 + }, + { + "epoch": 0.961579738141232, + "grad_norm": 0.1443174183368683, + "learning_rate": 4.894234213222484e-05, + "loss": 0.4363288879394531, + "memory(GiB)": 72.72, + "step": 2240, + "token_acc": 0.8583811494758153, + "train_speed(iter/s)": 0.090269 + }, + { + "epoch": 0.970165271517493, + "grad_norm": 0.1416754275560379, + "learning_rate": 4.8917194358158534e-05, + "loss": 0.43085694313049316, + "memory(GiB)": 72.72, + "step": 2260, + "token_acc": 0.8524799246312664, + "train_speed(iter/s)": 0.090285 + }, + { + "epoch": 0.978750804893754, + "grad_norm": 0.15419602394104004, + "learning_rate": 4.889175773388722e-05, + "loss": 0.42989211082458495, + "memory(GiB)": 72.72, + "step": 2280, + "token_acc": 0.8570728938425664, + "train_speed(iter/s)": 0.090292 + }, + { + "epoch": 0.987336338270015, + "grad_norm": 0.15600045025348663, + "learning_rate": 4.886603256661142e-05, + "loss": 0.43334760665893557, + "memory(GiB)": 72.72, + "step": 2300, + "token_acc": 0.844059695609059, + "train_speed(iter/s)": 0.090301 + }, + { + "epoch": 0.995921871646276, + "grad_norm": 0.1368878036737442, + "learning_rate": 4.884001916701639e-05, + "loss": 0.4333777904510498, + "memory(GiB)": 72.72, + "step": 2320, + "token_acc": 0.841434785356969, + "train_speed(iter/s)": 0.090297 + }, + { + "epoch": 1.0042927666881305, + "grad_norm": 0.17282716929912567, + "learning_rate": 4.881371784926839e-05, + "loss": 0.42626185417175294, + "memory(GiB)": 72.72, + "step": 2340, + "token_acc": 0.8547437072110268, + "train_speed(iter/s)": 0.090271 + }, + { + "epoch": 1.0128783000643915, + "grad_norm": 0.21046976745128632, + "learning_rate": 4.878712893101092e-05, + "loss": 0.40583181381225586, + "memory(GiB)": 72.72, + "step": 2360, + "token_acc": 0.8494737944090475, + "train_speed(iter/s)": 0.09027 + }, + { + "epoch": 1.0214638334406525, + "grad_norm": 0.1504330039024353, + "learning_rate": 4.8760252733360845e-05, + "loss": 0.40615053176879884, + "memory(GiB)": 72.72, + "step": 2380, + "token_acc": 0.861498977359772, + "train_speed(iter/s)": 0.090271 + }, + { + "epoch": 1.0300493668169135, + "grad_norm": 0.13325518369674683, + "learning_rate": 4.8733089580904525e-05, + "loss": 0.4108716011047363, + "memory(GiB)": 72.72, + "step": 2400, + "token_acc": 0.8607458709259072, + "train_speed(iter/s)": 0.090273 + }, + { + "epoch": 1.0386349001931745, + "grad_norm": 0.14907221496105194, + "learning_rate": 4.870563980169391e-05, + "loss": 0.4110468864440918, + "memory(GiB)": 72.72, + "step": 2420, + "token_acc": 0.8597078066556821, + "train_speed(iter/s)": 0.090268 + }, + { + "epoch": 1.0472204335694355, + "grad_norm": 0.13383924961090088, + "learning_rate": 4.867790372724257e-05, + "loss": 0.4098019599914551, + "memory(GiB)": 72.72, + "step": 2440, + "token_acc": 0.8552879722635879, + "train_speed(iter/s)": 0.090259 + }, + { + "epoch": 1.0558059669456965, + "grad_norm": 0.1269863396883011, + "learning_rate": 4.864988169252168e-05, + "loss": 0.40692687034606934, + "memory(GiB)": 72.72, + "step": 2460, + "token_acc": 0.8569142548291154, + "train_speed(iter/s)": 0.090254 + }, + { + "epoch": 1.0643915003219575, + "grad_norm": 0.1471211463212967, + "learning_rate": 4.862157403595598e-05, + "loss": 0.4115363597869873, + "memory(GiB)": 72.72, + "step": 2480, + "token_acc": 0.8509032023648785, + "train_speed(iter/s)": 0.090253 + }, + { + "epoch": 1.0729770336982185, + "grad_norm": 0.1170874610543251, + "learning_rate": 4.859298109941971e-05, + "loss": 0.40721793174743653, + "memory(GiB)": 72.72, + "step": 2500, + "token_acc": 0.8535656636728612, + "train_speed(iter/s)": 0.09024 + }, + { + "epoch": 1.0815625670744795, + "grad_norm": 0.15042538940906525, + "learning_rate": 4.8564103228232445e-05, + "loss": 0.4073436737060547, + "memory(GiB)": 72.72, + "step": 2520, + "token_acc": 0.8541391331235382, + "train_speed(iter/s)": 0.090233 + }, + { + "epoch": 1.0901481004507405, + "grad_norm": 0.13396978378295898, + "learning_rate": 4.8534940771154954e-05, + "loss": 0.40180039405822754, + "memory(GiB)": 72.72, + "step": 2540, + "token_acc": 0.8529722329553782, + "train_speed(iter/s)": 0.09023 + }, + { + "epoch": 1.0987336338270015, + "grad_norm": 0.1457211673259735, + "learning_rate": 4.850549408038498e-05, + "loss": 0.4088040828704834, + "memory(GiB)": 72.72, + "step": 2560, + "token_acc": 0.8557055478261985, + "train_speed(iter/s)": 0.090233 + }, + { + "epoch": 1.1073191672032625, + "grad_norm": 0.1382468044757843, + "learning_rate": 4.8475763511552965e-05, + "loss": 0.4087985515594482, + "memory(GiB)": 72.72, + "step": 2580, + "token_acc": 0.8476997133289814, + "train_speed(iter/s)": 0.090235 + }, + { + "epoch": 1.1159047005795235, + "grad_norm": 0.13849055767059326, + "learning_rate": 4.844574942371779e-05, + "loss": 0.4051491737365723, + "memory(GiB)": 72.72, + "step": 2600, + "token_acc": 0.8530841075229781, + "train_speed(iter/s)": 0.09023 + }, + { + "epoch": 1.1244902339557845, + "grad_norm": 0.10844399780035019, + "learning_rate": 4.841545217936241e-05, + "loss": 0.40656099319458006, + "memory(GiB)": 72.72, + "step": 2620, + "token_acc": 0.8659740741451311, + "train_speed(iter/s)": 0.090231 + }, + { + "epoch": 1.1330757673320455, + "grad_norm": 0.14399060606956482, + "learning_rate": 4.838487214438951e-05, + "loss": 0.40219764709472655, + "memory(GiB)": 72.72, + "step": 2640, + "token_acc": 0.8656865378871902, + "train_speed(iter/s)": 0.090232 + }, + { + "epoch": 1.1416613007083065, + "grad_norm": 0.15220606327056885, + "learning_rate": 4.8354009688117026e-05, + "loss": 0.409071159362793, + "memory(GiB)": 72.72, + "step": 2660, + "token_acc": 0.8480521276805948, + "train_speed(iter/s)": 0.090228 + }, + { + "epoch": 1.1502468340845675, + "grad_norm": 0.13173505663871765, + "learning_rate": 4.832286518327376e-05, + "loss": 0.40669097900390627, + "memory(GiB)": 72.72, + "step": 2680, + "token_acc": 0.8510178845290564, + "train_speed(iter/s)": 0.090198 + }, + { + "epoch": 1.1588323674608285, + "grad_norm": 0.13876375555992126, + "learning_rate": 4.829143900599481e-05, + "loss": 0.40750818252563475, + "memory(GiB)": 72.72, + "step": 2700, + "token_acc": 0.8563185312128616, + "train_speed(iter/s)": 0.090196 + }, + { + "epoch": 1.1674179008370895, + "grad_norm": 0.1286059468984604, + "learning_rate": 4.825973153581709e-05, + "loss": 0.4104398250579834, + "memory(GiB)": 72.72, + "step": 2720, + "token_acc": 0.8429334625658422, + "train_speed(iter/s)": 0.090194 + }, + { + "epoch": 1.1760034342133505, + "grad_norm": 0.11903152614831924, + "learning_rate": 4.8227743155674684e-05, + "loss": 0.405780553817749, + "memory(GiB)": 72.72, + "step": 2740, + "token_acc": 0.8503315207488469, + "train_speed(iter/s)": 0.090196 + }, + { + "epoch": 1.1845889675896115, + "grad_norm": 0.13296058773994446, + "learning_rate": 4.819547425189429e-05, + "loss": 0.406817626953125, + "memory(GiB)": 72.72, + "step": 2760, + "token_acc": 0.8561766559029692, + "train_speed(iter/s)": 0.090196 + }, + { + "epoch": 1.1931745009658725, + "grad_norm": 0.1934213489294052, + "learning_rate": 4.816292521419046e-05, + "loss": 0.40883073806762693, + "memory(GiB)": 72.72, + "step": 2780, + "token_acc": 0.844781303243432, + "train_speed(iter/s)": 0.090191 + }, + { + "epoch": 1.2017600343421335, + "grad_norm": 0.14654423296451569, + "learning_rate": 4.813009643566101e-05, + "loss": 0.40619373321533203, + "memory(GiB)": 72.72, + "step": 2800, + "token_acc": 0.8772289089291062, + "train_speed(iter/s)": 0.090194 + }, + { + "epoch": 1.2103455677183945, + "grad_norm": 0.15193308889865875, + "learning_rate": 4.8096988312782174e-05, + "loss": 0.41390376091003417, + "memory(GiB)": 72.72, + "step": 2820, + "token_acc": 0.8615587932421312, + "train_speed(iter/s)": 0.089981 + }, + { + "epoch": 1.2189311010946555, + "grad_norm": 0.31674590706825256, + "learning_rate": 4.8063601245403864e-05, + "loss": 0.40833268165588377, + "memory(GiB)": 72.72, + "step": 2840, + "token_acc": 0.8733467856737243, + "train_speed(iter/s)": 0.089985 + }, + { + "epoch": 1.2275166344709165, + "grad_norm": 0.14927241206169128, + "learning_rate": 4.802993563674483e-05, + "loss": 0.4076714038848877, + "memory(GiB)": 72.72, + "step": 2860, + "token_acc": 0.854028153160118, + "train_speed(iter/s)": 0.089983 + }, + { + "epoch": 1.2361021678471775, + "grad_norm": 0.12387314438819885, + "learning_rate": 4.7995991893387796e-05, + "loss": 0.4103559970855713, + "memory(GiB)": 72.72, + "step": 2880, + "token_acc": 0.8473229063574101, + "train_speed(iter/s)": 0.089987 + }, + { + "epoch": 1.2446877012234385, + "grad_norm": 0.12055594474077225, + "learning_rate": 4.7961770425274545e-05, + "loss": 0.4068136215209961, + "memory(GiB)": 72.72, + "step": 2900, + "token_acc": 0.8558365116304547, + "train_speed(iter/s)": 0.089986 + }, + { + "epoch": 1.2532732345996995, + "grad_norm": 0.15471091866493225, + "learning_rate": 4.7927271645700966e-05, + "loss": 0.40784463882446287, + "memory(GiB)": 72.72, + "step": 2920, + "token_acc": 0.8654847024471946, + "train_speed(iter/s)": 0.089986 + }, + { + "epoch": 1.2618587679759605, + "grad_norm": 0.14402052760124207, + "learning_rate": 4.789249597131205e-05, + "loss": 0.416036319732666, + "memory(GiB)": 72.72, + "step": 2940, + "token_acc": 0.8460941475007567, + "train_speed(iter/s)": 0.089991 + }, + { + "epoch": 1.2704443013522215, + "grad_norm": 0.12818260490894318, + "learning_rate": 4.7857443822096905e-05, + "loss": 0.4087369441986084, + "memory(GiB)": 72.72, + "step": 2960, + "token_acc": 0.8485137361496985, + "train_speed(iter/s)": 0.089995 + }, + { + "epoch": 1.2790298347284825, + "grad_norm": 0.1295406073331833, + "learning_rate": 4.7822115621383626e-05, + "loss": 0.406325101852417, + "memory(GiB)": 72.72, + "step": 2980, + "token_acc": 0.844768784514136, + "train_speed(iter/s)": 0.089994 + }, + { + "epoch": 1.2876153681047435, + "grad_norm": 0.12578845024108887, + "learning_rate": 4.77865117958342e-05, + "loss": 0.4075514316558838, + "memory(GiB)": 72.72, + "step": 3000, + "token_acc": 0.8509595377960147, + "train_speed(iter/s)": 0.089997 + }, + { + "epoch": 1.2876153681047435, + "eval_loss": 0.458536833524704, + "eval_runtime": 73.3656, + "eval_samples_per_second": 51.305, + "eval_steps_per_second": 0.654, + "eval_token_acc": 0.836097728930092, + "step": 3000 + }, + { + "epoch": 1.2962009014810045, + "grad_norm": 0.14806412160396576, + "learning_rate": 4.7750632775439396e-05, + "loss": 0.4144165515899658, + "memory(GiB)": 72.72, + "step": 3020, + "token_acc": 0.8443864646089783, + "train_speed(iter/s)": 0.089698 + }, + { + "epoch": 1.3047864348572655, + "grad_norm": 0.12434946000576019, + "learning_rate": 4.771447899351351e-05, + "loss": 0.4105505466461182, + "memory(GiB)": 72.72, + "step": 3040, + "token_acc": 0.8428811902693311, + "train_speed(iter/s)": 0.089674 + }, + { + "epoch": 1.3133719682335265, + "grad_norm": 0.13531598448753357, + "learning_rate": 4.767805088668916e-05, + "loss": 0.40719943046569823, + "memory(GiB)": 72.72, + "step": 3060, + "token_acc": 0.8596681679947646, + "train_speed(iter/s)": 0.089654 + }, + { + "epoch": 1.3219575016097875, + "grad_norm": 0.1139962449669838, + "learning_rate": 4.764134889491203e-05, + "loss": 0.41121878623962405, + "memory(GiB)": 72.72, + "step": 3080, + "token_acc": 0.8527798587518215, + "train_speed(iter/s)": 0.089646 + }, + { + "epoch": 1.3305430349860485, + "grad_norm": 0.12814205884933472, + "learning_rate": 4.760437346143551e-05, + "loss": 0.409865140914917, + "memory(GiB)": 72.72, + "step": 3100, + "token_acc": 0.8476134603221164, + "train_speed(iter/s)": 0.089635 + }, + { + "epoch": 1.3391285683623095, + "grad_norm": 0.12462539970874786, + "learning_rate": 4.7567125032815394e-05, + "loss": 0.4104144096374512, + "memory(GiB)": 72.72, + "step": 3120, + "token_acc": 0.8563240702901512, + "train_speed(iter/s)": 0.089626 + }, + { + "epoch": 1.3477141017385705, + "grad_norm": 0.14763693511486053, + "learning_rate": 4.752960405890446e-05, + "loss": 0.4084192752838135, + "memory(GiB)": 72.72, + "step": 3140, + "token_acc": 0.8617203660494134, + "train_speed(iter/s)": 0.08962 + }, + { + "epoch": 1.3562996351148315, + "grad_norm": 0.1228506788611412, + "learning_rate": 4.749181099284703e-05, + "loss": 0.4092958927154541, + "memory(GiB)": 72.72, + "step": 3160, + "token_acc": 0.8593913560568706, + "train_speed(iter/s)": 0.089613 + }, + { + "epoch": 1.3648851684910925, + "grad_norm": 0.13346746563911438, + "learning_rate": 4.745374629107352e-05, + "loss": 0.4028874397277832, + "memory(GiB)": 72.72, + "step": 3180, + "token_acc": 0.8684066693278472, + "train_speed(iter/s)": 0.08961 + }, + { + "epoch": 1.3734707018673535, + "grad_norm": 0.11734752357006073, + "learning_rate": 4.7415410413294914e-05, + "loss": 0.40769195556640625, + "memory(GiB)": 72.72, + "step": 3200, + "token_acc": 0.8596272472768909, + "train_speed(iter/s)": 0.089609 + }, + { + "epoch": 1.3820562352436145, + "grad_norm": 0.12647828459739685, + "learning_rate": 4.737680382249721e-05, + "loss": 0.40363130569458006, + "memory(GiB)": 72.72, + "step": 3220, + "token_acc": 0.8496863902084465, + "train_speed(iter/s)": 0.089608 + }, + { + "epoch": 1.3906417686198755, + "grad_norm": 0.1108260527253151, + "learning_rate": 4.733792698493584e-05, + "loss": 0.40738682746887206, + "memory(GiB)": 72.72, + "step": 3240, + "token_acc": 0.8423361384211572, + "train_speed(iter/s)": 0.089609 + }, + { + "epoch": 1.3992273019961365, + "grad_norm": 0.12982375919818878, + "learning_rate": 4.7298780370130014e-05, + "loss": 0.4081905364990234, + "memory(GiB)": 72.72, + "step": 3260, + "token_acc": 0.856714728114282, + "train_speed(iter/s)": 0.089608 + }, + { + "epoch": 1.4078128353723975, + "grad_norm": 0.11955548077821732, + "learning_rate": 4.7259364450857096e-05, + "loss": 0.405292272567749, + "memory(GiB)": 72.72, + "step": 3280, + "token_acc": 0.8639308100087719, + "train_speed(iter/s)": 0.08961 + }, + { + "epoch": 1.4163983687486585, + "grad_norm": 0.13745689392089844, + "learning_rate": 4.721967970314684e-05, + "loss": 0.40678954124450684, + "memory(GiB)": 72.72, + "step": 3300, + "token_acc": 0.8452769593980903, + "train_speed(iter/s)": 0.089609 + }, + { + "epoch": 1.4249839021249195, + "grad_norm": 0.12158916145563126, + "learning_rate": 4.717972660627567e-05, + "loss": 0.40230860710144045, + "memory(GiB)": 72.72, + "step": 3320, + "token_acc": 0.8659293308755474, + "train_speed(iter/s)": 0.089614 + }, + { + "epoch": 1.4335694355011805, + "grad_norm": 0.14111177623271942, + "learning_rate": 4.713950564276091e-05, + "loss": 0.4016873359680176, + "memory(GiB)": 72.72, + "step": 3340, + "token_acc": 0.8510812474231116, + "train_speed(iter/s)": 0.089592 + }, + { + "epoch": 1.4421549688774415, + "grad_norm": 0.10712361335754395, + "learning_rate": 4.70990172983549e-05, + "loss": 0.4058821201324463, + "memory(GiB)": 72.72, + "step": 3360, + "token_acc": 0.8550924401373665, + "train_speed(iter/s)": 0.089592 + }, + { + "epoch": 1.4507405022537025, + "grad_norm": 0.11166644841432571, + "learning_rate": 4.705826206203918e-05, + "loss": 0.4066760540008545, + "memory(GiB)": 72.72, + "step": 3380, + "token_acc": 0.8444937034366048, + "train_speed(iter/s)": 0.089586 + }, + { + "epoch": 1.4593260356299635, + "grad_norm": 0.14026156067848206, + "learning_rate": 4.701724042601859e-05, + "loss": 0.40719261169433596, + "memory(GiB)": 72.72, + "step": 3400, + "token_acc": 0.8498371056241426, + "train_speed(iter/s)": 0.08959 + }, + { + "epoch": 1.4679115690062245, + "grad_norm": 0.13125832378864288, + "learning_rate": 4.697595288571528e-05, + "loss": 0.4064974308013916, + "memory(GiB)": 72.72, + "step": 3420, + "token_acc": 0.8575960472975773, + "train_speed(iter/s)": 0.089593 + }, + { + "epoch": 1.4764971023824855, + "grad_norm": 0.12359972298145294, + "learning_rate": 4.6934399939762746e-05, + "loss": 0.4019315242767334, + "memory(GiB)": 72.72, + "step": 3440, + "token_acc": 0.8573588526594907, + "train_speed(iter/s)": 0.089592 + }, + { + "epoch": 1.4850826357587465, + "grad_norm": 0.15697510540485382, + "learning_rate": 4.689258208999983e-05, + "loss": 0.4078845500946045, + "memory(GiB)": 72.72, + "step": 3460, + "token_acc": 0.8560958939786878, + "train_speed(iter/s)": 0.089591 + }, + { + "epoch": 1.4936681691350076, + "grad_norm": 0.11863242089748383, + "learning_rate": 4.685049984146463e-05, + "loss": 0.4097602844238281, + "memory(GiB)": 72.72, + "step": 3480, + "token_acc": 0.8628702144893777, + "train_speed(iter/s)": 0.08959 + }, + { + "epoch": 1.5022537025112686, + "grad_norm": 0.11114250868558884, + "learning_rate": 4.680815370238843e-05, + "loss": 0.40899147987365725, + "memory(GiB)": 72.72, + "step": 3500, + "token_acc": 0.8451921045701701, + "train_speed(iter/s)": 0.089584 + }, + { + "epoch": 1.5108392358875296, + "grad_norm": 0.1112656220793724, + "learning_rate": 4.676554418418953e-05, + "loss": 0.40816683769226075, + "memory(GiB)": 72.72, + "step": 3520, + "token_acc": 0.8431806288233773, + "train_speed(iter/s)": 0.089584 + }, + { + "epoch": 1.5194247692637906, + "grad_norm": 0.11323296278715134, + "learning_rate": 4.6722671801467074e-05, + "loss": 0.4055006980895996, + "memory(GiB)": 72.72, + "step": 3540, + "token_acc": 0.8815225166268434, + "train_speed(iter/s)": 0.089589 + }, + { + "epoch": 1.5280103026400516, + "grad_norm": 0.12150542438030243, + "learning_rate": 4.6679537071994874e-05, + "loss": 0.4004813194274902, + "memory(GiB)": 72.72, + "step": 3560, + "token_acc": 0.8570034017657414, + "train_speed(iter/s)": 0.089589 + }, + { + "epoch": 1.5365958360163126, + "grad_norm": 0.12244880199432373, + "learning_rate": 4.6636140516715104e-05, + "loss": 0.4029510021209717, + "memory(GiB)": 72.72, + "step": 3580, + "token_acc": 0.8517572914459227, + "train_speed(iter/s)": 0.089593 + }, + { + "epoch": 1.5451813693925736, + "grad_norm": 0.1206183210015297, + "learning_rate": 4.659248265973205e-05, + "loss": 0.40460500717163084, + "memory(GiB)": 72.72, + "step": 3600, + "token_acc": 0.8554049462946347, + "train_speed(iter/s)": 0.089596 + }, + { + "epoch": 1.5537669027688346, + "grad_norm": 0.1283605545759201, + "learning_rate": 4.6548564028305746e-05, + "loss": 0.40555410385131835, + "memory(GiB)": 72.72, + "step": 3620, + "token_acc": 0.8552409152003629, + "train_speed(iter/s)": 0.0896 + }, + { + "epoch": 1.5623524361450956, + "grad_norm": 0.10448771715164185, + "learning_rate": 4.650438515284564e-05, + "loss": 0.4010280132293701, + "memory(GiB)": 72.72, + "step": 3640, + "token_acc": 0.8516997869926084, + "train_speed(iter/s)": 0.089603 + }, + { + "epoch": 1.5709379695213566, + "grad_norm": 0.14749032258987427, + "learning_rate": 4.645994656690417e-05, + "loss": 0.4050903797149658, + "memory(GiB)": 72.72, + "step": 3660, + "token_acc": 0.8502366458426844, + "train_speed(iter/s)": 0.0896 + }, + { + "epoch": 1.5795235028976176, + "grad_norm": 0.1269512176513672, + "learning_rate": 4.6415248807170296e-05, + "loss": 0.4045454502105713, + "memory(GiB)": 72.72, + "step": 3680, + "token_acc": 0.8799187339606501, + "train_speed(iter/s)": 0.089583 + }, + { + "epoch": 1.5881090362738786, + "grad_norm": 0.11708427965641022, + "learning_rate": 4.637029241346309e-05, + "loss": 0.4028982162475586, + "memory(GiB)": 72.72, + "step": 3700, + "token_acc": 0.8584745030316171, + "train_speed(iter/s)": 0.089582 + }, + { + "epoch": 1.5966945696501396, + "grad_norm": 0.12971659004688263, + "learning_rate": 4.632507792872513e-05, + "loss": 0.4027679920196533, + "memory(GiB)": 72.72, + "step": 3720, + "token_acc": 0.8444115651659281, + "train_speed(iter/s)": 0.089587 + }, + { + "epoch": 1.6052801030264006, + "grad_norm": 0.1406456083059311, + "learning_rate": 4.6279605899016007e-05, + "loss": 0.4045069694519043, + "memory(GiB)": 72.72, + "step": 3740, + "token_acc": 0.8620591654047942, + "train_speed(iter/s)": 0.089588 + }, + { + "epoch": 1.6138656364026616, + "grad_norm": 0.12651792168617249, + "learning_rate": 4.6233876873505694e-05, + "loss": 0.3987946271896362, + "memory(GiB)": 72.72, + "step": 3760, + "token_acc": 0.8604080254900858, + "train_speed(iter/s)": 0.089587 + }, + { + "epoch": 1.6224511697789226, + "grad_norm": 0.1294124722480774, + "learning_rate": 4.618789140446793e-05, + "loss": 0.4040426254272461, + "memory(GiB)": 72.72, + "step": 3780, + "token_acc": 0.8575430560407852, + "train_speed(iter/s)": 0.089588 + }, + { + "epoch": 1.6310367031551836, + "grad_norm": 0.13899479806423187, + "learning_rate": 4.614165004727356e-05, + "loss": 0.40485129356384275, + "memory(GiB)": 72.72, + "step": 3800, + "token_acc": 0.8618784194621236, + "train_speed(iter/s)": 0.089589 + }, + { + "epoch": 1.6396222365314446, + "grad_norm": 0.11304246634244919, + "learning_rate": 4.609515336038379e-05, + "loss": 0.39697728157043455, + "memory(GiB)": 72.72, + "step": 3820, + "token_acc": 0.8657167944284284, + "train_speed(iter/s)": 0.089593 + }, + { + "epoch": 1.6482077699077056, + "grad_norm": 0.10555765777826309, + "learning_rate": 4.604840190534349e-05, + "loss": 0.4016863346099854, + "memory(GiB)": 72.72, + "step": 3840, + "token_acc": 0.8618964493040964, + "train_speed(iter/s)": 0.089597 + }, + { + "epoch": 1.6567933032839666, + "grad_norm": 0.10668028146028519, + "learning_rate": 4.600139624677436e-05, + "loss": 0.40195555686950685, + "memory(GiB)": 72.72, + "step": 3860, + "token_acc": 0.8585640908572081, + "train_speed(iter/s)": 0.089599 + }, + { + "epoch": 1.6653788366602276, + "grad_norm": 0.11972223222255707, + "learning_rate": 4.5954136952368175e-05, + "loss": 0.404964542388916, + "memory(GiB)": 72.72, + "step": 3880, + "token_acc": 0.8751289644195156, + "train_speed(iter/s)": 0.089603 + }, + { + "epoch": 1.6739643700364886, + "grad_norm": 0.1090841144323349, + "learning_rate": 4.590662459287987e-05, + "loss": 0.4025224208831787, + "memory(GiB)": 72.72, + "step": 3900, + "token_acc": 0.8712011406091705, + "train_speed(iter/s)": 0.089608 + }, + { + "epoch": 1.6825499034127496, + "grad_norm": 0.09250445663928986, + "learning_rate": 4.585885974212068e-05, + "loss": 0.39822845458984374, + "memory(GiB)": 72.72, + "step": 3920, + "token_acc": 0.8478179395649417, + "train_speed(iter/s)": 0.089608 + }, + { + "epoch": 1.6911354367890106, + "grad_norm": 0.12228672951459885, + "learning_rate": 4.58108429769512e-05, + "loss": 0.4002052307128906, + "memory(GiB)": 72.72, + "step": 3940, + "token_acc": 0.8550839992606666, + "train_speed(iter/s)": 0.089611 + }, + { + "epoch": 1.6997209701652716, + "grad_norm": 0.11608360707759857, + "learning_rate": 4.576257487727442e-05, + "loss": 0.40276689529418946, + "memory(GiB)": 72.72, + "step": 3960, + "token_acc": 0.8589090178774137, + "train_speed(iter/s)": 0.089614 + }, + { + "epoch": 1.7083065035415326, + "grad_norm": 0.10027152299880981, + "learning_rate": 4.571405602602871e-05, + "loss": 0.39651687145233155, + "memory(GiB)": 72.72, + "step": 3980, + "token_acc": 0.8630956830570248, + "train_speed(iter/s)": 0.089614 + }, + { + "epoch": 1.7168920369177934, + "grad_norm": 0.13469679653644562, + "learning_rate": 4.5665287009180796e-05, + "loss": 0.404406213760376, + "memory(GiB)": 72.72, + "step": 4000, + "token_acc": 0.8562729568578561, + "train_speed(iter/s)": 0.089618 + }, + { + "epoch": 1.7168920369177934, + "eval_loss": 0.45004475116729736, + "eval_runtime": 69.5068, + "eval_samples_per_second": 54.153, + "eval_steps_per_second": 0.691, + "eval_token_acc": 0.838482459020931, + "step": 4000 + }, + { + "epoch": 1.7254775702940544, + "grad_norm": 0.11884400248527527, + "learning_rate": 4.5616268415718686e-05, + "loss": 0.4039021968841553, + "memory(GiB)": 72.72, + "step": 4020, + "token_acc": 0.8519779575146431, + "train_speed(iter/s)": 0.089391 + }, + { + "epoch": 1.7340631036703154, + "grad_norm": 0.11766815185546875, + "learning_rate": 4.5567000837644555e-05, + "loss": 0.40551328659057617, + "memory(GiB)": 72.72, + "step": 4040, + "token_acc": 0.8603655792648116, + "train_speed(iter/s)": 0.089374 + }, + { + "epoch": 1.7426486370465764, + "grad_norm": 0.1035754606127739, + "learning_rate": 4.551748486996755e-05, + "loss": 0.3972191333770752, + "memory(GiB)": 72.72, + "step": 4060, + "token_acc": 0.8441598716065328, + "train_speed(iter/s)": 0.08936 + }, + { + "epoch": 1.7512341704228374, + "grad_norm": 0.11534030735492706, + "learning_rate": 4.5467721110696685e-05, + "loss": 0.39623782634735105, + "memory(GiB)": 72.72, + "step": 4080, + "token_acc": 0.8508078067985404, + "train_speed(iter/s)": 0.089346 + }, + { + "epoch": 1.7598197037990984, + "grad_norm": 0.11770807206630707, + "learning_rate": 4.541771016083356e-05, + "loss": 0.4031228542327881, + "memory(GiB)": 72.72, + "step": 4100, + "token_acc": 0.8575402257628572, + "train_speed(iter/s)": 0.089337 + }, + { + "epoch": 1.7684052371753594, + "grad_norm": 0.11031018942594528, + "learning_rate": 4.5367452624365107e-05, + "loss": 0.39590916633605955, + "memory(GiB)": 72.72, + "step": 4120, + "token_acc": 0.8493938383274198, + "train_speed(iter/s)": 0.089333 + }, + { + "epoch": 1.7769907705516204, + "grad_norm": 0.12101167440414429, + "learning_rate": 4.531694910825632e-05, + "loss": 0.4022487163543701, + "memory(GiB)": 72.72, + "step": 4140, + "token_acc": 0.8616033848286162, + "train_speed(iter/s)": 0.08933 + }, + { + "epoch": 1.7855763039278814, + "grad_norm": 0.12361987680196762, + "learning_rate": 4.526620022244293e-05, + "loss": 0.3952162265777588, + "memory(GiB)": 72.72, + "step": 4160, + "token_acc": 0.8546911728976807, + "train_speed(iter/s)": 0.089329 + }, + { + "epoch": 1.7941618373041424, + "grad_norm": 0.11886027455329895, + "learning_rate": 4.521520657982399e-05, + "loss": 0.3967653751373291, + "memory(GiB)": 72.72, + "step": 4180, + "token_acc": 0.850109229842917, + "train_speed(iter/s)": 0.089327 + }, + { + "epoch": 1.8027473706804034, + "grad_norm": 0.10228098928928375, + "learning_rate": 4.516396879625451e-05, + "loss": 0.3982940435409546, + "memory(GiB)": 72.72, + "step": 4200, + "token_acc": 0.8674571957241461, + "train_speed(iter/s)": 0.089324 + }, + { + "epoch": 1.8113329040566644, + "grad_norm": 0.13192002475261688, + "learning_rate": 4.5112487490538033e-05, + "loss": 0.4016000747680664, + "memory(GiB)": 72.72, + "step": 4220, + "token_acc": 0.8583699143774935, + "train_speed(iter/s)": 0.089324 + }, + { + "epoch": 1.8199184374329254, + "grad_norm": 0.13863115012645721, + "learning_rate": 4.5060763284419114e-05, + "loss": 0.3993339538574219, + "memory(GiB)": 72.72, + "step": 4240, + "token_acc": 0.8529649884386873, + "train_speed(iter/s)": 0.089327 + }, + { + "epoch": 1.8285039708091864, + "grad_norm": 0.1052585169672966, + "learning_rate": 4.500879680257587e-05, + "loss": 0.39501266479492186, + "memory(GiB)": 72.72, + "step": 4260, + "token_acc": 0.8490466163025552, + "train_speed(iter/s)": 0.089326 + }, + { + "epoch": 1.8370895041854474, + "grad_norm": 0.11824264377355576, + "learning_rate": 4.495658867261237e-05, + "loss": 0.3999388933181763, + "memory(GiB)": 72.72, + "step": 4280, + "token_acc": 0.8604835011176714, + "train_speed(iter/s)": 0.08933 + }, + { + "epoch": 1.8456750375617084, + "grad_norm": 0.10404901951551437, + "learning_rate": 4.490413952505113e-05, + "loss": 0.399350905418396, + "memory(GiB)": 72.72, + "step": 4300, + "token_acc": 0.8754184479751959, + "train_speed(iter/s)": 0.089333 + }, + { + "epoch": 1.8542605709379694, + "grad_norm": 0.11935856193304062, + "learning_rate": 4.485144999332541e-05, + "loss": 0.3988263845443726, + "memory(GiB)": 72.72, + "step": 4320, + "token_acc": 0.8642416058331645, + "train_speed(iter/s)": 0.089334 + }, + { + "epoch": 1.8628461043142304, + "grad_norm": 0.12025253474712372, + "learning_rate": 4.4798520713771655e-05, + "loss": 0.3969618320465088, + "memory(GiB)": 72.72, + "step": 4340, + "token_acc": 0.8592759073410623, + "train_speed(iter/s)": 0.089324 + }, + { + "epoch": 1.8714316376904914, + "grad_norm": 0.10460798442363739, + "learning_rate": 4.474535232562176e-05, + "loss": 0.4043170928955078, + "memory(GiB)": 72.72, + "step": 4360, + "token_acc": 0.852819602922532, + "train_speed(iter/s)": 0.089327 + }, + { + "epoch": 1.8800171710667524, + "grad_norm": 0.10020267218351364, + "learning_rate": 4.469194547099532e-05, + "loss": 0.3999593734741211, + "memory(GiB)": 72.72, + "step": 4380, + "token_acc": 0.8611075959033526, + "train_speed(iter/s)": 0.089328 + }, + { + "epoch": 1.8886027044430134, + "grad_norm": 0.12959228456020355, + "learning_rate": 4.463830079489196e-05, + "loss": 0.39733612537384033, + "memory(GiB)": 72.72, + "step": 4400, + "token_acc": 0.8531978711946401, + "train_speed(iter/s)": 0.089335 + }, + { + "epoch": 1.8971882378192744, + "grad_norm": 0.11115922778844833, + "learning_rate": 4.458441894518348e-05, + "loss": 0.4049359321594238, + "memory(GiB)": 72.72, + "step": 4420, + "token_acc": 0.8702030459301568, + "train_speed(iter/s)": 0.089338 + }, + { + "epoch": 1.9057737711955354, + "grad_norm": 0.10734923928976059, + "learning_rate": 4.453030057260604e-05, + "loss": 0.40124940872192383, + "memory(GiB)": 72.72, + "step": 4440, + "token_acc": 0.8526137694097369, + "train_speed(iter/s)": 0.089343 + }, + { + "epoch": 1.9143593045717964, + "grad_norm": 0.10538238286972046, + "learning_rate": 4.44759463307523e-05, + "loss": 0.3986711263656616, + "memory(GiB)": 72.72, + "step": 4460, + "token_acc": 0.8580899206582427, + "train_speed(iter/s)": 0.089347 + }, + { + "epoch": 1.9229448379480574, + "grad_norm": 0.11792416125535965, + "learning_rate": 4.4421356876063566e-05, + "loss": 0.4009650707244873, + "memory(GiB)": 72.72, + "step": 4480, + "token_acc": 0.8415756258347672, + "train_speed(iter/s)": 0.089351 + }, + { + "epoch": 1.9315303713243184, + "grad_norm": 0.10540692508220673, + "learning_rate": 4.4366532867821816e-05, + "loss": 0.40032110214233396, + "memory(GiB)": 72.72, + "step": 4500, + "token_acc": 0.8645283673549553, + "train_speed(iter/s)": 0.089356 + }, + { + "epoch": 1.9401159047005794, + "grad_norm": 0.10806146264076233, + "learning_rate": 4.4311474968141745e-05, + "loss": 0.4047665596008301, + "memory(GiB)": 72.72, + "step": 4520, + "token_acc": 0.8665738751278136, + "train_speed(iter/s)": 0.089358 + }, + { + "epoch": 1.9487014380768404, + "grad_norm": 0.0982556939125061, + "learning_rate": 4.4256183841962776e-05, + "loss": 0.39951965808868406, + "memory(GiB)": 72.72, + "step": 4540, + "token_acc": 0.8557438649716252, + "train_speed(iter/s)": 0.08936 + }, + { + "epoch": 1.9572869714531014, + "grad_norm": 0.11462666094303131, + "learning_rate": 4.420066015704105e-05, + "loss": 0.39820613861083987, + "memory(GiB)": 72.72, + "step": 4560, + "token_acc": 0.851616577376715, + "train_speed(iter/s)": 0.089365 + }, + { + "epoch": 1.9658725048293624, + "grad_norm": 0.12274167686700821, + "learning_rate": 4.414490458394134e-05, + "loss": 0.39962952136993407, + "memory(GiB)": 72.72, + "step": 4580, + "token_acc": 0.8450544293089454, + "train_speed(iter/s)": 0.089369 + }, + { + "epoch": 1.9744580382056234, + "grad_norm": 0.11052652448415756, + "learning_rate": 4.408891779602892e-05, + "loss": 0.40143113136291503, + "memory(GiB)": 72.72, + "step": 4600, + "token_acc": 0.8466183479919549, + "train_speed(iter/s)": 0.089369 + }, + { + "epoch": 1.9830435715818844, + "grad_norm": 0.11736435443162918, + "learning_rate": 4.403270046946151e-05, + "loss": 0.39746062755584716, + "memory(GiB)": 72.72, + "step": 4620, + "token_acc": 0.8545920867275066, + "train_speed(iter/s)": 0.08937 + }, + { + "epoch": 1.9916291049581454, + "grad_norm": 0.09831462055444717, + "learning_rate": 4.397625328318104e-05, + "loss": 0.40285186767578124, + "memory(GiB)": 72.72, + "step": 4640, + "token_acc": 0.8588040292883812, + "train_speed(iter/s)": 0.089375 + }, + { + "epoch": 2.0, + "grad_norm": 0.1868225783109665, + "learning_rate": 4.3919576918905495e-05, + "loss": 0.40441222190856935, + "memory(GiB)": 72.72, + "step": 4660, + "token_acc": 0.8483147592149679, + "train_speed(iter/s)": 0.089388 + }, + { + "epoch": 2.008585533376261, + "grad_norm": 0.1071023941040039, + "learning_rate": 4.3862672061120637e-05, + "loss": 0.3615531921386719, + "memory(GiB)": 72.72, + "step": 4680, + "token_acc": 0.8768156740901892, + "train_speed(iter/s)": 0.089352 + }, + { + "epoch": 2.017171066752522, + "grad_norm": 0.10470844805240631, + "learning_rate": 4.3805539397071806e-05, + "loss": 0.36854674816131594, + "memory(GiB)": 72.72, + "step": 4700, + "token_acc": 0.8658165567867299, + "train_speed(iter/s)": 0.089356 + }, + { + "epoch": 2.025756600128783, + "grad_norm": 0.11344057321548462, + "learning_rate": 4.374817961675553e-05, + "loss": 0.36517815589904784, + "memory(GiB)": 72.72, + "step": 4720, + "token_acc": 0.8573345434699361, + "train_speed(iter/s)": 0.08936 + }, + { + "epoch": 2.034342133505044, + "grad_norm": 0.12088248133659363, + "learning_rate": 4.369059341291131e-05, + "loss": 0.3732161045074463, + "memory(GiB)": 72.72, + "step": 4740, + "token_acc": 0.8643586935864834, + "train_speed(iter/s)": 0.089232 + }, + { + "epoch": 2.042927666881305, + "grad_norm": 0.1079382449388504, + "learning_rate": 4.3632781481013105e-05, + "loss": 0.3706186294555664, + "memory(GiB)": 72.72, + "step": 4760, + "token_acc": 0.8583230735096428, + "train_speed(iter/s)": 0.089236 + }, + { + "epoch": 2.051513200257566, + "grad_norm": 0.10861940681934357, + "learning_rate": 4.357474451926107e-05, + "loss": 0.36578049659729006, + "memory(GiB)": 72.72, + "step": 4780, + "token_acc": 0.8634094633238114, + "train_speed(iter/s)": 0.089242 + }, + { + "epoch": 2.060098733633827, + "grad_norm": 0.10976995527744293, + "learning_rate": 4.351648322857304e-05, + "loss": 0.3717454671859741, + "memory(GiB)": 72.72, + "step": 4800, + "token_acc": 0.8754387101732538, + "train_speed(iter/s)": 0.089245 + }, + { + "epoch": 2.068684267010088, + "grad_norm": 0.11381576955318451, + "learning_rate": 4.345799831257612e-05, + "loss": 0.3690098524093628, + "memory(GiB)": 72.72, + "step": 4820, + "token_acc": 0.8754805492942517, + "train_speed(iter/s)": 0.089251 + }, + { + "epoch": 2.077269800386349, + "grad_norm": 0.11746755242347717, + "learning_rate": 4.339929047759812e-05, + "loss": 0.3719310760498047, + "memory(GiB)": 72.72, + "step": 4840, + "token_acc": 0.8569852569604883, + "train_speed(iter/s)": 0.089254 + }, + { + "epoch": 2.08585533376261, + "grad_norm": 0.1201663538813591, + "learning_rate": 4.334036043265909e-05, + "loss": 0.366811728477478, + "memory(GiB)": 72.72, + "step": 4860, + "token_acc": 0.868435326772985, + "train_speed(iter/s)": 0.089256 + }, + { + "epoch": 2.094440867138871, + "grad_norm": 0.10783121734857559, + "learning_rate": 4.3281208889462715e-05, + "loss": 0.3673741102218628, + "memory(GiB)": 72.72, + "step": 4880, + "token_acc": 0.8597632948845746, + "train_speed(iter/s)": 0.089257 + }, + { + "epoch": 2.103026400515132, + "grad_norm": 0.10495728254318237, + "learning_rate": 4.3221836562387754e-05, + "loss": 0.371392560005188, + "memory(GiB)": 72.72, + "step": 4900, + "token_acc": 0.8613381730879158, + "train_speed(iter/s)": 0.08926 + }, + { + "epoch": 2.111611933891393, + "grad_norm": 0.12534630298614502, + "learning_rate": 4.3162244168479385e-05, + "loss": 0.37217743396759034, + "memory(GiB)": 72.72, + "step": 4920, + "token_acc": 0.8550305751583707, + "train_speed(iter/s)": 0.089265 + }, + { + "epoch": 2.120197467267654, + "grad_norm": 0.11463397741317749, + "learning_rate": 4.310243242744055e-05, + "loss": 0.37210404872894287, + "memory(GiB)": 72.72, + "step": 4940, + "token_acc": 0.8669802804648793, + "train_speed(iter/s)": 0.089267 + }, + { + "epoch": 2.128783000643915, + "grad_norm": 0.0987405851483345, + "learning_rate": 4.304240206162326e-05, + "loss": 0.36531455516815187, + "memory(GiB)": 72.72, + "step": 4960, + "token_acc": 0.8615956192835081, + "train_speed(iter/s)": 0.089271 + }, + { + "epoch": 2.137368534020176, + "grad_norm": 0.10236576199531555, + "learning_rate": 4.2982153796019895e-05, + "loss": 0.3683722734451294, + "memory(GiB)": 72.72, + "step": 4980, + "token_acc": 0.8691021414446882, + "train_speed(iter/s)": 0.089273 + }, + { + "epoch": 2.145954067396437, + "grad_norm": 0.11913823336362839, + "learning_rate": 4.292168835825442e-05, + "loss": 0.36794998645782473, + "memory(GiB)": 72.72, + "step": 5000, + "token_acc": 0.8603812367895441, + "train_speed(iter/s)": 0.089274 + }, + { + "epoch": 2.145954067396437, + "eval_loss": 0.44772276282310486, + "eval_runtime": 74.9501, + "eval_samples_per_second": 50.22, + "eval_steps_per_second": 0.64, + "eval_token_acc": 0.8396186479726367, + "step": 5000 + }, + { + "epoch": 2.154539600772698, + "grad_norm": 0.10815497487783432, + "learning_rate": 4.286100647857362e-05, + "loss": 0.3666555881500244, + "memory(GiB)": 72.72, + "step": 5020, + "token_acc": 0.8487304373111233, + "train_speed(iter/s)": 0.089094 + }, + { + "epoch": 2.163125134148959, + "grad_norm": 0.10708407312631607, + "learning_rate": 4.2800108889838244e-05, + "loss": 0.3680349111557007, + "memory(GiB)": 72.72, + "step": 5040, + "token_acc": 0.8607205605794815, + "train_speed(iter/s)": 0.089078 + }, + { + "epoch": 2.17171066752522, + "grad_norm": 0.10280643403530121, + "learning_rate": 4.273899632751422e-05, + "loss": 0.3690458297729492, + "memory(GiB)": 72.72, + "step": 5060, + "token_acc": 0.8681282741623693, + "train_speed(iter/s)": 0.089068 + }, + { + "epoch": 2.180296200901481, + "grad_norm": 0.11067724972963333, + "learning_rate": 4.267766952966369e-05, + "loss": 0.37246291637420653, + "memory(GiB)": 72.72, + "step": 5080, + "token_acc": 0.8648300486787626, + "train_speed(iter/s)": 0.089062 + }, + { + "epoch": 2.188881734277742, + "grad_norm": 0.10517250746488571, + "learning_rate": 4.261612923693617e-05, + "loss": 0.37222487926483155, + "memory(GiB)": 72.72, + "step": 5100, + "token_acc": 0.8561770562371953, + "train_speed(iter/s)": 0.089058 + }, + { + "epoch": 2.197467267654003, + "grad_norm": 0.11643174290657043, + "learning_rate": 4.255437619255955e-05, + "loss": 0.37151226997375486, + "memory(GiB)": 72.72, + "step": 5120, + "token_acc": 0.856546833515401, + "train_speed(iter/s)": 0.089056 + }, + { + "epoch": 2.206052801030264, + "grad_norm": 0.10725266486406326, + "learning_rate": 4.2492411142331164e-05, + "loss": 0.3672873258590698, + "memory(GiB)": 72.72, + "step": 5140, + "token_acc": 0.8657454419748819, + "train_speed(iter/s)": 0.089055 + }, + { + "epoch": 2.214638334406525, + "grad_norm": 0.10386510193347931, + "learning_rate": 4.243023483460875e-05, + "loss": 0.3682314395904541, + "memory(GiB)": 72.72, + "step": 5160, + "token_acc": 0.8692801593001643, + "train_speed(iter/s)": 0.089056 + }, + { + "epoch": 2.223223867782786, + "grad_norm": 0.11796915531158447, + "learning_rate": 4.236784802030141e-05, + "loss": 0.3701756000518799, + "memory(GiB)": 72.72, + "step": 5180, + "token_acc": 0.8771635645482831, + "train_speed(iter/s)": 0.089057 + }, + { + "epoch": 2.231809401159047, + "grad_norm": 0.10015714913606644, + "learning_rate": 4.230525145286057e-05, + "loss": 0.36999518871307374, + "memory(GiB)": 72.72, + "step": 5200, + "token_acc": 0.8674851697347774, + "train_speed(iter/s)": 0.089057 + }, + { + "epoch": 2.240394934535308, + "grad_norm": 0.1074676439166069, + "learning_rate": 4.224244588827088e-05, + "loss": 0.3750225782394409, + "memory(GiB)": 72.72, + "step": 5220, + "token_acc": 0.8527425346133436, + "train_speed(iter/s)": 0.089057 + }, + { + "epoch": 2.248980467911569, + "grad_norm": 0.10311347991228104, + "learning_rate": 4.2179432085041016e-05, + "loss": 0.3746063232421875, + "memory(GiB)": 72.72, + "step": 5240, + "token_acc": 0.8669185952544043, + "train_speed(iter/s)": 0.089056 + }, + { + "epoch": 2.25756600128783, + "grad_norm": 0.11873036623001099, + "learning_rate": 4.211621080419463e-05, + "loss": 0.37813477516174315, + "memory(GiB)": 72.72, + "step": 5260, + "token_acc": 0.8692531193982356, + "train_speed(iter/s)": 0.089056 + }, + { + "epoch": 2.266151534664091, + "grad_norm": 0.11505374312400818, + "learning_rate": 4.205278280926106e-05, + "loss": 0.37494683265686035, + "memory(GiB)": 72.72, + "step": 5280, + "token_acc": 0.8686222108977568, + "train_speed(iter/s)": 0.089057 + }, + { + "epoch": 2.274737068040352, + "grad_norm": 0.10475321859121323, + "learning_rate": 4.198914886626617e-05, + "loss": 0.37322399616241453, + "memory(GiB)": 72.72, + "step": 5300, + "token_acc": 0.8642545858709445, + "train_speed(iter/s)": 0.089058 + }, + { + "epoch": 2.283322601416613, + "grad_norm": 0.10895238816738129, + "learning_rate": 4.192530974372307e-05, + "loss": 0.37212719917297366, + "memory(GiB)": 72.72, + "step": 5320, + "token_acc": 0.8592036985069942, + "train_speed(iter/s)": 0.089059 + }, + { + "epoch": 2.291908134792874, + "grad_norm": 0.13440454006195068, + "learning_rate": 4.186126621262286e-05, + "loss": 0.3748520612716675, + "memory(GiB)": 72.72, + "step": 5340, + "token_acc": 0.8694009430316147, + "train_speed(iter/s)": 0.089059 + }, + { + "epoch": 2.300493668169135, + "grad_norm": 0.10428149253129959, + "learning_rate": 4.1797019046425264e-05, + "loss": 0.3729527711868286, + "memory(GiB)": 72.72, + "step": 5360, + "token_acc": 0.8606326299971436, + "train_speed(iter/s)": 0.089059 + }, + { + "epoch": 2.309079201545396, + "grad_norm": 0.10109774023294449, + "learning_rate": 4.173256902104937e-05, + "loss": 0.3786268949508667, + "memory(GiB)": 72.72, + "step": 5380, + "token_acc": 0.8546159979614149, + "train_speed(iter/s)": 0.089063 + }, + { + "epoch": 2.317664734921657, + "grad_norm": 0.1086476594209671, + "learning_rate": 4.166791691486417e-05, + "loss": 0.37719101905822755, + "memory(GiB)": 72.72, + "step": 5400, + "token_acc": 0.8614693814596865, + "train_speed(iter/s)": 0.089065 + }, + { + "epoch": 2.326250268297918, + "grad_norm": 0.0986161157488823, + "learning_rate": 4.1603063508679254e-05, + "loss": 0.3716520071029663, + "memory(GiB)": 72.72, + "step": 5420, + "token_acc": 0.8700038391325128, + "train_speed(iter/s)": 0.089068 + }, + { + "epoch": 2.334835801674179, + "grad_norm": 0.10710026323795319, + "learning_rate": 4.1538009585735296e-05, + "loss": 0.37460925579071047, + "memory(GiB)": 72.72, + "step": 5440, + "token_acc": 0.864236101862486, + "train_speed(iter/s)": 0.089068 + }, + { + "epoch": 2.34342133505044, + "grad_norm": 0.1084044948220253, + "learning_rate": 4.1472755931694626e-05, + "loss": 0.37008664608001707, + "memory(GiB)": 72.72, + "step": 5460, + "token_acc": 0.884960342611309, + "train_speed(iter/s)": 0.08907 + }, + { + "epoch": 2.352006868426701, + "grad_norm": 0.1017412543296814, + "learning_rate": 4.1407303334631784e-05, + "loss": 0.37591137886047366, + "memory(GiB)": 72.72, + "step": 5480, + "token_acc": 0.8690706806478822, + "train_speed(iter/s)": 0.089076 + }, + { + "epoch": 2.360592401802962, + "grad_norm": 0.09642521291971207, + "learning_rate": 4.134165258502392e-05, + "loss": 0.3724454641342163, + "memory(GiB)": 72.72, + "step": 5500, + "token_acc": 0.8665798727743621, + "train_speed(iter/s)": 0.089078 + }, + { + "epoch": 2.369177935179223, + "grad_norm": 0.10195600241422653, + "learning_rate": 4.127580447574131e-05, + "loss": 0.37389321327209474, + "memory(GiB)": 72.72, + "step": 5520, + "token_acc": 0.8659414758069467, + "train_speed(iter/s)": 0.089082 + }, + { + "epoch": 2.377763468555484, + "grad_norm": 0.10220296680927277, + "learning_rate": 4.120975980203778e-05, + "loss": 0.37123832702636717, + "memory(GiB)": 72.72, + "step": 5540, + "token_acc": 0.8599722163416741, + "train_speed(iter/s)": 0.089087 + }, + { + "epoch": 2.386349001931745, + "grad_norm": 0.09796139597892761, + "learning_rate": 4.114351936154105e-05, + "loss": 0.37191407680511473, + "memory(GiB)": 72.72, + "step": 5560, + "token_acc": 0.8607424388032349, + "train_speed(iter/s)": 0.089089 + }, + { + "epoch": 2.394934535308006, + "grad_norm": 0.1000937819480896, + "learning_rate": 4.1077083954243134e-05, + "loss": 0.3728537082672119, + "memory(GiB)": 72.72, + "step": 5580, + "token_acc": 0.8599175456576579, + "train_speed(iter/s)": 0.089092 + }, + { + "epoch": 2.403520068684267, + "grad_norm": 0.10470744967460632, + "learning_rate": 4.101045438249072e-05, + "loss": 0.3749739170074463, + "memory(GiB)": 72.72, + "step": 5600, + "token_acc": 0.8662434580620245, + "train_speed(iter/s)": 0.089094 + }, + { + "epoch": 2.412105602060528, + "grad_norm": 0.10006117075681686, + "learning_rate": 4.0943631450975395e-05, + "loss": 0.3695227146148682, + "memory(GiB)": 72.72, + "step": 5620, + "token_acc": 0.8567862235957147, + "train_speed(iter/s)": 0.089097 + }, + { + "epoch": 2.420691135436789, + "grad_norm": 0.11233365535736084, + "learning_rate": 4.0876615966723983e-05, + "loss": 0.37129299640655516, + "memory(GiB)": 72.72, + "step": 5640, + "token_acc": 0.8725153838730988, + "train_speed(iter/s)": 0.089101 + }, + { + "epoch": 2.42927666881305, + "grad_norm": 0.09630627185106277, + "learning_rate": 4.080940873908881e-05, + "loss": 0.3767483472824097, + "memory(GiB)": 72.72, + "step": 5660, + "token_acc": 0.8623814759151552, + "train_speed(iter/s)": 0.089105 + }, + { + "epoch": 2.437862202189311, + "grad_norm": 0.11699684709310532, + "learning_rate": 4.0742010579737855e-05, + "loss": 0.37203705310821533, + "memory(GiB)": 72.72, + "step": 5680, + "token_acc": 0.8617447464487988, + "train_speed(iter/s)": 0.089104 + }, + { + "epoch": 2.446447735565572, + "grad_norm": 0.10771006345748901, + "learning_rate": 4.067442230264503e-05, + "loss": 0.3795736312866211, + "memory(GiB)": 72.72, + "step": 5700, + "token_acc": 0.8621945679332835, + "train_speed(iter/s)": 0.089107 + }, + { + "epoch": 2.455033268941833, + "grad_norm": 0.10978804528713226, + "learning_rate": 4.0606644724080334e-05, + "loss": 0.37045629024505616, + "memory(GiB)": 72.72, + "step": 5720, + "token_acc": 0.8683952247812166, + "train_speed(iter/s)": 0.089111 + }, + { + "epoch": 2.463618802318094, + "grad_norm": 0.11052682995796204, + "learning_rate": 4.053867866259994e-05, + "loss": 0.37306039333343505, + "memory(GiB)": 72.72, + "step": 5740, + "token_acc": 0.8691631145068139, + "train_speed(iter/s)": 0.089115 + }, + { + "epoch": 2.472204335694355, + "grad_norm": 0.09953057020902634, + "learning_rate": 4.0470524939036355e-05, + "loss": 0.37361931800842285, + "memory(GiB)": 72.72, + "step": 5760, + "token_acc": 0.8691507758784558, + "train_speed(iter/s)": 0.089118 + }, + { + "epoch": 2.480789869070616, + "grad_norm": 0.11204252392053604, + "learning_rate": 4.0402184376488514e-05, + "loss": 0.37611095905303954, + "memory(GiB)": 72.72, + "step": 5780, + "token_acc": 0.8522185815081345, + "train_speed(iter/s)": 0.089118 + }, + { + "epoch": 2.489375402446877, + "grad_norm": 0.09915061295032501, + "learning_rate": 4.033365780031183e-05, + "loss": 0.37398972511291506, + "memory(GiB)": 72.72, + "step": 5800, + "token_acc": 0.8830448305013368, + "train_speed(iter/s)": 0.089119 + }, + { + "epoch": 2.497960935823138, + "grad_norm": 0.10546642541885376, + "learning_rate": 4.026494603810819e-05, + "loss": 0.3730853796005249, + "memory(GiB)": 72.72, + "step": 5820, + "token_acc": 0.8712887390375224, + "train_speed(iter/s)": 0.089122 + }, + { + "epoch": 2.506546469199399, + "grad_norm": 0.10121016949415207, + "learning_rate": 4.0196049919716004e-05, + "loss": 0.3762380361557007, + "memory(GiB)": 72.72, + "step": 5840, + "token_acc": 0.8579485282281408, + "train_speed(iter/s)": 0.089126 + }, + { + "epoch": 2.51513200257566, + "grad_norm": 0.103721484541893, + "learning_rate": 4.012697027720018e-05, + "loss": 0.36703407764434814, + "memory(GiB)": 72.72, + "step": 5860, + "token_acc": 0.8760437267344359, + "train_speed(iter/s)": 0.089129 + }, + { + "epoch": 2.523717535951921, + "grad_norm": 0.10886813700199127, + "learning_rate": 4.005770794484206e-05, + "loss": 0.3760274648666382, + "memory(GiB)": 72.72, + "step": 5880, + "token_acc": 0.86771377124094, + "train_speed(iter/s)": 0.089132 + }, + { + "epoch": 2.532303069328182, + "grad_norm": 0.10048224776983261, + "learning_rate": 3.998826375912934e-05, + "loss": 0.3727203369140625, + "memory(GiB)": 72.72, + "step": 5900, + "token_acc": 0.8678732978111068, + "train_speed(iter/s)": 0.089136 + }, + { + "epoch": 2.540888602704443, + "grad_norm": 0.11523660272359848, + "learning_rate": 3.9918638558745966e-05, + "loss": 0.3741061449050903, + "memory(GiB)": 72.72, + "step": 5920, + "token_acc": 0.8660318303612676, + "train_speed(iter/s)": 0.089136 + }, + { + "epoch": 2.549474136080704, + "grad_norm": 0.11144141107797623, + "learning_rate": 3.9848833184562056e-05, + "loss": 0.3695514440536499, + "memory(GiB)": 72.72, + "step": 5940, + "token_acc": 0.8587312382845311, + "train_speed(iter/s)": 0.089141 + }, + { + "epoch": 2.558059669456965, + "grad_norm": 0.10469717532396317, + "learning_rate": 3.9778848479623656e-05, + "loss": 0.3754448413848877, + "memory(GiB)": 72.72, + "step": 5960, + "token_acc": 0.856428029145263, + "train_speed(iter/s)": 0.089145 + }, + { + "epoch": 2.566645202833226, + "grad_norm": 0.09304027259349823, + "learning_rate": 3.970868528914264e-05, + "loss": 0.3713753938674927, + "memory(GiB)": 72.72, + "step": 5980, + "token_acc": 0.8559786330639638, + "train_speed(iter/s)": 0.089145 + }, + { + "epoch": 2.575230736209487, + "grad_norm": 0.10925323516130447, + "learning_rate": 3.963834446048644e-05, + "loss": 0.3693029165267944, + "memory(GiB)": 72.72, + "step": 6000, + "token_acc": 0.8629965592743197, + "train_speed(iter/s)": 0.089148 + }, + { + "epoch": 2.575230736209487, + "eval_loss": 0.4436081647872925, + "eval_runtime": 70.1538, + "eval_samples_per_second": 53.654, + "eval_steps_per_second": 0.684, + "eval_token_acc": 0.84051665623243, + "step": 6000 + }, + { + "epoch": 2.583816269585748, + "grad_norm": 0.1098393052816391, + "learning_rate": 3.956782684316788e-05, + "loss": 0.37103126049041746, + "memory(GiB)": 72.72, + "step": 6020, + "token_acc": 0.8471303763965553, + "train_speed(iter/s)": 0.089007 + }, + { + "epoch": 2.592401802962009, + "grad_norm": 0.10387935489416122, + "learning_rate": 3.949713328883483e-05, + "loss": 0.36882970333099363, + "memory(GiB)": 72.72, + "step": 6040, + "token_acc": 0.8459705942755437, + "train_speed(iter/s)": 0.088998 + }, + { + "epoch": 2.60098733633827, + "grad_norm": 0.10209009051322937, + "learning_rate": 3.942626465126001e-05, + "loss": 0.36882977485656737, + "memory(GiB)": 72.72, + "step": 6060, + "token_acc": 0.8655902503061222, + "train_speed(iter/s)": 0.088991 + }, + { + "epoch": 2.609572869714531, + "grad_norm": 0.10415869951248169, + "learning_rate": 3.935522178633062e-05, + "loss": 0.3759881258010864, + "memory(GiB)": 72.72, + "step": 6080, + "token_acc": 0.8581398082906834, + "train_speed(iter/s)": 0.08899 + }, + { + "epoch": 2.618158403090792, + "grad_norm": 0.11114171892404556, + "learning_rate": 3.928400555203801e-05, + "loss": 0.37210090160369874, + "memory(GiB)": 72.72, + "step": 6100, + "token_acc": 0.8736639992402806, + "train_speed(iter/s)": 0.088988 + }, + { + "epoch": 2.626743936467053, + "grad_norm": 0.10994569212198257, + "learning_rate": 3.921261680846734e-05, + "loss": 0.3746177673339844, + "memory(GiB)": 72.72, + "step": 6120, + "token_acc": 0.8693309992064365, + "train_speed(iter/s)": 0.088985 + }, + { + "epoch": 2.635329469843314, + "grad_norm": 0.096384197473526, + "learning_rate": 3.914105641778718e-05, + "loss": 0.3694021701812744, + "memory(GiB)": 72.72, + "step": 6140, + "token_acc": 0.8684860314899538, + "train_speed(iter/s)": 0.088985 + }, + { + "epoch": 2.643915003219575, + "grad_norm": 0.10146961361169815, + "learning_rate": 3.9069325244239095e-05, + "loss": 0.36874828338623045, + "memory(GiB)": 72.72, + "step": 6160, + "token_acc": 0.8668048776320361, + "train_speed(iter/s)": 0.08898 + }, + { + "epoch": 2.652500536595836, + "grad_norm": 0.0965966135263443, + "learning_rate": 3.899742415412722e-05, + "loss": 0.36864802837371824, + "memory(GiB)": 72.72, + "step": 6180, + "token_acc": 0.8755646100841697, + "train_speed(iter/s)": 0.088979 + }, + { + "epoch": 2.661086069972097, + "grad_norm": 0.09827576577663422, + "learning_rate": 3.892535401580776e-05, + "loss": 0.36760308742523196, + "memory(GiB)": 72.72, + "step": 6200, + "token_acc": 0.8648272017837235, + "train_speed(iter/s)": 0.088982 + }, + { + "epoch": 2.669671603348358, + "grad_norm": 0.09901771694421768, + "learning_rate": 3.885311569967858e-05, + "loss": 0.37281830310821534, + "memory(GiB)": 72.72, + "step": 6220, + "token_acc": 0.8820075603884335, + "train_speed(iter/s)": 0.088983 + }, + { + "epoch": 2.678257136724619, + "grad_norm": 0.1107199490070343, + "learning_rate": 3.878071007816859e-05, + "loss": 0.37139651775360105, + "memory(GiB)": 72.72, + "step": 6240, + "token_acc": 0.8544989601044564, + "train_speed(iter/s)": 0.088985 + }, + { + "epoch": 2.68684267010088, + "grad_norm": 0.10703787952661514, + "learning_rate": 3.87081380257273e-05, + "loss": 0.3727452039718628, + "memory(GiB)": 72.72, + "step": 6260, + "token_acc": 0.8577885712183096, + "train_speed(iter/s)": 0.088987 + }, + { + "epoch": 2.695428203477141, + "grad_norm": 0.10672149062156677, + "learning_rate": 3.8635400418814214e-05, + "loss": 0.36861019134521483, + "memory(GiB)": 72.72, + "step": 6280, + "token_acc": 0.8560803640097792, + "train_speed(iter/s)": 0.088987 + }, + { + "epoch": 2.704013736853402, + "grad_norm": 0.09396067261695862, + "learning_rate": 3.856249813588824e-05, + "loss": 0.36811778545379636, + "memory(GiB)": 72.72, + "step": 6300, + "token_acc": 0.868624502647731, + "train_speed(iter/s)": 0.088988 + }, + { + "epoch": 2.712599270229663, + "grad_norm": 0.1063656210899353, + "learning_rate": 3.848943205739711e-05, + "loss": 0.369048547744751, + "memory(GiB)": 72.72, + "step": 6320, + "token_acc": 0.8519738843659109, + "train_speed(iter/s)": 0.088991 + }, + { + "epoch": 2.721184803605924, + "grad_norm": 0.10474120825529099, + "learning_rate": 3.841620306576673e-05, + "loss": 0.3731086730957031, + "memory(GiB)": 72.72, + "step": 6340, + "token_acc": 0.8653390159502418, + "train_speed(iter/s)": 0.088991 + }, + { + "epoch": 2.729770336982185, + "grad_norm": 0.10354544967412949, + "learning_rate": 3.834281204539051e-05, + "loss": 0.37295677661895754, + "memory(GiB)": 72.72, + "step": 6360, + "token_acc": 0.8547169188263978, + "train_speed(iter/s)": 0.088993 + }, + { + "epoch": 2.738355870358446, + "grad_norm": 0.10440333932638168, + "learning_rate": 3.82692598826187e-05, + "loss": 0.3712725877761841, + "memory(GiB)": 72.72, + "step": 6380, + "token_acc": 0.8820346020559459, + "train_speed(iter/s)": 0.088996 + }, + { + "epoch": 2.746941403734707, + "grad_norm": 0.09520816057920456, + "learning_rate": 3.8195547465747685e-05, + "loss": 0.3697003602981567, + "memory(GiB)": 72.72, + "step": 6400, + "token_acc": 0.8595229803723984, + "train_speed(iter/s)": 0.088997 + }, + { + "epoch": 2.755526937110968, + "grad_norm": 0.09628502279520035, + "learning_rate": 3.812167568500927e-05, + "loss": 0.3673550128936768, + "memory(GiB)": 72.72, + "step": 6420, + "token_acc": 0.8689877572158957, + "train_speed(iter/s)": 0.089 + }, + { + "epoch": 2.764112470487229, + "grad_norm": 0.09505701065063477, + "learning_rate": 3.804764543255987e-05, + "loss": 0.36903977394104004, + "memory(GiB)": 72.72, + "step": 6440, + "token_acc": 0.8750163024104382, + "train_speed(iter/s)": 0.089003 + }, + { + "epoch": 2.77269800386349, + "grad_norm": 0.092622309923172, + "learning_rate": 3.797345760246982e-05, + "loss": 0.3679107666015625, + "memory(GiB)": 72.72, + "step": 6460, + "token_acc": 0.8469855548827956, + "train_speed(iter/s)": 0.089006 + }, + { + "epoch": 2.781283537239751, + "grad_norm": 0.10573814809322357, + "learning_rate": 3.7899113090712526e-05, + "loss": 0.3690340042114258, + "memory(GiB)": 72.72, + "step": 6480, + "token_acc": 0.8704448664825046, + "train_speed(iter/s)": 0.089008 + }, + { + "epoch": 2.789869070616012, + "grad_norm": 0.1018662378191948, + "learning_rate": 3.782461279515363e-05, + "loss": 0.3682270050048828, + "memory(GiB)": 72.72, + "step": 6500, + "token_acc": 0.8687270373931054, + "train_speed(iter/s)": 0.089011 + }, + { + "epoch": 2.798454603992273, + "grad_norm": 0.09893783926963806, + "learning_rate": 3.7749957615540224e-05, + "loss": 0.371025824546814, + "memory(GiB)": 72.72, + "step": 6520, + "token_acc": 0.8599820821280371, + "train_speed(iter/s)": 0.089014 + }, + { + "epoch": 2.807040137368534, + "grad_norm": 0.1044677123427391, + "learning_rate": 3.767514845348992e-05, + "loss": 0.37092270851135256, + "memory(GiB)": 72.72, + "step": 6540, + "token_acc": 0.860977069485444, + "train_speed(iter/s)": 0.089016 + }, + { + "epoch": 2.815625670744795, + "grad_norm": 0.10815873742103577, + "learning_rate": 3.760018621248e-05, + "loss": 0.36874244213104246, + "memory(GiB)": 72.72, + "step": 6560, + "token_acc": 0.8586234130381737, + "train_speed(iter/s)": 0.089019 + }, + { + "epoch": 2.824211204121056, + "grad_norm": 0.08873378485441208, + "learning_rate": 3.75250717978365e-05, + "loss": 0.36833083629608154, + "memory(GiB)": 72.72, + "step": 6580, + "token_acc": 0.8633419814445173, + "train_speed(iter/s)": 0.089022 + }, + { + "epoch": 2.832796737497317, + "grad_norm": 0.09121917188167572, + "learning_rate": 3.7449806116723266e-05, + "loss": 0.3694983720779419, + "memory(GiB)": 72.72, + "step": 6600, + "token_acc": 0.8697197272952701, + "train_speed(iter/s)": 0.089026 + }, + { + "epoch": 2.841382270873578, + "grad_norm": 0.09253229945898056, + "learning_rate": 3.7374390078131015e-05, + "loss": 0.37108821868896485, + "memory(GiB)": 72.72, + "step": 6620, + "token_acc": 0.8706145844516814, + "train_speed(iter/s)": 0.089031 + }, + { + "epoch": 2.849967804249839, + "grad_norm": 0.09768302738666534, + "learning_rate": 3.729882459286632e-05, + "loss": 0.3706928253173828, + "memory(GiB)": 72.72, + "step": 6640, + "token_acc": 0.8605562350922205, + "train_speed(iter/s)": 0.089033 + }, + { + "epoch": 2.8585533376261, + "grad_norm": 0.09809901565313339, + "learning_rate": 3.722311057354067e-05, + "loss": 0.3715434312820435, + "memory(GiB)": 72.72, + "step": 6660, + "token_acc": 0.8687115200037456, + "train_speed(iter/s)": 0.089037 + }, + { + "epoch": 2.867138871002361, + "grad_norm": 0.10311082750558853, + "learning_rate": 3.714724893455938e-05, + "loss": 0.3686758756637573, + "memory(GiB)": 72.72, + "step": 6680, + "token_acc": 0.8536554098061117, + "train_speed(iter/s)": 0.089035 + }, + { + "epoch": 2.875724404378622, + "grad_norm": 0.0951702892780304, + "learning_rate": 3.7071240592110604e-05, + "loss": 0.37487409114837644, + "memory(GiB)": 72.72, + "step": 6700, + "token_acc": 0.8619494831493575, + "train_speed(iter/s)": 0.089039 + }, + { + "epoch": 2.884309937754883, + "grad_norm": 0.10398156195878983, + "learning_rate": 3.699508646415424e-05, + "loss": 0.3755856275558472, + "memory(GiB)": 72.72, + "step": 6720, + "token_acc": 0.8719096505699291, + "train_speed(iter/s)": 0.089043 + }, + { + "epoch": 2.892895471131144, + "grad_norm": 0.09801426529884338, + "learning_rate": 3.691878747041084e-05, + "loss": 0.36969609260559083, + "memory(GiB)": 72.72, + "step": 6740, + "token_acc": 0.8539101926900138, + "train_speed(iter/s)": 0.089046 + }, + { + "epoch": 2.901481004507405, + "grad_norm": 0.10008656978607178, + "learning_rate": 3.684234453235054e-05, + "loss": 0.3719330310821533, + "memory(GiB)": 72.72, + "step": 6760, + "token_acc": 0.8648975749697432, + "train_speed(iter/s)": 0.08905 + }, + { + "epoch": 2.910066537883666, + "grad_norm": 0.12179595977067947, + "learning_rate": 3.676575857318189e-05, + "loss": 0.37140851020812987, + "memory(GiB)": 72.72, + "step": 6780, + "token_acc": 0.8577137651213464, + "train_speed(iter/s)": 0.089052 + }, + { + "epoch": 2.918652071259927, + "grad_norm": 0.09753546863794327, + "learning_rate": 3.66890305178407e-05, + "loss": 0.3708536624908447, + "memory(GiB)": 72.72, + "step": 6800, + "token_acc": 0.8671116019269858, + "train_speed(iter/s)": 0.089056 + }, + { + "epoch": 2.927237604636188, + "grad_norm": 0.09348613768815994, + "learning_rate": 3.661216129297894e-05, + "loss": 0.3709095001220703, + "memory(GiB)": 72.72, + "step": 6820, + "token_acc": 0.8573097173193669, + "train_speed(iter/s)": 0.089061 + }, + { + "epoch": 2.935823138012449, + "grad_norm": 0.0905463695526123, + "learning_rate": 3.653515182695344e-05, + "loss": 0.3767134189605713, + "memory(GiB)": 72.72, + "step": 6840, + "token_acc": 0.8626287415238181, + "train_speed(iter/s)": 0.089065 + }, + { + "epoch": 2.94440867138871, + "grad_norm": 0.10822242498397827, + "learning_rate": 3.645800304981477e-05, + "loss": 0.3709308385848999, + "memory(GiB)": 72.72, + "step": 6860, + "token_acc": 0.8577446782413705, + "train_speed(iter/s)": 0.089069 + }, + { + "epoch": 2.952994204764971, + "grad_norm": 0.1089344173669815, + "learning_rate": 3.638071589329597e-05, + "loss": 0.3755086660385132, + "memory(GiB)": 72.72, + "step": 6880, + "token_acc": 0.8570769973171867, + "train_speed(iter/s)": 0.089072 + }, + { + "epoch": 2.961579738141232, + "grad_norm": 0.10646732896566391, + "learning_rate": 3.630329129080129e-05, + "loss": 0.36852853298187255, + "memory(GiB)": 72.72, + "step": 6900, + "token_acc": 0.8610651132070156, + "train_speed(iter/s)": 0.089077 + }, + { + "epoch": 2.970165271517493, + "grad_norm": 0.10016820579767227, + "learning_rate": 3.622573017739495e-05, + "loss": 0.37330124378204343, + "memory(GiB)": 72.72, + "step": 6920, + "token_acc": 0.8775841748626209, + "train_speed(iter/s)": 0.08908 + }, + { + "epoch": 2.978750804893754, + "grad_norm": 0.1020449697971344, + "learning_rate": 3.6148033489789765e-05, + "loss": 0.3684419631958008, + "memory(GiB)": 72.72, + "step": 6940, + "token_acc": 0.8642162515149019, + "train_speed(iter/s)": 0.089084 + }, + { + "epoch": 2.987336338270015, + "grad_norm": 0.0974557027220726, + "learning_rate": 3.607020216633599e-05, + "loss": 0.37378945350646975, + "memory(GiB)": 72.72, + "step": 6960, + "token_acc": 0.858156359329171, + "train_speed(iter/s)": 0.089087 + }, + { + "epoch": 2.995921871646276, + "grad_norm": 0.09330358356237411, + "learning_rate": 3.59922371470098e-05, + "loss": 0.36865170001983644, + "memory(GiB)": 72.72, + "step": 6980, + "token_acc": 0.8638886721914512, + "train_speed(iter/s)": 0.089091 + }, + { + "epoch": 3.0042927666881303, + "grad_norm": 0.1193256601691246, + "learning_rate": 3.591413937340208e-05, + "loss": 0.3534395694732666, + "memory(GiB)": 72.72, + "step": 7000, + "token_acc": 0.8802663670407237, + "train_speed(iter/s)": 0.089092 + }, + { + "epoch": 3.0042927666881303, + "eval_loss": 0.4485101103782654, + "eval_runtime": 74.3969, + "eval_samples_per_second": 50.593, + "eval_steps_per_second": 0.645, + "eval_token_acc": 0.8402603254517357, + "step": 7000 + }, + { + "epoch": 3.0128783000643913, + "grad_norm": 0.1156892329454422, + "learning_rate": 3.583590978870699e-05, + "loss": 0.3319342851638794, + "memory(GiB)": 72.72, + "step": 7020, + "token_acc": 0.8532470204427854, + "train_speed(iter/s)": 0.088961 + }, + { + "epoch": 3.0214638334406523, + "grad_norm": 0.10194379091262817, + "learning_rate": 3.5757549337710564e-05, + "loss": 0.33723247051239014, + "memory(GiB)": 72.72, + "step": 7040, + "token_acc": 0.8831583445244781, + "train_speed(iter/s)": 0.088954 + }, + { + "epoch": 3.0300493668169133, + "grad_norm": 0.10132017731666565, + "learning_rate": 3.5679058966779344e-05, + "loss": 0.336438250541687, + "memory(GiB)": 72.72, + "step": 7060, + "token_acc": 0.8769756994854098, + "train_speed(iter/s)": 0.08895 + }, + { + "epoch": 3.0386349001931743, + "grad_norm": 0.1068112775683403, + "learning_rate": 3.560043962384891e-05, + "loss": 0.3355576753616333, + "memory(GiB)": 72.72, + "step": 7080, + "token_acc": 0.8759380793584041, + "train_speed(iter/s)": 0.088949 + }, + { + "epoch": 3.0472204335694353, + "grad_norm": 0.10327329486608505, + "learning_rate": 3.552169225841248e-05, + "loss": 0.3344245195388794, + "memory(GiB)": 72.72, + "step": 7100, + "token_acc": 0.8749370992739901, + "train_speed(iter/s)": 0.088945 + }, + { + "epoch": 3.0558059669456963, + "grad_norm": 0.10621868073940277, + "learning_rate": 3.544281782150936e-05, + "loss": 0.33667793273925783, + "memory(GiB)": 72.72, + "step": 7120, + "token_acc": 0.8698413495330857, + "train_speed(iter/s)": 0.088946 + }, + { + "epoch": 3.0643915003219577, + "grad_norm": 0.09647602587938309, + "learning_rate": 3.536381726571358e-05, + "loss": 0.33697144985198973, + "memory(GiB)": 72.72, + "step": 7140, + "token_acc": 0.879177233267265, + "train_speed(iter/s)": 0.088946 + }, + { + "epoch": 3.0729770336982183, + "grad_norm": 0.1008361279964447, + "learning_rate": 3.528469154512224e-05, + "loss": 0.3379324674606323, + "memory(GiB)": 72.72, + "step": 7160, + "token_acc": 0.881303225060136, + "train_speed(iter/s)": 0.088946 + }, + { + "epoch": 3.0815625670744797, + "grad_norm": 0.09905105084180832, + "learning_rate": 3.520544161534413e-05, + "loss": 0.33641412258148196, + "memory(GiB)": 72.72, + "step": 7180, + "token_acc": 0.8765279938577173, + "train_speed(iter/s)": 0.088947 + }, + { + "epoch": 3.0901481004507403, + "grad_norm": 0.09547468274831772, + "learning_rate": 3.51260684334881e-05, + "loss": 0.33444535732269287, + "memory(GiB)": 72.72, + "step": 7200, + "token_acc": 0.8740168402536957, + "train_speed(iter/s)": 0.088949 + }, + { + "epoch": 3.0987336338270013, + "grad_norm": 0.091608926653862, + "learning_rate": 3.504657295815153e-05, + "loss": 0.33458809852600097, + "memory(GiB)": 72.72, + "step": 7220, + "token_acc": 0.8822041996574748, + "train_speed(iter/s)": 0.088951 + }, + { + "epoch": 3.1073191672032623, + "grad_norm": 0.095795176923275, + "learning_rate": 3.496695614940875e-05, + "loss": 0.3341191053390503, + "memory(GiB)": 72.72, + "step": 7240, + "token_acc": 0.8863122055178043, + "train_speed(iter/s)": 0.088952 + }, + { + "epoch": 3.1159047005795233, + "grad_norm": 0.11027920246124268, + "learning_rate": 3.488721896879943e-05, + "loss": 0.3351098299026489, + "memory(GiB)": 72.72, + "step": 7260, + "token_acc": 0.8802774242498409, + "train_speed(iter/s)": 0.088955 + }, + { + "epoch": 3.1244902339557843, + "grad_norm": 0.09548976272344589, + "learning_rate": 3.4807362379317025e-05, + "loss": 0.3381031513214111, + "memory(GiB)": 72.72, + "step": 7280, + "token_acc": 0.8777537505068252, + "train_speed(iter/s)": 0.088954 + }, + { + "epoch": 3.1330757673320457, + "grad_norm": 0.1054491475224495, + "learning_rate": 3.472738734539706e-05, + "loss": 0.33547115325927734, + "memory(GiB)": 72.72, + "step": 7300, + "token_acc": 0.8795795912347567, + "train_speed(iter/s)": 0.088956 + }, + { + "epoch": 3.1416613007083063, + "grad_norm": 0.09988971799612045, + "learning_rate": 3.464729483290553e-05, + "loss": 0.3418281555175781, + "memory(GiB)": 72.72, + "step": 7320, + "token_acc": 0.8629133179032032, + "train_speed(iter/s)": 0.088958 + }, + { + "epoch": 3.1502468340845677, + "grad_norm": 0.09766259044408798, + "learning_rate": 3.456708580912725e-05, + "loss": 0.3392175674438477, + "memory(GiB)": 72.72, + "step": 7340, + "token_acc": 0.8706737594562531, + "train_speed(iter/s)": 0.088951 + }, + { + "epoch": 3.1588323674608283, + "grad_norm": 0.09341710805892944, + "learning_rate": 3.448676124275414e-05, + "loss": 0.3362084150314331, + "memory(GiB)": 72.72, + "step": 7360, + "token_acc": 0.8706982003587074, + "train_speed(iter/s)": 0.088954 + }, + { + "epoch": 3.1674179008370897, + "grad_norm": 0.0969720259308815, + "learning_rate": 3.440632210387354e-05, + "loss": 0.3380004644393921, + "memory(GiB)": 72.72, + "step": 7380, + "token_acc": 0.8738021476597112, + "train_speed(iter/s)": 0.088957 + }, + { + "epoch": 3.1760034342133503, + "grad_norm": 0.09787522256374359, + "learning_rate": 3.432576936395648e-05, + "loss": 0.3357203245162964, + "memory(GiB)": 72.72, + "step": 7400, + "token_acc": 0.8912336656741101, + "train_speed(iter/s)": 0.088961 + }, + { + "epoch": 3.1845889675896113, + "grad_norm": 0.10224709659814835, + "learning_rate": 3.424510399584601e-05, + "loss": 0.33477025032043456, + "memory(GiB)": 72.72, + "step": 7420, + "token_acc": 0.8561189105937783, + "train_speed(iter/s)": 0.088965 + }, + { + "epoch": 3.1931745009658723, + "grad_norm": 0.10669636726379395, + "learning_rate": 3.416432697374533e-05, + "loss": 0.33573341369628906, + "memory(GiB)": 72.72, + "step": 7440, + "token_acc": 0.874112458982316, + "train_speed(iter/s)": 0.088968 + }, + { + "epoch": 3.2017600343421333, + "grad_norm": 0.1014070212841034, + "learning_rate": 3.408343927320613e-05, + "loss": 0.3380695343017578, + "memory(GiB)": 72.72, + "step": 7460, + "token_acc": 0.8848022091860703, + "train_speed(iter/s)": 0.088972 + }, + { + "epoch": 3.2103455677183943, + "grad_norm": 0.09528549015522003, + "learning_rate": 3.40024418711168e-05, + "loss": 0.33952438831329346, + "memory(GiB)": 72.72, + "step": 7480, + "token_acc": 0.8705726760778868, + "train_speed(iter/s)": 0.088975 + }, + { + "epoch": 3.2189311010946553, + "grad_norm": 0.10318120568990707, + "learning_rate": 3.392133574569057e-05, + "loss": 0.3406086444854736, + "memory(GiB)": 72.72, + "step": 7500, + "token_acc": 0.8733639567077774, + "train_speed(iter/s)": 0.088978 + }, + { + "epoch": 3.2275166344709163, + "grad_norm": 0.11275230348110199, + "learning_rate": 3.3840121876453734e-05, + "loss": 0.33986356258392336, + "memory(GiB)": 72.72, + "step": 7520, + "token_acc": 0.8619126202517206, + "train_speed(iter/s)": 0.088978 + }, + { + "epoch": 3.2361021678471773, + "grad_norm": 0.10118957608938217, + "learning_rate": 3.375880124423383e-05, + "loss": 0.3386232852935791, + "memory(GiB)": 72.72, + "step": 7540, + "token_acc": 0.8710604646623604, + "train_speed(iter/s)": 0.088981 + }, + { + "epoch": 3.2446877012234383, + "grad_norm": 0.10550114512443542, + "learning_rate": 3.367737483114779e-05, + "loss": 0.3421770572662354, + "memory(GiB)": 72.72, + "step": 7560, + "token_acc": 0.8851797047121107, + "train_speed(iter/s)": 0.088985 + }, + { + "epoch": 3.2532732345996997, + "grad_norm": 0.1023048609495163, + "learning_rate": 3.359584362059004e-05, + "loss": 0.33796124458312987, + "memory(GiB)": 72.72, + "step": 7580, + "token_acc": 0.8739776940178287, + "train_speed(iter/s)": 0.088985 + }, + { + "epoch": 3.2618587679759603, + "grad_norm": 0.09559116512537003, + "learning_rate": 3.3514208597220705e-05, + "loss": 0.3409781217575073, + "memory(GiB)": 72.72, + "step": 7600, + "token_acc": 0.874609344576846, + "train_speed(iter/s)": 0.088989 + }, + { + "epoch": 3.2704443013522217, + "grad_norm": 0.09580449014902115, + "learning_rate": 3.3432470746953606e-05, + "loss": 0.33773849010467527, + "memory(GiB)": 72.72, + "step": 7620, + "token_acc": 0.8727284510693454, + "train_speed(iter/s)": 0.088993 + }, + { + "epoch": 3.2790298347284823, + "grad_norm": 0.10818155109882355, + "learning_rate": 3.335063105694447e-05, + "loss": 0.3401022434234619, + "memory(GiB)": 72.72, + "step": 7640, + "token_acc": 0.8764802837026362, + "train_speed(iter/s)": 0.088996 + }, + { + "epoch": 3.2876153681047438, + "grad_norm": 0.10184460878372192, + "learning_rate": 3.326869051557891e-05, + "loss": 0.3434968709945679, + "memory(GiB)": 72.72, + "step": 7660, + "token_acc": 0.8761705077978165, + "train_speed(iter/s)": 0.088919 + }, + { + "epoch": 3.2962009014810043, + "grad_norm": 0.09505783021450043, + "learning_rate": 3.318665011246056e-05, + "loss": 0.3408296346664429, + "memory(GiB)": 72.72, + "step": 7680, + "token_acc": 0.8661087384073535, + "train_speed(iter/s)": 0.088905 + }, + { + "epoch": 3.3047864348572658, + "grad_norm": 0.10040104389190674, + "learning_rate": 3.310451083839908e-05, + "loss": 0.3423358678817749, + "memory(GiB)": 72.72, + "step": 7700, + "token_acc": 0.861539109557306, + "train_speed(iter/s)": 0.088907 + }, + { + "epoch": 3.3133719682335263, + "grad_norm": 0.10616692155599594, + "learning_rate": 3.30222736853982e-05, + "loss": 0.34503300189971925, + "memory(GiB)": 72.72, + "step": 7720, + "token_acc": 0.8724093642360908, + "train_speed(iter/s)": 0.08891 + }, + { + "epoch": 3.3219575016097878, + "grad_norm": 0.10949140787124634, + "learning_rate": 3.293993964664376e-05, + "loss": 0.3432727098464966, + "memory(GiB)": 72.72, + "step": 7740, + "token_acc": 0.8669582519497799, + "train_speed(iter/s)": 0.088914 + }, + { + "epoch": 3.3305430349860483, + "grad_norm": 0.09881085902452469, + "learning_rate": 3.285750971649167e-05, + "loss": 0.3427408695220947, + "memory(GiB)": 72.72, + "step": 7760, + "token_acc": 0.8689034982030741, + "train_speed(iter/s)": 0.088917 + }, + { + "epoch": 3.3391285683623098, + "grad_norm": 0.09140335768461227, + "learning_rate": 3.2774984890455976e-05, + "loss": 0.3475862979888916, + "memory(GiB)": 72.72, + "step": 7780, + "token_acc": 0.8685826593182928, + "train_speed(iter/s)": 0.088921 + }, + { + "epoch": 3.3477141017385703, + "grad_norm": 0.1024077907204628, + "learning_rate": 3.2692366165196727e-05, + "loss": 0.3404365539550781, + "memory(GiB)": 72.72, + "step": 7800, + "token_acc": 0.8840015739822477, + "train_speed(iter/s)": 0.088925 + }, + { + "epoch": 3.3562996351148318, + "grad_norm": 0.09467454254627228, + "learning_rate": 3.260965453850806e-05, + "loss": 0.34421525001525877, + "memory(GiB)": 72.72, + "step": 7820, + "token_acc": 0.8758503166590742, + "train_speed(iter/s)": 0.088929 + }, + { + "epoch": 3.3648851684910923, + "grad_norm": 0.10136840492486954, + "learning_rate": 3.252685100930605e-05, + "loss": 0.3386892795562744, + "memory(GiB)": 72.72, + "step": 7840, + "token_acc": 0.85672288931185, + "train_speed(iter/s)": 0.088932 + }, + { + "epoch": 3.3734707018673533, + "grad_norm": 0.09780098497867584, + "learning_rate": 3.244395657761671e-05, + "loss": 0.3428237199783325, + "memory(GiB)": 72.72, + "step": 7860, + "token_acc": 0.868161995980711, + "train_speed(iter/s)": 0.088935 + }, + { + "epoch": 3.3820562352436143, + "grad_norm": 0.1032358855009079, + "learning_rate": 3.23609722445639e-05, + "loss": 0.3407264709472656, + "memory(GiB)": 72.72, + "step": 7880, + "token_acc": 0.8630356105896284, + "train_speed(iter/s)": 0.088936 + }, + { + "epoch": 3.3906417686198753, + "grad_norm": 0.09920444339513779, + "learning_rate": 3.2277899012357196e-05, + "loss": 0.34147114753723146, + "memory(GiB)": 72.72, + "step": 7900, + "token_acc": 0.8645183518911774, + "train_speed(iter/s)": 0.088941 + }, + { + "epoch": 3.3992273019961363, + "grad_norm": 0.1050969585776329, + "learning_rate": 3.219473788427984e-05, + "loss": 0.3448856115341187, + "memory(GiB)": 72.72, + "step": 7920, + "token_acc": 0.8714814655549509, + "train_speed(iter/s)": 0.088944 + }, + { + "epoch": 3.4078128353723973, + "grad_norm": 0.10028455406427383, + "learning_rate": 3.211148986467659e-05, + "loss": 0.3422698974609375, + "memory(GiB)": 72.72, + "step": 7940, + "token_acc": 0.8711220342714154, + "train_speed(iter/s)": 0.088948 + }, + { + "epoch": 3.4163983687486583, + "grad_norm": 0.09475808590650558, + "learning_rate": 3.2028155958941615e-05, + "loss": 0.3451426029205322, + "memory(GiB)": 72.72, + "step": 7960, + "token_acc": 0.8738779982122461, + "train_speed(iter/s)": 0.088952 + }, + { + "epoch": 3.4249839021249193, + "grad_norm": 0.09882804751396179, + "learning_rate": 3.1944737173506324e-05, + "loss": 0.3444493532180786, + "memory(GiB)": 72.72, + "step": 7980, + "token_acc": 0.8827410911702268, + "train_speed(iter/s)": 0.088955 + }, + { + "epoch": 3.4335694355011803, + "grad_norm": 0.10227163881063461, + "learning_rate": 3.186123451582723e-05, + "loss": 0.339670729637146, + "memory(GiB)": 72.72, + "step": 8000, + "token_acc": 0.8807350762593477, + "train_speed(iter/s)": 0.08896 + }, + { + "epoch": 3.4335694355011803, + "eval_loss": 0.44718989729881287, + "eval_runtime": 69.1311, + "eval_samples_per_second": 54.447, + "eval_steps_per_second": 0.694, + "eval_token_acc": 0.8404557155339724, + "step": 8000 + }, + { + "epoch": 3.4421549688774413, + "grad_norm": 0.0968368649482727, + "learning_rate": 3.177764899437378e-05, + "loss": 0.34265289306640623, + "memory(GiB)": 72.72, + "step": 8020, + "token_acc": 0.854713276154318, + "train_speed(iter/s)": 0.088852 + }, + { + "epoch": 3.4507405022537023, + "grad_norm": 0.09359851479530334, + "learning_rate": 3.169398161861618e-05, + "loss": 0.33971107006073, + "memory(GiB)": 72.72, + "step": 8040, + "token_acc": 0.8740548416277094, + "train_speed(iter/s)": 0.08884 + }, + { + "epoch": 3.4593260356299633, + "grad_norm": 0.09218861162662506, + "learning_rate": 3.1610233399013194e-05, + "loss": 0.34025261402130125, + "memory(GiB)": 72.72, + "step": 8060, + "token_acc": 0.8837196272437907, + "train_speed(iter/s)": 0.088833 + }, + { + "epoch": 3.4679115690062243, + "grad_norm": 0.09785692393779755, + "learning_rate": 3.1526405346999946e-05, + "loss": 0.34408791065216066, + "memory(GiB)": 72.72, + "step": 8080, + "token_acc": 0.8632519203232839, + "train_speed(iter/s)": 0.088829 + }, + { + "epoch": 3.4764971023824853, + "grad_norm": 0.0918072834610939, + "learning_rate": 3.1442498474975694e-05, + "loss": 0.3405976057052612, + "memory(GiB)": 72.72, + "step": 8100, + "token_acc": 0.8723113057185948, + "train_speed(iter/s)": 0.088832 + }, + { + "epoch": 3.4850826357587463, + "grad_norm": 0.10397264361381531, + "learning_rate": 3.1358513796291625e-05, + "loss": 0.3404028654098511, + "memory(GiB)": 72.72, + "step": 8120, + "token_acc": 0.8617087474123225, + "train_speed(iter/s)": 0.088834 + }, + { + "epoch": 3.4936681691350073, + "grad_norm": 0.10147637873888016, + "learning_rate": 3.1274452325238604e-05, + "loss": 0.3449804067611694, + "memory(GiB)": 72.72, + "step": 8140, + "token_acc": 0.881801972466236, + "train_speed(iter/s)": 0.088832 + }, + { + "epoch": 3.5022537025112683, + "grad_norm": 0.10313740372657776, + "learning_rate": 3.119031507703491e-05, + "loss": 0.34123189449310304, + "memory(GiB)": 72.72, + "step": 8160, + "token_acc": 0.8796580674904387, + "train_speed(iter/s)": 0.088831 + }, + { + "epoch": 3.5108392358875298, + "grad_norm": 0.10292479395866394, + "learning_rate": 3.1106103067814005e-05, + "loss": 0.342661452293396, + "memory(GiB)": 72.72, + "step": 8180, + "token_acc": 0.8706844817024822, + "train_speed(iter/s)": 0.088834 + }, + { + "epoch": 3.5194247692637903, + "grad_norm": 0.10231835395097733, + "learning_rate": 3.102181731461225e-05, + "loss": 0.3427009344100952, + "memory(GiB)": 72.72, + "step": 8200, + "token_acc": 0.8746011467506197, + "train_speed(iter/s)": 0.088833 + }, + { + "epoch": 3.5280103026400518, + "grad_norm": 0.09958157688379288, + "learning_rate": 3.09374588353566e-05, + "loss": 0.34229106903076173, + "memory(GiB)": 72.72, + "step": 8220, + "token_acc": 0.8835202199767901, + "train_speed(iter/s)": 0.088833 + }, + { + "epoch": 3.5365958360163123, + "grad_norm": 0.10157457739114761, + "learning_rate": 3.085302864885235e-05, + "loss": 0.3417761564254761, + "memory(GiB)": 72.72, + "step": 8240, + "token_acc": 0.8649101475499108, + "train_speed(iter/s)": 0.088834 + }, + { + "epoch": 3.545181369392574, + "grad_norm": 0.0995817556977272, + "learning_rate": 3.076852777477079e-05, + "loss": 0.34410881996154785, + "memory(GiB)": 72.72, + "step": 8260, + "token_acc": 0.8783496646486048, + "train_speed(iter/s)": 0.088836 + }, + { + "epoch": 3.5537669027688343, + "grad_norm": 0.09822899103164673, + "learning_rate": 3.068395723363694e-05, + "loss": 0.34146294593811033, + "memory(GiB)": 72.72, + "step": 8280, + "token_acc": 0.8781757426389024, + "train_speed(iter/s)": 0.088836 + }, + { + "epoch": 3.562352436145096, + "grad_norm": 0.10480652749538422, + "learning_rate": 3.0599318046817144e-05, + "loss": 0.34048995971679685, + "memory(GiB)": 72.72, + "step": 8300, + "token_acc": 0.8748031260669741, + "train_speed(iter/s)": 0.088838 + }, + { + "epoch": 3.5709379695213563, + "grad_norm": 0.09434372186660767, + "learning_rate": 3.051461123650685e-05, + "loss": 0.33703758716583254, + "memory(GiB)": 72.72, + "step": 8320, + "token_acc": 0.8765604747936422, + "train_speed(iter/s)": 0.088842 + }, + { + "epoch": 3.579523502897618, + "grad_norm": 0.09659520536661148, + "learning_rate": 3.0429837825718162e-05, + "loss": 0.3348528385162354, + "memory(GiB)": 72.72, + "step": 8340, + "token_acc": 0.8765401382308406, + "train_speed(iter/s)": 0.088834 + }, + { + "epoch": 3.5881090362738783, + "grad_norm": 0.09309985488653183, + "learning_rate": 3.0344998838267525e-05, + "loss": 0.3402057647705078, + "memory(GiB)": 72.72, + "step": 8360, + "token_acc": 0.8651010368553427, + "train_speed(iter/s)": 0.088836 + }, + { + "epoch": 3.59669456965014, + "grad_norm": 0.0929030105471611, + "learning_rate": 3.0260095298763376e-05, + "loss": 0.34411866664886476, + "memory(GiB)": 72.72, + "step": 8380, + "token_acc": 0.8811669848458061, + "train_speed(iter/s)": 0.088838 + }, + { + "epoch": 3.6052801030264003, + "grad_norm": 0.0983252078294754, + "learning_rate": 3.017512823259373e-05, + "loss": 0.34260566234588624, + "memory(GiB)": 72.72, + "step": 8400, + "token_acc": 0.8748058346767034, + "train_speed(iter/s)": 0.088837 + }, + { + "epoch": 3.613865636402662, + "grad_norm": 0.10412958264350891, + "learning_rate": 3.0090098665913857e-05, + "loss": 0.3410640716552734, + "memory(GiB)": 72.72, + "step": 8420, + "token_acc": 0.8833422403080311, + "train_speed(iter/s)": 0.088839 + }, + { + "epoch": 3.6224511697789223, + "grad_norm": 0.1032663881778717, + "learning_rate": 3.0005007625633806e-05, + "loss": 0.3369549512863159, + "memory(GiB)": 72.72, + "step": 8440, + "token_acc": 0.8744007729088683, + "train_speed(iter/s)": 0.088841 + }, + { + "epoch": 3.631036703155184, + "grad_norm": 0.09928712248802185, + "learning_rate": 2.9919856139406093e-05, + "loss": 0.3410694122314453, + "memory(GiB)": 72.72, + "step": 8460, + "token_acc": 0.868446777131458, + "train_speed(iter/s)": 0.088844 + }, + { + "epoch": 3.6396222365314443, + "grad_norm": 0.10240930318832397, + "learning_rate": 2.9834645235613202e-05, + "loss": 0.34042160511016845, + "memory(GiB)": 72.72, + "step": 8480, + "token_acc": 0.8746132434983096, + "train_speed(iter/s)": 0.088845 + }, + { + "epoch": 3.648207769907706, + "grad_norm": 0.09757622331380844, + "learning_rate": 2.9749375943355245e-05, + "loss": 0.3391597032546997, + "memory(GiB)": 72.72, + "step": 8500, + "token_acc": 0.8870871533336139, + "train_speed(iter/s)": 0.088847 + }, + { + "epoch": 3.6567933032839663, + "grad_norm": 0.10708373039960861, + "learning_rate": 2.966404929243746e-05, + "loss": 0.3418737888336182, + "memory(GiB)": 72.72, + "step": 8520, + "token_acc": 0.8803486188795007, + "train_speed(iter/s)": 0.08885 + }, + { + "epoch": 3.665378836660228, + "grad_norm": 0.09238722175359726, + "learning_rate": 2.9578666313357866e-05, + "loss": 0.3395582675933838, + "memory(GiB)": 72.72, + "step": 8540, + "token_acc": 0.8617492297025544, + "train_speed(iter/s)": 0.088851 + }, + { + "epoch": 3.6739643700364883, + "grad_norm": 0.0982414111495018, + "learning_rate": 2.9493228037294702e-05, + "loss": 0.339850926399231, + "memory(GiB)": 72.72, + "step": 8560, + "token_acc": 0.872913510605142, + "train_speed(iter/s)": 0.088854 + }, + { + "epoch": 3.68254990341275, + "grad_norm": 0.09378170222043991, + "learning_rate": 2.9407735496094074e-05, + "loss": 0.3445668935775757, + "memory(GiB)": 72.72, + "step": 8580, + "token_acc": 0.8608412452277943, + "train_speed(iter/s)": 0.088857 + }, + { + "epoch": 3.6911354367890103, + "grad_norm": 0.10139860957860947, + "learning_rate": 2.9322189722257437e-05, + "loss": 0.33951511383056643, + "memory(GiB)": 72.72, + "step": 8600, + "token_acc": 0.8813381599903551, + "train_speed(iter/s)": 0.088858 + }, + { + "epoch": 3.699720970165272, + "grad_norm": 0.10095764696598053, + "learning_rate": 2.9236591748929143e-05, + "loss": 0.3414825201034546, + "memory(GiB)": 72.72, + "step": 8620, + "token_acc": 0.8747491060455407, + "train_speed(iter/s)": 0.088861 + }, + { + "epoch": 3.7083065035415324, + "grad_norm": 0.09368202835321426, + "learning_rate": 2.915094260988397e-05, + "loss": 0.3400054216384888, + "memory(GiB)": 72.72, + "step": 8640, + "token_acc": 0.8603559177014007, + "train_speed(iter/s)": 0.088863 + }, + { + "epoch": 3.7168920369177934, + "grad_norm": 0.09599091857671738, + "learning_rate": 2.906524333951461e-05, + "loss": 0.33973557949066163, + "memory(GiB)": 72.72, + "step": 8660, + "token_acc": 0.8864862275305668, + "train_speed(iter/s)": 0.088865 + }, + { + "epoch": 3.7254775702940544, + "grad_norm": 0.0969940647482872, + "learning_rate": 2.8979494972819227e-05, + "loss": 0.3434182405471802, + "memory(GiB)": 72.72, + "step": 8680, + "token_acc": 0.8716537070538549, + "train_speed(iter/s)": 0.088858 + }, + { + "epoch": 3.7340631036703154, + "grad_norm": 0.10267031192779541, + "learning_rate": 2.8893698545388887e-05, + "loss": 0.3440374851226807, + "memory(GiB)": 72.72, + "step": 8700, + "token_acc": 0.8709150326797386, + "train_speed(iter/s)": 0.088861 + }, + { + "epoch": 3.7426486370465764, + "grad_norm": 0.09835559874773026, + "learning_rate": 2.8807855093395126e-05, + "loss": 0.34554252624511717, + "memory(GiB)": 72.72, + "step": 8720, + "token_acc": 0.8670599046959998, + "train_speed(iter/s)": 0.088863 + }, + { + "epoch": 3.7512341704228374, + "grad_norm": 0.0885239914059639, + "learning_rate": 2.8721965653577386e-05, + "loss": 0.3446002244949341, + "memory(GiB)": 72.72, + "step": 8740, + "token_acc": 0.8721490695849959, + "train_speed(iter/s)": 0.088867 + }, + { + "epoch": 3.7598197037990984, + "grad_norm": 0.09081339836120605, + "learning_rate": 2.86360312632305e-05, + "loss": 0.33843419551849363, + "memory(GiB)": 72.72, + "step": 8760, + "token_acc": 0.8680939478458125, + "train_speed(iter/s)": 0.088869 + }, + { + "epoch": 3.7684052371753594, + "grad_norm": 0.09640111774206161, + "learning_rate": 2.855005296019218e-05, + "loss": 0.340420126914978, + "memory(GiB)": 72.72, + "step": 8780, + "token_acc": 0.8749122556452666, + "train_speed(iter/s)": 0.088872 + }, + { + "epoch": 3.7769907705516204, + "grad_norm": 0.0949261263012886, + "learning_rate": 2.8464031782830474e-05, + "loss": 0.3449671983718872, + "memory(GiB)": 72.72, + "step": 8800, + "token_acc": 0.8710775436891774, + "train_speed(iter/s)": 0.088876 + }, + { + "epoch": 3.7855763039278814, + "grad_norm": 0.09448053687810898, + "learning_rate": 2.837796877003124e-05, + "loss": 0.3435060977935791, + "memory(GiB)": 72.72, + "step": 8820, + "token_acc": 0.884149136577708, + "train_speed(iter/s)": 0.08888 + }, + { + "epoch": 3.7941618373041424, + "grad_norm": 0.09816328436136246, + "learning_rate": 2.8291864961185566e-05, + "loss": 0.34175992012023926, + "memory(GiB)": 72.72, + "step": 8840, + "token_acc": 0.8704215639701488, + "train_speed(iter/s)": 0.088882 + }, + { + "epoch": 3.8027473706804034, + "grad_norm": 0.09840340167284012, + "learning_rate": 2.820572139617725e-05, + "loss": 0.3442914247512817, + "memory(GiB)": 72.72, + "step": 8860, + "token_acc": 0.8852563932460973, + "train_speed(iter/s)": 0.088885 + }, + { + "epoch": 3.8113329040566644, + "grad_norm": 0.09052480757236481, + "learning_rate": 2.8119539115370218e-05, + "loss": 0.3354163408279419, + "memory(GiB)": 72.72, + "step": 8880, + "token_acc": 0.8710054027589692, + "train_speed(iter/s)": 0.088887 + }, + { + "epoch": 3.8199184374329254, + "grad_norm": 0.09055832773447037, + "learning_rate": 2.803331915959599e-05, + "loss": 0.341020393371582, + "memory(GiB)": 72.72, + "step": 8900, + "token_acc": 0.8775211583840608, + "train_speed(iter/s)": 0.088889 + }, + { + "epoch": 3.8285039708091864, + "grad_norm": 0.09606460481882095, + "learning_rate": 2.7947062570141073e-05, + "loss": 0.34467277526855467, + "memory(GiB)": 72.72, + "step": 8920, + "token_acc": 0.8684845089446742, + "train_speed(iter/s)": 0.088892 + }, + { + "epoch": 3.8370895041854474, + "grad_norm": 0.0941082313656807, + "learning_rate": 2.7860770388734408e-05, + "loss": 0.34154183864593507, + "memory(GiB)": 72.72, + "step": 8940, + "token_acc": 0.8651064878551884, + "train_speed(iter/s)": 0.088895 + }, + { + "epoch": 3.8456750375617084, + "grad_norm": 0.08800920099020004, + "learning_rate": 2.7774443657534788e-05, + "loss": 0.34454681873321535, + "memory(GiB)": 72.72, + "step": 8960, + "token_acc": 0.884229596704054, + "train_speed(iter/s)": 0.088899 + }, + { + "epoch": 3.8542605709379694, + "grad_norm": 0.0993284210562706, + "learning_rate": 2.7688083419118255e-05, + "loss": 0.3417619466781616, + "memory(GiB)": 72.72, + "step": 8980, + "token_acc": 0.8696293253324922, + "train_speed(iter/s)": 0.088902 + }, + { + "epoch": 3.8628461043142304, + "grad_norm": 0.10383660346269608, + "learning_rate": 2.760169071646553e-05, + "loss": 0.34536774158477784, + "memory(GiB)": 72.72, + "step": 9000, + "token_acc": 0.8775136024730062, + "train_speed(iter/s)": 0.088905 + }, + { + "epoch": 3.8628461043142304, + "eval_loss": 0.4432525634765625, + "eval_runtime": 69.6489, + "eval_samples_per_second": 54.042, + "eval_steps_per_second": 0.689, + "eval_token_acc": 0.8414275958111643, + "step": 9000 + }, + { + "epoch": 3.8714316376904914, + "grad_norm": 0.0947885811328888, + "learning_rate": 2.7515266592949407e-05, + "loss": 0.3397974491119385, + "memory(GiB)": 72.72, + "step": 9020, + "token_acc": 0.8571858554733831, + "train_speed(iter/s)": 0.08881 + }, + { + "epoch": 3.8800171710667524, + "grad_norm": 0.09524156153202057, + "learning_rate": 2.742881209232215e-05, + "loss": 0.3427132129669189, + "memory(GiB)": 72.72, + "step": 9040, + "token_acc": 0.868957431040566, + "train_speed(iter/s)": 0.088802 + }, + { + "epoch": 3.8886027044430134, + "grad_norm": 0.08956858515739441, + "learning_rate": 2.7342328258702894e-05, + "loss": 0.34703960418701174, + "memory(GiB)": 72.72, + "step": 9060, + "token_acc": 0.8717364607638463, + "train_speed(iter/s)": 0.088797 + }, + { + "epoch": 3.8971882378192744, + "grad_norm": 0.09309873729944229, + "learning_rate": 2.7255816136565026e-05, + "loss": 0.34093830585479734, + "memory(GiB)": 72.72, + "step": 9080, + "token_acc": 0.8860340449246085, + "train_speed(iter/s)": 0.088797 + }, + { + "epoch": 3.9057737711955354, + "grad_norm": 0.09236317873001099, + "learning_rate": 2.7169276770723585e-05, + "loss": 0.3432276248931885, + "memory(GiB)": 72.72, + "step": 9100, + "token_acc": 0.8692972431017865, + "train_speed(iter/s)": 0.088797 + }, + { + "epoch": 3.9143593045717964, + "grad_norm": 0.09957270324230194, + "learning_rate": 2.708271120632262e-05, + "loss": 0.34100799560546874, + "memory(GiB)": 72.72, + "step": 9120, + "token_acc": 0.8780453295762229, + "train_speed(iter/s)": 0.088796 + }, + { + "epoch": 3.9229448379480574, + "grad_norm": 0.09253112971782684, + "learning_rate": 2.69961204888226e-05, + "loss": 0.344201922416687, + "memory(GiB)": 72.72, + "step": 9140, + "token_acc": 0.893788044699683, + "train_speed(iter/s)": 0.088799 + }, + { + "epoch": 3.9315303713243184, + "grad_norm": 0.09970075637102127, + "learning_rate": 2.6909505663987756e-05, + "loss": 0.34385430812835693, + "memory(GiB)": 72.72, + "step": 9160, + "token_acc": 0.884597342165496, + "train_speed(iter/s)": 0.088801 + }, + { + "epoch": 3.9401159047005794, + "grad_norm": 0.0891101136803627, + "learning_rate": 2.682286777787348e-05, + "loss": 0.3451590299606323, + "memory(GiB)": 72.72, + "step": 9180, + "token_acc": 0.8716154630632927, + "train_speed(iter/s)": 0.0888 + }, + { + "epoch": 3.9487014380768404, + "grad_norm": 0.09408137947320938, + "learning_rate": 2.6736207876813646e-05, + "loss": 0.34462172985076905, + "memory(GiB)": 72.72, + "step": 9200, + "token_acc": 0.8778122218028758, + "train_speed(iter/s)": 0.088802 + }, + { + "epoch": 3.9572869714531014, + "grad_norm": 0.09145346283912659, + "learning_rate": 2.664952700740806e-05, + "loss": 0.34248254299163816, + "memory(GiB)": 72.72, + "step": 9220, + "token_acc": 0.872891004579533, + "train_speed(iter/s)": 0.088803 + }, + { + "epoch": 3.9658725048293624, + "grad_norm": 0.09725998342037201, + "learning_rate": 2.6562826216509696e-05, + "loss": 0.34380669593811036, + "memory(GiB)": 72.72, + "step": 9240, + "token_acc": 0.8909276331759067, + "train_speed(iter/s)": 0.088804 + }, + { + "epoch": 3.9744580382056234, + "grad_norm": 0.10166844725608826, + "learning_rate": 2.6476106551212188e-05, + "loss": 0.34403514862060547, + "memory(GiB)": 72.72, + "step": 9260, + "token_acc": 0.8776962289782687, + "train_speed(iter/s)": 0.088806 + }, + { + "epoch": 3.9830435715818844, + "grad_norm": 0.09070953726768494, + "learning_rate": 2.6389369058837077e-05, + "loss": 0.341811990737915, + "memory(GiB)": 72.72, + "step": 9280, + "token_acc": 0.8719749437415167, + "train_speed(iter/s)": 0.088808 + }, + { + "epoch": 3.9916291049581454, + "grad_norm": 0.09677760303020477, + "learning_rate": 2.6302614786921204e-05, + "loss": 0.3442156553268433, + "memory(GiB)": 72.72, + "step": 9300, + "token_acc": 0.882238909204825, + "train_speed(iter/s)": 0.088808 + }, + { + "epoch": 4.0, + "grad_norm": 0.15741688013076782, + "learning_rate": 2.621584478320408e-05, + "loss": 0.3397855758666992, + "memory(GiB)": 72.72, + "step": 9320, + "token_acc": 0.8889204303051386, + "train_speed(iter/s)": 0.088814 + }, + { + "epoch": 4.008585533376261, + "grad_norm": 0.10205920785665512, + "learning_rate": 2.6129060095615187e-05, + "loss": 0.29747543334960935, + "memory(GiB)": 72.72, + "step": 9340, + "token_acc": 0.8900451968067217, + "train_speed(iter/s)": 0.0888 + }, + { + "epoch": 4.017171066752522, + "grad_norm": 0.10247659683227539, + "learning_rate": 2.604226177226137e-05, + "loss": 0.30353684425354005, + "memory(GiB)": 72.72, + "step": 9360, + "token_acc": 0.886528226098631, + "train_speed(iter/s)": 0.088801 + }, + { + "epoch": 4.025756600128783, + "grad_norm": 0.10435572266578674, + "learning_rate": 2.5955450861414126e-05, + "loss": 0.30368824005126954, + "memory(GiB)": 72.72, + "step": 9380, + "token_acc": 0.8827944824311919, + "train_speed(iter/s)": 0.088803 + }, + { + "epoch": 4.034342133505044, + "grad_norm": 0.1014116182923317, + "learning_rate": 2.586862841149701e-05, + "loss": 0.3020852327346802, + "memory(GiB)": 72.72, + "step": 9400, + "token_acc": 0.8891262896776423, + "train_speed(iter/s)": 0.088805 + }, + { + "epoch": 4.042927666881305, + "grad_norm": 0.10401485115289688, + "learning_rate": 2.5781795471072885e-05, + "loss": 0.3056429386138916, + "memory(GiB)": 72.72, + "step": 9420, + "token_acc": 0.8829484753143999, + "train_speed(iter/s)": 0.088807 + }, + { + "epoch": 4.051513200257566, + "grad_norm": 0.10134406387805939, + "learning_rate": 2.5694953088831352e-05, + "loss": 0.30531723499298097, + "memory(GiB)": 72.72, + "step": 9440, + "token_acc": 0.8840279216629264, + "train_speed(iter/s)": 0.088808 + }, + { + "epoch": 4.060098733633827, + "grad_norm": 0.10662077367305756, + "learning_rate": 2.5608102313576027e-05, + "loss": 0.3047459363937378, + "memory(GiB)": 72.72, + "step": 9460, + "token_acc": 0.9002080243657248, + "train_speed(iter/s)": 0.088811 + }, + { + "epoch": 4.068684267010088, + "grad_norm": 0.10326355695724487, + "learning_rate": 2.5521244194211884e-05, + "loss": 0.30735197067260744, + "memory(GiB)": 72.72, + "step": 9480, + "token_acc": 0.8819828054997908, + "train_speed(iter/s)": 0.088814 + }, + { + "epoch": 4.077269800386349, + "grad_norm": 0.10981076210737228, + "learning_rate": 2.5434379779732603e-05, + "loss": 0.30461032390594484, + "memory(GiB)": 72.72, + "step": 9500, + "token_acc": 0.882671980207554, + "train_speed(iter/s)": 0.088816 + }, + { + "epoch": 4.08585533376261, + "grad_norm": 0.09967193752527237, + "learning_rate": 2.5347510119207878e-05, + "loss": 0.3016824722290039, + "memory(GiB)": 72.72, + "step": 9520, + "token_acc": 0.8960580499977037, + "train_speed(iter/s)": 0.088818 + }, + { + "epoch": 4.094440867138871, + "grad_norm": 0.10693041980266571, + "learning_rate": 2.5260636261770777e-05, + "loss": 0.3073539972305298, + "memory(GiB)": 72.72, + "step": 9540, + "token_acc": 0.890892156523979, + "train_speed(iter/s)": 0.08882 + }, + { + "epoch": 4.103026400515132, + "grad_norm": 0.10553585737943649, + "learning_rate": 2.5173759256605027e-05, + "loss": 0.30216293334960936, + "memory(GiB)": 72.72, + "step": 9560, + "token_acc": 0.8891749049597542, + "train_speed(iter/s)": 0.088822 + }, + { + "epoch": 4.111611933891393, + "grad_norm": 0.10220309346914291, + "learning_rate": 2.5086880152932402e-05, + "loss": 0.3027711153030396, + "memory(GiB)": 72.72, + "step": 9580, + "token_acc": 0.8892161871654268, + "train_speed(iter/s)": 0.088824 + }, + { + "epoch": 4.120197467267654, + "grad_norm": 0.10086795687675476, + "learning_rate": 2.5e-05, + "loss": 0.30600886344909667, + "memory(GiB)": 72.72, + "step": 9600, + "token_acc": 0.8824066390041494, + "train_speed(iter/s)": 0.088825 + }, + { + "epoch": 4.1287830006439155, + "grad_norm": 0.10636570304632187, + "learning_rate": 2.4913119847067603e-05, + "loss": 0.30425918102264404, + "memory(GiB)": 72.72, + "step": 9620, + "token_acc": 0.8838457920573797, + "train_speed(iter/s)": 0.088829 + }, + { + "epoch": 4.137368534020176, + "grad_norm": 0.10464228689670563, + "learning_rate": 2.4826240743394982e-05, + "loss": 0.3025052070617676, + "memory(GiB)": 72.72, + "step": 9640, + "token_acc": 0.8769574601853707, + "train_speed(iter/s)": 0.088832 + }, + { + "epoch": 4.145954067396437, + "grad_norm": 0.1083202064037323, + "learning_rate": 2.4739363738229232e-05, + "loss": 0.30380189418792725, + "memory(GiB)": 72.72, + "step": 9660, + "token_acc": 0.8893545408707838, + "train_speed(iter/s)": 0.088834 + }, + { + "epoch": 4.154539600772698, + "grad_norm": 0.10492519289255142, + "learning_rate": 2.4652489880792128e-05, + "loss": 0.30443031787872316, + "memory(GiB)": 72.72, + "step": 9680, + "token_acc": 0.8797012712026356, + "train_speed(iter/s)": 0.088831 + }, + { + "epoch": 4.1631251341489595, + "grad_norm": 0.09974920004606247, + "learning_rate": 2.4565620220267396e-05, + "loss": 0.3066636800765991, + "memory(GiB)": 72.72, + "step": 9700, + "token_acc": 0.8844553871840214, + "train_speed(iter/s)": 0.088833 + }, + { + "epoch": 4.17171066752522, + "grad_norm": 0.0984271839261055, + "learning_rate": 2.447875580578812e-05, + "loss": 0.3007610082626343, + "memory(GiB)": 72.72, + "step": 9720, + "token_acc": 0.8761438976087101, + "train_speed(iter/s)": 0.088836 + }, + { + "epoch": 4.180296200901481, + "grad_norm": 0.10344758629798889, + "learning_rate": 2.439189768642398e-05, + "loss": 0.3055333375930786, + "memory(GiB)": 72.72, + "step": 9740, + "token_acc": 0.8798825324153172, + "train_speed(iter/s)": 0.088839 + }, + { + "epoch": 4.188881734277742, + "grad_norm": 0.10062626749277115, + "learning_rate": 2.4305046911168653e-05, + "loss": 0.30226128101348876, + "memory(GiB)": 72.72, + "step": 9760, + "token_acc": 0.877849069049261, + "train_speed(iter/s)": 0.088842 + }, + { + "epoch": 4.197467267654003, + "grad_norm": 0.1044364646077156, + "learning_rate": 2.4218204528927117e-05, + "loss": 0.3027973175048828, + "memory(GiB)": 72.72, + "step": 9780, + "token_acc": 0.8901094903786694, + "train_speed(iter/s)": 0.088844 + }, + { + "epoch": 4.206052801030264, + "grad_norm": 0.09792552888393402, + "learning_rate": 2.4131371588503003e-05, + "loss": 0.30410778522491455, + "memory(GiB)": 72.72, + "step": 9800, + "token_acc": 0.8904304675100755, + "train_speed(iter/s)": 0.088846 + }, + { + "epoch": 4.214638334406525, + "grad_norm": 0.11304216086864471, + "learning_rate": 2.4044549138585877e-05, + "loss": 0.3036644697189331, + "memory(GiB)": 72.72, + "step": 9820, + "token_acc": 0.8800798395927938, + "train_speed(iter/s)": 0.088849 + }, + { + "epoch": 4.223223867782786, + "grad_norm": 0.10015735030174255, + "learning_rate": 2.395773822773863e-05, + "loss": 0.30791220664978025, + "memory(GiB)": 72.72, + "step": 9840, + "token_acc": 0.8848758135171705, + "train_speed(iter/s)": 0.088852 + }, + { + "epoch": 4.231809401159047, + "grad_norm": 0.09757008403539658, + "learning_rate": 2.3870939904384815e-05, + "loss": 0.30361478328704833, + "memory(GiB)": 72.72, + "step": 9860, + "token_acc": 0.8940831985400854, + "train_speed(iter/s)": 0.088855 + }, + { + "epoch": 4.240394934535308, + "grad_norm": 0.09704037755727768, + "learning_rate": 2.378415521679593e-05, + "loss": 0.3088146924972534, + "memory(GiB)": 72.72, + "step": 9880, + "token_acc": 0.887872541700794, + "train_speed(iter/s)": 0.088857 + }, + { + "epoch": 4.248980467911569, + "grad_norm": 0.10431049019098282, + "learning_rate": 2.3697385213078805e-05, + "loss": 0.30578904151916503, + "memory(GiB)": 72.72, + "step": 9900, + "token_acc": 0.8816228300017872, + "train_speed(iter/s)": 0.088859 + }, + { + "epoch": 4.25756600128783, + "grad_norm": 0.09818245470523834, + "learning_rate": 2.361063094116293e-05, + "loss": 0.3096456527709961, + "memory(GiB)": 72.72, + "step": 9920, + "token_acc": 0.8843864415701027, + "train_speed(iter/s)": 0.088863 + }, + { + "epoch": 4.2661515346640915, + "grad_norm": 0.09961646795272827, + "learning_rate": 2.3523893448787818e-05, + "loss": 0.30978071689605713, + "memory(GiB)": 72.72, + "step": 9940, + "token_acc": 0.8849045058887656, + "train_speed(iter/s)": 0.088865 + }, + { + "epoch": 4.274737068040352, + "grad_norm": 0.10661664605140686, + "learning_rate": 2.3437173783490307e-05, + "loss": 0.30757110118865966, + "memory(GiB)": 72.72, + "step": 9960, + "token_acc": 0.8822289688850337, + "train_speed(iter/s)": 0.088868 + }, + { + "epoch": 4.283322601416613, + "grad_norm": 0.1005556732416153, + "learning_rate": 2.3350472992591947e-05, + "loss": 0.30759055614471437, + "memory(GiB)": 72.72, + "step": 9980, + "token_acc": 0.8835728408590111, + "train_speed(iter/s)": 0.088871 + }, + { + "epoch": 4.291908134792874, + "grad_norm": 0.09660108387470245, + "learning_rate": 2.3263792123186353e-05, + "loss": 0.30487823486328125, + "memory(GiB)": 72.72, + "step": 10000, + "token_acc": 0.8812157065140277, + "train_speed(iter/s)": 0.088874 + }, + { + "epoch": 4.291908134792874, + "eval_loss": 0.45848873257637024, + "eval_runtime": 74.5961, + "eval_samples_per_second": 50.458, + "eval_steps_per_second": 0.643, + "eval_token_acc": 0.8390728406167212, + "step": 10000 + }, + { + "epoch": 4.3004936681691355, + "grad_norm": 0.10211784392595291, + "learning_rate": 2.3177132222126536e-05, + "loss": 0.3054050922393799, + "memory(GiB)": 72.72, + "step": 10020, + "token_acc": 0.861507260950951, + "train_speed(iter/s)": 0.088783 + }, + { + "epoch": 4.309079201545396, + "grad_norm": 0.1039443388581276, + "learning_rate": 2.3090494336012253e-05, + "loss": 0.3065175533294678, + "memory(GiB)": 72.72, + "step": 10040, + "token_acc": 0.8864880616836895, + "train_speed(iter/s)": 0.088777 + }, + { + "epoch": 4.317664734921657, + "grad_norm": 0.1060820147395134, + "learning_rate": 2.3003879511177405e-05, + "loss": 0.31085891723632814, + "memory(GiB)": 72.72, + "step": 10060, + "token_acc": 0.8897265286253574, + "train_speed(iter/s)": 0.088773 + }, + { + "epoch": 4.326250268297918, + "grad_norm": 0.10298410803079605, + "learning_rate": 2.2917288793677382e-05, + "loss": 0.31043663024902346, + "memory(GiB)": 72.72, + "step": 10080, + "token_acc": 0.8748683362897243, + "train_speed(iter/s)": 0.088769 + }, + { + "epoch": 4.3348358016741795, + "grad_norm": 0.1114133968949318, + "learning_rate": 2.2830723229276424e-05, + "loss": 0.31448495388031006, + "memory(GiB)": 72.72, + "step": 10100, + "token_acc": 0.8866603970434808, + "train_speed(iter/s)": 0.088766 + }, + { + "epoch": 4.34342133505044, + "grad_norm": 0.10426465421915054, + "learning_rate": 2.2744183863434976e-05, + "loss": 0.31032671928405764, + "memory(GiB)": 72.72, + "step": 10120, + "token_acc": 0.8818581792950851, + "train_speed(iter/s)": 0.088765 + }, + { + "epoch": 4.352006868426701, + "grad_norm": 0.10287055373191833, + "learning_rate": 2.265767174129711e-05, + "loss": 0.3112910747528076, + "memory(GiB)": 72.72, + "step": 10140, + "token_acc": 0.8739920728492123, + "train_speed(iter/s)": 0.088763 + }, + { + "epoch": 4.360592401802962, + "grad_norm": 0.10366437584161758, + "learning_rate": 2.2571187907677853e-05, + "loss": 0.31062612533569334, + "memory(GiB)": 72.72, + "step": 10160, + "token_acc": 0.8771409538302638, + "train_speed(iter/s)": 0.088761 + }, + { + "epoch": 4.369177935179223, + "grad_norm": 0.10374686121940613, + "learning_rate": 2.2484733407050602e-05, + "loss": 0.31010420322418214, + "memory(GiB)": 72.72, + "step": 10180, + "token_acc": 0.8837488220680202, + "train_speed(iter/s)": 0.088762 + }, + { + "epoch": 4.377763468555484, + "grad_norm": 0.10094033926725388, + "learning_rate": 2.2398309283534477e-05, + "loss": 0.3080222845077515, + "memory(GiB)": 72.72, + "step": 10200, + "token_acc": 0.8891878281040166, + "train_speed(iter/s)": 0.088764 + }, + { + "epoch": 4.386349001931745, + "grad_norm": 0.10435180366039276, + "learning_rate": 2.2311916580881754e-05, + "loss": 0.30961949825286866, + "memory(GiB)": 72.72, + "step": 10220, + "token_acc": 0.8952461985350648, + "train_speed(iter/s)": 0.088764 + }, + { + "epoch": 4.394934535308006, + "grad_norm": 0.0953126922249794, + "learning_rate": 2.222555634246521e-05, + "loss": 0.3070392608642578, + "memory(GiB)": 72.72, + "step": 10240, + "token_acc": 0.8863533099042126, + "train_speed(iter/s)": 0.088766 + }, + { + "epoch": 4.403520068684267, + "grad_norm": 0.10288111865520477, + "learning_rate": 2.2139229611265594e-05, + "loss": 0.30999772548675536, + "memory(GiB)": 72.72, + "step": 10260, + "token_acc": 0.8752241865231873, + "train_speed(iter/s)": 0.088766 + }, + { + "epoch": 4.412105602060528, + "grad_norm": 0.10298358649015427, + "learning_rate": 2.205293742985893e-05, + "loss": 0.310498046875, + "memory(GiB)": 72.72, + "step": 10280, + "token_acc": 0.895062097103973, + "train_speed(iter/s)": 0.088768 + }, + { + "epoch": 4.420691135436789, + "grad_norm": 0.10269106179475784, + "learning_rate": 2.1966680840404013e-05, + "loss": 0.31382122039794924, + "memory(GiB)": 72.72, + "step": 10300, + "token_acc": 0.8826629491356146, + "train_speed(iter/s)": 0.088769 + }, + { + "epoch": 4.42927666881305, + "grad_norm": 0.09890419244766235, + "learning_rate": 2.188046088462979e-05, + "loss": 0.31236202716827394, + "memory(GiB)": 72.72, + "step": 10320, + "token_acc": 0.877004450607206, + "train_speed(iter/s)": 0.08877 + }, + { + "epoch": 4.437862202189311, + "grad_norm": 0.1035868227481842, + "learning_rate": 2.179427860382276e-05, + "loss": 0.31030888557434083, + "memory(GiB)": 72.72, + "step": 10340, + "token_acc": 0.88265658710238, + "train_speed(iter/s)": 0.08877 + }, + { + "epoch": 4.446447735565572, + "grad_norm": 0.10644908994436264, + "learning_rate": 2.170813503881444e-05, + "loss": 0.31080482006072996, + "memory(GiB)": 72.72, + "step": 10360, + "token_acc": 0.8680257223302367, + "train_speed(iter/s)": 0.088772 + }, + { + "epoch": 4.455033268941833, + "grad_norm": 0.10393664985895157, + "learning_rate": 2.162203122996876e-05, + "loss": 0.3072603702545166, + "memory(GiB)": 72.72, + "step": 10380, + "token_acc": 0.8879988357215192, + "train_speed(iter/s)": 0.088774 + }, + { + "epoch": 4.463618802318094, + "grad_norm": 0.09875033795833588, + "learning_rate": 2.1535968217169535e-05, + "loss": 0.308307147026062, + "memory(GiB)": 72.72, + "step": 10400, + "token_acc": 0.8760545062481376, + "train_speed(iter/s)": 0.088777 + }, + { + "epoch": 4.472204335694355, + "grad_norm": 0.10074667632579803, + "learning_rate": 2.1449947039807826e-05, + "loss": 0.3109966039657593, + "memory(GiB)": 72.72, + "step": 10420, + "token_acc": 0.8947892374351213, + "train_speed(iter/s)": 0.088778 + }, + { + "epoch": 4.480789869070616, + "grad_norm": 0.09881151467561722, + "learning_rate": 2.1363968736769508e-05, + "loss": 0.3046985626220703, + "memory(GiB)": 72.72, + "step": 10440, + "token_acc": 0.8952631152568657, + "train_speed(iter/s)": 0.08878 + }, + { + "epoch": 4.489375402446877, + "grad_norm": 0.09804583340883255, + "learning_rate": 2.1278034346422616e-05, + "loss": 0.31377933025360105, + "memory(GiB)": 72.72, + "step": 10460, + "token_acc": 0.8788372867424049, + "train_speed(iter/s)": 0.088782 + }, + { + "epoch": 4.497960935823138, + "grad_norm": 0.10384197533130646, + "learning_rate": 2.1192144906604876e-05, + "loss": 0.3103285312652588, + "memory(GiB)": 72.72, + "step": 10480, + "token_acc": 0.891363222526985, + "train_speed(iter/s)": 0.088783 + }, + { + "epoch": 4.5065464691993995, + "grad_norm": 0.10672769695520401, + "learning_rate": 2.110630145461112e-05, + "loss": 0.3111438512802124, + "memory(GiB)": 72.72, + "step": 10500, + "token_acc": 0.88412093531313, + "train_speed(iter/s)": 0.088785 + }, + { + "epoch": 4.51513200257566, + "grad_norm": 0.10372064262628555, + "learning_rate": 2.102050502718078e-05, + "loss": 0.3104998111724854, + "memory(GiB)": 72.72, + "step": 10520, + "token_acc": 0.8813348577961984, + "train_speed(iter/s)": 0.088787 + }, + { + "epoch": 4.523717535951921, + "grad_norm": 0.1009448915719986, + "learning_rate": 2.093475666048539e-05, + "loss": 0.30964412689208987, + "memory(GiB)": 72.72, + "step": 10540, + "token_acc": 0.8954398710496272, + "train_speed(iter/s)": 0.088788 + }, + { + "epoch": 4.532303069328182, + "grad_norm": 0.10434540361166, + "learning_rate": 2.0849057390116042e-05, + "loss": 0.30902681350708006, + "memory(GiB)": 72.72, + "step": 10560, + "token_acc": 0.8803101400044141, + "train_speed(iter/s)": 0.088789 + }, + { + "epoch": 4.5408886027044435, + "grad_norm": 0.10229279845952988, + "learning_rate": 2.0763408251070866e-05, + "loss": 0.3061969757080078, + "memory(GiB)": 72.72, + "step": 10580, + "token_acc": 0.8930533404217614, + "train_speed(iter/s)": 0.088791 + }, + { + "epoch": 4.549474136080704, + "grad_norm": 0.09319902211427689, + "learning_rate": 2.0677810277742565e-05, + "loss": 0.3094120740890503, + "memory(GiB)": 72.72, + "step": 10600, + "token_acc": 0.8876524522036789, + "train_speed(iter/s)": 0.088793 + }, + { + "epoch": 4.558059669456965, + "grad_norm": 0.09506496042013168, + "learning_rate": 2.0592264503905932e-05, + "loss": 0.3105063199996948, + "memory(GiB)": 72.72, + "step": 10620, + "token_acc": 0.8743828338452405, + "train_speed(iter/s)": 0.088795 + }, + { + "epoch": 4.566645202833226, + "grad_norm": 0.09979739040136337, + "learning_rate": 2.0506771962705304e-05, + "loss": 0.30733799934387207, + "memory(GiB)": 72.72, + "step": 10640, + "token_acc": 0.889678967341867, + "train_speed(iter/s)": 0.088798 + }, + { + "epoch": 4.5752307362094875, + "grad_norm": 0.0996963307261467, + "learning_rate": 2.0421333686642137e-05, + "loss": 0.30787818431854247, + "memory(GiB)": 72.72, + "step": 10660, + "token_acc": 0.8724791602710936, + "train_speed(iter/s)": 0.0888 + }, + { + "epoch": 4.583816269585748, + "grad_norm": 0.10467605292797089, + "learning_rate": 2.0335950707562535e-05, + "loss": 0.30961976051330564, + "memory(GiB)": 72.72, + "step": 10680, + "token_acc": 0.8865601551069852, + "train_speed(iter/s)": 0.088797 + }, + { + "epoch": 4.592401802962009, + "grad_norm": 0.10287564992904663, + "learning_rate": 2.0250624056644767e-05, + "loss": 0.30673904418945314, + "memory(GiB)": 72.72, + "step": 10700, + "token_acc": 0.8888166591838771, + "train_speed(iter/s)": 0.088799 + }, + { + "epoch": 4.60098733633827, + "grad_norm": 0.10342861711978912, + "learning_rate": 2.0165354764386807e-05, + "loss": 0.3080348253250122, + "memory(GiB)": 72.72, + "step": 10720, + "token_acc": 0.8935362282980741, + "train_speed(iter/s)": 0.088801 + }, + { + "epoch": 4.6095728697145315, + "grad_norm": 0.09834201633930206, + "learning_rate": 2.0080143860593913e-05, + "loss": 0.30832786560058595, + "memory(GiB)": 72.72, + "step": 10740, + "token_acc": 0.8824297207331616, + "train_speed(iter/s)": 0.088803 + }, + { + "epoch": 4.618158403090792, + "grad_norm": 0.10289661586284637, + "learning_rate": 1.9994992374366193e-05, + "loss": 0.3109771251678467, + "memory(GiB)": 72.72, + "step": 10760, + "token_acc": 0.8895210650649608, + "train_speed(iter/s)": 0.088805 + }, + { + "epoch": 4.626743936467053, + "grad_norm": 0.09662512689828873, + "learning_rate": 1.9909901334086152e-05, + "loss": 0.31307733058929443, + "memory(GiB)": 72.72, + "step": 10780, + "token_acc": 0.8865438146287556, + "train_speed(iter/s)": 0.088807 + }, + { + "epoch": 4.635329469843314, + "grad_norm": 0.10243885219097137, + "learning_rate": 1.982487176740627e-05, + "loss": 0.31298274993896485, + "memory(GiB)": 72.72, + "step": 10800, + "token_acc": 0.8782184863693918, + "train_speed(iter/s)": 0.088808 + }, + { + "epoch": 4.6439150032195755, + "grad_norm": 0.10350590944290161, + "learning_rate": 1.973990470123663e-05, + "loss": 0.309729266166687, + "memory(GiB)": 72.72, + "step": 10820, + "token_acc": 0.8839905751216937, + "train_speed(iter/s)": 0.088809 + }, + { + "epoch": 4.652500536595836, + "grad_norm": 0.10676155984401703, + "learning_rate": 1.9655001161732478e-05, + "loss": 0.3093304395675659, + "memory(GiB)": 72.72, + "step": 10840, + "token_acc": 0.8909490610287415, + "train_speed(iter/s)": 0.088812 + }, + { + "epoch": 4.661086069972097, + "grad_norm": 0.09464031457901001, + "learning_rate": 1.9570162174281847e-05, + "loss": 0.3070455312728882, + "memory(GiB)": 72.72, + "step": 10860, + "token_acc": 0.8747045411759784, + "train_speed(iter/s)": 0.088813 + }, + { + "epoch": 4.669671603348358, + "grad_norm": 0.09355127811431885, + "learning_rate": 1.9485388763493153e-05, + "loss": 0.30823278427124023, + "memory(GiB)": 72.72, + "step": 10880, + "token_acc": 0.9008269805356058, + "train_speed(iter/s)": 0.088815 + }, + { + "epoch": 4.6782571367246195, + "grad_norm": 0.0956326350569725, + "learning_rate": 1.9400681953182855e-05, + "loss": 0.30865190029144285, + "memory(GiB)": 72.72, + "step": 10900, + "token_acc": 0.8869463759204074, + "train_speed(iter/s)": 0.088817 + }, + { + "epoch": 4.68684267010088, + "grad_norm": 0.10339660942554474, + "learning_rate": 1.9316042766363075e-05, + "loss": 0.3091820955276489, + "memory(GiB)": 72.72, + "step": 10920, + "token_acc": 0.8778233411535858, + "train_speed(iter/s)": 0.088819 + }, + { + "epoch": 4.695428203477141, + "grad_norm": 0.0986744612455368, + "learning_rate": 1.9231472225229216e-05, + "loss": 0.31184089183807373, + "memory(GiB)": 72.72, + "step": 10940, + "token_acc": 0.8970116747089019, + "train_speed(iter/s)": 0.088821 + }, + { + "epoch": 4.704013736853402, + "grad_norm": 0.10175996273756027, + "learning_rate": 1.9146971351147655e-05, + "loss": 0.3101097583770752, + "memory(GiB)": 72.72, + "step": 10960, + "token_acc": 0.8852196976340255, + "train_speed(iter/s)": 0.088823 + }, + { + "epoch": 4.7125992702296635, + "grad_norm": 0.7812356948852539, + "learning_rate": 1.9062541164643403e-05, + "loss": 0.3123283863067627, + "memory(GiB)": 72.72, + "step": 10980, + "token_acc": 0.8854708801840979, + "train_speed(iter/s)": 0.088825 + }, + { + "epoch": 4.721184803605924, + "grad_norm": 0.10162019729614258, + "learning_rate": 1.897818268538776e-05, + "loss": 0.31052777767181394, + "memory(GiB)": 72.72, + "step": 11000, + "token_acc": 0.8801016226848057, + "train_speed(iter/s)": 0.088825 + }, + { + "epoch": 4.721184803605924, + "eval_loss": 0.45501717925071716, + "eval_runtime": 70.0969, + "eval_samples_per_second": 53.697, + "eval_steps_per_second": 0.685, + "eval_token_acc": 0.8396253696160709, + "step": 11000 + }, + { + "epoch": 4.729770336982185, + "grad_norm": 0.10113983601331711, + "learning_rate": 1.8893896932185994e-05, + "loss": 0.30813672542572024, + "memory(GiB)": 72.72, + "step": 11020, + "token_acc": 0.8546351539786743, + "train_speed(iter/s)": 0.088747 + }, + { + "epoch": 4.738355870358446, + "grad_norm": 0.103078193962574, + "learning_rate": 1.8809684922965097e-05, + "loss": 0.30388219356536866, + "memory(GiB)": 72.72, + "step": 11040, + "token_acc": 0.8948337756570212, + "train_speed(iter/s)": 0.088741 + }, + { + "epoch": 4.746941403734707, + "grad_norm": 0.09970963001251221, + "learning_rate": 1.87255476747614e-05, + "loss": 0.3133774042129517, + "memory(GiB)": 72.72, + "step": 11060, + "token_acc": 0.8968223367439061, + "train_speed(iter/s)": 0.088738 + }, + { + "epoch": 4.755526937110968, + "grad_norm": 0.10380697250366211, + "learning_rate": 1.8641486203708387e-05, + "loss": 0.30957233905792236, + "memory(GiB)": 72.72, + "step": 11080, + "token_acc": 0.8824741415108899, + "train_speed(iter/s)": 0.088734 + }, + { + "epoch": 4.764112470487229, + "grad_norm": 0.1037619411945343, + "learning_rate": 1.855750152502431e-05, + "loss": 0.3057359457015991, + "memory(GiB)": 72.72, + "step": 11100, + "token_acc": 0.8958725033279122, + "train_speed(iter/s)": 0.088733 + }, + { + "epoch": 4.77269800386349, + "grad_norm": 0.10157765448093414, + "learning_rate": 1.847359465300006e-05, + "loss": 0.30702900886535645, + "memory(GiB)": 72.72, + "step": 11120, + "token_acc": 0.876122716238661, + "train_speed(iter/s)": 0.088732 + }, + { + "epoch": 4.781283537239751, + "grad_norm": 0.09982700645923615, + "learning_rate": 1.83897666009868e-05, + "loss": 0.3116676092147827, + "memory(GiB)": 72.72, + "step": 11140, + "token_acc": 0.8806798775281173, + "train_speed(iter/s)": 0.088731 + }, + { + "epoch": 4.789869070616012, + "grad_norm": 0.10041019320487976, + "learning_rate": 1.830601838138382e-05, + "loss": 0.30963037014007566, + "memory(GiB)": 72.72, + "step": 11160, + "token_acc": 0.8754533556507809, + "train_speed(iter/s)": 0.088732 + }, + { + "epoch": 4.798454603992273, + "grad_norm": 0.09908230602741241, + "learning_rate": 1.8222351005626226e-05, + "loss": 0.31059741973876953, + "memory(GiB)": 72.72, + "step": 11180, + "token_acc": 0.8883735287189193, + "train_speed(iter/s)": 0.088733 + }, + { + "epoch": 4.807040137368534, + "grad_norm": 0.10159313678741455, + "learning_rate": 1.8138765484172775e-05, + "loss": 0.3082897186279297, + "memory(GiB)": 72.72, + "step": 11200, + "token_acc": 0.8837576612751032, + "train_speed(iter/s)": 0.088733 + }, + { + "epoch": 4.815625670744795, + "grad_norm": 0.09573191404342651, + "learning_rate": 1.805526282649369e-05, + "loss": 0.31205048561096194, + "memory(GiB)": 72.72, + "step": 11220, + "token_acc": 0.8822307222234796, + "train_speed(iter/s)": 0.088734 + }, + { + "epoch": 4.824211204121056, + "grad_norm": 0.10527610033750534, + "learning_rate": 1.797184404105839e-05, + "loss": 0.3125370264053345, + "memory(GiB)": 72.72, + "step": 11240, + "token_acc": 0.877393258829162, + "train_speed(iter/s)": 0.088733 + }, + { + "epoch": 4.832796737497317, + "grad_norm": 0.09318065643310547, + "learning_rate": 1.7888510135323414e-05, + "loss": 0.30796611309051514, + "memory(GiB)": 72.72, + "step": 11260, + "token_acc": 0.8781843195222971, + "train_speed(iter/s)": 0.088735 + }, + { + "epoch": 4.841382270873578, + "grad_norm": 0.09891670942306519, + "learning_rate": 1.780526211572016e-05, + "loss": 0.31104702949523927, + "memory(GiB)": 72.72, + "step": 11280, + "token_acc": 0.8761593749911606, + "train_speed(iter/s)": 0.088737 + }, + { + "epoch": 4.849967804249839, + "grad_norm": 0.09686607122421265, + "learning_rate": 1.772210098764281e-05, + "loss": 0.3131218433380127, + "memory(GiB)": 72.72, + "step": 11300, + "token_acc": 0.8879271267617395, + "train_speed(iter/s)": 0.088736 + }, + { + "epoch": 4.8585533376261, + "grad_norm": 0.09879806637763977, + "learning_rate": 1.7639027755436104e-05, + "loss": 0.30540714263916013, + "memory(GiB)": 72.72, + "step": 11320, + "token_acc": 0.8883117608455857, + "train_speed(iter/s)": 0.088738 + }, + { + "epoch": 4.867138871002361, + "grad_norm": 0.09865374863147736, + "learning_rate": 1.7556043422383293e-05, + "loss": 0.3053091287612915, + "memory(GiB)": 72.72, + "step": 11340, + "token_acc": 0.8942929802909607, + "train_speed(iter/s)": 0.088737 + }, + { + "epoch": 4.875724404378622, + "grad_norm": 0.10021404922008514, + "learning_rate": 1.7473148990693955e-05, + "loss": 0.31073627471923826, + "memory(GiB)": 72.72, + "step": 11360, + "token_acc": 0.8850116031551548, + "train_speed(iter/s)": 0.088738 + }, + { + "epoch": 4.884309937754883, + "grad_norm": 0.10069513320922852, + "learning_rate": 1.7390345461491954e-05, + "loss": 0.3094152927398682, + "memory(GiB)": 72.72, + "step": 11380, + "token_acc": 0.8841858526281734, + "train_speed(iter/s)": 0.088739 + }, + { + "epoch": 4.892895471131144, + "grad_norm": 0.1061035767197609, + "learning_rate": 1.730763383480328e-05, + "loss": 0.30918545722961427, + "memory(GiB)": 72.72, + "step": 11400, + "token_acc": 0.8904189361026621, + "train_speed(iter/s)": 0.08874 + }, + { + "epoch": 4.901481004507405, + "grad_norm": 0.0995524600148201, + "learning_rate": 1.722501510954403e-05, + "loss": 0.3127927541732788, + "memory(GiB)": 72.72, + "step": 11420, + "token_acc": 0.8850685685523632, + "train_speed(iter/s)": 0.088743 + }, + { + "epoch": 4.910066537883666, + "grad_norm": 0.09735783189535141, + "learning_rate": 1.7142490283508324e-05, + "loss": 0.30820300579071047, + "memory(GiB)": 72.72, + "step": 11440, + "token_acc": 0.8748503235514588, + "train_speed(iter/s)": 0.088745 + }, + { + "epoch": 4.918652071259927, + "grad_norm": 0.10155107080936432, + "learning_rate": 1.706006035335625e-05, + "loss": 0.3070305109024048, + "memory(GiB)": 72.72, + "step": 11460, + "token_acc": 0.886489278720699, + "train_speed(iter/s)": 0.088748 + }, + { + "epoch": 4.927237604636188, + "grad_norm": 0.11103896051645279, + "learning_rate": 1.6977726314601806e-05, + "loss": 0.31273181438446046, + "memory(GiB)": 72.72, + "step": 11480, + "token_acc": 0.8837625376784819, + "train_speed(iter/s)": 0.08875 + }, + { + "epoch": 4.935823138012449, + "grad_norm": 0.09665607661008835, + "learning_rate": 1.6895489161600924e-05, + "loss": 0.30753934383392334, + "memory(GiB)": 72.72, + "step": 11500, + "token_acc": 0.8802015271291028, + "train_speed(iter/s)": 0.088752 + }, + { + "epoch": 4.94440867138871, + "grad_norm": 0.0969487726688385, + "learning_rate": 1.6813349887539443e-05, + "loss": 0.3144726514816284, + "memory(GiB)": 72.72, + "step": 11520, + "token_acc": 0.8802431565821507, + "train_speed(iter/s)": 0.088753 + }, + { + "epoch": 4.952994204764971, + "grad_norm": 0.09839560836553574, + "learning_rate": 1.67313094844211e-05, + "loss": 0.30981805324554446, + "memory(GiB)": 72.72, + "step": 11540, + "token_acc": 0.8963515858448547, + "train_speed(iter/s)": 0.088755 + }, + { + "epoch": 4.961579738141232, + "grad_norm": 0.10357420891523361, + "learning_rate": 1.664936894305554e-05, + "loss": 0.3088369846343994, + "memory(GiB)": 72.72, + "step": 11560, + "token_acc": 0.8838528141659493, + "train_speed(iter/s)": 0.088757 + }, + { + "epoch": 4.970165271517493, + "grad_norm": 0.09701311588287354, + "learning_rate": 1.65675292530464e-05, + "loss": 0.31214241981506347, + "memory(GiB)": 72.72, + "step": 11580, + "token_acc": 0.8797079209755736, + "train_speed(iter/s)": 0.088759 + }, + { + "epoch": 4.978750804893754, + "grad_norm": 0.09698698669672012, + "learning_rate": 1.648579140277931e-05, + "loss": 0.3103867292404175, + "memory(GiB)": 72.72, + "step": 11600, + "token_acc": 0.874222062607426, + "train_speed(iter/s)": 0.088759 + }, + { + "epoch": 4.987336338270015, + "grad_norm": 0.09471474587917328, + "learning_rate": 1.640415637940996e-05, + "loss": 0.31050570011138917, + "memory(GiB)": 72.72, + "step": 11620, + "token_acc": 0.891306756689066, + "train_speed(iter/s)": 0.088762 + }, + { + "epoch": 4.995921871646276, + "grad_norm": 0.09426256269216537, + "learning_rate": 1.6322625168852217e-05, + "loss": 0.31265413761138916, + "memory(GiB)": 72.72, + "step": 11640, + "token_acc": 0.8955581978003312, + "train_speed(iter/s)": 0.088764 + }, + { + "epoch": 5.00429276668813, + "grad_norm": 0.11454425007104874, + "learning_rate": 1.6241198755766175e-05, + "loss": 0.2891073703765869, + "memory(GiB)": 72.72, + "step": 11660, + "token_acc": 0.8897775721320687, + "train_speed(iter/s)": 0.088768 + }, + { + "epoch": 5.012878300064392, + "grad_norm": 0.10882110148668289, + "learning_rate": 1.6159878123546275e-05, + "loss": 0.2693314790725708, + "memory(GiB)": 72.72, + "step": 11680, + "token_acc": 0.8982194210665359, + "train_speed(iter/s)": 0.088762 + }, + { + "epoch": 5.021463833440652, + "grad_norm": 0.10319822281599045, + "learning_rate": 1.6078664254309436e-05, + "loss": 0.27081449031829835, + "memory(GiB)": 72.72, + "step": 11700, + "token_acc": 0.9001353267268086, + "train_speed(iter/s)": 0.088765 + }, + { + "epoch": 5.030049366816914, + "grad_norm": 0.11284149438142776, + "learning_rate": 1.59975581288832e-05, + "loss": 0.27441935539245604, + "memory(GiB)": 72.72, + "step": 11720, + "token_acc": 0.8984662917082801, + "train_speed(iter/s)": 0.088767 + }, + { + "epoch": 5.038634900193174, + "grad_norm": 0.10425851494073868, + "learning_rate": 1.591656072679387e-05, + "loss": 0.2715555906295776, + "memory(GiB)": 72.72, + "step": 11740, + "token_acc": 0.890478422247908, + "train_speed(iter/s)": 0.088768 + }, + { + "epoch": 5.047220433569436, + "grad_norm": 0.11284064501523972, + "learning_rate": 1.583567302625469e-05, + "loss": 0.2725609540939331, + "memory(GiB)": 72.72, + "step": 11760, + "token_acc": 0.897063681945232, + "train_speed(iter/s)": 0.088769 + }, + { + "epoch": 5.055805966945696, + "grad_norm": 0.11659186333417892, + "learning_rate": 1.5754896004154e-05, + "loss": 0.2763663291931152, + "memory(GiB)": 72.72, + "step": 11780, + "token_acc": 0.8892781727292928, + "train_speed(iter/s)": 0.088771 + }, + { + "epoch": 5.064391500321958, + "grad_norm": 0.10959072411060333, + "learning_rate": 1.567423063604352e-05, + "loss": 0.27177045345306394, + "memory(GiB)": 72.72, + "step": 11800, + "token_acc": 0.8928949946338838, + "train_speed(iter/s)": 0.088772 + }, + { + "epoch": 5.072977033698218, + "grad_norm": 0.11139950156211853, + "learning_rate": 1.5593677896126462e-05, + "loss": 0.2721517086029053, + "memory(GiB)": 72.72, + "step": 11820, + "token_acc": 0.895870023109786, + "train_speed(iter/s)": 0.088773 + }, + { + "epoch": 5.08156256707448, + "grad_norm": 0.10203303396701813, + "learning_rate": 1.551323875724587e-05, + "loss": 0.27356884479522703, + "memory(GiB)": 72.72, + "step": 11840, + "token_acc": 0.9059859374397118, + "train_speed(iter/s)": 0.088775 + }, + { + "epoch": 5.09014810045074, + "grad_norm": 0.10516630858182907, + "learning_rate": 1.5432914190872757e-05, + "loss": 0.2754658222198486, + "memory(GiB)": 72.72, + "step": 11860, + "token_acc": 0.8908944849786643, + "train_speed(iter/s)": 0.088776 + }, + { + "epoch": 5.098733633827002, + "grad_norm": 0.10858064144849777, + "learning_rate": 1.5352705167094477e-05, + "loss": 0.2734870672225952, + "memory(GiB)": 72.72, + "step": 11880, + "token_acc": 0.8974409839317595, + "train_speed(iter/s)": 0.088777 + }, + { + "epoch": 5.107319167203262, + "grad_norm": 0.11306975036859512, + "learning_rate": 1.527261265460296e-05, + "loss": 0.27300803661346434, + "memory(GiB)": 72.72, + "step": 11900, + "token_acc": 0.9109106165341432, + "train_speed(iter/s)": 0.08878 + }, + { + "epoch": 5.115904700579524, + "grad_norm": 0.10477675497531891, + "learning_rate": 1.5192637620682981e-05, + "loss": 0.2717351198196411, + "memory(GiB)": 72.72, + "step": 11920, + "token_acc": 0.8888284413313953, + "train_speed(iter/s)": 0.088782 + }, + { + "epoch": 5.124490233955784, + "grad_norm": 0.11272590607404709, + "learning_rate": 1.5112781031200569e-05, + "loss": 0.2693598508834839, + "memory(GiB)": 72.72, + "step": 11940, + "token_acc": 0.8937451291948688, + "train_speed(iter/s)": 0.088784 + }, + { + "epoch": 5.133075767332046, + "grad_norm": 0.1111418828368187, + "learning_rate": 1.5033043850591256e-05, + "loss": 0.2743582487106323, + "memory(GiB)": 72.72, + "step": 11960, + "token_acc": 0.8922390332455552, + "train_speed(iter/s)": 0.088786 + }, + { + "epoch": 5.141661300708306, + "grad_norm": 0.10565278679132462, + "learning_rate": 1.4953427041848473e-05, + "loss": 0.2750978946685791, + "memory(GiB)": 72.72, + "step": 11980, + "token_acc": 0.8900847655801997, + "train_speed(iter/s)": 0.088787 + }, + { + "epoch": 5.150246834084568, + "grad_norm": 0.10609643161296844, + "learning_rate": 1.4873931566511901e-05, + "loss": 0.27565574645996094, + "memory(GiB)": 72.72, + "step": 12000, + "token_acc": 0.9002406831246359, + "train_speed(iter/s)": 0.088788 + }, + { + "epoch": 5.150246834084568, + "eval_loss": 0.47488316893577576, + "eval_runtime": 70.9516, + "eval_samples_per_second": 53.05, + "eval_steps_per_second": 0.677, + "eval_token_acc": 0.8365712662327689, + "step": 12000 + } + ], + "logging_steps": 20, + "max_steps": 18640, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6.567507230772429e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..09d7dfc --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f5dea8d8c5f4234be8ff86a50a240bffb4568f8258427de2ba35f9d870b2b26 +size 8248 diff --git a/vocab.json b/vocab.json new file mode 100644 index 0000000..6c49fc6 --- /dev/null +++ b/vocab.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca10d7e9fb3ed18575dd1e277a2579c16d108e32f27439684afa0e10b1440910 +size 2776833