commit a1d74f9c89c19cb6b1988625ba331a836a08d0ed Author: ModelHub XC Date: Sat May 23 12:48:12 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: IIGroup/X-Coder-SFT-Qwen3-8B Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..4964134 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,56 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text + + +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zstandard filter=lfs diff=lfs merge=lfs -text +*.tfevents* filter=lfs diff=lfs merge=lfs -text +*.db* filter=lfs diff=lfs merge=lfs -text +*.ark* filter=lfs diff=lfs merge=lfs -text +**/*ckpt*data* filter=lfs diff=lfs merge=lfs -text +**/*ckpt*.meta filter=lfs diff=lfs merge=lfs -text +**/*ckpt*.index filter=lfs diff=lfs merge=lfs -text + +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.gguf* filter=lfs diff=lfs merge=lfs -text +*.ggml filter=lfs diff=lfs merge=lfs -text +*.llamafile* filter=lfs diff=lfs merge=lfs -text +*.pt2 filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text + +tokenizer.json filter=lfs diff=lfs merge=lfs -text +model-00004-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text +model-00001-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text +model-00002-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text +vocab.json filter=lfs diff=lfs merge=lfs -text +merges.txt filter=lfs diff=lfs merge=lfs -text +model-00003-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text +training_args.bin filter=lfs diff=lfs merge=lfs -text \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..bb02b06 --- /dev/null +++ b/README.md @@ -0,0 +1,100 @@ +--- +license: apache-2.0 +base_model: + - Qwen/Qwen3-8B-Base +datasets: + - IIGroup/X-Coder-SFT-376k +language: + - en +tags: + - code + - sft + - competitive-programming +--- + +# X-Coder-SFT-Qwen3-8B + +X-Coder-SFT-Qwen3-8B is a code generation model fine-tuned on fully synthetic instruction data, designed for competitive programming tasks. It serves as the foundation for subsequent RLVR training. + +## Model Description + +- **Base Model**: [Qwen/Qwen3-8B-Base](https://huggingface.co/Qwen/Qwen3-8B-Base) +- **Training Method**: Supervised Fine-Tuning (SFT) +- **Training Data**: [IIGroup/X-Coder-SFT-376k](https://huggingface.co/datasets/IIGroup/X-Coder-SFT-376k) +- **Parameters**: 8B + +## Training + +This model was trained using [ms-swift](https://github.com/modelscope/ms-swift). For training details and code, please refer to the [X-Coder GitHub repository](https://github.com/JieWu02/X-Coder). + +### Training Hyperparameters + +| Parameter | Value | +|-----------|-------| +| Base Model | Qwen/Qwen3-8B-Base | +| Training Type | Full Parameter | +| Epochs | 8 | +| Global Batch Size | 128 | +| Learning Rate | 5e-5 | +| Max Grad Norm | 1.0 | +| Max Length | 32768 | +| Torch Dtype | bfloat16 | +| DeepSpeed | Zero3 Offload (80GB VRAM) / Zero2 (142GB VRAM) | +| Packing | True (2x faster training, slightly worse performance) | + +## Performance on LiveCodeBench v5. + +![Results](results.png) + +## Recommended Inference Parameters + +| Parameter | Value | +|-----------|-------| +| temperature | 0.6 | +| top_p | 0.95 | +| top_k | 20 (or -1 to disable) | +| max_new_tokens | 32768 | + +## Usage + +```python +from transformers import AutoModelForCausalLM, AutoTokenizer + +model_name = "IIGroup/X-Coder-SFT-Qwen3-8B" +tokenizer = AutoTokenizer.from_pretrained(model_name) +model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto") + +prompt = "Write a Python function to solve the two sum problem." +inputs = tokenizer(prompt, return_tensors="pt").to(model.device) +outputs = model.generate( + **inputs, + max_new_tokens=32768, + temperature=0.6, + top_p=0.95, + top_k=20, + do_sample=True +) +print(tokenizer.decode(outputs[0], skip_special_tokens=True)) +``` + +## Related Models + +- **RL Model**: [IIGroup/X-Coder-RL-Qwen3-8B](https://huggingface.co/IIGroup/X-Coder-RL-Qwen3-8B) - RLVR trained version achieving 64.0 on LiveCodeBench + +## Citation + +```bibtex +@misc{wu2026xcoderadvancingcompetitiveprogramming, + title={X-Coder: Advancing Competitive Programming with Fully Synthetic Tasks, Solutions, and Tests}, + author={Jie Wu and Haoling Li and Xin Zhang and Jiani Guo and Jane Luo and Steven Liu and Yangyu Huang and Ruihang Chu and Scarlett Li and Yujiu Yang}, + year={2026}, + eprint={2601.06953}, + archivePrefix={arXiv}, + primaryClass={cs.CL}, + url={https://arxiv.org/abs/2601.06953}, +} +``` + +## License + +This project is licensed under the Apache License 2.0. diff --git a/added_tokens.json b/added_tokens.json new file mode 100644 index 0000000..b54f913 --- /dev/null +++ b/added_tokens.json @@ -0,0 +1,28 @@ +{ + "": 151668, + "": 151658, + "": 151666, + "": 151667, + "": 151657, + "": 151665, + "<|box_end|>": 151649, + "<|box_start|>": 151648, + "<|endoftext|>": 151643, + "<|file_sep|>": 151664, + "<|fim_middle|>": 151660, + "<|fim_pad|>": 151662, + "<|fim_prefix|>": 151659, + "<|fim_suffix|>": 151661, + "<|im_end|>": 151645, + "<|im_start|>": 151644, + "<|image_pad|>": 151655, + "<|object_ref_end|>": 151647, + "<|object_ref_start|>": 151646, + "<|quad_end|>": 151651, + "<|quad_start|>": 151650, + "<|repo_name|>": 151663, + "<|video_pad|>": 151656, + "<|vision_end|>": 151653, + "<|vision_pad|>": 151654, + "<|vision_start|>": 151652 +} diff --git a/args.json b/args.json new file mode 100644 index 0000000..65e2e57 --- /dev/null +++ b/args.json @@ -0,0 +1,366 @@ +{ + "model": "Qwen3-8B-Base", + "model_type": "qwen3", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": "flash_attn", + "num_labels": null, + "problem_type": null, + "rope_scaling": null, + "device_map": null, + "max_memory": {}, + "local_repo_path": null, + "init_strategy": null, + "template": "qwen3", + "system": null, + "max_length": 24000, + "truncation_strategy": "delete", + "max_pixels": null, + "agent_template": null, + "norm_bbox": null, + "response_prefix": null, + "padding_side": "right", + "loss_scale": "default", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "376k_sft.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 1, + "load_from_cache_file": true, + "dataset_shuffle": true, + "val_dataset_shuffle": false, + "streaming": false, + "interleave_prob": null, + "stopping_strategy": "first_exhausted", + "shuffle_buffer_size": 1000, + "download_mode": "reuse_dataset_if_exists", + "columns": {}, + "strict": false, + "remove_unused_columns": true, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.0, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "full", + "adapters": [], + "external_plugins": [], + "seed": 42, + "model_kwargs": {}, + "load_args": false, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ddp_timeout": 1800, + "ddp_backend": "nccl", + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/openpai_config/sft/Long_Cot_data/r1-SFT-380k-24k-length-Qwen3-8B-Base-bs176-7p-5e-5/v0-20250824-012153", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 2, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 5e-05, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.95, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 7.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.03, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/openpai_config/sft/Long_Cot_data/r1-SFT-380k-24k-length-Qwen3-8B-Base-bs176-7p-5e-5/v0-20250824-012153/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 400.0, + "save_total_limit": 40, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": true, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 400.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": "/openpai_config/sft/Long_Cot_data/r1-SFT-380k-24k-length-Qwen3-8B-Base-bs176-7p-5e-5/v0-20250824-012153", + "disable_tqdm": null, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "cpu", + "pin_memory": true + }, + "offload_param": { + "device": "cpu", + "pin_memory": true + }, + "overlap_comm": false, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "check_model": true, + "acc_strategy": "token", + "train_dataloader_shuffle": true, + "max_epochs": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "eval_use_evalscope": false, + "eval_datasets": [], + "eval_limit": null, + "eval_datasets_args": null, + "eval_generation_config": null, + "freeze_parameters": [], + "freeze_parameters_regex": null, + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "trainable_parameters_regex": null, + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "swanlab_token": null, + "swanlab_project": null, + "swanlab_workspace": null, + "swanlab_exp_name": null, + "swanlab_mode": "cloud", + "add_version": true, + "resume_only_model": false, + "create_checkpoint_symlink": false, + "packing": false, + "lazy_tokenize": true, + "loss_type": null, + "optimizer": null, + "metric": null, + "zero_hpz_partition_size": null, + "rank": 0, + "global_world_size": 88, + "local_world_size": 8, + "model_suffix": "Qwen3-8B-Base", + "model_info": "ModelInfo(model_type='qwen3', model_dir='/openpai_config/sft/Long_Cot_data/Qwen3-8B-Base', torch_dtype=torch.bfloat16, max_model_len=32768, quant_method=None, quant_bits=None, rope_scaling=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='qwen3', model_groups=[ModelGroup(models=[Model(ms_model_id='Qwen/Qwen3-0.6B-Base', hf_model_id='Qwen/Qwen3-0.6B-Base', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-1.7B-Base', hf_model_id='Qwen/Qwen3-1.7B-Base', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-4B-Base', hf_model_id='Qwen/Qwen3-4B-Base', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-8B-Base', hf_model_id='Qwen/Qwen3-8B-Base', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-14B-Base', hf_model_id='Qwen/Qwen3-14B-Base', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-32B-Base', hf_model_id='Qwen/Qwen3-32B-Base', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-0.6B', hf_model_id='Qwen/Qwen3-0.6B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-1.7B', hf_model_id='Qwen/Qwen3-1.7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-4B', hf_model_id='Qwen/Qwen3-4B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-8B', hf_model_id='Qwen/Qwen3-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-14B', hf_model_id='Qwen/Qwen3-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-32B', hf_model_id='Qwen/Qwen3-32B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-0.6B-FP8', hf_model_id='Qwen/Qwen3-0.6B-FP8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-1.7B-FP8', hf_model_id='Qwen/Qwen3-1.7B-FP8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-4B-FP8', hf_model_id='Qwen/Qwen3-4B-FP8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-8B-FP8', hf_model_id='Qwen/Qwen3-8B-FP8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-14B-FP8', hf_model_id='Qwen/Qwen3-14B-FP8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-32B-FP8', hf_model_id='Qwen/Qwen3-32B-FP8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-4B-AWQ', hf_model_id='Qwen/Qwen3-4B-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-8B-AWQ', hf_model_id='Qwen/Qwen3-8B-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-14B-AWQ', hf_model_id='Qwen/Qwen3-14B-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-32B-AWQ', hf_model_id='Qwen/Qwen3-32B-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='swift/Qwen3-32B-AWQ', hf_model_id=None, model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='qwen3', get_function=, model_arch='llama', architectures=['Qwen3ForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=None, requires=['transformers>=4.51'], tags=[])", + "model_dir": "/openpai_config/sft/Long_Cot_data/Qwen3-8B-Base", + "hub": "", + "evaluation_strategy": "steps", + "training_args": "Seq2SeqTrainingArguments(output_dir='/openpai_config/sft/Long_Cot_data/r1-SFT-380k-24k-length-Qwen3-8B-Base-bs176-7p-5e-5/v0-20250824-012153', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=2, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=5e-05, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=7.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.03, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/openpai_config/sft/Long_Cot_data/r1-SFT-380k-24k-length-Qwen3-8B-Base-bs176-7p-5e-5/v0-20250824-012153/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=400, save_total_limit=40, save_safetensors=True, save_on_each_node=False, save_only_model=True, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend='nccl', tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=400, dataloader_num_workers=4, dataloader_prefetch_factor=10, past_index=-1, run_name='/openpai_config/sft/Long_Cot_data/r1-SFT-380k-24k-length-Qwen3-8B-Base-bs176-7p-5e-5/v0-20250824-012153', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'cpu', 'pin_memory': True}, 'offload_param': {'device': 'cpu', 'pin_memory': True}, 'overlap_comm': False, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, check_model=True, acc_strategy='token', train_dataloader_shuffle=True, max_epochs=None, metric_warmup_step=0, fsdp_num=1, acc_steps=1, eval_use_evalscope=False, eval_datasets=[], eval_limit=None, eval_datasets_args=None, eval_generation_config=None, train_type='full', optimizer=None, local_repo_path=None, galore_config=None)" +} \ No newline at end of file diff --git a/chat_template.jinja b/chat_template.jinja new file mode 100644 index 0000000..699ff8d --- /dev/null +++ b/chat_template.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..e0f5a76 --- /dev/null +++ b/config.json @@ -0,0 +1,31 @@ +{ + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 151643, + "eos_token_id": 151643, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 12288, + "max_position_embeddings": 32768, + "max_window_layers": 36, + "model_type": "qwen3", + "num_attention_heads": 32, + "num_hidden_layers": 36, + "num_key_value_heads": 8, + "pad_token_id": 151643, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000, + "sliding_window": null, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.52.3", + "use_cache": false, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/configuration.json b/configuration.json new file mode 100644 index 0000000..159097f --- /dev/null +++ b/configuration.json @@ -0,0 +1 @@ +{"framework": "pytorch", "task": "others", "allow_remote": true} \ No newline at end of file diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..5deb124 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,6 @@ +{ + "bos_token_id": 151643, + "eos_token_id": 151643, + "max_new_tokens": 2048, + "transformers_version": "4.52.3" +} diff --git a/merges.txt b/merges.txt new file mode 100644 index 0000000..80c1a19 --- /dev/null +++ b/merges.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8831e4f1a044471340f7c0a83d7bd71306a5b867e95fd870f74d0c5308a904d5 +size 1671853 diff --git a/model-00001-of-00004.safetensors b/model-00001-of-00004.safetensors new file mode 100644 index 0000000..b350b37 --- /dev/null +++ b/model-00001-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cba405bd4483ddb136956bf44b21015df196e38917d0de9ffc97092ddda36fa4 +size 4902257696 diff --git a/model-00002-of-00004.safetensors b/model-00002-of-00004.safetensors new file mode 100644 index 0000000..56107b2 --- /dev/null +++ b/model-00002-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:461e07415ac26eafee1aef7cb1ae1ba3e0d8687b7e97b2af5e91dae842297b29 +size 4915960368 diff --git a/model-00003-of-00004.safetensors b/model-00003-of-00004.safetensors new file mode 100644 index 0000000..53a8f33 --- /dev/null +++ b/model-00003-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7881b3054e5034cfb606b5d251506f3bd8bef0ec8935a61519d543ab1295f928 +size 4983068496 diff --git a/model-00004-of-00004.safetensors b/model-00004-of-00004.safetensors new file mode 100644 index 0000000..8c109ab --- /dev/null +++ b/model-00004-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91ba0ad9442ba46a7aa49d7ec8da9aaedf113f80f47a4cffdcb4041af0f8c21f +size 1580230264 diff --git a/model.safetensors.index.json b/model.safetensors.index.json new file mode 100644 index 0000000..10c7a17 --- /dev/null +++ b/model.safetensors.index.json @@ -0,0 +1,406 @@ +{ + "metadata": { + "total_size": 16381470720 + }, + "weight_map": { + "lm_head.weight": "model-00004-of-00004.safetensors", + "model.embed_tokens.weight": "model-00001-of-00004.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.k_norm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.q_norm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.k_norm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.q_norm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.k_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.q_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.k_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.q_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.k_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.q_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.k_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.q_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.k_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.q_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.k_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.q_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.k_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.q_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.k_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.q_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.k_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.q_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.k_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.q_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.k_norm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.q_norm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.20.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.k_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.q_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.21.self_attn.k_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.21.self_attn.q_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.k_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.22.self_attn.q_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.k_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.q_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.k_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.q_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.k_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.q_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.k_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.q_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.k_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.q_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.k_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.q_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.k_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.q_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.k_norm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.q_norm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.30.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.k_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.q_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.k_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.q_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.32.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.32.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.32.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.32.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.32.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.32.self_attn.k_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.32.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.32.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.32.self_attn.q_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.32.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.32.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.33.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.33.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.33.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.33.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.33.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.33.self_attn.k_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.33.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.33.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.33.self_attn.q_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.33.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.33.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.34.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.34.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.34.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.34.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.34.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.34.self_attn.k_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.34.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.34.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.34.self_attn.q_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.34.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.34.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.35.input_layernorm.weight": "model-00004-of-00004.safetensors", + "model.layers.35.mlp.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.35.mlp.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.35.mlp.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.35.post_attention_layernorm.weight": "model-00004-of-00004.safetensors", + "model.layers.35.self_attn.k_norm.weight": "model-00004-of-00004.safetensors", + "model.layers.35.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.35.self_attn.o_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.35.self_attn.q_norm.weight": "model-00004-of-00004.safetensors", + "model.layers.35.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.35.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.k_norm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.q_norm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.k_norm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.q_norm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.k_norm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.q_norm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.k_norm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.q_norm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.k_norm.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.q_norm.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.k_norm.weight": "model-00001-of-00004.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.9.self_attn.q_norm.weight": "model-00001-of-00004.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.norm.weight": "model-00004-of-00004.safetensors" + } +} diff --git a/results.png b/results.png new file mode 100644 index 0000000..7126e89 Binary files /dev/null and b/results.png differ diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..17305b3 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,31 @@ +{ + "additional_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "eos_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..cd71f61 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aeb13307a71acd8fe81861d94ad54ab689df773318809eed3cbe794b4492dae4 +size 11422654 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..eb89444 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,239 @@ +{ + "add_bos_token": false, + "add_prefix_space": false, + "added_tokens_decoder": { + "151643": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151644": { + "content": "<|im_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151645": { + "content": "<|im_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151646": { + "content": "<|object_ref_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151647": { + "content": "<|object_ref_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151648": { + "content": "<|box_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151649": { + "content": "<|box_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151650": { + "content": "<|quad_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151651": { + "content": "<|quad_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151652": { + "content": "<|vision_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151653": { + "content": "<|vision_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151654": { + "content": "<|vision_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151655": { + "content": "<|image_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151656": { + "content": "<|video_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151657": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151658": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151659": { + "content": "<|fim_prefix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151660": { + "content": "<|fim_middle|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151661": { + "content": "<|fim_suffix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151662": { + "content": "<|fim_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151663": { + "content": "<|repo_name|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151664": { + "content": "<|file_sep|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151665": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151666": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151667": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151668": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": {}, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000..5299f6a --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,1793 @@ +{ + "best_global_step": 1200, + "best_metric": 0.43056953, + "best_model_checkpoint": "/openpai_config/sft/Long_Cot_data/Stage1-380k-24k-length-Qwen3-8B-Base-resume-iter4600-4p-3e-5/v0-20250826-235423/checkpoint-1200", + "epoch": 2.2665292804396358, + "eval_steps": 300, + "global_step": 3300, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0006869311351537008, + "grad_norm": 0.07274238765239716, + "learning_rate": 2.9999997817680842e-05, + "loss": 0.2985985279083252, + "memory(GiB)": 56.61, + "step": 1, + "token_acc": 0.8812937590030485, + "train_speed(iter/s)": 0.013692 + }, + { + "epoch": 0.013738622703074016, + "grad_norm": 0.10364966094493866, + "learning_rate": 2.9999127080781484e-05, + "loss": 0.32278080990439967, + "memory(GiB)": 70.37, + "step": 20, + "token_acc": 0.8852460572314409, + "train_speed(iter/s)": 0.033578 + }, + { + "epoch": 0.027477245406148033, + "grad_norm": 0.0828627347946167, + "learning_rate": 2.999650842472434e-05, + "loss": 0.3287534713745117, + "memory(GiB)": 70.37, + "step": 40, + "token_acc": 0.8729899276605303, + "train_speed(iter/s)": 0.035269 + }, + { + "epoch": 0.04121586810922205, + "grad_norm": 0.075782909989357, + "learning_rate": 2.9992144336611927e-05, + "loss": 0.32648520469665526, + "memory(GiB)": 70.37, + "step": 60, + "token_acc": 0.871392951813677, + "train_speed(iter/s)": 0.03605 + }, + { + "epoch": 0.054954490812296065, + "grad_norm": 0.09333578497171402, + "learning_rate": 2.998603532437709e-05, + "loss": 0.3258840799331665, + "memory(GiB)": 70.37, + "step": 80, + "token_acc": 0.8820731351865331, + "train_speed(iter/s)": 0.036483 + }, + { + "epoch": 0.06869311351537008, + "grad_norm": 0.08618636429309845, + "learning_rate": 2.9978182099043062e-05, + "loss": 0.3262542724609375, + "memory(GiB)": 70.37, + "step": 100, + "token_acc": 0.8849732393640833, + "train_speed(iter/s)": 0.036776 + }, + { + "epoch": 0.0824317362184441, + "grad_norm": 0.08525851368904114, + "learning_rate": 2.9968585574640675e-05, + "loss": 0.32371375560760496, + "memory(GiB)": 70.37, + "step": 120, + "token_acc": 0.8871000703215756, + "train_speed(iter/s)": 0.036983 + }, + { + "epoch": 0.09617035892151812, + "grad_norm": 0.08820851147174835, + "learning_rate": 2.995724686810202e-05, + "loss": 0.3224189281463623, + "memory(GiB)": 70.37, + "step": 140, + "token_acc": 0.8856544898235427, + "train_speed(iter/s)": 0.03715 + }, + { + "epoch": 0.10990898162459213, + "grad_norm": 0.0837075412273407, + "learning_rate": 2.9944167299130397e-05, + "loss": 0.32613368034362794, + "memory(GiB)": 70.4, + "step": 160, + "token_acc": 0.8848279812123834, + "train_speed(iter/s)": 0.037079 + }, + { + "epoch": 0.12364760432766615, + "grad_norm": 0.09300094842910767, + "learning_rate": 2.9929348390046766e-05, + "loss": 0.32998530864715575, + "memory(GiB)": 70.4, + "step": 180, + "token_acc": 0.8633912245766957, + "train_speed(iter/s)": 0.037181 + }, + { + "epoch": 0.13738622703074016, + "grad_norm": 0.08325231075286865, + "learning_rate": 2.9912791865612525e-05, + "loss": 0.3349442958831787, + "memory(GiB)": 70.4, + "step": 200, + "token_acc": 0.8835910835510771, + "train_speed(iter/s)": 0.037269 + }, + { + "epoch": 0.15112484973381418, + "grad_norm": 0.08610066026449203, + "learning_rate": 2.9894499652828798e-05, + "loss": 0.33656883239746094, + "memory(GiB)": 70.4, + "step": 220, + "token_acc": 0.8754293623532549, + "train_speed(iter/s)": 0.037356 + }, + { + "epoch": 0.1648634724368882, + "grad_norm": 0.09172473102807999, + "learning_rate": 2.9874473880712125e-05, + "loss": 0.3357390403747559, + "memory(GiB)": 70.4, + "step": 240, + "token_acc": 0.8816886144038251, + "train_speed(iter/s)": 0.037427 + }, + { + "epoch": 0.17860209513996222, + "grad_norm": 0.08784262090921402, + "learning_rate": 2.9852716880046687e-05, + "loss": 0.33715412616729734, + "memory(GiB)": 70.4, + "step": 260, + "token_acc": 0.8768648698038725, + "train_speed(iter/s)": 0.037172 + }, + { + "epoch": 0.19234071784303625, + "grad_norm": 0.08109364658594131, + "learning_rate": 2.9829231183113013e-05, + "loss": 0.3330291509628296, + "memory(GiB)": 70.4, + "step": 280, + "token_acc": 0.884993071444176, + "train_speed(iter/s)": 0.037251 + }, + { + "epoch": 0.20607934054611024, + "grad_norm": 0.08930086344480515, + "learning_rate": 2.980401952339328e-05, + "loss": 0.32976431846618653, + "memory(GiB)": 70.4, + "step": 300, + "token_acc": 0.8701908549811445, + "train_speed(iter/s)": 0.037312 + }, + { + "epoch": 0.20607934054611024, + "eval_loss": 0.4335128664970398, + "eval_runtime": 97.7946, + "eval_samples_per_second": 38.489, + "eval_steps_per_second": 0.603, + "eval_token_acc": 0.8434016162256411, + "step": 300 + }, + { + "epoch": 0.21981796324918426, + "grad_norm": 0.08906491845846176, + "learning_rate": 2.9777084835253107e-05, + "loss": 0.3341225624084473, + "memory(GiB)": 72.41, + "step": 320, + "token_acc": 0.8633943248975853, + "train_speed(iter/s)": 0.036714 + }, + { + "epoch": 0.23355658595225828, + "grad_norm": 0.09569748491048813, + "learning_rate": 2.9748430253600103e-05, + "loss": 0.3317814826965332, + "memory(GiB)": 72.41, + "step": 340, + "token_acc": 0.8690857078605088, + "train_speed(iter/s)": 0.036742 + }, + { + "epoch": 0.2472952086553323, + "grad_norm": 0.09385869652032852, + "learning_rate": 2.9718059113518926e-05, + "loss": 0.33147258758544923, + "memory(GiB)": 72.41, + "step": 360, + "token_acc": 0.8744091464585358, + "train_speed(iter/s)": 0.036789 + }, + { + "epoch": 0.2610338313584063, + "grad_norm": 0.09747012704610825, + "learning_rate": 2.9685974949883163e-05, + "loss": 0.3316455841064453, + "memory(GiB)": 72.41, + "step": 380, + "token_acc": 0.8744771091993391, + "train_speed(iter/s)": 0.036834 + }, + { + "epoch": 0.2747724540614803, + "grad_norm": 0.0942377969622612, + "learning_rate": 2.9652181496943888e-05, + "loss": 0.33130803108215334, + "memory(GiB)": 72.41, + "step": 400, + "token_acc": 0.8743932496768547, + "train_speed(iter/s)": 0.036889 + }, + { + "epoch": 0.28851107676455434, + "grad_norm": 0.08894450962543488, + "learning_rate": 2.9616682687895038e-05, + "loss": 0.3286163806915283, + "memory(GiB)": 72.41, + "step": 420, + "token_acc": 0.8871290646641795, + "train_speed(iter/s)": 0.036939 + }, + { + "epoch": 0.30224969946762836, + "grad_norm": 0.09808226674795151, + "learning_rate": 2.9579482654415627e-05, + "loss": 0.3300768375396729, + "memory(GiB)": 72.41, + "step": 440, + "token_acc": 0.8833857380634584, + "train_speed(iter/s)": 0.036991 + }, + { + "epoch": 0.3159883221707024, + "grad_norm": 0.09648197889328003, + "learning_rate": 2.9540585726188883e-05, + "loss": 0.3316764831542969, + "memory(GiB)": 72.41, + "step": 460, + "token_acc": 0.8862556346113715, + "train_speed(iter/s)": 0.037032 + }, + { + "epoch": 0.3297269448737764, + "grad_norm": 0.08601760119199753, + "learning_rate": 2.9499996430398296e-05, + "loss": 0.3263013124465942, + "memory(GiB)": 72.41, + "step": 480, + "token_acc": 0.8804854791092939, + "train_speed(iter/s)": 0.037074 + }, + { + "epoch": 0.34346556757685043, + "grad_norm": 0.08579235523939133, + "learning_rate": 2.945771949120071e-05, + "loss": 0.33027398586273193, + "memory(GiB)": 72.41, + "step": 500, + "token_acc": 0.8894077316408038, + "train_speed(iter/s)": 0.037103 + }, + { + "epoch": 0.35720419027992445, + "grad_norm": 0.08523233234882355, + "learning_rate": 2.9413759829176497e-05, + "loss": 0.32302305698394773, + "memory(GiB)": 72.41, + "step": 520, + "token_acc": 0.8851045135756845, + "train_speed(iter/s)": 0.037131 + }, + { + "epoch": 0.37094281298299847, + "grad_norm": 0.08190900087356567, + "learning_rate": 2.9368122560756822e-05, + "loss": 0.3292397975921631, + "memory(GiB)": 72.41, + "step": 540, + "token_acc": 0.8731282020961862, + "train_speed(iter/s)": 0.037164 + }, + { + "epoch": 0.3846814356860725, + "grad_norm": 0.0842556357383728, + "learning_rate": 2.9320812997628184e-05, + "loss": 0.3262279748916626, + "memory(GiB)": 72.41, + "step": 560, + "token_acc": 0.8791384953802229, + "train_speed(iter/s)": 0.037189 + }, + { + "epoch": 0.3984200583891465, + "grad_norm": 0.09352608770132065, + "learning_rate": 2.9271836646114166e-05, + "loss": 0.3300283908843994, + "memory(GiB)": 72.41, + "step": 580, + "token_acc": 0.8848605133294101, + "train_speed(iter/s)": 0.037218 + }, + { + "epoch": 0.4121586810922205, + "grad_norm": 0.08545698970556259, + "learning_rate": 2.922119920653457e-05, + "loss": 0.33294997215270994, + "memory(GiB)": 72.41, + "step": 600, + "token_acc": 0.8852764804825887, + "train_speed(iter/s)": 0.037244 + }, + { + "epoch": 0.4121586810922205, + "eval_loss": 0.43278947472572327, + "eval_runtime": 98.2411, + "eval_samples_per_second": 38.314, + "eval_steps_per_second": 0.601, + "eval_token_acc": 0.8435347324817475, + "step": 600 + }, + { + "epoch": 0.4258973037952945, + "grad_norm": 0.09264256060123444, + "learning_rate": 2.916890657254194e-05, + "loss": 0.32485041618347166, + "memory(GiB)": 72.41, + "step": 620, + "token_acc": 0.8645305113582833, + "train_speed(iter/s)": 0.036936 + }, + { + "epoch": 0.4396359264983685, + "grad_norm": 0.08746380358934402, + "learning_rate": 2.9114964830435648e-05, + "loss": 0.32976808547973635, + "memory(GiB)": 72.41, + "step": 640, + "token_acc": 0.8707312036033282, + "train_speed(iter/s)": 0.036935 + }, + { + "epoch": 0.45337454920144254, + "grad_norm": 0.08795125037431717, + "learning_rate": 2.9059380258453473e-05, + "loss": 0.3318798303604126, + "memory(GiB)": 72.41, + "step": 660, + "token_acc": 0.8750859732168144, + "train_speed(iter/s)": 0.03695 + }, + { + "epoch": 0.46711317190451657, + "grad_norm": 0.09344589710235596, + "learning_rate": 2.9002159326040897e-05, + "loss": 0.3324501752853394, + "memory(GiB)": 72.41, + "step": 680, + "token_acc": 0.8820933572199229, + "train_speed(iter/s)": 0.036974 + }, + { + "epoch": 0.4808517946075906, + "grad_norm": 0.08175963908433914, + "learning_rate": 2.894330869309814e-05, + "loss": 0.3342601776123047, + "memory(GiB)": 72.41, + "step": 700, + "token_acc": 0.888566898590454, + "train_speed(iter/s)": 0.037001 + }, + { + "epoch": 0.4945904173106646, + "grad_norm": 0.08012371510267258, + "learning_rate": 2.8882835209205e-05, + "loss": 0.3325735807418823, + "memory(GiB)": 72.41, + "step": 720, + "token_acc": 0.8726016398435306, + "train_speed(iter/s)": 0.03703 + }, + { + "epoch": 0.5083290400137386, + "grad_norm": 0.08838852494955063, + "learning_rate": 2.8820745912823653e-05, + "loss": 0.3366635799407959, + "memory(GiB)": 72.41, + "step": 740, + "token_acc": 0.8669464276591397, + "train_speed(iter/s)": 0.037058 + }, + { + "epoch": 0.5220676627168126, + "grad_norm": 0.08403000980615616, + "learning_rate": 2.8757048030479438e-05, + "loss": 0.33883938789367674, + "memory(GiB)": 72.41, + "step": 760, + "token_acc": 0.8731299584725523, + "train_speed(iter/s)": 0.037083 + }, + { + "epoch": 0.5358062854198866, + "grad_norm": 0.09687156975269318, + "learning_rate": 2.8691748975919784e-05, + "loss": 0.3338437557220459, + "memory(GiB)": 72.41, + "step": 780, + "token_acc": 0.8848287485135103, + "train_speed(iter/s)": 0.037115 + }, + { + "epoch": 0.5495449081229606, + "grad_norm": 0.08311958611011505, + "learning_rate": 2.862485634925131e-05, + "loss": 0.3358239889144897, + "memory(GiB)": 72.41, + "step": 800, + "token_acc": 0.8782527593369446, + "train_speed(iter/s)": 0.037145 + }, + { + "epoch": 0.5632835308260347, + "grad_norm": 0.08971302956342697, + "learning_rate": 2.855637793605527e-05, + "loss": 0.3317149877548218, + "memory(GiB)": 72.41, + "step": 820, + "token_acc": 0.8807387938176727, + "train_speed(iter/s)": 0.037168 + }, + { + "epoch": 0.5770221535291087, + "grad_norm": 0.08680903911590576, + "learning_rate": 2.848632170648139e-05, + "loss": 0.3325679779052734, + "memory(GiB)": 72.41, + "step": 840, + "token_acc": 0.8763491204567425, + "train_speed(iter/s)": 0.037194 + }, + { + "epoch": 0.5907607762321827, + "grad_norm": 0.08324452489614487, + "learning_rate": 2.8414695814320224e-05, + "loss": 0.3364755868911743, + "memory(GiB)": 72.41, + "step": 860, + "token_acc": 0.8762890432412627, + "train_speed(iter/s)": 0.037217 + }, + { + "epoch": 0.6044993989352567, + "grad_norm": 0.08400746434926987, + "learning_rate": 2.834150859605415e-05, + "loss": 0.3361694812774658, + "memory(GiB)": 72.41, + "step": 880, + "token_acc": 0.8813631606405008, + "train_speed(iter/s)": 0.037241 + }, + { + "epoch": 0.6182380216383307, + "grad_norm": 0.08885052800178528, + "learning_rate": 2.8266768569887078e-05, + "loss": 0.33670692443847655, + "memory(GiB)": 72.41, + "step": 900, + "token_acc": 0.8707416462917685, + "train_speed(iter/s)": 0.037262 + }, + { + "epoch": 0.6182380216383307, + "eval_loss": 0.43130558729171753, + "eval_runtime": 98.6067, + "eval_samples_per_second": 38.172, + "eval_steps_per_second": 0.598, + "eval_token_acc": 0.8438052359165757, + "step": 900 + }, + { + "epoch": 0.6319766443414048, + "grad_norm": 0.08866509050130844, + "learning_rate": 2.8190484434753047e-05, + "loss": 0.3301401615142822, + "memory(GiB)": 72.41, + "step": 920, + "token_acc": 0.8694062120826557, + "train_speed(iter/s)": 0.037051 + }, + { + "epoch": 0.6457152670444788, + "grad_norm": 0.0904315784573555, + "learning_rate": 2.811266506930373e-05, + "loss": 0.3325372219085693, + "memory(GiB)": 72.41, + "step": 940, + "token_acc": 0.8789977772514003, + "train_speed(iter/s)": 0.037059 + }, + { + "epoch": 0.6594538897475528, + "grad_norm": 0.08624011278152466, + "learning_rate": 2.80333195308751e-05, + "loss": 0.33223259449005127, + "memory(GiB)": 72.41, + "step": 960, + "token_acc": 0.8850906146562588, + "train_speed(iter/s)": 0.037068 + }, + { + "epoch": 0.6731925124506268, + "grad_norm": 0.09679839015007019, + "learning_rate": 2.7952457054433193e-05, + "loss": 0.3346273183822632, + "memory(GiB)": 72.41, + "step": 980, + "token_acc": 0.8790110005974765, + "train_speed(iter/s)": 0.037086 + }, + { + "epoch": 0.6869311351537009, + "grad_norm": 0.08788934350013733, + "learning_rate": 2.787008705149932e-05, + "loss": 0.331668758392334, + "memory(GiB)": 72.41, + "step": 1000, + "token_acc": 0.8727144921802968, + "train_speed(iter/s)": 0.037101 + }, + { + "epoch": 0.7006697578567749, + "grad_norm": 0.07837922871112823, + "learning_rate": 2.7786219109054618e-05, + "loss": 0.33264620304107667, + "memory(GiB)": 72.41, + "step": 1020, + "token_acc": 0.8763268597016804, + "train_speed(iter/s)": 0.037116 + }, + { + "epoch": 0.7144083805598489, + "grad_norm": 0.08805106580257416, + "learning_rate": 2.770086298842426e-05, + "loss": 0.33027656078338624, + "memory(GiB)": 72.41, + "step": 1040, + "token_acc": 0.8825189704888438, + "train_speed(iter/s)": 0.037133 + }, + { + "epoch": 0.7281470032629229, + "grad_norm": 0.07906992733478546, + "learning_rate": 2.7614028624141333e-05, + "loss": 0.3281256914138794, + "memory(GiB)": 72.41, + "step": 1060, + "token_acc": 0.8848472346715264, + "train_speed(iter/s)": 0.037145 + }, + { + "epoch": 0.7418856259659969, + "grad_norm": 0.08966130018234253, + "learning_rate": 2.7525726122790556e-05, + "loss": 0.33036127090454104, + "memory(GiB)": 72.41, + "step": 1080, + "token_acc": 0.8711485117219462, + "train_speed(iter/s)": 0.037158 + }, + { + "epoch": 0.755624248669071, + "grad_norm": 0.0860779657959938, + "learning_rate": 2.7435965761831987e-05, + "loss": 0.32908782958984373, + "memory(GiB)": 72.41, + "step": 1100, + "token_acc": 0.868969553437273, + "train_speed(iter/s)": 0.037175 + }, + { + "epoch": 0.769362871372145, + "grad_norm": 0.08935214579105377, + "learning_rate": 2.7344757988404845e-05, + "loss": 0.33276095390319826, + "memory(GiB)": 72.41, + "step": 1120, + "token_acc": 0.8682377205407336, + "train_speed(iter/s)": 0.037189 + }, + { + "epoch": 0.783101494075219, + "grad_norm": 0.08778548985719681, + "learning_rate": 2.725211341811158e-05, + "loss": 0.33044397830963135, + "memory(GiB)": 72.41, + "step": 1140, + "token_acc": 0.8793117868359244, + "train_speed(iter/s)": 0.037201 + }, + { + "epoch": 0.796840116778293, + "grad_norm": 0.09090530127286911, + "learning_rate": 2.71580428337823e-05, + "loss": 0.32858192920684814, + "memory(GiB)": 72.41, + "step": 1160, + "token_acc": 0.8733004247503157, + "train_speed(iter/s)": 0.037216 + }, + { + "epoch": 0.8105787394813669, + "grad_norm": 0.08553975820541382, + "learning_rate": 2.7062557184219806e-05, + "loss": 0.3291203498840332, + "memory(GiB)": 72.41, + "step": 1180, + "token_acc": 0.8764492301755195, + "train_speed(iter/s)": 0.037231 + }, + { + "epoch": 0.824317362184441, + "grad_norm": 0.08339999616146088, + "learning_rate": 2.6965667582925247e-05, + "loss": 0.3333151817321777, + "memory(GiB)": 72.41, + "step": 1200, + "token_acc": 0.8695492444572112, + "train_speed(iter/s)": 0.037244 + }, + { + "epoch": 0.824317362184441, + "eval_loss": 0.43056952953338623, + "eval_runtime": 99.0376, + "eval_samples_per_second": 38.006, + "eval_steps_per_second": 0.596, + "eval_token_acc": 0.8441900740502716, + "step": 1200 + }, + { + "epoch": 0.838055984887515, + "grad_norm": 0.08720073848962784, + "learning_rate": 2.686738530680462e-05, + "loss": 0.33159494400024414, + "memory(GiB)": 72.41, + "step": 1220, + "token_acc": 0.8612746275278335, + "train_speed(iter/s)": 0.037076 + }, + { + "epoch": 0.851794607590589, + "grad_norm": 0.08212270587682724, + "learning_rate": 2.676772179485629e-05, + "loss": 0.3343451976776123, + "memory(GiB)": 72.41, + "step": 1240, + "token_acc": 0.874983845473253, + "train_speed(iter/s)": 0.037077 + }, + { + "epoch": 0.865533230293663, + "grad_norm": 0.08813036233186722, + "learning_rate": 2.6666688646839574e-05, + "loss": 0.3311768531799316, + "memory(GiB)": 72.41, + "step": 1260, + "token_acc": 0.8867381088510741, + "train_speed(iter/s)": 0.037081 + }, + { + "epoch": 0.879271852996737, + "grad_norm": 0.08220379054546356, + "learning_rate": 2.6564297621924696e-05, + "loss": 0.33231358528137206, + "memory(GiB)": 72.41, + "step": 1280, + "token_acc": 0.8829307086452494, + "train_speed(iter/s)": 0.037092 + }, + { + "epoch": 0.8930104756998111, + "grad_norm": 0.08680058270692825, + "learning_rate": 2.6460560637324113e-05, + "loss": 0.3345161199569702, + "memory(GiB)": 72.41, + "step": 1300, + "token_acc": 0.8796637788480545, + "train_speed(iter/s)": 0.037101 + }, + { + "epoch": 0.9067490984028851, + "grad_norm": 0.09523913264274597, + "learning_rate": 2.6355489766905496e-05, + "loss": 0.33291900157928467, + "memory(GiB)": 72.41, + "step": 1320, + "token_acc": 0.8843804465166379, + "train_speed(iter/s)": 0.037112 + }, + { + "epoch": 0.9204877211059591, + "grad_norm": 0.08730066567659378, + "learning_rate": 2.6249097239786456e-05, + "loss": 0.33270628452301027, + "memory(GiB)": 72.41, + "step": 1340, + "token_acc": 0.8798414921651669, + "train_speed(iter/s)": 0.037122 + }, + { + "epoch": 0.9342263438090331, + "grad_norm": 0.0898577868938446, + "learning_rate": 2.6141395438911216e-05, + "loss": 0.3346142530441284, + "memory(GiB)": 72.41, + "step": 1360, + "token_acc": 0.8822741759710485, + "train_speed(iter/s)": 0.037134 + }, + { + "epoch": 0.9479649665121072, + "grad_norm": 0.08267758786678314, + "learning_rate": 2.603239689960935e-05, + "loss": 0.33362205028533937, + "memory(GiB)": 72.41, + "step": 1380, + "token_acc": 0.875556680651538, + "train_speed(iter/s)": 0.037147 + }, + { + "epoch": 0.9617035892151812, + "grad_norm": 0.08812654763460159, + "learning_rate": 2.5922114308136826e-05, + "loss": 0.3352126359939575, + "memory(GiB)": 72.41, + "step": 1400, + "token_acc": 0.8624814158268795, + "train_speed(iter/s)": 0.037157 + }, + { + "epoch": 0.9754422119182552, + "grad_norm": 0.08206140995025635, + "learning_rate": 2.5810560500199454e-05, + "loss": 0.32973828315734866, + "memory(GiB)": 72.41, + "step": 1420, + "token_acc": 0.8715281106649746, + "train_speed(iter/s)": 0.037167 + }, + { + "epoch": 0.9891808346213292, + "grad_norm": 0.08127112686634064, + "learning_rate": 2.5697748459458945e-05, + "loss": 0.33533248901367185, + "memory(GiB)": 72.41, + "step": 1440, + "token_acc": 0.8709140359593514, + "train_speed(iter/s)": 0.037179 + }, + { + "epoch": 1.002747724540615, + "grad_norm": 0.13326233625411987, + "learning_rate": 2.5583691316021758e-05, + "loss": 0.32816076278686523, + "memory(GiB)": 72.41, + "step": 1460, + "token_acc": 0.8743706550630328, + "train_speed(iter/s)": 0.037189 + }, + { + "epoch": 1.016486347243689, + "grad_norm": 0.09180466085672379, + "learning_rate": 2.5468402344910895e-05, + "loss": 0.30609779357910155, + "memory(GiB)": 72.41, + "step": 1480, + "token_acc": 0.890758443681086, + "train_speed(iter/s)": 0.037196 + }, + { + "epoch": 1.030224969946763, + "grad_norm": 0.09560491889715195, + "learning_rate": 2.5351894964520832e-05, + "loss": 0.3120020627975464, + "memory(GiB)": 72.41, + "step": 1500, + "token_acc": 0.8784849657881351, + "train_speed(iter/s)": 0.037203 + }, + { + "epoch": 1.030224969946763, + "eval_loss": 0.43459564447402954, + "eval_runtime": 99.9824, + "eval_samples_per_second": 37.647, + "eval_steps_per_second": 0.59, + "eval_token_acc": 0.8433591170448618, + "step": 1500 + }, + { + "epoch": 1.043963592649837, + "grad_norm": 0.08771445602178574, + "learning_rate": 2.523418273505576e-05, + "loss": 0.31691765785217285, + "memory(GiB)": 72.41, + "step": 1520, + "token_acc": 0.8715976442080544, + "train_speed(iter/s)": 0.037064 + }, + { + "epoch": 1.057702215352911, + "grad_norm": 0.07920734584331512, + "learning_rate": 2.511527935695133e-05, + "loss": 0.31117587089538573, + "memory(GiB)": 72.41, + "step": 1540, + "token_acc": 0.891813564000213, + "train_speed(iter/s)": 0.037065 + }, + { + "epoch": 1.071440838055985, + "grad_norm": 0.08091707527637482, + "learning_rate": 2.499519866928006e-05, + "loss": 0.3078420639038086, + "memory(GiB)": 72.41, + "step": 1560, + "token_acc": 0.8856428771072653, + "train_speed(iter/s)": 0.037064 + }, + { + "epoch": 1.0851794607590588, + "grad_norm": 0.08118196576833725, + "learning_rate": 2.487395464814062e-05, + "loss": 0.30695157051086425, + "memory(GiB)": 72.41, + "step": 1580, + "token_acc": 0.896800843691485, + "train_speed(iter/s)": 0.037065 + }, + { + "epoch": 1.0989180834621328, + "grad_norm": 0.08192740380764008, + "learning_rate": 2.475156140503116e-05, + "loss": 0.30917532444000245, + "memory(GiB)": 72.41, + "step": 1600, + "token_acc": 0.8825165654283423, + "train_speed(iter/s)": 0.037072 + }, + { + "epoch": 1.1126567061652068, + "grad_norm": 0.08665605634450912, + "learning_rate": 2.4628033185206914e-05, + "loss": 0.3106253147125244, + "memory(GiB)": 72.41, + "step": 1620, + "token_acc": 0.885648658540145, + "train_speed(iter/s)": 0.037078 + }, + { + "epoch": 1.1263953288682809, + "grad_norm": 0.08092272281646729, + "learning_rate": 2.4503384366022153e-05, + "loss": 0.3136306285858154, + "memory(GiB)": 72.41, + "step": 1640, + "token_acc": 0.8739959227894275, + "train_speed(iter/s)": 0.037085 + }, + { + "epoch": 1.1401339515713549, + "grad_norm": 0.08443531394004822, + "learning_rate": 2.437762945525686e-05, + "loss": 0.317700719833374, + "memory(GiB)": 72.41, + "step": 1660, + "token_acc": 0.8844603109257694, + "train_speed(iter/s)": 0.037093 + }, + { + "epoch": 1.153872574274429, + "grad_norm": 0.0871417298913002, + "learning_rate": 2.425078308942815e-05, + "loss": 0.3168033123016357, + "memory(GiB)": 72.41, + "step": 1680, + "token_acc": 0.8770761500087126, + "train_speed(iter/s)": 0.0371 + }, + { + "epoch": 1.167611196977503, + "grad_norm": 0.0831030011177063, + "learning_rate": 2.4122860032086763e-05, + "loss": 0.31378917694091796, + "memory(GiB)": 72.41, + "step": 1700, + "token_acc": 0.8724602675981399, + "train_speed(iter/s)": 0.037109 + }, + { + "epoch": 1.181349819680577, + "grad_norm": 0.08853127807378769, + "learning_rate": 2.3993875172098737e-05, + "loss": 0.31873183250427245, + "memory(GiB)": 72.41, + "step": 1720, + "token_acc": 0.887010406125234, + "train_speed(iter/s)": 0.037115 + }, + { + "epoch": 1.195088442383651, + "grad_norm": 0.08741605281829834, + "learning_rate": 2.3863843521912497e-05, + "loss": 0.31804475784301756, + "memory(GiB)": 72.41, + "step": 1740, + "token_acc": 0.8780459946059642, + "train_speed(iter/s)": 0.037121 + }, + { + "epoch": 1.208827065086725, + "grad_norm": 0.0871034637093544, + "learning_rate": 2.3732780215811563e-05, + "loss": 0.31754317283630373, + "memory(GiB)": 72.41, + "step": 1760, + "token_acc": 0.8814304365386241, + "train_speed(iter/s)": 0.03713 + }, + { + "epoch": 1.222565687789799, + "grad_norm": 0.08191618323326111, + "learning_rate": 2.3600700508153103e-05, + "loss": 0.31642465591430663, + "memory(GiB)": 72.41, + "step": 1780, + "token_acc": 0.8742795205201467, + "train_speed(iter/s)": 0.037136 + }, + { + "epoch": 1.236304310492873, + "grad_norm": 0.0853760614991188, + "learning_rate": 2.346761977159248e-05, + "loss": 0.31632657051086427, + "memory(GiB)": 72.41, + "step": 1800, + "token_acc": 0.8847685541277756, + "train_speed(iter/s)": 0.037145 + }, + { + "epoch": 1.236304310492873, + "eval_loss": 0.4353775978088379, + "eval_runtime": 98.8783, + "eval_samples_per_second": 38.067, + "eval_steps_per_second": 0.597, + "eval_token_acc": 0.8432962434627534, + "step": 1800 + }, + { + "epoch": 1.250042933195947, + "grad_norm": 0.08532160520553589, + "learning_rate": 2.3333553495294033e-05, + "loss": 0.3085492610931396, + "memory(GiB)": 72.41, + "step": 1820, + "token_acc": 0.8650615076497046, + "train_speed(iter/s)": 0.037034 + }, + { + "epoch": 1.263781555899021, + "grad_norm": 0.09372863918542862, + "learning_rate": 2.3198517283128316e-05, + "loss": 0.314247727394104, + "memory(GiB)": 72.41, + "step": 1840, + "token_acc": 0.8870913422078638, + "train_speed(iter/s)": 0.037033 + }, + { + "epoch": 1.277520178602095, + "grad_norm": 0.0852976143360138, + "learning_rate": 2.3062526851855962e-05, + "loss": 0.31310009956359863, + "memory(GiB)": 72.41, + "step": 1860, + "token_acc": 0.8906191502723332, + "train_speed(iter/s)": 0.037035 + }, + { + "epoch": 1.2912588013051691, + "grad_norm": 0.07643554359674454, + "learning_rate": 2.2925598029298437e-05, + "loss": 0.3103055715560913, + "memory(GiB)": 72.41, + "step": 1880, + "token_acc": 0.887677412229967, + "train_speed(iter/s)": 0.037042 + }, + { + "epoch": 1.3049974240082431, + "grad_norm": 0.08572836965322495, + "learning_rate": 2.278774675249585e-05, + "loss": 0.31417174339294435, + "memory(GiB)": 72.41, + "step": 1900, + "token_acc": 0.8894350828946791, + "train_speed(iter/s)": 0.037046 + }, + { + "epoch": 1.3187360467113172, + "grad_norm": 0.08305912464857101, + "learning_rate": 2.264898906585204e-05, + "loss": 0.31093263626098633, + "memory(GiB)": 72.41, + "step": 1920, + "token_acc": 0.8820933517164594, + "train_speed(iter/s)": 0.037052 + }, + { + "epoch": 1.3324746694143912, + "grad_norm": 0.08301204442977905, + "learning_rate": 2.2509341119267193e-05, + "loss": 0.3095247268676758, + "memory(GiB)": 72.41, + "step": 1940, + "token_acc": 0.8671473791714781, + "train_speed(iter/s)": 0.037058 + }, + { + "epoch": 1.3462132921174652, + "grad_norm": 0.0815897062420845, + "learning_rate": 2.236881916625816e-05, + "loss": 0.3098980188369751, + "memory(GiB)": 72.41, + "step": 1960, + "token_acc": 0.8935473891956769, + "train_speed(iter/s)": 0.037064 + }, + { + "epoch": 1.3599519148205392, + "grad_norm": 0.08821182698011398, + "learning_rate": 2.2227439562066734e-05, + "loss": 0.30906736850738525, + "memory(GiB)": 72.41, + "step": 1980, + "token_acc": 0.8789100589878682, + "train_speed(iter/s)": 0.037069 + }, + { + "epoch": 1.3736905375236133, + "grad_norm": 0.0886450707912445, + "learning_rate": 2.2085218761756058e-05, + "loss": 0.3117701768875122, + "memory(GiB)": 72.41, + "step": 2000, + "token_acc": 0.8915136412607484, + "train_speed(iter/s)": 0.037071 + }, + { + "epoch": 1.3874291602266873, + "grad_norm": 0.08608590811491013, + "learning_rate": 2.1942173318295443e-05, + "loss": 0.3138264179229736, + "memory(GiB)": 72.41, + "step": 2020, + "token_acc": 0.8859320703790349, + "train_speed(iter/s)": 0.037077 + }, + { + "epoch": 1.4011677829297613, + "grad_norm": 0.07869933545589447, + "learning_rate": 2.1798319880633795e-05, + "loss": 0.3135652542114258, + "memory(GiB)": 72.41, + "step": 2040, + "token_acc": 0.8910194771797223, + "train_speed(iter/s)": 0.037082 + }, + { + "epoch": 1.4149064056328353, + "grad_norm": 0.07948032766580582, + "learning_rate": 2.165367519176183e-05, + "loss": 0.3114771842956543, + "memory(GiB)": 72.41, + "step": 2060, + "token_acc": 0.888334672346102, + "train_speed(iter/s)": 0.037086 + }, + { + "epoch": 1.4286450283359093, + "grad_norm": 0.08114151656627655, + "learning_rate": 2.1508256086763372e-05, + "loss": 0.3094203948974609, + "memory(GiB)": 72.41, + "step": 2080, + "token_acc": 0.8877183536236418, + "train_speed(iter/s)": 0.03709 + }, + { + "epoch": 1.4423836510389834, + "grad_norm": 0.0871092826128006, + "learning_rate": 2.1362079490855968e-05, + "loss": 0.3111464738845825, + "memory(GiB)": 72.41, + "step": 2100, + "token_acc": 0.881144622390869, + "train_speed(iter/s)": 0.037093 + }, + { + "epoch": 1.4423836510389834, + "eval_loss": 0.4344118535518646, + "eval_runtime": 99.5043, + "eval_samples_per_second": 37.828, + "eval_steps_per_second": 0.593, + "eval_token_acc": 0.8434381641208087, + "step": 2100 + }, + { + "epoch": 1.4561222737420574, + "grad_norm": 0.08234430849552155, + "learning_rate": 2.1215162417420926e-05, + "loss": 0.3089058637619019, + "memory(GiB)": 72.41, + "step": 2120, + "token_acc": 0.8675401686436827, + "train_speed(iter/s)": 0.036998 + }, + { + "epoch": 1.4698608964451314, + "grad_norm": 0.08042703568935394, + "learning_rate": 2.1067521966023165e-05, + "loss": 0.31057741641998293, + "memory(GiB)": 72.41, + "step": 2140, + "token_acc": 0.8962609916378795, + "train_speed(iter/s)": 0.036997 + }, + { + "epoch": 1.4835995191482054, + "grad_norm": 0.08385493606328964, + "learning_rate": 2.0919175320421023e-05, + "loss": 0.3134245634078979, + "memory(GiB)": 72.41, + "step": 2160, + "token_acc": 0.8868531518893562, + "train_speed(iter/s)": 0.036998 + }, + { + "epoch": 1.4973381418512794, + "grad_norm": 0.09416038542985916, + "learning_rate": 2.0770139746566223e-05, + "loss": 0.31356468200683596, + "memory(GiB)": 72.41, + "step": 2180, + "token_acc": 0.8753653697079176, + "train_speed(iter/s)": 0.037001 + }, + { + "epoch": 1.5110767645543535, + "grad_norm": 0.08807655423879623, + "learning_rate": 2.062043259059432e-05, + "loss": 0.31597309112548827, + "memory(GiB)": 72.41, + "step": 2200, + "token_acc": 0.8919064810265528, + "train_speed(iter/s)": 0.037007 + }, + { + "epoch": 1.5248153872574275, + "grad_norm": 0.08815551549196243, + "learning_rate": 2.047007127680579e-05, + "loss": 0.3196309804916382, + "memory(GiB)": 72.41, + "step": 2220, + "token_acc": 0.8772186268233043, + "train_speed(iter/s)": 0.037012 + }, + { + "epoch": 1.5385540099605015, + "grad_norm": 0.08227042853832245, + "learning_rate": 2.0319073305638035e-05, + "loss": 0.31729488372802733, + "memory(GiB)": 72.41, + "step": 2240, + "token_acc": 0.8858860714860183, + "train_speed(iter/s)": 0.037014 + }, + { + "epoch": 1.5522926326635753, + "grad_norm": 0.08253244310617447, + "learning_rate": 2.0167456251628524e-05, + "loss": 0.31553847789764405, + "memory(GiB)": 72.41, + "step": 2260, + "token_acc": 0.8908496364853541, + "train_speed(iter/s)": 0.037016 + }, + { + "epoch": 1.5660312553666493, + "grad_norm": 0.08127789944410324, + "learning_rate": 2.00152377613693e-05, + "loss": 0.3174169063568115, + "memory(GiB)": 72.41, + "step": 2280, + "token_acc": 0.8759036896828214, + "train_speed(iter/s)": 0.037021 + }, + { + "epoch": 1.5797698780697234, + "grad_norm": 0.08351726084947586, + "learning_rate": 1.9862435551453103e-05, + "loss": 0.31812009811401365, + "memory(GiB)": 72.41, + "step": 2300, + "token_acc": 0.8801280981073656, + "train_speed(iter/s)": 0.037028 + }, + { + "epoch": 1.5935085007727974, + "grad_norm": 0.08042768388986588, + "learning_rate": 1.9709067406411352e-05, + "loss": 0.3188045024871826, + "memory(GiB)": 72.41, + "step": 2320, + "token_acc": 0.8883485418399553, + "train_speed(iter/s)": 0.037034 + }, + { + "epoch": 1.6072471234758714, + "grad_norm": 0.0847587063908577, + "learning_rate": 1.9555151176644223e-05, + "loss": 0.31552605628967284, + "memory(GiB)": 72.41, + "step": 2340, + "token_acc": 0.8933710959011879, + "train_speed(iter/s)": 0.03704 + }, + { + "epoch": 1.6209857461789454, + "grad_norm": 0.0849500447511673, + "learning_rate": 1.9400704776343047e-05, + "loss": 0.3190001010894775, + "memory(GiB)": 72.41, + "step": 2360, + "token_acc": 0.8655127619672538, + "train_speed(iter/s)": 0.037046 + }, + { + "epoch": 1.6347243688820194, + "grad_norm": 0.0820319652557373, + "learning_rate": 1.9245746181405306e-05, + "loss": 0.3157363414764404, + "memory(GiB)": 72.41, + "step": 2380, + "token_acc": 0.8931401676158139, + "train_speed(iter/s)": 0.037052 + }, + { + "epoch": 1.6484629915850935, + "grad_norm": 0.07777854800224304, + "learning_rate": 1.9090293427342406e-05, + "loss": 0.30912251472473146, + "memory(GiB)": 72.41, + "step": 2400, + "token_acc": 0.8933260366449716, + "train_speed(iter/s)": 0.037059 + }, + { + "epoch": 1.6484629915850935, + "eval_loss": 0.43367844820022583, + "eval_runtime": 99.7879, + "eval_samples_per_second": 37.72, + "eval_steps_per_second": 0.591, + "eval_token_acc": 0.843613657031226, + "step": 2400 + }, + { + "epoch": 1.6622016142881675, + "grad_norm": 0.08155303448438644, + "learning_rate": 1.893436460718056e-05, + "loss": 0.3163402795791626, + "memory(GiB)": 72.41, + "step": 2420, + "token_acc": 0.8672117073299662, + "train_speed(iter/s)": 0.036975 + }, + { + "epoch": 1.6759402369912415, + "grad_norm": 0.08382421731948853, + "learning_rate": 1.877797786935495e-05, + "loss": 0.3165715217590332, + "memory(GiB)": 72.41, + "step": 2440, + "token_acc": 0.8854491510650321, + "train_speed(iter/s)": 0.03697 + }, + { + "epoch": 1.6896788596943155, + "grad_norm": 0.08178658783435822, + "learning_rate": 1.862115141559744e-05, + "loss": 0.3171123504638672, + "memory(GiB)": 72.41, + "step": 2460, + "token_acc": 0.8836779780841286, + "train_speed(iter/s)": 0.036973 + }, + { + "epoch": 1.7034174823973895, + "grad_norm": 0.07710675150156021, + "learning_rate": 1.8463903498818088e-05, + "loss": 0.31471326351165774, + "memory(GiB)": 72.41, + "step": 2480, + "token_acc": 0.8908230830682876, + "train_speed(iter/s)": 0.036976 + }, + { + "epoch": 1.7171561051004636, + "grad_norm": 0.08034602552652359, + "learning_rate": 1.8306252420980704e-05, + "loss": 0.31853632926940917, + "memory(GiB)": 72.41, + "step": 2500, + "token_acc": 0.8883582169845952, + "train_speed(iter/s)": 0.036978 + }, + { + "epoch": 1.7308947278035376, + "grad_norm": 0.07948100566864014, + "learning_rate": 1.8148216530972714e-05, + "loss": 0.3109827995300293, + "memory(GiB)": 72.41, + "step": 2520, + "token_acc": 0.8949681174869483, + "train_speed(iter/s)": 0.036981 + }, + { + "epoch": 1.7446333505066116, + "grad_norm": 0.07899336516857147, + "learning_rate": 1.7989814222469538e-05, + "loss": 0.3090771436691284, + "memory(GiB)": 72.41, + "step": 2540, + "token_acc": 0.8906750005261931, + "train_speed(iter/s)": 0.036983 + }, + { + "epoch": 1.7583719732096856, + "grad_norm": 0.07443471997976303, + "learning_rate": 1.783106393179375e-05, + "loss": 0.31173481941223147, + "memory(GiB)": 72.41, + "step": 2560, + "token_acc": 0.8891939493597887, + "train_speed(iter/s)": 0.036988 + }, + { + "epoch": 1.7721105959127597, + "grad_norm": 0.07555528730154037, + "learning_rate": 1.767198413576931e-05, + "loss": 0.30927410125732424, + "memory(GiB)": 72.41, + "step": 2580, + "token_acc": 0.8850328545945815, + "train_speed(iter/s)": 0.036994 + }, + { + "epoch": 1.7858492186158337, + "grad_norm": 0.08017897605895996, + "learning_rate": 1.7512593349571046e-05, + "loss": 0.31209754943847656, + "memory(GiB)": 72.41, + "step": 2600, + "token_acc": 0.8816632260591382, + "train_speed(iter/s)": 0.036998 + }, + { + "epoch": 1.7995878413189077, + "grad_norm": 0.07677578181028366, + "learning_rate": 1.7352910124569695e-05, + "loss": 0.30882983207702636, + "memory(GiB)": 72.41, + "step": 2620, + "token_acc": 0.8925267013383078, + "train_speed(iter/s)": 0.037 + }, + { + "epoch": 1.8133264640219817, + "grad_norm": 0.07692938297986984, + "learning_rate": 1.7192953046172726e-05, + "loss": 0.3074300289154053, + "memory(GiB)": 72.41, + "step": 2640, + "token_acc": 0.8861350676140611, + "train_speed(iter/s)": 0.037005 + }, + { + "epoch": 1.8270650867250557, + "grad_norm": 0.07619909197092056, + "learning_rate": 1.7032740731661178e-05, + "loss": 0.30927472114562987, + "memory(GiB)": 72.41, + "step": 2660, + "token_acc": 0.8921508449028042, + "train_speed(iter/s)": 0.037009 + }, + { + "epoch": 1.8408037094281298, + "grad_norm": 0.08186180889606476, + "learning_rate": 1.687229182802284e-05, + "loss": 0.3076324939727783, + "memory(GiB)": 72.41, + "step": 2680, + "token_acc": 0.874112111934862, + "train_speed(iter/s)": 0.037013 + }, + { + "epoch": 1.8545423321312038, + "grad_norm": 0.0749615728855133, + "learning_rate": 1.6711625009781926e-05, + "loss": 0.3025542736053467, + "memory(GiB)": 72.41, + "step": 2700, + "token_acc": 0.9005740784776456, + "train_speed(iter/s)": 0.037016 + }, + { + "epoch": 1.8545423321312038, + "eval_loss": 0.43260329961776733, + "eval_runtime": 99.7086, + "eval_samples_per_second": 37.75, + "eval_steps_per_second": 0.592, + "eval_token_acc": 0.8437799954640701, + "step": 2700 + }, + { + "epoch": 1.8682809548342778, + "grad_norm": 0.07678617537021637, + "learning_rate": 1.655075897682555e-05, + "loss": 0.3069960117340088, + "memory(GiB)": 72.41, + "step": 2720, + "token_acc": 0.8656336346071796, + "train_speed(iter/s)": 0.036945 + }, + { + "epoch": 1.8820195775373518, + "grad_norm": 0.08224895596504211, + "learning_rate": 1.6389712452227295e-05, + "loss": 0.31150364875793457, + "memory(GiB)": 72.41, + "step": 2740, + "token_acc": 0.8871026948734946, + "train_speed(iter/s)": 0.036944 + }, + { + "epoch": 1.8957582002404259, + "grad_norm": 0.07674538344144821, + "learning_rate": 1.6228504180068003e-05, + "loss": 0.31361680030822753, + "memory(GiB)": 72.41, + "step": 2760, + "token_acc": 0.8885069679173144, + "train_speed(iter/s)": 0.036944 + }, + { + "epoch": 1.9094968229434999, + "grad_norm": 0.07724355906248093, + "learning_rate": 1.60671529232542e-05, + "loss": 0.31092076301574706, + "memory(GiB)": 72.41, + "step": 2780, + "token_acc": 0.8775532573683428, + "train_speed(iter/s)": 0.036948 + }, + { + "epoch": 1.923235445646574, + "grad_norm": 0.0748470202088356, + "learning_rate": 1.5905677461334292e-05, + "loss": 0.3125690698623657, + "memory(GiB)": 72.41, + "step": 2800, + "token_acc": 0.8846377126342211, + "train_speed(iter/s)": 0.036916 + }, + { + "epoch": 1.936974068349648, + "grad_norm": 0.08404634892940521, + "learning_rate": 1.574409658831281e-05, + "loss": 0.3153404235839844, + "memory(GiB)": 72.41, + "step": 2820, + "token_acc": 0.8762131944710342, + "train_speed(iter/s)": 0.036918 + }, + { + "epoch": 1.950712691052722, + "grad_norm": 0.07959295809268951, + "learning_rate": 1.558242911046302e-05, + "loss": 0.31249830722808836, + "memory(GiB)": 72.41, + "step": 2840, + "token_acc": 0.8892035392544179, + "train_speed(iter/s)": 0.036921 + }, + { + "epoch": 1.964451313755796, + "grad_norm": 0.08044803887605667, + "learning_rate": 1.5420693844138036e-05, + "loss": 0.3130341053009033, + "memory(GiB)": 72.41, + "step": 2860, + "token_acc": 0.8932265094341124, + "train_speed(iter/s)": 0.036924 + }, + { + "epoch": 1.97818993645887, + "grad_norm": 0.07583785802125931, + "learning_rate": 1.525890961358083e-05, + "loss": 0.3141756772994995, + "memory(GiB)": 72.41, + "step": 2880, + "token_acc": 0.8839660044002189, + "train_speed(iter/s)": 0.036928 + }, + { + "epoch": 1.991928559161944, + "grad_norm": 0.07464556396007538, + "learning_rate": 1.5097095248733284e-05, + "loss": 0.31082568168640134, + "memory(GiB)": 72.41, + "step": 2900, + "token_acc": 0.8775183645838733, + "train_speed(iter/s)": 0.036932 + }, + { + "epoch": 2.00549544908123, + "grad_norm": 0.12503303587436676, + "learning_rate": 1.4935269583044581e-05, + "loss": 0.2993995904922485, + "memory(GiB)": 72.41, + "step": 2920, + "token_acc": 0.8823204490957476, + "train_speed(iter/s)": 0.036937 + }, + { + "epoch": 2.019234071784304, + "grad_norm": 0.08332613110542297, + "learning_rate": 1.4773451451279213e-05, + "loss": 0.29198360443115234, + "memory(GiB)": 72.41, + "step": 2940, + "token_acc": 0.8980348203187491, + "train_speed(iter/s)": 0.036941 + }, + { + "epoch": 2.032972694487378, + "grad_norm": 0.08240070939064026, + "learning_rate": 1.461165968732479e-05, + "loss": 0.2935274362564087, + "memory(GiB)": 72.41, + "step": 2960, + "token_acc": 0.8904662128095143, + "train_speed(iter/s)": 0.036946 + }, + { + "epoch": 2.046711317190452, + "grad_norm": 0.08113058656454086, + "learning_rate": 1.4449913122000005e-05, + "loss": 0.29198508262634276, + "memory(GiB)": 72.41, + "step": 2980, + "token_acc": 0.8908448858293387, + "train_speed(iter/s)": 0.036953 + }, + { + "epoch": 2.060449939893526, + "grad_norm": 0.08105536550283432, + "learning_rate": 1.4288230580862905e-05, + "loss": 0.290987491607666, + "memory(GiB)": 72.41, + "step": 3000, + "token_acc": 0.8764201959142056, + "train_speed(iter/s)": 0.036958 + }, + { + "epoch": 2.060449939893526, + "eval_loss": 0.4393101930618286, + "eval_runtime": 100.5668, + "eval_samples_per_second": 37.428, + "eval_steps_per_second": 0.587, + "eval_token_acc": 0.84265782805066, + "step": 3000 + }, + { + "epoch": 2.0741885625966, + "grad_norm": 0.08123844116926193, + "learning_rate": 1.412663088201982e-05, + "loss": 0.29090156555175783, + "memory(GiB)": 72.41, + "step": 3020, + "token_acc": 0.8735638063478551, + "train_speed(iter/s)": 0.036892 + }, + { + "epoch": 2.087927185299674, + "grad_norm": 0.0814339891076088, + "learning_rate": 1.3965132833935126e-05, + "loss": 0.2902204990386963, + "memory(GiB)": 72.41, + "step": 3040, + "token_acc": 0.8839492383548759, + "train_speed(iter/s)": 0.036893 + }, + { + "epoch": 2.101665808002748, + "grad_norm": 0.0760771632194519, + "learning_rate": 1.380375523324215e-05, + "loss": 0.29666552543640134, + "memory(GiB)": 72.41, + "step": 3060, + "token_acc": 0.8921821581883145, + "train_speed(iter/s)": 0.036894 + }, + { + "epoch": 2.115404430705822, + "grad_norm": 0.0808984562754631, + "learning_rate": 1.3642516862555433e-05, + "loss": 0.28961887359619143, + "memory(GiB)": 72.41, + "step": 3080, + "token_acc": 0.8996385382943057, + "train_speed(iter/s)": 0.036898 + }, + { + "epoch": 2.129143053408896, + "grad_norm": 0.08168598264455795, + "learning_rate": 1.3481436488284648e-05, + "loss": 0.2952747821807861, + "memory(GiB)": 72.41, + "step": 3100, + "token_acc": 0.8932001882680204, + "train_speed(iter/s)": 0.036901 + }, + { + "epoch": 2.14288167611197, + "grad_norm": 0.08384265005588531, + "learning_rate": 1.3320532858450382e-05, + "loss": 0.29767014980316164, + "memory(GiB)": 72.41, + "step": 3120, + "token_acc": 0.8847414688023099, + "train_speed(iter/s)": 0.036904 + }, + { + "epoch": 2.156620298815044, + "grad_norm": 0.08613137155771255, + "learning_rate": 1.3159824700502083e-05, + "loss": 0.2987870693206787, + "memory(GiB)": 72.41, + "step": 3140, + "token_acc": 0.8912630847005318, + "train_speed(iter/s)": 0.036909 + }, + { + "epoch": 2.1703589215181176, + "grad_norm": 0.08044470101594925, + "learning_rate": 1.2999330719138363e-05, + "loss": 0.29793477058410645, + "memory(GiB)": 72.41, + "step": 3160, + "token_acc": 0.8911335210006078, + "train_speed(iter/s)": 0.036914 + }, + { + "epoch": 2.1840975442211916, + "grad_norm": 0.0807594358921051, + "learning_rate": 1.283906959413e-05, + "loss": 0.2947986125946045, + "memory(GiB)": 72.41, + "step": 3180, + "token_acc": 0.8855782459322568, + "train_speed(iter/s)": 0.036917 + }, + { + "epoch": 2.1978361669242656, + "grad_norm": 0.08066173642873764, + "learning_rate": 1.267905997814578e-05, + "loss": 0.2977961778640747, + "memory(GiB)": 72.41, + "step": 3200, + "token_acc": 0.8803478438446615, + "train_speed(iter/s)": 0.036922 + }, + { + "epoch": 2.2115747896273397, + "grad_norm": 0.08045271784067154, + "learning_rate": 1.2519320494581581e-05, + "loss": 0.29424285888671875, + "memory(GiB)": 72.41, + "step": 3220, + "token_acc": 0.8828892872837293, + "train_speed(iter/s)": 0.036927 + }, + { + "epoch": 2.2253134123304137, + "grad_norm": 0.08237405866384506, + "learning_rate": 1.2359869735392746e-05, + "loss": 0.29676170349121095, + "memory(GiB)": 72.41, + "step": 3240, + "token_acc": 0.8954581030873394, + "train_speed(iter/s)": 0.036931 + }, + { + "epoch": 2.2390520350334877, + "grad_norm": 0.08597618341445923, + "learning_rate": 1.220072625893023e-05, + "loss": 0.296732759475708, + "memory(GiB)": 72.41, + "step": 3260, + "token_acc": 0.8882645330425789, + "train_speed(iter/s)": 0.036937 + }, + { + "epoch": 2.2527906577365617, + "grad_norm": 0.07742371410131454, + "learning_rate": 1.2041908587780571e-05, + "loss": 0.293271803855896, + "memory(GiB)": 72.41, + "step": 3280, + "token_acc": 0.8859188183637006, + "train_speed(iter/s)": 0.036942 + }, + { + "epoch": 2.2665292804396358, + "grad_norm": 0.07885393500328064, + "learning_rate": 1.1883435206610095e-05, + "loss": 0.29781594276428225, + "memory(GiB)": 72.41, + "step": 3300, + "token_acc": 0.8873998820001778, + "train_speed(iter/s)": 0.036945 + }, + { + "epoch": 2.2665292804396358, + "eval_loss": 0.4392697215080261, + "eval_runtime": 99.0852, + "eval_samples_per_second": 37.988, + "eval_steps_per_second": 0.595, + "eval_token_acc": 0.8427081339178593, + "step": 3300 + } + ], + "logging_steps": 20, + "max_steps": 5824, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 300, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.596186893058048e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..8032596 --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13bbf28e790c5ff417dd7db694a26752dccc43cd8db6cb53667f26f9890b9240 +size 8248 diff --git a/vocab.json b/vocab.json new file mode 100644 index 0000000..6c49fc6 --- /dev/null +++ b/vocab.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca10d7e9fb3ed18575dd1e277a2579c16d108e32f27439684afa0e10b1440910 +size 2776833