From 36e96bc2b22299ddf2f7eace41e4d550d32bbb80 Mon Sep 17 00:00:00 2001 From: ModelHub XC Date: Sat, 6 Jun 2026 04:30:15 +0800 Subject: [PATCH] =?UTF-8?q?=E5=88=9D=E5=A7=8B=E5=8C=96=E9=A1=B9=E7=9B=AE?= =?UTF-8?q?=EF=BC=8C=E7=94=B1ModelHub=20XC=E7=A4=BE=E5=8C=BA=E6=8F=90?= =?UTF-8?q?=E4=BE=9B=E6=A8=A1=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Model: arnav-yadav/jailbreak-attacker-l2 Source: Original Platform --- .gitattributes | 46 + README.md | 70 ++ chat_template.jinja | 54 + checkpoint-100/README.md | 210 ++++ checkpoint-100/adapter_config.json | 49 + checkpoint-100/adapter_model.safetensors | 3 + checkpoint-100/chat_template.jinja | 54 + checkpoint-100/optimizer.pt | 3 + checkpoint-100/rng_state.pth | 3 + checkpoint-100/scaler.pt | 3 + checkpoint-100/scheduler.pt | 3 + checkpoint-100/tokenizer.json | 3 + checkpoint-100/tokenizer_config.json | 201 ++++ checkpoint-100/trainer_state.json | 304 +++++ checkpoint-100/training_args.bin | 3 + checkpoint-150/README.md | 210 ++++ checkpoint-150/adapter_config.json | 49 + checkpoint-150/adapter_model.safetensors | 3 + checkpoint-150/chat_template.jinja | 54 + checkpoint-150/optimizer.pt | 3 + checkpoint-150/rng_state.pth | 3 + checkpoint-150/scaler.pt | 3 + checkpoint-150/scheduler.pt | 3 + checkpoint-150/tokenizer.json | 3 + checkpoint-150/tokenizer_config.json | 201 ++++ checkpoint-150/trainer_state.json | 439 +++++++ checkpoint-150/training_args.bin | 3 + checkpoint-200/README.md | 210 ++++ checkpoint-200/adapter_config.json | 49 + checkpoint-200/adapter_model.safetensors | 3 + checkpoint-200/chat_template.jinja | 54 + checkpoint-200/optimizer.pt | 3 + checkpoint-200/rng_state.pth | 3 + checkpoint-200/scaler.pt | 3 + checkpoint-200/scheduler.pt | 3 + checkpoint-200/tokenizer.json | 3 + checkpoint-200/tokenizer_config.json | 201 ++++ checkpoint-200/trainer_state.json | 574 ++++++++++ checkpoint-200/training_args.bin | 3 + checkpoint-250/README.md | 210 ++++ checkpoint-250/adapter_config.json | 49 + checkpoint-250/adapter_model.safetensors | 3 + checkpoint-250/chat_template.jinja | 54 + checkpoint-250/optimizer.pt | 3 + checkpoint-250/rng_state.pth | 3 + checkpoint-250/scaler.pt | 3 + checkpoint-250/scheduler.pt | 3 + checkpoint-250/tokenizer.json | 3 + checkpoint-250/tokenizer_config.json | 201 ++++ checkpoint-250/trainer_state.json | 709 ++++++++++++ checkpoint-250/training_args.bin | 3 + checkpoint-300/README.md | 210 ++++ checkpoint-300/adapter_config.json | 49 + checkpoint-300/adapter_model.safetensors | 3 + checkpoint-300/chat_template.jinja | 54 + checkpoint-300/optimizer.pt | 3 + checkpoint-300/rng_state.pth | 3 + checkpoint-300/scaler.pt | 3 + checkpoint-300/scheduler.pt | 3 + checkpoint-300/tokenizer.json | 3 + checkpoint-300/tokenizer_config.json | 201 ++++ checkpoint-300/trainer_state.json | 844 ++++++++++++++ checkpoint-300/training_args.bin | 3 + checkpoint-350/README.md | 210 ++++ checkpoint-350/adapter_config.json | 49 + checkpoint-350/adapter_model.safetensors | 3 + checkpoint-350/chat_template.jinja | 54 + checkpoint-350/optimizer.pt | 3 + checkpoint-350/rng_state.pth | 3 + checkpoint-350/scaler.pt | 3 + checkpoint-350/scheduler.pt | 3 + checkpoint-350/tokenizer.json | 3 + checkpoint-350/tokenizer_config.json | 201 ++++ checkpoint-350/trainer_state.json | 979 ++++++++++++++++ checkpoint-350/training_args.bin | 3 + checkpoint-400/README.md | 210 ++++ checkpoint-400/adapter_config.json | 49 + checkpoint-400/adapter_model.safetensors | 3 + checkpoint-400/chat_template.jinja | 54 + checkpoint-400/optimizer.pt | 3 + checkpoint-400/rng_state.pth | 3 + checkpoint-400/scaler.pt | 3 + checkpoint-400/scheduler.pt | 3 + checkpoint-400/tokenizer.json | 3 + checkpoint-400/tokenizer_config.json | 201 ++++ checkpoint-400/trainer_state.json | 1114 ++++++++++++++++++ checkpoint-400/training_args.bin | 3 + checkpoint-450/README.md | 210 ++++ checkpoint-450/adapter_config.json | 49 + checkpoint-450/adapter_model.safetensors | 3 + checkpoint-450/chat_template.jinja | 54 + checkpoint-450/optimizer.pt | 3 + checkpoint-450/rng_state.pth | 3 + checkpoint-450/scaler.pt | 3 + checkpoint-450/scheduler.pt | 3 + checkpoint-450/tokenizer.json | 3 + checkpoint-450/tokenizer_config.json | 201 ++++ checkpoint-450/trainer_state.json | 1249 ++++++++++++++++++++ checkpoint-450/training_args.bin | 3 + checkpoint-480/README.md | 210 ++++ checkpoint-480/adapter_config.json | 49 + checkpoint-480/adapter_model.safetensors | 3 + checkpoint-480/chat_template.jinja | 54 + checkpoint-480/optimizer.pt | 3 + checkpoint-480/rng_state.pth | 3 + checkpoint-480/scaler.pt | 3 + checkpoint-480/scheduler.pt | 3 + checkpoint-480/tokenizer.json | 3 + checkpoint-480/tokenizer_config.json | 201 ++++ checkpoint-480/trainer_state.json | 1330 ++++++++++++++++++++++ checkpoint-480/training_args.bin | 3 + checkpoint-50/README.md | 210 ++++ checkpoint-50/adapter_config.json | 49 + checkpoint-50/adapter_model.safetensors | 3 + checkpoint-50/chat_template.jinja | 54 + checkpoint-50/optimizer.pt | 3 + checkpoint-50/rng_state.pth | 3 + checkpoint-50/scaler.pt | 3 + checkpoint-50/scheduler.pt | 3 + checkpoint-50/tokenizer.json | 3 + checkpoint-50/tokenizer_config.json | 201 ++++ checkpoint-50/trainer_state.json | 169 +++ checkpoint-50/training_args.bin | 3 + config.json | 62 + model.safetensors | 3 + tokenizer.json | 3 + tokenizer_config.json | 202 ++++ 127 files changed, 13501 insertions(+) create mode 100644 .gitattributes create mode 100644 README.md create mode 100644 chat_template.jinja create mode 100644 checkpoint-100/README.md create mode 100644 checkpoint-100/adapter_config.json create mode 100644 checkpoint-100/adapter_model.safetensors create mode 100644 checkpoint-100/chat_template.jinja create mode 100644 checkpoint-100/optimizer.pt create mode 100644 checkpoint-100/rng_state.pth create mode 100644 checkpoint-100/scaler.pt create mode 100644 checkpoint-100/scheduler.pt create mode 100644 checkpoint-100/tokenizer.json create mode 100644 checkpoint-100/tokenizer_config.json create mode 100644 checkpoint-100/trainer_state.json create mode 100644 checkpoint-100/training_args.bin create mode 100644 checkpoint-150/README.md create mode 100644 checkpoint-150/adapter_config.json create mode 100644 checkpoint-150/adapter_model.safetensors create mode 100644 checkpoint-150/chat_template.jinja create mode 100644 checkpoint-150/optimizer.pt create mode 100644 checkpoint-150/rng_state.pth create mode 100644 checkpoint-150/scaler.pt create mode 100644 checkpoint-150/scheduler.pt create mode 100644 checkpoint-150/tokenizer.json create mode 100644 checkpoint-150/tokenizer_config.json create mode 100644 checkpoint-150/trainer_state.json create mode 100644 checkpoint-150/training_args.bin create mode 100644 checkpoint-200/README.md create mode 100644 checkpoint-200/adapter_config.json create mode 100644 checkpoint-200/adapter_model.safetensors create mode 100644 checkpoint-200/chat_template.jinja create mode 100644 checkpoint-200/optimizer.pt create mode 100644 checkpoint-200/rng_state.pth create mode 100644 checkpoint-200/scaler.pt create mode 100644 checkpoint-200/scheduler.pt create mode 100644 checkpoint-200/tokenizer.json create mode 100644 checkpoint-200/tokenizer_config.json create mode 100644 checkpoint-200/trainer_state.json create mode 100644 checkpoint-200/training_args.bin create mode 100644 checkpoint-250/README.md create mode 100644 checkpoint-250/adapter_config.json create mode 100644 checkpoint-250/adapter_model.safetensors create mode 100644 checkpoint-250/chat_template.jinja create mode 100644 checkpoint-250/optimizer.pt create mode 100644 checkpoint-250/rng_state.pth create mode 100644 checkpoint-250/scaler.pt create mode 100644 checkpoint-250/scheduler.pt create mode 100644 checkpoint-250/tokenizer.json create mode 100644 checkpoint-250/tokenizer_config.json create mode 100644 checkpoint-250/trainer_state.json create mode 100644 checkpoint-250/training_args.bin create mode 100644 checkpoint-300/README.md create mode 100644 checkpoint-300/adapter_config.json create mode 100644 checkpoint-300/adapter_model.safetensors create mode 100644 checkpoint-300/chat_template.jinja create mode 100644 checkpoint-300/optimizer.pt create mode 100644 checkpoint-300/rng_state.pth create mode 100644 checkpoint-300/scaler.pt create mode 100644 checkpoint-300/scheduler.pt create mode 100644 checkpoint-300/tokenizer.json create mode 100644 checkpoint-300/tokenizer_config.json create mode 100644 checkpoint-300/trainer_state.json create mode 100644 checkpoint-300/training_args.bin create mode 100644 checkpoint-350/README.md create mode 100644 checkpoint-350/adapter_config.json create mode 100644 checkpoint-350/adapter_model.safetensors create mode 100644 checkpoint-350/chat_template.jinja create mode 100644 checkpoint-350/optimizer.pt create mode 100644 checkpoint-350/rng_state.pth create mode 100644 checkpoint-350/scaler.pt create mode 100644 checkpoint-350/scheduler.pt create mode 100644 checkpoint-350/tokenizer.json create mode 100644 checkpoint-350/tokenizer_config.json create mode 100644 checkpoint-350/trainer_state.json create mode 100644 checkpoint-350/training_args.bin create mode 100644 checkpoint-400/README.md create mode 100644 checkpoint-400/adapter_config.json create mode 100644 checkpoint-400/adapter_model.safetensors create mode 100644 checkpoint-400/chat_template.jinja create mode 100644 checkpoint-400/optimizer.pt create mode 100644 checkpoint-400/rng_state.pth create mode 100644 checkpoint-400/scaler.pt create mode 100644 checkpoint-400/scheduler.pt create mode 100644 checkpoint-400/tokenizer.json create mode 100644 checkpoint-400/tokenizer_config.json create mode 100644 checkpoint-400/trainer_state.json create mode 100644 checkpoint-400/training_args.bin create mode 100644 checkpoint-450/README.md create mode 100644 checkpoint-450/adapter_config.json create mode 100644 checkpoint-450/adapter_model.safetensors create mode 100644 checkpoint-450/chat_template.jinja create mode 100644 checkpoint-450/optimizer.pt create mode 100644 checkpoint-450/rng_state.pth create mode 100644 checkpoint-450/scaler.pt create mode 100644 checkpoint-450/scheduler.pt create mode 100644 checkpoint-450/tokenizer.json create mode 100644 checkpoint-450/tokenizer_config.json create mode 100644 checkpoint-450/trainer_state.json create mode 100644 checkpoint-450/training_args.bin create mode 100644 checkpoint-480/README.md create mode 100644 checkpoint-480/adapter_config.json create mode 100644 checkpoint-480/adapter_model.safetensors create mode 100644 checkpoint-480/chat_template.jinja create mode 100644 checkpoint-480/optimizer.pt create mode 100644 checkpoint-480/rng_state.pth create mode 100644 checkpoint-480/scaler.pt create mode 100644 checkpoint-480/scheduler.pt create mode 100644 checkpoint-480/tokenizer.json create mode 100644 checkpoint-480/tokenizer_config.json create mode 100644 checkpoint-480/trainer_state.json create mode 100644 checkpoint-480/training_args.bin create mode 100644 checkpoint-50/README.md create mode 100644 checkpoint-50/adapter_config.json create mode 100644 checkpoint-50/adapter_model.safetensors create mode 100644 checkpoint-50/chat_template.jinja create mode 100644 checkpoint-50/optimizer.pt create mode 100644 checkpoint-50/rng_state.pth create mode 100644 checkpoint-50/scaler.pt create mode 100644 checkpoint-50/scheduler.pt create mode 100644 checkpoint-50/tokenizer.json create mode 100644 checkpoint-50/tokenizer_config.json create mode 100644 checkpoint-50/trainer_state.json create mode 100644 checkpoint-50/training_args.bin create mode 100644 config.json create mode 100644 model.safetensors create mode 100644 tokenizer.json create mode 100644 tokenizer_config.json diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..7d12bce --- /dev/null +++ b/.gitattributes @@ -0,0 +1,46 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +checkpoint-100/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-150/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-200/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-250/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-300/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-350/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-400/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-450/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-480/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-50/tokenizer.json filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..3a7bf84 --- /dev/null +++ b/README.md @@ -0,0 +1,70 @@ +--- +base_model: unsloth/qwen2.5-1.5b-instruct-unsloth-bnb-4bit +library_name: transformers +model_name: jailbreak-attacker-l2 +tags: +- generated_from_trainer +- hf_jobs +- unsloth +- trl +- grpo +licence: license +--- + +# Model Card for jailbreak-attacker-l2 + +This model is a fine-tuned version of [unsloth/qwen2.5-1.5b-instruct-unsloth-bnb-4bit](https://huggingface.co/unsloth/qwen2.5-1.5b-instruct-unsloth-bnb-4bit). +It has been trained using [TRL](https://github.com/huggingface/trl). + +## Quick start + +```python +from transformers import pipeline + +question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?" +generator = pipeline("text-generation", model="None", device="cuda") +output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0] +print(output["generated_text"]) +``` + +## Training procedure + +[Visualize in Weights & Biases](https://wandb.ai/2024eb02510-/jailbreak-arena/runs/9raw4l66) + + +This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300). + +### Framework versions + +- TRL: 0.24.0 +- Transformers: 5.5.0 +- Pytorch: 2.10.0 +- Datasets: 4.3.0 +- Tokenizers: 0.22.2 + +## Citations + +Cite GRPO as: + +```bibtex +@article{shao2024deepseekmath, + title = {{DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models}}, + author = {Zhihong Shao and Peiyi Wang and Qihao Zhu and Runxin Xu and Junxiao Song and Mingchuan Zhang and Y. K. Li and Y. Wu and Daya Guo}, + year = 2024, + eprint = {arXiv:2402.03300}, +} + +``` + +Cite TRL as: + +```bibtex +@misc{vonwerra2022trl, + title = {{TRL: Transformer Reinforcement Learning}}, + author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallou{\'e}dec}, + year = 2020, + journal = {GitHub repository}, + publisher = {GitHub}, + howpublished = {\url{https://github.com/huggingface/trl}} +} +``` \ No newline at end of file diff --git a/chat_template.jinja b/chat_template.jinja new file mode 100644 index 0000000..bdf7919 --- /dev/null +++ b/chat_template.jinja @@ -0,0 +1,54 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0]['role'] == 'system' %} + {{- messages[0]['content'] }} + {%- else %} + {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }} + {%- endif %} + {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0]['role'] == 'system' %} + {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }} + {%- else %} + {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {{- '<|im_start|>' + message.role }} + {%- if message.content %} + {{- '\n' + message.content }} + {%- endif %} + {%- for tool_call in message.tool_calls %} + {%- if tool_call.function is defined %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {{- tool_call.arguments | tojson }} + {{- '}\n' }} + {%- endfor %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} +{%- endif %} diff --git a/checkpoint-100/README.md b/checkpoint-100/README.md new file mode 100644 index 0000000..bf064af --- /dev/null +++ b/checkpoint-100/README.md @@ -0,0 +1,210 @@ +--- +base_model: unsloth/qwen2.5-1.5b-instruct-unsloth-bnb-4bit +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:unsloth/qwen2.5-1.5b-instruct-unsloth-bnb-4bit +- grpo +- lora +- transformers +- trl +- unsloth +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/checkpoint-100/adapter_config.json b/checkpoint-100/adapter_config.json new file mode 100644 index 0000000..ae4673f --- /dev/null +++ b/checkpoint-100/adapter_config.json @@ -0,0 +1,49 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": { + "base_model_class": "Qwen2ForCausalLM", + "parent_library": "transformers.models.qwen2.modeling_qwen2", + "unsloth_fixed": true + }, + "base_model_name_or_path": "unsloth/qwen2.5-1.5b-instruct-unsloth-bnb-4bit", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-100/adapter_model.safetensors b/checkpoint-100/adapter_model.safetensors new file mode 100644 index 0000000..f3a3ecd --- /dev/null +++ b/checkpoint-100/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8f6fe63390c140f91c3223c6b523a5349c030b6c7c76fbd1d6a8e70d1a63343d +size 17462432 diff --git a/checkpoint-100/chat_template.jinja b/checkpoint-100/chat_template.jinja new file mode 100644 index 0000000..bdf7919 --- /dev/null +++ b/checkpoint-100/chat_template.jinja @@ -0,0 +1,54 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0]['role'] == 'system' %} + {{- messages[0]['content'] }} + {%- else %} + {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }} + {%- endif %} + {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0]['role'] == 'system' %} + {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }} + {%- else %} + {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {{- '<|im_start|>' + message.role }} + {%- if message.content %} + {{- '\n' + message.content }} + {%- endif %} + {%- for tool_call in message.tool_calls %} + {%- if tool_call.function is defined %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {{- tool_call.arguments | tojson }} + {{- '}\n' }} + {%- endfor %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} +{%- endif %} diff --git a/checkpoint-100/optimizer.pt b/checkpoint-100/optimizer.pt new file mode 100644 index 0000000..14fb3f9 --- /dev/null +++ b/checkpoint-100/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:60059eb4d438d4f59c8e20c28d7605a0bdb9f10bf70725f3bb0b0320ef7dc43e +size 9117451 diff --git a/checkpoint-100/rng_state.pth b/checkpoint-100/rng_state.pth new file mode 100644 index 0000000..59e76a2 --- /dev/null +++ b/checkpoint-100/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7976309b32038fe3fdad0e785daca676697c5fc7b30760267195ec50f030e8ad +size 14645 diff --git a/checkpoint-100/scaler.pt b/checkpoint-100/scaler.pt new file mode 100644 index 0000000..215c5d2 --- /dev/null +++ b/checkpoint-100/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c4393a84a3109995aa1202073b039b12062e3189ed89aa0b94ef0510ba843009 +size 1383 diff --git a/checkpoint-100/scheduler.pt b/checkpoint-100/scheduler.pt new file mode 100644 index 0000000..3ea1055 --- /dev/null +++ b/checkpoint-100/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a830202ee873f29917b7d3a9d1968fea3e276eae47bb088abb41c7230fc9bf78 +size 1465 diff --git a/checkpoint-100/tokenizer.json b/checkpoint-100/tokenizer.json new file mode 100644 index 0000000..9fa63fd --- /dev/null +++ b/checkpoint-100/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af5891a15588546db1ac7f2baf8fa94835a51a85c032c39793a55bb048b47446 +size 11422523 diff --git a/checkpoint-100/tokenizer_config.json b/checkpoint-100/tokenizer_config.json new file mode 100644 index 0000000..d3dac53 --- /dev/null +++ b/checkpoint-100/tokenizer_config.json @@ -0,0 +1,201 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|im_end|>", + "errors": "replace", + "is_local": false, + "model_max_length": 32768, + "pad_token": "<|PAD_TOKEN|>", + "padding_side": "right", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null, + "added_tokens_decoder": { + "151643": { + "content": "<|endoftext|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151644": { + "content": "<|im_start|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151645": { + "content": "<|im_end|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151646": { + "content": "<|object_ref_start|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151647": { + "content": "<|object_ref_end|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151648": { + "content": "<|box_start|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151649": { + "content": "<|box_end|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151650": { + "content": "<|quad_start|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151651": { + "content": "<|quad_end|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151652": { + "content": "<|vision_start|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151653": { + "content": "<|vision_end|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151654": { + "content": "<|vision_pad|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151655": { + "content": "<|image_pad|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151656": { + "content": "<|video_pad|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151657": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151658": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151659": { + "content": "<|fim_prefix|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151660": { + "content": "<|fim_middle|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151661": { + "content": "<|fim_suffix|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151662": { + "content": "<|fim_pad|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151663": { + "content": "<|repo_name|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151664": { + "content": "<|file_sep|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151665": { + "content": "<|PAD_TOKEN|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + } +} diff --git a/checkpoint-100/trainer_state.json b/checkpoint-100/trainer_state.json new file mode 100644 index 0000000..1295723 --- /dev/null +++ b/checkpoint-100/trainer_state.json @@ -0,0 +1,304 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.625, + "eval_steps": 500, + "global_step": 100, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0625, + "frac_reward_zero_std": 0.075, + "grad_norm": 0.12458323687314987, + "kl": 1.9381546007934957e-05, + "learning_rate": 9.375000000000001e-06, + "loss": 3.427267074584961e-08, + "num_tokens": 57908.0, + "reward": -1.4099999368190765, + "reward_std": 0.7305359303951263, + "rewards/reward_fn/mean": -1.409999930858612, + "rewards/reward_fn/std": 0.8336655974388123, + "step": 10 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.125, + "frac_reward_zero_std": 0.05, + "grad_norm": 0.13424909114837646, + "kl": 3.758955572266132e-05, + "learning_rate": 1.9791666666666665e-05, + "loss": 2.0861625671386717e-08, + "num_tokens": 115744.0, + "reward": -1.5606249690055847, + "reward_std": 0.6338691949844361, + "rewards/reward_fn/mean": -1.5606249451637269, + "rewards/reward_fn/std": 0.6913031578063965, + "step": 20 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1875, + "frac_reward_zero_std": 0.025, + "grad_norm": 0.12381561845541, + "kl": 0.000246869730472099, + "learning_rate": 3.0208333333333334e-05, + "loss": 2.4400651454925537e-07, + "num_tokens": 173524.0, + "reward": -1.4581249356269836, + "reward_std": 0.7430005550384522, + "rewards/reward_fn/mean": -1.4581249117851258, + "rewards/reward_fn/std": 0.7565619319677352, + "step": 30 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.25, + "frac_reward_zero_std": 0.025, + "grad_norm": 0.14411121606826782, + "kl": 0.0016107790230307727, + "learning_rate": 4.0625000000000005e-05, + "loss": 1.6100704669952392e-06, + "num_tokens": 231416.0, + "reward": -1.3331249535083771, + "reward_std": 0.6972330152988434, + "rewards/reward_fn/mean": -1.3331249415874482, + "rewards/reward_fn/std": 0.766948527097702, + "step": 40 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3125, + "frac_reward_zero_std": 0.175, + "grad_norm": 0.14639103412628174, + "kl": 0.007588638024753891, + "learning_rate": 4.988425925925926e-05, + "loss": 7.59512186050415e-06, + "num_tokens": 289368.0, + "reward": -1.0006249606609345, + "reward_std": 0.6684236854314805, + "rewards/reward_fn/mean": -1.0006249785423278, + "rewards/reward_fn/std": 0.7909055948257446, + "step": 50 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.375, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.1545860469341278, + "kl": 0.024985690240282566, + "learning_rate": 4.8726851851851855e-05, + "loss": 2.498328685760498e-05, + "num_tokens": 346964.0, + "reward": -0.6074999988079071, + "reward_std": 0.5110155880451203, + "rewards/reward_fn/mean": -0.6075000107288361, + "rewards/reward_fn/std": 0.6012047231197357, + "step": 60 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4375, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.12491069734096527, + "kl": 0.042862199200317266, + "learning_rate": 4.756944444444444e-05, + "loss": 4.285752947907895e-05, + "num_tokens": 404676.0, + "reward": -0.41250000298023226, + "reward_std": 0.19732050001621246, + "rewards/reward_fn/mean": -0.41250000298023226, + "rewards/reward_fn/std": 0.3099328398704529, + "step": 70 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5, + "frac_reward_zero_std": 0.825, + "grad_norm": 0.139762744307518, + "kl": 0.05795831000432372, + "learning_rate": 4.6412037037037034e-05, + "loss": 5.795210599899292e-05, + "num_tokens": 462404.0, + "reward": -0.3443750023841858, + "reward_std": 0.11552571803331375, + "rewards/reward_fn/mean": -0.344375005364418, + "rewards/reward_fn/std": 0.1971571922302246, + "step": 80 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5625, + "frac_reward_zero_std": 0.9, + "grad_norm": 0.0012635246384888887, + "kl": 0.0758059567771852, + "learning_rate": 4.525462962962963e-05, + "loss": 7.580451783724129e-05, + "num_tokens": 520292.0, + "reward": -0.341250005364418, + "reward_std": 0.06749999970197677, + "rewards/reward_fn/mean": -0.34125000834465025, + "rewards/reward_fn/std": 0.14239275455474854, + "step": 90 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.625, + "frac_reward_zero_std": 0.925, + "grad_norm": 0.28265202045440674, + "kl": 0.10205610543489456, + "learning_rate": 4.4097222222222226e-05, + "loss": 0.00010205432772636414, + "num_tokens": 578164.0, + "reward": -0.31062501221895217, + "reward_std": 0.05374999940395355, + "rewards/reward_fn/mean": -0.31062501221895217, + "rewards/reward_fn/std": 0.0848927527666092, + "step": 100 + } + ], + "logging_steps": 10, + "max_steps": 480, + "num_input_tokens_seen": 578164, + "num_train_epochs": 3, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-100/training_args.bin b/checkpoint-100/training_args.bin new file mode 100644 index 0000000..32a13ee --- /dev/null +++ b/checkpoint-100/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9bb5eec9fa732effecc39928ed0f7eda205a0e16573a345963766968a1a8e3e2 +size 6737 diff --git a/checkpoint-150/README.md b/checkpoint-150/README.md new file mode 100644 index 0000000..bf064af --- /dev/null +++ b/checkpoint-150/README.md @@ -0,0 +1,210 @@ +--- +base_model: unsloth/qwen2.5-1.5b-instruct-unsloth-bnb-4bit +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:unsloth/qwen2.5-1.5b-instruct-unsloth-bnb-4bit +- grpo +- lora +- transformers +- trl +- unsloth +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/checkpoint-150/adapter_config.json b/checkpoint-150/adapter_config.json new file mode 100644 index 0000000..ae4673f --- /dev/null +++ b/checkpoint-150/adapter_config.json @@ -0,0 +1,49 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": { + "base_model_class": "Qwen2ForCausalLM", + "parent_library": "transformers.models.qwen2.modeling_qwen2", + "unsloth_fixed": true + }, + "base_model_name_or_path": "unsloth/qwen2.5-1.5b-instruct-unsloth-bnb-4bit", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-150/adapter_model.safetensors b/checkpoint-150/adapter_model.safetensors new file mode 100644 index 0000000..b869317 --- /dev/null +++ b/checkpoint-150/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:266eb8f6b5685d94013a7e3f7915eaf1b745d68327c70dca9737731fd1ffcf5c +size 17462432 diff --git a/checkpoint-150/chat_template.jinja b/checkpoint-150/chat_template.jinja new file mode 100644 index 0000000..bdf7919 --- /dev/null +++ b/checkpoint-150/chat_template.jinja @@ -0,0 +1,54 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0]['role'] == 'system' %} + {{- messages[0]['content'] }} + {%- else %} + {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }} + {%- endif %} + {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0]['role'] == 'system' %} + {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }} + {%- else %} + {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {{- '<|im_start|>' + message.role }} + {%- if message.content %} + {{- '\n' + message.content }} + {%- endif %} + {%- for tool_call in message.tool_calls %} + {%- if tool_call.function is defined %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {{- tool_call.arguments | tojson }} + {{- '}\n' }} + {%- endfor %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} +{%- endif %} diff --git a/checkpoint-150/optimizer.pt b/checkpoint-150/optimizer.pt new file mode 100644 index 0000000..d2c20a4 --- /dev/null +++ b/checkpoint-150/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f18a375d42fddfe97539e8c114dfaeb434882c99c0f4d3d54c09b0f8fd3d090 +size 9117451 diff --git a/checkpoint-150/rng_state.pth b/checkpoint-150/rng_state.pth new file mode 100644 index 0000000..1d40574 --- /dev/null +++ b/checkpoint-150/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5611ccab0729462d7748ff6ecab8ec349d7a0ac4719c59e381dc9f57aac776a9 +size 14645 diff --git a/checkpoint-150/scaler.pt b/checkpoint-150/scaler.pt new file mode 100644 index 0000000..b756f05 --- /dev/null +++ b/checkpoint-150/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7a646a858b71ab76099f4dca96d8db39ffde2f88258f5a4b082ff62947bcc11 +size 1383 diff --git a/checkpoint-150/scheduler.pt b/checkpoint-150/scheduler.pt new file mode 100644 index 0000000..569c18e --- /dev/null +++ b/checkpoint-150/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8206307394a449e70d8cda749820dc5f2c93cd0fc51958acc50e593295c4ad40 +size 1465 diff --git a/checkpoint-150/tokenizer.json b/checkpoint-150/tokenizer.json new file mode 100644 index 0000000..9fa63fd --- /dev/null +++ b/checkpoint-150/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af5891a15588546db1ac7f2baf8fa94835a51a85c032c39793a55bb048b47446 +size 11422523 diff --git a/checkpoint-150/tokenizer_config.json b/checkpoint-150/tokenizer_config.json new file mode 100644 index 0000000..d3dac53 --- /dev/null +++ b/checkpoint-150/tokenizer_config.json @@ -0,0 +1,201 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|im_end|>", + "errors": "replace", + "is_local": false, + "model_max_length": 32768, + "pad_token": "<|PAD_TOKEN|>", + "padding_side": "right", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null, + "added_tokens_decoder": { + "151643": { + "content": "<|endoftext|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151644": { + "content": "<|im_start|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151645": { + "content": "<|im_end|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151646": { + "content": "<|object_ref_start|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151647": { + "content": "<|object_ref_end|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151648": { + "content": "<|box_start|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151649": { + "content": "<|box_end|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151650": { + "content": "<|quad_start|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151651": { + "content": "<|quad_end|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151652": { + "content": "<|vision_start|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151653": { + "content": "<|vision_end|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151654": { + "content": "<|vision_pad|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151655": { + "content": "<|image_pad|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151656": { + "content": "<|video_pad|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151657": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151658": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151659": { + "content": "<|fim_prefix|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151660": { + "content": "<|fim_middle|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151661": { + "content": "<|fim_suffix|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151662": { + "content": "<|fim_pad|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151663": { + "content": "<|repo_name|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151664": { + "content": "<|file_sep|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151665": { + "content": "<|PAD_TOKEN|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + } +} diff --git a/checkpoint-150/trainer_state.json b/checkpoint-150/trainer_state.json new file mode 100644 index 0000000..dba5dec --- /dev/null +++ b/checkpoint-150/trainer_state.json @@ -0,0 +1,439 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9375, + "eval_steps": 500, + "global_step": 150, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0625, + "frac_reward_zero_std": 0.075, + "grad_norm": 0.12458323687314987, + "kl": 1.9381546007934957e-05, + "learning_rate": 9.375000000000001e-06, + "loss": 3.427267074584961e-08, + "num_tokens": 57908.0, + "reward": -1.4099999368190765, + "reward_std": 0.7305359303951263, + "rewards/reward_fn/mean": -1.409999930858612, + "rewards/reward_fn/std": 0.8336655974388123, + "step": 10 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.125, + "frac_reward_zero_std": 0.05, + "grad_norm": 0.13424909114837646, + "kl": 3.758955572266132e-05, + "learning_rate": 1.9791666666666665e-05, + "loss": 2.0861625671386717e-08, + "num_tokens": 115744.0, + "reward": -1.5606249690055847, + "reward_std": 0.6338691949844361, + "rewards/reward_fn/mean": -1.5606249451637269, + "rewards/reward_fn/std": 0.6913031578063965, + "step": 20 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1875, + "frac_reward_zero_std": 0.025, + "grad_norm": 0.12381561845541, + "kl": 0.000246869730472099, + "learning_rate": 3.0208333333333334e-05, + "loss": 2.4400651454925537e-07, + "num_tokens": 173524.0, + "reward": -1.4581249356269836, + "reward_std": 0.7430005550384522, + "rewards/reward_fn/mean": -1.4581249117851258, + "rewards/reward_fn/std": 0.7565619319677352, + "step": 30 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.25, + "frac_reward_zero_std": 0.025, + "grad_norm": 0.14411121606826782, + "kl": 0.0016107790230307727, + "learning_rate": 4.0625000000000005e-05, + "loss": 1.6100704669952392e-06, + "num_tokens": 231416.0, + "reward": -1.3331249535083771, + "reward_std": 0.6972330152988434, + "rewards/reward_fn/mean": -1.3331249415874482, + "rewards/reward_fn/std": 0.766948527097702, + "step": 40 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3125, + "frac_reward_zero_std": 0.175, + "grad_norm": 0.14639103412628174, + "kl": 0.007588638024753891, + "learning_rate": 4.988425925925926e-05, + "loss": 7.59512186050415e-06, + "num_tokens": 289368.0, + "reward": -1.0006249606609345, + "reward_std": 0.6684236854314805, + "rewards/reward_fn/mean": -1.0006249785423278, + "rewards/reward_fn/std": 0.7909055948257446, + "step": 50 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.375, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.1545860469341278, + "kl": 0.024985690240282566, + "learning_rate": 4.8726851851851855e-05, + "loss": 2.498328685760498e-05, + "num_tokens": 346964.0, + "reward": -0.6074999988079071, + "reward_std": 0.5110155880451203, + "rewards/reward_fn/mean": -0.6075000107288361, + "rewards/reward_fn/std": 0.6012047231197357, + "step": 60 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4375, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.12491069734096527, + "kl": 0.042862199200317266, + "learning_rate": 4.756944444444444e-05, + "loss": 4.285752947907895e-05, + "num_tokens": 404676.0, + "reward": -0.41250000298023226, + "reward_std": 0.19732050001621246, + "rewards/reward_fn/mean": -0.41250000298023226, + "rewards/reward_fn/std": 0.3099328398704529, + "step": 70 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5, + "frac_reward_zero_std": 0.825, + "grad_norm": 0.139762744307518, + "kl": 0.05795831000432372, + "learning_rate": 4.6412037037037034e-05, + "loss": 5.795210599899292e-05, + "num_tokens": 462404.0, + "reward": -0.3443750023841858, + "reward_std": 0.11552571803331375, + "rewards/reward_fn/mean": -0.344375005364418, + "rewards/reward_fn/std": 0.1971571922302246, + "step": 80 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5625, + "frac_reward_zero_std": 0.9, + "grad_norm": 0.0012635246384888887, + "kl": 0.0758059567771852, + "learning_rate": 4.525462962962963e-05, + "loss": 7.580451783724129e-05, + "num_tokens": 520292.0, + "reward": -0.341250005364418, + "reward_std": 0.06749999970197677, + "rewards/reward_fn/mean": -0.34125000834465025, + "rewards/reward_fn/std": 0.14239275455474854, + "step": 90 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.625, + "frac_reward_zero_std": 0.925, + "grad_norm": 0.28265202045440674, + "kl": 0.10205610543489456, + "learning_rate": 4.4097222222222226e-05, + "loss": 0.00010205432772636414, + "num_tokens": 578164.0, + "reward": -0.31062501221895217, + "reward_std": 0.05374999940395355, + "rewards/reward_fn/mean": -0.31062501221895217, + "rewards/reward_fn/std": 0.0848927527666092, + "step": 100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6875, + "frac_reward_zero_std": 0.925, + "grad_norm": 0.15435293316841125, + "kl": 0.11740942746400833, + "learning_rate": 4.293981481481482e-05, + "loss": 0.00011740787886083126, + "num_tokens": 635928.0, + "reward": -0.3262500077486038, + "reward_std": 0.052499999105930326, + "rewards/reward_fn/mean": -0.3262500077486038, + "rewards/reward_fn/std": 0.10499999821186065, + "step": 110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.75, + "frac_reward_zero_std": 0.975, + "grad_norm": 0.0017832271987572312, + "kl": 0.1092877883464098, + "learning_rate": 4.178240740740741e-05, + "loss": 0.0001092874095775187, + "num_tokens": 693776.0, + "reward": -0.31125001013278963, + "reward_std": 0.02249999940395355, + "rewards/reward_fn/mean": -0.31125001013278963, + "rewards/reward_fn/std": 0.04499999582767487, + "step": 120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8125, + "frac_reward_zero_std": 0.925, + "grad_norm": 0.0019980475772172213, + "kl": 0.1017705887556076, + "learning_rate": 4.0625000000000005e-05, + "loss": 0.00010176897048950196, + "num_tokens": 751704.0, + "reward": -0.3262500077486038, + "reward_std": 0.05249999761581421, + "rewards/reward_fn/mean": -0.3262500077486038, + "rewards/reward_fn/std": 0.10499999821186065, + "step": 130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.875, + "frac_reward_zero_std": 0.975, + "grad_norm": 0.006963597144931555, + "kl": 0.10655302237719297, + "learning_rate": 3.94675925925926e-05, + "loss": 0.00010655303485691547, + "num_tokens": 809360.0, + "reward": -0.31125001013278963, + "reward_std": 0.02249999940395355, + "rewards/reward_fn/mean": -0.31125001013278963, + "rewards/reward_fn/std": 0.04499999582767487, + "step": 140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9375, + "frac_reward_zero_std": 0.925, + "grad_norm": 0.0015019988641142845, + "kl": 0.10568692134693265, + "learning_rate": 3.831018518518519e-05, + "loss": 0.00010568510042503476, + "num_tokens": 867440.0, + "reward": -0.33375000655651094, + "reward_std": 0.06749999821186066, + "rewards/reward_fn/mean": -0.33375000655651094, + "rewards/reward_fn/std": 0.1349999874830246, + "step": 150 + } + ], + "logging_steps": 10, + "max_steps": 480, + "num_input_tokens_seen": 867440, + "num_train_epochs": 3, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-150/training_args.bin b/checkpoint-150/training_args.bin new file mode 100644 index 0000000..32a13ee --- /dev/null +++ b/checkpoint-150/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9bb5eec9fa732effecc39928ed0f7eda205a0e16573a345963766968a1a8e3e2 +size 6737 diff --git a/checkpoint-200/README.md b/checkpoint-200/README.md new file mode 100644 index 0000000..bf064af --- /dev/null +++ b/checkpoint-200/README.md @@ -0,0 +1,210 @@ +--- +base_model: unsloth/qwen2.5-1.5b-instruct-unsloth-bnb-4bit +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:unsloth/qwen2.5-1.5b-instruct-unsloth-bnb-4bit +- grpo +- lora +- transformers +- trl +- unsloth +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/checkpoint-200/adapter_config.json b/checkpoint-200/adapter_config.json new file mode 100644 index 0000000..ae4673f --- /dev/null +++ b/checkpoint-200/adapter_config.json @@ -0,0 +1,49 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": { + "base_model_class": "Qwen2ForCausalLM", + "parent_library": "transformers.models.qwen2.modeling_qwen2", + "unsloth_fixed": true + }, + "base_model_name_or_path": "unsloth/qwen2.5-1.5b-instruct-unsloth-bnb-4bit", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-200/adapter_model.safetensors b/checkpoint-200/adapter_model.safetensors new file mode 100644 index 0000000..f648139 --- /dev/null +++ b/checkpoint-200/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03c0191796f357e290a533a9c3e3a1be580080ad31de5371b780ce056ad76b99 +size 17462432 diff --git a/checkpoint-200/chat_template.jinja b/checkpoint-200/chat_template.jinja new file mode 100644 index 0000000..bdf7919 --- /dev/null +++ b/checkpoint-200/chat_template.jinja @@ -0,0 +1,54 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0]['role'] == 'system' %} + {{- messages[0]['content'] }} + {%- else %} + {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }} + {%- endif %} + {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0]['role'] == 'system' %} + {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }} + {%- else %} + {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {{- '<|im_start|>' + message.role }} + {%- if message.content %} + {{- '\n' + message.content }} + {%- endif %} + {%- for tool_call in message.tool_calls %} + {%- if tool_call.function is defined %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {{- tool_call.arguments | tojson }} + {{- '}\n' }} + {%- endfor %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} +{%- endif %} diff --git a/checkpoint-200/optimizer.pt b/checkpoint-200/optimizer.pt new file mode 100644 index 0000000..1f734f8 --- /dev/null +++ b/checkpoint-200/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d9caddcf15fab178e8da1baa78df54bc52e61bfd6c34938ac841e7a5bd92e346 +size 9117451 diff --git a/checkpoint-200/rng_state.pth b/checkpoint-200/rng_state.pth new file mode 100644 index 0000000..07ebdb5 --- /dev/null +++ b/checkpoint-200/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:467668a0ddf90a5ccd34a829114cc9153ff715328ae2b83990514e72e3904d6b +size 14645 diff --git a/checkpoint-200/scaler.pt b/checkpoint-200/scaler.pt new file mode 100644 index 0000000..dce3572 --- /dev/null +++ b/checkpoint-200/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:124625e167eb28acbfc793cfcb3e8a08b32e7fea06501462bc9e420a5e1beb2a +size 1383 diff --git a/checkpoint-200/scheduler.pt b/checkpoint-200/scheduler.pt new file mode 100644 index 0000000..356633d --- /dev/null +++ b/checkpoint-200/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:54017b9447902bb874203354ab111f2e9dd05c6f8d8a2a295f5b8805945883dd +size 1465 diff --git a/checkpoint-200/tokenizer.json b/checkpoint-200/tokenizer.json new file mode 100644 index 0000000..9fa63fd --- /dev/null +++ b/checkpoint-200/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af5891a15588546db1ac7f2baf8fa94835a51a85c032c39793a55bb048b47446 +size 11422523 diff --git a/checkpoint-200/tokenizer_config.json b/checkpoint-200/tokenizer_config.json new file mode 100644 index 0000000..d3dac53 --- /dev/null +++ b/checkpoint-200/tokenizer_config.json @@ -0,0 +1,201 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|im_end|>", + "errors": "replace", + "is_local": false, + "model_max_length": 32768, + "pad_token": "<|PAD_TOKEN|>", + "padding_side": "right", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null, + "added_tokens_decoder": { + "151643": { + "content": "<|endoftext|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151644": { + "content": "<|im_start|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151645": { + "content": "<|im_end|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151646": { + "content": "<|object_ref_start|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151647": { + "content": "<|object_ref_end|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151648": { + "content": "<|box_start|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151649": { + "content": "<|box_end|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151650": { + "content": "<|quad_start|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151651": { + "content": "<|quad_end|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151652": { + "content": "<|vision_start|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151653": { + "content": "<|vision_end|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151654": { + "content": "<|vision_pad|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151655": { + "content": "<|image_pad|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151656": { + "content": "<|video_pad|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151657": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151658": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151659": { + "content": "<|fim_prefix|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151660": { + "content": "<|fim_middle|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151661": { + "content": "<|fim_suffix|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151662": { + "content": "<|fim_pad|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151663": { + "content": "<|repo_name|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151664": { + "content": "<|file_sep|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151665": { + "content": "<|PAD_TOKEN|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + } +} diff --git a/checkpoint-200/trainer_state.json b/checkpoint-200/trainer_state.json new file mode 100644 index 0000000..63c20d6 --- /dev/null +++ b/checkpoint-200/trainer_state.json @@ -0,0 +1,574 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.25, + "eval_steps": 500, + "global_step": 200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0625, + "frac_reward_zero_std": 0.075, + "grad_norm": 0.12458323687314987, + "kl": 1.9381546007934957e-05, + "learning_rate": 9.375000000000001e-06, + "loss": 3.427267074584961e-08, + "num_tokens": 57908.0, + "reward": -1.4099999368190765, + "reward_std": 0.7305359303951263, + "rewards/reward_fn/mean": -1.409999930858612, + "rewards/reward_fn/std": 0.8336655974388123, + "step": 10 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.125, + "frac_reward_zero_std": 0.05, + "grad_norm": 0.13424909114837646, + "kl": 3.758955572266132e-05, + "learning_rate": 1.9791666666666665e-05, + "loss": 2.0861625671386717e-08, + "num_tokens": 115744.0, + "reward": -1.5606249690055847, + "reward_std": 0.6338691949844361, + "rewards/reward_fn/mean": -1.5606249451637269, + "rewards/reward_fn/std": 0.6913031578063965, + "step": 20 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1875, + "frac_reward_zero_std": 0.025, + "grad_norm": 0.12381561845541, + "kl": 0.000246869730472099, + "learning_rate": 3.0208333333333334e-05, + "loss": 2.4400651454925537e-07, + "num_tokens": 173524.0, + "reward": -1.4581249356269836, + "reward_std": 0.7430005550384522, + "rewards/reward_fn/mean": -1.4581249117851258, + "rewards/reward_fn/std": 0.7565619319677352, + "step": 30 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.25, + "frac_reward_zero_std": 0.025, + "grad_norm": 0.14411121606826782, + "kl": 0.0016107790230307727, + "learning_rate": 4.0625000000000005e-05, + "loss": 1.6100704669952392e-06, + "num_tokens": 231416.0, + "reward": -1.3331249535083771, + "reward_std": 0.6972330152988434, + "rewards/reward_fn/mean": -1.3331249415874482, + "rewards/reward_fn/std": 0.766948527097702, + "step": 40 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3125, + "frac_reward_zero_std": 0.175, + "grad_norm": 0.14639103412628174, + "kl": 0.007588638024753891, + "learning_rate": 4.988425925925926e-05, + "loss": 7.59512186050415e-06, + "num_tokens": 289368.0, + "reward": -1.0006249606609345, + "reward_std": 0.6684236854314805, + "rewards/reward_fn/mean": -1.0006249785423278, + "rewards/reward_fn/std": 0.7909055948257446, + "step": 50 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.375, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.1545860469341278, + "kl": 0.024985690240282566, + "learning_rate": 4.8726851851851855e-05, + "loss": 2.498328685760498e-05, + "num_tokens": 346964.0, + "reward": -0.6074999988079071, + "reward_std": 0.5110155880451203, + "rewards/reward_fn/mean": -0.6075000107288361, + "rewards/reward_fn/std": 0.6012047231197357, + "step": 60 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4375, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.12491069734096527, + "kl": 0.042862199200317266, + "learning_rate": 4.756944444444444e-05, + "loss": 4.285752947907895e-05, + "num_tokens": 404676.0, + "reward": -0.41250000298023226, + "reward_std": 0.19732050001621246, + "rewards/reward_fn/mean": -0.41250000298023226, + "rewards/reward_fn/std": 0.3099328398704529, + "step": 70 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5, + "frac_reward_zero_std": 0.825, + "grad_norm": 0.139762744307518, + "kl": 0.05795831000432372, + "learning_rate": 4.6412037037037034e-05, + "loss": 5.795210599899292e-05, + "num_tokens": 462404.0, + "reward": -0.3443750023841858, + "reward_std": 0.11552571803331375, + "rewards/reward_fn/mean": -0.344375005364418, + "rewards/reward_fn/std": 0.1971571922302246, + "step": 80 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5625, + "frac_reward_zero_std": 0.9, + "grad_norm": 0.0012635246384888887, + "kl": 0.0758059567771852, + "learning_rate": 4.525462962962963e-05, + "loss": 7.580451783724129e-05, + "num_tokens": 520292.0, + "reward": -0.341250005364418, + "reward_std": 0.06749999970197677, + "rewards/reward_fn/mean": -0.34125000834465025, + "rewards/reward_fn/std": 0.14239275455474854, + "step": 90 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.625, + "frac_reward_zero_std": 0.925, + "grad_norm": 0.28265202045440674, + "kl": 0.10205610543489456, + "learning_rate": 4.4097222222222226e-05, + "loss": 0.00010205432772636414, + "num_tokens": 578164.0, + "reward": -0.31062501221895217, + "reward_std": 0.05374999940395355, + "rewards/reward_fn/mean": -0.31062501221895217, + "rewards/reward_fn/std": 0.0848927527666092, + "step": 100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6875, + "frac_reward_zero_std": 0.925, + "grad_norm": 0.15435293316841125, + "kl": 0.11740942746400833, + "learning_rate": 4.293981481481482e-05, + "loss": 0.00011740787886083126, + "num_tokens": 635928.0, + "reward": -0.3262500077486038, + "reward_std": 0.052499999105930326, + "rewards/reward_fn/mean": -0.3262500077486038, + "rewards/reward_fn/std": 0.10499999821186065, + "step": 110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.75, + "frac_reward_zero_std": 0.975, + "grad_norm": 0.0017832271987572312, + "kl": 0.1092877883464098, + "learning_rate": 4.178240740740741e-05, + "loss": 0.0001092874095775187, + "num_tokens": 693776.0, + "reward": -0.31125001013278963, + "reward_std": 0.02249999940395355, + "rewards/reward_fn/mean": -0.31125001013278963, + "rewards/reward_fn/std": 0.04499999582767487, + "step": 120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8125, + "frac_reward_zero_std": 0.925, + "grad_norm": 0.0019980475772172213, + "kl": 0.1017705887556076, + "learning_rate": 4.0625000000000005e-05, + "loss": 0.00010176897048950196, + "num_tokens": 751704.0, + "reward": -0.3262500077486038, + "reward_std": 0.05249999761581421, + "rewards/reward_fn/mean": -0.3262500077486038, + "rewards/reward_fn/std": 0.10499999821186065, + "step": 130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.875, + "frac_reward_zero_std": 0.975, + "grad_norm": 0.006963597144931555, + "kl": 0.10655302237719297, + "learning_rate": 3.94675925925926e-05, + "loss": 0.00010655303485691547, + "num_tokens": 809360.0, + "reward": -0.31125001013278963, + "reward_std": 0.02249999940395355, + "rewards/reward_fn/mean": -0.31125001013278963, + "rewards/reward_fn/std": 0.04499999582767487, + "step": 140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9375, + "frac_reward_zero_std": 0.925, + "grad_norm": 0.0015019988641142845, + "kl": 0.10568692134693265, + "learning_rate": 3.831018518518519e-05, + "loss": 0.00010568510042503476, + "num_tokens": 867440.0, + "reward": -0.33375000655651094, + "reward_std": 0.06749999821186066, + "rewards/reward_fn/mean": -0.33375000655651094, + "rewards/reward_fn/std": 0.1349999874830246, + "step": 150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.0, + "frac_reward_zero_std": 0.975, + "grad_norm": 0.0008070049225352705, + "kl": 0.07564885132014751, + "learning_rate": 3.715277777777778e-05, + "loss": 7.564762490801513e-05, + "num_tokens": 925440.0, + "reward": -0.30750001072883604, + "reward_std": 0.015000000596046448, + "rewards/reward_fn/mean": -0.30750001072883604, + "rewards/reward_fn/std": 0.030000001192092896, + "step": 160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.0625, + "frac_reward_zero_std": 0.925, + "grad_norm": 0.0015270161675289273, + "kl": 0.06843139906413853, + "learning_rate": 3.5995370370370376e-05, + "loss": 6.842819275334477e-05, + "num_tokens": 983304.0, + "reward": -0.33375000655651094, + "reward_std": 0.06749999821186066, + "rewards/reward_fn/mean": -0.33375000655651094, + "rewards/reward_fn/std": 0.1349999874830246, + "step": 170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.125, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007024089572951198, + "kl": 0.06553028370253741, + "learning_rate": 3.483796296296297e-05, + "loss": 6.553027196787298e-05, + "num_tokens": 1041204.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.1875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001360859489068389, + "kl": 0.058797695627436045, + "learning_rate": 3.368055555555556e-05, + "loss": 5.8797706151381136e-05, + "num_tokens": 1099072.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.25, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0012780348770320415, + "kl": 0.05732685485854745, + "learning_rate": 3.252314814814815e-05, + "loss": 5.732685676775873e-05, + "num_tokens": 1156860.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 200 + } + ], + "logging_steps": 10, + "max_steps": 480, + "num_input_tokens_seen": 1156860, + "num_train_epochs": 3, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-200/training_args.bin b/checkpoint-200/training_args.bin new file mode 100644 index 0000000..32a13ee --- /dev/null +++ b/checkpoint-200/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9bb5eec9fa732effecc39928ed0f7eda205a0e16573a345963766968a1a8e3e2 +size 6737 diff --git a/checkpoint-250/README.md b/checkpoint-250/README.md new file mode 100644 index 0000000..bf064af --- /dev/null +++ b/checkpoint-250/README.md @@ -0,0 +1,210 @@ +--- +base_model: unsloth/qwen2.5-1.5b-instruct-unsloth-bnb-4bit +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:unsloth/qwen2.5-1.5b-instruct-unsloth-bnb-4bit +- grpo +- lora +- transformers +- trl +- unsloth +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/checkpoint-250/adapter_config.json b/checkpoint-250/adapter_config.json new file mode 100644 index 0000000..ae4673f --- /dev/null +++ b/checkpoint-250/adapter_config.json @@ -0,0 +1,49 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": { + "base_model_class": "Qwen2ForCausalLM", + "parent_library": "transformers.models.qwen2.modeling_qwen2", + "unsloth_fixed": true + }, + "base_model_name_or_path": "unsloth/qwen2.5-1.5b-instruct-unsloth-bnb-4bit", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-250/adapter_model.safetensors b/checkpoint-250/adapter_model.safetensors new file mode 100644 index 0000000..fa80957 --- /dev/null +++ b/checkpoint-250/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:60acb92877131d85b8e1d916c74fd05a21432b7d9f6572539fbdb85ad71ac32b +size 17462432 diff --git a/checkpoint-250/chat_template.jinja b/checkpoint-250/chat_template.jinja new file mode 100644 index 0000000..bdf7919 --- /dev/null +++ b/checkpoint-250/chat_template.jinja @@ -0,0 +1,54 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0]['role'] == 'system' %} + {{- messages[0]['content'] }} + {%- else %} + {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }} + {%- endif %} + {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0]['role'] == 'system' %} + {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }} + {%- else %} + {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {{- '<|im_start|>' + message.role }} + {%- if message.content %} + {{- '\n' + message.content }} + {%- endif %} + {%- for tool_call in message.tool_calls %} + {%- if tool_call.function is defined %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {{- tool_call.arguments | tojson }} + {{- '}\n' }} + {%- endfor %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} +{%- endif %} diff --git a/checkpoint-250/optimizer.pt b/checkpoint-250/optimizer.pt new file mode 100644 index 0000000..6e278d4 --- /dev/null +++ b/checkpoint-250/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:05d0ea6b5775506f5b2021e1dd495445581fae27b859980999ac8df750925f0a +size 9117451 diff --git a/checkpoint-250/rng_state.pth b/checkpoint-250/rng_state.pth new file mode 100644 index 0000000..a2bc1be --- /dev/null +++ b/checkpoint-250/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:451e032b799d75fa43efe27f5986060047c2aedd862f9191bc8d539fb98cb8aa +size 14645 diff --git a/checkpoint-250/scaler.pt b/checkpoint-250/scaler.pt new file mode 100644 index 0000000..849c6fa --- /dev/null +++ b/checkpoint-250/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d6fca631a6bcdfa2416587314d206a68f40e27a07bc674b76e72a93db4e5058 +size 1383 diff --git a/checkpoint-250/scheduler.pt b/checkpoint-250/scheduler.pt new file mode 100644 index 0000000..15f5be7 --- /dev/null +++ b/checkpoint-250/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4f8553206496dc12a1a3f9d88fa46926b7184fd2fb64e8917233eae7491a6ed +size 1465 diff --git a/checkpoint-250/tokenizer.json b/checkpoint-250/tokenizer.json new file mode 100644 index 0000000..9fa63fd --- /dev/null +++ b/checkpoint-250/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af5891a15588546db1ac7f2baf8fa94835a51a85c032c39793a55bb048b47446 +size 11422523 diff --git a/checkpoint-250/tokenizer_config.json b/checkpoint-250/tokenizer_config.json new file mode 100644 index 0000000..d3dac53 --- /dev/null +++ b/checkpoint-250/tokenizer_config.json @@ -0,0 +1,201 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|im_end|>", + "errors": "replace", + "is_local": false, + "model_max_length": 32768, + "pad_token": "<|PAD_TOKEN|>", + "padding_side": "right", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null, + "added_tokens_decoder": { + "151643": { + "content": "<|endoftext|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151644": { + "content": "<|im_start|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151645": { + "content": "<|im_end|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151646": { + "content": "<|object_ref_start|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151647": { + "content": "<|object_ref_end|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151648": { + "content": "<|box_start|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151649": { + "content": "<|box_end|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151650": { + "content": "<|quad_start|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151651": { + "content": "<|quad_end|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151652": { + "content": "<|vision_start|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151653": { + "content": "<|vision_end|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151654": { + "content": "<|vision_pad|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151655": { + "content": "<|image_pad|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151656": { + "content": "<|video_pad|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151657": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151658": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151659": { + "content": "<|fim_prefix|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151660": { + "content": "<|fim_middle|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151661": { + "content": "<|fim_suffix|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151662": { + "content": "<|fim_pad|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151663": { + "content": "<|repo_name|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151664": { + "content": "<|file_sep|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151665": { + "content": "<|PAD_TOKEN|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + } +} diff --git a/checkpoint-250/trainer_state.json b/checkpoint-250/trainer_state.json new file mode 100644 index 0000000..406bc08 --- /dev/null +++ b/checkpoint-250/trainer_state.json @@ -0,0 +1,709 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.5625, + "eval_steps": 500, + "global_step": 250, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0625, + "frac_reward_zero_std": 0.075, + "grad_norm": 0.12458323687314987, + "kl": 1.9381546007934957e-05, + "learning_rate": 9.375000000000001e-06, + "loss": 3.427267074584961e-08, + "num_tokens": 57908.0, + "reward": -1.4099999368190765, + "reward_std": 0.7305359303951263, + "rewards/reward_fn/mean": -1.409999930858612, + "rewards/reward_fn/std": 0.8336655974388123, + "step": 10 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.125, + "frac_reward_zero_std": 0.05, + "grad_norm": 0.13424909114837646, + "kl": 3.758955572266132e-05, + "learning_rate": 1.9791666666666665e-05, + "loss": 2.0861625671386717e-08, + "num_tokens": 115744.0, + "reward": -1.5606249690055847, + "reward_std": 0.6338691949844361, + "rewards/reward_fn/mean": -1.5606249451637269, + "rewards/reward_fn/std": 0.6913031578063965, + "step": 20 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1875, + "frac_reward_zero_std": 0.025, + "grad_norm": 0.12381561845541, + "kl": 0.000246869730472099, + "learning_rate": 3.0208333333333334e-05, + "loss": 2.4400651454925537e-07, + "num_tokens": 173524.0, + "reward": -1.4581249356269836, + "reward_std": 0.7430005550384522, + "rewards/reward_fn/mean": -1.4581249117851258, + "rewards/reward_fn/std": 0.7565619319677352, + "step": 30 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.25, + "frac_reward_zero_std": 0.025, + "grad_norm": 0.14411121606826782, + "kl": 0.0016107790230307727, + "learning_rate": 4.0625000000000005e-05, + "loss": 1.6100704669952392e-06, + "num_tokens": 231416.0, + "reward": -1.3331249535083771, + "reward_std": 0.6972330152988434, + "rewards/reward_fn/mean": -1.3331249415874482, + "rewards/reward_fn/std": 0.766948527097702, + "step": 40 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3125, + "frac_reward_zero_std": 0.175, + "grad_norm": 0.14639103412628174, + "kl": 0.007588638024753891, + "learning_rate": 4.988425925925926e-05, + "loss": 7.59512186050415e-06, + "num_tokens": 289368.0, + "reward": -1.0006249606609345, + "reward_std": 0.6684236854314805, + "rewards/reward_fn/mean": -1.0006249785423278, + "rewards/reward_fn/std": 0.7909055948257446, + "step": 50 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.375, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.1545860469341278, + "kl": 0.024985690240282566, + "learning_rate": 4.8726851851851855e-05, + "loss": 2.498328685760498e-05, + "num_tokens": 346964.0, + "reward": -0.6074999988079071, + "reward_std": 0.5110155880451203, + "rewards/reward_fn/mean": -0.6075000107288361, + "rewards/reward_fn/std": 0.6012047231197357, + "step": 60 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4375, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.12491069734096527, + "kl": 0.042862199200317266, + "learning_rate": 4.756944444444444e-05, + "loss": 4.285752947907895e-05, + "num_tokens": 404676.0, + "reward": -0.41250000298023226, + "reward_std": 0.19732050001621246, + "rewards/reward_fn/mean": -0.41250000298023226, + "rewards/reward_fn/std": 0.3099328398704529, + "step": 70 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5, + "frac_reward_zero_std": 0.825, + "grad_norm": 0.139762744307518, + "kl": 0.05795831000432372, + "learning_rate": 4.6412037037037034e-05, + "loss": 5.795210599899292e-05, + "num_tokens": 462404.0, + "reward": -0.3443750023841858, + "reward_std": 0.11552571803331375, + "rewards/reward_fn/mean": -0.344375005364418, + "rewards/reward_fn/std": 0.1971571922302246, + "step": 80 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5625, + "frac_reward_zero_std": 0.9, + "grad_norm": 0.0012635246384888887, + "kl": 0.0758059567771852, + "learning_rate": 4.525462962962963e-05, + "loss": 7.580451783724129e-05, + "num_tokens": 520292.0, + "reward": -0.341250005364418, + "reward_std": 0.06749999970197677, + "rewards/reward_fn/mean": -0.34125000834465025, + "rewards/reward_fn/std": 0.14239275455474854, + "step": 90 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.625, + "frac_reward_zero_std": 0.925, + "grad_norm": 0.28265202045440674, + "kl": 0.10205610543489456, + "learning_rate": 4.4097222222222226e-05, + "loss": 0.00010205432772636414, + "num_tokens": 578164.0, + "reward": -0.31062501221895217, + "reward_std": 0.05374999940395355, + "rewards/reward_fn/mean": -0.31062501221895217, + "rewards/reward_fn/std": 0.0848927527666092, + "step": 100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6875, + "frac_reward_zero_std": 0.925, + "grad_norm": 0.15435293316841125, + "kl": 0.11740942746400833, + "learning_rate": 4.293981481481482e-05, + "loss": 0.00011740787886083126, + "num_tokens": 635928.0, + "reward": -0.3262500077486038, + "reward_std": 0.052499999105930326, + "rewards/reward_fn/mean": -0.3262500077486038, + "rewards/reward_fn/std": 0.10499999821186065, + "step": 110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.75, + "frac_reward_zero_std": 0.975, + "grad_norm": 0.0017832271987572312, + "kl": 0.1092877883464098, + "learning_rate": 4.178240740740741e-05, + "loss": 0.0001092874095775187, + "num_tokens": 693776.0, + "reward": -0.31125001013278963, + "reward_std": 0.02249999940395355, + "rewards/reward_fn/mean": -0.31125001013278963, + "rewards/reward_fn/std": 0.04499999582767487, + "step": 120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8125, + "frac_reward_zero_std": 0.925, + "grad_norm": 0.0019980475772172213, + "kl": 0.1017705887556076, + "learning_rate": 4.0625000000000005e-05, + "loss": 0.00010176897048950196, + "num_tokens": 751704.0, + "reward": -0.3262500077486038, + "reward_std": 0.05249999761581421, + "rewards/reward_fn/mean": -0.3262500077486038, + "rewards/reward_fn/std": 0.10499999821186065, + "step": 130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.875, + "frac_reward_zero_std": 0.975, + "grad_norm": 0.006963597144931555, + "kl": 0.10655302237719297, + "learning_rate": 3.94675925925926e-05, + "loss": 0.00010655303485691547, + "num_tokens": 809360.0, + "reward": -0.31125001013278963, + "reward_std": 0.02249999940395355, + "rewards/reward_fn/mean": -0.31125001013278963, + "rewards/reward_fn/std": 0.04499999582767487, + "step": 140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9375, + "frac_reward_zero_std": 0.925, + "grad_norm": 0.0015019988641142845, + "kl": 0.10568692134693265, + "learning_rate": 3.831018518518519e-05, + "loss": 0.00010568510042503476, + "num_tokens": 867440.0, + "reward": -0.33375000655651094, + "reward_std": 0.06749999821186066, + "rewards/reward_fn/mean": -0.33375000655651094, + "rewards/reward_fn/std": 0.1349999874830246, + "step": 150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.0, + "frac_reward_zero_std": 0.975, + "grad_norm": 0.0008070049225352705, + "kl": 0.07564885132014751, + "learning_rate": 3.715277777777778e-05, + "loss": 7.564762490801513e-05, + "num_tokens": 925440.0, + "reward": -0.30750001072883604, + "reward_std": 0.015000000596046448, + "rewards/reward_fn/mean": -0.30750001072883604, + "rewards/reward_fn/std": 0.030000001192092896, + "step": 160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.0625, + "frac_reward_zero_std": 0.925, + "grad_norm": 0.0015270161675289273, + "kl": 0.06843139906413853, + "learning_rate": 3.5995370370370376e-05, + "loss": 6.842819275334477e-05, + "num_tokens": 983304.0, + "reward": -0.33375000655651094, + "reward_std": 0.06749999821186066, + "rewards/reward_fn/mean": -0.33375000655651094, + "rewards/reward_fn/std": 0.1349999874830246, + "step": 170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.125, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007024089572951198, + "kl": 0.06553028370253741, + "learning_rate": 3.483796296296297e-05, + "loss": 6.553027196787298e-05, + "num_tokens": 1041204.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.1875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001360859489068389, + "kl": 0.058797695627436045, + "learning_rate": 3.368055555555556e-05, + "loss": 5.8797706151381136e-05, + "num_tokens": 1099072.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.25, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0012780348770320415, + "kl": 0.05732685485854745, + "learning_rate": 3.252314814814815e-05, + "loss": 5.732685676775873e-05, + "num_tokens": 1156860.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.3125, + "frac_reward_zero_std": 0.95, + "grad_norm": 0.0007964351680129766, + "kl": 0.0754379843827337, + "learning_rate": 3.136574074074074e-05, + "loss": 7.543485262431204e-05, + "num_tokens": 1214784.0, + "reward": -0.3150000095367432, + "reward_std": 0.029999998956918718, + "rewards/reward_fn/mean": -0.3150000095367432, + "rewards/reward_fn/std": 0.059999996423721315, + "step": 210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.375, + "frac_reward_zero_std": 0.95, + "grad_norm": 0.0004977025673724711, + "kl": 0.05669140527024865, + "learning_rate": 3.0208333333333334e-05, + "loss": 5.669001257047057e-05, + "num_tokens": 1272932.0, + "reward": -0.3187500089406967, + "reward_std": 0.03749999850988388, + "rewards/reward_fn/mean": -0.3187500089406967, + "rewards/reward_fn/std": 0.07499999701976776, + "step": 220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.4375, + "frac_reward_zero_std": 0.975, + "grad_norm": 0.0008227138314396143, + "kl": 0.0492351729888469, + "learning_rate": 2.9050925925925926e-05, + "loss": 4.923430969938636e-05, + "num_tokens": 1330828.0, + "reward": -0.30750001072883604, + "reward_std": 0.014999999105930329, + "rewards/reward_fn/mean": -0.30750001072883604, + "rewards/reward_fn/std": 0.030000001192092896, + "step": 230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008481480763293803, + "kl": 0.05081965173594653, + "learning_rate": 2.789351851851852e-05, + "loss": 5.0819659372791647e-05, + "num_tokens": 1388588.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.5625, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00036746280966326594, + "kl": 0.046290718484669924, + "learning_rate": 2.6736111111111112e-05, + "loss": 4.6290719183161855e-05, + "num_tokens": 1446308.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 250 + } + ], + "logging_steps": 10, + "max_steps": 480, + "num_input_tokens_seen": 1446308, + "num_train_epochs": 3, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-250/training_args.bin b/checkpoint-250/training_args.bin new file mode 100644 index 0000000..32a13ee --- /dev/null +++ b/checkpoint-250/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9bb5eec9fa732effecc39928ed0f7eda205a0e16573a345963766968a1a8e3e2 +size 6737 diff --git a/checkpoint-300/README.md b/checkpoint-300/README.md new file mode 100644 index 0000000..bf064af --- /dev/null +++ b/checkpoint-300/README.md @@ -0,0 +1,210 @@ +--- +base_model: unsloth/qwen2.5-1.5b-instruct-unsloth-bnb-4bit +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:unsloth/qwen2.5-1.5b-instruct-unsloth-bnb-4bit +- grpo +- lora +- transformers +- trl +- unsloth +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/checkpoint-300/adapter_config.json b/checkpoint-300/adapter_config.json new file mode 100644 index 0000000..ae4673f --- /dev/null +++ b/checkpoint-300/adapter_config.json @@ -0,0 +1,49 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": { + "base_model_class": "Qwen2ForCausalLM", + "parent_library": "transformers.models.qwen2.modeling_qwen2", + "unsloth_fixed": true + }, + "base_model_name_or_path": "unsloth/qwen2.5-1.5b-instruct-unsloth-bnb-4bit", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-300/adapter_model.safetensors b/checkpoint-300/adapter_model.safetensors new file mode 100644 index 0000000..66f4886 --- /dev/null +++ b/checkpoint-300/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4cf11038670d3b4b09b8543452be1d80c5dff907df498d2c93451711d96ffb2e +size 17462432 diff --git a/checkpoint-300/chat_template.jinja b/checkpoint-300/chat_template.jinja new file mode 100644 index 0000000..bdf7919 --- /dev/null +++ b/checkpoint-300/chat_template.jinja @@ -0,0 +1,54 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0]['role'] == 'system' %} + {{- messages[0]['content'] }} + {%- else %} + {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }} + {%- endif %} + {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0]['role'] == 'system' %} + {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }} + {%- else %} + {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {{- '<|im_start|>' + message.role }} + {%- if message.content %} + {{- '\n' + message.content }} + {%- endif %} + {%- for tool_call in message.tool_calls %} + {%- if tool_call.function is defined %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {{- tool_call.arguments | tojson }} + {{- '}\n' }} + {%- endfor %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} +{%- endif %} diff --git a/checkpoint-300/optimizer.pt b/checkpoint-300/optimizer.pt new file mode 100644 index 0000000..fb09c8f --- /dev/null +++ b/checkpoint-300/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4e41f4b6a5c898850109b2649a204a2c80c2b9e99756ab890f4faab083cda55 +size 9117643 diff --git a/checkpoint-300/rng_state.pth b/checkpoint-300/rng_state.pth new file mode 100644 index 0000000..05cbc56 --- /dev/null +++ b/checkpoint-300/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b951f5b58add4e59188cefa84ad956e74855d691bbc156b5bd9b39dbb04e375f +size 14645 diff --git a/checkpoint-300/scaler.pt b/checkpoint-300/scaler.pt new file mode 100644 index 0000000..872dde6 --- /dev/null +++ b/checkpoint-300/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:30af866df24edce708e1eb20700878b402fa05707fa9bc5f332496baf440dbbb +size 1383 diff --git a/checkpoint-300/scheduler.pt b/checkpoint-300/scheduler.pt new file mode 100644 index 0000000..9865272 --- /dev/null +++ b/checkpoint-300/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ead55837b9c3c028b4e7bf4807cb36ff404cd98b882e7bc2f31c507057e7620e +size 1465 diff --git a/checkpoint-300/tokenizer.json b/checkpoint-300/tokenizer.json new file mode 100644 index 0000000..9fa63fd --- /dev/null +++ b/checkpoint-300/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af5891a15588546db1ac7f2baf8fa94835a51a85c032c39793a55bb048b47446 +size 11422523 diff --git a/checkpoint-300/tokenizer_config.json b/checkpoint-300/tokenizer_config.json new file mode 100644 index 0000000..d3dac53 --- /dev/null +++ b/checkpoint-300/tokenizer_config.json @@ -0,0 +1,201 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|im_end|>", + "errors": "replace", + "is_local": false, + "model_max_length": 32768, + "pad_token": "<|PAD_TOKEN|>", + "padding_side": "right", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null, + "added_tokens_decoder": { + "151643": { + "content": "<|endoftext|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151644": { + "content": "<|im_start|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151645": { + "content": "<|im_end|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151646": { + "content": "<|object_ref_start|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151647": { + "content": "<|object_ref_end|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151648": { + "content": "<|box_start|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151649": { + "content": "<|box_end|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151650": { + "content": "<|quad_start|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151651": { + "content": "<|quad_end|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151652": { + "content": "<|vision_start|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151653": { + "content": "<|vision_end|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151654": { + "content": "<|vision_pad|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151655": { + "content": "<|image_pad|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151656": { + "content": "<|video_pad|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151657": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151658": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151659": { + "content": "<|fim_prefix|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151660": { + "content": "<|fim_middle|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151661": { + "content": "<|fim_suffix|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151662": { + "content": "<|fim_pad|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151663": { + "content": "<|repo_name|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151664": { + "content": "<|file_sep|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151665": { + "content": "<|PAD_TOKEN|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + } +} diff --git a/checkpoint-300/trainer_state.json b/checkpoint-300/trainer_state.json new file mode 100644 index 0000000..487c816 --- /dev/null +++ b/checkpoint-300/trainer_state.json @@ -0,0 +1,844 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.875, + "eval_steps": 500, + "global_step": 300, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0625, + "frac_reward_zero_std": 0.075, + "grad_norm": 0.12458323687314987, + "kl": 1.9381546007934957e-05, + "learning_rate": 9.375000000000001e-06, + "loss": 3.427267074584961e-08, + "num_tokens": 57908.0, + "reward": -1.4099999368190765, + "reward_std": 0.7305359303951263, + "rewards/reward_fn/mean": -1.409999930858612, + "rewards/reward_fn/std": 0.8336655974388123, + "step": 10 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.125, + "frac_reward_zero_std": 0.05, + "grad_norm": 0.13424909114837646, + "kl": 3.758955572266132e-05, + "learning_rate": 1.9791666666666665e-05, + "loss": 2.0861625671386717e-08, + "num_tokens": 115744.0, + "reward": -1.5606249690055847, + "reward_std": 0.6338691949844361, + "rewards/reward_fn/mean": -1.5606249451637269, + "rewards/reward_fn/std": 0.6913031578063965, + "step": 20 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1875, + "frac_reward_zero_std": 0.025, + "grad_norm": 0.12381561845541, + "kl": 0.000246869730472099, + "learning_rate": 3.0208333333333334e-05, + "loss": 2.4400651454925537e-07, + "num_tokens": 173524.0, + "reward": -1.4581249356269836, + "reward_std": 0.7430005550384522, + "rewards/reward_fn/mean": -1.4581249117851258, + "rewards/reward_fn/std": 0.7565619319677352, + "step": 30 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.25, + "frac_reward_zero_std": 0.025, + "grad_norm": 0.14411121606826782, + "kl": 0.0016107790230307727, + "learning_rate": 4.0625000000000005e-05, + "loss": 1.6100704669952392e-06, + "num_tokens": 231416.0, + "reward": -1.3331249535083771, + "reward_std": 0.6972330152988434, + "rewards/reward_fn/mean": -1.3331249415874482, + "rewards/reward_fn/std": 0.766948527097702, + "step": 40 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3125, + "frac_reward_zero_std": 0.175, + "grad_norm": 0.14639103412628174, + "kl": 0.007588638024753891, + "learning_rate": 4.988425925925926e-05, + "loss": 7.59512186050415e-06, + "num_tokens": 289368.0, + "reward": -1.0006249606609345, + "reward_std": 0.6684236854314805, + "rewards/reward_fn/mean": -1.0006249785423278, + "rewards/reward_fn/std": 0.7909055948257446, + "step": 50 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.375, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.1545860469341278, + "kl": 0.024985690240282566, + "learning_rate": 4.8726851851851855e-05, + "loss": 2.498328685760498e-05, + "num_tokens": 346964.0, + "reward": -0.6074999988079071, + "reward_std": 0.5110155880451203, + "rewards/reward_fn/mean": -0.6075000107288361, + "rewards/reward_fn/std": 0.6012047231197357, + "step": 60 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4375, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.12491069734096527, + "kl": 0.042862199200317266, + "learning_rate": 4.756944444444444e-05, + "loss": 4.285752947907895e-05, + "num_tokens": 404676.0, + "reward": -0.41250000298023226, + "reward_std": 0.19732050001621246, + "rewards/reward_fn/mean": -0.41250000298023226, + "rewards/reward_fn/std": 0.3099328398704529, + "step": 70 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5, + "frac_reward_zero_std": 0.825, + "grad_norm": 0.139762744307518, + "kl": 0.05795831000432372, + "learning_rate": 4.6412037037037034e-05, + "loss": 5.795210599899292e-05, + "num_tokens": 462404.0, + "reward": -0.3443750023841858, + "reward_std": 0.11552571803331375, + "rewards/reward_fn/mean": -0.344375005364418, + "rewards/reward_fn/std": 0.1971571922302246, + "step": 80 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5625, + "frac_reward_zero_std": 0.9, + "grad_norm": 0.0012635246384888887, + "kl": 0.0758059567771852, + "learning_rate": 4.525462962962963e-05, + "loss": 7.580451783724129e-05, + "num_tokens": 520292.0, + "reward": -0.341250005364418, + "reward_std": 0.06749999970197677, + "rewards/reward_fn/mean": -0.34125000834465025, + "rewards/reward_fn/std": 0.14239275455474854, + "step": 90 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.625, + "frac_reward_zero_std": 0.925, + "grad_norm": 0.28265202045440674, + "kl": 0.10205610543489456, + "learning_rate": 4.4097222222222226e-05, + "loss": 0.00010205432772636414, + "num_tokens": 578164.0, + "reward": -0.31062501221895217, + "reward_std": 0.05374999940395355, + "rewards/reward_fn/mean": -0.31062501221895217, + "rewards/reward_fn/std": 0.0848927527666092, + "step": 100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6875, + "frac_reward_zero_std": 0.925, + "grad_norm": 0.15435293316841125, + "kl": 0.11740942746400833, + "learning_rate": 4.293981481481482e-05, + "loss": 0.00011740787886083126, + "num_tokens": 635928.0, + "reward": -0.3262500077486038, + "reward_std": 0.052499999105930326, + "rewards/reward_fn/mean": -0.3262500077486038, + "rewards/reward_fn/std": 0.10499999821186065, + "step": 110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.75, + "frac_reward_zero_std": 0.975, + "grad_norm": 0.0017832271987572312, + "kl": 0.1092877883464098, + "learning_rate": 4.178240740740741e-05, + "loss": 0.0001092874095775187, + "num_tokens": 693776.0, + "reward": -0.31125001013278963, + "reward_std": 0.02249999940395355, + "rewards/reward_fn/mean": -0.31125001013278963, + "rewards/reward_fn/std": 0.04499999582767487, + "step": 120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8125, + "frac_reward_zero_std": 0.925, + "grad_norm": 0.0019980475772172213, + "kl": 0.1017705887556076, + "learning_rate": 4.0625000000000005e-05, + "loss": 0.00010176897048950196, + "num_tokens": 751704.0, + "reward": -0.3262500077486038, + "reward_std": 0.05249999761581421, + "rewards/reward_fn/mean": -0.3262500077486038, + "rewards/reward_fn/std": 0.10499999821186065, + "step": 130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.875, + "frac_reward_zero_std": 0.975, + "grad_norm": 0.006963597144931555, + "kl": 0.10655302237719297, + "learning_rate": 3.94675925925926e-05, + "loss": 0.00010655303485691547, + "num_tokens": 809360.0, + "reward": -0.31125001013278963, + "reward_std": 0.02249999940395355, + "rewards/reward_fn/mean": -0.31125001013278963, + "rewards/reward_fn/std": 0.04499999582767487, + "step": 140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9375, + "frac_reward_zero_std": 0.925, + "grad_norm": 0.0015019988641142845, + "kl": 0.10568692134693265, + "learning_rate": 3.831018518518519e-05, + "loss": 0.00010568510042503476, + "num_tokens": 867440.0, + "reward": -0.33375000655651094, + "reward_std": 0.06749999821186066, + "rewards/reward_fn/mean": -0.33375000655651094, + "rewards/reward_fn/std": 0.1349999874830246, + "step": 150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.0, + "frac_reward_zero_std": 0.975, + "grad_norm": 0.0008070049225352705, + "kl": 0.07564885132014751, + "learning_rate": 3.715277777777778e-05, + "loss": 7.564762490801513e-05, + "num_tokens": 925440.0, + "reward": -0.30750001072883604, + "reward_std": 0.015000000596046448, + "rewards/reward_fn/mean": -0.30750001072883604, + "rewards/reward_fn/std": 0.030000001192092896, + "step": 160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.0625, + "frac_reward_zero_std": 0.925, + "grad_norm": 0.0015270161675289273, + "kl": 0.06843139906413853, + "learning_rate": 3.5995370370370376e-05, + "loss": 6.842819275334477e-05, + "num_tokens": 983304.0, + "reward": -0.33375000655651094, + "reward_std": 0.06749999821186066, + "rewards/reward_fn/mean": -0.33375000655651094, + "rewards/reward_fn/std": 0.1349999874830246, + "step": 170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.125, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007024089572951198, + "kl": 0.06553028370253741, + "learning_rate": 3.483796296296297e-05, + "loss": 6.553027196787298e-05, + "num_tokens": 1041204.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.1875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001360859489068389, + "kl": 0.058797695627436045, + "learning_rate": 3.368055555555556e-05, + "loss": 5.8797706151381136e-05, + "num_tokens": 1099072.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.25, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0012780348770320415, + "kl": 0.05732685485854745, + "learning_rate": 3.252314814814815e-05, + "loss": 5.732685676775873e-05, + "num_tokens": 1156860.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.3125, + "frac_reward_zero_std": 0.95, + "grad_norm": 0.0007964351680129766, + "kl": 0.0754379843827337, + "learning_rate": 3.136574074074074e-05, + "loss": 7.543485262431204e-05, + "num_tokens": 1214784.0, + "reward": -0.3150000095367432, + "reward_std": 0.029999998956918718, + "rewards/reward_fn/mean": -0.3150000095367432, + "rewards/reward_fn/std": 0.059999996423721315, + "step": 210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.375, + "frac_reward_zero_std": 0.95, + "grad_norm": 0.0004977025673724711, + "kl": 0.05669140527024865, + "learning_rate": 3.0208333333333334e-05, + "loss": 5.669001257047057e-05, + "num_tokens": 1272932.0, + "reward": -0.3187500089406967, + "reward_std": 0.03749999850988388, + "rewards/reward_fn/mean": -0.3187500089406967, + "rewards/reward_fn/std": 0.07499999701976776, + "step": 220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.4375, + "frac_reward_zero_std": 0.975, + "grad_norm": 0.0008227138314396143, + "kl": 0.0492351729888469, + "learning_rate": 2.9050925925925926e-05, + "loss": 4.923430969938636e-05, + "num_tokens": 1330828.0, + "reward": -0.30750001072883604, + "reward_std": 0.014999999105930329, + "rewards/reward_fn/mean": -0.30750001072883604, + "rewards/reward_fn/std": 0.030000001192092896, + "step": 230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008481480763293803, + "kl": 0.05081965173594653, + "learning_rate": 2.789351851851852e-05, + "loss": 5.0819659372791647e-05, + "num_tokens": 1388588.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.5625, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00036746280966326594, + "kl": 0.046290718484669924, + "learning_rate": 2.6736111111111112e-05, + "loss": 4.6290719183161855e-05, + "num_tokens": 1446308.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.625, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007307095802389085, + "kl": 0.04613759703934193, + "learning_rate": 2.5578703703703705e-05, + "loss": 4.613760684151203e-05, + "num_tokens": 1503988.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.6875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006135129951871932, + "kl": 0.051912948861718176, + "learning_rate": 2.4421296296296298e-05, + "loss": 5.191295058466494e-05, + "num_tokens": 1561968.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.75, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008430331945419312, + "kl": 0.05388685767538846, + "learning_rate": 2.326388888888889e-05, + "loss": 5.3886871319264175e-05, + "num_tokens": 1619908.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.8125, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00030109184444881976, + "kl": 0.04647140288725495, + "learning_rate": 2.2106481481481483e-05, + "loss": 4.6471410314552486e-05, + "num_tokens": 1677612.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00029493175679817796, + "kl": 0.04179914747364819, + "learning_rate": 2.0949074074074073e-05, + "loss": 4.17991541326046e-05, + "num_tokens": 1735372.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 300 + } + ], + "logging_steps": 10, + "max_steps": 480, + "num_input_tokens_seen": 1735372, + "num_train_epochs": 3, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-300/training_args.bin b/checkpoint-300/training_args.bin new file mode 100644 index 0000000..32a13ee --- /dev/null +++ b/checkpoint-300/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9bb5eec9fa732effecc39928ed0f7eda205a0e16573a345963766968a1a8e3e2 +size 6737 diff --git a/checkpoint-350/README.md b/checkpoint-350/README.md new file mode 100644 index 0000000..bf064af --- /dev/null +++ b/checkpoint-350/README.md @@ -0,0 +1,210 @@ +--- +base_model: unsloth/qwen2.5-1.5b-instruct-unsloth-bnb-4bit +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:unsloth/qwen2.5-1.5b-instruct-unsloth-bnb-4bit +- grpo +- lora +- transformers +- trl +- unsloth +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/checkpoint-350/adapter_config.json b/checkpoint-350/adapter_config.json new file mode 100644 index 0000000..ae4673f --- /dev/null +++ b/checkpoint-350/adapter_config.json @@ -0,0 +1,49 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": { + "base_model_class": "Qwen2ForCausalLM", + "parent_library": "transformers.models.qwen2.modeling_qwen2", + "unsloth_fixed": true + }, + "base_model_name_or_path": "unsloth/qwen2.5-1.5b-instruct-unsloth-bnb-4bit", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-350/adapter_model.safetensors b/checkpoint-350/adapter_model.safetensors new file mode 100644 index 0000000..111636a --- /dev/null +++ b/checkpoint-350/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c15416a03a5cdc4e75b72c792ddd593b6f4691a47e5010ced8d2aca7fc6241c +size 17462432 diff --git a/checkpoint-350/chat_template.jinja b/checkpoint-350/chat_template.jinja new file mode 100644 index 0000000..bdf7919 --- /dev/null +++ b/checkpoint-350/chat_template.jinja @@ -0,0 +1,54 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0]['role'] == 'system' %} + {{- messages[0]['content'] }} + {%- else %} + {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }} + {%- endif %} + {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0]['role'] == 'system' %} + {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }} + {%- else %} + {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {{- '<|im_start|>' + message.role }} + {%- if message.content %} + {{- '\n' + message.content }} + {%- endif %} + {%- for tool_call in message.tool_calls %} + {%- if tool_call.function is defined %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {{- tool_call.arguments | tojson }} + {{- '}\n' }} + {%- endfor %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} +{%- endif %} diff --git a/checkpoint-350/optimizer.pt b/checkpoint-350/optimizer.pt new file mode 100644 index 0000000..100e4e2 --- /dev/null +++ b/checkpoint-350/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:17a8c7b5ce75204386d022d817feec1d6b704c9123e6baf28dc034450d4e2247 +size 9117643 diff --git a/checkpoint-350/rng_state.pth b/checkpoint-350/rng_state.pth new file mode 100644 index 0000000..4bdb02f --- /dev/null +++ b/checkpoint-350/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13498e441e117878ad5ef63f21936541c2618640ab7ca677b078d0de5d93cfb3 +size 14645 diff --git a/checkpoint-350/scaler.pt b/checkpoint-350/scaler.pt new file mode 100644 index 0000000..99cb3c4 --- /dev/null +++ b/checkpoint-350/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7a3b7c7f6ae3d4b9e2fcb0774d91cea73c152177e3f6818dfc11db3a9a74275 +size 1383 diff --git a/checkpoint-350/scheduler.pt b/checkpoint-350/scheduler.pt new file mode 100644 index 0000000..5bbab41 --- /dev/null +++ b/checkpoint-350/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6de957f913cdcc25511d72448987f4ea6828e6a9818bfc6f84f3b177896760ef +size 1465 diff --git a/checkpoint-350/tokenizer.json b/checkpoint-350/tokenizer.json new file mode 100644 index 0000000..9fa63fd --- /dev/null +++ b/checkpoint-350/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af5891a15588546db1ac7f2baf8fa94835a51a85c032c39793a55bb048b47446 +size 11422523 diff --git a/checkpoint-350/tokenizer_config.json b/checkpoint-350/tokenizer_config.json new file mode 100644 index 0000000..d3dac53 --- /dev/null +++ b/checkpoint-350/tokenizer_config.json @@ -0,0 +1,201 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|im_end|>", + "errors": "replace", + "is_local": false, + "model_max_length": 32768, + "pad_token": "<|PAD_TOKEN|>", + "padding_side": "right", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null, + "added_tokens_decoder": { + "151643": { + "content": "<|endoftext|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151644": { + "content": "<|im_start|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151645": { + "content": "<|im_end|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151646": { + "content": "<|object_ref_start|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151647": { + "content": "<|object_ref_end|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151648": { + "content": "<|box_start|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151649": { + "content": "<|box_end|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151650": { + "content": "<|quad_start|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151651": { + "content": "<|quad_end|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151652": { + "content": "<|vision_start|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151653": { + "content": "<|vision_end|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151654": { + "content": "<|vision_pad|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151655": { + "content": "<|image_pad|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151656": { + "content": "<|video_pad|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151657": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151658": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151659": { + "content": "<|fim_prefix|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151660": { + "content": "<|fim_middle|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151661": { + "content": "<|fim_suffix|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151662": { + "content": "<|fim_pad|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151663": { + "content": "<|repo_name|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151664": { + "content": "<|file_sep|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151665": { + "content": "<|PAD_TOKEN|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + } +} diff --git a/checkpoint-350/trainer_state.json b/checkpoint-350/trainer_state.json new file mode 100644 index 0000000..de28e0a --- /dev/null +++ b/checkpoint-350/trainer_state.json @@ -0,0 +1,979 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.1875, + "eval_steps": 500, + "global_step": 350, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0625, + "frac_reward_zero_std": 0.075, + "grad_norm": 0.12458323687314987, + "kl": 1.9381546007934957e-05, + "learning_rate": 9.375000000000001e-06, + "loss": 3.427267074584961e-08, + "num_tokens": 57908.0, + "reward": -1.4099999368190765, + "reward_std": 0.7305359303951263, + "rewards/reward_fn/mean": -1.409999930858612, + "rewards/reward_fn/std": 0.8336655974388123, + "step": 10 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.125, + "frac_reward_zero_std": 0.05, + "grad_norm": 0.13424909114837646, + "kl": 3.758955572266132e-05, + "learning_rate": 1.9791666666666665e-05, + "loss": 2.0861625671386717e-08, + "num_tokens": 115744.0, + "reward": -1.5606249690055847, + "reward_std": 0.6338691949844361, + "rewards/reward_fn/mean": -1.5606249451637269, + "rewards/reward_fn/std": 0.6913031578063965, + "step": 20 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1875, + "frac_reward_zero_std": 0.025, + "grad_norm": 0.12381561845541, + "kl": 0.000246869730472099, + "learning_rate": 3.0208333333333334e-05, + "loss": 2.4400651454925537e-07, + "num_tokens": 173524.0, + "reward": -1.4581249356269836, + "reward_std": 0.7430005550384522, + "rewards/reward_fn/mean": -1.4581249117851258, + "rewards/reward_fn/std": 0.7565619319677352, + "step": 30 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.25, + "frac_reward_zero_std": 0.025, + "grad_norm": 0.14411121606826782, + "kl": 0.0016107790230307727, + "learning_rate": 4.0625000000000005e-05, + "loss": 1.6100704669952392e-06, + "num_tokens": 231416.0, + "reward": -1.3331249535083771, + "reward_std": 0.6972330152988434, + "rewards/reward_fn/mean": -1.3331249415874482, + "rewards/reward_fn/std": 0.766948527097702, + "step": 40 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3125, + "frac_reward_zero_std": 0.175, + "grad_norm": 0.14639103412628174, + "kl": 0.007588638024753891, + "learning_rate": 4.988425925925926e-05, + "loss": 7.59512186050415e-06, + "num_tokens": 289368.0, + "reward": -1.0006249606609345, + "reward_std": 0.6684236854314805, + "rewards/reward_fn/mean": -1.0006249785423278, + "rewards/reward_fn/std": 0.7909055948257446, + "step": 50 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.375, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.1545860469341278, + "kl": 0.024985690240282566, + "learning_rate": 4.8726851851851855e-05, + "loss": 2.498328685760498e-05, + "num_tokens": 346964.0, + "reward": -0.6074999988079071, + "reward_std": 0.5110155880451203, + "rewards/reward_fn/mean": -0.6075000107288361, + "rewards/reward_fn/std": 0.6012047231197357, + "step": 60 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4375, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.12491069734096527, + "kl": 0.042862199200317266, + "learning_rate": 4.756944444444444e-05, + "loss": 4.285752947907895e-05, + "num_tokens": 404676.0, + "reward": -0.41250000298023226, + "reward_std": 0.19732050001621246, + "rewards/reward_fn/mean": -0.41250000298023226, + "rewards/reward_fn/std": 0.3099328398704529, + "step": 70 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5, + "frac_reward_zero_std": 0.825, + "grad_norm": 0.139762744307518, + "kl": 0.05795831000432372, + "learning_rate": 4.6412037037037034e-05, + "loss": 5.795210599899292e-05, + "num_tokens": 462404.0, + "reward": -0.3443750023841858, + "reward_std": 0.11552571803331375, + "rewards/reward_fn/mean": -0.344375005364418, + "rewards/reward_fn/std": 0.1971571922302246, + "step": 80 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5625, + "frac_reward_zero_std": 0.9, + "grad_norm": 0.0012635246384888887, + "kl": 0.0758059567771852, + "learning_rate": 4.525462962962963e-05, + "loss": 7.580451783724129e-05, + "num_tokens": 520292.0, + "reward": -0.341250005364418, + "reward_std": 0.06749999970197677, + "rewards/reward_fn/mean": -0.34125000834465025, + "rewards/reward_fn/std": 0.14239275455474854, + "step": 90 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.625, + "frac_reward_zero_std": 0.925, + "grad_norm": 0.28265202045440674, + "kl": 0.10205610543489456, + "learning_rate": 4.4097222222222226e-05, + "loss": 0.00010205432772636414, + "num_tokens": 578164.0, + "reward": -0.31062501221895217, + "reward_std": 0.05374999940395355, + "rewards/reward_fn/mean": -0.31062501221895217, + "rewards/reward_fn/std": 0.0848927527666092, + "step": 100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6875, + "frac_reward_zero_std": 0.925, + "grad_norm": 0.15435293316841125, + "kl": 0.11740942746400833, + "learning_rate": 4.293981481481482e-05, + "loss": 0.00011740787886083126, + "num_tokens": 635928.0, + "reward": -0.3262500077486038, + "reward_std": 0.052499999105930326, + "rewards/reward_fn/mean": -0.3262500077486038, + "rewards/reward_fn/std": 0.10499999821186065, + "step": 110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.75, + "frac_reward_zero_std": 0.975, + "grad_norm": 0.0017832271987572312, + "kl": 0.1092877883464098, + "learning_rate": 4.178240740740741e-05, + "loss": 0.0001092874095775187, + "num_tokens": 693776.0, + "reward": -0.31125001013278963, + "reward_std": 0.02249999940395355, + "rewards/reward_fn/mean": -0.31125001013278963, + "rewards/reward_fn/std": 0.04499999582767487, + "step": 120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8125, + "frac_reward_zero_std": 0.925, + "grad_norm": 0.0019980475772172213, + "kl": 0.1017705887556076, + "learning_rate": 4.0625000000000005e-05, + "loss": 0.00010176897048950196, + "num_tokens": 751704.0, + "reward": -0.3262500077486038, + "reward_std": 0.05249999761581421, + "rewards/reward_fn/mean": -0.3262500077486038, + "rewards/reward_fn/std": 0.10499999821186065, + "step": 130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.875, + "frac_reward_zero_std": 0.975, + "grad_norm": 0.006963597144931555, + "kl": 0.10655302237719297, + "learning_rate": 3.94675925925926e-05, + "loss": 0.00010655303485691547, + "num_tokens": 809360.0, + "reward": -0.31125001013278963, + "reward_std": 0.02249999940395355, + "rewards/reward_fn/mean": -0.31125001013278963, + "rewards/reward_fn/std": 0.04499999582767487, + "step": 140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9375, + "frac_reward_zero_std": 0.925, + "grad_norm": 0.0015019988641142845, + "kl": 0.10568692134693265, + "learning_rate": 3.831018518518519e-05, + "loss": 0.00010568510042503476, + "num_tokens": 867440.0, + "reward": -0.33375000655651094, + "reward_std": 0.06749999821186066, + "rewards/reward_fn/mean": -0.33375000655651094, + "rewards/reward_fn/std": 0.1349999874830246, + "step": 150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.0, + "frac_reward_zero_std": 0.975, + "grad_norm": 0.0008070049225352705, + "kl": 0.07564885132014751, + "learning_rate": 3.715277777777778e-05, + "loss": 7.564762490801513e-05, + "num_tokens": 925440.0, + "reward": -0.30750001072883604, + "reward_std": 0.015000000596046448, + "rewards/reward_fn/mean": -0.30750001072883604, + "rewards/reward_fn/std": 0.030000001192092896, + "step": 160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.0625, + "frac_reward_zero_std": 0.925, + "grad_norm": 0.0015270161675289273, + "kl": 0.06843139906413853, + "learning_rate": 3.5995370370370376e-05, + "loss": 6.842819275334477e-05, + "num_tokens": 983304.0, + "reward": -0.33375000655651094, + "reward_std": 0.06749999821186066, + "rewards/reward_fn/mean": -0.33375000655651094, + "rewards/reward_fn/std": 0.1349999874830246, + "step": 170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.125, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007024089572951198, + "kl": 0.06553028370253741, + "learning_rate": 3.483796296296297e-05, + "loss": 6.553027196787298e-05, + "num_tokens": 1041204.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.1875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001360859489068389, + "kl": 0.058797695627436045, + "learning_rate": 3.368055555555556e-05, + "loss": 5.8797706151381136e-05, + "num_tokens": 1099072.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.25, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0012780348770320415, + "kl": 0.05732685485854745, + "learning_rate": 3.252314814814815e-05, + "loss": 5.732685676775873e-05, + "num_tokens": 1156860.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.3125, + "frac_reward_zero_std": 0.95, + "grad_norm": 0.0007964351680129766, + "kl": 0.0754379843827337, + "learning_rate": 3.136574074074074e-05, + "loss": 7.543485262431204e-05, + "num_tokens": 1214784.0, + "reward": -0.3150000095367432, + "reward_std": 0.029999998956918718, + "rewards/reward_fn/mean": -0.3150000095367432, + "rewards/reward_fn/std": 0.059999996423721315, + "step": 210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.375, + "frac_reward_zero_std": 0.95, + "grad_norm": 0.0004977025673724711, + "kl": 0.05669140527024865, + "learning_rate": 3.0208333333333334e-05, + "loss": 5.669001257047057e-05, + "num_tokens": 1272932.0, + "reward": -0.3187500089406967, + "reward_std": 0.03749999850988388, + "rewards/reward_fn/mean": -0.3187500089406967, + "rewards/reward_fn/std": 0.07499999701976776, + "step": 220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.4375, + "frac_reward_zero_std": 0.975, + "grad_norm": 0.0008227138314396143, + "kl": 0.0492351729888469, + "learning_rate": 2.9050925925925926e-05, + "loss": 4.923430969938636e-05, + "num_tokens": 1330828.0, + "reward": -0.30750001072883604, + "reward_std": 0.014999999105930329, + "rewards/reward_fn/mean": -0.30750001072883604, + "rewards/reward_fn/std": 0.030000001192092896, + "step": 230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008481480763293803, + "kl": 0.05081965173594653, + "learning_rate": 2.789351851851852e-05, + "loss": 5.0819659372791647e-05, + "num_tokens": 1388588.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.5625, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00036746280966326594, + "kl": 0.046290718484669924, + "learning_rate": 2.6736111111111112e-05, + "loss": 4.6290719183161855e-05, + "num_tokens": 1446308.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.625, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007307095802389085, + "kl": 0.04613759703934193, + "learning_rate": 2.5578703703703705e-05, + "loss": 4.613760684151203e-05, + "num_tokens": 1503988.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.6875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006135129951871932, + "kl": 0.051912948861718176, + "learning_rate": 2.4421296296296298e-05, + "loss": 5.191295058466494e-05, + "num_tokens": 1561968.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.75, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008430331945419312, + "kl": 0.05388685767538846, + "learning_rate": 2.326388888888889e-05, + "loss": 5.3886871319264175e-05, + "num_tokens": 1619908.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.8125, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00030109184444881976, + "kl": 0.04647140288725495, + "learning_rate": 2.2106481481481483e-05, + "loss": 4.6471410314552486e-05, + "num_tokens": 1677612.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00029493175679817796, + "kl": 0.04179914747364819, + "learning_rate": 2.0949074074074073e-05, + "loss": 4.17991541326046e-05, + "num_tokens": 1735372.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.9375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0018822322599589825, + "kl": 0.04672949458472431, + "learning_rate": 1.9791666666666665e-05, + "loss": 4.672949435189366e-05, + "num_tokens": 1793268.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 2.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006305224960669875, + "kl": 0.044585675746202466, + "learning_rate": 1.863425925925926e-05, + "loss": 4.458568291738629e-05, + "num_tokens": 1850880.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 2.0625, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003445159236434847, + "kl": 0.04190994901582599, + "learning_rate": 1.747685185185185e-05, + "loss": 4.190995532553643e-05, + "num_tokens": 1908628.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 2.125, + "frac_reward_zero_std": 0.975, + "grad_norm": 0.00043421241571195424, + "kl": 0.04299232084304094, + "learning_rate": 1.6319444444444444e-05, + "loss": 4.2992207454517484e-05, + "num_tokens": 1966240.0, + "reward": -0.30750001072883604, + "reward_std": 0.014999999105930329, + "rewards/reward_fn/mean": -0.30750001072883604, + "rewards/reward_fn/std": 0.030000001192092896, + "step": 340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 2.1875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005787091795355082, + "kl": 0.047347473446279764, + "learning_rate": 1.5162037037037038e-05, + "loss": 4.7347473446279766e-05, + "num_tokens": 2024200.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 350 + } + ], + "logging_steps": 10, + "max_steps": 480, + "num_input_tokens_seen": 2024200, + "num_train_epochs": 3, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-350/training_args.bin b/checkpoint-350/training_args.bin new file mode 100644 index 0000000..32a13ee --- /dev/null +++ b/checkpoint-350/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9bb5eec9fa732effecc39928ed0f7eda205a0e16573a345963766968a1a8e3e2 +size 6737 diff --git a/checkpoint-400/README.md b/checkpoint-400/README.md new file mode 100644 index 0000000..bf064af --- /dev/null +++ b/checkpoint-400/README.md @@ -0,0 +1,210 @@ +--- +base_model: unsloth/qwen2.5-1.5b-instruct-unsloth-bnb-4bit +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:unsloth/qwen2.5-1.5b-instruct-unsloth-bnb-4bit +- grpo +- lora +- transformers +- trl +- unsloth +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/checkpoint-400/adapter_config.json b/checkpoint-400/adapter_config.json new file mode 100644 index 0000000..ae4673f --- /dev/null +++ b/checkpoint-400/adapter_config.json @@ -0,0 +1,49 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": { + "base_model_class": "Qwen2ForCausalLM", + "parent_library": "transformers.models.qwen2.modeling_qwen2", + "unsloth_fixed": true + }, + "base_model_name_or_path": "unsloth/qwen2.5-1.5b-instruct-unsloth-bnb-4bit", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-400/adapter_model.safetensors b/checkpoint-400/adapter_model.safetensors new file mode 100644 index 0000000..6539850 --- /dev/null +++ b/checkpoint-400/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3078a233f68ad957c692746c54cc6bbd582b492deb5087e1155b34269f3974e9 +size 17462432 diff --git a/checkpoint-400/chat_template.jinja b/checkpoint-400/chat_template.jinja new file mode 100644 index 0000000..bdf7919 --- /dev/null +++ b/checkpoint-400/chat_template.jinja @@ -0,0 +1,54 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0]['role'] == 'system' %} + {{- messages[0]['content'] }} + {%- else %} + {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }} + {%- endif %} + {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0]['role'] == 'system' %} + {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }} + {%- else %} + {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {{- '<|im_start|>' + message.role }} + {%- if message.content %} + {{- '\n' + message.content }} + {%- endif %} + {%- for tool_call in message.tool_calls %} + {%- if tool_call.function is defined %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {{- tool_call.arguments | tojson }} + {{- '}\n' }} + {%- endfor %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} +{%- endif %} diff --git a/checkpoint-400/optimizer.pt b/checkpoint-400/optimizer.pt new file mode 100644 index 0000000..de8659c --- /dev/null +++ b/checkpoint-400/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:413e080934f9dcafbf33f19f1ce13e4b9fd243d2e21c3a093652a5e1ede8d266 +size 9117643 diff --git a/checkpoint-400/rng_state.pth b/checkpoint-400/rng_state.pth new file mode 100644 index 0000000..a3a14b5 --- /dev/null +++ b/checkpoint-400/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23a99a08e278d625ba8c9ae6b9e53c91b0f8a9576fd73d18731ed95653aef28b +size 14645 diff --git a/checkpoint-400/scaler.pt b/checkpoint-400/scaler.pt new file mode 100644 index 0000000..0b0bfee --- /dev/null +++ b/checkpoint-400/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:30858f23bcb22d0baef45bd4add9d6fa474141308c12653c706077b87d932e49 +size 1383 diff --git a/checkpoint-400/scheduler.pt b/checkpoint-400/scheduler.pt new file mode 100644 index 0000000..ebfeee1 --- /dev/null +++ b/checkpoint-400/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba69e116e6977b6bb0276d93bc56ef121109a918e9715d3a98a00b54ed1bd9b7 +size 1465 diff --git a/checkpoint-400/tokenizer.json b/checkpoint-400/tokenizer.json new file mode 100644 index 0000000..9fa63fd --- /dev/null +++ b/checkpoint-400/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af5891a15588546db1ac7f2baf8fa94835a51a85c032c39793a55bb048b47446 +size 11422523 diff --git a/checkpoint-400/tokenizer_config.json b/checkpoint-400/tokenizer_config.json new file mode 100644 index 0000000..d3dac53 --- /dev/null +++ b/checkpoint-400/tokenizer_config.json @@ -0,0 +1,201 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|im_end|>", + "errors": "replace", + "is_local": false, + "model_max_length": 32768, + "pad_token": "<|PAD_TOKEN|>", + "padding_side": "right", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null, + "added_tokens_decoder": { + "151643": { + "content": "<|endoftext|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151644": { + "content": "<|im_start|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151645": { + "content": "<|im_end|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151646": { + "content": "<|object_ref_start|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151647": { + "content": "<|object_ref_end|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151648": { + "content": "<|box_start|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151649": { + "content": "<|box_end|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151650": { + "content": "<|quad_start|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151651": { + "content": "<|quad_end|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151652": { + "content": "<|vision_start|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151653": { + "content": "<|vision_end|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151654": { + "content": "<|vision_pad|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151655": { + "content": "<|image_pad|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151656": { + "content": "<|video_pad|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151657": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151658": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151659": { + "content": "<|fim_prefix|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151660": { + "content": "<|fim_middle|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151661": { + "content": "<|fim_suffix|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151662": { + "content": "<|fim_pad|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151663": { + "content": "<|repo_name|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151664": { + "content": "<|file_sep|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151665": { + "content": "<|PAD_TOKEN|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + } +} diff --git a/checkpoint-400/trainer_state.json b/checkpoint-400/trainer_state.json new file mode 100644 index 0000000..a55689a --- /dev/null +++ b/checkpoint-400/trainer_state.json @@ -0,0 +1,1114 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.5, + "eval_steps": 500, + "global_step": 400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0625, + "frac_reward_zero_std": 0.075, + "grad_norm": 0.12458323687314987, + "kl": 1.9381546007934957e-05, + "learning_rate": 9.375000000000001e-06, + "loss": 3.427267074584961e-08, + "num_tokens": 57908.0, + "reward": -1.4099999368190765, + "reward_std": 0.7305359303951263, + "rewards/reward_fn/mean": -1.409999930858612, + "rewards/reward_fn/std": 0.8336655974388123, + "step": 10 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.125, + "frac_reward_zero_std": 0.05, + "grad_norm": 0.13424909114837646, + "kl": 3.758955572266132e-05, + "learning_rate": 1.9791666666666665e-05, + "loss": 2.0861625671386717e-08, + "num_tokens": 115744.0, + "reward": -1.5606249690055847, + "reward_std": 0.6338691949844361, + "rewards/reward_fn/mean": -1.5606249451637269, + "rewards/reward_fn/std": 0.6913031578063965, + "step": 20 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1875, + "frac_reward_zero_std": 0.025, + "grad_norm": 0.12381561845541, + "kl": 0.000246869730472099, + "learning_rate": 3.0208333333333334e-05, + "loss": 2.4400651454925537e-07, + "num_tokens": 173524.0, + "reward": -1.4581249356269836, + "reward_std": 0.7430005550384522, + "rewards/reward_fn/mean": -1.4581249117851258, + "rewards/reward_fn/std": 0.7565619319677352, + "step": 30 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.25, + "frac_reward_zero_std": 0.025, + "grad_norm": 0.14411121606826782, + "kl": 0.0016107790230307727, + "learning_rate": 4.0625000000000005e-05, + "loss": 1.6100704669952392e-06, + "num_tokens": 231416.0, + "reward": -1.3331249535083771, + "reward_std": 0.6972330152988434, + "rewards/reward_fn/mean": -1.3331249415874482, + "rewards/reward_fn/std": 0.766948527097702, + "step": 40 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3125, + "frac_reward_zero_std": 0.175, + "grad_norm": 0.14639103412628174, + "kl": 0.007588638024753891, + "learning_rate": 4.988425925925926e-05, + "loss": 7.59512186050415e-06, + "num_tokens": 289368.0, + "reward": -1.0006249606609345, + "reward_std": 0.6684236854314805, + "rewards/reward_fn/mean": -1.0006249785423278, + "rewards/reward_fn/std": 0.7909055948257446, + "step": 50 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.375, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.1545860469341278, + "kl": 0.024985690240282566, + "learning_rate": 4.8726851851851855e-05, + "loss": 2.498328685760498e-05, + "num_tokens": 346964.0, + "reward": -0.6074999988079071, + "reward_std": 0.5110155880451203, + "rewards/reward_fn/mean": -0.6075000107288361, + "rewards/reward_fn/std": 0.6012047231197357, + "step": 60 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4375, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.12491069734096527, + "kl": 0.042862199200317266, + "learning_rate": 4.756944444444444e-05, + "loss": 4.285752947907895e-05, + "num_tokens": 404676.0, + "reward": -0.41250000298023226, + "reward_std": 0.19732050001621246, + "rewards/reward_fn/mean": -0.41250000298023226, + "rewards/reward_fn/std": 0.3099328398704529, + "step": 70 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5, + "frac_reward_zero_std": 0.825, + "grad_norm": 0.139762744307518, + "kl": 0.05795831000432372, + "learning_rate": 4.6412037037037034e-05, + "loss": 5.795210599899292e-05, + "num_tokens": 462404.0, + "reward": -0.3443750023841858, + "reward_std": 0.11552571803331375, + "rewards/reward_fn/mean": -0.344375005364418, + "rewards/reward_fn/std": 0.1971571922302246, + "step": 80 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5625, + "frac_reward_zero_std": 0.9, + "grad_norm": 0.0012635246384888887, + "kl": 0.0758059567771852, + "learning_rate": 4.525462962962963e-05, + "loss": 7.580451783724129e-05, + "num_tokens": 520292.0, + "reward": -0.341250005364418, + "reward_std": 0.06749999970197677, + "rewards/reward_fn/mean": -0.34125000834465025, + "rewards/reward_fn/std": 0.14239275455474854, + "step": 90 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.625, + "frac_reward_zero_std": 0.925, + "grad_norm": 0.28265202045440674, + "kl": 0.10205610543489456, + "learning_rate": 4.4097222222222226e-05, + "loss": 0.00010205432772636414, + "num_tokens": 578164.0, + "reward": -0.31062501221895217, + "reward_std": 0.05374999940395355, + "rewards/reward_fn/mean": -0.31062501221895217, + "rewards/reward_fn/std": 0.0848927527666092, + "step": 100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6875, + "frac_reward_zero_std": 0.925, + "grad_norm": 0.15435293316841125, + "kl": 0.11740942746400833, + "learning_rate": 4.293981481481482e-05, + "loss": 0.00011740787886083126, + "num_tokens": 635928.0, + "reward": -0.3262500077486038, + "reward_std": 0.052499999105930326, + "rewards/reward_fn/mean": -0.3262500077486038, + "rewards/reward_fn/std": 0.10499999821186065, + "step": 110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.75, + "frac_reward_zero_std": 0.975, + "grad_norm": 0.0017832271987572312, + "kl": 0.1092877883464098, + "learning_rate": 4.178240740740741e-05, + "loss": 0.0001092874095775187, + "num_tokens": 693776.0, + "reward": -0.31125001013278963, + "reward_std": 0.02249999940395355, + "rewards/reward_fn/mean": -0.31125001013278963, + "rewards/reward_fn/std": 0.04499999582767487, + "step": 120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8125, + "frac_reward_zero_std": 0.925, + "grad_norm": 0.0019980475772172213, + "kl": 0.1017705887556076, + "learning_rate": 4.0625000000000005e-05, + "loss": 0.00010176897048950196, + "num_tokens": 751704.0, + "reward": -0.3262500077486038, + "reward_std": 0.05249999761581421, + "rewards/reward_fn/mean": -0.3262500077486038, + "rewards/reward_fn/std": 0.10499999821186065, + "step": 130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.875, + "frac_reward_zero_std": 0.975, + "grad_norm": 0.006963597144931555, + "kl": 0.10655302237719297, + "learning_rate": 3.94675925925926e-05, + "loss": 0.00010655303485691547, + "num_tokens": 809360.0, + "reward": -0.31125001013278963, + "reward_std": 0.02249999940395355, + "rewards/reward_fn/mean": -0.31125001013278963, + "rewards/reward_fn/std": 0.04499999582767487, + "step": 140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9375, + "frac_reward_zero_std": 0.925, + "grad_norm": 0.0015019988641142845, + "kl": 0.10568692134693265, + "learning_rate": 3.831018518518519e-05, + "loss": 0.00010568510042503476, + "num_tokens": 867440.0, + "reward": -0.33375000655651094, + "reward_std": 0.06749999821186066, + "rewards/reward_fn/mean": -0.33375000655651094, + "rewards/reward_fn/std": 0.1349999874830246, + "step": 150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.0, + "frac_reward_zero_std": 0.975, + "grad_norm": 0.0008070049225352705, + "kl": 0.07564885132014751, + "learning_rate": 3.715277777777778e-05, + "loss": 7.564762490801513e-05, + "num_tokens": 925440.0, + "reward": -0.30750001072883604, + "reward_std": 0.015000000596046448, + "rewards/reward_fn/mean": -0.30750001072883604, + "rewards/reward_fn/std": 0.030000001192092896, + "step": 160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.0625, + "frac_reward_zero_std": 0.925, + "grad_norm": 0.0015270161675289273, + "kl": 0.06843139906413853, + "learning_rate": 3.5995370370370376e-05, + "loss": 6.842819275334477e-05, + "num_tokens": 983304.0, + "reward": -0.33375000655651094, + "reward_std": 0.06749999821186066, + "rewards/reward_fn/mean": -0.33375000655651094, + "rewards/reward_fn/std": 0.1349999874830246, + "step": 170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.125, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007024089572951198, + "kl": 0.06553028370253741, + "learning_rate": 3.483796296296297e-05, + "loss": 6.553027196787298e-05, + "num_tokens": 1041204.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.1875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001360859489068389, + "kl": 0.058797695627436045, + "learning_rate": 3.368055555555556e-05, + "loss": 5.8797706151381136e-05, + "num_tokens": 1099072.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.25, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0012780348770320415, + "kl": 0.05732685485854745, + "learning_rate": 3.252314814814815e-05, + "loss": 5.732685676775873e-05, + "num_tokens": 1156860.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.3125, + "frac_reward_zero_std": 0.95, + "grad_norm": 0.0007964351680129766, + "kl": 0.0754379843827337, + "learning_rate": 3.136574074074074e-05, + "loss": 7.543485262431204e-05, + "num_tokens": 1214784.0, + "reward": -0.3150000095367432, + "reward_std": 0.029999998956918718, + "rewards/reward_fn/mean": -0.3150000095367432, + "rewards/reward_fn/std": 0.059999996423721315, + "step": 210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.375, + "frac_reward_zero_std": 0.95, + "grad_norm": 0.0004977025673724711, + "kl": 0.05669140527024865, + "learning_rate": 3.0208333333333334e-05, + "loss": 5.669001257047057e-05, + "num_tokens": 1272932.0, + "reward": -0.3187500089406967, + "reward_std": 0.03749999850988388, + "rewards/reward_fn/mean": -0.3187500089406967, + "rewards/reward_fn/std": 0.07499999701976776, + "step": 220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.4375, + "frac_reward_zero_std": 0.975, + "grad_norm": 0.0008227138314396143, + "kl": 0.0492351729888469, + "learning_rate": 2.9050925925925926e-05, + "loss": 4.923430969938636e-05, + "num_tokens": 1330828.0, + "reward": -0.30750001072883604, + "reward_std": 0.014999999105930329, + "rewards/reward_fn/mean": -0.30750001072883604, + "rewards/reward_fn/std": 0.030000001192092896, + "step": 230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008481480763293803, + "kl": 0.05081965173594653, + "learning_rate": 2.789351851851852e-05, + "loss": 5.0819659372791647e-05, + "num_tokens": 1388588.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.5625, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00036746280966326594, + "kl": 0.046290718484669924, + "learning_rate": 2.6736111111111112e-05, + "loss": 4.6290719183161855e-05, + "num_tokens": 1446308.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.625, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007307095802389085, + "kl": 0.04613759703934193, + "learning_rate": 2.5578703703703705e-05, + "loss": 4.613760684151203e-05, + "num_tokens": 1503988.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.6875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006135129951871932, + "kl": 0.051912948861718176, + "learning_rate": 2.4421296296296298e-05, + "loss": 5.191295058466494e-05, + "num_tokens": 1561968.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.75, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008430331945419312, + "kl": 0.05388685767538846, + "learning_rate": 2.326388888888889e-05, + "loss": 5.3886871319264175e-05, + "num_tokens": 1619908.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.8125, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00030109184444881976, + "kl": 0.04647140288725495, + "learning_rate": 2.2106481481481483e-05, + "loss": 4.6471410314552486e-05, + "num_tokens": 1677612.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00029493175679817796, + "kl": 0.04179914747364819, + "learning_rate": 2.0949074074074073e-05, + "loss": 4.17991541326046e-05, + "num_tokens": 1735372.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.9375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0018822322599589825, + "kl": 0.04672949458472431, + "learning_rate": 1.9791666666666665e-05, + "loss": 4.672949435189366e-05, + "num_tokens": 1793268.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 2.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006305224960669875, + "kl": 0.044585675746202466, + "learning_rate": 1.863425925925926e-05, + "loss": 4.458568291738629e-05, + "num_tokens": 1850880.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 2.0625, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003445159236434847, + "kl": 0.04190994901582599, + "learning_rate": 1.747685185185185e-05, + "loss": 4.190995532553643e-05, + "num_tokens": 1908628.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 2.125, + "frac_reward_zero_std": 0.975, + "grad_norm": 0.00043421241571195424, + "kl": 0.04299232084304094, + "learning_rate": 1.6319444444444444e-05, + "loss": 4.2992207454517484e-05, + "num_tokens": 1966240.0, + "reward": -0.30750001072883604, + "reward_std": 0.014999999105930329, + "rewards/reward_fn/mean": -0.30750001072883604, + "rewards/reward_fn/std": 0.030000001192092896, + "step": 340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 2.1875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005787091795355082, + "kl": 0.047347473446279764, + "learning_rate": 1.5162037037037038e-05, + "loss": 4.7347473446279766e-05, + "num_tokens": 2024200.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 2.25, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006160089396871626, + "kl": 0.038707632245495914, + "learning_rate": 1.4004629629629631e-05, + "loss": 3.870762884616852e-05, + "num_tokens": 2081972.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 2.3125, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009722487884573638, + "kl": 0.04313907837495208, + "learning_rate": 1.2847222222222222e-05, + "loss": 4.313907993491739e-05, + "num_tokens": 2139648.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 2.375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004891545977443457, + "kl": 0.04277103706263006, + "learning_rate": 1.1689814814814815e-05, + "loss": 4.277102998457849e-05, + "num_tokens": 2197280.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 2.4375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00046821596333757043, + "kl": 0.04802478738129139, + "learning_rate": 1.0532407407407408e-05, + "loss": 4.802478651981801e-05, + "num_tokens": 2255032.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 2.5, + "frac_reward_zero_std": 0.975, + "grad_norm": 0.0004983897670172155, + "kl": 0.04196312534622848, + "learning_rate": 9.375000000000001e-06, + "loss": 4.196249938104302e-05, + "num_tokens": 2312772.0, + "reward": -0.31125001013278963, + "reward_std": 0.02249999940395355, + "rewards/reward_fn/mean": -0.31125001013278963, + "rewards/reward_fn/std": 0.04499999582767487, + "step": 400 + } + ], + "logging_steps": 10, + "max_steps": 480, + "num_input_tokens_seen": 2312772, + "num_train_epochs": 3, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-400/training_args.bin b/checkpoint-400/training_args.bin new file mode 100644 index 0000000..32a13ee --- /dev/null +++ b/checkpoint-400/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9bb5eec9fa732effecc39928ed0f7eda205a0e16573a345963766968a1a8e3e2 +size 6737 diff --git a/checkpoint-450/README.md b/checkpoint-450/README.md new file mode 100644 index 0000000..bf064af --- /dev/null +++ b/checkpoint-450/README.md @@ -0,0 +1,210 @@ +--- +base_model: unsloth/qwen2.5-1.5b-instruct-unsloth-bnb-4bit +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:unsloth/qwen2.5-1.5b-instruct-unsloth-bnb-4bit +- grpo +- lora +- transformers +- trl +- unsloth +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/checkpoint-450/adapter_config.json b/checkpoint-450/adapter_config.json new file mode 100644 index 0000000..ae4673f --- /dev/null +++ b/checkpoint-450/adapter_config.json @@ -0,0 +1,49 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": { + "base_model_class": "Qwen2ForCausalLM", + "parent_library": "transformers.models.qwen2.modeling_qwen2", + "unsloth_fixed": true + }, + "base_model_name_or_path": "unsloth/qwen2.5-1.5b-instruct-unsloth-bnb-4bit", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-450/adapter_model.safetensors b/checkpoint-450/adapter_model.safetensors new file mode 100644 index 0000000..e6da281 --- /dev/null +++ b/checkpoint-450/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c83a0339a0837b2cf6265409bb116511a8f0631be3ddb492649ee6bf7f4433f0 +size 17462432 diff --git a/checkpoint-450/chat_template.jinja b/checkpoint-450/chat_template.jinja new file mode 100644 index 0000000..bdf7919 --- /dev/null +++ b/checkpoint-450/chat_template.jinja @@ -0,0 +1,54 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0]['role'] == 'system' %} + {{- messages[0]['content'] }} + {%- else %} + {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }} + {%- endif %} + {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0]['role'] == 'system' %} + {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }} + {%- else %} + {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {{- '<|im_start|>' + message.role }} + {%- if message.content %} + {{- '\n' + message.content }} + {%- endif %} + {%- for tool_call in message.tool_calls %} + {%- if tool_call.function is defined %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {{- tool_call.arguments | tojson }} + {{- '}\n' }} + {%- endfor %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} +{%- endif %} diff --git a/checkpoint-450/optimizer.pt b/checkpoint-450/optimizer.pt new file mode 100644 index 0000000..1d5da12 --- /dev/null +++ b/checkpoint-450/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:50462d573b33590a0b0ef83f94f26a1a5da6b83d104c4ec27a49e2fd3e804e4a +size 9117643 diff --git a/checkpoint-450/rng_state.pth b/checkpoint-450/rng_state.pth new file mode 100644 index 0000000..b608bd1 --- /dev/null +++ b/checkpoint-450/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:54ba2e1b45f91493e55324133972c6376516576ba79c59042c098b8b777d207c +size 14645 diff --git a/checkpoint-450/scaler.pt b/checkpoint-450/scaler.pt new file mode 100644 index 0000000..9e3490c --- /dev/null +++ b/checkpoint-450/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:911449e67f3112a5e15b8316033a123bafd40147919c93233393f8ca44c64005 +size 1383 diff --git a/checkpoint-450/scheduler.pt b/checkpoint-450/scheduler.pt new file mode 100644 index 0000000..f690c5c --- /dev/null +++ b/checkpoint-450/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e5ffc55d83179a15923d34ef58cddba1f5faa68d34e22f7ec9c0289903ac559f +size 1465 diff --git a/checkpoint-450/tokenizer.json b/checkpoint-450/tokenizer.json new file mode 100644 index 0000000..9fa63fd --- /dev/null +++ b/checkpoint-450/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af5891a15588546db1ac7f2baf8fa94835a51a85c032c39793a55bb048b47446 +size 11422523 diff --git a/checkpoint-450/tokenizer_config.json b/checkpoint-450/tokenizer_config.json new file mode 100644 index 0000000..d3dac53 --- /dev/null +++ b/checkpoint-450/tokenizer_config.json @@ -0,0 +1,201 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|im_end|>", + "errors": "replace", + "is_local": false, + "model_max_length": 32768, + "pad_token": "<|PAD_TOKEN|>", + "padding_side": "right", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null, + "added_tokens_decoder": { + "151643": { + "content": "<|endoftext|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151644": { + "content": "<|im_start|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151645": { + "content": "<|im_end|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151646": { + "content": "<|object_ref_start|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151647": { + "content": "<|object_ref_end|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151648": { + "content": "<|box_start|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151649": { + "content": "<|box_end|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151650": { + "content": "<|quad_start|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151651": { + "content": "<|quad_end|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151652": { + "content": "<|vision_start|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151653": { + "content": "<|vision_end|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151654": { + "content": "<|vision_pad|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151655": { + "content": "<|image_pad|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151656": { + "content": "<|video_pad|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151657": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151658": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151659": { + "content": "<|fim_prefix|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151660": { + "content": "<|fim_middle|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151661": { + "content": "<|fim_suffix|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151662": { + "content": "<|fim_pad|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151663": { + "content": "<|repo_name|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151664": { + "content": "<|file_sep|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151665": { + "content": "<|PAD_TOKEN|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + } +} diff --git a/checkpoint-450/trainer_state.json b/checkpoint-450/trainer_state.json new file mode 100644 index 0000000..85895a6 --- /dev/null +++ b/checkpoint-450/trainer_state.json @@ -0,0 +1,1249 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.8125, + "eval_steps": 500, + "global_step": 450, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0625, + "frac_reward_zero_std": 0.075, + "grad_norm": 0.12458323687314987, + "kl": 1.9381546007934957e-05, + "learning_rate": 9.375000000000001e-06, + "loss": 3.427267074584961e-08, + "num_tokens": 57908.0, + "reward": -1.4099999368190765, + "reward_std": 0.7305359303951263, + "rewards/reward_fn/mean": -1.409999930858612, + "rewards/reward_fn/std": 0.8336655974388123, + "step": 10 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.125, + "frac_reward_zero_std": 0.05, + "grad_norm": 0.13424909114837646, + "kl": 3.758955572266132e-05, + "learning_rate": 1.9791666666666665e-05, + "loss": 2.0861625671386717e-08, + "num_tokens": 115744.0, + "reward": -1.5606249690055847, + "reward_std": 0.6338691949844361, + "rewards/reward_fn/mean": -1.5606249451637269, + "rewards/reward_fn/std": 0.6913031578063965, + "step": 20 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1875, + "frac_reward_zero_std": 0.025, + "grad_norm": 0.12381561845541, + "kl": 0.000246869730472099, + "learning_rate": 3.0208333333333334e-05, + "loss": 2.4400651454925537e-07, + "num_tokens": 173524.0, + "reward": -1.4581249356269836, + "reward_std": 0.7430005550384522, + "rewards/reward_fn/mean": -1.4581249117851258, + "rewards/reward_fn/std": 0.7565619319677352, + "step": 30 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.25, + "frac_reward_zero_std": 0.025, + "grad_norm": 0.14411121606826782, + "kl": 0.0016107790230307727, + "learning_rate": 4.0625000000000005e-05, + "loss": 1.6100704669952392e-06, + "num_tokens": 231416.0, + "reward": -1.3331249535083771, + "reward_std": 0.6972330152988434, + "rewards/reward_fn/mean": -1.3331249415874482, + "rewards/reward_fn/std": 0.766948527097702, + "step": 40 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3125, + "frac_reward_zero_std": 0.175, + "grad_norm": 0.14639103412628174, + "kl": 0.007588638024753891, + "learning_rate": 4.988425925925926e-05, + "loss": 7.59512186050415e-06, + "num_tokens": 289368.0, + "reward": -1.0006249606609345, + "reward_std": 0.6684236854314805, + "rewards/reward_fn/mean": -1.0006249785423278, + "rewards/reward_fn/std": 0.7909055948257446, + "step": 50 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.375, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.1545860469341278, + "kl": 0.024985690240282566, + "learning_rate": 4.8726851851851855e-05, + "loss": 2.498328685760498e-05, + "num_tokens": 346964.0, + "reward": -0.6074999988079071, + "reward_std": 0.5110155880451203, + "rewards/reward_fn/mean": -0.6075000107288361, + "rewards/reward_fn/std": 0.6012047231197357, + "step": 60 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4375, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.12491069734096527, + "kl": 0.042862199200317266, + "learning_rate": 4.756944444444444e-05, + "loss": 4.285752947907895e-05, + "num_tokens": 404676.0, + "reward": -0.41250000298023226, + "reward_std": 0.19732050001621246, + "rewards/reward_fn/mean": -0.41250000298023226, + "rewards/reward_fn/std": 0.3099328398704529, + "step": 70 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5, + "frac_reward_zero_std": 0.825, + "grad_norm": 0.139762744307518, + "kl": 0.05795831000432372, + "learning_rate": 4.6412037037037034e-05, + "loss": 5.795210599899292e-05, + "num_tokens": 462404.0, + "reward": -0.3443750023841858, + "reward_std": 0.11552571803331375, + "rewards/reward_fn/mean": -0.344375005364418, + "rewards/reward_fn/std": 0.1971571922302246, + "step": 80 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5625, + "frac_reward_zero_std": 0.9, + "grad_norm": 0.0012635246384888887, + "kl": 0.0758059567771852, + "learning_rate": 4.525462962962963e-05, + "loss": 7.580451783724129e-05, + "num_tokens": 520292.0, + "reward": -0.341250005364418, + "reward_std": 0.06749999970197677, + "rewards/reward_fn/mean": -0.34125000834465025, + "rewards/reward_fn/std": 0.14239275455474854, + "step": 90 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.625, + "frac_reward_zero_std": 0.925, + "grad_norm": 0.28265202045440674, + "kl": 0.10205610543489456, + "learning_rate": 4.4097222222222226e-05, + "loss": 0.00010205432772636414, + "num_tokens": 578164.0, + "reward": -0.31062501221895217, + "reward_std": 0.05374999940395355, + "rewards/reward_fn/mean": -0.31062501221895217, + "rewards/reward_fn/std": 0.0848927527666092, + "step": 100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6875, + "frac_reward_zero_std": 0.925, + "grad_norm": 0.15435293316841125, + "kl": 0.11740942746400833, + "learning_rate": 4.293981481481482e-05, + "loss": 0.00011740787886083126, + "num_tokens": 635928.0, + "reward": -0.3262500077486038, + "reward_std": 0.052499999105930326, + "rewards/reward_fn/mean": -0.3262500077486038, + "rewards/reward_fn/std": 0.10499999821186065, + "step": 110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.75, + "frac_reward_zero_std": 0.975, + "grad_norm": 0.0017832271987572312, + "kl": 0.1092877883464098, + "learning_rate": 4.178240740740741e-05, + "loss": 0.0001092874095775187, + "num_tokens": 693776.0, + "reward": -0.31125001013278963, + "reward_std": 0.02249999940395355, + "rewards/reward_fn/mean": -0.31125001013278963, + "rewards/reward_fn/std": 0.04499999582767487, + "step": 120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8125, + "frac_reward_zero_std": 0.925, + "grad_norm": 0.0019980475772172213, + "kl": 0.1017705887556076, + "learning_rate": 4.0625000000000005e-05, + "loss": 0.00010176897048950196, + "num_tokens": 751704.0, + "reward": -0.3262500077486038, + "reward_std": 0.05249999761581421, + "rewards/reward_fn/mean": -0.3262500077486038, + "rewards/reward_fn/std": 0.10499999821186065, + "step": 130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.875, + "frac_reward_zero_std": 0.975, + "grad_norm": 0.006963597144931555, + "kl": 0.10655302237719297, + "learning_rate": 3.94675925925926e-05, + "loss": 0.00010655303485691547, + "num_tokens": 809360.0, + "reward": -0.31125001013278963, + "reward_std": 0.02249999940395355, + "rewards/reward_fn/mean": -0.31125001013278963, + "rewards/reward_fn/std": 0.04499999582767487, + "step": 140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9375, + "frac_reward_zero_std": 0.925, + "grad_norm": 0.0015019988641142845, + "kl": 0.10568692134693265, + "learning_rate": 3.831018518518519e-05, + "loss": 0.00010568510042503476, + "num_tokens": 867440.0, + "reward": -0.33375000655651094, + "reward_std": 0.06749999821186066, + "rewards/reward_fn/mean": -0.33375000655651094, + "rewards/reward_fn/std": 0.1349999874830246, + "step": 150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.0, + "frac_reward_zero_std": 0.975, + "grad_norm": 0.0008070049225352705, + "kl": 0.07564885132014751, + "learning_rate": 3.715277777777778e-05, + "loss": 7.564762490801513e-05, + "num_tokens": 925440.0, + "reward": -0.30750001072883604, + "reward_std": 0.015000000596046448, + "rewards/reward_fn/mean": -0.30750001072883604, + "rewards/reward_fn/std": 0.030000001192092896, + "step": 160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.0625, + "frac_reward_zero_std": 0.925, + "grad_norm": 0.0015270161675289273, + "kl": 0.06843139906413853, + "learning_rate": 3.5995370370370376e-05, + "loss": 6.842819275334477e-05, + "num_tokens": 983304.0, + "reward": -0.33375000655651094, + "reward_std": 0.06749999821186066, + "rewards/reward_fn/mean": -0.33375000655651094, + "rewards/reward_fn/std": 0.1349999874830246, + "step": 170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.125, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007024089572951198, + "kl": 0.06553028370253741, + "learning_rate": 3.483796296296297e-05, + "loss": 6.553027196787298e-05, + "num_tokens": 1041204.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.1875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001360859489068389, + "kl": 0.058797695627436045, + "learning_rate": 3.368055555555556e-05, + "loss": 5.8797706151381136e-05, + "num_tokens": 1099072.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.25, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0012780348770320415, + "kl": 0.05732685485854745, + "learning_rate": 3.252314814814815e-05, + "loss": 5.732685676775873e-05, + "num_tokens": 1156860.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.3125, + "frac_reward_zero_std": 0.95, + "grad_norm": 0.0007964351680129766, + "kl": 0.0754379843827337, + "learning_rate": 3.136574074074074e-05, + "loss": 7.543485262431204e-05, + "num_tokens": 1214784.0, + "reward": -0.3150000095367432, + "reward_std": 0.029999998956918718, + "rewards/reward_fn/mean": -0.3150000095367432, + "rewards/reward_fn/std": 0.059999996423721315, + "step": 210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.375, + "frac_reward_zero_std": 0.95, + "grad_norm": 0.0004977025673724711, + "kl": 0.05669140527024865, + "learning_rate": 3.0208333333333334e-05, + "loss": 5.669001257047057e-05, + "num_tokens": 1272932.0, + "reward": -0.3187500089406967, + "reward_std": 0.03749999850988388, + "rewards/reward_fn/mean": -0.3187500089406967, + "rewards/reward_fn/std": 0.07499999701976776, + "step": 220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.4375, + "frac_reward_zero_std": 0.975, + "grad_norm": 0.0008227138314396143, + "kl": 0.0492351729888469, + "learning_rate": 2.9050925925925926e-05, + "loss": 4.923430969938636e-05, + "num_tokens": 1330828.0, + "reward": -0.30750001072883604, + "reward_std": 0.014999999105930329, + "rewards/reward_fn/mean": -0.30750001072883604, + "rewards/reward_fn/std": 0.030000001192092896, + "step": 230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008481480763293803, + "kl": 0.05081965173594653, + "learning_rate": 2.789351851851852e-05, + "loss": 5.0819659372791647e-05, + "num_tokens": 1388588.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.5625, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00036746280966326594, + "kl": 0.046290718484669924, + "learning_rate": 2.6736111111111112e-05, + "loss": 4.6290719183161855e-05, + "num_tokens": 1446308.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.625, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007307095802389085, + "kl": 0.04613759703934193, + "learning_rate": 2.5578703703703705e-05, + "loss": 4.613760684151203e-05, + "num_tokens": 1503988.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.6875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006135129951871932, + "kl": 0.051912948861718176, + "learning_rate": 2.4421296296296298e-05, + "loss": 5.191295058466494e-05, + "num_tokens": 1561968.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.75, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008430331945419312, + "kl": 0.05388685767538846, + "learning_rate": 2.326388888888889e-05, + "loss": 5.3886871319264175e-05, + "num_tokens": 1619908.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.8125, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00030109184444881976, + "kl": 0.04647140288725495, + "learning_rate": 2.2106481481481483e-05, + "loss": 4.6471410314552486e-05, + "num_tokens": 1677612.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00029493175679817796, + "kl": 0.04179914747364819, + "learning_rate": 2.0949074074074073e-05, + "loss": 4.17991541326046e-05, + "num_tokens": 1735372.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.9375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0018822322599589825, + "kl": 0.04672949458472431, + "learning_rate": 1.9791666666666665e-05, + "loss": 4.672949435189366e-05, + "num_tokens": 1793268.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 2.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006305224960669875, + "kl": 0.044585675746202466, + "learning_rate": 1.863425925925926e-05, + "loss": 4.458568291738629e-05, + "num_tokens": 1850880.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 2.0625, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003445159236434847, + "kl": 0.04190994901582599, + "learning_rate": 1.747685185185185e-05, + "loss": 4.190995532553643e-05, + "num_tokens": 1908628.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 2.125, + "frac_reward_zero_std": 0.975, + "grad_norm": 0.00043421241571195424, + "kl": 0.04299232084304094, + "learning_rate": 1.6319444444444444e-05, + "loss": 4.2992207454517484e-05, + "num_tokens": 1966240.0, + "reward": -0.30750001072883604, + "reward_std": 0.014999999105930329, + "rewards/reward_fn/mean": -0.30750001072883604, + "rewards/reward_fn/std": 0.030000001192092896, + "step": 340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 2.1875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005787091795355082, + "kl": 0.047347473446279764, + "learning_rate": 1.5162037037037038e-05, + "loss": 4.7347473446279766e-05, + "num_tokens": 2024200.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 2.25, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006160089396871626, + "kl": 0.038707632245495914, + "learning_rate": 1.4004629629629631e-05, + "loss": 3.870762884616852e-05, + "num_tokens": 2081972.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 2.3125, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009722487884573638, + "kl": 0.04313907837495208, + "learning_rate": 1.2847222222222222e-05, + "loss": 4.313907993491739e-05, + "num_tokens": 2139648.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 2.375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004891545977443457, + "kl": 0.04277103706263006, + "learning_rate": 1.1689814814814815e-05, + "loss": 4.277102998457849e-05, + "num_tokens": 2197280.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 2.4375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00046821596333757043, + "kl": 0.04802478738129139, + "learning_rate": 1.0532407407407408e-05, + "loss": 4.802478651981801e-05, + "num_tokens": 2255032.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 2.5, + "frac_reward_zero_std": 0.975, + "grad_norm": 0.0004983897670172155, + "kl": 0.04196312534622848, + "learning_rate": 9.375000000000001e-06, + "loss": 4.196249938104302e-05, + "num_tokens": 2312772.0, + "reward": -0.31125001013278963, + "reward_std": 0.02249999940395355, + "rewards/reward_fn/mean": -0.31125001013278963, + "rewards/reward_fn/std": 0.04499999582767487, + "step": 400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 2.5625, + "frac_reward_zero_std": 0.975, + "grad_norm": 0.00025357346748933196, + "kl": 0.04060801230370999, + "learning_rate": 8.217592592592594e-06, + "loss": 4.060695064254105e-05, + "num_tokens": 2370916.0, + "reward": -0.30750001072883604, + "reward_std": 0.015000000596046448, + "rewards/reward_fn/mean": -0.30750001072883604, + "rewards/reward_fn/std": 0.030000001192092896, + "step": 410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 2.625, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007254995289258659, + "kl": 0.04095169799402356, + "learning_rate": 7.060185185185185e-06, + "loss": 4.09516942454502e-05, + "num_tokens": 2429052.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 2.6875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008786269463598728, + "kl": 0.04227711800485849, + "learning_rate": 5.902777777777778e-06, + "loss": 4.227712051942945e-05, + "num_tokens": 2486904.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 2.75, + "frac_reward_zero_std": 0.975, + "grad_norm": 0.0004273704544175416, + "kl": 0.04474450033158064, + "learning_rate": 4.7453703703703705e-06, + "loss": 4.4743606122210625e-05, + "num_tokens": 2544744.0, + "reward": -0.30750001072883604, + "reward_std": 0.014999999105930329, + "rewards/reward_fn/mean": -0.30750001072883604, + "rewards/reward_fn/std": 0.030000001192092896, + "step": 440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 2.8125, + "frac_reward_zero_std": 0.975, + "grad_norm": 0.0004974489565938711, + "kl": 0.0426725294906646, + "learning_rate": 3.5879629629629633e-06, + "loss": 4.2671963456086814e-05, + "num_tokens": 2602668.0, + "reward": -0.31125001013278963, + "reward_std": 0.02249999940395355, + "rewards/reward_fn/mean": -0.31125001013278963, + "rewards/reward_fn/std": 0.04499999582767487, + "step": 450 + } + ], + "logging_steps": 10, + "max_steps": 480, + "num_input_tokens_seen": 2602668, + "num_train_epochs": 3, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-450/training_args.bin b/checkpoint-450/training_args.bin new file mode 100644 index 0000000..32a13ee --- /dev/null +++ b/checkpoint-450/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9bb5eec9fa732effecc39928ed0f7eda205a0e16573a345963766968a1a8e3e2 +size 6737 diff --git a/checkpoint-480/README.md b/checkpoint-480/README.md new file mode 100644 index 0000000..bf064af --- /dev/null +++ b/checkpoint-480/README.md @@ -0,0 +1,210 @@ +--- +base_model: unsloth/qwen2.5-1.5b-instruct-unsloth-bnb-4bit +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:unsloth/qwen2.5-1.5b-instruct-unsloth-bnb-4bit +- grpo +- lora +- transformers +- trl +- unsloth +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/checkpoint-480/adapter_config.json b/checkpoint-480/adapter_config.json new file mode 100644 index 0000000..ae4673f --- /dev/null +++ b/checkpoint-480/adapter_config.json @@ -0,0 +1,49 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": { + "base_model_class": "Qwen2ForCausalLM", + "parent_library": "transformers.models.qwen2.modeling_qwen2", + "unsloth_fixed": true + }, + "base_model_name_or_path": "unsloth/qwen2.5-1.5b-instruct-unsloth-bnb-4bit", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-480/adapter_model.safetensors b/checkpoint-480/adapter_model.safetensors new file mode 100644 index 0000000..ca6a479 --- /dev/null +++ b/checkpoint-480/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd39a9a5e21e7dd43dbc0b86084b801e5c03318652eec48f7bac321348090156 +size 17462432 diff --git a/checkpoint-480/chat_template.jinja b/checkpoint-480/chat_template.jinja new file mode 100644 index 0000000..bdf7919 --- /dev/null +++ b/checkpoint-480/chat_template.jinja @@ -0,0 +1,54 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0]['role'] == 'system' %} + {{- messages[0]['content'] }} + {%- else %} + {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }} + {%- endif %} + {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0]['role'] == 'system' %} + {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }} + {%- else %} + {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {{- '<|im_start|>' + message.role }} + {%- if message.content %} + {{- '\n' + message.content }} + {%- endif %} + {%- for tool_call in message.tool_calls %} + {%- if tool_call.function is defined %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {{- tool_call.arguments | tojson }} + {{- '}\n' }} + {%- endfor %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} +{%- endif %} diff --git a/checkpoint-480/optimizer.pt b/checkpoint-480/optimizer.pt new file mode 100644 index 0000000..46cbcaa --- /dev/null +++ b/checkpoint-480/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4eef9e6989652c5b09b05f1f34427fba9124875151b9aca27ff72d1f1fa69e34 +size 9117643 diff --git a/checkpoint-480/rng_state.pth b/checkpoint-480/rng_state.pth new file mode 100644 index 0000000..c2ac03c --- /dev/null +++ b/checkpoint-480/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0158d28d6f1c602d54c45feefaaecb2bc5dd1016effa8b0a053a543d36e8ceac +size 14645 diff --git a/checkpoint-480/scaler.pt b/checkpoint-480/scaler.pt new file mode 100644 index 0000000..97b7fef --- /dev/null +++ b/checkpoint-480/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d48ee9d9909680ca611f0a95c8cefcadb338dd2851b722337f41dd0606fbe3b +size 1383 diff --git a/checkpoint-480/scheduler.pt b/checkpoint-480/scheduler.pt new file mode 100644 index 0000000..3134915 --- /dev/null +++ b/checkpoint-480/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0de742672a2184cab4e634b68946f6d485c80b4c7e7b66ccf929e1df05ec3dad +size 1465 diff --git a/checkpoint-480/tokenizer.json b/checkpoint-480/tokenizer.json new file mode 100644 index 0000000..9fa63fd --- /dev/null +++ b/checkpoint-480/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af5891a15588546db1ac7f2baf8fa94835a51a85c032c39793a55bb048b47446 +size 11422523 diff --git a/checkpoint-480/tokenizer_config.json b/checkpoint-480/tokenizer_config.json new file mode 100644 index 0000000..d3dac53 --- /dev/null +++ b/checkpoint-480/tokenizer_config.json @@ -0,0 +1,201 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|im_end|>", + "errors": "replace", + "is_local": false, + "model_max_length": 32768, + "pad_token": "<|PAD_TOKEN|>", + "padding_side": "right", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null, + "added_tokens_decoder": { + "151643": { + "content": "<|endoftext|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151644": { + "content": "<|im_start|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151645": { + "content": "<|im_end|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151646": { + "content": "<|object_ref_start|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151647": { + "content": "<|object_ref_end|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151648": { + "content": "<|box_start|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151649": { + "content": "<|box_end|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151650": { + "content": "<|quad_start|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151651": { + "content": "<|quad_end|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151652": { + "content": "<|vision_start|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151653": { + "content": "<|vision_end|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151654": { + "content": "<|vision_pad|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151655": { + "content": "<|image_pad|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151656": { + "content": "<|video_pad|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151657": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151658": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151659": { + "content": "<|fim_prefix|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151660": { + "content": "<|fim_middle|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151661": { + "content": "<|fim_suffix|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151662": { + "content": "<|fim_pad|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151663": { + "content": "<|repo_name|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151664": { + "content": "<|file_sep|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151665": { + "content": "<|PAD_TOKEN|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + } +} diff --git a/checkpoint-480/trainer_state.json b/checkpoint-480/trainer_state.json new file mode 100644 index 0000000..325d58a --- /dev/null +++ b/checkpoint-480/trainer_state.json @@ -0,0 +1,1330 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 480, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0625, + "frac_reward_zero_std": 0.075, + "grad_norm": 0.12458323687314987, + "kl": 1.9381546007934957e-05, + "learning_rate": 9.375000000000001e-06, + "loss": 3.427267074584961e-08, + "num_tokens": 57908.0, + "reward": -1.4099999368190765, + "reward_std": 0.7305359303951263, + "rewards/reward_fn/mean": -1.409999930858612, + "rewards/reward_fn/std": 0.8336655974388123, + "step": 10 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.125, + "frac_reward_zero_std": 0.05, + "grad_norm": 0.13424909114837646, + "kl": 3.758955572266132e-05, + "learning_rate": 1.9791666666666665e-05, + "loss": 2.0861625671386717e-08, + "num_tokens": 115744.0, + "reward": -1.5606249690055847, + "reward_std": 0.6338691949844361, + "rewards/reward_fn/mean": -1.5606249451637269, + "rewards/reward_fn/std": 0.6913031578063965, + "step": 20 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1875, + "frac_reward_zero_std": 0.025, + "grad_norm": 0.12381561845541, + "kl": 0.000246869730472099, + "learning_rate": 3.0208333333333334e-05, + "loss": 2.4400651454925537e-07, + "num_tokens": 173524.0, + "reward": -1.4581249356269836, + "reward_std": 0.7430005550384522, + "rewards/reward_fn/mean": -1.4581249117851258, + "rewards/reward_fn/std": 0.7565619319677352, + "step": 30 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.25, + "frac_reward_zero_std": 0.025, + "grad_norm": 0.14411121606826782, + "kl": 0.0016107790230307727, + "learning_rate": 4.0625000000000005e-05, + "loss": 1.6100704669952392e-06, + "num_tokens": 231416.0, + "reward": -1.3331249535083771, + "reward_std": 0.6972330152988434, + "rewards/reward_fn/mean": -1.3331249415874482, + "rewards/reward_fn/std": 0.766948527097702, + "step": 40 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3125, + "frac_reward_zero_std": 0.175, + "grad_norm": 0.14639103412628174, + "kl": 0.007588638024753891, + "learning_rate": 4.988425925925926e-05, + "loss": 7.59512186050415e-06, + "num_tokens": 289368.0, + "reward": -1.0006249606609345, + "reward_std": 0.6684236854314805, + "rewards/reward_fn/mean": -1.0006249785423278, + "rewards/reward_fn/std": 0.7909055948257446, + "step": 50 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.375, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.1545860469341278, + "kl": 0.024985690240282566, + "learning_rate": 4.8726851851851855e-05, + "loss": 2.498328685760498e-05, + "num_tokens": 346964.0, + "reward": -0.6074999988079071, + "reward_std": 0.5110155880451203, + "rewards/reward_fn/mean": -0.6075000107288361, + "rewards/reward_fn/std": 0.6012047231197357, + "step": 60 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4375, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.12491069734096527, + "kl": 0.042862199200317266, + "learning_rate": 4.756944444444444e-05, + "loss": 4.285752947907895e-05, + "num_tokens": 404676.0, + "reward": -0.41250000298023226, + "reward_std": 0.19732050001621246, + "rewards/reward_fn/mean": -0.41250000298023226, + "rewards/reward_fn/std": 0.3099328398704529, + "step": 70 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5, + "frac_reward_zero_std": 0.825, + "grad_norm": 0.139762744307518, + "kl": 0.05795831000432372, + "learning_rate": 4.6412037037037034e-05, + "loss": 5.795210599899292e-05, + "num_tokens": 462404.0, + "reward": -0.3443750023841858, + "reward_std": 0.11552571803331375, + "rewards/reward_fn/mean": -0.344375005364418, + "rewards/reward_fn/std": 0.1971571922302246, + "step": 80 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5625, + "frac_reward_zero_std": 0.9, + "grad_norm": 0.0012635246384888887, + "kl": 0.0758059567771852, + "learning_rate": 4.525462962962963e-05, + "loss": 7.580451783724129e-05, + "num_tokens": 520292.0, + "reward": -0.341250005364418, + "reward_std": 0.06749999970197677, + "rewards/reward_fn/mean": -0.34125000834465025, + "rewards/reward_fn/std": 0.14239275455474854, + "step": 90 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.625, + "frac_reward_zero_std": 0.925, + "grad_norm": 0.28265202045440674, + "kl": 0.10205610543489456, + "learning_rate": 4.4097222222222226e-05, + "loss": 0.00010205432772636414, + "num_tokens": 578164.0, + "reward": -0.31062501221895217, + "reward_std": 0.05374999940395355, + "rewards/reward_fn/mean": -0.31062501221895217, + "rewards/reward_fn/std": 0.0848927527666092, + "step": 100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6875, + "frac_reward_zero_std": 0.925, + "grad_norm": 0.15435293316841125, + "kl": 0.11740942746400833, + "learning_rate": 4.293981481481482e-05, + "loss": 0.00011740787886083126, + "num_tokens": 635928.0, + "reward": -0.3262500077486038, + "reward_std": 0.052499999105930326, + "rewards/reward_fn/mean": -0.3262500077486038, + "rewards/reward_fn/std": 0.10499999821186065, + "step": 110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.75, + "frac_reward_zero_std": 0.975, + "grad_norm": 0.0017832271987572312, + "kl": 0.1092877883464098, + "learning_rate": 4.178240740740741e-05, + "loss": 0.0001092874095775187, + "num_tokens": 693776.0, + "reward": -0.31125001013278963, + "reward_std": 0.02249999940395355, + "rewards/reward_fn/mean": -0.31125001013278963, + "rewards/reward_fn/std": 0.04499999582767487, + "step": 120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8125, + "frac_reward_zero_std": 0.925, + "grad_norm": 0.0019980475772172213, + "kl": 0.1017705887556076, + "learning_rate": 4.0625000000000005e-05, + "loss": 0.00010176897048950196, + "num_tokens": 751704.0, + "reward": -0.3262500077486038, + "reward_std": 0.05249999761581421, + "rewards/reward_fn/mean": -0.3262500077486038, + "rewards/reward_fn/std": 0.10499999821186065, + "step": 130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.875, + "frac_reward_zero_std": 0.975, + "grad_norm": 0.006963597144931555, + "kl": 0.10655302237719297, + "learning_rate": 3.94675925925926e-05, + "loss": 0.00010655303485691547, + "num_tokens": 809360.0, + "reward": -0.31125001013278963, + "reward_std": 0.02249999940395355, + "rewards/reward_fn/mean": -0.31125001013278963, + "rewards/reward_fn/std": 0.04499999582767487, + "step": 140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9375, + "frac_reward_zero_std": 0.925, + "grad_norm": 0.0015019988641142845, + "kl": 0.10568692134693265, + "learning_rate": 3.831018518518519e-05, + "loss": 0.00010568510042503476, + "num_tokens": 867440.0, + "reward": -0.33375000655651094, + "reward_std": 0.06749999821186066, + "rewards/reward_fn/mean": -0.33375000655651094, + "rewards/reward_fn/std": 0.1349999874830246, + "step": 150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.0, + "frac_reward_zero_std": 0.975, + "grad_norm": 0.0008070049225352705, + "kl": 0.07564885132014751, + "learning_rate": 3.715277777777778e-05, + "loss": 7.564762490801513e-05, + "num_tokens": 925440.0, + "reward": -0.30750001072883604, + "reward_std": 0.015000000596046448, + "rewards/reward_fn/mean": -0.30750001072883604, + "rewards/reward_fn/std": 0.030000001192092896, + "step": 160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.0625, + "frac_reward_zero_std": 0.925, + "grad_norm": 0.0015270161675289273, + "kl": 0.06843139906413853, + "learning_rate": 3.5995370370370376e-05, + "loss": 6.842819275334477e-05, + "num_tokens": 983304.0, + "reward": -0.33375000655651094, + "reward_std": 0.06749999821186066, + "rewards/reward_fn/mean": -0.33375000655651094, + "rewards/reward_fn/std": 0.1349999874830246, + "step": 170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.125, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007024089572951198, + "kl": 0.06553028370253741, + "learning_rate": 3.483796296296297e-05, + "loss": 6.553027196787298e-05, + "num_tokens": 1041204.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.1875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001360859489068389, + "kl": 0.058797695627436045, + "learning_rate": 3.368055555555556e-05, + "loss": 5.8797706151381136e-05, + "num_tokens": 1099072.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.25, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0012780348770320415, + "kl": 0.05732685485854745, + "learning_rate": 3.252314814814815e-05, + "loss": 5.732685676775873e-05, + "num_tokens": 1156860.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.3125, + "frac_reward_zero_std": 0.95, + "grad_norm": 0.0007964351680129766, + "kl": 0.0754379843827337, + "learning_rate": 3.136574074074074e-05, + "loss": 7.543485262431204e-05, + "num_tokens": 1214784.0, + "reward": -0.3150000095367432, + "reward_std": 0.029999998956918718, + "rewards/reward_fn/mean": -0.3150000095367432, + "rewards/reward_fn/std": 0.059999996423721315, + "step": 210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.375, + "frac_reward_zero_std": 0.95, + "grad_norm": 0.0004977025673724711, + "kl": 0.05669140527024865, + "learning_rate": 3.0208333333333334e-05, + "loss": 5.669001257047057e-05, + "num_tokens": 1272932.0, + "reward": -0.3187500089406967, + "reward_std": 0.03749999850988388, + "rewards/reward_fn/mean": -0.3187500089406967, + "rewards/reward_fn/std": 0.07499999701976776, + "step": 220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.4375, + "frac_reward_zero_std": 0.975, + "grad_norm": 0.0008227138314396143, + "kl": 0.0492351729888469, + "learning_rate": 2.9050925925925926e-05, + "loss": 4.923430969938636e-05, + "num_tokens": 1330828.0, + "reward": -0.30750001072883604, + "reward_std": 0.014999999105930329, + "rewards/reward_fn/mean": -0.30750001072883604, + "rewards/reward_fn/std": 0.030000001192092896, + "step": 230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008481480763293803, + "kl": 0.05081965173594653, + "learning_rate": 2.789351851851852e-05, + "loss": 5.0819659372791647e-05, + "num_tokens": 1388588.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.5625, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00036746280966326594, + "kl": 0.046290718484669924, + "learning_rate": 2.6736111111111112e-05, + "loss": 4.6290719183161855e-05, + "num_tokens": 1446308.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.625, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007307095802389085, + "kl": 0.04613759703934193, + "learning_rate": 2.5578703703703705e-05, + "loss": 4.613760684151203e-05, + "num_tokens": 1503988.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.6875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006135129951871932, + "kl": 0.051912948861718176, + "learning_rate": 2.4421296296296298e-05, + "loss": 5.191295058466494e-05, + "num_tokens": 1561968.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.75, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008430331945419312, + "kl": 0.05388685767538846, + "learning_rate": 2.326388888888889e-05, + "loss": 5.3886871319264175e-05, + "num_tokens": 1619908.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.8125, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00030109184444881976, + "kl": 0.04647140288725495, + "learning_rate": 2.2106481481481483e-05, + "loss": 4.6471410314552486e-05, + "num_tokens": 1677612.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00029493175679817796, + "kl": 0.04179914747364819, + "learning_rate": 2.0949074074074073e-05, + "loss": 4.17991541326046e-05, + "num_tokens": 1735372.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.9375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0018822322599589825, + "kl": 0.04672949458472431, + "learning_rate": 1.9791666666666665e-05, + "loss": 4.672949435189366e-05, + "num_tokens": 1793268.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 2.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006305224960669875, + "kl": 0.044585675746202466, + "learning_rate": 1.863425925925926e-05, + "loss": 4.458568291738629e-05, + "num_tokens": 1850880.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 2.0625, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003445159236434847, + "kl": 0.04190994901582599, + "learning_rate": 1.747685185185185e-05, + "loss": 4.190995532553643e-05, + "num_tokens": 1908628.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 2.125, + "frac_reward_zero_std": 0.975, + "grad_norm": 0.00043421241571195424, + "kl": 0.04299232084304094, + "learning_rate": 1.6319444444444444e-05, + "loss": 4.2992207454517484e-05, + "num_tokens": 1966240.0, + "reward": -0.30750001072883604, + "reward_std": 0.014999999105930329, + "rewards/reward_fn/mean": -0.30750001072883604, + "rewards/reward_fn/std": 0.030000001192092896, + "step": 340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 2.1875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005787091795355082, + "kl": 0.047347473446279764, + "learning_rate": 1.5162037037037038e-05, + "loss": 4.7347473446279766e-05, + "num_tokens": 2024200.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 2.25, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006160089396871626, + "kl": 0.038707632245495914, + "learning_rate": 1.4004629629629631e-05, + "loss": 3.870762884616852e-05, + "num_tokens": 2081972.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 2.3125, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009722487884573638, + "kl": 0.04313907837495208, + "learning_rate": 1.2847222222222222e-05, + "loss": 4.313907993491739e-05, + "num_tokens": 2139648.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 2.375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004891545977443457, + "kl": 0.04277103706263006, + "learning_rate": 1.1689814814814815e-05, + "loss": 4.277102998457849e-05, + "num_tokens": 2197280.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 2.4375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00046821596333757043, + "kl": 0.04802478738129139, + "learning_rate": 1.0532407407407408e-05, + "loss": 4.802478651981801e-05, + "num_tokens": 2255032.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 2.5, + "frac_reward_zero_std": 0.975, + "grad_norm": 0.0004983897670172155, + "kl": 0.04196312534622848, + "learning_rate": 9.375000000000001e-06, + "loss": 4.196249938104302e-05, + "num_tokens": 2312772.0, + "reward": -0.31125001013278963, + "reward_std": 0.02249999940395355, + "rewards/reward_fn/mean": -0.31125001013278963, + "rewards/reward_fn/std": 0.04499999582767487, + "step": 400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 2.5625, + "frac_reward_zero_std": 0.975, + "grad_norm": 0.00025357346748933196, + "kl": 0.04060801230370999, + "learning_rate": 8.217592592592594e-06, + "loss": 4.060695064254105e-05, + "num_tokens": 2370916.0, + "reward": -0.30750001072883604, + "reward_std": 0.015000000596046448, + "rewards/reward_fn/mean": -0.30750001072883604, + "rewards/reward_fn/std": 0.030000001192092896, + "step": 410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 2.625, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007254995289258659, + "kl": 0.04095169799402356, + "learning_rate": 7.060185185185185e-06, + "loss": 4.09516942454502e-05, + "num_tokens": 2429052.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 2.6875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008786269463598728, + "kl": 0.04227711800485849, + "learning_rate": 5.902777777777778e-06, + "loss": 4.227712051942945e-05, + "num_tokens": 2486904.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 2.75, + "frac_reward_zero_std": 0.975, + "grad_norm": 0.0004273704544175416, + "kl": 0.04474450033158064, + "learning_rate": 4.7453703703703705e-06, + "loss": 4.4743606122210625e-05, + "num_tokens": 2544744.0, + "reward": -0.30750001072883604, + "reward_std": 0.014999999105930329, + "rewards/reward_fn/mean": -0.30750001072883604, + "rewards/reward_fn/std": 0.030000001192092896, + "step": 440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 2.8125, + "frac_reward_zero_std": 0.975, + "grad_norm": 0.0004974489565938711, + "kl": 0.0426725294906646, + "learning_rate": 3.5879629629629633e-06, + "loss": 4.2671963456086814e-05, + "num_tokens": 2602668.0, + "reward": -0.31125001013278963, + "reward_std": 0.02249999940395355, + "rewards/reward_fn/mean": -0.31125001013278963, + "rewards/reward_fn/std": 0.04499999582767487, + "step": 450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 2.875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0014066597213968635, + "kl": 0.03986191004514694, + "learning_rate": 2.4305555555555557e-06, + "loss": 3.9861915865913033e-05, + "num_tokens": 2660728.0, + "reward": -0.30000001192092896, + "reward_std": 0.0, + "rewards/reward_fn/mean": -0.30000001192092896, + "rewards/reward_fn/std": 0.0, + "step": 460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 2.9375, + "frac_reward_zero_std": 0.975, + "grad_norm": 0.0002597096317913383, + "kl": 0.043036780599504706, + "learning_rate": 1.273148148148148e-06, + "loss": 4.3035755516029896e-05, + "num_tokens": 2718520.0, + "reward": -0.3037500113248825, + "reward_std": 0.007499999552965164, + "rewards/reward_fn/mean": -0.3037500113248825, + "rewards/reward_fn/std": 0.015000000596046448, + "step": 470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 3.0, + "frac_reward_zero_std": 0.95, + "grad_norm": 0.0002936372475232929, + "kl": 0.040247406950220464, + "learning_rate": 1.1574074074074074e-07, + "loss": 4.0245510172098874e-05, + "num_tokens": 2776320.0, + "reward": -0.3187500089406967, + "reward_std": 0.0375, + "rewards/reward_fn/mean": -0.3187500089406967, + "rewards/reward_fn/std": 0.07499999701976776, + "step": 480 + } + ], + "logging_steps": 10, + "max_steps": 480, + "num_input_tokens_seen": 2776320, + "num_train_epochs": 3, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-480/training_args.bin b/checkpoint-480/training_args.bin new file mode 100644 index 0000000..32a13ee --- /dev/null +++ b/checkpoint-480/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9bb5eec9fa732effecc39928ed0f7eda205a0e16573a345963766968a1a8e3e2 +size 6737 diff --git a/checkpoint-50/README.md b/checkpoint-50/README.md new file mode 100644 index 0000000..bf064af --- /dev/null +++ b/checkpoint-50/README.md @@ -0,0 +1,210 @@ +--- +base_model: unsloth/qwen2.5-1.5b-instruct-unsloth-bnb-4bit +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:unsloth/qwen2.5-1.5b-instruct-unsloth-bnb-4bit +- grpo +- lora +- transformers +- trl +- unsloth +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/checkpoint-50/adapter_config.json b/checkpoint-50/adapter_config.json new file mode 100644 index 0000000..ae4673f --- /dev/null +++ b/checkpoint-50/adapter_config.json @@ -0,0 +1,49 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": { + "base_model_class": "Qwen2ForCausalLM", + "parent_library": "transformers.models.qwen2.modeling_qwen2", + "unsloth_fixed": true + }, + "base_model_name_or_path": "unsloth/qwen2.5-1.5b-instruct-unsloth-bnb-4bit", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-50/adapter_model.safetensors b/checkpoint-50/adapter_model.safetensors new file mode 100644 index 0000000..afa355d --- /dev/null +++ b/checkpoint-50/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d8eba8a77750db164f5ecece721d093a47ff6340b89b609ac9f2ef35f41a9337 +size 17462432 diff --git a/checkpoint-50/chat_template.jinja b/checkpoint-50/chat_template.jinja new file mode 100644 index 0000000..bdf7919 --- /dev/null +++ b/checkpoint-50/chat_template.jinja @@ -0,0 +1,54 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0]['role'] == 'system' %} + {{- messages[0]['content'] }} + {%- else %} + {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }} + {%- endif %} + {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0]['role'] == 'system' %} + {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }} + {%- else %} + {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {{- '<|im_start|>' + message.role }} + {%- if message.content %} + {{- '\n' + message.content }} + {%- endif %} + {%- for tool_call in message.tool_calls %} + {%- if tool_call.function is defined %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {{- tool_call.arguments | tojson }} + {{- '}\n' }} + {%- endfor %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} +{%- endif %} diff --git a/checkpoint-50/optimizer.pt b/checkpoint-50/optimizer.pt new file mode 100644 index 0000000..8951e30 --- /dev/null +++ b/checkpoint-50/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a4b1c18c7cd33a42d323673ee8a0bd892920ca7b1e24793dcc2434554641948a +size 9117451 diff --git a/checkpoint-50/rng_state.pth b/checkpoint-50/rng_state.pth new file mode 100644 index 0000000..775e714 --- /dev/null +++ b/checkpoint-50/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e52239b8b8f2e97436242bf74bdeabb95b879e8f659173263a720ce25ef94b3 +size 14645 diff --git a/checkpoint-50/scaler.pt b/checkpoint-50/scaler.pt new file mode 100644 index 0000000..48ade11 --- /dev/null +++ b/checkpoint-50/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5cd0e9d505fbc3f97feb166d29026132bdf14eb3e5c7ff77beebc303ee666f96 +size 1383 diff --git a/checkpoint-50/scheduler.pt b/checkpoint-50/scheduler.pt new file mode 100644 index 0000000..d1a636c --- /dev/null +++ b/checkpoint-50/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9889b752e9a085fba515d7af54f3d53bc889daca04ab26b9d3ae3400ac87c21d +size 1465 diff --git a/checkpoint-50/tokenizer.json b/checkpoint-50/tokenizer.json new file mode 100644 index 0000000..9fa63fd --- /dev/null +++ b/checkpoint-50/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af5891a15588546db1ac7f2baf8fa94835a51a85c032c39793a55bb048b47446 +size 11422523 diff --git a/checkpoint-50/tokenizer_config.json b/checkpoint-50/tokenizer_config.json new file mode 100644 index 0000000..d3dac53 --- /dev/null +++ b/checkpoint-50/tokenizer_config.json @@ -0,0 +1,201 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|im_end|>", + "errors": "replace", + "is_local": false, + "model_max_length": 32768, + "pad_token": "<|PAD_TOKEN|>", + "padding_side": "right", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null, + "added_tokens_decoder": { + "151643": { + "content": "<|endoftext|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151644": { + "content": "<|im_start|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151645": { + "content": "<|im_end|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151646": { + "content": "<|object_ref_start|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151647": { + "content": "<|object_ref_end|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151648": { + "content": "<|box_start|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151649": { + "content": "<|box_end|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151650": { + "content": "<|quad_start|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151651": { + "content": "<|quad_end|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151652": { + "content": "<|vision_start|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151653": { + "content": "<|vision_end|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151654": { + "content": "<|vision_pad|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151655": { + "content": "<|image_pad|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151656": { + "content": "<|video_pad|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151657": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151658": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151659": { + "content": "<|fim_prefix|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151660": { + "content": "<|fim_middle|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151661": { + "content": "<|fim_suffix|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151662": { + "content": "<|fim_pad|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151663": { + "content": "<|repo_name|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151664": { + "content": "<|file_sep|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151665": { + "content": "<|PAD_TOKEN|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + } +} diff --git a/checkpoint-50/trainer_state.json b/checkpoint-50/trainer_state.json new file mode 100644 index 0000000..6f42195 --- /dev/null +++ b/checkpoint-50/trainer_state.json @@ -0,0 +1,169 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.3125, + "eval_steps": 500, + "global_step": 50, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0625, + "frac_reward_zero_std": 0.075, + "grad_norm": 0.12458323687314987, + "kl": 1.9381546007934957e-05, + "learning_rate": 9.375000000000001e-06, + "loss": 3.427267074584961e-08, + "num_tokens": 57908.0, + "reward": -1.4099999368190765, + "reward_std": 0.7305359303951263, + "rewards/reward_fn/mean": -1.409999930858612, + "rewards/reward_fn/std": 0.8336655974388123, + "step": 10 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.125, + "frac_reward_zero_std": 0.05, + "grad_norm": 0.13424909114837646, + "kl": 3.758955572266132e-05, + "learning_rate": 1.9791666666666665e-05, + "loss": 2.0861625671386717e-08, + "num_tokens": 115744.0, + "reward": -1.5606249690055847, + "reward_std": 0.6338691949844361, + "rewards/reward_fn/mean": -1.5606249451637269, + "rewards/reward_fn/std": 0.6913031578063965, + "step": 20 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1875, + "frac_reward_zero_std": 0.025, + "grad_norm": 0.12381561845541, + "kl": 0.000246869730472099, + "learning_rate": 3.0208333333333334e-05, + "loss": 2.4400651454925537e-07, + "num_tokens": 173524.0, + "reward": -1.4581249356269836, + "reward_std": 0.7430005550384522, + "rewards/reward_fn/mean": -1.4581249117851258, + "rewards/reward_fn/std": 0.7565619319677352, + "step": 30 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.25, + "frac_reward_zero_std": 0.025, + "grad_norm": 0.14411121606826782, + "kl": 0.0016107790230307727, + "learning_rate": 4.0625000000000005e-05, + "loss": 1.6100704669952392e-06, + "num_tokens": 231416.0, + "reward": -1.3331249535083771, + "reward_std": 0.6972330152988434, + "rewards/reward_fn/mean": -1.3331249415874482, + "rewards/reward_fn/std": 0.766948527097702, + "step": 40 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 256.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 256.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3125, + "frac_reward_zero_std": 0.175, + "grad_norm": 0.14639103412628174, + "kl": 0.007588638024753891, + "learning_rate": 4.988425925925926e-05, + "loss": 7.59512186050415e-06, + "num_tokens": 289368.0, + "reward": -1.0006249606609345, + "reward_std": 0.6684236854314805, + "rewards/reward_fn/mean": -1.0006249785423278, + "rewards/reward_fn/std": 0.7909055948257446, + "step": 50 + } + ], + "logging_steps": 10, + "max_steps": 480, + "num_input_tokens_seen": 289368, + "num_train_epochs": 3, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-50/training_args.bin b/checkpoint-50/training_args.bin new file mode 100644 index 0000000..32a13ee --- /dev/null +++ b/checkpoint-50/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9bb5eec9fa732effecc39928ed0f7eda205a0e16573a345963766968a1a8e3e2 +size 6737 diff --git a/config.json b/config.json new file mode 100644 index 0000000..c8aa5c7 --- /dev/null +++ b/config.json @@ -0,0 +1,62 @@ +{ + "architectures": [ + "Qwen2ForCausalLM" + ], + "attention_dropout": 0.0, + "bos_token_id": null, + "torch_dtype": "float16", + "eos_token_id": 151645, + "hidden_act": "silu", + "hidden_size": 1536, + "initializer_range": 0.02, + "intermediate_size": 8960, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 32768, + "max_window_layers": 21, + "model_type": "qwen2", + "num_attention_heads": 12, + "num_hidden_layers": 28, + "num_key_value_heads": 2, + "pad_token_id": 151665, + "rms_norm_eps": 1e-06, + "rope_parameters": { + "rope_theta": 1000000.0, + "rope_type": "default" + }, + "sliding_window": null, + "tie_word_embeddings": true, + "unsloth_fixed": true, + "unsloth_version": "2026.4.8", + "use_cache": false, + "use_sliding_window": false, + "vocab_size": 151936 +} \ No newline at end of file diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000..db93746 --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c3e3200b7aca4cc53df643242e4fa7c8111d9066a1a79996d1441494352aa64c +size 3087467144 diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..9fa63fd --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af5891a15588546db1ac7f2baf8fa94835a51a85c032c39793a55bb048b47446 +size 11422523 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..544df20 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,202 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|im_end|>", + "errors": "replace", + "is_local": false, + "model_max_length": 32768, + "pad_token": "<|PAD_TOKEN|>", + "padding_side": "left", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null, + "added_tokens_decoder": { + "151643": { + "content": "<|endoftext|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151644": { + "content": "<|im_start|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151645": { + "content": "<|im_end|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151646": { + "content": "<|object_ref_start|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151647": { + "content": "<|object_ref_end|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151648": { + "content": "<|box_start|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151649": { + "content": "<|box_end|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151650": { + "content": "<|quad_start|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151651": { + "content": "<|quad_end|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151652": { + "content": "<|vision_start|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151653": { + "content": "<|vision_end|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151654": { + "content": "<|vision_pad|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151655": { + "content": "<|image_pad|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151656": { + "content": "<|video_pad|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151657": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151658": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151659": { + "content": "<|fim_prefix|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151660": { + "content": "<|fim_middle|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151661": { + "content": "<|fim_suffix|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151662": { + "content": "<|fim_pad|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151663": { + "content": "<|repo_name|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151664": { + "content": "<|file_sep|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151665": { + "content": "<|PAD_TOKEN|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + }, + "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within XML tags:\\n\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n\\n\\nFor each function call, return a json object with function name and arguments within XML tags:\\n\\n{\\\"name\\\": , \\\"arguments\\\": }\\n<|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n\\n' }}\n {{- message.content }}\n {{- '\\n' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n" +} \ No newline at end of file