commit 1ce601e77deba25f137ddee4397c450b8f00c161 Author: ModelHub XC Date: Fri Jun 19 04:53:17 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: mohd-musheer/qforge-qwen-adapter Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..4073e3b --- /dev/null +++ b/.gitattributes @@ -0,0 +1,40 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +checkpoint-658/tokenizer.json filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text +qforge-q4.gguf filter=lfs diff=lfs merge=lfs -text +qforge-f16.gguf filter=lfs diff=lfs merge=lfs -text +qforge-q5.gguf filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..44fad20 --- /dev/null +++ b/README.md @@ -0,0 +1,62 @@ +--- +base_model: Qwen/Qwen2.5-Coder-3B-Instruct +library_name: peft +model_name: qforge-qwen +tags: +- base_model:adapter:Qwen/Qwen2.5-Coder-3B-Instruct +- lora +- sft +- transformers +- trl +licence: license +pipeline_tag: text-generation +--- + +# Model Card for qforge-qwen + +This model is a fine-tuned version of [Qwen/Qwen2.5-Coder-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Coder-3B-Instruct). +It has been trained using [TRL](https://github.com/huggingface/trl). + +## Quick start + +```python +from transformers import pipeline + +question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?" +generator = pipeline("text-generation", model="None", device="cuda") +output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0] +print(output["generated_text"]) +``` + +## Training procedure + + + + + +This model was trained with SFT. + +### Framework versions + +- PEFT 0.18.1 +- TRL: 1.4.0 +- Transformers: 5.0.0 +- Pytorch: 2.10.0+cu128 +- Datasets: 4.8.3 +- Tokenizers: 0.22.2 + +## Citations + + + +Cite TRL as: + +```bibtex +@software{vonwerra2020trl, + title = {{TRL: Transformers Reinforcement Learning}}, + author = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin}, + license = {Apache-2.0}, + url = {https://github.com/huggingface/trl}, + year = {2020} +} +``` \ No newline at end of file diff --git a/adapter_config.json b/adapter_config.json new file mode 100644 index 0000000..aa35e84 --- /dev/null +++ b/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen2.5-Coder-3B-Instruct", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj", + "down_proj", + "k_proj", + "up_proj", + "gate_proj", + "o_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/adapter_model.safetensors b/adapter_model.safetensors new file mode 100644 index 0000000..d5a0eed --- /dev/null +++ b/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:64730d3f15a97013c981be20d0609f8a2898bda8713714746daa04299a1c68ba +size 59934640 diff --git a/chat_template.jinja b/chat_template.jinja new file mode 100644 index 0000000..bdf7919 --- /dev/null +++ b/chat_template.jinja @@ -0,0 +1,54 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0]['role'] == 'system' %} + {{- messages[0]['content'] }} + {%- else %} + {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }} + {%- endif %} + {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0]['role'] == 'system' %} + {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }} + {%- else %} + {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {{- '<|im_start|>' + message.role }} + {%- if message.content %} + {{- '\n' + message.content }} + {%- endif %} + {%- for tool_call in message.tool_calls %} + {%- if tool_call.function is defined %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {{- tool_call.arguments | tojson }} + {{- '}\n' }} + {%- endfor %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} +{%- endif %} diff --git a/checkpoint-658/README.md b/checkpoint-658/README.md new file mode 100644 index 0000000..c36ea41 --- /dev/null +++ b/checkpoint-658/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen2.5-Coder-3B-Instruct +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen2.5-Coder-3B-Instruct +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/checkpoint-658/adapter_config.json b/checkpoint-658/adapter_config.json new file mode 100644 index 0000000..aa35e84 --- /dev/null +++ b/checkpoint-658/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen2.5-Coder-3B-Instruct", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj", + "down_proj", + "k_proj", + "up_proj", + "gate_proj", + "o_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-658/adapter_model.safetensors b/checkpoint-658/adapter_model.safetensors new file mode 100644 index 0000000..d5a0eed --- /dev/null +++ b/checkpoint-658/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:64730d3f15a97013c981be20d0609f8a2898bda8713714746daa04299a1c68ba +size 59934640 diff --git a/checkpoint-658/chat_template.jinja b/checkpoint-658/chat_template.jinja new file mode 100644 index 0000000..bdf7919 --- /dev/null +++ b/checkpoint-658/chat_template.jinja @@ -0,0 +1,54 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0]['role'] == 'system' %} + {{- messages[0]['content'] }} + {%- else %} + {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }} + {%- endif %} + {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0]['role'] == 'system' %} + {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }} + {%- else %} + {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {{- '<|im_start|>' + message.role }} + {%- if message.content %} + {{- '\n' + message.content }} + {%- endif %} + {%- for tool_call in message.tool_calls %} + {%- if tool_call.function is defined %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {{- tool_call.arguments | tojson }} + {{- '}\n' }} + {%- endfor %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} +{%- endif %} diff --git a/checkpoint-658/optimizer.pt b/checkpoint-658/optimizer.pt new file mode 100644 index 0000000..9418884 --- /dev/null +++ b/checkpoint-658/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd41b8b431e454335d0a83a253c37c042418e7691ec194f802dd0f58098b72e7 +size 61409159 diff --git a/checkpoint-658/rng_state.pth b/checkpoint-658/rng_state.pth new file mode 100644 index 0000000..9c0cb34 --- /dev/null +++ b/checkpoint-658/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2db01e3128ed019448fd11a4489d12d7a72bc1e8c99c2821d96a5f723526872e +size 14645 diff --git a/checkpoint-658/scheduler.pt b/checkpoint-658/scheduler.pt new file mode 100644 index 0000000..2980fc1 --- /dev/null +++ b/checkpoint-658/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d349d7c839f4e3c54b0d8155948a539be7f248e0ba7954cac85ff1c9edfaafa +size 1465 diff --git a/checkpoint-658/tokenizer.json b/checkpoint-658/tokenizer.json new file mode 100644 index 0000000..34510ff --- /dev/null +++ b/checkpoint-658/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fd169731d2cbde95e10bf356d66d5997fd885dd8dbb6fb4684da3f23b2585d8 +size 11421892 diff --git a/checkpoint-658/tokenizer_config.json b/checkpoint-658/tokenizer_config.json new file mode 100644 index 0000000..18ddaff --- /dev/null +++ b/checkpoint-658/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|im_end|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 32768, + "pad_token": "<|im_end|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/checkpoint-658/trainer_state.json b/checkpoint-658/trainer_state.json new file mode 100644 index 0000000..5252614 --- /dev/null +++ b/checkpoint-658/trainer_state.json @@ -0,0 +1,684 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 658, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.1479004077613353, + "epoch": 0.015197568389057751, + "grad_norm": 0.51171875, + "learning_rate": 1.8e-05, + "loss": 1.5924851417541503, + "mean_token_accuracy": 0.6634366158396006, + "num_tokens": 62452.0, + "step": 10 + }, + { + "entropy": 1.2071532987058162, + "epoch": 0.030395136778115502, + "grad_norm": 0.361328125, + "learning_rate": 3.8e-05, + "loss": 1.5587355613708496, + "mean_token_accuracy": 0.6596032619476319, + "num_tokens": 125614.0, + "step": 20 + }, + { + "entropy": 1.1939050488173961, + "epoch": 0.04559270516717325, + "grad_norm": 0.40625, + "learning_rate": 5.8e-05, + "loss": 1.3669007301330567, + "mean_token_accuracy": 0.6933483470231294, + "num_tokens": 184892.0, + "step": 30 + }, + { + "entropy": 1.221787953004241, + "epoch": 0.060790273556231005, + "grad_norm": 0.353515625, + "learning_rate": 7.800000000000001e-05, + "loss": 1.23649320602417, + "mean_token_accuracy": 0.7031858772039413, + "num_tokens": 245953.0, + "step": 40 + }, + { + "entropy": 1.077309934794903, + "epoch": 0.07598784194528875, + "grad_norm": 0.28515625, + "learning_rate": 9.8e-05, + "loss": 1.0901852607727052, + "mean_token_accuracy": 0.7312592580914498, + "num_tokens": 306051.0, + "step": 50 + }, + { + "entropy": 1.1815281737595797, + "epoch": 0.0911854103343465, + "grad_norm": 0.2353515625, + "learning_rate": 0.000118, + "loss": 1.188125991821289, + "mean_token_accuracy": 0.7160288453102112, + "num_tokens": 366518.0, + "step": 60 + }, + { + "entropy": 1.0586148291826247, + "epoch": 0.10638297872340426, + "grad_norm": 0.2392578125, + "learning_rate": 0.000138, + "loss": 1.0688477516174317, + "mean_token_accuracy": 0.7354599207639694, + "num_tokens": 430087.0, + "step": 70 + }, + { + "entropy": 1.050880615785718, + "epoch": 0.12158054711246201, + "grad_norm": 0.3515625, + "learning_rate": 0.00015800000000000002, + "loss": 1.051170825958252, + "mean_token_accuracy": 0.73864571377635, + "num_tokens": 491536.0, + "step": 80 + }, + { + "entropy": 1.0797226771712303, + "epoch": 0.13677811550151975, + "grad_norm": 0.298828125, + "learning_rate": 0.00017800000000000002, + "loss": 1.0682001113891602, + "mean_token_accuracy": 0.7340194799005986, + "num_tokens": 552928.0, + "step": 90 + }, + { + "entropy": 1.0816405173391104, + "epoch": 0.1519756838905775, + "grad_norm": 0.2333984375, + "learning_rate": 0.00019800000000000002, + "loss": 1.0748048782348634, + "mean_token_accuracy": 0.7275203809142112, + "num_tokens": 617585.0, + "step": 100 + }, + { + "entropy": 1.1387810289859772, + "epoch": 0.16717325227963525, + "grad_norm": 0.2421875, + "learning_rate": 0.00019987165071710527, + "loss": 1.1564626693725586, + "mean_token_accuracy": 0.7178827051073313, + "num_tokens": 682493.0, + "step": 110 + }, + { + "entropy": 1.0470557224005461, + "epoch": 0.182370820668693, + "grad_norm": 0.263671875, + "learning_rate": 0.00019942839715782445, + "loss": 1.0679837226867677, + "mean_token_accuracy": 0.7393594264984131, + "num_tokens": 741585.0, + "step": 120 + }, + { + "entropy": 1.022692532464862, + "epoch": 0.19756838905775076, + "grad_norm": 0.1943359375, + "learning_rate": 0.0001986700590805916, + "loss": 1.022587490081787, + "mean_token_accuracy": 0.7484504476189613, + "num_tokens": 805276.0, + "step": 130 + }, + { + "entropy": 1.0931762170046568, + "epoch": 0.2127659574468085, + "grad_norm": 0.2890625, + "learning_rate": 0.00019759903962771156, + "loss": 1.0889897346496582, + "mean_token_accuracy": 0.7326500549912452, + "num_tokens": 870605.0, + "step": 140 + }, + { + "entropy": 1.046710380911827, + "epoch": 0.22796352583586627, + "grad_norm": 0.283203125, + "learning_rate": 0.00019621873281596092, + "loss": 1.088566493988037, + "mean_token_accuracy": 0.7342231959104538, + "num_tokens": 932551.0, + "step": 150 + }, + { + "entropy": 1.0759602926671505, + "epoch": 0.24316109422492402, + "grad_norm": 0.2265625, + "learning_rate": 0.00019453351278108806, + "loss": 1.0699708938598633, + "mean_token_accuracy": 0.735610119625926, + "num_tokens": 991401.0, + "step": 160 + }, + { + "entropy": 1.0602354250848294, + "epoch": 0.25835866261398177, + "grad_norm": 0.2255859375, + "learning_rate": 0.00019254871991635598, + "loss": 1.0612274169921876, + "mean_token_accuracy": 0.7369228135794401, + "num_tokens": 1053437.0, + "step": 170 + }, + { + "entropy": 1.0833635926246643, + "epoch": 0.2735562310030395, + "grad_norm": 0.2333984375, + "learning_rate": 0.00019027064394905473, + "loss": 1.0936810493469238, + "mean_token_accuracy": 0.7335428461432457, + "num_tokens": 1116692.0, + "step": 180 + }, + { + "entropy": 1.0710609000176192, + "epoch": 0.2887537993920973, + "grad_norm": 0.2470703125, + "learning_rate": 0.00018770650400861357, + "loss": 1.0741844177246094, + "mean_token_accuracy": 0.7365527264773846, + "num_tokens": 1176488.0, + "step": 190 + }, + { + "entropy": 1.0221609488129615, + "epoch": 0.303951367781155, + "grad_norm": 0.302734375, + "learning_rate": 0.00018486442574947511, + "loss": 1.0395893096923827, + "mean_token_accuracy": 0.7463255256414414, + "num_tokens": 1234621.0, + "step": 200 + }, + { + "entropy": 1.028166577219963, + "epoch": 0.3191489361702128, + "grad_norm": 0.263671875, + "learning_rate": 0.0001817534156012295, + "loss": 1.0307375907897949, + "mean_token_accuracy": 0.7422413423657417, + "num_tokens": 1293632.0, + "step": 210 + }, + { + "entropy": 1.0808881603181362, + "epoch": 0.3343465045592705, + "grad_norm": 0.220703125, + "learning_rate": 0.00017838333222760792, + "loss": 1.0678336143493652, + "mean_token_accuracy": 0.7352703791111708, + "num_tokens": 1355301.0, + "step": 220 + }, + { + "entropy": 1.0897805251181125, + "epoch": 0.3495440729483283, + "grad_norm": 0.232421875, + "learning_rate": 0.00017476485528478093, + "loss": 1.0985455513000488, + "mean_token_accuracy": 0.7289470963180065, + "num_tokens": 1416981.0, + "step": 230 + }, + { + "entropy": 1.0927846007049085, + "epoch": 0.364741641337386, + "grad_norm": 0.23828125, + "learning_rate": 0.0001709094515779655, + "loss": 1.0999566078186036, + "mean_token_accuracy": 0.7316457532346249, + "num_tokens": 1477934.0, + "step": 240 + }, + { + "entropy": 1.0481801606714725, + "epoch": 0.3799392097264438, + "grad_norm": 0.2421875, + "learning_rate": 0.00016682933872358912, + "loss": 1.0316462516784668, + "mean_token_accuracy": 0.7426637083292007, + "num_tokens": 1539241.0, + "step": 250 + }, + { + "entropy": 1.0000269718468189, + "epoch": 0.3951367781155015, + "grad_norm": 0.255859375, + "learning_rate": 0.00016253744643216368, + "loss": 1.0046791076660155, + "mean_token_accuracy": 0.747460788488388, + "num_tokens": 1604048.0, + "step": 260 + }, + { + "entropy": 1.0748730458319187, + "epoch": 0.41033434650455924, + "grad_norm": 0.236328125, + "learning_rate": 0.0001580473755345625, + "loss": 1.0607515335083009, + "mean_token_accuracy": 0.7351726233959198, + "num_tokens": 1666904.0, + "step": 270 + }, + { + "entropy": 1.059519186615944, + "epoch": 0.425531914893617, + "grad_norm": 0.224609375, + "learning_rate": 0.00015337335488154431, + "loss": 1.0550406455993653, + "mean_token_accuracy": 0.7438629917800427, + "num_tokens": 1725233.0, + "step": 280 + }, + { + "entropy": 1.049350445717573, + "epoch": 0.44072948328267475, + "grad_norm": 0.203125, + "learning_rate": 0.00014853019625310813, + "loss": 1.0740507125854493, + "mean_token_accuracy": 0.7419425651431084, + "num_tokens": 1788516.0, + "step": 290 + }, + { + "entropy": 1.0794111423194408, + "epoch": 0.45592705167173253, + "grad_norm": 0.2734375, + "learning_rate": 0.000143533247420569, + "loss": 1.0830384254455567, + "mean_token_accuracy": 0.7354968316853047, + "num_tokens": 1849950.0, + "step": 300 + }, + { + "entropy": 1.0809252394363285, + "epoch": 0.47112462006079026, + "grad_norm": 0.271484375, + "learning_rate": 0.00013839834351009954, + "loss": 1.106314754486084, + "mean_token_accuracy": 0.7333131659775972, + "num_tokens": 1911826.0, + "step": 310 + }, + { + "entropy": 1.0794141918420792, + "epoch": 0.48632218844984804, + "grad_norm": 0.251953125, + "learning_rate": 0.0001331417568218636, + "loss": 1.080414390563965, + "mean_token_accuracy": 0.7330277658998966, + "num_tokens": 1971255.0, + "step": 320 + }, + { + "entropy": 1.0166626147925855, + "epoch": 0.5015197568389058, + "grad_norm": 0.2412109375, + "learning_rate": 0.00012778014526376353, + "loss": 1.0223579406738281, + "mean_token_accuracy": 0.7451204493641853, + "num_tokens": 2035666.0, + "step": 330 + }, + { + "entropy": 1.0680379424244166, + "epoch": 0.5167173252279635, + "grad_norm": 0.21875, + "learning_rate": 0.0001223304995632124, + "loss": 1.063144588470459, + "mean_token_accuracy": 0.7386665888130665, + "num_tokens": 2097745.0, + "step": 340 + }, + { + "entropy": 1.1010748460888862, + "epoch": 0.5319148936170213, + "grad_norm": 0.265625, + "learning_rate": 0.00011681008942421483, + "loss": 1.1296490669250487, + "mean_token_accuracy": 0.7327280201017856, + "num_tokens": 2154975.0, + "step": 350 + }, + { + "entropy": 1.0673069586977362, + "epoch": 0.547112462006079, + "grad_norm": 0.2392578125, + "learning_rate": 0.00011123640880038233, + "loss": 1.0613948822021484, + "mean_token_accuracy": 0.7358887560665608, + "num_tokens": 2218174.0, + "step": 360 + }, + { + "entropy": 1.120655293017626, + "epoch": 0.5623100303951368, + "grad_norm": 0.2255859375, + "learning_rate": 0.00010562712045731084, + "loss": 1.1228485107421875, + "mean_token_accuracy": 0.7270086117088794, + "num_tokens": 2282763.0, + "step": 370 + }, + { + "entropy": 1.023940760269761, + "epoch": 0.5775075987841946, + "grad_norm": 0.2197265625, + "learning_rate": 0.0001, + "loss": 1.01791353225708, + "mean_token_accuracy": 0.7478193882852793, + "num_tokens": 2342406.0, + "step": 380 + }, + { + "entropy": 1.0468089915812016, + "epoch": 0.5927051671732523, + "grad_norm": 0.296875, + "learning_rate": 9.43728795426892e-05, + "loss": 1.049548053741455, + "mean_token_accuracy": 0.742385634034872, + "num_tokens": 2403644.0, + "step": 390 + }, + { + "entropy": 1.0749794770032168, + "epoch": 0.60790273556231, + "grad_norm": 0.224609375, + "learning_rate": 8.87635911996177e-05, + "loss": 1.0873553276062011, + "mean_token_accuracy": 0.7330629404634237, + "num_tokens": 2466781.0, + "step": 400 + }, + { + "entropy": 1.0800842259079217, + "epoch": 0.6231003039513677, + "grad_norm": 0.2734375, + "learning_rate": 8.31899105757852e-05, + "loss": 1.0950210571289063, + "mean_token_accuracy": 0.7357834167778492, + "num_tokens": 2529775.0, + "step": 410 + }, + { + "entropy": 1.0835391595959663, + "epoch": 0.6382978723404256, + "grad_norm": 0.2734375, + "learning_rate": 7.766950043678764e-05, + "loss": 1.0691117286682128, + "mean_token_accuracy": 0.7371876426041126, + "num_tokens": 2593744.0, + "step": 420 + }, + { + "entropy": 1.0298739653080702, + "epoch": 0.6534954407294833, + "grad_norm": 0.240234375, + "learning_rate": 7.221985473623654e-05, + "loss": 1.0413244247436524, + "mean_token_accuracy": 0.7439139492809772, + "num_tokens": 2656829.0, + "step": 430 + }, + { + "entropy": 1.1207933265715837, + "epoch": 0.668693009118541, + "grad_norm": 0.244140625, + "learning_rate": 6.685824317813643e-05, + "loss": 1.136990737915039, + "mean_token_accuracy": 0.7278301935642958, + "num_tokens": 2719001.0, + "step": 440 + }, + { + "entropy": 1.0710795857012272, + "epoch": 0.6838905775075987, + "grad_norm": 0.263671875, + "learning_rate": 6.160165648990048e-05, + "loss": 1.0599514961242675, + "mean_token_accuracy": 0.7398943588137626, + "num_tokens": 2783541.0, + "step": 450 + }, + { + "entropy": 1.0919535238295794, + "epoch": 0.6990881458966566, + "grad_norm": 0.609375, + "learning_rate": 5.6466752579431016e-05, + "loss": 1.0944193840026855, + "mean_token_accuracy": 0.7346091568470001, + "num_tokens": 2845061.0, + "step": 460 + }, + { + "entropy": 1.0125693645328284, + "epoch": 0.7142857142857143, + "grad_norm": 0.2470703125, + "learning_rate": 5.146980374689192e-05, + "loss": 1.0110005378723144, + "mean_token_accuracy": 0.743566332012415, + "num_tokens": 2905588.0, + "step": 470 + }, + { + "entropy": 1.0804476391524076, + "epoch": 0.729483282674772, + "grad_norm": 0.224609375, + "learning_rate": 4.662664511845568e-05, + "loss": 1.065206813812256, + "mean_token_accuracy": 0.7375747956335544, + "num_tokens": 2969406.0, + "step": 480 + }, + { + "entropy": 1.076500639691949, + "epoch": 0.7446808510638298, + "grad_norm": 0.2109375, + "learning_rate": 4.195262446543753e-05, + "loss": 1.0726984977722167, + "mean_token_accuracy": 0.7347712397575379, + "num_tokens": 3033325.0, + "step": 490 + }, + { + "entropy": 1.0519217938184737, + "epoch": 0.7598784194528876, + "grad_norm": 0.2373046875, + "learning_rate": 3.746255356783632e-05, + "loss": 1.0575304985046388, + "mean_token_accuracy": 0.7421793609857559, + "num_tokens": 3096358.0, + "step": 500 + }, + { + "entropy": 1.1361971575766803, + "epoch": 0.7750759878419453, + "grad_norm": 0.26953125, + "learning_rate": 3.317066127641091e-05, + "loss": 1.14003267288208, + "mean_token_accuracy": 0.7234445497393608, + "num_tokens": 3157442.0, + "step": 510 + }, + { + "entropy": 1.0398458503186703, + "epoch": 0.790273556231003, + "grad_norm": 0.263671875, + "learning_rate": 2.9090548422034525e-05, + "loss": 1.0192261695861817, + "mean_token_accuracy": 0.7409338817000389, + "num_tokens": 3217105.0, + "step": 520 + }, + { + "entropy": 1.0723623022437097, + "epoch": 0.8054711246200608, + "grad_norm": 0.208984375, + "learning_rate": 2.523514471521913e-05, + "loss": 1.0806448936462403, + "mean_token_accuracy": 0.739747503399849, + "num_tokens": 3279648.0, + "step": 530 + }, + { + "entropy": 1.0766091130673885, + "epoch": 0.8206686930091185, + "grad_norm": 0.2578125, + "learning_rate": 2.1616667772392074e-05, + "loss": 1.0830445289611816, + "mean_token_accuracy": 0.7354145631194114, + "num_tokens": 3340664.0, + "step": 540 + }, + { + "entropy": 1.0176336735486984, + "epoch": 0.8358662613981763, + "grad_norm": 0.2294921875, + "learning_rate": 1.8246584398770493e-05, + "loss": 1.0089756965637207, + "mean_token_accuracy": 0.7476604901254177, + "num_tokens": 3403315.0, + "step": 550 + }, + { + "entropy": 1.1855169147253037, + "epoch": 0.851063829787234, + "grad_norm": 0.251953125, + "learning_rate": 1.5135574250524897e-05, + "loss": 1.2196799278259278, + "mean_token_accuracy": 0.711599162966013, + "num_tokens": 3465413.0, + "step": 560 + }, + { + "entropy": 1.0133992433547974, + "epoch": 0.8662613981762918, + "grad_norm": 0.22265625, + "learning_rate": 1.229349599138645e-05, + "loss": 1.0138180732727051, + "mean_token_accuracy": 0.7445756837725639, + "num_tokens": 3527634.0, + "step": 570 + }, + { + "entropy": 1.0065228387713432, + "epoch": 0.8814589665653495, + "grad_norm": 0.2470703125, + "learning_rate": 9.729356050945271e-06, + "loss": 1.0059442520141602, + "mean_token_accuracy": 0.7528380408883095, + "num_tokens": 3589816.0, + "step": 580 + }, + { + "entropy": 1.026823963969946, + "epoch": 0.8966565349544073, + "grad_norm": 0.2265625, + "learning_rate": 7.4512800836440525e-06, + "loss": 1.0102774620056152, + "mean_token_accuracy": 0.7461944825947284, + "num_tokens": 3656572.0, + "step": 590 + }, + { + "entropy": 1.041152635589242, + "epoch": 0.9118541033434651, + "grad_norm": 0.2333984375, + "learning_rate": 5.466487218911942e-06, + "loss": 1.0332704544067384, + "mean_token_accuracy": 0.7453692108392715, + "num_tokens": 3720528.0, + "step": 600 + }, + { + "entropy": 1.0129390254616737, + "epoch": 0.9270516717325228, + "grad_norm": 0.259765625, + "learning_rate": 3.7812671840390835e-06, + "loss": 1.0002843856811523, + "mean_token_accuracy": 0.7472102656960488, + "num_tokens": 3784254.0, + "step": 610 + }, + { + "entropy": 1.0242063857614994, + "epoch": 0.9422492401215805, + "grad_norm": 0.265625, + "learning_rate": 2.4009603722884742e-06, + "loss": 0.9934074401855468, + "mean_token_accuracy": 0.7479867108166218, + "num_tokens": 3845304.0, + "step": 620 + }, + { + "entropy": 1.023862723633647, + "epoch": 0.9574468085106383, + "grad_norm": 0.20703125, + "learning_rate": 1.3299409194084122e-06, + "loss": 1.0128738403320312, + "mean_token_accuracy": 0.7516457572579384, + "num_tokens": 3910188.0, + "step": 630 + }, + { + "entropy": 1.0282081000506877, + "epoch": 0.9726443768996961, + "grad_norm": 0.2001953125, + "learning_rate": 5.716028421755671e-07, + "loss": 1.0188997268676758, + "mean_token_accuracy": 0.7448701910674572, + "num_tokens": 3973174.0, + "step": 640 + }, + { + "entropy": 1.0912907514721155, + "epoch": 0.9878419452887538, + "grad_norm": 0.275390625, + "learning_rate": 1.2834928289472416e-07, + "loss": 1.1018651962280273, + "mean_token_accuracy": 0.7346004512161016, + "num_tokens": 4038046.0, + "step": 650 + } + ], + "logging_steps": 10, + "max_steps": 658, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 6.882623888257843e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-658/training_args.bin b/checkpoint-658/training_args.bin new file mode 100644 index 0000000..0b04afc --- /dev/null +++ b/checkpoint-658/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b208f49cf3711ad5619acc29e443e47bf9b2a4776fd6c484fa9d460411c698af +size 5649 diff --git a/qforge-f16.gguf b/qforge-f16.gguf new file mode 100644 index 0000000..c91951b --- /dev/null +++ b/qforge-f16.gguf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f16c7b8df4143ff7b8a0803351983846054febe939086cd5634670c38b19c78 +size 6178316640 diff --git a/qforge-q4.gguf b/qforge-q4.gguf new file mode 100644 index 0000000..df2bde5 --- /dev/null +++ b/qforge-q4.gguf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d0b4f7f6b3c0fde449ffdefee75f3c7a53de019fa4a2986b2475767132ff6d2 +size 1929902432 diff --git a/qforge-q5.gguf b/qforge-q5.gguf new file mode 100644 index 0000000..4f954bc --- /dev/null +++ b/qforge-q5.gguf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:73480ddbc9398460b8dcd890f71a996bdce419afb793d0b550e2db4e6f1f6733 +size 2224814432 diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..34510ff --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fd169731d2cbde95e10bf356d66d5997fd885dd8dbb6fb4684da3f23b2585d8 +size 11421892 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..18ddaff --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|im_end|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 32768, + "pad_token": "<|im_end|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +}