From 3ab2d41b2666b867798979c6d268148cab6cd8d9 Mon Sep 17 00:00:00 2001 From: ModelHub XC Date: Mon, 15 Jun 2026 23:56:17 +0800 Subject: [PATCH] =?UTF-8?q?=E5=88=9D=E5=A7=8B=E5=8C=96=E9=A1=B9=E7=9B=AE?= =?UTF-8?q?=EF=BC=8C=E7=94=B1ModelHub=20XC=E7=A4=BE=E5=8C=BA=E6=8F=90?= =?UTF-8?q?=E4=BE=9B=E6=A8=A1=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Model: SantiagoC/palindrome-grpo Source: Original Platform --- .gitattributes | 36 ++++++ README.md | 90 ++++++++++++++ chat_template.jinja | 54 ++++++++ config.json | 57 +++++++++ generation_config.json | 13 ++ model.safetensors | 3 + tokenizer.json | 3 + tokenizer_config.json | 32 +++++ train_grpo.py | 272 +++++++++++++++++++++++++++++++++++++++++ training_args.bin | 3 + 10 files changed, 563 insertions(+) create mode 100644 .gitattributes create mode 100644 README.md create mode 100644 chat_template.jinja create mode 100644 config.json create mode 100644 generation_config.json create mode 100644 model.safetensors create mode 100644 tokenizer.json create mode 100644 tokenizer_config.json create mode 100644 train_grpo.py create mode 100644 training_args.bin diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..52373fe --- /dev/null +++ b/.gitattributes @@ -0,0 +1,36 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..885edd7 --- /dev/null +++ b/README.md @@ -0,0 +1,90 @@ +--- +base_model: Qwen/Qwen2.5-0.5B-Instruct +library_name: transformers +model_name: palindrome-grpo +tags: +- generated_from_trainer +- hf_jobs +- trl +- trackio:https://SantiagoC-mlintern-palindrm.hf.space?project=palindrome-grpo&runs=palindrome-grpo-v1&sidebar=collapsed +- grpo +- ml-intern +licence: license +--- + +# Model Card for palindrome-grpo + +This model is a fine-tuned version of [Qwen/Qwen2.5-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct). +It has been trained using [TRL](https://github.com/huggingface/trl). + +## Quick start + +```python +from transformers import pipeline + +question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?" +generator = pipeline("text-generation", model="SantiagoC/palindrome-grpo", device="cuda") +output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0] +print(output["generated_text"]) +``` + +## Training procedure + + +[Visualize in Trackio](https://SantiagoC-mlintern-palindrm.hf.space?project=palindrome-grpo&runs=palindrome-grpo-v1&sidebar=collapsed) + + +This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300). + +### Framework versions + +- TRL: 1.3.0 +- Transformers: 5.8.0 +- Pytorch: 2.11.0 +- Datasets: 4.8.5 +- Tokenizers: 0.22.2 + +## Citations + +Cite GRPO as: + +```bibtex +@article{shao2024deepseekmath, + title = {{DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models}}, + author = {Zhihong Shao and Peiyi Wang and Qihao Zhu and Runxin Xu and Junxiao Song and Mingchuan Zhang and Y. K. Li and Y. Wu and Daya Guo}, + year = 2024, + eprint = {arXiv:2402.03300}, +} +``` + +Cite TRL as: + +```bibtex +@software{vonwerra2020trl, + title = {{TRL: Transformers Reinforcement Learning}}, + author = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin}, + license = {Apache-2.0}, + url = {https://github.com/huggingface/trl}, + year = {2020} +} +``` + + +## Generated by ML Intern + +This model repository was generated by [ML Intern](https://github.com/huggingface/ml-intern), an agent for machine learning research and development on the Hugging Face Hub. + +- Try ML Intern: https://smolagents-ml-intern.hf.space +- Source code: https://github.com/huggingface/ml-intern + +## Usage + +```python +from transformers import AutoModelForCausalLM, AutoTokenizer + +model_id = 'SantiagoC/palindrome-grpo' +tokenizer = AutoTokenizer.from_pretrained(model_id) +model = AutoModelForCausalLM.from_pretrained(model_id) +``` + +For non-causal architectures, replace `AutoModelForCausalLM` with the appropriate `AutoModel` class. diff --git a/chat_template.jinja b/chat_template.jinja new file mode 100644 index 0000000..bdf7919 --- /dev/null +++ b/chat_template.jinja @@ -0,0 +1,54 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0]['role'] == 'system' %} + {{- messages[0]['content'] }} + {%- else %} + {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }} + {%- endif %} + {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0]['role'] == 'system' %} + {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }} + {%- else %} + {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {{- '<|im_start|>' + message.role }} + {%- if message.content %} + {{- '\n' + message.content }} + {%- endif %} + {%- for tool_call in message.tool_calls %} + {%- if tool_call.function is defined %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {{- tool_call.arguments | tojson }} + {{- '}\n' }} + {%- endfor %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} +{%- endif %} diff --git a/config.json b/config.json new file mode 100644 index 0000000..b227eda --- /dev/null +++ b/config.json @@ -0,0 +1,57 @@ +{ + "architectures": [ + "Qwen2ForCausalLM" + ], + "attention_dropout": 0.0, + "bos_token_id": null, + "dtype": "float32", + "eos_token_id": 151645, + "hidden_act": "silu", + "hidden_size": 896, + "initializer_range": 0.02, + "intermediate_size": 4864, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 32768, + "max_window_layers": 21, + "model_type": "qwen2", + "num_attention_heads": 14, + "num_hidden_layers": 24, + "num_key_value_heads": 2, + "pad_token_id": 151643, + "rms_norm_eps": 1e-06, + "rope_parameters": { + "rope_theta": 1000000.0, + "rope_type": "default" + }, + "sliding_window": null, + "tie_word_embeddings": true, + "transformers_version": "5.8.0", + "use_cache": false, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..3d7646a --- /dev/null +++ b/generation_config.json @@ -0,0 +1,13 @@ +{ + "do_sample": true, + "eos_token_id": [ + 151645, + 151643 + ], + "pad_token_id": 151643, + "repetition_penalty": 1.1, + "temperature": 0.7, + "top_k": 20, + "top_p": 0.8, + "transformers_version": "5.8.0" +} diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000..9c2d2eb --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:abe93aaae15122a73eef519e5c0809deca51d59a88efd7b10fa41350ce134daa +size 1976163472 diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..34510ff --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fd169731d2cbde95e10bf356d66d5997fd885dd8dbb6fb4684da3f23b2585d8 +size 11421892 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..4d8760d --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,32 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|im_end|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "local_files_only": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "padding_side": "left", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "truncation_side": "left", + "unk_token": null +} diff --git a/train_grpo.py b/train_grpo.py new file mode 100644 index 0000000..f161bd3 --- /dev/null +++ b/train_grpo.py @@ -0,0 +1,272 @@ +""" +GRPO training script for palindrome generation. + +Trains a small LLM (Qwen2.5-0.5B-Instruct) to generate palindromes +given a theme using Group Relative Policy Optimization. + +Reward function: + 1. Palindrome accuracy (continuous partial-credit: proportion of matching + mirrored character pairs) + 2. Length bonus (longer palindromes = more reward) + 3. Theme relevance (keyword overlap between theme and generated text) + +reward = α * palindrome_accuracy + β * length_bonus + γ * theme_relevance +""" +import re +import os +from datasets import load_dataset, Dataset +from trl import GRPOConfig, GRPOTrainer +from transformers import TrainerCallback +import trackio + + +# ── Reward function ───────────────────────────────────────────────────── + +def _normalize(text: str) -> str: + """Strip non-alphanumeric and lowercase.""" + return re.sub(r'[^a-zA-Z0-9]', '', text).lower() + + +def _palindrome_accuracy(text: str) -> float: + """ + Continuous palindrome score: proportion of matching mirrored pairs. + + Perfect palindrome → 1.0 + "ractar" (close to racecar) → partial credit + Empty / single char → 0.0 or 1.0 respectively + """ + cleaned = _normalize(text) + n = len(cleaned) + if n == 0: + return 0.0 + if n == 1: + return 1.0 + + matches = 0 + pairs = n // 2 + for i in range(pairs): + if cleaned[i] == cleaned[n - 1 - i]: + matches += 1 + return matches / pairs + + +def _length_bonus(text: str, max_len: int = 80) -> float: + """ + Normalized length reward: longer palindromes score higher. + Capped at max_len characters. + """ + cleaned = _normalize(text) + return min(len(cleaned), max_len) / max_len + + +def _theme_relevance(text: str, theme: str) -> float: + """ + Simple keyword overlap: does the palindrome contain the theme word + or substrings of it? Returns 0.0 - 1.0. + """ + text_lower = _normalize(text) + theme_clean = _normalize(theme) + + if len(theme_clean) == 0: + return 0.0 + + # Exact match of the full theme word + if theme_clean in text_lower: + return 1.0 + + # Partial match: theme word split into n-grams + if len(theme_clean) > 3: + bigrams = set(theme_clean[i:i + 2] for i in range(len(theme_clean) - 1)) + match_count = sum(1 for bg in bigrams if bg in text_lower) + return match_count / len(bigrams) + + return 0.0 + + +def palindrome_reward( + prompts, + completions, + completion_ids, + theme=None, + **kwargs, +) -> list[float]: + """ + GRPO reward function for palindrome generation. + + Args: + prompts: list[list[dict]] — conversational prompts + completions: list[list[dict]] — generated completions + completion_ids: list[list[int]] — token IDs + theme: list[str] — theme column from dataset (forwarded via **kwargs) + + Returns: + list[float] — one reward per sample + """ + # Reward weights (tunable) + alpha = 0.6 # palindrome accuracy weight + beta = 0.25 # length bonus weight + gamma = 0.15 # theme relevance weight + + rewards = [] + for i, completion in enumerate(completions): + # Extract text from conversational format + if isinstance(completion, list): + text = completion[0]["content"].strip() + else: + text = str(completion).strip() + + acc = _palindrome_accuracy(text) + length = _length_bonus(text) + + # Theme from dataset column + theme_text = "" + if theme is not None: + # theme list is flattened: [t*n_generations for t in batch_themes] + theme_text = theme[i] if i < len(theme) else "" + + theme_rel = _theme_relevance(text, theme_text) if theme_text else 0.0 + + reward = alpha * acc + beta * length + gamma * theme_rel + rewards.append(reward) + + return rewards + + +# ── Trackio alert callback ────────────────────────────────────────────── + +class PalindromeAlertCallback(TrainerCallback): + """Log Trackio alerts on key training events.""" + + def on_train_begin(self, args, state, control, **kwargs): + trackio.alert( + "Training started", + f"Model: Qwen/Qwen2.5-0.5B-Instruct | " + f"num_generations={args.num_generations} | " + f"temperature={args.temperature} | " + f"lr={args.learning_rate} | " + f"batch_size={args.per_device_train_batch_size}×" + f"{args.gradient_accumulation_steps}", + level="INFO", + ) + + def on_train_end(self, args, state, control, **kwargs): + trackio.alert( + "Training complete", + f"Model pushed to {args.hub_model_id}", + level="INFO", + ) + + def on_log(self, args, state, control, logs=None, **kwargs): + if logs is None: + return + + # Reward monitoring + reward = logs.get("reward") + if reward is not None: + if reward < 0.1 and state.global_step > 50: + trackio.alert( + "Low reward", + f"reward={reward:.4f} at step {state.global_step} — " + "model not producing palindromes yet. Consider increasing " + "temperature for more exploration.", + level="WARN", + ) + elif 0.1 <= reward < 0.5 and state.global_step > 50: + trackio.alert( + "Moderate reward", + f"reward={reward:.4f} at step {state.global_step} — " + "model starting to learn. Continue training.", + level="INFO", + ) + elif reward >= 0.7: + trackio.alert( + "High reward milestone", + f"reward={reward:.4f} at step {state.global_step} — " + "model producing good palindromes!", + level="INFO", + ) + + # KL divergence monitoring + kl = logs.get("kl") + if kl is not None and kl > 0.5: + trackio.alert( + "High KL divergence", + f"kl={kl:.4f} at step {state.global_step} — " + "policy diverging too far. Consider lowering learning rate.", + level="WARN", + ) + + # Loss monitoring + loss = logs.get("loss") + if loss is not None: + if loss > 10: + trackio.alert( + "High loss", + f"loss={loss:.4f} at step {state.global_step} — " + "loss spike detected. Check for instability, " + "consider reducing learning rate by 0.5×.", + level="WARN", + ) + + +# ── Main ──────────────────────────────────────────────────────────────── + +def main(): + # Load dataset + dataset_path = os.getenv("DATASET_PATH", "SantiagoC/palindrome-prompts") + dataset = load_dataset(dataset_path, split="train") + + print(f"Dataset size: {len(dataset)} samples") + print(f"Columns: {dataset.column_names}") + print(f"First prompt: {dataset[0]['prompt']}") + + # Training config + training_args = GRPOConfig( + output_dir="./palindrome-grpo-output", + # ── GRPO specific ── + num_generations=4, + max_completion_length=128, + temperature=0.9, + beta=0.0, # No reference model KL penalty + scale_rewards=False, # Dr.GRPO recommendation + loss_type="dapo", + mask_truncated_completions=True, + # ── Standard training ── + learning_rate=1e-6, + num_train_epochs=6, + per_device_train_batch_size=2, + gradient_accumulation_steps=4, + bf16=True, + gradient_checkpointing=True, + # ── Logging ── + logging_steps=10, + logging_first_step=True, + save_steps=100, + save_total_limit=3, + disable_tqdm=True, + report_to="trackio", + # ── Hub push ── + push_to_hub=True, + hub_model_id=os.getenv("HUB_MODEL_ID", "SantiagoC/palindrome-grpo"), + run_name=os.getenv("RUN_NAME", "palindrome-grpo-v1"), + project="palindrome-grpo", + trackio_space_id=os.getenv("TRACKIO_SPACE_ID", "SantiagoC/mlintern-palindrm"), + ) + + trainer = GRPOTrainer( + model="Qwen/Qwen2.5-0.5B-Instruct", + reward_funcs=palindrome_reward, + args=training_args, + train_dataset=dataset, + callbacks=[PalindromeAlertCallback()], + ) + + trainer.train() + + # Final push + trainer.save_model() + trainer.push_to_hub() + + +if __name__ == "__main__": + main() diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..4efd286 --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9b6b4e5d0e7fab9407ca337c0d19353e218e56a19a3d70b81e0333423d618ca +size 7313