{ "model_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", "repo_root": "/mnt/nlp/scratch/home/kopidaki/ThinkPRM/", "hf_dataset": "launch/thinkprm-1K-verification-cots", "hf_split": "train", "eval_hf_split": null, "eval_split_ratio": 0.0, "output_dir": "/mnt/nlp/scratch/home/kopidaki/outputs/thinkprm-full-trl", "num_train_epochs": 3.0, "learning_rate": 6e-05, "lr_scheduler_type": "constant", "warmup_ratio": 0.0, "per_device_train_batch_size": 1, "per_device_eval_batch_size": 8, "gradient_accumulation_steps": 8, "max_length": 4096, "dataloader_num_workers": 0, "eval_strategy": "no", "save_strategy": "steps", "save_steps": 500, "save_total_limit": null, "logging_steps": 10, "report_to": [ "wandb" ], "seed": 42, "gradient_checkpointing": true }