The following hyperparameters were used during training:
learning_rate: 5e-07
eta: 1000
per_device_train_batch_size: 8
gradient_accumulation_steps: 1
seed: 42
distributed_type: deepspeed_zero3
num_devices: 8
optimizer: RMSProp
lr_scheduler_type: linear
lr_scheduler_warmup_ratio: 0.1
num_train_epochs: 18.0 (stop at epoch=1.0)
Citation
@misc{wu2024self,
title={Self-Play Preference Optimization for Language Model Alignment},
author={Wu, Yue and Sun, Zhiqing and Yuan, Huizhuo and Ji, Kaixuan and Yang, Yiming and Gu, Quanquan},
year={2024},
eprint={2405.00675},
archivePrefix={arXiv},
primaryClass={cs.LG}
}