model_name_or_path: wh-zhu/qwen2_7B-ultrachat200k
weak_base_model: wh-zhu/qwen2_1.5B-ultrachat200k
weak_aligned_model: wh-zhu/qwen2_1.5B-ultrachatfeedback-dpo
stage: wspo
do_train: true
finetuning_type: full
deepspeed: examples/deepspeed/ds_z3_config.json
pref_beta: 0.1
pref_loss: margin_mse_self_wspo
dataset: ultrafeedback
template: qwen
cutoff_len: 2048
max_samples: 268694
overwrite_cache: true
preprocessing_num_workers: 16
output_dir: /outputs/qwen2_7b_wspo_full_train_full_E1
logging_steps: 10
save_steps: 800
plot_loss: true
overwrite_output_dir: true
per_device_train_batch_size: 2
gradient_accumulation_steps: 4
learning_rate: 1.0e-06
num_train_epochs: 1
lr_scheduler_type: cosine
warmup_ratio: 0.1
bf16: true
ddp_timeout: 180000000
val_size: 0.0
per_device_eval_batch_size: 1
eval_strategy: 'no'
eval_steps: 1000
report_to:
- wandb
run_name: qwen2_7b_wspo_full_train_full_E1
do_eval: false
self_wspo_negative_strategy: in_batch_random
pref_ftx: 0.0
ddp_find_unused_parameters: false
dataloader_drop_last: true
push_to_hub: true
hub_model_id: pltops/qwen2_7B-ultrachatfeedback-self-wspo-full_E1-adapter
hub_strategy: end