seed: 1 exp_name: llama3-8B-sft train_datasets: - yunjae-won/Qwen3-30B-MagpieLM-SFT-Outputs-v0.1-shard0 test_datasets: - yunjae-won/Qwen3-30B-MagpieLM-SFT-Outputs-v0.1-shard0 debug: false wandb: enabled: true entity: null project: KD cache_dir: .cache/ local_run_dir: .cache//llama3-8B-sft do_first_eval: true minimum_log_interval_secs: 1.0 intermediate_checkpoints: false trainer: BasicTrainer template_tokens: [] lr: 5.0e-06 n_epochs: 1 n_examples: null n_eval_examples: 512 eval_every: 19968 save_every: 5120 step_scheduler_with_optimizer: false optimizer: RMSprop weight_decay: 0 beta1: 0.9 beta2: 0.999 eps: 1.0e-05 warmup: 0.1 cache_reference_logprobs: false load_reference_logprobs: null humanline: false log_epsilon_P: -1.0 log_epsilon_R: 1.5 online: false frac_unique_desirable: 1.0 frac_unique_undesirable: 1.0 model: name_or_path: meta-llama/Llama-3.1-8B tokenizer_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct load_from: null from_checkpoint: null block_name: LlamaDecoderLayer policy_dtype: bfloat16 reference_dtype: bfloat16 max_grad_norm: 10.0 v_head_max_grad_norm: 0.1 max_length: 4096 max_prompt_length: 2048 activation_checkpointing: false batch_size: 256 microbatch_size: 2.0 gradient_accumulation_steps: 32 eval_batch_size: 256 eval_microbatch_size: 2.0 attn_implementation: flash_attention_2 use_peft: false load_lora_from: null peft: lora_r: 64 lora_alpha: 256 lora_dropout: 0.05 target_modules: all-linear reward_model: path: null model_class: AutoModelForBradleyTerry dtype: float32 attn_implementation: flash_attention_2 loss: trainer: SFTTrainer dataloader: SFTDataLoader sync_reference: false num_epochs: 1