alpha: 0.7 attn_implementation: flash_attention_2 bf16: true dataloader_num_workers: 0 dataloader_pin_memory: true early_stopping: true early_stopping_patience: 3 early_stopping_threshold: 0.0 eval_data: /data/vibe_exp/dia-guard/dataset/dia_splits/val.jsonl eval_steps: 200 eval_strategy: steps gradient_accumulation_steps: 1 gradient_checkpointing: true learning_rate: 5.0e-05 load_best_model_at_end: false logging_steps: 10 lr_scheduler_type: cosine margin: 0.3 max_grad_norm: 1.0 max_seq_length: 2048 metric_for_best_model: eval_loss model_name: google/gemma-3-270m-it num_epochs: 3 output_dir: /data/vibe_exp/dia-guard/models/group3_student_ft_baseline/full_ft/gemma_3_270m_it per_device_eval_batch_size: 256 per_device_train_batch_size: 256 report_to: wandb run_name: gemma-3-270m-ce-ft save_steps: 500 save_strategy: steps save_total_limit: 3 temperature: 0.05 tf32: true train_data: /data/vibe_exp/dia-guard/dataset/dia_splits/train.jsonl trust_remote_code: false use_liger_kernel: true warmup_steps: 4218 weight_decay: 0.01