39 lines
1.0 KiB
YAML
39 lines
1.0 KiB
YAML
alpha: 0.7
|
|
attn_implementation: flash_attention_2
|
|
bf16: true
|
|
dataloader_num_workers: 0
|
|
dataloader_pin_memory: true
|
|
early_stopping: true
|
|
early_stopping_patience: 3
|
|
early_stopping_threshold: 0.0
|
|
eval_data: /data/vibe_exp/dia-guard/dataset/dia_splits/val.jsonl
|
|
eval_steps: 200
|
|
eval_strategy: steps
|
|
gradient_accumulation_steps: 1
|
|
gradient_checkpointing: true
|
|
learning_rate: 5.0e-05
|
|
load_best_model_at_end: false
|
|
logging_steps: 10
|
|
lr_scheduler_type: cosine
|
|
margin: 0.3
|
|
max_grad_norm: 1.0
|
|
max_seq_length: 2048
|
|
metric_for_best_model: eval_loss
|
|
model_name: google/gemma-3-270m-it
|
|
num_epochs: 3
|
|
output_dir: /data/vibe_exp/dia-guard/models/group3_student_ft_baseline/full_ft/gemma_3_270m_it
|
|
per_device_eval_batch_size: 256
|
|
per_device_train_batch_size: 256
|
|
report_to: wandb
|
|
run_name: gemma-3-270m-ce-ft
|
|
save_steps: 500
|
|
save_strategy: steps
|
|
save_total_limit: 3
|
|
temperature: 0.05
|
|
tf32: true
|
|
train_data: /data/vibe_exp/dia-guard/dataset/dia_splits/train.jsonl
|
|
trust_remote_code: false
|
|
use_liger_kernel: true
|
|
warmup_steps: 4218
|
|
weight_decay: 0.01
|