# Qwen Fine-tuning Configuration with Unsloth
# Optimized for 20GB VRAM — based on config.yaml with readability improvements

# Model Configuration
model:
  model_name: "unsloth/Qwen2.5-1.5B-Instruct"  # Options: Qwen3-4B, Qwen3-8B, Qwen3-14B, Qwen3-32B
  max_seq_length: 4096 # Sequence length (avg input ~2054 tokens; 2048 truncated 38% of samples)
  dtype: null  # Auto-detect best dtype
  load_in_4bit: true  # QLoRA — 4-bit base model required for 20GB VRAM
  device_map: "auto"  # Automatic device mapping

  # LoRA Configuration
  lora_r: 64  # Increased from 16 — wider subspace helps model learn synthesis over verbatim copying
  lora_alpha: 64  # Keep equal to r with RSLoRA
  lora_dropout: 0.0  # No dropout — 461 samples, ~174 optimizer steps; every gradient counts
  lora_targets:  # Target modules for LoRA
    - "q_proj"
    - "k_proj"
    - "v_proj"
    - "o_proj"
    - "gate_proj"
    - "up_proj"
    - "down_proj"
  use_rslora: true  # Mandatory at r=64 to normalize gradient scaling
  random_state: 42  # Random seed for reproducibility
  loftq_config: null  # LoftQ configuration

# Dataset Configuration
dataset:
  type: "local"  # Options: "huggingface", "local"
  name: "mixed_dataset"  # Hugging Face dataset name (if type is huggingface)
  path: "train_dataset.json"  # Local dataset path (if type is local)
  eval_path: "eval_dataset.json"  # Enabled — allows load_best_model_at_end to select best checkpoint
  split: "train"  # Dataset split to use
  text_column: "messages"  # Column name containing conversations
  use_chat_template: true  # Apply chat template formatting
  dpo_train_path: "dpo_train_dataset.json"
  dpo_eval_path:  "dpo_eval_dataset.json"

# Training Configuration
training:
  mode: "sft"  # Options: "sft", "dpo", "orpo"

  # Batch size and accumulation
  per_device_train_batch_size: 1  # 8192 seq len requires batch=1 to avoid OOM
  gradient_accumulation_steps: 16  # effective batch size = 16

  # Learning rate and schedule
  learning_rate: 0.00002  # 2e-5 — RSLoRA stability allows higher LR than r=16 config
  lr_scheduler_type: "cosine"  # Learning rate scheduler
  warmup_steps: 20  # 11% of ~174 steps
  weight_decay: 0.01  # Weight decay

  # Training duration
  num_train_epochs: 3  # Number of training epochs
  max_steps: -1  # Maximum training steps (-1 for full epochs)

  # Precision and optimization
  fp16: false  # Use BF16 instead (Ampere GPU assumed)
  bf16: true   # BF16 — larger dynamic range than FP16, no overflow risk
  optimizer: "adamw_8bit"  # 8-bit optimizer — keeps optimizer states within 20GB budget

  # Logging and saving
  logging_steps: 1  # Logging frequency
  save_steps: 50  # Model save frequency
  save_total_limit: 2  # Maximum number of saved checkpoints

  # Output directory
  output_dir: "./qwen_finetuned_4096_20gb"  # Output directory for model and checkpoints

  # Data processing
  dataset_num_proc: 2  # Number of processes for dataset processing
  dataloader_num_workers: 0  # Number of dataloader workers
  packing: false  # Must be false when using train_on_responses_only

  # Reporting
  report_to: []  # Reporting services (wandb, tensorboard, etc.)

  # Model saving format
  save_method: "merged_16bit"  # Options: "lora", "merged_16bit", "merged_4bit"
  gguf_quantization: "q5_k_m"  # Also export GGUF for Ollama. Options: q4_k_m, q5_k_m, q8_0, f16. Set to null to skip.

  # Reproducibility
  seed: 42  # Random seed

# DPO / ORPO Configuration
dpo:
  beta: 0.1             # KL penalty coefficient for DPO (standard starting point)
  orpo_lambda: 0.1      # ORPO odds-ratio weight (same scale as DPO beta)
  dpo_learning_rate: 0.00005  # Lower LR than SFT — DPO is sensitive to overshooting

# Weights & Biases Configuration
use_wandb: false  # Enable W&B logging
wandb:
  project: "qwen-finetuning"  # W&B project name
  run_name: "qwen-dpo-stage2"  # W&B run name
  tags: ["qwen", "unsloth", "lora"]  # W&B tags

# Hardware-specific configurations
hardware:
  # For different GPU memory configurations
  gpu_16gb:
    model_name: "unsloth/Qwen1.5-3B"
    per_device_train_batch_size: 2
    gradient_accumulation_steps: 4
    max_seq_length: 2048

  gpu_24gb:
    model_name: "unsloth/Qwen1.5-3B"
    per_device_train_batch_size: 4
    gradient_accumulation_steps: 2
    max_seq_length: 4096

  gpu_40gb:
    model_name: "unsloth/Qwen1.5-3B"
    per_device_train_batch_size: 2
    gradient_accumulation_steps: 4
    max_seq_length: 4096

# Evaluation Configuration (optional)
evaluation:
  eval_steps: 50  # Evaluation frequency (reduced for small dataset)
  metric_for_best_model: "loss"  # Metric to track for best model
  load_best_model_at_end: true  # Load best model at end of training
  save_total_limit: 2  # Keep only 2 checkpoints to save disk