qwen2.5-1.5b-slips-immune-risk/config.yaml

# Qwen Fine-tuning Configuration with Unsloth
# Risk analysis (cause + risk combined adapter) — Qwen2.5-3B, 4096 seq_len, 20GB VRAM

# Model Configuration
model:
  model_name: "unsloth/Qwen2.5-1.5B-Instruct"  # Target deployment model (RPi5)
  max_seq_length: 4096 # 3500 DAG tokens + 183 prompt overhead + ~400 response tokens
  dtype: null  # Auto-detect best dtype
  load_in_4bit: true  # QLoRA — 4-bit base model required for 20GB VRAM
  device_map: "auto"  # Automatic device mapping

  # LoRA Configuration
  lora_r: 64  # Increased from 16 — wider subspace helps model learn synthesis over verbatim copying
  lora_alpha: 64  # Keep equal to r with RSLoRA
  lora_dropout: 0.0  # No dropout — 461 samples, ~174 optimizer steps; every gradient counts
  lora_targets:  # Target modules for LoRA
    - "q_proj"
    - "k_proj"
    - "v_proj"
    - "o_proj"
    - "gate_proj"
    - "up_proj"
    - "down_proj"
  use_rslora: true  # Mandatory at r=64 to normalize gradient scaling
  random_state: 42  # Random seed for reproducibility
  loftq_config: null  # LoftQ configuration

# Dataset Configuration
dataset:
  type: "local"  # Options: "huggingface", "local"
  name: "mixed_dataset"  # Hugging Face dataset name (if type is huggingface)
  path: "risk_combined_train_dataset.json"  # Combined interleaved cause+risk train split (1328 records)
  eval_path: "risk_combined_eval_dataset.json"  # Combined interleaved cause+risk eval split (148 records)
  split: "train"  # Dataset split to use
  text_column: "messages"  # Column name containing conversations
  use_chat_template: true  # Apply chat template formatting
  dpo_train_path: "dpo_train_dataset.json"
  dpo_eval_path:  "dpo_eval_dataset.json"

# Training Configuration
training:
  mode: "sft"  # Options: "sft", "dpo", "orpo"

  # Batch size and accumulation
  per_device_train_batch_size: 1  # 8192 seq len requires batch=1 to avoid OOM
  gradient_accumulation_steps: 16  # effective batch size = 16

  # Learning rate and schedule
  learning_rate: 0.00002  # 2e-5 — RSLoRA stability allows higher LR than r=16 config
  lr_scheduler_type: "cosine"  # Learning rate scheduler
  warmup_steps: 20  # 11% of ~174 steps
  weight_decay: 0.01  # Weight decay

  # Training duration
  num_train_epochs: 3  # Number of training epochs
  max_steps: -1  # Maximum training steps (-1 for full epochs)

  # Precision and optimization
  fp16: false  # Use BF16 instead (Ampere GPU assumed)
  bf16: true   # BF16 — larger dynamic range than FP16, no overflow risk
  optimizer: "adamw_8bit"  # 8-bit optimizer — keeps optimizer states within 20GB budget

  # Logging and saving
  logging_steps: 1  # Logging frequency
  save_steps: 50  # Model save frequency
  save_total_limit: 2  # Maximum number of saved checkpoints

  # Output directory
  output_dir: "./qwen_risk_finetuned"  # Output directory for model and checkpoints

  # Data processing
  dataset_num_proc: 2  # Number of processes for dataset processing
  dataloader_num_workers: 0  # Number of dataloader workers
  packing: false  # Must be false when using train_on_responses_only

  # Reporting
  report_to: []  # Reporting services (wandb, tensorboard, etc.)

  # Model saving format
  save_method: "merged_16bit"  # Options: "lora", "merged_16bit", "merged_4bit"
  gguf_quantization: "q5_k_m"  # Also export GGUF for Ollama. Options: q4_k_m, q5_k_m, q8_0, f16. Set to null to skip.

  # Reproducibility
  seed: 42  # Random seed

# DPO / ORPO Configuration
dpo:
  beta: 0.1             # KL penalty coefficient for DPO (standard starting point)
  orpo_lambda: 0.1      # ORPO odds-ratio weight (same scale as DPO beta)
  dpo_learning_rate: 0.00005  # Lower LR than SFT — DPO is sensitive to overshooting

# Weights & Biases Configuration
use_wandb: false  # Enable W&B logging
wandb:
  project: "qwen-finetuning"  # W&B project name
  run_name: "qwen-dpo-stage2"  # W&B run name
  tags: ["qwen", "unsloth", "lora"]  # W&B tags

# Hardware-specific configurations
hardware:
  # For different GPU memory configurations
  gpu_16gb:
    model_name: "unsloth/Qwen1.5-3B"
    per_device_train_batch_size: 2
    gradient_accumulation_steps: 4
    max_seq_length: 2048

  gpu_24gb:
    model_name: "unsloth/Qwen1.5-3B"
    per_device_train_batch_size: 4
    gradient_accumulation_steps: 2
    max_seq_length: 4096

  gpu_40gb:
    model_name: "unsloth/Qwen1.5-3B"
    per_device_train_batch_size: 2
    gradient_accumulation_steps: 4
    max_seq_length: 4096

# Evaluation Configuration (optional)
evaluation:
  eval_steps: 50  # Evaluation frequency (reduced for small dataset)
  metric_for_best_model: "loss"  # Metric to track for best model
  load_best_model_at_end: true  # Load best model at end of training
  save_total_limit: 2  # Keep only 2 checkpoints to save disk
初始化项目，由ModelHub XC社区提供模型 Model: stratosphere/qwen2.5-1.5b-slips-immune-risk Source: Original Platform 2026-04-27 04:10:46 +08:00			`# Qwen Fine-tuning Configuration with Unsloth`
			`# Risk analysis (cause + risk combined adapter) — Qwen2.5-3B, 4096 seq_len, 20GB VRAM`

			`# Model Configuration`
			`model:`
			`model_name: "unsloth/Qwen2.5-1.5B-Instruct" # Target deployment model (RPi5)`
			`max_seq_length: 4096 # 3500 DAG tokens + 183 prompt overhead + ~400 response tokens`
			`dtype: null # Auto-detect best dtype`
			`load_in_4bit: true # QLoRA — 4-bit base model required for 20GB VRAM`
			`device_map: "auto" # Automatic device mapping`

			`# LoRA Configuration`
			`lora_r: 64 # Increased from 16 — wider subspace helps model learn synthesis over verbatim copying`
			`lora_alpha: 64 # Keep equal to r with RSLoRA`
			`lora_dropout: 0.0 # No dropout — 461 samples, ~174 optimizer steps; every gradient counts`
			`lora_targets: # Target modules for LoRA`
			`- "q_proj"`
			`- "k_proj"`
			`- "v_proj"`
			`- "o_proj"`
			`- "gate_proj"`
			`- "up_proj"`
			`- "down_proj"`
			`use_rslora: true # Mandatory at r=64 to normalize gradient scaling`
			`random_state: 42 # Random seed for reproducibility`
			`loftq_config: null # LoftQ configuration`

			`# Dataset Configuration`
			`dataset:`
			`type: "local" # Options: "huggingface", "local"`
			`name: "mixed_dataset" # Hugging Face dataset name (if type is huggingface)`
			`path: "risk_combined_train_dataset.json" # Combined interleaved cause+risk train split (1328 records)`
			`eval_path: "risk_combined_eval_dataset.json" # Combined interleaved cause+risk eval split (148 records)`
			`split: "train" # Dataset split to use`
			`text_column: "messages" # Column name containing conversations`
			`use_chat_template: true # Apply chat template formatting`
			`dpo_train_path: "dpo_train_dataset.json"`
			`dpo_eval_path: "dpo_eval_dataset.json"`

			`# Training Configuration`
			`training:`
			`mode: "sft" # Options: "sft", "dpo", "orpo"`

			`# Batch size and accumulation`
			`per_device_train_batch_size: 1 # 8192 seq len requires batch=1 to avoid OOM`
			`gradient_accumulation_steps: 16 # effective batch size = 16`

			`# Learning rate and schedule`
			`learning_rate: 0.00002 # 2e-5 — RSLoRA stability allows higher LR than r=16 config`
			`lr_scheduler_type: "cosine" # Learning rate scheduler`
			`warmup_steps: 20 # 11% of ~174 steps`
			`weight_decay: 0.01 # Weight decay`

			`# Training duration`
			`num_train_epochs: 3 # Number of training epochs`
			`max_steps: -1 # Maximum training steps (-1 for full epochs)`

			`# Precision and optimization`
			`fp16: false # Use BF16 instead (Ampere GPU assumed)`
			`bf16: true # BF16 — larger dynamic range than FP16, no overflow risk`
			`optimizer: "adamw_8bit" # 8-bit optimizer — keeps optimizer states within 20GB budget`

			`# Logging and saving`
			`logging_steps: 1 # Logging frequency`
			`save_steps: 50 # Model save frequency`
			`save_total_limit: 2 # Maximum number of saved checkpoints`

			`# Output directory`
			`output_dir: "./qwen_risk_finetuned" # Output directory for model and checkpoints`

			`# Data processing`
			`dataset_num_proc: 2 # Number of processes for dataset processing`
			`dataloader_num_workers: 0 # Number of dataloader workers`
			`packing: false # Must be false when using train_on_responses_only`

			`# Reporting`
			`report_to: [] # Reporting services (wandb, tensorboard, etc.)`

			`# Model saving format`
			`save_method: "merged_16bit" # Options: "lora", "merged_16bit", "merged_4bit"`
			`gguf_quantization: "q5_k_m" # Also export GGUF for Ollama. Options: q4_k_m, q5_k_m, q8_0, f16. Set to null to skip.`

			`# Reproducibility`
			`seed: 42 # Random seed`

			`# DPO / ORPO Configuration`
			`dpo:`
			`beta: 0.1 # KL penalty coefficient for DPO (standard starting point)`
			`orpo_lambda: 0.1 # ORPO odds-ratio weight (same scale as DPO beta)`
			`dpo_learning_rate: 0.00005 # Lower LR than SFT — DPO is sensitive to overshooting`

			`# Weights & Biases Configuration`
			`use_wandb: false # Enable W&B logging`
			`wandb:`
			`project: "qwen-finetuning" # W&B project name`
			`run_name: "qwen-dpo-stage2" # W&B run name`
			`tags: ["qwen", "unsloth", "lora"] # W&B tags`

			`# Hardware-specific configurations`
			`hardware:`
			`# For different GPU memory configurations`
			`gpu_16gb:`
			`model_name: "unsloth/Qwen1.5-3B"`
			`per_device_train_batch_size: 2`
			`gradient_accumulation_steps: 4`
			`max_seq_length: 2048`

			`gpu_24gb:`
			`model_name: "unsloth/Qwen1.5-3B"`
			`per_device_train_batch_size: 4`
			`gradient_accumulation_steps: 2`
			`max_seq_length: 4096`

			`gpu_40gb:`
			`model_name: "unsloth/Qwen1.5-3B"`
			`per_device_train_batch_size: 2`
			`gradient_accumulation_steps: 4`
			`max_seq_length: 4096`

			`# Evaluation Configuration (optional)`
			`evaluation:`
			`eval_steps: 50 # Evaluation frequency (reduced for small dataset)`
			`metric_for_best_model: "loss" # Metric to track for best model`
			`load_best_model_at_end: true # Load best model at end of training`
			`save_total_limit: 2 # Keep only 2 checkpoints to save disk`