# Training configuration for checkpoint-7000 (synth-only-v2-fixed-run2-removedlabelaug) # This is the config used to train the published model. model: name: "Qwen/Qwen2.5-0.5B-Instruct" max_length: 1024 predict_token: "<|predict|>" action_tokens: - "<|continue_listening|>" - "<|start_speaking|>" - "<|start_listening|>" - "<|continue_speaking|>" prediction_tokens: - "<|continue_listening|>" - "<|start_speaking|>" - "<|start_listening|>" - "<|continue_speaking|>" data: train: - "data/context_action/synthetic/train.csv" # ~140K - "data/context_action/synthetic/supplementary_start_listening.csv" # ~14K val: "data/context_action/synthetic/val.csv" test: "data/context_action/synthetic/test.csv" training: batch_size: 8 gradient_accumulation_steps: 4 # effective batch size = 32 learning_rate: 5.0e-5 num_epochs: 50 warmup_steps: 100 weight_decay: 0.01 max_grad_norm: 1.0 bf16: true lr_scheduler: "cosine" loss: ntp_weight: 0.1 # 90% action CE + 10% NTP auxiliary early_stopping: enabled: true patience: 10 metric: "eval_f1_macro" augmentation: enabled: true context_truncation: enabled: true probability: 0.2 min_turns: 1 asr_styles: - "pure" - "punctuated" - "mixed" filler_injection: enabled: true probability: 0.2 max_fillers: 3 disfluency: enabled: true probability: 0.2 # Label-changing augmentations DISABLED (broke balanced sampling) streaming_crop: enabled: false backchannel_inject: enabled: false output: dir: "models_longer_training" save_steps: 1000 eval_steps: 1000 logging_steps: 100 save_total_limit: 5 evaluation: batch_size: 8