seed: 42 ### model model_name_or_path: meta-llama/Llama-3.2-1B-Instruct trust_remote_code: true flash_attn: auto use_cache: false ### method # Full fine-tune of every decoder block, but with the (tied) embeddings frozen. # `finetuning_type: freeze` only trains modules whose name matches a trainable layer; # embed_tokens / lm_head / final model.norm are "extra" modules and stay frozen unless # listed in freeze_extra_modules. Setting freeze_trainable_layers = num_hidden_layers (16 # for Llama-3.2-1B) makes ALL decoder blocks trainable, so this == "full FT minus # embeddings". Because tie_word_embeddings=true, freezing embed_tokens also freezes lm_head. # This is lever B of the embedding-amplification fix (see figures/amplification/README.md). stage: sft do_train: true finetuning_type: freeze freeze_trainable_layers: 16 freeze_trainable_modules: all # freeze_extra_modules: left unset -> embed_tokens, lm_head (tied), final norm stay frozen ### dataset dataset: record template: llama3 cutoff_len: 2048 overwrite_cache: true preprocessing_num_workers: 4 dataloader_num_workers: 4 packing: false ### output output_dir: saves_bts_preliminary/freeze/llama-3.2-1b-instruct/train_record_42_1779354540 logging_steps: 5 save_steps: 0.05 overwrite_output_dir: true save_only_model: false plot_loss: true include_num_input_tokens_seen: true push_to_hub: true push_to_hub_organization: rbelanec load_best_model_at_end: true save_total_limit: 1 ### train per_device_train_batch_size: 8 learning_rate: 2.0e-6 num_train_epochs: 1 weight_decay: 1.0e-2 lr_scheduler_type: cosine bf16: true ddp_timeout: 180000000 resume_from_checkpoint: null warmup_ratio: 0.1 optim: adamw_torch report_to: - wandb run_name: freeze_llama-3.2-1b-instruct_train_record_42_1779354540 ### eval per_device_eval_batch_size: 8 eval_strategy: steps eval_steps: 0.05 val_size: 0.1