DAPO_E2H-countdown-gaussian…/.hydra/config.yaml

mode: train
experiment:
  dataset_size: 6000
  dataset_seed: 1234
  test_size: 0.1
  hf_token: ${oc.env:HF_TOKEN,null}
output:
  root_path: ${oc.env:ROOT_PATH}
  run_name: ${model.trim}_${task.name}_${algorithm.name}_${algorithm.training.curriculum_schedule}_${algorithm.training.scheduler_params.mu_exp}_${algorithm.training.scheduler_params.sigma}_SEC${algorithm.training.scheduler_params.vrex_adds.sec}DRO${algorithm.training.scheduler_params.vrex_adds.groupdro}G${algorithm.training.scheduler_params.vrex_adds.gaussian}_minp${algorithm.training.scheduler_params.min_prob}${ckpt2short:${algorithm.training.resume_from_checkpoint}}_${algorithm.training.max_steps}
lora:
  r: 32
  alpha: 64
  dropout: 0.1
  target_modules:
  - q_proj
  - v_proj
  task_type: CAUSAL_LM
occupy_gpu_memory: false
occupy_gpu_memory_gb: 50
gpu_device: cuda:0
model:
  family: Qwen
  trim: Qwen2.5-1.5B-Instruct
  name: ${model.family}/${model.trim}
  trust_remote_code: true
  torch_dtype: bfloat16
  attn_implementation: flash_attention_2
task:
  name: countdown2345
  data_files:
  - citrinegui/countdown_n2t100_1-100
  - citrinegui/countdown_n3t100_1-100
  - citrinegui/countdown_n4t100_1-100
  - citrinegui/countdown_n5t100_1-100
  test_file: citrinegui/countdown_n6t100_1-100
  force_redownload: false
  train_size: 327680
  test_size: 1024
  training:
    max_prompt_length: 1000
    max_completion_length: 512
  inference:
    checkpoint: outputs/Qwen2.5-1.5B-Instruct_countdown2345_grpo_balanced_0.5_0.5_SEC0.3DRO1.0G0.0_minpTrue_1600/checkpoint-1600/
    temperature: 0.0
    sc_num: 1
    pass_at_k: 1
    resume: 0
    max_new_tokens: 512
    batch_size: 32
algorithm:
  name: grpo
  training:
    resume_from_checkpoint: null
    learning_rate: 1.0e-06
    lr_scheduler_type: cosine
    logging_steps: 10
    max_steps: 1600
    per_device_train_batch_size: 16
    generation_batch_size: null
    steps_per_generation: 1
    gradient_accumulation_steps: 4
    gradient_checkpointing: true
    bf16: true
    report_to:
    - wandb
    push_to_hub: true
    save_strategy: steps
    save_steps: ${algorithm.training.max_steps}
    tf32: true
    num_generations: 8
    beta: 0.001
    use_vllm: true
    vllm_mode: colocate
    vllm_gpu_memory_utilization: 0.3
    vllm_server_port: 8000
    curriculum: false
    curriculum_schedule: gaussian
    scheduler_params:
      mu_exp: 0.5
      sigma: 0.5
      vrex_adds:
        groupdro: 1.0
        gaussian: 0.0
        sec: 0.3
      beta: 1.0
      min_prob: true
      td_alpha: 0.5
      sec_temperature: 0.3
      max_dapo_iter: 2