mode: train experiment: dataset_size: 6000 dataset_seed: 1234 test_size: 0.1 hf_token: ${oc.env:HF_TOKEN,null} output: root_path: ${oc.env:ROOT_PATH} run_name: ${model.trim}_${task.name}_${algorithm.name}_${algorithm.training.curriculum_schedule}_${algorithm.training.scheduler_params.mu_exp}_${algorithm.training.scheduler_params.sigma}_SEC${algorithm.training.scheduler_params.vrex_adds.sec}DRO${algorithm.training.scheduler_params.vrex_adds.groupdro}G${algorithm.training.scheduler_params.vrex_adds.gaussian}_minp${algorithm.training.scheduler_params.min_prob}${ckpt2short:${algorithm.training.resume_from_checkpoint}}_${algorithm.training.max_steps} lora: r: 32 alpha: 64 dropout: 0.1 target_modules: - q_proj - v_proj task_type: CAUSAL_LM occupy_gpu_memory: false occupy_gpu_memory_gb: 50 gpu_device: cuda:0 model: family: Qwen trim: Qwen2.5-1.5B-Instruct name: ${model.family}/${model.trim} trust_remote_code: true torch_dtype: bfloat16 attn_implementation: flash_attention_2 task: name: math data_files: - data/math/level_1 - data/math/level_2 - data/math/level_3 - data/math/level_4 training: max_prompt_length: 1600 max_completion_length: 1600 inference: data_files: - data/math/level1 - data/math/level2 - data/math/level3 - data/math/level4 - data/math/level5 max_prompt_length: 1600 max_completion_length: 1600 temperature: 0.0 'n': 1 algorithm: name: grpo training: resume_from_checkpoint: null learning_rate: 1.0e-06 lr_scheduler_type: cosine logging_steps: 10 max_steps: 1600 per_device_train_batch_size: 16 generation_batch_size: null steps_per_generation: 1 gradient_accumulation_steps: 4 gradient_checkpointing: true bf16: true report_to: - wandb push_to_hub: true save_strategy: steps save_steps: ${algorithm.training.max_steps} tf32: true num_generations: 8 beta: 0.001 use_vllm: true vllm_mode: colocate vllm_gpu_memory_utilization: 0.25 vllm_server_port: 8000 curriculum: false curriculum_schedule: gaussian scheduler_params: mu_exp: 0.5 sigma: 0.5 vrex_adds: groupdro: 1.0 gaussian: 0.0 sec: 0.3 beta: 1.0 min_prob: true td_alpha: 0.5 sec_temperature: 0.3 max_dapo_iter: 2