finetune: data: null model_class: causal-language-modeling config_name: ${..model_path} optim: adamw_torch load_as_bf16: true fp32_lm_head: ${..fp32_lm_head} fp32_layer_prefix: ${..fp32_layer_prefix} use_flash_attention: true attn_implementation: flash_attention_2 auto_device_map: false lora: enabled: false task_type: CAUSAL_LM base_model_8bit: false base_model_4bit: false r: 16 alpha: 16 dropout: 0.05 bias: none target_modules: [] force_restart: ${..force_restart} resume_dataloader: false train_batch_size: 2 valid_batch_size: 4 weight_decay: 0.01 learning_rate: 1.0e-06 gradient_clipping_threshold: 0.3 lr_scheduler_type: cosine num_warmup_steps: 25 gradient_accumulation_passes: 128 gradient_checkpointing: true reentrant_checkpointing: false max_train_steps: 1500 interrupt_train_steps: -1 max_eval_steps: -1 seq_length: 8192 seq_packing: true output_dir: ${..output_dir}/finetune seed: ${..seed} save_checkpoint_steps: 100 keep_intermediate_checkpoints: true trust_remote_code: false cuda_empty_cache: true sft_config_name: null n_examples: 0 log_each_n_steps: 1 also_save_steps: [] use_safetensors: true save_final_training_state: true seq_parallel: 1 objective: rl input: training_data send_weight_updates: true queue_size: 32 max_lag: null weight_update_interval: 1 pop_old_data: ${..pop_old_data} attempts: 8 eval_callback: _target_: pipelinerl.finetune.utils.dummy_eval_callback config_name: '' rl: policy_loss: ppo divide_advantage_by_std: false kl_coef: 0.001 final_kl_coef: ${..rl.kl_coef} entropy_bonus: 0.0 reward_minus_kl_coef: 0.0 epsilon_low: 0.02 epsilon_high: 0.02 use_advantages: true relu_log_p_weights: false clamp_log_ratio_ref_new_value: 5 temperature: ${...llm.parameters.temperature} aggregate_loss: sum overlong_filtering: false filter_zero_advantage_groups: false rewards: correct_answer_finished: 1.0 correct_answer_not_finished: 1.0 wrong_answer_finished: 0 wrong_answer_not_finished: 0 no_answer_finished: 0 no_answer_not_finished: 0 unparsable_finished: 0 unparsable_not_finished: 0 streams: backend: files seed: 42 fp32_lm_head: false fp32_layer_prefix: lm_head actor: log_each_n_secs: 0 llm_max_rollouts: 256 rollout_workers: 1 discount_factor: 1 problem_queue_size: 256 result_queue_size: 256 throughput_window_size: 50 shared_memory_entry_size: 10000000 rollout_policy: pipelinerl.domains.math.generate_math_rollout system_prompt: Please reason step by step, and put your final answer within \boxed{}. task_template: '{task}' task_prompt: '' environment: null preprocess: input: actor output: training_data n_workers: 8 chunk_n_groups: 2 raw_queue_size: 8 input_queue_size: 32 output_queue_size: 32 dataset_buffer_size: 0 ring_buffer_size: 128 max_ready_samples_per_lead: 64 pop_old_data: ${..pop_old_data} shared_memory_entry_size: 100000000 log_every_n_samples: 128 llm: parameters: max_tokens: 4096 temperature: 1.0 test_llm: parameters: max_tokens: 4096 temperature: 1.0 top_p: 0.95 top_k: 50 vllm_config: use_v1: false quantization: null vllm_kwargs: dtype: bfloat16 gpu-memory-utilization: 0.92 max-num-seqs: 64 max-num-batched-tokens: 16384 enable-chunked-prefill: '' return-tokens-as-token-ids: '' tensor-parallel-size: 1 pipeline-parallel-size: 1 generation-config: vllm max_model_len: 8192 num-scheduler-steps: 8 disable-log-requests: '' disable-frontend-multiprocessing: '' world: replicas: 1 actor_fraction: 2 preprocessor_fraction: 1 finetune_fraction: 1 env_replicas: 1 actor_group_port: 9000 environment_start_port: 7777 jobs: - kind: actor_llm idx: 0 replica_idx: 0 local_idx: 0 node_rank: 0 hostname: localhost port: 8080 gpus: - 0 url: http://localhost:8080 environment_key: null environment_index: null - kind: actor_llm idx: 1 replica_idx: 1 local_idx: 1 node_rank: 0 hostname: localhost port: 8081 gpus: - 1 url: http://localhost:8081 environment_key: null environment_index: null - kind: preprocessor_llm idx: 2 replica_idx: 0 local_idx: 2 node_rank: 0 hostname: localhost port: null gpus: - 2 url: http://localhost:8182 environment_key: null environment_index: null - kind: actor idx: 3 replica_idx: 0 local_idx: 0 node_rank: 0 hostname: localhost port: null gpus: [] url: '' environment_key: null environment_index: null - kind: preprocessor idx: 4 replica_idx: 0 local_idx: 0 node_rank: 0 hostname: localhost port: null gpus: [] url: '' environment_key: null environment_index: null - kind: environment idx: 5 replica_idx: 0 local_idx: 0 node_rank: 0 hostname: localhost port: 7777 gpus: [] url: '' environment_key: math environment_index: 0 - kind: finetune idx: 6 replica_idx: 0 local_idx: 0 node_rank: 0 hostname: localhost port: null gpus: - 3 url: '' environment_key: null environment_index: null eval_every_n_versions: 78000 model_path: Qwen/Qwen3-4B accelerate_config: null use_deepspeed: true deepspeed_config: deepspeed_stage3_bf16 use_fsdp: false fsdp: param_dtype: fp32 reduce_dtype: fp32 buffer_dtype: fp32 output_dir: results/qwen3_4b_grpo_with_kl_2a1p1f_4xh100_197342 force_restart: false pop_old_data: true max_lag: null attempts: 16 train_subset: null debug: mode: '' streams_from: null place_inference_workers: true use_existing_llms: false me: job_idx: null wandb: use_wandb: true fail_on_init_error: false init_timeout: 120 wandb_id: null wandb_name: null wandb_entity_name: jaygala24-team wandb_project_name: rl-post-training wandb_resume: always wandb_use_basename: true wandb_workspace_root: results wandb_group: qwen3_4b_grpo_with_kl_2a1p1f_4xh100_197342 wandb_dir: null tags: [] environments: - key: math mode: remote _target_: pipelinerl.domains.math.MathEnvironment environment_key: math dataset_loader: pipelinerl.domains.math.load_datasets train_dataset_names: - gsm8k_train - math_train test_dataset_names: - gsm8k_test - math_500