finetune:
  data: null
  model_class: causal-language-modeling
  config_name: ${..model_path}
  optim: adamw_torch
  load_as_bf16: true
  fp32_lm_head: ${..fp32_lm_head}
  fp32_layer_prefix: ${..fp32_layer_prefix}
  use_flash_attention: true
  attn_implementation: flash_attention_2
  auto_device_map: false
  lora:
    enabled: false
    task_type: CAUSAL_LM
    base_model_8bit: false
    base_model_4bit: false
    r: 16
    alpha: 16
    dropout: 0.05
    bias: none
    target_modules: []
  force_restart: ${..force_restart}
  resume_dataloader: false
  train_batch_size: 2
  valid_batch_size: 4
  weight_decay: 0.01
  learning_rate: 1.0e-06
  gradient_clipping_threshold: 0.3
  lr_scheduler_type: cosine
  num_warmup_steps: 25
  gradient_accumulation_passes: 128
  gradient_checkpointing: true
  reentrant_checkpointing: false
  max_train_steps: 1500
  interrupt_train_steps: -1
  max_eval_steps: -1
  seq_length: 8192
  seq_packing: true
  output_dir: ${..output_dir}/finetune
  seed: ${..seed}
  save_checkpoint_steps: 100
  keep_intermediate_checkpoints: true
  trust_remote_code: false
  cuda_empty_cache: true
  sft_config_name: null
  n_examples: 0
  log_each_n_steps: 1
  also_save_steps: []
  use_safetensors: true
  save_final_training_state: true
  seq_parallel: 1
  objective: rl
  input: training_data
  send_weight_updates: true
  queue_size: 32
  max_lag: null
  weight_update_interval: 1
  pop_old_data: ${..pop_old_data}
  attempts: 8
  eval_callback:
    _target_: pipelinerl.finetune.utils.dummy_eval_callback
    config_name: ''
  rl:
    policy_loss: ppo
    divide_advantage_by_std: false
    kl_coef: 0.001
    final_kl_coef: ${..rl.kl_coef}
    entropy_bonus: 0.0
    reward_minus_kl_coef: 0.0
    epsilon_low: 0.02
    epsilon_high: 0.02
    use_advantages: true
    relu_log_p_weights: false
    clamp_log_ratio_ref_new_value: 5
    temperature: ${...llm.parameters.temperature}
    aggregate_loss: sum
    overlong_filtering: false
    filter_zero_advantage_groups: false
rewards:
  correct_answer_finished: 1.0
  correct_answer_not_finished: 1.0
  wrong_answer_finished: 0
  wrong_answer_not_finished: 0
  no_answer_finished: 0
  no_answer_not_finished: 0
  unparsable_finished: 0
  unparsable_not_finished: 0
streams:
  backend: files
seed: 42
fp32_lm_head: false
fp32_layer_prefix: lm_head
actor:
  log_each_n_secs: 0
  llm_max_rollouts: 256
  rollout_workers: 1
  discount_factor: 1
  problem_queue_size: 256
  result_queue_size: 256
  throughput_window_size: 50
  shared_memory_entry_size: 10000000
  rollout_policy: pipelinerl.domains.math.generate_math_rollout
  system_prompt: Please reason step by step, and put your final answer within \boxed{}.
  task_template: '{task}'
  task_prompt: ''
environment: null
preprocess:
  input: actor
  output: training_data
  n_workers: 8
  chunk_n_groups: 2
  raw_queue_size: 8
  input_queue_size: 32
  output_queue_size: 32
  dataset_buffer_size: 0
  ring_buffer_size: 128
  max_ready_samples_per_lead: 64
  pop_old_data: ${..pop_old_data}
  shared_memory_entry_size: 100000000
  log_every_n_samples: 128
llm:
  parameters:
    max_tokens: 4096
    temperature: 1.0
test_llm:
  parameters:
    max_tokens: 4096
    temperature: 1.0
    top_p: 0.95
    top_k: 50
vllm_config:
  use_v1: false
  quantization: null
  vllm_kwargs:
    dtype: bfloat16
    gpu-memory-utilization: 0.92
    max-num-seqs: 64
    max-num-batched-tokens: 16384
    enable-chunked-prefill: ''
    return-tokens-as-token-ids: ''
    tensor-parallel-size: 1
    pipeline-parallel-size: 1
    generation-config: vllm
    max_model_len: 8192
    num-scheduler-steps: 8
    disable-log-requests: ''
    disable-frontend-multiprocessing: ''
world:
  replicas: 1
  actor_fraction: 2
  preprocessor_fraction: 1
  finetune_fraction: 1
  env_replicas: 1
  actor_group_port: 9000
  environment_start_port: 7777
jobs:
- kind: actor_llm
  idx: 0
  replica_idx: 0
  local_idx: 0
  node_rank: 0
  hostname: localhost
  port: 8080
  gpus:
  - 0
  url: http://localhost:8080
  environment_key: null
  environment_index: null
- kind: actor_llm
  idx: 1
  replica_idx: 1
  local_idx: 1
  node_rank: 0
  hostname: localhost
  port: 8081
  gpus:
  - 1
  url: http://localhost:8081
  environment_key: null
  environment_index: null
- kind: preprocessor_llm
  idx: 2
  replica_idx: 0
  local_idx: 2
  node_rank: 0
  hostname: localhost
  port: null
  gpus:
  - 2
  url: http://localhost:8182
  environment_key: null
  environment_index: null
- kind: actor
  idx: 3
  replica_idx: 0
  local_idx: 0
  node_rank: 0
  hostname: localhost
  port: null
  gpus: []
  url: ''
  environment_key: null
  environment_index: null
- kind: preprocessor
  idx: 4
  replica_idx: 0
  local_idx: 0
  node_rank: 0
  hostname: localhost
  port: null
  gpus: []
  url: ''
  environment_key: null
  environment_index: null
- kind: environment
  idx: 5
  replica_idx: 0
  local_idx: 0
  node_rank: 0
  hostname: localhost
  port: 7777
  gpus: []
  url: ''
  environment_key: math
  environment_index: 0
- kind: finetune
  idx: 6
  replica_idx: 0
  local_idx: 0
  node_rank: 0
  hostname: localhost
  port: null
  gpus:
  - 3
  url: ''
  environment_key: null
  environment_index: null
eval_every_n_versions: 78000
model_path: Qwen/Qwen3-4B
accelerate_config: null
use_deepspeed: true
deepspeed_config: deepspeed_stage3_bf16
use_fsdp: false
fsdp:
  param_dtype: fp32
  reduce_dtype: fp32
  buffer_dtype: fp32
output_dir: results/qwen3_4b_grpo_with_kl_2a1p1f_4xh100_197342
force_restart: false
pop_old_data: true
max_lag: null
attempts: 16
train_subset: null
debug:
  mode: ''
  streams_from: null
  place_inference_workers: true
  use_existing_llms: false
me:
  job_idx: null
wandb:
  use_wandb: true
  fail_on_init_error: false
  init_timeout: 120
  wandb_id: null
  wandb_name: null
  wandb_entity_name: jaygala24-team
  wandb_project_name: rl-post-training
  wandb_resume: always
  wandb_use_basename: true
  wandb_workspace_root: results
  wandb_group: qwen3_4b_grpo_with_kl_2a1p1f_4xh100_197342
  wandb_dir: null
  tags: []
environments:
- key: math
  mode: remote
  _target_: pipelinerl.domains.math.MathEnvironment
environment_key: math
dataset_loader: pipelinerl.domains.math.load_datasets
train_dataset_names:
- gsm8k_train
- math_train
test_dataset_names:
- gsm8k_test
- math_500