{ "loss": { "final": 0.0, "max": -0.0, "min": -0.0, "mean": 0.0, "n": 8 }, "reward_total": { "final": 0.8438500165939331, "max": 0.9020000100135803, "min": 0.800000011920929, "mean": 0.8473062515258789, "n": 8 }, "reward_market": { "final": 0.0, "max": 0.0, "min": 0.0, "mean": 0.0, "n": 0 }, "reward_warehouse": { "final": 0.0, "max": 0.0, "min": 0.0, "mean": 0.0, "n": 0 }, "reward_showroom": { "final": 0.0, "max": 0.0, "min": 0.0, "mean": 0.0, "n": 0 }, "n_log_rows": 9, "output_dir": "/ws/sm/shopmanager-grpo-smoke-l4-v2", "run_config": { "model": "Qwen/Qwen3-0.6B", "env_url": "https://hard007ik-shopmanagereng.hf.space", "dataset_size": 300, "num_generations": 2, "per_device_batch": 2, "grad_accum": 1, "max_completion_length": 64, "max_turns": 15, "lr": 5e-06, "warmup_steps": 10, "max_steps": 8, "epochs": 1, "vllm_gpu_mem": 0.3, "reward_weights": [ 1.0, 0.0, 0.0, 0.0 ], "precision": { "bf16": true } } }