{ "step": 15, "scenarios_path": "data/scenarios_val.jsonl", "num_generations": 8, "total_scenarios": 50, "total_rollouts": 400, "successes": 18, "accuracy_pct": 4.5, "avg_reward": -1.8850000000000002, "per_scenario": [ { "scenario_index": 0, "task": "Convert 98 kg to lbs.", "mean_reward": 3.125, "max_reward": 4.0, "success_count": 7, "total_attempts": 8 }, { "scenario_index": 1, "task": "What is the speed of light?", "mean_reward": -2.5, "max_reward": -2.0, "success_count": 0, "total_attempts": 8 }, { "scenario_index": 2, "task": "What is the distance from Earth to the Sun in km in miles?", "mean_reward": 0.5625, "max_reward": 2.5, "success_count": 1, "total_attempts": 8 }, { "scenario_index": 3, "task": "What is 441 plus 23?", "mean_reward": 2.25, "max_reward": 4.0, "success_count": 1, "total_attempts": 8 }, { "scenario_index": 4, "task": "Convert 62 kg to lbs.", "mean_reward": -3.0, "max_reward": -3.0, "success_count": 0, "total_attempts": 8 }, { "scenario_index": 5, "task": "Which is hotter right now, London or Mumbai?", "mean_reward": -2.875, "max_reward": -2.0, "success_count": 0, "total_attempts": 8 }, { "scenario_index": 6, "task": "What is 185 plus 89?", "mean_reward": -3.0, "max_reward": -3.0, "success_count": 0, "total_attempts": 8 }, { "scenario_index": 7, "task": "What's the weather like in Dubai?", "mean_reward": -1.375, "max_reward": 4.0, "success_count": 2, "total_attempts": 8 }, { "scenario_index": 8, "task": "What is the population of Germany divided by its area in km2?", "mean_reward": -2.041666666666667, "max_reward": 1.666666666666666, "success_count": 0, "total_attempts": 8 }, { "scenario_index": 9, "task": "What is the boiling point of water?", "mean_reward": -3.0, "max_reward": -3.0, "success_count": 0, "total_attempts": 8 }, { "scenario_index": 10, "task": "Which is hotter right now, London or Mumbai?", "mean_reward": -2.0, "max_reward": -2.0, "success_count": 0, "total_attempts": 8 }, { "scenario_index": 11, "task": "What is the population of India divided by its area in km2?", "mean_reward": -2.125, "max_reward": -2.0, "success_count": 0, "total_attempts": 8 }, { "scenario_index": 12, "task": "What is India's population density in people per square mile?", "mean_reward": -1.25, "max_reward": 1.333333333333333, "success_count": 0, "total_attempts": 8 }, { "scenario_index": 13, "task": "What is the tallest mountain?", "mean_reward": -2.875, "max_reward": -2.0, "success_count": 0, "total_attempts": 8 }, { "scenario_index": 14, "task": "What is the distance from Earth to the Sun in km in miles?", "mean_reward": -1.875, "max_reward": 1.5, "success_count": 0, "total_attempts": 8 }, { "scenario_index": 15, "task": "What is the population of Japan divided by its area in km2?", "mean_reward": -2.875, "max_reward": -2.0, "success_count": 0, "total_attempts": 8 }, { "scenario_index": 16, "task": "What is Germany's population density in people per square mile?", "mean_reward": -3.0, "max_reward": -3.0, "success_count": 0, "total_attempts": 8 }, { "scenario_index": 17, "task": "Convert 74 kg to lbs.", "mean_reward": -3.0, "max_reward": -3.0, "success_count": 0, "total_attempts": 8 }, { "scenario_index": 18, "task": "Which is hotter right now, Paris or Cairo?", "mean_reward": -3.0, "max_reward": -3.0, "success_count": 0, "total_attempts": 8 }, { "scenario_index": 19, "task": "What is India's population density in people per square mile?", "mean_reward": -1.5, "max_reward": 1.333333333333333, "success_count": 0, "total_attempts": 8 }, { "scenario_index": 20, "task": "Which country has a larger population, France or Brazil?", "mean_reward": -3.0, "max_reward": -3.0, "success_count": 0, "total_attempts": 8 }, { "scenario_index": 21, "task": "Convert 64 kg to lbs.", "mean_reward": -3.0, "max_reward": -3.0, "success_count": 0, "total_attempts": 8 }, { "scenario_index": 22, "task": "Which country has a larger population, Japan or India?", "mean_reward": -1.5625, "max_reward": 3.0, "success_count": 2, "total_attempts": 8 }, { "scenario_index": 23, "task": "What is the GDP of Japan?", "mean_reward": -2.75, "max_reward": -2.0, "success_count": 0, "total_attempts": 8 }, { "scenario_index": 24, "task": "What is the population of France divided by its area in km2?", "mean_reward": -2.875, "max_reward": -2.0, "success_count": 0, "total_attempts": 8 }, { "scenario_index": 25, "task": "What is France's population density in people per square mile?", "mean_reward": -2.5, "max_reward": 1.0, "success_count": 0, "total_attempts": 8 }, { "scenario_index": 26, "task": "Convert 26 kg to lbs.", "mean_reward": -3.0, "max_reward": -3.0, "success_count": 0, "total_attempts": 8 }, { "scenario_index": 27, "task": "What is 660 times 87?", "mean_reward": 1.0, "max_reward": 4.0, "success_count": 1, "total_attempts": 8 }, { "scenario_index": 28, "task": "What is the boiling point of water?", "mean_reward": -2.5, "max_reward": -2.0, "success_count": 0, "total_attempts": 8 }, { "scenario_index": 29, "task": "What is the population of Germany divided by its area in km2?", "mean_reward": -1.125, "max_reward": 2.333333333333333, "success_count": 1, "total_attempts": 8 }, { "scenario_index": 30, "task": "Convert 40 kg to lbs.", "mean_reward": -3.0, "max_reward": -3.0, "success_count": 0, "total_attempts": 8 }, { "scenario_index": 31, "task": "What is the speed of light?", "mean_reward": -2.375, "max_reward": -2.0, "success_count": 0, "total_attempts": 8 }, { "scenario_index": 32, "task": "How old was Guido van Rossum in 2024?", "mean_reward": -2.875, "max_reward": -2.0, "success_count": 0, "total_attempts": 8 }, { "scenario_index": 33, "task": "Which is hotter right now, Paris or Dubai?", "mean_reward": -3.0, "max_reward": -3.0, "success_count": 0, "total_attempts": 8 }, { "scenario_index": 34, "task": "Which is hotter right now, Tokyo or Dubai?", "mean_reward": -2.875, "max_reward": -2.0, "success_count": 0, "total_attempts": 8 }, { "scenario_index": 35, "task": "Which is hotter right now, London or Cairo?", "mean_reward": -2.375, "max_reward": -2.0, "success_count": 0, "total_attempts": 8 }, { "scenario_index": 36, "task": "What is the value of pi?", "mean_reward": -2.125, "max_reward": -2.0, "success_count": 0, "total_attempts": 8 }, { "scenario_index": 37, "task": "What is the population of Japan divided by its area in km2?", "mean_reward": -1.6666666666666667, "max_reward": 1.666666666666666, "success_count": 0, "total_attempts": 8 }, { "scenario_index": 38, "task": "What is the temperature in London in Fahrenheit?", "mean_reward": -1.375, "max_reward": 1.5, "success_count": 0, "total_attempts": 8 }, { "scenario_index": 39, "task": "What is 464 plus 30?", "mean_reward": -1.75, "max_reward": 2.0, "success_count": 0, "total_attempts": 8 }, { "scenario_index": 40, "task": "Which country has a larger population, France or India?", "mean_reward": -2.1875, "max_reward": 2.5, "success_count": 1, "total_attempts": 8 }, { "scenario_index": 41, "task": "What is the distance from Earth to the Sun in km in miles?", "mean_reward": -1.625, "max_reward": 2.0, "success_count": 0, "total_attempts": 8 }, { "scenario_index": 42, "task": "What is the tallest mountain?", "mean_reward": -1.125, "max_reward": 2.0, "success_count": 0, "total_attempts": 8 }, { "scenario_index": 43, "task": "What is the temperature in London in Fahrenheit?", "mean_reward": 0.6875, "max_reward": 2.0, "success_count": 0, "total_attempts": 8 }, { "scenario_index": 44, "task": "What is 496 minus 24?", "mean_reward": 1.0, "max_reward": 2.0, "success_count": 0, "total_attempts": 8 }, { "scenario_index": 45, "task": "What's the weather like in Cairo?", "mean_reward": -1.25, "max_reward": 4.0, "success_count": 2, "total_attempts": 8 }, { "scenario_index": 46, "task": "What is the tallest mountain?", "mean_reward": -2.5, "max_reward": -2.0, "success_count": 0, "total_attempts": 8 }, { "scenario_index": 47, "task": "What is India's population density in people per square mile?", "mean_reward": -1.7916666666666667, "max_reward": 1.333333333333333, "success_count": 0, "total_attempts": 8 }, { "scenario_index": 48, "task": "What is the GDP of France?", "mean_reward": -2.625, "max_reward": -2.0, "success_count": 0, "total_attempts": 8 }, { "scenario_index": 49, "task": "How old was Guido van Rossum in 2024?", "mean_reward": -2.75, "max_reward": -2.0, "success_count": 0, "total_attempts": 8 } ] }