412 lines
10 KiB
JSON
412 lines
10 KiB
JSON
{
|
|
"step": 15,
|
|
"scenarios_path": "data/scenarios_val.jsonl",
|
|
"num_generations": 8,
|
|
"total_scenarios": 50,
|
|
"total_rollouts": 400,
|
|
"successes": 18,
|
|
"accuracy_pct": 4.5,
|
|
"avg_reward": -1.8850000000000002,
|
|
"per_scenario": [
|
|
{
|
|
"scenario_index": 0,
|
|
"task": "Convert 98 kg to lbs.",
|
|
"mean_reward": 3.125,
|
|
"max_reward": 4.0,
|
|
"success_count": 7,
|
|
"total_attempts": 8
|
|
},
|
|
{
|
|
"scenario_index": 1,
|
|
"task": "What is the speed of light?",
|
|
"mean_reward": -2.5,
|
|
"max_reward": -2.0,
|
|
"success_count": 0,
|
|
"total_attempts": 8
|
|
},
|
|
{
|
|
"scenario_index": 2,
|
|
"task": "What is the distance from Earth to the Sun in km in miles?",
|
|
"mean_reward": 0.5625,
|
|
"max_reward": 2.5,
|
|
"success_count": 1,
|
|
"total_attempts": 8
|
|
},
|
|
{
|
|
"scenario_index": 3,
|
|
"task": "What is 441 plus 23?",
|
|
"mean_reward": 2.25,
|
|
"max_reward": 4.0,
|
|
"success_count": 1,
|
|
"total_attempts": 8
|
|
},
|
|
{
|
|
"scenario_index": 4,
|
|
"task": "Convert 62 kg to lbs.",
|
|
"mean_reward": -3.0,
|
|
"max_reward": -3.0,
|
|
"success_count": 0,
|
|
"total_attempts": 8
|
|
},
|
|
{
|
|
"scenario_index": 5,
|
|
"task": "Which is hotter right now, London or Mumbai?",
|
|
"mean_reward": -2.875,
|
|
"max_reward": -2.0,
|
|
"success_count": 0,
|
|
"total_attempts": 8
|
|
},
|
|
{
|
|
"scenario_index": 6,
|
|
"task": "What is 185 plus 89?",
|
|
"mean_reward": -3.0,
|
|
"max_reward": -3.0,
|
|
"success_count": 0,
|
|
"total_attempts": 8
|
|
},
|
|
{
|
|
"scenario_index": 7,
|
|
"task": "What's the weather like in Dubai?",
|
|
"mean_reward": -1.375,
|
|
"max_reward": 4.0,
|
|
"success_count": 2,
|
|
"total_attempts": 8
|
|
},
|
|
{
|
|
"scenario_index": 8,
|
|
"task": "What is the population of Germany divided by its area in km2?",
|
|
"mean_reward": -2.041666666666667,
|
|
"max_reward": 1.666666666666666,
|
|
"success_count": 0,
|
|
"total_attempts": 8
|
|
},
|
|
{
|
|
"scenario_index": 9,
|
|
"task": "What is the boiling point of water?",
|
|
"mean_reward": -3.0,
|
|
"max_reward": -3.0,
|
|
"success_count": 0,
|
|
"total_attempts": 8
|
|
},
|
|
{
|
|
"scenario_index": 10,
|
|
"task": "Which is hotter right now, London or Mumbai?",
|
|
"mean_reward": -2.0,
|
|
"max_reward": -2.0,
|
|
"success_count": 0,
|
|
"total_attempts": 8
|
|
},
|
|
{
|
|
"scenario_index": 11,
|
|
"task": "What is the population of India divided by its area in km2?",
|
|
"mean_reward": -2.125,
|
|
"max_reward": -2.0,
|
|
"success_count": 0,
|
|
"total_attempts": 8
|
|
},
|
|
{
|
|
"scenario_index": 12,
|
|
"task": "What is India's population density in people per square mile?",
|
|
"mean_reward": -1.25,
|
|
"max_reward": 1.333333333333333,
|
|
"success_count": 0,
|
|
"total_attempts": 8
|
|
},
|
|
{
|
|
"scenario_index": 13,
|
|
"task": "What is the tallest mountain?",
|
|
"mean_reward": -2.875,
|
|
"max_reward": -2.0,
|
|
"success_count": 0,
|
|
"total_attempts": 8
|
|
},
|
|
{
|
|
"scenario_index": 14,
|
|
"task": "What is the distance from Earth to the Sun in km in miles?",
|
|
"mean_reward": -1.875,
|
|
"max_reward": 1.5,
|
|
"success_count": 0,
|
|
"total_attempts": 8
|
|
},
|
|
{
|
|
"scenario_index": 15,
|
|
"task": "What is the population of Japan divided by its area in km2?",
|
|
"mean_reward": -2.875,
|
|
"max_reward": -2.0,
|
|
"success_count": 0,
|
|
"total_attempts": 8
|
|
},
|
|
{
|
|
"scenario_index": 16,
|
|
"task": "What is Germany's population density in people per square mile?",
|
|
"mean_reward": -3.0,
|
|
"max_reward": -3.0,
|
|
"success_count": 0,
|
|
"total_attempts": 8
|
|
},
|
|
{
|
|
"scenario_index": 17,
|
|
"task": "Convert 74 kg to lbs.",
|
|
"mean_reward": -3.0,
|
|
"max_reward": -3.0,
|
|
"success_count": 0,
|
|
"total_attempts": 8
|
|
},
|
|
{
|
|
"scenario_index": 18,
|
|
"task": "Which is hotter right now, Paris or Cairo?",
|
|
"mean_reward": -3.0,
|
|
"max_reward": -3.0,
|
|
"success_count": 0,
|
|
"total_attempts": 8
|
|
},
|
|
{
|
|
"scenario_index": 19,
|
|
"task": "What is India's population density in people per square mile?",
|
|
"mean_reward": -1.5,
|
|
"max_reward": 1.333333333333333,
|
|
"success_count": 0,
|
|
"total_attempts": 8
|
|
},
|
|
{
|
|
"scenario_index": 20,
|
|
"task": "Which country has a larger population, France or Brazil?",
|
|
"mean_reward": -3.0,
|
|
"max_reward": -3.0,
|
|
"success_count": 0,
|
|
"total_attempts": 8
|
|
},
|
|
{
|
|
"scenario_index": 21,
|
|
"task": "Convert 64 kg to lbs.",
|
|
"mean_reward": -3.0,
|
|
"max_reward": -3.0,
|
|
"success_count": 0,
|
|
"total_attempts": 8
|
|
},
|
|
{
|
|
"scenario_index": 22,
|
|
"task": "Which country has a larger population, Japan or India?",
|
|
"mean_reward": -1.5625,
|
|
"max_reward": 3.0,
|
|
"success_count": 2,
|
|
"total_attempts": 8
|
|
},
|
|
{
|
|
"scenario_index": 23,
|
|
"task": "What is the GDP of Japan?",
|
|
"mean_reward": -2.75,
|
|
"max_reward": -2.0,
|
|
"success_count": 0,
|
|
"total_attempts": 8
|
|
},
|
|
{
|
|
"scenario_index": 24,
|
|
"task": "What is the population of France divided by its area in km2?",
|
|
"mean_reward": -2.875,
|
|
"max_reward": -2.0,
|
|
"success_count": 0,
|
|
"total_attempts": 8
|
|
},
|
|
{
|
|
"scenario_index": 25,
|
|
"task": "What is France's population density in people per square mile?",
|
|
"mean_reward": -2.5,
|
|
"max_reward": 1.0,
|
|
"success_count": 0,
|
|
"total_attempts": 8
|
|
},
|
|
{
|
|
"scenario_index": 26,
|
|
"task": "Convert 26 kg to lbs.",
|
|
"mean_reward": -3.0,
|
|
"max_reward": -3.0,
|
|
"success_count": 0,
|
|
"total_attempts": 8
|
|
},
|
|
{
|
|
"scenario_index": 27,
|
|
"task": "What is 660 times 87?",
|
|
"mean_reward": 1.0,
|
|
"max_reward": 4.0,
|
|
"success_count": 1,
|
|
"total_attempts": 8
|
|
},
|
|
{
|
|
"scenario_index": 28,
|
|
"task": "What is the boiling point of water?",
|
|
"mean_reward": -2.5,
|
|
"max_reward": -2.0,
|
|
"success_count": 0,
|
|
"total_attempts": 8
|
|
},
|
|
{
|
|
"scenario_index": 29,
|
|
"task": "What is the population of Germany divided by its area in km2?",
|
|
"mean_reward": -1.125,
|
|
"max_reward": 2.333333333333333,
|
|
"success_count": 1,
|
|
"total_attempts": 8
|
|
},
|
|
{
|
|
"scenario_index": 30,
|
|
"task": "Convert 40 kg to lbs.",
|
|
"mean_reward": -3.0,
|
|
"max_reward": -3.0,
|
|
"success_count": 0,
|
|
"total_attempts": 8
|
|
},
|
|
{
|
|
"scenario_index": 31,
|
|
"task": "What is the speed of light?",
|
|
"mean_reward": -2.375,
|
|
"max_reward": -2.0,
|
|
"success_count": 0,
|
|
"total_attempts": 8
|
|
},
|
|
{
|
|
"scenario_index": 32,
|
|
"task": "How old was Guido van Rossum in 2024?",
|
|
"mean_reward": -2.875,
|
|
"max_reward": -2.0,
|
|
"success_count": 0,
|
|
"total_attempts": 8
|
|
},
|
|
{
|
|
"scenario_index": 33,
|
|
"task": "Which is hotter right now, Paris or Dubai?",
|
|
"mean_reward": -3.0,
|
|
"max_reward": -3.0,
|
|
"success_count": 0,
|
|
"total_attempts": 8
|
|
},
|
|
{
|
|
"scenario_index": 34,
|
|
"task": "Which is hotter right now, Tokyo or Dubai?",
|
|
"mean_reward": -2.875,
|
|
"max_reward": -2.0,
|
|
"success_count": 0,
|
|
"total_attempts": 8
|
|
},
|
|
{
|
|
"scenario_index": 35,
|
|
"task": "Which is hotter right now, London or Cairo?",
|
|
"mean_reward": -2.375,
|
|
"max_reward": -2.0,
|
|
"success_count": 0,
|
|
"total_attempts": 8
|
|
},
|
|
{
|
|
"scenario_index": 36,
|
|
"task": "What is the value of pi?",
|
|
"mean_reward": -2.125,
|
|
"max_reward": -2.0,
|
|
"success_count": 0,
|
|
"total_attempts": 8
|
|
},
|
|
{
|
|
"scenario_index": 37,
|
|
"task": "What is the population of Japan divided by its area in km2?",
|
|
"mean_reward": -1.6666666666666667,
|
|
"max_reward": 1.666666666666666,
|
|
"success_count": 0,
|
|
"total_attempts": 8
|
|
},
|
|
{
|
|
"scenario_index": 38,
|
|
"task": "What is the temperature in London in Fahrenheit?",
|
|
"mean_reward": -1.375,
|
|
"max_reward": 1.5,
|
|
"success_count": 0,
|
|
"total_attempts": 8
|
|
},
|
|
{
|
|
"scenario_index": 39,
|
|
"task": "What is 464 plus 30?",
|
|
"mean_reward": -1.75,
|
|
"max_reward": 2.0,
|
|
"success_count": 0,
|
|
"total_attempts": 8
|
|
},
|
|
{
|
|
"scenario_index": 40,
|
|
"task": "Which country has a larger population, France or India?",
|
|
"mean_reward": -2.1875,
|
|
"max_reward": 2.5,
|
|
"success_count": 1,
|
|
"total_attempts": 8
|
|
},
|
|
{
|
|
"scenario_index": 41,
|
|
"task": "What is the distance from Earth to the Sun in km in miles?",
|
|
"mean_reward": -1.625,
|
|
"max_reward": 2.0,
|
|
"success_count": 0,
|
|
"total_attempts": 8
|
|
},
|
|
{
|
|
"scenario_index": 42,
|
|
"task": "What is the tallest mountain?",
|
|
"mean_reward": -1.125,
|
|
"max_reward": 2.0,
|
|
"success_count": 0,
|
|
"total_attempts": 8
|
|
},
|
|
{
|
|
"scenario_index": 43,
|
|
"task": "What is the temperature in London in Fahrenheit?",
|
|
"mean_reward": 0.6875,
|
|
"max_reward": 2.0,
|
|
"success_count": 0,
|
|
"total_attempts": 8
|
|
},
|
|
{
|
|
"scenario_index": 44,
|
|
"task": "What is 496 minus 24?",
|
|
"mean_reward": 1.0,
|
|
"max_reward": 2.0,
|
|
"success_count": 0,
|
|
"total_attempts": 8
|
|
},
|
|
{
|
|
"scenario_index": 45,
|
|
"task": "What's the weather like in Cairo?",
|
|
"mean_reward": -1.25,
|
|
"max_reward": 4.0,
|
|
"success_count": 2,
|
|
"total_attempts": 8
|
|
},
|
|
{
|
|
"scenario_index": 46,
|
|
"task": "What is the tallest mountain?",
|
|
"mean_reward": -2.5,
|
|
"max_reward": -2.0,
|
|
"success_count": 0,
|
|
"total_attempts": 8
|
|
},
|
|
{
|
|
"scenario_index": 47,
|
|
"task": "What is India's population density in people per square mile?",
|
|
"mean_reward": -1.7916666666666667,
|
|
"max_reward": 1.333333333333333,
|
|
"success_count": 0,
|
|
"total_attempts": 8
|
|
},
|
|
{
|
|
"scenario_index": 48,
|
|
"task": "What is the GDP of France?",
|
|
"mean_reward": -2.625,
|
|
"max_reward": -2.0,
|
|
"success_count": 0,
|
|
"total_attempts": 8
|
|
},
|
|
{
|
|
"scenario_index": 49,
|
|
"task": "How old was Guido van Rossum in 2024?",
|
|
"mean_reward": -2.75,
|
|
"max_reward": -2.0,
|
|
"success_count": 0,
|
|
"total_attempts": 8
|
|
}
|
|
]
|
|
} |