Files
qwen-0.5b-tool-agent-grpo/artifacts/eval_results.json
ModelHub XC 2e4a8d6a83 初始化项目,由ModelHub XC社区提供模型
Model: abhid1234/qwen-0.5b-tool-agent-grpo
Source: Original Platform
2026-04-28 07:59:12 +08:00

412 lines
10 KiB
JSON

{
"step": 15,
"scenarios_path": "data/scenarios_val.jsonl",
"num_generations": 8,
"total_scenarios": 50,
"total_rollouts": 400,
"successes": 18,
"accuracy_pct": 4.5,
"avg_reward": -1.8850000000000002,
"per_scenario": [
{
"scenario_index": 0,
"task": "Convert 98 kg to lbs.",
"mean_reward": 3.125,
"max_reward": 4.0,
"success_count": 7,
"total_attempts": 8
},
{
"scenario_index": 1,
"task": "What is the speed of light?",
"mean_reward": -2.5,
"max_reward": -2.0,
"success_count": 0,
"total_attempts": 8
},
{
"scenario_index": 2,
"task": "What is the distance from Earth to the Sun in km in miles?",
"mean_reward": 0.5625,
"max_reward": 2.5,
"success_count": 1,
"total_attempts": 8
},
{
"scenario_index": 3,
"task": "What is 441 plus 23?",
"mean_reward": 2.25,
"max_reward": 4.0,
"success_count": 1,
"total_attempts": 8
},
{
"scenario_index": 4,
"task": "Convert 62 kg to lbs.",
"mean_reward": -3.0,
"max_reward": -3.0,
"success_count": 0,
"total_attempts": 8
},
{
"scenario_index": 5,
"task": "Which is hotter right now, London or Mumbai?",
"mean_reward": -2.875,
"max_reward": -2.0,
"success_count": 0,
"total_attempts": 8
},
{
"scenario_index": 6,
"task": "What is 185 plus 89?",
"mean_reward": -3.0,
"max_reward": -3.0,
"success_count": 0,
"total_attempts": 8
},
{
"scenario_index": 7,
"task": "What's the weather like in Dubai?",
"mean_reward": -1.375,
"max_reward": 4.0,
"success_count": 2,
"total_attempts": 8
},
{
"scenario_index": 8,
"task": "What is the population of Germany divided by its area in km2?",
"mean_reward": -2.041666666666667,
"max_reward": 1.666666666666666,
"success_count": 0,
"total_attempts": 8
},
{
"scenario_index": 9,
"task": "What is the boiling point of water?",
"mean_reward": -3.0,
"max_reward": -3.0,
"success_count": 0,
"total_attempts": 8
},
{
"scenario_index": 10,
"task": "Which is hotter right now, London or Mumbai?",
"mean_reward": -2.0,
"max_reward": -2.0,
"success_count": 0,
"total_attempts": 8
},
{
"scenario_index": 11,
"task": "What is the population of India divided by its area in km2?",
"mean_reward": -2.125,
"max_reward": -2.0,
"success_count": 0,
"total_attempts": 8
},
{
"scenario_index": 12,
"task": "What is India's population density in people per square mile?",
"mean_reward": -1.25,
"max_reward": 1.333333333333333,
"success_count": 0,
"total_attempts": 8
},
{
"scenario_index": 13,
"task": "What is the tallest mountain?",
"mean_reward": -2.875,
"max_reward": -2.0,
"success_count": 0,
"total_attempts": 8
},
{
"scenario_index": 14,
"task": "What is the distance from Earth to the Sun in km in miles?",
"mean_reward": -1.875,
"max_reward": 1.5,
"success_count": 0,
"total_attempts": 8
},
{
"scenario_index": 15,
"task": "What is the population of Japan divided by its area in km2?",
"mean_reward": -2.875,
"max_reward": -2.0,
"success_count": 0,
"total_attempts": 8
},
{
"scenario_index": 16,
"task": "What is Germany's population density in people per square mile?",
"mean_reward": -3.0,
"max_reward": -3.0,
"success_count": 0,
"total_attempts": 8
},
{
"scenario_index": 17,
"task": "Convert 74 kg to lbs.",
"mean_reward": -3.0,
"max_reward": -3.0,
"success_count": 0,
"total_attempts": 8
},
{
"scenario_index": 18,
"task": "Which is hotter right now, Paris or Cairo?",
"mean_reward": -3.0,
"max_reward": -3.0,
"success_count": 0,
"total_attempts": 8
},
{
"scenario_index": 19,
"task": "What is India's population density in people per square mile?",
"mean_reward": -1.5,
"max_reward": 1.333333333333333,
"success_count": 0,
"total_attempts": 8
},
{
"scenario_index": 20,
"task": "Which country has a larger population, France or Brazil?",
"mean_reward": -3.0,
"max_reward": -3.0,
"success_count": 0,
"total_attempts": 8
},
{
"scenario_index": 21,
"task": "Convert 64 kg to lbs.",
"mean_reward": -3.0,
"max_reward": -3.0,
"success_count": 0,
"total_attempts": 8
},
{
"scenario_index": 22,
"task": "Which country has a larger population, Japan or India?",
"mean_reward": -1.5625,
"max_reward": 3.0,
"success_count": 2,
"total_attempts": 8
},
{
"scenario_index": 23,
"task": "What is the GDP of Japan?",
"mean_reward": -2.75,
"max_reward": -2.0,
"success_count": 0,
"total_attempts": 8
},
{
"scenario_index": 24,
"task": "What is the population of France divided by its area in km2?",
"mean_reward": -2.875,
"max_reward": -2.0,
"success_count": 0,
"total_attempts": 8
},
{
"scenario_index": 25,
"task": "What is France's population density in people per square mile?",
"mean_reward": -2.5,
"max_reward": 1.0,
"success_count": 0,
"total_attempts": 8
},
{
"scenario_index": 26,
"task": "Convert 26 kg to lbs.",
"mean_reward": -3.0,
"max_reward": -3.0,
"success_count": 0,
"total_attempts": 8
},
{
"scenario_index": 27,
"task": "What is 660 times 87?",
"mean_reward": 1.0,
"max_reward": 4.0,
"success_count": 1,
"total_attempts": 8
},
{
"scenario_index": 28,
"task": "What is the boiling point of water?",
"mean_reward": -2.5,
"max_reward": -2.0,
"success_count": 0,
"total_attempts": 8
},
{
"scenario_index": 29,
"task": "What is the population of Germany divided by its area in km2?",
"mean_reward": -1.125,
"max_reward": 2.333333333333333,
"success_count": 1,
"total_attempts": 8
},
{
"scenario_index": 30,
"task": "Convert 40 kg to lbs.",
"mean_reward": -3.0,
"max_reward": -3.0,
"success_count": 0,
"total_attempts": 8
},
{
"scenario_index": 31,
"task": "What is the speed of light?",
"mean_reward": -2.375,
"max_reward": -2.0,
"success_count": 0,
"total_attempts": 8
},
{
"scenario_index": 32,
"task": "How old was Guido van Rossum in 2024?",
"mean_reward": -2.875,
"max_reward": -2.0,
"success_count": 0,
"total_attempts": 8
},
{
"scenario_index": 33,
"task": "Which is hotter right now, Paris or Dubai?",
"mean_reward": -3.0,
"max_reward": -3.0,
"success_count": 0,
"total_attempts": 8
},
{
"scenario_index": 34,
"task": "Which is hotter right now, Tokyo or Dubai?",
"mean_reward": -2.875,
"max_reward": -2.0,
"success_count": 0,
"total_attempts": 8
},
{
"scenario_index": 35,
"task": "Which is hotter right now, London or Cairo?",
"mean_reward": -2.375,
"max_reward": -2.0,
"success_count": 0,
"total_attempts": 8
},
{
"scenario_index": 36,
"task": "What is the value of pi?",
"mean_reward": -2.125,
"max_reward": -2.0,
"success_count": 0,
"total_attempts": 8
},
{
"scenario_index": 37,
"task": "What is the population of Japan divided by its area in km2?",
"mean_reward": -1.6666666666666667,
"max_reward": 1.666666666666666,
"success_count": 0,
"total_attempts": 8
},
{
"scenario_index": 38,
"task": "What is the temperature in London in Fahrenheit?",
"mean_reward": -1.375,
"max_reward": 1.5,
"success_count": 0,
"total_attempts": 8
},
{
"scenario_index": 39,
"task": "What is 464 plus 30?",
"mean_reward": -1.75,
"max_reward": 2.0,
"success_count": 0,
"total_attempts": 8
},
{
"scenario_index": 40,
"task": "Which country has a larger population, France or India?",
"mean_reward": -2.1875,
"max_reward": 2.5,
"success_count": 1,
"total_attempts": 8
},
{
"scenario_index": 41,
"task": "What is the distance from Earth to the Sun in km in miles?",
"mean_reward": -1.625,
"max_reward": 2.0,
"success_count": 0,
"total_attempts": 8
},
{
"scenario_index": 42,
"task": "What is the tallest mountain?",
"mean_reward": -1.125,
"max_reward": 2.0,
"success_count": 0,
"total_attempts": 8
},
{
"scenario_index": 43,
"task": "What is the temperature in London in Fahrenheit?",
"mean_reward": 0.6875,
"max_reward": 2.0,
"success_count": 0,
"total_attempts": 8
},
{
"scenario_index": 44,
"task": "What is 496 minus 24?",
"mean_reward": 1.0,
"max_reward": 2.0,
"success_count": 0,
"total_attempts": 8
},
{
"scenario_index": 45,
"task": "What's the weather like in Cairo?",
"mean_reward": -1.25,
"max_reward": 4.0,
"success_count": 2,
"total_attempts": 8
},
{
"scenario_index": 46,
"task": "What is the tallest mountain?",
"mean_reward": -2.5,
"max_reward": -2.0,
"success_count": 0,
"total_attempts": 8
},
{
"scenario_index": 47,
"task": "What is India's population density in people per square mile?",
"mean_reward": -1.7916666666666667,
"max_reward": 1.333333333333333,
"success_count": 0,
"total_attempts": 8
},
{
"scenario_index": 48,
"task": "What is the GDP of France?",
"mean_reward": -2.625,
"max_reward": -2.0,
"success_count": 0,
"total_attempts": 8
},
{
"scenario_index": 49,
"task": "How old was Guido van Rossum in 2024?",
"mean_reward": -2.75,
"max_reward": -2.0,
"success_count": 0,
"total_attempts": 8
}
]
}