Files
qwen3-tiny-v2-finetuned/sweep_results.json
ModelHub XC 2feb2eab28 初始化项目,由ModelHub XC社区提供模型
Model: g023/qwen3-tiny-v2-finetuned
Source: Original Platform
2026-05-06 13:43:43 +08:00

1272 lines
35 KiB
JSON

{
"model": "qwen3-best-p2-finetuned-grpo-q8",
"best": {
"params": {
"label": "balanced_01",
"temperature": 0.65,
"top_p": 0.9,
"top_k": 20,
"min_p": 0.0,
"repeat_penalty": 1.05,
"presence_penalty": 0.1,
"frequency_penalty": 0.1
},
"pass_count": 8,
"total_tests": 8,
"pass_rate": 1.0,
"score_sum": 18.5,
"score_avg": 2.3125,
"tests": [
{
"test": "factual_france",
"passed": true,
"score": 2.0,
"issues": [],
"metrics": {
"readable_ratio": 1.0,
"unique_ratio": 1.0,
"content_len": 31,
"thinking_len": 0
},
"response_preview": "The capital of France is Paris.",
"think": true
},
{
"test": "factual_japan",
"passed": true,
"score": 2.0,
"issues": [],
"metrics": {
"readable_ratio": 1.0,
"unique_ratio": 1.0,
"content_len": 5,
"thinking_len": 0
},
"response_preview": "Tokyo",
"think": false
},
{
"test": "math_multiply",
"passed": true,
"score": 2.0,
"issues": [],
"metrics": {
"readable_ratio": 0.9375,
"unique_ratio": 0.8889,
"content_len": 32,
"thinking_len": 0
},
"response_preview": "17 * 19 = 323 The answer is: 323",
"think": true
},
{
"test": "math_sqrt",
"passed": true,
"score": 2.0,
"issues": [],
"metrics": {
"readable_ratio": 1.0,
"unique_ratio": 1.0,
"content_len": 2,
"thinking_len": 0
},
"response_preview": "12",
"think": false
},
{
"test": "reasoning",
"passed": true,
"score": 2.75,
"issues": [],
"metrics": {
"readable_ratio": 1.0,
"unique_ratio": 0.5909,
"content_len": 124,
"thinking_len": 0
},
"response_preview": "Yes, because if all bloops are razzies and all razzies are lazzies, then all bloops must be lazzies. All bloops are lazzies.",
"think": true
},
{
"test": "coding",
"passed": true,
"score": 2.5,
"issues": [],
"metrics": {
"readable_ratio": 0.9677,
"unique_ratio": 1.0,
"content_len": 31,
"thinking_len": 0
},
"response_preview": "def add(a, b): return a + b",
"think": true
},
{
"test": "short_planets",
"passed": true,
"score": 3.0,
"issues": [],
"metrics": {
"readable_ratio": 1.0,
"unique_ratio": 1.0,
"content_len": 61,
"thinking_len": 0
},
"response_preview": "Three planets in a single sentence: Earth, Mars, and Jupiter.",
"think": false
},
{
"test": "coherent_explanation",
"passed": true,
"score": 2.25,
"issues": [],
"metrics": {
"readable_ratio": 1.0,
"unique_ratio": 0.9286,
"content_len": 173,
"thinking_len": 0
},
"response_preview": "Photosynthesis is the process by which plants convert light energy into chemical energy, storing it in the form of glucose. This process also releases oxygen as a byproduct.",
"think": false
}
]
},
"all_candidates": [
{
"params": {
"label": "qwen_think_default",
"temperature": 0.6,
"top_p": 0.95,
"top_k": 20,
"min_p": 0.0,
"repeat_penalty": 1.0,
"presence_penalty": 0.0,
"frequency_penalty": 0.0
},
"pass_count": 8,
"total_tests": 8,
"pass_rate": 1.0,
"score_sum": 18.5,
"score_avg": 2.3125,
"tests": [
{
"test": "factual_france",
"passed": true,
"score": 2.0,
"issues": [],
"metrics": {
"readable_ratio": 1.0,
"unique_ratio": 1.0,
"content_len": 31,
"thinking_len": 0
},
"response_preview": "The capital of France is Paris.",
"think": true
},
{
"test": "factual_japan",
"passed": true,
"score": 2.0,
"issues": [],
"metrics": {
"readable_ratio": 1.0,
"unique_ratio": 1.0,
"content_len": 5,
"thinking_len": 0
},
"response_preview": "Tokyo",
"think": false
},
{
"test": "math_multiply",
"passed": true,
"score": 2.0,
"issues": [],
"metrics": {
"readable_ratio": 0.9375,
"unique_ratio": 0.8889,
"content_len": 32,
"thinking_len": 0
},
"response_preview": "17 * 19 = 323 The answer is: 323",
"think": true
},
{
"test": "math_sqrt",
"passed": true,
"score": 2.0,
"issues": [],
"metrics": {
"readable_ratio": 1.0,
"unique_ratio": 1.0,
"content_len": 2,
"thinking_len": 0
},
"response_preview": "12",
"think": false
},
{
"test": "reasoning",
"passed": true,
"score": 2.75,
"issues": [],
"metrics": {
"readable_ratio": 1.0,
"unique_ratio": 0.5909,
"content_len": 124,
"thinking_len": 0
},
"response_preview": "Yes, because if all bloops are razzies and all razzies are lazzies, then all bloops must be lazzies. All bloops are lazzies.",
"think": true
},
{
"test": "coding",
"passed": true,
"score": 2.5,
"issues": [],
"metrics": {
"readable_ratio": 0.9677,
"unique_ratio": 1.0,
"content_len": 31,
"thinking_len": 0
},
"response_preview": "def add(a, b): return a + b",
"think": true
},
{
"test": "short_planets",
"passed": true,
"score": 3.0,
"issues": [],
"metrics": {
"readable_ratio": 1.0,
"unique_ratio": 1.0,
"content_len": 61,
"thinking_len": 0
},
"response_preview": "Three planets in a single sentence: Earth, Mars, and Jupiter.",
"think": false
},
{
"test": "coherent_explanation",
"passed": true,
"score": 2.25,
"issues": [],
"metrics": {
"readable_ratio": 1.0,
"unique_ratio": 0.9286,
"content_len": 173,
"thinking_len": 0
},
"response_preview": "Photosynthesis is the process by which plants convert light energy into chemical energy, storing it in the form of glucose. This process also releases oxygen as a byproduct.",
"think": false
}
]
},
{
"params": {
"label": "qwen_nonthink_default",
"temperature": 0.7,
"top_p": 0.8,
"top_k": 20,
"min_p": 0.0,
"repeat_penalty": 1.0,
"presence_penalty": 0.0,
"frequency_penalty": 0.0
},
"pass_count": 7,
"total_tests": 8,
"pass_rate": 0.875,
"score_sum": 17.5,
"score_avg": 2.1875,
"tests": [
{
"test": "factual_france",
"passed": true,
"score": 2.0,
"issues": [],
"metrics": {
"readable_ratio": 1.0,
"unique_ratio": 1.0,
"content_len": 31,
"thinking_len": 0
},
"response_preview": "The capital of France is Paris.",
"think": true
},
{
"test": "factual_japan",
"passed": false,
"score": 1.0,
"issues": [
"missing expected token 'Tokyo'"
],
"metrics": {
"readable_ratio": 1.0,
"unique_ratio": 1.0,
"content_len": 5,
"thinking_len": 0
},
"response_preview": "Japan",
"think": false
},
{
"test": "math_multiply",
"passed": true,
"score": 2.0,
"issues": [],
"metrics": {
"readable_ratio": 0.9375,
"unique_ratio": 0.8889,
"content_len": 32,
"thinking_len": 0
},
"response_preview": "17 * 19 = 323 The answer is: 323",
"think": true
},
{
"test": "math_sqrt",
"passed": true,
"score": 2.0,
"issues": [],
"metrics": {
"readable_ratio": 1.0,
"unique_ratio": 1.0,
"content_len": 2,
"thinking_len": 0
},
"response_preview": "12",
"think": false
},
{
"test": "reasoning",
"passed": true,
"score": 2.75,
"issues": [],
"metrics": {
"readable_ratio": 1.0,
"unique_ratio": 0.6667,
"content_len": 132,
"thinking_len": 0
},
"response_preview": "Yes, because if all bloops are razzies and all razzies are lazzies, then all bloops must be lazzies as well. All bloops are lazzies.",
"think": true
},
{
"test": "coding",
"passed": true,
"score": 2.5,
"issues": [],
"metrics": {
"readable_ratio": 0.9677,
"unique_ratio": 1.0,
"content_len": 31,
"thinking_len": 0
},
"response_preview": "def add(a, b): return a + b",
"think": true
},
{
"test": "short_planets",
"passed": true,
"score": 3.0,
"issues": [],
"metrics": {
"readable_ratio": 1.0,
"unique_ratio": 1.0,
"content_len": 59,
"thinking_len": 0
},
"response_preview": "The three planets in question are Earth, Mars, and Jupiter.",
"think": false
},
{
"test": "coherent_explanation",
"passed": true,
"score": 2.25,
"issues": [],
"metrics": {
"readable_ratio": 1.0,
"unique_ratio": 0.9286,
"content_len": 173,
"thinking_len": 0
},
"response_preview": "Photosynthesis is the process by which plants convert light energy into chemical energy, storing it in the form of glucose. This process also releases oxygen as a byproduct.",
"think": false
}
]
},
{
"params": {
"label": "balanced_01",
"temperature": 0.65,
"top_p": 0.9,
"top_k": 20,
"min_p": 0.0,
"repeat_penalty": 1.05,
"presence_penalty": 0.1,
"frequency_penalty": 0.1
},
"pass_count": 8,
"total_tests": 8,
"pass_rate": 1.0,
"score_sum": 18.5,
"score_avg": 2.3125,
"tests": [
{
"test": "factual_france",
"passed": true,
"score": 2.0,
"issues": [],
"metrics": {
"readable_ratio": 1.0,
"unique_ratio": 1.0,
"content_len": 31,
"thinking_len": 0
},
"response_preview": "The capital of France is Paris.",
"think": true
},
{
"test": "factual_japan",
"passed": true,
"score": 2.0,
"issues": [],
"metrics": {
"readable_ratio": 1.0,
"unique_ratio": 1.0,
"content_len": 5,
"thinking_len": 0
},
"response_preview": "Tokyo",
"think": false
},
{
"test": "math_multiply",
"passed": true,
"score": 2.0,
"issues": [],
"metrics": {
"readable_ratio": 0.9375,
"unique_ratio": 0.8889,
"content_len": 32,
"thinking_len": 0
},
"response_preview": "17 * 19 = 323 The answer is: 323",
"think": true
},
{
"test": "math_sqrt",
"passed": true,
"score": 2.0,
"issues": [],
"metrics": {
"readable_ratio": 1.0,
"unique_ratio": 1.0,
"content_len": 2,
"thinking_len": 0
},
"response_preview": "12",
"think": false
},
{
"test": "reasoning",
"passed": true,
"score": 2.75,
"issues": [],
"metrics": {
"readable_ratio": 1.0,
"unique_ratio": 0.5909,
"content_len": 124,
"thinking_len": 0
},
"response_preview": "Yes, because if all bloops are razzies and all razzies are lazzies, then all bloops must be lazzies. All bloops are lazzies.",
"think": true
},
{
"test": "coding",
"passed": true,
"score": 2.5,
"issues": [],
"metrics": {
"readable_ratio": 0.9677,
"unique_ratio": 1.0,
"content_len": 31,
"thinking_len": 0
},
"response_preview": "def add(a, b): return a + b",
"think": true
},
{
"test": "short_planets",
"passed": true,
"score": 3.0,
"issues": [],
"metrics": {
"readable_ratio": 1.0,
"unique_ratio": 1.0,
"content_len": 61,
"thinking_len": 0
},
"response_preview": "Three planets in a single sentence: Earth, Mars, and Jupiter.",
"think": false
},
{
"test": "coherent_explanation",
"passed": true,
"score": 2.25,
"issues": [],
"metrics": {
"readable_ratio": 1.0,
"unique_ratio": 0.9286,
"content_len": 173,
"thinking_len": 0
},
"response_preview": "Photosynthesis is the process by which plants convert light energy into chemical energy, storing it in the form of glucose. This process also releases oxygen as a byproduct.",
"think": false
}
]
},
{
"params": {
"label": "balanced_02",
"temperature": 0.65,
"top_p": 0.9,
"top_k": 30,
"min_p": 0.0,
"repeat_penalty": 1.1,
"presence_penalty": 0.2,
"frequency_penalty": 0.2
},
"pass_count": 8,
"total_tests": 8,
"pass_rate": 1.0,
"score_sum": 18.5,
"score_avg": 2.3125,
"tests": [
{
"test": "factual_france",
"passed": true,
"score": 2.0,
"issues": [],
"metrics": {
"readable_ratio": 1.0,
"unique_ratio": 1.0,
"content_len": 31,
"thinking_len": 0
},
"response_preview": "The capital of France is Paris.",
"think": true
},
{
"test": "factual_japan",
"passed": true,
"score": 2.0,
"issues": [],
"metrics": {
"readable_ratio": 1.0,
"unique_ratio": 1.0,
"content_len": 5,
"thinking_len": 0
},
"response_preview": "Tokyo",
"think": false
},
{
"test": "math_multiply",
"passed": true,
"score": 2.0,
"issues": [],
"metrics": {
"readable_ratio": 0.9375,
"unique_ratio": 0.8889,
"content_len": 32,
"thinking_len": 0
},
"response_preview": "17 * 19 = 323 The answer is: 323",
"think": true
},
{
"test": "math_sqrt",
"passed": true,
"score": 2.0,
"issues": [],
"metrics": {
"readable_ratio": 1.0,
"unique_ratio": 1.0,
"content_len": 2,
"thinking_len": 0
},
"response_preview": "12",
"think": false
},
{
"test": "reasoning",
"passed": true,
"score": 2.75,
"issues": [],
"metrics": {
"readable_ratio": 1.0,
"unique_ratio": 0.5909,
"content_len": 124,
"thinking_len": 0
},
"response_preview": "Yes, because if all bloops are razzies and all razzies are lazzies, then all bloops must be lazzies. All bloops are lazzies.",
"think": true
},
{
"test": "coding",
"passed": true,
"score": 2.5,
"issues": [],
"metrics": {
"readable_ratio": 0.9677,
"unique_ratio": 1.0,
"content_len": 31,
"thinking_len": 0
},
"response_preview": "def add(a, b): return a + b",
"think": true
},
{
"test": "short_planets",
"passed": true,
"score": 3.0,
"issues": [],
"metrics": {
"readable_ratio": 1.0,
"unique_ratio": 1.0,
"content_len": 59,
"thinking_len": 0
},
"response_preview": "The three planets in question are Earth, Mars, and Jupiter.",
"think": false
},
{
"test": "coherent_explanation",
"passed": true,
"score": 2.25,
"issues": [],
"metrics": {
"readable_ratio": 1.0,
"unique_ratio": 0.6818,
"content_len": 421,
"thinking_len": 0
},
"response_preview": "Photosynthesis is the process by which plants, algae, and some bacteria convert light energy into chemical energy stored in glucose. This process involves the absorption of sunlight, the splitting of water molecules, and",
"think": false
}
]
},
{
"params": {
"label": "anti_repeat_01",
"temperature": 0.6,
"top_p": 0.95,
"top_k": 20,
"min_p": 0.0,
"repeat_penalty": 1.1,
"presence_penalty": 0.4,
"frequency_penalty": 0.4
},
"pass_count": 8,
"total_tests": 8,
"pass_rate": 1.0,
"score_sum": 18.5,
"score_avg": 2.3125,
"tests": [
{
"test": "factual_france",
"passed": true,
"score": 2.0,
"issues": [],
"metrics": {
"readable_ratio": 1.0,
"unique_ratio": 1.0,
"content_len": 31,
"thinking_len": 0
},
"response_preview": "The capital of France is Paris.",
"think": true
},
{
"test": "factual_japan",
"passed": true,
"score": 2.0,
"issues": [],
"metrics": {
"readable_ratio": 1.0,
"unique_ratio": 1.0,
"content_len": 5,
"thinking_len": 0
},
"response_preview": "Tokyo",
"think": false
},
{
"test": "math_multiply",
"passed": true,
"score": 2.0,
"issues": [],
"metrics": {
"readable_ratio": 0.9375,
"unique_ratio": 0.8889,
"content_len": 32,
"thinking_len": 0
},
"response_preview": "17 * 19 = 323 The answer is: 323",
"think": true
},
{
"test": "math_sqrt",
"passed": true,
"score": 2.0,
"issues": [],
"metrics": {
"readable_ratio": 1.0,
"unique_ratio": 1.0,
"content_len": 2,
"thinking_len": 0
},
"response_preview": "12",
"think": false
},
{
"test": "reasoning",
"passed": true,
"score": 2.75,
"issues": [],
"metrics": {
"readable_ratio": 1.0,
"unique_ratio": 0.75,
"content_len": 108,
"thinking_len": 0
},
"response_preview": "Yes, because if all bloops are razzies and all razzies are lazzies, then all bloops must be lazzies as well.",
"think": true
},
{
"test": "coding",
"passed": true,
"score": 2.5,
"issues": [],
"metrics": {
"readable_ratio": 0.9677,
"unique_ratio": 1.0,
"content_len": 31,
"thinking_len": 0
},
"response_preview": "def add(a, b): return a + b",
"think": true
},
{
"test": "short_planets",
"passed": true,
"score": 3.0,
"issues": [],
"metrics": {
"readable_ratio": 1.0,
"unique_ratio": 1.0,
"content_len": 61,
"thinking_len": 0
},
"response_preview": "Three planets in a single sentence: Earth, Mars, and Jupiter.",
"think": false
},
{
"test": "coherent_explanation",
"passed": true,
"score": 2.25,
"issues": [],
"metrics": {
"readable_ratio": 1.0,
"unique_ratio": 0.9286,
"content_len": 173,
"thinking_len": 0
},
"response_preview": "Photosynthesis is the process by which plants convert light energy into chemical energy, storing it in the form of glucose. This process also releases oxygen as a byproduct.",
"think": false
}
]
},
{
"params": {
"label": "anti_repeat_02",
"temperature": 0.7,
"top_p": 0.9,
"top_k": 40,
"min_p": 0.05,
"repeat_penalty": 1.15,
"presence_penalty": 0.5,
"frequency_penalty": 0.4
},
"pass_count": 7,
"total_tests": 8,
"pass_rate": 0.875,
"score_sum": 17.5,
"score_avg": 2.1875,
"tests": [
{
"test": "factual_france",
"passed": true,
"score": 2.0,
"issues": [],
"metrics": {
"readable_ratio": 1.0,
"unique_ratio": 1.0,
"content_len": 31,
"thinking_len": 0
},
"response_preview": "The capital of France is Paris.",
"think": true
},
{
"test": "factual_japan",
"passed": false,
"score": 1.0,
"issues": [
"missing expected token 'Tokyo'"
],
"metrics": {
"readable_ratio": 1.0,
"unique_ratio": 1.0,
"content_len": 5,
"thinking_len": 0
},
"response_preview": "Japan",
"think": false
},
{
"test": "math_multiply",
"passed": true,
"score": 2.0,
"issues": [],
"metrics": {
"readable_ratio": 0.9375,
"unique_ratio": 0.8889,
"content_len": 32,
"thinking_len": 0
},
"response_preview": "17 * 19 = 323 The answer is: 323",
"think": true
},
{
"test": "math_sqrt",
"passed": true,
"score": 2.0,
"issues": [],
"metrics": {
"readable_ratio": 1.0,
"unique_ratio": 1.0,
"content_len": 2,
"thinking_len": 0
},
"response_preview": "12",
"think": false
},
{
"test": "reasoning",
"passed": true,
"score": 2.75,
"issues": [],
"metrics": {
"readable_ratio": 1.0,
"unique_ratio": 0.5909,
"content_len": 124,
"thinking_len": 0
},
"response_preview": "Yes, because if all bloops are razzies and all razzies are lazzies, then all bloops must be lazzies. All bloops are lazzies.",
"think": true
},
{
"test": "coding",
"passed": true,
"score": 2.5,
"issues": [],
"metrics": {
"readable_ratio": 0.9677,
"unique_ratio": 1.0,
"content_len": 31,
"thinking_len": 0
},
"response_preview": "def add(a, b): return a + b",
"think": true
},
{
"test": "short_planets",
"passed": true,
"score": 3.0,
"issues": [],
"metrics": {
"readable_ratio": 1.0,
"unique_ratio": 1.0,
"content_len": 61,
"thinking_len": 0
},
"response_preview": "Three planets in a single sentence: Earth, Mars, and Jupiter.",
"think": false
},
{
"test": "coherent_explanation",
"passed": true,
"score": 2.25,
"issues": [],
"metrics": {
"readable_ratio": 1.0,
"unique_ratio": 0.9286,
"content_len": 173,
"thinking_len": 0
},
"response_preview": "Photosynthesis is the process by which plants convert light energy into chemical energy, storing it in the form of glucose. This process also releases oxygen as a byproduct.",
"think": false
}
]
},
{
"params": {
"label": "creative",
"temperature": 0.8,
"top_p": 0.92,
"top_k": 40,
"min_p": 0.05,
"repeat_penalty": 1.08,
"presence_penalty": 0.2,
"frequency_penalty": 0.2
},
"pass_count": 8,
"total_tests": 8,
"pass_rate": 1.0,
"score_sum": 18.5,
"score_avg": 2.3125,
"tests": [
{
"test": "factual_france",
"passed": true,
"score": 2.0,
"issues": [],
"metrics": {
"readable_ratio": 1.0,
"unique_ratio": 1.0,
"content_len": 31,
"thinking_len": 0
},
"response_preview": "The capital of France is Paris.",
"think": true
},
{
"test": "factual_japan",
"passed": true,
"score": 2.0,
"issues": [],
"metrics": {
"readable_ratio": 1.0,
"unique_ratio": 1.0,
"content_len": 5,
"thinking_len": 0
},
"response_preview": "Tokyo",
"think": false
},
{
"test": "math_multiply",
"passed": true,
"score": 2.0,
"issues": [],
"metrics": {
"readable_ratio": 0.9375,
"unique_ratio": 0.8889,
"content_len": 32,
"thinking_len": 0
},
"response_preview": "17 * 19 = 323 The answer is: 323",
"think": true
},
{
"test": "math_sqrt",
"passed": true,
"score": 2.0,
"issues": [],
"metrics": {
"readable_ratio": 1.0,
"unique_ratio": 1.0,
"content_len": 2,
"thinking_len": 0
},
"response_preview": "12",
"think": false
},
{
"test": "reasoning",
"passed": true,
"score": 2.75,
"issues": [],
"metrics": {
"readable_ratio": 1.0,
"unique_ratio": 0.75,
"content_len": 108,
"thinking_len": 0
},
"response_preview": "Yes, because if all bloops are razzies and all razzies are lazzies, then all bloops must be lazzies as well.",
"think": true
},
{
"test": "coding",
"passed": true,
"score": 2.5,
"issues": [],
"metrics": {
"readable_ratio": 0.9677,
"unique_ratio": 1.0,
"content_len": 31,
"thinking_len": 0
},
"response_preview": "def add(a, b): return a + b",
"think": true
},
{
"test": "short_planets",
"passed": true,
"score": 3.0,
"issues": [],
"metrics": {
"readable_ratio": 1.0,
"unique_ratio": 1.0,
"content_len": 61,
"thinking_len": 0
},
"response_preview": "Three planets in a single sentence: Earth, Mars, and Jupiter.",
"think": false
},
{
"test": "coherent_explanation",
"passed": true,
"score": 2.25,
"issues": [],
"metrics": {
"readable_ratio": 0.9886,
"unique_ratio": 0.75,
"content_len": 351,
"thinking_len": 0
},
"response_preview": "Photosynthesis is the process by which plants, including algae and certain fungi, convert light energy into chemical energy through the absorption of sunlight. This process involves the use of carbon dioxide and water to",
"think": false
}
]
},
{
"params": {
"label": "low_temp_precise",
"temperature": 0.55,
"top_p": 0.95,
"top_k": 20,
"min_p": 0.0,
"repeat_penalty": 1.05,
"presence_penalty": 0.0,
"frequency_penalty": 0.0
},
"pass_count": 8,
"total_tests": 8,
"pass_rate": 1.0,
"score_sum": 18.5,
"score_avg": 2.3125,
"tests": [
{
"test": "factual_france",
"passed": true,
"score": 2.0,
"issues": [],
"metrics": {
"readable_ratio": 1.0,
"unique_ratio": 1.0,
"content_len": 31,
"thinking_len": 0
},
"response_preview": "The capital of France is Paris.",
"think": true
},
{
"test": "factual_japan",
"passed": true,
"score": 2.0,
"issues": [],
"metrics": {
"readable_ratio": 1.0,
"unique_ratio": 1.0,
"content_len": 5,
"thinking_len": 0
},
"response_preview": "Tokyo",
"think": false
},
{
"test": "math_multiply",
"passed": true,
"score": 2.0,
"issues": [],
"metrics": {
"readable_ratio": 0.9375,
"unique_ratio": 0.8889,
"content_len": 32,
"thinking_len": 0
},
"response_preview": "17 * 19 = 323 The answer is: 323",
"think": true
},
{
"test": "math_sqrt",
"passed": true,
"score": 2.0,
"issues": [],
"metrics": {
"readable_ratio": 1.0,
"unique_ratio": 1.0,
"content_len": 2,
"thinking_len": 0
},
"response_preview": "12",
"think": false
},
{
"test": "reasoning",
"passed": true,
"score": 2.75,
"issues": [],
"metrics": {
"readable_ratio": 1.0,
"unique_ratio": 0.5909,
"content_len": 124,
"thinking_len": 0
},
"response_preview": "Yes, because if all bloops are razzies and all razzies are lazzies, then all bloops must be lazzies. All bloops are lazzies.",
"think": true
},
{
"test": "coding",
"passed": true,
"score": 2.5,
"issues": [],
"metrics": {
"readable_ratio": 0.9677,
"unique_ratio": 1.0,
"content_len": 31,
"thinking_len": 0
},
"response_preview": "def add(a, b): return a + b",
"think": true
},
{
"test": "short_planets",
"passed": true,
"score": 3.0,
"issues": [],
"metrics": {
"readable_ratio": 1.0,
"unique_ratio": 1.0,
"content_len": 61,
"thinking_len": 0
},
"response_preview": "Three planets in a single sentence: Earth, Mars, and Jupiter.",
"think": false
},
{
"test": "coherent_explanation",
"passed": true,
"score": 2.25,
"issues": [],
"metrics": {
"readable_ratio": 1.0,
"unique_ratio": 0.7273,
"content_len": 414,
"thinking_len": 0
},
"response_preview": "Photosynthesis is the process by which plants, including trees and shrubs, convert light energy into chemical energy stored in glucose. This process involves the absorption of carbon dioxide and the release of oxygen gas",
"think": false
}
]
}
],
"tests": [
{
"name": "factual_france",
"prompt": "What is the capital of France? Answer in one short sentence.",
"think": true,
"max_tokens": 120,
"check_contains": [
"Paris"
]
},
{
"name": "factual_japan",
"prompt": "Capital of Japan? Answer with one word.",
"think": false,
"max_tokens": 64,
"check_contains": [
"Tokyo"
]
},
{
"name": "math_multiply",
"prompt": "What is 17 * 19? Give only the number.",
"think": true,
"max_tokens": 64,
"check_contains": [
"323"
]
},
{
"name": "math_sqrt",
"prompt": "Square root of 144? Give only the number.",
"think": false,
"max_tokens": 64,
"check_contains": [
"12"
]
},
{
"name": "reasoning",
"prompt": "If all bloops are razzies and all razzies are lazzies, are all bloops lazzies? Answer yes or no then one sentence.",
"think": true,
"max_tokens": 180,
"check_contains": [
"yes"
],
"check_min_words": 6
},
{
"name": "coding",
"prompt": "Write a Python function add(a, b) that returns a + b. Output only code.",
"think": true,
"max_tokens": 200,
"check_any_contains": [
"def add",
"return a + b",
"return a+b"
],
"check_no_garbage": true
},
{
"name": "short_planets",
"prompt": "Name three planets in a single sentence.",
"think": false,
"max_tokens": 120,
"check_any_contains": [
"Mercury",
"Venus",
"Earth",
"Mars",
"Jupiter",
"Saturn"
],
"check_min_words": 4,
"check_no_garbage": true
},
{
"name": "coherent_explanation",
"prompt": "Explain photosynthesis in 2 short sentences.",
"think": false,
"max_tokens": 220,
"check_min_words": 12,
"check_no_garbage": true
}
]
}