Files
nives-fg-270m-v1/checkpoint-897/structured_eval.json
ModelHub XC 021c63722e 初始化项目,由ModelHub XC社区提供模型
Model: hoornet/nives-fg-270m-v1
Source: Original Platform
2026-05-22 17:27:16 +08:00

423 lines
14 KiB
JSON

{
"step": 897,
"metrics": {
"n_total": 32,
"n_tool_call": 22,
"n_chat_only": 10,
"emit_rate": 1.0,
"schema_validity": 0.9090909090909091,
"escalation_correct": 1.0,
"argument_accuracy": 0.393939393939394,
"by_category": {
"turn_on_light": {
"n": 3,
"ok": 3,
"pct": 1.0
},
"turn_off_light": {
"n": 3,
"ok": 3,
"pct": 1.0
},
"dim_light": {
"n": 2,
"ok": 2,
"pct": 1.0
},
"set_color": {
"n": 2,
"ok": 2,
"pct": 1.0
},
"get_state": {
"n": 2,
"ok": 2,
"pct": 1.0
},
"get_state_sensor": {
"n": 2,
"ok": 2,
"pct": 1.0
},
"search_devices": {
"n": 2,
"ok": 2,
"pct": 1.0
},
"list_in_area": {
"n": 2,
"ok": 0,
"pct": 0.0
},
"history_sensor": {
"n": 2,
"ok": 2,
"pct": 1.0
},
"complex_reasoning": {
"n": 2,
"ok": 2,
"pct": 1.0
},
"chat_greeting": {
"n": 2,
"ok": 2,
"pct": 1.0
},
"chat_ack": {
"n": 2,
"ok": 2,
"pct": 1.0
},
"chat_out_of_scope": {
"n": 2,
"ok": 2,
"pct": 1.0
},
"chat_ambiguous": {
"n": 2,
"ok": 2,
"pct": 1.0
},
"chat_capability": {
"n": 2,
"ok": 2,
"pct": 1.0
}
}
},
"results": [
{
"id": "tc_turn_on_light_001",
"category": "turn_on_light",
"type": "tool_call",
"emitted_call": true,
"called_name": "call_service",
"schema_valid": true,
"argument_accuracy": 0.6666666666666666,
"raw": "<start_function_call>call:call_service{domain:<escape>light<escape>,entity_id:<escape>light.living_room_ceiling_light<escape>,service:<escape>turn_on<escape>}<end_function_call>",
"latency_s": 1.08
},
{
"id": "tc_turn_on_light_002",
"category": "turn_on_light",
"type": "tool_call",
"emitted_call": true,
"called_name": "call_service",
"schema_valid": true,
"argument_accuracy": 0.6666666666666666,
"raw": "<start_function_call>call:call_service{domain:<escape>light<escape>,entity_id:<escape>light.bedroom_ceiling_light<escape>,service:<escape>turn_on<escape>}<end_function_call>",
"latency_s": 0.97
},
{
"id": "tc_turn_on_light_003",
"category": "turn_on_light",
"type": "tool_call",
"emitted_call": true,
"called_name": "call_service",
"schema_valid": true,
"argument_accuracy": 0.6666666666666666,
"raw": "<start_function_call>call:call_service{domain:<escape>light<escape>,entity_id:<escape>light.desk_desk_downlight<escape>,service:<escape>turn_on<escape>}<end_function_call>",
"latency_s": 1.0
},
{
"id": "tc_turn_off_light_001",
"category": "turn_off_light",
"type": "tool_call",
"emitted_call": true,
"called_name": "call_service",
"schema_valid": true,
"argument_accuracy": 1.0,
"raw": "<start_function_call>call:call_service{domain:<escape>light<escape>,entity_id:<escape>light.kitchen_pendant<escape>,service:<escape>turn_off<escape>}<end_function_call>",
"latency_s": 0.94
},
{
"id": "tc_turn_off_light_002",
"category": "turn_off_light",
"type": "tool_call",
"emitted_call": true,
"called_name": "call_service",
"schema_valid": true,
"argument_accuracy": 1.0,
"raw": "<start_function_call>call:call_service{domain:<escape>light<escape>,entity_id:<escape>light.bedroom_ceiling_light<escape>,service:<escape>turn_off<escape>}<end_function_call>",
"latency_s": 0.97
},
{
"id": "tc_turn_off_light_003",
"category": "turn_off_light",
"type": "tool_call",
"emitted_call": true,
"called_name": "call_service",
"schema_valid": true,
"argument_accuracy": 0.6666666666666666,
"raw": "<start_function_call>call:call_service{domain:<escape>light<escape>,entity_id:<escape>light.kids_night_light<escape>,service:<escape>turn_off<escape>}<end_function_call>",
"latency_s": 0.97
},
{
"id": "tc_dim_light_001",
"category": "dim_light",
"type": "tool_call",
"emitted_call": true,
"called_name": "call_service",
"schema_valid": true,
"argument_accuracy": 0.0,
"raw": "<start_function_call>call:call_service{data:{brightness_pct:30},domain:<escape>light<escape>,entity_id:<escape>light.bedroom_lamp<escape>,service:<escape>turn_on<escape>}<end_function_call>",
"latency_s": 1.15
},
{
"id": "tc_dim_light_002",
"category": "dim_light",
"type": "tool_call",
"emitted_call": true,
"called_name": "call_service",
"schema_valid": true,
"argument_accuracy": 0.0,
"raw": "<start_function_call>call:call_service{data:{brightness_pct:70},domain:<escape>light<escape>,entity_id:<escape>light.kitchen_island<escape>,service:<escape>turn_on<escape>}<end_function_call>",
"latency_s": 1.15
},
{
"id": "tc_set_color_001",
"category": "set_color",
"type": "tool_call",
"emitted_call": true,
"called_name": "call_service",
"schema_valid": true,
"argument_accuracy": 0.0,
"raw": "<start_function_call>call:call_service{data:{rgb_color:[255,0,0]},domain:<escape>light<escape>,entity_id:<escape>light.living_room_main_light<escape>,service:<escape>turn_on<escape>}<end_function_call>",
"latency_s": 1.39
},
{
"id": "tc_set_color_002",
"category": "set_color",
"type": "tool_call",
"emitted_call": true,
"called_name": "call_service",
"schema_valid": true,
"argument_accuracy": 0.0,
"raw": "<start_function_call>call:call_service{data:{rgb_color:[255,0,0]},domain:<escape>light<escape>,entity_id:<escape>light.bedroom_night_light<escape>,service:<escape>turn_on<escape>}<end_function_call>",
"latency_s": 1.34
},
{
"id": "tc_get_state_001",
"category": "get_state",
"type": "tool_call",
"emitted_call": true,
"called_name": "get_state",
"schema_valid": true,
"argument_accuracy": 1.0,
"raw": "<start_function_call>call:get_state{entity_id:<escape>switch.bedroom_fan<escape>}<end_function_call>",
"latency_s": 0.54
},
{
"id": "tc_get_state_002",
"category": "get_state",
"type": "tool_call",
"emitted_call": true,
"called_name": "get_state",
"schema_valid": true,
"argument_accuracy": 0.0,
"raw": "<start_function_call>call:get_state{entity_id:<escape>climate.master_bedroom_mini_split<escape>}<end_function_call>",
"latency_s": 0.65
},
{
"id": "tc_get_state_sensor_001",
"category": "get_state_sensor",
"type": "tool_call",
"emitted_call": true,
"called_name": "get_state",
"schema_valid": true,
"argument_accuracy": 0.0,
"raw": "<start_function_call>call:get_state{entity_id:<escape>sensor.living_room_temperature_sensor<escape>}<end_function_call>",
"latency_s": 0.65
},
{
"id": "tc_get_state_sensor_002",
"category": "get_state_sensor",
"type": "tool_call",
"emitted_call": true,
"called_name": "get_state",
"schema_valid": true,
"argument_accuracy": 0.0,
"raw": "<start_function_call>call:get_state{entity_id:<escape>sensor.kitchen_humidity_sensor<escape>}<end_function_call>",
"latency_s": 0.6
},
{
"id": "tc_search_001",
"category": "search_devices",
"type": "tool_call",
"emitted_call": true,
"called_name": "search_entities",
"schema_valid": true,
"argument_accuracy": 0.0,
"raw": "<start_function_call>call:search_entities{query:<escape>sensors<escape>}<end_function_call>",
"latency_s": 0.38
},
{
"id": "tc_search_002",
"category": "search_devices",
"type": "tool_call",
"emitted_call": true,
"called_name": "search_entities",
"schema_valid": true,
"argument_accuracy": 1.0,
"raw": "<start_function_call>call:search_entities{query:<escape>blinds<escape>}<end_function_call>",
"latency_s": 0.41
},
{
"id": "tc_list_001",
"category": "list_in_area",
"type": "tool_call",
"emitted_call": true,
"called_name": "search_entities",
"schema_valid": false,
"argument_accuracy": 0.0,
"raw": "<start_function_call>call:search_entities{query:<escape>lights<escape>}<end_function_call>",
"latency_s": 0.38
},
{
"id": "tc_list_002",
"category": "list_in_area",
"type": "tool_call",
"emitted_call": true,
"called_name": "search_entities",
"schema_valid": false,
"argument_accuracy": 0.0,
"raw": "<start_function_call>call:search_entities{query:<escape>climate<escape>}<end_function_call>",
"latency_s": 0.38
},
{
"id": "tc_history_001",
"category": "history_sensor",
"type": "tool_call",
"emitted_call": true,
"called_name": "get_history",
"schema_valid": true,
"argument_accuracy": 0.0,
"raw": "<start_function_call>call:get_history{end_time:<escape>2026-05-02T00:00:00Z<escape>,entity_id:<escape>sensor.living_room_temperature_sensor<escape>,start_time:<escape>2026-05-01T00:00:00Z<escape>}<end_function_call>",
"latency_s": 2.06
},
{
"id": "tc_history_002",
"category": "history_sensor",
"type": "tool_call",
"emitted_call": true,
"called_name": "get_history",
"schema_valid": true,
"argument_accuracy": 0.0,
"raw": "<start_function_call>call:get_history{end_time:<escape>2026-05-02T00:00:00Z<escape>,entity_id:<escape>sensor.living_room_co2_sensor<escape>,start_time:<escape>2026-05-01T00:00:00Z<escape>}<end_function_call>",
"latency_s": 2.12
},
{
"id": "tc_complex_001",
"category": "complex_reasoning",
"type": "tool_call",
"emitted_call": true,
"called_name": "escalate_to_sonnet",
"schema_valid": true,
"argument_accuracy": 1.0,
"raw": "<start_function_call>call:escalate_to_sonnet{reason:<escape>complex_query<escape>}<end_function_call>",
"latency_s": 0.55
},
{
"id": "tc_complex_002",
"category": "complex_reasoning",
"type": "tool_call",
"emitted_call": true,
"called_name": "escalate_to_sonnet",
"schema_valid": true,
"argument_accuracy": 1.0,
"raw": "<start_function_call>call:escalate_to_sonnet{reason:<escape>complex_query<escape>}<end_function_call>",
"latency_s": 0.54
},
{
"id": "co_greet_001",
"category": "chat_greeting",
"type": "chat_only",
"emitted_call": false,
"escalation_correct": true,
"raw": "Hi! What can I do for you?<end_of_turn>",
"latency_s": 0.28
},
{
"id": "co_greet_002",
"category": "chat_greeting",
"type": "chat_only",
"emitted_call": false,
"escalation_correct": true,
"raw": "Hello! How can I help with your smart home today?<end_of_turn>",
"latency_s": 0.36
},
{
"id": "co_ack_001",
"category": "chat_ack",
"type": "chat_only",
"emitted_call": false,
"escalation_correct": true,
"raw": "Sure thing.<end_of_turn>",
"latency_s": 0.12
},
{
"id": "co_ack_002",
"category": "chat_ack",
"type": "chat_only",
"emitted_call": false,
"escalation_correct": true,
"raw": "Anytime.<end_of_turn>",
"latency_s": 0.11
},
{
"id": "co_oos_001",
"category": "chat_out_of_scope",
"type": "chat_only",
"emitted_call": false,
"escalation_correct": true,
"raw": "I can't help with that, but I can turn on lights, set scenes, or check what's going on around the house.<end_of_turn>",
"latency_s": 0.8
},
{
"id": "co_oos_002",
"category": "chat_out_of_scope",
"type": "chat_only",
"emitted_call": false,
"escalation_correct": true,
"raw": "I'm focused on your smart home \u2014 for that one you'll want a different assistant.<end_of_turn>",
"latency_s": 0.56
},
{
"id": "co_amb_001",
"category": "chat_ambiguous",
"type": "chat_only",
"emitted_call": false,
"escalation_correct": true,
"raw": "Sure \u2014 could you tell me a bit more about what you'd like? For example, I can turn on lights, set a thermostat, or check a sensor.<end_of_turn>",
"latency_s": 0.93
},
{
"id": "co_amb_002",
"category": "chat_ambiguous",
"type": "chat_only",
"emitted_call": false,
"escalation_correct": true,
"raw": "I can do plenty around the house \u2014 could you point me at a specific room or device?<end_of_turn>",
"latency_s": 0.54
},
{
"id": "co_cap_001",
"category": "chat_capability",
"type": "chat_only",
"emitted_call": false,
"escalation_correct": true,
"raw": "I control your smart home \u2014 lights, switches, climate, covers, media players \u2014 and I can read sensors and look up history. Just tell me what you want.<end_of_turn>",
"latency_s": 0.95
},
{
"id": "co_cap_002",
"category": "chat_capability",
"type": "chat_only",
"emitted_call": false,
"escalation_correct": true,
"raw": "I'm a smart home assistant connected to Home Assistant. I can turn devices on and off, dim lights, set thermostats, open and close covers, and read sensor data. Try asking me to turn on a light.<end_of_turn>",
"latency_s": 1.23
}
]
}