423 lines
14 KiB
JSON
423 lines
14 KiB
JSON
{
|
|
"step": 897,
|
|
"metrics": {
|
|
"n_total": 32,
|
|
"n_tool_call": 22,
|
|
"n_chat_only": 10,
|
|
"emit_rate": 1.0,
|
|
"schema_validity": 0.9090909090909091,
|
|
"escalation_correct": 1.0,
|
|
"argument_accuracy": 0.393939393939394,
|
|
"by_category": {
|
|
"turn_on_light": {
|
|
"n": 3,
|
|
"ok": 3,
|
|
"pct": 1.0
|
|
},
|
|
"turn_off_light": {
|
|
"n": 3,
|
|
"ok": 3,
|
|
"pct": 1.0
|
|
},
|
|
"dim_light": {
|
|
"n": 2,
|
|
"ok": 2,
|
|
"pct": 1.0
|
|
},
|
|
"set_color": {
|
|
"n": 2,
|
|
"ok": 2,
|
|
"pct": 1.0
|
|
},
|
|
"get_state": {
|
|
"n": 2,
|
|
"ok": 2,
|
|
"pct": 1.0
|
|
},
|
|
"get_state_sensor": {
|
|
"n": 2,
|
|
"ok": 2,
|
|
"pct": 1.0
|
|
},
|
|
"search_devices": {
|
|
"n": 2,
|
|
"ok": 2,
|
|
"pct": 1.0
|
|
},
|
|
"list_in_area": {
|
|
"n": 2,
|
|
"ok": 0,
|
|
"pct": 0.0
|
|
},
|
|
"history_sensor": {
|
|
"n": 2,
|
|
"ok": 2,
|
|
"pct": 1.0
|
|
},
|
|
"complex_reasoning": {
|
|
"n": 2,
|
|
"ok": 2,
|
|
"pct": 1.0
|
|
},
|
|
"chat_greeting": {
|
|
"n": 2,
|
|
"ok": 2,
|
|
"pct": 1.0
|
|
},
|
|
"chat_ack": {
|
|
"n": 2,
|
|
"ok": 2,
|
|
"pct": 1.0
|
|
},
|
|
"chat_out_of_scope": {
|
|
"n": 2,
|
|
"ok": 2,
|
|
"pct": 1.0
|
|
},
|
|
"chat_ambiguous": {
|
|
"n": 2,
|
|
"ok": 2,
|
|
"pct": 1.0
|
|
},
|
|
"chat_capability": {
|
|
"n": 2,
|
|
"ok": 2,
|
|
"pct": 1.0
|
|
}
|
|
}
|
|
},
|
|
"results": [
|
|
{
|
|
"id": "tc_turn_on_light_001",
|
|
"category": "turn_on_light",
|
|
"type": "tool_call",
|
|
"emitted_call": true,
|
|
"called_name": "call_service",
|
|
"schema_valid": true,
|
|
"argument_accuracy": 0.6666666666666666,
|
|
"raw": "<start_function_call>call:call_service{domain:<escape>light<escape>,entity_id:<escape>light.living_room_ceiling_light<escape>,service:<escape>turn_on<escape>}<end_function_call>",
|
|
"latency_s": 1.08
|
|
},
|
|
{
|
|
"id": "tc_turn_on_light_002",
|
|
"category": "turn_on_light",
|
|
"type": "tool_call",
|
|
"emitted_call": true,
|
|
"called_name": "call_service",
|
|
"schema_valid": true,
|
|
"argument_accuracy": 0.6666666666666666,
|
|
"raw": "<start_function_call>call:call_service{domain:<escape>light<escape>,entity_id:<escape>light.bedroom_ceiling_light<escape>,service:<escape>turn_on<escape>}<end_function_call>",
|
|
"latency_s": 0.97
|
|
},
|
|
{
|
|
"id": "tc_turn_on_light_003",
|
|
"category": "turn_on_light",
|
|
"type": "tool_call",
|
|
"emitted_call": true,
|
|
"called_name": "call_service",
|
|
"schema_valid": true,
|
|
"argument_accuracy": 0.6666666666666666,
|
|
"raw": "<start_function_call>call:call_service{domain:<escape>light<escape>,entity_id:<escape>light.desk_desk_downlight<escape>,service:<escape>turn_on<escape>}<end_function_call>",
|
|
"latency_s": 1.0
|
|
},
|
|
{
|
|
"id": "tc_turn_off_light_001",
|
|
"category": "turn_off_light",
|
|
"type": "tool_call",
|
|
"emitted_call": true,
|
|
"called_name": "call_service",
|
|
"schema_valid": true,
|
|
"argument_accuracy": 1.0,
|
|
"raw": "<start_function_call>call:call_service{domain:<escape>light<escape>,entity_id:<escape>light.kitchen_pendant<escape>,service:<escape>turn_off<escape>}<end_function_call>",
|
|
"latency_s": 0.94
|
|
},
|
|
{
|
|
"id": "tc_turn_off_light_002",
|
|
"category": "turn_off_light",
|
|
"type": "tool_call",
|
|
"emitted_call": true,
|
|
"called_name": "call_service",
|
|
"schema_valid": true,
|
|
"argument_accuracy": 1.0,
|
|
"raw": "<start_function_call>call:call_service{domain:<escape>light<escape>,entity_id:<escape>light.bedroom_ceiling_light<escape>,service:<escape>turn_off<escape>}<end_function_call>",
|
|
"latency_s": 0.97
|
|
},
|
|
{
|
|
"id": "tc_turn_off_light_003",
|
|
"category": "turn_off_light",
|
|
"type": "tool_call",
|
|
"emitted_call": true,
|
|
"called_name": "call_service",
|
|
"schema_valid": true,
|
|
"argument_accuracy": 0.6666666666666666,
|
|
"raw": "<start_function_call>call:call_service{domain:<escape>light<escape>,entity_id:<escape>light.kids_night_light<escape>,service:<escape>turn_off<escape>}<end_function_call>",
|
|
"latency_s": 0.97
|
|
},
|
|
{
|
|
"id": "tc_dim_light_001",
|
|
"category": "dim_light",
|
|
"type": "tool_call",
|
|
"emitted_call": true,
|
|
"called_name": "call_service",
|
|
"schema_valid": true,
|
|
"argument_accuracy": 0.0,
|
|
"raw": "<start_function_call>call:call_service{data:{brightness_pct:30},domain:<escape>light<escape>,entity_id:<escape>light.bedroom_lamp<escape>,service:<escape>turn_on<escape>}<end_function_call>",
|
|
"latency_s": 1.15
|
|
},
|
|
{
|
|
"id": "tc_dim_light_002",
|
|
"category": "dim_light",
|
|
"type": "tool_call",
|
|
"emitted_call": true,
|
|
"called_name": "call_service",
|
|
"schema_valid": true,
|
|
"argument_accuracy": 0.0,
|
|
"raw": "<start_function_call>call:call_service{data:{brightness_pct:70},domain:<escape>light<escape>,entity_id:<escape>light.kitchen_island<escape>,service:<escape>turn_on<escape>}<end_function_call>",
|
|
"latency_s": 1.15
|
|
},
|
|
{
|
|
"id": "tc_set_color_001",
|
|
"category": "set_color",
|
|
"type": "tool_call",
|
|
"emitted_call": true,
|
|
"called_name": "call_service",
|
|
"schema_valid": true,
|
|
"argument_accuracy": 0.0,
|
|
"raw": "<start_function_call>call:call_service{data:{rgb_color:[255,0,0]},domain:<escape>light<escape>,entity_id:<escape>light.living_room_main_light<escape>,service:<escape>turn_on<escape>}<end_function_call>",
|
|
"latency_s": 1.39
|
|
},
|
|
{
|
|
"id": "tc_set_color_002",
|
|
"category": "set_color",
|
|
"type": "tool_call",
|
|
"emitted_call": true,
|
|
"called_name": "call_service",
|
|
"schema_valid": true,
|
|
"argument_accuracy": 0.0,
|
|
"raw": "<start_function_call>call:call_service{data:{rgb_color:[255,0,0]},domain:<escape>light<escape>,entity_id:<escape>light.bedroom_night_light<escape>,service:<escape>turn_on<escape>}<end_function_call>",
|
|
"latency_s": 1.34
|
|
},
|
|
{
|
|
"id": "tc_get_state_001",
|
|
"category": "get_state",
|
|
"type": "tool_call",
|
|
"emitted_call": true,
|
|
"called_name": "get_state",
|
|
"schema_valid": true,
|
|
"argument_accuracy": 1.0,
|
|
"raw": "<start_function_call>call:get_state{entity_id:<escape>switch.bedroom_fan<escape>}<end_function_call>",
|
|
"latency_s": 0.54
|
|
},
|
|
{
|
|
"id": "tc_get_state_002",
|
|
"category": "get_state",
|
|
"type": "tool_call",
|
|
"emitted_call": true,
|
|
"called_name": "get_state",
|
|
"schema_valid": true,
|
|
"argument_accuracy": 0.0,
|
|
"raw": "<start_function_call>call:get_state{entity_id:<escape>climate.master_bedroom_mini_split<escape>}<end_function_call>",
|
|
"latency_s": 0.65
|
|
},
|
|
{
|
|
"id": "tc_get_state_sensor_001",
|
|
"category": "get_state_sensor",
|
|
"type": "tool_call",
|
|
"emitted_call": true,
|
|
"called_name": "get_state",
|
|
"schema_valid": true,
|
|
"argument_accuracy": 0.0,
|
|
"raw": "<start_function_call>call:get_state{entity_id:<escape>sensor.living_room_temperature_sensor<escape>}<end_function_call>",
|
|
"latency_s": 0.65
|
|
},
|
|
{
|
|
"id": "tc_get_state_sensor_002",
|
|
"category": "get_state_sensor",
|
|
"type": "tool_call",
|
|
"emitted_call": true,
|
|
"called_name": "get_state",
|
|
"schema_valid": true,
|
|
"argument_accuracy": 0.0,
|
|
"raw": "<start_function_call>call:get_state{entity_id:<escape>sensor.kitchen_humidity_sensor<escape>}<end_function_call>",
|
|
"latency_s": 0.6
|
|
},
|
|
{
|
|
"id": "tc_search_001",
|
|
"category": "search_devices",
|
|
"type": "tool_call",
|
|
"emitted_call": true,
|
|
"called_name": "search_entities",
|
|
"schema_valid": true,
|
|
"argument_accuracy": 0.0,
|
|
"raw": "<start_function_call>call:search_entities{query:<escape>sensors<escape>}<end_function_call>",
|
|
"latency_s": 0.38
|
|
},
|
|
{
|
|
"id": "tc_search_002",
|
|
"category": "search_devices",
|
|
"type": "tool_call",
|
|
"emitted_call": true,
|
|
"called_name": "search_entities",
|
|
"schema_valid": true,
|
|
"argument_accuracy": 1.0,
|
|
"raw": "<start_function_call>call:search_entities{query:<escape>blinds<escape>}<end_function_call>",
|
|
"latency_s": 0.41
|
|
},
|
|
{
|
|
"id": "tc_list_001",
|
|
"category": "list_in_area",
|
|
"type": "tool_call",
|
|
"emitted_call": true,
|
|
"called_name": "search_entities",
|
|
"schema_valid": false,
|
|
"argument_accuracy": 0.0,
|
|
"raw": "<start_function_call>call:search_entities{query:<escape>lights<escape>}<end_function_call>",
|
|
"latency_s": 0.38
|
|
},
|
|
{
|
|
"id": "tc_list_002",
|
|
"category": "list_in_area",
|
|
"type": "tool_call",
|
|
"emitted_call": true,
|
|
"called_name": "search_entities",
|
|
"schema_valid": false,
|
|
"argument_accuracy": 0.0,
|
|
"raw": "<start_function_call>call:search_entities{query:<escape>climate<escape>}<end_function_call>",
|
|
"latency_s": 0.38
|
|
},
|
|
{
|
|
"id": "tc_history_001",
|
|
"category": "history_sensor",
|
|
"type": "tool_call",
|
|
"emitted_call": true,
|
|
"called_name": "get_history",
|
|
"schema_valid": true,
|
|
"argument_accuracy": 0.0,
|
|
"raw": "<start_function_call>call:get_history{end_time:<escape>2026-05-02T00:00:00Z<escape>,entity_id:<escape>sensor.living_room_temperature_sensor<escape>,start_time:<escape>2026-05-01T00:00:00Z<escape>}<end_function_call>",
|
|
"latency_s": 2.06
|
|
},
|
|
{
|
|
"id": "tc_history_002",
|
|
"category": "history_sensor",
|
|
"type": "tool_call",
|
|
"emitted_call": true,
|
|
"called_name": "get_history",
|
|
"schema_valid": true,
|
|
"argument_accuracy": 0.0,
|
|
"raw": "<start_function_call>call:get_history{end_time:<escape>2026-05-02T00:00:00Z<escape>,entity_id:<escape>sensor.living_room_co2_sensor<escape>,start_time:<escape>2026-05-01T00:00:00Z<escape>}<end_function_call>",
|
|
"latency_s": 2.12
|
|
},
|
|
{
|
|
"id": "tc_complex_001",
|
|
"category": "complex_reasoning",
|
|
"type": "tool_call",
|
|
"emitted_call": true,
|
|
"called_name": "escalate_to_sonnet",
|
|
"schema_valid": true,
|
|
"argument_accuracy": 1.0,
|
|
"raw": "<start_function_call>call:escalate_to_sonnet{reason:<escape>complex_query<escape>}<end_function_call>",
|
|
"latency_s": 0.55
|
|
},
|
|
{
|
|
"id": "tc_complex_002",
|
|
"category": "complex_reasoning",
|
|
"type": "tool_call",
|
|
"emitted_call": true,
|
|
"called_name": "escalate_to_sonnet",
|
|
"schema_valid": true,
|
|
"argument_accuracy": 1.0,
|
|
"raw": "<start_function_call>call:escalate_to_sonnet{reason:<escape>complex_query<escape>}<end_function_call>",
|
|
"latency_s": 0.54
|
|
},
|
|
{
|
|
"id": "co_greet_001",
|
|
"category": "chat_greeting",
|
|
"type": "chat_only",
|
|
"emitted_call": false,
|
|
"escalation_correct": true,
|
|
"raw": "Hi! What can I do for you?<end_of_turn>",
|
|
"latency_s": 0.28
|
|
},
|
|
{
|
|
"id": "co_greet_002",
|
|
"category": "chat_greeting",
|
|
"type": "chat_only",
|
|
"emitted_call": false,
|
|
"escalation_correct": true,
|
|
"raw": "Hello! How can I help with your smart home today?<end_of_turn>",
|
|
"latency_s": 0.36
|
|
},
|
|
{
|
|
"id": "co_ack_001",
|
|
"category": "chat_ack",
|
|
"type": "chat_only",
|
|
"emitted_call": false,
|
|
"escalation_correct": true,
|
|
"raw": "Sure thing.<end_of_turn>",
|
|
"latency_s": 0.12
|
|
},
|
|
{
|
|
"id": "co_ack_002",
|
|
"category": "chat_ack",
|
|
"type": "chat_only",
|
|
"emitted_call": false,
|
|
"escalation_correct": true,
|
|
"raw": "Anytime.<end_of_turn>",
|
|
"latency_s": 0.11
|
|
},
|
|
{
|
|
"id": "co_oos_001",
|
|
"category": "chat_out_of_scope",
|
|
"type": "chat_only",
|
|
"emitted_call": false,
|
|
"escalation_correct": true,
|
|
"raw": "I can't help with that, but I can turn on lights, set scenes, or check what's going on around the house.<end_of_turn>",
|
|
"latency_s": 0.8
|
|
},
|
|
{
|
|
"id": "co_oos_002",
|
|
"category": "chat_out_of_scope",
|
|
"type": "chat_only",
|
|
"emitted_call": false,
|
|
"escalation_correct": true,
|
|
"raw": "I'm focused on your smart home \u2014 for that one you'll want a different assistant.<end_of_turn>",
|
|
"latency_s": 0.56
|
|
},
|
|
{
|
|
"id": "co_amb_001",
|
|
"category": "chat_ambiguous",
|
|
"type": "chat_only",
|
|
"emitted_call": false,
|
|
"escalation_correct": true,
|
|
"raw": "Sure \u2014 could you tell me a bit more about what you'd like? For example, I can turn on lights, set a thermostat, or check a sensor.<end_of_turn>",
|
|
"latency_s": 0.93
|
|
},
|
|
{
|
|
"id": "co_amb_002",
|
|
"category": "chat_ambiguous",
|
|
"type": "chat_only",
|
|
"emitted_call": false,
|
|
"escalation_correct": true,
|
|
"raw": "I can do plenty around the house \u2014 could you point me at a specific room or device?<end_of_turn>",
|
|
"latency_s": 0.54
|
|
},
|
|
{
|
|
"id": "co_cap_001",
|
|
"category": "chat_capability",
|
|
"type": "chat_only",
|
|
"emitted_call": false,
|
|
"escalation_correct": true,
|
|
"raw": "I control your smart home \u2014 lights, switches, climate, covers, media players \u2014 and I can read sensors and look up history. Just tell me what you want.<end_of_turn>",
|
|
"latency_s": 0.95
|
|
},
|
|
{
|
|
"id": "co_cap_002",
|
|
"category": "chat_capability",
|
|
"type": "chat_only",
|
|
"emitted_call": false,
|
|
"escalation_correct": true,
|
|
"raw": "I'm a smart home assistant connected to Home Assistant. I can turn devices on and off, dim lights, set thermostats, open and close covers, and read sensor data. Try asking me to turn on a light.<end_of_turn>",
|
|
"latency_s": 1.23
|
|
}
|
|
]
|
|
} |