{ "step": 897, "metrics": { "n_total": 32, "n_tool_call": 22, "n_chat_only": 10, "emit_rate": 1.0, "schema_validity": 0.9090909090909091, "escalation_correct": 1.0, "argument_accuracy": 0.393939393939394, "by_category": { "turn_on_light": { "n": 3, "ok": 3, "pct": 1.0 }, "turn_off_light": { "n": 3, "ok": 3, "pct": 1.0 }, "dim_light": { "n": 2, "ok": 2, "pct": 1.0 }, "set_color": { "n": 2, "ok": 2, "pct": 1.0 }, "get_state": { "n": 2, "ok": 2, "pct": 1.0 }, "get_state_sensor": { "n": 2, "ok": 2, "pct": 1.0 }, "search_devices": { "n": 2, "ok": 2, "pct": 1.0 }, "list_in_area": { "n": 2, "ok": 0, "pct": 0.0 }, "history_sensor": { "n": 2, "ok": 2, "pct": 1.0 }, "complex_reasoning": { "n": 2, "ok": 2, "pct": 1.0 }, "chat_greeting": { "n": 2, "ok": 2, "pct": 1.0 }, "chat_ack": { "n": 2, "ok": 2, "pct": 1.0 }, "chat_out_of_scope": { "n": 2, "ok": 2, "pct": 1.0 }, "chat_ambiguous": { "n": 2, "ok": 2, "pct": 1.0 }, "chat_capability": { "n": 2, "ok": 2, "pct": 1.0 } } }, "results": [ { "id": "tc_turn_on_light_001", "category": "turn_on_light", "type": "tool_call", "emitted_call": true, "called_name": "call_service", "schema_valid": true, "argument_accuracy": 0.6666666666666666, "raw": "call:call_service{domain:light,entity_id:light.living_room_ceiling_light,service:turn_on}", "latency_s": 1.08 }, { "id": "tc_turn_on_light_002", "category": "turn_on_light", "type": "tool_call", "emitted_call": true, "called_name": "call_service", "schema_valid": true, "argument_accuracy": 0.6666666666666666, "raw": "call:call_service{domain:light,entity_id:light.bedroom_ceiling_light,service:turn_on}", "latency_s": 0.97 }, { "id": "tc_turn_on_light_003", "category": "turn_on_light", "type": "tool_call", "emitted_call": true, "called_name": "call_service", "schema_valid": true, "argument_accuracy": 0.6666666666666666, "raw": "call:call_service{domain:light,entity_id:light.desk_desk_downlight,service:turn_on}", "latency_s": 1.0 }, { "id": "tc_turn_off_light_001", "category": "turn_off_light", "type": "tool_call", "emitted_call": true, "called_name": "call_service", "schema_valid": true, "argument_accuracy": 1.0, "raw": "call:call_service{domain:light,entity_id:light.kitchen_pendant,service:turn_off}", "latency_s": 0.94 }, { "id": "tc_turn_off_light_002", "category": "turn_off_light", "type": "tool_call", "emitted_call": true, "called_name": "call_service", "schema_valid": true, "argument_accuracy": 1.0, "raw": "call:call_service{domain:light,entity_id:light.bedroom_ceiling_light,service:turn_off}", "latency_s": 0.97 }, { "id": "tc_turn_off_light_003", "category": "turn_off_light", "type": "tool_call", "emitted_call": true, "called_name": "call_service", "schema_valid": true, "argument_accuracy": 0.6666666666666666, "raw": "call:call_service{domain:light,entity_id:light.kids_night_light,service:turn_off}", "latency_s": 0.97 }, { "id": "tc_dim_light_001", "category": "dim_light", "type": "tool_call", "emitted_call": true, "called_name": "call_service", "schema_valid": true, "argument_accuracy": 0.0, "raw": "call:call_service{data:{brightness_pct:30},domain:light,entity_id:light.bedroom_lamp,service:turn_on}", "latency_s": 1.15 }, { "id": "tc_dim_light_002", "category": "dim_light", "type": "tool_call", "emitted_call": true, "called_name": "call_service", "schema_valid": true, "argument_accuracy": 0.0, "raw": "call:call_service{data:{brightness_pct:70},domain:light,entity_id:light.kitchen_island,service:turn_on}", "latency_s": 1.15 }, { "id": "tc_set_color_001", "category": "set_color", "type": "tool_call", "emitted_call": true, "called_name": "call_service", "schema_valid": true, "argument_accuracy": 0.0, "raw": "call:call_service{data:{rgb_color:[255,0,0]},domain:light,entity_id:light.living_room_main_light,service:turn_on}", "latency_s": 1.39 }, { "id": "tc_set_color_002", "category": "set_color", "type": "tool_call", "emitted_call": true, "called_name": "call_service", "schema_valid": true, "argument_accuracy": 0.0, "raw": "call:call_service{data:{rgb_color:[255,0,0]},domain:light,entity_id:light.bedroom_night_light,service:turn_on}", "latency_s": 1.34 }, { "id": "tc_get_state_001", "category": "get_state", "type": "tool_call", "emitted_call": true, "called_name": "get_state", "schema_valid": true, "argument_accuracy": 1.0, "raw": "call:get_state{entity_id:switch.bedroom_fan}", "latency_s": 0.54 }, { "id": "tc_get_state_002", "category": "get_state", "type": "tool_call", "emitted_call": true, "called_name": "get_state", "schema_valid": true, "argument_accuracy": 0.0, "raw": "call:get_state{entity_id:climate.master_bedroom_mini_split}", "latency_s": 0.65 }, { "id": "tc_get_state_sensor_001", "category": "get_state_sensor", "type": "tool_call", "emitted_call": true, "called_name": "get_state", "schema_valid": true, "argument_accuracy": 0.0, "raw": "call:get_state{entity_id:sensor.living_room_temperature_sensor}", "latency_s": 0.65 }, { "id": "tc_get_state_sensor_002", "category": "get_state_sensor", "type": "tool_call", "emitted_call": true, "called_name": "get_state", "schema_valid": true, "argument_accuracy": 0.0, "raw": "call:get_state{entity_id:sensor.kitchen_humidity_sensor}", "latency_s": 0.6 }, { "id": "tc_search_001", "category": "search_devices", "type": "tool_call", "emitted_call": true, "called_name": "search_entities", "schema_valid": true, "argument_accuracy": 0.0, "raw": "call:search_entities{query:sensors}", "latency_s": 0.38 }, { "id": "tc_search_002", "category": "search_devices", "type": "tool_call", "emitted_call": true, "called_name": "search_entities", "schema_valid": true, "argument_accuracy": 1.0, "raw": "call:search_entities{query:blinds}", "latency_s": 0.41 }, { "id": "tc_list_001", "category": "list_in_area", "type": "tool_call", "emitted_call": true, "called_name": "search_entities", "schema_valid": false, "argument_accuracy": 0.0, "raw": "call:search_entities{query:lights}", "latency_s": 0.38 }, { "id": "tc_list_002", "category": "list_in_area", "type": "tool_call", "emitted_call": true, "called_name": "search_entities", "schema_valid": false, "argument_accuracy": 0.0, "raw": "call:search_entities{query:climate}", "latency_s": 0.38 }, { "id": "tc_history_001", "category": "history_sensor", "type": "tool_call", "emitted_call": true, "called_name": "get_history", "schema_valid": true, "argument_accuracy": 0.0, "raw": "call:get_history{end_time:2026-05-02T00:00:00Z,entity_id:sensor.living_room_temperature_sensor,start_time:2026-05-01T00:00:00Z}", "latency_s": 2.06 }, { "id": "tc_history_002", "category": "history_sensor", "type": "tool_call", "emitted_call": true, "called_name": "get_history", "schema_valid": true, "argument_accuracy": 0.0, "raw": "call:get_history{end_time:2026-05-02T00:00:00Z,entity_id:sensor.living_room_co2_sensor,start_time:2026-05-01T00:00:00Z}", "latency_s": 2.12 }, { "id": "tc_complex_001", "category": "complex_reasoning", "type": "tool_call", "emitted_call": true, "called_name": "escalate_to_sonnet", "schema_valid": true, "argument_accuracy": 1.0, "raw": "call:escalate_to_sonnet{reason:complex_query}", "latency_s": 0.55 }, { "id": "tc_complex_002", "category": "complex_reasoning", "type": "tool_call", "emitted_call": true, "called_name": "escalate_to_sonnet", "schema_valid": true, "argument_accuracy": 1.0, "raw": "call:escalate_to_sonnet{reason:complex_query}", "latency_s": 0.54 }, { "id": "co_greet_001", "category": "chat_greeting", "type": "chat_only", "emitted_call": false, "escalation_correct": true, "raw": "Hi! What can I do for you?", "latency_s": 0.28 }, { "id": "co_greet_002", "category": "chat_greeting", "type": "chat_only", "emitted_call": false, "escalation_correct": true, "raw": "Hello! How can I help with your smart home today?", "latency_s": 0.36 }, { "id": "co_ack_001", "category": "chat_ack", "type": "chat_only", "emitted_call": false, "escalation_correct": true, "raw": "Sure thing.", "latency_s": 0.12 }, { "id": "co_ack_002", "category": "chat_ack", "type": "chat_only", "emitted_call": false, "escalation_correct": true, "raw": "Anytime.", "latency_s": 0.11 }, { "id": "co_oos_001", "category": "chat_out_of_scope", "type": "chat_only", "emitted_call": false, "escalation_correct": true, "raw": "I can't help with that, but I can turn on lights, set scenes, or check what's going on around the house.", "latency_s": 0.8 }, { "id": "co_oos_002", "category": "chat_out_of_scope", "type": "chat_only", "emitted_call": false, "escalation_correct": true, "raw": "I'm focused on your smart home \u2014 for that one you'll want a different assistant.", "latency_s": 0.56 }, { "id": "co_amb_001", "category": "chat_ambiguous", "type": "chat_only", "emitted_call": false, "escalation_correct": true, "raw": "Sure \u2014 could you tell me a bit more about what you'd like? For example, I can turn on lights, set a thermostat, or check a sensor.", "latency_s": 0.93 }, { "id": "co_amb_002", "category": "chat_ambiguous", "type": "chat_only", "emitted_call": false, "escalation_correct": true, "raw": "I can do plenty around the house \u2014 could you point me at a specific room or device?", "latency_s": 0.54 }, { "id": "co_cap_001", "category": "chat_capability", "type": "chat_only", "emitted_call": false, "escalation_correct": true, "raw": "I control your smart home \u2014 lights, switches, climate, covers, media players \u2014 and I can read sensors and look up history. Just tell me what you want.", "latency_s": 0.95 }, { "id": "co_cap_002", "category": "chat_capability", "type": "chat_only", "emitted_call": false, "escalation_correct": true, "raw": "I'm a smart home assistant connected to Home Assistant. I can turn devices on and off, dim lights, set thermostats, open and close covers, and read sensor data. Try asking me to turn on a light.", "latency_s": 1.23 } ] }