{ "model": "semantic-turn-taking/production-v1/ckpt-7000", "base_model": "Qwen/Qwen2.5-0.5B-Instruct", "checkpoint_step": 7000, "benchmarks": { "ten": { "dataset": "TEN Turn Detection", "examples": 528, "binary_accuracy": 0.869, "binary_f1_macro": 0.868, "finished_recall": 0.88, "unfinished_recall": 0.96, "wait_recall": 0.66 }, "swda_v2": { "dataset": "SwDA (improved heuristic v2)", "examples": 4087, "four_class_accuracy": 0.6337, "four_class_f1_macro": 0.4364, "binary_accuracy": 0.7149, "binary_f1_macro": 0.7122, "per_class_accuracy": { "start_speaking": 0.618, "continue_listening": 0.479, "start_listening": 0.0, "continue_speaking": 0.847 }, "note": "v2 fixes false start_listening from agent backchannels" }, "internal_test_en": { "dataset": "Hand-crafted Internal Test (English)", "examples": 48, "four_class_accuracy": 0.7708, "four_class_f1_macro": 0.7421, "binary_accuracy": 0.9583, "binary_f1_macro": 0.9583, "per_class_accuracy": { "start_speaking": 0.833, "continue_listening": 1.0, "start_listening": 0.25, "continue_speaking": 1.0 } }, "internal_test_es": { "dataset": "Hand-crafted Internal Test (Spanish)", "examples": 48, "four_class_accuracy": 0.6667, "four_class_f1_macro": 0.631, "binary_accuracy": 0.8542, "binary_f1_macro": 0.8536, "per_class_accuracy": { "start_speaking": 0.667, "continue_listening": 1.0, "start_listening": 0.167, "continue_speaking": 0.833 }, "note": "Model was never trained on Spanish — cross-lingual transfer from Qwen2.5" }, "synthetic_eval": { "dataset": "Synthetic validation set", "eval_accuracy": 0.900, "eval_f1_macro": 0.767 } }, "livekit_comparison": { "model": "livekit/turn-detector@v0.4.1-intl", "ten_binary_accuracy": 0.667, "ten_binary_f1_macro": 0.592, "swda_v2_binary_accuracy": 0.367, "swda_v2_binary_f1_macro": 0.289, "internal_en_binary_accuracy": 0.708, "internal_en_binary_f1_macro": 0.681, "internal_es_binary_accuracy": 0.625, "internal_es_binary_f1_macro": 0.578 } }