初始化项目，由ModelHub XC社区提供模型

Model: anyreach-ai/semantic-turn-taking Source: Original Platform
2026-04-29 12:06:38 +08:00
commit a288e309b5
26 changed files with 304062 additions and 0 deletions
--- a/.gitattributes
+++ b/.gitattributes
@@ -0,0 +1,38 @@
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text
+onnx/model.onnx_data filter=lfs diff=lfs merge=lfs -text
+onnx/tokenizer.json filter=lfs diff=lfs merge=lfs -text
--- a/README.md
+++ b/README.md
@@ -0,0 +1,186 @@
+---
+license: apache-2.0
+language:
+- en
+library_name: transformers
+tags:
+- turn-taking
+- voice-ai
+- conversational-ai
+- dialogue
+- qwen2
+- onnx
+base_model: Qwen/Qwen2.5-0.5B-Instruct
+pipeline_tag: text-generation
+---
+
+# Semantic Turn-Taking Model
+
+A fine-tuned [Qwen2.5-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct) model that predicts turn-taking actions in conversations. Given a conversation context, the model predicts what action a voice AI agent should take next.
+
+Unlike acoustic-based approaches (VAD, silence detection), this model uses the **semantic content** of the conversation to make turn-taking decisions.
+
+## Action Classes
+
+The model predicts one of 4 actions:
+
+| Action | Token | Description |
+|--------|-------|-------------|
+| `start_speaking` | `<\|start_speaking\|>` | User finished their turn, agent should respond |
+| `continue_listening` | `<\|continue_listening\|>` | User is mid-utterance, keep listening |
+| `start_listening` | `<\|start_listening\|>` | User interrupted the agent, stop talking |
+| `continue_speaking` | `<\|continue_speaking\|>` | User gave a backchannel, agent keeps talking |
+
+## Usage
+
+### PyTorch
+
+```python
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+
+model_name = "anyreach-ai/semantic-turn-taking"
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16).cuda()
+model.eval()
+
+# Format conversation as ChatML with <|predict|> trigger
+conversation = """<|im_start|>user
+I need help with my bill<|im_end|>
+<|im_start|>assistant
+Sure I can help with that what seems to be the issue<|im_end|>
+<|im_start|>user
+I was charged twice for the same order<|im_end|>
+<|predict|>"""
+
+inputs = tokenizer(conversation, return_tensors="pt").to("cuda")
+
+with torch.no_grad():
+    logits = model(**inputs).logits[:, -1, :]
+
+# Get action probabilities
+action_tokens = {
+    "start_speaking": tokenizer.convert_tokens_to_ids("<|start_speaking|>"),
+    "continue_listening": tokenizer.convert_tokens_to_ids("<|continue_listening|>"),
+    "start_listening": tokenizer.convert_tokens_to_ids("<|start_listening|>"),
+    "continue_speaking": tokenizer.convert_tokens_to_ids("<|continue_speaking|>"),
+}
+
+action_logits = {name: logits[0, tid].item() for name, tid in action_tokens.items()}
+probs = torch.softmax(torch.tensor(list(action_logits.values())), dim=0)
+for (name, _), p in zip(action_logits.items(), probs):
+    print(f"  {name}: {p:.4f}")
+# → start_speaking: 0.95+ (user is done, agent should respond)
+```
+
+### ONNX (CPU)
+
+```python
+import numpy as np
+import onnxruntime as ort
+from transformers import AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("anyreach-ai/semantic-turn-taking")
+
+sess_options = ort.SessionOptions()
+sess_options.intra_op_num_threads = 4
+session = ort.InferenceSession(
+    "onnx/model_q8.onnx",  # download from this repo
+    providers=["CPUExecutionProvider"],
+    sess_options=sess_options,
+)
+
+# Tokenize
+conversation = "..."  # ChatML format as above
+inputs = tokenizer(conversation, return_tensors="np")
+input_ids = inputs["input_ids"].astype("int64")
+seq_len = input_ids.shape[1]
+
+# Build feed (empty KV cache for single forward pass)
+feed = {
+    "input_ids": input_ids,
+    "attention_mask": inputs["attention_mask"].astype("int64"),
+    "position_ids": np.arange(seq_len, dtype="int64").reshape(1, -1),
+}
+for i in range(24):
+    feed[f"past_key_values.{i}.key"] = np.zeros((1, 2, 0, 64), dtype="float32")
+    feed[f"past_key_values.{i}.value"] = np.zeros((1, 2, 0, 64), dtype="float32")
+
+# Run inference
+logits = session.run(None, feed)[0]  # [1, seq_len, vocab_size]
+last_logits = logits[0, -1, :]
+
+# Extract action probabilities
+ACTION_IDS = [151666, 151665, 151667, 151668]  # SS, CL, SLi, CS
+action_logits = last_logits[ACTION_IDS]
+probs = np.exp(action_logits) / np.sum(np.exp(action_logits))
+```
+
+## Benchmark Results
+
+Evaluated on [anyreach-ai/semantic-turn-taking-benchmark](https://huggingface.co/datasets/anyreach-ai/semantic-turn-taking-benchmark).
+
+### Binary (EOU vs Not-EOU)
+
+Only `start_speaking` and `continue_listening` examples. Predictions mapped: SS/CS → EOU, CL/SLi → Not-EOU.
+
+| Subset | N | Accuracy | F1 (macro) |
+|--------|--:|--:|--:|
+| TEN | 428 | 91.82% | 91.80% |
+| SwDA | 2,688 | 65.96% | 51.46% |
+| Synthetic | 36 | 86.11% | 85.57% |
+
+### Multi-class
+
+| Subset | N | Classes | Accuracy | F1 (macro) |
+|--------|--:|--------:|--:|--:|
+| TEN | 428 | 2 | 91.82% | 91.80% |
+| SwDA | 3,523 | 3 | 68.98% | 46.92% |
+| Synthetic | 60 | 4 | 76.67% | 72.07% |
+
+## Latency
+
+Measured on single examples, CPU (4 threads) and GPU (NVIDIA T4).
+
+| Format | Size | Short (8 tok) | Medium (28 tok) | Long (54 tok) |
+|--------|-----:|--:|--:|--:|
+| PyTorch GPU (fp16) | 942 MB | 26 ms | 30 ms | 34 ms |
+| PyTorch CPU (fp32) | 942 MB | 165 ms | 247 ms | 289 ms |
+| ONNX CPU (q8) | 473 MB | 128 ms | 151 ms | 191 ms |
+
+## Model Details
+
+- **Base model**: [Qwen/Qwen2.5-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct) (494M parameters)
+- **Training**: Full fine-tuning on ~154K synthetic conversation examples
+- **Input format**: Qwen ChatML with `<|predict|>` trigger token
+- **Max sequence length**: 1024 tokens (left truncation)
+- **Special tokens**: 5 added (`<|predict|>`, 4 action tokens)
+
+## Files
+
+| File | Description |
+|------|-------------|
+| `model.safetensors` | PyTorch model weights (fp32) |
+| `onnx/model_q8.onnx` | ONNX INT8 quantized (dynamic quantization) |
+| `config.json` | Model configuration |
+| `tokenizer.json` | Tokenizer |
+
+## Citation
+
+```bibtex
+@misc{semantic-turn-taking-2026,
+  title={Semantic Turn-Taking Model},
+  author={Shangeth Rajaa},
+  year={2026},
+  publisher={Hugging Face},
+  url={https://huggingface.co/anyreach-ai/semantic-turn-taking}
+}
+```
+
+## Authors
+
+- [**Shangeth Rajaa**](https://github.com/shangeth)
+
+## License
+
+Apache 2.0
--- a/added_tokens.json
+++ b/added_tokens.json
@@ -0,0 +1,29 @@
+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|continue_listening|>": 151665,
+  "<|continue_speaking|>": 151668,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|predict|>": 151669,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|start_listening|>": 151667,
+  "<|start_speaking|>": 151666,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}
--- a/chat_template.jinja
+++ b/chat_template.jinja
@@ -0,0 +1,54 @@
+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- messages[0]['content'] }}
+    {%- else %}
+        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
+    {%- endif %}
+    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+    {%- else %}
+        {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role }}
+        {%- if message.content %}
+            {{- '\n' + message.content }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '\n<tool_call>\n{"name": "' }}
+            {{- tool_call.name }}
+            {{- '", "arguments": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- '}\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}
--- a/config.json
+++ b/config.json
@@ -0,0 +1,55 @@
+{
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "dtype": "bfloat16",
+  "eos_token_id": 151645,
+  "hidden_act": "silu",
+  "hidden_size": 896,
+  "initializer_range": 0.02,
+  "intermediate_size": 4864,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 32768,
+  "max_window_layers": 21,
+  "model_type": "qwen2",
+  "num_attention_heads": 14,
+  "num_hidden_layers": 24,
+  "num_key_value_heads": 2,
+  "pad_token_id": 151643,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "transformers_version": "4.57.6",
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 151670
+}
--- a/generation_config.json
+++ b/generation_config.json
@@ -0,0 +1,14 @@
+{
+  "bos_token_id": 151643,
+  "do_sample": true,
+  "eos_token_id": [
+    151645,
+    151643
+  ],
+  "pad_token_id": 151643,
+  "repetition_penalty": 1.1,
+  "temperature": 0.7,
+  "top_k": 20,
+  "top_p": 0.8,
+  "transformers_version": "4.57.6"
+}
--- a/merges.txt
+++ b/merges.txt
--- a/model.safetensors
+++ b/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7bb1d46d3bfb9b60f62ce3e3ae0f444df4cfa12ed9e709ba713d87719126c21e
+size 987621152
--- a/onnx/added_tokens.json
+++ b/onnx/added_tokens.json
@@ -0,0 +1,29 @@
+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|continue_listening|>": 151665,
+  "<|continue_speaking|>": 151668,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|predict|>": 151669,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|start_listening|>": 151667,
+  "<|start_speaking|>": 151666,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}
--- a/onnx/chat_template.jinja
+++ b/onnx/chat_template.jinja
@@ -0,0 +1,54 @@
+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- messages[0]['content'] }}
+    {%- else %}
+        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
+    {%- endif %}
+    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+    {%- else %}
+        {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role }}
+        {%- if message.content %}
+            {{- '\n' + message.content }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '\n<tool_call>\n{"name": "' }}
+            {{- tool_call.name }}
+            {{- '", "arguments": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- '}\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}
--- a/onnx/config.json
+++ b/onnx/config.json
@@ -0,0 +1,55 @@
+{
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "dtype": "bfloat16",
+  "eos_token_id": 151645,
+  "hidden_act": "silu",
+  "hidden_size": 896,
+  "initializer_range": 0.02,
+  "intermediate_size": 4864,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 32768,
+  "max_window_layers": 21,
+  "model_type": "qwen2",
+  "num_attention_heads": 14,
+  "num_hidden_layers": 24,
+  "num_key_value_heads": 2,
+  "pad_token_id": 151643,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "transformers_version": "4.57.6",
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 151670
+}
--- a/onnx/generation_config.json
+++ b/onnx/generation_config.json
@@ -0,0 +1,14 @@
+{
+  "bos_token_id": 151643,
+  "do_sample": true,
+  "eos_token_id": [
+    151645,
+    151643
+  ],
+  "pad_token_id": 151643,
+  "repetition_penalty": 1.1,
+  "temperature": 0.7,
+  "top_k": 20,
+  "top_p": 0.8,
+  "transformers_version": "4.57.6"
+}
--- a/onnx/merges.txt
+++ b/onnx/merges.txt
--- a/onnx/model.onnx
+++ b/onnx/model.onnx
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:46a669b8d19c01c7ce6147394c421f2fa2904d0ab948b7275ed10b19148170c2
+size 1125454
--- a/onnx/model.onnx_data
+++ b/onnx/model.onnx_data
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:25418b69c1666c540a7117db39689bb7949aaebb1f323e39b5fbddb4f4d0e6fb
+size 1975174272
--- a/onnx/model_q8.onnx
+++ b/onnx/model_q8.onnx
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1974c790d2aa557642b0906eafbb593af8ca48fc045e5b9d187d7fa95faf88fe
+size 495715962
--- a/onnx/special_tokens_map.json
+++ b/onnx/special_tokens_map.json
@@ -0,0 +1,53 @@
+{
+  "additional_special_tokens": [
+    {
+      "content": "<|continue_listening|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|start_speaking|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|start_listening|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|continue_speaking|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|predict|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    }
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
--- a/onnx/tokenizer.json
+++ b/onnx/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cb8544e622fff1dd549fddc8273bd5fd327f9642604868434f594f19ac2a49cc
+size 11422970
--- a/onnx/tokenizer_config.json
+++ b/onnx/tokenizer_config.json
@@ -0,0 +1,239 @@
+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151665": {
+      "content": "<|continue_listening|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151666": {
+      "content": "<|start_speaking|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151667": {
+      "content": "<|start_listening|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151668": {
+      "content": "<|continue_speaking|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151669": {
+      "content": "<|predict|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|continue_listening|>",
+    "<|start_speaking|>",
+    "<|start_listening|>",
+    "<|continue_speaking|>",
+    "<|predict|>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}
--- a/onnx/vocab.json
+++ b/onnx/vocab.json
--- a/special_tokens_map.json
+++ b/special_tokens_map.json
@@ -0,0 +1,53 @@
+{
+  "additional_special_tokens": [
+    {
+      "content": "<|continue_listening|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|start_speaking|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|start_listening|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|continue_speaking|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|predict|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    }
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
--- a/test_metrics.json
+++ b/test_metrics.json
@@ -0,0 +1,76 @@
+{
+  "model": "semantic-turn-taking/production-v1/ckpt-7000",
+  "base_model": "Qwen/Qwen2.5-0.5B-Instruct",
+  "checkpoint_step": 7000,
+  "benchmarks": {
+    "ten": {
+      "dataset": "TEN Turn Detection",
+      "examples": 528,
+      "binary_accuracy": 0.869,
+      "binary_f1_macro": 0.868,
+      "finished_recall": 0.88,
+      "unfinished_recall": 0.96,
+      "wait_recall": 0.66
+    },
+    "swda_v2": {
+      "dataset": "SwDA (improved heuristic v2)",
+      "examples": 4087,
+      "four_class_accuracy": 0.6337,
+      "four_class_f1_macro": 0.4364,
+      "binary_accuracy": 0.7149,
+      "binary_f1_macro": 0.7122,
+      "per_class_accuracy": {
+        "start_speaking": 0.618,
+        "continue_listening": 0.479,
+        "start_listening": 0.0,
+        "continue_speaking": 0.847
+      },
+      "note": "v2 fixes false start_listening from agent backchannels"
+    },
+    "internal_test_en": {
+      "dataset": "Hand-crafted Internal Test (English)",
+      "examples": 48,
+      "four_class_accuracy": 0.7708,
+      "four_class_f1_macro": 0.7421,
+      "binary_accuracy": 0.9583,
+      "binary_f1_macro": 0.9583,
+      "per_class_accuracy": {
+        "start_speaking": 0.833,
+        "continue_listening": 1.0,
+        "start_listening": 0.25,
+        "continue_speaking": 1.0
+      }
+    },
+    "internal_test_es": {
+      "dataset": "Hand-crafted Internal Test (Spanish)",
+      "examples": 48,
+      "four_class_accuracy": 0.6667,
+      "four_class_f1_macro": 0.631,
+      "binary_accuracy": 0.8542,
+      "binary_f1_macro": 0.8536,
+      "per_class_accuracy": {
+        "start_speaking": 0.667,
+        "continue_listening": 1.0,
+        "start_listening": 0.167,
+        "continue_speaking": 0.833
+      },
+      "note": "Model was never trained on Spanish — cross-lingual transfer from Qwen2.5"
+    },
+    "synthetic_eval": {
+      "dataset": "Synthetic validation set",
+      "eval_accuracy": 0.900,
+      "eval_f1_macro": 0.767
+    }
+  },
+  "livekit_comparison": {
+    "model": "livekit/turn-detector@v0.4.1-intl",
+    "ten_binary_accuracy": 0.667,
+    "ten_binary_f1_macro": 0.592,
+    "swda_v2_binary_accuracy": 0.367,
+    "swda_v2_binary_f1_macro": 0.289,
+    "internal_en_binary_accuracy": 0.708,
+    "internal_en_binary_f1_macro": 0.681,
+    "internal_es_binary_accuracy": 0.625,
+    "internal_es_binary_f1_macro": 0.578
+  }
+}
--- a/tokenizer.json
+++ b/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cb8544e622fff1dd549fddc8273bd5fd327f9642604868434f594f19ac2a49cc
+size 11422970
--- a/tokenizer_config.json
+++ b/tokenizer_config.json
@@ -0,0 +1,239 @@
+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151665": {
+      "content": "<|continue_listening|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151666": {
+      "content": "<|start_speaking|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151667": {
+      "content": "<|start_listening|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151668": {
+      "content": "<|continue_speaking|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151669": {
+      "content": "<|predict|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|continue_listening|>",
+    "<|start_speaking|>",
+    "<|start_listening|>",
+    "<|continue_speaking|>",
+    "<|predict|>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}
--- a/training_config.yaml
+++ b/training_config.yaml
@@ -0,0 +1,78 @@
+# Training configuration for checkpoint-7000 (synth-only-v2-fixed-run2-removedlabelaug)
+# This is the config used to train the published model.
+
+model:
+  name: "Qwen/Qwen2.5-0.5B-Instruct"
+  max_length: 1024
+  predict_token: "<|predict|>"
+
+action_tokens:
+  - "<|continue_listening|>"
+  - "<|start_speaking|>"
+  - "<|start_listening|>"
+  - "<|continue_speaking|>"
+
+prediction_tokens:
+  - "<|continue_listening|>"
+  - "<|start_speaking|>"
+  - "<|start_listening|>"
+  - "<|continue_speaking|>"
+
+data:
+  train:
+    - "data/context_action/synthetic/train.csv"                              # ~140K
+    - "data/context_action/synthetic/supplementary_start_listening.csv"       # ~14K
+  val: "data/context_action/synthetic/val.csv"
+  test: "data/context_action/synthetic/test.csv"
+
+training:
+  batch_size: 8
+  gradient_accumulation_steps: 4    # effective batch size = 32
+  learning_rate: 5.0e-5
+  num_epochs: 50
+  warmup_steps: 100
+  weight_decay: 0.01
+  max_grad_norm: 1.0
+  bf16: true
+  lr_scheduler: "cosine"
+
+  loss:
+    ntp_weight: 0.1                 # 90% action CE + 10% NTP auxiliary
+
+  early_stopping:
+    enabled: true
+    patience: 10
+    metric: "eval_f1_macro"
+
+augmentation:
+  enabled: true
+  context_truncation:
+    enabled: true
+    probability: 0.2
+    min_turns: 1
+  asr_styles:
+    - "pure"
+    - "punctuated"
+    - "mixed"
+  filler_injection:
+    enabled: true
+    probability: 0.2
+    max_fillers: 3
+  disfluency:
+    enabled: true
+    probability: 0.2
+  # Label-changing augmentations DISABLED (broke balanced sampling)
+  streaming_crop:
+    enabled: false
+  backchannel_inject:
+    enabled: false
+
+output:
+  dir: "models_longer_training"
+  save_steps: 1000
+  eval_steps: 1000
+  logging_steps: 100
+  save_total_limit: 5
+
+evaluation:
+  batch_size: 8
--- a/vocab.json
+++ b/vocab.json