Fix displaying streaming speech recognition results for Python. (#2196)

2025-05-09 21:48:49 +08:00
parent a6834f6556
commit 4a833a7547
8 changed files with 81 additions and 50 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -14,7 +14,7 @@ project(sherpa-onnx)
 # Remember to update
 # ./CHANGELOG.md
 # ./new-release.sh
-set(SHERPA_ONNX_VERSION "1.11.5")
+set(SHERPA_ONNX_VERSION "1.11.6")

 # Disable warning about
 #
--- a/python-api-examples/speech-recognition-from-microphone-with-endpoint-detection-alsa.py
+++ b/python-api-examples/speech-recognition-from-microphone-with-endpoint-detection-alsa.py
@@ -11,8 +11,8 @@
 # to download pre-trained models

 import argparse
-import sys
 from pathlib import Path
+
 import sherpa_onnx


@@ -202,8 +202,8 @@ def main():

    stream = recognizer.create_stream()

-    last_result = ""
-    segment_id = 0
+    display = sherpa_onnx.Display()
+
    while True:
        samples = alsa.read(samples_per_read)  # a blocking read
        stream.accept_waveform(sample_rate, samples)
@@ -214,13 +214,14 @@ def main():

        result = recognizer.get_result(stream)

-        if result and (last_result != result):
-            last_result = result
-            print("\r{}:{}".format(segment_id, result), end="", flush=True)
+        display.update_text(result)
+        display.display()
+
        if is_endpoint:
            if result:
-                print("\r{}:{}".format(segment_id, result), flush=True)
-                segment_id += 1
+                display.finalize_current_sentence()
+                display.display()
+
            recognizer.reset(stream)


--- a/python-api-examples/speech-recognition-from-microphone-with-endpoint-detection.py
+++ b/python-api-examples/speech-recognition-from-microphone-with-endpoint-detection.py
@@ -192,8 +192,8 @@ def main():

    stream = recognizer.create_stream()

-    last_result = ""
-    segment_id = 0
+    display = sherpa_onnx.Display()
+
    with sd.InputStream(channels=1, dtype="float32", samplerate=sample_rate) as s:
        while True:
            samples, _ = s.read(samples_per_read)  # a blocking read
@@ -206,13 +206,14 @@ def main():

            result = recognizer.get_result(stream)

-            if result and (last_result != result):
-                last_result = result
-                print("\r{}:{}".format(segment_id, result), end="", flush=True)
+            display.update_text(result)
+            display.display()
+
            if is_endpoint:
                if result:
-                    print("\r{}:{}".format(segment_id, result), flush=True)
-                    segment_id += 1
+                    display.finalize_current_sentence()
+                    display.display()
+
                recognizer.reset(stream)


--- a/python-api-examples/speech-recognition-from-url.py
+++ b/python-api-examples/speech-recognition-from-url.py
@@ -192,8 +192,7 @@ def main():

    stream = recognizer.create_stream()

-    last_result = ""
-    segment_id = 0
+    display = sherpa_onnx.Display()

    print("Started!")
    while True:
@@ -213,13 +212,14 @@ def main():

        result = recognizer.get_result(stream)

-        if result and (last_result != result):
-            last_result = result
-            print("\r{}:{}".format(segment_id, result), end="", flush=True)
+        display.update_text(result)
+        display.display()
+
        if is_endpoint:
            if result:
-                print("\r{}:{}".format(segment_id, result), flush=True)
-                segment_id += 1
+                display.finalize_current_sentence()
+                display.display()
+
            recognizer.reset(stream)


--- a/python-api-examples/streaming-paraformer-asr-microphone.py
+++ b/python-api-examples/streaming-paraformer-asr-microphone.py
@@ -74,8 +74,8 @@ def main():

    stream = recognizer.create_stream()

-    last_result = ""
-    segment_id = 0
+    display = sherpa_onnx.Display()
+
    with sd.InputStream(channels=1, dtype="float32", samplerate=sample_rate) as s:
        while True:
            samples, _ = s.read(samples_per_read)  # a blocking read
@@ -88,13 +88,14 @@ def main():

            result = recognizer.get_result(stream)

-            if result and (last_result != result):
-                last_result = result
-                print("\r{}:{}".format(segment_id, result), end="", flush=True)
+            display.update_text(result)
+            display.display()
+
            if is_endpoint:
                if result:
-                    print("\r{}:{}".format(segment_id, result), flush=True)
-                    segment_id += 1
+                    display.finalize_current_sentence()
+                    display.display()
+
                recognizer.reset(stream)


--- a/python-api-examples/two-pass-speech-recognition-from-microphone.py
+++ b/python-api-examples/two-pass-speech-recognition-from-microphone.py
@@ -46,7 +46,6 @@ python3 ./python-api-examples/two-pass-speech-recognition-from-microphone.py \
 import argparse
 import sys
 from pathlib import Path
-from typing import List

 import numpy as np

@@ -375,8 +374,7 @@ def main():
    samples_per_read = int(0.1 * sample_rate)  # 0.1 second = 100 ms
    stream = first_recognizer.create_stream()

-    last_result = ""
-    segment_id = 0
+    display = sherpa_onnx.Display()

    sample_buffers = []
    with sd.InputStream(channels=1, dtype="float32", samplerate=sample_rate) as s:
@@ -395,14 +393,8 @@ def main():
            result = first_recognizer.get_result(stream)
            result = result.lower().strip()

-            if last_result != result:
-                print(
-                    "\r{}:{}".format(segment_id, " " * len(last_result)),
-                    end="",
-                    flush=True,
-                )
-                last_result = result
-                print("\r{}:{}".format(segment_id, result), end="", flush=True)
+            display.update_text(result)
+            display.display()

            if is_endpoint:
                if result:
@@ -419,14 +411,9 @@ def main():
                        sample_rate=sample_rate,
                    )
                    result = result.lower().strip()
-
-                    print(
-                        "\r{}:{}".format(segment_id, " " * len(last_result)),
-                        end="",
-                        flush=True,
-                    )
-                    print("\r{}:{}".format(segment_id, result), flush=True)
-                    segment_id += 1
+                    display.update_text(result)
+                    display.finalize_current_sentence()
+                    display.display()
                else:
                    sample_buffers = []

--- a/sherpa-onnx/python/sherpa_onnx/init.py
+++ b/sherpa-onnx/python/sherpa_onnx/init.py
@@ -6,7 +6,6 @@ from _sherpa_onnx import (
    AudioTaggingModelConfig,
    CircularBuffer,
    DenoisedAudio,
-    Display,
    FastClustering,
    FastClusteringConfig,
    OfflinePunctuation,
@@ -48,6 +47,7 @@ from _sherpa_onnx import (
    write_wave,
 )

+from .display import Display
 from .keyword_spotter import KeywordSpotter
 from .offline_recognizer import OfflineRecognizer
 from .online_recognizer import OnlineRecognizer
--- a/sherpa-onnx/python/sherpa_onnx/display.py
+++ b/sherpa-onnx/python/sherpa_onnx/display.py
@@ -0,0 +1,41 @@
+# Copyright (c)  2025  Xiaomi Corporation
+import os
+from time import gmtime, strftime
+
+
+def get_current_time():
+    return strftime("%Y-%m-%d %H:%M:%S", gmtime())
+
+
+def clear_console():
+    os.system("cls" if os.name == "nt" else "clear")
+
+
+class Display:
+    def __init__(self):
+        self.sentences = []
+        self.currentText = ""
+
+    def update_text(self, text):
+        self.currentText = text
+
+    def finalize_current_sentence(self):
+        if self.currentText.strip():
+            self.sentences.append((get_current_time(), self.currentText))
+
+        self.currentText = ""
+
+    def display(self):
+        clear_console()
+        print("=== Speech Recognition with Next-gen Kaldi ===")
+        print("Time:", get_current_time())
+        print("-" * 30)
+
+        # display history sentences
+        if self.sentences:
+            for i, (when, text) in enumerate(self.sentences):
+                print(f"[{when}] {i + 1}. {text}")
+            print("-" * 30)
+
+        if self.currentText.strip():
+            print("Recognizing:", self.currentText)