diff --git a/CMakeLists.txt b/CMakeLists.txt index d897067c..cd954bc1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -14,7 +14,7 @@ project(sherpa-onnx) # Remember to update # ./CHANGELOG.md # ./new-release.sh -set(SHERPA_ONNX_VERSION "1.11.5") +set(SHERPA_ONNX_VERSION "1.11.6") # Disable warning about # diff --git a/python-api-examples/speech-recognition-from-microphone-with-endpoint-detection-alsa.py b/python-api-examples/speech-recognition-from-microphone-with-endpoint-detection-alsa.py index 67fb0051..7f6f303a 100755 --- a/python-api-examples/speech-recognition-from-microphone-with-endpoint-detection-alsa.py +++ b/python-api-examples/speech-recognition-from-microphone-with-endpoint-detection-alsa.py @@ -11,8 +11,8 @@ # to download pre-trained models import argparse -import sys from pathlib import Path + import sherpa_onnx @@ -202,8 +202,8 @@ def main(): stream = recognizer.create_stream() - last_result = "" - segment_id = 0 + display = sherpa_onnx.Display() + while True: samples = alsa.read(samples_per_read) # a blocking read stream.accept_waveform(sample_rate, samples) @@ -214,13 +214,14 @@ def main(): result = recognizer.get_result(stream) - if result and (last_result != result): - last_result = result - print("\r{}:{}".format(segment_id, result), end="", flush=True) + display.update_text(result) + display.display() + if is_endpoint: if result: - print("\r{}:{}".format(segment_id, result), flush=True) - segment_id += 1 + display.finalize_current_sentence() + display.display() + recognizer.reset(stream) diff --git a/python-api-examples/speech-recognition-from-microphone-with-endpoint-detection.py b/python-api-examples/speech-recognition-from-microphone-with-endpoint-detection.py index e47cf496..c60a9e18 100755 --- a/python-api-examples/speech-recognition-from-microphone-with-endpoint-detection.py +++ b/python-api-examples/speech-recognition-from-microphone-with-endpoint-detection.py @@ -192,8 +192,8 @@ def main(): stream = recognizer.create_stream() - last_result = "" - segment_id = 0 + display = sherpa_onnx.Display() + with sd.InputStream(channels=1, dtype="float32", samplerate=sample_rate) as s: while True: samples, _ = s.read(samples_per_read) # a blocking read @@ -206,13 +206,14 @@ def main(): result = recognizer.get_result(stream) - if result and (last_result != result): - last_result = result - print("\r{}:{}".format(segment_id, result), end="", flush=True) + display.update_text(result) + display.display() + if is_endpoint: if result: - print("\r{}:{}".format(segment_id, result), flush=True) - segment_id += 1 + display.finalize_current_sentence() + display.display() + recognizer.reset(stream) diff --git a/python-api-examples/speech-recognition-from-url.py b/python-api-examples/speech-recognition-from-url.py index b47f0f9d..fef11eeb 100755 --- a/python-api-examples/speech-recognition-from-url.py +++ b/python-api-examples/speech-recognition-from-url.py @@ -192,8 +192,7 @@ def main(): stream = recognizer.create_stream() - last_result = "" - segment_id = 0 + display = sherpa_onnx.Display() print("Started!") while True: @@ -213,13 +212,14 @@ def main(): result = recognizer.get_result(stream) - if result and (last_result != result): - last_result = result - print("\r{}:{}".format(segment_id, result), end="", flush=True) + display.update_text(result) + display.display() + if is_endpoint: if result: - print("\r{}:{}".format(segment_id, result), flush=True) - segment_id += 1 + display.finalize_current_sentence() + display.display() + recognizer.reset(stream) diff --git a/python-api-examples/streaming-paraformer-asr-microphone.py b/python-api-examples/streaming-paraformer-asr-microphone.py index ad5c8f70..4efc40ca 100755 --- a/python-api-examples/streaming-paraformer-asr-microphone.py +++ b/python-api-examples/streaming-paraformer-asr-microphone.py @@ -74,8 +74,8 @@ def main(): stream = recognizer.create_stream() - last_result = "" - segment_id = 0 + display = sherpa_onnx.Display() + with sd.InputStream(channels=1, dtype="float32", samplerate=sample_rate) as s: while True: samples, _ = s.read(samples_per_read) # a blocking read @@ -88,13 +88,14 @@ def main(): result = recognizer.get_result(stream) - if result and (last_result != result): - last_result = result - print("\r{}:{}".format(segment_id, result), end="", flush=True) + display.update_text(result) + display.display() + if is_endpoint: if result: - print("\r{}:{}".format(segment_id, result), flush=True) - segment_id += 1 + display.finalize_current_sentence() + display.display() + recognizer.reset(stream) diff --git a/python-api-examples/two-pass-speech-recognition-from-microphone.py b/python-api-examples/two-pass-speech-recognition-from-microphone.py index aa2245c5..fad4de20 100755 --- a/python-api-examples/two-pass-speech-recognition-from-microphone.py +++ b/python-api-examples/two-pass-speech-recognition-from-microphone.py @@ -46,7 +46,6 @@ python3 ./python-api-examples/two-pass-speech-recognition-from-microphone.py \ import argparse import sys from pathlib import Path -from typing import List import numpy as np @@ -375,8 +374,7 @@ def main(): samples_per_read = int(0.1 * sample_rate) # 0.1 second = 100 ms stream = first_recognizer.create_stream() - last_result = "" - segment_id = 0 + display = sherpa_onnx.Display() sample_buffers = [] with sd.InputStream(channels=1, dtype="float32", samplerate=sample_rate) as s: @@ -395,14 +393,8 @@ def main(): result = first_recognizer.get_result(stream) result = result.lower().strip() - if last_result != result: - print( - "\r{}:{}".format(segment_id, " " * len(last_result)), - end="", - flush=True, - ) - last_result = result - print("\r{}:{}".format(segment_id, result), end="", flush=True) + display.update_text(result) + display.display() if is_endpoint: if result: @@ -419,14 +411,9 @@ def main(): sample_rate=sample_rate, ) result = result.lower().strip() - - print( - "\r{}:{}".format(segment_id, " " * len(last_result)), - end="", - flush=True, - ) - print("\r{}:{}".format(segment_id, result), flush=True) - segment_id += 1 + display.update_text(result) + display.finalize_current_sentence() + display.display() else: sample_buffers = [] diff --git a/sherpa-onnx/python/sherpa_onnx/__init__.py b/sherpa-onnx/python/sherpa_onnx/__init__.py index ff1008b7..529712b3 100644 --- a/sherpa-onnx/python/sherpa_onnx/__init__.py +++ b/sherpa-onnx/python/sherpa_onnx/__init__.py @@ -6,7 +6,6 @@ from _sherpa_onnx import ( AudioTaggingModelConfig, CircularBuffer, DenoisedAudio, - Display, FastClustering, FastClusteringConfig, OfflinePunctuation, @@ -48,6 +47,7 @@ from _sherpa_onnx import ( write_wave, ) +from .display import Display from .keyword_spotter import KeywordSpotter from .offline_recognizer import OfflineRecognizer from .online_recognizer import OnlineRecognizer diff --git a/sherpa-onnx/python/sherpa_onnx/display.py b/sherpa-onnx/python/sherpa_onnx/display.py new file mode 100644 index 00000000..f6b2b5b1 --- /dev/null +++ b/sherpa-onnx/python/sherpa_onnx/display.py @@ -0,0 +1,41 @@ +# Copyright (c) 2025 Xiaomi Corporation +import os +from time import gmtime, strftime + + +def get_current_time(): + return strftime("%Y-%m-%d %H:%M:%S", gmtime()) + + +def clear_console(): + os.system("cls" if os.name == "nt" else "clear") + + +class Display: + def __init__(self): + self.sentences = [] + self.currentText = "" + + def update_text(self, text): + self.currentText = text + + def finalize_current_sentence(self): + if self.currentText.strip(): + self.sentences.append((get_current_time(), self.currentText)) + + self.currentText = "" + + def display(self): + clear_console() + print("=== Speech Recognition with Next-gen Kaldi ===") + print("Time:", get_current_time()) + print("-" * 30) + + # display history sentences + if self.sentences: + for i, (when, text) in enumerate(self.sentences): + print(f"[{when}] {i + 1}. {text}") + print("-" * 30) + + if self.currentText.strip(): + print("Recognizing:", self.currentText)