Fix displaying streaming speech recognition results for Python. (#2196)
This commit is contained in:
@@ -14,7 +14,7 @@ project(sherpa-onnx)
|
|||||||
# Remember to update
|
# Remember to update
|
||||||
# ./CHANGELOG.md
|
# ./CHANGELOG.md
|
||||||
# ./new-release.sh
|
# ./new-release.sh
|
||||||
set(SHERPA_ONNX_VERSION "1.11.5")
|
set(SHERPA_ONNX_VERSION "1.11.6")
|
||||||
|
|
||||||
# Disable warning about
|
# Disable warning about
|
||||||
#
|
#
|
||||||
|
|||||||
@@ -11,8 +11,8 @@
|
|||||||
# to download pre-trained models
|
# to download pre-trained models
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import sys
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import sherpa_onnx
|
import sherpa_onnx
|
||||||
|
|
||||||
|
|
||||||
@@ -202,8 +202,8 @@ def main():
|
|||||||
|
|
||||||
stream = recognizer.create_stream()
|
stream = recognizer.create_stream()
|
||||||
|
|
||||||
last_result = ""
|
display = sherpa_onnx.Display()
|
||||||
segment_id = 0
|
|
||||||
while True:
|
while True:
|
||||||
samples = alsa.read(samples_per_read) # a blocking read
|
samples = alsa.read(samples_per_read) # a blocking read
|
||||||
stream.accept_waveform(sample_rate, samples)
|
stream.accept_waveform(sample_rate, samples)
|
||||||
@@ -214,13 +214,14 @@ def main():
|
|||||||
|
|
||||||
result = recognizer.get_result(stream)
|
result = recognizer.get_result(stream)
|
||||||
|
|
||||||
if result and (last_result != result):
|
display.update_text(result)
|
||||||
last_result = result
|
display.display()
|
||||||
print("\r{}:{}".format(segment_id, result), end="", flush=True)
|
|
||||||
if is_endpoint:
|
if is_endpoint:
|
||||||
if result:
|
if result:
|
||||||
print("\r{}:{}".format(segment_id, result), flush=True)
|
display.finalize_current_sentence()
|
||||||
segment_id += 1
|
display.display()
|
||||||
|
|
||||||
recognizer.reset(stream)
|
recognizer.reset(stream)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -192,8 +192,8 @@ def main():
|
|||||||
|
|
||||||
stream = recognizer.create_stream()
|
stream = recognizer.create_stream()
|
||||||
|
|
||||||
last_result = ""
|
display = sherpa_onnx.Display()
|
||||||
segment_id = 0
|
|
||||||
with sd.InputStream(channels=1, dtype="float32", samplerate=sample_rate) as s:
|
with sd.InputStream(channels=1, dtype="float32", samplerate=sample_rate) as s:
|
||||||
while True:
|
while True:
|
||||||
samples, _ = s.read(samples_per_read) # a blocking read
|
samples, _ = s.read(samples_per_read) # a blocking read
|
||||||
@@ -206,13 +206,14 @@ def main():
|
|||||||
|
|
||||||
result = recognizer.get_result(stream)
|
result = recognizer.get_result(stream)
|
||||||
|
|
||||||
if result and (last_result != result):
|
display.update_text(result)
|
||||||
last_result = result
|
display.display()
|
||||||
print("\r{}:{}".format(segment_id, result), end="", flush=True)
|
|
||||||
if is_endpoint:
|
if is_endpoint:
|
||||||
if result:
|
if result:
|
||||||
print("\r{}:{}".format(segment_id, result), flush=True)
|
display.finalize_current_sentence()
|
||||||
segment_id += 1
|
display.display()
|
||||||
|
|
||||||
recognizer.reset(stream)
|
recognizer.reset(stream)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -192,8 +192,7 @@ def main():
|
|||||||
|
|
||||||
stream = recognizer.create_stream()
|
stream = recognizer.create_stream()
|
||||||
|
|
||||||
last_result = ""
|
display = sherpa_onnx.Display()
|
||||||
segment_id = 0
|
|
||||||
|
|
||||||
print("Started!")
|
print("Started!")
|
||||||
while True:
|
while True:
|
||||||
@@ -213,13 +212,14 @@ def main():
|
|||||||
|
|
||||||
result = recognizer.get_result(stream)
|
result = recognizer.get_result(stream)
|
||||||
|
|
||||||
if result and (last_result != result):
|
display.update_text(result)
|
||||||
last_result = result
|
display.display()
|
||||||
print("\r{}:{}".format(segment_id, result), end="", flush=True)
|
|
||||||
if is_endpoint:
|
if is_endpoint:
|
||||||
if result:
|
if result:
|
||||||
print("\r{}:{}".format(segment_id, result), flush=True)
|
display.finalize_current_sentence()
|
||||||
segment_id += 1
|
display.display()
|
||||||
|
|
||||||
recognizer.reset(stream)
|
recognizer.reset(stream)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -74,8 +74,8 @@ def main():
|
|||||||
|
|
||||||
stream = recognizer.create_stream()
|
stream = recognizer.create_stream()
|
||||||
|
|
||||||
last_result = ""
|
display = sherpa_onnx.Display()
|
||||||
segment_id = 0
|
|
||||||
with sd.InputStream(channels=1, dtype="float32", samplerate=sample_rate) as s:
|
with sd.InputStream(channels=1, dtype="float32", samplerate=sample_rate) as s:
|
||||||
while True:
|
while True:
|
||||||
samples, _ = s.read(samples_per_read) # a blocking read
|
samples, _ = s.read(samples_per_read) # a blocking read
|
||||||
@@ -88,13 +88,14 @@ def main():
|
|||||||
|
|
||||||
result = recognizer.get_result(stream)
|
result = recognizer.get_result(stream)
|
||||||
|
|
||||||
if result and (last_result != result):
|
display.update_text(result)
|
||||||
last_result = result
|
display.display()
|
||||||
print("\r{}:{}".format(segment_id, result), end="", flush=True)
|
|
||||||
if is_endpoint:
|
if is_endpoint:
|
||||||
if result:
|
if result:
|
||||||
print("\r{}:{}".format(segment_id, result), flush=True)
|
display.finalize_current_sentence()
|
||||||
segment_id += 1
|
display.display()
|
||||||
|
|
||||||
recognizer.reset(stream)
|
recognizer.reset(stream)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -46,7 +46,6 @@ python3 ./python-api-examples/two-pass-speech-recognition-from-microphone.py \
|
|||||||
import argparse
|
import argparse
|
||||||
import sys
|
import sys
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List
|
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
@@ -375,8 +374,7 @@ def main():
|
|||||||
samples_per_read = int(0.1 * sample_rate) # 0.1 second = 100 ms
|
samples_per_read = int(0.1 * sample_rate) # 0.1 second = 100 ms
|
||||||
stream = first_recognizer.create_stream()
|
stream = first_recognizer.create_stream()
|
||||||
|
|
||||||
last_result = ""
|
display = sherpa_onnx.Display()
|
||||||
segment_id = 0
|
|
||||||
|
|
||||||
sample_buffers = []
|
sample_buffers = []
|
||||||
with sd.InputStream(channels=1, dtype="float32", samplerate=sample_rate) as s:
|
with sd.InputStream(channels=1, dtype="float32", samplerate=sample_rate) as s:
|
||||||
@@ -395,14 +393,8 @@ def main():
|
|||||||
result = first_recognizer.get_result(stream)
|
result = first_recognizer.get_result(stream)
|
||||||
result = result.lower().strip()
|
result = result.lower().strip()
|
||||||
|
|
||||||
if last_result != result:
|
display.update_text(result)
|
||||||
print(
|
display.display()
|
||||||
"\r{}:{}".format(segment_id, " " * len(last_result)),
|
|
||||||
end="",
|
|
||||||
flush=True,
|
|
||||||
)
|
|
||||||
last_result = result
|
|
||||||
print("\r{}:{}".format(segment_id, result), end="", flush=True)
|
|
||||||
|
|
||||||
if is_endpoint:
|
if is_endpoint:
|
||||||
if result:
|
if result:
|
||||||
@@ -419,14 +411,9 @@ def main():
|
|||||||
sample_rate=sample_rate,
|
sample_rate=sample_rate,
|
||||||
)
|
)
|
||||||
result = result.lower().strip()
|
result = result.lower().strip()
|
||||||
|
display.update_text(result)
|
||||||
print(
|
display.finalize_current_sentence()
|
||||||
"\r{}:{}".format(segment_id, " " * len(last_result)),
|
display.display()
|
||||||
end="",
|
|
||||||
flush=True,
|
|
||||||
)
|
|
||||||
print("\r{}:{}".format(segment_id, result), flush=True)
|
|
||||||
segment_id += 1
|
|
||||||
else:
|
else:
|
||||||
sample_buffers = []
|
sample_buffers = []
|
||||||
|
|
||||||
|
|||||||
@@ -6,7 +6,6 @@ from _sherpa_onnx import (
|
|||||||
AudioTaggingModelConfig,
|
AudioTaggingModelConfig,
|
||||||
CircularBuffer,
|
CircularBuffer,
|
||||||
DenoisedAudio,
|
DenoisedAudio,
|
||||||
Display,
|
|
||||||
FastClustering,
|
FastClustering,
|
||||||
FastClusteringConfig,
|
FastClusteringConfig,
|
||||||
OfflinePunctuation,
|
OfflinePunctuation,
|
||||||
@@ -48,6 +47,7 @@ from _sherpa_onnx import (
|
|||||||
write_wave,
|
write_wave,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
from .display import Display
|
||||||
from .keyword_spotter import KeywordSpotter
|
from .keyword_spotter import KeywordSpotter
|
||||||
from .offline_recognizer import OfflineRecognizer
|
from .offline_recognizer import OfflineRecognizer
|
||||||
from .online_recognizer import OnlineRecognizer
|
from .online_recognizer import OnlineRecognizer
|
||||||
|
|||||||
41
sherpa-onnx/python/sherpa_onnx/display.py
Normal file
41
sherpa-onnx/python/sherpa_onnx/display.py
Normal file
@@ -0,0 +1,41 @@
|
|||||||
|
# Copyright (c) 2025 Xiaomi Corporation
|
||||||
|
import os
|
||||||
|
from time import gmtime, strftime
|
||||||
|
|
||||||
|
|
||||||
|
def get_current_time():
|
||||||
|
return strftime("%Y-%m-%d %H:%M:%S", gmtime())
|
||||||
|
|
||||||
|
|
||||||
|
def clear_console():
|
||||||
|
os.system("cls" if os.name == "nt" else "clear")
|
||||||
|
|
||||||
|
|
||||||
|
class Display:
|
||||||
|
def __init__(self):
|
||||||
|
self.sentences = []
|
||||||
|
self.currentText = ""
|
||||||
|
|
||||||
|
def update_text(self, text):
|
||||||
|
self.currentText = text
|
||||||
|
|
||||||
|
def finalize_current_sentence(self):
|
||||||
|
if self.currentText.strip():
|
||||||
|
self.sentences.append((get_current_time(), self.currentText))
|
||||||
|
|
||||||
|
self.currentText = ""
|
||||||
|
|
||||||
|
def display(self):
|
||||||
|
clear_console()
|
||||||
|
print("=== Speech Recognition with Next-gen Kaldi ===")
|
||||||
|
print("Time:", get_current_time())
|
||||||
|
print("-" * 30)
|
||||||
|
|
||||||
|
# display history sentences
|
||||||
|
if self.sentences:
|
||||||
|
for i, (when, text) in enumerate(self.sentences):
|
||||||
|
print(f"[{when}] {i + 1}. {text}")
|
||||||
|
print("-" * 30)
|
||||||
|
|
||||||
|
if self.currentText.strip():
|
||||||
|
print("Recognizing:", self.currentText)
|
||||||
Reference in New Issue
Block a user