Support recognition from URLs. (#194)

2023-07-04 10:16:11 +08:00
parent 2c436606bd
commit 1f02f7c349
3 changed files with 188 additions and 1 deletions
--- a/python-api-examples/speech-recognition-from-microphone-with-endpoint-detection.py
+++ b/python-api-examples/speech-recognition-from-microphone-with-endpoint-detection.py
@@ -40,24 +40,28 @@ def get_args():
    parser.add_argument(
        "--tokens",
        type=str,
+        required=True,
        help="Path to tokens.txt",
    )

    parser.add_argument(
        "--encoder",
        type=str,
+        required=True,
        help="Path to the encoder model",
    )

    parser.add_argument(
        "--decoder",
        type=str,
+        required=True,
        help="Path to the decoder model",
    )

    parser.add_argument(
        "--joiner",
        type=str,
+        required=True,
        help="Path to the joiner model",
    )

@@ -105,7 +109,7 @@ def main():
    # sherpa-onnx will do resampling inside.
    sample_rate = 48000
    samples_per_read = int(0.1 * sample_rate)  # 0.1 second = 100 ms
-    last_result = ""
+
    stream = recognizer.create_stream()

    last_result = ""
--- a/python-api-examples/speech-recognition-from-microphone.py
+++ b/python-api-examples/speech-recognition-from-microphone.py
@@ -39,18 +39,21 @@ def get_args():
    parser.add_argument(
        "--tokens",
        type=str,
+        required=True,
        help="Path to tokens.txt",
    )

    parser.add_argument(
        "--encoder",
        type=str,
+        required=True,
        help="Path to the encoder model",
    )

    parser.add_argument(
        "--decoder",
        type=str,
+        required=True,
        help="Path to the decoder model",
    )

--- a/python-api-examples/speech-recognition-from-url.py
+++ b/python-api-examples/speech-recognition-from-url.py
@@ -0,0 +1,180 @@
+#!/usr/bin/env python3
+#
+# Real-time speech recognition from a URL with sherpa-onnx Python API
+#
+# Supported URLs are those supported by ffmpeg.
+#
+# For instance:
+# (1) RTMP
+#     rtmp://localhost/live/livestream
+#
+# (2) A file
+#     https://huggingface.co/spaces/k2-fsa/automatic-speech-recognition/resolve/main/test_wavs/wenetspeech/DEV_T0000000000.opus
+#     https://huggingface.co/spaces/k2-fsa/automatic-speech-recognition/resolve/main/test_wavs/aishell2/ID0012W0030.wav
+#     file:///Users/fangjun/open-source/sherpa-onnx/a.wav
+#
+#    Note that it supports all file formats supported by ffmpeg
+#
+# Please refer to
+# https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html
+# to download pre-trained models
+
+import argparse
+import shutil
+import subprocess
+import sys
+from pathlib import Path
+
+import numpy as np
+import sherpa_onnx
+
+
+def assert_file_exists(filename: str):
+    assert Path(filename).is_file(), (
+        f"{filename} does not exist!\n"
+        "Please refer to "
+        "https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html to download it"
+    )
+
+
+def get_args():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+
+    parser.add_argument(
+        "--tokens",
+        type=str,
+        required=True,
+        help="Path to tokens.txt",
+    )
+
+    parser.add_argument(
+        "--encoder",
+        type=str,
+        required=True,
+        help="Path to the encoder model",
+    )
+
+    parser.add_argument(
+        "--decoder",
+        type=str,
+        required=True,
+        help="Path to the decoder model",
+    )
+
+    parser.add_argument(
+        "--joiner",
+        type=str,
+        help="Path to the joiner model",
+    )
+
+    parser.add_argument(
+        "--decoding-method",
+        type=str,
+        default="greedy_search",
+        help="Valid values are greedy_search and modified_beam_search",
+    )
+
+    parser.add_argument(
+        "--url",
+        type=str,
+        required=True,
+        help="""Example values:
+          rtmp://localhost/live/livestream
+          https://huggingface.co/spaces/k2-fsa/automatic-speech-recognition/resolve/main/test_wavs/wenetspeech/DEV_T0000000000.opus
+          https://huggingface.co/spaces/k2-fsa/automatic-speech-recognition/resolve/main/test_wavs/aishell2/ID0012W0030.wav
+        """,
+    )
+
+    return parser.parse_args()
+
+
+def create_recognizer(args):
+    # Please replace the model files if needed.
+    # See https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html
+    # for download links.
+    recognizer = sherpa_onnx.OnlineRecognizer(
+        tokens=args.tokens,
+        encoder=args.encoder,
+        decoder=args.decoder,
+        joiner=args.joiner,
+        num_threads=1,
+        sample_rate=16000,
+        feature_dim=80,
+        decoding_method=args.decoding_method,
+        enable_endpoint_detection=True,
+        rule1_min_trailing_silence=2.4,
+        rule2_min_trailing_silence=1.2,
+        rule3_min_utterance_length=300,  # it essentially disables this rule
+    )
+    return recognizer
+
+
+def main():
+    args = get_args()
+    assert_file_exists(args.encoder)
+    assert_file_exists(args.decoder)
+    assert_file_exists(args.joiner)
+    assert_file_exists(args.tokens)
+
+    recognizer = create_recognizer(args)
+
+    ffmpeg_cmd = [
+        "ffmpeg",
+        "-i",
+        args.url,
+        "-f",
+        "s16le",
+        "-acodec",
+        "pcm_s16le",
+        "-ac",
+        "1",
+        "-ar",
+        "16000",
+        "-",
+    ]
+
+    process = subprocess.Popen(
+        ffmpeg_cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL
+    )
+
+    frames_per_read = 1600  # 0.1 second
+
+    stream = recognizer.create_stream()
+
+    last_result = ""
+    segment_id = 0
+
+    print("Started!")
+    while True:
+        # *2 because int16_t has two bytes
+        data = process.stdout.read(frames_per_read * 2)
+        if not data:
+            break
+
+        samples = np.frombuffer(data, dtype=np.int16)
+        samples = samples.astype(np.float32) / 32768
+        stream.accept_waveform(16000, samples)
+
+        while recognizer.is_ready(stream):
+            recognizer.decode_stream(stream)
+
+        is_endpoint = recognizer.is_endpoint(stream)
+
+        result = recognizer.get_result(stream)
+
+        if result and (last_result != result):
+            last_result = result
+            print("\r{}:{}".format(segment_id, result), end="", flush=True)
+        if is_endpoint:
+            if result:
+                print("\r{}:{}".format(segment_id, result), flush=True)
+                segment_id += 1
+            recognizer.reset(stream)
+
+
+if __name__ == "__main__":
+    if shutil.which("ffmpeg") is None:
+        sys.exit("Please install ffmpeg first!")
+    main()