enginex_bi_series-sherpa-onnx/python-api-examples/two-pass-speech-recognition-from-microphone.py

#!/usr/bin/env python3

# Two-pass real-time speech recognition from a microphone with sherpa-onnx
# Python API.
#
# The first pass uses a streaming model, which has two purposes:
#
#  (1) Display a temporary result to users
#
#  (2) Endpointing
#
# The second pass uses a non-streaming model. It has a higher recognition
# accuracy than the first pass model and its result is used as the final result.
#
# Please refer to
# https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html
# to download pre-trained models

"""
Usage examples:

(1) Chinese: Streaming zipformer (1st pass) + Non-streaming paraformer (2nd pass)

python3 ./python-api-examples/two-pass-speech-recognition-from-microphone.py \
  --first-encoder ./sherpa-onnx-streaming-zipformer-zh-14M-2023-02-23/encoder-epoch-99-avg-1.onnx \
  --first-decoder ./sherpa-onnx-streaming-zipformer-zh-14M-2023-02-23/decoder-epoch-99-avg-1.onnx \
  --first-joiner ./sherpa-onnx-streaming-zipformer-zh-14M-2023-02-23/joiner-epoch-99-avg-1.onnx \
  --first-tokens ./sherpa-onnx-streaming-zipformer-zh-14M-2023-02-23/tokens.txt \
  \
  --second-paraformer ./sherpa-onnx-paraformer-zh-2023-09-14/model.int8.onnx \
  --second-tokens ./sherpa-onnx-paraformer-zh-2023-09-14/tokens.txt

(2) English: Streaming zipformer (1st pass) + Non-streaming whisper (2nd pass)

python3 ./python-api-examples/two-pass-speech-recognition-from-microphone.py \
  --first-encoder ./sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/encoder-epoch-99-avg-1.onnx \
  --first-decoder ./sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/decoder-epoch-99-avg-1.onnx \
  --first-joiner ./sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/joiner-epoch-99-avg-1.onnx \
  --first-tokens ./sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/tokens.txt \
  \
  --second-whisper-encoder ./sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx \
  --second-whisper-decoder ./sherpa-onnx-whisper-tiny.en/tiny.en-decoder.int8.onnx \
  --second-tokens ./sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt
"""

import argparse
import sys
from pathlib import Path
from typing import List

import numpy as np

try:
    import sounddevice as sd
except ImportError:
    print("Please install sounddevice first. You can use")
    print()
    print("  pip install sounddevice")
    print()
    print("to install it")
    sys.exit(-1)

import sherpa_onnx


def assert_file_exists(filename: str, message: str):
    if not filename:
        raise ValueError(f"Please specify {message}")

    if not Path(filename).is_file():
        raise ValueError(f"{message} {filename} does not exist")


def add_first_pass_streaming_model_args(parser: argparse.ArgumentParser):
    parser.add_argument(
        "--first-tokens",
        type=str,
        required=True,
        help="Path to tokens.txt for the first pass",
    )

    parser.add_argument(
        "--first-encoder",
        type=str,
        required=True,
        help="Path to the encoder model for the first pass",
    )

    parser.add_argument(
        "--first-decoder",
        type=str,
        required=True,
        help="Path to the decoder model for the first pass",
    )

    parser.add_argument(
        "--first-joiner",
        type=str,
        help="Path to the joiner model for the first pass",
    )

    parser.add_argument(
        "--first-decoding-method",
        type=str,
        default="greedy_search",
        help="""Decoding method for the first pass. Valid values are
        greedy_search and modified_beam_search""",
    )

    parser.add_argument(
        "--first-max-active-paths",
        type=int,
        default=4,
        help="""Used only when --first-decoding-method is modified_beam_search.
        It specifies number of active paths to keep during decoding.
        """,
    )


def add_second_pass_transducer_model_args(parser: argparse.ArgumentParser):
    parser.add_argument(
        "--second-encoder",
        default="",
        type=str,
        help="Path to the transducer encoder model for the second pass",
    )

    parser.add_argument(
        "--second-decoder",
        default="",
        type=str,
        help="Path to the transducer decoder model for the second pass",
    )

    parser.add_argument(
        "--second-joiner",
        default="",
        type=str,
        help="Path to the transducer joiner model for the second pass",
    )


def add_second_pass_paraformer_model_args(parser: argparse.ArgumentParser):
    parser.add_argument(
        "--second-paraformer",
        default="",
        type=str,
        help="Path to the model.onnx for Paraformer for the second pass",
    )


def add_second_pass_nemo_ctc_model_args(parser: argparse.ArgumentParser):
    parser.add_argument(
        "--second-nemo-ctc",
        default="",
        type=str,
        help="Path to the model.onnx for NeMo CTC for the second pass",
    )


def add_second_pass_whisper_model_args(parser: argparse.ArgumentParser):
    parser.add_argument(
        "--second-whisper-encoder",
        default="",
        type=str,
        help="Path to whisper encoder model for the second pass",
    )

    parser.add_argument(
        "--second-whisper-decoder",
        default="",
        type=str,
        help="Path to whisper decoder model for the second pass",
    )

    parser.add_argument(
        "--second-whisper-language",
        default="",
        type=str,
        help="""It specifies the spoken language in the input audio file.
        Example values: en, fr, de, zh, jp.
        Available languages for multilingual models can be found at
        https://github.com/openai/whisper/blob/main/whisper/tokenizer.py#L10
        If not specified, we infer the language from the input audio file.
        """,
    )

    parser.add_argument(
        "--second-whisper-task",
        default="transcribe",
        choices=["transcribe", "translate"],
        type=str,
        help="""For multilingual models, if you specify translate, the output
        will be in English.
        """,
    )

    parser.add_argument(
        "--second-whisper-tail-paddings",
        default=-1,
        type=int,
        help="""Number of tail padding frames.
        We have removed the 30-second constraint from whisper, so you need to
        choose the amount of tail padding frames by yourself.
        Use -1 to use a default value for tail padding.
        """,
    )


def add_second_pass_non_streaming_model_args(parser: argparse.ArgumentParser):
    add_second_pass_transducer_model_args(parser)
    add_second_pass_nemo_ctc_model_args(parser)
    add_second_pass_paraformer_model_args(parser)
    add_second_pass_whisper_model_args(parser)

    parser.add_argument(
        "--second-tokens",
        type=str,
        help="Path to tokens.txt for the second pass",
    )


def get_args():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )

    parser.add_argument(
        "--provider",
        type=str,
        default="cpu",
        help="Valid values: cpu, cuda, coreml",
    )

    add_first_pass_streaming_model_args(parser)
    add_second_pass_non_streaming_model_args(parser)

    return parser.parse_args()


def check_first_pass_args(args):
    assert_file_exists(args.first_tokens, "--first-tokens")
    assert_file_exists(args.first_encoder, "--first-encoder")
    assert_file_exists(args.first_decoder, "--first-decoder")
    assert_file_exists(args.first_joiner, "--first-joiner")


def check_second_pass_args(args):
    assert_file_exists(args.second_tokens, "--second-tokens")

    if args.second_encoder:
        assert_file_exists(args.second_encoder, "--second-encoder")
        assert_file_exists(args.second_decoder, "--second-decoder")
        assert_file_exists(args.second_joiner, "--second-joiner")
    elif args.second_paraformer:
        assert_file_exists(args.second_paraformer, "--second-paraformer")
    elif args.second_nemo_ctc:
        assert_file_exists(args.second_nemo_ctc, "--second-nemo-ctc")
    elif args.second_whisper_encoder:
        assert_file_exists(args.second_whisper_encoder, "--second-whisper-encoder")
        assert_file_exists(args.second_whisper_decoder, "--second-whisper-decoder")
    else:
        raise ValueError("Please specify the model for the second pass")


def create_first_pass_recognizer(args):
    # Please replace the model files if needed.
    # See https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html
    # for download links.
    recognizer = sherpa_onnx.OnlineRecognizer.from_transducer(
        tokens=args.first_tokens,
        encoder=args.first_encoder,
        decoder=args.first_decoder,
        joiner=args.first_joiner,
        num_threads=1,
        sample_rate=16000,
        feature_dim=80,
        decoding_method=args.first_decoding_method,
        max_active_paths=args.first_max_active_paths,
        provider=args.provider,
        enable_endpoint_detection=True,
        rule1_min_trailing_silence=2.4,
        rule2_min_trailing_silence=1.2,
        rule3_min_utterance_length=20,
    )
    return recognizer


def create_second_pass_recognizer(args) -> sherpa_onnx.OfflineRecognizer:
    if args.second_encoder:
        recognizer = sherpa_onnx.OfflineRecognizer.from_transducer(
            encoder=args.second_encoder,
            decoder=args.second_decoder,
            joiner=args.second_joiner,
            tokens=args.second_tokens,
            sample_rate=16000,
            feature_dim=80,
            decoding_method="greedy_search",
            max_active_paths=4,
        )
    elif args.second_paraformer:
        recognizer = sherpa_onnx.OfflineRecognizer.from_paraformer(
            paraformer=args.second_paraformer,
            tokens=args.second_tokens,
            num_threads=1,
            sample_rate=16000,
            feature_dim=80,
            decoding_method="greedy_search",
        )
    elif args.second_nemo_ctc:
        recognizer = sherpa_onnx.OfflineRecognizer.from_nemo_ctc(
            model=args.second_nemo_ctc,
            tokens=args.second_tokens,
            num_threads=1,
            sample_rate=16000,
            feature_dim=80,
            decoding_method="greedy_search",
        )
    elif args.second_whisper_encoder:
        recognizer = sherpa_onnx.OfflineRecognizer.from_whisper(
            encoder=args.second_whisper_encoder,
            decoder=args.second_whisper_decoder,
            tokens=args.second_tokens,
            num_threads=1,
            decoding_method="greedy_search",
            language=args.second_whisper_language,
            task=args.second_whisper_task,
            tail_paddings=args.second_whisper_tail_paddings,
        )
    else:
        raise ValueError("Please specify at least one model for the second pass")

    return recognizer


def run_second_pass(
    recognizer: sherpa_onnx.OfflineRecognizer,
    samples: np.ndarray,
    sample_rate: int,
):
    stream = recognizer.create_stream()
    stream.accept_waveform(sample_rate, samples)

    recognizer.decode_stream(stream)

    return stream.result.text


def main():
    args = get_args()
    check_first_pass_args(args)
    check_second_pass_args(args)

    devices = sd.query_devices()
    if len(devices) == 0:
        print("No microphone devices found")
        sys.exit(0)

    print(devices)

    # If you want to select a different input device, please use
    # sd.default.device[0] = xxx
    # where xxx is the device number

    default_input_device_idx = sd.default.device[0]
    print(f'Use default device: {devices[default_input_device_idx]["name"]}')

    print("Creating recognizers. Please wait...")
    first_recognizer = create_first_pass_recognizer(args)
    second_recognizer = create_second_pass_recognizer(args)

    print("Started! Please speak")

    sample_rate = 16000
    samples_per_read = int(0.1 * sample_rate)  # 0.1 second = 100 ms
    stream = first_recognizer.create_stream()

    last_result = ""
    segment_id = 0

    sample_buffers = []
    with sd.InputStream(channels=1, dtype="float32", samplerate=sample_rate) as s:
        while True:
            samples, _ = s.read(samples_per_read)  # a blocking read
            samples = samples.reshape(-1)
            stream.accept_waveform(sample_rate, samples)

            sample_buffers.append(samples)

            while first_recognizer.is_ready(stream):
                first_recognizer.decode_stream(stream)

            is_endpoint = first_recognizer.is_endpoint(stream)

            result = first_recognizer.get_result(stream)
            result = result.lower().strip()

            if last_result != result:
                print(
                    "\r{}:{}".format(segment_id, " " * len(last_result)),
                    end="",
                    flush=True,
                )
                last_result = result
                print("\r{}:{}".format(segment_id, result), end="", flush=True)

            if is_endpoint:
                if result:
                    samples = np.concatenate(sample_buffers)
                    # There are internal sample buffers inside the streaming
                    # feature extractor, so we cannot send all samples to
                    # the 2nd pass. Here 8000 is just an empirical value
                    # that should work for most streaming models in sherpa-onnx
                    sample_buffers = [samples[-8000:]]
                    samples = samples[:-8000]
                    result = run_second_pass(
                        recognizer=second_recognizer,
                        samples=samples,
                        sample_rate=sample_rate,
                    )
                    result = result.lower().strip()

                    print(
                        "\r{}:{}".format(segment_id, " " * len(last_result)),
                        end="",
                        flush=True,
                    )
                    print("\r{}:{}".format(segment_id, result), flush=True)
                    segment_id += 1
                else:
                    sample_buffers = []

                first_recognizer.reset(stream)


if __name__ == "__main__":
    try:
        main()
    except KeyboardInterrupt:
        print("\nCaught Ctrl + C. Exiting")