#!/usr/bin/env python3 # # Copyright (c) 2023 Xiaomi Corporation """ This file demonstrates how to use sherpa-onnx Python APIs with VAD and non-streaming ASR models for speech recognition from a microphone. Note that you need a non-streaming model for this script. (1) For paraformer ./python-api-examples/vad-with-non-streaming-asr.py \ --silero-vad-model=/path/to/silero_vad.onnx \ --tokens=/path/to/tokens.txt \ --paraformer=/path/to/paraformer.onnx \ --num-threads=2 \ --decoding-method=greedy_search \ --debug=false \ --sample-rate=16000 \ --feature-dim=80 (2) For transducer models from icefall ./python-api-examples/vad-with-non-streaming-asr.py \ --silero-vad-model=/path/to/silero_vad.onnx \ --tokens=/path/to/tokens.txt \ --encoder=/path/to/encoder.onnx \ --decoder=/path/to/decoder.onnx \ --joiner=/path/to/joiner.onnx \ --num-threads=2 \ --decoding-method=greedy_search \ --debug=false \ --sample-rate=16000 \ --feature-dim=80 (3) For Whisper models ./python-api-examples/vad-with-non-streaming-asr.py \ --silero-vad-model=/path/to/silero_vad.onnx \ --whisper-encoder=./sherpa-onnx-whisper-base.en/base.en-encoder.int8.onnx \ --whisper-decoder=./sherpa-onnx-whisper-base.en/base.en-decoder.int8.onnx \ --tokens=./sherpa-onnx-whisper-base.en/base.en-tokens.txt \ --whisper-task=transcribe \ --num-threads=2 Please refer to https://k2-fsa.github.io/sherpa/onnx/index.html to install sherpa-onnx and to download non-streaming pre-trained models used in this file. Please visit https://github.com/snakers4/silero-vad/blob/master/files/silero_vad.onnx to download silero_vad.onnx For instance, wget https://github.com/snakers4/silero-vad/raw/master/files/silero_vad.onnx """ import argparse import sys from pathlib import Path import numpy as np try: import sounddevice as sd except ImportError: print("Please install sounddevice first. You can use") print() print(" pip install sounddevice") print() print("to install it") sys.exit(-1) import sherpa_onnx def get_args(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter ) parser.add_argument( "--silero-vad-model", type=str, required=True, help="Path to silero_vad.onnx", ) parser.add_argument( "--tokens", type=str, help="Path to tokens.txt", ) parser.add_argument( "--encoder", default="", type=str, help="Path to the transducer encoder model", ) parser.add_argument( "--decoder", default="", type=str, help="Path to the transducer decoder model", ) parser.add_argument( "--joiner", default="", type=str, help="Path to the transducer joiner model", ) parser.add_argument( "--paraformer", default="", type=str, help="Path to the model.onnx from Paraformer", ) parser.add_argument( "--num-threads", type=int, default=1, help="Number of threads for neural network computation", ) parser.add_argument( "--whisper-encoder", default="", type=str, help="Path to whisper encoder model", ) parser.add_argument( "--whisper-decoder", default="", type=str, help="Path to whisper decoder model", ) parser.add_argument( "--whisper-language", default="", type=str, help="""It specifies the spoken language in the input file. Example values: en, fr, de, zh, jp. Available languages for multilingual models can be found at https://github.com/openai/whisper/blob/main/whisper/tokenizer.py#L10 If not specified, we infer the language from the input audio file. """, ) parser.add_argument( "--whisper-task", default="transcribe", choices=["transcribe", "translate"], type=str, help="""For multilingual models, if you specify translate, the output will be in English. """, ) parser.add_argument( "--whisper-tail-paddings", default=-1, type=int, help="""Number of tail padding frames. We have removed the 30-second constraint from whisper, so you need to choose the amount of tail padding frames by yourself. Use -1 to use a default value for tail padding. """, ) parser.add_argument( "--blank-penalty", type=float, default=0.0, help=""" The penalty applied on blank symbol during decoding. Note: It is a positive value that would be applied to logits like this `logits[:, 0] -= blank_penalty` (suppose logits.shape is [batch_size, vocab] and blank id is 0). """, ) parser.add_argument( "--decoding-method", type=str, default="greedy_search", help="""Valid values are greedy_search and modified_beam_search. modified_beam_search is valid only for transducer models. """, ) parser.add_argument( "--debug", type=bool, default=False, help="True to show debug messages when loading modes.", ) parser.add_argument( "--sample-rate", type=int, default=16000, help="""Sample rate of the feature extractor. Must match the one expected by the model.""", ) parser.add_argument( "--feature-dim", type=int, default=80, help="Feature dimension. Must match the one expected by the model", ) return parser.parse_args() def assert_file_exists(filename: str): assert Path(filename).is_file(), ( f"{filename} does not exist!\n" "Please refer to " "https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html to download it" ) def create_recognizer(args) -> sherpa_onnx.OfflineRecognizer: if args.encoder: assert len(args.paraformer) == 0, args.paraformer assert len(args.whisper_encoder) == 0, args.whisper_encoder assert len(args.whisper_decoder) == 0, args.whisper_decoder assert_file_exists(args.encoder) assert_file_exists(args.decoder) assert_file_exists(args.joiner) recognizer = sherpa_onnx.OfflineRecognizer.from_transducer( encoder=args.encoder, decoder=args.decoder, joiner=args.joiner, tokens=args.tokens, num_threads=args.num_threads, sample_rate=args.sample_rate, feature_dim=args.feature_dim, decoding_method=args.decoding_method, blank_penalty=args.blank_penalty, debug=args.debug, ) elif args.paraformer: assert len(args.whisper_encoder) == 0, args.whisper_encoder assert len(args.whisper_decoder) == 0, args.whisper_decoder assert_file_exists(args.paraformer) recognizer = sherpa_onnx.OfflineRecognizer.from_paraformer( paraformer=args.paraformer, tokens=args.tokens, num_threads=args.num_threads, sample_rate=args.sample_rate, feature_dim=args.feature_dim, decoding_method=args.decoding_method, debug=args.debug, ) elif args.whisper_encoder: assert_file_exists(args.whisper_encoder) assert_file_exists(args.whisper_decoder) recognizer = sherpa_onnx.OfflineRecognizer.from_whisper( encoder=args.whisper_encoder, decoder=args.whisper_decoder, tokens=args.tokens, num_threads=args.num_threads, decoding_method=args.decoding_method, debug=args.debug, language=args.whisper_language, task=args.whisper_task, tail_paddings=args.whisper_tail_paddings, ) else: raise ValueError("Please specify at least one model") return recognizer def main(): devices = sd.query_devices() if len(devices) == 0: print("No microphone devices found") sys.exit(0) print(devices) # If you want to select a different input device, please use # sd.default.device[0] = xxx # where xxx is the device number default_input_device_idx = sd.default.device[0] print(f'Use default device: {devices[default_input_device_idx]["name"]}') args = get_args() assert_file_exists(args.tokens) assert_file_exists(args.silero_vad_model) assert args.num_threads > 0, args.num_threads assert ( args.sample_rate == 16000 ), f"Only sample rate 16000 is supported.Given: {args.sample_rate}" print("Creating recognizer. Please wait...") recognizer = create_recognizer(args) config = sherpa_onnx.VadModelConfig() config.silero_vad.model = args.silero_vad_model config.silero_vad.min_silence_duration = 0.25 config.sample_rate = args.sample_rate window_size = config.silero_vad.window_size vad = sherpa_onnx.VoiceActivityDetector(config, buffer_size_in_seconds=100) samples_per_read = int(0.1 * args.sample_rate) # 0.1 second = 100 ms print("Started! Please speak") buffer = [] texts = [] with sd.InputStream(channels=1, dtype="float32", samplerate=args.sample_rate) as s: while True: samples, _ = s.read(samples_per_read) # a blocking read samples = samples.reshape(-1) buffer = np.concatenate([buffer, samples]) while len(buffer) > window_size: vad.accept_waveform(buffer[:window_size]) buffer = buffer[window_size:] while not vad.empty(): stream = recognizer.create_stream() stream.accept_waveform(args.sample_rate, vad.front.samples) vad.pop() recognizer.decode_stream(stream) text = stream.result.text.strip().lower() if len(text): idx = len(texts) texts.append(text) print(f"{idx}: {text}") if __name__ == "__main__": try: main() except KeyboardInterrupt: print("\nCaught Ctrl + C. Exiting")