diff --git a/CMakeLists.txt b/CMakeLists.txt index 0d98b00c..c57ae359 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -146,6 +146,7 @@ include(CheckIncludeFileCXX) if(UNIX AND NOT APPLE AND NOT SHERPA_ONNX_ENABLE_WASM AND NOT CMAKE_SYSTEM_NAME STREQUAL Android) check_include_file_cxx(alsa/asoundlib.h SHERPA_ONNX_HAS_ALSA) if(SHERPA_ONNX_HAS_ALSA) + message(STATUS "With Alsa") add_definitions(-DSHERPA_ONNX_ENABLE_ALSA=1) else() message(WARNING "\ diff --git a/python-api-examples/speech-recognition-from-microphone-with-endpoint-detection-alsa.py b/python-api-examples/speech-recognition-from-microphone-with-endpoint-detection-alsa.py new file mode 100755 index 00000000..45962755 --- /dev/null +++ b/python-api-examples/speech-recognition-from-microphone-with-endpoint-detection-alsa.py @@ -0,0 +1,207 @@ +#!/usr/bin/env python3 + +# Real-time speech recognition from a microphone with sherpa-onnx Python API +# with endpoint detection. +# +# Note: This script uses ALSA and works only on Linux systems, especially +# for embedding Linux systems and for running Linux on Windows using WSL. +# +# Please refer to +# https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html +# to download pre-trained models + +import argparse +import sys +from pathlib import Path +import sherpa_onnx + + +def assert_file_exists(filename: str): + assert Path(filename).is_file(), ( + f"{filename} does not exist!\n" + "Please refer to " + "https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html to download it" + ) + + +def get_args(): + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + + parser.add_argument( + "--tokens", + type=str, + required=True, + help="Path to tokens.txt", + ) + + parser.add_argument( + "--encoder", + type=str, + required=True, + help="Path to the encoder model", + ) + + parser.add_argument( + "--decoder", + type=str, + required=True, + help="Path to the decoder model", + ) + + parser.add_argument( + "--joiner", + type=str, + required=True, + help="Path to the joiner model", + ) + + parser.add_argument( + "--decoding-method", + type=str, + default="greedy_search", + help="Valid values are greedy_search and modified_beam_search", + ) + + parser.add_argument( + "--provider", + type=str, + default="cpu", + help="Valid values: cpu, cuda, coreml", + ) + + parser.add_argument( + "--hotwords-file", + type=str, + default="", + help=""" + The file containing hotwords, one words/phrases per line, and for each + phrase the bpe/cjkchar are separated by a space. For example: + + ▁HE LL O ▁WORLD + 你 好 世 界 + """, + ) + + parser.add_argument( + "--hotwords-score", + type=float, + default=1.5, + help=""" + The hotword score of each token for biasing word/phrase. Used only if + --hotwords-file is given. + """, + ) + + parser.add_argument( + "--blank-penalty", + type=float, + default=0.0, + help=""" + The penalty applied on blank symbol during decoding. + Note: It is a positive value that would be applied to logits like + this `logits[:, 0] -= blank_penalty` (suppose logits.shape is + [batch_size, vocab] and blank id is 0). + """, + ) + + parser.add_argument( + "--device-name", + type=str, + required=True, + help=""" +The device name specifies which microphone to use in case there are several +on your system. You can use + + arecord -l + +to find all available microphones on your computer. For instance, if it outputs + +**** List of CAPTURE Hardware Devices **** +card 3: UACDemoV10 [UACDemoV1.0], device 0: USB Audio [USB Audio] + Subdevices: 1/1 + Subdevice #0: subdevice #0 + +and if you want to select card 3 and the device 0 on that card, please use: + + plughw:3,0 + +as the device_name. + """, + ) + + return parser.parse_args() + + +def create_recognizer(args): + assert_file_exists(args.encoder) + assert_file_exists(args.decoder) + assert_file_exists(args.joiner) + assert_file_exists(args.tokens) + # Please replace the model files if needed. + # See https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html + # for download links. + recognizer = sherpa_onnx.OnlineRecognizer.from_transducer( + tokens=args.tokens, + encoder=args.encoder, + decoder=args.decoder, + joiner=args.joiner, + num_threads=1, + sample_rate=16000, + feature_dim=80, + enable_endpoint_detection=True, + rule1_min_trailing_silence=2.4, + rule2_min_trailing_silence=1.2, + rule3_min_utterance_length=300, # it essentially disables this rule + decoding_method=args.decoding_method, + provider=args.provider, + hotwords_file=args.hotwords_file, + hotwords_score=args.hotwords_score, + blank_penalty=args.blank_penalty, + ) + return recognizer + + +def main(): + args = get_args() + device_name = args.device_name + print(f"device_name: {device_name}") + alsa = sherpa_onnx.Alsa(device_name) + + print("Creating recognizer") + recognizer = create_recognizer(args) + print("Started! Please speak") + + sample_rate = 16000 + samples_per_read = int(0.1 * sample_rate) # 0.1 second = 100 ms + + stream = recognizer.create_stream() + + last_result = "" + segment_id = 0 + while True: + samples = alsa.read(samples_per_read) # a blocking read + stream.accept_waveform(sample_rate, samples) + while recognizer.is_ready(stream): + recognizer.decode_stream(stream) + + is_endpoint = recognizer.is_endpoint(stream) + + result = recognizer.get_result(stream) + + if result and (last_result != result): + last_result = result + print("\r{}:{}".format(segment_id, result), end="", flush=True) + if is_endpoint: + if result: + print("\r{}:{}".format(segment_id, result), flush=True) + segment_id += 1 + recognizer.reset(stream) + + +if __name__ == "__main__": + try: + main() + except KeyboardInterrupt: + print("\nCaught Ctrl + C. Exiting") diff --git a/sherpa-onnx/csrc/session.cc b/sherpa-onnx/csrc/session.cc index 94987ebc..759c66dd 100644 --- a/sherpa-onnx/csrc/session.cc +++ b/sherpa-onnx/csrc/session.cc @@ -16,7 +16,7 @@ #endif #if __ANDROID_API__ >= 27 -#include "nnapi_provider_factory.h" +#include "nnapi_provider_factory.h" // NOLINT #endif namespace sherpa_onnx { diff --git a/sherpa-onnx/csrc/sherpa-onnx-alsa-offline-speaker-identification.cc b/sherpa-onnx/csrc/sherpa-onnx-alsa-offline-speaker-identification.cc index 5ecf99a5..76695d5c 100644 --- a/sherpa-onnx/csrc/sherpa-onnx-alsa-offline-speaker-identification.cc +++ b/sherpa-onnx/csrc/sherpa-onnx-alsa-offline-speaker-identification.cc @@ -276,8 +276,8 @@ as the device_name. } } - using namespace std::chrono_literals; - std::this_thread::sleep_for(20ms); // sleep for 20ms + using namespace std::chrono_literals; // NOLINT + std::this_thread::sleep_for(20ms); // sleep for 20ms } t.join(); diff --git a/sherpa-onnx/csrc/sherpa-onnx-alsa-offline.cc b/sherpa-onnx/csrc/sherpa-onnx-alsa-offline.cc index dc17230c..2f24a21a 100644 --- a/sherpa-onnx/csrc/sherpa-onnx-alsa-offline.cc +++ b/sherpa-onnx/csrc/sherpa-onnx-alsa-offline.cc @@ -192,8 +192,8 @@ as the device_name. } } - using namespace std::chrono_literals; - std::this_thread::sleep_for(20ms); // sleep for 20ms + using namespace std::chrono_literals; // NOLINT + std::this_thread::sleep_for(20ms); // sleep for 20ms } t.join(); t2.join(); diff --git a/sherpa-onnx/csrc/sherpa-onnx-keyword-spotter-alsa.cc b/sherpa-onnx/csrc/sherpa-onnx-keyword-spotter-alsa.cc index ab61eb87..2e784ebb 100644 --- a/sherpa-onnx/csrc/sherpa-onnx-keyword-spotter-alsa.cc +++ b/sherpa-onnx/csrc/sherpa-onnx-keyword-spotter-alsa.cc @@ -53,10 +53,6 @@ card 3: UACDemoV10 [UACDemoV1.0], device 0: USB Audio [USB Audio] and if you want to select card 3 and the device 0 on that card, please use: - hw:3,0 - -or - plughw:3,0 as the device_name. diff --git a/sherpa-onnx/python/csrc/CMakeLists.txt b/sherpa-onnx/python/csrc/CMakeLists.txt index 30f64621..bba7903a 100644 --- a/sherpa-onnx/python/csrc/CMakeLists.txt +++ b/sherpa-onnx/python/csrc/CMakeLists.txt @@ -1,6 +1,6 @@ include_directories(${CMAKE_SOURCE_DIR}) -pybind11_add_module(_sherpa_onnx +set(srcs circular-buffer.cc display.cc endpoint.cc @@ -37,6 +37,13 @@ pybind11_add_module(_sherpa_onnx vad-model.cc voice-activity-detector.cc ) +if(SHERPA_ONNX_HAS_ALSA) + list(APPEND srcs ${CMAKE_SOURCE_DIR}/sherpa-onnx/csrc/alsa.cc alsa.cc) +else() + list(APPEND srcs faked-alsa.cc) +endif() + +pybind11_add_module(_sherpa_onnx ${srcs}) if(APPLE) execute_process( @@ -54,6 +61,14 @@ endif() target_link_libraries(_sherpa_onnx PRIVATE sherpa-onnx-core) +if(SHERPA_ONNX_HAS_ALSA) + if(DEFINED ENV{SHERPA_ONNX_ALSA_LIB_DIR}) + target_link_libraries(_sherpa_onnx PRIVATE -L$ENV{SHERPA_ONNX_ALSA_LIB_DIR} -lasound) + else() + target_link_libraries(_sherpa_onnx PRIVATE asound) + endif() +endif() + install(TARGETS _sherpa_onnx DESTINATION ../ ) diff --git a/sherpa-onnx/python/csrc/alsa.cc b/sherpa-onnx/python/csrc/alsa.cc new file mode 100644 index 00000000..b6f752fa --- /dev/null +++ b/sherpa-onnx/python/csrc/alsa.cc @@ -0,0 +1,30 @@ +// sherpa-onnx/python/csrc/alsa.cc +// +// Copyright (c) 2024 Xiaomi Corporation + +#include "sherpa-onnx/python/csrc/alsa.h" + +#include + +#include "sherpa-onnx/csrc/alsa.h" + +namespace sherpa_onnx { + +void PybindAlsa(py::module *m) { + using PyClass = Alsa; + py::class_(*m, "Alsa") + .def(py::init(), py::arg("device_name"), + py::call_guard()) + .def( + "read", + [](PyClass &self, int32_t num_samples) -> std::vector { + return self.Read(num_samples); + }, + py::arg("num_samples"), py::call_guard()) + .def_property_readonly("expected_sample_rate", + &PyClass::GetExpectedSampleRate) + .def_property_readonly("actual_sample_rate", + &PyClass::GetActualSampleRate); +} + +} // namespace sherpa_onnx diff --git a/sherpa-onnx/python/csrc/alsa.h b/sherpa-onnx/python/csrc/alsa.h new file mode 100644 index 00000000..e0106c12 --- /dev/null +++ b/sherpa-onnx/python/csrc/alsa.h @@ -0,0 +1,16 @@ +// sherpa-onnx/python/csrc/alsa.h +// +// Copyright (c) 2024 Xiaomi Corporation + +#ifndef SHERPA_ONNX_PYTHON_CSRC_ALSA_H_ +#define SHERPA_ONNX_PYTHON_CSRC_ALSA_H_ + +#include "sherpa-onnx/python/csrc/sherpa-onnx.h" + +namespace sherpa_onnx { + +void PybindAlsa(py::module *m); + +} // namespace sherpa_onnx + +#endif // SHERPA_ONNX_PYTHON_CSRC_ALSA_H_ diff --git a/sherpa-onnx/python/csrc/faked-alsa.cc b/sherpa-onnx/python/csrc/faked-alsa.cc new file mode 100644 index 00000000..7325ce17 --- /dev/null +++ b/sherpa-onnx/python/csrc/faked-alsa.cc @@ -0,0 +1,47 @@ +// sherpa-onnx/python/csrc/faked-alsa.cc +// +// Copyright (c) 2024 Xiaomi Corporation + +#include "sherpa-onnx/csrc/macros.h" +#include "sherpa-onnx/python/csrc/alsa.h" + +namespace sherpa_onnx { + +class FakedAlsa { + public: + explicit FakedAlsa(const char *) { + SHERPA_ONNX_LOGE("This function is for Linux only."); +#if (SHERPA_ONNX_ENABLE_ALSA == 0) && (defined(__unix__) || defined(__unix)) + SHERPA_ONNX_LOGE(R"doc( +sherpa-onnx is compiled without alsa support. To enable that, please run + (1) sudo apt-get install alsa-utils libasound2-dev + (2) rebuild sherpa-onnx +)doc"); +#endif + exit(-1); + } + + std::vector Read(int32_t) const { return {}; } + int32_t GetExpectedSampleRate() const { return -1; } + int32_t GetActualSampleRate() const { return -1; } +}; + +void PybindAlsa(py::module *m) { + using PyClass = FakedAlsa; + py::class_(*m, "Alsa") + .def(py::init(), py::arg("device_name")) + .def( + "read", + [](PyClass &self, int32_t num_samples) -> std::vector { + return self.Read(num_samples); + }, + py::arg("num_samples"), py::call_guard()) + .def_property_readonly("expected_sample_rate", + &PyClass::GetExpectedSampleRate) + .def_property_readonly("actual_sample_rate", + &PyClass::GetActualSampleRate); +} + +} // namespace sherpa_onnx + +#endif // SHERPA_ONNX_PYTHON_CSRC_FAKED_ALSA_H_ diff --git a/sherpa-onnx/python/csrc/sherpa-onnx.cc b/sherpa-onnx/python/csrc/sherpa-onnx.cc index bdc38bbe..7b0d7c0a 100644 --- a/sherpa-onnx/python/csrc/sherpa-onnx.cc +++ b/sherpa-onnx/python/csrc/sherpa-onnx.cc @@ -4,6 +4,7 @@ #include "sherpa-onnx/python/csrc/sherpa-onnx.h" +#include "sherpa-onnx/python/csrc/alsa.h" #include "sherpa-onnx/python/csrc/circular-buffer.h" #include "sherpa-onnx/python/csrc/display.h" #include "sherpa-onnx/python/csrc/endpoint.h" @@ -54,6 +55,8 @@ PYBIND11_MODULE(_sherpa_onnx, m) { PybindOfflineTts(&m); PybindSpeakerEmbeddingExtractor(&m); PybindSpeakerEmbeddingManager(&m); + + PybindAlsa(&m); } } // namespace sherpa_onnx diff --git a/sherpa-onnx/python/sherpa_onnx/__init__.py b/sherpa-onnx/python/sherpa_onnx/__init__.py index 926edbb8..ee22bd43 100644 --- a/sherpa-onnx/python/sherpa_onnx/__init__.py +++ b/sherpa-onnx/python/sherpa_onnx/__init__.py @@ -1,4 +1,5 @@ from _sherpa_onnx import ( + Alsa, CircularBuffer, Display, OfflineStream,