diff --git a/CMakeLists.txt b/CMakeLists.txt index 19a97017..6736731d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -15,6 +15,7 @@ option(SHERPA_ONNX_ENABLE_PYTHON "Whether to build Python" OFF) option(SHERPA_ONNX_ENABLE_TESTS "Whether to build tests" OFF) option(SHERPA_ONNX_ENABLE_CHECK "Whether to build with assert" ON) option(BUILD_SHARED_LIBS "Whether to build shared libraries" OFF) +option(SHERPA_ONNX_ENABLE_PORTAUDIO "Whether to build with portaudio" ON) set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib") set(CMAKE_LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib") @@ -49,6 +50,7 @@ message(STATUS "BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS}") message(STATUS "SHERPA_ONNX_ENABLE_PYTHON ${SHERPA_ONNX_ENABLE_PYTHON}") message(STATUS "SHERPA_ONNX_ENABLE_TESTS ${SHERPA_ONNX_ENABLE_TESTS}") message(STATUS "SHERPA_ONNX_ENABLE_CHECK ${SHERPA_ONNX_ENABLE_CHECK}") +message(STATUS "SHERPA_ONNX_ENABLE_PORTAUDIO ${SHERPA_ONNX_ENABLE_PORTAUDIO}") set(CMAKE_CXX_STANDARD 14 CACHE STRING "The C++ version to be used.") set(CMAKE_CXX_EXTENSIONS OFF) @@ -68,6 +70,10 @@ list(APPEND CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake) include(kaldi-native-fbank) include(onnxruntime) +if(SHERPA_ONNX_ENABLE_PORTAUDIO) + include(portaudio) +endif() + if(SHERPA_ONNX_ENABLE_PYTHON) include(pybind11) endif() diff --git a/cmake/portaudio.cmake b/cmake/portaudio.cmake new file mode 100644 index 00000000..594dbe73 --- /dev/null +++ b/cmake/portaudio.cmake @@ -0,0 +1,56 @@ +function(download_portaudio) + include(FetchContent) + + set(portaudio_URL "http://files.portaudio.com/archives/pa_stable_v190700_20210406.tgz") + set(portaudio_HASH "SHA256=47efbf42c77c19a05d22e627d42873e991ec0c1357219c0d74ce6a2948cb2def") + + # If you don't have access to the Internet, please download it to your + # local drive and modify the following line according to your needs. + set(possible_file_locations + $ENV{HOME}/Downloads/pa_stable_v190700_20210406.tgz + $ENV{HOME}/asr/pa_stable_v190700_20210406.tgz + ${PROJECT_SOURCE_DIR}/pa_stable_v190700_20210406.tgz + ${PROJECT_BINARY_DIR}/pa_stable_v190700_20210406.tgz + /tmp/pa_stable_v190700_20210406.tgz + ) + + foreach(f IN LISTS possible_file_locations) + if(EXISTS ${f}) + set(portaudio_URL "file://${f}") + break() + endif() + endforeach() + + if(BUILD_SHARED_LIBS) + set(PA_BUILD_SHARED ON CACHE BOOL "" FORCE) + set(PA_BUILD_STATIC OFF CACHE BOOL "" FORCE) + else() + set(PA_BUILD_SHARED OFF CACHE BOOL "" FORCE) + set(PA_BUILD_STATIC ON CACHE BOOL "" FORCE) + endif() + + FetchContent_Declare(portaudio + URL ${portaudio_URL} + URL_HASH ${portaudio_HASH} + ) + + FetchContent_GetProperties(portaudio) + if(NOT portaudio_POPULATED) + message(STATUS "Downloading portaudio from ${portaudio_URL}") + FetchContent_Populate(portaudio) + endif() + message(STATUS "portaudio is downloaded to ${portaudio_SOURCE_DIR}") + message(STATUS "portaudio's binary dir is ${portaudio_BINARY_DIR}") + + if(APPLE) + set(CMAKE_MACOSX_RPATH ON) # to solve the following warning on macOS + endif() + + add_subdirectory(${portaudio_SOURCE_DIR} ${portaudio_BINARY_DIR} EXCLUDE_FROM_ALL) +endfunction() + +download_portaudio() + +# Note +# See http://portaudio.com/docs/v19-doxydocs/tutorial_start.html +# for how to use portaudio diff --git a/sherpa-onnx/csrc/CMakeLists.txt b/sherpa-onnx/csrc/CMakeLists.txt index 4a9c007c..6663262c 100644 --- a/sherpa-onnx/csrc/CMakeLists.txt +++ b/sherpa-onnx/csrc/CMakeLists.txt @@ -65,6 +65,24 @@ if(SHERPA_ONNX_HAS_ALSA) install(TARGETS sherpa-onnx-alsa DESTINATION bin) endif() +if(SHERPA_ONNX_ENABLE_PORTAUDIO) + add_executable(sherpa-onnx-microphone + sherpa-onnx-microphone.cc + microphone.cc + ) + + if(BUILD_SHARED_LIBS) + set(PA_LIB portaudio) + else() + set(PA_LIB portaudio_static) + endif() + + target_link_libraries(sherpa-onnx-microphone PRIVATE ${PA_LIB} sherpa-onnx-core) + + install(TARGETS sherpa-onnx-microphone DESTINATION bin) +endif() + + if(SHERPA_ONNX_ENABLE_TESTS) set(sherpa_onnx_test_srcs cat-test.cc diff --git a/sherpa-onnx/csrc/microphone.cc b/sherpa-onnx/csrc/microphone.cc new file mode 100644 index 00000000..956574f4 --- /dev/null +++ b/sherpa-onnx/csrc/microphone.cc @@ -0,0 +1,30 @@ +// sherpa-onnx/csrc/microphone.cc +// +// Copyright (c) 2022-2023 Xiaomi Corporation + +#include "sherpa-onnx/csrc/microphone.h" + +#include +#include + +#include "portaudio.h" // NOLINT + +namespace sherpa_onnx { + +Microphone::Microphone() { + PaError err = Pa_Initialize(); + if (err != paNoError) { + fprintf(stderr, "portaudio error: %s\n", Pa_GetErrorText(err)); + exit(-1); + } +} + +Microphone::~Microphone() { + PaError err = Pa_Terminate(); + if (err != paNoError) { + fprintf(stderr, "portaudio error: %s\n", Pa_GetErrorText(err)); + exit(-1); + } +} + +} // namespace sherpa_onnx diff --git a/sherpa-onnx/csrc/microphone.h b/sherpa-onnx/csrc/microphone.h new file mode 100644 index 00000000..61cedbf3 --- /dev/null +++ b/sherpa-onnx/csrc/microphone.h @@ -0,0 +1,18 @@ +// sherpa-onnx/csrc/microphone.h +// +// Copyright (c) 2022-2023 Xiaomi Corporation + +#ifndef SHERPA_ONNX_CSRC_MICROPHONE_H_ +#define SHERPA_ONNX_CSRC_MICROPHONE_H_ + +namespace sherpa_onnx { + +class Microphone { + public: + Microphone(); + ~Microphone(); +}; + +} // namespace sherpa_onnx + +#endif // SHERPA_ONNX_CSRC_MICROPHONE_H_ diff --git a/sherpa-onnx/csrc/sherpa-onnx-microphone.cc b/sherpa-onnx/csrc/sherpa-onnx-microphone.cc new file mode 100644 index 00000000..03ad3a46 --- /dev/null +++ b/sherpa-onnx/csrc/sherpa-onnx-microphone.cc @@ -0,0 +1,165 @@ +// sherpa-onnx/csrc/sherpa-onnx-microphone.cc +// +// Copyright (c) 2022-2023 Xiaomi Corporation + +#include +#include +#include + +#include +#include // std::tolower + +#include "portaudio.h" // NOLINT +#include "sherpa-onnx/csrc/display.h" +#include "sherpa-onnx/csrc/microphone.h" +#include "sherpa-onnx/csrc/online-recognizer.h" + +bool stop = false; + +static int32_t RecordCallback(const void *input_buffer, + void * /*output_buffer*/, + unsigned long frames_per_buffer, // NOLINT + const PaStreamCallbackTimeInfo * /*time_info*/, + PaStreamCallbackFlags /*status_flags*/, + void *user_data) { + auto stream = reinterpret_cast(user_data); + + stream->AcceptWaveform(16000, reinterpret_cast(input_buffer), + frames_per_buffer); + + return stop ? paComplete : paContinue; +} + +static void Handler(int32_t sig) { + stop = true; + fprintf(stderr, "\nCaught Ctrl + C. Exiting...\n"); +} + +int32_t main(int32_t argc, char *argv[]) { + if (argc < 5 || argc > 6) { + const char *usage = R"usage( +Usage: + ./bin/sherpa-onnx-microphone \ + /path/to/tokens.txt \ + /path/to/encoder.onnx\ + /path/to/decoder.onnx\ + /path/to/joiner.onnx\ + [num_threads] + +Please refer to +https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html +for a list of pre-trained models to download. +)usage"; + fprintf(stderr, "%s\n", usage); + fprintf(stderr, "argc, %d\n", argc); + + return 0; + } + signal(SIGINT, Handler); + + sherpa_onnx::OnlineRecognizerConfig config; + config.tokens = argv[1]; + + config.model_config.debug = false; + config.model_config.encoder_filename = argv[2]; + config.model_config.decoder_filename = argv[3]; + config.model_config.joiner_filename = argv[4]; + + config.model_config.num_threads = 2; + if (argc == 6 && atoi(argv[5]) > 0) { + config.model_config.num_threads = atoi(argv[5]); + } + + config.enable_endpoint = true; + + config.endpoint_config.rule1.min_trailing_silence = 2.4; + config.endpoint_config.rule2.min_trailing_silence = 1.2; + config.endpoint_config.rule3.min_utterance_length = 300; + + fprintf(stderr, "%s\n", config.ToString().c_str()); + + sherpa_onnx::OnlineRecognizer recognizer(config); + auto s = recognizer.CreateStream(); + + sherpa_onnx::Microphone mic; + + PaDeviceIndex num_devices = Pa_GetDeviceCount(); + fprintf(stderr, "Num devices: %d\n", num_devices); + + PaStreamParameters param; + + param.device = Pa_GetDefaultInputDevice(); + if (param.device == paNoDevice) { + fprintf(stderr, "No default input device found\n"); + exit(EXIT_FAILURE); + } + fprintf(stderr, "Use default device: %d\n", param.device); + + const PaDeviceInfo *info = Pa_GetDeviceInfo(param.device); + fprintf(stderr, " Name: %s\n", info->name); + fprintf(stderr, " Max input channels: %d\n", info->maxInputChannels); + + param.channelCount = 1; + param.sampleFormat = paFloat32; + + param.suggestedLatency = info->defaultLowInputLatency; + param.hostApiSpecificStreamInfo = nullptr; + const float sample_rate = 16000; + + PaStream *stream; + PaError err = + Pa_OpenStream(&stream, ¶m, nullptr, /* &outputParameters, */ + sample_rate, + 0, // frames per buffer + paClipOff, // we won't output out of range samples + // so don't bother clipping them + RecordCallback, s.get()); + if (err != paNoError) { + fprintf(stderr, "portaudio error: %s\n", Pa_GetErrorText(err)); + exit(EXIT_FAILURE); + } + + err = Pa_StartStream(stream); + fprintf(stderr, "Started\n"); + + if (err != paNoError) { + fprintf(stderr, "portaudio error: %s\n", Pa_GetErrorText(err)); + exit(EXIT_FAILURE); + } + + std::string last_text; + int32_t segment_index = 0; + sherpa_onnx::Display display; + while (!stop) { + while (recognizer.IsReady(s.get())) { + recognizer.DecodeStream(s.get()); + } + + auto text = recognizer.GetResult(s.get()).text; + bool is_endpoint = recognizer.IsEndpoint(s.get()); + + if (!text.empty() && last_text != text) { + last_text = text; + + std::transform(text.begin(), text.end(), text.begin(), + [](auto c) { return std::tolower(c); }); + + display.Print(segment_index, text); + } + + if (!text.empty() && is_endpoint) { + ++segment_index; + recognizer.Reset(s.get()); + } + + Pa_Sleep(20); // sleep for 20ms + } + + err = Pa_CloseStream(stream); + if (err != paNoError) { + fprintf(stderr, "portaudio error: %s\n", Pa_GetErrorText(err)); + exit(EXIT_FAILURE); + } + + return 0; +}