diff --git a/cxx-api-examples/CMakeLists.txt b/cxx-api-examples/CMakeLists.txt index bb6c980f..d6d50b48 100644 --- a/cxx-api-examples/CMakeLists.txt +++ b/cxx-api-examples/CMakeLists.txt @@ -49,6 +49,15 @@ if(SHERPA_ONNX_ENABLE_PORTAUDIO) portaudio_static ) + add_executable(parakeet-tdt-ctc-simulate-streaming-microphone-cxx-api + ./parakeet-tdt-ctc-simulate-streaming-microphone-cxx-api.cc + ${CMAKE_CURRENT_LIST_DIR}/../sherpa-onnx/csrc/microphone.cc + ) + target_link_libraries(parakeet-tdt-ctc-simulate-streaming-microphone-cxx-api + sherpa-onnx-cxx-api + portaudio_static + ) + add_executable(zipformer-ctc-simulate-streaming-microphone-cxx-api ./zipformer-ctc-simulate-streaming-microphone-cxx-api.cc ${CMAKE_CURRENT_LIST_DIR}/../sherpa-onnx/csrc/microphone.cc @@ -57,6 +66,15 @@ if(SHERPA_ONNX_ENABLE_PORTAUDIO) sherpa-onnx-cxx-api portaudio_static ) + + add_executable(zipformer-transducer-simulate-streaming-microphone-cxx-api + ./zipformer-transducer-simulate-streaming-microphone-cxx-api.cc + ${CMAKE_CURRENT_LIST_DIR}/../sherpa-onnx/csrc/microphone.cc + ) + target_link_libraries(zipformer-transducer-simulate-streaming-microphone-cxx-api + sherpa-onnx-cxx-api + portaudio_static + ) endif() if(SHERPA_ONNX_HAS_ALSA) diff --git a/cxx-api-examples/parakeet-tdt-ctc-simulate-streaming-microphone-cxx-api.cc b/cxx-api-examples/parakeet-tdt-ctc-simulate-streaming-microphone-cxx-api.cc new file mode 100644 index 00000000..59bb2c2e --- /dev/null +++ b/cxx-api-examples/parakeet-tdt-ctc-simulate-streaming-microphone-cxx-api.cc @@ -0,0 +1,238 @@ +// cxx-api-examples/parakeet-tdt-simulate-streaming-microphone-cxx-api.cc +// Copyright (c) 2025 Xiaomi Corporation + +// +// This file demonstrates how to use parakeet-tdt with sherpa-onnx's C++ API +// for streaming speech recognition from a microphone. +// +// clang-format off +// +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx +// +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-parakeet-tdt_ctc-0.6b-ja-35000-int8.tar.bz2 +// tar xvf sherpa-onnx-nemo-parakeet-tdt_ctc-0.6b-ja-35000-int8.tar.bz2 +// rm sherpa-onnx-nemo-parakeet-tdt_ctc-0.6b-ja-35000-int8.tar.bz2 +// +// clang-format on + +#include +#include +#include + +#include // NOLINT +#include // NOLINT +#include +#include // NOLINT +#include +#include + +#include "portaudio.h" // NOLINT +#include "sherpa-display.h" // NOLINT +#include "sherpa-onnx/c-api/cxx-api.h" +#include "sherpa-onnx/csrc/microphone.h" + +std::queue> samples_queue; +std::condition_variable condition_variable; +std::mutex mutex; +bool stop = false; + +static void Handler(int32_t /*sig*/) { + stop = true; + condition_variable.notify_one(); + fprintf(stderr, "\nCaught Ctrl + C. Exiting...\n"); +} + +static int32_t RecordCallback(const void *input_buffer, + void * /*output_buffer*/, + unsigned long frames_per_buffer, // NOLINT + const PaStreamCallbackTimeInfo * /*time_info*/, + PaStreamCallbackFlags /*status_flags*/, + void * /*user_data*/) { + std::lock_guard lock(mutex); + samples_queue.emplace( + reinterpret_cast(input_buffer), + reinterpret_cast(input_buffer) + frames_per_buffer); + condition_variable.notify_one(); + + return stop ? paComplete : paContinue; +} + +static sherpa_onnx::cxx::VoiceActivityDetector CreateVad() { + using namespace sherpa_onnx::cxx; // NOLINT + VadModelConfig config; + config.silero_vad.model = "./silero_vad.onnx"; + config.silero_vad.threshold = 0.25; + config.silero_vad.min_silence_duration = 0.25; + config.silero_vad.min_speech_duration = 0.25; + config.silero_vad.max_speech_duration = 5; + config.sample_rate = 16000; + config.debug = false; + + VoiceActivityDetector vad = VoiceActivityDetector::Create(config, 60); + if (!vad.Get()) { + std::cerr << "Failed to create VAD. Please check your config\n"; + exit(-1); + } + + return vad; +} + +static sherpa_onnx::cxx::OfflineRecognizer CreateOfflineRecognizer() { + using namespace sherpa_onnx::cxx; // NOLINT + OfflineRecognizerConfig config; + + config.model_config.nemo_ctc.model = + "./sherpa-onnx-nemo-parakeet-tdt_ctc-0.6b-ja-35000-int8/model.int8.onnx"; + config.model_config.tokens = + "./sherpa-onnx-nemo-parakeet-tdt_ctc-0.6b-ja-35000-int8/tokens.txt"; + + config.model_config.num_threads = 2; + config.model_config.debug = false; + + std::cout << "Loading model\n"; + OfflineRecognizer recognizer = OfflineRecognizer::Create(config); + if (!recognizer.Get()) { + std::cerr << "Please check your config\n"; + exit(-1); + } + std::cout << "Loading model done\n"; + return recognizer; +} + +int32_t main() { + signal(SIGINT, Handler); + + using namespace sherpa_onnx::cxx; // NOLINT + + auto vad = CreateVad(); + auto recognizer = CreateOfflineRecognizer(); + + sherpa_onnx::Microphone mic; + + PaDeviceIndex num_devices = Pa_GetDeviceCount(); + if (num_devices == 0) { + std::cerr << " If you are using Linux, please try to modify " + "./build/bin/sense-voice-simulate-streaming-alsa-cxx-api\n"; + return -1; + } + + int32_t device_index = Pa_GetDefaultInputDevice(); + const char *pDeviceIndex = std::getenv("SHERPA_ONNX_MIC_DEVICE"); + if (pDeviceIndex) { + fprintf(stderr, "Use specified device: %s\n", pDeviceIndex); + device_index = atoi(pDeviceIndex); + } + mic.PrintDevices(device_index); + + float mic_sample_rate = 16000; + const char *sample_rate_str = std::getenv("SHERPA_ONNX_MIC_SAMPLE_RATE"); + if (sample_rate_str) { + mic_sample_rate = atof(sample_rate_str); + fprintf(stderr, "Use sample rate %f for mic\n", mic_sample_rate); + } + + float sample_rate = 16000; + LinearResampler resampler; + if (mic_sample_rate != sample_rate) { + float min_freq = std::min(mic_sample_rate, sample_rate); + float lowpass_cutoff = 0.99 * 0.5 * min_freq; + + int32_t lowpass_filter_width = 6; + resampler = LinearResampler::Create(mic_sample_rate, sample_rate, + lowpass_cutoff, lowpass_filter_width); + } + if (!mic.OpenDevice(device_index, mic_sample_rate, 1, RecordCallback, + nullptr)) { + std::cerr << "Failed to open microphone device\n"; + return -1; + } + + int32_t window_size = 512; // samples, please don't change + + int32_t offset = 0; + std::vector buffer; + bool speech_started = false; + + auto started_time = std::chrono::steady_clock::now(); + + SherpaDisplay display; + + std::cout << "Started! Please speak\n"; + + while (!stop) { + { + std::unique_lock lock(mutex); + while (samples_queue.empty() && !stop) { + condition_variable.wait(lock); + } + + const auto &s = samples_queue.front(); + if (!resampler.Get()) { + buffer.insert(buffer.end(), s.begin(), s.end()); + } else { + auto resampled = resampler.Resample(s.data(), s.size(), false); + buffer.insert(buffer.end(), resampled.begin(), resampled.end()); + } + + samples_queue.pop(); + } + + for (; offset + window_size < buffer.size(); offset += window_size) { + vad.AcceptWaveform(buffer.data() + offset, window_size); + if (!speech_started && vad.IsDetected()) { + speech_started = true; + started_time = std::chrono::steady_clock::now(); + } + } + if (!speech_started) { + if (buffer.size() > 10 * window_size) { + offset -= buffer.size() - 10 * window_size; + buffer = {buffer.end() - 10 * window_size, buffer.end()}; + } + } + + auto current_time = std::chrono::steady_clock::now(); + const float elapsed_seconds = + std::chrono::duration_cast(current_time - + started_time) + .count() / + 1000.; + + if (speech_started && elapsed_seconds > 0.2) { + OfflineStream stream = recognizer.CreateStream(); + stream.AcceptWaveform(sample_rate, buffer.data(), buffer.size()); + + recognizer.Decode(&stream); + + OfflineRecognizerResult result = recognizer.GetResult(&stream); + display.UpdateText(result.text); + display.Display(); + + started_time = std::chrono::steady_clock::now(); + } + + while (!vad.IsEmpty()) { + auto segment = vad.Front(); + + vad.Pop(); + + OfflineStream stream = recognizer.CreateStream(); + stream.AcceptWaveform(sample_rate, segment.samples.data(), + segment.samples.size()); + + recognizer.Decode(&stream); + + OfflineRecognizerResult result = recognizer.GetResult(&stream); + + display.UpdateText(result.text); + display.FinalizeCurrentSentence(); + display.Display(); + + buffer.clear(); + offset = 0; + speech_started = false; + } + } + + return 0; +} diff --git a/cxx-api-examples/parakeet-tdt-simulate-streaming-microphone-cxx-api.cc b/cxx-api-examples/parakeet-tdt-simulate-streaming-microphone-cxx-api.cc index bc58d9ec..ccbe2813 100644 --- a/cxx-api-examples/parakeet-tdt-simulate-streaming-microphone-cxx-api.cc +++ b/cxx-api-examples/parakeet-tdt-simulate-streaming-microphone-cxx-api.cc @@ -136,11 +136,7 @@ int32_t main() { fprintf(stderr, "Use sample rate %f for mic\n", mic_sample_rate); mic_sample_rate = atof(sample_rate_str); } - if (!mic.OpenDevice(device_index, mic_sample_rate, 1, RecordCallback, - nullptr) == false) { - std::cerr << "Failed to open microphone device\n"; - return -1; - } + float sample_rate = 16000; LinearResampler resampler; if (mic_sample_rate != sample_rate) { @@ -152,6 +148,12 @@ int32_t main() { lowpass_cutoff, lowpass_filter_width); } + if (!mic.OpenDevice(device_index, mic_sample_rate, 1, RecordCallback, + nullptr)) { + std::cerr << "Failed to open microphone device\n"; + return -1; + } + int32_t window_size = 512; // samples, please don't change int32_t offset = 0; diff --git a/cxx-api-examples/sense-voice-simulate-streaming-microphone-cxx-api.cc b/cxx-api-examples/sense-voice-simulate-streaming-microphone-cxx-api.cc index ed9aa891..9cc847a3 100644 --- a/cxx-api-examples/sense-voice-simulate-streaming-microphone-cxx-api.cc +++ b/cxx-api-examples/sense-voice-simulate-streaming-microphone-cxx-api.cc @@ -142,8 +142,8 @@ int32_t main() { resampler = LinearResampler::Create(mic_sample_rate, sample_rate, lowpass_cutoff, lowpass_filter_width); } - if (mic.OpenDevice(device_index, mic_sample_rate, 1, RecordCallback, - nullptr) == false) { + if (!mic.OpenDevice(device_index, mic_sample_rate, 1, RecordCallback, + nullptr)) { std::cerr << "Failed to open microphone device\n"; return -1; } diff --git a/cxx-api-examples/zipformer-ctc-simulate-streaming-microphone-cxx-api.cc b/cxx-api-examples/zipformer-ctc-simulate-streaming-microphone-cxx-api.cc index f7eb117e..006b7747 100644 --- a/cxx-api-examples/zipformer-ctc-simulate-streaming-microphone-cxx-api.cc +++ b/cxx-api-examples/zipformer-ctc-simulate-streaming-microphone-cxx-api.cc @@ -140,8 +140,8 @@ int32_t main() { resampler = LinearResampler::Create(mic_sample_rate, sample_rate, lowpass_cutoff, lowpass_filter_width); } - if (mic.OpenDevice(device_index, mic_sample_rate, 1, RecordCallback, - nullptr) == false) { + if (!mic.OpenDevice(device_index, mic_sample_rate, 1, RecordCallback, + nullptr)) { std::cerr << "Failed to open microphone device\n"; return -1; } diff --git a/cxx-api-examples/zipformer-transducer-simulate-streaming-microphone-cxx-api.cc b/cxx-api-examples/zipformer-transducer-simulate-streaming-microphone-cxx-api.cc new file mode 100644 index 00000000..ac0ebeea --- /dev/null +++ b/cxx-api-examples/zipformer-transducer-simulate-streaming-microphone-cxx-api.cc @@ -0,0 +1,245 @@ +// cxx-api-examples/zipformer-transducer-simulate-streaming-microphone-cxx-api.cc +// Copyright (c) 2025 Xiaomi Corporation +// +// This file demonstrates how to use Zipformer transducer with sherpa-onnx's C++ API +// for streaming speech recognition from a microphone. +// +// clang-format off +// +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx +// +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01.tar.bz2 +// tar xvf sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01.tar.bz2 +// rm sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01.tar.bz2 +// +// clang-format on + +#include +#include +#include + +#include // NOLINT +#include // NOLINT +#include +#include // NOLINT +#include +#include + +#include "portaudio.h" // NOLINT +#include "sherpa-display.h" // NOLINT +#include "sherpa-onnx/c-api/cxx-api.h" +#include "sherpa-onnx/csrc/microphone.h" + +std::queue> samples_queue; +std::condition_variable condition_variable; +std::mutex mutex; +bool stop = false; + +static void Handler(int32_t /*sig*/) { + stop = true; + condition_variable.notify_one(); + fprintf(stderr, "\nCaught Ctrl + C. Exiting...\n"); +} + +static int32_t RecordCallback(const void *input_buffer, + void * /*output_buffer*/, + unsigned long frames_per_buffer, // NOLINT + const PaStreamCallbackTimeInfo * /*time_info*/, + PaStreamCallbackFlags /*status_flags*/, + void * /*user_data*/) { + std::lock_guard lock(mutex); + samples_queue.emplace( + reinterpret_cast(input_buffer), + reinterpret_cast(input_buffer) + frames_per_buffer); + condition_variable.notify_one(); + + return stop ? paComplete : paContinue; +} + +static sherpa_onnx::cxx::VoiceActivityDetector CreateVad() { + using namespace sherpa_onnx::cxx; // NOLINT + VadModelConfig config; + config.silero_vad.model = "./silero_vad.onnx"; + config.silero_vad.threshold = 0.5; + config.silero_vad.min_silence_duration = 0.1; + config.silero_vad.min_speech_duration = 0.25; + config.silero_vad.max_speech_duration = 8; + config.sample_rate = 16000; + config.debug = false; + + VoiceActivityDetector vad = VoiceActivityDetector::Create(config, 20); + if (!vad.Get()) { + std::cerr << "Failed to create VAD. Please check your config\n"; + exit(-1); + } + + return vad; +} + +static sherpa_onnx::cxx::OfflineRecognizer CreateOfflineRecognizer() { + using namespace sherpa_onnx::cxx; // NOLINT + OfflineRecognizerConfig config; + + config.model_config.transducer.encoder = + "./sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01/" + "encoder-epoch-99-avg-1.int8.onnx"; + + config.model_config.transducer.decoder = + "./sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01/" + "decoder-epoch-99-avg-1.onnx"; + + config.model_config.transducer.joiner = + "./sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01/" + "joiner-epoch-99-avg-1.int8.onnx"; + config.model_config.tokens = + "./sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01/tokens.txt"; + + config.model_config.num_threads = 2; + config.model_config.debug = false; + + std::cout << "Loading model\n"; + OfflineRecognizer recognizer = OfflineRecognizer::Create(config); + if (!recognizer.Get()) { + std::cerr << "Please check your config\n"; + exit(-1); + } + std::cout << "Loading model done\n"; + return recognizer; +} + +int32_t main() { + signal(SIGINT, Handler); + + using namespace sherpa_onnx::cxx; // NOLINT + + auto vad = CreateVad(); + auto recognizer = CreateOfflineRecognizer(); + + sherpa_onnx::Microphone mic; + + PaDeviceIndex num_devices = Pa_GetDeviceCount(); + if (num_devices == 0) { + std::cerr << " If you are using Linux, please try " + "./build/bin/zipformer-ctc-simulate-streaming-alsa-cxx-api\n"; + return -1; + } + + int32_t device_index = Pa_GetDefaultInputDevice(); + const char *pDeviceIndex = std::getenv("SHERPA_ONNX_MIC_DEVICE"); + if (pDeviceIndex) { + fprintf(stderr, "Use specified device: %s\n", pDeviceIndex); + device_index = atoi(pDeviceIndex); + } + mic.PrintDevices(device_index); + + float mic_sample_rate = 16000; + const char *sample_rate_str = std::getenv("SHERPA_ONNX_MIC_SAMPLE_RATE"); + if (sample_rate_str) { + fprintf(stderr, "Use sample rate %f for mic\n", mic_sample_rate); + mic_sample_rate = atof(sample_rate_str); + } + float sample_rate = 16000; + LinearResampler resampler; + if (mic_sample_rate != sample_rate) { + float min_freq = std::min(mic_sample_rate, sample_rate); + float lowpass_cutoff = 0.99 * 0.5 * min_freq; + + int32_t lowpass_filter_width = 6; + resampler = LinearResampler::Create(mic_sample_rate, sample_rate, + lowpass_cutoff, lowpass_filter_width); + } + if (!mic.OpenDevice(device_index, mic_sample_rate, 1, RecordCallback, + nullptr)) { + std::cerr << "Failed to open microphone device\n"; + return -1; + } + + int32_t window_size = 512; // samples, please don't change + + int32_t offset = 0; + std::vector buffer; + bool speech_started = false; + + auto started_time = std::chrono::steady_clock::now(); + + SherpaDisplay display; + + std::cout << "Started! Please speak\n"; + + while (!stop) { + { + std::unique_lock lock(mutex); + while (samples_queue.empty() && !stop) { + condition_variable.wait(lock); + } + + const auto &s = samples_queue.front(); + if (!resampler.Get()) { + buffer.insert(buffer.end(), s.begin(), s.end()); + } else { + auto resampled = resampler.Resample(s.data(), s.size(), false); + buffer.insert(buffer.end(), resampled.begin(), resampled.end()); + } + + samples_queue.pop(); + } + + for (; offset + window_size < buffer.size(); offset += window_size) { + vad.AcceptWaveform(buffer.data() + offset, window_size); + if (!speech_started && vad.IsDetected()) { + speech_started = true; + started_time = std::chrono::steady_clock::now(); + } + } + if (!speech_started) { + if (buffer.size() > 10 * window_size) { + offset -= buffer.size() - 10 * window_size; + buffer = {buffer.end() - 10 * window_size, buffer.end()}; + } + } + + auto current_time = std::chrono::steady_clock::now(); + const float elapsed_seconds = + std::chrono::duration_cast(current_time - + started_time) + .count() / + 1000.; + + if (speech_started && elapsed_seconds > 0.2) { + OfflineStream stream = recognizer.CreateStream(); + stream.AcceptWaveform(sample_rate, buffer.data(), buffer.size()); + + recognizer.Decode(&stream); + + OfflineRecognizerResult result = recognizer.GetResult(&stream); + display.UpdateText(result.text); + display.Display(); + + started_time = std::chrono::steady_clock::now(); + } + + while (!vad.IsEmpty()) { + auto segment = vad.Front(); + + vad.Pop(); + + OfflineStream stream = recognizer.CreateStream(); + stream.AcceptWaveform(sample_rate, segment.samples.data(), + segment.samples.size()); + + recognizer.Decode(&stream); + + OfflineRecognizerResult result = recognizer.GetResult(&stream); + + display.UpdateText(result.text); + display.FinalizeCurrentSentence(); + display.Display(); + + buffer.clear(); + offset = 0; + speech_started = false; + } + } + + return 0; +}