diff --git a/cxx-api-examples/CMakeLists.txt b/cxx-api-examples/CMakeLists.txt index 4ae59609..69e2b644 100644 --- a/cxx-api-examples/CMakeLists.txt +++ b/cxx-api-examples/CMakeLists.txt @@ -36,6 +36,15 @@ if(SHERPA_ONNX_ENABLE_PORTAUDIO) sherpa-onnx-cxx-api portaudio_static ) + + add_executable(parakeet-tdt-simulate-streaming-microphone-cxx-api + ./parakeet-tdt-simulate-streaming-microphone-cxx-api.cc + ${CMAKE_CURRENT_LIST_DIR}/../sherpa-onnx/csrc/microphone.cc + ) + target_link_libraries(parakeet-tdt-simulate-streaming-microphone-cxx-api + sherpa-onnx-cxx-api + portaudio_static + ) endif() add_executable(sense-voice-with-hr-cxx-api ./sense-voice-with-hr-cxx-api.cc) diff --git a/cxx-api-examples/parakeet-tdt-simulate-streaming-microphone-cxx-api.cc b/cxx-api-examples/parakeet-tdt-simulate-streaming-microphone-cxx-api.cc new file mode 100644 index 00000000..9f150679 --- /dev/null +++ b/cxx-api-examples/parakeet-tdt-simulate-streaming-microphone-cxx-api.cc @@ -0,0 +1,286 @@ +// cxx-api-examples/parakeet-tdt-simulate-streaming-microphone-cxx-api.cc +// Copyright (c) 2025 Xiaomi Corporation + +// +// This file demonstrates how to use parakeet-tdt with sherpa-onnx's C++ API +// for streaming speech recognition from a microphone. +// +// clang-format off +// +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx +// +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8.tar.bz2 +// tar xvf sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8.tar.bz2 +// rm sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8.tar.bz2 +// +// clang-format on + +#include +#include +#include + +#include // NOLINT +#include // NOLINT +#include +#include // NOLINT +#include +#include + +#include "portaudio.h" // NOLINT +#include "sherpa-display.h" // NOLINT +#include "sherpa-onnx/c-api/cxx-api.h" +#include "sherpa-onnx/csrc/microphone.h" + +std::queue> samples_queue; +std::condition_variable condition_variable; +std::mutex mutex; +bool stop = false; + +static void Handler(int32_t /*sig*/) { + stop = true; + condition_variable.notify_one(); + fprintf(stderr, "\nCaught Ctrl + C. Exiting...\n"); +} + +static int32_t RecordCallback(const void *input_buffer, + void * /*output_buffer*/, + unsigned long frames_per_buffer, // NOLINT + const PaStreamCallbackTimeInfo * /*time_info*/, + PaStreamCallbackFlags /*status_flags*/, + void * /*user_data*/) { + std::lock_guard lock(mutex); + samples_queue.emplace( + reinterpret_cast(input_buffer), + reinterpret_cast(input_buffer) + frames_per_buffer); + condition_variable.notify_one(); + + return stop ? paComplete : paContinue; +} + +static sherpa_onnx::cxx::VoiceActivityDetector CreateVad() { + using namespace sherpa_onnx::cxx; // NOLINT + VadModelConfig config; + config.silero_vad.model = "./silero_vad.onnx"; + config.silero_vad.threshold = 0.5; + config.silero_vad.min_silence_duration = 0.25; + config.silero_vad.min_speech_duration = 0.25; + config.silero_vad.max_speech_duration = 5; + config.sample_rate = 16000; + config.debug = false; + + VoiceActivityDetector vad = VoiceActivityDetector::Create(config, 60); + if (!vad.Get()) { + std::cerr << "Failed to create VAD. Please check your config\n"; + exit(-1); + } + + return vad; +} + +static sherpa_onnx::cxx::OfflineRecognizer CreateOfflineRecognizer() { + using namespace sherpa_onnx::cxx; // NOLINT + OfflineRecognizerConfig config; + + config.model_config.transducer.encoder = + "./sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8/encoder.int8.onnx"; + config.model_config.transducer.decoder = + "./sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8/decoder.int8.onnx"; + config.model_config.transducer.joiner = + "./sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8/joiner.int8.onnx"; + config.model_config.tokens = + "./sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8/tokens.txt"; + + config.model_config.model_type = "nemo_transducer"; + + config.model_config.num_threads = 2; + config.model_config.debug = false; + + std::cout << "Loading model\n"; + OfflineRecognizer recognizer = OfflineRecognizer::Create(config); + if (!recognizer.Get()) { + std::cerr << "Please check your config\n"; + exit(-1); + } + std::cout << "Loading model done\n"; + return recognizer; +} + +int32_t main() { + signal(SIGINT, Handler); + + using namespace sherpa_onnx::cxx; // NOLINT + + auto vad = CreateVad(); + auto recognizer = CreateOfflineRecognizer(); + + sherpa_onnx::Microphone mic; + + PaDeviceIndex num_devices = Pa_GetDeviceCount(); + std::cout << "Num devices: " << num_devices << "\n"; + if (num_devices == 0) { + std::cerr << " If you are using Linux, please try " + "./build/bin/sense-voice-simulate-streaming-alsa-cxx-api\n"; + return -1; + } + + int32_t device_index = Pa_GetDefaultInputDevice(); + + const char *pDeviceIndex = std::getenv("SHERPA_ONNX_MIC_DEVICE"); + if (pDeviceIndex) { + fprintf(stderr, "Use specified device: %s\n", pDeviceIndex); + device_index = atoi(pDeviceIndex); + } + + for (int32_t i = 0; i != num_devices; ++i) { + const PaDeviceInfo *info = Pa_GetDeviceInfo(i); + fprintf(stderr, " %s %d %s\n", (i == device_index) ? "*" : " ", i, + info->name); + } + + PaStreamParameters param; + param.device = device_index; + + fprintf(stderr, "Use device: %d\n", param.device); + + const PaDeviceInfo *info = Pa_GetDeviceInfo(param.device); + fprintf(stderr, " Name: %s\n", info->name); + fprintf(stderr, " Max input channels: %d\n", info->maxInputChannels); + + param.channelCount = 1; + param.sampleFormat = paFloat32; + + param.suggestedLatency = info->defaultLowInputLatency; + param.hostApiSpecificStreamInfo = nullptr; + float mic_sample_rate = 16000; + const char *sample_rate_str = std::getenv("SHERPA_ONNX_MIC_SAMPLE_RATE"); + if (sample_rate_str) { + fprintf(stderr, "Use sample rate %f for mic\n", mic_sample_rate); + mic_sample_rate = atof(sample_rate_str); + } + float sample_rate = 16000; + LinearResampler resampler; + if (mic_sample_rate != sample_rate) { + float min_freq = std::min(mic_sample_rate, sample_rate); + float lowpass_cutoff = 0.99 * 0.5 * min_freq; + + int32_t lowpass_filter_width = 6; + resampler = LinearResampler::Create(mic_sample_rate, sample_rate, + lowpass_cutoff, lowpass_filter_width); + } + + PaStream *stream; + PaError err = + Pa_OpenStream(&stream, ¶m, nullptr, /* &outputParameters, */ + mic_sample_rate, + 0, // frames per buffer + paClipOff, // we won't output out of range samples + // so don't bother clipping them + RecordCallback, // RecordCallback is run in a separate + // thread created by portaudio + nullptr); + if (err != paNoError) { + fprintf(stderr, "portaudio error: %s\n", Pa_GetErrorText(err)); + exit(EXIT_FAILURE); + } + + err = Pa_StartStream(stream); + if (err != paNoError) { + fprintf(stderr, "portaudio error: %s\n", Pa_GetErrorText(err)); + exit(EXIT_FAILURE); + } + + int32_t window_size = 512; // samples, please don't change + + int32_t offset = 0; + std::vector buffer; + bool speech_started = false; + + auto started_time = std::chrono::steady_clock::now(); + + SherpaDisplay display; + + std::cout << "Started! Please speak\n"; + + while (!stop) { + { + std::unique_lock lock(mutex); + while (samples_queue.empty() && !stop) { + condition_variable.wait(lock); + } + + const auto &s = samples_queue.front(); + if (!resampler.Get()) { + buffer.insert(buffer.end(), s.begin(), s.end()); + } else { + auto resampled = resampler.Resample(s.data(), s.size(), false); + buffer.insert(buffer.end(), resampled.begin(), resampled.end()); + } + + samples_queue.pop(); + } + + for (; offset + window_size < buffer.size(); offset += window_size) { + vad.AcceptWaveform(buffer.data() + offset, window_size); + if (!speech_started && vad.IsDetected()) { + speech_started = true; + started_time = std::chrono::steady_clock::now(); + } + } + if (!speech_started) { + if (buffer.size() > 10 * window_size) { + offset -= buffer.size() - 10 * window_size; + buffer = {buffer.end() - 10 * window_size, buffer.end()}; + } + } + + auto current_time = std::chrono::steady_clock::now(); + const float elapsed_seconds = + std::chrono::duration_cast(current_time - + started_time) + .count() / + 1000.; + + if (speech_started && elapsed_seconds > 0.2) { + OfflineStream stream = recognizer.CreateStream(); + stream.AcceptWaveform(sample_rate, buffer.data(), buffer.size()); + + recognizer.Decode(&stream); + + OfflineRecognizerResult result = recognizer.GetResult(&stream); + display.UpdateText(result.text); + display.Display(); + + started_time = std::chrono::steady_clock::now(); + } + + while (!vad.IsEmpty()) { + auto segment = vad.Front(); + + vad.Pop(); + + OfflineStream stream = recognizer.CreateStream(); + stream.AcceptWaveform(sample_rate, segment.samples.data(), + segment.samples.size()); + + recognizer.Decode(&stream); + + OfflineRecognizerResult result = recognizer.GetResult(&stream); + + display.UpdateText(result.text); + display.FinalizeCurrentSentence(); + display.Display(); + + buffer.clear(); + offset = 0; + speech_started = false; + } + } + + err = Pa_CloseStream(stream); + if (err != paNoError) { + fprintf(stderr, "portaudio error: %s\n", Pa_GetErrorText(err)); + exit(EXIT_FAILURE); + } + + return 0; +} diff --git a/cxx-api-examples/sherpa-display.h b/cxx-api-examples/sherpa-display.h index b0fc4605..996babe1 100644 --- a/cxx-api-examples/sherpa-display.h +++ b/cxx-api-examples/sherpa-display.h @@ -14,7 +14,8 @@ class SherpaDisplay { void UpdateText(const std::string &text) { current_text_ = text; } void FinalizeCurrentSentence() { - if (!current_text_.empty() && current_text_[0] != ' ') { + if (!current_text_.empty() && + (current_text_[0] != ' ' || current_text_.size() > 1)) { sentences_.push_back({GetCurrentDateTime(), std::move(current_text_)}); } }