diff --git a/.github/workflows/aarch64-linux-gnu-shared.yaml b/.github/workflows/aarch64-linux-gnu-shared.yaml index a6af5468..159138c3 100644 --- a/.github/workflows/aarch64-linux-gnu-shared.yaml +++ b/.github/workflows/aarch64-linux-gnu-shared.yaml @@ -82,6 +82,8 @@ jobs: .. make -j4 install + cp -v bin/sense-voice-simulate-streaming-alsa-cxx-api install/bin + rm -rf install/lib/pkgconfig rm -fv install/lib/cargs.h rm -fv install/lib/libcargs.so @@ -126,6 +128,8 @@ jobs: make -j4 install + cp -v bin/sense-voice-simulate-streaming-alsa-cxx-api install/bin + rm -rf install/lib/pkgconfig rm -fv install/lib/cargs.h rm -fv install/lib/libcargs.so @@ -242,7 +246,7 @@ jobs: file: sherpa-onnx-*linux-aarch64*.tar.bz2 # repo_name: k2-fsa/sherpa-onnx # repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }} - # tag: v1.11.1 + # tag: v1.11.5 - name: Test offline Moonshine if: matrix.build_type != 'Debug' diff --git a/.github/workflows/aarch64-linux-gnu-static.yaml b/.github/workflows/aarch64-linux-gnu-static.yaml index 376d4f51..a8da8ce7 100644 --- a/.github/workflows/aarch64-linux-gnu-static.yaml +++ b/.github/workflows/aarch64-linux-gnu-static.yaml @@ -83,6 +83,8 @@ jobs: make install + cp bin/sense-voice-simulate-streaming-alsa-cxx-api install/bin + ls -lh install/lib rm -rf install/lib/pkgconfig @@ -164,7 +166,7 @@ jobs: file: sherpa-onnx-*linux-aarch64*.tar.bz2 # repo_name: k2-fsa/sherpa-onnx # repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }} - # tag: v1.10.42 + # tag: v1.11.5 - name: Test offline Moonshine if: matrix.build_type != 'Debug' diff --git a/.github/workflows/android.yaml b/.github/workflows/android.yaml index 490ca35d..eb01120f 100644 --- a/.github/workflows/android.yaml +++ b/.github/workflows/android.yaml @@ -168,7 +168,7 @@ jobs: file: sherpa-onnx-*-android.tar.bz2 # repo_name: k2-fsa/sherpa-onnx # repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }} - # tag: v1.11.3 + # tag: v1.11.5 build-android-aar: needs: [build-android-libs] @@ -297,7 +297,7 @@ jobs: file: ./*.aar # repo_name: k2-fsa/sherpa-onnx # repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }} - # tag: v1.11.3 + # tag: v1.11.5 - name: Release android aar if: github.repository_owner == 'k2-fsa' && github.event_name == 'push' && contains(github.ref, 'refs/tags/') diff --git a/cxx-api-examples/CMakeLists.txt b/cxx-api-examples/CMakeLists.txt index 69e2b644..0c0a8d24 100644 --- a/cxx-api-examples/CMakeLists.txt +++ b/cxx-api-examples/CMakeLists.txt @@ -47,6 +47,23 @@ if(SHERPA_ONNX_ENABLE_PORTAUDIO) ) endif() +if(SHERPA_ONNX_HAS_ALSA) + add_executable(sense-voice-simulate-streaming-alsa-cxx-api + ./sense-voice-simulate-streaming-alsa-cxx-api.cc + ${CMAKE_CURRENT_LIST_DIR}/../sherpa-onnx/csrc/alsa.cc + ) + target_link_libraries(sense-voice-simulate-streaming-alsa-cxx-api + sherpa-onnx-cxx-api + portaudio_static + ) + + if(DEFINED ENV{SHERPA_ONNX_ALSA_LIB_DIR}) + target_link_libraries(sense-voice-simulate-streaming-alsa-cxx-api -L$ENV{SHERPA_ONNX_ALSA_LIB_DIR} -lasound) + else() + target_link_libraries(sense-voice-simulate-streaming-alsa-cxx-api asound) + endif() +endif() + add_executable(sense-voice-with-hr-cxx-api ./sense-voice-with-hr-cxx-api.cc) target_link_libraries(sense-voice-with-hr-cxx-api sherpa-onnx-cxx-api) diff --git a/cxx-api-examples/sense-voice-simulate-streaming-alsa-cxx-api.cc b/cxx-api-examples/sense-voice-simulate-streaming-alsa-cxx-api.cc new file mode 100644 index 00000000..ab096d3e --- /dev/null +++ b/cxx-api-examples/sense-voice-simulate-streaming-alsa-cxx-api.cc @@ -0,0 +1,243 @@ +// cxx-api-examples/sense-voice-simulate-streaming-alsa-cxx-api.cc +// Copyright (c) 2025 Xiaomi Corporation + +// +// This file demonstrates how to use sense voice with sherpa-onnx's C++ API +// for streaming speech recognition from a microphone. +// +// clang-format off +// +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx +// +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 +// tar xvf sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 +// rm sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 +// +// clang-format on + +#include +#include +#include + +#include // NOLINT +#include // NOLINT +#include +#include // NOLINT +#include +#include +#include + +#include "portaudio.h" // NOLINT +#include "sherpa-display.h" // NOLINT +#include "sherpa-onnx/c-api/cxx-api.h" +#include "sherpa-onnx/csrc/alsa.h" + +std::queue> samples_queue; +std::condition_variable condition_variable; +std::mutex mutex; +bool stop = false; + +static void Handler(int32_t /*sig*/) { + stop = true; + condition_variable.notify_one(); + fprintf(stderr, "\nCaught Ctrl + C. Exiting...\n"); +} + +static void RecordCallback(sherpa_onnx::Alsa *alsa) { + int32_t chunk = 0.1 * alsa->GetActualSampleRate(); + while (!stop) { + std::vector samples = alsa->Read(chunk); + + std::lock_guard lock(mutex); + samples_queue.emplace(std::move(samples)); + condition_variable.notify_one(); + } +} + +static sherpa_onnx::cxx::VoiceActivityDetector CreateVad() { + using namespace sherpa_onnx::cxx; // NOLINT + VadModelConfig config; + config.silero_vad.model = "./silero_vad.onnx"; + config.silero_vad.threshold = 0.5; + config.silero_vad.min_silence_duration = 0.1; + config.silero_vad.min_speech_duration = 0.25; + config.silero_vad.max_speech_duration = 8; + config.sample_rate = 16000; + config.debug = false; + + VoiceActivityDetector vad = VoiceActivityDetector::Create(config, 20); + if (!vad.Get()) { + std::cerr << "Failed to create VAD. Please check your config\n"; + exit(-1); + } + + return vad; +} + +static sherpa_onnx::cxx::OfflineRecognizer CreateOfflineRecognizer() { + using namespace sherpa_onnx::cxx; // NOLINT + OfflineRecognizerConfig config; + + config.model_config.sense_voice.model = + "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.int8.onnx"; + config.model_config.sense_voice.use_itn = false; + config.model_config.sense_voice.language = "auto"; + config.model_config.tokens = + "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt"; + + config.model_config.num_threads = 2; + config.model_config.debug = false; + + std::cout << "Loading model\n"; + OfflineRecognizer recognizer = OfflineRecognizer::Create(config); + if (!recognizer.Get()) { + std::cerr << "Please check your config\n"; + exit(-1); + } + std::cout << "Loading model done\n"; + return recognizer; +} + +int32_t main(int32_t argc, const char *argv[]) { + const char *kUsageMessage = R"usage( +Usage: + +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx + +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 +tar xvf sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 +rm sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 + +./sense-voice-simulate-streaming-alsa-cxx-api device_name + +The device name specifies which microphone to use in case there are several +on your system. You can use + + arecord -l + +to find all available microphones on your computer. For instance, if it outputs + +**** List of CAPTURE Hardware Devices **** +card 3: UACDemoV10 [UACDemoV1.0], device 0: USB Audio [USB Audio] + Subdevices: 1/1 + Subdevice #0: subdevice #0 + +and if you want to select card 3 and device 0 on that card, please use: + + plughw:3,0 + +as the device_name. +)usage"; + + if (argc != 2) { + fprintf(stderr, "%s\n", kUsageMessage); + return -1; + } + + signal(SIGINT, Handler); + + using namespace sherpa_onnx::cxx; // NOLINT + + auto vad = CreateVad(); + auto recognizer = CreateOfflineRecognizer(); + + int32_t expected_sample_rate = 16000; + + std::string device_name = argv[1]; + sherpa_onnx::Alsa alsa(device_name.c_str()); + fprintf(stderr, "Use recording device: %s\n", device_name.c_str()); + + if (alsa.GetExpectedSampleRate() != expected_sample_rate) { + fprintf(stderr, "sample rate: %d != %d\n", alsa.GetExpectedSampleRate(), + expected_sample_rate); + exit(-1); + } + + int32_t window_size = 512; // samples, please don't change + + int32_t offset = 0; + std::vector buffer; + bool speech_started = false; + + auto started_time = std::chrono::steady_clock::now(); + + SherpaDisplay display; + + std::thread record_thread(RecordCallback, &alsa); + + std::cout << "Started! Please speak\n"; + + while (!stop) { + { + std::unique_lock lock(mutex); + while (samples_queue.empty() && !stop) { + condition_variable.wait(lock); + } + + const auto &s = samples_queue.front(); + buffer.insert(buffer.end(), s.begin(), s.end()); + + samples_queue.pop(); + } + + for (; offset + window_size < buffer.size(); offset += window_size) { + vad.AcceptWaveform(buffer.data() + offset, window_size); + if (!speech_started && vad.IsDetected()) { + speech_started = true; + started_time = std::chrono::steady_clock::now(); + } + } + if (!speech_started) { + if (buffer.size() > 10 * window_size) { + offset -= buffer.size() - 10 * window_size; + buffer = {buffer.end() - 10 * window_size, buffer.end()}; + } + } + + auto current_time = std::chrono::steady_clock::now(); + const float elapsed_seconds = + std::chrono::duration_cast(current_time - + started_time) + .count() / + 1000.; + + if (speech_started && elapsed_seconds > 0.2) { + OfflineStream stream = recognizer.CreateStream(); + stream.AcceptWaveform(expected_sample_rate, buffer.data(), buffer.size()); + + recognizer.Decode(&stream); + + OfflineRecognizerResult result = recognizer.GetResult(&stream); + display.UpdateText(result.text); + display.Display(); + + started_time = std::chrono::steady_clock::now(); + } + + while (!vad.IsEmpty()) { + auto segment = vad.Front(); + + vad.Pop(); + + OfflineStream stream = recognizer.CreateStream(); + stream.AcceptWaveform(expected_sample_rate, segment.samples.data(), + segment.samples.size()); + + recognizer.Decode(&stream); + + OfflineRecognizerResult result = recognizer.GetResult(&stream); + + display.UpdateText(result.text); + display.FinalizeCurrentSentence(); + display.Display(); + + buffer.clear(); + offset = 0; + speech_started = false; + } + } + + record_thread.join(); + + return 0; +} diff --git a/cxx-api-examples/sherpa-display.h b/cxx-api-examples/sherpa-display.h index 996babe1..ca8d286c 100644 --- a/cxx-api-examples/sherpa-display.h +++ b/cxx-api-examples/sherpa-display.h @@ -45,10 +45,11 @@ class SherpaDisplay { private: static void ClearScreen() { #ifdef _MSC_VER - system("cls"); + auto ret = system("cls"); #else - system("clear"); + auto ret = system("clear"); #endif + (void)ret; } static std::string GetCurrentDateTime() {