// cxx-api-examples/nemo-canary-cxx-api.cc // // Copyright (c) 2025 Xiaomi Corporation // // This file demonstrates how to use NeMo Canary models with // sherpa-onnx's C++ API. // // clang-format off // // wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8.tar.bz2 // tar xvf sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8.tar.bz2 // rm sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8.tar.bz2 // // clang-format on // // see https://k2-fsa.github.io/sherpa/onnx/nemo/canary.html // for details #include // NOLINT #include #include #include "sherpa-onnx/c-api/cxx-api.h" int32_t main() { using namespace sherpa_onnx::cxx; // NOLINT OfflineRecognizerConfig config; config.model_config.canary.encoder = "sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8/encoder.int8.onnx"; config.model_config.canary.decoder = "sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8/decoder.int8.onnx"; // our input audio is German, so we set src_lang to "de" config.model_config.canary.src_lang = "de"; // we can set tgt_lang either to de or en in this specific case config.model_config.canary.tgt_lang = "en"; config.model_config.tokens = "sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8/tokens.txt"; config.model_config.num_threads = 1; std::cout << "Loading model\n"; OfflineRecognizer recognizer = OfflineRecognizer::Create(config); if (!recognizer.Get()) { std::cerr << "Please check your config\n"; return -1; } std::cout << "Loading model done\n"; std::string wave_filename = "./sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8/test_wavs/de.wav"; Wave wave = ReadWave(wave_filename); if (wave.samples.empty()) { std::cerr << "Failed to read: '" << wave_filename << "'\n"; return -1; } std::cout << "Start recognition\n"; const auto begin = std::chrono::steady_clock::now(); OfflineStream stream = recognizer.CreateStream(); stream.AcceptWaveform(wave.sample_rate, wave.samples.data(), wave.samples.size()); recognizer.Decode(&stream); OfflineRecognizerResult result = recognizer.GetResult(&stream); const auto end = std::chrono::steady_clock::now(); const float elapsed_seconds = std::chrono::duration_cast(end - begin) .count() / 1000.; float duration = wave.samples.size() / static_cast(wave.sample_rate); float rtf = elapsed_seconds / duration; std::cout << "text (English): " << result.text << "\n"; printf("Number of threads: %d\n", config.model_config.num_threads); printf("Duration: %.3fs\n", duration); printf("Elapsed seconds: %.3fs\n", elapsed_seconds); printf("(Real time factor) RTF = %.3f / %.3f = %.3f\n", elapsed_seconds, duration, rtf); // now output text in German config.model_config.canary.tgt_lang = "de"; recognizer.SetConfig(config); stream = recognizer.CreateStream(); stream.AcceptWaveform(wave.sample_rate, wave.samples.data(), wave.samples.size()); recognizer.Decode(&stream); result = recognizer.GetResult(&stream); std::cout << "text (German): " << result.text << "\n"; return 0; }