diff --git a/.github/workflows/build-wheels-linux.yaml b/.github/workflows/build-wheels-linux.yaml index 0380e2a9..e3f5fd97 100644 --- a/.github/workflows/build-wheels-linux.yaml +++ b/.github/workflows/build-wheels-linux.yaml @@ -20,7 +20,7 @@ jobs: strategy: fail-fast: false matrix: - os: [ubuntu-20.04] + os: [ubuntu-latest] python-version: ["cp37", "cp38", "cp39", "cp310", "cp311", "cp312", "cp313"] manylinux: [manylinux2014] #, manylinux_2_28] diff --git a/.github/workflows/c-api.yaml b/.github/workflows/c-api.yaml index 40978766..a3e81279 100644 --- a/.github/workflows/c-api.yaml +++ b/.github/workflows/c-api.yaml @@ -79,6 +79,48 @@ jobs: otool -L ./install/lib/libsherpa-onnx-c-api.dylib fi + - name: Test streaming zipformer with homophone replacer + shell: bash + run: | + name=streaming-zipformer-with-hr-c-api + gcc -o $name ./c-api-examples/$name.c \ + -I ./build/install/include \ + -L ./build/install/lib/ \ + -l sherpa-onnx-c-api \ + -l onnxruntime + + ls -lh $name + + if [[ ${{ matrix.os }} == ubuntu-latest || ${{ matrix.os }} == ubuntu-22.04-arm ]]; then + ldd ./$name + echo "----" + readelf -d ./$name + fi + + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 + tar xvf sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 + rm sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 + + ls -lh sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20 + echo "---" + + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/hr-files/dict.tar.bz2 + tar xf dict.tar.bz2 + rm dict.tar.bz2 + + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/hr-files/replace.fst + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/hr-files/test-hr.wav + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/hr-files/lexicon.txt + + export LD_LIBRARY_PATH=$PWD/build/install/lib:$LD_LIBRARY_PATH + export DYLD_LIBRARY_PATH=$PWD/build/install/lib:$DYLD_LIBRARY_PATH + + ./$name + + rm -rf sherpa-onnx-streaming-zipformer-* + rm -rf dict lexicon.txt test-hr.wav replace.fst + rm -v $name + - name: Test Dolphin CTC shell: bash run: | diff --git a/.github/workflows/cxx-api.yaml b/.github/workflows/cxx-api.yaml index fe755c5d..d3441c81 100644 --- a/.github/workflows/cxx-api.yaml +++ b/.github/workflows/cxx-api.yaml @@ -81,6 +81,49 @@ jobs: otool -L ./install/lib/libsherpa-onnx-cxx-api.dylib fi + - name: Test streaming zipformer with Homophone replacer + shell: bash + run: | + name=streaming-zipformer-with-hr-cxx-api + g++ -std=c++17 -o $name ./cxx-api-examples/$name.cc \ + -I ./build/install/include \ + -L ./build/install/lib/ \ + -l sherpa-onnx-cxx-api \ + -l sherpa-onnx-c-api \ + -l onnxruntime + + ls -lh $name + + if [[ ${{ matrix.os }} == ubuntu-latest || ${{ matrix.os }} == ubuntu-22.04-arm ]]; then + ldd ./$name + echo "----" + readelf -d ./$name + fi + + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 + tar xvf sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 + rm sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 + + ls -lh sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20 + echo "---" + + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/hr-files/dict.tar.bz2 + tar xf dict.tar.bz2 + rm dict.tar.bz2 + + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/hr-files/replace.fst + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/hr-files/test-hr.wav + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/hr-files/lexicon.txt + + export LD_LIBRARY_PATH=$PWD/build/install/lib:$LD_LIBRARY_PATH + export DYLD_LIBRARY_PATH=$PWD/build/install/lib:$DYLD_LIBRARY_PATH + + ./$name + + rm -rf sherpa-onnx-streaming-zipformer-* + rm -rf dict lexicon.txt test-hr.wav replace.fst + rm -v ./$name + - name: Test Dolphin CTC shell: bash run: | diff --git a/c-api-examples/CMakeLists.txt b/c-api-examples/CMakeLists.txt index df63444b..9e899a25 100644 --- a/c-api-examples/CMakeLists.txt +++ b/c-api-examples/CMakeLists.txt @@ -56,6 +56,9 @@ target_link_libraries(fire-red-asr-c-api sherpa-onnx-c-api) add_executable(sense-voice-c-api sense-voice-c-api.c) target_link_libraries(sense-voice-c-api sherpa-onnx-c-api) +add_executable(sense-voice-with-hr-c-api sense-voice-with-hr-c-api.c) +target_link_libraries(sense-voice-with-hr-c-api sherpa-onnx-c-api) + add_executable(dolphin-ctc-c-api dolphin-ctc-c-api.c) target_link_libraries(dolphin-ctc-c-api sherpa-onnx-c-api) @@ -68,6 +71,9 @@ target_link_libraries(zipformer-c-api sherpa-onnx-c-api) add_executable(streaming-zipformer-c-api streaming-zipformer-c-api.c) target_link_libraries(streaming-zipformer-c-api sherpa-onnx-c-api) +add_executable(streaming-zipformer-with-hr-c-api streaming-zipformer-with-hr-c-api.c) +target_link_libraries(streaming-zipformer-with-hr-c-api sherpa-onnx-c-api) + add_executable(paraformer-c-api paraformer-c-api.c) target_link_libraries(paraformer-c-api sherpa-onnx-c-api) diff --git a/c-api-examples/sense-voice-with-hr-c-api.c b/c-api-examples/sense-voice-with-hr-c-api.c new file mode 100644 index 00000000..66c68f80 --- /dev/null +++ b/c-api-examples/sense-voice-with-hr-c-api.c @@ -0,0 +1,99 @@ +// c-api-examples/sense-voice-with-hr-c-api.c +// +// Copyright (c) 2024-2025 Xiaomi Corporation + +// +// This file demonstrates how to use SenseVoice with sherpa-onnx's C API +// with homophone replacer. +// clang-format off +// +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 +// tar xvf sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 +// rm sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 +// +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/hr-files/dict.tar.bz2 +// tar xf dict.tar.bz2 +// +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/hr-files/replace.fst +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/hr-files/test-hr.wav +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/hr-files/lexicon.txt +// +// clang-format on + +#include +#include +#include + +#include "sherpa-onnx/c-api/c-api.h" + +int32_t main() { + const char *wav_filename = "./test-hr.wav"; + const char *model_filename = + "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.int8.onnx"; + const char *tokens_filename = + "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt"; + const char *language = "auto"; + const char *provider = "cpu"; + int32_t use_inverse_text_normalization = 1; + + const SherpaOnnxWave *wave = SherpaOnnxReadWave(wav_filename); + if (wave == NULL) { + fprintf(stderr, "Failed to read %s\n", wav_filename); + return -1; + } + + SherpaOnnxOfflineSenseVoiceModelConfig sense_voice_config; + memset(&sense_voice_config, 0, sizeof(sense_voice_config)); + sense_voice_config.model = model_filename; + sense_voice_config.language = language; + sense_voice_config.use_itn = use_inverse_text_normalization; + + // Offline model config + SherpaOnnxOfflineModelConfig offline_model_config; + memset(&offline_model_config, 0, sizeof(offline_model_config)); + offline_model_config.debug = 1; + offline_model_config.num_threads = 1; + offline_model_config.provider = provider; + offline_model_config.tokens = tokens_filename; + offline_model_config.sense_voice = sense_voice_config; + + // Recognizer config + SherpaOnnxOfflineRecognizerConfig recognizer_config; + memset(&recognizer_config, 0, sizeof(recognizer_config)); + recognizer_config.decoding_method = "greedy_search"; + recognizer_config.model_config = offline_model_config; + recognizer_config.hr.dict_dir = "./dict"; + recognizer_config.hr.lexicon = "./lexicon.txt"; + + // Please see + // https://colab.research.google.com/drive/1jEaS3s8FbRJIcVQJv2EQx19EM_mnuARi?usp=sharing + // for how to generate your own replace.fst + recognizer_config.hr.rule_fsts = "./replace.fst"; + + const SherpaOnnxOfflineRecognizer *recognizer = + SherpaOnnxCreateOfflineRecognizer(&recognizer_config); + + if (recognizer == NULL) { + fprintf(stderr, "Please check your config!\n"); + SherpaOnnxFreeWave(wave); + return -1; + } + + const SherpaOnnxOfflineStream *stream = + SherpaOnnxCreateOfflineStream(recognizer); + + SherpaOnnxAcceptWaveformOffline(stream, wave->sample_rate, wave->samples, + wave->num_samples); + SherpaOnnxDecodeOfflineStream(recognizer, stream); + const SherpaOnnxOfflineRecognizerResult *result = + SherpaOnnxGetOfflineStreamResult(stream); + + fprintf(stderr, "Decoded text: %s\n", result->text); + + SherpaOnnxDestroyOfflineRecognizerResult(result); + SherpaOnnxDestroyOfflineStream(stream); + SherpaOnnxDestroyOfflineRecognizer(recognizer); + SherpaOnnxFreeWave(wave); + + return 0; +} diff --git a/c-api-examples/streaming-zipformer-with-hr-c-api.c b/c-api-examples/streaming-zipformer-with-hr-c-api.c new file mode 100644 index 00000000..ac364068 --- /dev/null +++ b/c-api-examples/streaming-zipformer-with-hr-c-api.c @@ -0,0 +1,158 @@ +// c-api-examples/streaming-zipformer-with-hr-c-api.c +// +// Copyright (c) 2025 Xiaomi Corporation + +// +// This file demonstrates how to use streaming Zipformer with sherpa-onnx's C +// API. +// clang-format off +// +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 +// tar xvf sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 +// rm sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 +// +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/hr-files/dict.tar.bz2 +// tar xf dict.tar.bz2 +// +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/hr-files/replace.fst +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/hr-files/test-hr.wav +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/hr-files/lexicon.txt +// +// clang-format on + +#include +#include +#include + +#include "sherpa-onnx/c-api/c-api.h" + +int32_t main() { + const char *wav_filename = "test-hr.wav"; + + const SherpaOnnxWave *wave = SherpaOnnxReadWave(wav_filename); + if (wave == NULL) { + fprintf(stderr, "Failed to read %s\n", wav_filename); + return -1; + } + + // Online model config + SherpaOnnxOnlineModelConfig online_model_config; + memset(&online_model_config, 0, sizeof(online_model_config)); + online_model_config.debug = 0; + online_model_config.num_threads = 1; + online_model_config.provider = "cpu"; + online_model_config.tokens = + "./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/tokens.txt"; + + online_model_config.transducer.encoder = + "./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/" + "encoder-epoch-99-avg-1.int8.onnx"; + + // Note: We recommend not using int8.onnx for the decoder. + online_model_config.transducer.decoder = + "./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/" + "decoder-epoch-99-avg-1.onnx"; + + online_model_config.transducer.joiner = + "./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/" + "joiner-epoch-99-avg-1.int8.onnx"; + + online_model_config.tokens = + "./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/tokens.txt"; + + online_model_config.num_threads = 1; + + // Recognizer config + SherpaOnnxOnlineRecognizerConfig recognizer_config; + memset(&recognizer_config, 0, sizeof(recognizer_config)); + recognizer_config.decoding_method = "greedy_search"; + recognizer_config.model_config = online_model_config; + + recognizer_config.hr.dict_dir = "./dict"; + recognizer_config.hr.lexicon = "./lexicon.txt"; + + // Please see + // https://colab.research.google.com/drive/1jEaS3s8FbRJIcVQJv2EQx19EM_mnuARi?usp=sharing + // for how to generate your own replace.fst + recognizer_config.hr.rule_fsts = "./replace.fst"; + + const SherpaOnnxOnlineRecognizer *recognizer = + SherpaOnnxCreateOnlineRecognizer(&recognizer_config); + + if (recognizer == NULL) { + fprintf(stderr, "Please check your config!\n"); + SherpaOnnxFreeWave(wave); + return -1; + } + + const SherpaOnnxOnlineStream *stream = + SherpaOnnxCreateOnlineStream(recognizer); + + const SherpaOnnxDisplay *display = SherpaOnnxCreateDisplay(50); + int32_t segment_id = 0; + +// simulate streaming. You can choose an arbitrary N +#define N 3200 + + fprintf(stderr, "sample rate: %d, num samples: %d, duration: %.2f s\n", + wave->sample_rate, wave->num_samples, + (float)wave->num_samples / wave->sample_rate); + + int32_t k = 0; + while (k < wave->num_samples) { + int32_t start = k; + int32_t end = + (start + N > wave->num_samples) ? wave->num_samples : (start + N); + k += N; + + SherpaOnnxOnlineStreamAcceptWaveform(stream, wave->sample_rate, + wave->samples + start, end - start); + while (SherpaOnnxIsOnlineStreamReady(recognizer, stream)) { + SherpaOnnxDecodeOnlineStream(recognizer, stream); + } + + const SherpaOnnxOnlineRecognizerResult *r = + SherpaOnnxGetOnlineStreamResult(recognizer, stream); + + if (strlen(r->text)) { + SherpaOnnxPrint(display, segment_id, r->text); + } + + if (SherpaOnnxOnlineStreamIsEndpoint(recognizer, stream)) { + if (strlen(r->text)) { + ++segment_id; + } + SherpaOnnxOnlineStreamReset(recognizer, stream); + } + + SherpaOnnxDestroyOnlineRecognizerResult(r); + } + + // add some tail padding + float tail_paddings[4800] = {0}; // 0.3 seconds at 16 kHz sample rate + SherpaOnnxOnlineStreamAcceptWaveform(stream, wave->sample_rate, tail_paddings, + 4800); + + SherpaOnnxFreeWave(wave); + + SherpaOnnxOnlineStreamInputFinished(stream); + while (SherpaOnnxIsOnlineStreamReady(recognizer, stream)) { + SherpaOnnxDecodeOnlineStream(recognizer, stream); + } + + const SherpaOnnxOnlineRecognizerResult *r = + SherpaOnnxGetOnlineStreamResult(recognizer, stream); + + if (strlen(r->text)) { + SherpaOnnxPrint(display, segment_id, r->text); + } + + SherpaOnnxDestroyOnlineRecognizerResult(r); + + SherpaOnnxDestroyDisplay(display); + SherpaOnnxDestroyOnlineStream(stream); + SherpaOnnxDestroyOnlineRecognizer(recognizer); + fprintf(stderr, "\n"); + + return 0; +} diff --git a/cxx-api-examples/CMakeLists.txt b/cxx-api-examples/CMakeLists.txt index 4a820649..e0dc5e66 100644 --- a/cxx-api-examples/CMakeLists.txt +++ b/cxx-api-examples/CMakeLists.txt @@ -3,6 +3,9 @@ include_directories(${PROJECT_SOURCE_DIR}) add_executable(streaming-zipformer-cxx-api ./streaming-zipformer-cxx-api.cc) target_link_libraries(streaming-zipformer-cxx-api sherpa-onnx-cxx-api) +add_executable(streaming-zipformer-with-hr-cxx-api ./streaming-zipformer-with-hr-cxx-api.cc) +target_link_libraries(streaming-zipformer-with-hr-cxx-api sherpa-onnx-cxx-api) + add_executable(speech-enhancement-gtcrn-cxx-api ./speech-enhancement-gtcrn-cxx-api.cc) target_link_libraries(speech-enhancement-gtcrn-cxx-api sherpa-onnx-cxx-api) @@ -24,6 +27,9 @@ target_link_libraries(moonshine-cxx-api sherpa-onnx-cxx-api) add_executable(sense-voice-cxx-api ./sense-voice-cxx-api.cc) target_link_libraries(sense-voice-cxx-api sherpa-onnx-cxx-api) +add_executable(sense-voice-with-hr-cxx-api ./sense-voice-with-hr-cxx-api.cc) +target_link_libraries(sense-voice-with-hr-cxx-api sherpa-onnx-cxx-api) + add_executable(dolphin-ctc-cxx-api ./dolphin-ctc-cxx-api.cc) target_link_libraries(dolphin-ctc-cxx-api sherpa-onnx-cxx-api) diff --git a/cxx-api-examples/sense-voice-with-hr-cxx-api.cc b/cxx-api-examples/sense-voice-with-hr-cxx-api.cc new file mode 100644 index 00000000..2de8425a --- /dev/null +++ b/cxx-api-examples/sense-voice-with-hr-cxx-api.cc @@ -0,0 +1,92 @@ +// cxx-api-examples/sense-voice-with-hr-cxx-api.cc +// +// Copyright (c) 2024-2025 Xiaomi Corporation + +// +// This file demonstrates how to use sense voice with sherpa-onnx's C++ API. +// +// clang-format off +// +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 +// tar xvf sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 +// rm sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 +// +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/hr-files/dict.tar.bz2 +// tar xf dict.tar.bz2 +// +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/hr-files/replace.fst +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/hr-files/test-hr.wav +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/hr-files/lexicon.txt +// +// clang-format on + +#include // NOLINT +#include +#include + +#include "sherpa-onnx/c-api/cxx-api.h" + +int32_t main() { + using namespace sherpa_onnx::cxx; // NOLINT + OfflineRecognizerConfig config; + + config.model_config.sense_voice.model = + "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.int8.onnx"; + config.model_config.sense_voice.use_itn = true; + config.model_config.sense_voice.language = "auto"; + config.model_config.tokens = + "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt"; + config.hr.dict_dir = "./dict"; + config.hr.lexicon = "./lexicon.txt"; + + // Please see + // https://colab.research.google.com/drive/1jEaS3s8FbRJIcVQJv2EQx19EM_mnuARi?usp=sharing + // for how to generate your own replace.fst + config.hr.rule_fsts = "./replace.fst"; + + config.model_config.num_threads = 1; + + std::cout << "Loading model\n"; + OfflineRecognizer recongizer = OfflineRecognizer::Create(config); + if (!recongizer.Get()) { + std::cerr << "Please check your config\n"; + return -1; + } + std::cout << "Loading model done\n"; + + std::string wave_filename = "./test-hr.wav"; + + Wave wave = ReadWave(wave_filename); + if (wave.samples.empty()) { + std::cerr << "Failed to read: '" << wave_filename << "'\n"; + return -1; + } + + std::cout << "Start recognition\n"; + const auto begin = std::chrono::steady_clock::now(); + + OfflineStream stream = recongizer.CreateStream(); + stream.AcceptWaveform(wave.sample_rate, wave.samples.data(), + wave.samples.size()); + + recongizer.Decode(&stream); + + OfflineRecognizerResult result = recongizer.GetResult(&stream); + + const auto end = std::chrono::steady_clock::now(); + const float elapsed_seconds = + std::chrono::duration_cast(end - begin) + .count() / + 1000.; + float duration = wave.samples.size() / static_cast(wave.sample_rate); + float rtf = elapsed_seconds / duration; + + std::cout << "text: " << result.text << "\n"; + printf("Number of threads: %d\n", config.model_config.num_threads); + printf("Duration: %.3fs\n", duration); + printf("Elapsed seconds: %.3fs\n", elapsed_seconds); + printf("(Real time factor) RTF = %.3f / %.3f = %.3f\n", elapsed_seconds, + duration, rtf); + + return 0; +} diff --git a/cxx-api-examples/streaming-zipformer-with-hr-cxx-api.cc b/cxx-api-examples/streaming-zipformer-with-hr-cxx-api.cc new file mode 100644 index 00000000..e9e6ed4e --- /dev/null +++ b/cxx-api-examples/streaming-zipformer-with-hr-cxx-api.cc @@ -0,0 +1,106 @@ +// cxx-api-examples/streaming-zipformer-with-hr-cxx-api.cc +// Copyright (c) 2024-2025 Xiaomi Corporation + +// +// This file demonstrates how to use streaming Zipformer +// with sherpa-onnx's C++ API. +// +// clang-format off +// +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 +// tar xvf sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 +// rm sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 +// +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/hr-files/dict.tar.bz2 +// tar xf dict.tar.bz2 +// +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/hr-files/replace.fst +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/hr-files/test-hr.wav +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/hr-files/lexicon.txt +// +// clang-format on + +#include // NOLINT +#include +#include + +#include "sherpa-onnx/c-api/cxx-api.h" + +int32_t main() { + using namespace sherpa_onnx::cxx; // NOLINT + OnlineRecognizerConfig config; + + // please see + // https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-transducer/zipformer-transducer-models.html#csukuangfj-sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20-bilingual-chinese-english + config.model_config.transducer.encoder = + "./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/" + "encoder-epoch-99-avg-1.int8.onnx"; + + // Note: We recommend not using int8.onnx for the decoder. + config.model_config.transducer.decoder = + "./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/" + "decoder-epoch-99-avg-1.onnx"; + + config.model_config.transducer.joiner = + "./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/" + "joiner-epoch-99-avg-1.int8.onnx"; + + config.model_config.tokens = + "./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/tokens.txt"; + + config.model_config.num_threads = 1; + + config.hr.dict_dir = "./dict"; + config.hr.lexicon = "./lexicon.txt"; + + // Please see + // https://colab.research.google.com/drive/1jEaS3s8FbRJIcVQJv2EQx19EM_mnuARi?usp=sharing + // for how to generate your own replace.fst + config.hr.rule_fsts = "./replace.fst"; + + std::cout << "Loading model\n"; + OnlineRecognizer recongizer = OnlineRecognizer::Create(config); + if (!recongizer.Get()) { + std::cerr << "Please check your config\n"; + return -1; + } + std::cout << "Loading model done\n"; + + std::string wave_filename = "./test-hr.wav"; + Wave wave = ReadWave(wave_filename); + if (wave.samples.empty()) { + std::cerr << "Failed to read: '" << wave_filename << "'\n"; + return -1; + } + + std::cout << "Start recognition\n"; + const auto begin = std::chrono::steady_clock::now(); + + OnlineStream stream = recongizer.CreateStream(); + stream.AcceptWaveform(wave.sample_rate, wave.samples.data(), + wave.samples.size()); + stream.InputFinished(); + + while (recongizer.IsReady(&stream)) { + recongizer.Decode(&stream); + } + + OnlineRecognizerResult result = recongizer.GetResult(&stream); + + const auto end = std::chrono::steady_clock::now(); + const float elapsed_seconds = + std::chrono::duration_cast(end - begin) + .count() / + 1000.; + float duration = wave.samples.size() / static_cast(wave.sample_rate); + float rtf = elapsed_seconds / duration; + + std::cout << "text: " << result.text << "\n"; + printf("Number of threads: %d\n", config.model_config.num_threads); + printf("Duration: %.3fs\n", duration); + printf("Elapsed seconds: %.3fs\n", elapsed_seconds); + printf("(Real time factor) RTF = %.3f / %.3f = %.3f\n", elapsed_seconds, + duration, rtf); + + return 0; +} diff --git a/sherpa-onnx/c-api/c-api.cc b/sherpa-onnx/c-api/c-api.cc index eb370b78..507275a6 100644 --- a/sherpa-onnx/c-api/c-api.cc +++ b/sherpa-onnx/c-api/c-api.cc @@ -153,6 +153,10 @@ static sherpa_onnx::OnlineRecognizerConfig GetOnlineRecognizerConfig( recognizer_config.rule_fsts = SHERPA_ONNX_OR(config->rule_fsts, ""); recognizer_config.rule_fars = SHERPA_ONNX_OR(config->rule_fars, ""); + recognizer_config.hr.dict_dir = SHERPA_ONNX_OR(config->hr.dict_dir, ""); + recognizer_config.hr.lexicon = SHERPA_ONNX_OR(config->hr.lexicon, ""); + recognizer_config.hr.rule_fsts = SHERPA_ONNX_OR(config->hr.rule_fsts, ""); + if (config->model_config.debug) { #if __OHOS__ SHERPA_ONNX_LOGE("%{public}s\n", recognizer_config.ToString().c_str()); @@ -494,6 +498,10 @@ static sherpa_onnx::OfflineRecognizerConfig GetOfflineRecognizerConfig( recognizer_config.rule_fsts = SHERPA_ONNX_OR(config->rule_fsts, ""); recognizer_config.rule_fars = SHERPA_ONNX_OR(config->rule_fars, ""); + recognizer_config.hr.dict_dir = SHERPA_ONNX_OR(config->hr.dict_dir, ""); + recognizer_config.hr.lexicon = SHERPA_ONNX_OR(config->hr.lexicon, ""); + recognizer_config.hr.rule_fsts = SHERPA_ONNX_OR(config->hr.rule_fsts, ""); + if (config->model_config.debug) { #if __OHOS__ SHERPA_ONNX_LOGE("%{public}s\n", recognizer_config.ToString().c_str()); diff --git a/sherpa-onnx/c-api/c-api.h b/sherpa-onnx/c-api/c-api.h index 83847ab8..2134d2b3 100644 --- a/sherpa-onnx/c-api/c-api.h +++ b/sherpa-onnx/c-api/c-api.h @@ -112,6 +112,12 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOnlineCtcFstDecoderConfig { int32_t max_active; } SherpaOnnxOnlineCtcFstDecoderConfig; +SHERPA_ONNX_API typedef struct SherpaOnnxHomophoneReplacerConfig { + const char *dict_dir; + const char *lexicon; + const char *rule_fsts; +} SherpaOnnxHomophoneReplacerConfig; + SHERPA_ONNX_API typedef struct SherpaOnnxOnlineRecognizerConfig { SherpaOnnxFeatureConfig feat_config; SherpaOnnxOnlineModelConfig model_config; @@ -157,6 +163,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOnlineRecognizerConfig { const char *hotwords_buf; /// byte size excluding the tailing '\0' int32_t hotwords_buf_size; + SherpaOnnxHomophoneReplacerConfig hr; } SherpaOnnxOnlineRecognizerConfig; SHERPA_ONNX_API typedef struct SherpaOnnxOnlineRecognizerResult { @@ -461,6 +468,8 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineRecognizerConfig { const char *rule_fsts; const char *rule_fars; float blank_penalty; + + SherpaOnnxHomophoneReplacerConfig hr; } SherpaOnnxOfflineRecognizerConfig; SHERPA_ONNX_API typedef struct SherpaOnnxOfflineRecognizer diff --git a/sherpa-onnx/c-api/cxx-api.cc b/sherpa-onnx/c-api/cxx-api.cc index 0f818b9f..dea70579 100644 --- a/sherpa-onnx/c-api/cxx-api.cc +++ b/sherpa-onnx/c-api/cxx-api.cc @@ -99,6 +99,10 @@ OnlineRecognizer OnlineRecognizer::Create( c.hotwords_buf = config.hotwords_buf.c_str(); c.hotwords_buf_size = config.hotwords_buf.size(); + c.hr.dict_dir = config.hr.dict_dir.c_str(); + c.hr.lexicon = config.hr.lexicon.c_str(); + c.hr.rule_fsts = config.hr.rule_fsts.c_str(); + auto p = SherpaOnnxCreateOnlineRecognizer(&c); return OnlineRecognizer(p); } @@ -261,6 +265,10 @@ OfflineRecognizer OfflineRecognizer::Create( c.blank_penalty = config.blank_penalty; + c.hr.dict_dir = config.hr.dict_dir.c_str(); + c.hr.lexicon = config.hr.lexicon.c_str(); + c.hr.rule_fsts = config.hr.rule_fsts.c_str(); + auto p = SherpaOnnxCreateOfflineRecognizer(&c); return OfflineRecognizer(p); } diff --git a/sherpa-onnx/c-api/cxx-api.h b/sherpa-onnx/c-api/cxx-api.h index f1b1f040..a8fd6552 100644 --- a/sherpa-onnx/c-api/cxx-api.h +++ b/sherpa-onnx/c-api/cxx-api.h @@ -55,6 +55,12 @@ struct OnlineCtcFstDecoderConfig { int32_t max_active = 3000; }; +struct HomophoneReplacerConfig { + std::string dict_dir; + std::string lexicon; + std::string rule_fsts; +}; + struct OnlineRecognizerConfig { FeatureConfig feat_config; OnlineModelConfig model_config; @@ -81,6 +87,7 @@ struct OnlineRecognizerConfig { float blank_penalty = 0; std::string hotwords_buf; + HomophoneReplacerConfig hr; }; struct OnlineRecognizerResult { @@ -280,6 +287,7 @@ struct SHERPA_ONNX_API OfflineRecognizerConfig { std::string rule_fsts; std::string rule_fars; float blank_penalty = 0; + HomophoneReplacerConfig hr; }; struct SHERPA_ONNX_API OfflineRecognizerResult {