diff --git a/.github/scripts/test-offline-ctc.sh b/.github/scripts/test-offline-ctc.sh index f85b5853..f0978930 100755 --- a/.github/scripts/test-offline-ctc.sh +++ b/.github/scripts/test-offline-ctc.sh @@ -15,6 +15,39 @@ echo "PATH: $PATH" which $EXE +for type in base small; do + log "------------------------------------------------------------" + log "Run Dolphin CTC models ($type int8)" + log "------------------------------------------------------------" + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-dolphin-$type-ctc-multi-lang-int8-2025-04-02.tar.bz2 + tar xvf sherpa-onnx-dolphin-$type-ctc-multi-lang-int8-2025-04-02.tar.bz2 + rm sherpa-onnx-dolphin-$type-ctc-multi-lang-int8-2025-04-02.tar.bz2 + + $EXE \ + --dolphin-model=./sherpa-onnx-dolphin-$type-ctc-multi-lang-int8-2025-04-02/model.int8.onnx \ + --tokens=./sherpa-onnx-dolphin-$type-ctc-multi-lang-int8-2025-04-02/tokens.txt \ + --debug=1 \ + ./sherpa-onnx-dolphin-$type-ctc-multi-lang-int8-2025-04-02/test_wavs/0.wav + + rm -rf sherpa-onnx-dolphin-$type-ctc-multi-lang-int8-2025-04-02 + + log "------------------------------------------------------------" + log "Run Dolphin CTC models ($type)" + log "------------------------------------------------------------" + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-dolphin-$type-ctc-multi-lang-2025-04-02.tar.bz2 + tar xvf sherpa-onnx-dolphin-$type-ctc-multi-lang-2025-04-02.tar.bz2 + rm sherpa-onnx-dolphin-$type-ctc-multi-lang-2025-04-02.tar.bz2 + + $EXE \ + --dolphin-model=./sherpa-onnx-dolphin-$type-ctc-multi-lang-2025-04-02/model.onnx \ + --tokens=./sherpa-onnx-dolphin-$type-ctc-multi-lang-2025-04-02/tokens.txt \ + --debug=1 \ + ./sherpa-onnx-dolphin-$type-ctc-multi-lang-2025-04-02/test_wavs/0.wav + + rm -rf sherpa-onnx-dolphin-$type-ctc-multi-lang-2025-04-02 +done + + log "------------------------------------------------------------" log "Run NeMo GigaAM Russian models" log "------------------------------------------------------------" diff --git a/.github/scripts/test-python.sh b/.github/scripts/test-python.sh index 8a132ade..3704e7fb 100755 --- a/.github/scripts/test-python.sh +++ b/.github/scripts/test-python.sh @@ -8,6 +8,15 @@ log() { echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" } +log "test offline dolphin ctc" +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2 +tar xvf sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2 +rm sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2 + +python3 ./python-api-examples/offline-dolphin-ctc-decode-files.py + +rm -rf sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02 + log "test offline speech enhancement (GTCRN)" curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speech-enhancement-models/gtcrn_simple.onnx diff --git a/.github/workflows/export-dophin-ctc-to-onnx.yaml b/.github/workflows/export-dophin-ctc-to-onnx.yaml new file mode 100644 index 00000000..28bd07d7 --- /dev/null +++ b/.github/workflows/export-dophin-ctc-to-onnx.yaml @@ -0,0 +1,48 @@ +name: export-dolphin-ctc-to-onnx + +on: + workflow_dispatch: + +concurrency: + group: export-dolphin-ctc-to-onnx-${{ github.ref }} + cancel-in-progress: true + +jobs: + export-dolphin-ctc-to-onnx: + if: github.repository_owner == 'k2-fsa' || github.repository_owner == 'csukuangfj' + name: ${{ matrix.model_type }} + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [macos-latest] + model_type: [small, base] + + steps: + - uses: actions/checkout@v4 + + - name: Download ${{ matrix.model_type }} + shell: bash + run: | + git lfs install + type=${{ matrix.model_type }} + + git clone https://huggingface.co/csukuangfj/sherpa-onnx-dolphin-$type-ctc-multi-lang-int8-2025-04-02 + git clone https://huggingface.co/csukuangfj/sherpa-onnx-dolphin-$type-ctc-multi-lang-2025-04-02 + + rm -rf sherpa-onnx-dolphin-*/.git* + + ls -lha sherpa-onnx-dolphin-*/ + + tar cjfv sherpa-onnx-dolphin-$type-ctc-multi-lang-int8-2025-04-02.tar.bz2 sherpa-onnx-dolphin-$type-ctc-multi-lang-int8-2025-04-02 + tar cjfv sherpa-onnx-dolphin-$type-ctc-multi-lang-2025-04-02.tar.bz2 sherpa-onnx-dolphin-$type-ctc-multi-lang-2025-04-02 + + - name: Release + uses: svenstaro/upload-release-action@v2 + with: + file_glob: true + file: ./*.tar.bz2 + overwrite: true + repo_name: k2-fsa/sherpa-onnx + repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }} + tag: asr-models diff --git a/.github/workflows/linux.yaml b/.github/workflows/linux.yaml index bcf27136..4a5277f1 100644 --- a/.github/workflows/linux.yaml +++ b/.github/workflows/linux.yaml @@ -205,6 +205,16 @@ jobs: overwrite: true file: sherpa-onnx-*.tar.bz2 + - name: Test offline CTC + shell: bash + run: | + du -h -d1 . + export PATH=$PWD/build/bin:$PATH + export EXE=sherpa-onnx-offline + + .github/scripts/test-offline-ctc.sh + du -h -d1 . + - name: Test offline speech denoiser shell: bash run: | @@ -249,16 +259,6 @@ jobs: .github/scripts/test-offline-moonshine.sh du -h -d1 . - - name: Test offline CTC - shell: bash - run: | - du -h -d1 . - export PATH=$PWD/build/bin:$PATH - export EXE=sherpa-onnx-offline - - .github/scripts/test-offline-ctc.sh - du -h -d1 . - - name: Test C++ API shell: bash run: | diff --git a/.github/workflows/macos.yaml b/.github/workflows/macos.yaml index bc7bbfd7..ca7f0767 100644 --- a/.github/workflows/macos.yaml +++ b/.github/workflows/macos.yaml @@ -162,6 +162,14 @@ jobs: overwrite: true file: sherpa-onnx-*osx-universal2*.tar.bz2 + - name: Test offline CTC + shell: bash + run: | + export PATH=$PWD/build/bin:$PATH + export EXE=sherpa-onnx-offline + + .github/scripts/test-offline-ctc.sh + - name: Test offline speech denoiser shell: bash run: | @@ -226,14 +234,6 @@ jobs: .github/scripts/test-online-punctuation.sh - - name: Test offline CTC - shell: bash - run: | - export PATH=$PWD/build/bin:$PATH - export EXE=sherpa-onnx-offline - - .github/scripts/test-offline-ctc.sh - - name: Test online CTC shell: bash run: | diff --git a/CMakeLists.txt b/CMakeLists.txt index 3e2e1e5c..966f750c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,3 +1,7 @@ +if (CMAKE_VERSION VERSION_GREATER_EQUAL "4.0.0") + set(CMAKE_POLICY_VERSION_MINIMUM 3.5) +endif() + cmake_minimum_required(VERSION 3.13 FATAL_ERROR) set(CMAKE_OSX_DEPLOYMENT_TARGET "10.14" CACHE STRING "Minimum OS X deployment version. Used only for macOS") diff --git a/python-api-examples/offline-dolphin-ctc-decode-files.py b/python-api-examples/offline-dolphin-ctc-decode-files.py new file mode 100755 index 00000000..9d9b4815 --- /dev/null +++ b/python-api-examples/offline-dolphin-ctc-decode-files.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python3 + +""" +This file shows how to use a non-streaming CTC model from Dolphin +to decode files. + +Please download model files from +https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models +""" + +from pathlib import Path +import time + +import sherpa_onnx +import soundfile as sf + + +def create_recognizer(): + model = "./sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02/model.int8.onnx" + tokens = "./sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02/tokens.txt" + test_wav = ( + "./sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02/test_wavs/0.wav" + ) + + if not Path(model).is_file() or not Path(test_wav).is_file(): + raise ValueError( + """Please download model files from + https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models + """ + ) + return ( + sherpa_onnx.OfflineRecognizer.from_dolphin_ctc( + model=model, + tokens=tokens, + debug=True, + ), + test_wav, + ) + + +def main(): + recognizer, wave_filename = create_recognizer() + + audio, sample_rate = sf.read(wave_filename, dtype="float32", always_2d=True) + audio = audio[:, 0] # only use the first channel + + # audio is a 1-D float32 numpy array normalized to the range [-1, 1] + # sample_rate does not need to be 16000 Hz + + start = time.time() + stream = recognizer.create_stream() + stream.accept_waveform(sample_rate, audio) + recognizer.decode_stream(stream) + end = time.time() + + print(wave_filename) + print(stream.result) + + elapsed_seconds = end - start + audio_duration = len(audio) / sample_rate + real_time_factor = elapsed_seconds / audio_duration + + print(f"Elapsed seconds: {elapsed_seconds:.3f}") + print(f"Audio duration in seconds: {audio_duration:.3f}") + print(f"RTF: {elapsed_seconds:.3f}/{audio_duration:.3f} = {real_time_factor:.3f}") + + +if __name__ == "__main__": + main() diff --git a/sherpa-onnx/csrc/CMakeLists.txt b/sherpa-onnx/csrc/CMakeLists.txt index 9aa192c0..a84a9f4a 100644 --- a/sherpa-onnx/csrc/CMakeLists.txt +++ b/sherpa-onnx/csrc/CMakeLists.txt @@ -27,6 +27,8 @@ set(sources offline-ctc-fst-decoder.cc offline-ctc-greedy-search-decoder.cc offline-ctc-model.cc + offline-dolphin-model-config.cc + offline-dolphin-model.cc offline-fire-red-asr-greedy-search-decoder.cc offline-fire-red-asr-model-config.cc offline-fire-red-asr-model.cc diff --git a/sherpa-onnx/csrc/offline-ctc-model.cc b/sherpa-onnx/csrc/offline-ctc-model.cc index 10748829..fb3089fa 100644 --- a/sherpa-onnx/csrc/offline-ctc-model.cc +++ b/sherpa-onnx/csrc/offline-ctc-model.cc @@ -20,6 +20,7 @@ #include "sherpa-onnx/csrc/file-utils.h" #include "sherpa-onnx/csrc/macros.h" +#include "sherpa-onnx/csrc/offline-dolphin-model.h" #include "sherpa-onnx/csrc/offline-nemo-enc-dec-ctc-model.h" #include "sherpa-onnx/csrc/offline-tdnn-ctc-model.h" #include "sherpa-onnx/csrc/offline-telespeech-ctc-model.h" @@ -110,6 +111,10 @@ static ModelType GetModelType(char *model_data, size_t model_data_length, std::unique_ptr OfflineCtcModel::Create( const OfflineModelConfig &config) { + if (!config.dolphin.model.empty()) { + return std::make_unique(config); + } + // TODO(fangjun): Refactor it. We don't need to use model_type here ModelType model_type = ModelType::kUnknown; @@ -160,6 +165,10 @@ std::unique_ptr OfflineCtcModel::Create( template std::unique_ptr OfflineCtcModel::Create( Manager *mgr, const OfflineModelConfig &config) { + if (!config.dolphin.model.empty()) { + return std::make_unique(mgr, config); + } + // TODO(fangjun): Refactor it. We don't need to use model_type here ModelType model_type = ModelType::kUnknown; diff --git a/sherpa-onnx/csrc/offline-ctc-model.h b/sherpa-onnx/csrc/offline-ctc-model.h index 5ad4fcdc..a9d79402 100644 --- a/sherpa-onnx/csrc/offline-ctc-model.h +++ b/sherpa-onnx/csrc/offline-ctc-model.h @@ -64,6 +64,10 @@ class OfflineCtcModel { // return true for models from https://github.com/salute-developers/GigaAM // return false otherwise virtual bool IsGigaAM() const { return false; } + + // For Dolphin models, they use global CMVN + virtual void NormalizeFeatures(float *features, int32_t num_frames, + int32_t feat_dim) const {} }; } // namespace sherpa_onnx diff --git a/sherpa-onnx/csrc/offline-dolphin-model-config.cc b/sherpa-onnx/csrc/offline-dolphin-model-config.cc new file mode 100644 index 00000000..03f4cb57 --- /dev/null +++ b/sherpa-onnx/csrc/offline-dolphin-model-config.cc @@ -0,0 +1,35 @@ +// sherpa-onnx/csrc/offline-dolphin-model-config.cc +// +// Copyright (c) 2025 Xiaomi Corporation + +#include "sherpa-onnx/csrc/offline-dolphin-model-config.h" + +#include "sherpa-onnx/csrc/file-utils.h" +#include "sherpa-onnx/csrc/macros.h" + +namespace sherpa_onnx { + +void OfflineDolphinModelConfig::Register(ParseOptions *po) { + po->Register("dolphin-model", &model, + "Path to model.onnx of Dolphin CTC branch."); +} + +bool OfflineDolphinModelConfig::Validate() const { + if (!FileExists(model)) { + SHERPA_ONNX_LOGE("Dolphin model '%s' does not exist", model.c_str()); + return false; + } + + return true; +} + +std::string OfflineDolphinModelConfig::ToString() const { + std::ostringstream os; + + os << "OfflineDolphinModelConfig("; + os << "model=\"" << model << "\")"; + + return os.str(); +} + +} // namespace sherpa_onnx diff --git a/sherpa-onnx/csrc/offline-dolphin-model-config.h b/sherpa-onnx/csrc/offline-dolphin-model-config.h new file mode 100644 index 00000000..ddd39e29 --- /dev/null +++ b/sherpa-onnx/csrc/offline-dolphin-model-config.h @@ -0,0 +1,27 @@ +// sherpa-onnx/csrc/offline-dolphin-model-config.h +// +// Copyright (c) 2025 Xiaomi Corporation +#ifndef SHERPA_ONNX_CSRC_OFFLINE_DOLPHIN_MODEL_CONFIG_H_ +#define SHERPA_ONNX_CSRC_OFFLINE_DOLPHIN_MODEL_CONFIG_H_ + +#include + +#include "sherpa-onnx/csrc/parse-options.h" + +namespace sherpa_onnx { + +struct OfflineDolphinModelConfig { + std::string model; + + OfflineDolphinModelConfig() = default; + explicit OfflineDolphinModelConfig(const std::string &model) : model(model) {} + + void Register(ParseOptions *po); + bool Validate() const; + + std::string ToString() const; +}; + +} // namespace sherpa_onnx + +#endif // SHERPA_ONNX_CSRC_OFFLINE_DOLPHIN_MODEL_CONFIG_H_ diff --git a/sherpa-onnx/csrc/offline-dolphin-model-meta-data.h b/sherpa-onnx/csrc/offline-dolphin-model-meta-data.h new file mode 100644 index 00000000..8dcbc581 --- /dev/null +++ b/sherpa-onnx/csrc/offline-dolphin-model-meta-data.h @@ -0,0 +1,21 @@ +// sherpa-onnx/csrc/offline-dolphin-model-meta-data.h +// +// Copyright (c) 2024 Xiaomi Corporation +#ifndef SHERPA_ONNX_CSRC_OFFLINE_DOLPHIN_MODEL_META_DATA_H_ +#define SHERPA_ONNX_CSRC_OFFLINE_DOLPHIN_MODEL_META_DATA_H_ + +#include +#include + +namespace sherpa_onnx { + +struct OfflineDolphinModelMetaData { + int32_t vocab_size; + int32_t subsampling_factor = 4; + std::vector mean; + std::vector inv_stddev; +}; + +} // namespace sherpa_onnx + +#endif // SHERPA_ONNX_CSRC_OFFLINE_DOLPHIN_MODEL_META_DATA_H_ diff --git a/sherpa-onnx/csrc/offline-dolphin-model.cc b/sherpa-onnx/csrc/offline-dolphin-model.cc new file mode 100644 index 00000000..b8abd5b3 --- /dev/null +++ b/sherpa-onnx/csrc/offline-dolphin-model.cc @@ -0,0 +1,165 @@ +// sherpa-onnx/csrc/offline-dolphin-model.cc +// +// Copyright (c) 2025 Xiaomi Corporation + +#include "sherpa-onnx/csrc/offline-dolphin-model.h" + +#include +#include +#include + +#if __ANDROID_API__ >= 9 +#include "android/asset_manager.h" +#include "android/asset_manager_jni.h" +#endif + +#if __OHOS__ +#include "rawfile/raw_file_manager.h" +#endif + +#include "sherpa-onnx/csrc/file-utils.h" +#include "sherpa-onnx/csrc/macros.h" +#include "sherpa-onnx/csrc/onnx-utils.h" +#include "sherpa-onnx/csrc/session.h" +#include "sherpa-onnx/csrc/text-utils.h" + +namespace sherpa_onnx { + +class OfflineDolphinModel::Impl { + public: + explicit Impl(const OfflineModelConfig &config) + : config_(config), + env_(ORT_LOGGING_LEVEL_ERROR), + sess_opts_(GetSessionOptions(config)), + allocator_{} { + auto buf = ReadFile(config_.dolphin.model); + Init(buf.data(), buf.size()); + } + + template + Impl(Manager *mgr, const OfflineModelConfig &config) + : config_(config), + env_(ORT_LOGGING_LEVEL_ERROR), + sess_opts_(GetSessionOptions(config)), + allocator_{} { + auto buf = ReadFile(mgr, config_.dolphin.model); + Init(buf.data(), buf.size()); + } + + std::vector Forward(Ort::Value features, + Ort::Value features_length) { + std::array inputs = { + std::move(features), + std::move(features_length), + }; + + return sess_->Run({}, input_names_ptr_.data(), inputs.data(), inputs.size(), + output_names_ptr_.data(), output_names_ptr_.size()); + } + + int32_t VocabSize() const { return meta_data_.vocab_size; } + + int32_t SubsamplingFactor() const { return meta_data_.subsampling_factor; } + + void NormalizeFeatures(float *features, int32_t num_frames, + int32_t feat_dim) const { + auto p = features; + const auto &mean = meta_data_.mean; + const auto &invstd = meta_data_.inv_stddev; + + for (int32_t f = 0; f < num_frames; ++f) { + for (int32_t d = 0; d < feat_dim; ++d) { + p[d] = (p[d] - mean[d]) * invstd[d]; + } + p += feat_dim; + } + } + + OrtAllocator *Allocator() { return allocator_; } + + private: + void Init(void *model_data, size_t model_data_length) { + sess_ = std::make_unique(env_, model_data, model_data_length, + sess_opts_); + + GetInputNames(sess_.get(), &input_names_, &input_names_ptr_); + + GetOutputNames(sess_.get(), &output_names_, &output_names_ptr_); + + // get meta data + Ort::ModelMetadata meta_data = sess_->GetModelMetadata(); + if (config_.debug) { + std::ostringstream os; + PrintModelMetadata(os, meta_data); +#if __OHOS__ + SHERPA_ONNX_LOGE("%{public}s\n", os.str().c_str()); +#else + SHERPA_ONNX_LOGE("%s\n", os.str().c_str()); +#endif + } + + Ort::AllocatorWithDefaultOptions allocator; // used in the macro below + SHERPA_ONNX_READ_META_DATA(meta_data_.vocab_size, "vocab_size"); + + SHERPA_ONNX_READ_META_DATA_VEC_FLOAT(meta_data_.mean, "mean"); + SHERPA_ONNX_READ_META_DATA_VEC_FLOAT(meta_data_.inv_stddev, "invstd"); + } + + private: + OfflineModelConfig config_; + Ort::Env env_; + Ort::SessionOptions sess_opts_; + Ort::AllocatorWithDefaultOptions allocator_; + + std::unique_ptr sess_; + + std::vector input_names_; + std::vector input_names_ptr_; + + std::vector output_names_; + std::vector output_names_ptr_; + + OfflineDolphinModelMetaData meta_data_; +}; + +OfflineDolphinModel::OfflineDolphinModel(const OfflineModelConfig &config) + : impl_(std::make_unique(config)) {} + +template +OfflineDolphinModel::OfflineDolphinModel(Manager *mgr, + const OfflineModelConfig &config) + : impl_(std::make_unique(mgr, config)) {} + +OfflineDolphinModel::~OfflineDolphinModel() = default; + +std::vector OfflineDolphinModel::Forward( + Ort::Value features, Ort::Value features_length) { + return impl_->Forward(std::move(features), std::move(features_length)); +} + +int32_t OfflineDolphinModel::VocabSize() const { return impl_->VocabSize(); } + +int32_t OfflineDolphinModel::SubsamplingFactor() const { + return impl_->SubsamplingFactor(); +} + +void OfflineDolphinModel::NormalizeFeatures(float *features, int32_t num_frames, + int32_t feat_dim) const { + return impl_->NormalizeFeatures(features, num_frames, feat_dim); +} + +OrtAllocator *OfflineDolphinModel::Allocator() const { + return impl_->Allocator(); +} + +#if __ANDROID_API__ >= 9 +template OfflineDolphinModel::OfflineDolphinModel( + AAssetManager *mgr, const OfflineModelConfig &config); +#endif + +#if __OHOS__ +template OfflineDolphinModel::OfflineDolphinModel( + NativeResourceManager *mgr, const OfflineModelConfig &config); +#endif + +} // namespace sherpa_onnx diff --git a/sherpa-onnx/csrc/offline-dolphin-model.h b/sherpa-onnx/csrc/offline-dolphin-model.h new file mode 100644 index 00000000..5ab8e1b9 --- /dev/null +++ b/sherpa-onnx/csrc/offline-dolphin-model.h @@ -0,0 +1,67 @@ +// sherpa-onnx/csrc/offline-dolphin-model.h +// +// Copyright (c) 2025 Xiaomi Corporation +#ifndef SHERPA_ONNX_CSRC_OFFLINE_DOLPHIN_MODEL_H_ +#define SHERPA_ONNX_CSRC_OFFLINE_DOLPHIN_MODEL_H_ + +#include +#include + +#include "onnxruntime_cxx_api.h" // NOLINT +#include "sherpa-onnx/csrc/offline-ctc-model.h" +#include "sherpa-onnx/csrc/offline-dolphin-model-meta-data.h" +#include "sherpa-onnx/csrc/offline-model-config.h" + +namespace sherpa_onnx { + +class OfflineDolphinModel : public OfflineCtcModel { + public: + explicit OfflineDolphinModel(const OfflineModelConfig &config); + + template + OfflineDolphinModel(Manager *mgr, const OfflineModelConfig &config); + + ~OfflineDolphinModel() override; + + /** Run the forward method of the model. + * + * @param features A tensor of shape (N, T, C). + * @param features_length A 1-D tensor of shape (N,) containing number of + * valid frames in `features` before padding. + * Its dtype is int64_t. + * + * @return Return a vector containing: + * - log_probs: A 3-D tensor of shape (N, T', vocab_size). + * - log_probs_length A 1-D tensor of shape (N,). Its dtype is int64_t + */ + std::vector Forward(Ort::Value features, + Ort::Value features_length) override; + + /** Return the vocabulary size of the model + */ + int32_t VocabSize() const override; + + /** SubsamplingFactor of the model + * + * For Citrinet, the subsampling factor is usually 4. + * For Conformer CTC, the subsampling factor is usually 8. + */ + int32_t SubsamplingFactor() const override; + + /** Return an allocator for allocating memory + */ + OrtAllocator *Allocator() const override; + + bool SupportBatchProcessing() const override { return true; } + + void NormalizeFeatures(float *features, int32_t num_frames, + int32_t feat_dim) const override; + + private: + class Impl; + std::unique_ptr impl_; +}; + +} // namespace sherpa_onnx + +#endif // SHERPA_ONNX_CSRC_OFFLINE_DOLPHIN_MODEL_H_ diff --git a/sherpa-onnx/csrc/offline-model-config.cc b/sherpa-onnx/csrc/offline-model-config.cc index 2aee77c8..9ab59b3b 100644 --- a/sherpa-onnx/csrc/offline-model-config.cc +++ b/sherpa-onnx/csrc/offline-model-config.cc @@ -21,6 +21,7 @@ void OfflineModelConfig::Register(ParseOptions *po) { wenet_ctc.Register(po); sense_voice.Register(po); moonshine.Register(po); + dolphin.Register(po); po->Register("telespeech-ctc", &telespeech_ctc, "Path to model.onnx for telespeech ctc"); @@ -109,6 +110,10 @@ bool OfflineModelConfig::Validate() const { return moonshine.Validate(); } + if (!dolphin.model.empty()) { + return dolphin.Validate(); + } + if (!telespeech_ctc.empty() && !FileExists(telespeech_ctc)) { SHERPA_ONNX_LOGE("telespeech_ctc: '%s' does not exist", telespeech_ctc.c_str()); @@ -136,6 +141,7 @@ std::string OfflineModelConfig::ToString() const { os << "wenet_ctc=" << wenet_ctc.ToString() << ", "; os << "sense_voice=" << sense_voice.ToString() << ", "; os << "moonshine=" << moonshine.ToString() << ", "; + os << "dolphin=" << dolphin.ToString() << ", "; os << "telespeech_ctc=\"" << telespeech_ctc << "\", "; os << "tokens=\"" << tokens << "\", "; os << "num_threads=" << num_threads << ", "; diff --git a/sherpa-onnx/csrc/offline-model-config.h b/sherpa-onnx/csrc/offline-model-config.h index e99e39a5..c12a480a 100644 --- a/sherpa-onnx/csrc/offline-model-config.h +++ b/sherpa-onnx/csrc/offline-model-config.h @@ -6,6 +6,7 @@ #include +#include "sherpa-onnx/csrc/offline-dolphin-model-config.h" #include "sherpa-onnx/csrc/offline-fire-red-asr-model-config.h" #include "sherpa-onnx/csrc/offline-moonshine-model-config.h" #include "sherpa-onnx/csrc/offline-nemo-enc-dec-ctc-model-config.h" @@ -30,6 +31,7 @@ struct OfflineModelConfig { OfflineWenetCtcModelConfig wenet_ctc; OfflineSenseVoiceModelConfig sense_voice; OfflineMoonshineModelConfig moonshine; + OfflineDolphinModelConfig dolphin; std::string telespeech_ctc; std::string tokens; @@ -62,6 +64,7 @@ struct OfflineModelConfig { const OfflineWenetCtcModelConfig &wenet_ctc, const OfflineSenseVoiceModelConfig &sense_voice, const OfflineMoonshineModelConfig &moonshine, + const OfflineDolphinModelConfig &dolphin, const std::string &telespeech_ctc, const std::string &tokens, int32_t num_threads, bool debug, const std::string &provider, const std::string &model_type, @@ -77,6 +80,7 @@ struct OfflineModelConfig { wenet_ctc(wenet_ctc), sense_voice(sense_voice), moonshine(moonshine), + dolphin(dolphin), telespeech_ctc(telespeech_ctc), tokens(tokens), num_threads(num_threads), diff --git a/sherpa-onnx/csrc/offline-recognizer-ctc-impl.h b/sherpa-onnx/csrc/offline-recognizer-ctc-impl.h index 3dca0dfc..30df001f 100644 --- a/sherpa-onnx/csrc/offline-recognizer-ctc-impl.h +++ b/sherpa-onnx/csrc/offline-recognizer-ctc-impl.h @@ -118,6 +118,19 @@ class OfflineRecognizerCtcImpl : public OfflineRecognizerImpl { } } + if (!config_.model_config.dolphin.model.empty()) { + config_.feat_config.low_freq = 0; + config_.feat_config.high_freq = 8000; + config_.feat_config.remove_dc_offset = false; + config_.feat_config.dither = 0; + config_.feat_config.preemph_coeff = 0; + config_.feat_config.window_type = "hann"; + config_.feat_config.feature_dim = 80; + config_.feat_config.is_librosa = true; + config_.feat_config.frame_length_ms = 31.25; // 16000/512 = 31.25 + config_.feat_config.snip_edges = false; + } + if (!config_.model_config.wenet_ctc.model.empty()) { // WeNet CTC models assume input samples are in the range // [-32768, 32767], so we set normalize_samples to false @@ -157,7 +170,7 @@ class OfflineRecognizerCtcImpl : public OfflineRecognizerImpl { } else { SHERPA_ONNX_LOGE("Only greedy_search is supported at present. Given %s", config_.decoding_method.c_str()); - exit(-1); + SHERPA_ONNX_EXIT(-1); } } @@ -166,7 +179,7 @@ class OfflineRecognizerCtcImpl : public OfflineRecognizerImpl { } void DecodeStreams(OfflineStream **ss, int32_t n) const override { - if (!model_->SupportBatchProcessing()) { + if (!model_->SupportBatchProcessing() || (n == 1)) { // If the model does not support batch process, // we process each stream independently. for (int32_t i = 0; i != n; ++i) { @@ -190,6 +203,9 @@ class OfflineRecognizerCtcImpl : public OfflineRecognizerImpl { std::vector f = ss[i]->GetFrames(); int32_t num_frames = f.size() / feat_dim; + + model_->NormalizeFeatures(f.data(), num_frames, feat_dim); + features_vec[i] = std::move(f); features_length_vec[i] = num_frames; @@ -241,6 +257,8 @@ class OfflineRecognizerCtcImpl : public OfflineRecognizerImpl { int32_t num_frames = f.size() / feat_dim; + model_->NormalizeFeatures(f.data(), num_frames, feat_dim); + std::array shape = {1, num_frames, feat_dim}; Ort::Value x = Ort::Value::CreateTensor(memory_info, f.data(), f.size(), diff --git a/sherpa-onnx/csrc/offline-recognizer-impl.cc b/sherpa-onnx/csrc/offline-recognizer-impl.cc index b74bbbbb..c4dba8aa 100644 --- a/sherpa-onnx/csrc/offline-recognizer-impl.cc +++ b/sherpa-onnx/csrc/offline-recognizer-impl.cc @@ -49,7 +49,8 @@ std::unique_ptr OfflineRecognizerImpl::Create( if (!config.model_config.nemo_ctc.model.empty() || !config.model_config.zipformer_ctc.model.empty() || !config.model_config.tdnn.model.empty() || - !config.model_config.wenet_ctc.model.empty()) { + !config.model_config.wenet_ctc.model.empty() || + !config.model_config.dolphin.model.empty()) { return std::make_unique(config); } @@ -234,7 +235,8 @@ std::unique_ptr OfflineRecognizerImpl::Create( if (!config.model_config.nemo_ctc.model.empty() || !config.model_config.zipformer_ctc.model.empty() || !config.model_config.tdnn.model.empty() || - !config.model_config.wenet_ctc.model.empty()) { + !config.model_config.wenet_ctc.model.empty() || + !config.model_config.dolphin.model.empty()) { return std::make_unique(mgr, config); } diff --git a/sherpa-onnx/csrc/offline-sense-voice-model-config.h b/sherpa-onnx/csrc/offline-sense-voice-model-config.h index 2f724e44..f19e959e 100644 --- a/sherpa-onnx/csrc/offline-sense-voice-model-config.h +++ b/sherpa-onnx/csrc/offline-sense-voice-model-config.h @@ -23,9 +23,8 @@ struct OfflineSenseVoiceModelConfig { bool use_itn = false; OfflineSenseVoiceModelConfig() = default; - explicit OfflineSenseVoiceModelConfig(const std::string &model, - const std::string &language, - bool use_itn) + OfflineSenseVoiceModelConfig(const std::string &model, + const std::string &language, bool use_itn) : model(model), language(language), use_itn(use_itn) {} void Register(ParseOptions *po); diff --git a/sherpa-onnx/csrc/online-recognizer-transducer-impl.h b/sherpa-onnx/csrc/online-recognizer-transducer-impl.h index e6fd0b50..a6f86c2b 100644 --- a/sherpa-onnx/csrc/online-recognizer-transducer-impl.h +++ b/sherpa-onnx/csrc/online-recognizer-transducer-impl.h @@ -41,6 +41,9 @@ OnlineRecognizerResult Convert(const OnlineTransducerDecoderResult &src, std::string text; for (auto i : src.tokens) { auto sym = sym_table[i]; + if (sym == "") { + continue; + } text.append(sym); diff --git a/sherpa-onnx/csrc/rknn/silero-vad-model-rknn.h b/sherpa-onnx/csrc/rknn/silero-vad-model-rknn.h index a11b34e6..4ffb5f24 100644 --- a/sherpa-onnx/csrc/rknn/silero-vad-model-rknn.h +++ b/sherpa-onnx/csrc/rknn/silero-vad-model-rknn.h @@ -4,6 +4,8 @@ #ifndef SHERPA_ONNX_CSRC_RKNN_SILERO_VAD_MODEL_RKNN_H_ #define SHERPA_ONNX_CSRC_RKNN_SILERO_VAD_MODEL_RKNN_H_ +#include + #include "rknn_api.h" // NOLINT #include "sherpa-onnx/csrc/online-model-config.h" #include "sherpa-onnx/csrc/vad-model.h" diff --git a/sherpa-onnx/python/csrc/CMakeLists.txt b/sherpa-onnx/python/csrc/CMakeLists.txt index c941b0db..c0b5c01c 100644 --- a/sherpa-onnx/python/csrc/CMakeLists.txt +++ b/sherpa-onnx/python/csrc/CMakeLists.txt @@ -9,6 +9,7 @@ set(srcs features.cc keyword-spotter.cc offline-ctc-fst-decoder-config.cc + offline-dolphin-model-config.cc offline-fire-red-asr-model-config.cc offline-lm-config.cc offline-model-config.cc diff --git a/sherpa-onnx/python/csrc/offline-dolphin-model-config.cc b/sherpa-onnx/python/csrc/offline-dolphin-model-config.cc new file mode 100644 index 00000000..72a767a8 --- /dev/null +++ b/sherpa-onnx/python/csrc/offline-dolphin-model-config.cc @@ -0,0 +1,23 @@ +// sherpa-onnx/python/csrc/offline-dolphin-model-config.cc +// +// Copyright (c) 2025 Xiaomi Corporation + +#include "sherpa-onnx/csrc/offline-dolphin-model-config.h" + +#include +#include + +#include "sherpa-onnx/python/csrc/offline-dolphin-model-config.h" + +namespace sherpa_onnx { + +void PybindOfflineDolphinModelConfig(py::module *m) { + using PyClass = OfflineDolphinModelConfig; + py::class_(*m, "OfflineDolphinModelConfig") + .def(py::init<>()) + .def(py::init(), py::arg("model")) + .def_readwrite("model", &PyClass::model) + .def("__str__", &PyClass::ToString); +} + +} // namespace sherpa_onnx diff --git a/sherpa-onnx/python/csrc/offline-dolphin-model-config.h b/sherpa-onnx/python/csrc/offline-dolphin-model-config.h new file mode 100644 index 00000000..76f89126 --- /dev/null +++ b/sherpa-onnx/python/csrc/offline-dolphin-model-config.h @@ -0,0 +1,16 @@ +// sherpa-onnx/python/csrc/offline-dolphin-model-config.h +// +// Copyright (c) 2025 Xiaomi Corporation + +#ifndef SHERPA_ONNX_PYTHON_CSRC_OFFLINE_DOLPHIN_MODEL_CONFIG_H_ +#define SHERPA_ONNX_PYTHON_CSRC_OFFLINE_DOLPHIN_MODEL_CONFIG_H_ + +#include "sherpa-onnx/python/csrc/sherpa-onnx.h" + +namespace sherpa_onnx { + +void PybindOfflineDolphinModelConfig(py::module *m); + +} + +#endif // SHERPA_ONNX_PYTHON_CSRC_OFFLINE_DOLPHIN_MODEL_CONFIG_H_ diff --git a/sherpa-onnx/python/csrc/offline-model-config.cc b/sherpa-onnx/python/csrc/offline-model-config.cc index 92f8a2a8..c73eafd7 100644 --- a/sherpa-onnx/python/csrc/offline-model-config.cc +++ b/sherpa-onnx/python/csrc/offline-model-config.cc @@ -8,6 +8,7 @@ #include #include "sherpa-onnx/csrc/offline-model-config.h" +#include "sherpa-onnx/python/csrc/offline-dolphin-model-config.h" #include "sherpa-onnx/python/csrc/offline-fire-red-asr-model-config.h" #include "sherpa-onnx/python/csrc/offline-moonshine-model-config.h" #include "sherpa-onnx/python/csrc/offline-nemo-enc-dec-ctc-model-config.h" @@ -32,6 +33,7 @@ void PybindOfflineModelConfig(py::module *m) { PybindOfflineWenetCtcModelConfig(m); PybindOfflineSenseVoiceModelConfig(m); PybindOfflineMoonshineModelConfig(m); + PybindOfflineDolphinModelConfig(m); using PyClass = OfflineModelConfig; py::class_(*m, "OfflineModelConfig") @@ -44,7 +46,8 @@ void PybindOfflineModelConfig(py::module *m) { const OfflineZipformerCtcModelConfig &, const OfflineWenetCtcModelConfig &, const OfflineSenseVoiceModelConfig &, - const OfflineMoonshineModelConfig &, const std::string &, + const OfflineMoonshineModelConfig &, + const OfflineDolphinModelConfig &, const std::string &, const std::string &, int32_t, bool, const std::string &, const std::string &, const std::string &, const std::string &>(), @@ -58,6 +61,7 @@ void PybindOfflineModelConfig(py::module *m) { py::arg("wenet_ctc") = OfflineWenetCtcModelConfig(), py::arg("sense_voice") = OfflineSenseVoiceModelConfig(), py::arg("moonshine") = OfflineMoonshineModelConfig(), + py::arg("dolphin") = OfflineDolphinModelConfig(), py::arg("telespeech_ctc") = "", py::arg("tokens"), py::arg("num_threads"), py::arg("debug") = false, py::arg("provider") = "cpu", py::arg("model_type") = "", @@ -72,6 +76,7 @@ void PybindOfflineModelConfig(py::module *m) { .def_readwrite("wenet_ctc", &PyClass::wenet_ctc) .def_readwrite("sense_voice", &PyClass::sense_voice) .def_readwrite("moonshine", &PyClass::moonshine) + .def_readwrite("dolphin", &PyClass::dolphin) .def_readwrite("telespeech_ctc", &PyClass::telespeech_ctc) .def_readwrite("tokens", &PyClass::tokens) .def_readwrite("num_threads", &PyClass::num_threads) diff --git a/sherpa-onnx/python/sherpa_onnx/offline_recognizer.py b/sherpa-onnx/python/sherpa_onnx/offline_recognizer.py index d8ab6709..69bb3ef8 100644 --- a/sherpa-onnx/python/sherpa_onnx/offline_recognizer.py +++ b/sherpa-onnx/python/sherpa_onnx/offline_recognizer.py @@ -6,6 +6,7 @@ from typing import List, Optional from _sherpa_onnx import ( FeatureExtractorConfig, OfflineCtcFstDecoderConfig, + OfflineDolphinModelConfig, OfflineFireRedAsrModelConfig, OfflineLMConfig, OfflineModelConfig, @@ -408,6 +409,78 @@ class OfflineRecognizer(object): self.config = recognizer_config return self + @classmethod + def from_dolphin_ctc( + cls, + model: str, + tokens: str, + num_threads: int = 1, + sample_rate: int = 16000, + feature_dim: int = 80, + decoding_method: str = "greedy_search", + debug: bool = False, + provider: str = "cpu", + rule_fsts: str = "", + rule_fars: str = "", + ): + """ + Please refer to + ``_ + to download pre-trained models. + + Args: + model: + Path to ``model.onnx`` or ``model.int8.onnx``. + tokens: + Path to ``tokens.txt``. Each line in ``tokens.txt`` contains two + columns:: + + symbol integer_id + + num_threads: + Number of threads for neural network computation. + sample_rate: + Sample rate of the training data used to train the model. + feature_dim: + Dimension of the feature used to train the model. + decoding_method: + Valid values are greedy_search. + debug: + True to show debug messages. + provider: + onnxruntime execution providers. Valid values are: cpu, cuda, coreml. + rule_fsts: + If not empty, it specifies fsts for inverse text normalization. + If there are multiple fsts, they are separated by a comma. + rule_fars: + If not empty, it specifies fst archives for inverse text normalization. + If there are multiple archives, they are separated by a comma. + """ + self = cls.__new__(cls) + model_config = OfflineModelConfig( + dolphin=OfflineDolphinModelConfig(model=model), + tokens=tokens, + num_threads=num_threads, + debug=debug, + provider=provider, + ) + + feat_config = FeatureExtractorConfig( + sampling_rate=sample_rate, + feature_dim=feature_dim, + ) + + recognizer_config = OfflineRecognizerConfig( + feat_config=feat_config, + model_config=model_config, + decoding_method=decoding_method, + rule_fsts=rule_fsts, + rule_fars=rule_fars, + ) + self.recognizer = _Recognizer(recognizer_config) + self.config = recognizer_config + return self + @classmethod def from_nemo_ctc( cls,