From 8e6826521e250003c9b2ab62b2b17c5557490590 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Thu, 29 May 2025 10:34:22 +0800 Subject: [PATCH] Update kaldi-native-fbank. (#2259) Now it supports FFT of an even number, not necessarily a power of 2. --- cmake/kaldi-native-fbank.cmake | 16 ++++++++-------- cmake/sherpa-onnx-static-no-tts.pc.in | 2 +- cmake/sherpa-onnx-static.pc.in | 2 +- scripts/nemo/GigaAM/test-onnx-ctc.py | 4 +--- scripts/nemo/GigaAM/test-onnx-rnnt.py | 4 +--- sherpa-onnx/csrc/features.cc | 2 ++ sherpa-onnx/csrc/features.h | 2 ++ sherpa-onnx/csrc/offline-recognizer-ctc-impl.h | 6 ++++++ .../offline-recognizer-transducer-nemo-impl.h | 6 ++++++ 9 files changed, 28 insertions(+), 16 deletions(-) diff --git a/cmake/kaldi-native-fbank.cmake b/cmake/kaldi-native-fbank.cmake index f7aba1b5..e97e7e67 100644 --- a/cmake/kaldi-native-fbank.cmake +++ b/cmake/kaldi-native-fbank.cmake @@ -1,9 +1,9 @@ function(download_kaldi_native_fbank) include(FetchContent) - set(kaldi_native_fbank_URL "https://github.com/csukuangfj/kaldi-native-fbank/archive/refs/tags/v1.21.1.tar.gz") - set(kaldi_native_fbank_URL2 "https://hf-mirror.com/csukuangfj/sherpa-onnx-cmake-deps/resolve/main/kaldi-native-fbank-1.21.1.tar.gz") - set(kaldi_native_fbank_HASH "SHA256=37c1aa230b00fe062791d800d8fc50aa3de215918d3dce6440699e67275d859e") + set(kaldi_native_fbank_URL "https://github.com/csukuangfj/kaldi-native-fbank/archive/refs/tags/v1.21.2.tar.gz") + set(kaldi_native_fbank_URL2 "https://hf-mirror.com/csukuangfj/sherpa-onnx-cmake-deps/resolve/main/kaldi-native-fbank-1.21.2.tar.gz") + set(kaldi_native_fbank_HASH "SHA256=f4bd7d53fe8aeaecc4eda9680c72696bb86bf74e86371d81aacacd6f4ca3914d") set(KALDI_NATIVE_FBANK_BUILD_TESTS OFF CACHE BOOL "" FORCE) set(KALDI_NATIVE_FBANK_BUILD_PYTHON OFF CACHE BOOL "" FORCE) @@ -12,11 +12,11 @@ function(download_kaldi_native_fbank) # If you don't have access to the Internet, # please pre-download kaldi-native-fbank set(possible_file_locations - $ENV{HOME}/Downloads/kaldi-native-fbank-1.21.1.tar.gz - ${CMAKE_SOURCE_DIR}/kaldi-native-fbank-1.21.1.tar.gz - ${CMAKE_BINARY_DIR}/kaldi-native-fbank-1.21.1.tar.gz - /tmp/kaldi-native-fbank-1.21.1.tar.gz - /star-fj/fangjun/download/github/kaldi-native-fbank-1.21.1.tar.gz + $ENV{HOME}/Downloads/kaldi-native-fbank-1.21.2.tar.gz + ${CMAKE_SOURCE_DIR}/kaldi-native-fbank-1.21.2.tar.gz + ${CMAKE_BINARY_DIR}/kaldi-native-fbank-1.21.2.tar.gz + /tmp/kaldi-native-fbank-1.21.2.tar.gz + /star-fj/fangjun/download/github/kaldi-native-fbank-1.21.2.tar.gz ) foreach(f IN LISTS possible_file_locations) diff --git a/cmake/sherpa-onnx-static-no-tts.pc.in b/cmake/sherpa-onnx-static-no-tts.pc.in index f100cd95..069d620d 100644 --- a/cmake/sherpa-onnx-static-no-tts.pc.in +++ b/cmake/sherpa-onnx-static-no-tts.pc.in @@ -22,4 +22,4 @@ Cflags: -I"${includedir}" # Note: -lcargs is required only for the following file # https://github.com/k2-fsa/sherpa-onnx/blob/master/c-api-examples/decode-file-c-api.c # We add it here so that users don't need to specify -lcargs when compiling decode-file-c-api.c -Libs: -L"${libdir}" -lsherpa-onnx-c-api -lsherpa-onnx-core -lkaldi-decoder-core -lsherpa-onnx-kaldifst-core -lsherpa-onnx-fst -lkaldi-native-fbank-core -lonnxruntime -lssentencepiece_core -Wl,-rpath,${libdir} @SHERPA_ONNX_PKG_WITH_CARGS@ @SHERPA_ONNX_PKG_CONFIG_EXTRA_LIBS@ +Libs: -L"${libdir}" -lsherpa-onnx-c-api -lsherpa-onnx-core -lkaldi-decoder-core -lsherpa-onnx-kaldifst-core -lsherpa-onnx-fst -lkaldi-native-fbank-core -lkissfft-float -lonnxruntime -lssentencepiece_core -Wl,-rpath,${libdir} @SHERPA_ONNX_PKG_WITH_CARGS@ @SHERPA_ONNX_PKG_CONFIG_EXTRA_LIBS@ diff --git a/cmake/sherpa-onnx-static.pc.in b/cmake/sherpa-onnx-static.pc.in index 1f788b00..42beb084 100644 --- a/cmake/sherpa-onnx-static.pc.in +++ b/cmake/sherpa-onnx-static.pc.in @@ -22,4 +22,4 @@ Cflags: -I"${includedir}" # Note: -lcargs is required only for the following file # https://github.com/k2-fsa/sherpa-onnx/blob/master/c-api-examples/decode-file-c-api.c # We add it here so that users don't need to specify -lcargs when compiling decode-file-c-api.c -Libs: -L"${libdir}" -lsherpa-onnx-c-api -lsherpa-onnx-core -lkaldi-decoder-core -lsherpa-onnx-kaldifst-core -lsherpa-onnx-fstfar -lsherpa-onnx-fst -lkaldi-native-fbank-core -lpiper_phonemize -lespeak-ng -lucd -lonnxruntime -lssentencepiece_core -Wl,-rpath,${libdir} @SHERPA_ONNX_PKG_WITH_CARGS@ @SHERPA_ONNX_PKG_CONFIG_EXTRA_LIBS@ +Libs: -L"${libdir}" -lsherpa-onnx-c-api -lsherpa-onnx-core -lkaldi-decoder-core -lsherpa-onnx-kaldifst-core -lsherpa-onnx-fstfar -lsherpa-onnx-fst -lkaldi-native-fbank-core -lkissfft-float -lpiper_phonemize -lespeak-ng -lucd -lonnxruntime -lssentencepiece_core -Wl,-rpath,${libdir} @SHERPA_ONNX_PKG_WITH_CARGS@ @SHERPA_ONNX_PKG_CONFIG_EXTRA_LIBS@ diff --git a/scripts/nemo/GigaAM/test-onnx-ctc.py b/scripts/nemo/GigaAM/test-onnx-ctc.py index 731d4e7e..ea69b70b 100755 --- a/scripts/nemo/GigaAM/test-onnx-ctc.py +++ b/scripts/nemo/GigaAM/test-onnx-ctc.py @@ -18,9 +18,7 @@ def create_fbank(): opts.frame_opts.preemph_coeff = 0 opts.frame_opts.window_type = "hann" - # Even though GigaAM uses 400 for fft, here we use 512 - # since kaldi-native-fbank only supports fft for power of 2. - opts.frame_opts.round_to_power_of_two = True + opts.frame_opts.round_to_power_of_two = False opts.mel_opts.low_freq = 0 opts.mel_opts.high_freq = 8000 diff --git a/scripts/nemo/GigaAM/test-onnx-rnnt.py b/scripts/nemo/GigaAM/test-onnx-rnnt.py index f2bf7b2a..cf831e1e 100755 --- a/scripts/nemo/GigaAM/test-onnx-rnnt.py +++ b/scripts/nemo/GigaAM/test-onnx-rnnt.py @@ -19,9 +19,7 @@ def create_fbank(): opts.frame_opts.preemph_coeff = 0 opts.frame_opts.window_type = "hann" - # Even though GigaAM uses 400 for fft, here we use 512 - # since kaldi-native-fbank only supports fft for power of 2. - opts.frame_opts.round_to_power_of_two = True + opts.frame_opts.round_to_power_of_two = False opts.mel_opts.low_freq = 0 opts.mel_opts.high_freq = 8000 diff --git a/sherpa-onnx/csrc/features.cc b/sherpa-onnx/csrc/features.cc index 16632513..95c6f82a 100644 --- a/sherpa-onnx/csrc/features.cc +++ b/sherpa-onnx/csrc/features.cc @@ -197,6 +197,7 @@ class FeatureExtractor::Impl { opts_.frame_opts.remove_dc_offset = config_.remove_dc_offset; opts_.frame_opts.preemph_coeff = config_.preemph_coeff; opts_.frame_opts.window_type = config_.window_type; + opts_.frame_opts.round_to_power_of_two = config_.round_to_power_of_two; opts_.mel_opts.num_bins = config_.feature_dim; @@ -216,6 +217,7 @@ class FeatureExtractor::Impl { mfcc_opts_.frame_opts.remove_dc_offset = config_.remove_dc_offset; mfcc_opts_.frame_opts.preemph_coeff = config_.preemph_coeff; mfcc_opts_.frame_opts.window_type = config_.window_type; + mfcc_opts_.frame_opts.round_to_power_of_two = config_.round_to_power_of_two; mfcc_opts_.mel_opts.num_bins = config_.feature_dim; diff --git a/sherpa-onnx/csrc/features.h b/sherpa-onnx/csrc/features.h index fb5ff2fe..d10b486b 100644 --- a/sherpa-onnx/csrc/features.h +++ b/sherpa-onnx/csrc/features.h @@ -79,6 +79,8 @@ struct FeatureExtractorConfig { bool is_mfcc = false; + bool round_to_power_of_two = true; + std::string ToString() const; void Register(ParseOptions *po); diff --git a/sherpa-onnx/csrc/offline-recognizer-ctc-impl.h b/sherpa-onnx/csrc/offline-recognizer-ctc-impl.h index b2a1884e..491ac27b 100644 --- a/sherpa-onnx/csrc/offline-recognizer-ctc-impl.h +++ b/sherpa-onnx/csrc/offline-recognizer-ctc-impl.h @@ -109,6 +109,12 @@ class OfflineRecognizerCtcImpl : public OfflineRecognizerImpl { config_.feat_config.preemph_coeff = 0; config_.feat_config.window_type = "hann"; config_.feat_config.feature_dim = 64; + + // see + // https://github.com/salute-developers/GigaAM/blob/main/gigaam/preprocess.py#L68 + // + // GigaAM uses n_fft 400 + config_.feat_config.round_to_power_of_two = false; } else { config_.feat_config.low_freq = 0; config_.feat_config.high_freq = 0; diff --git a/sherpa-onnx/csrc/offline-recognizer-transducer-nemo-impl.h b/sherpa-onnx/csrc/offline-recognizer-transducer-nemo-impl.h index eda0295d..b080a9b3 100644 --- a/sherpa-onnx/csrc/offline-recognizer-transducer-nemo-impl.h +++ b/sherpa-onnx/csrc/offline-recognizer-transducer-nemo-impl.h @@ -156,6 +156,12 @@ class OfflineRecognizerTransducerNeMoImpl : public OfflineRecognizerImpl { config_.feat_config.preemph_coeff = 0; config_.feat_config.window_type = "hann"; config_.feat_config.feature_dim = 64; + + // see + // https://github.com/salute-developers/GigaAM/blob/main/gigaam/preprocess.py#L68 + // + // GigaAM uses n_fft 400 + config_.feat_config.round_to_power_of_two = false; } else { config_.feat_config.low_freq = 0; // config_.feat_config.high_freq = 8000;