Support Zipformer transducer ASR with whisper features. (#2321)

Adds support for Zipformer transducer ASR models that use Whisper-style 
features by introducing a new feature flag, parsing metadata, 
and integrating per-chunk normalization.

- Introduce UseWhisperFeature in the model interface and Zipformer implementation
- Parse "feature" metadata to set the whisper flag and wire it into the recognizer
- Update feature extraction logic to handle Whisper filterbanks with early returns
This commit is contained in:
Fangjun Kuang
2025-06-27 10:40:41 +08:00
committed by GitHub
parent 54bf3732d9
commit f835642b1c
5 changed files with 31 additions and 0 deletions

View File

@@ -16,6 +16,7 @@
#include "sherpa-onnx/csrc/file-utils.h"
#include "sherpa-onnx/csrc/macros.h"
#include "sherpa-onnx/csrc/offline-whisper-model.h"
#include "sherpa-onnx/csrc/online-lm.h"
#include "sherpa-onnx/csrc/online-recognizer-impl.h"
#include "sherpa-onnx/csrc/online-recognizer.h"
@@ -133,6 +134,10 @@ class OnlineRecognizerTransducerImpl : public OnlineRecognizerImpl {
config.decoding_method.c_str());
exit(-1);
}
if (model_->UseWhisperFeature()) {
config_.feat_config.is_whisper = true;
}
}
template <typename Manager>
@@ -182,6 +187,10 @@ class OnlineRecognizerTransducerImpl : public OnlineRecognizerImpl {
config.decoding_method.c_str());
exit(-1);
}
if (model_->UseWhisperFeature()) {
config_.feat_config.is_whisper = true;
}
}
std::unique_ptr<OnlineStream> CreateStream() const override {
@@ -292,6 +301,11 @@ class OnlineRecognizerTransducerImpl : public OnlineRecognizerImpl {
std::vector<float> features =
ss[i]->GetFrames(num_processed_frames, chunk_size);
if (config_.feat_config.is_whisper) {
OfflineWhisperModel::NormalizeFeatures(features.data(), chunk_size,
feature_dim);
}
// Question: should num_processed_frames include chunk_shift?
ss[i]->GetNumProcessedFrames() += chunk_shift;