Remove the 30-second constraint from whisper. (#471)

2023-12-07 17:47:08 +08:00
parent a7d69359c9
commit 3ae984f148
10 changed files with 178 additions and 78 deletions
--- a/sherpa-onnx/csrc/offline-recognizer-whisper-impl.h
+++ b/sherpa-onnx/csrc/offline-recognizer-whisper-impl.h
@@ -115,7 +115,27 @@ class OfflineRecognizerWhisperImpl : public OfflineRecognizerImpl {

    NormalizeFeatures(f.data(), num_frames, feat_dim);

-    std::array<int64_t, 3> shape{1, max_num_frames, feat_dim};
+    // note that 50 is an experience value.
+    // see also ../../scripts/whisper/test.py
+    //
+    // You can replace 50 by other values, say, 100.
+    //
+    // Since we have removed the 30 seconds constraint, we need
+    // tail_padding_frames so that whisper is able to detect the eot token.
+    int32_t tail_padding_frames = 50;
+    if (model_->IsMultiLingual()) {
+      // 300 is an experience value. If it throws, please use a larger value.
+      tail_padding_frames = 300;
+    }
+
+    if (config_.model_config.whisper.tail_paddings > 0) {
+      tail_padding_frames = config_.model_config.whisper.tail_paddings;
+    }
+
+    int32_t actual_frames =
+        std::min(num_frames + tail_padding_frames, max_num_frames);
+
+    std::array<int64_t, 3> shape{1, actual_frames, feat_dim};

    Ort::Value mel = Ort::Value::CreateTensor<float>(
        model_->Allocator(), shape.data(), shape.size());
@@ -123,7 +143,7 @@ class OfflineRecognizerWhisperImpl : public OfflineRecognizerImpl {
    std::copy(f.begin(), f.end(), p_mel);

    memset(p_mel + f.size(), 0,
-           (max_num_frames - num_frames) * feat_dim * sizeof(float));
+           (actual_frames - num_frames) * feat_dim * sizeof(float));
    mel = Transpose12(model_->Allocator(), &mel);

    try {
--- a/sherpa-onnx/csrc/offline-whisper-model-config.cc
+++ b/sherpa-onnx/csrc/offline-whisper-model-config.cc
@@ -32,6 +32,14 @@ void OfflineWhisperModelConfig::Register(ParseOptions *po) {
               "Valid values: transcribe, translate. "
               "Note that for non-multilingual models, it supports "
               "only 'transcribe'");
+
+  po->Register(
+      "whisper-tail-paddings", &tail_paddings,
+      "Suggest value: 50 for English models. 300 for multilingual models. "
+      "Since we have removed the 30-second constraint, we need to add some "
+      "tail padding frames "
+      "so that whisper can detect the eot token. Leave it to -1 to use 50 for "
+      "English models and 300 for multilingual models.");
 }

 bool OfflineWhisperModelConfig::Validate() const {
@@ -63,7 +71,8 @@ std::string OfflineWhisperModelConfig::ToString() const {
  os << "encoder=\"" << encoder << "\", ";
  os << "decoder=\"" << decoder << "\", ";
  os << "language=\"" << language << "\", ";
-  os << "task=\"" << task << "\")";
+  os << "task=\"" << task << "\", ";
+  os << "tail_paddings=" << tail_paddings << ")";

  return os.str();
 }
--- a/sherpa-onnx/csrc/offline-whisper-model-config.h
+++ b/sherpa-onnx/csrc/offline-whisper-model-config.h
@@ -28,12 +28,26 @@ struct OfflineWhisperModelConfig {
  // Note: For non-multilingual models, it supports only "transcribe"
  std::string task = "transcribe";

+  // Number of tail padding frames.
+  //
+  // Since we remove the 30-second constraint, we need to add some paddings
+  // at the end.
+  //
+  // Recommended values:
+  //   - 50 for English models
+  //   - 300 for multilingual models
+  int32_t tail_paddings = -1;
+
  OfflineWhisperModelConfig() = default;
  OfflineWhisperModelConfig(const std::string &encoder,
                            const std::string &decoder,
                            const std::string &language,
-                            const std::string &task)
-      : encoder(encoder), decoder(decoder), language(language), task(task) {}
+                            const std::string &task, int32_t tail_paddings)
+      : encoder(encoder),
+        decoder(decoder),
+        language(language),
+        task(task),
+        tail_paddings(tail_paddings) {}

  void Register(ParseOptions *po);
  bool Validate() const;