Limit number of tokens per second for whisper. (#1958)

Otherwise, it spends lots of time in the loop if the EOT token is not predicted.
2025-03-04 15:45:28 +08:00
parent 49177530ff
commit 209eaaae1d
4 changed files with 14 additions and 6 deletions
--- a/sherpa-onnx/csrc/offline-whisper-greedy-search-decoder.cc
+++ b/sherpa-onnx/csrc/offline-whisper-greedy-search-decoder.cc
@@ -19,7 +19,8 @@ void OfflineWhisperGreedySearchDecoder::SetConfig(

 std::vector<OfflineWhisperDecoderResult>
 OfflineWhisperGreedySearchDecoder::Decode(Ort::Value cross_k,
-                                          Ort::Value cross_v) {
+                                          Ort::Value cross_v,
+                                          int32_t num_feature_frames) {
  auto memory_info =
      Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeDefault);

@@ -99,7 +100,12 @@ OfflineWhisperGreedySearchDecoder::Decode(Ort::Value cross_k,
  int32_t n_text_ctx = model_->TextCtx();

  std::vector<int32_t> predicted_tokens;
-  for (int32_t i = 0; i < n_text_ctx / 2; ++i) {
+
+  // assume at most 6 tokens per second
+  int32_t num_possible_tokens = num_feature_frames / 100 * 6;
+  num_possible_tokens = std::min<int32_t>(num_possible_tokens, n_text_ctx / 2);
+
+  for (int32_t i = 0; i < num_possible_tokens; ++i) {
    if (max_token_id == model_->EOT()) {
      break;
    }