Fix C api for Go and MFC to support streaming paraformer (#268)

2023-08-14 17:02:23 +08:00
parent eb5ae18015
commit bc791d4996
13 changed files with 315 additions and 74 deletions
--- a/mfc-examples/NonStreamingSpeechRecognition/NonStreamingSpeechRecognitionDlg.cpp
+++ b/mfc-examples/NonStreamingSpeechRecognition/NonStreamingSpeechRecognitionDlg.cpp
@@ -306,12 +306,10 @@ void CNonStreamingSpeechRecognitionDlg::ShowInitRecognizerHelpMessage() {
      "https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html "
      "\r\n";
  msg += "to download a non-streaming model, i.e., an offline model.\r\n";
+  msg += "You need to rename them after downloading\r\n\r\n";
+  msg += "It supports transducer, paraformer, and whisper models.\r\n\r\n";
  msg +=
-      "You need to rename them to encoder.onnx, decoder.onnx, and "
-      "joiner.onnx correspoondingly.\r\n\r\n";
-  msg += "It supports both transducer models and paraformer models.\r\n\r\n";
-  msg +=
-      "We give two examples below to show you how to download models\r\n\r\n";
+      "We give three examples below to show you how to download models\r\n\r\n";
  msg += "(1) Transducer\r\n\r\n";
  msg +=
      "We use "
@@ -346,13 +344,82 @@ void CNonStreamingSpeechRecognitionDlg::ShowInitRecognizerHelpMessage() {
      "https://huggingface.co/csukuangfj/sherpa-onnx-paraformer-zh-2023-03-28/"
      "resolve/main/tokens.txt\r\n\r\n";
  msg += "\r\n Now rename them\r\n";
-  msg += "mv model.onnx paraformer.onnx\r\n";
+  msg += "mv model.onnx paraformer.onnx\r\n\r\n";
+  msg += "(3) Whisper\r\n\r\n";
+  msg +=
+      "wget "
+      "https://huggingface.co/csukuangfj/sherpa-onnx-whisper-tiny.en/resolve/"
+      "main/tiny.en-encoder.onnx\r\n";
+  msg +=
+      "wget "
+      "https://huggingface.co/csukuangfj/sherpa-onnx-whisper-tiny.en/resolve/"
+      "main/tiny.en-decoder.onnx\r\n";
+  msg +=
+      "wget "
+      "https://huggingface.co/csukuangfj/sherpa-onnx-whisper-tiny.en/resolve/"
+      "main/tiny.en-tokens.txt\r\n";
+  msg += "\r\n Now rename them\r\n";
+  msg += "mv tiny.en-encoder.onnx whisper-encoder.onnx\r\n";
+  msg += "mv tiny.en-decoder.onnx whisper-decoder.onnx\r\n";
  msg += "\r\n";
  msg += "That's it!\r\n";

  AppendLineToMultilineEditCtrl(msg);
 }

+void CNonStreamingSpeechRecognitionDlg::InitWhisper() {
+  std::string whisper_encoder = "./whisper-encoder.onnx";
+  std::string whisper_decoder = "./whisper-decoder.onnx";
+
+  std::string tokens = "./tokens.txt";
+
+  bool is_ok = true;
+
+  if (Exists("./whisper-encoder.int8.onnx")) {
+    whisper_encoder = "./whisper-encoder.int8.onnx";
+  } else if (!Exists(whisper_encoder)) {
+    std::string msg = whisper_encoder + " does not exist!";
+    AppendLineToMultilineEditCtrl(msg);
+    is_ok = false;
+  }
+
+  if (Exists("./whisper-decoder.int8.onnx")) {
+    whisper_decoder = "./whisper-decoder.int8.onnx";
+  } else if (!Exists(whisper_decoder)) {
+    std::string msg = whisper_decoder + " does not exist!";
+    AppendLineToMultilineEditCtrl(msg);
+    is_ok = false;
+  }
+
+  if (!Exists(tokens)) {
+    std::string msg = tokens + " does not exist!";
+    AppendLineToMultilineEditCtrl(msg);
+    is_ok = false;
+  }
+
+  if (!is_ok) {
+    ShowInitRecognizerHelpMessage();
+    return;
+  }
+
+  memset(&config_, 0, sizeof(config_));
+
+  config_.feat_config.sample_rate = 16000;
+  config_.feat_config.feature_dim = 80;
+
+  config_.model_config.whisper.encoder = whisper_encoder.c_str();
+  config_.model_config.whisper.decoder = whisper_decoder.c_str();
+  config_.model_config.tokens = tokens.c_str();
+  config_.model_config.num_threads = 1;
+  config_.model_config.debug = 1;
+  config_.model_config.model_type = "whisper";
+
+  config_.decoding_method = "greedy_search";
+  config_.max_active_paths = 4;
+
+  recognizer_ = CreateOfflineRecognizer(&config_);
+}
+
 void CNonStreamingSpeechRecognitionDlg::InitParaformer() {
  std::string paraformer = "./paraformer.onnx";
  std::string tokens = "./tokens.txt";
@@ -401,6 +468,11 @@ void CNonStreamingSpeechRecognitionDlg::InitRecognizer() {
    return;
  }

+  if (Exists("./whisper-encoder.onnx") || Exists("./whisper-encoder.int8.onnx")) {
+    InitWhisper();
+    return;
+  }
+
  // assume it is transducer

  std::string encoder = "./encoder.onnx";
--- a/mfc-examples/NonStreamingSpeechRecognition/NonStreamingSpeechRecognitionDlg.h
+++ b/mfc-examples/NonStreamingSpeechRecognition/NonStreamingSpeechRecognitionDlg.h
@@ -69,5 +69,6 @@ class CNonStreamingSpeechRecognitionDlg : public CDialogEx {
  void InitRecognizer();

  void InitParaformer();
+  void InitWhisper();
  void ShowInitRecognizerHelpMessage();
 };
--- a/mfc-examples/StreamingSpeechRecognition/StreamingSpeechRecognitionDlg.cpp
+++ b/mfc-examples/StreamingSpeechRecognition/StreamingSpeechRecognitionDlg.cpp
@@ -234,7 +234,137 @@ bool CStreamingSpeechRecognitionDlg::Exists(const std::string &filename) {
  return is.good();
 }

+void CStreamingSpeechRecognitionDlg::ShowInitRecognizerHelpMessage() {
+    my_btn_.EnableWindow(FALSE);
+    std::string msg =
+        "\r\nPlease go to\r\n"
+        "https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html "
+        "\r\n";
+    msg += "to download a streaming model, i.e., an online model.\r\n";
+    msg += "You need to rename them after downloading\r\n\r\n";
+    msg += "It supports both transducer and paraformer models.\r\n\r\n";
+    msg +=
+      "We give two examples below to show you how to download models\r\n\r\n";
+    msg += "(1) Transducer\r\n\r\n";
+    msg +=
+        "https://huggingface.co/pkufool/"
+        "icefall-asr-zipformer-streaming-wenetspeech-20230615";
+    msg += "\r\n\r\n";
+    msg +=
+        "wget https:// "
+        "huggingface.co/pkufool/"
+        "icefall-asr-zipformer-streaming-wenetspeech-20230615/resolve/main/exp/"
+        "encoder-epoch-12-avg-4-chunk-16-left-128.onnx\r\n";
+    msg +=
+        "wget https:// "
+        "huggingface.co/pkufool/"
+        "icefall-asr-zipformer-streaming-wenetspeech-20230615/resolve/main/exp/"
+        "decoder-epoch-12-avg-4-chunk-16-left-128.onnx\r\n";
+    msg +=
+        "wget https:// "
+        "huggingface.co/pkufool/"
+        "icefall-asr-zipformer-streaming-wenetspeech-20230615/resolve/main/exp/"
+        "joiner-epoch-12-avg-4-chunk-16-left-128.onnx\r\n";
+    msg +=
+        "wget "
+        "https://huggingface.co/pkufool/"
+        "icefall-asr-zipformer-streaming-wenetspeech-20230615/resolve/main/"
+        "data/lang_char/tokens.txt\r\n";
+
+    msg += "\r\nNow rename them.\r\n";
+    msg += "mv encoder-epoch-12-avg-4-chunk-16-left-128.onnx encoder.onnx\r\n";
+    msg += "mv decoder-epoch-12-avg-4-chunk-16-left-128.onnx decoder.onnx\r\n";
+    msg += "mv joiner-epoch-12-avg-4-chunk-16-left-128.onnx joiner.onnx\r\n";
+    msg += "\r\n";
+    msg += "(2) Paraformer\r\n\r\n";
+    msg +=
+        "wget "
+        "https://huggingface.co/csukuangfj/"
+        "sherpa-onnx-streaming-paraformer-bilingual-zh-en/resolve/main/"
+        "encoder.int8.onnx\r\n";
+    msg +=
+        "wget "
+        "https://huggingface.co/csukuangfj/"
+        "sherpa-onnx-streaming-paraformer-bilingual-zh-en/resolve/main/"
+        "decoder.int8.onnx\r\n";
+    msg +=
+        "wget "
+        "https://huggingface.co/csukuangfj/"
+        "sherpa-onnx-streaming-paraformer-bilingual-zh-en/resolve/main/"
+        "tokens.txt\r\n";
+    msg += "\r\nNow rename them.\r\n";
+    msg += "mv encoder.int8.onnx paraformer-encoder.onnx\r\n";
+    msg += "mv decoder.int8.onnx paraformer-decoder.onnx\r\n\r\n";
+    msg += "That's it!\r\n";
+
+    AppendLineToMultilineEditCtrl(msg);
+}
+
+void CStreamingSpeechRecognitionDlg::InitParaformer() {
+  std::string paraformer_encoder = "./paraformer-encoder.onnx";
+  std::string paraformer_decoder = "./paraformer-decoder.onnx";
+
+  std::string tokens = "./tokens.txt";
+
+  bool is_ok = true;
+
+  if (Exists("./paraformer-encoder.int8.onnx")) {
+    paraformer_encoder = "./paraformer-encoder.int8.onnx";
+  } else if (!Exists(paraformer_encoder)) {
+    std::string msg = paraformer_encoder + " does not exist!";
+    AppendLineToMultilineEditCtrl(msg);
+    is_ok = false;
+  }
+
+  if (Exists("./paraformer-decoder.int8.onnx")) {
+    paraformer_decoder = "./paraformer-decoder.int8.onnx";
+  } else if (!Exists(paraformer_decoder)) {
+    std::string msg = paraformer_decoder + " does not exist!";
+    AppendLineToMultilineEditCtrl(msg);
+    is_ok = false;
+  }
+
+  if (!Exists(tokens)) {
+    std::string msg = tokens + " does not exist!";
+    AppendLineToMultilineEditCtrl(msg);
+    is_ok = false;
+  }
+
+  if (!is_ok) {
+    ShowInitRecognizerHelpMessage();
+    return;
+  }
+
+  SherpaOnnxOnlineRecognizerConfig config;
+  memset(&config, 0, sizeof(config));
+  config.model_config.debug = 0;
+  config.model_config.num_threads = 1;
+  config.model_config.provider = "cpu";
+
+  config.decoding_method = "greedy_search";
+  config.max_active_paths = 4;
+
+  config.feat_config.sample_rate = 16000;
+  config.feat_config.feature_dim = 80;
+
+  config.enable_endpoint = 1;
+  config.rule1_min_trailing_silence = 1.2f;
+  config.rule2_min_trailing_silence = 0.8f;
+  config.rule3_min_utterance_length = 300.0f;
+
+  config.model_config.tokens = tokens.c_str();
+  config.model_config.paraformer.encoder = paraformer_encoder.c_str();
+  config.model_config.paraformer.decoder = paraformer_decoder.c_str();
+
+  recognizer_ = CreateOnlineRecognizer(&config);
+}
+
 void CStreamingSpeechRecognitionDlg::InitRecognizer() {
+  if (Exists("./paraformer-encoder.onnx") || Exists("./paraformer-encoder.int8.onnx")) {
+    InitParaformer();
+    return;
+  }
+
  std::string encoder = "./encoder.onnx";
  std::string decoder = "./decoder.onnx";
  std::string joiner = "./joiner.onnx";
@@ -266,55 +396,12 @@ void CStreamingSpeechRecognitionDlg::InitRecognizer() {
  }

  if (!is_ok) {
-    my_btn_.EnableWindow(FALSE);
-    std::string msg =
-        "\r\nPlease go to\r\n"
-        "https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html "
-        "\r\n";
-    msg += "to download a streaming model, i.e., an online model.\r\n";
-    msg +=
-        "You need to rename them to encoder.onnx, decoder.onnx, and "
-        "joiner.onnx correspoondingly.\r\n\r\n";
-    msg +=
-        "We use the following model as an example to show you how to do "
-        "that.\r\n";
-    msg +=
-        "https://huggingface.co/pkufool/"
-        "icefall-asr-zipformer-streaming-wenetspeech-20230615";
-    msg += "\r\n\r\n";
-    msg +=
-        "wget https:// "
-        "huggingface.co/pkufool/"
-        "icefall-asr-zipformer-streaming-wenetspeech-20230615/resolve/main/exp/"
-        "encoder-epoch-12-avg-4-chunk-16-left-128.onnx\r\n";
-    msg +=
-        "wget https:// "
-        "huggingface.co/pkufool/"
-        "icefall-asr-zipformer-streaming-wenetspeech-20230615/resolve/main/exp/"
-        "decoder-epoch-12-avg-4-chunk-16-left-128.onnx\r\n";
-    msg +=
-        "wget https:// "
-        "huggingface.co/pkufool/"
-        "icefall-asr-zipformer-streaming-wenetspeech-20230615/resolve/main/exp/"
-        "joiner-epoch-12-avg-4-chunk-16-left-128.onnx\r\n";
-    msg +=
-        "wget "
-        "https://huggingface.co/pkufool/"
-        "icefall-asr-zipformer-streaming-wenetspeech-20230615/resolve/main/"
-        "data/lang_char/tokens.txt\r\n";
-
-    msg += "\r\nNow rename them.\r\n";
-    msg += "mv encoder-epoch-12-avg-4-chunk-16-left-128.onnx encoder.onnx\r\n";
-    msg += "mv decoder-epoch-12-avg-4-chunk-16-left-128.onnx decoder.onnx\r\n";
-    msg += "mv joiner-epoch-12-avg-4-chunk-16-left-128.onnx joiner.onnx\r\n";
-    msg += "\r\n";
-    msg += "That's it!\r\n";
-
-    AppendLineToMultilineEditCtrl(msg);
+    ShowInitRecognizerHelpMessage();
    return;
  }

  SherpaOnnxOnlineRecognizerConfig config;
+  memset(&config, 0, sizeof(config));
  config.model_config.debug = 0;
  config.model_config.num_threads = 1;
  config.model_config.provider = "cpu";
@@ -331,9 +418,9 @@ void CStreamingSpeechRecognitionDlg::InitRecognizer() {
  config.rule3_min_utterance_length = 300.0f;

  config.model_config.tokens = tokens.c_str();
-  config.model_config.encoder = encoder.c_str();
-  config.model_config.decoder = decoder.c_str();
-  config.model_config.joiner = joiner.c_str();
+  config.model_config.transducer.encoder = encoder.c_str();
+  config.model_config.transducer.decoder = decoder.c_str();
+  config.model_config.transducer.joiner = joiner.c_str();

  recognizer_ = CreateOnlineRecognizer(&config);
 }
--- a/mfc-examples/StreamingSpeechRecognition/StreamingSpeechRecognitionDlg.h
+++ b/mfc-examples/StreamingSpeechRecognition/StreamingSpeechRecognitionDlg.h
@@ -67,6 +67,8 @@ class CStreamingSpeechRecognitionDlg : public CDialogEx {

  bool Exists(const std::string &filename);
  void InitRecognizer();
+  void InitParaformer();
+  void ShowInitRecognizerHelpMessage();
 };

 class RecognizerThread : public CWinThread {