Support spoken language identification with whisper (#694)

2024-03-24 22:57:00 +08:00
parent 3cdad9b5d1
commit 0d258dd150
36 changed files with 1173 additions and 200 deletions
--- a/sherpa-onnx/csrc/spoken-language-identification.h
+++ b/sherpa-onnx/csrc/spoken-language-identification.h
@@ -0,0 +1,89 @@
+// sherpa-onnx/csrc/spoken-language-identification.h
+//
+// Copyright (c)  2024  Xiaomi Corporation
+#ifndef SHERPA_ONNX_CSRC_SPOKEN_LANGUAGE_IDENTIFICATION_H_
+#define SHERPA_ONNX_CSRC_SPOKEN_LANGUAGE_IDENTIFICATION_H_
+
+#include <memory>
+#include <string>
+
+#include "sherpa-onnx/csrc/offline-stream.h"
+#include "sherpa-onnx/csrc/parse-options.h"
+
+namespace sherpa_onnx {
+
+struct SpokenLanguageIdentificationWhisperConfig {
+  // Requires a multi-lingual whisper model.
+  // That is, it supports only tiny, base, small, medium, large.
+  // Note: It does NOT support tiny.en, base.en, small.en, medium.en
+  std::string encoder;
+  std::string decoder;
+
+  // Number of tail padding frames.
+  //
+  // Since we remove the 30-second constraint, we need to add some paddings
+  // at the end.
+  //
+  // Recommended values:
+  //   - 50 for English models
+  //   - 300 for multilingual models
+  int32_t tail_paddings = -1;
+
+  SpokenLanguageIdentificationWhisperConfig() = default;
+
+  SpokenLanguageIdentificationWhisperConfig(const std::string &encoder,
+                                            const std::string &decoder,
+                                            int32_t tail_paddings)
+      : encoder(encoder), decoder(decoder), tail_paddings(tail_paddings) {}
+
+  void Register(ParseOptions *po);
+  bool Validate() const;
+  std::string ToString() const;
+};
+
+struct SpokenLanguageIdentificationConfig {
+  SpokenLanguageIdentificationWhisperConfig whisper;
+
+  int32_t num_threads = 1;
+  bool debug = false;
+  std::string provider = "cpu";
+
+  SpokenLanguageIdentificationConfig() = default;
+
+  SpokenLanguageIdentificationConfig(
+      const SpokenLanguageIdentificationWhisperConfig &whisper,
+      int32_t num_threads, bool debug, const std::string &provider)
+      : whisper(whisper),
+        num_threads(num_threads),
+        debug(debug),
+        provider(provider) {}
+
+  void Register(ParseOptions *po);
+  bool Validate() const;
+  std::string ToString() const;
+};
+
+class SpokenLanguageIdentificationImpl;
+
+class SpokenLanguageIdentification {
+ public:
+  explicit SpokenLanguageIdentification(
+      const SpokenLanguageIdentificationConfig &config);
+
+  ~SpokenLanguageIdentification();
+
+  // Create a stream to accept audio samples and compute features
+  std::unique_ptr<OfflineStream> CreateStream() const;
+
+  // Return a string containing the language, e.g., en, zh, de,
+  // etc.
+  // Note: en is for English, zh is for Chinese, de is for German, etc.
+  std::string Compute(OfflineStream *s) const;
+
+ private:
+  std::unique_ptr<SpokenLanguageIdentificationImpl> impl_;
+};
+
+}  // namespace sherpa_onnx
+
+#endif  // SHERPA_ONNX_CSRC_SPOKEN_LANGUAGE_IDENTIFICATION_H_