Support spoken language identification with whisper (#694)

2024-03-24 22:57:00 +08:00
parent 3cdad9b5d1
commit 0d258dd150
36 changed files with 1173 additions and 200 deletions
--- a/sherpa-onnx/python/csrc/CMakeLists.txt
+++ b/sherpa-onnx/python/csrc/CMakeLists.txt
@@ -33,6 +33,7 @@ set(srcs
  silero-vad-model-config.cc
  speaker-embedding-extractor.cc
  speaker-embedding-manager.cc
+  spoken-language-identification.cc
  vad-model-config.cc
  vad-model.cc
  voice-activity-detector.cc
--- a/sherpa-onnx/python/csrc/sherpa-onnx.cc
+++ b/sherpa-onnx/python/csrc/sherpa-onnx.cc
@@ -22,6 +22,7 @@
 #include "sherpa-onnx/python/csrc/online-stream.h"
 #include "sherpa-onnx/python/csrc/speaker-embedding-extractor.h"
 #include "sherpa-onnx/python/csrc/speaker-embedding-manager.h"
+#include "sherpa-onnx/python/csrc/spoken-language-identification.h"
 #include "sherpa-onnx/python/csrc/vad-model-config.h"
 #include "sherpa-onnx/python/csrc/vad-model.h"
 #include "sherpa-onnx/python/csrc/voice-activity-detector.h"
@@ -55,6 +56,7 @@ PYBIND11_MODULE(_sherpa_onnx, m) {
  PybindOfflineTts(&m);
  PybindSpeakerEmbeddingExtractor(&m);
  PybindSpeakerEmbeddingManager(&m);
+  PybindSpokenLanguageIdentification(&m);

  PybindAlsa(&m);
 }
--- a/sherpa-onnx/python/csrc/spoken-language-identification.cc
+++ b/sherpa-onnx/python/csrc/spoken-language-identification.cc
@@ -0,0 +1,60 @@
+// sherpa-onnx/python/csrc/spoken-language-identification.cc
+//
+// Copyright (c)  2024  Xiaomi Corporation
+
+#include "sherpa-onnx/python/csrc/spoken-language-identification.h"
+
+#include <string>
+
+#include "sherpa-onnx/csrc/spoken-language-identification.h"
+
+namespace sherpa_onnx {
+
+static void PybindSpokenLanguageIdentificationWhisperConfig(py::module *m) {
+  using PyClass = SpokenLanguageIdentificationWhisperConfig;
+
+  py::class_<PyClass>(*m, "SpokenLanguageIdentificationWhisperConfig")
+      .def(py::init<>())
+      .def(py::init<const std::string &, const std::string &, int32_t>(),
+           py::arg("encoder"), py::arg("decoder"),
+           py::arg("tail_paddings") = -1)
+      .def_readwrite("encoder", &PyClass::encoder)
+      .def_readwrite("decoder", &PyClass::decoder)
+      .def_readwrite("tail_paddings", &PyClass::tail_paddings)
+      .def("validate", &PyClass::Validate)
+      .def("__str__", &PyClass::ToString);
+}
+
+static void PybindSpokenLanguageIdentificationConfig(py::module *m) {
+  PybindSpokenLanguageIdentificationWhisperConfig(m);
+
+  using PyClass = SpokenLanguageIdentificationConfig;
+
+  py::class_<PyClass>(*m, "SpokenLanguageIdentificationConfig")
+      .def(py::init<>())
+      .def(py::init<const SpokenLanguageIdentificationWhisperConfig &, int32_t,
+                    bool, const std::string>(),
+           py::arg("whisper"), py::arg("num_threads") = 1,
+           py::arg("debug") = false, py::arg("provider") = "cpu")
+      .def_readwrite("whisper", &PyClass::whisper)
+      .def_readwrite("num_threads", &PyClass::num_threads)
+      .def_readwrite("debug", &PyClass::debug)
+      .def_readwrite("provider", &PyClass::provider)
+      .def("validate", &PyClass::Validate)
+      .def("__str__", &PyClass::ToString);
+}
+
+void PybindSpokenLanguageIdentification(py::module *m) {
+  PybindSpokenLanguageIdentificationConfig(m);
+
+  using PyClass = SpokenLanguageIdentification;
+  py::class_<PyClass>(*m, "SpokenLanguageIdentification")
+      .def(py::init<const SpokenLanguageIdentificationConfig &>(),
+           py::arg("config"), py::call_guard<py::gil_scoped_release>())
+      .def("create_stream", &PyClass::CreateStream,
+           py::call_guard<py::gil_scoped_release>())
+      .def("compute", &PyClass::Compute,
+           py::call_guard<py::gil_scoped_release>());
+}
+
+}  // namespace sherpa_onnx
--- a/sherpa-onnx/python/csrc/spoken-language-identification.h
+++ b/sherpa-onnx/python/csrc/spoken-language-identification.h
@@ -0,0 +1,16 @@
+// sherpa-onnx/python/csrc/spoken-language-identification.h
+//
+// Copyright (c)  2024  Xiaomi Corporation
+
+#ifndef SHERPA_ONNX_PYTHON_CSRC_SPOKEN_LANGUAGE_IDENTIFICATION_H_
+#define SHERPA_ONNX_PYTHON_CSRC_SPOKEN_LANGUAGE_IDENTIFICATION_H_
+
+#include "sherpa-onnx/python/csrc/sherpa-onnx.h"
+
+namespace sherpa_onnx {
+
+void PybindSpokenLanguageIdentification(py::module *m);
+
+}
+
+#endif  // SHERPA_ONNX_PYTHON_CSRC_SPOKEN_LANGUAGE_IDENTIFICATION_H_