Add C++ example for streaming ASR with SenseVoice. (#2199)

2025-05-11 00:23:32 +08:00
parent fc2121c307
commit 028b8f2718
16 changed files with 514 additions and 60 deletions
--- a/sherpa-onnx/c-api/cxx-api.cc
+++ b/sherpa-onnx/c-api/cxx-api.cc
@@ -678,4 +678,42 @@ void VoiceActivityDetector::Flush() const {
  SherpaOnnxVoiceActivityDetectorFlush(p_);
 }

+LinearResampler LinearResampler::Create(int32_t samp_rate_in_hz,
+                                        int32_t samp_rate_out_hz,
+                                        float filter_cutoff_hz,
+                                        int32_t num_zeros) {
+  auto p = SherpaOnnxCreateLinearResampler(samp_rate_in_hz, samp_rate_out_hz,
+                                           filter_cutoff_hz, num_zeros);
+  return LinearResampler(p);
+}
+
+LinearResampler::LinearResampler(const SherpaOnnxLinearResampler *p)
+    : MoveOnly<LinearResampler, SherpaOnnxLinearResampler>(p) {}
+
+void LinearResampler::Destroy(const SherpaOnnxLinearResampler *p) const {
+  SherpaOnnxDestroyLinearResampler(p);
+}
+
+void LinearResampler::Reset() const { SherpaOnnxLinearResamplerReset(p_); }
+
+std::vector<float> LinearResampler::Resample(const float *input,
+                                             int32_t input_dim,
+                                             bool flush) const {
+  auto out = SherpaOnnxLinearResamplerResample(p_, input, input_dim, flush);
+
+  std::vector<float> ans{out->samples, out->samples + out->n};
+
+  SherpaOnnxLinearResamplerResampleFree(out);
+
+  return ans;
+}
+
+int32_t LinearResampler::GetInputSamplingRate() const {
+  return SherpaOnnxLinearResamplerResampleGetInputSampleRate(p_);
+}
+
+int32_t LinearResampler::GetOutputSamplingRate() const {
+  return SherpaOnnxLinearResamplerResampleGetOutputSampleRate(p_);
+}
+
 }  // namespace sherpa_onnx::cxx
--- a/sherpa-onnx/c-api/cxx-api.h
+++ b/sherpa-onnx/c-api/cxx-api.h
@@ -111,6 +111,7 @@ SHERPA_ONNX_API bool WriteWave(const std::string &filename, const Wave &wave);
 template <typename Derived, typename T>
 class SHERPA_ONNX_API MoveOnly {
 public:
+  MoveOnly() = default;
  explicit MoveOnly(const T *p) : p_(p) {}

  ~MoveOnly() { Destroy(); }
@@ -591,6 +592,28 @@ class SHERPA_ONNX_API VoiceActivityDetector
  explicit VoiceActivityDetector(const SherpaOnnxVoiceActivityDetector *p);
 };

+class SHERPA_ONNX_API LinearResampler
+    : public MoveOnly<LinearResampler, SherpaOnnxLinearResampler> {
+ public:
+  LinearResampler() = default;
+  static LinearResampler Create(int32_t samp_rate_in_hz,
+                                int32_t samp_rate_out_hz,
+                                float filter_cutoff_hz, int32_t num_zeros);
+
+  void Destroy(const SherpaOnnxLinearResampler *p) const;
+
+  void Reset() const;
+
+  std::vector<float> Resample(const float *input, int32_t input_dim,
+                              bool flush) const;
+
+  int32_t GetInputSamplingRate() const;
+  int32_t GetOutputSamplingRate() const;
+
+ private:
+  explicit LinearResampler(const SherpaOnnxLinearResampler *p);
+};
+
 }  // namespace sherpa_onnx::cxx

 #endif  // SHERPA_ONNX_C_API_CXX_API_H_
--- a/sherpa-onnx/csrc/homophone-replacer.cc
+++ b/sherpa-onnx/csrc/homophone-replacer.cc
@@ -166,20 +166,32 @@ class HomophoneReplacer::Impl {
    }

    // convert words to pronunciations
-    std::vector<std::string> pronunciations;
+    std::vector<std::string> current_words;
+    std::vector<std::string> current_pronunciations;

    for (const auto &w : words) {
+      if (w.size() < 3 ||
+          reinterpret_cast<const uint8_t *>(w.data())[0] < 128) {
+        if (!current_words.empty()) {
+          ans += ApplyImpl(current_words, current_pronunciations);
+          current_words.clear();
+          current_pronunciations.clear();
+        }
+        ans += w;
+        continue;
+      }
+
      auto p = ConvertWordToPronunciation(w);
      if (config_.debug) {
        SHERPA_ONNX_LOGE("%s %s", w.c_str(), p.c_str());
      }
-      pronunciations.push_back(std::move(p));
+
+      current_words.push_back(w);
+      current_pronunciations.push_back(std::move(p));
    }

-    for (const auto &r : replacer_list_) {
-      ans = r->Normalize(words, pronunciations);
-      // TODO(fangjun): We support only 1 rule fst at present.
-      break;
+    if (!current_words.empty()) {
+      ans += ApplyImpl(current_words, current_pronunciations);
    }

    if (config_.debug) {
@@ -190,6 +202,16 @@ class HomophoneReplacer::Impl {
  }

 private:
+  std::string ApplyImpl(const std::vector<std::string> &words,
+                        const std::vector<std::string> &pronunciations) const {
+    std::string ans;
+    for (const auto &r : replacer_list_) {
+      ans = r->Normalize(words, pronunciations);
+      // TODO(fangjun): We support only 1 rule fst at present.
+      break;
+    }
+    return ans;
+  }
  std::string ConvertWordToPronunciation(const std::string &word) const {
    if (word2pron_.count(word)) {
      return word2pron_.at(word);
@@ -239,6 +261,9 @@ class HomophoneReplacer::Impl {
      }

      while (iss >> p) {
+        if (p.back() > '4') {
+          p.push_back('1');
+        }
        pron.append(std::move(p));
      }