Add JavaScript (node-addon) API for speech enhancement GTCRN models (#1996)

2025-03-12 15:52:01 +08:00
parent fd78a482df
commit 6a97f8adcf
21 changed files with 500 additions and 119 deletions
--- a/sherpa-onnx/c-api/c-api.cc
+++ b/sherpa-onnx/c-api/c-api.cc
@@ -922,22 +922,23 @@ struct SherpaOnnxCircularBuffer {
  std::unique_ptr<sherpa_onnx::CircularBuffer> impl;
 };

-SherpaOnnxCircularBuffer *SherpaOnnxCreateCircularBuffer(int32_t capacity) {
+const SherpaOnnxCircularBuffer *SherpaOnnxCreateCircularBuffer(
+    int32_t capacity) {
  SherpaOnnxCircularBuffer *buffer = new SherpaOnnxCircularBuffer;
  buffer->impl = std::make_unique<sherpa_onnx::CircularBuffer>(capacity);
  return buffer;
 }

-void SherpaOnnxDestroyCircularBuffer(SherpaOnnxCircularBuffer *buffer) {
+void SherpaOnnxDestroyCircularBuffer(const SherpaOnnxCircularBuffer *buffer) {
  delete buffer;
 }

-void SherpaOnnxCircularBufferPush(SherpaOnnxCircularBuffer *buffer,
+void SherpaOnnxCircularBufferPush(const SherpaOnnxCircularBuffer *buffer,
                                  const float *p, int32_t n) {
  buffer->impl->Push(p, n);
 }

-const float *SherpaOnnxCircularBufferGet(SherpaOnnxCircularBuffer *buffer,
+const float *SherpaOnnxCircularBufferGet(const SherpaOnnxCircularBuffer *buffer,
                                         int32_t start_index, int32_t n) {
  std::vector<float> v = buffer->impl->Get(start_index, n);

@@ -948,19 +949,20 @@ const float *SherpaOnnxCircularBufferGet(SherpaOnnxCircularBuffer *buffer,

 void SherpaOnnxCircularBufferFree(const float *p) { delete[] p; }

-void SherpaOnnxCircularBufferPop(SherpaOnnxCircularBuffer *buffer, int32_t n) {
+void SherpaOnnxCircularBufferPop(const SherpaOnnxCircularBuffer *buffer,
+                                 int32_t n) {
  buffer->impl->Pop(n);
 }

-int32_t SherpaOnnxCircularBufferSize(SherpaOnnxCircularBuffer *buffer) {
+int32_t SherpaOnnxCircularBufferSize(const SherpaOnnxCircularBuffer *buffer) {
  return buffer->impl->Size();
 }

-int32_t SherpaOnnxCircularBufferHead(SherpaOnnxCircularBuffer *buffer) {
+int32_t SherpaOnnxCircularBufferHead(const SherpaOnnxCircularBuffer *buffer) {
  return buffer->impl->Head();
 }

-void SherpaOnnxCircularBufferReset(SherpaOnnxCircularBuffer *buffer) {
+void SherpaOnnxCircularBufferReset(const SherpaOnnxCircularBuffer *buffer) {
  buffer->impl->Reset();
 }

@@ -1008,7 +1010,7 @@ sherpa_onnx::VadModelConfig GetVadModelConfig(
  return vad_config;
 }

-SherpaOnnxVoiceActivityDetector *SherpaOnnxCreateVoiceActivityDetector(
+const SherpaOnnxVoiceActivityDetector *SherpaOnnxCreateVoiceActivityDetector(
    const SherpaOnnxVadModelConfig *config, float buffer_size_in_seconds) {
  auto vad_config = GetVadModelConfig(config);

@@ -1025,35 +1027,37 @@ SherpaOnnxVoiceActivityDetector *SherpaOnnxCreateVoiceActivityDetector(
 }

 void SherpaOnnxDestroyVoiceActivityDetector(
-    SherpaOnnxVoiceActivityDetector *p) {
+    const SherpaOnnxVoiceActivityDetector *p) {
  delete p;
 }

 void SherpaOnnxVoiceActivityDetectorAcceptWaveform(
-    SherpaOnnxVoiceActivityDetector *p, const float *samples, int32_t n) {
+    const SherpaOnnxVoiceActivityDetector *p, const float *samples, int32_t n) {
  p->impl->AcceptWaveform(samples, n);
 }

 int32_t SherpaOnnxVoiceActivityDetectorEmpty(
-    SherpaOnnxVoiceActivityDetector *p) {
+    const SherpaOnnxVoiceActivityDetector *p) {
  return p->impl->Empty();
 }

 int32_t SherpaOnnxVoiceActivityDetectorDetected(
-    SherpaOnnxVoiceActivityDetector *p) {
+    const SherpaOnnxVoiceActivityDetector *p) {
  return p->impl->IsSpeechDetected();
 }

-void SherpaOnnxVoiceActivityDetectorPop(SherpaOnnxVoiceActivityDetector *p) {
+void SherpaOnnxVoiceActivityDetectorPop(
+    const SherpaOnnxVoiceActivityDetector *p) {
  p->impl->Pop();
 }

-void SherpaOnnxVoiceActivityDetectorClear(SherpaOnnxVoiceActivityDetector *p) {
+void SherpaOnnxVoiceActivityDetectorClear(
+    const SherpaOnnxVoiceActivityDetector *p) {
  p->impl->Clear();
 }

 const SherpaOnnxSpeechSegment *SherpaOnnxVoiceActivityDetectorFront(
-    SherpaOnnxVoiceActivityDetector *p) {
+    const SherpaOnnxVoiceActivityDetector *p) {
  const sherpa_onnx::SpeechSegment &segment = p->impl->Front();

  SherpaOnnxSpeechSegment *ans = new SherpaOnnxSpeechSegment;
@@ -1072,11 +1076,13 @@ void SherpaOnnxDestroySpeechSegment(const SherpaOnnxSpeechSegment *p) {
  }
 }

-void SherpaOnnxVoiceActivityDetectorReset(SherpaOnnxVoiceActivityDetector *p) {
+void SherpaOnnxVoiceActivityDetectorReset(
+    const SherpaOnnxVoiceActivityDetector *p) {
  p->impl->Reset();
 }

-void SherpaOnnxVoiceActivityDetectorFlush(SherpaOnnxVoiceActivityDetector *p) {
+void SherpaOnnxVoiceActivityDetectorFlush(
+    const SherpaOnnxVoiceActivityDetector *p) {
  p->impl->Flush();
 }

@@ -1915,7 +1921,7 @@ struct SherpaOnnxLinearResampler {
  std::unique_ptr<sherpa_onnx::LinearResample> impl;
 };

-SherpaOnnxLinearResampler *SherpaOnnxCreateLinearResampler(
+const SherpaOnnxLinearResampler *SherpaOnnxCreateLinearResampler(
    int32_t samp_rate_in_hz, int32_t samp_rate_out_hz, float filter_cutoff_hz,
    int32_t num_zeros) {
  SherpaOnnxLinearResampler *p = new SherpaOnnxLinearResampler;
@@ -1925,12 +1931,12 @@ SherpaOnnxLinearResampler *SherpaOnnxCreateLinearResampler(
  return p;
 }

-void SherpaOnnxDestroyLinearResampler(SherpaOnnxLinearResampler *p) {
+void SherpaOnnxDestroyLinearResampler(const SherpaOnnxLinearResampler *p) {
  delete p;
 }

 const SherpaOnnxResampleOut *SherpaOnnxLinearResamplerResample(
-    SherpaOnnxLinearResampler *p, const float *input, int32_t input_dim,
+    const SherpaOnnxLinearResampler *p, const float *input, int32_t input_dim,
    int32_t flush) {
  std::vector<float> o;
  p->impl->Resample(input, input_dim, flush, &o);
@@ -2320,7 +2326,7 @@ const SherpaOnnxOfflineSpeechDenoiser *
 SherpaOnnxCreateOfflineSpeechDenoiserOHOS(
    const SherpaOnnxOfflineSpeechDenoiserConfig *config,
    NativeResourceManager *mgr) {
-  auto sd_config = GetOfflineSpeechDenoiserConfia(config);
+  auto sd_config = GetOfflineSpeechDenoiserConfig(config);

  SherpaOnnxOfflineSpeechDenoiser *sd = new SherpaOnnxOfflineSpeechDenoiser;

@@ -2361,7 +2367,8 @@ const SherpaOnnxOfflineRecognizer *SherpaOnnxCreateOfflineRecognizerOHOS(
  return recognizer;
 }

-SherpaOnnxVoiceActivityDetector *SherpaOnnxCreateVoiceActivityDetectorOHOS(
+const SherpaOnnxVoiceActivityDetector *
+SherpaOnnxCreateVoiceActivityDetectorOHOS(
    const SherpaOnnxVadModelConfig *config, float buffer_size_in_seconds,
    NativeResourceManager *mgr) {
  if (mgr == nullptr) {
--- a/sherpa-onnx/c-api/c-api.h
+++ b/sherpa-onnx/c-api/c-api.h
@@ -808,15 +808,15 @@ SHERPA_ONNX_API typedef struct SherpaOnnxCircularBuffer
 // Return an instance of circular buffer. The user has to use
 // SherpaOnnxDestroyCircularBuffer() to free the returned pointer to avoid
 // memory leak.
-SHERPA_ONNX_API SherpaOnnxCircularBuffer *SherpaOnnxCreateCircularBuffer(
+SHERPA_ONNX_API const SherpaOnnxCircularBuffer *SherpaOnnxCreateCircularBuffer(
    int32_t capacity);

 // Free the pointer returned by SherpaOnnxCreateCircularBuffer()
 SHERPA_ONNX_API void SherpaOnnxDestroyCircularBuffer(
-    SherpaOnnxCircularBuffer *buffer);
+    const SherpaOnnxCircularBuffer *buffer);

 SHERPA_ONNX_API void SherpaOnnxCircularBufferPush(
-    SherpaOnnxCircularBuffer *buffer, const float *p, int32_t n);
+    const SherpaOnnxCircularBuffer *buffer, const float *p, int32_t n);

 // Return n samples starting at the given index.
 //
@@ -824,27 +824,27 @@ SHERPA_ONNX_API void SherpaOnnxCircularBufferPush(
 // The user has to use SherpaOnnxCircularBufferFree() to free the returned
 // pointer to avoid memory leak.
 SHERPA_ONNX_API const float *SherpaOnnxCircularBufferGet(
-    SherpaOnnxCircularBuffer *buffer, int32_t start_index, int32_t n);
+    const SherpaOnnxCircularBuffer *buffer, int32_t start_index, int32_t n);

 // Free the pointer returned by SherpaOnnxCircularBufferGet().
 SHERPA_ONNX_API void SherpaOnnxCircularBufferFree(const float *p);

 // Remove n elements from the buffer
 SHERPA_ONNX_API void SherpaOnnxCircularBufferPop(
-    SherpaOnnxCircularBuffer *buffer, int32_t n);
+    const SherpaOnnxCircularBuffer *buffer, int32_t n);

 // Return number of elements in the buffer.
 SHERPA_ONNX_API int32_t
-SherpaOnnxCircularBufferSize(SherpaOnnxCircularBuffer *buffer);
+SherpaOnnxCircularBufferSize(const SherpaOnnxCircularBuffer *buffer);

 // Return the head of the buffer. It's always non-decreasing until you
 // invoke SherpaOnnxCircularBufferReset() which resets head to 0.
 SHERPA_ONNX_API int32_t
-SherpaOnnxCircularBufferHead(SherpaOnnxCircularBuffer *buffer);
+SherpaOnnxCircularBufferHead(const SherpaOnnxCircularBuffer *buffer);

 // Clear all elements in the buffer
 SHERPA_ONNX_API void SherpaOnnxCircularBufferReset(
-    SherpaOnnxCircularBuffer *buffer);
+    const SherpaOnnxCircularBuffer *buffer);

 SHERPA_ONNX_API typedef struct SherpaOnnxSpeechSegment {
  // The start index in samples of this segment
@@ -862,40 +862,40 @@ typedef struct SherpaOnnxVoiceActivityDetector SherpaOnnxVoiceActivityDetector;
 // Return an instance of VoiceActivityDetector.
 // The user has to use SherpaOnnxDestroyVoiceActivityDetector() to free
 // the returned pointer to avoid memory leak.
-SHERPA_ONNX_API SherpaOnnxVoiceActivityDetector *
+SHERPA_ONNX_API const SherpaOnnxVoiceActivityDetector *
 SherpaOnnxCreateVoiceActivityDetector(const SherpaOnnxVadModelConfig *config,
                                      float buffer_size_in_seconds);

 SHERPA_ONNX_API void SherpaOnnxDestroyVoiceActivityDetector(
-    SherpaOnnxVoiceActivityDetector *p);
+    const SherpaOnnxVoiceActivityDetector *p);

 SHERPA_ONNX_API void SherpaOnnxVoiceActivityDetectorAcceptWaveform(
-    SherpaOnnxVoiceActivityDetector *p, const float *samples, int32_t n);
+    const SherpaOnnxVoiceActivityDetector *p, const float *samples, int32_t n);

 // Return 1 if there are no speech segments available.
 // Return 0 if there are speech segments.
 SHERPA_ONNX_API int32_t
-SherpaOnnxVoiceActivityDetectorEmpty(SherpaOnnxVoiceActivityDetector *p);
+SherpaOnnxVoiceActivityDetectorEmpty(const SherpaOnnxVoiceActivityDetector *p);

 // Return 1 if there is voice detected.
 // Return 0 if voice is silent.
-SHERPA_ONNX_API int32_t
-SherpaOnnxVoiceActivityDetectorDetected(SherpaOnnxVoiceActivityDetector *p);
+SHERPA_ONNX_API int32_t SherpaOnnxVoiceActivityDetectorDetected(
+    const SherpaOnnxVoiceActivityDetector *p);

 // Return the first speech segment.
 // It throws if SherpaOnnxVoiceActivityDetectorEmpty() returns 1.
 SHERPA_ONNX_API void SherpaOnnxVoiceActivityDetectorPop(
-    SherpaOnnxVoiceActivityDetector *p);
+    const SherpaOnnxVoiceActivityDetector *p);

 // Clear current speech segments.
 SHERPA_ONNX_API void SherpaOnnxVoiceActivityDetectorClear(
-    SherpaOnnxVoiceActivityDetector *p);
+    const SherpaOnnxVoiceActivityDetector *p);

 // Return the first speech segment.
 // The user has to use SherpaOnnxDestroySpeechSegment() to free the returned
 // pointer to avoid memory leak.
 SHERPA_ONNX_API const SherpaOnnxSpeechSegment *
-SherpaOnnxVoiceActivityDetectorFront(SherpaOnnxVoiceActivityDetector *p);
+SherpaOnnxVoiceActivityDetectorFront(const SherpaOnnxVoiceActivityDetector *p);

 // Free the pointer returned SherpaOnnxVoiceActivityDetectorFront().
 SHERPA_ONNX_API void SherpaOnnxDestroySpeechSegment(
@@ -903,10 +903,10 @@ SHERPA_ONNX_API void SherpaOnnxDestroySpeechSegment(

 // Re-initialize the voice activity detector.
 SHERPA_ONNX_API void SherpaOnnxVoiceActivityDetectorReset(
-    SherpaOnnxVoiceActivityDetector *p);
+    const SherpaOnnxVoiceActivityDetector *p);

 SHERPA_ONNX_API void SherpaOnnxVoiceActivityDetectorFlush(
-    SherpaOnnxVoiceActivityDetector *p);
+    const SherpaOnnxVoiceActivityDetector *p);

 // ============================================================
 // For offline Text-to-Speech (i.e., non-streaming TTS)
@@ -1481,15 +1481,16 @@ SHERPA_ONNX_API typedef struct SherpaOnnxLinearResampler
 */
 // The user has to invoke SherpaOnnxDestroyLinearResampler()
 // to free the returned pointer to avoid memory leak
-SHERPA_ONNX_API SherpaOnnxLinearResampler *SherpaOnnxCreateLinearResampler(
-    int32_t samp_rate_in_hz, int32_t samp_rate_out_hz, float filter_cutoff_hz,
-    int32_t num_zeros);
+SHERPA_ONNX_API const SherpaOnnxLinearResampler *
+SherpaOnnxCreateLinearResampler(int32_t samp_rate_in_hz,
+                                int32_t samp_rate_out_hz,
+                                float filter_cutoff_hz, int32_t num_zeros);

 SHERPA_ONNX_API void SherpaOnnxDestroyLinearResampler(
-    SherpaOnnxLinearResampler *p);
+    const SherpaOnnxLinearResampler *p);

 SHERPA_ONNX_API void SherpaOnnxLinearResamplerReset(
-    SherpaOnnxLinearResampler *p);
+    const SherpaOnnxLinearResampler *p);

 typedef struct SherpaOnnxResampleOut {
  const float *samples;
@@ -1501,7 +1502,7 @@ typedef struct SherpaOnnxResampleOut {
 // If this is the last segment, you can set flush to 1; otherwise, please
 // set flush to 0
 SHERPA_ONNX_API const SherpaOnnxResampleOut *SherpaOnnxLinearResamplerResample(
-    SherpaOnnxLinearResampler *p, const float *input, int32_t input_dim,
+    const SherpaOnnxLinearResampler *p, const float *input, int32_t input_dim,
    int32_t flush);

 SHERPA_ONNX_API void SherpaOnnxLinearResamplerResampleFree(
@@ -1724,7 +1725,7 @@ SherpaOnnxCreateOfflineRecognizerOHOS(
 // Return an instance of VoiceActivityDetector.
 // The user has to use SherpaOnnxDestroyVoiceActivityDetector() to free
 // the returned pointer to avoid memory leak.
-SHERPA_ONNX_API SherpaOnnxVoiceActivityDetector *
+SHERPA_ONNX_API const SherpaOnnxVoiceActivityDetector *
 SherpaOnnxCreateVoiceActivityDetectorOHOS(
    const SherpaOnnxVadModelConfig *config, float buffer_size_in_seconds,
    NativeResourceManager *mgr);