Add JavaScript API for SenseVoice (#1157)

2024-07-21 10:14:14 +08:00
parent 8f4d332aab
commit c3260ef842
6 changed files with 210 additions and 1 deletions
--- a/.github/scripts/test-nodejs-addon-npm.sh
+++ b/.github/scripts/test-nodejs-addon-npm.sh
@@ -20,6 +20,13 @@ if [[ $arch != "ia32" && $platform != "win32" ]]; then
  node ./test_asr_non_streaming_nemo_ctc.js
  rm -rf sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k
  curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
  tar xvf sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
  rm sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
  node ./test_asr_non_streaming_sense_voice.js
  rm -rf sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17
  curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-paraformer-zh-2023-09-14.tar.bz2
  tar xvf sherpa-onnx-paraformer-zh-2023-09-14.tar.bz2
  rm sherpa-onnx-paraformer-zh-2023-09-14.tar.bz2
--- a/nodejs-addon-examples/README.md
+++ b/nodejs-addon-examples/README.md
@@ -95,6 +95,7 @@ The following tables list the examples in this folder.
 |[./test_asr_non_streaming_whisper.js](./test_asr_non_streaming_whisper.js)| Non-streaming speech recognition from a file using [Whisper](https://github.com/openai/whisper)|
 |[./test_asr_non_streaming_nemo_ctc.js](./test_asr_non_streaming_nemo_ctc.js)|Non-streaming speech recognition from a file using a [NeMo](https://github.com/NVIDIA/NeMo) CTC model with greedy search|
 |[./test_asr_non_streaming_paraformer.js](./test_asr_non_streaming_paraformer.js)|Non-streaming speech recognition from a file using [Paraformer](https://github.com/alibaba-damo-academy/FunASR)|
 |[./test_asr_non_streaming_sense_voice.js](./test_asr_non_streaming_sense_voice.js)|Non-streaming speech recognition from a file using [SenseVoice](https://github.com/FunAudioLLM/SenseVoice)|
 ## Non-Streaming speech-to-text from a microphone with VAD
@@ -104,6 +105,7 @@ The following tables list the examples in this folder.
 |[./test_vad_asr_non_streaming_whisper_microphone.js](./test_vad_asr_non_streaming_whisper_microphone.js)|VAD + Non-streaming speech recognition from a microphone using [Whisper](https://github.com/openai/whisper)|
 |[./test_vad_asr_non_streaming_nemo_ctc_microphone.js](./test_vad_asr_non_streaming_nemo_ctc_microphone.js)|VAD + Non-streaming speech recognition from a microphone using a [NeMo](https://github.com/NVIDIA/NeMo) CTC model with greedy search|
 |[./test_vad_asr_non_streaming_paraformer_microphone.js](./test_vad_asr_non_streaming_paraformer_microphone.js)|VAD + Non-streaming speech recognition from a microphone using [Paraformer](https://github.com/alibaba-damo-academy/FunASR)|
 |[./test_vad_asr_non_streaming_sense_voice_microphone.js](./test_vad_asr_non_streaming_sense_voice_microphone.js)|VAD + Non-streaming speech recognition from a microphone using [SenseVoice](https://github.com/FunAudioLLM/SenseVoice)|
 ## Text-to-speech
@@ -252,6 +254,20 @@ npm install naudiodon2
 node ./test_vad_asr_non_streaming_paraformer_microphone.js
 ```
 ### Non-streaming speech recognition with SenseVoice
 ```bash
 wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
 tar xvf sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
 rm sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
 node ./test_asr_non_streaming_sense_voice.js
 # To run VAD + non-streaming ASR with Paraformer using a microphone
 npm install naudiodon2
 node ./test_vad_asr_non_streaming_sense_voice_microphone.js
 ```
 ### Text-to-speech with piper VITS models (TTS)
 ```bash
--- a/nodejs-addon-examples/package.json
+++ b/nodejs-addon-examples/package.json
@@ -1,5 +1,5 @@
 {
  "dependencies": {
-    "sherpa-onnx-node": "^1.0.30"
+    "sherpa-onnx-node": "^1.10.17"
  }
 }
--- a/nodejs-addon-examples/test_asr_non_streaming_sense_voice.js
+++ b/nodejs-addon-examples/test_asr_non_streaming_sense_voice.js
@@ -0,0 +1,48 @@
 // Copyright (c)  2024  Xiaomi Corporation
 const sherpa_onnx = require('sherpa-onnx-node');
 // Please download test files from
 // https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
 const config = {
  'featConfig': {
    'sampleRate': 16000,
    'featureDim': 80,
  },
  'modelConfig': {
    'senseVoice': {
      'model':
          './sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.int8.onnx',
      'useInverseTextNormalization': 1,
    },
    'tokens': './sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt',
    'numThreads': 2,
    'provider': 'cpu',
    'debug': 1,
  }
 };
 const waveFilename =
    './sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/test_wavs/zh.wav';
 const recognizer = new sherpa_onnx.OfflineRecognizer(config);
 console.log('Started')
 let start = Date.now();
 const stream = recognizer.createStream();
 const wave = sherpa_onnx.readWave(waveFilename);
 stream.acceptWaveform({sampleRate: wave.sampleRate, samples: wave.samples});
 recognizer.decode(stream);
 result = recognizer.getResult(stream)
 let stop = Date.now();
 console.log('Done')
 const elapsed_seconds = (stop - start) / 1000;
 const duration = wave.samples.length / wave.sampleRate;
 const real_time_factor = elapsed_seconds / duration;
 console.log('Wave duration', duration.toFixed(3), 'secodns')
 console.log('Elapsed', elapsed_seconds.toFixed(3), 'secodns')
 console.log(
    `RTF = ${elapsed_seconds.toFixed(3)}/${duration.toFixed(3)} =`,
    real_time_factor.toFixed(3))
 console.log(waveFilename)
 console.log('result\n', result)
--- a/nodejs-addon-examples/test_vad_asr_non_streaming_sense_voice_microphone.js
+++ b/nodejs-addon-examples/test_vad_asr_non_streaming_sense_voice_microphone.js
@@ -0,0 +1,111 @@
 // Copyright (c)  2023-2024  Xiaomi Corporation (authors: Fangjun Kuang)
 //
 const portAudio = require('naudiodon2');
 // console.log(portAudio.getDevices());
 const sherpa_onnx = require('sherpa-onnx-node');
 function createRecognizer() {
  // Please download test files from
  // https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
  const config = {
    'featConfig': {
      'sampleRate': 16000,
      'featureDim': 80,
    },
    'modelConfig': {
      'senseVoice': {
        'model':
            './sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.int8.onnx',
        'useInverseTextNormalization': 1,
      },
      'tokens':
          './sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt',
      'numThreads': 2,
      'provider': 'cpu',
      'debug': 1,
    }
  };
  return new sherpa_onnx.OfflineRecognizer(config);
 }
 function createVad() {
  // please download silero_vad.onnx from
  // https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
  const config = {
    sileroVad: {
      model: './silero_vad.onnx',
      threshold: 0.5,
      minSpeechDuration: 0.25,
      minSilenceDuration: 0.5,
      windowSize: 512,
    },
    sampleRate: 16000,
    debug: true,
    numThreads: 1,
  };
  const bufferSizeInSeconds = 60;
  return new sherpa_onnx.Vad(config, bufferSizeInSeconds);
 }
 const recognizer = createRecognizer();
 const vad = createVad();
 const bufferSizeInSeconds = 30;
 const buffer =
    new sherpa_onnx.CircularBuffer(bufferSizeInSeconds * vad.config.sampleRate);
 const ai = new portAudio.AudioIO({
  inOptions: {
    channelCount: 1,
    closeOnError: true,  // Close the stream if an audio error is detected, if
                         // set false then just log the error
    deviceId: -1,  // Use -1 or omit the deviceId to select the default device
    sampleFormat: portAudio.SampleFormatFloat32,
    sampleRate: vad.config.sampleRate
  }
 });
 let printed = false;
 let index = 0;
 ai.on('data', data => {
  const windowSize = vad.config.sileroVad.windowSize;
  buffer.push(new Float32Array(data.buffer));
  while (buffer.size() > windowSize) {
    const samples = buffer.get(buffer.head(), windowSize);
    buffer.pop(windowSize);
    vad.acceptWaveform(samples);
  }
  while (!vad.isEmpty()) {
    const segment = vad.front();
    vad.pop();
    const stream = recognizer.createStream();
    stream.acceptWaveform({
      samples: segment.samples,
      sampleRate: recognizer.config.featConfig.sampleRate
    });
    recognizer.decode(stream);
    const r = recognizer.getResult(stream);
    if (r.text.length > 0) {
      const text = r.text.toLowerCase().trim();
      console.log(`${index}: ${text}`);
      const filename = `${index}-${text}-${
          new Date()
              .toLocaleTimeString('en-US', {hour12: false})
              .split(' ')[0]}.wav`;
      sherpa_onnx.writeWave(
          filename,
          {samples: segment.samples, sampleRate: vad.config.sampleRate});
      index += 1;
    }
  }
 });
 ai.start();
 console.log('Started! Please speak')
--- a/scripts/node-addon-api/src/non-streaming-asr.cc
+++ b/scripts/node-addon-api/src/non-streaming-asr.cc
@@ -96,6 +96,24 @@ static SherpaOnnxOfflineTdnnModelConfig GetOfflineTdnnModelConfig(
  return c;
 }
 static SherpaOnnxOfflineSenseVoiceModelConfig GetOfflineSenseVoiceModelConfig(
    Napi::Object obj) {
  SherpaOnnxOfflineSenseVoiceModelConfig c;
  memset(&c, 0, sizeof(c));
  if (!obj.Has("senseVoice") || !obj.Get("senseVoice").IsObject()) {
    return c;
  }
  Napi::Object o = obj.Get("senseVoice").As<Napi::Object>();
  SHERPA_ONNX_ASSIGN_ATTR_STR(model, model);
  SHERPA_ONNX_ASSIGN_ATTR_STR(language, language);
  SHERPA_ONNX_ASSIGN_ATTR_INT32(use_itn, useInverseTextNormalization);
  return c;
 }
 static SherpaOnnxOfflineModelConfig GetOfflineModelConfig(Napi::Object obj) {
  SherpaOnnxOfflineModelConfig c;
  memset(&c, 0, sizeof(c));
@@ -111,6 +129,7 @@ static SherpaOnnxOfflineModelConfig GetOfflineModelConfig(Napi::Object obj) {
  c.nemo_ctc = GetOfflineNeMoCtcModelConfig(o);
  c.whisper = GetOfflineWhisperModelConfig(o);
  c.tdnn = GetOfflineTdnnModelConfig(o);
  c.sense_voice = GetOfflineSenseVoiceModelConfig(o);
  SHERPA_ONNX_ASSIGN_ATTR_STR(tokens, tokens);
  SHERPA_ONNX_ASSIGN_ATTR_INT32(num_threads, numThreads);
@@ -225,6 +244,14 @@ CreateOfflineRecognizerWrapper(const Napi::CallbackInfo &info) {
    delete[] c.model_config.tdnn.model;
  }
  if (c.model_config.sense_voice.model) {
    delete[] c.model_config.sense_voice.model;
  }
  if (c.model_config.sense_voice.language) {
    delete[] c.model_config.sense_voice.language;
  }
  if (c.model_config.tokens) {
    delete[] c.model_config.tokens;
  }