From 6f261d39f392eb9c8c7a713d4eaecb3f13677854 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Sun, 27 Oct 2024 11:31:01 +0800 Subject: [PATCH] Add JavaScript API for Moonshine models (#1480) --- .github/scripts/test-nodejs-addon-npm.sh | 20 ++- .github/scripts/test-nodejs-npm.sh | 24 ++++ nodejs-addon-examples/README.md | 30 ++++ .../test_asr_non_streaming_moonshine.js | 50 +++++++ ..._asr_non_streaming_moonshine_microphone.js | 113 +++++++++++++++ ...st_vad_with_non_streaming_asr_moonshine.js | 132 ++++++++++++++++++ nodejs-examples/README.md | 51 +++++++ nodejs-examples/test-offline-moonshine.js | 37 +++++ ...st-vad-with-non-streaming-asr-moonshine.js | 128 +++++++++++++++++ scripts/node-addon-api/src/macros.h | 7 + .../node-addon-api/src/non-streaming-asr.cc | 129 +++++++---------- wasm/asr/sherpa-onnx-asr.js | 75 +++++++++- wasm/nodejs/sherpa-onnx-wasm-nodejs.cc | 11 +- 13 files changed, 719 insertions(+), 88 deletions(-) create mode 100644 nodejs-addon-examples/test_asr_non_streaming_moonshine.js create mode 100644 nodejs-addon-examples/test_vad_asr_non_streaming_moonshine_microphone.js create mode 100644 nodejs-addon-examples/test_vad_with_non_streaming_asr_moonshine.js create mode 100644 nodejs-examples/test-offline-moonshine.js create mode 100644 nodejs-examples/test-vad-with-non-streaming-asr-moonshine.js diff --git a/.github/scripts/test-nodejs-addon-npm.sh b/.github/scripts/test-nodejs-addon-npm.sh index 42c753eb..755cde74 100755 --- a/.github/scripts/test-nodejs-addon-npm.sh +++ b/.github/scripts/test-nodejs-addon-npm.sh @@ -10,6 +10,19 @@ arch=$(node -p "require('os').arch()") platform=$(node -p "require('os').platform()") node_version=$(node -p "process.versions.node.split('.')[0]") +echo "----------non-streaming asr moonshine + vad----------" +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 +tar xvf sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 +rm sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 + +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/Obama.wav +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx + +node ./test_vad_with_non_streaming_asr_moonshine.js +rm -rf sherpa-onnx-* +rm *.wav +rm *.onnx + echo "----------non-streaming speaker diarization----------" curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 @@ -24,7 +37,7 @@ node ./test_offline_speaker_diarization.js rm -rfv *.onnx *.wav sherpa-onnx-pyannote-* -echo "----------non-streaming asr + vad----------" +echo "----------non-streaming asr whisper + vad----------" curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2 tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2 rm sherpa-onnx-whisper-tiny.en.tar.bz2 @@ -218,6 +231,11 @@ rm sherpa-onnx-whisper-tiny.en.tar.bz2 node ./test_asr_non_streaming_whisper.js rm -rf sherpa-onnx-whisper-tiny.en +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 +tar xvf sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 +rm sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 +node ./test_asr_non_streaming_moonshine.js +rm -rf sherpa-onnx-* ls -lh diff --git a/.github/scripts/test-nodejs-npm.sh b/.github/scripts/test-nodejs-npm.sh index 03dec04a..518d173b 100755 --- a/.github/scripts/test-nodejs-npm.sh +++ b/.github/scripts/test-nodejs-npm.sh @@ -21,6 +21,23 @@ curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segm node ./test-offline-speaker-diarization.js rm -rfv *.wav *.onnx sherpa-onnx-pyannote-* +echo '-----vad+moonshine----------' + +curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2 +tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2 +rm sherpa-onnx-whisper-tiny.en.tar.bz2 + +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 +tar xvf sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 +rm sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 + +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/Obama.wav +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx +node ./test-vad-with-non-streaming-asr-whisper.js +rm Obama.wav +rm silero_vad.onnx +rm -rf sherpa-onnx-moonshine-* + echo '-----vad+whisper----------' curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2 @@ -90,6 +107,13 @@ rm sherpa-onnx-whisper-tiny.en.tar.bz2 node ./test-offline-whisper.js rm -rf sherpa-onnx-whisper-tiny.en +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 +tar xvf sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 +rm sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 + +node ./test-offline-moonshine.js +rm -rf sherpa-onnx-moonshine-* + # online asr curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-paraformer-bilingual-zh-en.tar.bz2 tar xvf sherpa-onnx-streaming-paraformer-bilingual-zh-en.tar.bz2 diff --git a/nodejs-addon-examples/README.md b/nodejs-addon-examples/README.md index 8851c626..f436bcae 100644 --- a/nodejs-addon-examples/README.md +++ b/nodejs-addon-examples/README.md @@ -112,6 +112,8 @@ The following tables list the examples in this folder. |[./test_asr_non_streaming_transducer.js](./test_asr_non_streaming_transducer.js)|Non-streaming speech recognition from a file with a Zipformer transducer model| |[./test_asr_non_streaming_whisper.js](./test_asr_non_streaming_whisper.js)| Non-streaming speech recognition from a file using [Whisper](https://github.com/openai/whisper)| |[./test_vad_with_non_streaming_asr_whisper.js](./test_vad_with_non_streaming_asr_whisper.js)| Non-streaming speech recognition from a file using [Whisper](https://github.com/openai/whisper) + [Silero VAD](https://github.com/snakers4/silero-vad)| +|[./test_asr_non_streaming_moonshine.js](./test_asr_non_streaming_moonshine.js)|Non-streaming speech recognition from a file using [Moonshine](https://github.com/usefulsensors/moonshine)| +|[./test_vad_with_non_streaming_asr_moonshine.js](./test_vad_with_non_streaming_asr_moonshine.js)| Non-streaming speech recognition from a file using [Moonshine](https://github.com/usefulsensors/moonshine) + [Silero VAD](https://github.com/snakers4/silero-vad)| |[./test_asr_non_streaming_nemo_ctc.js](./test_asr_non_streaming_nemo_ctc.js)|Non-streaming speech recognition from a file using a [NeMo](https://github.com/NVIDIA/NeMo) CTC model with greedy search| |[./test_asr_non_streaming_paraformer.js](./test_asr_non_streaming_paraformer.js)|Non-streaming speech recognition from a file using [Paraformer](https://github.com/alibaba-damo-academy/FunASR)| |[./test_asr_non_streaming_sense_voice.js](./test_asr_non_streaming_sense_voice.js)|Non-streaming speech recognition from a file using [SenseVoice](https://github.com/FunAudioLLM/SenseVoice)| @@ -122,6 +124,7 @@ The following tables list the examples in this folder. |---|---| |[./test_vad_asr_non_streaming_transducer_microphone.js](./test_vad_asr_non_streaming_transducer_microphone.js)|VAD + Non-streaming speech recognition from a microphone using a Zipformer transducer model| |[./test_vad_asr_non_streaming_whisper_microphone.js](./test_vad_asr_non_streaming_whisper_microphone.js)|VAD + Non-streaming speech recognition from a microphone using [Whisper](https://github.com/openai/whisper)| +|[./test_vad_asr_non_streaming_moonshine_microphone.js](./test_vad_asr_non_streaming_moonshine_microphone.js)|VAD + Non-streaming speech recognition from a microphone using [Moonshine](https://github.com/usefulsensors/moonshine)| |[./test_vad_asr_non_streaming_nemo_ctc_microphone.js](./test_vad_asr_non_streaming_nemo_ctc_microphone.js)|VAD + Non-streaming speech recognition from a microphone using a [NeMo](https://github.com/NVIDIA/NeMo) CTC model with greedy search| |[./test_vad_asr_non_streaming_paraformer_microphone.js](./test_vad_asr_non_streaming_paraformer_microphone.js)|VAD + Non-streaming speech recognition from a microphone using [Paraformer](https://github.com/alibaba-damo-academy/FunASR)| |[./test_vad_asr_non_streaming_sense_voice_microphone.js](./test_vad_asr_non_streaming_sense_voice_microphone.js)|VAD + Non-streaming speech recognition from a microphone using [SenseVoice](https://github.com/FunAudioLLM/SenseVoice)| @@ -260,6 +263,33 @@ npm install naudiodon2 node ./test_vad_asr_non_streaming_whisper_microphone.js ``` +### Non-streaming speech recognition with Moonshine + +```bash +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 +tar xvf sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 +rm sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 + +node ./test_asr_non_streaming_moonshine.js + +# To run VAD + non-streaming ASR with Moonshine using a microphone +npm install naudiodon2 +node ./test_vad_asr_non_streaming_moonshine_microphone.js +``` + +### Non-streaming speech recognition with Moonshine + VAD + +```bash +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 +tar xvf sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 +rm sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 + +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/Obama.wav +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx + +node ./test_vad_with_non_streaming_asr_moonshine.js +``` + ### Non-streaming speech recognition with Whisper + VAD ```bash diff --git a/nodejs-addon-examples/test_asr_non_streaming_moonshine.js b/nodejs-addon-examples/test_asr_non_streaming_moonshine.js new file mode 100644 index 00000000..9e676b8c --- /dev/null +++ b/nodejs-addon-examples/test_asr_non_streaming_moonshine.js @@ -0,0 +1,50 @@ +// Copyright (c) 2024 Xiaomi Corporation +const sherpa_onnx = require('sherpa-onnx-node'); + +// Please download test files from +// https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models +const config = { + 'featConfig': { + 'sampleRate': 16000, + 'featureDim': 80, + }, + 'modelConfig': { + 'moonshine': { + 'preprocessor': './sherpa-onnx-moonshine-tiny-en-int8/preprocess.onnx', + 'encoder': './sherpa-onnx-moonshine-tiny-en-int8/encode.int8.onnx', + 'uncachedDecoder': + './sherpa-onnx-moonshine-tiny-en-int8/uncached_decode.int8.onnx', + 'cachedDecoder': + './sherpa-onnx-moonshine-tiny-en-int8/cached_decode.int8.onnx', + }, + 'tokens': './sherpa-onnx-moonshine-tiny-en-int8/tokens.txt', + 'numThreads': 2, + 'provider': 'cpu', + 'debug': 1, + } +}; + +const waveFilename = './sherpa-onnx-moonshine-tiny-en-int8/test_wavs/0.wav'; + +const recognizer = new sherpa_onnx.OfflineRecognizer(config); +console.log('Started') +let start = Date.now(); +const stream = recognizer.createStream(); +const wave = sherpa_onnx.readWave(waveFilename); +stream.acceptWaveform({sampleRate: wave.sampleRate, samples: wave.samples}); + +recognizer.decode(stream); +result = recognizer.getResult(stream) +let stop = Date.now(); +console.log('Done') + +const elapsed_seconds = (stop - start) / 1000; +const duration = wave.samples.length / wave.sampleRate; +const real_time_factor = elapsed_seconds / duration; +console.log('Wave duration', duration.toFixed(3), 'secodns') +console.log('Elapsed', elapsed_seconds.toFixed(3), 'secodns') +console.log( + `RTF = ${elapsed_seconds.toFixed(3)}/${duration.toFixed(3)} =`, + real_time_factor.toFixed(3)) +console.log(waveFilename) +console.log('result\n', result) diff --git a/nodejs-addon-examples/test_vad_asr_non_streaming_moonshine_microphone.js b/nodejs-addon-examples/test_vad_asr_non_streaming_moonshine_microphone.js new file mode 100644 index 00000000..b3cbcca5 --- /dev/null +++ b/nodejs-addon-examples/test_vad_asr_non_streaming_moonshine_microphone.js @@ -0,0 +1,113 @@ +// Copyright (c) 2023-2024 Xiaomi Corporation (authors: Fangjun Kuang) +// +const portAudio = require('naudiodon2'); +// console.log(portAudio.getDevices()); + +const sherpa_onnx = require('sherpa-onnx-node'); + +function createRecognizer() { + // Please download test files from + // https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models + const config = { + 'featConfig': { + 'sampleRate': 16000, + 'featureDim': 80, + }, + 'modelConfig': { + 'moonshine': { + 'preprocessor': './sherpa-onnx-moonshine-tiny-en-int8/preprocess.onnx', + 'encoder': './sherpa-onnx-moonshine-tiny-en-int8/encode.int8.onnx', + 'uncachedDecoder': + './sherpa-onnx-moonshine-tiny-en-int8/uncached_decode.int8.onnx', + 'cachedDecoder': + './sherpa-onnx-moonshine-tiny-en-int8/cached_decode.int8.onnx', + }, + 'tokens': './sherpa-onnx-moonshine-tiny-en-int8/tokens.txt', + 'numThreads': 2, + 'provider': 'cpu', + 'debug': 1, + } + }; + + return new sherpa_onnx.OfflineRecognizer(config); +} + +function createVad() { + // please download silero_vad.onnx from + // https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx + const config = { + sileroVad: { + model: './silero_vad.onnx', + threshold: 0.5, + minSpeechDuration: 0.25, + minSilenceDuration: 0.5, + windowSize: 512, + }, + sampleRate: 16000, + debug: true, + numThreads: 1, + }; + + const bufferSizeInSeconds = 60; + + return new sherpa_onnx.Vad(config, bufferSizeInSeconds); +} + +const recognizer = createRecognizer(); +const vad = createVad(); + +const bufferSizeInSeconds = 30; +const buffer = + new sherpa_onnx.CircularBuffer(bufferSizeInSeconds * vad.config.sampleRate); + +const ai = new portAudio.AudioIO({ + inOptions: { + channelCount: 1, + closeOnError: true, // Close the stream if an audio error is detected, if + // set false then just log the error + deviceId: -1, // Use -1 or omit the deviceId to select the default device + sampleFormat: portAudio.SampleFormatFloat32, + sampleRate: vad.config.sampleRate + } +}); + +let printed = false; +let index = 0; +ai.on('data', data => { + const windowSize = vad.config.sileroVad.windowSize; + buffer.push(new Float32Array(data.buffer)); + while (buffer.size() > windowSize) { + const samples = buffer.get(buffer.head(), windowSize); + buffer.pop(windowSize); + vad.acceptWaveform(samples); + } + + while (!vad.isEmpty()) { + const segment = vad.front(); + vad.pop(); + const stream = recognizer.createStream(); + stream.acceptWaveform({ + samples: segment.samples, + sampleRate: recognizer.config.featConfig.sampleRate + }); + recognizer.decode(stream); + const r = recognizer.getResult(stream); + if (r.text.length > 0) { + const text = r.text.toLowerCase().trim(); + console.log(`${index}: ${text}`); + + const filename = `${index}-${text}-${ + new Date() + .toLocaleTimeString('en-US', {hour12: false}) + .split(' ')[0]}.wav`; + sherpa_onnx.writeWave( + filename, + {samples: segment.samples, sampleRate: vad.config.sampleRate}); + + index += 1; + } + } +}); + +ai.start(); +console.log('Started! Please speak') diff --git a/nodejs-addon-examples/test_vad_with_non_streaming_asr_moonshine.js b/nodejs-addon-examples/test_vad_with_non_streaming_asr_moonshine.js new file mode 100644 index 00000000..3b051060 --- /dev/null +++ b/nodejs-addon-examples/test_vad_with_non_streaming_asr_moonshine.js @@ -0,0 +1,132 @@ +// Copyright (c) 2023-2024 Xiaomi Corporation (authors: Fangjun Kuang) + +const sherpa_onnx = require('sherpa-onnx-node'); + +function createRecognizer() { + // Please download test files from + // https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models + const config = { + 'featConfig': { + 'sampleRate': 16000, + 'featureDim': 80, + }, + 'modelConfig': { + 'moonshine': { + 'preprocessor': './sherpa-onnx-moonshine-tiny-en-int8/preprocess.onnx', + 'encoder': './sherpa-onnx-moonshine-tiny-en-int8/encode.int8.onnx', + 'uncachedDecoder': + './sherpa-onnx-moonshine-tiny-en-int8/uncached_decode.int8.onnx', + 'cachedDecoder': + './sherpa-onnx-moonshine-tiny-en-int8/cached_decode.int8.onnx', + }, + 'tokens': './sherpa-onnx-moonshine-tiny-en-int8/tokens.txt', + 'numThreads': 2, + 'provider': 'cpu', + 'debug': 1, + } + }; + + return new sherpa_onnx.OfflineRecognizer(config); +} + +function createVad() { + // please download silero_vad.onnx from + // https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx + const config = { + sileroVad: { + model: './silero_vad.onnx', + threshold: 0.5, + minSpeechDuration: 0.25, + minSilenceDuration: 0.5, + maxSpeechDuration: 5, + windowSize: 512, + }, + sampleRate: 16000, + debug: true, + numThreads: 1, + }; + + const bufferSizeInSeconds = 60; + + return new sherpa_onnx.Vad(config, bufferSizeInSeconds); +} + +const recognizer = createRecognizer(); +const vad = createVad(); + +// please download ./Obama.wav from +// https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models +const waveFilename = './Obama.wav'; +const wave = sherpa_onnx.readWave(waveFilename); + +if (wave.sampleRate != recognizer.config.featConfig.sampleRate) { + throw new Error( + 'Expected sample rate: ${recognizer.config.featConfig.sampleRate}. Given: ${wave.sampleRate}'); +} + +console.log('Started') +let start = Date.now(); + +const windowSize = vad.config.sileroVad.windowSize; +for (let i = 0; i < wave.samples.length; i += windowSize) { + const thisWindow = wave.samples.subarray(i, i + windowSize); + vad.acceptWaveform(thisWindow); + + while (!vad.isEmpty()) { + const segment = vad.front(); + vad.pop(); + + let start_time = segment.start / wave.sampleRate; + let end_time = start_time + segment.samples.length / wave.sampleRate; + + start_time = start_time.toFixed(2); + end_time = end_time.toFixed(2); + + const stream = recognizer.createStream(); + stream.acceptWaveform( + {samples: segment.samples, sampleRate: wave.sampleRate}); + + recognizer.decode(stream); + const r = recognizer.getResult(stream); + if (r.text.length > 0) { + const text = r.text.toLowerCase().trim(); + console.log(`${start_time} -- ${end_time}: ${text}`); + } + } +} + +vad.flush(); + +while (!vad.isEmpty()) { + const segment = vad.front(); + vad.pop(); + + let start_time = segment.start / wave.sampleRate; + let end_time = start_time + segment.samples.length / wave.sampleRate; + + start_time = start_time.toFixed(2); + end_time = end_time.toFixed(2); + + const stream = recognizer.createStream(); + stream.acceptWaveform( + {samples: segment.samples, sampleRate: wave.sampleRate}); + + recognizer.decode(stream); + const r = recognizer.getResult(stream); + if (r.text.length > 0) { + const text = r.text.toLowerCase().trim(); + console.log(`${start_time} -- ${end_time}: ${text}`); + } +} + +let stop = Date.now(); +console.log('Done') + +const elapsed_seconds = (stop - start) / 1000; +const duration = wave.samples.length / wave.sampleRate; +const real_time_factor = elapsed_seconds / duration; +console.log('Wave duration', duration.toFixed(3), 'seconds') +console.log('Elapsed', elapsed_seconds.toFixed(3), 'seconds') +console.log( + `RTF = ${elapsed_seconds.toFixed(3)}/${duration.toFixed(3)} =`, + real_time_factor.toFixed(3)) diff --git a/nodejs-examples/README.md b/nodejs-examples/README.md index 496a0062..a953573b 100644 --- a/nodejs-examples/README.md +++ b/nodejs-examples/README.md @@ -133,7 +133,25 @@ tar xvf sherpa-onnx-zipformer-en-2023-06-26.tar.bz2 node ./test-offline-transducer.js ``` +## ./test-vad-with-non-streaming-asr-whisper.js + +[./test-vad-with-non-streaming-asr-whisper.js](./test-vad-with-non-streaming-asr-whisper.js) +shows how to use VAD + whisper to decode a very long file. + +You can use the following command to run it: + +```bash +wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2 +tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2 + +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/Obama.wav +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx + +node ./test-vad-with-non-streaming-asr-whisper.js +``` + ## ./test-offline-whisper.js + [./test-offline-whisper.js](./test-offline-whisper.js) demonstrates how to decode a file with a Whisper model. In the code we use [sherpa-onnx-whisper-tiny.en](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/whisper/tiny.en.html). @@ -146,7 +164,40 @@ tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2 node ./test-offline-whisper.js ``` +## ./test-offline-moonshine.js + +[./test-offline-moonshine.js](./test-offline-moonshine.js) demonstrates +how to decode a file with a Moonshine model. In the code we use +[sherpa-onnx-moonshine-tiny-en-int8](https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-moonshine-tiny-en-int8.tar.bz2). + +You can use the following command to run it: + +```bash +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 +tar xvf sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 + +node ./test-offline-moonshine.js +``` + +## ./test-vad-with-non-streaming-asr-moonshine.js + +[./test-vad-with-non-streaming-asr-moonshine.js](./test-vad-with-non-streaming-asr-moonshine.js) +shows how to use VAD + whisper to decode a very long file. + +You can use the following command to run it: + +```bash +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 +tar xvf sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 + +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/Obama.wav +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx + +node ./test-vad-with-non-streaming-asr-moonshine.js +``` + ## ./test-online-paraformer-microphone.js + [./test-online-paraformer-microphone.js](./test-online-paraformer-microphone.js) demonstrates how to do real-time speech recognition from microphone with a streaming Paraformer model. In the code we use diff --git a/nodejs-examples/test-offline-moonshine.js b/nodejs-examples/test-offline-moonshine.js new file mode 100644 index 00000000..8f5d2f00 --- /dev/null +++ b/nodejs-examples/test-offline-moonshine.js @@ -0,0 +1,37 @@ +// Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang) +// +const sherpa_onnx = require('sherpa-onnx'); + +function createOfflineRecognizer() { + let modelConfig = { + moonshine: { + preprocessor: './sherpa-onnx-moonshine-tiny-en-int8/preprocess.onnx', + encoder: './sherpa-onnx-moonshine-tiny-en-int8/encode.int8.onnx', + uncachedDecoder: + './sherpa-onnx-moonshine-tiny-en-int8/uncached_decode.int8.onnx', + cachedDecoder: + './sherpa-onnx-moonshine-tiny-en-int8/cached_decode.int8.onnx', + }, + tokens: './sherpa-onnx-moonshine-tiny-en-int8/tokens.txt', + }; + + let config = { + modelConfig: modelConfig, + }; + + return sherpa_onnx.createOfflineRecognizer(config); +} + +recognizer = createOfflineRecognizer(); +stream = recognizer.createStream(); + +const waveFilename = './sherpa-onnx-moonshine-tiny-en-int8/test_wavs/0.wav'; +const wave = sherpa_onnx.readWave(waveFilename); +stream.acceptWaveform(wave.sampleRate, wave.samples); + +recognizer.decode(stream); +const text = recognizer.getResult(stream).text; +console.log(text); + +stream.free(); +recognizer.free(); diff --git a/nodejs-examples/test-vad-with-non-streaming-asr-moonshine.js b/nodejs-examples/test-vad-with-non-streaming-asr-moonshine.js new file mode 100644 index 00000000..0d6bd764 --- /dev/null +++ b/nodejs-examples/test-vad-with-non-streaming-asr-moonshine.js @@ -0,0 +1,128 @@ +// Copyright (c) 2023-2024 Xiaomi Corporation (authors: Fangjun Kuang) + +const sherpa_onnx = require('sherpa-onnx'); + +function createRecognizer() { + // Please download test files from + // https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models + const config = { + 'modelConfig': { + 'moonshine': { + 'preprocessor': './sherpa-onnx-moonshine-tiny-en-int8/preprocess.onnx', + 'encoder': './sherpa-onnx-moonshine-tiny-en-int8/encode.int8.onnx', + 'uncachedDecoder': + './sherpa-onnx-moonshine-tiny-en-int8/uncached_decode.int8.onnx', + 'cachedDecoder': + './sherpa-onnx-moonshine-tiny-en-int8/cached_decode.int8.onnx', + }, + 'tokens': './sherpa-onnx-moonshine-tiny-en-int8/tokens.txt', + 'debug': 0, + } + }; + + return sherpa_onnx.createOfflineRecognizer(config); +} + +function createVad() { + // please download silero_vad.onnx from + // https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx + const config = { + sileroVad: { + model: './silero_vad.onnx', + threshold: 0.5, + minSpeechDuration: 0.25, + minSilenceDuration: 0.5, + maxSpeechDuration: 5, + windowSize: 512, + }, + sampleRate: 16000, + debug: true, + numThreads: 1, + bufferSizeInSeconds: 60, + }; + + return sherpa_onnx.createVad(config); +} + +const recognizer = createRecognizer(); +const vad = createVad(); + +// please download ./Obama.wav from +// https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models +const waveFilename = './Obama.wav'; +const wave = sherpa_onnx.readWave(waveFilename); + +if (wave.sampleRate != recognizer.config.featConfig.sampleRate) { + throw new Error( + 'Expected sample rate: ${recognizer.config.featConfig.sampleRate}. Given: ${wave.sampleRate}'); +} + +console.log('Started') +let start = Date.now(); + +const windowSize = vad.config.sileroVad.windowSize; +for (let i = 0; i < wave.samples.length; i += windowSize) { + const thisWindow = wave.samples.subarray(i, i + windowSize); + vad.acceptWaveform(thisWindow); + + while (!vad.isEmpty()) { + const segment = vad.front(); + vad.pop(); + + let start_time = segment.start / wave.sampleRate; + let end_time = start_time + segment.samples.length / wave.sampleRate; + + start_time = start_time.toFixed(2); + end_time = end_time.toFixed(2); + + const stream = recognizer.createStream(); + stream.acceptWaveform(wave.sampleRate, segment.samples); + + recognizer.decode(stream); + const r = recognizer.getResult(stream); + if (r.text.length > 0) { + const text = r.text.toLowerCase().trim(); + console.log(`${start_time} -- ${end_time}: ${text}`); + } + + stream.free(); + } +} + +vad.flush(); + +while (!vad.isEmpty()) { + const segment = vad.front(); + vad.pop(); + + let start_time = segment.start / wave.sampleRate; + let end_time = start_time + segment.samples.length / wave.sampleRate; + + start_time = start_time.toFixed(2); + end_time = end_time.toFixed(2); + + const stream = recognizer.createStream(); + stream.acceptWaveform(wave.sampleRate, segment.samples); + + recognizer.decode(stream); + const r = recognizer.getResult(stream); + if (r.text.length > 0) { + const text = r.text.toLowerCase().trim(); + console.log(`${start_time} -- ${end_time}: ${text}`); + } +} + +let stop = Date.now(); +console.log('Done') + +const elapsed_seconds = (stop - start) / 1000; +const duration = wave.samples.length / wave.sampleRate; +const real_time_factor = elapsed_seconds / duration; +console.log('Wave duration', duration.toFixed(3), 'seconds') +console.log('Elapsed', elapsed_seconds.toFixed(3), 'seconds') +console.log( + `RTF = ${elapsed_seconds.toFixed(3)}/${duration.toFixed(3)} =`, + real_time_factor.toFixed(3)) + +vad.free(); +recognizer.free(); diff --git a/scripts/node-addon-api/src/macros.h b/scripts/node-addon-api/src/macros.h index ac0dbd56..1f4f401e 100644 --- a/scripts/node-addon-api/src/macros.h +++ b/scripts/node-addon-api/src/macros.h @@ -41,4 +41,11 @@ } \ } while (0) +#define SHERPA_ONNX_DELETE_C_STR(p) \ + do { \ + if (p) { \ + delete[] p; \ + } \ + } while (0) + #endif // SCRIPTS_NODE_ADDON_API_SRC_MACROS_H_ diff --git a/scripts/node-addon-api/src/non-streaming-asr.cc b/scripts/node-addon-api/src/non-streaming-asr.cc index 28c0b31e..a95c892a 100644 --- a/scripts/node-addon-api/src/non-streaming-asr.cc +++ b/scripts/node-addon-api/src/non-streaming-asr.cc @@ -80,6 +80,25 @@ static SherpaOnnxOfflineWhisperModelConfig GetOfflineWhisperModelConfig( return c; } +static SherpaOnnxOfflineMoonshineModelConfig GetOfflineMoonshineModelConfig( + Napi::Object obj) { + SherpaOnnxOfflineMoonshineModelConfig c; + memset(&c, 0, sizeof(c)); + + if (!obj.Has("moonshine") || !obj.Get("moonshine").IsObject()) { + return c; + } + + Napi::Object o = obj.Get("moonshine").As(); + + SHERPA_ONNX_ASSIGN_ATTR_STR(preprocessor, preprocessor); + SHERPA_ONNX_ASSIGN_ATTR_STR(encoder, encoder); + SHERPA_ONNX_ASSIGN_ATTR_STR(uncached_decoder, uncachedDecoder); + SHERPA_ONNX_ASSIGN_ATTR_STR(cached_decoder, cachedDecoder); + + return c; +} + static SherpaOnnxOfflineTdnnModelConfig GetOfflineTdnnModelConfig( Napi::Object obj) { SherpaOnnxOfflineTdnnModelConfig c; @@ -130,6 +149,7 @@ static SherpaOnnxOfflineModelConfig GetOfflineModelConfig(Napi::Object obj) { c.whisper = GetOfflineWhisperModelConfig(o); c.tdnn = GetOfflineTdnnModelConfig(o); c.sense_voice = GetOfflineSenseVoiceModelConfig(o); + c.moonshine = GetOfflineMoonshineModelConfig(o); SHERPA_ONNX_ASSIGN_ATTR_STR(tokens, tokens); SHERPA_ONNX_ASSIGN_ATTR_INT32(num_threads, numThreads); @@ -206,97 +226,42 @@ CreateOfflineRecognizerWrapper(const Napi::CallbackInfo &info) { const SherpaOnnxOfflineRecognizer *recognizer = SherpaOnnxCreateOfflineRecognizer(&c); - if (c.model_config.transducer.encoder) { - delete[] c.model_config.transducer.encoder; - } + SHERPA_ONNX_DELETE_C_STR(c.model_config.transducer.encoder); + SHERPA_ONNX_DELETE_C_STR(c.model_config.transducer.decoder); + SHERPA_ONNX_DELETE_C_STR(c.model_config.transducer.joiner); - if (c.model_config.transducer.decoder) { - delete[] c.model_config.transducer.decoder; - } + SHERPA_ONNX_DELETE_C_STR(c.model_config.paraformer.model); - if (c.model_config.transducer.joiner) { - delete[] c.model_config.transducer.joiner; - } + SHERPA_ONNX_DELETE_C_STR(c.model_config.nemo_ctc.model); - if (c.model_config.paraformer.model) { - delete[] c.model_config.paraformer.model; - } + SHERPA_ONNX_DELETE_C_STR(c.model_config.whisper.encoder); + SHERPA_ONNX_DELETE_C_STR(c.model_config.whisper.decoder); + SHERPA_ONNX_DELETE_C_STR(c.model_config.whisper.language); + SHERPA_ONNX_DELETE_C_STR(c.model_config.whisper.task); - if (c.model_config.nemo_ctc.model) { - delete[] c.model_config.nemo_ctc.model; - } + SHERPA_ONNX_DELETE_C_STR(c.model_config.tdnn.model); - if (c.model_config.whisper.encoder) { - delete[] c.model_config.whisper.encoder; - } + SHERPA_ONNX_DELETE_C_STR(c.model_config.sense_voice.model); + SHERPA_ONNX_DELETE_C_STR(c.model_config.sense_voice.language); - if (c.model_config.whisper.decoder) { - delete[] c.model_config.whisper.decoder; - } + SHERPA_ONNX_DELETE_C_STR(c.model_config.moonshine.preprocessor); + SHERPA_ONNX_DELETE_C_STR(c.model_config.moonshine.encoder); + SHERPA_ONNX_DELETE_C_STR(c.model_config.moonshine.uncached_decoder); + SHERPA_ONNX_DELETE_C_STR(c.model_config.moonshine.cached_decoder); - if (c.model_config.whisper.language) { - delete[] c.model_config.whisper.language; - } + SHERPA_ONNX_DELETE_C_STR(c.model_config.tokens); + SHERPA_ONNX_DELETE_C_STR(c.model_config.provider); + SHERPA_ONNX_DELETE_C_STR(c.model_config.model_type); + SHERPA_ONNX_DELETE_C_STR(c.model_config.modeling_unit); + SHERPA_ONNX_DELETE_C_STR(c.model_config.bpe_vocab); + SHERPA_ONNX_DELETE_C_STR(c.model_config.telespeech_ctc); - if (c.model_config.whisper.task) { - delete[] c.model_config.whisper.task; - } + SHERPA_ONNX_DELETE_C_STR(c.lm_config.model); - if (c.model_config.tdnn.model) { - delete[] c.model_config.tdnn.model; - } - - if (c.model_config.sense_voice.model) { - delete[] c.model_config.sense_voice.model; - } - - if (c.model_config.sense_voice.language) { - delete[] c.model_config.sense_voice.language; - } - - if (c.model_config.tokens) { - delete[] c.model_config.tokens; - } - - if (c.model_config.provider) { - delete[] c.model_config.provider; - } - - if (c.model_config.model_type) { - delete[] c.model_config.model_type; - } - - if (c.model_config.modeling_unit) { - delete[] c.model_config.modeling_unit; - } - - if (c.model_config.bpe_vocab) { - delete[] c.model_config.bpe_vocab; - } - - if (c.model_config.telespeech_ctc) { - delete[] c.model_config.telespeech_ctc; - } - - if (c.lm_config.model) { - delete[] c.lm_config.model; - } - - if (c.decoding_method) { - delete[] c.decoding_method; - } - - if (c.hotwords_file) { - delete[] c.hotwords_file; - } - - if (c.rule_fsts) { - delete[] c.rule_fsts; - } - - if (c.rule_fars) { - delete[] c.rule_fars; - } + SHERPA_ONNX_DELETE_C_STR(c.decoding_method); + SHERPA_ONNX_DELETE_C_STR(c.hotwords_file); + SHERPA_ONNX_DELETE_C_STR(c.rule_fsts); + SHERPA_ONNX_DELETE_C_STR(c.rule_fars); if (!recognizer) { Napi::TypeError::New(env, "Please check your config!") diff --git a/wasm/asr/sherpa-onnx-asr.js b/wasm/asr/sherpa-onnx-asr.js index 9b966090..f0c34c97 100644 --- a/wasm/asr/sherpa-onnx-asr.js +++ b/wasm/asr/sherpa-onnx-asr.js @@ -35,6 +35,10 @@ function freeConfig(config, Module) { freeConfig(config.whisper, Module) } + if ('moonshine' in config) { + freeConfig(config.moonshine, Module) + } + if ('tdnn' in config) { freeConfig(config.tdnn, Module) } @@ -563,7 +567,7 @@ function initSherpaOnnxOfflineWhisperModelConfig(config, Module) { const n = encoderLen + decoderLen + languageLen + taskLen; const buffer = Module._malloc(n); - const len = 5 * 4; // 4 pointers + const len = 5 * 4; // 4 pointers + 1 int32 const ptr = Module._malloc(len); let offset = 0; @@ -598,6 +602,55 @@ function initSherpaOnnxOfflineWhisperModelConfig(config, Module) { } } +function initSherpaOnnxOfflineMoonshineModelConfig(config, Module) { + const preprocessorLen = Module.lengthBytesUTF8(config.preprocessor || '') + 1; + const encoderLen = Module.lengthBytesUTF8(config.encoder || '') + 1; + const uncachedDecoderLen = + Module.lengthBytesUTF8(config.uncachedDecoder || '') + 1; + const cachedDecoderLen = + Module.lengthBytesUTF8(config.cachedDecoder || '') + 1; + + const n = + preprocessorLen + encoderLen + uncachedDecoderLen + cachedDecoderLen; + const buffer = Module._malloc(n); + + const len = 4 * 4; // 4 pointers + const ptr = Module._malloc(len); + + let offset = 0; + Module.stringToUTF8( + config.preprocessor || '', buffer + offset, preprocessorLen); + offset += preprocessorLen; + + Module.stringToUTF8(config.encoder || '', buffer + offset, encoderLen); + offset += encoderLen; + + Module.stringToUTF8( + config.uncachedDecoder || '', buffer + offset, uncachedDecoderLen); + offset += uncachedDecoderLen; + + Module.stringToUTF8( + config.cachedDecoder || '', buffer + offset, cachedDecoderLen); + offset += cachedDecoderLen; + + offset = 0; + Module.setValue(ptr, buffer + offset, 'i8*'); + offset += preprocessorLen; + + Module.setValue(ptr + 4, buffer + offset, 'i8*'); + offset += encoderLen; + + Module.setValue(ptr + 8, buffer + offset, 'i8*'); + offset += uncachedDecoderLen; + + Module.setValue(ptr + 12, buffer + offset, 'i8*'); + offset += cachedDecoderLen; + + return { + buffer: buffer, ptr: ptr, len: len, + } +} + function initSherpaOnnxOfflineTdnnModelConfig(config, Module) { const n = Module.lengthBytesUTF8(config.model || '') + 1; const buffer = Module._malloc(n); @@ -693,6 +746,15 @@ function initSherpaOnnxOfflineModelConfig(config, Module) { }; } + if (!('moonshine' in config)) { + config.moonshine = { + preprocessor: '', + encoder: '', + uncachedDecoder: '', + cachedDecoder: '', + }; + } + if (!('tdnn' in config)) { config.tdnn = { model: '', @@ -724,8 +786,11 @@ function initSherpaOnnxOfflineModelConfig(config, Module) { const senseVoice = initSherpaOnnxOfflineSenseVoiceModelConfig(config.senseVoice, Module); + const moonshine = + initSherpaOnnxOfflineMoonshineModelConfig(config.moonshine, Module); + const len = transducer.len + paraformer.len + nemoCtc.len + whisper.len + - tdnn.len + 8 * 4 + senseVoice.len; + tdnn.len + 8 * 4 + senseVoice.len + moonshine.len; const ptr = Module._malloc(len); @@ -745,7 +810,6 @@ function initSherpaOnnxOfflineModelConfig(config, Module) { Module._CopyHeap(tdnn.ptr, tdnn.len, ptr + offset); offset += tdnn.len; - const tokensLen = Module.lengthBytesUTF8(config.tokens || '') + 1; const providerLen = Module.lengthBytesUTF8(config.provider || 'cpu') + 1; const modelTypeLen = Module.lengthBytesUTF8(config.modelType || '') + 1; @@ -817,11 +881,14 @@ function initSherpaOnnxOfflineModelConfig(config, Module) { offset += 4; Module._CopyHeap(senseVoice.ptr, senseVoice.len, ptr + offset); + offset += senseVoice.len; + + Module._CopyHeap(moonshine.ptr, moonshine.len, ptr + offset); return { buffer: buffer, ptr: ptr, len: len, transducer: transducer, paraformer: paraformer, nemoCtc: nemoCtc, whisper: whisper, tdnn: tdnn, - senseVoice: senseVoice, + senseVoice: senseVoice, moonshine: moonshine, } } diff --git a/wasm/nodejs/sherpa-onnx-wasm-nodejs.cc b/wasm/nodejs/sherpa-onnx-wasm-nodejs.cc index 43c9d42c..ff8cd493 100644 --- a/wasm/nodejs/sherpa-onnx-wasm-nodejs.cc +++ b/wasm/nodejs/sherpa-onnx-wasm-nodejs.cc @@ -15,6 +15,7 @@ static_assert(sizeof(SherpaOnnxOfflineParaformerModelConfig) == 4, ""); static_assert(sizeof(SherpaOnnxOfflineNemoEncDecCtcModelConfig) == 4, ""); static_assert(sizeof(SherpaOnnxOfflineWhisperModelConfig) == 5 * 4, ""); +static_assert(sizeof(SherpaOnnxOfflineMoonshineModelConfig) == 4 * 4, ""); static_assert(sizeof(SherpaOnnxOfflineTdnnModelConfig) == 4, ""); static_assert(sizeof(SherpaOnnxOfflineSenseVoiceModelConfig) == 3 * 4, ""); static_assert(sizeof(SherpaOnnxOfflineLMConfig) == 2 * 4, ""); @@ -25,7 +26,8 @@ static_assert(sizeof(SherpaOnnxOfflineModelConfig) == sizeof(SherpaOnnxOfflineNemoEncDecCtcModelConfig) + sizeof(SherpaOnnxOfflineWhisperModelConfig) + sizeof(SherpaOnnxOfflineTdnnModelConfig) + 8 * 4 + - sizeof(SherpaOnnxOfflineSenseVoiceModelConfig), + sizeof(SherpaOnnxOfflineSenseVoiceModelConfig) + + sizeof(SherpaOnnxOfflineMoonshineModelConfig), ""); static_assert(sizeof(SherpaOnnxFeatureConfig) == 2 * 4, ""); static_assert(sizeof(SherpaOnnxOfflineRecognizerConfig) == @@ -66,6 +68,7 @@ void PrintOfflineRecognizerConfig(SherpaOnnxOfflineRecognizerConfig *config) { auto whisper = &model_config->whisper; auto tdnn = &model_config->tdnn; auto sense_voice = &model_config->sense_voice; + auto moonshine = &model_config->moonshine; fprintf(stdout, "----------offline transducer model config----------\n"); fprintf(stdout, "encoder: %s\n", transducer->encoder); @@ -93,6 +96,12 @@ void PrintOfflineRecognizerConfig(SherpaOnnxOfflineRecognizerConfig *config) { fprintf(stdout, "language: %s\n", sense_voice->language); fprintf(stdout, "use_itn: %d\n", sense_voice->use_itn); + fprintf(stdout, "----------offline moonshine model config----------\n"); + fprintf(stdout, "preprocessor: %s\n", moonshine->preprocessor); + fprintf(stdout, "encoder: %s\n", moonshine->encoder); + fprintf(stdout, "uncached_decoder: %s\n", moonshine->uncached_decoder); + fprintf(stdout, "cached_decoder: %s\n", moonshine->cached_decoder); + fprintf(stdout, "tokens: %s\n", model_config->tokens); fprintf(stdout, "num_threads: %d\n", model_config->num_threads); fprintf(stdout, "provider: %s\n", model_config->provider);