diff --git a/.github/scripts/node-addon/run.sh b/.github/scripts/node-addon/run.sh index 6b1c8615..67a28776 100755 --- a/.github/scripts/node-addon/run.sh +++ b/.github/scripts/node-addon/run.sh @@ -18,7 +18,7 @@ fi SHERPA_ONNX_VERSION=$(grep "SHERPA_ONNX_VERSION" ./CMakeLists.txt | cut -d " " -f 2 | cut -d '"' -f 2) echo "SHERPA_ONNX_VERSION $SHERPA_ONNX_VERSION" -# SHERPA_ONNX_VERSION=1.0.21 +# SHERPA_ONNX_VERSION=1.0.22 if [ -z $owner ]; then owner=k2-fsa diff --git a/.github/scripts/test-nodejs-addon-npm.sh b/.github/scripts/test-nodejs-addon-npm.sh index e13af102..9adbaeb4 100755 --- a/.github/scripts/test-nodejs-addon-npm.sh +++ b/.github/scripts/test-nodejs-addon-npm.sh @@ -6,6 +6,22 @@ d=nodejs-addon-examples echo "dir: $d" cd $d +echo "----------audio tagging----------" + +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/audio-tagging-models/sherpa-onnx-zipformer-small-audio-tagging-2024-04-15.tar.bz2 +tar xvf sherpa-onnx-zipformer-small-audio-tagging-2024-04-15.tar.bz2 +rm sherpa-onnx-zipformer-small-audio-tagging-2024-04-15.tar.bz2 + +node ./test_audio_tagging_zipformer.js +rm -rf sherpa-onnx-zipformer-small-audio-tagging-2024-04-15 + +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/audio-tagging-models/sherpa-onnx-ced-mini-audio-tagging-2024-04-19.tar.bz2 +tar xvf sherpa-onnx-ced-mini-audio-tagging-2024-04-19.tar.bz2 +rm sherpa-onnx-ced-mini-audio-tagging-2024-04-19.tar.bz2 + +node ./test_audio_tagging_ced.js +rm -rf sherpa-onnx-ced-mini-audio-tagging-2024-04-19 + echo "----------speaker identification----------" curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx diff --git a/.github/workflows/npm-addon-macos.yaml b/.github/workflows/npm-addon-macos.yaml index 520196e0..a075ea8b 100644 --- a/.github/workflows/npm-addon-macos.yaml +++ b/.github/workflows/npm-addon-macos.yaml @@ -33,6 +33,11 @@ jobs: with: python-version: ${{ matrix.python-version }} + - name: Update pip + shell: bash + run: | + pip install -U pip + - uses: actions/setup-node@v4 with: registry-url: 'https://registry.npmjs.org' diff --git a/.github/workflows/npm-addon.yaml b/.github/workflows/npm-addon.yaml index 6a79ae48..ddcb4a85 100644 --- a/.github/workflows/npm-addon.yaml +++ b/.github/workflows/npm-addon.yaml @@ -55,7 +55,7 @@ jobs: SHERPA_ONNX_VERSION=$(grep "SHERPA_ONNX_VERSION" ./CMakeLists.txt | cut -d " " -f 2 | cut -d '"' -f 2) echo "SHERPA_ONNX_VERSION $SHERPA_ONNX_VERSION" - # SHERPA_ONNX_VERSION=1.0.21 + # SHERPA_ONNX_VERSION=1.0.22 src_dir=.github/scripts/node-addon sed -i.bak s/SHERPA_ONNX_VERSION/$SHERPA_ONNX_VERSION/g $src_dir/package.json diff --git a/nodejs-addon-examples/README.md b/nodejs-addon-examples/README.md index 0d9c73a3..62d0e4d6 100644 --- a/nodejs-addon-examples/README.md +++ b/nodejs-addon-examples/README.md @@ -27,7 +27,82 @@ export LD_LIBRARY_PATH=$PWD/node_modules/sherpa-onnx-linux-x64:$LD_LIBRARY_PATH export LD_LIBRARY_PATH=$PWD/node_modules/sherpa-onnx-linux-arm64:$LD_LIBRARY_PATH ``` -# Voice Activity detection (VAD) +# Examples + +The following tables list the examples in this folder. + +## Voice activity detection (VAD) + +|File| Description| +|---|---| +|[./test_vad_microphone.js](./test_vad_microphone.js)| VAD with a microphone. It uses [silero-vad](https://github.com/snakers4/silero-vad)| + +## Speaker identification + +|File| Description| +|---|---| +|[ ./test_speaker_identification.js]( ./test_speaker_identification.js)| Speaker identification from a file| + +## Spoken language identification + +|File| Description| +|---|---| +|[./test_vad_spoken_language_identification_microphone.js](./test_vad_spoken_language_identification_microphone.js)|Spoken language identification from a microphone using a multi-lingual [Whisper](https://github.com/openai/whisper) model| + +## Audio tagging + +|File| Description| +|---|---| +|[./test_audio_tagging_zipformer.js](./test_audio_tagging_zipformer.js)| Audio tagging with a Zipformer model| +|[./test_audio_tagging_ced.js](./test_audio_tagging_ced.js)| Audio tagging with a [CED](https://github.com/RicherMans/CED) model| + +## Streaming speech-to-text from files + +|File| Description| +|---|---| +|[./test_asr_streaming_transducer.js](./test_asr_streaming_transducer.js)| Streaming speech recognition from a file using a Zipformer transducer model| +|[./test_asr_streaming_ctc.js](./test_asr_streaming_ctc.js)| Streaming speech recognition from a file using a Zipformer CTC model with greedy search| +|[./test_asr_streaming_ctc_hlg.js](./test_asr_streaming_ctc_hlg.js)| Streaming speech recognition from a file using a Zipformer CTC model with HLG decoding| +|[./test_asr_streaming_paraformer.js](./test_asr_streaming_paraformer.js)|Streaming speech recognition from a file using a [Paraformer](https://github.com/alibaba-damo-academy/FunASR) model| + +## Streaming speech-to-text from a microphone + +|File| Description| +|---|---| +|[./test_asr_streaming_transducer_microphone.js](./test_asr_streaming_transducer_microphone.js)| Streaming speech recognition from a microphone using a Zipformer transducer model| +|[./test_asr_streaming_ctc_microphone.js](./test_asr_streaming_ctc_microphone.js)| Streaming speech recognition from a microphone using a Zipformer CTC model with greedy search| +|[./test_asr_streaming_ctc_hlg_microphone.js](./test_asr_streaming_ctc_hlg_microphone.js)|Streaming speech recognition from a microphone using a Zipformer CTC model with HLG decoding| +|[./test_asr_streaming_paraformer_microphone.js](./test_asr_streaming_paraformer_microphone.js)| Streaming speech recognition from a microphone using a [Paraformer](https://github.com/alibaba-damo-academy/FunASR) model| + +## Non-Streaming speech-to-text from files + +|File| Description| +|---|---| +|[./test_asr_non_streaming_transducer.js](./test_asr_non_streaming_transducer.js)|Non-streaming speech recognition from a file with a Zipformer transducer model| +|[./test_asr_non_streaming_whisper.js](./test_asr_non_streaming_whisper.js)| Non-streaming speech recognition from a file using [Whisper](https://github.com/openai/whisper)| +|[./test_asr_non_streaming_nemo_ctc.js](./test_asr_non_streaming_nemo_ctc.js)|Non-streaming speech recognition from a file using a [NeMo](https://github.com/NVIDIA/NeMo) CTC model with greedy search| +|[./test_asr_non_streaming_paraformer.js](./test_asr_non_streaming_paraformer.js)|Non-streaming speech recognition from a file using [Paraformer](https://github.com/alibaba-damo-academy/FunASR)| + +## Non-Streaming speech-to-text from a microphone with VAD + +|File| Description| +|---|---| +|[./test_vad_asr_non_streaming_transducer_microphone.js](./test_vad_asr_non_streaming_transducer_microphone.js)|VAD + Non-streaming speech recognition from a microphone using a Zipformer transducer model| +|[./test_vad_asr_non_streaming_whisper_microphone.js](./test_vad_asr_non_streaming_whisper_microphone.js)|VAD + Non-streaming speech recognition from a microphone using [Whisper](https://github.com/openai/whisper)| +|[./test_vad_asr_non_streaming_nemo_ctc_microphone.js](./test_vad_asr_non_streaming_nemo_ctc_microphone.js)|VAD + Non-streaming speech recognition from a microphone using a [NeMo](https://github.com/NVIDIA/NeMo) CTC model with greedy search| +|[./test_vad_asr_non_streaming_paraformer_microphone.js](./test_vad_asr_non_streaming_paraformer_microphone.js)|VAD + Non-streaming speech recognition from a microphone using [Paraformer](https://github.com/alibaba-damo-academy/FunASR)| + +## Text-to-speech + +|File| Description| +|---|---| +|[./test_tts_non_streaming_vits_piper_en.js](./test_tts_non_streaming_vits_piper_en.js)| Text-to-speech with a [piper](https://github.com/rhasspy/piper) English model| +|[./test_tts_non_streaming_vits_coqui_de.js](./test_tts_non_streaming_vits_coqui_de.js)| Text-to-speech with a [coqui](https://github.com/coqui-ai/TTS) German model| +|[./test_tts_non_streaming_vits_zh_ll.js](./test_tts_non_streaming_vits_zh_ll.js)| Text-to-speech with a Chinese model using [cppjieba](https://github.com/yanyiwu/cppjieba)| +|[./test_tts_non_streaming_vits_zh_aishell3.js](./test_tts_non_streaming_vits_zh_aishell3.js)| Text-to-speech with a Chinese TTS model| + + +### Voice Activity detection (VAD) ```bash wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx @@ -39,7 +114,27 @@ npm install naudiodon2 node ./test_vad_microphone.js ``` -## Streaming speech recognition with Zipformer transducer +### Audio tagging with zipformer + +```bash +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/audio-tagging-models/sherpa-onnx-zipformer-small-audio-tagging-2024-04-15.tar.bz2 +tar xvf sherpa-onnx-zipformer-small-audio-tagging-2024-04-15.tar.bz2 +rm sherpa-onnx-zipformer-small-audio-tagging-2024-04-15.tar.bz2 + +node ./test_audio_tagging_zipformer.js +``` + +### Audio tagging with CED + +```bash +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/audio-tagging-models/sherpa-onnx-ced-mini-audio-tagging-2024-04-19.tar.bz2 +tar xvf sherpa-onnx-ced-mini-audio-tagging-2024-04-19.tar.bz2 +rm sherpa-onnx-ced-mini-audio-tagging-2024-04-19.tar.bz2 + +node ./test_audio_tagging_ced.js +``` + +### Streaming speech recognition with Zipformer transducer ```bash wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 @@ -54,7 +149,7 @@ npm install naudiodon2 node ./test_asr_streaming_transducer_microphone.js ``` -## Streaming speech recognition with Zipformer CTC +### Streaming speech recognition with Zipformer CTC ```bash wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2 @@ -73,7 +168,7 @@ node ./test_asr_streaming_ctc_microphone.js node ./test_asr_streaming_ctc_hlg_microphone.js ``` -## Streaming speech recognition with Paraformer +### Streaming speech recognition with Paraformer ```bash wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-paraformer-bilingual-zh-en.tar.bz2 @@ -88,7 +183,7 @@ npm install naudiodon2 node ./test_asr_streaming_paraformer_microphone.js ``` -## Non-streaming speech recognition with Zipformer transducer +### Non-streaming speech recognition with Zipformer transducer ```bash wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-en-2023-04-01.tar.bz2 @@ -102,7 +197,7 @@ npm install naudiodon2 node ./test_vad_asr_non_streaming_transducer_microphone.js ``` -## Non-streaming speech recognition with Whisper +### Non-streaming speech recognition with Whisper ```bash wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2 @@ -116,7 +211,7 @@ npm install naudiodon2 node ./test_vad_asr_non_streaming_whisper_microphone.js ``` -## Non-streaming speech recognition with NeMo CTC models +### Non-streaming speech recognition with NeMo CTC models ```bash wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k.tar.bz2 @@ -130,7 +225,7 @@ npm install naudiodon2 node ./test_vad_asr_non_streaming_nemo_ctc_microphone.js ``` -## Non-streaming speech recognition with Paraformer +### Non-streaming speech recognition with Paraformer ```bash wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2 @@ -144,7 +239,7 @@ npm install naudiodon2 node ./test_vad_asr_non_streaming_paraformer_microphone.js ``` -## Text-to-speech with piper VITS models (TTS) +### Text-to-speech with piper VITS models (TTS) ```bash wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_GB-cori-medium.tar.bz2 @@ -154,7 +249,7 @@ rm vits-piper-en_GB-cori-medium.tar.bz2 node ./test_tts_non_streaming_vits_piper_en.js ``` -## Text-to-speech with piper Coqui-ai/TTS models (TTS) +### Text-to-speech with piper Coqui-ai/TTS models (TTS) ```bash wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-coqui-de-css10.tar.bz2 @@ -164,7 +259,7 @@ rm vits-coqui-de-css10.tar.bz2 node ./test_tts_non_streaming_vits_coqui_de.js ``` -## Text-to-speech with vits Chinese models (1/2) +### Text-to-speech with vits Chinese models (1/2) ```bash wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/sherpa-onnx-vits-zh-ll.tar.bz2 @@ -174,7 +269,7 @@ rm sherpa-onnx-vits-zh-ll.tar.bz2 node ./test_tts_non_streaming_vits_zh_ll.js ``` -## Text-to-speech with vits Chinese models (2/2) +### Text-to-speech with vits Chinese models (2/2) ```bash wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2 @@ -184,7 +279,7 @@ rm vits-icefall-zh-aishell3.tar.bz2 node ./test_tts_non_streaming_vits_zh_aishell3.js ``` -## Spoken language identification with Whisper multi-lingual models +### Spoken language identification with Whisper multi-lingual models ```bash wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.tar.bz2 @@ -202,7 +297,7 @@ npm install naudiodon2 node ./test_vad_spoken_language_identification_microphone.js ``` -## Speaker identification +### Speaker identification You can find more models at diff --git a/nodejs-addon-examples/test_audio_tagging_ced.js b/nodejs-addon-examples/test_audio_tagging_ced.js new file mode 100644 index 00000000..4e4d085a --- /dev/null +++ b/nodejs-addon-examples/test_audio_tagging_ced.js @@ -0,0 +1,63 @@ +// Copyright (c) 2024 Xiaomi Corporation +const sherpa_onnx = require('sherpa-onnx-node'); + +// Please download models files from +// https://github.com/k2-fsa/sherpa-onnx/releases/tag/audio-tagging-models +function createAudioTagging() { + const config = { + model: { + ced: './sherpa-onnx-ced-mini-audio-tagging-2024-04-19/model.int8.onnx', + numThreads: 1, + debug: true, + }, + labels: + './sherpa-onnx-ced-mini-audio-tagging-2024-04-19/class_labels_indices.csv', + topK: 5, + }; + return new sherpa_onnx.AudioTagging(config); +} + +const at = createAudioTagging(); + +const testWaves = [ + './sherpa-onnx-ced-mini-audio-tagging-2024-04-19/test_wavs/1.wav', + './sherpa-onnx-ced-mini-audio-tagging-2024-04-19/test_wavs/2.wav', + './sherpa-onnx-ced-mini-audio-tagging-2024-04-19/test_wavs/3.wav', + './sherpa-onnx-ced-mini-audio-tagging-2024-04-19/test_wavs/4.wav', + './sherpa-onnx-ced-mini-audio-tagging-2024-04-19/test_wavs/5.wav', + './sherpa-onnx-ced-mini-audio-tagging-2024-04-19/test_wavs/6.wav', + './sherpa-onnx-ced-mini-audio-tagging-2024-04-19/test_wavs/7.wav', + './sherpa-onnx-ced-mini-audio-tagging-2024-04-19/test_wavs/8.wav', + './sherpa-onnx-ced-mini-audio-tagging-2024-04-19/test_wavs/9.wav', + './sherpa-onnx-ced-mini-audio-tagging-2024-04-19/test_wavs/10.wav', + './sherpa-onnx-ced-mini-audio-tagging-2024-04-19/test_wavs/11.wav', + './sherpa-onnx-ced-mini-audio-tagging-2024-04-19/test_wavs/12.wav', + './sherpa-onnx-ced-mini-audio-tagging-2024-04-19/test_wavs/13.wav', +]; + +console.log('------'); + +for (let filename of testWaves) { + const start = performance.now(); + const stream = at.createStream(); + const wave = sherpa_onnx.readWave(filename); + stream.acceptWaveform({sampleRate: wave.sampleRate, samples: wave.samples}); + const events = at.compute(stream); + const stop = performance.now(); + + const elapsed_seconds = (stop - start) / 1000; + const duration = wave.samples.length / wave.sampleRate; + const real_time_factor = elapsed_seconds / duration; + + console.log('input file:', filename); + console.log('Probability\t\tName'); + for (let e of events) { + console.log(`${e.prob.toFixed(3)}\t\t\t${e.name}`); + } + console.log('Wave duration', duration.toFixed(3), 'secodns') + console.log('Elapsed', elapsed_seconds.toFixed(3), 'secodns') + console.log( + `RTF = ${elapsed_seconds.toFixed(3)}/${duration.toFixed(3)} =`, + real_time_factor.toFixed(3)) + console.log('------'); +} diff --git a/nodejs-addon-examples/test_audio_tagging_zipformer.js b/nodejs-addon-examples/test_audio_tagging_zipformer.js new file mode 100644 index 00000000..25fb5219 --- /dev/null +++ b/nodejs-addon-examples/test_audio_tagging_zipformer.js @@ -0,0 +1,66 @@ +// Copyright (c) 2024 Xiaomi Corporation +const sherpa_onnx = require('sherpa-onnx-node'); + +// Please download models files from +// https://github.com/k2-fsa/sherpa-onnx/releases/tag/audio-tagging-models +function createAudioTagging() { + const config = { + model: { + zipformer: { + model: + './sherpa-onnx-zipformer-small-audio-tagging-2024-04-15/model.int8.onnx' + }, + numThreads: 1, + debug: true, + }, + labels: + './sherpa-onnx-zipformer-small-audio-tagging-2024-04-15/class_labels_indices.csv', + topK: 5, + }; + return new sherpa_onnx.AudioTagging(config); +} + +const at = createAudioTagging(); + +const testWaves = [ + './sherpa-onnx-zipformer-small-audio-tagging-2024-04-15/test_wavs/1.wav', + './sherpa-onnx-zipformer-small-audio-tagging-2024-04-15/test_wavs/2.wav', + './sherpa-onnx-zipformer-small-audio-tagging-2024-04-15/test_wavs/3.wav', + './sherpa-onnx-zipformer-small-audio-tagging-2024-04-15/test_wavs/4.wav', + './sherpa-onnx-zipformer-small-audio-tagging-2024-04-15/test_wavs/5.wav', + './sherpa-onnx-zipformer-small-audio-tagging-2024-04-15/test_wavs/6.wav', + './sherpa-onnx-zipformer-small-audio-tagging-2024-04-15/test_wavs/7.wav', + './sherpa-onnx-zipformer-small-audio-tagging-2024-04-15/test_wavs/8.wav', + './sherpa-onnx-zipformer-small-audio-tagging-2024-04-15/test_wavs/9.wav', + './sherpa-onnx-zipformer-small-audio-tagging-2024-04-15/test_wavs/10.wav', + './sherpa-onnx-zipformer-small-audio-tagging-2024-04-15/test_wavs/11.wav', + './sherpa-onnx-zipformer-small-audio-tagging-2024-04-15/test_wavs/12.wav', + './sherpa-onnx-zipformer-small-audio-tagging-2024-04-15/test_wavs/13.wav', +]; + +console.log('------'); + +for (let filename of testWaves) { + const start = performance.now(); + const stream = at.createStream(); + const wave = sherpa_onnx.readWave(filename); + stream.acceptWaveform({sampleRate: wave.sampleRate, samples: wave.samples}); + const events = at.compute(stream); + const stop = performance.now(); + + const elapsed_seconds = (stop - start) / 1000; + const duration = wave.samples.length / wave.sampleRate; + const real_time_factor = elapsed_seconds / duration; + + console.log('input file:', filename); + console.log('Probability\t\tName'); + for (let e of events) { + console.log(`${e.prob.toFixed(3)}\t\t\t${e.name}`); + } + console.log('Wave duration', duration.toFixed(3), 'secodns') + console.log('Elapsed', elapsed_seconds.toFixed(3), 'secodns') + console.log( + `RTF = ${elapsed_seconds.toFixed(3)}/${duration.toFixed(3)} =`, + real_time_factor.toFixed(3)) + console.log('------'); +} diff --git a/scripts/node-addon-api/CMakeLists.txt b/scripts/node-addon-api/CMakeLists.txt index ffd888db..9d03a1ee 100644 --- a/scripts/node-addon-api/CMakeLists.txt +++ b/scripts/node-addon-api/CMakeLists.txt @@ -18,6 +18,7 @@ add_definitions(-DNAPI_VERSION=3) include_directories(${CMAKE_JS_INC}) set(srcs + src/audio-tagging.cc src/non-streaming-asr.cc src/non-streaming-tts.cc src/sherpa-onnx-node-addon-api.cc diff --git a/scripts/node-addon-api/lib/audio-tagg.js b/scripts/node-addon-api/lib/audio-tagg.js new file mode 100644 index 00000000..b55daf5b --- /dev/null +++ b/scripts/node-addon-api/lib/audio-tagg.js @@ -0,0 +1,26 @@ +const addon = require('./addon.js'); +const non_streaming_asr = require('./non-streaming-asr.js'); + +class AudioTagging { + constructor(config) { + this.handle = addon.createAudioTagging(config); + this.config = config; + } + + createStream() { + return new non_streaming_asr.OfflineStream( + addon.audioTaggingCreateOfflineStream(this.handle)); + } + + /* Return an array. Each element is + * an object {name: "xxx", prob: xxx, index: xxx}; + * + */ + compute(stream, topK = -1) { + return addon.audioTaggingCompute(this.handle, stream.handle, topK); + } +} + +module.exports = { + AudioTagging, +} diff --git a/scripts/node-addon-api/lib/sherpa-onnx.js b/scripts/node-addon-api/lib/sherpa-onnx.js index 31cfd936..33e398cf 100644 --- a/scripts/node-addon-api/lib/sherpa-onnx.js +++ b/scripts/node-addon-api/lib/sherpa-onnx.js @@ -5,6 +5,7 @@ const non_streaming_tts = require('./non-streaming-tts.js'); const vad = require('./vad.js'); const slid = require('./spoken-language-identification.js'); const sid = require('./speaker-identification.js'); +const at = require('./audio-tagg.js'); module.exports = { OnlineRecognizer: streaming_asr.OnlineRecognizer, @@ -18,4 +19,5 @@ module.exports = { SpokenLanguageIdentification: slid.SpokenLanguageIdentification, SpeakerEmbeddingExtractor: sid.SpeakerEmbeddingExtractor, SpeakerEmbeddingManager: sid.SpeakerEmbeddingManager, + AudioTagging: at.AudioTagging, } diff --git a/scripts/node-addon-api/src/audio-tagging.cc b/scripts/node-addon-api/src/audio-tagging.cc new file mode 100644 index 00000000..1ec6251b --- /dev/null +++ b/scripts/node-addon-api/src/audio-tagging.cc @@ -0,0 +1,227 @@ +// scripts/node-addon-api/src/audio-tagging.cc +// +// Copyright (c) 2024 Xiaomi Corporation +#include + +#include "macros.h" // NOLINT +#include "napi.h" // NOLINT +#include "sherpa-onnx/c-api/c-api.h" + +static SherpaOnnxOfflineZipformerAudioTaggingModelConfig +GetAudioTaggingZipformerModelConfig(Napi::Object obj) { + SherpaOnnxOfflineZipformerAudioTaggingModelConfig c; + memset(&c, 0, sizeof(c)); + + if (!obj.Has("zipformer") || !obj.Get("zipformer").IsObject()) { + return c; + } + + Napi::Object o = obj.Get("zipformer").As(); + + SHERPA_ONNX_ASSIGN_ATTR_STR(model, model); + + return c; +} + +static SherpaOnnxAudioTaggingModelConfig GetAudioTaggingModelConfig( + Napi::Object obj) { + SherpaOnnxAudioTaggingModelConfig c; + memset(&c, 0, sizeof(c)); + + if (!obj.Has("model") || !obj.Get("model").IsObject()) { + return c; + } + + Napi::Object o = obj.Get("model").As(); + c.zipformer = GetAudioTaggingZipformerModelConfig(o); + + SHERPA_ONNX_ASSIGN_ATTR_STR(ced, ced); + + SHERPA_ONNX_ASSIGN_ATTR_INT32(num_threads, numThreads); + + if (o.Has("debug") && + (o.Get("debug").IsNumber() || o.Get("debug").IsBoolean())) { + if (o.Get("debug").IsBoolean()) { + c.debug = o.Get("debug").As().Value(); + } else { + c.debug = o.Get("debug").As().Int32Value(); + } + } + SHERPA_ONNX_ASSIGN_ATTR_STR(provider, provider); + + return c; +} + +static Napi::External CreateAudioTaggingWrapper( + const Napi::CallbackInfo &info) { + Napi::Env env = info.Env(); + if (info.Length() != 1) { + std::ostringstream os; + os << "Expect only 1 argument. Given: " << info.Length(); + + Napi::TypeError::New(env, os.str()).ThrowAsJavaScriptException(); + + return {}; + } + + if (!info[0].IsObject()) { + Napi::TypeError::New(env, "You should pass an object as the only argument.") + .ThrowAsJavaScriptException(); + + return {}; + } + + Napi::Object o = info[0].As(); + + SherpaOnnxAudioTaggingConfig c; + memset(&c, 0, sizeof(c)); + c.model = GetAudioTaggingModelConfig(o); + + SHERPA_ONNX_ASSIGN_ATTR_STR(labels, labels); + SHERPA_ONNX_ASSIGN_ATTR_INT32(top_k, topK); + + const SherpaOnnxAudioTagging *at = SherpaOnnxCreateAudioTagging(&c); + + if (c.model.zipformer.model) { + delete[] c.model.zipformer.model; + } + + if (c.model.ced) { + delete[] c.model.ced; + } + + if (c.model.provider) { + delete[] c.model.provider; + } + + if (c.labels) { + delete[] c.labels; + } + + if (!at) { + Napi::TypeError::New(env, "Please check your config!") + .ThrowAsJavaScriptException(); + + return {}; + } + + return Napi::External::New( + env, const_cast(at), + [](Napi::Env env, SherpaOnnxAudioTagging *at) { + SherpaOnnxDestroyAudioTagging(at); + }); +} + +static Napi::External +AudioTaggingCreateOfflineStreamWrapper(const Napi::CallbackInfo &info) { + Napi::Env env = info.Env(); + if (info.Length() != 1) { + std::ostringstream os; + os << "Expect only 1 argument. Given: " << info.Length(); + + Napi::TypeError::New(env, os.str()).ThrowAsJavaScriptException(); + + return {}; + } + + if (!info[0].IsExternal()) { + Napi::TypeError::New( + env, "You should pass an audio tagging pointer as the only argument") + .ThrowAsJavaScriptException(); + + return {}; + } + + SherpaOnnxAudioTagging *at = + info[0].As>().Data(); + + const SherpaOnnxOfflineStream *stream = + SherpaOnnxAudioTaggingCreateOfflineStream(at); + + return Napi::External::New( + env, const_cast(stream), + [](Napi::Env env, SherpaOnnxOfflineStream *stream) { + DestroyOfflineStream(stream); + }); +} + +static Napi::Object AudioTaggingComputeWrapper(const Napi::CallbackInfo &info) { + Napi::Env env = info.Env(); + if (info.Length() != 3) { + std::ostringstream os; + os << "Expect only 3 arguments. Given: " << info.Length(); + + Napi::TypeError::New(env, os.str()).ThrowAsJavaScriptException(); + + return {}; + } + + if (!info[0].IsExternal()) { + Napi::TypeError::New( + env, "You should pass an audio tagging pointer as the first argument") + .ThrowAsJavaScriptException(); + + return {}; + } + + if (!info[1].IsExternal()) { + Napi::TypeError::New( + env, "You should pass a offline stream pointer as the second argument") + .ThrowAsJavaScriptException(); + + return {}; + } + + if (!info[2].IsNumber()) { + Napi::TypeError::New(env, + "You should pass an integer as the third argument") + .ThrowAsJavaScriptException(); + + return {}; + } + + SherpaOnnxAudioTagging *at = + info[0].As>().Data(); + + SherpaOnnxOfflineStream *stream = + info[1].As>().Data(); + + int32_t top_k = info[2].As().Int32Value(); + + const SherpaOnnxAudioEvent *const *events = + SherpaOnnxAudioTaggingCompute(at, stream, top_k); + + auto p = events; + int32_t k = 0; + while (p && *p) { + ++k; + ++p; + } + + Napi::Array ans = Napi::Array::New(env, k); + for (int32_t i = 0; i != k; ++i) { + Napi::Object obj = Napi::Object::New(env); + obj.Set(Napi::String::New(env, "name"), + Napi::String::New(env, events[i]->name)); + obj.Set(Napi::String::New(env, "index"), + Napi::Number::New(env, events[i]->index)); + obj.Set(Napi::String::New(env, "prob"), + Napi::Number::New(env, events[i]->prob)); + ans[i] = obj; + } + + SherpaOnnxAudioTaggingFreeResults(events); + + return ans; +} + +void InitAudioTagging(Napi::Env env, Napi::Object exports) { + exports.Set(Napi::String::New(env, "createAudioTagging"), + Napi::Function::New(env, CreateAudioTaggingWrapper)); + + exports.Set(Napi::String::New(env, "audioTaggingCreateOfflineStream"), + Napi::Function::New(env, AudioTaggingCreateOfflineStreamWrapper)); + + exports.Set(Napi::String::New(env, "audioTaggingCompute"), + Napi::Function::New(env, AudioTaggingComputeWrapper)); +} diff --git a/scripts/node-addon-api/src/sherpa-onnx-node-addon-api.cc b/scripts/node-addon-api/src/sherpa-onnx-node-addon-api.cc index 450c63b6..d353eed2 100644 --- a/scripts/node-addon-api/src/sherpa-onnx-node-addon-api.cc +++ b/scripts/node-addon-api/src/sherpa-onnx-node-addon-api.cc @@ -19,6 +19,8 @@ void InitSpokenLanguageID(Napi::Env env, Napi::Object exports); void InitSpeakerID(Napi::Env env, Napi::Object exports); +void InitAudioTagging(Napi::Env env, Napi::Object exports); + Napi::Object Init(Napi::Env env, Napi::Object exports) { InitStreamingAsr(env, exports); InitNonStreamingAsr(env, exports); @@ -28,6 +30,7 @@ Napi::Object Init(Napi::Env env, Napi::Object exports) { InitWaveWriter(env, exports); InitSpokenLanguageID(env, exports); InitSpeakerID(env, exports); + InitAudioTagging(env, exports); return exports; }