Add audio tagging APIs for node-addon-api (#875)
This commit is contained in:
2
.github/scripts/node-addon/run.sh
vendored
2
.github/scripts/node-addon/run.sh
vendored
@@ -18,7 +18,7 @@ fi
|
||||
SHERPA_ONNX_VERSION=$(grep "SHERPA_ONNX_VERSION" ./CMakeLists.txt | cut -d " " -f 2 | cut -d '"' -f 2)
|
||||
echo "SHERPA_ONNX_VERSION $SHERPA_ONNX_VERSION"
|
||||
|
||||
# SHERPA_ONNX_VERSION=1.0.21
|
||||
# SHERPA_ONNX_VERSION=1.0.22
|
||||
|
||||
if [ -z $owner ]; then
|
||||
owner=k2-fsa
|
||||
|
||||
16
.github/scripts/test-nodejs-addon-npm.sh
vendored
16
.github/scripts/test-nodejs-addon-npm.sh
vendored
@@ -6,6 +6,22 @@ d=nodejs-addon-examples
|
||||
echo "dir: $d"
|
||||
cd $d
|
||||
|
||||
echo "----------audio tagging----------"
|
||||
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/audio-tagging-models/sherpa-onnx-zipformer-small-audio-tagging-2024-04-15.tar.bz2
|
||||
tar xvf sherpa-onnx-zipformer-small-audio-tagging-2024-04-15.tar.bz2
|
||||
rm sherpa-onnx-zipformer-small-audio-tagging-2024-04-15.tar.bz2
|
||||
|
||||
node ./test_audio_tagging_zipformer.js
|
||||
rm -rf sherpa-onnx-zipformer-small-audio-tagging-2024-04-15
|
||||
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/audio-tagging-models/sherpa-onnx-ced-mini-audio-tagging-2024-04-19.tar.bz2
|
||||
tar xvf sherpa-onnx-ced-mini-audio-tagging-2024-04-19.tar.bz2
|
||||
rm sherpa-onnx-ced-mini-audio-tagging-2024-04-19.tar.bz2
|
||||
|
||||
node ./test_audio_tagging_ced.js
|
||||
rm -rf sherpa-onnx-ced-mini-audio-tagging-2024-04-19
|
||||
|
||||
echo "----------speaker identification----------"
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
|
||||
|
||||
|
||||
5
.github/workflows/npm-addon-macos.yaml
vendored
5
.github/workflows/npm-addon-macos.yaml
vendored
@@ -33,6 +33,11 @@ jobs:
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
|
||||
- name: Update pip
|
||||
shell: bash
|
||||
run: |
|
||||
pip install -U pip
|
||||
|
||||
- uses: actions/setup-node@v4
|
||||
with:
|
||||
registry-url: 'https://registry.npmjs.org'
|
||||
|
||||
2
.github/workflows/npm-addon.yaml
vendored
2
.github/workflows/npm-addon.yaml
vendored
@@ -55,7 +55,7 @@ jobs:
|
||||
|
||||
SHERPA_ONNX_VERSION=$(grep "SHERPA_ONNX_VERSION" ./CMakeLists.txt | cut -d " " -f 2 | cut -d '"' -f 2)
|
||||
echo "SHERPA_ONNX_VERSION $SHERPA_ONNX_VERSION"
|
||||
# SHERPA_ONNX_VERSION=1.0.21
|
||||
# SHERPA_ONNX_VERSION=1.0.22
|
||||
|
||||
src_dir=.github/scripts/node-addon
|
||||
sed -i.bak s/SHERPA_ONNX_VERSION/$SHERPA_ONNX_VERSION/g $src_dir/package.json
|
||||
|
||||
@@ -27,7 +27,82 @@ export LD_LIBRARY_PATH=$PWD/node_modules/sherpa-onnx-linux-x64:$LD_LIBRARY_PATH
|
||||
export LD_LIBRARY_PATH=$PWD/node_modules/sherpa-onnx-linux-arm64:$LD_LIBRARY_PATH
|
||||
```
|
||||
|
||||
# Voice Activity detection (VAD)
|
||||
# Examples
|
||||
|
||||
The following tables list the examples in this folder.
|
||||
|
||||
## Voice activity detection (VAD)
|
||||
|
||||
|File| Description|
|
||||
|---|---|
|
||||
|[./test_vad_microphone.js](./test_vad_microphone.js)| VAD with a microphone. It uses [silero-vad](https://github.com/snakers4/silero-vad)|
|
||||
|
||||
## Speaker identification
|
||||
|
||||
|File| Description|
|
||||
|---|---|
|
||||
|[ ./test_speaker_identification.js]( ./test_speaker_identification.js)| Speaker identification from a file|
|
||||
|
||||
## Spoken language identification
|
||||
|
||||
|File| Description|
|
||||
|---|---|
|
||||
|[./test_vad_spoken_language_identification_microphone.js](./test_vad_spoken_language_identification_microphone.js)|Spoken language identification from a microphone using a multi-lingual [Whisper](https://github.com/openai/whisper) model|
|
||||
|
||||
## Audio tagging
|
||||
|
||||
|File| Description|
|
||||
|---|---|
|
||||
|[./test_audio_tagging_zipformer.js](./test_audio_tagging_zipformer.js)| Audio tagging with a Zipformer model|
|
||||
|[./test_audio_tagging_ced.js](./test_audio_tagging_ced.js)| Audio tagging with a [CED](https://github.com/RicherMans/CED) model|
|
||||
|
||||
## Streaming speech-to-text from files
|
||||
|
||||
|File| Description|
|
||||
|---|---|
|
||||
|[./test_asr_streaming_transducer.js](./test_asr_streaming_transducer.js)| Streaming speech recognition from a file using a Zipformer transducer model|
|
||||
|[./test_asr_streaming_ctc.js](./test_asr_streaming_ctc.js)| Streaming speech recognition from a file using a Zipformer CTC model with greedy search|
|
||||
|[./test_asr_streaming_ctc_hlg.js](./test_asr_streaming_ctc_hlg.js)| Streaming speech recognition from a file using a Zipformer CTC model with HLG decoding|
|
||||
|[./test_asr_streaming_paraformer.js](./test_asr_streaming_paraformer.js)|Streaming speech recognition from a file using a [Paraformer](https://github.com/alibaba-damo-academy/FunASR) model|
|
||||
|
||||
## Streaming speech-to-text from a microphone
|
||||
|
||||
|File| Description|
|
||||
|---|---|
|
||||
|[./test_asr_streaming_transducer_microphone.js](./test_asr_streaming_transducer_microphone.js)| Streaming speech recognition from a microphone using a Zipformer transducer model|
|
||||
|[./test_asr_streaming_ctc_microphone.js](./test_asr_streaming_ctc_microphone.js)| Streaming speech recognition from a microphone using a Zipformer CTC model with greedy search|
|
||||
|[./test_asr_streaming_ctc_hlg_microphone.js](./test_asr_streaming_ctc_hlg_microphone.js)|Streaming speech recognition from a microphone using a Zipformer CTC model with HLG decoding|
|
||||
|[./test_asr_streaming_paraformer_microphone.js](./test_asr_streaming_paraformer_microphone.js)| Streaming speech recognition from a microphone using a [Paraformer](https://github.com/alibaba-damo-academy/FunASR) model|
|
||||
|
||||
## Non-Streaming speech-to-text from files
|
||||
|
||||
|File| Description|
|
||||
|---|---|
|
||||
|[./test_asr_non_streaming_transducer.js](./test_asr_non_streaming_transducer.js)|Non-streaming speech recognition from a file with a Zipformer transducer model|
|
||||
|[./test_asr_non_streaming_whisper.js](./test_asr_non_streaming_whisper.js)| Non-streaming speech recognition from a file using [Whisper](https://github.com/openai/whisper)|
|
||||
|[./test_asr_non_streaming_nemo_ctc.js](./test_asr_non_streaming_nemo_ctc.js)|Non-streaming speech recognition from a file using a [NeMo](https://github.com/NVIDIA/NeMo) CTC model with greedy search|
|
||||
|[./test_asr_non_streaming_paraformer.js](./test_asr_non_streaming_paraformer.js)|Non-streaming speech recognition from a file using [Paraformer](https://github.com/alibaba-damo-academy/FunASR)|
|
||||
|
||||
## Non-Streaming speech-to-text from a microphone with VAD
|
||||
|
||||
|File| Description|
|
||||
|---|---|
|
||||
|[./test_vad_asr_non_streaming_transducer_microphone.js](./test_vad_asr_non_streaming_transducer_microphone.js)|VAD + Non-streaming speech recognition from a microphone using a Zipformer transducer model|
|
||||
|[./test_vad_asr_non_streaming_whisper_microphone.js](./test_vad_asr_non_streaming_whisper_microphone.js)|VAD + Non-streaming speech recognition from a microphone using [Whisper](https://github.com/openai/whisper)|
|
||||
|[./test_vad_asr_non_streaming_nemo_ctc_microphone.js](./test_vad_asr_non_streaming_nemo_ctc_microphone.js)|VAD + Non-streaming speech recognition from a microphone using a [NeMo](https://github.com/NVIDIA/NeMo) CTC model with greedy search|
|
||||
|[./test_vad_asr_non_streaming_paraformer_microphone.js](./test_vad_asr_non_streaming_paraformer_microphone.js)|VAD + Non-streaming speech recognition from a microphone using [Paraformer](https://github.com/alibaba-damo-academy/FunASR)|
|
||||
|
||||
## Text-to-speech
|
||||
|
||||
|File| Description|
|
||||
|---|---|
|
||||
|[./test_tts_non_streaming_vits_piper_en.js](./test_tts_non_streaming_vits_piper_en.js)| Text-to-speech with a [piper](https://github.com/rhasspy/piper) English model|
|
||||
|[./test_tts_non_streaming_vits_coqui_de.js](./test_tts_non_streaming_vits_coqui_de.js)| Text-to-speech with a [coqui](https://github.com/coqui-ai/TTS) German model|
|
||||
|[./test_tts_non_streaming_vits_zh_ll.js](./test_tts_non_streaming_vits_zh_ll.js)| Text-to-speech with a Chinese model using [cppjieba](https://github.com/yanyiwu/cppjieba)|
|
||||
|[./test_tts_non_streaming_vits_zh_aishell3.js](./test_tts_non_streaming_vits_zh_aishell3.js)| Text-to-speech with a Chinese TTS model|
|
||||
|
||||
|
||||
### Voice Activity detection (VAD)
|
||||
|
||||
```bash
|
||||
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
|
||||
@@ -39,7 +114,27 @@ npm install naudiodon2
|
||||
node ./test_vad_microphone.js
|
||||
```
|
||||
|
||||
## Streaming speech recognition with Zipformer transducer
|
||||
### Audio tagging with zipformer
|
||||
|
||||
```bash
|
||||
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/audio-tagging-models/sherpa-onnx-zipformer-small-audio-tagging-2024-04-15.tar.bz2
|
||||
tar xvf sherpa-onnx-zipformer-small-audio-tagging-2024-04-15.tar.bz2
|
||||
rm sherpa-onnx-zipformer-small-audio-tagging-2024-04-15.tar.bz2
|
||||
|
||||
node ./test_audio_tagging_zipformer.js
|
||||
```
|
||||
|
||||
### Audio tagging with CED
|
||||
|
||||
```bash
|
||||
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/audio-tagging-models/sherpa-onnx-ced-mini-audio-tagging-2024-04-19.tar.bz2
|
||||
tar xvf sherpa-onnx-ced-mini-audio-tagging-2024-04-19.tar.bz2
|
||||
rm sherpa-onnx-ced-mini-audio-tagging-2024-04-19.tar.bz2
|
||||
|
||||
node ./test_audio_tagging_ced.js
|
||||
```
|
||||
|
||||
### Streaming speech recognition with Zipformer transducer
|
||||
|
||||
```bash
|
||||
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2
|
||||
@@ -54,7 +149,7 @@ npm install naudiodon2
|
||||
node ./test_asr_streaming_transducer_microphone.js
|
||||
```
|
||||
|
||||
## Streaming speech recognition with Zipformer CTC
|
||||
### Streaming speech recognition with Zipformer CTC
|
||||
|
||||
```bash
|
||||
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
|
||||
@@ -73,7 +168,7 @@ node ./test_asr_streaming_ctc_microphone.js
|
||||
node ./test_asr_streaming_ctc_hlg_microphone.js
|
||||
```
|
||||
|
||||
## Streaming speech recognition with Paraformer
|
||||
### Streaming speech recognition with Paraformer
|
||||
|
||||
```bash
|
||||
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-paraformer-bilingual-zh-en.tar.bz2
|
||||
@@ -88,7 +183,7 @@ npm install naudiodon2
|
||||
node ./test_asr_streaming_paraformer_microphone.js
|
||||
```
|
||||
|
||||
## Non-streaming speech recognition with Zipformer transducer
|
||||
### Non-streaming speech recognition with Zipformer transducer
|
||||
|
||||
```bash
|
||||
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-en-2023-04-01.tar.bz2
|
||||
@@ -102,7 +197,7 @@ npm install naudiodon2
|
||||
node ./test_vad_asr_non_streaming_transducer_microphone.js
|
||||
```
|
||||
|
||||
## Non-streaming speech recognition with Whisper
|
||||
### Non-streaming speech recognition with Whisper
|
||||
|
||||
```bash
|
||||
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2
|
||||
@@ -116,7 +211,7 @@ npm install naudiodon2
|
||||
node ./test_vad_asr_non_streaming_whisper_microphone.js
|
||||
```
|
||||
|
||||
## Non-streaming speech recognition with NeMo CTC models
|
||||
### Non-streaming speech recognition with NeMo CTC models
|
||||
|
||||
```bash
|
||||
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k.tar.bz2
|
||||
@@ -130,7 +225,7 @@ npm install naudiodon2
|
||||
node ./test_vad_asr_non_streaming_nemo_ctc_microphone.js
|
||||
```
|
||||
|
||||
## Non-streaming speech recognition with Paraformer
|
||||
### Non-streaming speech recognition with Paraformer
|
||||
|
||||
```bash
|
||||
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2
|
||||
@@ -144,7 +239,7 @@ npm install naudiodon2
|
||||
node ./test_vad_asr_non_streaming_paraformer_microphone.js
|
||||
```
|
||||
|
||||
## Text-to-speech with piper VITS models (TTS)
|
||||
### Text-to-speech with piper VITS models (TTS)
|
||||
|
||||
```bash
|
||||
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_GB-cori-medium.tar.bz2
|
||||
@@ -154,7 +249,7 @@ rm vits-piper-en_GB-cori-medium.tar.bz2
|
||||
node ./test_tts_non_streaming_vits_piper_en.js
|
||||
```
|
||||
|
||||
## Text-to-speech with piper Coqui-ai/TTS models (TTS)
|
||||
### Text-to-speech with piper Coqui-ai/TTS models (TTS)
|
||||
|
||||
```bash
|
||||
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-coqui-de-css10.tar.bz2
|
||||
@@ -164,7 +259,7 @@ rm vits-coqui-de-css10.tar.bz2
|
||||
node ./test_tts_non_streaming_vits_coqui_de.js
|
||||
```
|
||||
|
||||
## Text-to-speech with vits Chinese models (1/2)
|
||||
### Text-to-speech with vits Chinese models (1/2)
|
||||
|
||||
```bash
|
||||
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/sherpa-onnx-vits-zh-ll.tar.bz2
|
||||
@@ -174,7 +269,7 @@ rm sherpa-onnx-vits-zh-ll.tar.bz2
|
||||
node ./test_tts_non_streaming_vits_zh_ll.js
|
||||
```
|
||||
|
||||
## Text-to-speech with vits Chinese models (2/2)
|
||||
### Text-to-speech with vits Chinese models (2/2)
|
||||
|
||||
```bash
|
||||
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2
|
||||
@@ -184,7 +279,7 @@ rm vits-icefall-zh-aishell3.tar.bz2
|
||||
node ./test_tts_non_streaming_vits_zh_aishell3.js
|
||||
```
|
||||
|
||||
## Spoken language identification with Whisper multi-lingual models
|
||||
### Spoken language identification with Whisper multi-lingual models
|
||||
|
||||
```bash
|
||||
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.tar.bz2
|
||||
@@ -202,7 +297,7 @@ npm install naudiodon2
|
||||
node ./test_vad_spoken_language_identification_microphone.js
|
||||
```
|
||||
|
||||
## Speaker identification
|
||||
### Speaker identification
|
||||
|
||||
You can find more models at
|
||||
<https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models>
|
||||
|
||||
63
nodejs-addon-examples/test_audio_tagging_ced.js
Normal file
63
nodejs-addon-examples/test_audio_tagging_ced.js
Normal file
@@ -0,0 +1,63 @@
|
||||
// Copyright (c) 2024 Xiaomi Corporation
|
||||
const sherpa_onnx = require('sherpa-onnx-node');
|
||||
|
||||
// Please download models files from
|
||||
// https://github.com/k2-fsa/sherpa-onnx/releases/tag/audio-tagging-models
|
||||
function createAudioTagging() {
|
||||
const config = {
|
||||
model: {
|
||||
ced: './sherpa-onnx-ced-mini-audio-tagging-2024-04-19/model.int8.onnx',
|
||||
numThreads: 1,
|
||||
debug: true,
|
||||
},
|
||||
labels:
|
||||
'./sherpa-onnx-ced-mini-audio-tagging-2024-04-19/class_labels_indices.csv',
|
||||
topK: 5,
|
||||
};
|
||||
return new sherpa_onnx.AudioTagging(config);
|
||||
}
|
||||
|
||||
const at = createAudioTagging();
|
||||
|
||||
const testWaves = [
|
||||
'./sherpa-onnx-ced-mini-audio-tagging-2024-04-19/test_wavs/1.wav',
|
||||
'./sherpa-onnx-ced-mini-audio-tagging-2024-04-19/test_wavs/2.wav',
|
||||
'./sherpa-onnx-ced-mini-audio-tagging-2024-04-19/test_wavs/3.wav',
|
||||
'./sherpa-onnx-ced-mini-audio-tagging-2024-04-19/test_wavs/4.wav',
|
||||
'./sherpa-onnx-ced-mini-audio-tagging-2024-04-19/test_wavs/5.wav',
|
||||
'./sherpa-onnx-ced-mini-audio-tagging-2024-04-19/test_wavs/6.wav',
|
||||
'./sherpa-onnx-ced-mini-audio-tagging-2024-04-19/test_wavs/7.wav',
|
||||
'./sherpa-onnx-ced-mini-audio-tagging-2024-04-19/test_wavs/8.wav',
|
||||
'./sherpa-onnx-ced-mini-audio-tagging-2024-04-19/test_wavs/9.wav',
|
||||
'./sherpa-onnx-ced-mini-audio-tagging-2024-04-19/test_wavs/10.wav',
|
||||
'./sherpa-onnx-ced-mini-audio-tagging-2024-04-19/test_wavs/11.wav',
|
||||
'./sherpa-onnx-ced-mini-audio-tagging-2024-04-19/test_wavs/12.wav',
|
||||
'./sherpa-onnx-ced-mini-audio-tagging-2024-04-19/test_wavs/13.wav',
|
||||
];
|
||||
|
||||
console.log('------');
|
||||
|
||||
for (let filename of testWaves) {
|
||||
const start = performance.now();
|
||||
const stream = at.createStream();
|
||||
const wave = sherpa_onnx.readWave(filename);
|
||||
stream.acceptWaveform({sampleRate: wave.sampleRate, samples: wave.samples});
|
||||
const events = at.compute(stream);
|
||||
const stop = performance.now();
|
||||
|
||||
const elapsed_seconds = (stop - start) / 1000;
|
||||
const duration = wave.samples.length / wave.sampleRate;
|
||||
const real_time_factor = elapsed_seconds / duration;
|
||||
|
||||
console.log('input file:', filename);
|
||||
console.log('Probability\t\tName');
|
||||
for (let e of events) {
|
||||
console.log(`${e.prob.toFixed(3)}\t\t\t${e.name}`);
|
||||
}
|
||||
console.log('Wave duration', duration.toFixed(3), 'secodns')
|
||||
console.log('Elapsed', elapsed_seconds.toFixed(3), 'secodns')
|
||||
console.log(
|
||||
`RTF = ${elapsed_seconds.toFixed(3)}/${duration.toFixed(3)} =`,
|
||||
real_time_factor.toFixed(3))
|
||||
console.log('------');
|
||||
}
|
||||
66
nodejs-addon-examples/test_audio_tagging_zipformer.js
Normal file
66
nodejs-addon-examples/test_audio_tagging_zipformer.js
Normal file
@@ -0,0 +1,66 @@
|
||||
// Copyright (c) 2024 Xiaomi Corporation
|
||||
const sherpa_onnx = require('sherpa-onnx-node');
|
||||
|
||||
// Please download models files from
|
||||
// https://github.com/k2-fsa/sherpa-onnx/releases/tag/audio-tagging-models
|
||||
function createAudioTagging() {
|
||||
const config = {
|
||||
model: {
|
||||
zipformer: {
|
||||
model:
|
||||
'./sherpa-onnx-zipformer-small-audio-tagging-2024-04-15/model.int8.onnx'
|
||||
},
|
||||
numThreads: 1,
|
||||
debug: true,
|
||||
},
|
||||
labels:
|
||||
'./sherpa-onnx-zipformer-small-audio-tagging-2024-04-15/class_labels_indices.csv',
|
||||
topK: 5,
|
||||
};
|
||||
return new sherpa_onnx.AudioTagging(config);
|
||||
}
|
||||
|
||||
const at = createAudioTagging();
|
||||
|
||||
const testWaves = [
|
||||
'./sherpa-onnx-zipformer-small-audio-tagging-2024-04-15/test_wavs/1.wav',
|
||||
'./sherpa-onnx-zipformer-small-audio-tagging-2024-04-15/test_wavs/2.wav',
|
||||
'./sherpa-onnx-zipformer-small-audio-tagging-2024-04-15/test_wavs/3.wav',
|
||||
'./sherpa-onnx-zipformer-small-audio-tagging-2024-04-15/test_wavs/4.wav',
|
||||
'./sherpa-onnx-zipformer-small-audio-tagging-2024-04-15/test_wavs/5.wav',
|
||||
'./sherpa-onnx-zipformer-small-audio-tagging-2024-04-15/test_wavs/6.wav',
|
||||
'./sherpa-onnx-zipformer-small-audio-tagging-2024-04-15/test_wavs/7.wav',
|
||||
'./sherpa-onnx-zipformer-small-audio-tagging-2024-04-15/test_wavs/8.wav',
|
||||
'./sherpa-onnx-zipformer-small-audio-tagging-2024-04-15/test_wavs/9.wav',
|
||||
'./sherpa-onnx-zipformer-small-audio-tagging-2024-04-15/test_wavs/10.wav',
|
||||
'./sherpa-onnx-zipformer-small-audio-tagging-2024-04-15/test_wavs/11.wav',
|
||||
'./sherpa-onnx-zipformer-small-audio-tagging-2024-04-15/test_wavs/12.wav',
|
||||
'./sherpa-onnx-zipformer-small-audio-tagging-2024-04-15/test_wavs/13.wav',
|
||||
];
|
||||
|
||||
console.log('------');
|
||||
|
||||
for (let filename of testWaves) {
|
||||
const start = performance.now();
|
||||
const stream = at.createStream();
|
||||
const wave = sherpa_onnx.readWave(filename);
|
||||
stream.acceptWaveform({sampleRate: wave.sampleRate, samples: wave.samples});
|
||||
const events = at.compute(stream);
|
||||
const stop = performance.now();
|
||||
|
||||
const elapsed_seconds = (stop - start) / 1000;
|
||||
const duration = wave.samples.length / wave.sampleRate;
|
||||
const real_time_factor = elapsed_seconds / duration;
|
||||
|
||||
console.log('input file:', filename);
|
||||
console.log('Probability\t\tName');
|
||||
for (let e of events) {
|
||||
console.log(`${e.prob.toFixed(3)}\t\t\t${e.name}`);
|
||||
}
|
||||
console.log('Wave duration', duration.toFixed(3), 'secodns')
|
||||
console.log('Elapsed', elapsed_seconds.toFixed(3), 'secodns')
|
||||
console.log(
|
||||
`RTF = ${elapsed_seconds.toFixed(3)}/${duration.toFixed(3)} =`,
|
||||
real_time_factor.toFixed(3))
|
||||
console.log('------');
|
||||
}
|
||||
@@ -18,6 +18,7 @@ add_definitions(-DNAPI_VERSION=3)
|
||||
include_directories(${CMAKE_JS_INC})
|
||||
|
||||
set(srcs
|
||||
src/audio-tagging.cc
|
||||
src/non-streaming-asr.cc
|
||||
src/non-streaming-tts.cc
|
||||
src/sherpa-onnx-node-addon-api.cc
|
||||
|
||||
26
scripts/node-addon-api/lib/audio-tagg.js
Normal file
26
scripts/node-addon-api/lib/audio-tagg.js
Normal file
@@ -0,0 +1,26 @@
|
||||
const addon = require('./addon.js');
|
||||
const non_streaming_asr = require('./non-streaming-asr.js');
|
||||
|
||||
class AudioTagging {
|
||||
constructor(config) {
|
||||
this.handle = addon.createAudioTagging(config);
|
||||
this.config = config;
|
||||
}
|
||||
|
||||
createStream() {
|
||||
return new non_streaming_asr.OfflineStream(
|
||||
addon.audioTaggingCreateOfflineStream(this.handle));
|
||||
}
|
||||
|
||||
/* Return an array. Each element is
|
||||
* an object {name: "xxx", prob: xxx, index: xxx};
|
||||
*
|
||||
*/
|
||||
compute(stream, topK = -1) {
|
||||
return addon.audioTaggingCompute(this.handle, stream.handle, topK);
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
AudioTagging,
|
||||
}
|
||||
@@ -5,6 +5,7 @@ const non_streaming_tts = require('./non-streaming-tts.js');
|
||||
const vad = require('./vad.js');
|
||||
const slid = require('./spoken-language-identification.js');
|
||||
const sid = require('./speaker-identification.js');
|
||||
const at = require('./audio-tagg.js');
|
||||
|
||||
module.exports = {
|
||||
OnlineRecognizer: streaming_asr.OnlineRecognizer,
|
||||
@@ -18,4 +19,5 @@ module.exports = {
|
||||
SpokenLanguageIdentification: slid.SpokenLanguageIdentification,
|
||||
SpeakerEmbeddingExtractor: sid.SpeakerEmbeddingExtractor,
|
||||
SpeakerEmbeddingManager: sid.SpeakerEmbeddingManager,
|
||||
AudioTagging: at.AudioTagging,
|
||||
}
|
||||
|
||||
227
scripts/node-addon-api/src/audio-tagging.cc
Normal file
227
scripts/node-addon-api/src/audio-tagging.cc
Normal file
@@ -0,0 +1,227 @@
|
||||
// scripts/node-addon-api/src/audio-tagging.cc
|
||||
//
|
||||
// Copyright (c) 2024 Xiaomi Corporation
|
||||
#include <sstream>
|
||||
|
||||
#include "macros.h" // NOLINT
|
||||
#include "napi.h" // NOLINT
|
||||
#include "sherpa-onnx/c-api/c-api.h"
|
||||
|
||||
static SherpaOnnxOfflineZipformerAudioTaggingModelConfig
|
||||
GetAudioTaggingZipformerModelConfig(Napi::Object obj) {
|
||||
SherpaOnnxOfflineZipformerAudioTaggingModelConfig c;
|
||||
memset(&c, 0, sizeof(c));
|
||||
|
||||
if (!obj.Has("zipformer") || !obj.Get("zipformer").IsObject()) {
|
||||
return c;
|
||||
}
|
||||
|
||||
Napi::Object o = obj.Get("zipformer").As<Napi::Object>();
|
||||
|
||||
SHERPA_ONNX_ASSIGN_ATTR_STR(model, model);
|
||||
|
||||
return c;
|
||||
}
|
||||
|
||||
static SherpaOnnxAudioTaggingModelConfig GetAudioTaggingModelConfig(
|
||||
Napi::Object obj) {
|
||||
SherpaOnnxAudioTaggingModelConfig c;
|
||||
memset(&c, 0, sizeof(c));
|
||||
|
||||
if (!obj.Has("model") || !obj.Get("model").IsObject()) {
|
||||
return c;
|
||||
}
|
||||
|
||||
Napi::Object o = obj.Get("model").As<Napi::Object>();
|
||||
c.zipformer = GetAudioTaggingZipformerModelConfig(o);
|
||||
|
||||
SHERPA_ONNX_ASSIGN_ATTR_STR(ced, ced);
|
||||
|
||||
SHERPA_ONNX_ASSIGN_ATTR_INT32(num_threads, numThreads);
|
||||
|
||||
if (o.Has("debug") &&
|
||||
(o.Get("debug").IsNumber() || o.Get("debug").IsBoolean())) {
|
||||
if (o.Get("debug").IsBoolean()) {
|
||||
c.debug = o.Get("debug").As<Napi::Boolean>().Value();
|
||||
} else {
|
||||
c.debug = o.Get("debug").As<Napi::Number>().Int32Value();
|
||||
}
|
||||
}
|
||||
SHERPA_ONNX_ASSIGN_ATTR_STR(provider, provider);
|
||||
|
||||
return c;
|
||||
}
|
||||
|
||||
static Napi::External<SherpaOnnxAudioTagging> CreateAudioTaggingWrapper(
|
||||
const Napi::CallbackInfo &info) {
|
||||
Napi::Env env = info.Env();
|
||||
if (info.Length() != 1) {
|
||||
std::ostringstream os;
|
||||
os << "Expect only 1 argument. Given: " << info.Length();
|
||||
|
||||
Napi::TypeError::New(env, os.str()).ThrowAsJavaScriptException();
|
||||
|
||||
return {};
|
||||
}
|
||||
|
||||
if (!info[0].IsObject()) {
|
||||
Napi::TypeError::New(env, "You should pass an object as the only argument.")
|
||||
.ThrowAsJavaScriptException();
|
||||
|
||||
return {};
|
||||
}
|
||||
|
||||
Napi::Object o = info[0].As<Napi::Object>();
|
||||
|
||||
SherpaOnnxAudioTaggingConfig c;
|
||||
memset(&c, 0, sizeof(c));
|
||||
c.model = GetAudioTaggingModelConfig(o);
|
||||
|
||||
SHERPA_ONNX_ASSIGN_ATTR_STR(labels, labels);
|
||||
SHERPA_ONNX_ASSIGN_ATTR_INT32(top_k, topK);
|
||||
|
||||
const SherpaOnnxAudioTagging *at = SherpaOnnxCreateAudioTagging(&c);
|
||||
|
||||
if (c.model.zipformer.model) {
|
||||
delete[] c.model.zipformer.model;
|
||||
}
|
||||
|
||||
if (c.model.ced) {
|
||||
delete[] c.model.ced;
|
||||
}
|
||||
|
||||
if (c.model.provider) {
|
||||
delete[] c.model.provider;
|
||||
}
|
||||
|
||||
if (c.labels) {
|
||||
delete[] c.labels;
|
||||
}
|
||||
|
||||
if (!at) {
|
||||
Napi::TypeError::New(env, "Please check your config!")
|
||||
.ThrowAsJavaScriptException();
|
||||
|
||||
return {};
|
||||
}
|
||||
|
||||
return Napi::External<SherpaOnnxAudioTagging>::New(
|
||||
env, const_cast<SherpaOnnxAudioTagging *>(at),
|
||||
[](Napi::Env env, SherpaOnnxAudioTagging *at) {
|
||||
SherpaOnnxDestroyAudioTagging(at);
|
||||
});
|
||||
}
|
||||
|
||||
static Napi::External<SherpaOnnxOfflineStream>
|
||||
AudioTaggingCreateOfflineStreamWrapper(const Napi::CallbackInfo &info) {
|
||||
Napi::Env env = info.Env();
|
||||
if (info.Length() != 1) {
|
||||
std::ostringstream os;
|
||||
os << "Expect only 1 argument. Given: " << info.Length();
|
||||
|
||||
Napi::TypeError::New(env, os.str()).ThrowAsJavaScriptException();
|
||||
|
||||
return {};
|
||||
}
|
||||
|
||||
if (!info[0].IsExternal()) {
|
||||
Napi::TypeError::New(
|
||||
env, "You should pass an audio tagging pointer as the only argument")
|
||||
.ThrowAsJavaScriptException();
|
||||
|
||||
return {};
|
||||
}
|
||||
|
||||
SherpaOnnxAudioTagging *at =
|
||||
info[0].As<Napi::External<SherpaOnnxAudioTagging>>().Data();
|
||||
|
||||
const SherpaOnnxOfflineStream *stream =
|
||||
SherpaOnnxAudioTaggingCreateOfflineStream(at);
|
||||
|
||||
return Napi::External<SherpaOnnxOfflineStream>::New(
|
||||
env, const_cast<SherpaOnnxOfflineStream *>(stream),
|
||||
[](Napi::Env env, SherpaOnnxOfflineStream *stream) {
|
||||
DestroyOfflineStream(stream);
|
||||
});
|
||||
}
|
||||
|
||||
static Napi::Object AudioTaggingComputeWrapper(const Napi::CallbackInfo &info) {
|
||||
Napi::Env env = info.Env();
|
||||
if (info.Length() != 3) {
|
||||
std::ostringstream os;
|
||||
os << "Expect only 3 arguments. Given: " << info.Length();
|
||||
|
||||
Napi::TypeError::New(env, os.str()).ThrowAsJavaScriptException();
|
||||
|
||||
return {};
|
||||
}
|
||||
|
||||
if (!info[0].IsExternal()) {
|
||||
Napi::TypeError::New(
|
||||
env, "You should pass an audio tagging pointer as the first argument")
|
||||
.ThrowAsJavaScriptException();
|
||||
|
||||
return {};
|
||||
}
|
||||
|
||||
if (!info[1].IsExternal()) {
|
||||
Napi::TypeError::New(
|
||||
env, "You should pass a offline stream pointer as the second argument")
|
||||
.ThrowAsJavaScriptException();
|
||||
|
||||
return {};
|
||||
}
|
||||
|
||||
if (!info[2].IsNumber()) {
|
||||
Napi::TypeError::New(env,
|
||||
"You should pass an integer as the third argument")
|
||||
.ThrowAsJavaScriptException();
|
||||
|
||||
return {};
|
||||
}
|
||||
|
||||
SherpaOnnxAudioTagging *at =
|
||||
info[0].As<Napi::External<SherpaOnnxAudioTagging>>().Data();
|
||||
|
||||
SherpaOnnxOfflineStream *stream =
|
||||
info[1].As<Napi::External<SherpaOnnxOfflineStream>>().Data();
|
||||
|
||||
int32_t top_k = info[2].As<Napi::Number>().Int32Value();
|
||||
|
||||
const SherpaOnnxAudioEvent *const *events =
|
||||
SherpaOnnxAudioTaggingCompute(at, stream, top_k);
|
||||
|
||||
auto p = events;
|
||||
int32_t k = 0;
|
||||
while (p && *p) {
|
||||
++k;
|
||||
++p;
|
||||
}
|
||||
|
||||
Napi::Array ans = Napi::Array::New(env, k);
|
||||
for (int32_t i = 0; i != k; ++i) {
|
||||
Napi::Object obj = Napi::Object::New(env);
|
||||
obj.Set(Napi::String::New(env, "name"),
|
||||
Napi::String::New(env, events[i]->name));
|
||||
obj.Set(Napi::String::New(env, "index"),
|
||||
Napi::Number::New(env, events[i]->index));
|
||||
obj.Set(Napi::String::New(env, "prob"),
|
||||
Napi::Number::New(env, events[i]->prob));
|
||||
ans[i] = obj;
|
||||
}
|
||||
|
||||
SherpaOnnxAudioTaggingFreeResults(events);
|
||||
|
||||
return ans;
|
||||
}
|
||||
|
||||
void InitAudioTagging(Napi::Env env, Napi::Object exports) {
|
||||
exports.Set(Napi::String::New(env, "createAudioTagging"),
|
||||
Napi::Function::New(env, CreateAudioTaggingWrapper));
|
||||
|
||||
exports.Set(Napi::String::New(env, "audioTaggingCreateOfflineStream"),
|
||||
Napi::Function::New(env, AudioTaggingCreateOfflineStreamWrapper));
|
||||
|
||||
exports.Set(Napi::String::New(env, "audioTaggingCompute"),
|
||||
Napi::Function::New(env, AudioTaggingComputeWrapper));
|
||||
}
|
||||
@@ -19,6 +19,8 @@ void InitSpokenLanguageID(Napi::Env env, Napi::Object exports);
|
||||
|
||||
void InitSpeakerID(Napi::Env env, Napi::Object exports);
|
||||
|
||||
void InitAudioTagging(Napi::Env env, Napi::Object exports);
|
||||
|
||||
Napi::Object Init(Napi::Env env, Napi::Object exports) {
|
||||
InitStreamingAsr(env, exports);
|
||||
InitNonStreamingAsr(env, exports);
|
||||
@@ -28,6 +30,7 @@ Napi::Object Init(Napi::Env env, Napi::Object exports) {
|
||||
InitWaveWriter(env, exports);
|
||||
InitSpokenLanguageID(env, exports);
|
||||
InitSpeakerID(env, exports);
|
||||
InitAudioTagging(env, exports);
|
||||
|
||||
return exports;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user