JavaScript API with WebAssembly for speaker diarization (#1414)
#1408 uses [node-addon-api](https://github.com/nodejs/node-addon-api) to call C API from JavaScript, whereas this pull request uses WebAssembly to call C API from JavaScript.
This commit is contained in:
12
.github/scripts/test-nodejs-npm.sh
vendored
12
.github/scripts/test-nodejs-npm.sh
vendored
@@ -9,6 +9,18 @@ git status
|
|||||||
ls -lh
|
ls -lh
|
||||||
ls -lh node_modules
|
ls -lh node_modules
|
||||||
|
|
||||||
|
echo '-----speaker diarization----------'
|
||||||
|
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
|
||||||
|
tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
|
||||||
|
rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
|
||||||
|
|
||||||
|
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
|
||||||
|
|
||||||
|
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav
|
||||||
|
|
||||||
|
node ./test-offline-speaker-diarization.js
|
||||||
|
rm -rfv *.wav *.onnx sherpa-onnx-pyannote-*
|
||||||
|
|
||||||
echo '-----vad+whisper----------'
|
echo '-----vad+whisper----------'
|
||||||
|
|
||||||
curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2
|
curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2
|
||||||
|
|||||||
2
.github/workflows/test-build-wheel.yaml
vendored
2
.github/workflows/test-build-wheel.yaml
vendored
@@ -139,7 +139,7 @@ jobs:
|
|||||||
export PATH=/c/hostedtoolcache/windows/Python/3.9.13/x64/bin:$PATH
|
export PATH=/c/hostedtoolcache/windows/Python/3.9.13/x64/bin:$PATH
|
||||||
export PATH=/c/hostedtoolcache/windows/Python/3.10.11/x64/bin:$PATH
|
export PATH=/c/hostedtoolcache/windows/Python/3.10.11/x64/bin:$PATH
|
||||||
export PATH=/c/hostedtoolcache/windows/Python/3.11.9/x64/bin:$PATH
|
export PATH=/c/hostedtoolcache/windows/Python/3.11.9/x64/bin:$PATH
|
||||||
export PATH=/c/hostedtoolcache/windows/Python/3.12.6/x64/bin:$PATH
|
export PATH=/c/hostedtoolcache/windows/Python/3.12.7/x64/bin:$PATH
|
||||||
|
|
||||||
which sherpa-onnx
|
which sherpa-onnx
|
||||||
sherpa-onnx --help
|
sherpa-onnx --help
|
||||||
|
|||||||
2
.github/workflows/test-pip-install.yaml
vendored
2
.github/workflows/test-pip-install.yaml
vendored
@@ -104,7 +104,7 @@ jobs:
|
|||||||
export PATH=/c/hostedtoolcache/windows/Python/3.9.13/x64/bin:$PATH
|
export PATH=/c/hostedtoolcache/windows/Python/3.9.13/x64/bin:$PATH
|
||||||
export PATH=/c/hostedtoolcache/windows/Python/3.10.11/x64/bin:$PATH
|
export PATH=/c/hostedtoolcache/windows/Python/3.10.11/x64/bin:$PATH
|
||||||
export PATH=/c/hostedtoolcache/windows/Python/3.11.9/x64/bin:$PATH
|
export PATH=/c/hostedtoolcache/windows/Python/3.11.9/x64/bin:$PATH
|
||||||
export PATH=/c/hostedtoolcache/windows/Python/3.12.6/x64/bin:$PATH
|
export PATH=/c/hostedtoolcache/windows/Python/3.12.7/x64/bin:$PATH
|
||||||
|
|
||||||
sherpa-onnx --help
|
sherpa-onnx --help
|
||||||
sherpa-onnx-keyword-spotter --help
|
sherpa-onnx-keyword-spotter --help
|
||||||
|
|||||||
@@ -22,6 +22,22 @@ In the following, we describe how to use [sherpa-onnx](https://github.com/k2-fsa
|
|||||||
for text-to-speech and speech-to-text.
|
for text-to-speech and speech-to-text.
|
||||||
|
|
||||||
|
|
||||||
|
# Speaker diarization
|
||||||
|
|
||||||
|
In the following, we demonstrate how to run speaker diarization.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
|
||||||
|
tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
|
||||||
|
rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
|
||||||
|
|
||||||
|
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
|
||||||
|
|
||||||
|
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav
|
||||||
|
|
||||||
|
node ./test-offline-speaker-diarization.js
|
||||||
|
```
|
||||||
|
|
||||||
# Text-to-speech
|
# Text-to-speech
|
||||||
|
|
||||||
In the following, we demonstrate how to run text-to-speech.
|
In the following, we demonstrate how to run text-to-speech.
|
||||||
|
|||||||
64
nodejs-examples/test-offline-speaker-diarization.js
Normal file
64
nodejs-examples/test-offline-speaker-diarization.js
Normal file
@@ -0,0 +1,64 @@
|
|||||||
|
// Copyright (c) 2024 Xiaomi Corporation
|
||||||
|
const sherpa_onnx = require('sherpa-onnx');
|
||||||
|
|
||||||
|
// clang-format off
|
||||||
|
/* Please use the following commands to download files
|
||||||
|
used in this script
|
||||||
|
|
||||||
|
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
|
||||||
|
tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
|
||||||
|
rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
|
||||||
|
|
||||||
|
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
|
||||||
|
|
||||||
|
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav
|
||||||
|
|
||||||
|
*/
|
||||||
|
// clang-format on
|
||||||
|
|
||||||
|
const config = {
|
||||||
|
segmentation: {
|
||||||
|
pyannote: {
|
||||||
|
model: './sherpa-onnx-pyannote-segmentation-3-0/model.onnx',
|
||||||
|
debug: 1,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
embedding: {
|
||||||
|
model: './3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx',
|
||||||
|
debug: 1,
|
||||||
|
},
|
||||||
|
clustering: {
|
||||||
|
// since we know that the test wave file
|
||||||
|
// ./0-four-speakers-zh.wav contains 4 speakers, we use 4 for numClusters
|
||||||
|
// here. if you don't have such information, please set numClusters to -1
|
||||||
|
numClusters: 4,
|
||||||
|
|
||||||
|
// If numClusters is not -1, then threshold is ignored.
|
||||||
|
//
|
||||||
|
// A larger threshold leads to fewer clusters, i.e., fewer speakers
|
||||||
|
// A smaller threshold leads to more clusters, i.e., more speakers
|
||||||
|
// You need to tune it by yourself.
|
||||||
|
threshold: 0.5,
|
||||||
|
},
|
||||||
|
|
||||||
|
// If a segment is shorter than minDurationOn, we discard it
|
||||||
|
minDurationOn: 0.2, // in seconds
|
||||||
|
|
||||||
|
// If the gap between two segments is less than minDurationOff, then we
|
||||||
|
// merge these two segments into a single one
|
||||||
|
minDurationOff: 0.5, // in seconds
|
||||||
|
};
|
||||||
|
|
||||||
|
const waveFilename = './0-four-speakers-zh.wav';
|
||||||
|
|
||||||
|
const sd = sherpa_onnx.createOfflineSpeakerDiarization(config);
|
||||||
|
console.log('Started')
|
||||||
|
|
||||||
|
const wave = sherpa_onnx.readWave(waveFilename);
|
||||||
|
if (sd.sampleRate != wave.sampleRate) {
|
||||||
|
throw new Error(
|
||||||
|
`Expected sample rate: ${sd.sampleRate}, given: ${wave.sampleRate}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
const segments = sd.process(wave.samples);
|
||||||
|
console.log(segments);
|
||||||
@@ -7,6 +7,8 @@ const sherpa_onnx_tts = require('./sherpa-onnx-tts.js');
|
|||||||
const sherpa_onnx_kws = require('./sherpa-onnx-kws.js');
|
const sherpa_onnx_kws = require('./sherpa-onnx-kws.js');
|
||||||
const sherpa_onnx_wave = require('./sherpa-onnx-wave.js');
|
const sherpa_onnx_wave = require('./sherpa-onnx-wave.js');
|
||||||
const sherpa_onnx_vad = require('./sherpa-onnx-vad.js');
|
const sherpa_onnx_vad = require('./sherpa-onnx-vad.js');
|
||||||
|
const sherpa_onnx_speaker_diarization =
|
||||||
|
require('./sherpa-onnx-speaker-diarization.js');
|
||||||
|
|
||||||
function createOnlineRecognizer(config) {
|
function createOnlineRecognizer(config) {
|
||||||
return sherpa_onnx_asr.createOnlineRecognizer(wasmModule, config);
|
return sherpa_onnx_asr.createOnlineRecognizer(wasmModule, config);
|
||||||
@@ -32,6 +34,11 @@ function createVad(config) {
|
|||||||
return sherpa_onnx_vad.createVad(wasmModule, config);
|
return sherpa_onnx_vad.createVad(wasmModule, config);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function createOfflineSpeakerDiarization(config) {
|
||||||
|
return sherpa_onnx_speaker_diarization.createOfflineSpeakerDiarization(
|
||||||
|
wasmModule, config);
|
||||||
|
}
|
||||||
|
|
||||||
function readWave(filename) {
|
function readWave(filename) {
|
||||||
return sherpa_onnx_wave.readWave(filename, wasmModule);
|
return sherpa_onnx_wave.readWave(filename, wasmModule);
|
||||||
}
|
}
|
||||||
@@ -51,4 +58,5 @@ module.exports = {
|
|||||||
writeWave,
|
writeWave,
|
||||||
createCircularBuffer,
|
createCircularBuffer,
|
||||||
createVad,
|
createVad,
|
||||||
|
createOfflineSpeakerDiarization,
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -70,6 +70,17 @@ set(exported_functions
|
|||||||
SherpaOnnxDestroySpeechSegment
|
SherpaOnnxDestroySpeechSegment
|
||||||
SherpaOnnxVoiceActivityDetectorReset
|
SherpaOnnxVoiceActivityDetectorReset
|
||||||
SherpaOnnxVoiceActivityDetectorFlush
|
SherpaOnnxVoiceActivityDetectorFlush
|
||||||
|
# Speaker diarization
|
||||||
|
SherpaOnnxCreateOfflineSpeakerDiarization
|
||||||
|
SherpaOnnxDestroyOfflineSpeakerDiarization
|
||||||
|
SherpaOnnxOfflineSpeakerDiarizationDestroyResult
|
||||||
|
SherpaOnnxOfflineSpeakerDiarizationDestroySegment
|
||||||
|
SherpaOnnxOfflineSpeakerDiarizationGetSampleRate
|
||||||
|
SherpaOnnxOfflineSpeakerDiarizationProcess
|
||||||
|
SherpaOnnxOfflineSpeakerDiarizationProcessWithCallback
|
||||||
|
SherpaOnnxOfflineSpeakerDiarizationResultGetNumSegments
|
||||||
|
SherpaOnnxOfflineSpeakerDiarizationResultSortByStartTime
|
||||||
|
SherpaOnnxOfflineSpeakerDiarizationSetConfig
|
||||||
#
|
#
|
||||||
SherpaOnnxFileExists
|
SherpaOnnxFileExists
|
||||||
SherpaOnnxReadWave
|
SherpaOnnxReadWave
|
||||||
@@ -109,6 +120,7 @@ install(
|
|||||||
${CMAKE_SOURCE_DIR}/wasm/tts/sherpa-onnx-tts.js
|
${CMAKE_SOURCE_DIR}/wasm/tts/sherpa-onnx-tts.js
|
||||||
${CMAKE_SOURCE_DIR}/wasm/kws/sherpa-onnx-kws.js
|
${CMAKE_SOURCE_DIR}/wasm/kws/sherpa-onnx-kws.js
|
||||||
${CMAKE_SOURCE_DIR}/wasm/vad/sherpa-onnx-vad.js
|
${CMAKE_SOURCE_DIR}/wasm/vad/sherpa-onnx-vad.js
|
||||||
|
${CMAKE_SOURCE_DIR}/wasm/speaker-diarization/sherpa-onnx-speaker-diarization.js
|
||||||
${CMAKE_SOURCE_DIR}/wasm/nodejs/sherpa-onnx-wave.js
|
${CMAKE_SOURCE_DIR}/wasm/nodejs/sherpa-onnx-wave.js
|
||||||
"$<TARGET_FILE_DIR:sherpa-onnx-wasm-nodejs>/sherpa-onnx-wasm-nodejs.js"
|
"$<TARGET_FILE_DIR:sherpa-onnx-wasm-nodejs>/sherpa-onnx-wasm-nodejs.js"
|
||||||
"$<TARGET_FILE_DIR:sherpa-onnx-wasm-nodejs>/sherpa-onnx-wasm-nodejs.wasm"
|
"$<TARGET_FILE_DIR:sherpa-onnx-wasm-nodejs>/sherpa-onnx-wasm-nodejs.wasm"
|
||||||
|
|||||||
@@ -12,7 +12,6 @@ Remember to rename the downloaded files.
|
|||||||
|
|
||||||
The following is an example.
|
The following is an example.
|
||||||
|
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
cd wasm/speaker-diarization/assets/
|
cd wasm/speaker-diarization/assets/
|
||||||
|
|
||||||
@@ -22,9 +21,6 @@ rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
|
|||||||
cp sherpa-onnx-pyannote-segmentation-3-0/model.onnx ./segmentation.onnx
|
cp sherpa-onnx-pyannote-segmentation-3-0/model.onnx ./segmentation.onnx
|
||||||
rm -rf sherpa-onnx-pyannote-segmentation-3-0
|
rm -rf sherpa-onnx-pyannote-segmentation-3-0
|
||||||
|
|
||||||
|
|
||||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
|
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
|
||||||
mv 3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx ./embedding.onnx
|
mv 3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx ./embedding.onnx
|
||||||
|
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|||||||
@@ -64,7 +64,7 @@ function initSherpaOnnxOfflineSpeakerSegmentationModelConfig(config, Module) {
|
|||||||
Module.setValue(ptr + offset, config.numThreads || 1, 'i32');
|
Module.setValue(ptr + offset, config.numThreads || 1, 'i32');
|
||||||
offset += 4;
|
offset += 4;
|
||||||
|
|
||||||
Module.setValue(ptr + offset, config.debug || 1, 'i32');
|
Module.setValue(ptr + offset, config.debug || 0, 'i32');
|
||||||
offset += 4;
|
offset += 4;
|
||||||
|
|
||||||
const providerLen = Module.lengthBytesUTF8(config.provider || 'cpu') + 1;
|
const providerLen = Module.lengthBytesUTF8(config.provider || 'cpu') + 1;
|
||||||
@@ -103,7 +103,7 @@ function initSherpaOnnxSpeakerEmbeddingExtractorConfig(config, Module) {
|
|||||||
Module.setValue(ptr + offset, config.numThreads || 1, 'i32');
|
Module.setValue(ptr + offset, config.numThreads || 1, 'i32');
|
||||||
offset += 4;
|
offset += 4;
|
||||||
|
|
||||||
Module.setValue(ptr + offset, config.debug || 1, 'i32');
|
Module.setValue(ptr + offset, config.debug || 0, 'i32');
|
||||||
offset += 4;
|
offset += 4;
|
||||||
|
|
||||||
Module.setValue(ptr + offset, buffer + modelLen, 'i8*');
|
Module.setValue(ptr + offset, buffer + modelLen, 'i8*');
|
||||||
@@ -270,11 +270,15 @@ class OfflineSpeakerDiarization {
|
|||||||
}
|
}
|
||||||
|
|
||||||
function createOfflineSpeakerDiarization(Module, myConfig) {
|
function createOfflineSpeakerDiarization(Module, myConfig) {
|
||||||
const config = {
|
let config = {
|
||||||
segmentation: {
|
segmentation: {
|
||||||
pyannote: {model: './segmentation.onnx'},
|
pyannote: {model: './segmentation.onnx'},
|
||||||
|
debug: 1,
|
||||||
|
},
|
||||||
|
embedding: {
|
||||||
|
model: './embedding.onnx',
|
||||||
|
debug: 1,
|
||||||
},
|
},
|
||||||
embedding: {model: './embedding.onnx'},
|
|
||||||
clustering: {numClusters: -1, threshold: 0.5},
|
clustering: {numClusters: -1, threshold: 0.5},
|
||||||
minDurationOn: 0.3,
|
minDurationOn: 0.3,
|
||||||
minDurationOff: 0.5,
|
minDurationOff: 0.5,
|
||||||
|
|||||||
Reference in New Issue
Block a user