JavaScript API (node-addon) for speaker diarization (#1408)
This commit is contained in:
22
.github/scripts/node-addon/package-optional.json
vendored
22
.github/scripts/node-addon/package-optional.json
vendored
@@ -1,7 +1,7 @@
|
|||||||
{
|
{
|
||||||
"name": "sherpa-onnx-PLATFORM2-ARCH",
|
"name": "sherpa-onnx-PLATFORM2-ARCH",
|
||||||
"version": "SHERPA_ONNX_VERSION",
|
"version": "SHERPA_ONNX_VERSION",
|
||||||
"description": "Speech-to-text and text-to-speech using Next-gen Kaldi without internet connection",
|
"description": "Speech-to-text, text-to-speech, and speaker diarization using Next-gen Kaldi without internet connection",
|
||||||
"main": "index.js",
|
"main": "index.js",
|
||||||
"scripts": {
|
"scripts": {
|
||||||
"test": "echo \"Error: no test specified\" && exit 1"
|
"test": "echo \"Error: no test specified\" && exit 1"
|
||||||
@@ -16,8 +16,18 @@
|
|||||||
"transcription",
|
"transcription",
|
||||||
"real-time speech recognition",
|
"real-time speech recognition",
|
||||||
"without internet connection",
|
"without internet connection",
|
||||||
|
"locally",
|
||||||
|
"local",
|
||||||
"embedded systems",
|
"embedded systems",
|
||||||
"open source",
|
"open source",
|
||||||
|
"diarization",
|
||||||
|
"speaker diarization",
|
||||||
|
"speaker recognition",
|
||||||
|
"speaker",
|
||||||
|
"speaker segmentation",
|
||||||
|
"speaker verification",
|
||||||
|
"spoken language identification",
|
||||||
|
"sherpa",
|
||||||
"zipformer",
|
"zipformer",
|
||||||
"asr",
|
"asr",
|
||||||
"tts",
|
"tts",
|
||||||
@@ -30,13 +40,13 @@
|
|||||||
"offline",
|
"offline",
|
||||||
"privacy",
|
"privacy",
|
||||||
"open source",
|
"open source",
|
||||||
"vad",
|
|
||||||
"speaker id",
|
|
||||||
"language id",
|
|
||||||
"node-addon-api",
|
|
||||||
"streaming speech recognition",
|
"streaming speech recognition",
|
||||||
"speech",
|
"speech",
|
||||||
"recognition"
|
"recognition",
|
||||||
|
"vad",
|
||||||
|
"node-addon-api",
|
||||||
|
"speaker id",
|
||||||
|
"language id"
|
||||||
],
|
],
|
||||||
"author": "The next-gen Kaldi team",
|
"author": "The next-gen Kaldi team",
|
||||||
"license": "Apache-2.0",
|
"license": "Apache-2.0",
|
||||||
|
|||||||
22
.github/scripts/node-addon/package.json
vendored
22
.github/scripts/node-addon/package.json
vendored
@@ -1,7 +1,7 @@
|
|||||||
{
|
{
|
||||||
"name": "sherpa-onnx-node",
|
"name": "sherpa-onnx-node",
|
||||||
"version": "SHERPA_ONNX_VERSION",
|
"version": "SHERPA_ONNX_VERSION",
|
||||||
"description": "Speech-to-text and text-to-speech using Next-gen Kaldi without internet connection",
|
"description": "Speech-to-text, text-to-speech, and speaker diarization using Next-gen Kaldi without internet connection",
|
||||||
"main": "sherpa-onnx.js",
|
"main": "sherpa-onnx.js",
|
||||||
"scripts": {
|
"scripts": {
|
||||||
"test": "echo \"Error: no test specified\" && exit 1"
|
"test": "echo \"Error: no test specified\" && exit 1"
|
||||||
@@ -16,8 +16,18 @@
|
|||||||
"transcription",
|
"transcription",
|
||||||
"real-time speech recognition",
|
"real-time speech recognition",
|
||||||
"without internet connection",
|
"without internet connection",
|
||||||
|
"locally",
|
||||||
|
"local",
|
||||||
"embedded systems",
|
"embedded systems",
|
||||||
"open source",
|
"open source",
|
||||||
|
"diarization",
|
||||||
|
"speaker diarization",
|
||||||
|
"speaker recognition",
|
||||||
|
"speaker",
|
||||||
|
"speaker segmentation",
|
||||||
|
"speaker verification",
|
||||||
|
"spoken language identification",
|
||||||
|
"sherpa",
|
||||||
"zipformer",
|
"zipformer",
|
||||||
"asr",
|
"asr",
|
||||||
"tts",
|
"tts",
|
||||||
@@ -30,13 +40,13 @@
|
|||||||
"offline",
|
"offline",
|
||||||
"privacy",
|
"privacy",
|
||||||
"open source",
|
"open source",
|
||||||
"vad",
|
|
||||||
"speaker id",
|
|
||||||
"language id",
|
|
||||||
"node-addon-api",
|
|
||||||
"streaming speech recognition",
|
"streaming speech recognition",
|
||||||
"speech",
|
"speech",
|
||||||
"recognition"
|
"recognition",
|
||||||
|
"vad",
|
||||||
|
"node-addon-api",
|
||||||
|
"speaker id",
|
||||||
|
"language id"
|
||||||
],
|
],
|
||||||
"author": "The next-gen Kaldi team",
|
"author": "The next-gen Kaldi team",
|
||||||
"license": "Apache-2.0",
|
"license": "Apache-2.0",
|
||||||
|
|||||||
14
.github/scripts/test-nodejs-addon-npm.sh
vendored
14
.github/scripts/test-nodejs-addon-npm.sh
vendored
@@ -10,6 +10,20 @@ arch=$(node -p "require('os').arch()")
|
|||||||
platform=$(node -p "require('os').platform()")
|
platform=$(node -p "require('os').platform()")
|
||||||
node_version=$(node -p "process.versions.node.split('.')[0]")
|
node_version=$(node -p "process.versions.node.split('.')[0]")
|
||||||
|
|
||||||
|
echo "----------non-streaming speaker diarization----------"
|
||||||
|
|
||||||
|
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
|
||||||
|
tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
|
||||||
|
rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
|
||||||
|
|
||||||
|
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
|
||||||
|
|
||||||
|
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav
|
||||||
|
|
||||||
|
node ./test_offline_speaker_diarization.js
|
||||||
|
|
||||||
|
rm -rfv *.onnx *.wav sherpa-onnx-pyannote-*
|
||||||
|
|
||||||
echo "----------non-streaming asr + vad----------"
|
echo "----------non-streaming asr + vad----------"
|
||||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2
|
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2
|
||||||
tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2
|
tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2
|
||||||
|
|||||||
@@ -43,6 +43,12 @@ export LD_LIBRARY_PATH=$PWD/node_modules/.pnpm/sherpa-onnx-node@<REPLACE-THIS-WI
|
|||||||
|
|
||||||
The following tables list the examples in this folder.
|
The following tables list the examples in this folder.
|
||||||
|
|
||||||
|
## Speaker diarization
|
||||||
|
|
||||||
|
|File| Description|
|
||||||
|
|---|---|
|
||||||
|
|[./test_offline_speaker_diarization.js](./test_offline_speaker_diarization.js)| It demonstrates how to use sherpa-onnx JavaScript API for speaker diarization. It supports speaker segmentation models from [pyannote-audio](https://github.com/pyannote/pyannote-audio)|
|
||||||
|
|
||||||
## Add punctuations to text
|
## Add punctuations to text
|
||||||
|
|
||||||
|File| Description|
|
|File| Description|
|
||||||
@@ -130,6 +136,21 @@ The following tables list the examples in this folder.
|
|||||||
|[./test_tts_non_streaming_vits_zh_aishell3.js](./test_tts_non_streaming_vits_zh_aishell3.js)| Text-to-speech with a Chinese TTS model|
|
|[./test_tts_non_streaming_vits_zh_aishell3.js](./test_tts_non_streaming_vits_zh_aishell3.js)| Text-to-speech with a Chinese TTS model|
|
||||||
|
|
||||||
|
|
||||||
|
### Speaker diarization
|
||||||
|
|
||||||
|
```bash
|
||||||
|
|
||||||
|
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
|
||||||
|
tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
|
||||||
|
rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
|
||||||
|
|
||||||
|
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
|
||||||
|
|
||||||
|
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav
|
||||||
|
|
||||||
|
node ./test_offline_speaker_diarization.js
|
||||||
|
```
|
||||||
|
|
||||||
### Voice Activity detection (VAD)
|
### Voice Activity detection (VAD)
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
|||||||
62
nodejs-addon-examples/test_offline_speaker_diarization.js
Normal file
62
nodejs-addon-examples/test_offline_speaker_diarization.js
Normal file
@@ -0,0 +1,62 @@
|
|||||||
|
// Copyright (c) 2024 Xiaomi Corporation
|
||||||
|
const sherpa_onnx = require('sherpa-onnx-node');
|
||||||
|
|
||||||
|
// clang-format off
|
||||||
|
/* Please use the following commands to download files
|
||||||
|
used in this script
|
||||||
|
|
||||||
|
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
|
||||||
|
tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
|
||||||
|
rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
|
||||||
|
|
||||||
|
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
|
||||||
|
|
||||||
|
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav
|
||||||
|
|
||||||
|
*/
|
||||||
|
// clang-format on
|
||||||
|
|
||||||
|
const config = {
|
||||||
|
segmentation: {
|
||||||
|
pyannote: {
|
||||||
|
model: './sherpa-onnx-pyannote-segmentation-3-0/model.onnx',
|
||||||
|
},
|
||||||
|
},
|
||||||
|
embedding: {
|
||||||
|
model: './3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx',
|
||||||
|
},
|
||||||
|
clustering: {
|
||||||
|
// since we know that the test wave file
|
||||||
|
// ./0-four-speakers-zh.wav contains 4 speakers, we use 4 for numClusters
|
||||||
|
// here. if you don't have such information, please set numClusters to -1
|
||||||
|
numClusters: 4,
|
||||||
|
|
||||||
|
// If numClusters is not -1, then threshold is ignored.
|
||||||
|
//
|
||||||
|
// A larger threshold leads to fewer clusters, i.e., fewer speakers
|
||||||
|
// A smaller threshold leads to more clusters, i.e., more speakers
|
||||||
|
// You need to tune it by yourself.
|
||||||
|
threshold: 0.5,
|
||||||
|
},
|
||||||
|
|
||||||
|
// If a segment is shorter than minDurationOn, we discard it
|
||||||
|
minDurationOn: 0.2, // in seconds
|
||||||
|
|
||||||
|
// If the gap between two segments is less than minDurationOff, then we
|
||||||
|
// merge these two segments into a single one
|
||||||
|
minDurationOff: 0.5, // in seconds
|
||||||
|
};
|
||||||
|
|
||||||
|
const waveFilename = './0-four-speakers-zh.wav';
|
||||||
|
|
||||||
|
const sd = new sherpa_onnx.OfflineSpeakerDiarization(config);
|
||||||
|
console.log('Started')
|
||||||
|
|
||||||
|
const wave = sherpa_onnx.readWave(waveFilename);
|
||||||
|
if (sd.sampleRate != wave.sampleRate) {
|
||||||
|
throw new Error(
|
||||||
|
`Expected sample rate: ${sd.sampleRate}, given: ${wave.sampleRate}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
const segments = sd.process(wave.samples);
|
||||||
|
console.log(segments);
|
||||||
@@ -21,6 +21,7 @@ set(srcs
|
|||||||
src/audio-tagging.cc
|
src/audio-tagging.cc
|
||||||
src/keyword-spotting.cc
|
src/keyword-spotting.cc
|
||||||
src/non-streaming-asr.cc
|
src/non-streaming-asr.cc
|
||||||
|
src/non-streaming-speaker-diarization.cc
|
||||||
src/non-streaming-tts.cc
|
src/non-streaming-tts.cc
|
||||||
src/punctuation.cc
|
src/punctuation.cc
|
||||||
src/sherpa-onnx-node-addon-api.cc
|
src/sherpa-onnx-node-addon-api.cc
|
||||||
|
|||||||
@@ -0,0 +1,32 @@
|
|||||||
|
const addon = require('./addon.js');
|
||||||
|
|
||||||
|
class OfflineSpeakerDiarization {
|
||||||
|
constructor(config) {
|
||||||
|
this.handle = addon.createOfflineSpeakerDiarization(config);
|
||||||
|
this.config = config;
|
||||||
|
|
||||||
|
this.sampleRate = addon.getOfflineSpeakerDiarizationSampleRate(this.handle);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* samples is a 1-d float32 array. Each element of the array should be
|
||||||
|
* in the range [-1, 1].
|
||||||
|
*
|
||||||
|
* We assume its sample rate equals to this.sampleRate.
|
||||||
|
*
|
||||||
|
* Returns an array of object, where an object is
|
||||||
|
*
|
||||||
|
* {
|
||||||
|
* "start": start_time_in_seconds,
|
||||||
|
* "end": end_time_in_seconds,
|
||||||
|
* "speaker": an_integer,
|
||||||
|
* }
|
||||||
|
*/
|
||||||
|
process(samples) {
|
||||||
|
return addon.offlineSpeakerDiarizationProcess(this.handle, samples);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
module.exports = {
|
||||||
|
OfflineSpeakerDiarization,
|
||||||
|
}
|
||||||
@@ -8,6 +8,7 @@ const sid = require('./speaker-identification.js');
|
|||||||
const at = require('./audio-tagg.js');
|
const at = require('./audio-tagg.js');
|
||||||
const punct = require('./punctuation.js');
|
const punct = require('./punctuation.js');
|
||||||
const kws = require('./keyword-spotter.js');
|
const kws = require('./keyword-spotter.js');
|
||||||
|
const sd = require('./non-streaming-speaker-diarization.js');
|
||||||
|
|
||||||
module.exports = {
|
module.exports = {
|
||||||
OnlineRecognizer: streaming_asr.OnlineRecognizer,
|
OnlineRecognizer: streaming_asr.OnlineRecognizer,
|
||||||
@@ -24,4 +25,5 @@ module.exports = {
|
|||||||
AudioTagging: at.AudioTagging,
|
AudioTagging: at.AudioTagging,
|
||||||
Punctuation: punct.Punctuation,
|
Punctuation: punct.Punctuation,
|
||||||
KeywordSpotter: kws.KeywordSpotter,
|
KeywordSpotter: kws.KeywordSpotter,
|
||||||
|
OfflineSpeakerDiarization: sd.OfflineSpeakerDiarization,
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
{
|
{
|
||||||
"main": "lib/sherpa-onnx.js",
|
"main": "lib/sherpa-onnx.js",
|
||||||
"version": "1.0.0",
|
"version": "1.0.0",
|
||||||
"description": "Speech-to-text and text-to-speech using Next-gen Kaldi without internet connection",
|
"description": "Speech-to-text, text-to-speech, and speaker diarization using Next-gen Kaldi without internet connection",
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"cmake-js": "^6.0.0",
|
"cmake-js": "^6.0.0",
|
||||||
"node-addon-api": "^1.1.0",
|
"node-addon-api": "^1.1.0",
|
||||||
@@ -21,8 +21,18 @@
|
|||||||
"transcription",
|
"transcription",
|
||||||
"real-time speech recognition",
|
"real-time speech recognition",
|
||||||
"without internet connection",
|
"without internet connection",
|
||||||
|
"locally",
|
||||||
|
"local",
|
||||||
"embedded systems",
|
"embedded systems",
|
||||||
"open source",
|
"open source",
|
||||||
|
"diarization",
|
||||||
|
"speaker diarization",
|
||||||
|
"speaker recognition",
|
||||||
|
"speaker",
|
||||||
|
"speaker segmentation",
|
||||||
|
"speaker verification",
|
||||||
|
"spoken language identification",
|
||||||
|
"sherpa",
|
||||||
"zipformer",
|
"zipformer",
|
||||||
"asr",
|
"asr",
|
||||||
"tts",
|
"tts",
|
||||||
|
|||||||
265
scripts/node-addon-api/src/non-streaming-speaker-diarization.cc
Normal file
265
scripts/node-addon-api/src/non-streaming-speaker-diarization.cc
Normal file
@@ -0,0 +1,265 @@
|
|||||||
|
// scripts/node-addon-api/src/non-streaming-speaker-diarization.cc
|
||||||
|
//
|
||||||
|
// Copyright (c) 2024 Xiaomi Corporation
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
|
#include <sstream>
|
||||||
|
|
||||||
|
#include "macros.h" // NOLINT
|
||||||
|
#include "napi.h" // NOLINT
|
||||||
|
#include "sherpa-onnx/c-api/c-api.h"
|
||||||
|
|
||||||
|
static SherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig
|
||||||
|
GetOfflineSpeakerSegmentationPyannoteModelConfig(Napi::Object obj) {
|
||||||
|
SherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig c;
|
||||||
|
memset(&c, 0, sizeof(c));
|
||||||
|
|
||||||
|
if (!obj.Has("pyannote") || !obj.Get("pyannote").IsObject()) {
|
||||||
|
return c;
|
||||||
|
}
|
||||||
|
|
||||||
|
Napi::Object o = obj.Get("pyannote").As<Napi::Object>();
|
||||||
|
SHERPA_ONNX_ASSIGN_ATTR_STR(model, model);
|
||||||
|
|
||||||
|
return c;
|
||||||
|
}
|
||||||
|
|
||||||
|
static SherpaOnnxOfflineSpeakerSegmentationModelConfig
|
||||||
|
GetOfflineSpeakerSegmentationModelConfig(Napi::Object obj) {
|
||||||
|
SherpaOnnxOfflineSpeakerSegmentationModelConfig c;
|
||||||
|
memset(&c, 0, sizeof(c));
|
||||||
|
|
||||||
|
if (!obj.Has("segmentation") || !obj.Get("segmentation").IsObject()) {
|
||||||
|
return c;
|
||||||
|
}
|
||||||
|
|
||||||
|
Napi::Object o = obj.Get("segmentation").As<Napi::Object>();
|
||||||
|
|
||||||
|
c.pyannote = GetOfflineSpeakerSegmentationPyannoteModelConfig(o);
|
||||||
|
|
||||||
|
SHERPA_ONNX_ASSIGN_ATTR_INT32(num_threads, numThreads);
|
||||||
|
|
||||||
|
if (o.Has("debug") &&
|
||||||
|
(o.Get("debug").IsNumber() || o.Get("debug").IsBoolean())) {
|
||||||
|
if (o.Get("debug").IsBoolean()) {
|
||||||
|
c.debug = o.Get("debug").As<Napi::Boolean>().Value();
|
||||||
|
} else {
|
||||||
|
c.debug = o.Get("debug").As<Napi::Number>().Int32Value();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
SHERPA_ONNX_ASSIGN_ATTR_STR(provider, provider);
|
||||||
|
|
||||||
|
return c;
|
||||||
|
}
|
||||||
|
|
||||||
|
static SherpaOnnxSpeakerEmbeddingExtractorConfig
|
||||||
|
GetSpeakerEmbeddingExtractorConfig(Napi::Object obj) {
|
||||||
|
SherpaOnnxSpeakerEmbeddingExtractorConfig c;
|
||||||
|
memset(&c, 0, sizeof(c));
|
||||||
|
|
||||||
|
if (!obj.Has("embedding") || !obj.Get("embedding").IsObject()) {
|
||||||
|
return c;
|
||||||
|
}
|
||||||
|
|
||||||
|
Napi::Object o = obj.Get("embedding").As<Napi::Object>();
|
||||||
|
|
||||||
|
SHERPA_ONNX_ASSIGN_ATTR_STR(model, model);
|
||||||
|
SHERPA_ONNX_ASSIGN_ATTR_INT32(num_threads, numThreads);
|
||||||
|
|
||||||
|
if (o.Has("debug") &&
|
||||||
|
(o.Get("debug").IsNumber() || o.Get("debug").IsBoolean())) {
|
||||||
|
if (o.Get("debug").IsBoolean()) {
|
||||||
|
c.debug = o.Get("debug").As<Napi::Boolean>().Value();
|
||||||
|
} else {
|
||||||
|
c.debug = o.Get("debug").As<Napi::Number>().Int32Value();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
SHERPA_ONNX_ASSIGN_ATTR_STR(provider, provider);
|
||||||
|
|
||||||
|
return c;
|
||||||
|
}
|
||||||
|
|
||||||
|
static SherpaOnnxFastClusteringConfig GetFastClusteringConfig(
|
||||||
|
Napi::Object obj) {
|
||||||
|
SherpaOnnxFastClusteringConfig c;
|
||||||
|
memset(&c, 0, sizeof(c));
|
||||||
|
|
||||||
|
if (!obj.Has("clustering") || !obj.Get("clustering").IsObject()) {
|
||||||
|
return c;
|
||||||
|
}
|
||||||
|
|
||||||
|
Napi::Object o = obj.Get("clustering").As<Napi::Object>();
|
||||||
|
|
||||||
|
SHERPA_ONNX_ASSIGN_ATTR_INT32(num_clusters, numClusters);
|
||||||
|
SHERPA_ONNX_ASSIGN_ATTR_FLOAT(threshold, threshold);
|
||||||
|
|
||||||
|
return c;
|
||||||
|
}
|
||||||
|
|
||||||
|
static Napi::External<SherpaOnnxOfflineSpeakerDiarization>
|
||||||
|
CreateOfflineSpeakerDiarizationWrapper(const Napi::CallbackInfo &info) {
|
||||||
|
Napi::Env env = info.Env();
|
||||||
|
if (info.Length() != 1) {
|
||||||
|
std::ostringstream os;
|
||||||
|
os << "Expect only 1 argument. Given: " << info.Length();
|
||||||
|
|
||||||
|
Napi::TypeError::New(env, os.str()).ThrowAsJavaScriptException();
|
||||||
|
|
||||||
|
return {};
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!info[0].IsObject()) {
|
||||||
|
Napi::TypeError::New(env, "Expect an object as the argument")
|
||||||
|
.ThrowAsJavaScriptException();
|
||||||
|
|
||||||
|
return {};
|
||||||
|
}
|
||||||
|
|
||||||
|
Napi::Object o = info[0].As<Napi::Object>();
|
||||||
|
|
||||||
|
SherpaOnnxOfflineSpeakerDiarizationConfig c;
|
||||||
|
memset(&c, 0, sizeof(c));
|
||||||
|
|
||||||
|
c.segmentation = GetOfflineSpeakerSegmentationModelConfig(o);
|
||||||
|
c.embedding = GetSpeakerEmbeddingExtractorConfig(o);
|
||||||
|
c.clustering = GetFastClusteringConfig(o);
|
||||||
|
|
||||||
|
SHERPA_ONNX_ASSIGN_ATTR_FLOAT(min_duration_on, minDurationOn);
|
||||||
|
SHERPA_ONNX_ASSIGN_ATTR_FLOAT(min_duration_off, minDurationOff);
|
||||||
|
|
||||||
|
const SherpaOnnxOfflineSpeakerDiarization *sd =
|
||||||
|
SherpaOnnxCreateOfflineSpeakerDiarization(&c);
|
||||||
|
|
||||||
|
if (c.segmentation.pyannote.model) {
|
||||||
|
delete[] c.segmentation.pyannote.model;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (c.segmentation.provider) {
|
||||||
|
delete[] c.segmentation.provider;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (c.embedding.model) {
|
||||||
|
delete[] c.embedding.model;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (c.embedding.provider) {
|
||||||
|
delete[] c.embedding.provider;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!sd) {
|
||||||
|
Napi::TypeError::New(env, "Please check your config!")
|
||||||
|
.ThrowAsJavaScriptException();
|
||||||
|
|
||||||
|
return {};
|
||||||
|
}
|
||||||
|
|
||||||
|
return Napi::External<SherpaOnnxOfflineSpeakerDiarization>::New(
|
||||||
|
env, const_cast<SherpaOnnxOfflineSpeakerDiarization *>(sd),
|
||||||
|
[](Napi::Env env, SherpaOnnxOfflineSpeakerDiarization *sd) {
|
||||||
|
SherpaOnnxDestroyOfflineSpeakerDiarization(sd);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
static Napi::Number OfflineSpeakerDiarizationGetSampleRateWrapper(
|
||||||
|
const Napi::CallbackInfo &info) {
|
||||||
|
Napi::Env env = info.Env();
|
||||||
|
|
||||||
|
if (info.Length() != 1) {
|
||||||
|
std::ostringstream os;
|
||||||
|
os << "Expect only 1 argument. Given: " << info.Length();
|
||||||
|
|
||||||
|
Napi::TypeError::New(env, os.str()).ThrowAsJavaScriptException();
|
||||||
|
|
||||||
|
return {};
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!info[0].IsExternal()) {
|
||||||
|
Napi::TypeError::New(
|
||||||
|
env, "Argument 0 should be an offline speaker diarization pointer.")
|
||||||
|
.ThrowAsJavaScriptException();
|
||||||
|
|
||||||
|
return {};
|
||||||
|
}
|
||||||
|
|
||||||
|
const SherpaOnnxOfflineSpeakerDiarization *sd =
|
||||||
|
info[0].As<Napi::External<SherpaOnnxOfflineSpeakerDiarization>>().Data();
|
||||||
|
|
||||||
|
int32_t sample_rate = SherpaOnnxOfflineSpeakerDiarizationGetSampleRate(sd);
|
||||||
|
|
||||||
|
return Napi::Number::New(env, sample_rate);
|
||||||
|
}
|
||||||
|
|
||||||
|
static Napi::Array OfflineSpeakerDiarizationProcessWrapper(
|
||||||
|
const Napi::CallbackInfo &info) {
|
||||||
|
Napi::Env env = info.Env();
|
||||||
|
|
||||||
|
if (info.Length() != 2) {
|
||||||
|
std::ostringstream os;
|
||||||
|
os << "Expect only 2 arguments. Given: " << info.Length();
|
||||||
|
|
||||||
|
Napi::TypeError::New(env, os.str()).ThrowAsJavaScriptException();
|
||||||
|
|
||||||
|
return {};
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!info[0].IsExternal()) {
|
||||||
|
Napi::TypeError::New(
|
||||||
|
env, "Argument 0 should be an offline speaker diarization pointer.")
|
||||||
|
.ThrowAsJavaScriptException();
|
||||||
|
|
||||||
|
return {};
|
||||||
|
}
|
||||||
|
|
||||||
|
const SherpaOnnxOfflineSpeakerDiarization *sd =
|
||||||
|
info[0].As<Napi::External<SherpaOnnxOfflineSpeakerDiarization>>().Data();
|
||||||
|
|
||||||
|
if (!info[1].IsTypedArray()) {
|
||||||
|
Napi::TypeError::New(env, "Argument 1 should be a typed array")
|
||||||
|
.ThrowAsJavaScriptException();
|
||||||
|
|
||||||
|
return {};
|
||||||
|
}
|
||||||
|
|
||||||
|
Napi::Float32Array samples = info[1].As<Napi::Float32Array>();
|
||||||
|
|
||||||
|
const SherpaOnnxOfflineSpeakerDiarizationResult *r =
|
||||||
|
SherpaOnnxOfflineSpeakerDiarizationProcess(sd, samples.Data(),
|
||||||
|
samples.ElementLength());
|
||||||
|
|
||||||
|
int32_t num_segments =
|
||||||
|
SherpaOnnxOfflineSpeakerDiarizationResultGetNumSegments(r);
|
||||||
|
|
||||||
|
const SherpaOnnxOfflineSpeakerDiarizationSegment *segments =
|
||||||
|
SherpaOnnxOfflineSpeakerDiarizationResultSortByStartTime(r);
|
||||||
|
|
||||||
|
Napi::Array ans = Napi::Array::New(env, num_segments);
|
||||||
|
|
||||||
|
for (int32_t i = 0; i != num_segments; ++i) {
|
||||||
|
Napi::Object obj = Napi::Object::New(env);
|
||||||
|
obj.Set(Napi::String::New(env, "start"), segments[i].start);
|
||||||
|
obj.Set(Napi::String::New(env, "end"), segments[i].end);
|
||||||
|
obj.Set(Napi::String::New(env, "speaker"), segments[i].speaker);
|
||||||
|
|
||||||
|
ans[i] = obj;
|
||||||
|
}
|
||||||
|
|
||||||
|
SherpaOnnxOfflineSpeakerDiarizationDestroySegment(segments);
|
||||||
|
SherpaOnnxOfflineSpeakerDiarizationDestroyResult(r);
|
||||||
|
|
||||||
|
return ans;
|
||||||
|
}
|
||||||
|
|
||||||
|
void InitNonStreamingSpeakerDiarization(Napi::Env env, Napi::Object exports) {
|
||||||
|
exports.Set(Napi::String::New(env, "createOfflineSpeakerDiarization"),
|
||||||
|
Napi::Function::New(env, CreateOfflineSpeakerDiarizationWrapper));
|
||||||
|
|
||||||
|
exports.Set(
|
||||||
|
Napi::String::New(env, "getOfflineSpeakerDiarizationSampleRate"),
|
||||||
|
Napi::Function::New(env, OfflineSpeakerDiarizationGetSampleRateWrapper));
|
||||||
|
|
||||||
|
exports.Set(
|
||||||
|
Napi::String::New(env, "offlineSpeakerDiarizationProcess"),
|
||||||
|
Napi::Function::New(env, OfflineSpeakerDiarizationProcessWrapper));
|
||||||
|
}
|
||||||
@@ -25,6 +25,8 @@ void InitPunctuation(Napi::Env env, Napi::Object exports);
|
|||||||
|
|
||||||
void InitKeywordSpotting(Napi::Env env, Napi::Object exports);
|
void InitKeywordSpotting(Napi::Env env, Napi::Object exports);
|
||||||
|
|
||||||
|
void InitNonStreamingSpeakerDiarization(Napi::Env env, Napi::Object exports);
|
||||||
|
|
||||||
Napi::Object Init(Napi::Env env, Napi::Object exports) {
|
Napi::Object Init(Napi::Env env, Napi::Object exports) {
|
||||||
InitStreamingAsr(env, exports);
|
InitStreamingAsr(env, exports);
|
||||||
InitNonStreamingAsr(env, exports);
|
InitNonStreamingAsr(env, exports);
|
||||||
@@ -37,6 +39,7 @@ Napi::Object Init(Napi::Env env, Napi::Object exports) {
|
|||||||
InitAudioTagging(env, exports);
|
InitAudioTagging(env, exports);
|
||||||
InitPunctuation(env, exports);
|
InitPunctuation(env, exports);
|
||||||
InitKeywordSpotting(env, exports);
|
InitKeywordSpotting(env, exports);
|
||||||
|
InitNonStreamingSpeakerDiarization(env, exports);
|
||||||
|
|
||||||
return exports;
|
return exports;
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user