From 939fdd942c9436b977c737a2866b827099d4356d Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Mon, 13 May 2024 20:26:11 +0800 Subject: [PATCH] Add spoken language identification for node-addon-api (#872) --- .github/scripts/node-addon/run.sh | 2 + .github/scripts/test-nodejs-addon-npm.sh | 14 ++ .github/workflows/npm-addon.yaml | 1 + nodejs-addon-examples/README.md | 18 ++ .../test_spoken_language_identification.js | 40 ++++ nodejs-addon-examples/test_vad_microphone.js | 2 +- ...oken_language_identification_microphone.js | 114 +++++++++ scripts/node-addon-api/CMakeLists.txt | 1 + .../node-addon-api/lib/non-streaming-asr.js | 1 + scripts/node-addon-api/lib/sherpa-onnx.js | 2 + .../lib/spoken-language-identification.js | 30 +++ .../src/sherpa-onnx-node-addon-api.cc | 3 + .../src/spoken-language-identification.cc | 218 ++++++++++++++++++ 13 files changed, 445 insertions(+), 1 deletion(-) create mode 100644 nodejs-addon-examples/test_spoken_language_identification.js create mode 100644 nodejs-addon-examples/test_vad_spoken_language_identification_microphone.js create mode 100644 scripts/node-addon-api/lib/spoken-language-identification.js create mode 100644 scripts/node-addon-api/src/spoken-language-identification.cc diff --git a/.github/scripts/node-addon/run.sh b/.github/scripts/node-addon/run.sh index 5569dd60..0c9e32fe 100755 --- a/.github/scripts/node-addon/run.sh +++ b/.github/scripts/node-addon/run.sh @@ -18,6 +18,8 @@ fi SHERPA_ONNX_VERSION=$(grep "SHERPA_ONNX_VERSION" ./CMakeLists.txt | cut -d " " -f 2 | cut -d '"' -f 2) echo "SHERPA_ONNX_VERSION $SHERPA_ONNX_VERSION" +# SHERPA_ONNX_VERSION=1.0.20 + if [ -z $owner ]; then owner=k2-fsa fi diff --git a/.github/scripts/test-nodejs-addon-npm.sh b/.github/scripts/test-nodejs-addon-npm.sh index 0d9a8faa..65c8b0e0 100755 --- a/.github/scripts/test-nodejs-addon-npm.sh +++ b/.github/scripts/test-nodejs-addon-npm.sh @@ -6,6 +6,20 @@ d=nodejs-addon-examples echo "dir: $d" cd $d +echo "----------spoken language identification----------" + +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.tar.bz2 +tar xvf sherpa-onnx-whisper-tiny.tar.bz2 +rm sherpa-onnx-whisper-tiny.tar.bz2 + +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/spoken-language-identification-test-wavs.tar.bz2 +tar xvf spoken-language-identification-test-wavs.tar.bz2 +rm spoken-language-identification-test-wavs.tar.bz2 + +node ./test_spoken_language_identification.js +rm -rf sherpa-onnx-whisper-tiny +rm -rf spoken-language-identification-test-wavs + echo "----------streaming asr----------" curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 diff --git a/.github/workflows/npm-addon.yaml b/.github/workflows/npm-addon.yaml index 45067180..712dace3 100644 --- a/.github/workflows/npm-addon.yaml +++ b/.github/workflows/npm-addon.yaml @@ -52,6 +52,7 @@ jobs: SHERPA_ONNX_VERSION=$(grep "SHERPA_ONNX_VERSION" ./CMakeLists.txt | cut -d " " -f 2 | cut -d '"' -f 2) echo "SHERPA_ONNX_VERSION $SHERPA_ONNX_VERSION" + # SHERPA_ONNX_VERSION=1.0.20 src_dir=.github/scripts/node-addon sed -i.bak s/SHERPA_ONNX_VERSION/$SHERPA_ONNX_VERSION/g $src_dir/package.json diff --git a/nodejs-addon-examples/README.md b/nodejs-addon-examples/README.md index faf3bfdb..1fdb44c6 100644 --- a/nodejs-addon-examples/README.md +++ b/nodejs-addon-examples/README.md @@ -183,3 +183,21 @@ rm vits-icefall-zh-aishell3.tar.bz2 node ./test_tts_non_streaming_vits_zh_aishell3.js ``` + +## Spoken language identification with Whisper multi-lingual models + +```bash +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.tar.bz2 +tar xvf sherpa-onnx-whisper-tiny.tar.bz2 +rm sherpa-onnx-whisper-tiny.tar.bz2 + +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/spoken-language-identification-test-wavs.tar.bz2 +tar xvf spoken-language-identification-test-wavs.tar.bz2 +rm spoken-language-identification-test-wavs.tar.bz2 + +node ./test_spoken_language_identification.js + +# To run VAD + spoken language identification using a microphone +npm install naudiodon2 +node ./test_vad_spoken_language_identification_microphone.js +``` diff --git a/nodejs-addon-examples/test_spoken_language_identification.js b/nodejs-addon-examples/test_spoken_language_identification.js new file mode 100644 index 00000000..f5ea546d --- /dev/null +++ b/nodejs-addon-examples/test_spoken_language_identification.js @@ -0,0 +1,40 @@ +// Copyright (c) 2023-2024 Xiaomi Corporation (authors: Fangjun Kuang) + +const sherpa_onnx = require('sherpa-onnx-node'); + +function createSpokenLanguageID() { + const config = { + whisper: { + encoder: './sherpa-onnx-whisper-tiny/tiny-encoder.int8.onnx', + decoder: './sherpa-onnx-whisper-tiny/tiny-decoder.int8.onnx', + }, + debug: true, + numThreads: 1, + provider: 'cpu', + }; + return new sherpa_onnx.SpokenLanguageIdentification(config); +} + +const slid = createSpokenLanguageID(); + +const testWaves = [ + './spoken-language-identification-test-wavs/ar-arabic.wav', + './spoken-language-identification-test-wavs/de-german.wav', + './spoken-language-identification-test-wavs/en-english.wav', + './spoken-language-identification-test-wavs/fr-french.wav', + './spoken-language-identification-test-wavs/pt-portuguese.wav', + './spoken-language-identification-test-wavs/es-spanish.wav', + './spoken-language-identification-test-wavs/zh-chinese.wav', +]; + +const display = new Intl.DisplayNames(['en'], {type: 'language'}) + +for (let f of testWaves) { + const stream = slid.createStream(); + + const wave = sherpa_onnx.readWave(f); + stream.acceptWaveform({sampleRate: wave.sampleRate, samples: wave.samples}); + + const lang = slid.compute(stream); + console.log(f.split('/')[2], lang, display.of(lang)); +} diff --git a/nodejs-addon-examples/test_vad_microphone.js b/nodejs-addon-examples/test_vad_microphone.js index 196ec3cd..4b31515d 100644 --- a/nodejs-addon-examples/test_vad_microphone.js +++ b/nodejs-addon-examples/test_vad_microphone.js @@ -26,7 +26,7 @@ function createVad() { return new sherpa_onnx.Vad(config, bufferSizeInSeconds); } -vad = createVad(); +const vad = createVad(); const bufferSizeInSeconds = 30; const buffer = diff --git a/nodejs-addon-examples/test_vad_spoken_language_identification_microphone.js b/nodejs-addon-examples/test_vad_spoken_language_identification_microphone.js new file mode 100644 index 00000000..6a8d59e5 --- /dev/null +++ b/nodejs-addon-examples/test_vad_spoken_language_identification_microphone.js @@ -0,0 +1,114 @@ +// Copyright (c) 2023-2024 Xiaomi Corporation (authors: Fangjun Kuang) + +const portAudio = require('naudiodon2'); +// console.log(portAudio.getDevices()); + +const sherpa_onnx = require('sherpa-onnx-node'); + +function createVad() { + // please download silero_vad.onnx from + // https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx + const config = { + sileroVad: { + model: './silero_vad.onnx', + threshold: 0.5, + minSpeechDuration: 0.25, + minSilenceDuration: 0.5, + windowSize: 512, + }, + sampleRate: 16000, + debug: true, + numThreads: 1, + }; + + const bufferSizeInSeconds = 60; + + return new sherpa_onnx.Vad(config, bufferSizeInSeconds); +} + +// Please download test files from +// https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models +function createSpokenLanguageID() { + const config = { + whisper: { + encoder: './sherpa-onnx-whisper-tiny/tiny-encoder.int8.onnx', + decoder: './sherpa-onnx-whisper-tiny/tiny-decoder.int8.onnx', + }, + debug: true, + numThreads: 1, + provider: 'cpu', + }; + return new sherpa_onnx.SpokenLanguageIdentification(config); +} + +const slid = createSpokenLanguageID(); +const vad = createVad(); + +const display = new Intl.DisplayNames(['en'], {type: 'language'}) + +const bufferSizeInSeconds = 30; +const buffer = + new sherpa_onnx.CircularBuffer(bufferSizeInSeconds * vad.config.sampleRate); + + +const ai = new portAudio.AudioIO({ + inOptions: { + channelCount: 1, + closeOnError: true, // Close the stream if an audio error is detected, if + // set false then just log the error + deviceId: -1, // Use -1 or omit the deviceId to select the default device + sampleFormat: portAudio.SampleFormatFloat32, + sampleRate: vad.config.sampleRate, + } +}); + +let printed = false; +let index = 0; +ai.on('data', data => { + const windowSize = vad.config.sileroVad.windowSize; + buffer.push(new Float32Array(data.buffer)); + while (buffer.size() > windowSize) { + const samples = buffer.get(buffer.head(), windowSize); + buffer.pop(windowSize); + vad.acceptWaveform(samples) + if (vad.isDetected() && !printed) { + console.log(`${index}: Detected speech`) + printed = true; + } + + if (!vad.isDetected()) { + printed = false; + } + + while (!vad.isEmpty()) { + const segment = vad.front(); + vad.pop(); + + const stream = slid.createStream(); + stream.acceptWaveform( + {samples: segment.samples, sampleRate: vad.config.sampleRate}); + const lang = slid.compute(stream); + const fullLang = display.of(lang); + + const filename = `${index}-${fullLang}-${ + new Date() + .toLocaleTimeString('en-US', {hour12: false}) + .split(' ')[0]}.wav`; + sherpa_onnx.writeWave( + filename, + {samples: segment.samples, sampleRate: vad.config.sampleRate}); + const duration = segment.samples.length / vad.config.sampleRate; + console.log(`${index} End of speech. Duration: ${ + duration} seconds.\n Detected language: ${fullLang}`); + console.log(`Saved to ${filename}`); + index += 1; + } + } +}); + +ai.on('close', () => { + console.log('Free resources'); +}); + +ai.start(); +console.log('Started! Please speak') diff --git a/scripts/node-addon-api/CMakeLists.txt b/scripts/node-addon-api/CMakeLists.txt index de86cf63..4ac4969e 100644 --- a/scripts/node-addon-api/CMakeLists.txt +++ b/scripts/node-addon-api/CMakeLists.txt @@ -21,6 +21,7 @@ set(srcs src/non-streaming-asr.cc src/non-streaming-tts.cc src/sherpa-onnx-node-addon-api.cc + src/spoken-language-identification.cc src/streaming-asr.cc src/vad.cc src/wave-reader.cc diff --git a/scripts/node-addon-api/lib/non-streaming-asr.js b/scripts/node-addon-api/lib/non-streaming-asr.js index 852343e6..c7721906 100644 --- a/scripts/node-addon-api/lib/non-streaming-asr.js +++ b/scripts/node-addon-api/lib/non-streaming-asr.js @@ -37,4 +37,5 @@ class OfflineRecognizer { module.exports = { OfflineRecognizer, + OfflineStream, } diff --git a/scripts/node-addon-api/lib/sherpa-onnx.js b/scripts/node-addon-api/lib/sherpa-onnx.js index fa7d3e8b..9580e014 100644 --- a/scripts/node-addon-api/lib/sherpa-onnx.js +++ b/scripts/node-addon-api/lib/sherpa-onnx.js @@ -3,6 +3,7 @@ const streaming_asr = require('./streaming-asr.js'); const non_streaming_asr = require('./non-streaming-asr.js'); const non_streaming_tts = require('./non-streaming-tts.js'); const vad = require('./vad.js'); +const slid = require('./spoken-language-identification.js'); module.exports = { OnlineRecognizer: streaming_asr.OnlineRecognizer, @@ -13,4 +14,5 @@ module.exports = { Display: streaming_asr.Display, Vad: vad.Vad, CircularBuffer: vad.CircularBuffer, + SpokenLanguageIdentification: slid.SpokenLanguageIdentification, } diff --git a/scripts/node-addon-api/lib/spoken-language-identification.js b/scripts/node-addon-api/lib/spoken-language-identification.js new file mode 100644 index 00000000..1fd93172 --- /dev/null +++ b/scripts/node-addon-api/lib/spoken-language-identification.js @@ -0,0 +1,30 @@ +const addon = require('./addon.js'); +const non_streaming_asr = require('./non-streaming-asr.js'); + +class SpokenLanguageIdentification { + constructor(config) { + this.handle = addon.createSpokenLanguageIdentification(config); + this.config = config; + } + + createStream() { + return new non_streaming_asr.OfflineStream( + addon.createSpokenLanguageIdentificationOfflineStream(this.handle)); + } + + // return a string containing the language code (2 characters), + // e.g., en, de, fr, es, zh + // en -> English + // de -> German + // fr -> French + // es -> Spanish + // zh -> Chinese + compute(stream) { + return addon.spokenLanguageIdentificationCompute( + this.handle, stream.handle); + } +} + +module.exports = { + SpokenLanguageIdentification, +} diff --git a/scripts/node-addon-api/src/sherpa-onnx-node-addon-api.cc b/scripts/node-addon-api/src/sherpa-onnx-node-addon-api.cc index 90ad4d99..4ddd82f9 100644 --- a/scripts/node-addon-api/src/sherpa-onnx-node-addon-api.cc +++ b/scripts/node-addon-api/src/sherpa-onnx-node-addon-api.cc @@ -15,6 +15,8 @@ void InitWaveReader(Napi::Env env, Napi::Object exports); void InitWaveWriter(Napi::Env env, Napi::Object exports); +void InitSpokenLanguageID(Napi::Env env, Napi::Object exports); + Napi::Object Init(Napi::Env env, Napi::Object exports) { InitStreamingAsr(env, exports); InitNonStreamingAsr(env, exports); @@ -22,6 +24,7 @@ Napi::Object Init(Napi::Env env, Napi::Object exports) { InitVad(env, exports); InitWaveReader(env, exports); InitWaveWriter(env, exports); + InitSpokenLanguageID(env, exports); return exports; } diff --git a/scripts/node-addon-api/src/spoken-language-identification.cc b/scripts/node-addon-api/src/spoken-language-identification.cc new file mode 100644 index 00000000..f256ace5 --- /dev/null +++ b/scripts/node-addon-api/src/spoken-language-identification.cc @@ -0,0 +1,218 @@ +// scripts/node-addon-api/src/spoken-language-identification.cc +// +// Copyright (c) 2024 Xiaomi Corporation + +#include + +#include "napi.h" // NOLINT +#include "sherpa-onnx/c-api/c-api.h" + +static SherpaOnnxSpokenLanguageIdentificationWhisperConfig +GetSpokenLanguageIdentificationWhisperConfig(Napi::Object obj) { + SherpaOnnxSpokenLanguageIdentificationWhisperConfig c; + memset(&c, 0, sizeof(c)); + + if (!obj.Has("whisper") || !obj.Get("whisper").IsObject()) { + return c; + } + + Napi::Object o = obj.Get("whisper").As(); + + if (o.Has("encoder") && o.Get("encoder").IsString()) { + Napi::String encoder = o.Get("encoder").As(); + std::string s = encoder.Utf8Value(); + char *p = new char[s.size() + 1]; + std::copy(s.begin(), s.end(), p); + p[s.size()] = 0; + + c.encoder = p; + } + + if (o.Has("decoder") && o.Get("decoder").IsString()) { + Napi::String decoder = o.Get("decoder").As(); + std::string s = decoder.Utf8Value(); + char *p = new char[s.size() + 1]; + std::copy(s.begin(), s.end(), p); + p[s.size()] = 0; + + c.decoder = p; + } + + if (o.Has("tailPaddings") && o.Get("tailPaddings").IsNumber()) { + c.tail_paddings = o.Get("tailPaddings").As().Int32Value(); + } + + return c; +} + +static Napi::External +CreateSpokenLanguageIdentificationWrapper(const Napi::CallbackInfo &info) { + Napi::Env env = info.Env(); + if (info.Length() != 1) { + std::ostringstream os; + os << "Expect only 1 argument. Given: " << info.Length(); + + Napi::TypeError::New(env, os.str()).ThrowAsJavaScriptException(); + + return {}; + } + + if (!info[0].IsObject()) { + Napi::TypeError::New(env, "You should pass an object as the only argument.") + .ThrowAsJavaScriptException(); + + return {}; + } + + Napi::Object o = info[0].As(); + + SherpaOnnxSpokenLanguageIdentificationConfig c; + memset(&c, 0, sizeof(c)); + c.whisper = GetSpokenLanguageIdentificationWhisperConfig(o); + + if (o.Has("numThreads") && o.Get("numThreads").IsNumber()) { + c.num_threads = o.Get("numThreads").As().Int32Value(); + } + + if (o.Has("debug") && + (o.Get("debug").IsNumber() || o.Get("debug").IsBoolean())) { + if (o.Get("debug").IsBoolean()) { + c.debug = o.Get("debug").As().Value(); + } else { + c.debug = o.Get("debug").As().Int32Value(); + } + } + + if (o.Has("provider") && o.Get("provider").IsString()) { + Napi::String provider = o.Get("provider").As(); + std::string s = provider.Utf8Value(); + char *p = new char[s.size() + 1]; + std::copy(s.begin(), s.end(), p); + p[s.size()] = 0; + + c.provider = p; + } + + const SherpaOnnxSpokenLanguageIdentification *slid = + SherpaOnnxCreateSpokenLanguageIdentification(&c); + + if (c.whisper.encoder) { + delete[] c.whisper.encoder; + } + + if (c.whisper.decoder) { + delete[] c.whisper.decoder; + } + + if (c.provider) { + delete[] c.provider; + } + + if (!slid) { + Napi::TypeError::New(env, "Please check your config!") + .ThrowAsJavaScriptException(); + + return {}; + } + + return Napi::External::New( + env, const_cast(slid), + [](Napi::Env env, SherpaOnnxSpokenLanguageIdentification *slid) { + SherpaOnnxDestroySpokenLanguageIdentification(slid); + }); +} + +static Napi::External +SpokenLanguageIdentificationCreateOfflineStreamWrapper( + const Napi::CallbackInfo &info) { + Napi::Env env = info.Env(); + if (info.Length() != 1) { + std::ostringstream os; + os << "Expect only 1 argument. Given: " << info.Length(); + + Napi::TypeError::New(env, os.str()).ThrowAsJavaScriptException(); + + return {}; + } + + if (!info[0].IsExternal()) { + Napi::TypeError::New( + env, + "You should pass an offline language ID pointer as the only argument") + .ThrowAsJavaScriptException(); + + return {}; + } + + SherpaOnnxSpokenLanguageIdentification *slid = + info[0] + .As>() + .Data(); + + SherpaOnnxOfflineStream *stream = + SherpaOnnxSpokenLanguageIdentificationCreateOfflineStream(slid); + + return Napi::External::New( + env, stream, [](Napi::Env env, SherpaOnnxOfflineStream *stream) { + DestroyOfflineStream(stream); + }); +} + +static Napi::String SpokenLanguageIdentificationComputeWrapper( + const Napi::CallbackInfo &info) { + Napi::Env env = info.Env(); + if (info.Length() != 2) { + std::ostringstream os; + os << "Expect only 2 arguments. Given: " << info.Length(); + + Napi::TypeError::New(env, os.str()).ThrowAsJavaScriptException(); + + return {}; + } + + if (!info[0].IsExternal()) { + Napi::TypeError::New( + env, "Argument 0 should be an offline spoken language ID pointer.") + .ThrowAsJavaScriptException(); + + return {}; + } + + if (!info[1].IsExternal()) { + Napi::TypeError::New(env, "Argument 1 should be an offline stream pointer.") + .ThrowAsJavaScriptException(); + + return {}; + } + + SherpaOnnxSpokenLanguageIdentification *slid = + info[0] + .As>() + .Data(); + + SherpaOnnxOfflineStream *stream = + info[1].As>().Data(); + + const SherpaOnnxSpokenLanguageIdentificationResult *r = + SherpaOnnxSpokenLanguageIdentificationCompute(slid, stream); + + std::string lang = r->lang; + SherpaOnnxDestroySpokenLanguageIdentificationResult(r); + + return Napi::String::New(env, lang); +} + +void InitSpokenLanguageID(Napi::Env env, Napi::Object exports) { + exports.Set( + Napi::String::New(env, "createSpokenLanguageIdentification"), + Napi::Function::New(env, CreateSpokenLanguageIdentificationWrapper)); + + exports.Set( + Napi::String::New(env, "createSpokenLanguageIdentificationOfflineStream"), + Napi::Function::New( + env, SpokenLanguageIdentificationCreateOfflineStreamWrapper)); + + exports.Set( + Napi::String::New(env, "spokenLanguageIdentificationCompute"), + Napi::Function::New(env, SpokenLanguageIdentificationComputeWrapper)); +}