support nodejs (#438)

This commit is contained in:
Fangjun Kuang
2023-11-21 23:20:08 +08:00
committed by GitHub
parent 38ad05bdf8
commit fe977b8e8e
34 changed files with 2704 additions and 4 deletions

2
nodejs-examples/.gitignore vendored Normal file
View File

@@ -0,0 +1,2 @@
node_modules
package-lock.json

247
nodejs-examples/README.md Normal file
View File

@@ -0,0 +1,247 @@
# Introduction
This directory contains nodejs examples for [sherpa-onnx](https://github.com/k2-fsa/sherpa-onnx).
Before you continue, please first install the npm package `sherpa-onnx` by
```bash
npm install sherpa-onnx
```
In the following, we describe how to use [sherpa-onnx](https://github.com/k2-fsa/sherpa-onnx)
for text-to-speech and speech-to-text.
**Caution**: If you get the following error:
```
/Users/fangjun/open-source/sherpa-onnx/nodejs-examples/node_modules/ffi-napi/lib/dynamic_library.js:67
if (match = err.match(/^(([^ \t()])+\.so([^ \t:()])*):([ \t])*/)) {
^
TypeError: Cannot read properties of null (reading 'match')
at new DynamicLibrary (/Users/fangjun/open-source/sherpa-onnx/nodejs-examples/node_modules/ffi-napi/lib/dynamic_library.js:67:21)
at Object.Library (/Users/fangjun/open-source/sherpa-onnx/nodejs-examples/node_modules/ffi-napi/lib/library.js:47:10)
at Object.<anonymous> (/Users/fangjun/open-source/sherpa-onnx/nodejs-examples/node_modules/sherpa-onnx3/index.js:268:28)
at Module._compile (node:internal/modules/cjs/loader:1376:14)
at Module._extensions..js (node:internal/modules/cjs/loader:1435:10)
at Module.load (node:internal/modules/cjs/loader:1207:32)
at Module._load (node:internal/modules/cjs/loader:1023:12)
at Module.require (node:internal/modules/cjs/loader:1235:19)
at require (node:internal/modules/helpers:176:18)
at Object.<anonymous> (/Users/fangjun/open-source/sherpa-onnx/nodejs-examples/test-offline-tts-zh.js:3:21)
```
Please downgrade your node to version v13.14.0. See also
https://github.com/node-ffi-napi/node-ffi-napi/issues/244
and
https://github.com/node-ffi-napi/node-ffi-napi/issues/97 .
# Text-to-speech
In the following, we demonstrate how to run text-to-speech.
## ./test-offline-tts-en.js
[./test-offline-tts-en.js](./test-offline-tts-en.js) shows how to use
a VITS pretrained model
[VCTK](https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/vits.html#vctk-english-multi-speaker-109-speakers)
for text-to-speech.
You can use the following command to run it:
```bash
wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-vctk.tar.bz2
tar xvf vits-vctk.tar.bz2
node ./test-offline-tts-en.js
```
## ./test-offline-tts-zh.js
[./test-offline-tts-zh.js](./test-offline-tts-zh.js) shows how to use
a VITS pretrained model
[aishell3](https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/vits.html#vits-model-aishell3)
for text-to-speech.
You can use the following command to run it:
```bash
wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-zh-aishell3.tar.bz2
tar xvf vits-zh-aishell3.tar.bz2
node ./test-offline-tts-zh.js
```
# Speech-to-text
In the following, we demonstrate how to decode files and how to perform
speech recognition with a microphone with `nodejs`. We need to install two additional
npm packages:
```bash
npm install wav naudiodon2
```
## ./test-offline-nemo-ctc.js
[./test-offline-nemo-ctc.js](./test-offline-nemo-ctc.js) demonstrates
how to decode a file with a NeMo CTC model. In the code we use
[stt_en_conformer_ctc_small](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-ctc/nemo/english.html#stt-en-conformer-ctc-small).
You can use the following command run it:
```bash
wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-ctc-en-conformer-small.tar.bz2
tar xvf sherpa-onnx-nemo-ctc-en-conformer-small.tar.bz2
node ./test-offline-nemo-ctc.js
```
## ./test-offline-paraformer.js
[./test-offline-paraformer.js](./test-offline-paraformer.js) demonstrates
how to decode a file with a non-streaming Paraformer model. In the code we use
[sherpa-onnx-paraformer-zh-2023-03-28](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-paraformer/paraformer-models.html#csukuangfj-sherpa-onnx-paraformer-zh-2023-03-28-chinese).
You can use the following command run it:
```bash
wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2
tar xvf sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2
node ./test-offline-paraformer.js
```
## ./test-offline-transducer.js
[./test-offline-transducer.js](./test-offline-transducer.js) demonstrates
how to decode a file with a non-streaming transducer model. In the code we use
[sherpa-onnx-zipformer-en-2023-06-26](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-transducer/zipformer-transducer-models.html#csukuangfj-sherpa-onnx-zipformer-en-2023-06-26-english).
You can use the following command run it:
```bash
wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-en-2023-06-26.tar.bz2
tar xvf sherpa-onnx-zipformer-en-2023-06-26.tar.bz2
node ./test-offline-transducer.js
```
## ./test-offline-whisper.js
[./test-offline-whisper.js](./test-offline-whisper.js) demonstrates
how to decode a file with a Whisper model. In the code we use
[sherpa-onnx-whisper-tiny.en](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/whisper/tiny.en.html).
You can use the following command run it:
```bash
wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2
tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2
node ./test-offline-whisper.js
```
## ./test-online-paraformer-microphone.js
[./test-online-paraformer-microphone.js](./test-online-paraformer-microphone.js)
demonstrates how to do real-time speech recognition from microphone
with a streaming Paraformer model. In the code we use
[sherpa-onnx-streaming-paraformer-bilingual-zh-en](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-paraformer/paraformer-models.html#csukuangfj-sherpa-onnx-streaming-paraformer-bilingual-zh-en-chinese-english).
You can use the following command run it:
```bash
wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-paraformer-bilingual-zh-en.tar.bz2
rm sherpa-onnx-streaming-paraformer-bilingual-zh-en.tar.bz2
node ./test-online-paraformer-microphone.js
```
## ./test-online-paraformer.js
[./test-online-paraformer.js](./test-online-paraformer.js) demonstrates
how to decode a file using a streaming Paraformer model. In the code we use
[sherpa-onnx-streaming-paraformer-bilingual-zh-en](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-paraformer/paraformer-models.html#csukuangfj-sherpa-onnx-streaming-paraformer-bilingual-zh-en-chinese-english).
You can use the following command run it:
```bash
wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-paraformer-bilingual-zh-en.tar.bz2
rm sherpa-onnx-streaming-paraformer-bilingual-zh-en.tar.bz2
node ./test-online-paraformer.js
```
## ./test-online-transducer-microphone.js
[./test-online-transducer-microphone.js](./test-online-transducer-microphone.js)
demonstrates how to do real-time speech recognition with microphone using a streaming transducer model. In the code
we use [sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-transducer/zipformer-transducer-models.html#csukuangfj-sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20-bilingual-chinese-english).
You can use the following command run it:
```bash
wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2
tar xvf sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2
node ./test-online-transducer-microphone.js
```
## ./test-online-transducer.js
[./test-online-transducer.js](./test-online-transducer.js) demonstrates
how to decode a file using a streaming transducer model. In the code
we use [sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-transducer/zipformer-transducer-models.html#csukuangfj-sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20-bilingual-chinese-english).
You can use the following command run it:
```bash
wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2
tar xvf sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2
node ./test-online-transducer.js
```
## ./test-vad-microphone-offline-paraformer.js
[./test-vad-microphone-offline-paraformer.js](./test-vad-microphone-offline-paraformer.js)
demonstrates how to use [silero-vad](https://github.com/snakers4/silero-vad)
with non-streaming Paraformer for speech recognition from microphone.
You can use the following command run it:
```bash
wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2
tar xvf sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2
node ./test-vad-microphone-offline-paraformer.js
```
## ./test-vad-microphone-offline-transducer.js
[./test-vad-microphone-offline-transducer.js](./test-vad-microphone-offline-transducer.js)
demonstrates how to use [silero-vad](https://github.com/snakers4/silero-vad)
with a non-streaming transducer model for speech recognition from microphone.
You can use the following command run it:
```bash
wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-en-2023-06-26.tar.bz2
tar xvf sherpa-onnx-zipformer-en-2023-06-26.tar.bz2
node ./test-vad-microphone-offline-transducer.js
```
## ./test-vad-microphone-offline-whisper.js
[./test-vad-microphone-offline-whisper.js](./test-vad-microphone-offline-whisper.js)
demonstrates how to use [silero-vad](https://github.com/snakers4/silero-vad)
with whisper for speech recognition from microphone.
You can use the following command run it:
```bash
wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2
tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2
node ./test-vad-microphone-offline-whisper.js
```
## ./test-vad-microphone.js
[./test-vad-microphone.js](./test-vad-microphone.js)
demonstrates how to use [silero-vad](https://github.com/snakers4/silero-vad).
You can use the following command run it:
```bash
wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
node ./test-vad-microphone.js
```

View File

@@ -0,0 +1,7 @@
{
"dependencies": {
"naudiodon2": "^2.4.0",
"sherpa-onnx": "^1.8.11",
"wav": "^1.0.2"
}
}

View File

@@ -0,0 +1,97 @@
// Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang)
//
const fs = require('fs');
const {Readable} = require('stream');
const wav = require('wav');
const sherpa_onnx = require('sherpa-onnx');
function createRecognizer() {
const featConfig = new sherpa_onnx.FeatureConfig();
featConfig.sampleRate = 16000;
featConfig.featureDim = 80;
// test online recognizer
const nemoCtc = new sherpa_onnx.OfflineNemoEncDecCtcModelConfig();
nemoCtc.model = './sherpa-onnx-nemo-ctc-en-conformer-small/model.int8.onnx';
const tokens = './sherpa-onnx-nemo-ctc-en-conformer-small/tokens.txt';
const modelConfig = new sherpa_onnx.OfflineModelConfig();
modelConfig.nemoCtc = nemoCtc;
modelConfig.tokens = tokens;
modelConfig.modelType = 'nemo_ctc';
const recognizerConfig = new sherpa_onnx.OfflineRecognizerConfig();
recognizerConfig.featConfig = featConfig;
recognizerConfig.modelConfig = modelConfig;
recognizerConfig.decodingMethod = 'greedy_search';
const recognizer = new sherpa_onnx.OfflineRecognizer(recognizerConfig);
return recognizer;
}
recognizer = createRecognizer();
stream = recognizer.createStream();
const waveFilename =
'./sherpa-onnx-nemo-ctc-en-conformer-small/test_wavs/0.wav';
const reader = new wav.Reader();
const readable = new Readable().wrap(reader);
const buf = [];
reader.on('format', ({audioFormat, bitDepth, channels, sampleRate}) => {
if (sampleRate != recognizer.config.featConfig.sampleRate) {
throw new Error(`Only support sampleRate ${
recognizer.config.featConfig.sampleRate}. Given ${sampleRate}`);
}
if (audioFormat != 1) {
throw new Error(`Only support PCM format. Given ${audioFormat}`);
}
if (channels != 1) {
throw new Error(`Only a single channel. Given ${channel}`);
}
if (bitDepth != 16) {
throw new Error(`Only support 16-bit samples. Given ${bitDepth}`);
}
});
fs.createReadStream(waveFilename, {highWaterMark: 4096})
.pipe(reader)
.on('finish', function(err) {
// tail padding
const floatSamples =
new Float32Array(recognizer.config.featConfig.sampleRate * 0.5);
buf.push(floatSamples);
const flattened =
Float32Array.from(buf.reduce((a, b) => [...a, ...b], []));
stream.acceptWaveform(recognizer.config.featConfig.sampleRate, flattened);
recognizer.decode(stream);
const r = recognizer.getResult(stream);
console.log(r.text);
stream.free();
recognizer.free();
});
readable.on('readable', function() {
let chunk;
while ((chunk = readable.read()) != null) {
const int16Samples = new Int16Array(
chunk.buffer, chunk.byteOffset,
chunk.length / Int16Array.BYTES_PER_ELEMENT);
const floatSamples = new Float32Array(int16Samples.length);
for (let i = 0; i < floatSamples.length; i++) {
floatSamples[i] = int16Samples[i] / 32768.0;
}
buf.push(floatSamples);
}
});

View File

@@ -0,0 +1,95 @@
// Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang)
const fs = require('fs');
const {Readable} = require('stream');
const wav = require('wav');
const sherpa_onnx = require('sherpa-onnx');
function createRecognizer() {
const featConfig = new sherpa_onnx.FeatureConfig();
featConfig.sampleRate = 16000;
featConfig.featureDim = 80;
// test online recognizer
const paraformer = new sherpa_onnx.OfflineParaformerModelConfig();
paraformer.model = './sherpa-onnx-paraformer-zh-2023-03-28/model.onnx';
const tokens = './sherpa-onnx-paraformer-zh-2023-03-28/tokens.txt';
const modelConfig = new sherpa_onnx.OfflineModelConfig();
modelConfig.paraformer = paraformer;
modelConfig.tokens = tokens;
modelConfig.modelType = 'paraformer';
const recognizerConfig = new sherpa_onnx.OfflineRecognizerConfig();
recognizerConfig.featConfig = featConfig;
recognizerConfig.modelConfig = modelConfig;
recognizerConfig.decodingMethod = 'greedy_search';
const recognizer = new sherpa_onnx.OfflineRecognizer(recognizerConfig);
return recognizer;
}
recognizer = createRecognizer();
stream = recognizer.createStream();
const waveFilename = './sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/0.wav';
const reader = new wav.Reader();
const readable = new Readable().wrap(reader);
const buf = [];
reader.on('format', ({audioFormat, bitDepth, channels, sampleRate}) => {
if (sampleRate != recognizer.config.featConfig.sampleRate) {
throw new Error(`Only support sampleRate ${
recognizer.config.featConfig.sampleRate}. Given ${sampleRate}`);
}
if (audioFormat != 1) {
throw new Error(`Only support PCM format. Given ${audioFormat}`);
}
if (channels != 1) {
throw new Error(`Only a single channel. Given ${channel}`);
}
if (bitDepth != 16) {
throw new Error(`Only support 16-bit samples. Given ${bitDepth}`);
}
});
fs.createReadStream(waveFilename, {'highWaterMark': 4096})
.pipe(reader)
.on('finish', function(err) {
// tail padding
const floatSamples =
new Float32Array(recognizer.config.featConfig.sampleRate * 0.5);
buf.push(floatSamples);
const flattened =
Float32Array.from(buf.reduce((a, b) => [...a, ...b], []));
stream.acceptWaveform(recognizer.config.featConfig.sampleRate, flattened);
recognizer.decode(stream);
const r = recognizer.getResult(stream);
console.log(r.text);
stream.free();
recognizer.free();
});
readable.on('readable', function() {
let chunk;
while ((chunk = readable.read()) != null) {
const int16Samples = new Int16Array(
chunk.buffer, chunk.byteOffset,
chunk.length / Int16Array.BYTES_PER_ELEMENT);
const floatSamples = new Float32Array(int16Samples.length);
for (let i = 0; i < floatSamples.length; i++) {
floatSamples[i] = int16Samples[i] / 32768.0;
}
buf.push(floatSamples);
}
});

View File

@@ -0,0 +1,100 @@
// Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang)
//
const fs = require('fs');
const {Readable} = require('stream');
const wav = require('wav');
const sherpa_onnx = require('sherpa-onnx');
function createRecognizer() {
const featConfig = new sherpa_onnx.FeatureConfig();
featConfig.sampleRate = 16000;
featConfig.featureDim = 80;
// test online recognizer
const transducer = new sherpa_onnx.OfflineTransducerModelConfig();
transducer.encoder =
'./sherpa-onnx-zipformer-en-2023-06-26/encoder-epoch-99-avg-1.onnx';
transducer.decoder =
'./sherpa-onnx-zipformer-en-2023-06-26/decoder-epoch-99-avg-1.onnx';
transducer.joiner =
'./sherpa-onnx-zipformer-en-2023-06-26/joiner-epoch-99-avg-1.onnx';
const tokens = './sherpa-onnx-zipformer-en-2023-06-26/tokens.txt';
const modelConfig = new sherpa_onnx.OfflineModelConfig();
modelConfig.transducer = transducer;
modelConfig.tokens = tokens;
modelConfig.modelType = 'transducer';
const recognizerConfig = new sherpa_onnx.OfflineRecognizerConfig();
recognizerConfig.featConfig = featConfig;
recognizerConfig.modelConfig = modelConfig;
recognizerConfig.decodingMethod = 'greedy_search';
const recognizer = new sherpa_onnx.OfflineRecognizer(recognizerConfig);
return recognizer;
}
recognizer = createRecognizer();
stream = recognizer.createStream();
const waveFilename = './sherpa-onnx-zipformer-en-2023-06-26/test_wavs/0.wav';
const reader = new wav.Reader();
const readable = new Readable().wrap(reader);
const buf = [];
reader.on('format', ({audioFormat, bitDepth, channels, sampleRate}) => {
if (sampleRate != recognizer.config.featConfig.sampleRate) {
throw new Error(`Only support sampleRate ${
recognizer.config.featConfig.sampleRate}. Given ${sampleRate}`);
}
if (audioFormat != 1) {
throw new Error(`Only support PCM format. Given ${audioFormat}`);
}
if (channels != 1) {
throw new Error(`Only a single channel. Given ${channel}`);
}
if (bitDepth != 16) {
throw new Error(`Only support 16-bit samples. Given ${bitDepth}`);
}
});
fs.createReadStream(waveFilename, {'highWaterMark': 4096})
.pipe(reader)
.on('finish', function(err) {
// tail padding
const floatSamples =
new Float32Array(recognizer.config.featConfig.sampleRate * 0.5);
buf.push(floatSamples);
const flattened =
Float32Array.from(buf.reduce((a, b) => [...a, ...b], []));
stream.acceptWaveform(recognizer.config.featConfig.sampleRate, flattened);
recognizer.decode(stream);
const r = recognizer.getResult(stream);
console.log(r.text);
stream.free();
recognizer.free();
});
readable.on('readable', function() {
let chunk;
while ((chunk = readable.read()) != null) {
const int16Samples = new Int16Array(
chunk.buffer, chunk.byteOffset,
chunk.length / Int16Array.BYTES_PER_ELEMENT);
const floatSamples = new Float32Array(int16Samples.length);
for (let i = 0; i < floatSamples.length; i++) {
floatSamples[i] = int16Samples[i] / 32768.0;
}
buf.push(floatSamples);
}
});

View File

@@ -0,0 +1,27 @@
// Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang)
const sherpa_onnx = require('sherpa-onnx');
function createOfflineTts() {
const vits = new sherpa_onnx.OfflineTtsVitsModelConfig();
vits.model = './vits-vctk/vits-vctk.onnx';
vits.lexicon = './vits-vctk/lexicon.txt';
vits.tokens = './vits-vctk/tokens.txt';
const modelConfig = new sherpa_onnx.OfflineTtsModelConfig();
modelConfig.vits = vits;
const config = new sherpa_onnx.OfflineTtsConfig();
config.model = modelConfig;
return new sherpa_onnx.OfflineTts(config);
}
const tts = createOfflineTts();
const speakerId = 99;
const speed = 1.0;
const audio =
tts.generate('Good morning. How are you doing?', speakerId, speed);
audio.save('./test-en.wav');
console.log('Saved to test-en.wav successfully.');
tts.free();

View File

@@ -0,0 +1,27 @@
// Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang)
const sherpa_onnx = require('sherpa-onnx');
function createOfflineTts() {
const vits = new sherpa_onnx.OfflineTtsVitsModelConfig();
vits.model = './vits-zh-aishell3/vits-aishell3.onnx';
vits.lexicon = './vits-zh-aishell3/lexicon.txt';
vits.tokens = './vits-zh-aishell3/tokens.txt';
const modelConfig = new sherpa_onnx.OfflineTtsModelConfig();
modelConfig.vits = vits;
const config = new sherpa_onnx.OfflineTtsConfig();
config.model = modelConfig;
config.ruleFsts = './vits-zh-aishell3/rule.fst';
return new sherpa_onnx.OfflineTts(config);
}
const tts = createOfflineTts();
const speakerId = 66;
const speed = 1.0;
const audio = tts.generate('3年前中国总人口是1411778724人', speakerId, speed);
audio.save('./test-zh.wav');
console.log('Saved to test-zh.wav successfully.');
tts.free();

View File

@@ -0,0 +1,97 @@
// Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang)
//
const fs = require('fs');
const {Readable} = require('stream');
const wav = require('wav');
const sherpa_onnx = require('sherpa-onnx');
function createRecognizer() {
const featConfig = new sherpa_onnx.FeatureConfig();
featConfig.sampleRate = 16000;
featConfig.featureDim = 80;
// test online recognizer
const whisper = new sherpa_onnx.OfflineWhisperModelConfig();
whisper.encoder = './sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx';
whisper.decoder = './sherpa-onnx-whisper-tiny.en/tiny.en-decoder.int8.onnx';
const tokens = './sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt';
const modelConfig = new sherpa_onnx.OfflineModelConfig();
modelConfig.whisper = whisper;
modelConfig.tokens = tokens;
modelConfig.modelType = 'whisper';
const recognizerConfig = new sherpa_onnx.OfflineRecognizerConfig();
recognizerConfig.featConfig = featConfig;
recognizerConfig.modelConfig = modelConfig;
recognizerConfig.decodingMethod = 'greedy_search';
const recognizer = new sherpa_onnx.OfflineRecognizer(recognizerConfig);
return recognizer;
}
recognizer = createRecognizer();
stream = recognizer.createStream();
const waveFilename = './sherpa-onnx-whisper-tiny.en/test_wavs/0.wav';
const reader = new wav.Reader();
const readable = new Readable().wrap(reader);
const buf = [];
reader.on('format', ({audioFormat, bitDepth, channels, sampleRate}) => {
if (sampleRate != recognizer.config.featConfig.sampleRate) {
throw new Error(`Only support sampleRate ${
recognizer.config.featConfig.sampleRate}. Given ${sampleRate}`);
}
if (audioFormat != 1) {
throw new Error(`Only support PCM format. Given ${audioFormat}`);
}
if (channels != 1) {
throw new Error(`Only a single channel. Given ${channel}`);
}
if (bitDepth != 16) {
throw new Error(`Only support 16-bit samples. Given ${bitDepth}`);
}
});
fs.createReadStream(waveFilename, {'highWaterMark': 4096})
.pipe(reader)
.on('finish', function(err) {
// tail padding
const floatSamples =
new Float32Array(recognizer.config.featConfig.sampleRate * 0.5);
buf.push(floatSamples);
const flattened =
Float32Array.from(buf.reduce((a, b) => [...a, ...b], []));
stream.acceptWaveform(recognizer.config.featConfig.sampleRate, flattened);
recognizer.decode(stream);
const r = recognizer.getResult(stream);
console.log(r.text);
stream.free();
recognizer.free();
});
readable.on('readable', function() {
let chunk;
while ((chunk = readable.read()) != null) {
const int16Samples = new Int16Array(
chunk.buffer, chunk.byteOffset,
chunk.length / Int16Array.BYTES_PER_ELEMENT);
const floatSamples = new Float32Array(int16Samples.length);
for (let i = 0; i < floatSamples.length; i++) {
floatSamples[i] = int16Samples[i] / 32768.0;
}
buf.push(floatSamples);
}
});

View File

@@ -0,0 +1,86 @@
// Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang)
//
const portAudio = require('naudiodon2');
console.log(portAudio.getDevices());
const sherpa_onnx = require('sherpa-onnx');
function createRecognizer() {
const featConfig = new sherpa_onnx.FeatureConfig();
featConfig.sampleRate = 16000;
featConfig.featureDim = 80;
const paraformer = new sherpa_onnx.OnlineParaformerModelConfig();
paraformer.encoder =
'./sherpa-onnx-streaming-paraformer-bilingual-zh-en/encoder.int8.onnx';
paraformer.decoder =
'./sherpa-onnx-streaming-paraformer-bilingual-zh-en/decoder.int8.onnx';
const tokens =
'./sherpa-onnx-streaming-paraformer-bilingual-zh-en/tokens.txt';
const modelConfig = new sherpa_onnx.OnlineModelConfig();
modelConfig.paraformer = paraformer;
modelConfig.tokens = tokens;
modelConfig.modelType = 'paraformer';
const recognizerConfig = new sherpa_onnx.OnlineRecognizerConfig();
recognizerConfig.featConfig = featConfig;
recognizerConfig.modelConfig = modelConfig;
recognizerConfig.decodingMethod = 'greedy_search';
recognizerConfig.enableEndpoint = 1;
const recognizer = new sherpa_onnx.OnlineRecognizer(recognizerConfig);
return recognizer;
}
recognizer = createRecognizer();
stream = recognizer.createStream();
display = new sherpa_onnx.Display(50);
let lastText = '';
let segmentIndex = 0;
const ai = new portAudio.AudioIO({
inOptions: {
channelCount: 1,
closeOnError: true, // Close the stream if an audio error is detected, if
// set false then just log the error
deviceId: -1, // Use -1 or omit the deviceId to select the default device
sampleFormat: portAudio.SampleFormatFloat32,
sampleRate: recognizer.config.featConfig.sampleRate
}
});
ai.on('data', data => {
const samples = new Float32Array(data.buffer);
stream.acceptWaveform(recognizer.config.featConfig.sampleRate, samples);
while (recognizer.isReady(stream)) {
recognizer.decode(stream);
}
const isEndpoint = recognizer.isEndpoint(stream);
const text = recognizer.getResult(stream).text;
if (text.length > 0 && lastText != text) {
lastText = text;
display.print(segmentIndex, lastText);
}
if (isEndpoint) {
if (text.length > 0) {
lastText = text;
segmentIndex += 1;
}
recognizer.reset(stream)
}
});
ai.on('close', () => {
console.log('Free resources');
stream.free();
recognizer.free();
});
ai.start();
console.log('Started! Please speak')

View File

@@ -0,0 +1,99 @@
// Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang)
//
const fs = require('fs');
const {Readable} = require('stream');
const wav = require('wav');
const sherpa_onnx = require('sherpa-onnx');
function createRecognizer() {
const featConfig = new sherpa_onnx.FeatureConfig();
featConfig.sampleRate = 16000;
featConfig.featureDim = 80;
const paraformer = new sherpa_onnx.OnlineParaformerModelConfig();
paraformer.encoder =
'./sherpa-onnx-streaming-paraformer-bilingual-zh-en/encoder.onnx';
paraformer.decoder =
'./sherpa-onnx-streaming-paraformer-bilingual-zh-en/decoder.onnx';
const tokens =
'./sherpa-onnx-streaming-paraformer-bilingual-zh-en/tokens.txt';
const modelConfig = new sherpa_onnx.OnlineModelConfig();
modelConfig.paraformer = paraformer;
modelConfig.tokens = tokens;
modelConfig.modelType = 'paraformer';
const recognizerConfig = new sherpa_onnx.OnlineRecognizerConfig();
recognizerConfig.featConfig = featConfig;
recognizerConfig.modelConfig = modelConfig;
recognizerConfig.decodingMethod = 'greedy_search';
const recognizer = new sherpa_onnx.OnlineRecognizer(recognizerConfig);
return recognizer;
}
recognizer = createRecognizer();
stream = recognizer.createStream();
const waveFilename =
'./sherpa-onnx-streaming-paraformer-bilingual-zh-en/test_wavs/0.wav';
const reader = new wav.Reader();
const readable = new Readable().wrap(reader);
function decode(samples) {
stream.acceptWaveform(recognizer.config.featConfig.sampleRate, samples);
while (recognizer.isReady(stream)) {
recognizer.decode(stream);
}
const r = recognizer.getResult(stream);
console.log(r.text);
}
reader.on('format', ({audioFormat, bitDepth, channels, sampleRate}) => {
if (sampleRate != recognizer.config.featConfig.sampleRate) {
throw new Error(`Only support sampleRate ${
recognizer.config.featConfig.sampleRate}. Given ${sampleRate}`);
}
if (audioFormat != 1) {
throw new Error(`Only support PCM format. Given ${audioFormat}`);
}
if (channels != 1) {
throw new Error(`Only a single channel. Given ${channel}`);
}
if (bitDepth != 16) {
throw new Error(`Only support 16-bit samples. Given ${bitDepth}`);
}
});
fs.createReadStream(waveFilename, {'highWaterMark': 4096})
.pipe(reader)
.on('finish', function(err) {
// tail padding
const floatSamples =
new Float32Array(recognizer.config.featConfig.sampleRate * 0.5);
decode(floatSamples);
stream.free();
recognizer.free();
});
readable.on('readable', function() {
let chunk;
while ((chunk = readable.read()) != null) {
const int16Samples = new Int16Array(
chunk.buffer, chunk.byteOffset,
chunk.length / Int16Array.BYTES_PER_ELEMENT);
const floatSamples = new Float32Array(int16Samples.length);
for (let i = 0; i < floatSamples.length; i++) {
floatSamples[i] = int16Samples[i] / 32768.0;
}
decode(floatSamples);
}
});

View File

@@ -0,0 +1,88 @@
// Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang)
//
const portAudio = require('naudiodon2');
// console.log(portAudio.getDevices());
const sherpa_onnx = require('sherpa-onnx');
function createRecognizer() {
const featConfig = new sherpa_onnx.FeatureConfig();
featConfig.sampleRate = 16000;
featConfig.featureDim = 80;
// test online recognizer
const transducer = new sherpa_onnx.OnlineTransducerModelConfig();
transducer.encoder =
'./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/encoder-epoch-99-avg-1.int8.onnx';
transducer.decoder =
'./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/decoder-epoch-99-avg-1.onnx';
transducer.joiner =
'./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/joiner-epoch-99-avg-1.onnx';
const tokens =
'./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/tokens.txt';
const modelConfig = new sherpa_onnx.OnlineModelConfig();
modelConfig.transducer = transducer;
modelConfig.tokens = tokens;
modelConfig.modelType = 'zipformer';
const recognizerConfig = new sherpa_onnx.OnlineRecognizerConfig();
recognizerConfig.featConfig = featConfig;
recognizerConfig.modelConfig = modelConfig;
recognizerConfig.decodingMethod = 'greedy_search';
recognizerConfig.enableEndpoint = 1;
const recognizer = new sherpa_onnx.OnlineRecognizer(recognizerConfig);
return recognizer;
}
recognizer = createRecognizer();
stream = recognizer.createStream();
display = new sherpa_onnx.Display(50);
let lastText = '';
let segmentIndex = 0;
const ai = new portAudio.AudioIO({
inOptions: {
channelCount: 1,
closeOnError: true, // Close the stream if an audio error is detected, if
// set false then just log the error
deviceId: -1, // Use -1 or omit the deviceId to select the default device
sampleFormat: portAudio.SampleFormatFloat32,
sampleRate: recognizer.config.featConfig.sampleRate
}
});
ai.on('data', data => {
const samples = new Float32Array(data.buffer);
stream.acceptWaveform(recognizer.config.featConfig.sampleRate, samples);
while (recognizer.isReady(stream)) {
recognizer.decode(stream);
}
const isEndpoint = recognizer.isEndpoint(stream);
const text = recognizer.getResult(stream).text;
if (text.length > 0 && lastText != text) {
lastText = text;
display.print(segmentIndex, lastText);
}
if (isEndpoint) {
if (text.length > 0) {
lastText = text;
segmentIndex += 1;
}
recognizer.reset(stream)
}
});
ai.on('close', () => {
console.log('Free resources');
stream.free();
recognizer.free();
});
ai.start();
console.log('Started! Please speak')

View File

@@ -0,0 +1,102 @@
// Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang)
//
const fs = require('fs');
const {Readable} = require('stream');
const wav = require('wav');
const sherpa_onnx = require('sherpa-onnx');
function createRecognizer() {
const featConfig = new sherpa_onnx.FeatureConfig();
featConfig.sampleRate = 16000;
featConfig.featureDim = 80;
// test online recognizer
const transducer = new sherpa_onnx.OnlineTransducerModelConfig();
transducer.encoder =
'./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/encoder-epoch-99-avg-1.int8.onnx';
transducer.decoder =
'./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/decoder-epoch-99-avg-1.onnx';
transducer.joiner =
'./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/joiner-epoch-99-avg-1.onnx';
const tokens =
'./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/tokens.txt';
const modelConfig = new sherpa_onnx.OnlineModelConfig();
modelConfig.transducer = transducer;
modelConfig.tokens = tokens;
modelConfig.modelType = 'zipformer';
const recognizerConfig = new sherpa_onnx.OnlineRecognizerConfig();
recognizerConfig.featConfig = featConfig;
recognizerConfig.modelConfig = modelConfig;
recognizerConfig.decodingMethod = 'greedy_search';
recognizer = new sherpa_onnx.OnlineRecognizer(recognizerConfig);
return recognizer;
}
recognizer = createRecognizer();
stream = recognizer.createStream();
const waveFilename =
'./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/test_wavs/0.wav';
const reader = new wav.Reader();
const readable = new Readable().wrap(reader);
function decode(samples) {
stream.acceptWaveform(recognizer.config.featConfig.sampleRate, samples);
while (recognizer.isReady(stream)) {
recognizer.decode(stream);
}
const r = recognizer.getResult(stream);
console.log(r.text);
}
reader.on('format', ({audioFormat, bitDepth, channels, sampleRate}) => {
if (sampleRate != recognizer.config.featConfig.sampleRate) {
throw new Error(`Only support sampleRate ${
recognizer.config.featConfig.sampleRate}. Given ${sampleRate}`);
}
if (audioFormat != 1) {
throw new Error(`Only support PCM format. Given ${audioFormat}`);
}
if (channels != 1) {
throw new Error(`Only a single channel. Given ${channel}`);
}
if (bitDepth != 16) {
throw new Error(`Only support 16-bit samples. Given ${bitDepth}`);
}
});
fs.createReadStream(waveFilename, {'highWaterMark': 4096})
.pipe(reader)
.on('finish', function(err) {
// tail padding
const floatSamples =
new Float32Array(recognizer.config.featConfig.sampleRate * 0.5);
decode(floatSamples);
stream.free();
recognizer.free();
});
readable.on('readable', function() {
let chunk;
while ((chunk = readable.read()) != null) {
const int16Samples = new Int16Array(
chunk.buffer, chunk.byteOffset,
chunk.length / Int16Array.BYTES_PER_ELEMENT);
const floatSamples = new Float32Array(int16Samples.length);
for (let i = 0; i < floatSamples.length; i++) {
floatSamples[i] = int16Samples[i] / 32768.0;
}
decode(floatSamples);
}
});

View File

@@ -0,0 +1,101 @@
// Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang)
//
const sherpa_onnx = require('sherpa-onnx3');
const portAudio = require('naudiodon2');
console.log(portAudio.getDevices());
function createOfflineRecognizer() {
const featConfig = new sherpa_onnx.FeatureConfig();
featConfig.sampleRate = 16000;
featConfig.featureDim = 80;
// test online recognizer
const paraformer = new sherpa_onnx.OfflineParaformerModelConfig();
paraformer.model = './sherpa-onnx-paraformer-zh-2023-03-28/model.int8.onnx';
const tokens = './sherpa-onnx-paraformer-zh-2023-03-28/tokens.txt';
const modelConfig = new sherpa_onnx.OfflineModelConfig();
modelConfig.paraformer = paraformer;
modelConfig.tokens = tokens;
modelConfig.modelType = 'paraformer';
const recognizerConfig = new sherpa_onnx.OfflineRecognizerConfig();
recognizerConfig.featConfig = featConfig;
recognizerConfig.modelConfig = modelConfig;
recognizerConfig.decodingMethod = 'greedy_search';
const recognizer = new sherpa_onnx.OfflineRecognizer(recognizerConfig);
return recognizer
}
function createVad() {
const sileroVadModelConfig = new sherpa_onnx.SileroVadModelConfig();
sileroVadModelConfig.model = './silero_vad.onnx';
sileroVadModelConfig.minSpeechDuration = 0.3; // seconds
sileroVadModelConfig.minSilenceDuration = 0.3; // seconds
sileroVadModelConfig.windowSize = 512;
const vadModelConfig = new sherpa_onnx.VadModelConfig();
vadModelConfig.sileroVad = sileroVadModelConfig;
vadModelConfig.sampleRate = 16000;
const bufferSizeInSeconds = 60;
const vad = new sherpa_onnx.VoiceActivityDetector(
vadModelConfig, bufferSizeInSeconds);
return vad;
}
const recognizer = createOfflineRecognizer();
const vad = createVad();
const bufferSizeInSeconds = 30;
const buffer =
new sherpa_onnx.CircularBuffer(bufferSizeInSeconds * vad.config.sampleRate);
var ai = new portAudio.AudioIO({
inOptions: {
channelCount: 1,
sampleFormat: portAudio.SampleFormatFloat32,
sampleRate: vad.config.sampleRate,
deviceId: -1, // Use -1 or omit the deviceId to select the default device
closeOnError: true // Close the stream if an audio error is detected, if
// set false then just log the error
}
});
let printed = false;
let index = 0;
ai.on('data', data => {
const windowSize = vad.config.sileroVad.windowSize;
buffer.push(new Float32Array(data.buffer));
while (buffer.size() > windowSize) {
const samples = buffer.get(buffer.head(), windowSize);
buffer.pop(windowSize);
vad.acceptWaveform(samples)
}
while (!vad.isEmpty()) {
const segment = vad.front();
vad.pop();
const stream = recognizer.createStream();
stream.acceptWaveform(
recognizer.config.featConfig.sampleRate, segment.samples);
recognizer.decode(stream);
const r = recognizer.getResult(stream);
stream.free();
if (r.text.length > 0) {
console.log(`${index}: ${r.text}`);
index += 1;
}
}
});
ai.on('close', () => {
console.log('Free resources');
recognizer.free();
vad.free();
buffer.free();
});
ai.start();
console.log('Started! Please speak')

View File

@@ -0,0 +1,106 @@
// Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang)
//
const sherpa_onnx = require('sherpa-onnx');
const portAudio = require('naudiodon2');
console.log(portAudio.getDevices());
function createOfflineRecognizer() {
const featConfig = new sherpa_onnx.FeatureConfig();
featConfig.sampleRate = 16000;
featConfig.featureDim = 80;
// test online recognizer
const transducer = new sherpa_onnx.OfflineTransducerModelConfig();
transducer.encoder =
'./sherpa-onnx-zipformer-en-2023-06-26/encoder-epoch-99-avg-1.onnx';
transducer.decoder =
'./sherpa-onnx-zipformer-en-2023-06-26/decoder-epoch-99-avg-1.onnx';
transducer.joiner =
'./sherpa-onnx-zipformer-en-2023-06-26/joiner-epoch-99-avg-1.onnx';
const tokens = './sherpa-onnx-zipformer-en-2023-06-26/tokens.txt';
const modelConfig = new sherpa_onnx.OfflineModelConfig();
modelConfig.transducer = transducer;
modelConfig.tokens = tokens;
modelConfig.modelType = 'transducer';
const recognizerConfig = new sherpa_onnx.OfflineRecognizerConfig();
recognizerConfig.featConfig = featConfig;
recognizerConfig.modelConfig = modelConfig;
recognizerConfig.decodingMethod = 'greedy_search';
const recognizer = new sherpa_onnx.OfflineRecognizer(recognizerConfig);
return recognizer;
}
function createVad() {
const sileroVadModelConfig = new sherpa_onnx.SileroVadModelConfig();
sileroVadModelConfig.model = './silero_vad.onnx';
sileroVadModelConfig.minSpeechDuration = 0.3; // seconds
sileroVadModelConfig.minSilenceDuration = 0.3; // seconds
sileroVadModelConfig.windowSize = 512;
const vadModelConfig = new sherpa_onnx.VadModelConfig();
vadModelConfig.sileroVad = sileroVadModelConfig;
vadModelConfig.sampleRate = 16000;
const bufferSizeInSeconds = 60;
const vad = new sherpa_onnx.VoiceActivityDetector(
vadModelConfig, bufferSizeInSeconds);
return vad;
}
const recognizer = createOfflineRecognizer();
const vad = createVad();
const bufferSizeInSeconds = 30;
const buffer =
new sherpa_onnx.CircularBuffer(bufferSizeInSeconds * vad.config.sampleRate);
const ai = new portAudio.AudioIO({
inOptions: {
channelCount: 1,
closeOnError: true, // Close the stream if an audio error is detected, if
// set false then just log the error
deviceId: -1, // Use -1 or omit the deviceId to select the default device
sampleFormat: portAudio.SampleFormatFloat32,
sampleRate: vad.config.sampleRate
}
});
let printed = false;
let index = 0;
ai.on('data', data => {
const windowSize = vad.config.sileroVad.windowSize;
buffer.push(new Float32Array(data.buffer));
while (buffer.size() > windowSize) {
const samples = buffer.get(buffer.head(), windowSize);
buffer.pop(windowSize);
vad.acceptWaveform(samples)
}
while (!vad.isEmpty()) {
const segment = vad.front();
vad.pop();
const stream = recognizer.createStream();
stream.acceptWaveform(
recognizer.config.featConfig.sampleRate, segment.samples);
recognizer.decode(stream);
const r = recognizer.getResult(stream);
stream.free();
if (r.text.length > 0) {
console.log(`${index}: ${r.text}`);
index += 1;
}
}
});
ai.on('close', () => {
console.log('Free resources');
recognizer.free();
vad.free();
buffer.free();
});
ai.start();
console.log('Started! Please speak')

View File

@@ -0,0 +1,102 @@
// Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang)
//
const sherpa_onnx = require('sherpa-onnx');
const portAudio = require('naudiodon2');
console.log(portAudio.getDevices());
function createOfflineRecognizer() {
const featConfig = new sherpa_onnx.FeatureConfig();
featConfig.sampleRate = 16000;
featConfig.featureDim = 80;
// test online recognizer
const whisper = new sherpa_onnx.OfflineWhisperModelConfig();
whisper.encoder = './sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx';
whisper.decoder = './sherpa-onnx-whisper-tiny.en/tiny.en-decoder.int8.onnx';
const tokens = './sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt';
const modelConfig = new sherpa_onnx.OfflineModelConfig();
modelConfig.whisper = whisper;
modelConfig.tokens = tokens;
modelConfig.modelType = 'whisper';
const recognizerConfig = new sherpa_onnx.OfflineRecognizerConfig();
recognizerConfig.featConfig = featConfig;
recognizerConfig.modelConfig = modelConfig;
recognizerConfig.decodingMethod = 'greedy_search';
const recognizer = new sherpa_onnx.OfflineRecognizer(recognizerConfig);
return recognizer;
}
function createVad() {
const sileroVadModelConfig = new sherpa_onnx.SileroVadModelConfig();
sileroVadModelConfig.model = './silero_vad.onnx';
sileroVadModelConfig.minSpeechDuration = 0.3; // seconds
sileroVadModelConfig.minSilenceDuration = 0.3; // seconds
sileroVadModelConfig.windowSize = 512;
const vadModelConfig = new sherpa_onnx.VadModelConfig();
vadModelConfig.sileroVad = sileroVadModelConfig;
vadModelConfig.sampleRate = 16000;
const bufferSizeInSeconds = 60;
const vad = new sherpa_onnx.VoiceActivityDetector(
vadModelConfig, bufferSizeInSeconds);
return vad;
}
const recognizer = createOfflineRecognizer();
const vad = createVad();
const bufferSizeInSeconds = 30;
const buffer =
new sherpa_onnx.CircularBuffer(bufferSizeInSeconds * vad.config.sampleRate);
const ai = new portAudio.AudioIO({
inOptions: {
channelCount: 1,
closeOnError: true, // Close the stream if an audio error is detected, if
// set false then just log the error
deviceId: -1, // Use -1 or omit the deviceId to select the default device
sampleFormat: portAudio.SampleFormatFloat32,
sampleRate: vad.config.sampleRate
}
});
let printed = false;
let index = 0;
ai.on('data', data => {
const windowSize = vad.config.sileroVad.windowSize;
buffer.push(new Float32Array(data.buffer));
while (buffer.size() > windowSize) {
const samples = buffer.get(buffer.head(), windowSize);
buffer.pop(windowSize);
vad.acceptWaveform(samples)
}
while (!vad.isEmpty()) {
const segment = vad.front();
vad.pop();
const stream = recognizer.createStream();
stream.acceptWaveform(
recognizer.config.featConfig.sampleRate, segment.samples);
recognizer.decode(stream);
const r = recognizer.getResult(stream);
stream.free();
if (r.text.length > 0) {
console.log(`${index}: ${r.text}`);
index += 1;
}
}
});
ai.on('close', () => {
console.log('Free resources');
recognizer.free();
vad.free();
buffer.free();
});
ai.start();
console.log('Started! Please speak')

View File

@@ -0,0 +1,74 @@
// Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang)
const sherpa_onnx = require('sherpa-onnx');
const portAudio = require('naudiodon2');
console.log(portAudio.getDevices());
function createVad() {
const sileroVadModelConfig = new sherpa_onnx.SileroVadModelConfig();
sileroVadModelConfig.model = './silero_vad.onnx';
sileroVadModelConfig.minSpeechDuration = 0.3; // seconds
sileroVadModelConfig.minSilenceDuration = 0.3; // seconds
sileroVadModelConfig.windowSize = 512;
const vadModelConfig = new sherpa_onnx.VadModelConfig();
vadModelConfig.sileroVad = sileroVadModelConfig;
vadModelConfig.sampleRate = 16000;
const bufferSizeInSeconds = 60;
const vad = new sherpa_onnx.VoiceActivityDetector(
vadModelConfig, bufferSizeInSeconds);
return vad;
}
vad = createVad();
const bufferSizeInSeconds = 30;
const buffer =
new sherpa_onnx.CircularBuffer(bufferSizeInSeconds * vad.config.sampleRate);
const ai = new portAudio.AudioIO({
inOptions: {
channelCount: 1,
closeOnError: true, // Close the stream if an audio error is detected, if
// set false then just log the error
deviceId: -1, // Use -1 or omit the deviceId to select the default device
sampleFormat: portAudio.SampleFormatFloat32,
sampleRate: vad.config.sampleRate
}
});
let printed = false;
let index = 0;
ai.on('data', data => {
const windowSize = vad.config.sileroVad.windowSize;
buffer.push(new Float32Array(data.buffer));
while (buffer.size() > windowSize) {
const samples = buffer.get(buffer.head(), windowSize);
buffer.pop(windowSize);
vad.acceptWaveform(samples)
if (vad.isDetected() && !printed) {
console.log(`${index}: Detected speech`)
printed = true;
}
if (!vad.isDetected()) {
printed = false;
}
while (!vad.isEmpty()) {
const segment = vad.front();
vad.pop();
const duration = segment.samples.length / vad.config.sampleRate;
console.log(`${index} End of speech. Duration: ${duration} seconds`);
index += 1;
}
}
});
ai.on('close', () => {
console.log('Free resources');
vad.free();
buffer.free();
});
ai.start();
console.log('Started! Please speak')