support nodejs (#438)
This commit is contained in:
2
nodejs-examples/.gitignore
vendored
Normal file
2
nodejs-examples/.gitignore
vendored
Normal file
@@ -0,0 +1,2 @@
|
||||
node_modules
|
||||
package-lock.json
|
||||
247
nodejs-examples/README.md
Normal file
247
nodejs-examples/README.md
Normal file
@@ -0,0 +1,247 @@
|
||||
# Introduction
|
||||
|
||||
This directory contains nodejs examples for [sherpa-onnx](https://github.com/k2-fsa/sherpa-onnx).
|
||||
|
||||
Before you continue, please first install the npm package `sherpa-onnx` by
|
||||
|
||||
```bash
|
||||
npm install sherpa-onnx
|
||||
```
|
||||
|
||||
In the following, we describe how to use [sherpa-onnx](https://github.com/k2-fsa/sherpa-onnx)
|
||||
for text-to-speech and speech-to-text.
|
||||
|
||||
**Caution**: If you get the following error:
|
||||
```
|
||||
/Users/fangjun/open-source/sherpa-onnx/nodejs-examples/node_modules/ffi-napi/lib/dynamic_library.js:67
|
||||
if (match = err.match(/^(([^ \t()])+\.so([^ \t:()])*):([ \t])*/)) {
|
||||
^
|
||||
|
||||
TypeError: Cannot read properties of null (reading 'match')
|
||||
at new DynamicLibrary (/Users/fangjun/open-source/sherpa-onnx/nodejs-examples/node_modules/ffi-napi/lib/dynamic_library.js:67:21)
|
||||
at Object.Library (/Users/fangjun/open-source/sherpa-onnx/nodejs-examples/node_modules/ffi-napi/lib/library.js:47:10)
|
||||
at Object.<anonymous> (/Users/fangjun/open-source/sherpa-onnx/nodejs-examples/node_modules/sherpa-onnx3/index.js:268:28)
|
||||
at Module._compile (node:internal/modules/cjs/loader:1376:14)
|
||||
at Module._extensions..js (node:internal/modules/cjs/loader:1435:10)
|
||||
at Module.load (node:internal/modules/cjs/loader:1207:32)
|
||||
at Module._load (node:internal/modules/cjs/loader:1023:12)
|
||||
at Module.require (node:internal/modules/cjs/loader:1235:19)
|
||||
at require (node:internal/modules/helpers:176:18)
|
||||
at Object.<anonymous> (/Users/fangjun/open-source/sherpa-onnx/nodejs-examples/test-offline-tts-zh.js:3:21)
|
||||
```
|
||||
|
||||
Please downgrade your node to version v13.14.0. See also
|
||||
https://github.com/node-ffi-napi/node-ffi-napi/issues/244
|
||||
and
|
||||
https://github.com/node-ffi-napi/node-ffi-napi/issues/97 .
|
||||
|
||||
# Text-to-speech
|
||||
|
||||
In the following, we demonstrate how to run text-to-speech.
|
||||
|
||||
## ./test-offline-tts-en.js
|
||||
|
||||
[./test-offline-tts-en.js](./test-offline-tts-en.js) shows how to use
|
||||
a VITS pretrained model
|
||||
[VCTK](https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/vits.html#vctk-english-multi-speaker-109-speakers)
|
||||
for text-to-speech.
|
||||
|
||||
You can use the following command to run it:
|
||||
|
||||
```bash
|
||||
wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-vctk.tar.bz2
|
||||
tar xvf vits-vctk.tar.bz2
|
||||
node ./test-offline-tts-en.js
|
||||
```
|
||||
|
||||
## ./test-offline-tts-zh.js
|
||||
|
||||
[./test-offline-tts-zh.js](./test-offline-tts-zh.js) shows how to use
|
||||
a VITS pretrained model
|
||||
[aishell3](https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/vits.html#vits-model-aishell3)
|
||||
for text-to-speech.
|
||||
|
||||
You can use the following command to run it:
|
||||
|
||||
```bash
|
||||
wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-zh-aishell3.tar.bz2
|
||||
tar xvf vits-zh-aishell3.tar.bz2
|
||||
node ./test-offline-tts-zh.js
|
||||
```
|
||||
|
||||
# Speech-to-text
|
||||
|
||||
In the following, we demonstrate how to decode files and how to perform
|
||||
speech recognition with a microphone with `nodejs`. We need to install two additional
|
||||
npm packages:
|
||||
|
||||
|
||||
```bash
|
||||
npm install wav naudiodon2
|
||||
```
|
||||
|
||||
## ./test-offline-nemo-ctc.js
|
||||
|
||||
[./test-offline-nemo-ctc.js](./test-offline-nemo-ctc.js) demonstrates
|
||||
how to decode a file with a NeMo CTC model. In the code we use
|
||||
[stt_en_conformer_ctc_small](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-ctc/nemo/english.html#stt-en-conformer-ctc-small).
|
||||
|
||||
You can use the following command run it:
|
||||
|
||||
```bash
|
||||
wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-ctc-en-conformer-small.tar.bz2
|
||||
tar xvf sherpa-onnx-nemo-ctc-en-conformer-small.tar.bz2
|
||||
node ./test-offline-nemo-ctc.js
|
||||
```
|
||||
|
||||
## ./test-offline-paraformer.js
|
||||
|
||||
[./test-offline-paraformer.js](./test-offline-paraformer.js) demonstrates
|
||||
how to decode a file with a non-streaming Paraformer model. In the code we use
|
||||
[sherpa-onnx-paraformer-zh-2023-03-28](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-paraformer/paraformer-models.html#csukuangfj-sherpa-onnx-paraformer-zh-2023-03-28-chinese).
|
||||
|
||||
You can use the following command run it:
|
||||
|
||||
```bash
|
||||
wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2
|
||||
tar xvf sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2
|
||||
node ./test-offline-paraformer.js
|
||||
```
|
||||
|
||||
## ./test-offline-transducer.js
|
||||
|
||||
[./test-offline-transducer.js](./test-offline-transducer.js) demonstrates
|
||||
how to decode a file with a non-streaming transducer model. In the code we use
|
||||
[sherpa-onnx-zipformer-en-2023-06-26](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-transducer/zipformer-transducer-models.html#csukuangfj-sherpa-onnx-zipformer-en-2023-06-26-english).
|
||||
|
||||
You can use the following command run it:
|
||||
|
||||
```bash
|
||||
wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-en-2023-06-26.tar.bz2
|
||||
tar xvf sherpa-onnx-zipformer-en-2023-06-26.tar.bz2
|
||||
node ./test-offline-transducer.js
|
||||
```
|
||||
|
||||
## ./test-offline-whisper.js
|
||||
[./test-offline-whisper.js](./test-offline-whisper.js) demonstrates
|
||||
how to decode a file with a Whisper model. In the code we use
|
||||
[sherpa-onnx-whisper-tiny.en](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/whisper/tiny.en.html).
|
||||
|
||||
You can use the following command run it:
|
||||
|
||||
```bash
|
||||
wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2
|
||||
tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2
|
||||
node ./test-offline-whisper.js
|
||||
```
|
||||
|
||||
## ./test-online-paraformer-microphone.js
|
||||
[./test-online-paraformer-microphone.js](./test-online-paraformer-microphone.js)
|
||||
demonstrates how to do real-time speech recognition from microphone
|
||||
with a streaming Paraformer model. In the code we use
|
||||
[sherpa-onnx-streaming-paraformer-bilingual-zh-en](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-paraformer/paraformer-models.html#csukuangfj-sherpa-onnx-streaming-paraformer-bilingual-zh-en-chinese-english).
|
||||
|
||||
You can use the following command run it:
|
||||
|
||||
```bash
|
||||
wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-paraformer-bilingual-zh-en.tar.bz2
|
||||
rm sherpa-onnx-streaming-paraformer-bilingual-zh-en.tar.bz2
|
||||
node ./test-online-paraformer-microphone.js
|
||||
```
|
||||
|
||||
## ./test-online-paraformer.js
|
||||
[./test-online-paraformer.js](./test-online-paraformer.js) demonstrates
|
||||
how to decode a file using a streaming Paraformer model. In the code we use
|
||||
[sherpa-onnx-streaming-paraformer-bilingual-zh-en](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-paraformer/paraformer-models.html#csukuangfj-sherpa-onnx-streaming-paraformer-bilingual-zh-en-chinese-english).
|
||||
|
||||
You can use the following command run it:
|
||||
|
||||
```bash
|
||||
wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-paraformer-bilingual-zh-en.tar.bz2
|
||||
rm sherpa-onnx-streaming-paraformer-bilingual-zh-en.tar.bz2
|
||||
node ./test-online-paraformer.js
|
||||
```
|
||||
|
||||
## ./test-online-transducer-microphone.js
|
||||
[./test-online-transducer-microphone.js](./test-online-transducer-microphone.js)
|
||||
demonstrates how to do real-time speech recognition with microphone using a streaming transducer model. In the code
|
||||
we use [sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-transducer/zipformer-transducer-models.html#csukuangfj-sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20-bilingual-chinese-english).
|
||||
|
||||
|
||||
You can use the following command run it:
|
||||
|
||||
```bash
|
||||
wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2
|
||||
tar xvf sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2
|
||||
node ./test-online-transducer-microphone.js
|
||||
```
|
||||
|
||||
## ./test-online-transducer.js
|
||||
[./test-online-transducer.js](./test-online-transducer.js) demonstrates
|
||||
how to decode a file using a streaming transducer model. In the code
|
||||
we use [sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-transducer/zipformer-transducer-models.html#csukuangfj-sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20-bilingual-chinese-english).
|
||||
|
||||
You can use the following command run it:
|
||||
|
||||
```bash
|
||||
wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2
|
||||
tar xvf sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2
|
||||
node ./test-online-transducer.js
|
||||
```
|
||||
|
||||
## ./test-vad-microphone-offline-paraformer.js
|
||||
|
||||
[./test-vad-microphone-offline-paraformer.js](./test-vad-microphone-offline-paraformer.js)
|
||||
demonstrates how to use [silero-vad](https://github.com/snakers4/silero-vad)
|
||||
with non-streaming Paraformer for speech recognition from microphone.
|
||||
|
||||
You can use the following command run it:
|
||||
|
||||
```bash
|
||||
wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
|
||||
wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2
|
||||
tar xvf sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2
|
||||
node ./test-vad-microphone-offline-paraformer.js
|
||||
```
|
||||
|
||||
## ./test-vad-microphone-offline-transducer.js
|
||||
|
||||
[./test-vad-microphone-offline-transducer.js](./test-vad-microphone-offline-transducer.js)
|
||||
demonstrates how to use [silero-vad](https://github.com/snakers4/silero-vad)
|
||||
with a non-streaming transducer model for speech recognition from microphone.
|
||||
|
||||
You can use the following command run it:
|
||||
|
||||
```bash
|
||||
wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
|
||||
wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-en-2023-06-26.tar.bz2
|
||||
tar xvf sherpa-onnx-zipformer-en-2023-06-26.tar.bz2
|
||||
node ./test-vad-microphone-offline-transducer.js
|
||||
```
|
||||
|
||||
## ./test-vad-microphone-offline-whisper.js
|
||||
|
||||
[./test-vad-microphone-offline-whisper.js](./test-vad-microphone-offline-whisper.js)
|
||||
demonstrates how to use [silero-vad](https://github.com/snakers4/silero-vad)
|
||||
with whisper for speech recognition from microphone.
|
||||
|
||||
You can use the following command run it:
|
||||
|
||||
```bash
|
||||
wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
|
||||
wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2
|
||||
tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2
|
||||
node ./test-vad-microphone-offline-whisper.js
|
||||
```
|
||||
|
||||
## ./test-vad-microphone.js
|
||||
|
||||
[./test-vad-microphone.js](./test-vad-microphone.js)
|
||||
demonstrates how to use [silero-vad](https://github.com/snakers4/silero-vad).
|
||||
|
||||
You can use the following command run it:
|
||||
|
||||
```bash
|
||||
wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
|
||||
node ./test-vad-microphone.js
|
||||
```
|
||||
7
nodejs-examples/package.json
Normal file
7
nodejs-examples/package.json
Normal file
@@ -0,0 +1,7 @@
|
||||
{
|
||||
"dependencies": {
|
||||
"naudiodon2": "^2.4.0",
|
||||
"sherpa-onnx": "^1.8.11",
|
||||
"wav": "^1.0.2"
|
||||
}
|
||||
}
|
||||
97
nodejs-examples/test-offline-nemo-ctc.js
Normal file
97
nodejs-examples/test-offline-nemo-ctc.js
Normal file
@@ -0,0 +1,97 @@
|
||||
// Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang)
|
||||
//
|
||||
const fs = require('fs');
|
||||
const {Readable} = require('stream');
|
||||
const wav = require('wav');
|
||||
|
||||
const sherpa_onnx = require('sherpa-onnx');
|
||||
|
||||
function createRecognizer() {
|
||||
const featConfig = new sherpa_onnx.FeatureConfig();
|
||||
featConfig.sampleRate = 16000;
|
||||
featConfig.featureDim = 80;
|
||||
|
||||
// test online recognizer
|
||||
const nemoCtc = new sherpa_onnx.OfflineNemoEncDecCtcModelConfig();
|
||||
nemoCtc.model = './sherpa-onnx-nemo-ctc-en-conformer-small/model.int8.onnx';
|
||||
const tokens = './sherpa-onnx-nemo-ctc-en-conformer-small/tokens.txt';
|
||||
|
||||
const modelConfig = new sherpa_onnx.OfflineModelConfig();
|
||||
modelConfig.nemoCtc = nemoCtc;
|
||||
modelConfig.tokens = tokens;
|
||||
modelConfig.modelType = 'nemo_ctc';
|
||||
|
||||
const recognizerConfig = new sherpa_onnx.OfflineRecognizerConfig();
|
||||
recognizerConfig.featConfig = featConfig;
|
||||
recognizerConfig.modelConfig = modelConfig;
|
||||
recognizerConfig.decodingMethod = 'greedy_search';
|
||||
|
||||
const recognizer = new sherpa_onnx.OfflineRecognizer(recognizerConfig);
|
||||
return recognizer;
|
||||
}
|
||||
|
||||
recognizer = createRecognizer();
|
||||
stream = recognizer.createStream();
|
||||
|
||||
const waveFilename =
|
||||
'./sherpa-onnx-nemo-ctc-en-conformer-small/test_wavs/0.wav';
|
||||
|
||||
const reader = new wav.Reader();
|
||||
const readable = new Readable().wrap(reader);
|
||||
const buf = [];
|
||||
|
||||
reader.on('format', ({audioFormat, bitDepth, channels, sampleRate}) => {
|
||||
if (sampleRate != recognizer.config.featConfig.sampleRate) {
|
||||
throw new Error(`Only support sampleRate ${
|
||||
recognizer.config.featConfig.sampleRate}. Given ${sampleRate}`);
|
||||
}
|
||||
|
||||
if (audioFormat != 1) {
|
||||
throw new Error(`Only support PCM format. Given ${audioFormat}`);
|
||||
}
|
||||
|
||||
if (channels != 1) {
|
||||
throw new Error(`Only a single channel. Given ${channel}`);
|
||||
}
|
||||
|
||||
if (bitDepth != 16) {
|
||||
throw new Error(`Only support 16-bit samples. Given ${bitDepth}`);
|
||||
}
|
||||
});
|
||||
|
||||
fs.createReadStream(waveFilename, {highWaterMark: 4096})
|
||||
.pipe(reader)
|
||||
.on('finish', function(err) {
|
||||
// tail padding
|
||||
const floatSamples =
|
||||
new Float32Array(recognizer.config.featConfig.sampleRate * 0.5);
|
||||
|
||||
buf.push(floatSamples);
|
||||
const flattened =
|
||||
Float32Array.from(buf.reduce((a, b) => [...a, ...b], []));
|
||||
|
||||
stream.acceptWaveform(recognizer.config.featConfig.sampleRate, flattened);
|
||||
recognizer.decode(stream);
|
||||
const r = recognizer.getResult(stream);
|
||||
console.log(r.text);
|
||||
|
||||
stream.free();
|
||||
recognizer.free();
|
||||
});
|
||||
|
||||
readable.on('readable', function() {
|
||||
let chunk;
|
||||
while ((chunk = readable.read()) != null) {
|
||||
const int16Samples = new Int16Array(
|
||||
chunk.buffer, chunk.byteOffset,
|
||||
chunk.length / Int16Array.BYTES_PER_ELEMENT);
|
||||
|
||||
const floatSamples = new Float32Array(int16Samples.length);
|
||||
|
||||
for (let i = 0; i < floatSamples.length; i++) {
|
||||
floatSamples[i] = int16Samples[i] / 32768.0;
|
||||
}
|
||||
|
||||
buf.push(floatSamples);
|
||||
}
|
||||
});
|
||||
95
nodejs-examples/test-offline-paraformer.js
Normal file
95
nodejs-examples/test-offline-paraformer.js
Normal file
@@ -0,0 +1,95 @@
|
||||
// Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang)
|
||||
|
||||
const fs = require('fs');
|
||||
const {Readable} = require('stream');
|
||||
const wav = require('wav');
|
||||
|
||||
const sherpa_onnx = require('sherpa-onnx');
|
||||
|
||||
function createRecognizer() {
|
||||
const featConfig = new sherpa_onnx.FeatureConfig();
|
||||
featConfig.sampleRate = 16000;
|
||||
featConfig.featureDim = 80;
|
||||
|
||||
// test online recognizer
|
||||
const paraformer = new sherpa_onnx.OfflineParaformerModelConfig();
|
||||
paraformer.model = './sherpa-onnx-paraformer-zh-2023-03-28/model.onnx';
|
||||
const tokens = './sherpa-onnx-paraformer-zh-2023-03-28/tokens.txt';
|
||||
|
||||
const modelConfig = new sherpa_onnx.OfflineModelConfig();
|
||||
modelConfig.paraformer = paraformer;
|
||||
modelConfig.tokens = tokens;
|
||||
modelConfig.modelType = 'paraformer';
|
||||
|
||||
const recognizerConfig = new sherpa_onnx.OfflineRecognizerConfig();
|
||||
recognizerConfig.featConfig = featConfig;
|
||||
recognizerConfig.modelConfig = modelConfig;
|
||||
recognizerConfig.decodingMethod = 'greedy_search';
|
||||
|
||||
const recognizer = new sherpa_onnx.OfflineRecognizer(recognizerConfig);
|
||||
return recognizer;
|
||||
}
|
||||
|
||||
recognizer = createRecognizer();
|
||||
stream = recognizer.createStream();
|
||||
|
||||
const waveFilename = './sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/0.wav';
|
||||
|
||||
const reader = new wav.Reader();
|
||||
const readable = new Readable().wrap(reader);
|
||||
const buf = [];
|
||||
|
||||
reader.on('format', ({audioFormat, bitDepth, channels, sampleRate}) => {
|
||||
if (sampleRate != recognizer.config.featConfig.sampleRate) {
|
||||
throw new Error(`Only support sampleRate ${
|
||||
recognizer.config.featConfig.sampleRate}. Given ${sampleRate}`);
|
||||
}
|
||||
|
||||
if (audioFormat != 1) {
|
||||
throw new Error(`Only support PCM format. Given ${audioFormat}`);
|
||||
}
|
||||
|
||||
if (channels != 1) {
|
||||
throw new Error(`Only a single channel. Given ${channel}`);
|
||||
}
|
||||
|
||||
if (bitDepth != 16) {
|
||||
throw new Error(`Only support 16-bit samples. Given ${bitDepth}`);
|
||||
}
|
||||
});
|
||||
|
||||
fs.createReadStream(waveFilename, {'highWaterMark': 4096})
|
||||
.pipe(reader)
|
||||
.on('finish', function(err) {
|
||||
// tail padding
|
||||
const floatSamples =
|
||||
new Float32Array(recognizer.config.featConfig.sampleRate * 0.5);
|
||||
|
||||
buf.push(floatSamples);
|
||||
const flattened =
|
||||
Float32Array.from(buf.reduce((a, b) => [...a, ...b], []));
|
||||
|
||||
stream.acceptWaveform(recognizer.config.featConfig.sampleRate, flattened);
|
||||
recognizer.decode(stream);
|
||||
const r = recognizer.getResult(stream);
|
||||
console.log(r.text);
|
||||
|
||||
stream.free();
|
||||
recognizer.free();
|
||||
});
|
||||
|
||||
readable.on('readable', function() {
|
||||
let chunk;
|
||||
while ((chunk = readable.read()) != null) {
|
||||
const int16Samples = new Int16Array(
|
||||
chunk.buffer, chunk.byteOffset,
|
||||
chunk.length / Int16Array.BYTES_PER_ELEMENT);
|
||||
|
||||
const floatSamples = new Float32Array(int16Samples.length);
|
||||
for (let i = 0; i < floatSamples.length; i++) {
|
||||
floatSamples[i] = int16Samples[i] / 32768.0;
|
||||
}
|
||||
|
||||
buf.push(floatSamples);
|
||||
}
|
||||
});
|
||||
100
nodejs-examples/test-offline-transducer.js
Normal file
100
nodejs-examples/test-offline-transducer.js
Normal file
@@ -0,0 +1,100 @@
|
||||
// Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang)
|
||||
//
|
||||
const fs = require('fs');
|
||||
const {Readable} = require('stream');
|
||||
const wav = require('wav');
|
||||
|
||||
const sherpa_onnx = require('sherpa-onnx');
|
||||
|
||||
function createRecognizer() {
|
||||
const featConfig = new sherpa_onnx.FeatureConfig();
|
||||
featConfig.sampleRate = 16000;
|
||||
featConfig.featureDim = 80;
|
||||
|
||||
// test online recognizer
|
||||
const transducer = new sherpa_onnx.OfflineTransducerModelConfig();
|
||||
transducer.encoder =
|
||||
'./sherpa-onnx-zipformer-en-2023-06-26/encoder-epoch-99-avg-1.onnx';
|
||||
transducer.decoder =
|
||||
'./sherpa-onnx-zipformer-en-2023-06-26/decoder-epoch-99-avg-1.onnx';
|
||||
transducer.joiner =
|
||||
'./sherpa-onnx-zipformer-en-2023-06-26/joiner-epoch-99-avg-1.onnx';
|
||||
const tokens = './sherpa-onnx-zipformer-en-2023-06-26/tokens.txt';
|
||||
|
||||
const modelConfig = new sherpa_onnx.OfflineModelConfig();
|
||||
modelConfig.transducer = transducer;
|
||||
modelConfig.tokens = tokens;
|
||||
modelConfig.modelType = 'transducer';
|
||||
|
||||
const recognizerConfig = new sherpa_onnx.OfflineRecognizerConfig();
|
||||
recognizerConfig.featConfig = featConfig;
|
||||
recognizerConfig.modelConfig = modelConfig;
|
||||
recognizerConfig.decodingMethod = 'greedy_search';
|
||||
|
||||
const recognizer = new sherpa_onnx.OfflineRecognizer(recognizerConfig);
|
||||
return recognizer;
|
||||
}
|
||||
|
||||
recognizer = createRecognizer();
|
||||
stream = recognizer.createStream();
|
||||
|
||||
const waveFilename = './sherpa-onnx-zipformer-en-2023-06-26/test_wavs/0.wav';
|
||||
|
||||
const reader = new wav.Reader();
|
||||
const readable = new Readable().wrap(reader);
|
||||
const buf = [];
|
||||
|
||||
reader.on('format', ({audioFormat, bitDepth, channels, sampleRate}) => {
|
||||
if (sampleRate != recognizer.config.featConfig.sampleRate) {
|
||||
throw new Error(`Only support sampleRate ${
|
||||
recognizer.config.featConfig.sampleRate}. Given ${sampleRate}`);
|
||||
}
|
||||
|
||||
if (audioFormat != 1) {
|
||||
throw new Error(`Only support PCM format. Given ${audioFormat}`);
|
||||
}
|
||||
|
||||
if (channels != 1) {
|
||||
throw new Error(`Only a single channel. Given ${channel}`);
|
||||
}
|
||||
|
||||
if (bitDepth != 16) {
|
||||
throw new Error(`Only support 16-bit samples. Given ${bitDepth}`);
|
||||
}
|
||||
});
|
||||
|
||||
fs.createReadStream(waveFilename, {'highWaterMark': 4096})
|
||||
.pipe(reader)
|
||||
.on('finish', function(err) {
|
||||
// tail padding
|
||||
const floatSamples =
|
||||
new Float32Array(recognizer.config.featConfig.sampleRate * 0.5);
|
||||
|
||||
buf.push(floatSamples);
|
||||
const flattened =
|
||||
Float32Array.from(buf.reduce((a, b) => [...a, ...b], []));
|
||||
|
||||
stream.acceptWaveform(recognizer.config.featConfig.sampleRate, flattened);
|
||||
recognizer.decode(stream);
|
||||
const r = recognizer.getResult(stream);
|
||||
console.log(r.text);
|
||||
|
||||
stream.free();
|
||||
recognizer.free();
|
||||
});
|
||||
|
||||
readable.on('readable', function() {
|
||||
let chunk;
|
||||
while ((chunk = readable.read()) != null) {
|
||||
const int16Samples = new Int16Array(
|
||||
chunk.buffer, chunk.byteOffset,
|
||||
chunk.length / Int16Array.BYTES_PER_ELEMENT);
|
||||
|
||||
const floatSamples = new Float32Array(int16Samples.length);
|
||||
for (let i = 0; i < floatSamples.length; i++) {
|
||||
floatSamples[i] = int16Samples[i] / 32768.0;
|
||||
}
|
||||
|
||||
buf.push(floatSamples);
|
||||
}
|
||||
});
|
||||
27
nodejs-examples/test-offline-tts-en.js
Normal file
27
nodejs-examples/test-offline-tts-en.js
Normal file
@@ -0,0 +1,27 @@
|
||||
// Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang)
|
||||
|
||||
const sherpa_onnx = require('sherpa-onnx');
|
||||
|
||||
function createOfflineTts() {
|
||||
const vits = new sherpa_onnx.OfflineTtsVitsModelConfig();
|
||||
vits.model = './vits-vctk/vits-vctk.onnx';
|
||||
vits.lexicon = './vits-vctk/lexicon.txt';
|
||||
vits.tokens = './vits-vctk/tokens.txt';
|
||||
|
||||
const modelConfig = new sherpa_onnx.OfflineTtsModelConfig();
|
||||
modelConfig.vits = vits;
|
||||
|
||||
const config = new sherpa_onnx.OfflineTtsConfig();
|
||||
config.model = modelConfig;
|
||||
|
||||
return new sherpa_onnx.OfflineTts(config);
|
||||
}
|
||||
|
||||
const tts = createOfflineTts();
|
||||
const speakerId = 99;
|
||||
const speed = 1.0;
|
||||
const audio =
|
||||
tts.generate('Good morning. How are you doing?', speakerId, speed);
|
||||
audio.save('./test-en.wav');
|
||||
console.log('Saved to test-en.wav successfully.');
|
||||
tts.free();
|
||||
27
nodejs-examples/test-offline-tts-zh.js
Normal file
27
nodejs-examples/test-offline-tts-zh.js
Normal file
@@ -0,0 +1,27 @@
|
||||
// Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang)
|
||||
|
||||
const sherpa_onnx = require('sherpa-onnx');
|
||||
|
||||
function createOfflineTts() {
|
||||
const vits = new sherpa_onnx.OfflineTtsVitsModelConfig();
|
||||
vits.model = './vits-zh-aishell3/vits-aishell3.onnx';
|
||||
vits.lexicon = './vits-zh-aishell3/lexicon.txt';
|
||||
vits.tokens = './vits-zh-aishell3/tokens.txt';
|
||||
|
||||
const modelConfig = new sherpa_onnx.OfflineTtsModelConfig();
|
||||
modelConfig.vits = vits;
|
||||
|
||||
const config = new sherpa_onnx.OfflineTtsConfig();
|
||||
config.model = modelConfig;
|
||||
config.ruleFsts = './vits-zh-aishell3/rule.fst';
|
||||
|
||||
return new sherpa_onnx.OfflineTts(config);
|
||||
}
|
||||
|
||||
const tts = createOfflineTts();
|
||||
const speakerId = 66;
|
||||
const speed = 1.0;
|
||||
const audio = tts.generate('3年前中国总人口是1411778724人', speakerId, speed);
|
||||
audio.save('./test-zh.wav');
|
||||
console.log('Saved to test-zh.wav successfully.');
|
||||
tts.free();
|
||||
97
nodejs-examples/test-offline-whisper.js
Normal file
97
nodejs-examples/test-offline-whisper.js
Normal file
@@ -0,0 +1,97 @@
|
||||
// Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang)
|
||||
//
|
||||
const fs = require('fs');
|
||||
const {Readable} = require('stream');
|
||||
const wav = require('wav');
|
||||
|
||||
const sherpa_onnx = require('sherpa-onnx');
|
||||
|
||||
function createRecognizer() {
|
||||
const featConfig = new sherpa_onnx.FeatureConfig();
|
||||
featConfig.sampleRate = 16000;
|
||||
featConfig.featureDim = 80;
|
||||
|
||||
// test online recognizer
|
||||
const whisper = new sherpa_onnx.OfflineWhisperModelConfig();
|
||||
whisper.encoder = './sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx';
|
||||
whisper.decoder = './sherpa-onnx-whisper-tiny.en/tiny.en-decoder.int8.onnx';
|
||||
const tokens = './sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt';
|
||||
|
||||
const modelConfig = new sherpa_onnx.OfflineModelConfig();
|
||||
modelConfig.whisper = whisper;
|
||||
modelConfig.tokens = tokens;
|
||||
modelConfig.modelType = 'whisper';
|
||||
|
||||
const recognizerConfig = new sherpa_onnx.OfflineRecognizerConfig();
|
||||
recognizerConfig.featConfig = featConfig;
|
||||
recognizerConfig.modelConfig = modelConfig;
|
||||
recognizerConfig.decodingMethod = 'greedy_search';
|
||||
|
||||
const recognizer = new sherpa_onnx.OfflineRecognizer(recognizerConfig);
|
||||
return recognizer;
|
||||
}
|
||||
|
||||
recognizer = createRecognizer();
|
||||
stream = recognizer.createStream();
|
||||
|
||||
const waveFilename = './sherpa-onnx-whisper-tiny.en/test_wavs/0.wav';
|
||||
|
||||
const reader = new wav.Reader();
|
||||
const readable = new Readable().wrap(reader);
|
||||
const buf = [];
|
||||
|
||||
reader.on('format', ({audioFormat, bitDepth, channels, sampleRate}) => {
|
||||
if (sampleRate != recognizer.config.featConfig.sampleRate) {
|
||||
throw new Error(`Only support sampleRate ${
|
||||
recognizer.config.featConfig.sampleRate}. Given ${sampleRate}`);
|
||||
}
|
||||
|
||||
if (audioFormat != 1) {
|
||||
throw new Error(`Only support PCM format. Given ${audioFormat}`);
|
||||
}
|
||||
|
||||
if (channels != 1) {
|
||||
throw new Error(`Only a single channel. Given ${channel}`);
|
||||
}
|
||||
|
||||
if (bitDepth != 16) {
|
||||
throw new Error(`Only support 16-bit samples. Given ${bitDepth}`);
|
||||
}
|
||||
});
|
||||
|
||||
fs.createReadStream(waveFilename, {'highWaterMark': 4096})
|
||||
.pipe(reader)
|
||||
.on('finish', function(err) {
|
||||
// tail padding
|
||||
const floatSamples =
|
||||
new Float32Array(recognizer.config.featConfig.sampleRate * 0.5);
|
||||
|
||||
buf.push(floatSamples);
|
||||
const flattened =
|
||||
Float32Array.from(buf.reduce((a, b) => [...a, ...b], []));
|
||||
|
||||
stream.acceptWaveform(recognizer.config.featConfig.sampleRate, flattened);
|
||||
recognizer.decode(stream);
|
||||
const r = recognizer.getResult(stream);
|
||||
console.log(r.text);
|
||||
|
||||
stream.free();
|
||||
recognizer.free();
|
||||
});
|
||||
|
||||
readable.on('readable', function() {
|
||||
let chunk;
|
||||
while ((chunk = readable.read()) != null) {
|
||||
const int16Samples = new Int16Array(
|
||||
chunk.buffer, chunk.byteOffset,
|
||||
chunk.length / Int16Array.BYTES_PER_ELEMENT);
|
||||
|
||||
const floatSamples = new Float32Array(int16Samples.length);
|
||||
|
||||
for (let i = 0; i < floatSamples.length; i++) {
|
||||
floatSamples[i] = int16Samples[i] / 32768.0;
|
||||
}
|
||||
|
||||
buf.push(floatSamples);
|
||||
}
|
||||
});
|
||||
86
nodejs-examples/test-online-paraformer-microphone.js
Normal file
86
nodejs-examples/test-online-paraformer-microphone.js
Normal file
@@ -0,0 +1,86 @@
|
||||
// Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang)
|
||||
//
|
||||
const portAudio = require('naudiodon2');
|
||||
console.log(portAudio.getDevices());
|
||||
|
||||
const sherpa_onnx = require('sherpa-onnx');
|
||||
|
||||
function createRecognizer() {
|
||||
const featConfig = new sherpa_onnx.FeatureConfig();
|
||||
featConfig.sampleRate = 16000;
|
||||
featConfig.featureDim = 80;
|
||||
|
||||
const paraformer = new sherpa_onnx.OnlineParaformerModelConfig();
|
||||
paraformer.encoder =
|
||||
'./sherpa-onnx-streaming-paraformer-bilingual-zh-en/encoder.int8.onnx';
|
||||
paraformer.decoder =
|
||||
'./sherpa-onnx-streaming-paraformer-bilingual-zh-en/decoder.int8.onnx';
|
||||
const tokens =
|
||||
'./sherpa-onnx-streaming-paraformer-bilingual-zh-en/tokens.txt';
|
||||
|
||||
const modelConfig = new sherpa_onnx.OnlineModelConfig();
|
||||
modelConfig.paraformer = paraformer;
|
||||
modelConfig.tokens = tokens;
|
||||
modelConfig.modelType = 'paraformer';
|
||||
|
||||
const recognizerConfig = new sherpa_onnx.OnlineRecognizerConfig();
|
||||
recognizerConfig.featConfig = featConfig;
|
||||
recognizerConfig.modelConfig = modelConfig;
|
||||
recognizerConfig.decodingMethod = 'greedy_search';
|
||||
recognizerConfig.enableEndpoint = 1;
|
||||
|
||||
const recognizer = new sherpa_onnx.OnlineRecognizer(recognizerConfig);
|
||||
return recognizer;
|
||||
}
|
||||
recognizer = createRecognizer();
|
||||
stream = recognizer.createStream();
|
||||
|
||||
display = new sherpa_onnx.Display(50);
|
||||
|
||||
let lastText = '';
|
||||
let segmentIndex = 0;
|
||||
|
||||
const ai = new portAudio.AudioIO({
|
||||
inOptions: {
|
||||
channelCount: 1,
|
||||
closeOnError: true, // Close the stream if an audio error is detected, if
|
||||
// set false then just log the error
|
||||
deviceId: -1, // Use -1 or omit the deviceId to select the default device
|
||||
sampleFormat: portAudio.SampleFormatFloat32,
|
||||
sampleRate: recognizer.config.featConfig.sampleRate
|
||||
}
|
||||
});
|
||||
|
||||
ai.on('data', data => {
|
||||
const samples = new Float32Array(data.buffer);
|
||||
|
||||
stream.acceptWaveform(recognizer.config.featConfig.sampleRate, samples);
|
||||
|
||||
while (recognizer.isReady(stream)) {
|
||||
recognizer.decode(stream);
|
||||
}
|
||||
|
||||
const isEndpoint = recognizer.isEndpoint(stream);
|
||||
const text = recognizer.getResult(stream).text;
|
||||
|
||||
if (text.length > 0 && lastText != text) {
|
||||
lastText = text;
|
||||
display.print(segmentIndex, lastText);
|
||||
}
|
||||
if (isEndpoint) {
|
||||
if (text.length > 0) {
|
||||
lastText = text;
|
||||
segmentIndex += 1;
|
||||
}
|
||||
recognizer.reset(stream)
|
||||
}
|
||||
});
|
||||
|
||||
ai.on('close', () => {
|
||||
console.log('Free resources');
|
||||
stream.free();
|
||||
recognizer.free();
|
||||
});
|
||||
|
||||
ai.start();
|
||||
console.log('Started! Please speak')
|
||||
99
nodejs-examples/test-online-paraformer.js
Normal file
99
nodejs-examples/test-online-paraformer.js
Normal file
@@ -0,0 +1,99 @@
|
||||
// Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang)
|
||||
//
|
||||
const fs = require('fs');
|
||||
const {Readable} = require('stream');
|
||||
const wav = require('wav');
|
||||
|
||||
const sherpa_onnx = require('sherpa-onnx');
|
||||
|
||||
function createRecognizer() {
|
||||
const featConfig = new sherpa_onnx.FeatureConfig();
|
||||
featConfig.sampleRate = 16000;
|
||||
featConfig.featureDim = 80;
|
||||
|
||||
const paraformer = new sherpa_onnx.OnlineParaformerModelConfig();
|
||||
paraformer.encoder =
|
||||
'./sherpa-onnx-streaming-paraformer-bilingual-zh-en/encoder.onnx';
|
||||
paraformer.decoder =
|
||||
'./sherpa-onnx-streaming-paraformer-bilingual-zh-en/decoder.onnx';
|
||||
const tokens =
|
||||
'./sherpa-onnx-streaming-paraformer-bilingual-zh-en/tokens.txt';
|
||||
|
||||
const modelConfig = new sherpa_onnx.OnlineModelConfig();
|
||||
modelConfig.paraformer = paraformer;
|
||||
modelConfig.tokens = tokens;
|
||||
modelConfig.modelType = 'paraformer';
|
||||
|
||||
const recognizerConfig = new sherpa_onnx.OnlineRecognizerConfig();
|
||||
recognizerConfig.featConfig = featConfig;
|
||||
recognizerConfig.modelConfig = modelConfig;
|
||||
recognizerConfig.decodingMethod = 'greedy_search';
|
||||
|
||||
const recognizer = new sherpa_onnx.OnlineRecognizer(recognizerConfig);
|
||||
return recognizer;
|
||||
}
|
||||
recognizer = createRecognizer();
|
||||
stream = recognizer.createStream();
|
||||
|
||||
const waveFilename =
|
||||
'./sherpa-onnx-streaming-paraformer-bilingual-zh-en/test_wavs/0.wav';
|
||||
|
||||
const reader = new wav.Reader();
|
||||
const readable = new Readable().wrap(reader);
|
||||
|
||||
function decode(samples) {
|
||||
stream.acceptWaveform(recognizer.config.featConfig.sampleRate, samples);
|
||||
|
||||
while (recognizer.isReady(stream)) {
|
||||
recognizer.decode(stream);
|
||||
}
|
||||
const r = recognizer.getResult(stream);
|
||||
console.log(r.text);
|
||||
}
|
||||
|
||||
reader.on('format', ({audioFormat, bitDepth, channels, sampleRate}) => {
|
||||
if (sampleRate != recognizer.config.featConfig.sampleRate) {
|
||||
throw new Error(`Only support sampleRate ${
|
||||
recognizer.config.featConfig.sampleRate}. Given ${sampleRate}`);
|
||||
}
|
||||
|
||||
if (audioFormat != 1) {
|
||||
throw new Error(`Only support PCM format. Given ${audioFormat}`);
|
||||
}
|
||||
|
||||
if (channels != 1) {
|
||||
throw new Error(`Only a single channel. Given ${channel}`);
|
||||
}
|
||||
|
||||
if (bitDepth != 16) {
|
||||
throw new Error(`Only support 16-bit samples. Given ${bitDepth}`);
|
||||
}
|
||||
});
|
||||
|
||||
fs.createReadStream(waveFilename, {'highWaterMark': 4096})
|
||||
.pipe(reader)
|
||||
.on('finish', function(err) {
|
||||
// tail padding
|
||||
const floatSamples =
|
||||
new Float32Array(recognizer.config.featConfig.sampleRate * 0.5);
|
||||
decode(floatSamples);
|
||||
stream.free();
|
||||
recognizer.free();
|
||||
});
|
||||
|
||||
readable.on('readable', function() {
|
||||
let chunk;
|
||||
while ((chunk = readable.read()) != null) {
|
||||
const int16Samples = new Int16Array(
|
||||
chunk.buffer, chunk.byteOffset,
|
||||
chunk.length / Int16Array.BYTES_PER_ELEMENT);
|
||||
|
||||
const floatSamples = new Float32Array(int16Samples.length);
|
||||
|
||||
for (let i = 0; i < floatSamples.length; i++) {
|
||||
floatSamples[i] = int16Samples[i] / 32768.0;
|
||||
}
|
||||
|
||||
decode(floatSamples);
|
||||
}
|
||||
});
|
||||
88
nodejs-examples/test-online-transducer-microphone.js
Normal file
88
nodejs-examples/test-online-transducer-microphone.js
Normal file
@@ -0,0 +1,88 @@
|
||||
// Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang)
|
||||
//
|
||||
const portAudio = require('naudiodon2');
|
||||
// console.log(portAudio.getDevices());
|
||||
|
||||
const sherpa_onnx = require('sherpa-onnx');
|
||||
|
||||
function createRecognizer() {
|
||||
const featConfig = new sherpa_onnx.FeatureConfig();
|
||||
featConfig.sampleRate = 16000;
|
||||
featConfig.featureDim = 80;
|
||||
|
||||
// test online recognizer
|
||||
const transducer = new sherpa_onnx.OnlineTransducerModelConfig();
|
||||
transducer.encoder =
|
||||
'./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/encoder-epoch-99-avg-1.int8.onnx';
|
||||
transducer.decoder =
|
||||
'./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/decoder-epoch-99-avg-1.onnx';
|
||||
transducer.joiner =
|
||||
'./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/joiner-epoch-99-avg-1.onnx';
|
||||
const tokens =
|
||||
'./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/tokens.txt';
|
||||
|
||||
const modelConfig = new sherpa_onnx.OnlineModelConfig();
|
||||
modelConfig.transducer = transducer;
|
||||
modelConfig.tokens = tokens;
|
||||
modelConfig.modelType = 'zipformer';
|
||||
|
||||
const recognizerConfig = new sherpa_onnx.OnlineRecognizerConfig();
|
||||
recognizerConfig.featConfig = featConfig;
|
||||
recognizerConfig.modelConfig = modelConfig;
|
||||
recognizerConfig.decodingMethod = 'greedy_search';
|
||||
recognizerConfig.enableEndpoint = 1;
|
||||
|
||||
const recognizer = new sherpa_onnx.OnlineRecognizer(recognizerConfig);
|
||||
return recognizer;
|
||||
}
|
||||
recognizer = createRecognizer();
|
||||
stream = recognizer.createStream();
|
||||
display = new sherpa_onnx.Display(50);
|
||||
|
||||
let lastText = '';
|
||||
let segmentIndex = 0;
|
||||
|
||||
const ai = new portAudio.AudioIO({
|
||||
inOptions: {
|
||||
channelCount: 1,
|
||||
closeOnError: true, // Close the stream if an audio error is detected, if
|
||||
// set false then just log the error
|
||||
deviceId: -1, // Use -1 or omit the deviceId to select the default device
|
||||
sampleFormat: portAudio.SampleFormatFloat32,
|
||||
sampleRate: recognizer.config.featConfig.sampleRate
|
||||
}
|
||||
});
|
||||
|
||||
ai.on('data', data => {
|
||||
const samples = new Float32Array(data.buffer);
|
||||
|
||||
stream.acceptWaveform(recognizer.config.featConfig.sampleRate, samples);
|
||||
|
||||
while (recognizer.isReady(stream)) {
|
||||
recognizer.decode(stream);
|
||||
}
|
||||
|
||||
const isEndpoint = recognizer.isEndpoint(stream);
|
||||
const text = recognizer.getResult(stream).text;
|
||||
|
||||
if (text.length > 0 && lastText != text) {
|
||||
lastText = text;
|
||||
display.print(segmentIndex, lastText);
|
||||
}
|
||||
if (isEndpoint) {
|
||||
if (text.length > 0) {
|
||||
lastText = text;
|
||||
segmentIndex += 1;
|
||||
}
|
||||
recognizer.reset(stream)
|
||||
}
|
||||
});
|
||||
|
||||
ai.on('close', () => {
|
||||
console.log('Free resources');
|
||||
stream.free();
|
||||
recognizer.free();
|
||||
});
|
||||
|
||||
ai.start();
|
||||
console.log('Started! Please speak')
|
||||
102
nodejs-examples/test-online-transducer.js
Normal file
102
nodejs-examples/test-online-transducer.js
Normal file
@@ -0,0 +1,102 @@
|
||||
// Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang)
|
||||
//
|
||||
const fs = require('fs');
|
||||
const {Readable} = require('stream');
|
||||
const wav = require('wav');
|
||||
|
||||
const sherpa_onnx = require('sherpa-onnx');
|
||||
|
||||
function createRecognizer() {
|
||||
const featConfig = new sherpa_onnx.FeatureConfig();
|
||||
featConfig.sampleRate = 16000;
|
||||
featConfig.featureDim = 80;
|
||||
|
||||
// test online recognizer
|
||||
const transducer = new sherpa_onnx.OnlineTransducerModelConfig();
|
||||
transducer.encoder =
|
||||
'./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/encoder-epoch-99-avg-1.int8.onnx';
|
||||
transducer.decoder =
|
||||
'./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/decoder-epoch-99-avg-1.onnx';
|
||||
transducer.joiner =
|
||||
'./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/joiner-epoch-99-avg-1.onnx';
|
||||
const tokens =
|
||||
'./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/tokens.txt';
|
||||
|
||||
const modelConfig = new sherpa_onnx.OnlineModelConfig();
|
||||
modelConfig.transducer = transducer;
|
||||
modelConfig.tokens = tokens;
|
||||
modelConfig.modelType = 'zipformer';
|
||||
|
||||
const recognizerConfig = new sherpa_onnx.OnlineRecognizerConfig();
|
||||
recognizerConfig.featConfig = featConfig;
|
||||
recognizerConfig.modelConfig = modelConfig;
|
||||
recognizerConfig.decodingMethod = 'greedy_search';
|
||||
|
||||
recognizer = new sherpa_onnx.OnlineRecognizer(recognizerConfig);
|
||||
return recognizer;
|
||||
}
|
||||
recognizer = createRecognizer();
|
||||
stream = recognizer.createStream();
|
||||
|
||||
const waveFilename =
|
||||
'./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/test_wavs/0.wav';
|
||||
|
||||
const reader = new wav.Reader();
|
||||
const readable = new Readable().wrap(reader);
|
||||
|
||||
function decode(samples) {
|
||||
stream.acceptWaveform(recognizer.config.featConfig.sampleRate, samples);
|
||||
|
||||
while (recognizer.isReady(stream)) {
|
||||
recognizer.decode(stream);
|
||||
}
|
||||
const r = recognizer.getResult(stream);
|
||||
console.log(r.text);
|
||||
}
|
||||
|
||||
reader.on('format', ({audioFormat, bitDepth, channels, sampleRate}) => {
|
||||
if (sampleRate != recognizer.config.featConfig.sampleRate) {
|
||||
throw new Error(`Only support sampleRate ${
|
||||
recognizer.config.featConfig.sampleRate}. Given ${sampleRate}`);
|
||||
}
|
||||
|
||||
if (audioFormat != 1) {
|
||||
throw new Error(`Only support PCM format. Given ${audioFormat}`);
|
||||
}
|
||||
|
||||
if (channels != 1) {
|
||||
throw new Error(`Only a single channel. Given ${channel}`);
|
||||
}
|
||||
|
||||
if (bitDepth != 16) {
|
||||
throw new Error(`Only support 16-bit samples. Given ${bitDepth}`);
|
||||
}
|
||||
});
|
||||
|
||||
fs.createReadStream(waveFilename, {'highWaterMark': 4096})
|
||||
.pipe(reader)
|
||||
.on('finish', function(err) {
|
||||
// tail padding
|
||||
const floatSamples =
|
||||
new Float32Array(recognizer.config.featConfig.sampleRate * 0.5);
|
||||
decode(floatSamples);
|
||||
stream.free();
|
||||
recognizer.free();
|
||||
});
|
||||
|
||||
readable.on('readable', function() {
|
||||
let chunk;
|
||||
while ((chunk = readable.read()) != null) {
|
||||
const int16Samples = new Int16Array(
|
||||
chunk.buffer, chunk.byteOffset,
|
||||
chunk.length / Int16Array.BYTES_PER_ELEMENT);
|
||||
|
||||
const floatSamples = new Float32Array(int16Samples.length);
|
||||
|
||||
for (let i = 0; i < floatSamples.length; i++) {
|
||||
floatSamples[i] = int16Samples[i] / 32768.0;
|
||||
}
|
||||
|
||||
decode(floatSamples);
|
||||
}
|
||||
});
|
||||
101
nodejs-examples/test-vad-microphone-offline-paraformer.js
Normal file
101
nodejs-examples/test-vad-microphone-offline-paraformer.js
Normal file
@@ -0,0 +1,101 @@
|
||||
// Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang)
|
||||
//
|
||||
const sherpa_onnx = require('sherpa-onnx3');
|
||||
const portAudio = require('naudiodon2');
|
||||
console.log(portAudio.getDevices());
|
||||
|
||||
function createOfflineRecognizer() {
|
||||
const featConfig = new sherpa_onnx.FeatureConfig();
|
||||
featConfig.sampleRate = 16000;
|
||||
featConfig.featureDim = 80;
|
||||
|
||||
// test online recognizer
|
||||
const paraformer = new sherpa_onnx.OfflineParaformerModelConfig();
|
||||
paraformer.model = './sherpa-onnx-paraformer-zh-2023-03-28/model.int8.onnx';
|
||||
const tokens = './sherpa-onnx-paraformer-zh-2023-03-28/tokens.txt';
|
||||
|
||||
const modelConfig = new sherpa_onnx.OfflineModelConfig();
|
||||
modelConfig.paraformer = paraformer;
|
||||
modelConfig.tokens = tokens;
|
||||
modelConfig.modelType = 'paraformer';
|
||||
|
||||
const recognizerConfig = new sherpa_onnx.OfflineRecognizerConfig();
|
||||
recognizerConfig.featConfig = featConfig;
|
||||
recognizerConfig.modelConfig = modelConfig;
|
||||
recognizerConfig.decodingMethod = 'greedy_search';
|
||||
|
||||
const recognizer = new sherpa_onnx.OfflineRecognizer(recognizerConfig);
|
||||
return recognizer
|
||||
}
|
||||
|
||||
function createVad() {
|
||||
const sileroVadModelConfig = new sherpa_onnx.SileroVadModelConfig();
|
||||
sileroVadModelConfig.model = './silero_vad.onnx';
|
||||
sileroVadModelConfig.minSpeechDuration = 0.3; // seconds
|
||||
sileroVadModelConfig.minSilenceDuration = 0.3; // seconds
|
||||
sileroVadModelConfig.windowSize = 512;
|
||||
|
||||
const vadModelConfig = new sherpa_onnx.VadModelConfig();
|
||||
vadModelConfig.sileroVad = sileroVadModelConfig;
|
||||
vadModelConfig.sampleRate = 16000;
|
||||
|
||||
const bufferSizeInSeconds = 60;
|
||||
const vad = new sherpa_onnx.VoiceActivityDetector(
|
||||
vadModelConfig, bufferSizeInSeconds);
|
||||
return vad;
|
||||
}
|
||||
|
||||
const recognizer = createOfflineRecognizer();
|
||||
const vad = createVad();
|
||||
|
||||
const bufferSizeInSeconds = 30;
|
||||
const buffer =
|
||||
new sherpa_onnx.CircularBuffer(bufferSizeInSeconds * vad.config.sampleRate);
|
||||
|
||||
var ai = new portAudio.AudioIO({
|
||||
inOptions: {
|
||||
channelCount: 1,
|
||||
sampleFormat: portAudio.SampleFormatFloat32,
|
||||
sampleRate: vad.config.sampleRate,
|
||||
deviceId: -1, // Use -1 or omit the deviceId to select the default device
|
||||
closeOnError: true // Close the stream if an audio error is detected, if
|
||||
// set false then just log the error
|
||||
}
|
||||
});
|
||||
|
||||
let printed = false;
|
||||
let index = 0;
|
||||
ai.on('data', data => {
|
||||
const windowSize = vad.config.sileroVad.windowSize;
|
||||
buffer.push(new Float32Array(data.buffer));
|
||||
while (buffer.size() > windowSize) {
|
||||
const samples = buffer.get(buffer.head(), windowSize);
|
||||
buffer.pop(windowSize);
|
||||
vad.acceptWaveform(samples)
|
||||
}
|
||||
|
||||
while (!vad.isEmpty()) {
|
||||
const segment = vad.front();
|
||||
vad.pop();
|
||||
const stream = recognizer.createStream();
|
||||
stream.acceptWaveform(
|
||||
recognizer.config.featConfig.sampleRate, segment.samples);
|
||||
recognizer.decode(stream);
|
||||
const r = recognizer.getResult(stream);
|
||||
stream.free();
|
||||
if (r.text.length > 0) {
|
||||
console.log(`${index}: ${r.text}`);
|
||||
index += 1;
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
ai.on('close', () => {
|
||||
console.log('Free resources');
|
||||
recognizer.free();
|
||||
vad.free();
|
||||
buffer.free();
|
||||
});
|
||||
|
||||
ai.start();
|
||||
console.log('Started! Please speak')
|
||||
106
nodejs-examples/test-vad-microphone-offline-transducer.js
Normal file
106
nodejs-examples/test-vad-microphone-offline-transducer.js
Normal file
@@ -0,0 +1,106 @@
|
||||
// Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang)
|
||||
//
|
||||
const sherpa_onnx = require('sherpa-onnx');
|
||||
const portAudio = require('naudiodon2');
|
||||
console.log(portAudio.getDevices());
|
||||
|
||||
function createOfflineRecognizer() {
|
||||
const featConfig = new sherpa_onnx.FeatureConfig();
|
||||
featConfig.sampleRate = 16000;
|
||||
featConfig.featureDim = 80;
|
||||
|
||||
// test online recognizer
|
||||
const transducer = new sherpa_onnx.OfflineTransducerModelConfig();
|
||||
transducer.encoder =
|
||||
'./sherpa-onnx-zipformer-en-2023-06-26/encoder-epoch-99-avg-1.onnx';
|
||||
transducer.decoder =
|
||||
'./sherpa-onnx-zipformer-en-2023-06-26/decoder-epoch-99-avg-1.onnx';
|
||||
transducer.joiner =
|
||||
'./sherpa-onnx-zipformer-en-2023-06-26/joiner-epoch-99-avg-1.onnx';
|
||||
const tokens = './sherpa-onnx-zipformer-en-2023-06-26/tokens.txt';
|
||||
|
||||
const modelConfig = new sherpa_onnx.OfflineModelConfig();
|
||||
modelConfig.transducer = transducer;
|
||||
modelConfig.tokens = tokens;
|
||||
modelConfig.modelType = 'transducer';
|
||||
|
||||
const recognizerConfig = new sherpa_onnx.OfflineRecognizerConfig();
|
||||
recognizerConfig.featConfig = featConfig;
|
||||
recognizerConfig.modelConfig = modelConfig;
|
||||
recognizerConfig.decodingMethod = 'greedy_search';
|
||||
|
||||
const recognizer = new sherpa_onnx.OfflineRecognizer(recognizerConfig);
|
||||
return recognizer;
|
||||
}
|
||||
|
||||
function createVad() {
|
||||
const sileroVadModelConfig = new sherpa_onnx.SileroVadModelConfig();
|
||||
sileroVadModelConfig.model = './silero_vad.onnx';
|
||||
sileroVadModelConfig.minSpeechDuration = 0.3; // seconds
|
||||
sileroVadModelConfig.minSilenceDuration = 0.3; // seconds
|
||||
sileroVadModelConfig.windowSize = 512;
|
||||
|
||||
const vadModelConfig = new sherpa_onnx.VadModelConfig();
|
||||
vadModelConfig.sileroVad = sileroVadModelConfig;
|
||||
vadModelConfig.sampleRate = 16000;
|
||||
|
||||
const bufferSizeInSeconds = 60;
|
||||
const vad = new sherpa_onnx.VoiceActivityDetector(
|
||||
vadModelConfig, bufferSizeInSeconds);
|
||||
return vad;
|
||||
}
|
||||
|
||||
const recognizer = createOfflineRecognizer();
|
||||
const vad = createVad();
|
||||
|
||||
const bufferSizeInSeconds = 30;
|
||||
const buffer =
|
||||
new sherpa_onnx.CircularBuffer(bufferSizeInSeconds * vad.config.sampleRate);
|
||||
|
||||
const ai = new portAudio.AudioIO({
|
||||
inOptions: {
|
||||
channelCount: 1,
|
||||
closeOnError: true, // Close the stream if an audio error is detected, if
|
||||
// set false then just log the error
|
||||
deviceId: -1, // Use -1 or omit the deviceId to select the default device
|
||||
sampleFormat: portAudio.SampleFormatFloat32,
|
||||
sampleRate: vad.config.sampleRate
|
||||
}
|
||||
});
|
||||
|
||||
let printed = false;
|
||||
let index = 0;
|
||||
ai.on('data', data => {
|
||||
const windowSize = vad.config.sileroVad.windowSize;
|
||||
buffer.push(new Float32Array(data.buffer));
|
||||
while (buffer.size() > windowSize) {
|
||||
const samples = buffer.get(buffer.head(), windowSize);
|
||||
buffer.pop(windowSize);
|
||||
vad.acceptWaveform(samples)
|
||||
}
|
||||
|
||||
while (!vad.isEmpty()) {
|
||||
const segment = vad.front();
|
||||
vad.pop();
|
||||
const stream = recognizer.createStream();
|
||||
stream.acceptWaveform(
|
||||
recognizer.config.featConfig.sampleRate, segment.samples);
|
||||
recognizer.decode(stream);
|
||||
const r = recognizer.getResult(stream);
|
||||
stream.free();
|
||||
if (r.text.length > 0) {
|
||||
console.log(`${index}: ${r.text}`);
|
||||
index += 1;
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
ai.on('close', () => {
|
||||
console.log('Free resources');
|
||||
recognizer.free();
|
||||
vad.free();
|
||||
buffer.free();
|
||||
});
|
||||
|
||||
ai.start();
|
||||
console.log('Started! Please speak')
|
||||
102
nodejs-examples/test-vad-microphone-offline-whisper.js
Normal file
102
nodejs-examples/test-vad-microphone-offline-whisper.js
Normal file
@@ -0,0 +1,102 @@
|
||||
// Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang)
|
||||
//
|
||||
const sherpa_onnx = require('sherpa-onnx');
|
||||
const portAudio = require('naudiodon2');
|
||||
console.log(portAudio.getDevices());
|
||||
|
||||
function createOfflineRecognizer() {
|
||||
const featConfig = new sherpa_onnx.FeatureConfig();
|
||||
featConfig.sampleRate = 16000;
|
||||
featConfig.featureDim = 80;
|
||||
|
||||
// test online recognizer
|
||||
const whisper = new sherpa_onnx.OfflineWhisperModelConfig();
|
||||
whisper.encoder = './sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx';
|
||||
whisper.decoder = './sherpa-onnx-whisper-tiny.en/tiny.en-decoder.int8.onnx';
|
||||
const tokens = './sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt';
|
||||
|
||||
const modelConfig = new sherpa_onnx.OfflineModelConfig();
|
||||
modelConfig.whisper = whisper;
|
||||
modelConfig.tokens = tokens;
|
||||
modelConfig.modelType = 'whisper';
|
||||
|
||||
const recognizerConfig = new sherpa_onnx.OfflineRecognizerConfig();
|
||||
recognizerConfig.featConfig = featConfig;
|
||||
recognizerConfig.modelConfig = modelConfig;
|
||||
recognizerConfig.decodingMethod = 'greedy_search';
|
||||
|
||||
const recognizer = new sherpa_onnx.OfflineRecognizer(recognizerConfig);
|
||||
return recognizer;
|
||||
}
|
||||
|
||||
function createVad() {
|
||||
const sileroVadModelConfig = new sherpa_onnx.SileroVadModelConfig();
|
||||
sileroVadModelConfig.model = './silero_vad.onnx';
|
||||
sileroVadModelConfig.minSpeechDuration = 0.3; // seconds
|
||||
sileroVadModelConfig.minSilenceDuration = 0.3; // seconds
|
||||
sileroVadModelConfig.windowSize = 512;
|
||||
|
||||
const vadModelConfig = new sherpa_onnx.VadModelConfig();
|
||||
vadModelConfig.sileroVad = sileroVadModelConfig;
|
||||
vadModelConfig.sampleRate = 16000;
|
||||
|
||||
const bufferSizeInSeconds = 60;
|
||||
const vad = new sherpa_onnx.VoiceActivityDetector(
|
||||
vadModelConfig, bufferSizeInSeconds);
|
||||
return vad;
|
||||
}
|
||||
|
||||
const recognizer = createOfflineRecognizer();
|
||||
const vad = createVad();
|
||||
|
||||
const bufferSizeInSeconds = 30;
|
||||
const buffer =
|
||||
new sherpa_onnx.CircularBuffer(bufferSizeInSeconds * vad.config.sampleRate);
|
||||
|
||||
const ai = new portAudio.AudioIO({
|
||||
inOptions: {
|
||||
channelCount: 1,
|
||||
closeOnError: true, // Close the stream if an audio error is detected, if
|
||||
// set false then just log the error
|
||||
deviceId: -1, // Use -1 or omit the deviceId to select the default device
|
||||
sampleFormat: portAudio.SampleFormatFloat32,
|
||||
sampleRate: vad.config.sampleRate
|
||||
}
|
||||
});
|
||||
|
||||
let printed = false;
|
||||
let index = 0;
|
||||
ai.on('data', data => {
|
||||
const windowSize = vad.config.sileroVad.windowSize;
|
||||
buffer.push(new Float32Array(data.buffer));
|
||||
while (buffer.size() > windowSize) {
|
||||
const samples = buffer.get(buffer.head(), windowSize);
|
||||
buffer.pop(windowSize);
|
||||
vad.acceptWaveform(samples)
|
||||
}
|
||||
|
||||
while (!vad.isEmpty()) {
|
||||
const segment = vad.front();
|
||||
vad.pop();
|
||||
const stream = recognizer.createStream();
|
||||
stream.acceptWaveform(
|
||||
recognizer.config.featConfig.sampleRate, segment.samples);
|
||||
recognizer.decode(stream);
|
||||
const r = recognizer.getResult(stream);
|
||||
stream.free();
|
||||
if (r.text.length > 0) {
|
||||
console.log(`${index}: ${r.text}`);
|
||||
index += 1;
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
ai.on('close', () => {
|
||||
console.log('Free resources');
|
||||
recognizer.free();
|
||||
vad.free();
|
||||
buffer.free();
|
||||
});
|
||||
|
||||
ai.start();
|
||||
console.log('Started! Please speak')
|
||||
74
nodejs-examples/test-vad-microphone.js
Normal file
74
nodejs-examples/test-vad-microphone.js
Normal file
@@ -0,0 +1,74 @@
|
||||
// Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang)
|
||||
|
||||
const sherpa_onnx = require('sherpa-onnx');
|
||||
const portAudio = require('naudiodon2');
|
||||
console.log(portAudio.getDevices());
|
||||
|
||||
function createVad() {
|
||||
const sileroVadModelConfig = new sherpa_onnx.SileroVadModelConfig();
|
||||
sileroVadModelConfig.model = './silero_vad.onnx';
|
||||
sileroVadModelConfig.minSpeechDuration = 0.3; // seconds
|
||||
sileroVadModelConfig.minSilenceDuration = 0.3; // seconds
|
||||
sileroVadModelConfig.windowSize = 512;
|
||||
|
||||
const vadModelConfig = new sherpa_onnx.VadModelConfig();
|
||||
vadModelConfig.sileroVad = sileroVadModelConfig;
|
||||
vadModelConfig.sampleRate = 16000;
|
||||
|
||||
const bufferSizeInSeconds = 60;
|
||||
const vad = new sherpa_onnx.VoiceActivityDetector(
|
||||
vadModelConfig, bufferSizeInSeconds);
|
||||
return vad;
|
||||
}
|
||||
vad = createVad();
|
||||
const bufferSizeInSeconds = 30;
|
||||
const buffer =
|
||||
new sherpa_onnx.CircularBuffer(bufferSizeInSeconds * vad.config.sampleRate);
|
||||
|
||||
const ai = new portAudio.AudioIO({
|
||||
inOptions: {
|
||||
channelCount: 1,
|
||||
closeOnError: true, // Close the stream if an audio error is detected, if
|
||||
// set false then just log the error
|
||||
deviceId: -1, // Use -1 or omit the deviceId to select the default device
|
||||
sampleFormat: portAudio.SampleFormatFloat32,
|
||||
sampleRate: vad.config.sampleRate
|
||||
}
|
||||
});
|
||||
|
||||
let printed = false;
|
||||
let index = 0;
|
||||
ai.on('data', data => {
|
||||
const windowSize = vad.config.sileroVad.windowSize;
|
||||
buffer.push(new Float32Array(data.buffer));
|
||||
while (buffer.size() > windowSize) {
|
||||
const samples = buffer.get(buffer.head(), windowSize);
|
||||
buffer.pop(windowSize);
|
||||
vad.acceptWaveform(samples)
|
||||
if (vad.isDetected() && !printed) {
|
||||
console.log(`${index}: Detected speech`)
|
||||
printed = true;
|
||||
}
|
||||
|
||||
if (!vad.isDetected()) {
|
||||
printed = false;
|
||||
}
|
||||
|
||||
while (!vad.isEmpty()) {
|
||||
const segment = vad.front();
|
||||
vad.pop();
|
||||
const duration = segment.samples.length / vad.config.sampleRate;
|
||||
console.log(`${index} End of speech. Duration: ${duration} seconds`);
|
||||
index += 1;
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
ai.on('close', () => {
|
||||
console.log('Free resources');
|
||||
vad.free();
|
||||
buffer.free();
|
||||
});
|
||||
|
||||
ai.start();
|
||||
console.log('Started! Please speak')
|
||||
Reference in New Issue
Block a user