Add WebAssembly for NodeJS. (#628)
This commit is contained in:
1
nodejs-examples/.gitignore
vendored
1
nodejs-examples/.gitignore
vendored
@@ -1,3 +1,4 @@
|
||||
node_modules
|
||||
lib
|
||||
package-lock.json
|
||||
*.tar.bz2
|
||||
|
||||
@@ -2,38 +2,18 @@
|
||||
|
||||
This directory contains nodejs examples for [sherpa-onnx](https://github.com/k2-fsa/sherpa-onnx).
|
||||
|
||||
Before you continue, please first install the npm package `sherpa-onnx` by
|
||||
Before you continue, please first run
|
||||
|
||||
```bash
|
||||
npm install sherpa-onnx
|
||||
cd ./nodejs-examples
|
||||
|
||||
npm i
|
||||
```
|
||||
|
||||
In the following, we describe how to use [sherpa-onnx](https://github.com/k2-fsa/sherpa-onnx)
|
||||
for text-to-speech and speech-to-text.
|
||||
|
||||
**Caution**: If you get the following error:
|
||||
```
|
||||
/Users/fangjun/open-source/sherpa-onnx/nodejs-examples/node_modules/ffi-napi/lib/dynamic_library.js:67
|
||||
if (match = err.match(/^(([^ \t()])+\.so([^ \t:()])*):([ \t])*/)) {
|
||||
^
|
||||
|
||||
TypeError: Cannot read properties of null (reading 'match')
|
||||
at new DynamicLibrary (/Users/fangjun/open-source/sherpa-onnx/nodejs-examples/node_modules/ffi-napi/lib/dynamic_library.js:67:21)
|
||||
at Object.Library (/Users/fangjun/open-source/sherpa-onnx/nodejs-examples/node_modules/ffi-napi/lib/library.js:47:10)
|
||||
at Object.<anonymous> (/Users/fangjun/open-source/sherpa-onnx/nodejs-examples/node_modules/sherpa-onnx3/index.js:268:28)
|
||||
at Module._compile (node:internal/modules/cjs/loader:1376:14)
|
||||
at Module._extensions..js (node:internal/modules/cjs/loader:1435:10)
|
||||
at Module.load (node:internal/modules/cjs/loader:1207:32)
|
||||
at Module._load (node:internal/modules/cjs/loader:1023:12)
|
||||
at Module.require (node:internal/modules/cjs/loader:1235:19)
|
||||
at require (node:internal/modules/helpers:176:18)
|
||||
at Object.<anonymous> (/Users/fangjun/open-source/sherpa-onnx/nodejs-examples/test-offline-tts-zh.js:3:21)
|
||||
```
|
||||
|
||||
Please downgrade your node to version v13.14.0. See also
|
||||
https://github.com/node-ffi-napi/node-ffi-napi/issues/244
|
||||
and
|
||||
https://github.com/node-ffi-napi/node-ffi-napi/issues/97 .
|
||||
Note: You need `Node >= 18`.
|
||||
|
||||
# Text-to-speech
|
||||
|
||||
@@ -71,13 +51,7 @@ node ./test-offline-tts-zh.js
|
||||
# Speech-to-text
|
||||
|
||||
In the following, we demonstrate how to decode files and how to perform
|
||||
speech recognition with a microphone with `nodejs`. We need to install two additional
|
||||
npm packages:
|
||||
|
||||
|
||||
```bash
|
||||
npm install wav naudiodon2
|
||||
```
|
||||
speech recognition with a microphone with `nodejs`.
|
||||
|
||||
## ./test-offline-nemo-ctc.js
|
||||
|
||||
@@ -200,60 +174,3 @@ wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherp
|
||||
tar xvf sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13.tar.bz2
|
||||
node ./test-online-zipformer2-ctc.js
|
||||
```
|
||||
|
||||
## ./test-vad-microphone-offline-paraformer.js
|
||||
|
||||
[./test-vad-microphone-offline-paraformer.js](./test-vad-microphone-offline-paraformer.js)
|
||||
demonstrates how to use [silero-vad](https://github.com/snakers4/silero-vad)
|
||||
with non-streaming Paraformer for speech recognition from microphone.
|
||||
|
||||
You can use the following command to run it:
|
||||
|
||||
```bash
|
||||
wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
|
||||
wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2
|
||||
tar xvf sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2
|
||||
node ./test-vad-microphone-offline-paraformer.js
|
||||
```
|
||||
|
||||
## ./test-vad-microphone-offline-transducer.js
|
||||
|
||||
[./test-vad-microphone-offline-transducer.js](./test-vad-microphone-offline-transducer.js)
|
||||
demonstrates how to use [silero-vad](https://github.com/snakers4/silero-vad)
|
||||
with a non-streaming transducer model for speech recognition from microphone.
|
||||
|
||||
You can use the following command to run it:
|
||||
|
||||
```bash
|
||||
wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
|
||||
wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-en-2023-06-26.tar.bz2
|
||||
tar xvf sherpa-onnx-zipformer-en-2023-06-26.tar.bz2
|
||||
node ./test-vad-microphone-offline-transducer.js
|
||||
```
|
||||
|
||||
## ./test-vad-microphone-offline-whisper.js
|
||||
|
||||
[./test-vad-microphone-offline-whisper.js](./test-vad-microphone-offline-whisper.js)
|
||||
demonstrates how to use [silero-vad](https://github.com/snakers4/silero-vad)
|
||||
with whisper for speech recognition from microphone.
|
||||
|
||||
You can use the following command to run it:
|
||||
|
||||
```bash
|
||||
wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
|
||||
wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2
|
||||
tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2
|
||||
node ./test-vad-microphone-offline-whisper.js
|
||||
```
|
||||
|
||||
## ./test-vad-microphone.js
|
||||
|
||||
[./test-vad-microphone.js](./test-vad-microphone.js)
|
||||
demonstrates how to use [silero-vad](https://github.com/snakers4/silero-vad).
|
||||
|
||||
You can use the following command to run it:
|
||||
|
||||
```bash
|
||||
wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
|
||||
node ./test-vad-microphone.js
|
||||
```
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"dependencies": {
|
||||
"naudiodon2": "^2.4.0",
|
||||
"sherpa-onnx": "^1.8.12",
|
||||
"sherpa-onnx": "*",
|
||||
"wav": "^1.0.2"
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
// Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang)
|
||||
// Copyright (c) 2023-2024 Xiaomi Corporation (authors: Fangjun Kuang)
|
||||
//
|
||||
const fs = require('fs');
|
||||
const {Readable} = require('stream');
|
||||
@@ -6,32 +6,58 @@ const wav = require('wav');
|
||||
|
||||
const sherpa_onnx = require('sherpa-onnx');
|
||||
|
||||
function createRecognizer() {
|
||||
const featConfig = new sherpa_onnx.FeatureConfig();
|
||||
featConfig.sampleRate = 16000;
|
||||
featConfig.featureDim = 80;
|
||||
function createOfflineRecognizer() {
|
||||
let featConfig = {
|
||||
sampleRate: 16000,
|
||||
featureDim: 80,
|
||||
};
|
||||
|
||||
// test online recognizer
|
||||
const nemoCtc = new sherpa_onnx.OfflineNemoEncDecCtcModelConfig();
|
||||
nemoCtc.model = './sherpa-onnx-nemo-ctc-en-conformer-small/model.int8.onnx';
|
||||
const tokens = './sherpa-onnx-nemo-ctc-en-conformer-small/tokens.txt';
|
||||
let modelConfig = {
|
||||
transducer: {
|
||||
encoder: '',
|
||||
decoder: '',
|
||||
joiner: '',
|
||||
},
|
||||
paraformer: {
|
||||
model: '',
|
||||
},
|
||||
nemoCtc: {
|
||||
model: './sherpa-onnx-nemo-ctc-en-conformer-small/model.int8.onnx',
|
||||
},
|
||||
whisper: {
|
||||
encoder: '',
|
||||
decoder: '',
|
||||
},
|
||||
tdnn: {
|
||||
model: '',
|
||||
},
|
||||
tokens: './sherpa-onnx-nemo-ctc-en-conformer-small/tokens.txt',
|
||||
numThreads: 1,
|
||||
debug: 0,
|
||||
provider: 'cpu',
|
||||
modelType: 'nemo_ctc',
|
||||
};
|
||||
|
||||
const modelConfig = new sherpa_onnx.OfflineModelConfig();
|
||||
modelConfig.nemoCtc = nemoCtc;
|
||||
modelConfig.tokens = tokens;
|
||||
modelConfig.modelType = 'nemo_ctc';
|
||||
let lmConfig = {
|
||||
model: '',
|
||||
scale: 1.0,
|
||||
};
|
||||
|
||||
const recognizerConfig = new sherpa_onnx.OfflineRecognizerConfig();
|
||||
recognizerConfig.featConfig = featConfig;
|
||||
recognizerConfig.modelConfig = modelConfig;
|
||||
recognizerConfig.decodingMethod = 'greedy_search';
|
||||
let config = {
|
||||
featConfig: featConfig,
|
||||
modelConfig: modelConfig,
|
||||
lmConfig: lmConfig,
|
||||
decodingMethod: 'greedy_search',
|
||||
maxActivePaths: 4,
|
||||
hotwordsFile: '',
|
||||
hotwordsScore: 1.5,
|
||||
};
|
||||
|
||||
const recognizer = new sherpa_onnx.OfflineRecognizer(recognizerConfig);
|
||||
return recognizer;
|
||||
return sherpa_onnx.createOfflineRecognizer(config);
|
||||
}
|
||||
|
||||
recognizer = createRecognizer();
|
||||
stream = recognizer.createStream();
|
||||
const recognizer = createOfflineRecognizer();
|
||||
const stream = recognizer.createStream();
|
||||
|
||||
const waveFilename =
|
||||
'./sherpa-onnx-nemo-ctc-en-conformer-small/test_wavs/0.wav';
|
||||
@@ -72,8 +98,8 @@ fs.createReadStream(waveFilename, {highWaterMark: 4096})
|
||||
|
||||
stream.acceptWaveform(recognizer.config.featConfig.sampleRate, flattened);
|
||||
recognizer.decode(stream);
|
||||
const r = recognizer.getResult(stream);
|
||||
console.log(r.text);
|
||||
const text = recognizer.getResult(stream);
|
||||
console.log(text);
|
||||
|
||||
stream.free();
|
||||
recognizer.free();
|
||||
|
||||
@@ -6,32 +6,59 @@ const wav = require('wav');
|
||||
|
||||
const sherpa_onnx = require('sherpa-onnx');
|
||||
|
||||
function createRecognizer() {
|
||||
const featConfig = new sherpa_onnx.FeatureConfig();
|
||||
featConfig.sampleRate = 16000;
|
||||
featConfig.featureDim = 80;
|
||||
function createOfflineRecognizer() {
|
||||
let featConfig = {
|
||||
sampleRate: 16000,
|
||||
featureDim: 80,
|
||||
};
|
||||
|
||||
// test online recognizer
|
||||
const paraformer = new sherpa_onnx.OfflineParaformerModelConfig();
|
||||
paraformer.model = './sherpa-onnx-paraformer-zh-2023-03-28/model.onnx';
|
||||
const tokens = './sherpa-onnx-paraformer-zh-2023-03-28/tokens.txt';
|
||||
let modelConfig = {
|
||||
transducer: {
|
||||
encoder: '',
|
||||
decoder: '',
|
||||
joiner: '',
|
||||
},
|
||||
paraformer: {
|
||||
model: './sherpa-onnx-paraformer-zh-2023-03-28/model.int8.onnx',
|
||||
},
|
||||
nemoCtc: {
|
||||
model: '',
|
||||
},
|
||||
whisper: {
|
||||
encoder: '',
|
||||
decoder: '',
|
||||
},
|
||||
tdnn: {
|
||||
model: '',
|
||||
},
|
||||
tokens: './sherpa-onnx-paraformer-zh-2023-03-28/tokens.txt',
|
||||
numThreads: 1,
|
||||
debug: 0,
|
||||
provider: 'cpu',
|
||||
modelType: 'paraformer',
|
||||
};
|
||||
|
||||
const modelConfig = new sherpa_onnx.OfflineModelConfig();
|
||||
modelConfig.paraformer = paraformer;
|
||||
modelConfig.tokens = tokens;
|
||||
modelConfig.modelType = 'paraformer';
|
||||
let lmConfig = {
|
||||
model: '',
|
||||
scale: 1.0,
|
||||
};
|
||||
|
||||
const recognizerConfig = new sherpa_onnx.OfflineRecognizerConfig();
|
||||
recognizerConfig.featConfig = featConfig;
|
||||
recognizerConfig.modelConfig = modelConfig;
|
||||
recognizerConfig.decodingMethod = 'greedy_search';
|
||||
let config = {
|
||||
featConfig: featConfig,
|
||||
modelConfig: modelConfig,
|
||||
lmConfig: lmConfig,
|
||||
decodingMethod: 'greedy_search',
|
||||
maxActivePaths: 4,
|
||||
hotwordsFile: '',
|
||||
hotwordsScore: 1.5,
|
||||
};
|
||||
|
||||
const recognizer = new sherpa_onnx.OfflineRecognizer(recognizerConfig);
|
||||
return recognizer;
|
||||
return sherpa_onnx.createOfflineRecognizer(config);
|
||||
}
|
||||
|
||||
recognizer = createRecognizer();
|
||||
stream = recognizer.createStream();
|
||||
|
||||
const recognizer = createOfflineRecognizer();
|
||||
const stream = recognizer.createStream();
|
||||
|
||||
const waveFilename = './sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/0.wav';
|
||||
|
||||
@@ -71,8 +98,8 @@ fs.createReadStream(waveFilename, {'highWaterMark': 4096})
|
||||
|
||||
stream.acceptWaveform(recognizer.config.featConfig.sampleRate, flattened);
|
||||
recognizer.decode(stream);
|
||||
const r = recognizer.getResult(stream);
|
||||
console.log(r.text);
|
||||
const text = recognizer.getResult(stream);
|
||||
console.log(text);
|
||||
|
||||
stream.free();
|
||||
recognizer.free();
|
||||
|
||||
@@ -6,37 +6,60 @@ const wav = require('wav');
|
||||
|
||||
const sherpa_onnx = require('sherpa-onnx');
|
||||
|
||||
function createRecognizer() {
|
||||
const featConfig = new sherpa_onnx.FeatureConfig();
|
||||
featConfig.sampleRate = 16000;
|
||||
featConfig.featureDim = 80;
|
||||
function createOfflineRecognizer() {
|
||||
let featConfig = {
|
||||
sampleRate: 16000,
|
||||
featureDim: 80,
|
||||
};
|
||||
|
||||
// test online recognizer
|
||||
const transducer = new sherpa_onnx.OfflineTransducerModelConfig();
|
||||
transducer.encoder =
|
||||
'./sherpa-onnx-zipformer-en-2023-06-26/encoder-epoch-99-avg-1.onnx';
|
||||
transducer.decoder =
|
||||
'./sherpa-onnx-zipformer-en-2023-06-26/decoder-epoch-99-avg-1.onnx';
|
||||
transducer.joiner =
|
||||
'./sherpa-onnx-zipformer-en-2023-06-26/joiner-epoch-99-avg-1.onnx';
|
||||
const tokens = './sherpa-onnx-zipformer-en-2023-06-26/tokens.txt';
|
||||
let modelConfig = {
|
||||
transducer: {
|
||||
encoder:
|
||||
'./sherpa-onnx-zipformer-en-2023-06-26/encoder-epoch-99-avg-1.int8.onnx',
|
||||
decoder:
|
||||
'./sherpa-onnx-zipformer-en-2023-06-26/decoder-epoch-99-avg-1.onnx',
|
||||
joiner:
|
||||
'./sherpa-onnx-zipformer-en-2023-06-26/joiner-epoch-99-avg-1.int8.onnx',
|
||||
},
|
||||
paraformer: {
|
||||
model: '',
|
||||
},
|
||||
nemoCtc: {
|
||||
model: '',
|
||||
},
|
||||
whisper: {
|
||||
encoder: '',
|
||||
decoder: '',
|
||||
},
|
||||
tdnn: {
|
||||
model: '',
|
||||
},
|
||||
tokens: './sherpa-onnx-zipformer-en-2023-06-26/tokens.txt',
|
||||
numThreads: 1,
|
||||
debug: 0,
|
||||
provider: 'cpu',
|
||||
modelType: 'transducer',
|
||||
};
|
||||
|
||||
const modelConfig = new sherpa_onnx.OfflineModelConfig();
|
||||
modelConfig.transducer = transducer;
|
||||
modelConfig.tokens = tokens;
|
||||
modelConfig.modelType = 'transducer';
|
||||
let lmConfig = {
|
||||
model: '',
|
||||
scale: 1.0,
|
||||
};
|
||||
|
||||
const recognizerConfig = new sherpa_onnx.OfflineRecognizerConfig();
|
||||
recognizerConfig.featConfig = featConfig;
|
||||
recognizerConfig.modelConfig = modelConfig;
|
||||
recognizerConfig.decodingMethod = 'greedy_search';
|
||||
let config = {
|
||||
featConfig: featConfig,
|
||||
modelConfig: modelConfig,
|
||||
lmConfig: lmConfig,
|
||||
decodingMethod: 'greedy_search',
|
||||
maxActivePaths: 4,
|
||||
hotwordsFile: '',
|
||||
hotwordsScore: 1.5,
|
||||
};
|
||||
|
||||
const recognizer = new sherpa_onnx.OfflineRecognizer(recognizerConfig);
|
||||
return recognizer;
|
||||
return sherpa_onnx.createOfflineRecognizer(config);
|
||||
}
|
||||
|
||||
recognizer = createRecognizer();
|
||||
stream = recognizer.createStream();
|
||||
const recognizer = createOfflineRecognizer();
|
||||
const stream = recognizer.createStream();
|
||||
|
||||
const waveFilename = './sherpa-onnx-zipformer-en-2023-06-26/test_wavs/0.wav';
|
||||
|
||||
@@ -76,8 +99,8 @@ fs.createReadStream(waveFilename, {'highWaterMark': 4096})
|
||||
|
||||
stream.acceptWaveform(recognizer.config.featConfig.sampleRate, flattened);
|
||||
recognizer.decode(stream);
|
||||
const r = recognizer.getResult(stream);
|
||||
console.log(r.text);
|
||||
const text = recognizer.getResult(stream);
|
||||
console.log(text);
|
||||
|
||||
stream.free();
|
||||
recognizer.free();
|
||||
|
||||
@@ -1,28 +1,45 @@
|
||||
// Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang)
|
||||
// Copyright (c) 2023-2024 Xiaomi Corporation (authors: Fangjun Kuang)
|
||||
|
||||
const sherpa_onnx = require('sherpa-onnx');
|
||||
|
||||
function createOfflineTts() {
|
||||
const vits = new sherpa_onnx.OfflineTtsVitsModelConfig();
|
||||
vits.model = 'vits-piper-en_US-amy-low/en_US-amy-low.onnx'
|
||||
vits.tokens = './vits-piper-en_US-amy-low/tokens.txt';
|
||||
vits.dataDir = './vits-piper-en_US-amy-low/espeak-ng-data'
|
||||
let offlineTtsVitsModelConfig = {
|
||||
model: './vits-piper-en_US-amy-low/en_US-amy-low.onnx',
|
||||
lexicon: '',
|
||||
tokens: './vits-piper-en_US-amy-low/tokens.txt',
|
||||
dataDir: './vits-piper-en_US-amy-low/espeak-ng-data',
|
||||
noiseScale: 0.667,
|
||||
noiseScaleW: 0.8,
|
||||
lengthScale: 1.0,
|
||||
};
|
||||
let offlineTtsModelConfig = {
|
||||
offlineTtsVitsModelConfig: offlineTtsVitsModelConfig,
|
||||
numThreads: 1,
|
||||
debug: 1,
|
||||
provider: 'cpu',
|
||||
};
|
||||
|
||||
const modelConfig = new sherpa_onnx.OfflineTtsModelConfig();
|
||||
modelConfig.vits = vits;
|
||||
let offlineTtsConfig = {
|
||||
offlineTtsModelConfig: offlineTtsModelConfig,
|
||||
ruleFsts: '',
|
||||
maxNumSentences: 1,
|
||||
};
|
||||
|
||||
const config = new sherpa_onnx.OfflineTtsConfig();
|
||||
config.model = modelConfig;
|
||||
|
||||
return new sherpa_onnx.OfflineTts(config);
|
||||
return sherpa_onnx.createOfflineTts(offlineTtsConfig);
|
||||
}
|
||||
|
||||
|
||||
const tts = createOfflineTts();
|
||||
const speakerId = 0;
|
||||
const speed = 1.0;
|
||||
const audio = tts.generate(
|
||||
'“Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.”',
|
||||
speakerId, speed);
|
||||
audio.save('./test-en.wav');
|
||||
const audio = tts.generate({
|
||||
text:
|
||||
'“Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.”',
|
||||
sid: speakerId,
|
||||
speed: speed
|
||||
});
|
||||
|
||||
tts.save('./test-en.wav', audio);
|
||||
console.log('Saved to test-en.wav successfully.');
|
||||
|
||||
tts.free();
|
||||
|
||||
@@ -3,25 +3,37 @@
|
||||
const sherpa_onnx = require('sherpa-onnx');
|
||||
|
||||
function createOfflineTts() {
|
||||
const vits = new sherpa_onnx.OfflineTtsVitsModelConfig();
|
||||
vits.model = './vits-zh-aishell3/vits-aishell3.onnx';
|
||||
vits.lexicon = './vits-zh-aishell3/lexicon.txt';
|
||||
vits.tokens = './vits-zh-aishell3/tokens.txt';
|
||||
let offlineTtsVitsModelConfig = {
|
||||
model: './vits-zh-aishell3/vits-aishell3.onnx',
|
||||
lexicon: './vits-zh-aishell3/lexicon.txt',
|
||||
tokens: './vits-zh-aishell3/tokens.txt',
|
||||
dataDir: '',
|
||||
noiseScale: 0.667,
|
||||
noiseScaleW: 0.8,
|
||||
lengthScale: 1.0,
|
||||
};
|
||||
let offlineTtsModelConfig = {
|
||||
offlineTtsVitsModelConfig: offlineTtsVitsModelConfig,
|
||||
numThreads: 1,
|
||||
debug: 1,
|
||||
provider: 'cpu',
|
||||
};
|
||||
|
||||
const modelConfig = new sherpa_onnx.OfflineTtsModelConfig();
|
||||
modelConfig.vits = vits;
|
||||
let offlineTtsConfig = {
|
||||
offlineTtsModelConfig: offlineTtsModelConfig,
|
||||
ruleFsts: './vits-zh-aishell3/rule.fst',
|
||||
maxNumSentences: 1,
|
||||
};
|
||||
|
||||
const config = new sherpa_onnx.OfflineTtsConfig();
|
||||
config.model = modelConfig;
|
||||
config.ruleFsts = './vits-zh-aishell3/rule.fst';
|
||||
|
||||
return new sherpa_onnx.OfflineTts(config);
|
||||
return sherpa_onnx.createOfflineTts(offlineTtsConfig);
|
||||
}
|
||||
|
||||
|
||||
const tts = createOfflineTts();
|
||||
const speakerId = 66;
|
||||
const speed = 1.0;
|
||||
const audio = tts.generate('3年前中国总人口是1411778724人', speakerId, speed);
|
||||
audio.save('./test-zh.wav');
|
||||
const audio = tts.generate(
|
||||
{text: '3年前中国总人口是1411778724人', sid: speakerId, speed: speed});
|
||||
tts.save('./test-zh.wav', audio);
|
||||
console.log('Saved to test-zh.wav successfully.');
|
||||
tts.free();
|
||||
|
||||
@@ -6,32 +6,58 @@ const wav = require('wav');
|
||||
|
||||
const sherpa_onnx = require('sherpa-onnx');
|
||||
|
||||
function createRecognizer() {
|
||||
const featConfig = new sherpa_onnx.FeatureConfig();
|
||||
featConfig.sampleRate = 16000;
|
||||
featConfig.featureDim = 80;
|
||||
function createOfflineRecognizer() {
|
||||
let featConfig = {
|
||||
sampleRate: 16000,
|
||||
featureDim: 80,
|
||||
};
|
||||
|
||||
// test online recognizer
|
||||
const whisper = new sherpa_onnx.OfflineWhisperModelConfig();
|
||||
whisper.encoder = './sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx';
|
||||
whisper.decoder = './sherpa-onnx-whisper-tiny.en/tiny.en-decoder.int8.onnx';
|
||||
const tokens = './sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt';
|
||||
let modelConfig = {
|
||||
transducer: {
|
||||
encoder: '',
|
||||
decoder: '',
|
||||
joiner: '',
|
||||
},
|
||||
paraformer: {
|
||||
model: '',
|
||||
},
|
||||
nemoCtc: {
|
||||
model: '',
|
||||
},
|
||||
whisper: {
|
||||
encoder: './sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx',
|
||||
decoder: './sherpa-onnx-whisper-tiny.en/tiny.en-decoder.int8.onnx',
|
||||
},
|
||||
tdnn: {
|
||||
model: '',
|
||||
},
|
||||
tokens: './sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt',
|
||||
numThreads: 1,
|
||||
debug: 0,
|
||||
provider: 'cpu',
|
||||
modelType: 'whisper',
|
||||
};
|
||||
|
||||
const modelConfig = new sherpa_onnx.OfflineModelConfig();
|
||||
modelConfig.whisper = whisper;
|
||||
modelConfig.tokens = tokens;
|
||||
modelConfig.modelType = 'whisper';
|
||||
let lmConfig = {
|
||||
model: '',
|
||||
scale: 1.0,
|
||||
};
|
||||
|
||||
const recognizerConfig = new sherpa_onnx.OfflineRecognizerConfig();
|
||||
recognizerConfig.featConfig = featConfig;
|
||||
recognizerConfig.modelConfig = modelConfig;
|
||||
recognizerConfig.decodingMethod = 'greedy_search';
|
||||
let config = {
|
||||
featConfig: featConfig,
|
||||
modelConfig: modelConfig,
|
||||
lmConfig: lmConfig,
|
||||
decodingMethod: 'greedy_search',
|
||||
maxActivePaths: 4,
|
||||
hotwordsFile: '',
|
||||
hotwordsScore: 1.5,
|
||||
};
|
||||
|
||||
const recognizer = new sherpa_onnx.OfflineRecognizer(recognizerConfig);
|
||||
return recognizer;
|
||||
return sherpa_onnx.createOfflineRecognizer(config);
|
||||
}
|
||||
|
||||
recognizer = createRecognizer();
|
||||
|
||||
recognizer = createOfflineRecognizer();
|
||||
stream = recognizer.createStream();
|
||||
|
||||
const waveFilename = './sherpa-onnx-whisper-tiny.en/test_wavs/0.wav';
|
||||
@@ -72,8 +98,8 @@ fs.createReadStream(waveFilename, {'highWaterMark': 4096})
|
||||
|
||||
stream.acceptWaveform(recognizer.config.featConfig.sampleRate, flattened);
|
||||
recognizer.decode(stream);
|
||||
const r = recognizer.getResult(stream);
|
||||
console.log(r.text);
|
||||
const text = recognizer.getResult(stream);
|
||||
console.log(text);
|
||||
|
||||
stream.free();
|
||||
recognizer.free();
|
||||
|
||||
@@ -5,37 +5,58 @@ console.log(portAudio.getDevices());
|
||||
|
||||
const sherpa_onnx = require('sherpa-onnx');
|
||||
|
||||
function createRecognizer() {
|
||||
const featConfig = new sherpa_onnx.FeatureConfig();
|
||||
featConfig.sampleRate = 16000;
|
||||
featConfig.featureDim = 80;
|
||||
function createOnlineRecognizer() {
|
||||
let onlineTransducerModelConfig = {
|
||||
encoder: '',
|
||||
decoder: '',
|
||||
joiner: '',
|
||||
};
|
||||
|
||||
const paraformer = new sherpa_onnx.OnlineParaformerModelConfig();
|
||||
paraformer.encoder =
|
||||
'./sherpa-onnx-streaming-paraformer-bilingual-zh-en/encoder.int8.onnx';
|
||||
paraformer.decoder =
|
||||
'./sherpa-onnx-streaming-paraformer-bilingual-zh-en/decoder.int8.onnx';
|
||||
const tokens =
|
||||
'./sherpa-onnx-streaming-paraformer-bilingual-zh-en/tokens.txt';
|
||||
let onlineParaformerModelConfig = {
|
||||
encoder:
|
||||
'./sherpa-onnx-streaming-paraformer-bilingual-zh-en/encoder.int8.onnx',
|
||||
decoder:
|
||||
'./sherpa-onnx-streaming-paraformer-bilingual-zh-en/decoder.int8.onnx',
|
||||
};
|
||||
|
||||
const modelConfig = new sherpa_onnx.OnlineModelConfig();
|
||||
modelConfig.paraformer = paraformer;
|
||||
modelConfig.tokens = tokens;
|
||||
modelConfig.modelType = 'paraformer';
|
||||
let onlineZipformer2CtcModelConfig = {
|
||||
model: '',
|
||||
};
|
||||
|
||||
const recognizerConfig = new sherpa_onnx.OnlineRecognizerConfig();
|
||||
recognizerConfig.featConfig = featConfig;
|
||||
recognizerConfig.modelConfig = modelConfig;
|
||||
recognizerConfig.decodingMethod = 'greedy_search';
|
||||
recognizerConfig.enableEndpoint = 1;
|
||||
let onlineModelConfig = {
|
||||
transducer: onlineTransducerModelConfig,
|
||||
paraformer: onlineParaformerModelConfig,
|
||||
zipformer2Ctc: onlineZipformer2CtcModelConfig,
|
||||
tokens: './sherpa-onnx-streaming-paraformer-bilingual-zh-en/tokens.txt',
|
||||
numThreads: 1,
|
||||
provider: 'cpu',
|
||||
debug: 1,
|
||||
modelType: 'paraformer',
|
||||
};
|
||||
|
||||
const recognizer = new sherpa_onnx.OnlineRecognizer(recognizerConfig);
|
||||
return recognizer;
|
||||
let featureConfig = {
|
||||
sampleRate: 16000,
|
||||
featureDim: 80,
|
||||
};
|
||||
|
||||
let recognizerConfig = {
|
||||
featConfig: featureConfig,
|
||||
modelConfig: onlineModelConfig,
|
||||
decodingMethod: 'greedy_search',
|
||||
maxActivePaths: 4,
|
||||
enableEndpoint: 1,
|
||||
rule1MinTrailingSilence: 2.4,
|
||||
rule2MinTrailingSilence: 1.2,
|
||||
rule3MinUtteranceLength: 20,
|
||||
hotwordsFile: '',
|
||||
hotwordsScore: 1.5,
|
||||
};
|
||||
|
||||
return sherpa_onnx.createOnlineRecognizer(recognizerConfig);
|
||||
}
|
||||
recognizer = createRecognizer();
|
||||
stream = recognizer.createStream();
|
||||
|
||||
display = new sherpa_onnx.Display(50);
|
||||
const recognizer = createOnlineRecognizer();
|
||||
const stream = recognizer.createStream();
|
||||
|
||||
let lastText = '';
|
||||
let segmentIndex = 0;
|
||||
@@ -61,11 +82,11 @@ ai.on('data', data => {
|
||||
}
|
||||
|
||||
const isEndpoint = recognizer.isEndpoint(stream);
|
||||
const text = recognizer.getResult(stream).text;
|
||||
const text = recognizer.getResult(stream);
|
||||
|
||||
if (text.length > 0 && lastText != text) {
|
||||
lastText = text;
|
||||
display.print(segmentIndex, lastText);
|
||||
console.log(segmentIndex, lastText);
|
||||
}
|
||||
if (isEndpoint) {
|
||||
if (text.length > 0) {
|
||||
|
||||
@@ -6,34 +6,58 @@ const wav = require('wav');
|
||||
|
||||
const sherpa_onnx = require('sherpa-onnx');
|
||||
|
||||
function createRecognizer() {
|
||||
const featConfig = new sherpa_onnx.FeatureConfig();
|
||||
featConfig.sampleRate = 16000;
|
||||
featConfig.featureDim = 80;
|
||||
function createOnlineRecognizer() {
|
||||
let onlineTransducerModelConfig = {
|
||||
encoder: '',
|
||||
decoder: '',
|
||||
joiner: '',
|
||||
};
|
||||
|
||||
const paraformer = new sherpa_onnx.OnlineParaformerModelConfig();
|
||||
paraformer.encoder =
|
||||
'./sherpa-onnx-streaming-paraformer-bilingual-zh-en/encoder.onnx';
|
||||
paraformer.decoder =
|
||||
'./sherpa-onnx-streaming-paraformer-bilingual-zh-en/decoder.onnx';
|
||||
const tokens =
|
||||
'./sherpa-onnx-streaming-paraformer-bilingual-zh-en/tokens.txt';
|
||||
let onlineParaformerModelConfig = {
|
||||
encoder:
|
||||
'./sherpa-onnx-streaming-paraformer-bilingual-zh-en/encoder.int8.onnx',
|
||||
decoder:
|
||||
'./sherpa-onnx-streaming-paraformer-bilingual-zh-en/decoder.int8.onnx',
|
||||
};
|
||||
|
||||
const modelConfig = new sherpa_onnx.OnlineModelConfig();
|
||||
modelConfig.paraformer = paraformer;
|
||||
modelConfig.tokens = tokens;
|
||||
modelConfig.modelType = 'paraformer';
|
||||
let onlineZipformer2CtcModelConfig = {
|
||||
model: '',
|
||||
};
|
||||
|
||||
const recognizerConfig = new sherpa_onnx.OnlineRecognizerConfig();
|
||||
recognizerConfig.featConfig = featConfig;
|
||||
recognizerConfig.modelConfig = modelConfig;
|
||||
recognizerConfig.decodingMethod = 'greedy_search';
|
||||
let onlineModelConfig = {
|
||||
transducer: onlineTransducerModelConfig,
|
||||
paraformer: onlineParaformerModelConfig,
|
||||
zipformer2Ctc: onlineZipformer2CtcModelConfig,
|
||||
tokens: './sherpa-onnx-streaming-paraformer-bilingual-zh-en/tokens.txt',
|
||||
numThreads: 1,
|
||||
provider: 'cpu',
|
||||
debug: 1,
|
||||
modelType: 'paraformer',
|
||||
};
|
||||
|
||||
const recognizer = new sherpa_onnx.OnlineRecognizer(recognizerConfig);
|
||||
return recognizer;
|
||||
let featureConfig = {
|
||||
sampleRate: 16000,
|
||||
featureDim: 80,
|
||||
};
|
||||
|
||||
let recognizerConfig = {
|
||||
featConfig: featureConfig,
|
||||
modelConfig: onlineModelConfig,
|
||||
decodingMethod: 'greedy_search',
|
||||
maxActivePaths: 4,
|
||||
enableEndpoint: 1,
|
||||
rule1MinTrailingSilence: 2.4,
|
||||
rule2MinTrailingSilence: 1.2,
|
||||
rule3MinUtteranceLength: 20,
|
||||
hotwordsFile: '',
|
||||
hotwordsScore: 1.5,
|
||||
};
|
||||
|
||||
return sherpa_onnx.createOnlineRecognizer(recognizerConfig);
|
||||
}
|
||||
recognizer = createRecognizer();
|
||||
stream = recognizer.createStream();
|
||||
|
||||
const recognizer = createOnlineRecognizer();
|
||||
const stream = recognizer.createStream();
|
||||
|
||||
const waveFilename =
|
||||
'./sherpa-onnx-streaming-paraformer-bilingual-zh-en/test_wavs/0.wav';
|
||||
@@ -47,8 +71,8 @@ function decode(samples) {
|
||||
while (recognizer.isReady(stream)) {
|
||||
recognizer.decode(stream);
|
||||
}
|
||||
const r = recognizer.getResult(stream);
|
||||
console.log(r.text);
|
||||
const text = recognizer.getResult(stream);
|
||||
console.log(text);
|
||||
}
|
||||
|
||||
reader.on('format', ({audioFormat, bitDepth, channels, sampleRate}) => {
|
||||
|
||||
@@ -5,39 +5,60 @@ const portAudio = require('naudiodon2');
|
||||
|
||||
const sherpa_onnx = require('sherpa-onnx');
|
||||
|
||||
function createRecognizer() {
|
||||
const featConfig = new sherpa_onnx.FeatureConfig();
|
||||
featConfig.sampleRate = 16000;
|
||||
featConfig.featureDim = 80;
|
||||
function createOnlineRecognizer() {
|
||||
let onlineTransducerModelConfig = {
|
||||
encoder:
|
||||
'./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/encoder-epoch-99-avg-1.int8.onnx',
|
||||
decoder:
|
||||
'./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/decoder-epoch-99-avg-1.onnx',
|
||||
joiner:
|
||||
'./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/joiner-epoch-99-avg-1.int8.onnx',
|
||||
};
|
||||
|
||||
// test online recognizer
|
||||
const transducer = new sherpa_onnx.OnlineTransducerModelConfig();
|
||||
transducer.encoder =
|
||||
'./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/encoder-epoch-99-avg-1.int8.onnx';
|
||||
transducer.decoder =
|
||||
'./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/decoder-epoch-99-avg-1.onnx';
|
||||
transducer.joiner =
|
||||
'./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/joiner-epoch-99-avg-1.onnx';
|
||||
const tokens =
|
||||
'./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/tokens.txt';
|
||||
let onlineParaformerModelConfig = {
|
||||
encoder: '',
|
||||
decoder: '',
|
||||
};
|
||||
|
||||
const modelConfig = new sherpa_onnx.OnlineModelConfig();
|
||||
modelConfig.transducer = transducer;
|
||||
modelConfig.tokens = tokens;
|
||||
modelConfig.modelType = 'zipformer';
|
||||
let onlineZipformer2CtcModelConfig = {
|
||||
model: '',
|
||||
};
|
||||
|
||||
const recognizerConfig = new sherpa_onnx.OnlineRecognizerConfig();
|
||||
recognizerConfig.featConfig = featConfig;
|
||||
recognizerConfig.modelConfig = modelConfig;
|
||||
recognizerConfig.decodingMethod = 'greedy_search';
|
||||
recognizerConfig.enableEndpoint = 1;
|
||||
let onlineModelConfig = {
|
||||
transducer: onlineTransducerModelConfig,
|
||||
paraformer: onlineParaformerModelConfig,
|
||||
zipformer2Ctc: onlineZipformer2CtcModelConfig,
|
||||
tokens:
|
||||
'./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/tokens.txt',
|
||||
numThreads: 1,
|
||||
provider: 'cpu',
|
||||
debug: 1,
|
||||
modelType: 'zipformer',
|
||||
};
|
||||
|
||||
const recognizer = new sherpa_onnx.OnlineRecognizer(recognizerConfig);
|
||||
return recognizer;
|
||||
let featureConfig = {
|
||||
sampleRate: 16000,
|
||||
featureDim: 80,
|
||||
};
|
||||
|
||||
let recognizerConfig = {
|
||||
featConfig: featureConfig,
|
||||
modelConfig: onlineModelConfig,
|
||||
decodingMethod: 'greedy_search',
|
||||
maxActivePaths: 4,
|
||||
enableEndpoint: 1,
|
||||
rule1MinTrailingSilence: 2.4,
|
||||
rule2MinTrailingSilence: 1.2,
|
||||
rule3MinUtteranceLength: 20,
|
||||
hotwordsFile: '',
|
||||
hotwordsScore: 1.5,
|
||||
};
|
||||
|
||||
return sherpa_onnx.createOnlineRecognizer(recognizerConfig);
|
||||
}
|
||||
recognizer = createRecognizer();
|
||||
stream = recognizer.createStream();
|
||||
display = new sherpa_onnx.Display(50);
|
||||
|
||||
const recognizer = createOnlineRecognizer();
|
||||
const stream = recognizer.createStream();
|
||||
|
||||
let lastText = '';
|
||||
let segmentIndex = 0;
|
||||
@@ -63,11 +84,11 @@ ai.on('data', data => {
|
||||
}
|
||||
|
||||
const isEndpoint = recognizer.isEndpoint(stream);
|
||||
const text = recognizer.getResult(stream).text;
|
||||
const text = recognizer.getResult(stream);
|
||||
|
||||
if (text.length > 0 && lastText != text) {
|
||||
lastText = text;
|
||||
display.print(segmentIndex, lastText);
|
||||
console.log(segmentIndex, lastText);
|
||||
}
|
||||
if (isEndpoint) {
|
||||
if (text.length > 0) {
|
||||
|
||||
@@ -6,37 +6,60 @@ const wav = require('wav');
|
||||
|
||||
const sherpa_onnx = require('sherpa-onnx');
|
||||
|
||||
function createRecognizer() {
|
||||
const featConfig = new sherpa_onnx.FeatureConfig();
|
||||
featConfig.sampleRate = 16000;
|
||||
featConfig.featureDim = 80;
|
||||
function createOnlineRecognizer() {
|
||||
let onlineTransducerModelConfig = {
|
||||
encoder:
|
||||
'./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/encoder-epoch-99-avg-1.int8.onnx',
|
||||
decoder:
|
||||
'./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/decoder-epoch-99-avg-1.onnx',
|
||||
joiner:
|
||||
'./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/joiner-epoch-99-avg-1.int8.onnx',
|
||||
};
|
||||
|
||||
// test online recognizer
|
||||
const transducer = new sherpa_onnx.OnlineTransducerModelConfig();
|
||||
transducer.encoder =
|
||||
'./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/encoder-epoch-99-avg-1.int8.onnx';
|
||||
transducer.decoder =
|
||||
'./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/decoder-epoch-99-avg-1.onnx';
|
||||
transducer.joiner =
|
||||
'./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/joiner-epoch-99-avg-1.onnx';
|
||||
const tokens =
|
||||
'./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/tokens.txt';
|
||||
let onlineParaformerModelConfig = {
|
||||
encoder: '',
|
||||
decoder: '',
|
||||
};
|
||||
|
||||
const modelConfig = new sherpa_onnx.OnlineModelConfig();
|
||||
modelConfig.transducer = transducer;
|
||||
modelConfig.tokens = tokens;
|
||||
modelConfig.modelType = 'zipformer';
|
||||
let onlineZipformer2CtcModelConfig = {
|
||||
model: '',
|
||||
};
|
||||
|
||||
const recognizerConfig = new sherpa_onnx.OnlineRecognizerConfig();
|
||||
recognizerConfig.featConfig = featConfig;
|
||||
recognizerConfig.modelConfig = modelConfig;
|
||||
recognizerConfig.decodingMethod = 'greedy_search';
|
||||
let onlineModelConfig = {
|
||||
transducer: onlineTransducerModelConfig,
|
||||
paraformer: onlineParaformerModelConfig,
|
||||
zipformer2Ctc: onlineZipformer2CtcModelConfig,
|
||||
tokens:
|
||||
'./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/tokens.txt',
|
||||
numThreads: 1,
|
||||
provider: 'cpu',
|
||||
debug: 1,
|
||||
modelType: 'zipformer',
|
||||
};
|
||||
|
||||
recognizer = new sherpa_onnx.OnlineRecognizer(recognizerConfig);
|
||||
return recognizer;
|
||||
let featureConfig = {
|
||||
sampleRate: 16000,
|
||||
featureDim: 80,
|
||||
};
|
||||
|
||||
let recognizerConfig = {
|
||||
featConfig: featureConfig,
|
||||
modelConfig: onlineModelConfig,
|
||||
decodingMethod: 'greedy_search',
|
||||
maxActivePaths: 4,
|
||||
enableEndpoint: 1,
|
||||
rule1MinTrailingSilence: 2.4,
|
||||
rule2MinTrailingSilence: 1.2,
|
||||
rule3MinUtteranceLength: 20,
|
||||
hotwordsFile: '',
|
||||
hotwordsScore: 1.5,
|
||||
};
|
||||
|
||||
return sherpa_onnx.createOnlineRecognizer(recognizerConfig);
|
||||
}
|
||||
recognizer = createRecognizer();
|
||||
stream = recognizer.createStream();
|
||||
|
||||
const recognizer = createOnlineRecognizer();
|
||||
const stream = recognizer.createStream();
|
||||
|
||||
const waveFilename =
|
||||
'./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/test_wavs/0.wav';
|
||||
@@ -50,8 +73,8 @@ function decode(samples) {
|
||||
while (recognizer.isReady(stream)) {
|
||||
recognizer.decode(stream);
|
||||
}
|
||||
const r = recognizer.getResult(stream);
|
||||
console.log(r.text);
|
||||
const text = recognizer.getResult(stream);
|
||||
console.log(text);
|
||||
}
|
||||
|
||||
reader.on('format', ({audioFormat, bitDepth, channels, sampleRate}) => {
|
||||
|
||||
@@ -6,32 +6,58 @@ const wav = require('wav');
|
||||
|
||||
const sherpa_onnx = require('sherpa-onnx');
|
||||
|
||||
function createRecognizer() {
|
||||
const featConfig = new sherpa_onnx.FeatureConfig();
|
||||
featConfig.sampleRate = 16000;
|
||||
featConfig.featureDim = 80;
|
||||
function createOnlineRecognizer() {
|
||||
let onlineTransducerModelConfig = {
|
||||
encoder: '',
|
||||
decoder: '',
|
||||
joiner: '',
|
||||
};
|
||||
|
||||
// test online recognizer
|
||||
const zipformer2Ctc = new sherpa_onnx.OnlineZipformer2CtcModelConfig();
|
||||
zipformer2Ctc.model =
|
||||
'./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/ctc-epoch-20-avg-1-chunk-16-left-128.onnx';
|
||||
const tokens =
|
||||
'./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/tokens.txt';
|
||||
let onlineParaformerModelConfig = {
|
||||
encoder: '',
|
||||
decoder: '',
|
||||
};
|
||||
|
||||
const modelConfig = new sherpa_onnx.OnlineModelConfig();
|
||||
modelConfig.zipformer2Ctc = zipformer2Ctc;
|
||||
modelConfig.tokens = tokens;
|
||||
let onlineZipformer2CtcModelConfig = {
|
||||
model:
|
||||
'./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/ctc-epoch-20-avg-1-chunk-16-left-128.onnx',
|
||||
};
|
||||
|
||||
const recognizerConfig = new sherpa_onnx.OnlineRecognizerConfig();
|
||||
recognizerConfig.featConfig = featConfig;
|
||||
recognizerConfig.modelConfig = modelConfig;
|
||||
recognizerConfig.decodingMethod = 'greedy_search';
|
||||
let onlineModelConfig = {
|
||||
transducer: onlineTransducerModelConfig,
|
||||
paraformer: onlineParaformerModelConfig,
|
||||
zipformer2Ctc: onlineZipformer2CtcModelConfig,
|
||||
tokens:
|
||||
'./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/tokens.txt',
|
||||
numThreads: 1,
|
||||
provider: 'cpu',
|
||||
debug: 1,
|
||||
modelType: '',
|
||||
};
|
||||
|
||||
recognizer = new sherpa_onnx.OnlineRecognizer(recognizerConfig);
|
||||
return recognizer;
|
||||
let featureConfig = {
|
||||
sampleRate: 16000,
|
||||
featureDim: 80,
|
||||
};
|
||||
|
||||
let recognizerConfig = {
|
||||
featConfig: featureConfig,
|
||||
modelConfig: onlineModelConfig,
|
||||
decodingMethod: 'greedy_search',
|
||||
maxActivePaths: 4,
|
||||
enableEndpoint: 1,
|
||||
rule1MinTrailingSilence: 2.4,
|
||||
rule2MinTrailingSilence: 1.2,
|
||||
rule3MinUtteranceLength: 20,
|
||||
hotwordsFile: '',
|
||||
hotwordsScore: 1.5,
|
||||
};
|
||||
|
||||
return sherpa_onnx.createOnlineRecognizer(recognizerConfig);
|
||||
}
|
||||
recognizer = createRecognizer();
|
||||
stream = recognizer.createStream();
|
||||
|
||||
const recognizer = createOnlineRecognizer();
|
||||
const stream = recognizer.createStream();
|
||||
|
||||
const waveFilename =
|
||||
'./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/test_wavs/DEV_T0000000000.wav';
|
||||
@@ -45,8 +71,8 @@ function decode(samples) {
|
||||
while (recognizer.isReady(stream)) {
|
||||
recognizer.decode(stream);
|
||||
}
|
||||
const r = recognizer.getResult(stream);
|
||||
console.log(r.text);
|
||||
const text = recognizer.getResult(stream);
|
||||
console.log(text);
|
||||
}
|
||||
|
||||
reader.on('format', ({audioFormat, bitDepth, channels, sampleRate}) => {
|
||||
|
||||
@@ -1,101 +0,0 @@
|
||||
// Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang)
|
||||
//
|
||||
const sherpa_onnx = require('sherpa-onnx3');
|
||||
const portAudio = require('naudiodon2');
|
||||
console.log(portAudio.getDevices());
|
||||
|
||||
function createOfflineRecognizer() {
|
||||
const featConfig = new sherpa_onnx.FeatureConfig();
|
||||
featConfig.sampleRate = 16000;
|
||||
featConfig.featureDim = 80;
|
||||
|
||||
// test online recognizer
|
||||
const paraformer = new sherpa_onnx.OfflineParaformerModelConfig();
|
||||
paraformer.model = './sherpa-onnx-paraformer-zh-2023-03-28/model.int8.onnx';
|
||||
const tokens = './sherpa-onnx-paraformer-zh-2023-03-28/tokens.txt';
|
||||
|
||||
const modelConfig = new sherpa_onnx.OfflineModelConfig();
|
||||
modelConfig.paraformer = paraformer;
|
||||
modelConfig.tokens = tokens;
|
||||
modelConfig.modelType = 'paraformer';
|
||||
|
||||
const recognizerConfig = new sherpa_onnx.OfflineRecognizerConfig();
|
||||
recognizerConfig.featConfig = featConfig;
|
||||
recognizerConfig.modelConfig = modelConfig;
|
||||
recognizerConfig.decodingMethod = 'greedy_search';
|
||||
|
||||
const recognizer = new sherpa_onnx.OfflineRecognizer(recognizerConfig);
|
||||
return recognizer
|
||||
}
|
||||
|
||||
function createVad() {
|
||||
const sileroVadModelConfig = new sherpa_onnx.SileroVadModelConfig();
|
||||
sileroVadModelConfig.model = './silero_vad.onnx';
|
||||
sileroVadModelConfig.minSpeechDuration = 0.3; // seconds
|
||||
sileroVadModelConfig.minSilenceDuration = 0.3; // seconds
|
||||
sileroVadModelConfig.windowSize = 512;
|
||||
|
||||
const vadModelConfig = new sherpa_onnx.VadModelConfig();
|
||||
vadModelConfig.sileroVad = sileroVadModelConfig;
|
||||
vadModelConfig.sampleRate = 16000;
|
||||
|
||||
const bufferSizeInSeconds = 60;
|
||||
const vad = new sherpa_onnx.VoiceActivityDetector(
|
||||
vadModelConfig, bufferSizeInSeconds);
|
||||
return vad;
|
||||
}
|
||||
|
||||
const recognizer = createOfflineRecognizer();
|
||||
const vad = createVad();
|
||||
|
||||
const bufferSizeInSeconds = 30;
|
||||
const buffer =
|
||||
new sherpa_onnx.CircularBuffer(bufferSizeInSeconds * vad.config.sampleRate);
|
||||
|
||||
var ai = new portAudio.AudioIO({
|
||||
inOptions: {
|
||||
channelCount: 1,
|
||||
sampleFormat: portAudio.SampleFormatFloat32,
|
||||
sampleRate: vad.config.sampleRate,
|
||||
deviceId: -1, // Use -1 or omit the deviceId to select the default device
|
||||
closeOnError: true // Close the stream if an audio error is detected, if
|
||||
// set false then just log the error
|
||||
}
|
||||
});
|
||||
|
||||
let printed = false;
|
||||
let index = 0;
|
||||
ai.on('data', data => {
|
||||
const windowSize = vad.config.sileroVad.windowSize;
|
||||
buffer.push(new Float32Array(data.buffer));
|
||||
while (buffer.size() > windowSize) {
|
||||
const samples = buffer.get(buffer.head(), windowSize);
|
||||
buffer.pop(windowSize);
|
||||
vad.acceptWaveform(samples)
|
||||
}
|
||||
|
||||
while (!vad.isEmpty()) {
|
||||
const segment = vad.front();
|
||||
vad.pop();
|
||||
const stream = recognizer.createStream();
|
||||
stream.acceptWaveform(
|
||||
recognizer.config.featConfig.sampleRate, segment.samples);
|
||||
recognizer.decode(stream);
|
||||
const r = recognizer.getResult(stream);
|
||||
stream.free();
|
||||
if (r.text.length > 0) {
|
||||
console.log(`${index}: ${r.text}`);
|
||||
index += 1;
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
ai.on('close', () => {
|
||||
console.log('Free resources');
|
||||
recognizer.free();
|
||||
vad.free();
|
||||
buffer.free();
|
||||
});
|
||||
|
||||
ai.start();
|
||||
console.log('Started! Please speak')
|
||||
@@ -1,106 +0,0 @@
|
||||
// Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang)
|
||||
//
|
||||
const sherpa_onnx = require('sherpa-onnx');
|
||||
const portAudio = require('naudiodon2');
|
||||
console.log(portAudio.getDevices());
|
||||
|
||||
function createOfflineRecognizer() {
|
||||
const featConfig = new sherpa_onnx.FeatureConfig();
|
||||
featConfig.sampleRate = 16000;
|
||||
featConfig.featureDim = 80;
|
||||
|
||||
// test online recognizer
|
||||
const transducer = new sherpa_onnx.OfflineTransducerModelConfig();
|
||||
transducer.encoder =
|
||||
'./sherpa-onnx-zipformer-en-2023-06-26/encoder-epoch-99-avg-1.onnx';
|
||||
transducer.decoder =
|
||||
'./sherpa-onnx-zipformer-en-2023-06-26/decoder-epoch-99-avg-1.onnx';
|
||||
transducer.joiner =
|
||||
'./sherpa-onnx-zipformer-en-2023-06-26/joiner-epoch-99-avg-1.onnx';
|
||||
const tokens = './sherpa-onnx-zipformer-en-2023-06-26/tokens.txt';
|
||||
|
||||
const modelConfig = new sherpa_onnx.OfflineModelConfig();
|
||||
modelConfig.transducer = transducer;
|
||||
modelConfig.tokens = tokens;
|
||||
modelConfig.modelType = 'transducer';
|
||||
|
||||
const recognizerConfig = new sherpa_onnx.OfflineRecognizerConfig();
|
||||
recognizerConfig.featConfig = featConfig;
|
||||
recognizerConfig.modelConfig = modelConfig;
|
||||
recognizerConfig.decodingMethod = 'greedy_search';
|
||||
|
||||
const recognizer = new sherpa_onnx.OfflineRecognizer(recognizerConfig);
|
||||
return recognizer;
|
||||
}
|
||||
|
||||
function createVad() {
|
||||
const sileroVadModelConfig = new sherpa_onnx.SileroVadModelConfig();
|
||||
sileroVadModelConfig.model = './silero_vad.onnx';
|
||||
sileroVadModelConfig.minSpeechDuration = 0.3; // seconds
|
||||
sileroVadModelConfig.minSilenceDuration = 0.3; // seconds
|
||||
sileroVadModelConfig.windowSize = 512;
|
||||
|
||||
const vadModelConfig = new sherpa_onnx.VadModelConfig();
|
||||
vadModelConfig.sileroVad = sileroVadModelConfig;
|
||||
vadModelConfig.sampleRate = 16000;
|
||||
|
||||
const bufferSizeInSeconds = 60;
|
||||
const vad = new sherpa_onnx.VoiceActivityDetector(
|
||||
vadModelConfig, bufferSizeInSeconds);
|
||||
return vad;
|
||||
}
|
||||
|
||||
const recognizer = createOfflineRecognizer();
|
||||
const vad = createVad();
|
||||
|
||||
const bufferSizeInSeconds = 30;
|
||||
const buffer =
|
||||
new sherpa_onnx.CircularBuffer(bufferSizeInSeconds * vad.config.sampleRate);
|
||||
|
||||
const ai = new portAudio.AudioIO({
|
||||
inOptions: {
|
||||
channelCount: 1,
|
||||
closeOnError: true, // Close the stream if an audio error is detected, if
|
||||
// set false then just log the error
|
||||
deviceId: -1, // Use -1 or omit the deviceId to select the default device
|
||||
sampleFormat: portAudio.SampleFormatFloat32,
|
||||
sampleRate: vad.config.sampleRate
|
||||
}
|
||||
});
|
||||
|
||||
let printed = false;
|
||||
let index = 0;
|
||||
ai.on('data', data => {
|
||||
const windowSize = vad.config.sileroVad.windowSize;
|
||||
buffer.push(new Float32Array(data.buffer));
|
||||
while (buffer.size() > windowSize) {
|
||||
const samples = buffer.get(buffer.head(), windowSize);
|
||||
buffer.pop(windowSize);
|
||||
vad.acceptWaveform(samples)
|
||||
}
|
||||
|
||||
while (!vad.isEmpty()) {
|
||||
const segment = vad.front();
|
||||
vad.pop();
|
||||
const stream = recognizer.createStream();
|
||||
stream.acceptWaveform(
|
||||
recognizer.config.featConfig.sampleRate, segment.samples);
|
||||
recognizer.decode(stream);
|
||||
const r = recognizer.getResult(stream);
|
||||
stream.free();
|
||||
if (r.text.length > 0) {
|
||||
console.log(`${index}: ${r.text}`);
|
||||
index += 1;
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
ai.on('close', () => {
|
||||
console.log('Free resources');
|
||||
recognizer.free();
|
||||
vad.free();
|
||||
buffer.free();
|
||||
});
|
||||
|
||||
ai.start();
|
||||
console.log('Started! Please speak')
|
||||
@@ -1,102 +0,0 @@
|
||||
// Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang)
|
||||
//
|
||||
const sherpa_onnx = require('sherpa-onnx');
|
||||
const portAudio = require('naudiodon2');
|
||||
console.log(portAudio.getDevices());
|
||||
|
||||
function createOfflineRecognizer() {
|
||||
const featConfig = new sherpa_onnx.FeatureConfig();
|
||||
featConfig.sampleRate = 16000;
|
||||
featConfig.featureDim = 80;
|
||||
|
||||
// test online recognizer
|
||||
const whisper = new sherpa_onnx.OfflineWhisperModelConfig();
|
||||
whisper.encoder = './sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx';
|
||||
whisper.decoder = './sherpa-onnx-whisper-tiny.en/tiny.en-decoder.int8.onnx';
|
||||
const tokens = './sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt';
|
||||
|
||||
const modelConfig = new sherpa_onnx.OfflineModelConfig();
|
||||
modelConfig.whisper = whisper;
|
||||
modelConfig.tokens = tokens;
|
||||
modelConfig.modelType = 'whisper';
|
||||
|
||||
const recognizerConfig = new sherpa_onnx.OfflineRecognizerConfig();
|
||||
recognizerConfig.featConfig = featConfig;
|
||||
recognizerConfig.modelConfig = modelConfig;
|
||||
recognizerConfig.decodingMethod = 'greedy_search';
|
||||
|
||||
const recognizer = new sherpa_onnx.OfflineRecognizer(recognizerConfig);
|
||||
return recognizer;
|
||||
}
|
||||
|
||||
function createVad() {
|
||||
const sileroVadModelConfig = new sherpa_onnx.SileroVadModelConfig();
|
||||
sileroVadModelConfig.model = './silero_vad.onnx';
|
||||
sileroVadModelConfig.minSpeechDuration = 0.3; // seconds
|
||||
sileroVadModelConfig.minSilenceDuration = 0.3; // seconds
|
||||
sileroVadModelConfig.windowSize = 512;
|
||||
|
||||
const vadModelConfig = new sherpa_onnx.VadModelConfig();
|
||||
vadModelConfig.sileroVad = sileroVadModelConfig;
|
||||
vadModelConfig.sampleRate = 16000;
|
||||
|
||||
const bufferSizeInSeconds = 60;
|
||||
const vad = new sherpa_onnx.VoiceActivityDetector(
|
||||
vadModelConfig, bufferSizeInSeconds);
|
||||
return vad;
|
||||
}
|
||||
|
||||
const recognizer = createOfflineRecognizer();
|
||||
const vad = createVad();
|
||||
|
||||
const bufferSizeInSeconds = 30;
|
||||
const buffer =
|
||||
new sherpa_onnx.CircularBuffer(bufferSizeInSeconds * vad.config.sampleRate);
|
||||
|
||||
const ai = new portAudio.AudioIO({
|
||||
inOptions: {
|
||||
channelCount: 1,
|
||||
closeOnError: true, // Close the stream if an audio error is detected, if
|
||||
// set false then just log the error
|
||||
deviceId: -1, // Use -1 or omit the deviceId to select the default device
|
||||
sampleFormat: portAudio.SampleFormatFloat32,
|
||||
sampleRate: vad.config.sampleRate
|
||||
}
|
||||
});
|
||||
|
||||
let printed = false;
|
||||
let index = 0;
|
||||
ai.on('data', data => {
|
||||
const windowSize = vad.config.sileroVad.windowSize;
|
||||
buffer.push(new Float32Array(data.buffer));
|
||||
while (buffer.size() > windowSize) {
|
||||
const samples = buffer.get(buffer.head(), windowSize);
|
||||
buffer.pop(windowSize);
|
||||
vad.acceptWaveform(samples)
|
||||
}
|
||||
|
||||
while (!vad.isEmpty()) {
|
||||
const segment = vad.front();
|
||||
vad.pop();
|
||||
const stream = recognizer.createStream();
|
||||
stream.acceptWaveform(
|
||||
recognizer.config.featConfig.sampleRate, segment.samples);
|
||||
recognizer.decode(stream);
|
||||
const r = recognizer.getResult(stream);
|
||||
stream.free();
|
||||
if (r.text.length > 0) {
|
||||
console.log(`${index}: ${r.text}`);
|
||||
index += 1;
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
ai.on('close', () => {
|
||||
console.log('Free resources');
|
||||
recognizer.free();
|
||||
vad.free();
|
||||
buffer.free();
|
||||
});
|
||||
|
||||
ai.start();
|
||||
console.log('Started! Please speak')
|
||||
@@ -1,74 +0,0 @@
|
||||
// Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang)
|
||||
|
||||
const sherpa_onnx = require('sherpa-onnx');
|
||||
const portAudio = require('naudiodon2');
|
||||
console.log(portAudio.getDevices());
|
||||
|
||||
function createVad() {
|
||||
const sileroVadModelConfig = new sherpa_onnx.SileroVadModelConfig();
|
||||
sileroVadModelConfig.model = './silero_vad.onnx';
|
||||
sileroVadModelConfig.minSpeechDuration = 0.3; // seconds
|
||||
sileroVadModelConfig.minSilenceDuration = 0.3; // seconds
|
||||
sileroVadModelConfig.windowSize = 512;
|
||||
|
||||
const vadModelConfig = new sherpa_onnx.VadModelConfig();
|
||||
vadModelConfig.sileroVad = sileroVadModelConfig;
|
||||
vadModelConfig.sampleRate = 16000;
|
||||
|
||||
const bufferSizeInSeconds = 60;
|
||||
const vad = new sherpa_onnx.VoiceActivityDetector(
|
||||
vadModelConfig, bufferSizeInSeconds);
|
||||
return vad;
|
||||
}
|
||||
vad = createVad();
|
||||
const bufferSizeInSeconds = 30;
|
||||
const buffer =
|
||||
new sherpa_onnx.CircularBuffer(bufferSizeInSeconds * vad.config.sampleRate);
|
||||
|
||||
const ai = new portAudio.AudioIO({
|
||||
inOptions: {
|
||||
channelCount: 1,
|
||||
closeOnError: true, // Close the stream if an audio error is detected, if
|
||||
// set false then just log the error
|
||||
deviceId: -1, // Use -1 or omit the deviceId to select the default device
|
||||
sampleFormat: portAudio.SampleFormatFloat32,
|
||||
sampleRate: vad.config.sampleRate
|
||||
}
|
||||
});
|
||||
|
||||
let printed = false;
|
||||
let index = 0;
|
||||
ai.on('data', data => {
|
||||
const windowSize = vad.config.sileroVad.windowSize;
|
||||
buffer.push(new Float32Array(data.buffer));
|
||||
while (buffer.size() > windowSize) {
|
||||
const samples = buffer.get(buffer.head(), windowSize);
|
||||
buffer.pop(windowSize);
|
||||
vad.acceptWaveform(samples)
|
||||
if (vad.isDetected() && !printed) {
|
||||
console.log(`${index}: Detected speech`)
|
||||
printed = true;
|
||||
}
|
||||
|
||||
if (!vad.isDetected()) {
|
||||
printed = false;
|
||||
}
|
||||
|
||||
while (!vad.isEmpty()) {
|
||||
const segment = vad.front();
|
||||
vad.pop();
|
||||
const duration = segment.samples.length / vad.config.sampleRate;
|
||||
console.log(`${index} End of speech. Duration: ${duration} seconds`);
|
||||
index += 1;
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
ai.on('close', () => {
|
||||
console.log('Free resources');
|
||||
vad.free();
|
||||
buffer.free();
|
||||
});
|
||||
|
||||
ai.start();
|
||||
console.log('Started! Please speak')
|
||||
Reference in New Issue
Block a user