Add VAD and keyword spotting for the Node package with WebAssembly (#1286)

This commit is contained in:
Fangjun Kuang
2024-08-24 23:05:54 +08:00
committed by GitHub
parent 537e163dd0
commit 5ed8e31868
40 changed files with 456 additions and 524 deletions

View File

@@ -0,0 +1,49 @@
// Copyright (c) 2024 Xiaomi Corporation
const sherpa_onnx = require('sherpa-onnx');
function createKeywordSpotter() {
// Please download test files from
// https://github.com/k2-fsa/sherpa-onnx/releases/tag/kws-models
const config = {
'modelConfig': {
'transducer': {
'encoder':
'./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/encoder-epoch-12-avg-2-chunk-16-left-64.onnx',
'decoder':
'./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/decoder-epoch-12-avg-2-chunk-16-left-64.onnx',
'joiner':
'./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/joiner-epoch-12-avg-2-chunk-16-left-64.onnx',
},
'tokens':
'./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/tokens.txt',
},
keywords: 'w én s ēn t è k ǎ s uǒ @文森特卡索\n' +
'f ǎ g uó @法国'
};
return sherpa_onnx.createKws(config);
}
const kws = createKeywordSpotter();
const stream = kws.createStream();
const waveFilename =
'./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/test_wavs/3.wav';
const wave = sherpa_onnx.readWave(waveFilename);
stream.acceptWaveform(wave.sampleRate, wave.samples);
const tailPadding = new Float32Array(wave.sampleRate * 0.4);
stream.acceptWaveform(kws.config.featConfig.sampleRate, tailPadding);
const detectedKeywords = [];
while (kws.isReady(stream)) {
kws.decode(stream);
const keyword = kws.getResult(stream).keyword;
if (keyword != '') {
detectedKeywords.push(keyword);
}
}
console.log(detectedKeywords);
stream.free();
kws.free();

View File

@@ -7,27 +7,13 @@ const wav = require('wav');
const sherpa_onnx = require('sherpa-onnx');
function createOfflineRecognizer() {
let featConfig = {
sampleRate: 16000,
featureDim: 80,
};
let modelConfig = {
nemoCtc: {
model: './sherpa-onnx-nemo-ctc-en-conformer-small/model.int8.onnx',
},
tokens: './sherpa-onnx-nemo-ctc-en-conformer-small/tokens.txt',
numThreads: 1,
debug: 0,
provider: 'cpu',
modelType: 'nemo_ctc',
};
let config = {
featConfig: featConfig,
modelConfig: modelConfig,
decodingMethod: 'greedy_search',
maxActivePaths: 4,
modelConfig: {
nemoCtc: {
model: './sherpa-onnx-nemo-ctc-en-conformer-small/model.int8.onnx',
},
tokens: './sherpa-onnx-nemo-ctc-en-conformer-small/tokens.txt',
}
};
return sherpa_onnx.createOfflineRecognizer(config);
@@ -38,63 +24,12 @@ const stream = recognizer.createStream();
const waveFilename =
'./sherpa-onnx-nemo-ctc-en-conformer-small/test_wavs/0.wav';
const wave = sherpa_onnx.readWave(waveFilename);
stream.acceptWaveform(wave.sampleRate, wave.samples);
const reader = new wav.Reader();
const readable = new Readable().wrap(reader);
const buf = [];
recognizer.decode(stream);
const text = recognizer.getResult(stream).text;
console.log(text);
reader.on('format', ({audioFormat, bitDepth, channels, sampleRate}) => {
if (sampleRate != recognizer.config.featConfig.sampleRate) {
throw new Error(`Only support sampleRate ${
recognizer.config.featConfig.sampleRate}. Given ${sampleRate}`);
}
if (audioFormat != 1) {
throw new Error(`Only support PCM format. Given ${audioFormat}`);
}
if (channels != 1) {
throw new Error(`Only a single channel. Given ${channel}`);
}
if (bitDepth != 16) {
throw new Error(`Only support 16-bit samples. Given ${bitDepth}`);
}
});
fs.createReadStream(waveFilename, {highWaterMark: 4096})
.pipe(reader)
.on('finish', function(err) {
// tail padding
const floatSamples =
new Float32Array(recognizer.config.featConfig.sampleRate * 0.5);
buf.push(floatSamples);
const flattened =
Float32Array.from(buf.reduce((a, b) => [...a, ...b], []));
stream.acceptWaveform(recognizer.config.featConfig.sampleRate, flattened);
recognizer.decode(stream);
const text = recognizer.getResult(stream).text;
console.log(text);
stream.free();
recognizer.free();
});
readable.on('readable', function() {
let chunk;
while ((chunk = readable.read()) != null) {
const int16Samples = new Int16Array(
chunk.buffer, chunk.byteOffset,
chunk.length / Int16Array.BYTES_PER_ELEMENT);
const floatSamples = new Float32Array(int16Samples.length);
for (let i = 0; i < floatSamples.length; i++) {
floatSamples[i] = int16Samples[i] / 32768.0;
}
buf.push(floatSamples);
}
});
stream.free();
recognizer.free();

View File

@@ -7,27 +7,15 @@ const wav = require('wav');
const sherpa_onnx = require('sherpa-onnx');
function createOfflineRecognizer() {
let featConfig = {
sampleRate: 16000,
featureDim: 80,
};
let modelConfig = {
paraformer: {
model: './sherpa-onnx-paraformer-zh-2023-09-14/model.int8.onnx',
},
tokens: './sherpa-onnx-paraformer-zh-2023-09-14/tokens.txt',
numThreads: 1,
debug: 0,
provider: 'cpu',
modelType: 'paraformer',
};
let config = {
featConfig: featConfig,
modelConfig: modelConfig,
decodingMethod: 'greedy_search',
// https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn_zh_number.fst
ruleFsts: './itn_zh_number.fst',
};
@@ -41,62 +29,12 @@ const stream = recognizer.createStream();
// https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn-zh-number.wav
const waveFilename = './itn-zh-number.wav';
const wave = sherpa_onnx.readWave(waveFilename);
stream.acceptWaveform(wave.sampleRate, wave.samples);
const reader = new wav.Reader();
const readable = new Readable().wrap(reader);
const buf = [];
recognizer.decode(stream);
const text = recognizer.getResult(stream).text;
console.log(text);
reader.on('format', ({audioFormat, bitDepth, channels, sampleRate}) => {
if (sampleRate != recognizer.config.featConfig.sampleRate) {
throw new Error(`Only support sampleRate ${
recognizer.config.featConfig.sampleRate}. Given ${sampleRate}`);
}
if (audioFormat != 1) {
throw new Error(`Only support PCM format. Given ${audioFormat}`);
}
if (channels != 1) {
throw new Error(`Only a single channel. Given ${channel}`);
}
if (bitDepth != 16) {
throw new Error(`Only support 16-bit samples. Given ${bitDepth}`);
}
});
fs.createReadStream(waveFilename, {'highWaterMark': 4096})
.pipe(reader)
.on('finish', function(err) {
// tail padding
const floatSamples =
new Float32Array(recognizer.config.featConfig.sampleRate * 0.5);
buf.push(floatSamples);
const flattened =
Float32Array.from(buf.reduce((a, b) => [...a, ...b], []));
stream.acceptWaveform(recognizer.config.featConfig.sampleRate, flattened);
recognizer.decode(stream);
const text = recognizer.getResult(stream).text;
console.log(text);
stream.free();
recognizer.free();
});
readable.on('readable', function() {
let chunk;
while ((chunk = readable.read()) != null) {
const int16Samples = new Int16Array(
chunk.buffer, chunk.byteOffset,
chunk.length / Int16Array.BYTES_PER_ELEMENT);
const floatSamples = new Float32Array(int16Samples.length);
for (let i = 0; i < floatSamples.length; i++) {
floatSamples[i] = int16Samples[i] / 32768.0;
}
buf.push(floatSamples);
}
});
stream.free();
recognizer.free();

View File

@@ -1,98 +1,32 @@
// Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang)
const fs = require('fs');
const {Readable} = require('stream');
const wav = require('wav');
const sherpa_onnx = require('sherpa-onnx');
function createOfflineRecognizer() {
let featConfig = {
sampleRate: 16000,
featureDim: 80,
};
let modelConfig = {
paraformer: {
model: './sherpa-onnx-paraformer-zh-2023-09-14/model.int8.onnx',
},
tokens: './sherpa-onnx-paraformer-zh-2023-09-14/tokens.txt',
numThreads: 1,
debug: 0,
provider: 'cpu',
modelType: 'paraformer',
};
let config = {
featConfig: featConfig,
modelConfig: modelConfig,
decodingMethod: 'greedy_search',
};
return sherpa_onnx.createOfflineRecognizer(config);
}
const recognizer = createOfflineRecognizer();
const stream = recognizer.createStream();
const waveFilename = './sherpa-onnx-paraformer-zh-2023-09-14/test_wavs/0.wav';
const wave = sherpa_onnx.readWave(waveFilename);
stream.acceptWaveform(wave.sampleRate, wave.samples);
const reader = new wav.Reader();
const readable = new Readable().wrap(reader);
const buf = [];
recognizer.decode(stream);
const text = recognizer.getResult(stream).text;
console.log(text);
reader.on('format', ({audioFormat, bitDepth, channels, sampleRate}) => {
if (sampleRate != recognizer.config.featConfig.sampleRate) {
throw new Error(`Only support sampleRate ${
recognizer.config.featConfig.sampleRate}. Given ${sampleRate}`);
}
if (audioFormat != 1) {
throw new Error(`Only support PCM format. Given ${audioFormat}`);
}
if (channels != 1) {
throw new Error(`Only a single channel. Given ${channel}`);
}
if (bitDepth != 16) {
throw new Error(`Only support 16-bit samples. Given ${bitDepth}`);
}
});
fs.createReadStream(waveFilename, {'highWaterMark': 4096})
.pipe(reader)
.on('finish', function(err) {
// tail padding
const floatSamples =
new Float32Array(recognizer.config.featConfig.sampleRate * 0.5);
buf.push(floatSamples);
const flattened =
Float32Array.from(buf.reduce((a, b) => [...a, ...b], []));
stream.acceptWaveform(recognizer.config.featConfig.sampleRate, flattened);
recognizer.decode(stream);
const text = recognizer.getResult(stream).text;
console.log(text);
stream.free();
recognizer.free();
});
readable.on('readable', function() {
let chunk;
while ((chunk = readable.read()) != null) {
const int16Samples = new Int16Array(
chunk.buffer, chunk.byteOffset,
chunk.length / Int16Array.BYTES_PER_ELEMENT);
const floatSamples = new Float32Array(int16Samples.length);
for (let i = 0; i < floatSamples.length; i++) {
floatSamples[i] = int16Samples[i] / 32768.0;
}
buf.push(floatSamples);
}
});
stream.free();
recognizer.free();

View File

@@ -1,17 +1,8 @@
// Copyright (c) 2024 Xiaomi Corporation (authors: Fangjun Kuang)
const fs = require('fs');
const {Readable} = require('stream');
const wav = require('wav');
const sherpa_onnx = require('sherpa-onnx');
function createOfflineRecognizer() {
let featConfig = {
sampleRate: 16000,
featureDim: 80,
};
let modelConfig = {
senseVoice: {
model:
@@ -20,82 +11,26 @@ function createOfflineRecognizer() {
useInverseTextNormalization: 1,
},
tokens: './sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt',
numThreads: 1,
debug: 0,
provider: 'cpu',
};
let config = {
featConfig: featConfig,
modelConfig: modelConfig,
decodingMethod: 'greedy_search',
};
return sherpa_onnx.createOfflineRecognizer(config);
}
const recognizer = createOfflineRecognizer();
const stream = recognizer.createStream();
const waveFilename =
'./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/test_wavs/zh.wav';
const wave = sherpa_onnx.readWave(waveFilename);
stream.acceptWaveform(wave.sampleRate, wave.samples);
const reader = new wav.Reader();
const readable = new Readable().wrap(reader);
const buf = [];
recognizer.decode(stream);
const text = recognizer.getResult(stream).text;
console.log(text);
reader.on('format', ({audioFormat, bitDepth, channels, sampleRate}) => {
if (sampleRate != recognizer.config.featConfig.sampleRate) {
throw new Error(`Only support sampleRate ${
recognizer.config.featConfig.sampleRate}. Given ${sampleRate}`);
}
if (audioFormat != 1) {
throw new Error(`Only support PCM format. Given ${audioFormat}`);
}
if (channels != 1) {
throw new Error(`Only a single channel. Given ${channel}`);
}
if (bitDepth != 16) {
throw new Error(`Only support 16-bit samples. Given ${bitDepth}`);
}
});
fs.createReadStream(waveFilename, {'highWaterMark': 4096})
.pipe(reader)
.on('finish', function(err) {
// tail padding
const floatSamples =
new Float32Array(recognizer.config.featConfig.sampleRate * 0.5);
buf.push(floatSamples);
const flattened =
Float32Array.from(buf.reduce((a, b) => [...a, ...b], []));
stream.acceptWaveform(recognizer.config.featConfig.sampleRate, flattened);
recognizer.decode(stream);
const text = recognizer.getResult(stream).text;
console.log(text);
stream.free();
recognizer.free();
});
readable.on('readable', function() {
let chunk;
while ((chunk = readable.read()) != null) {
const int16Samples = new Int16Array(
chunk.buffer, chunk.byteOffset,
chunk.length / Int16Array.BYTES_PER_ELEMENT);
const floatSamples = new Float32Array(int16Samples.length);
for (let i = 0; i < floatSamples.length; i++) {
floatSamples[i] = int16Samples[i] / 32768.0;
}
buf.push(floatSamples);
}
});
stream.free();
recognizer.free();

View File

@@ -1,17 +1,8 @@
// Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang)
//
const fs = require('fs');
const {Readable} = require('stream');
const wav = require('wav');
const sherpa_onnx = require('sherpa-onnx');
function createOfflineRecognizer() {
let featConfig = {
sampleRate: 16000,
featureDim: 80,
};
let modelConfig = {
transducer: {
encoder:
@@ -22,19 +13,11 @@ function createOfflineRecognizer() {
'./sherpa-onnx-zipformer-en-2023-06-26/joiner-epoch-99-avg-1.int8.onnx',
},
tokens: './sherpa-onnx-zipformer-en-2023-06-26/tokens.txt',
numThreads: 1,
debug: 0,
provider: 'cpu',
modelType: 'transducer',
};
let config = {
featConfig: featConfig,
modelConfig: modelConfig,
decodingMethod: 'greedy_search',
maxActivePaths: 4,
hotwordsFile: '',
hotwordsScore: 1.5,
};
return sherpa_onnx.createOfflineRecognizer(config);
@@ -43,62 +26,12 @@ const recognizer = createOfflineRecognizer();
const stream = recognizer.createStream();
const waveFilename = './sherpa-onnx-zipformer-en-2023-06-26/test_wavs/0.wav';
const wave = sherpa_onnx.readWave(waveFilename);
stream.acceptWaveform(wave.sampleRate, wave.samples);
const reader = new wav.Reader();
const readable = new Readable().wrap(reader);
const buf = [];
recognizer.decode(stream);
const text = recognizer.getResult(stream).text;
console.log(text);
reader.on('format', ({audioFormat, bitDepth, channels, sampleRate}) => {
if (sampleRate != recognizer.config.featConfig.sampleRate) {
throw new Error(`Only support sampleRate ${
recognizer.config.featConfig.sampleRate}. Given ${sampleRate}`);
}
if (audioFormat != 1) {
throw new Error(`Only support PCM format. Given ${audioFormat}`);
}
if (channels != 1) {
throw new Error(`Only a single channel. Given ${channel}`);
}
if (bitDepth != 16) {
throw new Error(`Only support 16-bit samples. Given ${bitDepth}`);
}
});
fs.createReadStream(waveFilename, {'highWaterMark': 4096})
.pipe(reader)
.on('finish', function(err) {
// tail padding
const floatSamples =
new Float32Array(recognizer.config.featConfig.sampleRate * 0.5);
buf.push(floatSamples);
const flattened =
Float32Array.from(buf.reduce((a, b) => [...a, ...b], []));
stream.acceptWaveform(recognizer.config.featConfig.sampleRate, flattened);
recognizer.decode(stream);
const text = recognizer.getResult(stream).text;
console.log(text);
stream.free();
recognizer.free();
});
readable.on('readable', function() {
let chunk;
while ((chunk = readable.read()) != null) {
const int16Samples = new Int16Array(
chunk.buffer, chunk.byteOffset,
chunk.length / Int16Array.BYTES_PER_ELEMENT);
const floatSamples = new Float32Array(int16Samples.length);
for (let i = 0; i < floatSamples.length; i++) {
floatSamples[i] = int16Samples[i] / 32768.0;
}
buf.push(floatSamples);
}
});
stream.free();
recognizer.free();

View File

@@ -1,17 +1,8 @@
// Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang)
//
const fs = require('fs');
const {Readable} = require('stream');
const wav = require('wav');
const sherpa_onnx = require('sherpa-onnx');
function createOfflineRecognizer() {
let featConfig = {
sampleRate: 16000,
featureDim: 80,
};
let modelConfig = {
whisper: {
encoder: './sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx',
@@ -21,83 +12,25 @@ function createOfflineRecognizer() {
tailPaddings: -1,
},
tokens: './sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt',
numThreads: 1,
debug: 0,
provider: 'cpu',
modelType: 'whisper',
};
let config = {
featConfig: featConfig,
modelConfig: modelConfig,
decodingMethod: 'greedy_search',
};
return sherpa_onnx.createOfflineRecognizer(config);
}
recognizer = createOfflineRecognizer();
stream = recognizer.createStream();
const waveFilename = './sherpa-onnx-whisper-tiny.en/test_wavs/0.wav';
const wave = sherpa_onnx.readWave(waveFilename);
stream.acceptWaveform(wave.sampleRate, wave.samples);
const reader = new wav.Reader();
const readable = new Readable().wrap(reader);
const buf = [];
recognizer.decode(stream);
const text = recognizer.getResult(stream).text;
console.log(text);
reader.on('format', ({audioFormat, bitDepth, channels, sampleRate}) => {
if (sampleRate != recognizer.config.featConfig.sampleRate) {
throw new Error(`Only support sampleRate ${
recognizer.config.featConfig.sampleRate}. Given ${sampleRate}`);
}
if (audioFormat != 1) {
throw new Error(`Only support PCM format. Given ${audioFormat}`);
}
if (channels != 1) {
throw new Error(`Only a single channel. Given ${channel}`);
}
if (bitDepth != 16) {
throw new Error(`Only support 16-bit samples. Given ${bitDepth}`);
}
});
fs.createReadStream(waveFilename, {'highWaterMark': 4096})
.pipe(reader)
.on('finish', function(err) {
// tail padding
const floatSamples =
new Float32Array(recognizer.config.featConfig.sampleRate * 0.5);
buf.push(floatSamples);
const flattened =
Float32Array.from(buf.reduce((a, b) => [...a, ...b], []));
stream.acceptWaveform(recognizer.config.featConfig.sampleRate, flattened);
recognizer.decode(stream);
const text = recognizer.getResult(stream).text;
console.log(text);
stream.free();
recognizer.free();
});
readable.on('readable', function() {
let chunk;
while ((chunk = readable.read()) != null) {
const int16Samples = new Int16Array(
chunk.buffer, chunk.byteOffset,
chunk.length / Int16Array.BYTES_PER_ELEMENT);
const floatSamples = new Float32Array(int16Samples.length);
for (let i = 0; i < floatSamples.length; i++) {
floatSamples[i] = int16Samples[i] / 32768.0;
}
buf.push(floatSamples);
}
});
stream.free();
recognizer.free();

View File

@@ -16,22 +16,10 @@ function createOnlineRecognizer() {
let onlineModelConfig = {
paraformer: onlineParaformerModelConfig,
tokens: './sherpa-onnx-streaming-paraformer-bilingual-zh-en/tokens.txt',
numThreads: 1,
provider: 'cpu',
debug: 1,
modelType: 'paraformer',
};
let featureConfig = {
sampleRate: 16000,
featureDim: 80,
};
let recognizerConfig = {
featConfig: featureConfig,
modelConfig: onlineModelConfig,
decodingMethod: 'greedy_search',
maxActivePaths: 4,
enableEndpoint: 1,
rule1MinTrailingSilence: 2.4,
rule2MinTrailingSilence: 1.2,

View File

@@ -17,26 +17,10 @@ function createOnlineRecognizer() {
let onlineModelConfig = {
paraformer: onlineParaformerModelConfig,
tokens: './sherpa-onnx-streaming-paraformer-bilingual-zh-en/tokens.txt',
numThreads: 1,
provider: 'cpu',
debug: 1,
modelType: 'paraformer',
};
let featureConfig = {
sampleRate: 16000,
featureDim: 80,
};
let recognizerConfig = {
featConfig: featureConfig,
modelConfig: onlineModelConfig,
decodingMethod: 'greedy_search',
maxActivePaths: 4,
enableEndpoint: 1,
rule1MinTrailingSilence: 2.4,
rule2MinTrailingSilence: 1.2,
rule3MinUtteranceLength: 20,
};
return sherpa_onnx.createOnlineRecognizer(recognizerConfig);

View File

@@ -20,26 +20,10 @@ function createOnlineRecognizer() {
transducer: onlineTransducerModelConfig,
tokens:
'./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/tokens.txt',
numThreads: 1,
provider: 'cpu',
debug: 1,
modelType: 'zipformer',
};
let featureConfig = {
sampleRate: 16000,
featureDim: 80,
};
let recognizerConfig = {
featConfig: featureConfig,
modelConfig: onlineModelConfig,
decodingMethod: 'greedy_search',
maxActivePaths: 4,
enableEndpoint: 1,
rule1MinTrailingSilence: 2.4,
rule2MinTrailingSilence: 1.2,
rule3MinUtteranceLength: 20,
};
return sherpa_onnx.createOnlineRecognizer(recognizerConfig);

View File

@@ -0,0 +1,124 @@
// Copyright (c) 2023-2024 Xiaomi Corporation (authors: Fangjun Kuang)
const sherpa_onnx = require('sherpa-onnx');
function createRecognizer() {
// Please download test files from
// https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
const config = {
'modelConfig': {
'whisper': {
'encoder': './sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx',
'decoder': './sherpa-onnx-whisper-tiny.en/tiny.en-decoder.int8.onnx',
'tailPaddings': 2000,
},
'tokens': './sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt',
'debug': 0,
}
};
return sherpa_onnx.createOfflineRecognizer(config);
}
function createVad() {
// please download silero_vad.onnx from
// https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
const config = {
sileroVad: {
model: './silero_vad.onnx',
threshold: 0.5,
minSpeechDuration: 0.25,
minSilenceDuration: 0.5,
windowSize: 512,
},
sampleRate: 16000,
debug: true,
numThreads: 1,
bufferSizeInSeconds: 60,
};
return sherpa_onnx.createVad(config);
}
const recognizer = createRecognizer();
const vad = createVad();
// please download ./Obama.wav from
// https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
const waveFilename = './Obama.wav';
const wave = sherpa_onnx.readWave(waveFilename);
if (wave.sampleRate != recognizer.config.featConfig.sampleRate) {
throw new Error(
'Expected sample rate: ${recognizer.config.featConfig.sampleRate}. Given: ${wave.sampleRate}');
}
console.log('Started')
let start = Date.now();
const windowSize = vad.config.sileroVad.windowSize;
for (let i = 0; i < wave.samples.length; i += windowSize) {
const thisWindow = wave.samples.subarray(i, i + windowSize);
vad.acceptWaveform(thisWindow);
while (!vad.isEmpty()) {
const segment = vad.front();
vad.pop();
let start_time = segment.start / wave.sampleRate;
let end_time = start_time + segment.samples.length / wave.sampleRate;
start_time = start_time.toFixed(2);
end_time = end_time.toFixed(2);
const stream = recognizer.createStream();
stream.acceptWaveform(wave.sampleRate, segment.samples);
recognizer.decode(stream);
const r = recognizer.getResult(stream);
if (r.text.length > 0) {
const text = r.text.toLowerCase().trim();
console.log(`${start_time} -- ${end_time}: ${text}`);
}
stream.free();
}
}
vad.flush();
while (!vad.isEmpty()) {
const segment = vad.front();
vad.pop();
let start_time = segment.start / wave.sampleRate;
let end_time = start_time + segment.samples.length / wave.sampleRate;
start_time = start_time.toFixed(2);
end_time = end_time.toFixed(2);
const stream = recognizer.createStream();
stream.acceptWaveform(wave.sampleRate, segment.samples);
recognizer.decode(stream);
const r = recognizer.getResult(stream);
if (r.text.length > 0) {
const text = r.text.toLowerCase().trim();
console.log(`${start_time} -- ${end_time}: ${text}`);
}
}
let stop = Date.now();
console.log('Done')
const elapsed_seconds = (stop - start) / 1000;
const duration = wave.samples.length / wave.sampleRate;
const real_time_factor = elapsed_seconds / duration;
console.log('Wave duration', duration.toFixed(3), 'seconds')
console.log('Elapsed', elapsed_seconds.toFixed(3), 'seconds')
console.log(
`RTF = ${elapsed_seconds.toFixed(3)}/${duration.toFixed(3)} =`,
real_time_factor.toFixed(3))
vad.free();
recognizer.free();