89 lines
2.6 KiB
JavaScript
89 lines
2.6 KiB
JavaScript
// Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang)
|
|
//
|
|
const portAudio = require('naudiodon2');
|
|
// console.log(portAudio.getDevices());
|
|
|
|
const sherpa_onnx = require('sherpa-onnx');
|
|
|
|
function createRecognizer() {
|
|
const featConfig = new sherpa_onnx.FeatureConfig();
|
|
featConfig.sampleRate = 16000;
|
|
featConfig.featureDim = 80;
|
|
|
|
// test online recognizer
|
|
const transducer = new sherpa_onnx.OnlineTransducerModelConfig();
|
|
transducer.encoder =
|
|
'./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/encoder-epoch-99-avg-1.int8.onnx';
|
|
transducer.decoder =
|
|
'./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/decoder-epoch-99-avg-1.onnx';
|
|
transducer.joiner =
|
|
'./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/joiner-epoch-99-avg-1.onnx';
|
|
const tokens =
|
|
'./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/tokens.txt';
|
|
|
|
const modelConfig = new sherpa_onnx.OnlineModelConfig();
|
|
modelConfig.transducer = transducer;
|
|
modelConfig.tokens = tokens;
|
|
modelConfig.modelType = 'zipformer';
|
|
|
|
const recognizerConfig = new sherpa_onnx.OnlineRecognizerConfig();
|
|
recognizerConfig.featConfig = featConfig;
|
|
recognizerConfig.modelConfig = modelConfig;
|
|
recognizerConfig.decodingMethod = 'greedy_search';
|
|
recognizerConfig.enableEndpoint = 1;
|
|
|
|
const recognizer = new sherpa_onnx.OnlineRecognizer(recognizerConfig);
|
|
return recognizer;
|
|
}
|
|
recognizer = createRecognizer();
|
|
stream = recognizer.createStream();
|
|
display = new sherpa_onnx.Display(50);
|
|
|
|
let lastText = '';
|
|
let segmentIndex = 0;
|
|
|
|
const ai = new portAudio.AudioIO({
|
|
inOptions: {
|
|
channelCount: 1,
|
|
closeOnError: true, // Close the stream if an audio error is detected, if
|
|
// set false then just log the error
|
|
deviceId: -1, // Use -1 or omit the deviceId to select the default device
|
|
sampleFormat: portAudio.SampleFormatFloat32,
|
|
sampleRate: recognizer.config.featConfig.sampleRate
|
|
}
|
|
});
|
|
|
|
ai.on('data', data => {
|
|
const samples = new Float32Array(data.buffer);
|
|
|
|
stream.acceptWaveform(recognizer.config.featConfig.sampleRate, samples);
|
|
|
|
while (recognizer.isReady(stream)) {
|
|
recognizer.decode(stream);
|
|
}
|
|
|
|
const isEndpoint = recognizer.isEndpoint(stream);
|
|
const text = recognizer.getResult(stream).text;
|
|
|
|
if (text.length > 0 && lastText != text) {
|
|
lastText = text;
|
|
display.print(segmentIndex, lastText);
|
|
}
|
|
if (isEndpoint) {
|
|
if (text.length > 0) {
|
|
lastText = text;
|
|
segmentIndex += 1;
|
|
}
|
|
recognizer.reset(stream)
|
|
}
|
|
});
|
|
|
|
ai.on('close', () => {
|
|
console.log('Free resources');
|
|
stream.free();
|
|
recognizer.free();
|
|
});
|
|
|
|
ai.start();
|
|
console.log('Started! Please speak')
|