feat: add mic example for better compatibility (#1909)
Co-authored-by: wanghsinche <wanghsinche>
This commit is contained in:
@@ -1,6 +1,8 @@
|
|||||||
# Introduction
|
# Introduction
|
||||||
|
|
||||||
Note: You need `Node >= 18`.
|
Note: You need `Node >= 18`.
|
||||||
|
|
||||||
|
Note: For Mac M1 and other silicon chip series, do check the example `test-online-paraformer-microphone-mic.js`
|
||||||
|
|
||||||
This directory contains nodejs examples for [sherpa-onnx](https://github.com/k2-fsa/sherpa-onnx).
|
This directory contains nodejs examples for [sherpa-onnx](https://github.com/k2-fsa/sherpa-onnx).
|
||||||
|
|
||||||
@@ -278,6 +280,25 @@ rm sherpa-onnx-streaming-paraformer-bilingual-zh-en.tar.bz2
|
|||||||
node ./test-online-paraformer-microphone.js
|
node ./test-online-paraformer-microphone.js
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
||||||
|
## ./test-online-paraformer-microphone-mic.js
|
||||||
|
|
||||||
|
[./test-online-paraformer-microphone-mic.js](./test-online-paraformer-microphone-mic.js)
|
||||||
|
demonstrates how to do real-time speech recognition from microphone
|
||||||
|
with a streaming Paraformer model. In the code we use
|
||||||
|
[sherpa-onnx-streaming-paraformer-bilingual-zh-en](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-paraformer/paraformer-models.html#csukuangfj-sherpa-onnx-streaming-paraformer-bilingual-zh-en-chinese-english).
|
||||||
|
|
||||||
|
It uses `mic` for better compatibility, do check its [npm](https://www.npmjs.com/package/mic) before running it.
|
||||||
|
|
||||||
|
You can use the following command to run it:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-paraformer-bilingual-zh-en.tar.bz2
|
||||||
|
rm sherpa-onnx-streaming-paraformer-bilingual-zh-en.tar.bz2
|
||||||
|
node ./test-online-paraformer-microphone-mic.js
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
## ./test-online-paraformer.js
|
## ./test-online-paraformer.js
|
||||||
[./test-online-paraformer.js](./test-online-paraformer.js) demonstrates
|
[./test-online-paraformer.js](./test-online-paraformer.js) demonstrates
|
||||||
how to decode a file using a streaming Paraformer model. In the code we use
|
how to decode a file using a streaming Paraformer model. In the code we use
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
{
|
{
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
|
"mic": "^2.1.2",
|
||||||
"naudiodon2": "^2.4.0",
|
"naudiodon2": "^2.4.0",
|
||||||
"sherpa-onnx": "^1.10.45",
|
"sherpa-onnx": "^1.10.45",
|
||||||
"wav": "^1.0.2"
|
"wav": "^1.0.2"
|
||||||
|
|||||||
206
nodejs-examples/test-online-paraformer-microphone-mic.js
Normal file
206
nodejs-examples/test-online-paraformer-microphone-mic.js
Normal file
@@ -0,0 +1,206 @@
|
|||||||
|
// Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang)
|
||||||
|
const mic = require('mic'); // It uses `mic` for better compatibility, do check its [npm](https://www.npmjs.com/package/mic) before running it.
|
||||||
|
const sherpa_onnx = require('sherpa-onnx');
|
||||||
|
|
||||||
|
function createOnlineRecognizer() {
|
||||||
|
let onlineParaformerModelConfig = {
|
||||||
|
encoder: './sherpa-onnx-streaming-paraformer-bilingual-zh-en/encoder.int8.onnx',
|
||||||
|
decoder: './sherpa-onnx-streaming-paraformer-bilingual-zh-en/decoder.int8.onnx',
|
||||||
|
};
|
||||||
|
|
||||||
|
let onlineModelConfig = {
|
||||||
|
paraformer: onlineParaformerModelConfig,
|
||||||
|
tokens: './sherpa-onnx-streaming-paraformer-bilingual-zh-en/tokens.txt',
|
||||||
|
};
|
||||||
|
|
||||||
|
let recognizerConfig = {
|
||||||
|
modelConfig: onlineModelConfig,
|
||||||
|
enableEndpoint: 1,
|
||||||
|
rule1MinTrailingSilence: 2.4,
|
||||||
|
rule2MinTrailingSilence: 1.2,
|
||||||
|
rule3MinUtteranceLength: 20,
|
||||||
|
};
|
||||||
|
|
||||||
|
return sherpa_onnx.createOnlineRecognizer(recognizerConfig);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* SpeechSession class, work as a session manager with the formatOutput function
|
||||||
|
* Sample output:
|
||||||
|
=== Automated Speech Recognition ===
|
||||||
|
Current Session #1
|
||||||
|
Time: 8:44:46 PM
|
||||||
|
------------------------
|
||||||
|
Recognized Sentences:
|
||||||
|
[8:44:43 PM] 1. it's so great three result is great great 她还支持中文
|
||||||
|
[8:44:46 PM] 2. 很厉
|
||||||
|
------------------------
|
||||||
|
Recognizing: 真的很厉害太厉害
|
||||||
|
|
||||||
|
*/
|
||||||
|
class SpeechSession {
|
||||||
|
constructor() {
|
||||||
|
this.startTime = Date.now();
|
||||||
|
this.sentences = [];
|
||||||
|
this.currentText = '';
|
||||||
|
this.lastUpdateTime = Date.now();
|
||||||
|
}
|
||||||
|
|
||||||
|
addOrUpdateText(text) {
|
||||||
|
this.currentText = text;
|
||||||
|
this.lastUpdateTime = Date.now();
|
||||||
|
}
|
||||||
|
|
||||||
|
finalizeSentence() {
|
||||||
|
if (this.currentText.trim()) {
|
||||||
|
this.sentences.push({
|
||||||
|
text: this.currentText.trim(),
|
||||||
|
timestamp: new Date().toLocaleTimeString()
|
||||||
|
});
|
||||||
|
}
|
||||||
|
this.currentText = '';
|
||||||
|
}
|
||||||
|
|
||||||
|
shouldStartNewSession() {
|
||||||
|
return Date.now() - this.lastUpdateTime > 10000; // 10 seconds of silence
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function formatOutput() {
|
||||||
|
clearConsole();
|
||||||
|
console.log('\n=== Automated Speech Recognition ===');
|
||||||
|
console.log(`Current Session #${sessionCount}`);
|
||||||
|
console.log('Time:', new Date().toLocaleTimeString());
|
||||||
|
console.log('------------------------');
|
||||||
|
|
||||||
|
// 显示历史句子
|
||||||
|
if (currentSession.sentences.length > 0) {
|
||||||
|
console.log('Recognized Sentences:');
|
||||||
|
currentSession.sentences.forEach((sentence, index) => {
|
||||||
|
console.log(`[${sentence.timestamp}] ${index + 1}. ${sentence.text}`);
|
||||||
|
});
|
||||||
|
console.log('------------------------');
|
||||||
|
}
|
||||||
|
|
||||||
|
// 显示当前正在识别的内容
|
||||||
|
if (currentSession.currentText) {
|
||||||
|
console.log('Recognizing:', currentSession.currentText);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
const recognizer = createOnlineRecognizer();
|
||||||
|
const stream = recognizer.createStream();
|
||||||
|
let currentSession = new SpeechSession();
|
||||||
|
let sessionCount = 1;
|
||||||
|
|
||||||
|
function clearConsole() {
|
||||||
|
process.stdout.write('\x1B[2J\x1B[0f');
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
function exitHandler(options, exitCode) {
|
||||||
|
if (options.cleanup) {
|
||||||
|
console.log('\nCleaned up resources...');
|
||||||
|
micInstance.stop();
|
||||||
|
stream.free();
|
||||||
|
recognizer.free();
|
||||||
|
}
|
||||||
|
if (exitCode || exitCode === 0) console.log('Exit code:', exitCode);
|
||||||
|
if (options.exit) process.exit();
|
||||||
|
}
|
||||||
|
|
||||||
|
const micInstance = mic({
|
||||||
|
rate: recognizer.config.featConfig.sampleRate,
|
||||||
|
channels: 1,
|
||||||
|
debug: false, // 关闭调试输出
|
||||||
|
device: 'default',
|
||||||
|
bitwidth: 16,
|
||||||
|
encoding: 'signed-integer',
|
||||||
|
exitOnSilence: 6,
|
||||||
|
fileType: 'raw'
|
||||||
|
});
|
||||||
|
|
||||||
|
const micInputStream = micInstance.getAudioStream();
|
||||||
|
|
||||||
|
function startMic() {
|
||||||
|
return new Promise((resolve, reject) => {
|
||||||
|
micInputStream.once('startComplete', () => {
|
||||||
|
console.log('Mic phone started.');
|
||||||
|
resolve();
|
||||||
|
});
|
||||||
|
|
||||||
|
micInputStream.once('error', (err) => {
|
||||||
|
console.error('Mic phone start error:', err);
|
||||||
|
reject(err);
|
||||||
|
});
|
||||||
|
|
||||||
|
micInstance.start();
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
micInputStream.on('data', buffer => {
|
||||||
|
const int16Array = new Int16Array(buffer.buffer);
|
||||||
|
const samples = new Float32Array(int16Array.length);
|
||||||
|
|
||||||
|
for (let i = 0; i < int16Array.length; i++) {
|
||||||
|
samples[i] = int16Array[i] / 32768.0;
|
||||||
|
}
|
||||||
|
|
||||||
|
stream.acceptWaveform(recognizer.config.featConfig.sampleRate, samples);
|
||||||
|
|
||||||
|
while (recognizer.isReady(stream)) {
|
||||||
|
recognizer.decode(stream);
|
||||||
|
}
|
||||||
|
|
||||||
|
const isEndpoint = recognizer.isEndpoint(stream);
|
||||||
|
const text = recognizer.getResult(stream).text;
|
||||||
|
|
||||||
|
if (text.length > 0) {
|
||||||
|
// 检查是否需要开始新会话
|
||||||
|
if (currentSession.shouldStartNewSession()) {
|
||||||
|
currentSession.finalizeSentence();
|
||||||
|
sessionCount++;
|
||||||
|
currentSession = new SpeechSession();
|
||||||
|
}
|
||||||
|
|
||||||
|
currentSession.addOrUpdateText(text);
|
||||||
|
formatOutput();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (isEndpoint) {
|
||||||
|
if (text.length > 0) {
|
||||||
|
currentSession.finalizeSentence();
|
||||||
|
formatOutput();
|
||||||
|
}
|
||||||
|
recognizer.reset(stream);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
micInputStream.on('error', err => {
|
||||||
|
console.error('Audio stream error:', err);
|
||||||
|
});
|
||||||
|
|
||||||
|
micInputStream.on('close', () => {
|
||||||
|
console.log('Mic phone closed.');
|
||||||
|
});
|
||||||
|
|
||||||
|
process.on('exit', exitHandler.bind(null, {cleanup: true}));
|
||||||
|
process.on('SIGINT', exitHandler.bind(null, {exit: true}));
|
||||||
|
process.on('SIGUSR1', exitHandler.bind(null, {exit: true}));
|
||||||
|
process.on('SIGUSR2', exitHandler.bind(null, {exit: true}));
|
||||||
|
process.on('uncaughtException', exitHandler.bind(null, {exit: true}));
|
||||||
|
|
||||||
|
async function main() {
|
||||||
|
try {
|
||||||
|
console.log('Starting ...');
|
||||||
|
await startMic();
|
||||||
|
console.log('Initialized, waiting for speech ...');
|
||||||
|
formatOutput();
|
||||||
|
} catch (err) {
|
||||||
|
console.error('Failed to initialize:', err);
|
||||||
|
process.exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
main();
|
||||||
Reference in New Issue
Block a user