diff --git a/nodejs-examples/README.md b/nodejs-examples/README.md index 4faf217e..fa98a5b7 100644 --- a/nodejs-examples/README.md +++ b/nodejs-examples/README.md @@ -1,6 +1,8 @@ # Introduction -Note: You need `Node >= 18`. +Note: You need `Node >= 18`. + +Note: For Mac M1 and other silicon chip series, do check the example `test-online-paraformer-microphone-mic.js` This directory contains nodejs examples for [sherpa-onnx](https://github.com/k2-fsa/sherpa-onnx). @@ -278,6 +280,25 @@ rm sherpa-onnx-streaming-paraformer-bilingual-zh-en.tar.bz2 node ./test-online-paraformer-microphone.js ``` + +## ./test-online-paraformer-microphone-mic.js + +[./test-online-paraformer-microphone-mic.js](./test-online-paraformer-microphone-mic.js) +demonstrates how to do real-time speech recognition from microphone +with a streaming Paraformer model. In the code we use +[sherpa-onnx-streaming-paraformer-bilingual-zh-en](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-paraformer/paraformer-models.html#csukuangfj-sherpa-onnx-streaming-paraformer-bilingual-zh-en-chinese-english). + +It uses `mic` for better compatibility, do check its [npm](https://www.npmjs.com/package/mic) before running it. + +You can use the following command to run it: + +```bash +wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-paraformer-bilingual-zh-en.tar.bz2 +rm sherpa-onnx-streaming-paraformer-bilingual-zh-en.tar.bz2 +node ./test-online-paraformer-microphone-mic.js +``` + + ## ./test-online-paraformer.js [./test-online-paraformer.js](./test-online-paraformer.js) demonstrates how to decode a file using a streaming Paraformer model. In the code we use diff --git a/nodejs-examples/package.json b/nodejs-examples/package.json index babb8587..97dbb34b 100644 --- a/nodejs-examples/package.json +++ b/nodejs-examples/package.json @@ -1,5 +1,6 @@ { "dependencies": { + "mic": "^2.1.2", "naudiodon2": "^2.4.0", "sherpa-onnx": "^1.10.45", "wav": "^1.0.2" diff --git a/nodejs-examples/test-online-paraformer-microphone-mic.js b/nodejs-examples/test-online-paraformer-microphone-mic.js new file mode 100644 index 00000000..b03a4e60 --- /dev/null +++ b/nodejs-examples/test-online-paraformer-microphone-mic.js @@ -0,0 +1,206 @@ +// Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang) +const mic = require('mic'); // It uses `mic` for better compatibility, do check its [npm](https://www.npmjs.com/package/mic) before running it. +const sherpa_onnx = require('sherpa-onnx'); + +function createOnlineRecognizer() { + let onlineParaformerModelConfig = { + encoder: './sherpa-onnx-streaming-paraformer-bilingual-zh-en/encoder.int8.onnx', + decoder: './sherpa-onnx-streaming-paraformer-bilingual-zh-en/decoder.int8.onnx', + }; + + let onlineModelConfig = { + paraformer: onlineParaformerModelConfig, + tokens: './sherpa-onnx-streaming-paraformer-bilingual-zh-en/tokens.txt', + }; + + let recognizerConfig = { + modelConfig: onlineModelConfig, + enableEndpoint: 1, + rule1MinTrailingSilence: 2.4, + rule2MinTrailingSilence: 1.2, + rule3MinUtteranceLength: 20, + }; + + return sherpa_onnx.createOnlineRecognizer(recognizerConfig); +} + +/** + * SpeechSession class, work as a session manager with the formatOutput function + * Sample output: +=== Automated Speech Recognition === +Current Session #1 +Time: 8:44:46 PM +------------------------ +Recognized Sentences: +[8:44:43 PM] 1. it's so great three result is great great 她还支持中文 +[8:44:46 PM] 2. 很厉 +------------------------ +Recognizing: 真的很厉害太厉害 + +*/ +class SpeechSession { + constructor() { + this.startTime = Date.now(); + this.sentences = []; + this.currentText = ''; + this.lastUpdateTime = Date.now(); + } + + addOrUpdateText(text) { + this.currentText = text; + this.lastUpdateTime = Date.now(); + } + + finalizeSentence() { + if (this.currentText.trim()) { + this.sentences.push({ + text: this.currentText.trim(), + timestamp: new Date().toLocaleTimeString() + }); + } + this.currentText = ''; + } + + shouldStartNewSession() { + return Date.now() - this.lastUpdateTime > 10000; // 10 seconds of silence + } +} + +function formatOutput() { + clearConsole(); + console.log('\n=== Automated Speech Recognition ==='); + console.log(`Current Session #${sessionCount}`); + console.log('Time:', new Date().toLocaleTimeString()); + console.log('------------------------'); + + // 显示历史句子 + if (currentSession.sentences.length > 0) { + console.log('Recognized Sentences:'); + currentSession.sentences.forEach((sentence, index) => { + console.log(`[${sentence.timestamp}] ${index + 1}. ${sentence.text}`); + }); + console.log('------------------------'); + } + + // 显示当前正在识别的内容 + if (currentSession.currentText) { + console.log('Recognizing:', currentSession.currentText); + } + } + + +const recognizer = createOnlineRecognizer(); +const stream = recognizer.createStream(); +let currentSession = new SpeechSession(); +let sessionCount = 1; + +function clearConsole() { + process.stdout.write('\x1B[2J\x1B[0f'); +} + + +function exitHandler(options, exitCode) { + if (options.cleanup) { + console.log('\nCleaned up resources...'); + micInstance.stop(); + stream.free(); + recognizer.free(); + } + if (exitCode || exitCode === 0) console.log('Exit code:', exitCode); + if (options.exit) process.exit(); +} + +const micInstance = mic({ + rate: recognizer.config.featConfig.sampleRate, + channels: 1, + debug: false, // 关闭调试输出 + device: 'default', + bitwidth: 16, + encoding: 'signed-integer', + exitOnSilence: 6, + fileType: 'raw' +}); + +const micInputStream = micInstance.getAudioStream(); + +function startMic() { + return new Promise((resolve, reject) => { + micInputStream.once('startComplete', () => { + console.log('Mic phone started.'); + resolve(); + }); + + micInputStream.once('error', (err) => { + console.error('Mic phone start error:', err); + reject(err); + }); + + micInstance.start(); + }); +} + +micInputStream.on('data', buffer => { + const int16Array = new Int16Array(buffer.buffer); + const samples = new Float32Array(int16Array.length); + + for (let i = 0; i < int16Array.length; i++) { + samples[i] = int16Array[i] / 32768.0; + } + + stream.acceptWaveform(recognizer.config.featConfig.sampleRate, samples); + + while (recognizer.isReady(stream)) { + recognizer.decode(stream); + } + + const isEndpoint = recognizer.isEndpoint(stream); + const text = recognizer.getResult(stream).text; + + if (text.length > 0) { + // 检查是否需要开始新会话 + if (currentSession.shouldStartNewSession()) { + currentSession.finalizeSentence(); + sessionCount++; + currentSession = new SpeechSession(); + } + + currentSession.addOrUpdateText(text); + formatOutput(); + } + + if (isEndpoint) { + if (text.length > 0) { + currentSession.finalizeSentence(); + formatOutput(); + } + recognizer.reset(stream); + } +}); + +micInputStream.on('error', err => { + console.error('Audio stream error:', err); +}); + +micInputStream.on('close', () => { + console.log('Mic phone closed.'); +}); + +process.on('exit', exitHandler.bind(null, {cleanup: true})); +process.on('SIGINT', exitHandler.bind(null, {exit: true})); +process.on('SIGUSR1', exitHandler.bind(null, {exit: true})); +process.on('SIGUSR2', exitHandler.bind(null, {exit: true})); +process.on('uncaughtException', exitHandler.bind(null, {exit: true})); + +async function main() { + try { + console.log('Starting ...'); + await startMic(); + console.log('Initialized, waiting for speech ...'); + formatOutput(); + } catch (err) { + console.error('Failed to initialize:', err); + process.exit(1); + } +} + +main(); \ No newline at end of file