diff --git a/.github/scripts/test-nodejs-npm.sh b/.github/scripts/test-nodejs-npm.sh index 0dd17982..c2d28381 100755 --- a/.github/scripts/test-nodejs-npm.sh +++ b/.github/scripts/test-nodejs-npm.sh @@ -9,6 +9,13 @@ git status ls -lh ls -lh node_modules +# asr with offline dolphin ctc +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2 +tar xvf sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2 +rm sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2 +node ./test-offline-dolphin-ctc.js +rm -rf sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02 + # speech enhancement curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speech-enhancement-models/gtcrn_simple.onnx curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speech-enhancement-models/inp_16k.wav @@ -56,7 +63,7 @@ curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/m tar xvf matcha-icefall-en_US-ljspeech.tar.bz2 rm matcha-icefall-en_US-ljspeech.tar.bz2 -wget https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/vocos-22khz-univ.onnx +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/vocos-22khz-univ.onnx node ./test-offline-tts-matcha-en.js diff --git a/.github/workflows/wasm-simd-hf-space-vad-asr.yaml b/.github/workflows/wasm-simd-hf-space-vad-asr.yaml index 18c1c1d6..8daa3121 100644 --- a/.github/workflows/wasm-simd-hf-space-vad-asr.yaml +++ b/.github/workflows/wasm-simd-hf-space-vad-asr.yaml @@ -21,8 +21,8 @@ jobs: fail-fast: false matrix: os: [ubuntu-latest] - total: ["8"] - index: ["0", "1", "2", "3", "4", "5", "6", "7"] + total: ["11"] + index: ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"] steps: - uses: actions/checkout@v4 diff --git a/README.md b/README.md index f1bbee18..c7e67216 100644 --- a/README.md +++ b/README.md @@ -119,6 +119,7 @@ We also have spaces built using WebAssembly. They are listed below: |VAD + speech recognition (Chinese 多种方言) with a [TeleSpeech-ASR][TeleSpeech-ASR] CTC model|[Click me][wasm-hf-vad-asr-zh-telespeech]| [地址][wasm-ms-vad-asr-zh-telespeech]| |VAD + speech recognition (English + Chinese, 及多种中文方言) with Paraformer-large |[Click me][wasm-hf-vad-asr-zh-en-paraformer-large]| [地址][wasm-ms-vad-asr-zh-en-paraformer-large]| |VAD + speech recognition (English + Chinese, 及多种中文方言) with Paraformer-small |[Click me][wasm-hf-vad-asr-zh-en-paraformer-small]| [地址][wasm-ms-vad-asr-zh-en-paraformer-small]| +|VAD + speech recognition (多语种及多种中文方言) with [Dolphin][Dolphin]-base |[Click me][wasm-hf-vad-asr-multi-lang-dolphin-base]| [地址][wasm-ms-vad-asr-multi-lang-dolphin-base]| |Speech synthesis (English) |[Click me][wasm-hf-tts-piper-en]| [地址][wasm-ms-tts-piper-en]| |Speech synthesis (German) |[Click me][wasm-hf-tts-piper-de]| [地址][wasm-ms-tts-piper-de]| |Speaker diarization |[Click me][wasm-hf-speaker-diarization]|[地址][wasm-ms-speaker-diarization]| @@ -390,6 +391,10 @@ It uses TTS from sherpa-onnx. See also [✨ Speak command that uses the new glob [wasm-ms-vad-asr-zh-en-paraformer-large]: https://www.modelscope.cn/studios/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-paraformer [wasm-hf-vad-asr-zh-en-paraformer-small]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-paraformer-small [wasm-ms-vad-asr-zh-en-paraformer-small]: https://www.modelscope.cn/studios/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-paraformer-small +[Dolphin]: https://github.com/DataoceanAI/Dolphin +[wasm-ms-vad-asr-multi-lang-dolphin-base]: https://modelscope.cn/studios/csukuangfj/web-assembly-vad-asr-sherpa-onnx-multi-lang-dophin-ctc +[wasm-hf-vad-asr-multi-lang-dolphin-base]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-multi-lang-dophin-ctc + [wasm-hf-tts-piper-en]: https://huggingface.co/spaces/k2-fsa/web-assembly-tts-sherpa-onnx-en [wasm-ms-tts-piper-en]: https://modelscope.cn/studios/k2-fsa/web-assembly-tts-sherpa-onnx-en [wasm-hf-tts-piper-de]: https://huggingface.co/spaces/k2-fsa/web-assembly-tts-sherpa-onnx-de diff --git a/nodejs-examples/README.md b/nodejs-examples/README.md index 9b89dad3..64f7b184 100644 --- a/nodejs-examples/README.md +++ b/nodejs-examples/README.md @@ -140,6 +140,20 @@ node ./test-offline-tts-vits-zh.js In the following, we demonstrate how to decode files and how to perform speech recognition with a microphone with `nodejs`. +## ./test-offline-dolphin-ctc.js + +[./test-offline-dolphin-ctc.js](./test-offline-dolphin-ctc.js) demonstrates +how to decode a file with a [Dolphin](https://github.com/DataoceanAI/Dolphin) CTC model. + +You can use the following command to run it: + +```bash +wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2 +tar xvf sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2 +rm sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2 +node ./test-offline-dolphin-ctc.js +``` + ## ./test-offline-nemo-ctc.js [./test-offline-nemo-ctc.js](./test-offline-nemo-ctc.js) demonstrates diff --git a/nodejs-examples/test-offline-dolphin-ctc.js b/nodejs-examples/test-offline-dolphin-ctc.js new file mode 100644 index 00000000..65ef6d44 --- /dev/null +++ b/nodejs-examples/test-offline-dolphin-ctc.js @@ -0,0 +1,37 @@ +// Copyright (c) 2025 Xiaomi Corporation (authors: Fangjun Kuang) +// +const fs = require('fs'); +const {Readable} = require('stream'); +const wav = require('wav'); + +const sherpa_onnx = require('sherpa-onnx'); + +function createOfflineRecognizer() { + let config = { + modelConfig: { + dolphin: { + model: + './sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02/model.int8.onnx', + }, + tokens: + './sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02/tokens.txt', + } + }; + + return sherpa_onnx.createOfflineRecognizer(config); +} + +const recognizer = createOfflineRecognizer(); +const stream = recognizer.createStream(); + +const waveFilename = + './sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02/test_wavs/0.wav'; +const wave = sherpa_onnx.readWave(waveFilename); +stream.acceptWaveform(wave.sampleRate, wave.samples); + +recognizer.decode(stream); +const text = recognizer.getResult(stream).text; +console.log(text); + +stream.free(); +recognizer.free(); diff --git a/scripts/wasm/generate-vad-asr.py b/scripts/wasm/generate-vad-asr.py index 6176e084..ab2cd722 100755 --- a/scripts/wasm/generate-vad-asr.py +++ b/scripts/wasm/generate-vad-asr.py @@ -197,6 +197,21 @@ def get_models(): git diff """, ), + Model( + model_name="sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02", + hf="k2-fsa/web-assembly-vad-asr-sherpa-onnx-multi-lang-dophin-ctc", + ms="csukuangfj/web-assembly-vad-asr-sherpa-onnx-multi-lang-dophin-ctc", + short_name="vad-asr-multi_lang-dolphin_ctc", + cmd=""" + pushd $model_name + mv model.int8.onnx ../dolphin.onnx + mv tokens.txt ../ + popd + rm -rf $model_name + sed -i.bak 's%Zipformer%Dolphin (多种中文方言及非常多种语言)%g' ../index.html + git diff + """, + ), ] return models diff --git a/wasm/asr/sherpa-onnx-asr.js b/wasm/asr/sherpa-onnx-asr.js index 215e0f30..ebcda516 100644 --- a/wasm/asr/sherpa-onnx-asr.js +++ b/wasm/asr/sherpa-onnx-asr.js @@ -39,6 +39,10 @@ function freeConfig(config, Module) { freeConfig(config.fireRedAsr, Module) } + if ('dolphin' in config) { + freeConfig(config.dolphin, Module) + } + if ('moonshine' in config) { freeConfig(config.moonshine, Module) } @@ -562,6 +566,23 @@ function initSherpaOnnxOfflineNemoEncDecCtcModelConfig(config, Module) { } } +function initSherpaOnnxOfflineDolphinModelConfig(config, Module) { + const n = Module.lengthBytesUTF8(config.model || '') + 1; + + const buffer = Module._malloc(n); + + const len = 1 * 4; // 1 pointer + const ptr = Module._malloc(len); + + Module.stringToUTF8(config.model || '', buffer, n); + + Module.setValue(ptr, buffer, 'i8*'); + + return { + buffer: buffer, ptr: ptr, len: len, + } +} + function initSherpaOnnxOfflineWhisperModelConfig(config, Module) { const encoderLen = Module.lengthBytesUTF8(config.encoder || '') + 1; const decoderLen = Module.lengthBytesUTF8(config.decoder || '') + 1; @@ -769,6 +790,12 @@ function initSherpaOnnxOfflineModelConfig(config, Module) { }; } + if (!('dolphin' in config)) { + config.dolphin = { + model: '', + }; + } + if (!('whisper' in config)) { config.whisper = { encoder: '', @@ -832,8 +859,12 @@ function initSherpaOnnxOfflineModelConfig(config, Module) { const fireRedAsr = initSherpaOnnxOfflineFireRedAsrModelConfig(config.fireRedAsr, Module); + const dolphin = + initSherpaOnnxOfflineDolphinModelConfig(config.dolphin, Module); + const len = transducer.len + paraformer.len + nemoCtc.len + whisper.len + - tdnn.len + 8 * 4 + senseVoice.len + moonshine.len + fireRedAsr.len; + tdnn.len + 8 * 4 + senseVoice.len + moonshine.len + fireRedAsr.len + + dolphin.len; const ptr = Module._malloc(len); @@ -932,10 +963,14 @@ function initSherpaOnnxOfflineModelConfig(config, Module) { Module._CopyHeap(fireRedAsr.ptr, fireRedAsr.len, ptr + offset); offset += fireRedAsr.len; + Module._CopyHeap(dolphin.ptr, dolphin.len, ptr + offset); + offset += dolphin.len; + return { buffer: buffer, ptr: ptr, len: len, transducer: transducer, paraformer: paraformer, nemoCtc: nemoCtc, whisper: whisper, tdnn: tdnn, - senseVoice: senseVoice, moonshine: moonshine, fireRedAsr: fireRedAsr + senseVoice: senseVoice, moonshine: moonshine, fireRedAsr: fireRedAsr, + dolphin: dolphin } } diff --git a/wasm/nodejs/sherpa-onnx-wasm-nodejs.cc b/wasm/nodejs/sherpa-onnx-wasm-nodejs.cc index 4dc68c42..3dc5611d 100644 --- a/wasm/nodejs/sherpa-onnx-wasm-nodejs.cc +++ b/wasm/nodejs/sherpa-onnx-wasm-nodejs.cc @@ -13,6 +13,7 @@ extern "C" { static_assert(sizeof(SherpaOnnxOfflineTransducerModelConfig) == 3 * 4, ""); static_assert(sizeof(SherpaOnnxOfflineParaformerModelConfig) == 4, ""); +static_assert(sizeof(SherpaOnnxOfflineDolphinModelConfig) == 4, ""); static_assert(sizeof(SherpaOnnxOfflineNemoEncDecCtcModelConfig) == 4, ""); static_assert(sizeof(SherpaOnnxOfflineWhisperModelConfig) == 5 * 4, ""); static_assert(sizeof(SherpaOnnxOfflineFireRedAsrModelConfig) == 2 * 4, ""); @@ -29,7 +30,8 @@ static_assert(sizeof(SherpaOnnxOfflineModelConfig) == sizeof(SherpaOnnxOfflineTdnnModelConfig) + 8 * 4 + sizeof(SherpaOnnxOfflineSenseVoiceModelConfig) + sizeof(SherpaOnnxOfflineMoonshineModelConfig) + - sizeof(SherpaOnnxOfflineFireRedAsrModelConfig), + sizeof(SherpaOnnxOfflineFireRedAsrModelConfig) + + sizeof(SherpaOnnxOfflineDolphinModelConfig), ""); static_assert(sizeof(SherpaOnnxFeatureConfig) == 2 * 4, ""); @@ -73,6 +75,7 @@ void PrintOfflineRecognizerConfig(SherpaOnnxOfflineRecognizerConfig *config) { auto sense_voice = &model_config->sense_voice; auto moonshine = &model_config->moonshine; auto fire_red_asr = &model_config->fire_red_asr; + auto dolphin = &model_config->dolphin; fprintf(stdout, "----------offline transducer model config----------\n"); fprintf(stdout, "encoder: %s\n", transducer->encoder); @@ -110,6 +113,9 @@ void PrintOfflineRecognizerConfig(SherpaOnnxOfflineRecognizerConfig *config) { fprintf(stdout, "encoder: %s\n", fire_red_asr->encoder); fprintf(stdout, "decoder: %s\n", fire_red_asr->decoder); + fprintf(stdout, "----------offline Dolphin model config----------\n"); + fprintf(stdout, "model: %s\n", dolphin->model); + fprintf(stdout, "tokens: %s\n", model_config->tokens); fprintf(stdout, "num_threads: %d\n", model_config->num_threads); fprintf(stdout, "provider: %s\n", model_config->provider); diff --git a/wasm/vad-asr/app-vad-asr.js b/wasm/vad-asr/app-vad-asr.js index d116a218..159e48cc 100644 --- a/wasm/vad-asr/app-vad-asr.js +++ b/wasm/vad-asr/app-vad-asr.js @@ -15,7 +15,7 @@ let resultList = []; clearBtn.onclick = function() { resultList = []; textArea.value = getDisplayResult(); - textArea.scrollTop = textArea.scrollHeight; // auto scroll + textArea.scrollTop = textArea.scrollHeight; // auto scroll }; function getDisplayResult() { @@ -46,11 +46,11 @@ let audioCtx; let mediaStream; let expectedSampleRate = 16000; -let recordSampleRate; // the sampleRate of the microphone -let recorder = null; // the microphone -let leftchannel = []; // TODO: Use a single channel +let recordSampleRate; // the sampleRate of the microphone +let recorder = null; // the microphone +let leftchannel = []; // TODO: Use a single channel -let recordingLength = 0; // number of samples so far +let recordingLength = 0; // number of samples so far let vad = null; let buffer = null; @@ -73,48 +73,50 @@ function createOfflineRecognizerSenseVoice() {} function initOfflineRecognizer() { let config = { - modelConfig : { - debug : 1, - tokens : './tokens.txt', + modelConfig: { + debug: 1, + tokens: './tokens.txt', }, }; if (fileExists('sense-voice.onnx') == 1) { config.modelConfig.senseVoice = { - model : './sense-voice.onnx', - useInverseTextNormalization : 1, + model: './sense-voice.onnx', + useInverseTextNormalization: 1, }; } else if (fileExists('whisper-encoder.onnx')) { config.modelConfig.whisper = { - encoder : './whisper-encoder.onnx', - decoder : './whisper-decoder.onnx', + encoder: './whisper-encoder.onnx', + decoder: './whisper-decoder.onnx', }; } else if (fileExists('transducer-encoder.onnx')) { config.modelConfig.transducer = { - encoder : './transducer-encoder.onnx', - decoder : './transducer-decoder.onnx', - joiner : './transducer-joiner.onnx', + encoder: './transducer-encoder.onnx', + decoder: './transducer-decoder.onnx', + joiner: './transducer-joiner.onnx', }; config.modelConfig.modelType = 'transducer'; } else if (fileExists('nemo-transducer-encoder.onnx')) { config.modelConfig.transducer = { - encoder : './nemo-transducer-encoder.onnx', - decoder : './nemo-transducer-decoder.onnx', - joiner : './nemo-transducer-joiner.onnx', + encoder: './nemo-transducer-encoder.onnx', + decoder: './nemo-transducer-decoder.onnx', + joiner: './nemo-transducer-joiner.onnx', }; config.modelConfig.modelType = 'nemo_transducer'; } else if (fileExists('paraformer.onnx')) { config.modelConfig.paraformer = { - model : './paraformer.onnx', + model: './paraformer.onnx', }; } else if (fileExists('telespeech.onnx')) { config.modelConfig.telespeechCtc = './telespeech.onnx'; } else if (fileExists('moonshine-preprocessor.onnx')) { config.modelConfig.moonshine = { - preprocessor : './moonshine-preprocessor.onnx', - encoder : './moonshine-encoder.onnx', - uncachedDecoder : './moonshine-uncached-decoder.onnx', - cachedDecoder : './moonshine-cached-decoder.onnx' + preprocessor: './moonshine-preprocessor.onnx', + encoder: './moonshine-encoder.onnx', + uncachedDecoder: './moonshine-uncached-decoder.onnx', + cachedDecoder: './moonshine-cached-decoder.onnx' }; + } else if (fileExists('dolphin.onnx')) { + config.modelConfig.dolphin = {model: './dolphin.onnx'}; } else { console.log('Please specify a model.'); alert('Please specify a model.'); @@ -133,7 +135,7 @@ Module.locateFile = function(path, scriptDirectory = '') { Module.setStatus = function(status) { console.log(`status ${status}`); const statusElement = document.getElementById('status'); - if (status == "Running...") { + if (status == 'Running...') { status = 'Model downloaded. Initializing recongizer...' } statusElement.textContent = status; @@ -170,11 +172,11 @@ if (navigator.mediaDevices.getUserMedia) { console.log('getUserMedia supported.'); // see https://w3c.github.io/mediacapture-main/#dom-mediadevices-getusermedia - const constraints = {audio : true}; + const constraints = {audio: true}; let onSuccess = function(stream) { if (!audioCtx) { - audioCtx = new AudioContext({sampleRate : expectedSampleRate}); + audioCtx = new AudioContext({sampleRate: expectedSampleRate}); } console.log(audioCtx); recordSampleRate = audioCtx.sampleRate; @@ -299,7 +301,7 @@ if (navigator.mediaDevices.getUserMedia) { } textArea.value = getDisplayResult(); - textArea.scrollTop = textArea.scrollHeight; // auto scroll + textArea.scrollTop = textArea.scrollHeight; // auto scroll }; startBtn.onclick = function() { @@ -330,8 +332,9 @@ if (navigator.mediaDevices.getUserMedia) { }; }; - let onError = function( - err) { console.log('The following error occured: ' + err); }; + let onError = function(err) { + console.log('The following error occured: ' + err); + }; navigator.mediaDevices.getUserMedia(constraints).then(onSuccess, onError); } else { @@ -364,22 +367,22 @@ function toWav(samples) { // http://soundfile.sapp.org/doc/WaveFormat/ // F F I R - view.setUint32(0, 0x46464952, true); // chunkID - view.setUint32(4, 36 + samples.length * 2, true); // chunkSize + view.setUint32(0, 0x46464952, true); // chunkID + view.setUint32(4, 36 + samples.length * 2, true); // chunkSize // E V A W - view.setUint32(8, 0x45564157, true); // format - // + view.setUint32(8, 0x45564157, true); // format + // // t m f - view.setUint32(12, 0x20746d66, true); // subchunk1ID - view.setUint32(16, 16, true); // subchunk1Size, 16 for PCM - view.setUint32(20, 1, true); // audioFormat, 1 for PCM - view.setUint16(22, 1, true); // numChannels: 1 channel - view.setUint32(24, expectedSampleRate, true); // sampleRate - view.setUint32(28, expectedSampleRate * 2, true); // byteRate - view.setUint16(32, 2, true); // blockAlign - view.setUint16(34, 16, true); // bitsPerSample - view.setUint32(36, 0x61746164, true); // Subchunk2ID - view.setUint32(40, samples.length * 2, true); // subchunk2Size + view.setUint32(12, 0x20746d66, true); // subchunk1ID + view.setUint32(16, 16, true); // subchunk1Size, 16 for PCM + view.setUint32(20, 1, true); // audioFormat, 1 for PCM + view.setUint16(22, 1, true); // numChannels: 1 channel + view.setUint32(24, expectedSampleRate, true); // sampleRate + view.setUint32(28, expectedSampleRate * 2, true); // byteRate + view.setUint16(32, 2, true); // blockAlign + view.setUint16(34, 16, true); // bitsPerSample + view.setUint32(36, 0x61746164, true); // Subchunk2ID + view.setUint32(40, samples.length * 2, true); // subchunk2Size let offset = 44; for (let i = 0; i < samples.length; ++i) { @@ -387,7 +390,7 @@ function toWav(samples) { offset += 2; } - return new Blob([ view ], {type : 'audio/wav'}); + return new Blob([view], {type: 'audio/wav'}); } // this function is copied from