diff --git a/.github/workflows/npm.yaml b/.github/workflows/npm.yaml index f2ba35e3..c608a307 100644 --- a/.github/workflows/npm.yaml +++ b/.github/workflows/npm.yaml @@ -9,6 +9,7 @@ concurrency: permissions: contents: read + id-token: write jobs: nodejs: @@ -20,10 +21,20 @@ jobs: python-version: ["3.8"] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 with: fetch-depth: 0 + - name: Install emsdk + uses: mymindstorm/setup-emsdk@v14 + + - name: View emsdk version + shell: bash + run: | + emcc -v + echo "--------------------" + emcc --check + - name: Setup Python ${{ matrix.python-version }} uses: actions/setup-python@v5 with: @@ -31,28 +42,38 @@ jobs: - uses: actions/setup-node@v4 with: - node-version: 13 registry-url: 'https://registry.npmjs.org' - name: Display node version shell: bash run: | node --version - npm --version - cd nodejs-examples - - npm install npm@6.14.4 -g - npm install npm@6.14.4 - npm --version - name: Build nodejs package shell: bash env: NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }} run: | + ./build-wasm-simd-nodejs.sh + cp -v build-wasm-simd-nodejs/install/bin/wasm/nodejs/*.js ./scripts/nodejs/ + cp -v build-wasm-simd-nodejs/install/bin/wasm/nodejs/*.wasm ./scripts/nodejs/ + + SHERPA_ONNX_VERSION=$(grep "SHERPA_ONNX_VERSION" ./CMakeLists.txt | cut -d " " -f 2 | cut -d '"' -f 2) + echo "SHERPA_ONNX_VERSION $SHERPA_ONNX_VERSION" + cd scripts/nodejs - ./run.sh + + owner=${{ github.repository_owner }} + echo "owner: $owner" + + sed -i.bak s/SHERPA_ONNX_VERSION/$SHERPA_ONNX_VERSION/g ./package.json + sed -i.bak s/k2-fsa/$owner/g ./package.json + + rm package.json.bak + + git diff + npm install - rm run.sh npm ci + # see https://docs.npmjs.com/generating-provenance-statements npm publish --provenance --access public diff --git a/.github/workflows/test-nodejs-npm.yaml b/.github/workflows/test-nodejs-npm.yaml index 50c17ac0..cc49ac0c 100644 --- a/.github/workflows/test-nodejs-npm.yaml +++ b/.github/workflows/test-nodejs-npm.yaml @@ -40,7 +40,6 @@ jobs: - uses: actions/setup-node@v4 with: - node-version: 13 registry-url: 'https://registry.npmjs.org' - name: Display node version diff --git a/.github/workflows/test-nodejs.yaml b/.github/workflows/test-nodejs.yaml index 2a24b7bd..5aed0955 100644 --- a/.github/workflows/test-nodejs.yaml +++ b/.github/workflows/test-nodejs.yaml @@ -24,7 +24,7 @@ jobs: strategy: fail-fast: false matrix: - os: [ubuntu-latest, macos-latest] #, windows-2019] + os: [ubuntu-latest] #, macos-latest] #, windows-2019] python-version: ["3.8"] steps: @@ -32,49 +32,38 @@ jobs: with: fetch-depth: 0 - - name: ccache - uses: hendrikmuhs/ccache-action@v1.2 - with: - key: ${{ matrix.os }}-Release-ON + - name: Install emsdk + uses: mymindstorm/setup-emsdk@v14 - - name: Configure CMake + - name: View emsdk version shell: bash run: | - export CMAKE_CXX_COMPILER_LAUNCHER=ccache - export PATH="/usr/lib/ccache:/usr/local/opt/ccache/libexec:$PATH" - cmake --version - - mkdir build - cd build - cmake -D CMAKE_BUILD_TYPE=Release -D BUILD_SHARED_LIBS=ON -DCMAKE_INSTALL_PREFIX=./install .. - cmake --build . --target install --config Release - - ls -lh install/lib + emcc -v + echo "--------------------" + emcc --check - name: Setup Python ${{ matrix.python-version }} uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - - name: Copy files + - uses: actions/setup-node@v4 + with: + registry-url: 'https://registry.npmjs.org' + + - name: Display node version shell: bash run: | - os=${{ matrix.os }} - if [[ $os == 'ubuntu-latest' ]]; then - mkdir -p scripts/nodejs/lib/linux-x64 - dst=scripts/nodejs/lib/linux-x64 - elif [[ $os == 'macos-latest' ]]; then - mkdir -p scripts/nodejs/lib/osx-x64 - dst=scripts/nodejs/lib/osx-x64 - elif [[ $os == 'windows-2019' ]]; then - mkdir -p scripts/nodejs/lib/win-x64 - dst=scripts/nodejs/lib/win-x64 - fi - ls -lh build/install/lib/ + node --version - rm -rf build/install/lib/pkgconfig - - cp -v build/install/lib/* $dst/ + - name: Build nodejs package + shell: bash + env: + NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }} + run: | + ./build-wasm-simd-nodejs.sh + cp -v build-wasm-simd-nodejs/install/bin/wasm/nodejs/*.js ./scripts/nodejs/ + cp -v build-wasm-simd-nodejs/install/bin/wasm/nodejs/*.wasm ./scripts/nodejs/ - name: replace files shell: bash @@ -89,17 +78,6 @@ jobs: git diff cp *.js ../scripts/nodejs - - uses: actions/setup-node@v4 - with: - node-version: 13 - registry-url: 'https://registry.npmjs.org' - - - name: Display node version - shell: bash - run: | - node --version - npm --version - - name: Run tests shell: bash run: | diff --git a/CMakeLists.txt b/CMakeLists.txt index 4e8e67e6..e890bfb7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -23,6 +23,7 @@ option(SHERPA_ONNX_ENABLE_GPU "Enable ONNX Runtime GPU support" OFF) option(SHERPA_ONNX_ENABLE_WASM "Whether to enable WASM" OFF) option(SHERPA_ONNX_ENABLE_WASM_TTS "Whether to enable WASM for TTS" OFF) option(SHERPA_ONNX_ENABLE_WASM_ASR "Whether to enable WASM for ASR" OFF) +option(SHERPA_ONNX_ENABLE_WASM_NODEJS "Whether to enable WASM for NodeJS" OFF) option(SHERPA_ONNX_ENABLE_BINARY "Whether to build binaries" ON) option(SHERPA_ONNX_LINK_LIBSTDCPP_STATICALLY "True to link libstdc++ statically. Used only when BUILD_SHARED_LIBS is OFF on Linux" ON) @@ -108,6 +109,7 @@ message(STATUS "SHERPA_ONNX_ENABLE_GPU ${SHERPA_ONNX_ENABLE_GPU}") message(STATUS "SHERPA_ONNX_ENABLE_WASM ${SHERPA_ONNX_ENABLE_WASM}") message(STATUS "SHERPA_ONNX_ENABLE_WASM_TTS ${SHERPA_ONNX_ENABLE_WASM_TTS}") message(STATUS "SHERPA_ONNX_ENABLE_WASM_ASR ${SHERPA_ONNX_ENABLE_WASM_ASR}") +message(STATUS "SHERPA_ONNX_ENABLE_WASM_NODEJS ${SHERPA_ONNX_ENABLE_WASM_NODEJS}") if(SHERPA_ONNX_ENABLE_WASM_TTS) if(NOT SHERPA_ONNX_ENABLE_WASM) @@ -121,6 +123,12 @@ if(SHERPA_ONNX_ENABLE_WASM_ASR) endif() endif() +if(SHERPA_ONNX_ENABLE_WASM_NODEJS) + if(NOT SHERPA_ONNX_ENABLE_WASM) + message(FATAL_ERROR "Please set SHERPA_ONNX_ENABLE_WASM to ON if you enable WASM for NodeJS") + endif() +endif() + if(SHERPA_ONNX_ENABLE_WASM) add_definitions(-DSHERPA_ONNX_ENABLE_WASM=1) endif() diff --git a/build-wasm-simd-nodejs.sh b/build-wasm-simd-nodejs.sh new file mode 100755 index 00000000..21a3b25d --- /dev/null +++ b/build-wasm-simd-nodejs.sh @@ -0,0 +1,63 @@ +#!/usr/bin/env bash +# Copyright (c) 2024 Xiaomi Corporation +# +# This script is to build sherpa-onnx for WebAssembly (NodeJS) +# +# Please use NodeJS >= 18 + +set -ex + +if [ x"$EMSCRIPTEN" == x"" ]; then + if ! command -v emcc &> /dev/null; then + echo "Please install emscripten first" + echo "" + echo "You can use the following commands to install it:" + echo "" + echo "git clone https://github.com/emscripten-core/emsdk.git" + echo "cd emsdk" + echo "git pull" + echo "./emsdk install latest" + echo "./emsdk activate latest" + echo "source ./emsdk_env.sh" + exit 1 + else + EMSCRIPTEN=$(dirname $(realpath $(which emcc))) + fi +fi + +export EMSCRIPTEN=$EMSCRIPTEN +echo "EMSCRIPTEN: $EMSCRIPTEN" +if [ ! -f $EMSCRIPTEN/cmake/Modules/Platform/Emscripten.cmake ]; then + echo "Cannot find $EMSCRIPTEN/cmake/Modules/Platform/Emscripten.cmake" + echo "Please make sure you have installed emsdk correctly" + exit 1 +fi + +mkdir -p build-wasm-simd-nodejs +pushd build-wasm-simd-nodejs + +export SHERPA_ONNX_IS_USING_BUILD_WASM_SH=ON + +cmake \ + -DCMAKE_INSTALL_PREFIX=./install \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_TOOLCHAIN_FILE=$EMSCRIPTEN/cmake/Modules/Platform/Emscripten.cmake \ + \ + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \ + -DSHERPA_ONNX_ENABLE_TESTS=OFF \ + -DSHERPA_ONNX_ENABLE_CHECK=OFF \ + -DBUILD_SHARED_LIBS=OFF \ + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \ + -DSHERPA_ONNX_ENABLE_JNI=OFF \ + -DSHERPA_ONNX_ENABLE_C_API=ON \ + -DSHERPA_ONNX_ENABLE_WEBSOCKET=OFF \ + -DSHERPA_ONNX_ENABLE_GPU=OFF \ + -DSHERPA_ONNX_ENABLE_WASM=ON \ + -DSHERPA_ONNX_ENABLE_WASM_NODEJS=ON \ + -DSHERPA_ONNX_ENABLE_BINARY=OFF \ + -DSHERPA_ONNX_LINK_LIBSTDCPP_STATICALLY=OFF \ + .. +make -j10 +make install + +ls -lh install/bin/wasm/nodejs diff --git a/nodejs-examples/.gitignore b/nodejs-examples/.gitignore index 1c2d5f33..2ed39bf7 100644 --- a/nodejs-examples/.gitignore +++ b/nodejs-examples/.gitignore @@ -1,3 +1,4 @@ node_modules lib package-lock.json +*.tar.bz2 diff --git a/nodejs-examples/README.md b/nodejs-examples/README.md index ead8b529..f2dc14c9 100644 --- a/nodejs-examples/README.md +++ b/nodejs-examples/README.md @@ -2,38 +2,18 @@ This directory contains nodejs examples for [sherpa-onnx](https://github.com/k2-fsa/sherpa-onnx). -Before you continue, please first install the npm package `sherpa-onnx` by +Before you continue, please first run ```bash -npm install sherpa-onnx +cd ./nodejs-examples + +npm i ``` In the following, we describe how to use [sherpa-onnx](https://github.com/k2-fsa/sherpa-onnx) for text-to-speech and speech-to-text. -**Caution**: If you get the following error: -``` -/Users/fangjun/open-source/sherpa-onnx/nodejs-examples/node_modules/ffi-napi/lib/dynamic_library.js:67 - if (match = err.match(/^(([^ \t()])+\.so([^ \t:()])*):([ \t])*/)) { - ^ - -TypeError: Cannot read properties of null (reading 'match') - at new DynamicLibrary (/Users/fangjun/open-source/sherpa-onnx/nodejs-examples/node_modules/ffi-napi/lib/dynamic_library.js:67:21) - at Object.Library (/Users/fangjun/open-source/sherpa-onnx/nodejs-examples/node_modules/ffi-napi/lib/library.js:47:10) - at Object. (/Users/fangjun/open-source/sherpa-onnx/nodejs-examples/node_modules/sherpa-onnx3/index.js:268:28) - at Module._compile (node:internal/modules/cjs/loader:1376:14) - at Module._extensions..js (node:internal/modules/cjs/loader:1435:10) - at Module.load (node:internal/modules/cjs/loader:1207:32) - at Module._load (node:internal/modules/cjs/loader:1023:12) - at Module.require (node:internal/modules/cjs/loader:1235:19) - at require (node:internal/modules/helpers:176:18) - at Object. (/Users/fangjun/open-source/sherpa-onnx/nodejs-examples/test-offline-tts-zh.js:3:21) -``` - -Please downgrade your node to version v13.14.0. See also -https://github.com/node-ffi-napi/node-ffi-napi/issues/244 -and -https://github.com/node-ffi-napi/node-ffi-napi/issues/97 . +Note: You need `Node >= 18`. # Text-to-speech @@ -71,13 +51,7 @@ node ./test-offline-tts-zh.js # Speech-to-text In the following, we demonstrate how to decode files and how to perform -speech recognition with a microphone with `nodejs`. We need to install two additional -npm packages: - - -```bash -npm install wav naudiodon2 -``` +speech recognition with a microphone with `nodejs`. ## ./test-offline-nemo-ctc.js @@ -200,60 +174,3 @@ wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherp tar xvf sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13.tar.bz2 node ./test-online-zipformer2-ctc.js ``` - -## ./test-vad-microphone-offline-paraformer.js - -[./test-vad-microphone-offline-paraformer.js](./test-vad-microphone-offline-paraformer.js) -demonstrates how to use [silero-vad](https://github.com/snakers4/silero-vad) -with non-streaming Paraformer for speech recognition from microphone. - -You can use the following command to run it: - -```bash -wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx -wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2 -tar xvf sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2 -node ./test-vad-microphone-offline-paraformer.js -``` - -## ./test-vad-microphone-offline-transducer.js - -[./test-vad-microphone-offline-transducer.js](./test-vad-microphone-offline-transducer.js) -demonstrates how to use [silero-vad](https://github.com/snakers4/silero-vad) -with a non-streaming transducer model for speech recognition from microphone. - -You can use the following command to run it: - -```bash -wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx -wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-en-2023-06-26.tar.bz2 -tar xvf sherpa-onnx-zipformer-en-2023-06-26.tar.bz2 -node ./test-vad-microphone-offline-transducer.js -``` - -## ./test-vad-microphone-offline-whisper.js - -[./test-vad-microphone-offline-whisper.js](./test-vad-microphone-offline-whisper.js) -demonstrates how to use [silero-vad](https://github.com/snakers4/silero-vad) -with whisper for speech recognition from microphone. - -You can use the following command to run it: - -```bash -wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx -wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2 -tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2 -node ./test-vad-microphone-offline-whisper.js -``` - -## ./test-vad-microphone.js - -[./test-vad-microphone.js](./test-vad-microphone.js) -demonstrates how to use [silero-vad](https://github.com/snakers4/silero-vad). - -You can use the following command to run it: - -```bash -wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx -node ./test-vad-microphone.js -``` diff --git a/nodejs-examples/package.json b/nodejs-examples/package.json index 096928bf..003f4d47 100644 --- a/nodejs-examples/package.json +++ b/nodejs-examples/package.json @@ -1,7 +1,7 @@ { "dependencies": { "naudiodon2": "^2.4.0", - "sherpa-onnx": "^1.8.12", + "sherpa-onnx": "*", "wav": "^1.0.2" } } diff --git a/nodejs-examples/test-offline-nemo-ctc.js b/nodejs-examples/test-offline-nemo-ctc.js index 1cef7169..46fb869a 100644 --- a/nodejs-examples/test-offline-nemo-ctc.js +++ b/nodejs-examples/test-offline-nemo-ctc.js @@ -1,4 +1,4 @@ -// Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang) +// Copyright (c) 2023-2024 Xiaomi Corporation (authors: Fangjun Kuang) // const fs = require('fs'); const {Readable} = require('stream'); @@ -6,32 +6,58 @@ const wav = require('wav'); const sherpa_onnx = require('sherpa-onnx'); -function createRecognizer() { - const featConfig = new sherpa_onnx.FeatureConfig(); - featConfig.sampleRate = 16000; - featConfig.featureDim = 80; +function createOfflineRecognizer() { + let featConfig = { + sampleRate: 16000, + featureDim: 80, + }; - // test online recognizer - const nemoCtc = new sherpa_onnx.OfflineNemoEncDecCtcModelConfig(); - nemoCtc.model = './sherpa-onnx-nemo-ctc-en-conformer-small/model.int8.onnx'; - const tokens = './sherpa-onnx-nemo-ctc-en-conformer-small/tokens.txt'; + let modelConfig = { + transducer: { + encoder: '', + decoder: '', + joiner: '', + }, + paraformer: { + model: '', + }, + nemoCtc: { + model: './sherpa-onnx-nemo-ctc-en-conformer-small/model.int8.onnx', + }, + whisper: { + encoder: '', + decoder: '', + }, + tdnn: { + model: '', + }, + tokens: './sherpa-onnx-nemo-ctc-en-conformer-small/tokens.txt', + numThreads: 1, + debug: 0, + provider: 'cpu', + modelType: 'nemo_ctc', + }; - const modelConfig = new sherpa_onnx.OfflineModelConfig(); - modelConfig.nemoCtc = nemoCtc; - modelConfig.tokens = tokens; - modelConfig.modelType = 'nemo_ctc'; + let lmConfig = { + model: '', + scale: 1.0, + }; - const recognizerConfig = new sherpa_onnx.OfflineRecognizerConfig(); - recognizerConfig.featConfig = featConfig; - recognizerConfig.modelConfig = modelConfig; - recognizerConfig.decodingMethod = 'greedy_search'; + let config = { + featConfig: featConfig, + modelConfig: modelConfig, + lmConfig: lmConfig, + decodingMethod: 'greedy_search', + maxActivePaths: 4, + hotwordsFile: '', + hotwordsScore: 1.5, + }; - const recognizer = new sherpa_onnx.OfflineRecognizer(recognizerConfig); - return recognizer; + return sherpa_onnx.createOfflineRecognizer(config); } -recognizer = createRecognizer(); -stream = recognizer.createStream(); +const recognizer = createOfflineRecognizer(); +const stream = recognizer.createStream(); const waveFilename = './sherpa-onnx-nemo-ctc-en-conformer-small/test_wavs/0.wav'; @@ -72,8 +98,8 @@ fs.createReadStream(waveFilename, {highWaterMark: 4096}) stream.acceptWaveform(recognizer.config.featConfig.sampleRate, flattened); recognizer.decode(stream); - const r = recognizer.getResult(stream); - console.log(r.text); + const text = recognizer.getResult(stream); + console.log(text); stream.free(); recognizer.free(); diff --git a/nodejs-examples/test-offline-paraformer.js b/nodejs-examples/test-offline-paraformer.js index c96977b4..a7d6b63e 100644 --- a/nodejs-examples/test-offline-paraformer.js +++ b/nodejs-examples/test-offline-paraformer.js @@ -6,32 +6,59 @@ const wav = require('wav'); const sherpa_onnx = require('sherpa-onnx'); -function createRecognizer() { - const featConfig = new sherpa_onnx.FeatureConfig(); - featConfig.sampleRate = 16000; - featConfig.featureDim = 80; +function createOfflineRecognizer() { + let featConfig = { + sampleRate: 16000, + featureDim: 80, + }; - // test online recognizer - const paraformer = new sherpa_onnx.OfflineParaformerModelConfig(); - paraformer.model = './sherpa-onnx-paraformer-zh-2023-03-28/model.onnx'; - const tokens = './sherpa-onnx-paraformer-zh-2023-03-28/tokens.txt'; + let modelConfig = { + transducer: { + encoder: '', + decoder: '', + joiner: '', + }, + paraformer: { + model: './sherpa-onnx-paraformer-zh-2023-03-28/model.int8.onnx', + }, + nemoCtc: { + model: '', + }, + whisper: { + encoder: '', + decoder: '', + }, + tdnn: { + model: '', + }, + tokens: './sherpa-onnx-paraformer-zh-2023-03-28/tokens.txt', + numThreads: 1, + debug: 0, + provider: 'cpu', + modelType: 'paraformer', + }; - const modelConfig = new sherpa_onnx.OfflineModelConfig(); - modelConfig.paraformer = paraformer; - modelConfig.tokens = tokens; - modelConfig.modelType = 'paraformer'; + let lmConfig = { + model: '', + scale: 1.0, + }; - const recognizerConfig = new sherpa_onnx.OfflineRecognizerConfig(); - recognizerConfig.featConfig = featConfig; - recognizerConfig.modelConfig = modelConfig; - recognizerConfig.decodingMethod = 'greedy_search'; + let config = { + featConfig: featConfig, + modelConfig: modelConfig, + lmConfig: lmConfig, + decodingMethod: 'greedy_search', + maxActivePaths: 4, + hotwordsFile: '', + hotwordsScore: 1.5, + }; - const recognizer = new sherpa_onnx.OfflineRecognizer(recognizerConfig); - return recognizer; + return sherpa_onnx.createOfflineRecognizer(config); } -recognizer = createRecognizer(); -stream = recognizer.createStream(); + +const recognizer = createOfflineRecognizer(); +const stream = recognizer.createStream(); const waveFilename = './sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/0.wav'; @@ -71,8 +98,8 @@ fs.createReadStream(waveFilename, {'highWaterMark': 4096}) stream.acceptWaveform(recognizer.config.featConfig.sampleRate, flattened); recognizer.decode(stream); - const r = recognizer.getResult(stream); - console.log(r.text); + const text = recognizer.getResult(stream); + console.log(text); stream.free(); recognizer.free(); diff --git a/nodejs-examples/test-offline-transducer.js b/nodejs-examples/test-offline-transducer.js index d86cb67b..46bdf23d 100644 --- a/nodejs-examples/test-offline-transducer.js +++ b/nodejs-examples/test-offline-transducer.js @@ -6,37 +6,60 @@ const wav = require('wav'); const sherpa_onnx = require('sherpa-onnx'); -function createRecognizer() { - const featConfig = new sherpa_onnx.FeatureConfig(); - featConfig.sampleRate = 16000; - featConfig.featureDim = 80; +function createOfflineRecognizer() { + let featConfig = { + sampleRate: 16000, + featureDim: 80, + }; - // test online recognizer - const transducer = new sherpa_onnx.OfflineTransducerModelConfig(); - transducer.encoder = - './sherpa-onnx-zipformer-en-2023-06-26/encoder-epoch-99-avg-1.onnx'; - transducer.decoder = - './sherpa-onnx-zipformer-en-2023-06-26/decoder-epoch-99-avg-1.onnx'; - transducer.joiner = - './sherpa-onnx-zipformer-en-2023-06-26/joiner-epoch-99-avg-1.onnx'; - const tokens = './sherpa-onnx-zipformer-en-2023-06-26/tokens.txt'; + let modelConfig = { + transducer: { + encoder: + './sherpa-onnx-zipformer-en-2023-06-26/encoder-epoch-99-avg-1.int8.onnx', + decoder: + './sherpa-onnx-zipformer-en-2023-06-26/decoder-epoch-99-avg-1.onnx', + joiner: + './sherpa-onnx-zipformer-en-2023-06-26/joiner-epoch-99-avg-1.int8.onnx', + }, + paraformer: { + model: '', + }, + nemoCtc: { + model: '', + }, + whisper: { + encoder: '', + decoder: '', + }, + tdnn: { + model: '', + }, + tokens: './sherpa-onnx-zipformer-en-2023-06-26/tokens.txt', + numThreads: 1, + debug: 0, + provider: 'cpu', + modelType: 'transducer', + }; - const modelConfig = new sherpa_onnx.OfflineModelConfig(); - modelConfig.transducer = transducer; - modelConfig.tokens = tokens; - modelConfig.modelType = 'transducer'; + let lmConfig = { + model: '', + scale: 1.0, + }; - const recognizerConfig = new sherpa_onnx.OfflineRecognizerConfig(); - recognizerConfig.featConfig = featConfig; - recognizerConfig.modelConfig = modelConfig; - recognizerConfig.decodingMethod = 'greedy_search'; + let config = { + featConfig: featConfig, + modelConfig: modelConfig, + lmConfig: lmConfig, + decodingMethod: 'greedy_search', + maxActivePaths: 4, + hotwordsFile: '', + hotwordsScore: 1.5, + }; - const recognizer = new sherpa_onnx.OfflineRecognizer(recognizerConfig); - return recognizer; + return sherpa_onnx.createOfflineRecognizer(config); } - -recognizer = createRecognizer(); -stream = recognizer.createStream(); +const recognizer = createOfflineRecognizer(); +const stream = recognizer.createStream(); const waveFilename = './sherpa-onnx-zipformer-en-2023-06-26/test_wavs/0.wav'; @@ -76,8 +99,8 @@ fs.createReadStream(waveFilename, {'highWaterMark': 4096}) stream.acceptWaveform(recognizer.config.featConfig.sampleRate, flattened); recognizer.decode(stream); - const r = recognizer.getResult(stream); - console.log(r.text); + const text = recognizer.getResult(stream); + console.log(text); stream.free(); recognizer.free(); diff --git a/nodejs-examples/test-offline-tts-en.js b/nodejs-examples/test-offline-tts-en.js index 8f0e0c02..a8778017 100644 --- a/nodejs-examples/test-offline-tts-en.js +++ b/nodejs-examples/test-offline-tts-en.js @@ -1,28 +1,45 @@ -// Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang) +// Copyright (c) 2023-2024 Xiaomi Corporation (authors: Fangjun Kuang) const sherpa_onnx = require('sherpa-onnx'); function createOfflineTts() { - const vits = new sherpa_onnx.OfflineTtsVitsModelConfig(); - vits.model = 'vits-piper-en_US-amy-low/en_US-amy-low.onnx' - vits.tokens = './vits-piper-en_US-amy-low/tokens.txt'; - vits.dataDir = './vits-piper-en_US-amy-low/espeak-ng-data' + let offlineTtsVitsModelConfig = { + model: './vits-piper-en_US-amy-low/en_US-amy-low.onnx', + lexicon: '', + tokens: './vits-piper-en_US-amy-low/tokens.txt', + dataDir: './vits-piper-en_US-amy-low/espeak-ng-data', + noiseScale: 0.667, + noiseScaleW: 0.8, + lengthScale: 1.0, + }; + let offlineTtsModelConfig = { + offlineTtsVitsModelConfig: offlineTtsVitsModelConfig, + numThreads: 1, + debug: 1, + provider: 'cpu', + }; - const modelConfig = new sherpa_onnx.OfflineTtsModelConfig(); - modelConfig.vits = vits; + let offlineTtsConfig = { + offlineTtsModelConfig: offlineTtsModelConfig, + ruleFsts: '', + maxNumSentences: 1, + }; - const config = new sherpa_onnx.OfflineTtsConfig(); - config.model = modelConfig; - - return new sherpa_onnx.OfflineTts(config); + return sherpa_onnx.createOfflineTts(offlineTtsConfig); } + const tts = createOfflineTts(); const speakerId = 0; const speed = 1.0; -const audio = tts.generate( - '“Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.”', - speakerId, speed); -audio.save('./test-en.wav'); +const audio = tts.generate({ + text: + '“Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.”', + sid: speakerId, + speed: speed +}); + +tts.save('./test-en.wav', audio); console.log('Saved to test-en.wav successfully.'); + tts.free(); diff --git a/nodejs-examples/test-offline-tts-zh.js b/nodejs-examples/test-offline-tts-zh.js index 16555c82..bc808803 100644 --- a/nodejs-examples/test-offline-tts-zh.js +++ b/nodejs-examples/test-offline-tts-zh.js @@ -3,25 +3,37 @@ const sherpa_onnx = require('sherpa-onnx'); function createOfflineTts() { - const vits = new sherpa_onnx.OfflineTtsVitsModelConfig(); - vits.model = './vits-zh-aishell3/vits-aishell3.onnx'; - vits.lexicon = './vits-zh-aishell3/lexicon.txt'; - vits.tokens = './vits-zh-aishell3/tokens.txt'; + let offlineTtsVitsModelConfig = { + model: './vits-zh-aishell3/vits-aishell3.onnx', + lexicon: './vits-zh-aishell3/lexicon.txt', + tokens: './vits-zh-aishell3/tokens.txt', + dataDir: '', + noiseScale: 0.667, + noiseScaleW: 0.8, + lengthScale: 1.0, + }; + let offlineTtsModelConfig = { + offlineTtsVitsModelConfig: offlineTtsVitsModelConfig, + numThreads: 1, + debug: 1, + provider: 'cpu', + }; - const modelConfig = new sherpa_onnx.OfflineTtsModelConfig(); - modelConfig.vits = vits; + let offlineTtsConfig = { + offlineTtsModelConfig: offlineTtsModelConfig, + ruleFsts: './vits-zh-aishell3/rule.fst', + maxNumSentences: 1, + }; - const config = new sherpa_onnx.OfflineTtsConfig(); - config.model = modelConfig; - config.ruleFsts = './vits-zh-aishell3/rule.fst'; - - return new sherpa_onnx.OfflineTts(config); + return sherpa_onnx.createOfflineTts(offlineTtsConfig); } + const tts = createOfflineTts(); const speakerId = 66; const speed = 1.0; -const audio = tts.generate('3年前中国总人口是1411778724人', speakerId, speed); -audio.save('./test-zh.wav'); +const audio = tts.generate( + {text: '3年前中国总人口是1411778724人', sid: speakerId, speed: speed}); +tts.save('./test-zh.wav', audio); console.log('Saved to test-zh.wav successfully.'); tts.free(); diff --git a/nodejs-examples/test-offline-whisper.js b/nodejs-examples/test-offline-whisper.js index 1dd320bd..1012ce15 100644 --- a/nodejs-examples/test-offline-whisper.js +++ b/nodejs-examples/test-offline-whisper.js @@ -6,32 +6,58 @@ const wav = require('wav'); const sherpa_onnx = require('sherpa-onnx'); -function createRecognizer() { - const featConfig = new sherpa_onnx.FeatureConfig(); - featConfig.sampleRate = 16000; - featConfig.featureDim = 80; +function createOfflineRecognizer() { + let featConfig = { + sampleRate: 16000, + featureDim: 80, + }; - // test online recognizer - const whisper = new sherpa_onnx.OfflineWhisperModelConfig(); - whisper.encoder = './sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx'; - whisper.decoder = './sherpa-onnx-whisper-tiny.en/tiny.en-decoder.int8.onnx'; - const tokens = './sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt'; + let modelConfig = { + transducer: { + encoder: '', + decoder: '', + joiner: '', + }, + paraformer: { + model: '', + }, + nemoCtc: { + model: '', + }, + whisper: { + encoder: './sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx', + decoder: './sherpa-onnx-whisper-tiny.en/tiny.en-decoder.int8.onnx', + }, + tdnn: { + model: '', + }, + tokens: './sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt', + numThreads: 1, + debug: 0, + provider: 'cpu', + modelType: 'whisper', + }; - const modelConfig = new sherpa_onnx.OfflineModelConfig(); - modelConfig.whisper = whisper; - modelConfig.tokens = tokens; - modelConfig.modelType = 'whisper'; + let lmConfig = { + model: '', + scale: 1.0, + }; - const recognizerConfig = new sherpa_onnx.OfflineRecognizerConfig(); - recognizerConfig.featConfig = featConfig; - recognizerConfig.modelConfig = modelConfig; - recognizerConfig.decodingMethod = 'greedy_search'; + let config = { + featConfig: featConfig, + modelConfig: modelConfig, + lmConfig: lmConfig, + decodingMethod: 'greedy_search', + maxActivePaths: 4, + hotwordsFile: '', + hotwordsScore: 1.5, + }; - const recognizer = new sherpa_onnx.OfflineRecognizer(recognizerConfig); - return recognizer; + return sherpa_onnx.createOfflineRecognizer(config); } -recognizer = createRecognizer(); + +recognizer = createOfflineRecognizer(); stream = recognizer.createStream(); const waveFilename = './sherpa-onnx-whisper-tiny.en/test_wavs/0.wav'; @@ -72,8 +98,8 @@ fs.createReadStream(waveFilename, {'highWaterMark': 4096}) stream.acceptWaveform(recognizer.config.featConfig.sampleRate, flattened); recognizer.decode(stream); - const r = recognizer.getResult(stream); - console.log(r.text); + const text = recognizer.getResult(stream); + console.log(text); stream.free(); recognizer.free(); diff --git a/nodejs-examples/test-online-paraformer-microphone.js b/nodejs-examples/test-online-paraformer-microphone.js index 60b28f6f..4b76f4cd 100644 --- a/nodejs-examples/test-online-paraformer-microphone.js +++ b/nodejs-examples/test-online-paraformer-microphone.js @@ -5,37 +5,58 @@ console.log(portAudio.getDevices()); const sherpa_onnx = require('sherpa-onnx'); -function createRecognizer() { - const featConfig = new sherpa_onnx.FeatureConfig(); - featConfig.sampleRate = 16000; - featConfig.featureDim = 80; +function createOnlineRecognizer() { + let onlineTransducerModelConfig = { + encoder: '', + decoder: '', + joiner: '', + }; - const paraformer = new sherpa_onnx.OnlineParaformerModelConfig(); - paraformer.encoder = - './sherpa-onnx-streaming-paraformer-bilingual-zh-en/encoder.int8.onnx'; - paraformer.decoder = - './sherpa-onnx-streaming-paraformer-bilingual-zh-en/decoder.int8.onnx'; - const tokens = - './sherpa-onnx-streaming-paraformer-bilingual-zh-en/tokens.txt'; + let onlineParaformerModelConfig = { + encoder: + './sherpa-onnx-streaming-paraformer-bilingual-zh-en/encoder.int8.onnx', + decoder: + './sherpa-onnx-streaming-paraformer-bilingual-zh-en/decoder.int8.onnx', + }; - const modelConfig = new sherpa_onnx.OnlineModelConfig(); - modelConfig.paraformer = paraformer; - modelConfig.tokens = tokens; - modelConfig.modelType = 'paraformer'; + let onlineZipformer2CtcModelConfig = { + model: '', + }; - const recognizerConfig = new sherpa_onnx.OnlineRecognizerConfig(); - recognizerConfig.featConfig = featConfig; - recognizerConfig.modelConfig = modelConfig; - recognizerConfig.decodingMethod = 'greedy_search'; - recognizerConfig.enableEndpoint = 1; + let onlineModelConfig = { + transducer: onlineTransducerModelConfig, + paraformer: onlineParaformerModelConfig, + zipformer2Ctc: onlineZipformer2CtcModelConfig, + tokens: './sherpa-onnx-streaming-paraformer-bilingual-zh-en/tokens.txt', + numThreads: 1, + provider: 'cpu', + debug: 1, + modelType: 'paraformer', + }; - const recognizer = new sherpa_onnx.OnlineRecognizer(recognizerConfig); - return recognizer; + let featureConfig = { + sampleRate: 16000, + featureDim: 80, + }; + + let recognizerConfig = { + featConfig: featureConfig, + modelConfig: onlineModelConfig, + decodingMethod: 'greedy_search', + maxActivePaths: 4, + enableEndpoint: 1, + rule1MinTrailingSilence: 2.4, + rule2MinTrailingSilence: 1.2, + rule3MinUtteranceLength: 20, + hotwordsFile: '', + hotwordsScore: 1.5, + }; + + return sherpa_onnx.createOnlineRecognizer(recognizerConfig); } -recognizer = createRecognizer(); -stream = recognizer.createStream(); -display = new sherpa_onnx.Display(50); +const recognizer = createOnlineRecognizer(); +const stream = recognizer.createStream(); let lastText = ''; let segmentIndex = 0; @@ -61,11 +82,11 @@ ai.on('data', data => { } const isEndpoint = recognizer.isEndpoint(stream); - const text = recognizer.getResult(stream).text; + const text = recognizer.getResult(stream); if (text.length > 0 && lastText != text) { lastText = text; - display.print(segmentIndex, lastText); + console.log(segmentIndex, lastText); } if (isEndpoint) { if (text.length > 0) { diff --git a/nodejs-examples/test-online-paraformer.js b/nodejs-examples/test-online-paraformer.js index e2b6a01b..09982988 100644 --- a/nodejs-examples/test-online-paraformer.js +++ b/nodejs-examples/test-online-paraformer.js @@ -6,34 +6,58 @@ const wav = require('wav'); const sherpa_onnx = require('sherpa-onnx'); -function createRecognizer() { - const featConfig = new sherpa_onnx.FeatureConfig(); - featConfig.sampleRate = 16000; - featConfig.featureDim = 80; +function createOnlineRecognizer() { + let onlineTransducerModelConfig = { + encoder: '', + decoder: '', + joiner: '', + }; - const paraformer = new sherpa_onnx.OnlineParaformerModelConfig(); - paraformer.encoder = - './sherpa-onnx-streaming-paraformer-bilingual-zh-en/encoder.onnx'; - paraformer.decoder = - './sherpa-onnx-streaming-paraformer-bilingual-zh-en/decoder.onnx'; - const tokens = - './sherpa-onnx-streaming-paraformer-bilingual-zh-en/tokens.txt'; + let onlineParaformerModelConfig = { + encoder: + './sherpa-onnx-streaming-paraformer-bilingual-zh-en/encoder.int8.onnx', + decoder: + './sherpa-onnx-streaming-paraformer-bilingual-zh-en/decoder.int8.onnx', + }; - const modelConfig = new sherpa_onnx.OnlineModelConfig(); - modelConfig.paraformer = paraformer; - modelConfig.tokens = tokens; - modelConfig.modelType = 'paraformer'; + let onlineZipformer2CtcModelConfig = { + model: '', + }; - const recognizerConfig = new sherpa_onnx.OnlineRecognizerConfig(); - recognizerConfig.featConfig = featConfig; - recognizerConfig.modelConfig = modelConfig; - recognizerConfig.decodingMethod = 'greedy_search'; + let onlineModelConfig = { + transducer: onlineTransducerModelConfig, + paraformer: onlineParaformerModelConfig, + zipformer2Ctc: onlineZipformer2CtcModelConfig, + tokens: './sherpa-onnx-streaming-paraformer-bilingual-zh-en/tokens.txt', + numThreads: 1, + provider: 'cpu', + debug: 1, + modelType: 'paraformer', + }; - const recognizer = new sherpa_onnx.OnlineRecognizer(recognizerConfig); - return recognizer; + let featureConfig = { + sampleRate: 16000, + featureDim: 80, + }; + + let recognizerConfig = { + featConfig: featureConfig, + modelConfig: onlineModelConfig, + decodingMethod: 'greedy_search', + maxActivePaths: 4, + enableEndpoint: 1, + rule1MinTrailingSilence: 2.4, + rule2MinTrailingSilence: 1.2, + rule3MinUtteranceLength: 20, + hotwordsFile: '', + hotwordsScore: 1.5, + }; + + return sherpa_onnx.createOnlineRecognizer(recognizerConfig); } -recognizer = createRecognizer(); -stream = recognizer.createStream(); + +const recognizer = createOnlineRecognizer(); +const stream = recognizer.createStream(); const waveFilename = './sherpa-onnx-streaming-paraformer-bilingual-zh-en/test_wavs/0.wav'; @@ -47,8 +71,8 @@ function decode(samples) { while (recognizer.isReady(stream)) { recognizer.decode(stream); } - const r = recognizer.getResult(stream); - console.log(r.text); + const text = recognizer.getResult(stream); + console.log(text); } reader.on('format', ({audioFormat, bitDepth, channels, sampleRate}) => { diff --git a/nodejs-examples/test-online-transducer-microphone.js b/nodejs-examples/test-online-transducer-microphone.js index f16f10d7..9fa7c92c 100644 --- a/nodejs-examples/test-online-transducer-microphone.js +++ b/nodejs-examples/test-online-transducer-microphone.js @@ -5,39 +5,60 @@ const portAudio = require('naudiodon2'); const sherpa_onnx = require('sherpa-onnx'); -function createRecognizer() { - const featConfig = new sherpa_onnx.FeatureConfig(); - featConfig.sampleRate = 16000; - featConfig.featureDim = 80; +function createOnlineRecognizer() { + let onlineTransducerModelConfig = { + encoder: + './sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/encoder-epoch-99-avg-1.int8.onnx', + decoder: + './sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/decoder-epoch-99-avg-1.onnx', + joiner: + './sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/joiner-epoch-99-avg-1.int8.onnx', + }; - // test online recognizer - const transducer = new sherpa_onnx.OnlineTransducerModelConfig(); - transducer.encoder = - './sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/encoder-epoch-99-avg-1.int8.onnx'; - transducer.decoder = - './sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/decoder-epoch-99-avg-1.onnx'; - transducer.joiner = - './sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/joiner-epoch-99-avg-1.onnx'; - const tokens = - './sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/tokens.txt'; + let onlineParaformerModelConfig = { + encoder: '', + decoder: '', + }; - const modelConfig = new sherpa_onnx.OnlineModelConfig(); - modelConfig.transducer = transducer; - modelConfig.tokens = tokens; - modelConfig.modelType = 'zipformer'; + let onlineZipformer2CtcModelConfig = { + model: '', + }; - const recognizerConfig = new sherpa_onnx.OnlineRecognizerConfig(); - recognizerConfig.featConfig = featConfig; - recognizerConfig.modelConfig = modelConfig; - recognizerConfig.decodingMethod = 'greedy_search'; - recognizerConfig.enableEndpoint = 1; + let onlineModelConfig = { + transducer: onlineTransducerModelConfig, + paraformer: onlineParaformerModelConfig, + zipformer2Ctc: onlineZipformer2CtcModelConfig, + tokens: + './sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/tokens.txt', + numThreads: 1, + provider: 'cpu', + debug: 1, + modelType: 'zipformer', + }; - const recognizer = new sherpa_onnx.OnlineRecognizer(recognizerConfig); - return recognizer; + let featureConfig = { + sampleRate: 16000, + featureDim: 80, + }; + + let recognizerConfig = { + featConfig: featureConfig, + modelConfig: onlineModelConfig, + decodingMethod: 'greedy_search', + maxActivePaths: 4, + enableEndpoint: 1, + rule1MinTrailingSilence: 2.4, + rule2MinTrailingSilence: 1.2, + rule3MinUtteranceLength: 20, + hotwordsFile: '', + hotwordsScore: 1.5, + }; + + return sherpa_onnx.createOnlineRecognizer(recognizerConfig); } -recognizer = createRecognizer(); -stream = recognizer.createStream(); -display = new sherpa_onnx.Display(50); + +const recognizer = createOnlineRecognizer(); +const stream = recognizer.createStream(); let lastText = ''; let segmentIndex = 0; @@ -63,11 +84,11 @@ ai.on('data', data => { } const isEndpoint = recognizer.isEndpoint(stream); - const text = recognizer.getResult(stream).text; + const text = recognizer.getResult(stream); if (text.length > 0 && lastText != text) { lastText = text; - display.print(segmentIndex, lastText); + console.log(segmentIndex, lastText); } if (isEndpoint) { if (text.length > 0) { diff --git a/nodejs-examples/test-online-transducer.js b/nodejs-examples/test-online-transducer.js index 822b97da..4293cbc9 100644 --- a/nodejs-examples/test-online-transducer.js +++ b/nodejs-examples/test-online-transducer.js @@ -6,37 +6,60 @@ const wav = require('wav'); const sherpa_onnx = require('sherpa-onnx'); -function createRecognizer() { - const featConfig = new sherpa_onnx.FeatureConfig(); - featConfig.sampleRate = 16000; - featConfig.featureDim = 80; +function createOnlineRecognizer() { + let onlineTransducerModelConfig = { + encoder: + './sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/encoder-epoch-99-avg-1.int8.onnx', + decoder: + './sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/decoder-epoch-99-avg-1.onnx', + joiner: + './sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/joiner-epoch-99-avg-1.int8.onnx', + }; - // test online recognizer - const transducer = new sherpa_onnx.OnlineTransducerModelConfig(); - transducer.encoder = - './sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/encoder-epoch-99-avg-1.int8.onnx'; - transducer.decoder = - './sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/decoder-epoch-99-avg-1.onnx'; - transducer.joiner = - './sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/joiner-epoch-99-avg-1.onnx'; - const tokens = - './sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/tokens.txt'; + let onlineParaformerModelConfig = { + encoder: '', + decoder: '', + }; - const modelConfig = new sherpa_onnx.OnlineModelConfig(); - modelConfig.transducer = transducer; - modelConfig.tokens = tokens; - modelConfig.modelType = 'zipformer'; + let onlineZipformer2CtcModelConfig = { + model: '', + }; - const recognizerConfig = new sherpa_onnx.OnlineRecognizerConfig(); - recognizerConfig.featConfig = featConfig; - recognizerConfig.modelConfig = modelConfig; - recognizerConfig.decodingMethod = 'greedy_search'; + let onlineModelConfig = { + transducer: onlineTransducerModelConfig, + paraformer: onlineParaformerModelConfig, + zipformer2Ctc: onlineZipformer2CtcModelConfig, + tokens: + './sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/tokens.txt', + numThreads: 1, + provider: 'cpu', + debug: 1, + modelType: 'zipformer', + }; - recognizer = new sherpa_onnx.OnlineRecognizer(recognizerConfig); - return recognizer; + let featureConfig = { + sampleRate: 16000, + featureDim: 80, + }; + + let recognizerConfig = { + featConfig: featureConfig, + modelConfig: onlineModelConfig, + decodingMethod: 'greedy_search', + maxActivePaths: 4, + enableEndpoint: 1, + rule1MinTrailingSilence: 2.4, + rule2MinTrailingSilence: 1.2, + rule3MinUtteranceLength: 20, + hotwordsFile: '', + hotwordsScore: 1.5, + }; + + return sherpa_onnx.createOnlineRecognizer(recognizerConfig); } -recognizer = createRecognizer(); -stream = recognizer.createStream(); + +const recognizer = createOnlineRecognizer(); +const stream = recognizer.createStream(); const waveFilename = './sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/test_wavs/0.wav'; @@ -50,8 +73,8 @@ function decode(samples) { while (recognizer.isReady(stream)) { recognizer.decode(stream); } - const r = recognizer.getResult(stream); - console.log(r.text); + const text = recognizer.getResult(stream); + console.log(text); } reader.on('format', ({audioFormat, bitDepth, channels, sampleRate}) => { diff --git a/nodejs-examples/test-online-zipformer2-ctc.js b/nodejs-examples/test-online-zipformer2-ctc.js index 015d7944..4f3506a2 100644 --- a/nodejs-examples/test-online-zipformer2-ctc.js +++ b/nodejs-examples/test-online-zipformer2-ctc.js @@ -6,32 +6,58 @@ const wav = require('wav'); const sherpa_onnx = require('sherpa-onnx'); -function createRecognizer() { - const featConfig = new sherpa_onnx.FeatureConfig(); - featConfig.sampleRate = 16000; - featConfig.featureDim = 80; +function createOnlineRecognizer() { + let onlineTransducerModelConfig = { + encoder: '', + decoder: '', + joiner: '', + }; - // test online recognizer - const zipformer2Ctc = new sherpa_onnx.OnlineZipformer2CtcModelConfig(); - zipformer2Ctc.model = - './sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/ctc-epoch-20-avg-1-chunk-16-left-128.onnx'; - const tokens = - './sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/tokens.txt'; + let onlineParaformerModelConfig = { + encoder: '', + decoder: '', + }; - const modelConfig = new sherpa_onnx.OnlineModelConfig(); - modelConfig.zipformer2Ctc = zipformer2Ctc; - modelConfig.tokens = tokens; + let onlineZipformer2CtcModelConfig = { + model: + './sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/ctc-epoch-20-avg-1-chunk-16-left-128.onnx', + }; - const recognizerConfig = new sherpa_onnx.OnlineRecognizerConfig(); - recognizerConfig.featConfig = featConfig; - recognizerConfig.modelConfig = modelConfig; - recognizerConfig.decodingMethod = 'greedy_search'; + let onlineModelConfig = { + transducer: onlineTransducerModelConfig, + paraformer: onlineParaformerModelConfig, + zipformer2Ctc: onlineZipformer2CtcModelConfig, + tokens: + './sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/tokens.txt', + numThreads: 1, + provider: 'cpu', + debug: 1, + modelType: '', + }; - recognizer = new sherpa_onnx.OnlineRecognizer(recognizerConfig); - return recognizer; + let featureConfig = { + sampleRate: 16000, + featureDim: 80, + }; + + let recognizerConfig = { + featConfig: featureConfig, + modelConfig: onlineModelConfig, + decodingMethod: 'greedy_search', + maxActivePaths: 4, + enableEndpoint: 1, + rule1MinTrailingSilence: 2.4, + rule2MinTrailingSilence: 1.2, + rule3MinUtteranceLength: 20, + hotwordsFile: '', + hotwordsScore: 1.5, + }; + + return sherpa_onnx.createOnlineRecognizer(recognizerConfig); } -recognizer = createRecognizer(); -stream = recognizer.createStream(); + +const recognizer = createOnlineRecognizer(); +const stream = recognizer.createStream(); const waveFilename = './sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/test_wavs/DEV_T0000000000.wav'; @@ -45,8 +71,8 @@ function decode(samples) { while (recognizer.isReady(stream)) { recognizer.decode(stream); } - const r = recognizer.getResult(stream); - console.log(r.text); + const text = recognizer.getResult(stream); + console.log(text); } reader.on('format', ({audioFormat, bitDepth, channels, sampleRate}) => { diff --git a/nodejs-examples/test-vad-microphone-offline-paraformer.js b/nodejs-examples/test-vad-microphone-offline-paraformer.js deleted file mode 100644 index f5311bea..00000000 --- a/nodejs-examples/test-vad-microphone-offline-paraformer.js +++ /dev/null @@ -1,101 +0,0 @@ -// Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang) -// -const sherpa_onnx = require('sherpa-onnx3'); -const portAudio = require('naudiodon2'); -console.log(portAudio.getDevices()); - -function createOfflineRecognizer() { - const featConfig = new sherpa_onnx.FeatureConfig(); - featConfig.sampleRate = 16000; - featConfig.featureDim = 80; - - // test online recognizer - const paraformer = new sherpa_onnx.OfflineParaformerModelConfig(); - paraformer.model = './sherpa-onnx-paraformer-zh-2023-03-28/model.int8.onnx'; - const tokens = './sherpa-onnx-paraformer-zh-2023-03-28/tokens.txt'; - - const modelConfig = new sherpa_onnx.OfflineModelConfig(); - modelConfig.paraformer = paraformer; - modelConfig.tokens = tokens; - modelConfig.modelType = 'paraformer'; - - const recognizerConfig = new sherpa_onnx.OfflineRecognizerConfig(); - recognizerConfig.featConfig = featConfig; - recognizerConfig.modelConfig = modelConfig; - recognizerConfig.decodingMethod = 'greedy_search'; - - const recognizer = new sherpa_onnx.OfflineRecognizer(recognizerConfig); - return recognizer -} - -function createVad() { - const sileroVadModelConfig = new sherpa_onnx.SileroVadModelConfig(); - sileroVadModelConfig.model = './silero_vad.onnx'; - sileroVadModelConfig.minSpeechDuration = 0.3; // seconds - sileroVadModelConfig.minSilenceDuration = 0.3; // seconds - sileroVadModelConfig.windowSize = 512; - - const vadModelConfig = new sherpa_onnx.VadModelConfig(); - vadModelConfig.sileroVad = sileroVadModelConfig; - vadModelConfig.sampleRate = 16000; - - const bufferSizeInSeconds = 60; - const vad = new sherpa_onnx.VoiceActivityDetector( - vadModelConfig, bufferSizeInSeconds); - return vad; -} - -const recognizer = createOfflineRecognizer(); -const vad = createVad(); - -const bufferSizeInSeconds = 30; -const buffer = - new sherpa_onnx.CircularBuffer(bufferSizeInSeconds * vad.config.sampleRate); - -var ai = new portAudio.AudioIO({ - inOptions: { - channelCount: 1, - sampleFormat: portAudio.SampleFormatFloat32, - sampleRate: vad.config.sampleRate, - deviceId: -1, // Use -1 or omit the deviceId to select the default device - closeOnError: true // Close the stream if an audio error is detected, if - // set false then just log the error - } -}); - -let printed = false; -let index = 0; -ai.on('data', data => { - const windowSize = vad.config.sileroVad.windowSize; - buffer.push(new Float32Array(data.buffer)); - while (buffer.size() > windowSize) { - const samples = buffer.get(buffer.head(), windowSize); - buffer.pop(windowSize); - vad.acceptWaveform(samples) - } - - while (!vad.isEmpty()) { - const segment = vad.front(); - vad.pop(); - const stream = recognizer.createStream(); - stream.acceptWaveform( - recognizer.config.featConfig.sampleRate, segment.samples); - recognizer.decode(stream); - const r = recognizer.getResult(stream); - stream.free(); - if (r.text.length > 0) { - console.log(`${index}: ${r.text}`); - index += 1; - } - } -}); - -ai.on('close', () => { - console.log('Free resources'); - recognizer.free(); - vad.free(); - buffer.free(); -}); - -ai.start(); -console.log('Started! Please speak') diff --git a/nodejs-examples/test-vad-microphone-offline-transducer.js b/nodejs-examples/test-vad-microphone-offline-transducer.js deleted file mode 100644 index 4cf6d717..00000000 --- a/nodejs-examples/test-vad-microphone-offline-transducer.js +++ /dev/null @@ -1,106 +0,0 @@ -// Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang) -// -const sherpa_onnx = require('sherpa-onnx'); -const portAudio = require('naudiodon2'); -console.log(portAudio.getDevices()); - -function createOfflineRecognizer() { - const featConfig = new sherpa_onnx.FeatureConfig(); - featConfig.sampleRate = 16000; - featConfig.featureDim = 80; - - // test online recognizer - const transducer = new sherpa_onnx.OfflineTransducerModelConfig(); - transducer.encoder = - './sherpa-onnx-zipformer-en-2023-06-26/encoder-epoch-99-avg-1.onnx'; - transducer.decoder = - './sherpa-onnx-zipformer-en-2023-06-26/decoder-epoch-99-avg-1.onnx'; - transducer.joiner = - './sherpa-onnx-zipformer-en-2023-06-26/joiner-epoch-99-avg-1.onnx'; - const tokens = './sherpa-onnx-zipformer-en-2023-06-26/tokens.txt'; - - const modelConfig = new sherpa_onnx.OfflineModelConfig(); - modelConfig.transducer = transducer; - modelConfig.tokens = tokens; - modelConfig.modelType = 'transducer'; - - const recognizerConfig = new sherpa_onnx.OfflineRecognizerConfig(); - recognizerConfig.featConfig = featConfig; - recognizerConfig.modelConfig = modelConfig; - recognizerConfig.decodingMethod = 'greedy_search'; - - const recognizer = new sherpa_onnx.OfflineRecognizer(recognizerConfig); - return recognizer; -} - -function createVad() { - const sileroVadModelConfig = new sherpa_onnx.SileroVadModelConfig(); - sileroVadModelConfig.model = './silero_vad.onnx'; - sileroVadModelConfig.minSpeechDuration = 0.3; // seconds - sileroVadModelConfig.minSilenceDuration = 0.3; // seconds - sileroVadModelConfig.windowSize = 512; - - const vadModelConfig = new sherpa_onnx.VadModelConfig(); - vadModelConfig.sileroVad = sileroVadModelConfig; - vadModelConfig.sampleRate = 16000; - - const bufferSizeInSeconds = 60; - const vad = new sherpa_onnx.VoiceActivityDetector( - vadModelConfig, bufferSizeInSeconds); - return vad; -} - -const recognizer = createOfflineRecognizer(); -const vad = createVad(); - -const bufferSizeInSeconds = 30; -const buffer = - new sherpa_onnx.CircularBuffer(bufferSizeInSeconds * vad.config.sampleRate); - -const ai = new portAudio.AudioIO({ - inOptions: { - channelCount: 1, - closeOnError: true, // Close the stream if an audio error is detected, if - // set false then just log the error - deviceId: -1, // Use -1 or omit the deviceId to select the default device - sampleFormat: portAudio.SampleFormatFloat32, - sampleRate: vad.config.sampleRate - } -}); - -let printed = false; -let index = 0; -ai.on('data', data => { - const windowSize = vad.config.sileroVad.windowSize; - buffer.push(new Float32Array(data.buffer)); - while (buffer.size() > windowSize) { - const samples = buffer.get(buffer.head(), windowSize); - buffer.pop(windowSize); - vad.acceptWaveform(samples) - } - - while (!vad.isEmpty()) { - const segment = vad.front(); - vad.pop(); - const stream = recognizer.createStream(); - stream.acceptWaveform( - recognizer.config.featConfig.sampleRate, segment.samples); - recognizer.decode(stream); - const r = recognizer.getResult(stream); - stream.free(); - if (r.text.length > 0) { - console.log(`${index}: ${r.text}`); - index += 1; - } - } -}); - -ai.on('close', () => { - console.log('Free resources'); - recognizer.free(); - vad.free(); - buffer.free(); -}); - -ai.start(); -console.log('Started! Please speak') diff --git a/nodejs-examples/test-vad-microphone-offline-whisper.js b/nodejs-examples/test-vad-microphone-offline-whisper.js deleted file mode 100644 index 07a344b8..00000000 --- a/nodejs-examples/test-vad-microphone-offline-whisper.js +++ /dev/null @@ -1,102 +0,0 @@ -// Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang) -// -const sherpa_onnx = require('sherpa-onnx'); -const portAudio = require('naudiodon2'); -console.log(portAudio.getDevices()); - -function createOfflineRecognizer() { - const featConfig = new sherpa_onnx.FeatureConfig(); - featConfig.sampleRate = 16000; - featConfig.featureDim = 80; - - // test online recognizer - const whisper = new sherpa_onnx.OfflineWhisperModelConfig(); - whisper.encoder = './sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx'; - whisper.decoder = './sherpa-onnx-whisper-tiny.en/tiny.en-decoder.int8.onnx'; - const tokens = './sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt'; - - const modelConfig = new sherpa_onnx.OfflineModelConfig(); - modelConfig.whisper = whisper; - modelConfig.tokens = tokens; - modelConfig.modelType = 'whisper'; - - const recognizerConfig = new sherpa_onnx.OfflineRecognizerConfig(); - recognizerConfig.featConfig = featConfig; - recognizerConfig.modelConfig = modelConfig; - recognizerConfig.decodingMethod = 'greedy_search'; - - const recognizer = new sherpa_onnx.OfflineRecognizer(recognizerConfig); - return recognizer; -} - -function createVad() { - const sileroVadModelConfig = new sherpa_onnx.SileroVadModelConfig(); - sileroVadModelConfig.model = './silero_vad.onnx'; - sileroVadModelConfig.minSpeechDuration = 0.3; // seconds - sileroVadModelConfig.minSilenceDuration = 0.3; // seconds - sileroVadModelConfig.windowSize = 512; - - const vadModelConfig = new sherpa_onnx.VadModelConfig(); - vadModelConfig.sileroVad = sileroVadModelConfig; - vadModelConfig.sampleRate = 16000; - - const bufferSizeInSeconds = 60; - const vad = new sherpa_onnx.VoiceActivityDetector( - vadModelConfig, bufferSizeInSeconds); - return vad; -} - -const recognizer = createOfflineRecognizer(); -const vad = createVad(); - -const bufferSizeInSeconds = 30; -const buffer = - new sherpa_onnx.CircularBuffer(bufferSizeInSeconds * vad.config.sampleRate); - -const ai = new portAudio.AudioIO({ - inOptions: { - channelCount: 1, - closeOnError: true, // Close the stream if an audio error is detected, if - // set false then just log the error - deviceId: -1, // Use -1 or omit the deviceId to select the default device - sampleFormat: portAudio.SampleFormatFloat32, - sampleRate: vad.config.sampleRate - } -}); - -let printed = false; -let index = 0; -ai.on('data', data => { - const windowSize = vad.config.sileroVad.windowSize; - buffer.push(new Float32Array(data.buffer)); - while (buffer.size() > windowSize) { - const samples = buffer.get(buffer.head(), windowSize); - buffer.pop(windowSize); - vad.acceptWaveform(samples) - } - - while (!vad.isEmpty()) { - const segment = vad.front(); - vad.pop(); - const stream = recognizer.createStream(); - stream.acceptWaveform( - recognizer.config.featConfig.sampleRate, segment.samples); - recognizer.decode(stream); - const r = recognizer.getResult(stream); - stream.free(); - if (r.text.length > 0) { - console.log(`${index}: ${r.text}`); - index += 1; - } - } -}); - -ai.on('close', () => { - console.log('Free resources'); - recognizer.free(); - vad.free(); - buffer.free(); -}); - -ai.start(); -console.log('Started! Please speak') diff --git a/nodejs-examples/test-vad-microphone.js b/nodejs-examples/test-vad-microphone.js deleted file mode 100644 index ec65b50f..00000000 --- a/nodejs-examples/test-vad-microphone.js +++ /dev/null @@ -1,74 +0,0 @@ -// Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang) - -const sherpa_onnx = require('sherpa-onnx'); -const portAudio = require('naudiodon2'); -console.log(portAudio.getDevices()); - -function createVad() { - const sileroVadModelConfig = new sherpa_onnx.SileroVadModelConfig(); - sileroVadModelConfig.model = './silero_vad.onnx'; - sileroVadModelConfig.minSpeechDuration = 0.3; // seconds - sileroVadModelConfig.minSilenceDuration = 0.3; // seconds - sileroVadModelConfig.windowSize = 512; - - const vadModelConfig = new sherpa_onnx.VadModelConfig(); - vadModelConfig.sileroVad = sileroVadModelConfig; - vadModelConfig.sampleRate = 16000; - - const bufferSizeInSeconds = 60; - const vad = new sherpa_onnx.VoiceActivityDetector( - vadModelConfig, bufferSizeInSeconds); - return vad; -} -vad = createVad(); -const bufferSizeInSeconds = 30; -const buffer = - new sherpa_onnx.CircularBuffer(bufferSizeInSeconds * vad.config.sampleRate); - -const ai = new portAudio.AudioIO({ - inOptions: { - channelCount: 1, - closeOnError: true, // Close the stream if an audio error is detected, if - // set false then just log the error - deviceId: -1, // Use -1 or omit the deviceId to select the default device - sampleFormat: portAudio.SampleFormatFloat32, - sampleRate: vad.config.sampleRate - } -}); - -let printed = false; -let index = 0; -ai.on('data', data => { - const windowSize = vad.config.sileroVad.windowSize; - buffer.push(new Float32Array(data.buffer)); - while (buffer.size() > windowSize) { - const samples = buffer.get(buffer.head(), windowSize); - buffer.pop(windowSize); - vad.acceptWaveform(samples) - if (vad.isDetected() && !printed) { - console.log(`${index}: Detected speech`) - printed = true; - } - - if (!vad.isDetected()) { - printed = false; - } - - while (!vad.isEmpty()) { - const segment = vad.front(); - vad.pop(); - const duration = segment.samples.length / vad.config.sampleRate; - console.log(`${index} End of speech. Duration: ${duration} seconds`); - index += 1; - } - } -}); - -ai.on('close', () => { - console.log('Free resources'); - vad.free(); - buffer.free(); -}); - -ai.start(); -console.log('Started! Please speak') diff --git a/scripts/nodejs/.clang-format b/scripts/nodejs/.clang-format deleted file mode 100644 index f62c7209..00000000 --- a/scripts/nodejs/.clang-format +++ /dev/null @@ -1,3 +0,0 @@ -Language: JavaScript -JavaScriptQuotes: Double - diff --git a/scripts/nodejs/README.md b/scripts/nodejs/README.md index ed520597..27b8a3e5 100644 --- a/scripts/nodejs/README.md +++ b/scripts/nodejs/README.md @@ -7,3 +7,5 @@ It processes everything locally without accessing the Internet. Please refer to https://github.com/k2-fsa/sherpa-onnx/tree/master/nodejs-examples for examples. + +You need Node >= 18 for this package. diff --git a/scripts/nodejs/index.js b/scripts/nodejs/index.js index da6178b3..3163c0ab 100644 --- a/scripts/nodejs/index.js +++ b/scripts/nodejs/index.js @@ -1,726 +1,26 @@ -// Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang) -// -// Please use -// -// npm install ffi-napi ref-struct-napi -// -// before you use this file -// -// -// Please use node 13. node 16, 18, 20, and 21 are known not working. -// See also -// https://github.com/node-ffi-napi/node-ffi-napi/issues/244 -// and -// https://github.com/node-ffi-napi/node-ffi-napi/issues/97 -"use strict" +// Copyright (c) 2023-2024 Xiaomi Corporation (authors: Fangjun Kuang) +'use strict' -const debug = require("debug")("sherpa-onnx"); -const os = require("os"); -const path = require("path"); -const ffi = require("ffi-napi"); -const ref = require("ref-napi"); -const fs = require("fs"); -var ArrayType = require("ref-array-napi"); +const wasmModule = require('./sherpa-onnx-wasm-nodejs.js')(); +const sherpa_onnx_asr = require('./sherpa-onnx-asr.js'); +const sherpa_onnx_tts = require('./sherpa-onnx-tts.js'); -const FloatArray = ArrayType(ref.types.float); -const StructType = require("ref-struct-napi"); -const cstring = ref.types.CString; -const cstringPtr = ref.refType(cstring); -const int32_t = ref.types.int32; -const float = ref.types.float; -const floatPtr = ref.refType(float); - -const SherpaOnnxOnlineTransducerModelConfig = StructType({ - "encoder" : cstring, - "decoder" : cstring, - "joiner" : cstring, -}); - -const SherpaOnnxOnlineParaformerModelConfig = StructType({ - "encoder" : cstring, - "decoder" : cstring, -}); - -const SherpaOnnxOnlineZipformer2CtcModelConfig = StructType({ - "model" : cstring, -}); - -const SherpaOnnxOnlineModelConfig = StructType({ - "transducer" : SherpaOnnxOnlineTransducerModelConfig, - "paraformer" : SherpaOnnxOnlineParaformerModelConfig, - "zipformer2Ctc" : SherpaOnnxOnlineZipformer2CtcModelConfig, - "tokens" : cstring, - "numThreads" : int32_t, - "provider" : cstring, - "debug" : int32_t, - "modelType" : cstring, -}); - -const SherpaOnnxFeatureConfig = StructType({ - "sampleRate" : int32_t, - "featureDim" : int32_t, -}); - -const SherpaOnnxOnlineRecognizerConfig = StructType({ - "featConfig" : SherpaOnnxFeatureConfig, - "modelConfig" : SherpaOnnxOnlineModelConfig, - "decodingMethod" : cstring, - "maxActivePaths" : int32_t, - "enableEndpoint" : int32_t, - "rule1MinTrailingSilence" : float, - "rule2MinTrailingSilence" : float, - "rule3MinUtteranceLength" : float, - "hotwordsFile" : cstring, - "hotwordsScore" : float, -}); - -const SherpaOnnxOnlineRecognizerResult = StructType({ - "text" : cstring, - "tokens" : cstring, - "tokensArr" : cstringPtr, - "timestamps" : floatPtr, - "count" : int32_t, - "json" : cstring, -}); - -const SherpaOnnxOnlineRecognizerPtr = ref.refType(ref.types.void); -const SherpaOnnxOnlineStreamPtr = ref.refType(ref.types.void); -const SherpaOnnxOnlineStreamPtrPtr = ref.refType(SherpaOnnxOnlineStreamPtr); -const SherpaOnnxOnlineRecognizerResultPtr = - ref.refType(SherpaOnnxOnlineRecognizerResult); - -const SherpaOnnxOnlineRecognizerConfigPtr = - ref.refType(SherpaOnnxOnlineRecognizerConfig); - -const SherpaOnnxOfflineTransducerModelConfig = StructType({ - "encoder" : cstring, - "decoder" : cstring, - "joiner" : cstring, -}); - -const SherpaOnnxOfflineParaformerModelConfig = StructType({ - "model" : cstring, -}); - -const SherpaOnnxOfflineNemoEncDecCtcModelConfig = StructType({ - "model" : cstring, -}); - -const SherpaOnnxOfflineWhisperModelConfig = StructType({ - "encoder" : cstring, - "decoder" : cstring, -}); - -const SherpaOnnxOfflineTdnnModelConfig = StructType({ - "model" : cstring, -}); - -const SherpaOnnxOfflineLMConfig = StructType({ - "model" : cstring, - "scale" : float, -}); - -const SherpaOnnxOfflineModelConfig = StructType({ - "transducer" : SherpaOnnxOfflineTransducerModelConfig, - "paraformer" : SherpaOnnxOfflineParaformerModelConfig, - "nemoCtc" : SherpaOnnxOfflineNemoEncDecCtcModelConfig, - "whisper" : SherpaOnnxOfflineWhisperModelConfig, - "tdnn" : SherpaOnnxOfflineTdnnModelConfig, - "tokens" : cstring, - "numThreads" : int32_t, - "debug" : int32_t, - "provider" : cstring, - "modelType" : cstring, -}); - -const SherpaOnnxOfflineRecognizerConfig = StructType({ - "featConfig" : SherpaOnnxFeatureConfig, - "modelConfig" : SherpaOnnxOfflineModelConfig, - "lmConfig" : SherpaOnnxOfflineLMConfig, - "decodingMethod" : cstring, - "maxActivePaths" : int32_t, - "hotwordsFile" : cstring, - "hotwordsScore" : float, -}); - -const SherpaOnnxOfflineRecognizerResult = StructType({ - "text" : cstring, - "timestamps" : floatPtr, - "count" : int32_t, -}); - -const SherpaOnnxOfflineRecognizerPtr = ref.refType(ref.types.void); -const SherpaOnnxOfflineStreamPtr = ref.refType(ref.types.void); -const SherpaOnnxOfflineStreamPtrPtr = ref.refType(SherpaOnnxOfflineStreamPtr); -const SherpaOnnxOfflineRecognizerResultPtr = - ref.refType(SherpaOnnxOfflineRecognizerResult); - -const SherpaOnnxOfflineRecognizerConfigPtr = - ref.refType(SherpaOnnxOfflineRecognizerConfig); - -// vad -const SherpaOnnxSileroVadModelConfig = StructType({ - "model" : cstring, - "threshold" : float, - "minSilenceDuration" : float, - "minSpeechDuration" : float, - "windowSize" : int32_t, -}); - -const SherpaOnnxVadModelConfig = StructType({ - "sileroVad" : SherpaOnnxSileroVadModelConfig, - "sampleRate" : int32_t, - "numThreads" : int32_t, - "provider" : cstring, - "debug" : int32_t, -}); - -const SherpaOnnxSpeechSegment = StructType({ - "start" : int32_t, - "samples" : FloatArray, - "n" : int32_t, -}); - -const SherpaOnnxVadModelConfigPtr = ref.refType(SherpaOnnxVadModelConfig); -const SherpaOnnxSpeechSegmentPtr = ref.refType(SherpaOnnxSpeechSegment); -const SherpaOnnxCircularBufferPtr = ref.refType(ref.types.void); -const SherpaOnnxVoiceActivityDetectorPtr = ref.refType(ref.types.void); - -// tts -const SherpaOnnxOfflineTtsVitsModelConfig = StructType({ - "model" : cstring, - "lexicon" : cstring, - "tokens" : cstring, - "dataDir" : cstring, - "noiseScale" : float, - "noiseScaleW" : float, - "lengthScale" : float, -}); - -const SherpaOnnxOfflineTtsModelConfig = StructType({ - "vits" : SherpaOnnxOfflineTtsVitsModelConfig, - "numThreads" : int32_t, - "debug" : int32_t, - "provider" : cstring, -}); - -const SherpaOnnxOfflineTtsConfig = StructType({ - "model" : SherpaOnnxOfflineTtsModelConfig, - "ruleFsts" : cstring, - "maxNumSentences" : int32_t, -}); - -const SherpaOnnxGeneratedAudio = StructType({ - "samples" : FloatArray, - "n" : int32_t, - "sampleRate" : int32_t, -}); - -const SherpaOnnxOfflineTtsVitsModelConfigPtr = - ref.refType(SherpaOnnxOfflineTtsVitsModelConfig); -const SherpaOnnxOfflineTtsConfigPtr = ref.refType(SherpaOnnxOfflineTtsConfig); -const SherpaOnnxGeneratedAudioPtr = ref.refType(SherpaOnnxGeneratedAudio); -const SherpaOnnxOfflineTtsPtr = ref.refType(ref.types.void); - -const SherpaOnnxDisplayPtr = ref.refType(ref.types.void); - -let soname; -if (os.platform() == "win32") { - // see https://nodejs.org/api/process.html#processarch - if (process.arch == "x64") { - let currentPath = process.env.Path; - let dllDirectory = path.resolve(path.join(__dirname, "lib", "win-x64")); - process.env.Path = currentPath + path.delimiter + dllDirectory; - - soname = path.join(__dirname, "lib", "win-x64", "sherpa-onnx-c-api.dll") - } else if (process.arch == "ia32") { - let currentPath = process.env.Path; - let dllDirectory = path.resolve(path.join(__dirname, "lib", "win-x86")); - process.env.Path = currentPath + path.delimiter + dllDirectory; - - soname = path.join(__dirname, "lib", "win-x86", "sherpa-onnx-c-api.dll") - } else { - throw new Error( - `Support only Windows x86 and x64 for now. Given ${process.arch}`); - } -} else if (os.platform() == "darwin") { - if (process.arch == "x64") { - soname = - path.join(__dirname, "lib", "osx-x64", "libsherpa-onnx-c-api.dylib"); - } else if (process.arch == "arm64") { - soname = - path.join(__dirname, "lib", "osx-arm64", "libsherpa-onnx-c-api.dylib"); - } else { - throw new Error( - `Support only macOS x64 and arm64 for now. Given ${process.arch}`); - } -} else if (os.platform() == "linux") { - if (process.arch == "x64") { - soname = - path.join(__dirname, "lib", "linux-x64", "libsherpa-onnx-c-api.so"); - } else { - throw new Error(`Support only Linux x64 for now. Given ${process.arch}`); - } -} else { - throw new Error(`Unsupported platform ${os.platform()}`); +function createOnlineRecognizer(config) { + return sherpa_onnx_asr.createOnlineRecognizer(wasmModule, config); } -if (!fs.existsSync(soname)) { - throw new Error(`Cannot find file ${soname}. Please make sure you have run - ./build.sh`); +function createOfflineRecognizer(config) { + return new sherpa_onnx_asr.OfflineRecognizer(config, wasmModule); } -debug("soname ", soname) - -const libsherpa_onnx = ffi.Library(soname, { - // online asr - "CreateOnlineRecognizer" : [ - SherpaOnnxOnlineRecognizerPtr, [ SherpaOnnxOnlineRecognizerConfigPtr ] - ], - "DestroyOnlineRecognizer" : [ "void", [ SherpaOnnxOnlineRecognizerPtr ] ], - "CreateOnlineStream" : - [ SherpaOnnxOnlineStreamPtr, [ SherpaOnnxOnlineRecognizerPtr ] ], - "CreateOnlineStreamWithHotwords" : - [ SherpaOnnxOnlineStreamPtr, [ SherpaOnnxOnlineRecognizerPtr, cstring ] ], - "DestroyOnlineStream" : [ "void", [ SherpaOnnxOnlineStreamPtr ] ], - "AcceptWaveform" : - [ "void", [ SherpaOnnxOnlineStreamPtr, int32_t, floatPtr, int32_t ] ], - "IsOnlineStreamReady" : - [ int32_t, [ SherpaOnnxOnlineRecognizerPtr, SherpaOnnxOnlineStreamPtr ] ], - "DecodeOnlineStream" : - [ "void", [ SherpaOnnxOnlineRecognizerPtr, SherpaOnnxOnlineStreamPtr ] ], - "DecodeMultipleOnlineStreams" : [ - "void", - [ SherpaOnnxOnlineRecognizerPtr, SherpaOnnxOnlineStreamPtrPtr, int32_t ] - ], - "GetOnlineStreamResult" : [ - SherpaOnnxOnlineRecognizerResultPtr, - [ SherpaOnnxOnlineRecognizerPtr, SherpaOnnxOnlineStreamPtr ] - ], - "DestroyOnlineRecognizerResult" : - [ "void", [ SherpaOnnxOnlineRecognizerResultPtr ] ], - "Reset" : - [ "void", [ SherpaOnnxOnlineRecognizerPtr, SherpaOnnxOnlineStreamPtr ] ], - "InputFinished" : [ "void", [ SherpaOnnxOnlineStreamPtr ] ], - "IsEndpoint" : - [ int32_t, [ SherpaOnnxOnlineRecognizerPtr, SherpaOnnxOnlineStreamPtr ] ], - - // offline asr - "CreateOfflineRecognizer" : [ - SherpaOnnxOfflineRecognizerPtr, [ SherpaOnnxOfflineRecognizerConfigPtr ] - ], - "DestroyOfflineRecognizer" : [ "void", [ SherpaOnnxOfflineRecognizerPtr ] ], - "CreateOfflineStream" : - [ SherpaOnnxOfflineStreamPtr, [ SherpaOnnxOfflineRecognizerPtr ] ], - "DestroyOfflineStream" : [ "void", [ SherpaOnnxOfflineStreamPtr ] ], - "AcceptWaveformOffline" : - [ "void", [ SherpaOnnxOfflineStreamPtr, int32_t, floatPtr, int32_t ] ], - "DecodeOfflineStream" : [ - "void", [ SherpaOnnxOfflineRecognizerPtr, SherpaOnnxOfflineStreamPtr ] - ], - "DecodeMultipleOfflineStreams" : [ - "void", - [ SherpaOnnxOfflineRecognizerPtr, SherpaOnnxOfflineStreamPtrPtr, int32_t ] - ], - "GetOfflineStreamResult" : - [ SherpaOnnxOfflineRecognizerResultPtr, [ SherpaOnnxOfflineStreamPtr ] ], - "DestroyOfflineRecognizerResult" : - [ "void", [ SherpaOnnxOfflineRecognizerResultPtr ] ], - - // vad - "SherpaOnnxCreateCircularBuffer" : - [ SherpaOnnxCircularBufferPtr, [ int32_t ] ], - "SherpaOnnxDestroyCircularBuffer" : - [ "void", [ SherpaOnnxCircularBufferPtr ] ], - "SherpaOnnxCircularBufferPush" : - [ "void", [ SherpaOnnxCircularBufferPtr, floatPtr, int32_t ] ], - "SherpaOnnxCircularBufferGet" : - [ FloatArray, [ SherpaOnnxCircularBufferPtr, int32_t, int32_t ] ], - "SherpaOnnxCircularBufferFree" : [ "void", [ FloatArray ] ], - "SherpaOnnxCircularBufferPop" : - [ "void", [ SherpaOnnxCircularBufferPtr, int32_t ] ], - "SherpaOnnxCircularBufferSize" : [ int32_t, [ SherpaOnnxCircularBufferPtr ] ], - "SherpaOnnxCircularBufferHead" : [ int32_t, [ SherpaOnnxCircularBufferPtr ] ], - "SherpaOnnxCircularBufferReset" : [ "void", [ SherpaOnnxCircularBufferPtr ] ], - "SherpaOnnxCreateVoiceActivityDetector" : [ - SherpaOnnxVoiceActivityDetectorPtr, [ SherpaOnnxVadModelConfigPtr, float ] - ], - "SherpaOnnxDestroyVoiceActivityDetector" : - [ "void", [ SherpaOnnxVoiceActivityDetectorPtr ] ], - "SherpaOnnxVoiceActivityDetectorAcceptWaveform" : - [ "void", [ SherpaOnnxVoiceActivityDetectorPtr, floatPtr, int32_t ] ], - "SherpaOnnxVoiceActivityDetectorEmpty" : - [ int32_t, [ SherpaOnnxVoiceActivityDetectorPtr ] ], - "SherpaOnnxVoiceActivityDetectorDetected" : - [ int32_t, [ SherpaOnnxVoiceActivityDetectorPtr ] ], - "SherpaOnnxVoiceActivityDetectorPop" : - [ "void", [ SherpaOnnxVoiceActivityDetectorPtr ] ], - "SherpaOnnxVoiceActivityDetectorClear" : - [ "void", [ SherpaOnnxVoiceActivityDetectorPtr ] ], - "SherpaOnnxVoiceActivityDetectorFront" : - [ SherpaOnnxSpeechSegmentPtr, [ SherpaOnnxVoiceActivityDetectorPtr ] ], - "SherpaOnnxDestroySpeechSegment" : [ "void", [ SherpaOnnxSpeechSegmentPtr ] ], - "SherpaOnnxVoiceActivityDetectorReset" : - [ "void", [ SherpaOnnxVoiceActivityDetectorPtr ] ], - // tts - "SherpaOnnxCreateOfflineTts" : - [ SherpaOnnxOfflineTtsPtr, [ SherpaOnnxOfflineTtsConfigPtr ] ], - "SherpaOnnxDestroyOfflineTts" : [ "void", [ SherpaOnnxOfflineTtsPtr ] ], - "SherpaOnnxOfflineTtsGenerate" : [ - SherpaOnnxGeneratedAudioPtr, - [ SherpaOnnxOfflineTtsPtr, cstring, int32_t, float ] - ], - "SherpaOnnxDestroyOfflineTtsGeneratedAudio" : - [ "void", [ SherpaOnnxGeneratedAudioPtr ] ], - "SherpaOnnxWriteWave" : [ "void", [ floatPtr, int32_t, int32_t, cstring ] ], - - // display - "CreateDisplay" : [ SherpaOnnxDisplayPtr, [ int32_t ] ], - "DestroyDisplay" : [ "void", [ SherpaOnnxDisplayPtr ] ], - "SherpaOnnxPrint" : [ "void", [ SherpaOnnxDisplayPtr, int32_t, cstring ] ], -}); - -class Display { - constructor(maxWordPerLine) { - this.handle = libsherpa_onnx.CreateDisplay(maxWordPerLine); - } - free() { - if (this.handle) { - libsherpa_onnx.DestroyDisplay(this.handle); - this.handle = null; - } - } - - print(idx, s) { libsherpa_onnx.SherpaOnnxPrint(this.handle, idx, s); } -}; - -class OnlineResult { - constructor(text) { this.text = Buffer.from(text, "utf-8").toString(); } -}; - -class OnlineStream { - constructor(handle) { this.handle = handle } - - free() { - if (this.handle) { - libsherpa_onnx.DestroyOnlineStream(this.handle); - this.handle = null; - } - } - - /** - * @param sampleRate {Number} - * @param samples {Float32Array} Containing samples in the range [-1, 1] - */ - acceptWaveform(sampleRate, samples) { - libsherpa_onnx.AcceptWaveform(this.handle, sampleRate, samples, - samples.length); - } -}; - -class OnlineRecognizer { - constructor(config) { - this.config = config; - this.recognizer_handle = - libsherpa_onnx.CreateOnlineRecognizer(config.ref()); - } - - free() { - if (this.recognizer_handle) { - libsherpa_onnx.DestroyOnlineRecognizer(this.recognizer_handle); - this.recognizer_handle = null; - } - } - - createStream() { - let handle = libsherpa_onnx.CreateOnlineStream(this.recognizer_handle); - return new OnlineStream(handle); - } - - isReady(stream) { - return libsherpa_onnx.IsOnlineStreamReady(this.recognizer_handle, - stream.handle) - } - - isEndpoint(stream) { - return libsherpa_onnx.IsEndpoint(this.recognizer_handle, stream.handle); - } - - reset(stream) { libsherpa_onnx.Reset(this.recognizer_handle, stream.handle); } - - decode(stream) { - libsherpa_onnx.DecodeOnlineStream(this.recognizer_handle, stream.handle) - } - - getResult(stream) { - let handle = libsherpa_onnx.GetOnlineStreamResult(this.recognizer_handle, - stream.handle); - let r = handle.deref(); - let ans = new OnlineResult(r.text); - libsherpa_onnx.DestroyOnlineRecognizerResult(handle); - - return ans - } -}; - -class OfflineResult { - constructor(text) { this.text = Buffer.from(text, "utf-8").toString(); } -}; - -class OfflineStream { - constructor(handle) { this.handle = handle } - - free() { - if (this.handle) { - libsherpa_onnx.DestroyOfflineStream(this.handle); - this.handle = null; - } - } - - /** - * @param sampleRate {Number} - * @param samples {Float32Array} Containing samples in the range [-1, 1] - */ - acceptWaveform(sampleRate, samples) { - libsherpa_onnx.AcceptWaveformOffline(this.handle, sampleRate, samples, - samples.length); - } -}; - -class OfflineRecognizer { - constructor(config) { - this.config = config; - this.recognizer_handle = - libsherpa_onnx.CreateOfflineRecognizer(config.ref()); - } - - free() { - if (this.recognizer_handle) { - libsherpa_onnx.DestroyOfflineRecognizer(this.recognizer_handle); - this.recognizer_handle = null; - } - } - - createStream() { - let handle = libsherpa_onnx.CreateOfflineStream(this.recognizer_handle); - return new OfflineStream(handle); - } - - decode(stream) { - libsherpa_onnx.DecodeOfflineStream(this.recognizer_handle, stream.handle) - } - - getResult(stream) { - let handle = libsherpa_onnx.GetOfflineStreamResult(stream.handle); - let r = handle.deref(); - let ans = new OfflineResult(r.text); - libsherpa_onnx.DestroyOfflineRecognizerResult(handle); - - return ans - } -}; - -class SpeechSegment { - constructor(start, samples) { - this.start = start; - this.samples = samples; - } -}; - -// this buffer holds only float entries. -class CircularBuffer { - /** - * @param capacity {int} The capacity of the circular buffer. - */ - constructor(capacity) { - this.handle = libsherpa_onnx.SherpaOnnxCreateCircularBuffer(capacity); - } - - free() { - if (this.handle) { - libsherpa_onnx.SherpaOnnxDestroyCircularBuffer(this.handle); - this.handle = null; - } - } - - /** - * @param samples {Float32Array} - */ - push(samples) { - libsherpa_onnx.SherpaOnnxCircularBufferPush(this.handle, samples, - samples.length); - } - - get(startIndex, n) { - let data = - libsherpa_onnx.SherpaOnnxCircularBufferGet(this.handle, startIndex, n); - - // https://tootallnate.github.io/ref/#exports-reinterpret - const buffer = data.buffer.reinterpret(n * ref.sizeof.float).buffer; - - // create a copy since we are going to free the buffer at the end - let s = new Float32Array(buffer).slice(0); - libsherpa_onnx.SherpaOnnxCircularBufferFree(data); - return s; - } - - pop(n) { libsherpa_onnx.SherpaOnnxCircularBufferPop(this.handle, n); } - - size() { return libsherpa_onnx.SherpaOnnxCircularBufferSize(this.handle); } - - head() { return libsherpa_onnx.SherpaOnnxCircularBufferHead(this.handle); } - - reset() { libsherpa_onnx.SherpaOnnxCircularBufferReset(this.handle); } -}; - -class VoiceActivityDetector { - constructor(config, bufferSizeInSeconds) { - this.config = config; - this.handle = libsherpa_onnx.SherpaOnnxCreateVoiceActivityDetector( - config.ref(), bufferSizeInSeconds); - } - - free() { - if (this.handle) { - libsherpa_onnx.SherpaOnnxDestroyVoiceActivityDetector(this.handle); - } - } - - acceptWaveform(samples) { - libsherpa_onnx.SherpaOnnxVoiceActivityDetectorAcceptWaveform( - this.handle, samples, samples.length); - } - - isEmpty() { - return libsherpa_onnx.SherpaOnnxVoiceActivityDetectorEmpty(this.handle); - } - - isDetected() { - return libsherpa_onnx.SherpaOnnxVoiceActivityDetectorDetected(this.handle); - } - pop() { libsherpa_onnx.SherpaOnnxVoiceActivityDetectorPop(this.handle); } - - clear() { libsherpa_onnx.SherpaOnnxVoiceActivityDetectorClear(this.handle); } - - reset() { libsherpa_onnx.SherpaOnnxVoiceActivityDetectorReset(this.handle); } - - front() { - let segment = - libsherpa_onnx.SherpaOnnxVoiceActivityDetectorFront(this.handle); - - let buffer = - segment.deref() - .samples.buffer.reinterpret(segment.deref().n * ref.sizeof.float) - .buffer; - - let samples = new Float32Array(buffer).slice(0); - let ans = new SpeechSegment(segment.deref().start, samples); - - libsherpa_onnx.SherpaOnnxDestroySpeechSegment(segment); - return ans; - } -}; - -class GeneratedAudio { - constructor(sampleRate, samples) { - this.sampleRate = sampleRate; - this.samples = samples; - } - save(filename) { - libsherpa_onnx.SherpaOnnxWriteWave(this.samples, this.samples.length, - this.sampleRate, filename); - } -}; - -class OfflineTts { - constructor(config) { - this.config = config; - this.handle = libsherpa_onnx.SherpaOnnxCreateOfflineTts(config.ref()); - } - - free() { - if (this.handle) { - libsherpa_onnx.SherpaOnnxDestroyOfflineTts(this.handle); - this.handle = null; - } - } - generate(text, sid, speed) { - let r = libsherpa_onnx.SherpaOnnxOfflineTtsGenerate(this.handle, text, sid, - speed); - const buffer = - r.deref() - .samples.buffer.reinterpret(r.deref().n * ref.sizeof.float) - .buffer; - let samples = new Float32Array(buffer).slice(0); - let sampleRate = r.deref().sampleRate; - - let generatedAudio = new GeneratedAudio(sampleRate, samples); - - libsherpa_onnx.SherpaOnnxDestroyOfflineTtsGeneratedAudio(r); - - return generatedAudio; - } -}; - -// online asr -const OnlineTransducerModelConfig = SherpaOnnxOnlineTransducerModelConfig; -const OnlineModelConfig = SherpaOnnxOnlineModelConfig; -const FeatureConfig = SherpaOnnxFeatureConfig; -const OnlineRecognizerConfig = SherpaOnnxOnlineRecognizerConfig; -const OnlineParaformerModelConfig = SherpaOnnxOnlineParaformerModelConfig; -const OnlineZipformer2CtcModelConfig = SherpaOnnxOnlineZipformer2CtcModelConfig; - -// offline asr -const OfflineTransducerModelConfig = SherpaOnnxOfflineTransducerModelConfig; -const OfflineModelConfig = SherpaOnnxOfflineModelConfig; -const OfflineRecognizerConfig = SherpaOnnxOfflineRecognizerConfig; -const OfflineParaformerModelConfig = SherpaOnnxOfflineParaformerModelConfig; -const OfflineWhisperModelConfig = SherpaOnnxOfflineWhisperModelConfig; -const OfflineNemoEncDecCtcModelConfig = - SherpaOnnxOfflineNemoEncDecCtcModelConfig; -const OfflineTdnnModelConfig = SherpaOnnxOfflineTdnnModelConfig; - -// vad -const SileroVadModelConfig = SherpaOnnxSileroVadModelConfig; -const VadModelConfig = SherpaOnnxVadModelConfig; - -// tts -const OfflineTtsVitsModelConfig = SherpaOnnxOfflineTtsVitsModelConfig; -const OfflineTtsModelConfig = SherpaOnnxOfflineTtsModelConfig; -const OfflineTtsConfig = SherpaOnnxOfflineTtsConfig; +function createOfflineTts(config) { + return sherpa_onnx_tts.createOfflineTts(wasmModule, config); +} +// Note: online means streaming and offline means non-streaming here. +// Both of them don't require internet connection. module.exports = { - // online asr - OnlineTransducerModelConfig, - OnlineModelConfig, - FeatureConfig, - OnlineRecognizerConfig, - OnlineRecognizer, - OnlineStream, - OnlineParaformerModelConfig, - OnlineZipformer2CtcModelConfig, - - // offline asr - OfflineRecognizer, - OfflineStream, - OfflineTransducerModelConfig, - OfflineModelConfig, - OfflineRecognizerConfig, - OfflineParaformerModelConfig, - OfflineWhisperModelConfig, - OfflineNemoEncDecCtcModelConfig, - OfflineTdnnModelConfig, - // vad - SileroVadModelConfig, - VadModelConfig, - CircularBuffer, - VoiceActivityDetector, - // tts - OfflineTtsVitsModelConfig, - OfflineTtsModelConfig, - OfflineTtsConfig, - OfflineTts, - - // - Display, + createOnlineRecognizer, + createOfflineRecognizer, + createOfflineTts, }; diff --git a/scripts/nodejs/package.json b/scripts/nodejs/package.json index bfe671ff..49f47d29 100644 --- a/scripts/nodejs/package.json +++ b/scripts/nodejs/package.json @@ -1,7 +1,7 @@ { - "name": "sherpa-onnx2", - "version": "1.8.10", - "description": "Real-time speech recognition with Next-gen Kaldi", + "name": "sherpa-onnx", + "version": "SHERPA_ONNX_VERSION", + "description": "Speech-to-text and text-to-speech using Next-gen Kaldi without internet connection", "main": "index.js", "scripts": { "test": "echo \"Error: no test specified\" && exit 1" @@ -11,15 +11,30 @@ "url": "git+https://github.com/k2-fsa/sherpa-onnx.git" }, "keywords": [ - "speech-to-text", - "text-to-speech", + "speech to text", + "text to speech", + "transcription", "real-time speech recognition", - "without internet connect", + "without internet connection", "embedded systems", "open source", "zipformer", "asr", - "speech" + "tts", + "stt", + "c++", + "onnxruntime", + "onnx", + "ai", + "next-gen kaldi", + "offline", + "privacy", + "open source", + "streaming speech recognition", + "speech", + "recognition", + "WebAssembly", + "wasm" ], "author": "The next-gen Kaldi team", "license": "Apache-2.0", @@ -28,10 +43,5 @@ }, "homepage": "https://github.com/k2-fsa/sherpa-onnx#readme", "dependencies": { - "ffi-napi": "^4.0.3", - "npm": "^6.14.18", - "ref-array-napi": "^1.2.2", - "ref-napi": "^3.0.3", - "ref-struct-napi": "^1.1.1" } } diff --git a/scripts/nodejs/package.json.in b/scripts/nodejs/package.json.in deleted file mode 100644 index b097edc9..00000000 --- a/scripts/nodejs/package.json.in +++ /dev/null @@ -1,50 +0,0 @@ -{ - "name": "sherpa-onnx", - "version": "SHERPA_ONNX_VERSION", - "description": "Real-time speech recognition with Next-gen Kaldi", - "main": "index.js", - "scripts": { - "test": "echo \"Error: no test specified\" && exit 1" - }, - "repository": { - "type": "git", - "url": "git+https://github.com/k2-fsa/sherpa-onnx.git" - }, - "keywords": [ - "speech to text", - "text to speech", - "transcription", - "real-time speech recognition", - "without internet connect", - "embedded systems", - "open source", - "zipformer", - "asr", - "tts", - "stt", - "c++", - "onnxruntime", - "onnx", - "ai", - "next-gen kaldi", - "offline", - "privacy", - "open source", - "streaming speech recognition", - "speech", - "recognition" - ], - "author": "The next-gen Kaldi team", - "license": "Apache-2.0", - "bugs": { - "url": "https://github.com/k2-fsa/sherpa-onnx/issues" - }, - "homepage": "https://github.com/k2-fsa/sherpa-onnx#readme", - "dependencies": { - "ffi-napi": "^4.0.3", - "npm": "^6.14.18", - "ref-array-napi": "^1.2.2", - "ref-napi": "^3.0.3", - "ref-struct-napi": "^1.1.1" - } -} diff --git a/scripts/nodejs/run.sh b/scripts/nodejs/run.sh deleted file mode 100755 index 5ce3034d..00000000 --- a/scripts/nodejs/run.sh +++ /dev/null @@ -1,123 +0,0 @@ -#!/usr/bin/env bash -set -ex - -SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) -SHERPA_ONNX_DIR=$(realpath $SCRIPT_DIR/../..) -echo "SCRIPT_DIR: $SCRIPT_DIR" -echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR" - -SHERPA_ONNX_VERSION=$(grep "SHERPA_ONNX_VERSION" $SHERPA_ONNX_DIR/CMakeLists.txt | cut -d " " -f 2 | cut -d '"' -f 2) - -echo "SHERPA_ONNX_VERSION $SHERPA_ONNX_VERSION" -sed -i.bak s/SHERPA_ONNX_VERSION/$SHERPA_ONNX_VERSION/g ./package.json.in - -cp package.json.in package.json -rm package.json.in -rm package.json.in.bak -rm .clang-format - -function windows_x64() { - echo "Process Windows (x64)" - mkdir -p lib/win-x64 - dst=$(realpath lib/win-x64) - mkdir t - cd t - wget -q https://huggingface.co/csukuangfj/sherpa-onnx-wheels/resolve/main/sherpa_onnx-${SHERPA_ONNX_VERSION}-cp38-cp38-win_amd64.whl - unzip ./sherpa_onnx-${SHERPA_ONNX_VERSION}-cp38-cp38-win_amd64.whl - - cp -v sherpa_onnx-${SHERPA_ONNX_VERSION}.data/data/bin/*.dll $dst - cp -v sherpa_onnx-${SHERPA_ONNX_VERSION}.data/data/bin/*.lib $dst - rm -fv $dst/sherpa-onnx-portaudio.dll - - cd .. - rm -rf t -} - -function windows_x86() { - echo "Process Windows (x86)" - mkdir -p lib/win-x86 - dst=$(realpath lib/win-x86) - mkdir t - cd t - wget -q https://huggingface.co/csukuangfj/sherpa-onnx-wheels/resolve/main/sherpa_onnx-${SHERPA_ONNX_VERSION}-cp38-cp38-win32.whl - unzip ./sherpa_onnx-${SHERPA_ONNX_VERSION}-cp38-cp38-win32.whl - - cp -v sherpa_onnx-${SHERPA_ONNX_VERSION}.data/data/bin/*.dll $dst - cp -v sherpa_onnx-${SHERPA_ONNX_VERSION}.data/data/bin/*.lib $dst - rm -fv $dst/sherpa-onnx-portaudio.dll - - cd .. - rm -rf t -} - -function linux_x64() { - echo "Process Linux (x64)" - mkdir -p lib/linux-x64 - dst=$(realpath lib/linux-x64) - mkdir t - cd t - wget -q https://huggingface.co/csukuangfj/sherpa-onnx-wheels/resolve/main/sherpa_onnx-${SHERPA_ONNX_VERSION}-cp38-cp38-manylinux_2_28_x86_64.whl - unzip ./sherpa_onnx-${SHERPA_ONNX_VERSION}-cp38-cp38-manylinux_2_28_x86_64.whl - - cp -v sherpa_onnx/lib/*.so* $dst - rm -v $dst/libcargs.so - rm -v $dst/libsherpa-onnx-portaudio.so - rm -v $dst/libsherpa-onnx-fst.so - rm -v $dst/libonnxruntime.so - - cd .. - rm -rf t -} - -function osx_x64() { - echo "Process osx-x64" - mkdir -p lib/osx-x64 - dst=$(realpath lib/osx-x64) - mkdir t - cd t - wget -q https://huggingface.co/csukuangfj/sherpa-onnx-wheels/resolve/main/sherpa_onnx-${SHERPA_ONNX_VERSION}-cp38-cp38-macosx_11_0_x86_64.whl - unzip ./sherpa_onnx-${SHERPA_ONNX_VERSION}-cp38-cp38-macosx_11_0_x86_64.whl - - cp -v sherpa_onnx/lib/*.dylib $dst/ - rm -v $dst/libonnxruntime.dylib - rm -v $dst/libcargs.dylib - rm -v $dst/libsherpa-onnx-fst.dylib - rm -v $dst/libsherpa-onnx-portaudio.dylib - - cd .. - rm -rf t -} - -function osx_arm64() { - echo "Process osx-arm64" - mkdir -p lib/osx-arm64 - dst=$(realpath lib/osx-arm64) - mkdir t - cd t - wget -q https://huggingface.co/csukuangfj/sherpa-onnx-wheels/resolve/main/sherpa_onnx-${SHERPA_ONNX_VERSION}-cp38-cp38-macosx_11_0_arm64.whl - unzip ./sherpa_onnx-${SHERPA_ONNX_VERSION}-cp38-cp38-macosx_11_0_arm64.whl - - cp -v sherpa_onnx/lib/*.dylib $dst/ - rm -v $dst/libonnxruntime.dylib - rm -v $dst/libcargs.dylib - rm -v $dst/libsherpa-onnx-fst.dylib - rm -v $dst/libsherpa-onnx-portaudio.dylib - - cd .. - rm -rf t -} - -windows_x64 -ls -lh lib/win-x64 - -windows_x86 -ls -lh lib/win-x86 - -linux_x64 -ls -lh lib/linux-x64 - -osx_x64 -ls -lh lib/osx-x64 - -osx_arm64 -ls -lh lib/osx-arm64 diff --git a/sherpa-onnx/c-api/c-api.cc b/sherpa-onnx/c-api/c-api.cc index b2fdac61..407b359a 100644 --- a/sherpa-onnx/c-api/c-api.cc +++ b/sherpa-onnx/c-api/c-api.cc @@ -94,6 +94,11 @@ SherpaOnnxOnlineRecognizer *CreateOnlineRecognizer( SHERPA_ONNX_LOGE("%s\n", recognizer_config.ToString().c_str()); } + if (!recognizer_config.Validate()) { + SHERPA_ONNX_LOGE("Errors in config!"); + return nullptr; + } + SherpaOnnxOnlineRecognizer *recognizer = new SherpaOnnxOnlineRecognizer; recognizer->impl = @@ -324,6 +329,11 @@ SherpaOnnxOfflineRecognizer *CreateOfflineRecognizer( SHERPA_ONNX_LOGE("%s", recognizer_config.ToString().c_str()); } + if (!recognizer_config.Validate()) { + SHERPA_ONNX_LOGE("Errors in config"); + return nullptr; + } + SherpaOnnxOfflineRecognizer *recognizer = new SherpaOnnxOfflineRecognizer; recognizer->impl = @@ -480,6 +490,11 @@ SherpaOnnxVoiceActivityDetector *SherpaOnnxCreateVoiceActivityDetector( SHERPA_ONNX_LOGE("%s", vad_config.ToString().c_str()); } + if (!vad_config.Validate()) { + SHERPA_ONNX_LOGE("Errors in config"); + return nullptr; + } + SherpaOnnxVoiceActivityDetector *p = new SherpaOnnxVoiceActivityDetector; p->impl = std::make_unique( vad_config, buffer_size_in_seconds); @@ -570,6 +585,11 @@ SherpaOnnxOfflineTts *SherpaOnnxCreateOfflineTts( SHERPA_ONNX_LOGE("%s\n", tts_config.ToString().c_str()); } + if (!tts_config.Validate()) { + SHERPA_ONNX_LOGE("Errors in config"); + return nullptr; + } + SherpaOnnxOfflineTts *tts = new SherpaOnnxOfflineTts; tts->impl = std::make_unique(tts_config); diff --git a/wasm/CMakeLists.txt b/wasm/CMakeLists.txt index dc077a23..c5d283f1 100644 --- a/wasm/CMakeLists.txt +++ b/wasm/CMakeLists.txt @@ -5,3 +5,7 @@ endif() if(SHERPA_ONNX_ENABLE_WASM_ASR) add_subdirectory(asr) endif() + +if(SHERPA_ONNX_ENABLE_WASM_NODEJS) + add_subdirectory(nodejs) +endif() diff --git a/wasm/asr/app-asr.js b/wasm/asr/app-asr.js index cb27db97..0f6ec257 100644 --- a/wasm/asr/app-asr.js +++ b/wasm/asr/app-asr.js @@ -45,7 +45,7 @@ Module.onRuntimeInitialized = function() { startBtn.disabled = false; - recognizer = createRecognizer(); + recognizer = createOnlineRecognizer(Module); console.log('recognizer is created!', recognizer); }; diff --git a/wasm/asr/sherpa-onnx-asr.js b/wasm/asr/sherpa-onnx-asr.js index d0821722..97b19783 100644 --- a/wasm/asr/sherpa-onnx-asr.js +++ b/wasm/asr/sherpa-onnx-asr.js @@ -1,161 +1,181 @@ -function freeConfig(config) { +function freeConfig(config, Module) { if ('buffer' in config) { - _free(config.buffer); + Module._free(config.buffer); } if ('config' in config) { - freeConfig(config.config) + freeConfig(config.config, Module) } if ('transducer' in config) { - freeConfig(config.transducer) + freeConfig(config.transducer, Module) } if ('paraformer' in config) { - freeConfig(config.paraformer) + freeConfig(config.paraformer, Module) } if ('ctc' in config) { - freeConfig(config.ctc) + freeConfig(config.ctc, Module) } if ('feat' in config) { - freeConfig(config.feat) + freeConfig(config.feat, Module) } if ('model' in config) { - freeConfig(config.model) + freeConfig(config.model, Module) } - _free(config.ptr); + if ('nemoCtc' in config) { + freeConfig(config.nemoCtc, Module) + } + + if ('whisper' in config) { + freeConfig(config.whisper, Module) + } + + if ('tdnn' in config) { + freeConfig(config.tdnn, Module) + } + + if ('lm' in config) { + freeConfig(config.lm, Module) + } + + Module._free(config.ptr); } // The user should free the returned pointers -function initSherpaOnnxOnlineTransducerModelConfig(config) { - let encoderLen = lengthBytesUTF8(config.encoder) + 1; - let decoderLen = lengthBytesUTF8(config.decoder) + 1; - let joinerLen = lengthBytesUTF8(config.joiner) + 1; +function initSherpaOnnxOnlineTransducerModelConfig(config, Module) { + const encoderLen = Module.lengthBytesUTF8(config.encoder) + 1; + const decoderLen = Module.lengthBytesUTF8(config.decoder) + 1; + const joinerLen = Module.lengthBytesUTF8(config.joiner) + 1; - let n = encoderLen + decoderLen + joinerLen; + const n = encoderLen + decoderLen + joinerLen; - let buffer = _malloc(n); + const buffer = Module._malloc(n); - let len = 3 * 4; // 3 pointers - let ptr = _malloc(len); + const len = 3 * 4; // 3 pointers + const ptr = Module._malloc(len); let offset = 0; - stringToUTF8(config.encoder, buffer + offset, encoderLen); + Module.stringToUTF8(config.encoder, buffer + offset, encoderLen); offset += encoderLen; - stringToUTF8(config.decoder, buffer + offset, decoderLen); + Module.stringToUTF8(config.decoder, buffer + offset, decoderLen); offset += decoderLen; - stringToUTF8(config.joiner, buffer + offset, joinerLen); + Module.stringToUTF8(config.joiner, buffer + offset, joinerLen); offset = 0; - setValue(ptr, buffer + offset, 'i8*'); + Module.setValue(ptr, buffer + offset, 'i8*'); offset += encoderLen; - setValue(ptr + 4, buffer + offset, 'i8*'); + Module.setValue(ptr + 4, buffer + offset, 'i8*'); offset += decoderLen; - setValue(ptr + 8, buffer + offset, 'i8*'); + Module.setValue(ptr + 8, buffer + offset, 'i8*'); return { buffer: buffer, ptr: ptr, len: len, } } -function initSherpaOnnxOnlineParaformerModelConfig(config) { - let encoderLen = lengthBytesUTF8(config.encoder) + 1; - let decoderLen = lengthBytesUTF8(config.decoder) + 1; +function initSherpaOnnxOnlineParaformerModelConfig(config, Module) { + const encoderLen = Module.lengthBytesUTF8(config.encoder) + 1; + const decoderLen = Module.lengthBytesUTF8(config.decoder) + 1; - let n = encoderLen + decoderLen; - let buffer = _malloc(n); + const n = encoderLen + decoderLen; + const buffer = Module._malloc(n); - let len = 2 * 4; // 2 pointers - let ptr = _malloc(len); + const len = 2 * 4; // 2 pointers + const ptr = Module._malloc(len); let offset = 0; - stringToUTF8(config.encoder, buffer + offset, encoderLen); + Module.stringToUTF8(config.encoder, buffer + offset, encoderLen); offset += encoderLen; - stringToUTF8(config.decoder, buffer + offset, decoderLen); + Module.stringToUTF8(config.decoder, buffer + offset, decoderLen); offset = 0; - setValue(ptr, buffer + offset, 'i8*'); + Module.setValue(ptr, buffer + offset, 'i8*'); offset += encoderLen; - setValue(ptr + 4, buffer + offset, 'i8*'); + Module.setValue(ptr + 4, buffer + offset, 'i8*'); return { buffer: buffer, ptr: ptr, len: len, } } -function initSherpaOnnxOnlineZipformer2CtcModelConfig(config) { - let n = lengthBytesUTF8(config.model) + 1; - let buffer = _malloc(n); +function initSherpaOnnxOnlineZipformer2CtcModelConfig(config, Module) { + const n = Module.lengthBytesUTF8(config.model) + 1; + const buffer = Module._malloc(n); - let len = 1 * 4; // 1 pointer - let ptr = _malloc(len); + const len = 1 * 4; // 1 pointer + const ptr = Module._malloc(len); - stringToUTF8(config.model, buffer, n); + Module.stringToUTF8(config.model, buffer, n); - setValue(ptr, buffer, 'i8*'); + Module.setValue(ptr, buffer, 'i8*'); return { buffer: buffer, ptr: ptr, len: len, } } -function initSherpaOnnxOnlineModelConfig(config) { - let transducer = initSherpaOnnxOnlineTransducerModelConfig(config.transducer); - let paraformer = initSherpaOnnxOnlineParaformerModelConfig(config.paraformer); - let ctc = initSherpaOnnxOnlineZipformer2CtcModelConfig(config.zipformer2Ctc); +function initSherpaOnnxOnlineModelConfig(config, Module) { + const transducer = + initSherpaOnnxOnlineTransducerModelConfig(config.transducer, Module); + const paraformer = + initSherpaOnnxOnlineParaformerModelConfig(config.paraformer, Module); + const ctc = initSherpaOnnxOnlineZipformer2CtcModelConfig( + config.zipformer2Ctc, Module); - let len = transducer.len + paraformer.len + ctc.len + 5 * 4; - let ptr = _malloc(len); + const len = transducer.len + paraformer.len + ctc.len + 5 * 4; + const ptr = Module._malloc(len); let offset = 0; - _CopyHeap(transducer.ptr, transducer.len, ptr + offset); + Module._CopyHeap(transducer.ptr, transducer.len, ptr + offset); offset += transducer.len; - _CopyHeap(paraformer.ptr, paraformer.len, ptr + offset); + Module._CopyHeap(paraformer.ptr, paraformer.len, ptr + offset); offset += paraformer.len; - _CopyHeap(ctc.ptr, ctc.len, ptr + offset); + Module._CopyHeap(ctc.ptr, ctc.len, ptr + offset); offset += ctc.len; - let tokensLen = lengthBytesUTF8(config.tokens) + 1; - let providerLen = lengthBytesUTF8(config.provider) + 1; - let modelTypeLen = lengthBytesUTF8(config.modelType) + 1; - let bufferLen = tokensLen + providerLen + modelTypeLen; - let buffer = _malloc(bufferLen); + const tokensLen = Module.lengthBytesUTF8(config.tokens) + 1; + const providerLen = Module.lengthBytesUTF8(config.provider) + 1; + const modelTypeLen = Module.lengthBytesUTF8(config.modelType) + 1; + const bufferLen = tokensLen + providerLen + modelTypeLen; + const buffer = Module._malloc(bufferLen); offset = 0; - stringToUTF8(config.tokens, buffer, tokensLen); + Module.stringToUTF8(config.tokens, buffer, tokensLen); offset += tokensLen; - stringToUTF8(config.provider, buffer + offset, providerLen); + Module.stringToUTF8(config.provider, buffer + offset, providerLen); offset += providerLen; - stringToUTF8(config.modelType, buffer + offset, modelTypeLen); + Module.stringToUTF8(config.modelType, buffer + offset, modelTypeLen); offset = transducer.len + paraformer.len + ctc.len; - setValue(ptr + offset, buffer, 'i8*'); // tokens + Module.setValue(ptr + offset, buffer, 'i8*'); // tokens offset += 4; - setValue(ptr + offset, config.numThreads, 'i32'); + Module.setValue(ptr + offset, config.numThreads, 'i32'); offset += 4; - setValue(ptr + offset, buffer + tokensLen, 'i8*'); // provider + Module.setValue(ptr + offset, buffer + tokensLen, 'i8*'); // provider offset += 4; - setValue(ptr + offset, config.debug, 'i32'); + Module.setValue(ptr + offset, config.debug, 'i32'); offset += 4; - setValue(ptr + offset, buffer + tokensLen + providerLen, 'i8*'); // modelType + Module.setValue( + ptr + offset, buffer + tokensLen + providerLen, 'i8*'); // modelType offset += 4; return { @@ -164,63 +184,63 @@ function initSherpaOnnxOnlineModelConfig(config) { } } -function initSherpaOnnxFeatureConfig(config) { - let len = 2 * 4; // 2 pointers - let ptr = _malloc(len); +function initSherpaOnnxFeatureConfig(config, Module) { + const len = 2 * 4; // 2 pointers + const ptr = Module._malloc(len); - setValue(ptr, config.sampleRate, 'i32'); - setValue(ptr + 4, config.featureDim, 'i32'); + Module.setValue(ptr, config.sampleRate, 'i32'); + Module.setValue(ptr + 4, config.featureDim, 'i32'); return {ptr: ptr, len: len}; } -function initSherpaOnnxOnlineRecognizerConfig(config) { - let feat = initSherpaOnnxFeatureConfig(config.featConfig); - let model = initSherpaOnnxOnlineModelConfig(config.modelConfig); +function initSherpaOnnxOnlineRecognizerConfig(config, Module) { + const feat = initSherpaOnnxFeatureConfig(config.featConfig, Module); + const model = initSherpaOnnxOnlineModelConfig(config.modelConfig, Module); - let len = feat.len + model.len + 8 * 4; - let ptr = _malloc(len); + const len = feat.len + model.len + 8 * 4; + const ptr = Module._malloc(len); let offset = 0; - _CopyHeap(feat.ptr, feat.len, ptr + offset); + Module._CopyHeap(feat.ptr, feat.len, ptr + offset); offset += feat.len; - _CopyHeap(model.ptr, model.len, ptr + offset); + Module._CopyHeap(model.ptr, model.len, ptr + offset); offset += model.len; - let decodingMethodLen = lengthBytesUTF8(config.decodingMethod) + 1; - let hotwordsFileLen = lengthBytesUTF8(config.hotwordsFile) + 1; - let bufferLen = decodingMethodLen + hotwordsFileLen; - let buffer = _malloc(bufferLen); + const decodingMethodLen = Module.lengthBytesUTF8(config.decodingMethod) + 1; + const hotwordsFileLen = Module.lengthBytesUTF8(config.hotwordsFile) + 1; + const bufferLen = decodingMethodLen + hotwordsFileLen; + const buffer = Module._malloc(bufferLen); offset = 0; - stringToUTF8(config.decodingMethod, buffer, decodingMethodLen); + Module.stringToUTF8(config.decodingMethod, buffer, decodingMethodLen); offset += decodingMethodLen; - stringToUTF8(config.hotwordsFile, buffer + offset, hotwordsFileLen); + Module.stringToUTF8(config.hotwordsFile, buffer + offset, hotwordsFileLen); offset = feat.len + model.len; - setValue(ptr + offset, buffer, 'i8*'); // decoding method + Module.setValue(ptr + offset, buffer, 'i8*'); // decoding method offset += 4; - setValue(ptr + offset, config.maxActivePaths, 'i32'); + Module.setValue(ptr + offset, config.maxActivePaths, 'i32'); offset += 4; - setValue(ptr + offset, config.enableEndpoint, 'i32'); + Module.setValue(ptr + offset, config.enableEndpoint, 'i32'); offset += 4; - setValue(ptr + offset, config.rule1MinTrailingSilence, 'float'); + Module.setValue(ptr + offset, config.rule1MinTrailingSilence, 'float'); offset += 4; - setValue(ptr + offset, config.rule2MinTrailingSilence, 'float'); + Module.setValue(ptr + offset, config.rule2MinTrailingSilence, 'float'); offset += 4; - setValue(ptr + offset, config.rule3MinUtteranceLength, 'float'); + Module.setValue(ptr + offset, config.rule3MinUtteranceLength, 'float'); offset += 4; - setValue(ptr + offset, buffer + decodingMethodLen, 'i8*'); + Module.setValue(ptr + offset, buffer + decodingMethodLen, 'i8*'); offset += 4; - setValue(ptr + offset, config.hotwordsScore, 'float'); + Module.setValue(ptr + offset, config.hotwordsScore, 'float'); offset += 4; return { @@ -229,21 +249,21 @@ function initSherpaOnnxOnlineRecognizerConfig(config) { } -function createRecognizer() { - let onlineTransducerModelConfig = { +function createOnlineRecognizer(Module, myConfig) { + const onlineTransducerModelConfig = { encoder: '', decoder: '', joiner: '', - } + }; - let onlineParaformerModelConfig = { + const onlineParaformerModelConfig = { encoder: '', decoder: '', - } + }; - let onlineZipformer2CtcModelConfig = { + const onlineZipformer2CtcModelConfig = { model: '', - } + }; let type = 0; @@ -266,7 +286,7 @@ function createRecognizer() { } - let onlineModelConfig = { + const onlineModelConfig = { transducer: onlineTransducerModelConfig, paraformer: onlineParaformerModelConfig, zipformer2Ctc: onlineZipformer2CtcModelConfig, @@ -275,12 +295,12 @@ function createRecognizer() { provider: 'cpu', debug: 1, modelType: '', - } + }; - let featureConfig = { + const featureConfig = { sampleRate: 16000, featureDim: 80, - } + }; let recognizerConfig = { featConfig: featureConfig, @@ -293,23 +313,336 @@ function createRecognizer() { rule3MinUtteranceLength: 20, hotwordsFile: '', hotwordsScore: 1.5, + }; + if (myConfig) { + recognizerConfig = myConfig; } - return new OnlineRecognizer(recognizerConfig); + return new OnlineRecognizer(recognizerConfig, Module); } -class OnlineStream { - constructor(handle) { +function initSherpaOnnxOfflineTransducerModelConfig(config, Module) { + const encoderLen = Module.lengthBytesUTF8(config.encoder) + 1; + const decoderLen = Module.lengthBytesUTF8(config.decoder) + 1; + const joinerLen = Module.lengthBytesUTF8(config.joiner) + 1; + + const n = encoderLen + decoderLen + joinerLen; + + const buffer = Module._malloc(n); + + const len = 3 * 4; // 3 pointers + const ptr = Module._malloc(len); + + let offset = 0; + Module.stringToUTF8(config.encoder, buffer + offset, encoderLen); + offset += encoderLen; + + Module.stringToUTF8(config.decoder, buffer + offset, decoderLen); + offset += decoderLen; + + Module.stringToUTF8(config.joiner, buffer + offset, joinerLen); + + offset = 0; + Module.setValue(ptr, buffer + offset, 'i8*'); + offset += encoderLen; + + Module.setValue(ptr + 4, buffer + offset, 'i8*'); + offset += decoderLen; + + Module.setValue(ptr + 8, buffer + offset, 'i8*'); + + return { + buffer: buffer, ptr: ptr, len: len, + } +} + +function initSherpaOnnxOfflineParaformerModelConfig(config, Module) { + const n = Module.lengthBytesUTF8(config.model) + 1; + + const buffer = Module._malloc(n); + + const len = 1 * 4; // 1 pointer + const ptr = Module._malloc(len); + + Module.stringToUTF8(config.model, buffer, n); + + Module.setValue(ptr, buffer, 'i8*'); + + return { + buffer: buffer, ptr: ptr, len: len, + } +} + +function initSherpaOnnxOfflineNemoEncDecCtcModelConfig(config, Module) { + const n = Module.lengthBytesUTF8(config.model) + 1; + + const buffer = Module._malloc(n); + + const len = 1 * 4; // 1 pointer + const ptr = Module._malloc(len); + + Module.stringToUTF8(config.model, buffer, n); + + Module.setValue(ptr, buffer, 'i8*'); + + return { + buffer: buffer, ptr: ptr, len: len, + } +} + +function initSherpaOnnxOfflineWhisperModelConfig(config, Module) { + const encoderLen = Module.lengthBytesUTF8(config.encoder) + 1; + const decoderLen = Module.lengthBytesUTF8(config.decoder) + 1; + + const n = encoderLen + decoderLen; + const buffer = Module._malloc(n); + + const len = 2 * 4; // 2 pointers + const ptr = Module._malloc(len); + + let offset = 0; + Module.stringToUTF8(config.encoder, buffer + offset, encoderLen); + offset += encoderLen; + + Module.stringToUTF8(config.decoder, buffer + offset, decoderLen); + + offset = 0; + Module.setValue(ptr, buffer + offset, 'i8*'); + offset += encoderLen; + + Module.setValue(ptr + 4, buffer + offset, 'i8*'); + + return { + buffer: buffer, ptr: ptr, len: len, + } +} + +function initSherpaOnnxOfflineTdnnModelConfig(config, Module) { + const n = Module.lengthBytesUTF8(config.model) + 1; + const buffer = Module._malloc(n); + + const len = 1 * 4; // 1 pointer + const ptr = Module._malloc(len); + + Module.stringToUTF8(config.model, buffer, n); + + Module.setValue(ptr, buffer, 'i8*'); + + return { + buffer: buffer, ptr: ptr, len: len, + } +} + +function initSherpaOnnxOfflineLMConfig(config, Module) { + const n = Module.lengthBytesUTF8(config.model) + 1; + const buffer = Module._malloc(n); + + const len = 2 * 4; + const ptr = Module._malloc(len); + + Module.stringToUTF8(config.model, buffer, n); + Module.setValue(ptr, buffer, 'i8*'); + Module.setValue(ptr + 4, config.scale, 'float'); + + return { + buffer: buffer, ptr: ptr, len: len, + } +} + +function initSherpaOnnxOfflineModelConfig(config, Module) { + const transducer = + initSherpaOnnxOfflineTransducerModelConfig(config.transducer, Module); + const paraformer = + initSherpaOnnxOfflineParaformerModelConfig(config.paraformer, Module); + const nemoCtc = + initSherpaOnnxOfflineNemoEncDecCtcModelConfig(config.nemoCtc, Module); + const whisper = + initSherpaOnnxOfflineWhisperModelConfig(config.whisper, Module); + const tdnn = initSherpaOnnxOfflineTdnnModelConfig(config.tdnn, Module); + + const len = transducer.len + paraformer.len + nemoCtc.len + whisper.len + + tdnn.len + 5 * 4; + const ptr = Module._malloc(len); + + let offset = 0; + Module._CopyHeap(transducer.ptr, transducer.len, ptr + offset); + offset += transducer.len; + + Module._CopyHeap(paraformer.ptr, paraformer.len, ptr + offset); + offset += paraformer.len; + + Module._CopyHeap(nemoCtc.ptr, nemoCtc.len, ptr + offset); + offset += nemoCtc.len; + + Module._CopyHeap(whisper.ptr, whisper.len, ptr + offset); + offset += whisper.len; + + Module._CopyHeap(tdnn.ptr, tdnn.len, ptr + offset); + offset += tdnn.len; + + const tokensLen = Module.lengthBytesUTF8(config.tokens) + 1; + const providerLen = Module.lengthBytesUTF8(config.provider) + 1; + const modelTypeLen = Module.lengthBytesUTF8(config.modelType) + 1; + const bufferLen = tokensLen + providerLen + modelTypeLen; + const buffer = Module._malloc(bufferLen); + + offset = 0; + Module.stringToUTF8(config.tokens, buffer, tokensLen); + offset += tokensLen; + + Module.stringToUTF8(config.provider, buffer + offset, providerLen); + offset += providerLen; + + Module.stringToUTF8(config.modelType, buffer + offset, modelTypeLen); + + offset = + transducer.len + paraformer.len + nemoCtc.len + whisper.len + tdnn.len; + Module.setValue(ptr + offset, buffer, 'i8*'); // tokens + offset += 4; + + Module.setValue(ptr + offset, config.numThreads, 'i32'); + offset += 4; + + Module.setValue(ptr + offset, config.debug, 'i32'); + offset += 4; + + Module.setValue(ptr + offset, buffer + tokensLen, 'i8*'); // provider + offset += 4; + + Module.setValue( + ptr + offset, buffer + tokensLen + providerLen, 'i8*'); // modelType + offset += 4; + + return { + buffer: buffer, ptr: ptr, len: len, transducer: transducer, + paraformer: paraformer, nemoCtc: nemoCtc, whisper: whisper, tdnn: tdnn + } +} + +function initSherpaOnnxOfflineRecognizerConfig(config, Module) { + const feat = initSherpaOnnxFeatureConfig(config.featConfig, Module); + const model = initSherpaOnnxOfflineModelConfig(config.modelConfig, Module); + const lm = initSherpaOnnxOfflineLMConfig(config.lmConfig, Module); + + const len = feat.len + model.len + lm.len + 4 * 4; + const ptr = Module._malloc(len); + + let offset = 0; + Module._CopyHeap(feat.ptr, feat.len, ptr + offset); + offset += feat.len; + + Module._CopyHeap(model.ptr, model.len, ptr + offset); + offset += model.len; + + Module._CopyHeap(lm.ptr, lm.len, ptr + offset); + offset += lm.len; + + const decodingMethodLen = Module.lengthBytesUTF8(config.decodingMethod) + 1; + const hotwordsFileLen = Module.lengthBytesUTF8(config.hotwordsFile) + 1; + const bufferLen = decodingMethodLen + hotwordsFileLen; + const buffer = Module._malloc(bufferLen); + + offset = 0; + Module.stringToUTF8(config.decodingMethod, buffer, decodingMethodLen); + offset += decodingMethodLen; + + Module.stringToUTF8(config.hotwordsFile, buffer + offset, hotwordsFileLen); + + offset = feat.len + model.len + lm.len; + + Module.setValue(ptr + offset, buffer, 'i8*'); // decoding method + offset += 4; + + Module.setValue(ptr + offset, config.maxActivePaths, 'i32'); + offset += 4; + + Module.setValue(ptr + offset, buffer + decodingMethodLen, 'i8*'); + offset += 4; + + Module.setValue(ptr + offset, config.hotwordsScore, 'float'); + offset += 4; + + return { + buffer: buffer, ptr: ptr, len: len, feat: feat, model: model, lm: lm + } +} + +class OfflineStream { + constructor(handle, Module) { this.handle = handle; - this.pointer = null; // buffer - this.n = 0; // buffer size + this.Module = Module; } free() { if (this.handle) { - _DestroyOnlineStream(this.handle); + this.Module._DestroyOfflineStream(this.handle); this.handle = null; - _free(this.pointer); + } + } + + /** + * @param sampleRate {Number} + * @param samples {Float32Array} Containing samples in the range [-1, 1] + */ + acceptWaveform(sampleRate, samples) { + const pointer = + this.Module._malloc(samples.length * samples.BYTES_PER_ELEMENT); + this.Module.HEAPF32.set(samples, pointer / samples.BYTES_PER_ELEMENT); + this.Module._AcceptWaveformOffline( + this.handle, sampleRate, pointer, samples.length); + this.Module._free(pointer); + } +}; + +class OfflineRecognizer { + constructor(configObj, Module) { + this.config = configObj; + const config = initSherpaOnnxOfflineRecognizerConfig(configObj, Module); + const handle = Module._CreateOfflineRecognizer(config.ptr); + freeConfig(config, Module); + + this.handle = handle; + this.Module = Module; + } + + free() { + this.Module._DestroyOfflineRecognizer(this.handle); + this.handle = 0 + } + + createStream() { + const handle = this.Module._CreateOfflineStream(this.handle); + return new OfflineStream(handle, this.Module); + } + + decode(stream) { + this.Module._DecodeOfflineStream(this.handle, stream.handle); + } + + getResult(stream) { + const r = this.Module._GetOfflineStreamResult(stream.handle); + + const textPtr = this.Module.getValue(r, 'i8*'); + const text = this.Module.UTF8ToString(textPtr); + + this.Module._DestroyOfflineRecognizerResult(r); + return text; + } +}; + +class OnlineStream { + constructor(handle, Module) { + this.handle = handle; + this.pointer = null; // buffer + this.n = 0; // buffer size + this.Module = Module; + } + + free() { + if (this.handle) { + this.Module._DestroyOnlineStream(this.handle); + this.handle = null; + this.Module._free(this.pointer); this.pointer = null; this.n = 0; } @@ -321,61 +654,73 @@ class OnlineStream { */ acceptWaveform(sampleRate, samples) { if (this.n < samples.length) { - _free(this.pointer); - this.pointer = _malloc(samples.length * samples.BYTES_PER_ELEMENT); + this.Module._free(this.pointer); + this.pointer = + this.Module._malloc(samples.length * samples.BYTES_PER_ELEMENT); this.n = samples.length } - Module.HEAPF32.set(samples, this.pointer / samples.BYTES_PER_ELEMENT); - _AcceptWaveform(this.handle, sampleRate, this.pointer, samples.length); + this.Module.HEAPF32.set(samples, this.pointer / samples.BYTES_PER_ELEMENT); + this.Module._AcceptWaveform( + this.handle, sampleRate, this.pointer, samples.length); } inputFinished() { - _InputFinished(this.handle); + this.Module._InputFinished(this.handle); } }; class OnlineRecognizer { - constructor(configObj) { - let config = initSherpaOnnxOnlineRecognizerConfig(configObj) - let handle = _CreateOnlineRecognizer(config.ptr); + constructor(configObj, Module) { + this.config = configObj; + const config = initSherpaOnnxOnlineRecognizerConfig(configObj, Module) + const handle = Module._CreateOnlineRecognizer(config.ptr); - freeConfig(config); + freeConfig(config, Module); this.handle = handle; + this.Module = Module; } free() { - _DestroyOnlineRecognizer(this.handle); + this.Module._DestroyOnlineRecognizer(this.handle); this.handle = 0 } createStream() { - let handle = _CreateOnlineStream(this.handle); - return new OnlineStream(handle); + const handle = this.Module._CreateOnlineStream(this.handle); + return new OnlineStream(handle, this.Module); } isReady(stream) { - return _IsOnlineStreamReady(this.handle, stream.handle) == 1; + return this.Module._IsOnlineStreamReady(this.handle, stream.handle) == 1; } decode(stream) { - return _DecodeOnlineStream(this.handle, stream.handle); + this.Module._DecodeOnlineStream(this.handle, stream.handle); } isEndpoint(stream) { - return _IsEndpoint(this.handle, stream.handle) == 1; + return this.Module._IsEndpoint(this.handle, stream.handle) == 1; } reset(stream) { - _Reset(this.handle, stream.handle); + this.Module._Reset(this.handle, stream.handle); } getResult(stream) { - let r = _GetOnlineStreamResult(this.handle, stream.handle); - let textPtr = getValue(r, 'i8*'); - let text = UTF8ToString(textPtr); - _DestroyOnlineRecognizerResult(r); + const r = this.Module._GetOnlineStreamResult(this.handle, stream.handle); + const textPtr = this.Module.getValue(r, 'i8*'); + const text = this.Module.UTF8ToString(textPtr); + this.Module._DestroyOnlineRecognizerResult(r); return text; } } + +if (typeof process == 'object' && typeof process.versions == 'object' && + typeof process.versions.node == 'string') { + module.exports = { + createOnlineRecognizer, + OfflineRecognizer, + }; +} diff --git a/wasm/asr/sherpa-onnx-wasm-main-asr.cc b/wasm/asr/sherpa-onnx-wasm-main-asr.cc index 23676631..951391e1 100644 --- a/wasm/asr/sherpa-onnx-wasm-main-asr.cc +++ b/wasm/asr/sherpa-onnx-wasm-main-asr.cc @@ -1,4 +1,4 @@ -// wasm/sherpa-onnx-wasm-asr-main.cc +// wasm/sherpa-onnx-wasm-main-asr.cc // // Copyright (c) 2024 Xiaomi Corporation #include diff --git a/wasm/nodejs/CMakeLists.txt b/wasm/nodejs/CMakeLists.txt new file mode 100644 index 00000000..faff50ea --- /dev/null +++ b/wasm/nodejs/CMakeLists.txt @@ -0,0 +1,76 @@ +if(NOT $ENV{SHERPA_ONNX_IS_USING_BUILD_WASM_SH}) + message(FATAL_ERROR "Please use ./build-wasm-simd-nodejs.sh to build for wasm NodeJS") +endif() + +set(exported_functions + #tts + PrintOfflineTtsConfig + SherpaOnnxCreateOfflineTts + SherpaOnnxDestroyOfflineTts + SherpaOnnxDestroyOfflineTtsGeneratedAudio + SherpaOnnxOfflineTtsGenerate + SherpaOnnxOfflineTtsGenerateWithCallback + SherpaOnnxOfflineTtsNumSpeakers + SherpaOnnxOfflineTtsSampleRate + SherpaOnnxWriteWave + # streaming asr + AcceptWaveform + CreateOnlineRecognizer + CreateOnlineStream + DecodeOnlineStream + DestroyOnlineRecognizer + DestroyOnlineRecognizerResult + DestroyOnlineStream + GetOnlineStreamResult + InputFinished + IsEndpoint + IsOnlineStreamReady + Reset + # non-streaming ASR + PrintOfflineRecognizerConfig + CreateOfflineRecognizer + DestroyOfflineRecognizer + CreateOfflineStream + DestroyOfflineStream + AcceptWaveformOffline + DecodeOfflineStream + DecodeMultipleOfflineStreams + GetOfflineStreamResult + DestroyOfflineRecognizerResult +) + + +set(mangled_exported_functions) +foreach(x IN LISTS exported_functions) + list(APPEND mangled_exported_functions "_${x}") +endforeach() +list(JOIN mangled_exported_functions "," all_exported_functions) + +include_directories(${CMAKE_SOURCE_DIR}) +set(MY_FLAGS " -s FORCE_FILESYSTEM=1 -s INITIAL_MEMORY=512MB -s ALLOW_MEMORY_GROWTH=1") +string(APPEND MY_FLAGS " -sSTACK_SIZE=10485760 ") # 10MB +string(APPEND MY_FLAGS " -sEXPORTED_FUNCTIONS=[_CopyHeap,_malloc,_free,${all_exported_functions}] ") +string(APPEND MY_FLAGS " -sEXPORTED_RUNTIME_METHODS=['ccall','stringToUTF8','setValue','getValue'] ") +string(APPEND MY_FLAGS " -sNODERAWFS=1 ") +string(APPEND MY_FLAGS " -sEXPORTED_RUNTIME_METHODS=['ccall','stringToUTF8','setValue','getValue','lengthBytesUTF8','UTF8ToString'] ") +string(APPEND MY_FLAGS " -sMODULARIZE=1 -sWASM_ASYNC_COMPILATION=0 ") + +message(STATUS "MY_FLAGS: ${MY_FLAGS}") + +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${MY_FLAGS}") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${MY_FLAGS}") +set(CMAKE_EXECUTBLE_LINKER_FLAGS "${CMAKE_EXECUTBLE_LINKER_FLAGS} ${MY_FLAGS}") + +add_executable(sherpa-onnx-wasm-nodejs sherpa-onnx-wasm-nodejs.cc) +target_link_libraries(sherpa-onnx-wasm-nodejs sherpa-onnx-core sherpa-onnx-c-api) +install(TARGETS sherpa-onnx-wasm-nodejs DESTINATION bin/wasm/nodejs) + +install( + FILES + ${CMAKE_SOURCE_DIR}/wasm/asr/sherpa-onnx-asr.js + ${CMAKE_SOURCE_DIR}/wasm/tts/sherpa-onnx-tts.js + "$/sherpa-onnx-wasm-nodejs.js" + "$/sherpa-onnx-wasm-nodejs.wasm" + DESTINATION + bin/wasm/nodejs +) diff --git a/wasm/nodejs/sherpa-onnx-wasm-nodejs.cc b/wasm/nodejs/sherpa-onnx-wasm-nodejs.cc new file mode 100644 index 00000000..edbf250e --- /dev/null +++ b/wasm/nodejs/sherpa-onnx-wasm-nodejs.cc @@ -0,0 +1,104 @@ +// wasm/sherpa-onnx-wasm-main-nodejs.cc +// +// Copyright (c) 2024 Xiaomi Corporation +#include + +#include +#include + +#include "sherpa-onnx/c-api/c-api.h" + +extern "C" { + +static_assert(sizeof(SherpaOnnxOfflineTransducerModelConfig) == 3 * 4, ""); +static_assert(sizeof(SherpaOnnxOfflineParaformerModelConfig) == 4, ""); + +static_assert(sizeof(SherpaOnnxOfflineNemoEncDecCtcModelConfig) == 4, ""); +static_assert(sizeof(SherpaOnnxOfflineWhisperModelConfig) == 2 * 4, ""); +static_assert(sizeof(SherpaOnnxOfflineTdnnModelConfig) == 4, ""); +static_assert(sizeof(SherpaOnnxOfflineLMConfig) == 2 * 4, ""); + +static_assert(sizeof(SherpaOnnxOfflineModelConfig) == + sizeof(SherpaOnnxOfflineTransducerModelConfig) + + sizeof(SherpaOnnxOfflineParaformerModelConfig) + + sizeof(SherpaOnnxOfflineNemoEncDecCtcModelConfig) + + sizeof(SherpaOnnxOfflineWhisperModelConfig) + + sizeof(SherpaOnnxOfflineTdnnModelConfig) + 5 * 4, + ""); +static_assert(sizeof(SherpaOnnxFeatureConfig) == 2 * 4, ""); +static_assert(sizeof(SherpaOnnxOfflineRecognizerConfig) == + sizeof(SherpaOnnxFeatureConfig) + + sizeof(SherpaOnnxOfflineLMConfig) + + sizeof(SherpaOnnxOfflineModelConfig) + 4 * 4, + ""); + +void PrintOfflineTtsConfig(SherpaOnnxOfflineTtsConfig *tts_config) { + auto tts_model_config = &tts_config->model; + auto vits_model_config = &tts_model_config->vits; + fprintf(stdout, "----------vits model config----------\n"); + fprintf(stdout, "model: %s\n", vits_model_config->model); + fprintf(stdout, "lexicon: %s\n", vits_model_config->lexicon); + fprintf(stdout, "tokens: %s\n", vits_model_config->tokens); + fprintf(stdout, "data_dir: %s\n", vits_model_config->data_dir); + fprintf(stdout, "noise scale: %.3f\n", vits_model_config->noise_scale); + fprintf(stdout, "noise scale w: %.3f\n", vits_model_config->noise_scale_w); + fprintf(stdout, "length scale: %.3f\n", vits_model_config->length_scale); + + fprintf(stdout, "----------tts model config----------\n"); + fprintf(stdout, "num threads: %d\n", tts_model_config->num_threads); + fprintf(stdout, "debug: %d\n", tts_model_config->debug); + fprintf(stdout, "provider: %s\n", tts_model_config->provider); + + fprintf(stdout, "----------tts config----------\n"); + fprintf(stdout, "rule_fsts: %s\n", tts_config->rule_fsts); + fprintf(stdout, "max num sentences: %d\n", tts_config->max_num_sentences); +} + +void PrintOfflineRecognizerConfig(SherpaOnnxOfflineRecognizerConfig *config) { + auto model_config = &config->model_config; + auto feat = &config->feat_config; + auto transducer = &model_config->transducer; + auto paraformer = &model_config->paraformer; + auto nemo_ctc = &model_config->nemo_ctc; + auto whisper = &model_config->whisper; + auto tdnn = &model_config->tdnn; + + fprintf(stdout, "----------offline transducer model config----------\n"); + fprintf(stdout, "encoder: %s\n", transducer->encoder); + fprintf(stdout, "decoder: %s\n", transducer->decoder); + fprintf(stdout, "joiner: %s\n", transducer->joiner); + + fprintf(stdout, "----------offline paraformer model config----------\n"); + fprintf(stdout, "model: %s\n", paraformer->model); + + fprintf(stdout, "----------offline nemo_ctc model config----------\n"); + fprintf(stdout, "model: %s\n", nemo_ctc->model); + + fprintf(stdout, "----------offline whisper model config----------\n"); + fprintf(stdout, "encoder: %s\n", whisper->encoder); + fprintf(stdout, "decoder: %s\n", whisper->decoder); + + fprintf(stdout, "----------offline tdnn model config----------\n"); + fprintf(stdout, "model: %s\n", tdnn->model); + + fprintf(stdout, "tokens: %s\n", model_config->tokens); + fprintf(stdout, "num_threads: %d\n", model_config->num_threads); + fprintf(stdout, "provider: %s\n", model_config->provider); + fprintf(stdout, "debug: %d\n", model_config->debug); + fprintf(stdout, "model type: %s\n", model_config->model_type); + + fprintf(stdout, "----------feat config----------\n"); + fprintf(stdout, "sample rate: %d\n", feat->sample_rate); + fprintf(stdout, "feat dim: %d\n", feat->feature_dim); + + fprintf(stdout, "----------recognizer config----------\n"); + fprintf(stdout, "decoding method: %s\n", config->decoding_method); + fprintf(stdout, "max active paths: %d\n", config->max_active_paths); + fprintf(stdout, "hotwords_file: %s\n", config->hotwords_file); + fprintf(stdout, "hotwords_score: %.2f\n", config->hotwords_score); +} + +void CopyHeap(const char *src, int32_t num_bytes, char *dst) { + std::copy(src, src + num_bytes, dst); +} +} diff --git a/wasm/tts/app-tts.js b/wasm/tts/app-tts.js index 8359080c..d883e511 100644 --- a/wasm/tts/app-tts.js +++ b/wasm/tts/app-tts.js @@ -22,7 +22,7 @@ Module.onRuntimeInitialized = function() { console.log('Model files downloaded!'); console.log('Initializing tts ......'); - tts = initSherpaOnnxOfflineTts() + tts = createOfflineTts(Module) if (tts.numSpeakers > 1) { speakerIdLabel.innerHTML = `Speaker ID (0 - ${tts.numSpeakers - 1}):`; } diff --git a/wasm/tts/sherpa-onnx-tts.js b/wasm/tts/sherpa-onnx-tts.js index f32b09a1..c291d8a4 100644 --- a/wasm/tts/sherpa-onnx-tts.js +++ b/wasm/tts/sherpa-onnx-tts.js @@ -1,109 +1,109 @@ -function freeConfig(config) { +function freeConfig(config, Module) { if ('buffer' in config) { - _free(config.buffer); + Module._free(config.buffer); } if ('config' in config) { - freeConfig(config.config) + freeConfig(config.config, Module) } - _free(config.ptr); + Module._free(config.ptr); } // The user should free the returned pointers -function initSherpaOnnxOfflineTtsVitsModelConfig(config) { - let modelLen = lengthBytesUTF8(config.model) + 1; - let lexiconLen = lengthBytesUTF8(config.lexicon) + 1; - let tokensLen = lengthBytesUTF8(config.tokens) + 1; - let dataDirLen = lengthBytesUTF8(config.dataDir) + 1; +function initSherpaOnnxOfflineTtsVitsModelConfig(config, Module) { + const modelLen = Module.lengthBytesUTF8(config.model) + 1; + const lexiconLen = Module.lengthBytesUTF8(config.lexicon) + 1; + const tokensLen = Module.lengthBytesUTF8(config.tokens) + 1; + const dataDirLen = Module.lengthBytesUTF8(config.dataDir) + 1; - let n = modelLen + lexiconLen + tokensLen + dataDirLen; + const n = modelLen + lexiconLen + tokensLen + dataDirLen; - let buffer = _malloc(n); + const buffer = Module._malloc(n); - let len = 7 * 4; - let ptr = _malloc(len); + const len = 7 * 4; + const ptr = Module._malloc(len); let offset = 0; - stringToUTF8(config.model, buffer + offset, modelLen); + Module.stringToUTF8(config.model, buffer + offset, modelLen); offset += modelLen; - stringToUTF8(config.lexicon, buffer + offset, lexiconLen); + Module.stringToUTF8(config.lexicon, buffer + offset, lexiconLen); offset += lexiconLen; - stringToUTF8(config.tokens, buffer + offset, tokensLen); + Module.stringToUTF8(config.tokens, buffer + offset, tokensLen); offset += tokensLen; - stringToUTF8(config.dataDir, buffer + offset, dataDirLen); + Module.stringToUTF8(config.dataDir, buffer + offset, dataDirLen); offset += dataDirLen; offset = 0; - setValue(ptr, buffer + offset, 'i8*'); + Module.setValue(ptr, buffer + offset, 'i8*'); offset += modelLen; - setValue(ptr + 4, buffer + offset, 'i8*'); + Module.setValue(ptr + 4, buffer + offset, 'i8*'); offset += lexiconLen; - setValue(ptr + 8, buffer + offset, 'i8*'); + Module.setValue(ptr + 8, buffer + offset, 'i8*'); offset += tokensLen; - setValue(ptr + 12, buffer + offset, 'i8*'); + Module.setValue(ptr + 12, buffer + offset, 'i8*'); offset += dataDirLen; - setValue(ptr + 16, config.noiseScale, 'float'); - setValue(ptr + 20, config.noiseScaleW, 'float'); - setValue(ptr + 24, config.lengthScale, 'float'); + Module.setValue(ptr + 16, config.noiseScale, 'float'); + Module.setValue(ptr + 20, config.noiseScaleW, 'float'); + Module.setValue(ptr + 24, config.lengthScale, 'float'); return { buffer: buffer, ptr: ptr, len: len, } } -function initSherpaOnnxOfflineTtsModelConfig(config) { - let vitsModelConfig = - initSherpaOnnxOfflineTtsVitsModelConfig(config.offlineTtsVitsModelConfig); +function initSherpaOnnxOfflineTtsModelConfig(config, Module) { + const vitsModelConfig = initSherpaOnnxOfflineTtsVitsModelConfig( + config.offlineTtsVitsModelConfig, Module); - let len = vitsModelConfig.len + 3 * 4; - let ptr = _malloc(len); + const len = vitsModelConfig.len + 3 * 4; + const ptr = Module._malloc(len); let offset = 0; - _CopyHeap(vitsModelConfig.ptr, vitsModelConfig.len, ptr + offset); + Module._CopyHeap(vitsModelConfig.ptr, vitsModelConfig.len, ptr + offset); offset += vitsModelConfig.len; - setValue(ptr + offset, config.numThreads, 'i32'); + Module.setValue(ptr + offset, config.numThreads, 'i32'); offset += 4; - setValue(ptr + offset, config.debug, 'i32'); + Module.setValue(ptr + offset, config.debug, 'i32'); offset += 4; - let providerLen = lengthBytesUTF8(config.provider) + 1; - let buffer = _malloc(providerLen); - stringToUTF8(config.provider, buffer, providerLen); - setValue(ptr + offset, buffer, 'i8*'); + const providerLen = Module.lengthBytesUTF8(config.provider) + 1; + const buffer = Module._malloc(providerLen); + Module.stringToUTF8(config.provider, buffer, providerLen); + Module.setValue(ptr + offset, buffer, 'i8*'); return { buffer: buffer, ptr: ptr, len: len, config: vitsModelConfig, } } -function initSherpaOnnxOfflineTtsConfig(config) { - let modelConfig = - initSherpaOnnxOfflineTtsModelConfig(config.offlineTtsModelConfig); - let len = modelConfig.len + 2 * 4; - let ptr = _malloc(len); +function initSherpaOnnxOfflineTtsConfig(config, Module) { + const modelConfig = + initSherpaOnnxOfflineTtsModelConfig(config.offlineTtsModelConfig, Module); + const len = modelConfig.len + 2 * 4; + const ptr = Module._malloc(len); let offset = 0; - _CopyHeap(modelConfig.ptr, modelConfig.len, ptr + offset); + Module._CopyHeap(modelConfig.ptr, modelConfig.len, ptr + offset); offset += modelConfig.len; - let ruleFstsLen = lengthBytesUTF8(config.ruleFsts) + 1; - let buffer = _malloc(ruleFstsLen); - stringToUTF8(config.ruleFsts, buffer, ruleFstsLen); - setValue(ptr + offset, buffer, 'i8*'); + const ruleFstsLen = Module.lengthBytesUTF8(config.ruleFsts) + 1; + const buffer = Module._malloc(ruleFstsLen); + Module.stringToUTF8(config.ruleFsts, buffer, ruleFstsLen); + Module.setValue(ptr + offset, buffer, 'i8*'); offset += 4; - setValue(ptr + offset, config.maxNumSentences, 'i32'); + Module.setValue(ptr + offset, config.maxNumSentences, 'i32'); return { buffer: buffer, ptr: ptr, len: len, config: modelConfig, @@ -111,19 +111,21 @@ function initSherpaOnnxOfflineTtsConfig(config) { } class OfflineTts { - constructor(configObj) { - let config = initSherpaOnnxOfflineTtsConfig(configObj) - let handle = _SherpaOnnxCreateOfflineTts(config.ptr); + constructor(configObj, Module) { + console.log(configObj) + const config = initSherpaOnnxOfflineTtsConfig(configObj, Module) + const handle = Module._SherpaOnnxCreateOfflineTts(config.ptr); - freeConfig(config); + freeConfig(config, Module); this.handle = handle; - this.sampleRate = _SherpaOnnxOfflineTtsSampleRate(this.handle); - this.numSpeakers = _SherpaOnnxOfflineTtsNumSpeakers(this.handle); + this.sampleRate = Module._SherpaOnnxOfflineTtsSampleRate(this.handle); + this.numSpeakers = Module._SherpaOnnxOfflineTtsNumSpeakers(this.handle); + this.Module = Module } free() { - _SherpaOnnxDestroyOfflineTts(this.handle); + this.Module._SherpaOnnxDestroyOfflineTts(this.handle); this.handle = 0 } @@ -133,29 +135,44 @@ class OfflineTts { // speed: 1.0 // } generate(config) { - let textLen = lengthBytesUTF8(config.text) + 1; - let textPtr = _malloc(textLen); - stringToUTF8(config.text, textPtr, textLen); + const textLen = this.Module.lengthBytesUTF8(config.text) + 1; + const textPtr = this.Module._malloc(textLen); + this.Module.stringToUTF8(config.text, textPtr, textLen); - let h = _SherpaOnnxOfflineTtsGenerate( + const h = this.Module._SherpaOnnxOfflineTtsGenerate( this.handle, textPtr, config.sid, config.speed); - let numSamples = HEAP32[h / 4 + 1]; - let sampleRate = HEAP32[h / 4 + 2]; + const numSamples = this.Module.HEAP32[h / 4 + 1]; + const sampleRate = this.Module.HEAP32[h / 4 + 2]; - let samplesPtr = HEAP32[h / 4] / 4; - let samples = new Float32Array(numSamples); + const samplesPtr = this.Module.HEAP32[h / 4] / 4; + const samples = new Float32Array(numSamples); for (let i = 0; i < numSamples; i++) { - samples[i] = HEAPF32[samplesPtr + i]; + samples[i] = this.Module.HEAPF32[samplesPtr + i]; } - _SherpaOnnxDestroyOfflineTtsGeneratedAudio(h); + this.Module._SherpaOnnxDestroyOfflineTtsGeneratedAudio(h); return {samples: samples, sampleRate: sampleRate}; } + save(filename, audio) { + const samples = audio.samples; + const sampleRate = audio.sampleRate; + const ptr = this.Module._malloc(samples.length * 4); + for (let i = 0; i < samples.length; i++) { + this.Module.HEAPF32[ptr / 4 + i] = samples[i]; + } + + const filenameLen = this.Module.lengthBytesUTF8(filename) + 1; + const buffer = this.Module._malloc(filenameLen); + this.Module.stringToUTF8(filename, buffer, filenameLen); + this.Module._SherpaOnnxWriteWave(ptr, samples.length, sampleRate, buffer); + this.Module._free(buffer); + this.Module._free(ptr); + } } -function initSherpaOnnxOfflineTts() { - let offlineTtsVitsModelConfig = { +function createOfflineTts(Module, myConfig) { + const offlineTtsVitsModelConfig = { model: './model.onnx', lexicon: '', tokens: './tokens.txt', @@ -164,7 +181,7 @@ function initSherpaOnnxOfflineTts() { noiseScaleW: 0.8, lengthScale: 1.0, }; - let offlineTtsModelConfig = { + const offlineTtsModelConfig = { offlineTtsVitsModelConfig: offlineTtsVitsModelConfig, numThreads: 1, debug: 1, @@ -176,5 +193,16 @@ function initSherpaOnnxOfflineTts() { maxNumSentences: 1, } - return new OfflineTts(offlineTtsConfig); + if (myConfig) { + offlineTtsConfig = myConfig; + } + + return new OfflineTts(offlineTtsConfig, Module); +} + +if (typeof process == 'object' && typeof process.versions == 'object' && + typeof process.versions.node == 'string') { + module.exports = { + createOfflineTts, + }; } diff --git a/wasm/tts/sherpa-onnx-wasm-main-tts.cc b/wasm/tts/sherpa-onnx-wasm-main-tts.cc index 2441ae98..71701419 100644 --- a/wasm/tts/sherpa-onnx-wasm-main-tts.cc +++ b/wasm/tts/sherpa-onnx-wasm-main-tts.cc @@ -1,4 +1,4 @@ -// wasm/sherpa-onnx-wasm-main.cc +// wasm/sherpa-onnx-wasm-main-tts.cc // // Copyright (c) 2024 Xiaomi Corporation #include