Add WebAssembly for SenseVoice (#1158)

This commit is contained in:
Fangjun Kuang
2024-07-21 15:39:55 +08:00
committed by GitHub
parent c3260ef842
commit 70d14353bb
21 changed files with 383 additions and 351 deletions

View File

@@ -88,6 +88,21 @@ tar xvf sherpa-onnx-paraformer-zh-2023-09-14.tar.bz2
node ./test-offline-paraformer.js
```
## ./test-offline-sense-voice.js
[./test-offline-sense-voice.js](./test-offline-sense-voice.js) demonstrates
how to decode a file with a non-streaming Paraformer model.
You can use the following command to run it:
```bash
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
tar xvf sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
rm sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
node ./test-offline-sense-voice.js
```
## ./test-offline-transducer.js
[./test-offline-transducer.js](./test-offline-transducer.js) demonstrates

View File

@@ -13,27 +13,9 @@ function createOfflineRecognizer() {
};
let modelConfig = {
transducer: {
encoder: '',
decoder: '',
joiner: '',
},
paraformer: {
model: '',
},
nemoCtc: {
model: './sherpa-onnx-nemo-ctc-en-conformer-small/model.int8.onnx',
},
whisper: {
encoder: '',
decoder: '',
language: '',
task: '',
tailPaddings: -1,
},
tdnn: {
model: '',
},
tokens: './sherpa-onnx-nemo-ctc-en-conformer-small/tokens.txt',
numThreads: 1,
debug: 0,
@@ -41,19 +23,11 @@ function createOfflineRecognizer() {
modelType: 'nemo_ctc',
};
let lmConfig = {
model: '',
scale: 1.0,
};
let config = {
featConfig: featConfig,
modelConfig: modelConfig,
lmConfig: lmConfig,
decodingMethod: 'greedy_search',
maxActivePaths: 4,
hotwordsFile: '',
hotwordsScore: 1.5,
};
return sherpa_onnx.createOfflineRecognizer(config);

View File

@@ -13,27 +13,9 @@ function createOfflineRecognizer() {
};
let modelConfig = {
transducer: {
encoder: '',
decoder: '',
joiner: '',
},
paraformer: {
model: './sherpa-onnx-paraformer-zh-2023-09-14/model.int8.onnx',
},
nemoCtc: {
model: '',
},
whisper: {
encoder: '',
decoder: '',
language: '',
task: '',
tailPaddings: -1,
},
tdnn: {
model: '',
},
tokens: './sherpa-onnx-paraformer-zh-2023-09-14/tokens.txt',
numThreads: 1,
debug: 0,
@@ -41,19 +23,11 @@ function createOfflineRecognizer() {
modelType: 'paraformer',
};
let lmConfig = {
model: '',
scale: 1.0,
};
let config = {
featConfig: featConfig,
modelConfig: modelConfig,
lmConfig: lmConfig,
decodingMethod: 'greedy_search',
maxActivePaths: 4,
hotwordsFile: '',
hotwordsScore: 1.5,
// https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn_zh_number.fst
ruleFsts: './itn_zh_number.fst',
};

View File

@@ -13,27 +13,9 @@ function createOfflineRecognizer() {
};
let modelConfig = {
transducer: {
encoder: '',
decoder: '',
joiner: '',
},
paraformer: {
model: './sherpa-onnx-paraformer-zh-2023-09-14/model.int8.onnx',
},
nemoCtc: {
model: '',
},
whisper: {
encoder: '',
decoder: '',
language: '',
task: '',
tailPaddings: -1,
},
tdnn: {
model: '',
},
tokens: './sherpa-onnx-paraformer-zh-2023-09-14/tokens.txt',
numThreads: 1,
debug: 0,
@@ -41,19 +23,10 @@ function createOfflineRecognizer() {
modelType: 'paraformer',
};
let lmConfig = {
model: '',
scale: 1.0,
};
let config = {
featConfig: featConfig,
modelConfig: modelConfig,
lmConfig: lmConfig,
decodingMethod: 'greedy_search',
maxActivePaths: 4,
hotwordsFile: '',
hotwordsScore: 1.5,
};
return sherpa_onnx.createOfflineRecognizer(config);

View File

@@ -0,0 +1,101 @@
// Copyright (c) 2024 Xiaomi Corporation (authors: Fangjun Kuang)
const fs = require('fs');
const {Readable} = require('stream');
const wav = require('wav');
const sherpa_onnx = require('sherpa-onnx');
function createOfflineRecognizer() {
let featConfig = {
sampleRate: 16000,
featureDim: 80,
};
let modelConfig = {
senseVoice: {
model:
'./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.int8.onnx',
language: '',
useInverseTextNormalization: 1,
},
tokens: './sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt',
numThreads: 1,
debug: 0,
provider: 'cpu',
};
let config = {
featConfig: featConfig,
modelConfig: modelConfig,
decodingMethod: 'greedy_search',
};
return sherpa_onnx.createOfflineRecognizer(config);
}
const recognizer = createOfflineRecognizer();
const stream = recognizer.createStream();
const waveFilename =
'./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/test_wavs/zh.wav';
const reader = new wav.Reader();
const readable = new Readable().wrap(reader);
const buf = [];
reader.on('format', ({audioFormat, bitDepth, channels, sampleRate}) => {
if (sampleRate != recognizer.config.featConfig.sampleRate) {
throw new Error(`Only support sampleRate ${
recognizer.config.featConfig.sampleRate}. Given ${sampleRate}`);
}
if (audioFormat != 1) {
throw new Error(`Only support PCM format. Given ${audioFormat}`);
}
if (channels != 1) {
throw new Error(`Only a single channel. Given ${channel}`);
}
if (bitDepth != 16) {
throw new Error(`Only support 16-bit samples. Given ${bitDepth}`);
}
});
fs.createReadStream(waveFilename, {'highWaterMark': 4096})
.pipe(reader)
.on('finish', function(err) {
// tail padding
const floatSamples =
new Float32Array(recognizer.config.featConfig.sampleRate * 0.5);
buf.push(floatSamples);
const flattened =
Float32Array.from(buf.reduce((a, b) => [...a, ...b], []));
stream.acceptWaveform(recognizer.config.featConfig.sampleRate, flattened);
recognizer.decode(stream);
const text = recognizer.getResult(stream).text;
console.log(text);
stream.free();
recognizer.free();
});
readable.on('readable', function() {
let chunk;
while ((chunk = readable.read()) != null) {
const int16Samples = new Int16Array(
chunk.buffer, chunk.byteOffset,
chunk.length / Int16Array.BYTES_PER_ELEMENT);
const floatSamples = new Float32Array(int16Samples.length);
for (let i = 0; i < floatSamples.length; i++) {
floatSamples[i] = int16Samples[i] / 32768.0;
}
buf.push(floatSamples);
}
});

View File

@@ -21,22 +21,6 @@ function createOfflineRecognizer() {
joiner:
'./sherpa-onnx-zipformer-en-2023-06-26/joiner-epoch-99-avg-1.int8.onnx',
},
paraformer: {
model: '',
},
nemoCtc: {
model: '',
},
whisper: {
encoder: '',
decoder: '',
language: '',
task: '',
tailPaddings: -1,
},
tdnn: {
model: '',
},
tokens: './sherpa-onnx-zipformer-en-2023-06-26/tokens.txt',
numThreads: 1,
debug: 0,
@@ -44,15 +28,9 @@ function createOfflineRecognizer() {
modelType: 'transducer',
};
let lmConfig = {
model: '',
scale: 1.0,
};
let config = {
featConfig: featConfig,
modelConfig: modelConfig,
lmConfig: lmConfig,
decodingMethod: 'greedy_search',
maxActivePaths: 4,
hotwordsFile: '',

View File

@@ -5,10 +5,8 @@ const sherpa_onnx = require('sherpa-onnx');
function createOfflineTts() {
let offlineTtsVitsModelConfig = {
model: './vits-piper-en_US-amy-low/en_US-amy-low.onnx',
lexicon: '',
tokens: './vits-piper-en_US-amy-low/tokens.txt',
dataDir: './vits-piper-en_US-amy-low/espeak-ng-data',
dictDir: '',
noiseScale: 0.667,
noiseScaleW: 0.8,
lengthScale: 1.0,
@@ -22,8 +20,6 @@ function createOfflineTts() {
let offlineTtsConfig = {
offlineTtsModelConfig: offlineTtsModelConfig,
ruleFsts: '',
ruleFars: '',
maxNumSentences: 1,
};

View File

@@ -7,8 +7,6 @@ function createOfflineTts() {
model: './vits-icefall-zh-aishell3/model.onnx',
lexicon: './vits-icefall-zh-aishell3/lexicon.txt',
tokens: './vits-icefall-zh-aishell3/tokens.txt',
dataDir: '',
dictDir: '',
noiseScale: 0.667,
noiseScaleW: 0.8,
lengthScale: 1.0,
@@ -31,7 +29,6 @@ function createOfflineTts() {
return sherpa_onnx.createOfflineTts(offlineTtsConfig);
}
const tts = createOfflineTts();
const speakerId = 66;
const speed = 1.0;

View File

@@ -13,17 +13,6 @@ function createOfflineRecognizer() {
};
let modelConfig = {
transducer: {
encoder: '',
decoder: '',
joiner: '',
},
paraformer: {
model: '',
},
nemoCtc: {
model: '',
},
whisper: {
encoder: './sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx',
decoder: './sherpa-onnx-whisper-tiny.en/tiny.en-decoder.int8.onnx',
@@ -31,9 +20,6 @@ function createOfflineRecognizer() {
task: 'transcribe',
tailPaddings: -1,
},
tdnn: {
model: '',
},
tokens: './sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt',
numThreads: 1,
debug: 0,
@@ -41,19 +27,10 @@ function createOfflineRecognizer() {
modelType: 'whisper',
};
let lmConfig = {
model: '',
scale: 1.0,
};
let config = {
featConfig: featConfig,
modelConfig: modelConfig,
lmConfig: lmConfig,
decodingMethod: 'greedy_search',
maxActivePaths: 4,
hotwordsFile: '',
hotwordsScore: 1.5,
};
return sherpa_onnx.createOfflineRecognizer(config);

View File

@@ -6,12 +6,6 @@ console.log(portAudio.getDevices());
const sherpa_onnx = require('sherpa-onnx');
function createOnlineRecognizer() {
let onlineTransducerModelConfig = {
encoder: '',
decoder: '',
joiner: '',
};
let onlineParaformerModelConfig = {
encoder:
'./sherpa-onnx-streaming-paraformer-bilingual-zh-en/encoder.int8.onnx',
@@ -19,14 +13,8 @@ function createOnlineRecognizer() {
'./sherpa-onnx-streaming-paraformer-bilingual-zh-en/decoder.int8.onnx',
};
let onlineZipformer2CtcModelConfig = {
model: '',
};
let onlineModelConfig = {
transducer: onlineTransducerModelConfig,
paraformer: onlineParaformerModelConfig,
zipformer2Ctc: onlineZipformer2CtcModelConfig,
tokens: './sherpa-onnx-streaming-paraformer-bilingual-zh-en/tokens.txt',
numThreads: 1,
provider: 'cpu',
@@ -48,12 +36,6 @@ function createOnlineRecognizer() {
rule1MinTrailingSilence: 2.4,
rule2MinTrailingSilence: 1.2,
rule3MinUtteranceLength: 20,
hotwordsFile: '',
hotwordsScore: 1.5,
ctcFstDecoderConfig: {
graph: '',
maxActive: 3000,
}
};
return sherpa_onnx.createOnlineRecognizer(recognizerConfig);

View File

@@ -7,12 +7,6 @@ const wav = require('wav');
const sherpa_onnx = require('sherpa-onnx');
function createOnlineRecognizer() {
let onlineTransducerModelConfig = {
encoder: '',
decoder: '',
joiner: '',
};
let onlineParaformerModelConfig = {
encoder:
'./sherpa-onnx-streaming-paraformer-bilingual-zh-en/encoder.int8.onnx',
@@ -20,14 +14,8 @@ function createOnlineRecognizer() {
'./sherpa-onnx-streaming-paraformer-bilingual-zh-en/decoder.int8.onnx',
};
let onlineZipformer2CtcModelConfig = {
model: '',
};
let onlineModelConfig = {
transducer: onlineTransducerModelConfig,
paraformer: onlineParaformerModelConfig,
zipformer2Ctc: onlineZipformer2CtcModelConfig,
tokens: './sherpa-onnx-streaming-paraformer-bilingual-zh-en/tokens.txt',
numThreads: 1,
provider: 'cpu',
@@ -49,12 +37,6 @@ function createOnlineRecognizer() {
rule1MinTrailingSilence: 2.4,
rule2MinTrailingSilence: 1.2,
rule3MinUtteranceLength: 20,
hotwordsFile: '',
hotwordsScore: 1.5,
ctcFstDecoderConfig: {
graph: '',
maxActive: 3000,
}
};
return sherpa_onnx.createOnlineRecognizer(recognizerConfig);

View File

@@ -16,19 +16,8 @@ function createOnlineRecognizer() {
'./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/joiner-epoch-99-avg-1.int8.onnx',
};
let onlineParaformerModelConfig = {
encoder: '',
decoder: '',
};
let onlineZipformer2CtcModelConfig = {
model: '',
};
let onlineModelConfig = {
transducer: onlineTransducerModelConfig,
paraformer: onlineParaformerModelConfig,
zipformer2Ctc: onlineZipformer2CtcModelConfig,
tokens:
'./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/tokens.txt',
numThreads: 1,
@@ -51,12 +40,6 @@ function createOnlineRecognizer() {
rule1MinTrailingSilence: 2.4,
rule2MinTrailingSilence: 1.2,
rule3MinUtteranceLength: 20,
hotwordsFile: '',
hotwordsScore: 1.5,
ctcFstDecoderConfig: {
graph: '',
maxActive: 3000,
},
// https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn_zh_number.fst
ruleFsts: './itn_zh_number.fst',
};

View File

@@ -15,19 +15,8 @@ function createOnlineRecognizer() {
'./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/joiner-epoch-99-avg-1.int8.onnx',
};
let onlineParaformerModelConfig = {
encoder: '',
decoder: '',
};
let onlineZipformer2CtcModelConfig = {
model: '',
};
let onlineModelConfig = {
transducer: onlineTransducerModelConfig,
paraformer: onlineParaformerModelConfig,
zipformer2Ctc: onlineZipformer2CtcModelConfig,
tokens:
'./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/tokens.txt',
numThreads: 1,
@@ -50,12 +39,6 @@ function createOnlineRecognizer() {
rule1MinTrailingSilence: 2.4,
rule2MinTrailingSilence: 1.2,
rule3MinUtteranceLength: 20,
hotwordsFile: '',
hotwordsScore: 1.5,
ctcFstDecoderConfig: {
graph: '',
maxActive: 3000,
}
};
return sherpa_onnx.createOnlineRecognizer(recognizerConfig);

View File

@@ -16,19 +16,8 @@ function createOnlineRecognizer() {
'./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/joiner-epoch-99-avg-1.int8.onnx',
};
let onlineParaformerModelConfig = {
encoder: '',
decoder: '',
};
let onlineZipformer2CtcModelConfig = {
model: '',
};
let onlineModelConfig = {
transducer: onlineTransducerModelConfig,
paraformer: onlineParaformerModelConfig,
zipformer2Ctc: onlineZipformer2CtcModelConfig,
tokens:
'./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/tokens.txt',
numThreads: 1,
@@ -51,12 +40,6 @@ function createOnlineRecognizer() {
rule1MinTrailingSilence: 2.4,
rule2MinTrailingSilence: 1.2,
rule3MinUtteranceLength: 20,
hotwordsFile: '',
hotwordsScore: 1.5,
ctcFstDecoderConfig: {
graph: '',
maxActive: 3000,
}
};
return sherpa_onnx.createOnlineRecognizer(recognizerConfig);

View File

@@ -7,25 +7,12 @@ const wav = require('wav');
const sherpa_onnx = require('sherpa-onnx');
function createOnlineRecognizer() {
let onlineTransducerModelConfig = {
encoder: '',
decoder: '',
joiner: '',
};
let onlineParaformerModelConfig = {
encoder: '',
decoder: '',
};
let onlineZipformer2CtcModelConfig = {
model:
'./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/ctc-epoch-30-avg-3-chunk-16-left-128.int8.onnx',
};
let onlineModelConfig = {
transducer: onlineTransducerModelConfig,
paraformer: onlineParaformerModelConfig,
zipformer2Ctc: onlineZipformer2CtcModelConfig,
tokens: './sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/tokens.txt',
numThreads: 1,
@@ -48,8 +35,6 @@ function createOnlineRecognizer() {
rule1MinTrailingSilence: 2.4,
rule2MinTrailingSilence: 1.2,
rule3MinUtteranceLength: 20,
hotwordsFile: '',
hotwordsScore: 1.5,
ctcFstDecoderConfig: {
graph: './sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/HLG.fst',
maxActive: 3000,

View File

@@ -7,32 +7,18 @@ const wav = require('wav');
const sherpa_onnx = require('sherpa-onnx');
function createOnlineRecognizer() {
let onlineTransducerModelConfig = {
encoder: '',
decoder: '',
joiner: '',
};
let onlineParaformerModelConfig = {
encoder: '',
decoder: '',
};
let onlineZipformer2CtcModelConfig = {
model:
'./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/ctc-epoch-20-avg-1-chunk-16-left-128.onnx',
};
let onlineModelConfig = {
transducer: onlineTransducerModelConfig,
paraformer: onlineParaformerModelConfig,
zipformer2Ctc: onlineZipformer2CtcModelConfig,
tokens:
'./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/tokens.txt',
numThreads: 1,
provider: 'cpu',
debug: 1,
modelType: '',
};
let featureConfig = {
@@ -49,12 +35,6 @@ function createOnlineRecognizer() {
rule1MinTrailingSilence: 2.4,
rule2MinTrailingSilence: 1.2,
rule3MinUtteranceLength: 20,
hotwordsFile: '',
hotwordsScore: 1.5,
ctcFstDecoderConfig: {
graph: '',
maxActive: 3000,
}
};
return sherpa_onnx.createOnlineRecognizer(recognizerConfig);